Coverage for python/lsst/daf/butler/_butler.py : 9%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""
23Butler top level classes.
24"""
25from __future__ import annotations
27__all__ = (
28 "Butler",
29 "ButlerValidationError",
30 "PruneCollectionsArgsError",
31 "PurgeWithoutUnstorePruneCollectionsError",
32 "RunWithoutPurgePruneCollectionsError",
33 "PurgeUnsupportedPruneCollectionsError",
34)
37from collections import defaultdict
38import contextlib
39import logging
40import numbers
41import os
42from typing import (
43 Any,
44 ClassVar,
45 Counter,
46 Dict,
47 Iterable,
48 Iterator,
49 List,
50 MutableMapping,
51 Optional,
52 Set,
53 TextIO,
54 Tuple,
55 Type,
56 Union,
57)
59try:
60 import boto3
61except ImportError:
62 boto3 = None
64from lsst.utils import doImport
65from .core import (
66 AmbiguousDatasetError,
67 ButlerURI,
68 Config,
69 ConfigSubset,
70 DataCoordinate,
71 DataId,
72 DataIdValue,
73 DatasetRef,
74 DatasetType,
75 Datastore,
76 Dimension,
77 DimensionConfig,
78 FileDataset,
79 Progress,
80 StorageClassFactory,
81 Timespan,
82 ValidationError,
83)
84from .core.repoRelocation import BUTLER_ROOT_TAG
85from .core.utils import transactional, getClassOf
86from ._deferredDatasetHandle import DeferredDatasetHandle
87from ._butlerConfig import ButlerConfig
88from .registry import Registry, RegistryConfig, RegistryDefaults, CollectionType, ConflictingDefinitionError
89from .registry.wildcards import CollectionSearch
90from .transfers import RepoExportContext
92log = logging.getLogger(__name__)
95class ButlerValidationError(ValidationError):
96 """There is a problem with the Butler configuration."""
97 pass
100class PruneCollectionsArgsError(TypeError):
101 """Base class for errors relating to Butler.pruneCollections input
102 arguments.
103 """
104 pass
107class PurgeWithoutUnstorePruneCollectionsError(PruneCollectionsArgsError):
108 """Raised when purge and unstore are both required to be True, and
109 purge is True but unstore is False.
110 """
112 def __init__(self) -> None:
113 super().__init__("Cannot pass purge=True without unstore=True.")
116class RunWithoutPurgePruneCollectionsError(PruneCollectionsArgsError):
117 """Raised when pruning a RUN collection but purge is False."""
119 def __init__(self, collectionType: CollectionType):
120 self.collectionType = collectionType
121 super().__init__(f"Cannot prune RUN collection {self.collectionType.name} without purge=True.")
124class PurgeUnsupportedPruneCollectionsError(PruneCollectionsArgsError):
125 """Raised when purge is True but is not supported for the given
126 collection."""
128 def __init__(self, collectionType: CollectionType):
129 self.collectionType = collectionType
130 super().__init__(
131 f"Cannot prune {self.collectionType} collection {self.collectionType.name} with purge=True.")
134class Butler:
135 """Main entry point for the data access system.
137 Parameters
138 ----------
139 config : `ButlerConfig`, `Config` or `str`, optional.
140 Configuration. Anything acceptable to the
141 `ButlerConfig` constructor. If a directory path
142 is given the configuration will be read from a ``butler.yaml`` file in
143 that location. If `None` is given default values will be used.
144 butler : `Butler`, optional.
145 If provided, construct a new Butler that uses the same registry and
146 datastore as the given one, but with the given collection and run.
147 Incompatible with the ``config``, ``searchPaths``, and ``writeable``
148 arguments.
149 collections : `str` or `Iterable` [ `str` ], optional
150 An expression specifying the collections to be searched (in order) when
151 reading datasets.
152 This may be a `str` collection name or an iterable thereof.
153 See :ref:`daf_butler_collection_expressions` for more information.
154 These collections are not registered automatically and must be
155 manually registered before they are used by any method, but they may be
156 manually registered after the `Butler` is initialized.
157 run : `str`, optional
158 Name of the `~CollectionType.RUN` collection new datasets should be
159 inserted into. If ``collections`` is `None` and ``run`` is not `None`,
160 ``collections`` will be set to ``[run]``. If not `None`, this
161 collection will automatically be registered. If this is not set (and
162 ``writeable`` is not set either), a read-only butler will be created.
163 searchPaths : `list` of `str`, optional
164 Directory paths to search when calculating the full Butler
165 configuration. Not used if the supplied config is already a
166 `ButlerConfig`.
167 writeable : `bool`, optional
168 Explicitly sets whether the butler supports write operations. If not
169 provided, a read-write butler is created if any of ``run``, ``tags``,
170 or ``chains`` is non-empty.
171 inferDefaults : `bool`, optional
172 If `True` (default) infer default data ID values from the values
173 present in the datasets in ``collections``: if all collections have the
174 same value (or no value) for a governor dimension, that value will be
175 the default for that dimension. Nonexistent collections are ignored.
176 If a default value is provided explicitly for a governor dimension via
177 ``**kwargs``, no default will be inferred for that dimension.
178 **kwargs : `str`
179 Default data ID key-value pairs. These may only identify "governor"
180 dimensions like ``instrument`` and ``skymap``.
182 Examples
183 --------
184 While there are many ways to control exactly how a `Butler` interacts with
185 the collections in its `Registry`, the most common cases are still simple.
187 For a read-only `Butler` that searches one collection, do::
189 butler = Butler("/path/to/repo", collections=["u/alice/DM-50000"])
191 For a read-write `Butler` that writes to and reads from a
192 `~CollectionType.RUN` collection::
194 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a")
196 The `Butler` passed to a ``PipelineTask`` is often much more complex,
197 because we want to write to one `~CollectionType.RUN` collection but read
198 from several others (as well)::
200 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a",
201 collections=["u/alice/DM-50000/a",
202 "u/bob/DM-49998",
203 "HSC/defaults"])
205 This butler will `put` new datasets to the run ``u/alice/DM-50000/a``.
206 Datasets will be read first from that run (since it appears first in the
207 chain), and then from ``u/bob/DM-49998`` and finally ``HSC/defaults``.
209 Finally, one can always create a `Butler` with no collections::
211 butler = Butler("/path/to/repo", writeable=True)
213 This can be extremely useful when you just want to use ``butler.registry``,
214 e.g. for inserting dimension data or managing collections, or when the
215 collections you want to use with the butler are not consistent.
216 Passing ``writeable`` explicitly here is only necessary if you want to be
217 able to make changes to the repo - usually the value for ``writeable`` can
218 be guessed from the collection arguments provided, but it defaults to
219 `False` when there are not collection arguments.
220 """
221 def __init__(self, config: Union[Config, str, None] = None, *,
222 butler: Optional[Butler] = None,
223 collections: Any = None,
224 run: Optional[str] = None,
225 searchPaths: Optional[List[str]] = None,
226 writeable: Optional[bool] = None,
227 inferDefaults: bool = True,
228 **kwargs: str,
229 ):
230 defaults = RegistryDefaults(collections=collections, run=run, infer=inferDefaults, **kwargs)
231 # Load registry, datastore, etc. from config or existing butler.
232 if butler is not None:
233 if config is not None or searchPaths is not None or writeable is not None:
234 raise TypeError("Cannot pass 'config', 'searchPaths', or 'writeable' "
235 "arguments with 'butler' argument.")
236 self.registry = butler.registry.copy(defaults)
237 self.datastore = butler.datastore
238 self.storageClasses = butler.storageClasses
239 self._config: ButlerConfig = butler._config
240 else:
241 self._config = ButlerConfig(config, searchPaths=searchPaths)
242 if "root" in self._config:
243 butlerRoot = self._config["root"]
244 else:
245 butlerRoot = self._config.configDir
246 if writeable is None:
247 writeable = run is not None
248 self.registry = Registry.fromConfig(self._config, butlerRoot=butlerRoot, writeable=writeable,
249 defaults=defaults)
250 self.datastore = Datastore.fromConfig(self._config, self.registry.getDatastoreBridgeManager(),
251 butlerRoot=butlerRoot)
252 self.storageClasses = StorageClassFactory()
253 self.storageClasses.addFromConfig(self._config)
254 if "run" in self._config or "collection" in self._config:
255 raise ValueError("Passing a run or collection via configuration is no longer supported.")
257 GENERATION: ClassVar[int] = 3
258 """This is a Generation 3 Butler.
260 This attribute may be removed in the future, once the Generation 2 Butler
261 interface has been fully retired; it should only be used in transitional
262 code.
263 """
265 @staticmethod
266 def makeRepo(root: str, config: Union[Config, str, None] = None,
267 dimensionConfig: Union[Config, str, None] = None, standalone: bool = False,
268 searchPaths: Optional[List[str]] = None, forceConfigRoot: bool = True,
269 outfile: Optional[str] = None, overwrite: bool = False) -> Config:
270 """Create an empty data repository by adding a butler.yaml config
271 to a repository root directory.
273 Parameters
274 ----------
275 root : `str` or `ButlerURI`
276 Path or URI to the root location of the new repository. Will be
277 created if it does not exist.
278 config : `Config` or `str`, optional
279 Configuration to write to the repository, after setting any
280 root-dependent Registry or Datastore config options. Can not
281 be a `ButlerConfig` or a `ConfigSubset`. If `None`, default
282 configuration will be used. Root-dependent config options
283 specified in this config are overwritten if ``forceConfigRoot``
284 is `True`.
285 dimensionConfig : `Config` or `str`, optional
286 Configuration for dimensions, will be used to initialize registry
287 database.
288 standalone : `bool`
289 If True, write all expanded defaults, not just customized or
290 repository-specific settings.
291 This (mostly) decouples the repository from the default
292 configuration, insulating it from changes to the defaults (which
293 may be good or bad, depending on the nature of the changes).
294 Future *additions* to the defaults will still be picked up when
295 initializing `Butlers` to repos created with ``standalone=True``.
296 searchPaths : `list` of `str`, optional
297 Directory paths to search when calculating the full butler
298 configuration.
299 forceConfigRoot : `bool`, optional
300 If `False`, any values present in the supplied ``config`` that
301 would normally be reset are not overridden and will appear
302 directly in the output config. This allows non-standard overrides
303 of the root directory for a datastore or registry to be given.
304 If this parameter is `True` the values for ``root`` will be
305 forced into the resulting config if appropriate.
306 outfile : `str`, optional
307 If not-`None`, the output configuration will be written to this
308 location rather than into the repository itself. Can be a URI
309 string. Can refer to a directory that will be used to write
310 ``butler.yaml``.
311 overwrite : `bool`, optional
312 Create a new configuration file even if one already exists
313 in the specified output location. Default is to raise
314 an exception.
316 Returns
317 -------
318 config : `Config`
319 The updated `Config` instance written to the repo.
321 Raises
322 ------
323 ValueError
324 Raised if a ButlerConfig or ConfigSubset is passed instead of a
325 regular Config (as these subclasses would make it impossible to
326 support ``standalone=False``).
327 FileExistsError
328 Raised if the output config file already exists.
329 os.error
330 Raised if the directory does not exist, exists but is not a
331 directory, or cannot be created.
333 Notes
334 -----
335 Note that when ``standalone=False`` (the default), the configuration
336 search path (see `ConfigSubset.defaultSearchPaths`) that was used to
337 construct the repository should also be used to construct any Butlers
338 to avoid configuration inconsistencies.
339 """
340 if isinstance(config, (ButlerConfig, ConfigSubset)):
341 raise ValueError("makeRepo must be passed a regular Config without defaults applied.")
343 # Ensure that the root of the repository exists or can be made
344 uri = ButlerURI(root, forceDirectory=True)
345 uri.mkdir()
347 config = Config(config)
349 # If we are creating a new repo from scratch with relative roots,
350 # do not propagate an explicit root from the config file
351 if "root" in config:
352 del config["root"]
354 full = ButlerConfig(config, searchPaths=searchPaths) # this applies defaults
355 datastoreClass: Type[Datastore] = doImport(full["datastore", "cls"])
356 datastoreClass.setConfigRoot(BUTLER_ROOT_TAG, config, full, overwrite=forceConfigRoot)
358 # if key exists in given config, parse it, otherwise parse the defaults
359 # in the expanded config
360 if config.get(("registry", "db")):
361 registryConfig = RegistryConfig(config)
362 else:
363 registryConfig = RegistryConfig(full)
364 defaultDatabaseUri = registryConfig.makeDefaultDatabaseUri(BUTLER_ROOT_TAG)
365 if defaultDatabaseUri is not None:
366 Config.updateParameters(RegistryConfig, config, full,
367 toUpdate={"db": defaultDatabaseUri},
368 overwrite=forceConfigRoot)
369 else:
370 Config.updateParameters(RegistryConfig, config, full, toCopy=("db",),
371 overwrite=forceConfigRoot)
373 if standalone:
374 config.merge(full)
375 else:
376 # Always expand the registry.managers section into the per-repo
377 # config, because after the database schema is created, it's not
378 # allowed to change anymore. Note that in the standalone=True
379 # branch, _everything_ in the config is expanded, so there's no
380 # need to special case this.
381 Config.updateParameters(RegistryConfig, config, full, toCopy=("managers",), overwrite=False)
382 configURI: Union[str, ButlerURI]
383 if outfile is not None:
384 # When writing to a separate location we must include
385 # the root of the butler repo in the config else it won't know
386 # where to look.
387 config["root"] = uri.geturl()
388 configURI = outfile
389 else:
390 configURI = uri
391 config.dumpToUri(configURI, overwrite=overwrite)
393 # Create Registry and populate tables
394 registryConfig = RegistryConfig(config.get("registry"))
395 dimensionConfig = DimensionConfig(dimensionConfig)
396 Registry.createFromConfig(registryConfig, dimensionConfig=dimensionConfig, butlerRoot=root)
398 return config
400 @classmethod
401 def _unpickle(cls, config: ButlerConfig, collections: Optional[CollectionSearch], run: Optional[str],
402 defaultDataId: Dict[str, str], writeable: bool) -> Butler:
403 """Callable used to unpickle a Butler.
405 We prefer not to use ``Butler.__init__`` directly so we can force some
406 of its many arguments to be keyword-only (note that ``__reduce__``
407 can only invoke callables with positional arguments).
409 Parameters
410 ----------
411 config : `ButlerConfig`
412 Butler configuration, already coerced into a true `ButlerConfig`
413 instance (and hence after any search paths for overrides have been
414 utilized).
415 collections : `CollectionSearch`
416 Names of the default collections to read from.
417 run : `str`, optional
418 Name of the default `~CollectionType.RUN` collection to write to.
419 defaultDataId : `dict` [ `str`, `str` ]
420 Default data ID values.
421 writeable : `bool`
422 Whether the Butler should support write operations.
424 Returns
425 -------
426 butler : `Butler`
427 A new `Butler` instance.
428 """
429 # MyPy doesn't recognize that the kwargs below are totally valid; it
430 # seems to think '**defaultDataId* is a _positional_ argument!
431 return cls(config=config, collections=collections, run=run, writeable=writeable,
432 **defaultDataId) # type: ignore
434 def __reduce__(self) -> tuple:
435 """Support pickling.
436 """
437 return (Butler._unpickle, (self._config, self.collections, self.run,
438 self.registry.defaults.dataId.byName(),
439 self.registry.isWriteable()))
441 def __str__(self) -> str:
442 return "Butler(collections={}, run={}, datastore='{}', registry='{}')".format(
443 self.collections, self.run, self.datastore, self.registry)
445 def isWriteable(self) -> bool:
446 """Return `True` if this `Butler` supports write operations.
447 """
448 return self.registry.isWriteable()
450 @contextlib.contextmanager
451 def transaction(self) -> Iterator[None]:
452 """Context manager supporting `Butler` transactions.
454 Transactions can be nested.
455 """
456 with self.registry.transaction():
457 with self.datastore.transaction():
458 yield
460 def _standardizeArgs(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
461 dataId: Optional[DataId] = None, **kwds: Any
462 ) -> Tuple[DatasetType, Optional[DataId]]:
463 """Standardize the arguments passed to several Butler APIs.
465 Parameters
466 ----------
467 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
468 When `DatasetRef` the `dataId` should be `None`.
469 Otherwise the `DatasetType` or name thereof.
470 dataId : `dict` or `DataCoordinate`
471 A `dict` of `Dimension` link name, value pairs that label the
472 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
473 should be provided as the second argument.
474 kwds
475 Additional keyword arguments used to augment or construct a
476 `DataCoordinate`. See `DataCoordinate.standardize`
477 parameters.
479 Returns
480 -------
481 datasetType : `DatasetType`
482 A `DatasetType` instance extracted from ``datasetRefOrType``.
483 dataId : `dict` or `DataId`, optional
484 Argument that can be used (along with ``kwds``) to construct a
485 `DataId`.
487 Notes
488 -----
489 Butler APIs that conceptually need a DatasetRef also allow passing a
490 `DatasetType` (or the name of one) and a `DataId` (or a dict and
491 keyword arguments that can be used to construct one) separately. This
492 method accepts those arguments and always returns a true `DatasetType`
493 and a `DataId` or `dict`.
495 Standardization of `dict` vs `DataId` is best handled by passing the
496 returned ``dataId`` (and ``kwds``) to `Registry` APIs, which are
497 generally similarly flexible.
498 """
499 externalDatasetType: Optional[DatasetType] = None
500 internalDatasetType: Optional[DatasetType] = None
501 if isinstance(datasetRefOrType, DatasetRef):
502 if dataId is not None or kwds:
503 raise ValueError("DatasetRef given, cannot use dataId as well")
504 externalDatasetType = datasetRefOrType.datasetType
505 dataId = datasetRefOrType.dataId
506 else:
507 # Don't check whether DataId is provided, because Registry APIs
508 # can usually construct a better error message when it wasn't.
509 if isinstance(datasetRefOrType, DatasetType):
510 externalDatasetType = datasetRefOrType
511 else:
512 internalDatasetType = self.registry.getDatasetType(datasetRefOrType)
514 # Check that they are self-consistent
515 if externalDatasetType is not None:
516 internalDatasetType = self.registry.getDatasetType(externalDatasetType.name)
517 if externalDatasetType != internalDatasetType:
518 raise ValueError(f"Supplied dataset type ({externalDatasetType}) inconsistent with "
519 f"registry definition ({internalDatasetType})")
521 assert internalDatasetType is not None
522 return internalDatasetType, dataId
524 def _findDatasetRef(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
525 dataId: Optional[DataId] = None, *,
526 collections: Any = None,
527 allowUnresolved: bool = False,
528 **kwds: Any) -> DatasetRef:
529 """Shared logic for methods that start with a search for a dataset in
530 the registry.
532 Parameters
533 ----------
534 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
535 When `DatasetRef` the `dataId` should be `None`.
536 Otherwise the `DatasetType` or name thereof.
537 dataId : `dict` or `DataCoordinate`, optional
538 A `dict` of `Dimension` link name, value pairs that label the
539 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
540 should be provided as the first argument.
541 collections : Any, optional
542 Collections to be searched, overriding ``self.collections``.
543 Can be any of the types supported by the ``collections`` argument
544 to butler construction.
545 allowUnresolved : `bool`, optional
546 If `True`, return an unresolved `DatasetRef` if finding a resolved
547 one in the `Registry` fails. Defaults to `False`.
548 kwds
549 Additional keyword arguments used to augment or construct a
550 `DataId`. See `DataId` parameters.
552 Returns
553 -------
554 ref : `DatasetRef`
555 A reference to the dataset identified by the given arguments.
557 Raises
558 ------
559 LookupError
560 Raised if no matching dataset exists in the `Registry` (and
561 ``allowUnresolved is False``).
562 ValueError
563 Raised if a resolved `DatasetRef` was passed as an input, but it
564 differs from the one found in the registry.
565 TypeError
566 Raised if no collections were provided.
567 """
568 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwds)
569 if isinstance(datasetRefOrType, DatasetRef):
570 idNumber = datasetRefOrType.id
571 else:
572 idNumber = None
573 timespan: Optional[Timespan] = None
575 # Process dimension records that are using record information
576 # rather than ids
577 newDataId: Dict[str, DataIdValue] = {}
578 byRecord: Dict[str, Dict[str, Any]] = defaultdict(dict)
580 # if all the dataId comes from keyword parameters we do not need
581 # to do anything here because they can't be of the form
582 # exposure.obs_id because a "." is not allowed in a keyword parameter.
583 if dataId:
584 for k, v in dataId.items():
585 # If we have a Dimension we do not need to do anything
586 # because it cannot be a compound key.
587 if isinstance(k, str) and "." in k:
588 # Someone is using a more human-readable dataId
589 dimensionName, record = k.split(".", 1)
590 byRecord[dimensionName][record] = v
591 elif isinstance(k, Dimension):
592 newDataId[k.name] = v
593 else:
594 newDataId[k] = v
596 # Go through the updated dataId and check the type in case someone is
597 # using an alternate key. We have already filtered out the compound
598 # keys dimensions.record format.
599 not_dimensions = {}
601 # Will need to look in the dataId and the keyword arguments
602 # and will remove them if they need to be fixed or are unrecognized.
603 for dataIdDict in (newDataId, kwds):
604 # Use a list so we can adjust the dict safely in the loop
605 for dimensionName in list(dataIdDict):
606 value = dataIdDict[dimensionName]
607 try:
608 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
609 except KeyError:
610 # This is not a real dimension
611 not_dimensions[dimensionName] = value
612 del dataIdDict[dimensionName]
613 continue
615 # Convert an integral type to an explicit int to simplify
616 # comparisons here
617 if isinstance(value, numbers.Integral):
618 value = int(value)
620 if not isinstance(value, dimension.primaryKey.getPythonType()):
621 for alternate in dimension.alternateKeys:
622 if isinstance(value, alternate.getPythonType()):
623 byRecord[dimensionName][alternate.name] = value
624 del dataIdDict[dimensionName]
625 log.debug("Converting dimension %s to %s.%s=%s",
626 dimensionName, dimensionName, alternate.name, value)
627 break
628 else:
629 log.warning("Type mismatch found for value '%r' provided for dimension %s. "
630 "Could not find matching alternative (primary key has type %s) "
631 "so attempting to use as-is.",
632 value, dimensionName, dimension.primaryKey.getPythonType())
634 # If we have some unrecognized dimensions we have to try to connect
635 # them to records in other dimensions. This is made more complicated
636 # by some dimensions having records with clashing names. A mitigation
637 # is that we can tell by this point which dimensions are missing
638 # for the DatasetType but this does not work for calibrations
639 # where additional dimensions can be used to constrain the temporal
640 # axis.
641 if not_dimensions:
642 # Calculate missing dimensions
643 provided = set(newDataId) | set(kwds) | set(byRecord)
644 missingDimensions = datasetType.dimensions.names - provided
646 # For calibrations we may well be needing temporal dimensions
647 # so rather than always including all dimensions in the scan
648 # restrict things a little. It is still possible for there
649 # to be confusion over day_obs in visit vs exposure for example.
650 # If we are not searching calibration collections things may
651 # fail but they are going to fail anyway because of the
652 # ambiguousness of the dataId...
653 candidateDimensions: Set[str] = set()
654 candidateDimensions.update(missingDimensions)
655 if datasetType.isCalibration():
656 for dim in self.registry.dimensions.getStaticDimensions():
657 if dim.temporal:
658 candidateDimensions.add(str(dim))
660 # Look up table for the first association with a dimension
661 guessedAssociation: Dict[str, Dict[str, Any]] = defaultdict(dict)
663 # Keep track of whether an item is associated with multiple
664 # dimensions.
665 counter: Counter[str] = Counter()
666 assigned: Dict[str, Set[str]] = defaultdict(set)
668 # Go through the missing dimensions and associate the
669 # given names with records within those dimensions
670 for dimensionName in candidateDimensions:
671 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
672 fields = dimension.metadata.names | dimension.uniqueKeys.names
673 for field in not_dimensions:
674 if field in fields:
675 guessedAssociation[dimensionName][field] = not_dimensions[field]
676 counter[dimensionName] += 1
677 assigned[field].add(dimensionName)
679 # There is a chance we have allocated a single dataId item
680 # to multiple dimensions. Need to decide which should be retained.
681 # For now assume that the most popular alternative wins.
682 # This means that day_obs with seq_num will result in
683 # exposure.day_obs and not visit.day_obs
684 # Also prefer an explicitly missing dimension over an inferred
685 # temporal dimension.
686 for fieldName, assignedDimensions in assigned.items():
687 if len(assignedDimensions) > 1:
688 # Pick the most popular (preferring mandatory dimensions)
689 requiredButMissing = assignedDimensions.intersection(missingDimensions)
690 if requiredButMissing:
691 candidateDimensions = requiredButMissing
692 else:
693 candidateDimensions = assignedDimensions
695 # Select the relevant items and get a new restricted
696 # counter.
697 theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions}
698 duplicatesCounter: Counter[str] = Counter()
699 duplicatesCounter.update(theseCounts)
701 # Choose the most common. If they are equally common
702 # we will pick the one that was found first.
703 # Returns a list of tuples
704 selected = duplicatesCounter.most_common(1)[0][0]
706 log.debug("Ambiguous dataId entry '%s' associated with multiple dimensions: %s."
707 " Removed ambiguity by choosing dimension %s.",
708 fieldName, ", ".join(assignedDimensions), selected)
710 for candidateDimension in assignedDimensions:
711 if candidateDimension != selected:
712 del guessedAssociation[candidateDimension][fieldName]
714 # Update the record look up dict with the new associations
715 for dimensionName, values in guessedAssociation.items():
716 if values: # A dict might now be empty
717 log.debug("Assigned non-dimension dataId keys to dimension %s: %s",
718 dimensionName, values)
719 byRecord[dimensionName].update(values)
721 if byRecord:
722 # Some record specifiers were found so we need to convert
723 # them to the Id form
724 for dimensionName, values in byRecord.items():
725 if dimensionName in newDataId:
726 log.warning("DataId specified explicit %s dimension value of %s in addition to"
727 " general record specifiers for it of %s. Ignoring record information.",
728 dimensionName, newDataId[dimensionName], str(values))
729 continue
731 # Build up a WHERE expression -- use single quotes
732 def quote(s: Any) -> str:
733 if isinstance(s, str):
734 return f"'{s}'"
735 else:
736 return s
738 where = " AND ".join(f"{dimensionName}.{k} = {quote(v)}"
739 for k, v in values.items())
741 # Hopefully we get a single record that matches
742 records = set(self.registry.queryDimensionRecords(dimensionName, dataId=newDataId,
743 where=where, **kwds))
745 if len(records) != 1:
746 if len(records) > 1:
747 log.debug("Received %d records from constraints of %s", len(records), str(values))
748 for r in records:
749 log.debug("- %s", str(r))
750 raise RuntimeError(f"DataId specification for dimension {dimensionName} is not"
751 f" uniquely constrained to a single dataset by {values}."
752 f" Got {len(records)} results.")
753 raise RuntimeError(f"DataId specification for dimension {dimensionName} matched no"
754 f" records when constrained by {values}")
756 # Get the primary key from the real dimension object
757 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
758 if not isinstance(dimension, Dimension):
759 raise RuntimeError(
760 f"{dimension.name} is not a true dimension, and cannot be used in data IDs."
761 )
762 newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name)
764 # We have modified the dataId so need to switch to it
765 dataId = newDataId
767 if datasetType.isCalibration():
768 # Because this is a calibration dataset, first try to make a
769 # standardize the data ID without restricting the dimensions to
770 # those of the dataset type requested, because there may be extra
771 # dimensions that provide temporal information for a validity-range
772 # lookup.
773 dataId = DataCoordinate.standardize(dataId, universe=self.registry.dimensions,
774 defaults=self.registry.defaults.dataId, **kwds)
775 if dataId.graph.temporal:
776 dataId = self.registry.expandDataId(dataId)
777 timespan = dataId.timespan
778 else:
779 # Standardize the data ID to just the dimensions of the dataset
780 # type instead of letting registry.findDataset do it, so we get the
781 # result even if no dataset is found.
782 dataId = DataCoordinate.standardize(dataId, graph=datasetType.dimensions,
783 defaults=self.registry.defaults.dataId, **kwds)
784 # Always lookup the DatasetRef, even if one is given, to ensure it is
785 # present in the current collection.
786 ref = self.registry.findDataset(datasetType, dataId, collections=collections, timespan=timespan)
787 if ref is None:
788 if allowUnresolved:
789 return DatasetRef(datasetType, dataId)
790 else:
791 if collections is None:
792 collections = self.registry.defaults.collections
793 raise LookupError(f"Dataset {datasetType.name} with data ID {dataId} "
794 f"could not be found in collections {collections}.")
795 if idNumber is not None and idNumber != ref.id:
796 if collections is None:
797 collections = self.registry.defaults.collections
798 raise ValueError(f"DatasetRef.id provided ({idNumber}) does not match "
799 f"id ({ref.id}) in registry in collections {collections}.")
800 return ref
802 @transactional
803 def put(self, obj: Any, datasetRefOrType: Union[DatasetRef, DatasetType, str],
804 dataId: Optional[DataId] = None, *,
805 run: Optional[str] = None,
806 **kwds: Any) -> DatasetRef:
807 """Store and register a dataset.
809 Parameters
810 ----------
811 obj : `object`
812 The dataset.
813 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
814 When `DatasetRef` is provided, ``dataId`` should be `None`.
815 Otherwise the `DatasetType` or name thereof.
816 dataId : `dict` or `DataCoordinate`
817 A `dict` of `Dimension` link name, value pairs that label the
818 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
819 should be provided as the second argument.
820 run : `str`, optional
821 The name of the run the dataset should be added to, overriding
822 ``self.run``.
823 kwds
824 Additional keyword arguments used to augment or construct a
825 `DataCoordinate`. See `DataCoordinate.standardize`
826 parameters.
828 Returns
829 -------
830 ref : `DatasetRef`
831 A reference to the stored dataset, updated with the correct id if
832 given.
834 Raises
835 ------
836 TypeError
837 Raised if the butler is read-only or if no run has been provided.
838 """
839 log.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run)
840 if not self.isWriteable():
841 raise TypeError("Butler is read-only.")
842 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwds)
843 if isinstance(datasetRefOrType, DatasetRef) and datasetRefOrType.id is not None:
844 raise ValueError("DatasetRef must not be in registry, must have None id")
846 # Add Registry Dataset entry.
847 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions, **kwds)
848 ref, = self.registry.insertDatasets(datasetType, run=run, dataIds=[dataId])
850 # Add Datastore entry.
851 self.datastore.put(obj, ref)
853 return ref
855 def getDirect(self, ref: DatasetRef, *, parameters: Optional[Dict[str, Any]] = None) -> Any:
856 """Retrieve a stored dataset.
858 Unlike `Butler.get`, this method allows datasets outside the Butler's
859 collection to be read as long as the `DatasetRef` that identifies them
860 can be obtained separately.
862 Parameters
863 ----------
864 ref : `DatasetRef`
865 Resolved reference to an already stored dataset.
866 parameters : `dict`
867 Additional StorageClass-defined options to control reading,
868 typically used to efficiently read only a subset of the dataset.
870 Returns
871 -------
872 obj : `object`
873 The dataset.
874 """
875 return self.datastore.get(ref, parameters=parameters)
877 def getDirectDeferred(self, ref: DatasetRef, *,
878 parameters: Union[dict, None] = None) -> DeferredDatasetHandle:
879 """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
880 from a resolved `DatasetRef`.
882 Parameters
883 ----------
884 ref : `DatasetRef`
885 Resolved reference to an already stored dataset.
886 parameters : `dict`
887 Additional StorageClass-defined options to control reading,
888 typically used to efficiently read only a subset of the dataset.
890 Returns
891 -------
892 obj : `DeferredDatasetHandle`
893 A handle which can be used to retrieve a dataset at a later time.
895 Raises
896 ------
897 AmbiguousDatasetError
898 Raised if ``ref.id is None``, i.e. the reference is unresolved.
899 """
900 if ref.id is None:
901 raise AmbiguousDatasetError(
902 f"Dataset of type {ref.datasetType.name} with data ID {ref.dataId} is not resolved."
903 )
904 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters)
906 def getDeferred(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
907 dataId: Optional[DataId] = None, *,
908 parameters: Union[dict, None] = None,
909 collections: Any = None,
910 **kwds: Any) -> DeferredDatasetHandle:
911 """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
912 after an immediate registry lookup.
914 Parameters
915 ----------
916 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
917 When `DatasetRef` the `dataId` should be `None`.
918 Otherwise the `DatasetType` or name thereof.
919 dataId : `dict` or `DataCoordinate`, optional
920 A `dict` of `Dimension` link name, value pairs that label the
921 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
922 should be provided as the first argument.
923 parameters : `dict`
924 Additional StorageClass-defined options to control reading,
925 typically used to efficiently read only a subset of the dataset.
926 collections : Any, optional
927 Collections to be searched, overriding ``self.collections``.
928 Can be any of the types supported by the ``collections`` argument
929 to butler construction.
930 kwds
931 Additional keyword arguments used to augment or construct a
932 `DataId`. See `DataId` parameters.
934 Returns
935 -------
936 obj : `DeferredDatasetHandle`
937 A handle which can be used to retrieve a dataset at a later time.
939 Raises
940 ------
941 LookupError
942 Raised if no matching dataset exists in the `Registry` (and
943 ``allowUnresolved is False``).
944 ValueError
945 Raised if a resolved `DatasetRef` was passed as an input, but it
946 differs from the one found in the registry.
947 TypeError
948 Raised if no collections were provided.
949 """
950 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwds)
951 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters)
953 def get(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
954 dataId: Optional[DataId] = None, *,
955 parameters: Optional[Dict[str, Any]] = None,
956 collections: Any = None,
957 **kwds: Any) -> Any:
958 """Retrieve a stored dataset.
960 Parameters
961 ----------
962 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
963 When `DatasetRef` the `dataId` should be `None`.
964 Otherwise the `DatasetType` or name thereof.
965 dataId : `dict` or `DataCoordinate`
966 A `dict` of `Dimension` link name, value pairs that label the
967 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
968 should be provided as the first argument.
969 parameters : `dict`
970 Additional StorageClass-defined options to control reading,
971 typically used to efficiently read only a subset of the dataset.
972 collections : Any, optional
973 Collections to be searched, overriding ``self.collections``.
974 Can be any of the types supported by the ``collections`` argument
975 to butler construction.
976 kwds
977 Additional keyword arguments used to augment or construct a
978 `DataCoordinate`. See `DataCoordinate.standardize`
979 parameters.
981 Returns
982 -------
983 obj : `object`
984 The dataset.
986 Raises
987 ------
988 ValueError
989 Raised if a resolved `DatasetRef` was passed as an input, but it
990 differs from the one found in the registry.
991 LookupError
992 Raised if no matching dataset exists in the `Registry`.
993 TypeError
994 Raised if no collections were provided.
996 Notes
997 -----
998 When looking up datasets in a `~CollectionType.CALIBRATION` collection,
999 this method requires that the given data ID include temporal dimensions
1000 beyond the dimensions of the dataset type itself, in order to find the
1001 dataset with the appropriate validity range. For example, a "bias"
1002 dataset with native dimensions ``{instrument, detector}`` could be
1003 fetched with a ``{instrument, detector, exposure}`` data ID, because
1004 ``exposure`` is a temporal dimension.
1005 """
1006 log.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters)
1007 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwds)
1008 return self.getDirect(ref, parameters=parameters)
1010 def getURIs(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
1011 dataId: Optional[DataId] = None, *,
1012 predict: bool = False,
1013 collections: Any = None,
1014 run: Optional[str] = None,
1015 **kwds: Any) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]:
1016 """Returns the URIs associated with the dataset.
1018 Parameters
1019 ----------
1020 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1021 When `DatasetRef` the `dataId` should be `None`.
1022 Otherwise the `DatasetType` or name thereof.
1023 dataId : `dict` or `DataCoordinate`
1024 A `dict` of `Dimension` link name, value pairs that label the
1025 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1026 should be provided as the first argument.
1027 predict : `bool`
1028 If `True`, allow URIs to be returned of datasets that have not
1029 been written.
1030 collections : Any, optional
1031 Collections to be searched, overriding ``self.collections``.
1032 Can be any of the types supported by the ``collections`` argument
1033 to butler construction.
1034 run : `str`, optional
1035 Run to use for predictions, overriding ``self.run``.
1036 kwds
1037 Additional keyword arguments used to augment or construct a
1038 `DataCoordinate`. See `DataCoordinate.standardize`
1039 parameters.
1041 Returns
1042 -------
1043 primary : `ButlerURI`
1044 The URI to the primary artifact associated with this dataset.
1045 If the dataset was disassembled within the datastore this
1046 may be `None`.
1047 components : `dict`
1048 URIs to any components associated with the dataset artifact.
1049 Can be empty if there are no components.
1050 """
1051 ref = self._findDatasetRef(datasetRefOrType, dataId, allowUnresolved=predict,
1052 collections=collections, **kwds)
1053 if ref.id is None: # only possible if predict is True
1054 if run is None:
1055 run = self.run
1056 if run is None:
1057 raise TypeError("Cannot predict location with run=None.")
1058 # Lie about ID, because we can't guess it, and only
1059 # Datastore.getURIs() will ever see it (and it doesn't use it).
1060 ref = ref.resolved(id=0, run=run)
1061 return self.datastore.getURIs(ref, predict)
1063 def getURI(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
1064 dataId: Optional[DataId] = None, *,
1065 predict: bool = False,
1066 collections: Any = None,
1067 run: Optional[str] = None,
1068 **kwds: Any) -> ButlerURI:
1069 """Return the URI to the Dataset.
1071 Parameters
1072 ----------
1073 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1074 When `DatasetRef` the `dataId` should be `None`.
1075 Otherwise the `DatasetType` or name thereof.
1076 dataId : `dict` or `DataCoordinate`
1077 A `dict` of `Dimension` link name, value pairs that label the
1078 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1079 should be provided as the first argument.
1080 predict : `bool`
1081 If `True`, allow URIs to be returned of datasets that have not
1082 been written.
1083 collections : Any, optional
1084 Collections to be searched, overriding ``self.collections``.
1085 Can be any of the types supported by the ``collections`` argument
1086 to butler construction.
1087 run : `str`, optional
1088 Run to use for predictions, overriding ``self.run``.
1089 kwds
1090 Additional keyword arguments used to augment or construct a
1091 `DataCoordinate`. See `DataCoordinate.standardize`
1092 parameters.
1094 Returns
1095 -------
1096 uri : `ButlerURI`
1097 URI pointing to the Dataset within the datastore. If the
1098 Dataset does not exist in the datastore, and if ``predict`` is
1099 `True`, the URI will be a prediction and will include a URI
1100 fragment "#predicted".
1101 If the datastore does not have entities that relate well
1102 to the concept of a URI the returned URI string will be
1103 descriptive. The returned URI is not guaranteed to be obtainable.
1105 Raises
1106 ------
1107 LookupError
1108 A URI has been requested for a dataset that does not exist and
1109 guessing is not allowed.
1110 ValueError
1111 Raised if a resolved `DatasetRef` was passed as an input, but it
1112 differs from the one found in the registry.
1113 TypeError
1114 Raised if no collections were provided.
1115 RuntimeError
1116 Raised if a URI is requested for a dataset that consists of
1117 multiple artifacts.
1118 """
1119 primary, components = self.getURIs(datasetRefOrType, dataId=dataId, predict=predict,
1120 collections=collections, run=run, **kwds)
1122 if primary is None or components:
1123 raise RuntimeError(f"Dataset ({datasetRefOrType}) includes distinct URIs for components. "
1124 "Use Butler.getURIs() instead.")
1125 return primary
1127 def datasetExists(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
1128 dataId: Optional[DataId] = None, *,
1129 collections: Any = None,
1130 **kwds: Any) -> bool:
1131 """Return True if the Dataset is actually present in the Datastore.
1133 Parameters
1134 ----------
1135 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1136 When `DatasetRef` the `dataId` should be `None`.
1137 Otherwise the `DatasetType` or name thereof.
1138 dataId : `dict` or `DataCoordinate`
1139 A `dict` of `Dimension` link name, value pairs that label the
1140 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1141 should be provided as the first argument.
1142 collections : Any, optional
1143 Collections to be searched, overriding ``self.collections``.
1144 Can be any of the types supported by the ``collections`` argument
1145 to butler construction.
1146 kwds
1147 Additional keyword arguments used to augment or construct a
1148 `DataCoordinate`. See `DataCoordinate.standardize`
1149 parameters.
1151 Raises
1152 ------
1153 LookupError
1154 Raised if the dataset is not even present in the Registry.
1155 ValueError
1156 Raised if a resolved `DatasetRef` was passed as an input, but it
1157 differs from the one found in the registry.
1158 TypeError
1159 Raised if no collections were provided.
1160 """
1161 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwds)
1162 return self.datastore.exists(ref)
1164 def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None:
1165 """Remove one or more `~CollectionType.RUN` collections and the
1166 datasets within them.
1168 Parameters
1169 ----------
1170 names : `Iterable` [ `str` ]
1171 The names of the collections to remove.
1172 unstore : `bool`, optional
1173 If `True` (default), delete datasets from all datastores in which
1174 they are present, and attempt to rollback the registry deletions if
1175 datastore deletions fail (which may not always be possible). If
1176 `False`, datastore records for these datasets are still removed,
1177 but any artifacts (e.g. files) will not be.
1179 Raises
1180 ------
1181 TypeError
1182 Raised if one or more collections are not of type
1183 `~CollectionType.RUN`.
1184 """
1185 if not self.isWriteable():
1186 raise TypeError("Butler is read-only.")
1187 names = list(names)
1188 refs: List[DatasetRef] = []
1189 for name in names:
1190 collectionType = self.registry.getCollectionType(name)
1191 if collectionType is not CollectionType.RUN:
1192 raise TypeError(f"The collection type of '{name}' is {collectionType.name}, not RUN.")
1193 refs.extend(self.registry.queryDatasets(..., collections=name, findFirst=True))
1194 with self.registry.transaction():
1195 if unstore:
1196 for ref in refs:
1197 if self.datastore.exists(ref):
1198 self.datastore.trash(ref)
1199 else:
1200 self.datastore.forget(refs)
1201 for name in names:
1202 self.registry.removeCollection(name)
1203 if unstore:
1204 # Point of no return for removing artifacts
1205 self.datastore.emptyTrash()
1207 def pruneCollection(self, name: str, purge: bool = False, unstore: bool = False,
1208 unlink: Optional[List[str]] = None) -> None:
1209 """Remove a collection and possibly prune datasets within it.
1211 Parameters
1212 ----------
1213 name : `str`
1214 Name of the collection to remove. If this is a
1215 `~CollectionType.TAGGED` or `~CollectionType.CHAINED` collection,
1216 datasets within the collection are not modified unless ``unstore``
1217 is `True`. If this is a `~CollectionType.RUN` collection,
1218 ``purge`` and ``unstore`` must be `True`, and all datasets in it
1219 are fully removed from the data repository.
1220 purge : `bool`, optional
1221 If `True`, permit `~CollectionType.RUN` collections to be removed,
1222 fully removing datasets within them. Requires ``unstore=True`` as
1223 well as an added precaution against accidental deletion. Must be
1224 `False` (default) if the collection is not a ``RUN``.
1225 unstore: `bool`, optional
1226 If `True`, remove all datasets in the collection from all
1227 datastores in which they appear.
1228 unlink: `list` [`str`], optional
1229 Before removing the given `collection` unlink it from from these
1230 parent collections.
1232 Raises
1233 ------
1234 TypeError
1235 Raised if the butler is read-only or arguments are mutually
1236 inconsistent.
1237 """
1239 # See pruneDatasets comments for more information about the logic here;
1240 # the cases are almost the same, but here we can rely on Registry to
1241 # take care everything but Datastore deletion when we remove the
1242 # collection.
1243 if not self.isWriteable():
1244 raise TypeError("Butler is read-only.")
1245 collectionType = self.registry.getCollectionType(name)
1246 if purge and not unstore:
1247 raise PurgeWithoutUnstorePruneCollectionsError()
1248 if collectionType is CollectionType.RUN and not purge:
1249 raise RunWithoutPurgePruneCollectionsError(collectionType)
1250 if collectionType is not CollectionType.RUN and purge:
1251 raise PurgeUnsupportedPruneCollectionsError(collectionType)
1253 def remove(child: str, parent: str) -> None:
1254 """Remove a child collection from a parent collection."""
1255 # Remove child from parent.
1256 chain = list(self.registry.getCollectionChain(parent))
1257 try:
1258 chain.remove(name)
1259 except ValueError as e:
1260 raise RuntimeError(f"{name} is not a child of {parent}") from e
1261 self.registry.setCollectionChain(parent, chain)
1263 with self.registry.transaction():
1264 if (unlink):
1265 for parent in unlink:
1266 remove(name, parent)
1267 if unstore:
1268 for ref in self.registry.queryDatasets(..., collections=name, findFirst=True):
1269 if self.datastore.exists(ref):
1270 self.datastore.trash(ref)
1271 self.registry.removeCollection(name)
1272 if unstore:
1273 # Point of no return for removing artifacts
1274 self.datastore.emptyTrash()
1276 def pruneDatasets(self, refs: Iterable[DatasetRef], *,
1277 disassociate: bool = True,
1278 unstore: bool = False,
1279 tags: Iterable[str] = (),
1280 purge: bool = False,
1281 run: Optional[str] = None) -> None:
1282 """Remove one or more datasets from a collection and/or storage.
1284 Parameters
1285 ----------
1286 refs : `~collections.abc.Iterable` of `DatasetRef`
1287 Datasets to prune. These must be "resolved" references (not just
1288 a `DatasetType` and data ID).
1289 disassociate : `bool`, optional
1290 Disassociate pruned datasets from ``tags``, or from all collections
1291 if ``purge=True``.
1292 unstore : `bool`, optional
1293 If `True` (`False` is default) remove these datasets from all
1294 datastores known to this butler. Note that this will make it
1295 impossible to retrieve these datasets even via other collections.
1296 Datasets that are already not stored are ignored by this option.
1297 tags : `Iterable` [ `str` ], optional
1298 `~CollectionType.TAGGED` collections to disassociate the datasets
1299 from. Ignored if ``disassociate`` is `False` or ``purge`` is
1300 `True`.
1301 purge : `bool`, optional
1302 If `True` (`False` is default), completely remove the dataset from
1303 the `Registry`. To prevent accidental deletions, ``purge`` may
1304 only be `True` if all of the following conditions are met:
1306 - All given datasets are in the given run.
1307 - ``disassociate`` is `True`;
1308 - ``unstore`` is `True`.
1310 This mode may remove provenance information from datasets other
1311 than those provided, and should be used with extreme care.
1313 Raises
1314 ------
1315 TypeError
1316 Raised if the butler is read-only, if no collection was provided,
1317 or the conditions for ``purge=True`` were not met.
1318 """
1319 if not self.isWriteable():
1320 raise TypeError("Butler is read-only.")
1321 if purge:
1322 if not disassociate:
1323 raise TypeError("Cannot pass purge=True without disassociate=True.")
1324 if not unstore:
1325 raise TypeError("Cannot pass purge=True without unstore=True.")
1326 elif disassociate:
1327 tags = tuple(tags)
1328 if not tags:
1329 raise TypeError("No tags provided but disassociate=True.")
1330 for tag in tags:
1331 collectionType = self.registry.getCollectionType(tag)
1332 if collectionType is not CollectionType.TAGGED:
1333 raise TypeError(f"Cannot disassociate from collection '{tag}' "
1334 f"of non-TAGGED type {collectionType.name}.")
1335 # Transform possibly-single-pass iterable into something we can iterate
1336 # over multiple times.
1337 refs = list(refs)
1338 # Pruning a component of a DatasetRef makes no sense since registry
1339 # doesn't know about components and datastore might not store
1340 # components in a separate file
1341 for ref in refs:
1342 if ref.datasetType.component():
1343 raise ValueError(f"Can not prune a component of a dataset (ref={ref})")
1344 # We don't need an unreliable Datastore transaction for this, because
1345 # we've been extra careful to ensure that Datastore.trash only involves
1346 # mutating the Registry (it can _look_ at Datastore-specific things,
1347 # but shouldn't change them), and hence all operations here are
1348 # Registry operations.
1349 with self.registry.transaction():
1350 if unstore:
1351 for ref in refs:
1352 # There is a difference between a concrete composite
1353 # and virtual composite. In a virtual composite the
1354 # datastore is never given the top level DatasetRef. In
1355 # the concrete composite the datastore knows all the
1356 # refs and will clean up itself if asked to remove the
1357 # parent ref. We can not check configuration for this
1358 # since we can not trust that the configuration is the
1359 # same. We therefore have to ask if the ref exists or
1360 # not. This is consistent with the fact that we want
1361 # to ignore already-removed-from-datastore datasets
1362 # anyway.
1363 if self.datastore.exists(ref):
1364 self.datastore.trash(ref)
1365 if purge:
1366 self.registry.removeDatasets(refs)
1367 elif disassociate:
1368 assert tags, "Guaranteed by earlier logic in this function."
1369 for tag in tags:
1370 self.registry.disassociate(tag, refs)
1371 # We've exited the Registry transaction, and apparently committed.
1372 # (if there was an exception, everything rolled back, and it's as if
1373 # nothing happened - and we never get here).
1374 # Datastore artifacts are not yet gone, but they're clearly marked
1375 # as trash, so if we fail to delete now because of (e.g.) filesystem
1376 # problems we can try again later, and if manual administrative
1377 # intervention is required, it's pretty clear what that should entail:
1378 # deleting everything on disk and in private Datastore tables that is
1379 # in the dataset_location_trash table.
1380 if unstore:
1381 # Point of no return for removing artifacts
1382 self.datastore.emptyTrash()
1384 @transactional
1385 def ingest(self, *datasets: FileDataset, transfer: Optional[str] = "auto", run: Optional[str] = None,
1386 ) -> None:
1387 """Store and register one or more datasets that already exist on disk.
1389 Parameters
1390 ----------
1391 datasets : `FileDataset`
1392 Each positional argument is a struct containing information about
1393 a file to be ingested, including its path (either absolute or
1394 relative to the datastore root, if applicable), a `DatasetRef`,
1395 and optionally a formatter class or its fully-qualified string
1396 name. If a formatter is not provided, the formatter that would be
1397 used for `put` is assumed. On successful return, all
1398 `FileDataset.ref` attributes will have their `DatasetRef.id`
1399 attribute populated and all `FileDataset.formatter` attributes will
1400 be set to the formatter class used. `FileDataset.path` attributes
1401 may be modified to put paths in whatever the datastore considers a
1402 standardized form.
1403 transfer : `str`, optional
1404 If not `None`, must be one of 'auto', 'move', 'copy', 'direct',
1405 'hardlink', 'relsymlink' or 'symlink', indicating how to transfer
1406 the file.
1407 run : `str`, optional
1408 The name of the run ingested datasets should be added to,
1409 overriding ``self.run``.
1411 Raises
1412 ------
1413 TypeError
1414 Raised if the butler is read-only or if no run was provided.
1415 NotImplementedError
1416 Raised if the `Datastore` does not support the given transfer mode.
1417 DatasetTypeNotSupportedError
1418 Raised if one or more files to be ingested have a dataset type that
1419 is not supported by the `Datastore`..
1420 FileNotFoundError
1421 Raised if one of the given files does not exist.
1422 FileExistsError
1423 Raised if transfer is not `None` but the (internal) location the
1424 file would be moved to is already occupied.
1426 Notes
1427 -----
1428 This operation is not fully exception safe: if a database operation
1429 fails, the given `FileDataset` instances may be only partially updated.
1431 It is atomic in terms of database operations (they will either all
1432 succeed or all fail) providing the database engine implements
1433 transactions correctly. It will attempt to be atomic in terms of
1434 filesystem operations as well, but this cannot be implemented
1435 rigorously for most datastores.
1436 """
1437 if not self.isWriteable():
1438 raise TypeError("Butler is read-only.")
1439 progress = Progress("lsst.daf.butler.Butler.ingest", level=logging.DEBUG)
1440 # Reorganize the inputs so they're grouped by DatasetType and then
1441 # data ID. We also include a list of DatasetRefs for each FileDataset
1442 # to hold the resolved DatasetRefs returned by the Registry, before
1443 # it's safe to swap them into FileDataset.refs.
1444 # Some type annotation aliases to make that clearer:
1445 GroupForType = Dict[DataCoordinate, Tuple[FileDataset, List[DatasetRef]]]
1446 GroupedData = MutableMapping[DatasetType, GroupForType]
1447 # The actual data structure:
1448 groupedData: GroupedData = defaultdict(dict)
1449 # And the nested loop that populates it:
1450 for dataset in progress.wrap(datasets, desc="Grouping by dataset type"):
1451 # This list intentionally shared across the inner loop, since it's
1452 # associated with `dataset`.
1453 resolvedRefs: List[DatasetRef] = []
1454 for ref in dataset.refs:
1455 if ref.dataId in groupedData[ref.datasetType]:
1456 raise ConflictingDefinitionError(f"Ingest conflict. Dataset {dataset.path} has same"
1457 " DataId as other ingest dataset"
1458 f" {groupedData[ref.datasetType][ref.dataId][0].path} "
1459 f" ({ref.dataId})")
1460 groupedData[ref.datasetType][ref.dataId] = (dataset, resolvedRefs)
1462 # Now we can bulk-insert into Registry for each DatasetType.
1463 allResolvedRefs: List[DatasetRef] = []
1464 for datasetType, groupForType in progress.iter_item_chunks(groupedData.items(),
1465 desc="Bulk-inserting datasets by type"):
1466 refs = self.registry.insertDatasets(
1467 datasetType,
1468 dataIds=groupForType.keys(),
1469 run=run,
1470 expand=self.datastore.needs_expanded_data_ids(transfer, datasetType),
1471 )
1472 # Append those resolved DatasetRefs to the new lists we set up for
1473 # them.
1474 for ref, (_, resolvedRefs) in zip(refs, groupForType.values()):
1475 resolvedRefs.append(ref)
1477 # Go back to the original FileDatasets to replace their refs with the
1478 # new resolved ones, and also build a big list of all refs.
1479 allResolvedRefs = []
1480 for groupForType in progress.iter_chunks(groupedData.values(),
1481 desc="Reassociating resolved dataset refs with files"):
1482 for dataset, resolvedRefs in groupForType.values():
1483 dataset.refs = resolvedRefs
1484 allResolvedRefs.extend(resolvedRefs)
1486 # Bulk-insert everything into Datastore.
1487 self.datastore.ingest(*datasets, transfer=transfer)
1489 @contextlib.contextmanager
1490 def export(self, *, directory: Optional[str] = None,
1491 filename: Optional[str] = None,
1492 format: Optional[str] = None,
1493 transfer: Optional[str] = None) -> Iterator[RepoExportContext]:
1494 """Export datasets from the repository represented by this `Butler`.
1496 This method is a context manager that returns a helper object
1497 (`RepoExportContext`) that is used to indicate what information from
1498 the repository should be exported.
1500 Parameters
1501 ----------
1502 directory : `str`, optional
1503 Directory dataset files should be written to if ``transfer`` is not
1504 `None`.
1505 filename : `str`, optional
1506 Name for the file that will include database information associated
1507 with the exported datasets. If this is not an absolute path and
1508 ``directory`` is not `None`, it will be written to ``directory``
1509 instead of the current working directory. Defaults to
1510 "export.{format}".
1511 format : `str`, optional
1512 File format for the database information file. If `None`, the
1513 extension of ``filename`` will be used.
1514 transfer : `str`, optional
1515 Transfer mode passed to `Datastore.export`.
1517 Raises
1518 ------
1519 TypeError
1520 Raised if the set of arguments passed is inconsistent.
1522 Examples
1523 --------
1524 Typically the `Registry.queryDataIds` and `Registry.queryDatasets`
1525 methods are used to provide the iterables over data IDs and/or datasets
1526 to be exported::
1528 with butler.export("exports.yaml") as export:
1529 # Export all flats, but none of the dimension element rows
1530 # (i.e. data ID information) associated with them.
1531 export.saveDatasets(butler.registry.queryDatasets("flat"),
1532 elements=())
1533 # Export all datasets that start with "deepCoadd_" and all of
1534 # their associated data ID information.
1535 export.saveDatasets(butler.registry.queryDatasets("deepCoadd_*"))
1536 """
1537 if directory is None and transfer is not None:
1538 raise TypeError("Cannot transfer without providing a directory.")
1539 if transfer == "move":
1540 raise TypeError("Transfer may not be 'move': export is read-only")
1541 if format is None:
1542 if filename is None:
1543 raise TypeError("At least one of 'filename' or 'format' must be provided.")
1544 else:
1545 _, format = os.path.splitext(filename)
1546 elif filename is None:
1547 filename = f"export.{format}"
1548 if directory is not None:
1549 filename = os.path.join(directory, filename)
1550 BackendClass = getClassOf(self._config["repo_transfer_formats"][format]["export"])
1551 with open(filename, 'w') as stream:
1552 backend = BackendClass(stream)
1553 try:
1554 helper = RepoExportContext(self.registry, self.datastore, backend=backend,
1555 directory=directory, transfer=transfer)
1556 yield helper
1557 except BaseException:
1558 raise
1559 else:
1560 helper._finish()
1562 def import_(self, *, directory: Optional[str] = None,
1563 filename: Union[str, TextIO, None] = None,
1564 format: Optional[str] = None,
1565 transfer: Optional[str] = None,
1566 skip_dimensions: Optional[Set] = None) -> None:
1567 """Import datasets into this repository that were exported from a
1568 different butler repository via `~lsst.daf.butler.Butler.export`.
1570 Parameters
1571 ----------
1572 directory : `str`, optional
1573 Directory containing dataset files to import from. If `None`,
1574 ``filename`` and all dataset file paths specified therein must
1575 be absolute.
1576 filename : `str` or `TextIO`, optional
1577 A stream or name of file that contains database information
1578 associated with the exported datasets, typically generated by
1579 `~lsst.daf.butler.Butler.export`. If this a string (name) and
1580 is not an absolute path, does not exist in the current working
1581 directory, and ``directory`` is not `None`, it is assumed to be in
1582 ``directory``. Defaults to "export.{format}".
1583 format : `str`, optional
1584 File format for ``filename``. If `None`, the extension of
1585 ``filename`` will be used.
1586 transfer : `str`, optional
1587 Transfer mode passed to `~lsst.daf.butler.Datastore.ingest`.
1588 skip_dimensions : `set`, optional
1589 Names of dimensions that should be skipped and not imported.
1591 Raises
1592 ------
1593 TypeError
1594 Raised if the set of arguments passed is inconsistent, or if the
1595 butler is read-only.
1596 """
1597 if not self.isWriteable():
1598 raise TypeError("Butler is read-only.")
1599 if format is None:
1600 if filename is None:
1601 raise TypeError("At least one of 'filename' or 'format' must be provided.")
1602 else:
1603 _, format = os.path.splitext(filename) # type: ignore
1604 elif filename is None:
1605 filename = f"export.{format}"
1606 if isinstance(filename, str) and directory is not None and not os.path.exists(filename):
1607 filename = os.path.join(directory, filename)
1608 BackendClass = getClassOf(self._config["repo_transfer_formats"][format]["import"])
1610 def doImport(importStream: TextIO) -> None:
1611 backend = BackendClass(importStream, self.registry)
1612 backend.register()
1613 with self.transaction():
1614 backend.load(self.datastore, directory=directory, transfer=transfer,
1615 skip_dimensions=skip_dimensions)
1617 if isinstance(filename, str):
1618 with open(filename, "r") as stream:
1619 doImport(stream)
1620 else:
1621 doImport(filename)
1623 def validateConfiguration(self, logFailures: bool = False,
1624 datasetTypeNames: Optional[Iterable[str]] = None,
1625 ignore: Iterable[str] = None) -> None:
1626 """Validate butler configuration.
1628 Checks that each `DatasetType` can be stored in the `Datastore`.
1630 Parameters
1631 ----------
1632 logFailures : `bool`, optional
1633 If `True`, output a log message for every validation error
1634 detected.
1635 datasetTypeNames : iterable of `str`, optional
1636 The `DatasetType` names that should be checked. This allows
1637 only a subset to be selected.
1638 ignore : iterable of `str`, optional
1639 Names of DatasetTypes to skip over. This can be used to skip
1640 known problems. If a named `DatasetType` corresponds to a
1641 composite, all components of that `DatasetType` will also be
1642 ignored.
1644 Raises
1645 ------
1646 ButlerValidationError
1647 Raised if there is some inconsistency with how this Butler
1648 is configured.
1649 """
1650 if datasetTypeNames:
1651 datasetTypes = [self.registry.getDatasetType(name) for name in datasetTypeNames]
1652 else:
1653 datasetTypes = list(self.registry.queryDatasetTypes())
1655 # filter out anything from the ignore list
1656 if ignore:
1657 ignore = set(ignore)
1658 datasetTypes = [e for e in datasetTypes
1659 if e.name not in ignore and e.nameAndComponent()[0] not in ignore]
1660 else:
1661 ignore = set()
1663 # Find all the registered instruments
1664 instruments = set(
1665 record.name for record in self.registry.queryDimensionRecords("instrument")
1666 )
1668 # For each datasetType that has an instrument dimension, create
1669 # a DatasetRef for each defined instrument
1670 datasetRefs = []
1672 for datasetType in datasetTypes:
1673 if "instrument" in datasetType.dimensions:
1674 for instrument in instruments:
1675 datasetRef = DatasetRef(datasetType, {"instrument": instrument}, # type: ignore
1676 conform=False)
1677 datasetRefs.append(datasetRef)
1679 entities: List[Union[DatasetType, DatasetRef]] = []
1680 entities.extend(datasetTypes)
1681 entities.extend(datasetRefs)
1683 datastoreErrorStr = None
1684 try:
1685 self.datastore.validateConfiguration(entities, logFailures=logFailures)
1686 except ValidationError as e:
1687 datastoreErrorStr = str(e)
1689 # Also check that the LookupKeys used by the datastores match
1690 # registry and storage class definitions
1691 keys = self.datastore.getLookupKeys()
1693 failedNames = set()
1694 failedDataId = set()
1695 for key in keys:
1696 if key.name is not None:
1697 if key.name in ignore:
1698 continue
1700 # skip if specific datasetType names were requested and this
1701 # name does not match
1702 if datasetTypeNames and key.name not in datasetTypeNames:
1703 continue
1705 # See if it is a StorageClass or a DatasetType
1706 if key.name in self.storageClasses:
1707 pass
1708 else:
1709 try:
1710 self.registry.getDatasetType(key.name)
1711 except KeyError:
1712 if logFailures:
1713 log.critical("Key '%s' does not correspond to a DatasetType or StorageClass", key)
1714 failedNames.add(key)
1715 else:
1716 # Dimensions are checked for consistency when the Butler
1717 # is created and rendezvoused with a universe.
1718 pass
1720 # Check that the instrument is a valid instrument
1721 # Currently only support instrument so check for that
1722 if key.dataId:
1723 dataIdKeys = set(key.dataId)
1724 if set(["instrument"]) != dataIdKeys:
1725 if logFailures:
1726 log.critical("Key '%s' has unsupported DataId override", key)
1727 failedDataId.add(key)
1728 elif key.dataId["instrument"] not in instruments:
1729 if logFailures:
1730 log.critical("Key '%s' has unknown instrument", key)
1731 failedDataId.add(key)
1733 messages = []
1735 if datastoreErrorStr:
1736 messages.append(datastoreErrorStr)
1738 for failed, msg in ((failedNames, "Keys without corresponding DatasetType or StorageClass entry: "),
1739 (failedDataId, "Keys with bad DataId entries: ")):
1740 if failed:
1741 msg += ", ".join(str(k) for k in failed)
1742 messages.append(msg)
1744 if messages:
1745 raise ValidationError(";\n".join(messages))
1747 @property
1748 def collections(self) -> CollectionSearch:
1749 """The collections to search by default, in order (`CollectionSearch`).
1751 This is an alias for ``self.registry.defaults.collections``. It cannot
1752 be set directly in isolation, but all defaults may be changed together
1753 by assigning a new `RegistryDefaults` instance to
1754 ``self.registry.defaults``.
1755 """
1756 return self.registry.defaults.collections
1758 @property
1759 def run(self) -> Optional[str]:
1760 """Name of the run this butler writes outputs to by default (`str` or
1761 `None`).
1763 This is an alias for ``self.registry.defaults.run``. It cannot be set
1764 directly in isolation, but all defaults may be changed together by
1765 assigning a new `RegistryDefaults` instance to
1766 ``self.registry.defaults``.
1767 """
1768 return self.registry.defaults.run
1770 registry: Registry
1771 """The object that manages dataset metadata and relationships (`Registry`).
1773 Most operations that don't involve reading or writing butler datasets are
1774 accessible only via `Registry` methods.
1775 """
1777 datastore: Datastore
1778 """The object that manages actual dataset storage (`Datastore`).
1780 Direct user access to the datastore should rarely be necessary; the primary
1781 exception is the case where a `Datastore` implementation provides extra
1782 functionality beyond what the base class defines.
1783 """
1785 storageClasses: StorageClassFactory
1786 """An object that maps known storage class names to objects that fully
1787 describe them (`StorageClassFactory`).
1788 """