Coverage for python/lsst/daf/butler/_butler.py : 9%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""
23Butler top level classes.
24"""
25from __future__ import annotations
27__all__ = (
28 "Butler",
29 "ButlerValidationError",
30 "PruneCollectionsArgsError",
31 "PurgeWithoutUnstorePruneCollectionsError",
32 "RunWithoutPurgePruneCollectionsError",
33 "PurgeUnsupportedPruneCollectionsError",
34)
37from collections import defaultdict
38import contextlib
39import logging
40import numbers
41import os
42from typing import (
43 Any,
44 ClassVar,
45 Counter,
46 Dict,
47 Iterable,
48 Iterator,
49 List,
50 MutableMapping,
51 Optional,
52 Set,
53 TextIO,
54 Tuple,
55 Type,
56 Union,
57)
59try:
60 import boto3
61except ImportError:
62 boto3 = None
64from lsst.utils import doImport
65from .core import (
66 AmbiguousDatasetError,
67 ButlerURI,
68 Config,
69 ConfigSubset,
70 DataCoordinate,
71 DataId,
72 DataIdValue,
73 DatasetRef,
74 DatasetType,
75 Datastore,
76 Dimension,
77 DimensionConfig,
78 FileDataset,
79 StorageClassFactory,
80 Timespan,
81 ValidationError,
82)
83from .core.repoRelocation import BUTLER_ROOT_TAG
84from .core.utils import transactional, getClassOf
85from ._deferredDatasetHandle import DeferredDatasetHandle
86from ._butlerConfig import ButlerConfig
87from .registry import Registry, RegistryConfig, RegistryDefaults, CollectionType
88from .registry.wildcards import CollectionSearch
89from .transfers import RepoExportContext
91log = logging.getLogger(__name__)
94class ButlerValidationError(ValidationError):
95 """There is a problem with the Butler configuration."""
96 pass
99class PruneCollectionsArgsError(TypeError):
100 """Base class for errors relating to Butler.pruneCollections input
101 arguments.
102 """
103 pass
106class PurgeWithoutUnstorePruneCollectionsError(PruneCollectionsArgsError):
107 """Raised when purge and unstore are both required to be True, and
108 purge is True but unstore is False.
109 """
111 def __init__(self) -> None:
112 super().__init__("Cannot pass purge=True without unstore=True.")
115class RunWithoutPurgePruneCollectionsError(PruneCollectionsArgsError):
116 """Raised when pruning a RUN collection but purge is False."""
118 def __init__(self, collectionType: CollectionType):
119 self.collectionType = collectionType
120 super().__init__(f"Cannot prune RUN collection {self.collectionType.name} without purge=True.")
123class PurgeUnsupportedPruneCollectionsError(PruneCollectionsArgsError):
124 """Raised when purge is True but is not supported for the given
125 collection."""
127 def __init__(self, collectionType: CollectionType):
128 self.collectionType = collectionType
129 super().__init__(
130 f"Cannot prune {self.collectionType} collection {self.collectionType.name} with purge=True.")
133class Butler:
134 """Main entry point for the data access system.
136 Parameters
137 ----------
138 config : `ButlerConfig`, `Config` or `str`, optional.
139 Configuration. Anything acceptable to the
140 `ButlerConfig` constructor. If a directory path
141 is given the configuration will be read from a ``butler.yaml`` file in
142 that location. If `None` is given default values will be used.
143 butler : `Butler`, optional.
144 If provided, construct a new Butler that uses the same registry and
145 datastore as the given one, but with the given collection and run.
146 Incompatible with the ``config``, ``searchPaths``, and ``writeable``
147 arguments.
148 collections : `str` or `Iterable` [ `str` ], optional
149 An expression specifying the collections to be searched (in order) when
150 reading datasets.
151 This may be a `str` collection name or an iterable thereof.
152 See :ref:`daf_butler_collection_expressions` for more information.
153 These collections are not registered automatically and must be
154 manually registered before they are used by any method, but they may be
155 manually registered after the `Butler` is initialized.
156 run : `str`, optional
157 Name of the `~CollectionType.RUN` collection new datasets should be
158 inserted into. If ``collections`` is `None` and ``run`` is not `None`,
159 ``collections`` will be set to ``[run]``. If not `None`, this
160 collection will automatically be registered. If this is not set (and
161 ``writeable`` is not set either), a read-only butler will be created.
162 searchPaths : `list` of `str`, optional
163 Directory paths to search when calculating the full Butler
164 configuration. Not used if the supplied config is already a
165 `ButlerConfig`.
166 writeable : `bool`, optional
167 Explicitly sets whether the butler supports write operations. If not
168 provided, a read-write butler is created if any of ``run``, ``tags``,
169 or ``chains`` is non-empty.
170 inferDefaults : `bool`, optional
171 If `True` (default) infer default data ID values from the values
172 present in the datasets in ``collections``: if all collections have the
173 same value (or no value) for a governor dimension, that value will be
174 the default for that dimension. Nonexistent collections are ignored.
175 If a default value is provided explicitly for a governor dimension via
176 ``**kwargs``, no default will be inferred for that dimension.
177 **kwargs : `str`
178 Default data ID key-value pairs. These may only identify "governor"
179 dimensions like ``instrument`` and ``skymap``.
181 Examples
182 --------
183 While there are many ways to control exactly how a `Butler` interacts with
184 the collections in its `Registry`, the most common cases are still simple.
186 For a read-only `Butler` that searches one collection, do::
188 butler = Butler("/path/to/repo", collections=["u/alice/DM-50000"])
190 For a read-write `Butler` that writes to and reads from a
191 `~CollectionType.RUN` collection::
193 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a")
195 The `Butler` passed to a ``PipelineTask`` is often much more complex,
196 because we want to write to one `~CollectionType.RUN` collection but read
197 from several others (as well)::
199 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a",
200 collections=["u/alice/DM-50000/a",
201 "u/bob/DM-49998",
202 "HSC/defaults"])
204 This butler will `put` new datasets to the run ``u/alice/DM-50000/a``.
205 Datasets will be read first from that run (since it appears first in the
206 chain), and then from ``u/bob/DM-49998`` and finally ``HSC/defaults``.
208 Finally, one can always create a `Butler` with no collections::
210 butler = Butler("/path/to/repo", writeable=True)
212 This can be extremely useful when you just want to use ``butler.registry``,
213 e.g. for inserting dimension data or managing collections, or when the
214 collections you want to use with the butler are not consistent.
215 Passing ``writeable`` explicitly here is only necessary if you want to be
216 able to make changes to the repo - usually the value for ``writeable`` can
217 be guessed from the collection arguments provided, but it defaults to
218 `False` when there are not collection arguments.
219 """
220 def __init__(self, config: Union[Config, str, None] = None, *,
221 butler: Optional[Butler] = None,
222 collections: Any = None,
223 run: Optional[str] = None,
224 searchPaths: Optional[List[str]] = None,
225 writeable: Optional[bool] = None,
226 inferDefaults: bool = True,
227 **kwargs: str,
228 ):
229 defaults = RegistryDefaults(collections=collections, run=run, infer=inferDefaults, **kwargs)
230 # Load registry, datastore, etc. from config or existing butler.
231 if butler is not None:
232 if config is not None or searchPaths is not None or writeable is not None:
233 raise TypeError("Cannot pass 'config', 'searchPaths', or 'writeable' "
234 "arguments with 'butler' argument.")
235 self.registry = butler.registry.copy(defaults)
236 self.datastore = butler.datastore
237 self.storageClasses = butler.storageClasses
238 self._config: ButlerConfig = butler._config
239 else:
240 self._config = ButlerConfig(config, searchPaths=searchPaths)
241 if "root" in self._config:
242 butlerRoot = self._config["root"]
243 else:
244 butlerRoot = self._config.configDir
245 if writeable is None:
246 writeable = run is not None
247 self.registry = Registry.fromConfig(self._config, butlerRoot=butlerRoot, writeable=writeable,
248 defaults=defaults)
249 self.datastore = Datastore.fromConfig(self._config, self.registry.getDatastoreBridgeManager(),
250 butlerRoot=butlerRoot)
251 self.storageClasses = StorageClassFactory()
252 self.storageClasses.addFromConfig(self._config)
253 if "run" in self._config or "collection" in self._config:
254 raise ValueError("Passing a run or collection via configuration is no longer supported.")
256 GENERATION: ClassVar[int] = 3
257 """This is a Generation 3 Butler.
259 This attribute may be removed in the future, once the Generation 2 Butler
260 interface has been fully retired; it should only be used in transitional
261 code.
262 """
264 @staticmethod
265 def makeRepo(root: str, config: Union[Config, str, None] = None,
266 dimensionConfig: Union[Config, str, None] = None, standalone: bool = False,
267 searchPaths: Optional[List[str]] = None, forceConfigRoot: bool = True,
268 outfile: Optional[str] = None, overwrite: bool = False) -> Config:
269 """Create an empty data repository by adding a butler.yaml config
270 to a repository root directory.
272 Parameters
273 ----------
274 root : `str` or `ButlerURI`
275 Path or URI to the root location of the new repository. Will be
276 created if it does not exist.
277 config : `Config` or `str`, optional
278 Configuration to write to the repository, after setting any
279 root-dependent Registry or Datastore config options. Can not
280 be a `ButlerConfig` or a `ConfigSubset`. If `None`, default
281 configuration will be used. Root-dependent config options
282 specified in this config are overwritten if ``forceConfigRoot``
283 is `True`.
284 dimensionConfig : `Config` or `str`, optional
285 Configuration for dimensions, will be used to initialize registry
286 database.
287 standalone : `bool`
288 If True, write all expanded defaults, not just customized or
289 repository-specific settings.
290 This (mostly) decouples the repository from the default
291 configuration, insulating it from changes to the defaults (which
292 may be good or bad, depending on the nature of the changes).
293 Future *additions* to the defaults will still be picked up when
294 initializing `Butlers` to repos created with ``standalone=True``.
295 searchPaths : `list` of `str`, optional
296 Directory paths to search when calculating the full butler
297 configuration.
298 forceConfigRoot : `bool`, optional
299 If `False`, any values present in the supplied ``config`` that
300 would normally be reset are not overridden and will appear
301 directly in the output config. This allows non-standard overrides
302 of the root directory for a datastore or registry to be given.
303 If this parameter is `True` the values for ``root`` will be
304 forced into the resulting config if appropriate.
305 outfile : `str`, optional
306 If not-`None`, the output configuration will be written to this
307 location rather than into the repository itself. Can be a URI
308 string. Can refer to a directory that will be used to write
309 ``butler.yaml``.
310 overwrite : `bool`, optional
311 Create a new configuration file even if one already exists
312 in the specified output location. Default is to raise
313 an exception.
315 Returns
316 -------
317 config : `Config`
318 The updated `Config` instance written to the repo.
320 Raises
321 ------
322 ValueError
323 Raised if a ButlerConfig or ConfigSubset is passed instead of a
324 regular Config (as these subclasses would make it impossible to
325 support ``standalone=False``).
326 FileExistsError
327 Raised if the output config file already exists.
328 os.error
329 Raised if the directory does not exist, exists but is not a
330 directory, or cannot be created.
332 Notes
333 -----
334 Note that when ``standalone=False`` (the default), the configuration
335 search path (see `ConfigSubset.defaultSearchPaths`) that was used to
336 construct the repository should also be used to construct any Butlers
337 to avoid configuration inconsistencies.
338 """
339 if isinstance(config, (ButlerConfig, ConfigSubset)):
340 raise ValueError("makeRepo must be passed a regular Config without defaults applied.")
342 # Ensure that the root of the repository exists or can be made
343 uri = ButlerURI(root, forceDirectory=True)
344 uri.mkdir()
346 config = Config(config)
348 # If we are creating a new repo from scratch with relative roots,
349 # do not propagate an explicit root from the config file
350 if "root" in config:
351 del config["root"]
353 full = ButlerConfig(config, searchPaths=searchPaths) # this applies defaults
354 datastoreClass: Type[Datastore] = doImport(full["datastore", "cls"])
355 datastoreClass.setConfigRoot(BUTLER_ROOT_TAG, config, full, overwrite=forceConfigRoot)
357 # if key exists in given config, parse it, otherwise parse the defaults
358 # in the expanded config
359 if config.get(("registry", "db")):
360 registryConfig = RegistryConfig(config)
361 else:
362 registryConfig = RegistryConfig(full)
363 defaultDatabaseUri = registryConfig.makeDefaultDatabaseUri(BUTLER_ROOT_TAG)
364 if defaultDatabaseUri is not None:
365 Config.updateParameters(RegistryConfig, config, full,
366 toUpdate={"db": defaultDatabaseUri},
367 overwrite=forceConfigRoot)
368 else:
369 Config.updateParameters(RegistryConfig, config, full, toCopy=("db",),
370 overwrite=forceConfigRoot)
372 if standalone:
373 config.merge(full)
374 else:
375 # Always expand the registry.managers section into the per-repo
376 # config, because after the database schema is created, it's not
377 # allowed to change anymore. Note that in the standalone=True
378 # branch, _everything_ in the config is expanded, so there's no
379 # need to special case this.
380 Config.updateParameters(RegistryConfig, config, full, toCopy=("managers",), overwrite=False)
381 configURI: Union[str, ButlerURI]
382 if outfile is not None:
383 # When writing to a separate location we must include
384 # the root of the butler repo in the config else it won't know
385 # where to look.
386 config["root"] = uri.geturl()
387 configURI = outfile
388 else:
389 configURI = uri
390 config.dumpToUri(configURI, overwrite=overwrite)
392 # Create Registry and populate tables
393 registryConfig = RegistryConfig(config.get("registry"))
394 dimensionConfig = DimensionConfig(dimensionConfig)
395 Registry.createFromConfig(registryConfig, dimensionConfig=dimensionConfig, butlerRoot=root)
397 return config
399 @classmethod
400 def _unpickle(cls, config: ButlerConfig, collections: Optional[CollectionSearch], run: Optional[str],
401 defaultDataId: Dict[str, str], writeable: bool) -> Butler:
402 """Callable used to unpickle a Butler.
404 We prefer not to use ``Butler.__init__`` directly so we can force some
405 of its many arguments to be keyword-only (note that ``__reduce__``
406 can only invoke callables with positional arguments).
408 Parameters
409 ----------
410 config : `ButlerConfig`
411 Butler configuration, already coerced into a true `ButlerConfig`
412 instance (and hence after any search paths for overrides have been
413 utilized).
414 collections : `CollectionSearch`
415 Names of the default collections to read from.
416 run : `str`, optional
417 Name of the default `~CollectionType.RUN` collection to write to.
418 defaultDataId : `dict` [ `str`, `str` ]
419 Default data ID values.
420 writeable : `bool`
421 Whether the Butler should support write operations.
423 Returns
424 -------
425 butler : `Butler`
426 A new `Butler` instance.
427 """
428 # MyPy doesn't recognize that the kwargs below are totally valid; it
429 # seems to think '**defaultDataId* is a _positional_ argument!
430 return cls(config=config, collections=collections, run=run, writeable=writeable,
431 **defaultDataId) # type: ignore
433 def __reduce__(self) -> tuple:
434 """Support pickling.
435 """
436 return (Butler._unpickle, (self._config, self.collections, self.run,
437 self.registry.defaults.dataId.byName(),
438 self.registry.isWriteable()))
440 def __str__(self) -> str:
441 return "Butler(collections={}, run={}, datastore='{}', registry='{}')".format(
442 self.collections, self.run, self.datastore, self.registry)
444 def isWriteable(self) -> bool:
445 """Return `True` if this `Butler` supports write operations.
446 """
447 return self.registry.isWriteable()
449 @contextlib.contextmanager
450 def transaction(self) -> Iterator[None]:
451 """Context manager supporting `Butler` transactions.
453 Transactions can be nested.
454 """
455 with self.registry.transaction():
456 with self.datastore.transaction():
457 yield
459 def _standardizeArgs(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
460 dataId: Optional[DataId] = None, **kwds: Any
461 ) -> Tuple[DatasetType, Optional[DataId]]:
462 """Standardize the arguments passed to several Butler APIs.
464 Parameters
465 ----------
466 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
467 When `DatasetRef` the `dataId` should be `None`.
468 Otherwise the `DatasetType` or name thereof.
469 dataId : `dict` or `DataCoordinate`
470 A `dict` of `Dimension` link name, value pairs that label the
471 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
472 should be provided as the second argument.
473 kwds
474 Additional keyword arguments used to augment or construct a
475 `DataCoordinate`. See `DataCoordinate.standardize`
476 parameters.
478 Returns
479 -------
480 datasetType : `DatasetType`
481 A `DatasetType` instance extracted from ``datasetRefOrType``.
482 dataId : `dict` or `DataId`, optional
483 Argument that can be used (along with ``kwds``) to construct a
484 `DataId`.
486 Notes
487 -----
488 Butler APIs that conceptually need a DatasetRef also allow passing a
489 `DatasetType` (or the name of one) and a `DataId` (or a dict and
490 keyword arguments that can be used to construct one) separately. This
491 method accepts those arguments and always returns a true `DatasetType`
492 and a `DataId` or `dict`.
494 Standardization of `dict` vs `DataId` is best handled by passing the
495 returned ``dataId`` (and ``kwds``) to `Registry` APIs, which are
496 generally similarly flexible.
497 """
498 externalDatasetType: Optional[DatasetType] = None
499 internalDatasetType: Optional[DatasetType] = None
500 if isinstance(datasetRefOrType, DatasetRef):
501 if dataId is not None or kwds:
502 raise ValueError("DatasetRef given, cannot use dataId as well")
503 externalDatasetType = datasetRefOrType.datasetType
504 dataId = datasetRefOrType.dataId
505 else:
506 # Don't check whether DataId is provided, because Registry APIs
507 # can usually construct a better error message when it wasn't.
508 if isinstance(datasetRefOrType, DatasetType):
509 externalDatasetType = datasetRefOrType
510 else:
511 internalDatasetType = self.registry.getDatasetType(datasetRefOrType)
513 # Check that they are self-consistent
514 if externalDatasetType is not None:
515 internalDatasetType = self.registry.getDatasetType(externalDatasetType.name)
516 if externalDatasetType != internalDatasetType:
517 raise ValueError(f"Supplied dataset type ({externalDatasetType}) inconsistent with "
518 f"registry definition ({internalDatasetType})")
520 assert internalDatasetType is not None
521 return internalDatasetType, dataId
523 def _findDatasetRef(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
524 dataId: Optional[DataId] = None, *,
525 collections: Any = None,
526 allowUnresolved: bool = False,
527 **kwds: Any) -> DatasetRef:
528 """Shared logic for methods that start with a search for a dataset in
529 the registry.
531 Parameters
532 ----------
533 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
534 When `DatasetRef` the `dataId` should be `None`.
535 Otherwise the `DatasetType` or name thereof.
536 dataId : `dict` or `DataCoordinate`, optional
537 A `dict` of `Dimension` link name, value pairs that label the
538 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
539 should be provided as the first argument.
540 collections : Any, optional
541 Collections to be searched, overriding ``self.collections``.
542 Can be any of the types supported by the ``collections`` argument
543 to butler construction.
544 allowUnresolved : `bool`, optional
545 If `True`, return an unresolved `DatasetRef` if finding a resolved
546 one in the `Registry` fails. Defaults to `False`.
547 kwds
548 Additional keyword arguments used to augment or construct a
549 `DataId`. See `DataId` parameters.
551 Returns
552 -------
553 ref : `DatasetRef`
554 A reference to the dataset identified by the given arguments.
556 Raises
557 ------
558 LookupError
559 Raised if no matching dataset exists in the `Registry` (and
560 ``allowUnresolved is False``).
561 ValueError
562 Raised if a resolved `DatasetRef` was passed as an input, but it
563 differs from the one found in the registry.
564 TypeError
565 Raised if no collections were provided.
566 """
567 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwds)
568 if isinstance(datasetRefOrType, DatasetRef):
569 idNumber = datasetRefOrType.id
570 else:
571 idNumber = None
572 timespan: Optional[Timespan] = None
574 # Process dimension records that are using record information
575 # rather than ids
576 newDataId: Dict[str, DataIdValue] = {}
577 byRecord: Dict[str, Dict[str, Any]] = defaultdict(dict)
579 # if all the dataId comes from keyword parameters we do not need
580 # to do anything here because they can't be of the form
581 # exposure.obs_id because a "." is not allowed in a keyword parameter.
582 if dataId:
583 for k, v in dataId.items():
584 # If we have a Dimension we do not need to do anything
585 # because it cannot be a compound key.
586 if isinstance(k, str) and "." in k:
587 # Someone is using a more human-readable dataId
588 dimensionName, record = k.split(".", 1)
589 byRecord[dimensionName][record] = v
590 elif isinstance(k, Dimension):
591 newDataId[k.name] = v
592 else:
593 newDataId[k] = v
595 # Go through the updated dataId and check the type in case someone is
596 # using an alternate key. We have already filtered out the compound
597 # keys dimensions.record format.
598 not_dimensions = {}
600 # Will need to look in the dataId and the keyword arguments
601 # and will remove them if they need to be fixed or are unrecognized.
602 for dataIdDict in (newDataId, kwds):
603 # Use a list so we can adjust the dict safely in the loop
604 for dimensionName in list(dataIdDict):
605 value = dataIdDict[dimensionName]
606 try:
607 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
608 except KeyError:
609 # This is not a real dimension
610 not_dimensions[dimensionName] = value
611 del dataIdDict[dimensionName]
612 continue
614 # Convert an integral type to an explicit int to simplify
615 # comparisons here
616 if isinstance(value, numbers.Integral):
617 value = int(value)
619 if not isinstance(value, dimension.primaryKey.getPythonType()):
620 for alternate in dimension.alternateKeys:
621 if isinstance(value, alternate.getPythonType()):
622 byRecord[dimensionName][alternate.name] = value
623 del dataIdDict[dimensionName]
624 log.debug("Converting dimension %s to %s.%s=%s",
625 dimensionName, dimensionName, alternate.name, value)
626 break
627 else:
628 log.warning("Type mismatch found for value '%r' provided for dimension %s. "
629 "Could not find matching alternative (primary key has type %s) "
630 "so attempting to use as-is.",
631 value, dimensionName, dimension.primaryKey.getPythonType())
633 # If we have some unrecognized dimensions we have to try to connect
634 # them to records in other dimensions. This is made more complicated
635 # by some dimensions having records with clashing names. A mitigation
636 # is that we can tell by this point which dimensions are missing
637 # for the DatasetType but this does not work for calibrations
638 # where additional dimensions can be used to constrain the temporal
639 # axis.
640 if not_dimensions:
641 # Calculate missing dimensions
642 provided = set(newDataId) | set(kwds) | set(byRecord)
643 missingDimensions = datasetType.dimensions.names - provided
645 # For calibrations we may well be needing temporal dimensions
646 # so rather than always including all dimensions in the scan
647 # restrict things a little. It is still possible for there
648 # to be confusion over day_obs in visit vs exposure for example.
649 # If we are not searching calibration collections things may
650 # fail but they are going to fail anyway because of the
651 # ambiguousness of the dataId...
652 candidateDimensions: Set[str] = set()
653 candidateDimensions.update(missingDimensions)
654 if datasetType.isCalibration():
655 for dim in self.registry.dimensions.getStaticDimensions():
656 if dim.temporal:
657 candidateDimensions.add(str(dim))
659 # Look up table for the first association with a dimension
660 guessedAssociation: Dict[str, Dict[str, Any]] = defaultdict(dict)
662 # Keep track of whether an item is associated with multiple
663 # dimensions.
664 counter: Counter[str] = Counter()
665 assigned: Dict[str, Set[str]] = defaultdict(set)
667 # Go through the missing dimensions and associate the
668 # given names with records within those dimensions
669 for dimensionName in candidateDimensions:
670 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
671 fields = dimension.metadata.names | dimension.uniqueKeys.names
672 for field in not_dimensions:
673 if field in fields:
674 guessedAssociation[dimensionName][field] = not_dimensions[field]
675 counter[dimensionName] += 1
676 assigned[field].add(dimensionName)
678 # There is a chance we have allocated a single dataId item
679 # to multiple dimensions. Need to decide which should be retained.
680 # For now assume that the most popular alternative wins.
681 # This means that day_obs with seq_num will result in
682 # exposure.day_obs and not visit.day_obs
683 # Also prefer an explicitly missing dimension over an inferred
684 # temporal dimension.
685 for fieldName, assignedDimensions in assigned.items():
686 if len(assignedDimensions) > 1:
687 # Pick the most popular (preferring mandatory dimensions)
688 requiredButMissing = assignedDimensions.intersection(missingDimensions)
689 if requiredButMissing:
690 candidateDimensions = requiredButMissing
691 else:
692 candidateDimensions = assignedDimensions
694 # Select the relevant items and get a new restricted
695 # counter.
696 theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions}
697 duplicatesCounter: Counter[str] = Counter()
698 duplicatesCounter.update(theseCounts)
700 # Choose the most common. If they are equally common
701 # we will pick the one that was found first.
702 # Returns a list of tuples
703 selected = duplicatesCounter.most_common(1)[0][0]
705 log.debug("Ambiguous dataId entry '%s' associated with multiple dimensions: %s."
706 " Removed ambiguity by choosing dimension %s.",
707 fieldName, ", ".join(assignedDimensions), selected)
709 for candidateDimension in assignedDimensions:
710 if candidateDimension != selected:
711 del guessedAssociation[candidateDimension][fieldName]
713 # Update the record look up dict with the new associations
714 for dimensionName, values in guessedAssociation.items():
715 if values: # A dict might now be empty
716 log.debug("Assigned non-dimension dataId keys to dimension %s: %s",
717 dimensionName, values)
718 byRecord[dimensionName].update(values)
720 if byRecord:
721 # Some record specifiers were found so we need to convert
722 # them to the Id form
723 for dimensionName, values in byRecord.items():
724 if dimensionName in newDataId:
725 log.warning("DataId specified explicit %s dimension value of %s in addition to"
726 " general record specifiers for it of %s. Ignoring record information.",
727 dimensionName, newDataId[dimensionName], str(values))
728 continue
730 # Build up a WHERE expression -- use single quotes
731 def quote(s: Any) -> str:
732 if isinstance(s, str):
733 return f"'{s}'"
734 else:
735 return s
737 where = " AND ".join(f"{dimensionName}.{k} = {quote(v)}"
738 for k, v in values.items())
740 # Hopefully we get a single record that matches
741 records = set(self.registry.queryDimensionRecords(dimensionName, dataId=newDataId,
742 where=where, **kwds))
744 if len(records) != 1:
745 if len(records) > 1:
746 log.debug("Received %d records from constraints of %s", len(records), str(values))
747 for r in records:
748 log.debug("- %s", str(r))
749 raise RuntimeError(f"DataId specification for dimension {dimensionName} is not"
750 f" uniquely constrained to a single dataset by {values}."
751 f" Got {len(records)} results.")
752 raise RuntimeError(f"DataId specification for dimension {dimensionName} matched no"
753 f" records when constrained by {values}")
755 # Get the primary key from the real dimension object
756 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
757 if not isinstance(dimension, Dimension):
758 raise RuntimeError(
759 f"{dimension.name} is not a true dimension, and cannot be used in data IDs."
760 )
761 newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name)
763 # We have modified the dataId so need to switch to it
764 dataId = newDataId
766 if datasetType.isCalibration():
767 # Because this is a calibration dataset, first try to make a
768 # standardize the data ID without restricting the dimensions to
769 # those of the dataset type requested, because there may be extra
770 # dimensions that provide temporal information for a validity-range
771 # lookup.
772 dataId = DataCoordinate.standardize(dataId, universe=self.registry.dimensions,
773 defaults=self.registry.defaults.dataId, **kwds)
774 if dataId.graph.temporal:
775 dataId = self.registry.expandDataId(dataId)
776 timespan = dataId.timespan
777 else:
778 # Standardize the data ID to just the dimensions of the dataset
779 # type instead of letting registry.findDataset do it, so we get the
780 # result even if no dataset is found.
781 dataId = DataCoordinate.standardize(dataId, graph=datasetType.dimensions,
782 defaults=self.registry.defaults.dataId, **kwds)
783 # Always lookup the DatasetRef, even if one is given, to ensure it is
784 # present in the current collection.
785 ref = self.registry.findDataset(datasetType, dataId, collections=collections, timespan=timespan)
786 if ref is None:
787 if allowUnresolved:
788 return DatasetRef(datasetType, dataId)
789 else:
790 if collections is None:
791 collections = self.registry.defaults.collections
792 raise LookupError(f"Dataset {datasetType.name} with data ID {dataId} "
793 f"could not be found in collections {collections}.")
794 if idNumber is not None and idNumber != ref.id:
795 if collections is None:
796 collections = self.registry.defaults.collections
797 raise ValueError(f"DatasetRef.id provided ({idNumber}) does not match "
798 f"id ({ref.id}) in registry in collections {collections}.")
799 return ref
801 @transactional
802 def put(self, obj: Any, datasetRefOrType: Union[DatasetRef, DatasetType, str],
803 dataId: Optional[DataId] = None, *,
804 run: Optional[str] = None,
805 **kwds: Any) -> DatasetRef:
806 """Store and register a dataset.
808 Parameters
809 ----------
810 obj : `object`
811 The dataset.
812 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
813 When `DatasetRef` is provided, ``dataId`` should be `None`.
814 Otherwise the `DatasetType` or name thereof.
815 dataId : `dict` or `DataCoordinate`
816 A `dict` of `Dimension` link name, value pairs that label the
817 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
818 should be provided as the second argument.
819 run : `str`, optional
820 The name of the run the dataset should be added to, overriding
821 ``self.run``.
822 kwds
823 Additional keyword arguments used to augment or construct a
824 `DataCoordinate`. See `DataCoordinate.standardize`
825 parameters.
827 Returns
828 -------
829 ref : `DatasetRef`
830 A reference to the stored dataset, updated with the correct id if
831 given.
833 Raises
834 ------
835 TypeError
836 Raised if the butler is read-only or if no run has been provided.
837 """
838 log.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run)
839 if not self.isWriteable():
840 raise TypeError("Butler is read-only.")
841 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwds)
842 if isinstance(datasetRefOrType, DatasetRef) and datasetRefOrType.id is not None:
843 raise ValueError("DatasetRef must not be in registry, must have None id")
845 # Add Registry Dataset entry.
846 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions, **kwds)
847 ref, = self.registry.insertDatasets(datasetType, run=run, dataIds=[dataId])
849 # Add Datastore entry.
850 self.datastore.put(obj, ref)
852 return ref
854 def getDirect(self, ref: DatasetRef, *, parameters: Optional[Dict[str, Any]] = None) -> Any:
855 """Retrieve a stored dataset.
857 Unlike `Butler.get`, this method allows datasets outside the Butler's
858 collection to be read as long as the `DatasetRef` that identifies them
859 can be obtained separately.
861 Parameters
862 ----------
863 ref : `DatasetRef`
864 Resolved reference to an already stored dataset.
865 parameters : `dict`
866 Additional StorageClass-defined options to control reading,
867 typically used to efficiently read only a subset of the dataset.
869 Returns
870 -------
871 obj : `object`
872 The dataset.
873 """
874 return self.datastore.get(ref, parameters=parameters)
876 def getDirectDeferred(self, ref: DatasetRef, *,
877 parameters: Union[dict, None] = None) -> DeferredDatasetHandle:
878 """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
879 from a resolved `DatasetRef`.
881 Parameters
882 ----------
883 ref : `DatasetRef`
884 Resolved reference to an already stored dataset.
885 parameters : `dict`
886 Additional StorageClass-defined options to control reading,
887 typically used to efficiently read only a subset of the dataset.
889 Returns
890 -------
891 obj : `DeferredDatasetHandle`
892 A handle which can be used to retrieve a dataset at a later time.
894 Raises
895 ------
896 AmbiguousDatasetError
897 Raised if ``ref.id is None``, i.e. the reference is unresolved.
898 """
899 if ref.id is None:
900 raise AmbiguousDatasetError(
901 f"Dataset of type {ref.datasetType.name} with data ID {ref.dataId} is not resolved."
902 )
903 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters)
905 def getDeferred(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
906 dataId: Optional[DataId] = None, *,
907 parameters: Union[dict, None] = None,
908 collections: Any = None,
909 **kwds: Any) -> DeferredDatasetHandle:
910 """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
911 after an immediate registry lookup.
913 Parameters
914 ----------
915 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
916 When `DatasetRef` the `dataId` should be `None`.
917 Otherwise the `DatasetType` or name thereof.
918 dataId : `dict` or `DataCoordinate`, optional
919 A `dict` of `Dimension` link name, value pairs that label the
920 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
921 should be provided as the first argument.
922 parameters : `dict`
923 Additional StorageClass-defined options to control reading,
924 typically used to efficiently read only a subset of the dataset.
925 collections : Any, optional
926 Collections to be searched, overriding ``self.collections``.
927 Can be any of the types supported by the ``collections`` argument
928 to butler construction.
929 kwds
930 Additional keyword arguments used to augment or construct a
931 `DataId`. See `DataId` parameters.
933 Returns
934 -------
935 obj : `DeferredDatasetHandle`
936 A handle which can be used to retrieve a dataset at a later time.
938 Raises
939 ------
940 LookupError
941 Raised if no matching dataset exists in the `Registry` (and
942 ``allowUnresolved is False``).
943 ValueError
944 Raised if a resolved `DatasetRef` was passed as an input, but it
945 differs from the one found in the registry.
946 TypeError
947 Raised if no collections were provided.
948 """
949 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwds)
950 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters)
952 def get(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
953 dataId: Optional[DataId] = None, *,
954 parameters: Optional[Dict[str, Any]] = None,
955 collections: Any = None,
956 **kwds: Any) -> Any:
957 """Retrieve a stored dataset.
959 Parameters
960 ----------
961 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
962 When `DatasetRef` the `dataId` should be `None`.
963 Otherwise the `DatasetType` or name thereof.
964 dataId : `dict` or `DataCoordinate`
965 A `dict` of `Dimension` link name, value pairs that label the
966 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
967 should be provided as the first argument.
968 parameters : `dict`
969 Additional StorageClass-defined options to control reading,
970 typically used to efficiently read only a subset of the dataset.
971 collections : Any, optional
972 Collections to be searched, overriding ``self.collections``.
973 Can be any of the types supported by the ``collections`` argument
974 to butler construction.
975 kwds
976 Additional keyword arguments used to augment or construct a
977 `DataCoordinate`. See `DataCoordinate.standardize`
978 parameters.
980 Returns
981 -------
982 obj : `object`
983 The dataset.
985 Raises
986 ------
987 ValueError
988 Raised if a resolved `DatasetRef` was passed as an input, but it
989 differs from the one found in the registry.
990 LookupError
991 Raised if no matching dataset exists in the `Registry`.
992 TypeError
993 Raised if no collections were provided.
995 Notes
996 -----
997 When looking up datasets in a `~CollectionType.CALIBRATION` collection,
998 this method requires that the given data ID include temporal dimensions
999 beyond the dimensions of the dataset type itself, in order to find the
1000 dataset with the appropriate validity range. For example, a "bias"
1001 dataset with native dimensions ``{instrument, detector}`` could be
1002 fetched with a ``{instrument, detector, exposure}`` data ID, because
1003 ``exposure`` is a temporal dimension.
1004 """
1005 log.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters)
1006 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwds)
1007 return self.getDirect(ref, parameters=parameters)
1009 def getURIs(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
1010 dataId: Optional[DataId] = None, *,
1011 predict: bool = False,
1012 collections: Any = None,
1013 run: Optional[str] = None,
1014 **kwds: Any) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]:
1015 """Returns the URIs associated with the dataset.
1017 Parameters
1018 ----------
1019 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1020 When `DatasetRef` the `dataId` should be `None`.
1021 Otherwise the `DatasetType` or name thereof.
1022 dataId : `dict` or `DataCoordinate`
1023 A `dict` of `Dimension` link name, value pairs that label the
1024 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1025 should be provided as the first argument.
1026 predict : `bool`
1027 If `True`, allow URIs to be returned of datasets that have not
1028 been written.
1029 collections : Any, optional
1030 Collections to be searched, overriding ``self.collections``.
1031 Can be any of the types supported by the ``collections`` argument
1032 to butler construction.
1033 run : `str`, optional
1034 Run to use for predictions, overriding ``self.run``.
1035 kwds
1036 Additional keyword arguments used to augment or construct a
1037 `DataCoordinate`. See `DataCoordinate.standardize`
1038 parameters.
1040 Returns
1041 -------
1042 primary : `ButlerURI`
1043 The URI to the primary artifact associated with this dataset.
1044 If the dataset was disassembled within the datastore this
1045 may be `None`.
1046 components : `dict`
1047 URIs to any components associated with the dataset artifact.
1048 Can be empty if there are no components.
1049 """
1050 ref = self._findDatasetRef(datasetRefOrType, dataId, allowUnresolved=predict,
1051 collections=collections, **kwds)
1052 if ref.id is None: # only possible if predict is True
1053 if run is None:
1054 run = self.run
1055 if run is None:
1056 raise TypeError("Cannot predict location with run=None.")
1057 # Lie about ID, because we can't guess it, and only
1058 # Datastore.getURIs() will ever see it (and it doesn't use it).
1059 ref = ref.resolved(id=0, run=run)
1060 return self.datastore.getURIs(ref, predict)
1062 def getURI(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
1063 dataId: Optional[DataId] = None, *,
1064 predict: bool = False,
1065 collections: Any = None,
1066 run: Optional[str] = None,
1067 **kwds: Any) -> ButlerURI:
1068 """Return the URI to the Dataset.
1070 Parameters
1071 ----------
1072 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1073 When `DatasetRef` the `dataId` should be `None`.
1074 Otherwise the `DatasetType` or name thereof.
1075 dataId : `dict` or `DataCoordinate`
1076 A `dict` of `Dimension` link name, value pairs that label the
1077 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1078 should be provided as the first argument.
1079 predict : `bool`
1080 If `True`, allow URIs to be returned of datasets that have not
1081 been written.
1082 collections : Any, optional
1083 Collections to be searched, overriding ``self.collections``.
1084 Can be any of the types supported by the ``collections`` argument
1085 to butler construction.
1086 run : `str`, optional
1087 Run to use for predictions, overriding ``self.run``.
1088 kwds
1089 Additional keyword arguments used to augment or construct a
1090 `DataCoordinate`. See `DataCoordinate.standardize`
1091 parameters.
1093 Returns
1094 -------
1095 uri : `ButlerURI`
1096 URI pointing to the Dataset within the datastore. If the
1097 Dataset does not exist in the datastore, and if ``predict`` is
1098 `True`, the URI will be a prediction and will include a URI
1099 fragment "#predicted".
1100 If the datastore does not have entities that relate well
1101 to the concept of a URI the returned URI string will be
1102 descriptive. The returned URI is not guaranteed to be obtainable.
1104 Raises
1105 ------
1106 LookupError
1107 A URI has been requested for a dataset that does not exist and
1108 guessing is not allowed.
1109 ValueError
1110 Raised if a resolved `DatasetRef` was passed as an input, but it
1111 differs from the one found in the registry.
1112 TypeError
1113 Raised if no collections were provided.
1114 RuntimeError
1115 Raised if a URI is requested for a dataset that consists of
1116 multiple artifacts.
1117 """
1118 primary, components = self.getURIs(datasetRefOrType, dataId=dataId, predict=predict,
1119 collections=collections, run=run, **kwds)
1121 if primary is None or components:
1122 raise RuntimeError(f"Dataset ({datasetRefOrType}) includes distinct URIs for components. "
1123 "Use Butler.getURIs() instead.")
1124 return primary
1126 def datasetExists(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
1127 dataId: Optional[DataId] = None, *,
1128 collections: Any = None,
1129 **kwds: Any) -> bool:
1130 """Return True if the Dataset is actually present in the Datastore.
1132 Parameters
1133 ----------
1134 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1135 When `DatasetRef` the `dataId` should be `None`.
1136 Otherwise the `DatasetType` or name thereof.
1137 dataId : `dict` or `DataCoordinate`
1138 A `dict` of `Dimension` link name, value pairs that label the
1139 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1140 should be provided as the first argument.
1141 collections : Any, optional
1142 Collections to be searched, overriding ``self.collections``.
1143 Can be any of the types supported by the ``collections`` argument
1144 to butler construction.
1145 kwds
1146 Additional keyword arguments used to augment or construct a
1147 `DataCoordinate`. See `DataCoordinate.standardize`
1148 parameters.
1150 Raises
1151 ------
1152 LookupError
1153 Raised if the dataset is not even present in the Registry.
1154 ValueError
1155 Raised if a resolved `DatasetRef` was passed as an input, but it
1156 differs from the one found in the registry.
1157 TypeError
1158 Raised if no collections were provided.
1159 """
1160 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwds)
1161 return self.datastore.exists(ref)
1163 def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None:
1164 """Remove one or more `~CollectionType.RUN` collections and the
1165 datasets within them.
1167 Parameters
1168 ----------
1169 names : `Iterable` [ `str` ]
1170 The names of the collections to remove.
1171 unstore : `bool`, optional
1172 If `True` (default), delete datasets from all datastores in which
1173 they are present, and attempt to rollback the registry deletions if
1174 datastore deletions fail (which may not always be possible). If
1175 `False`, datastore records for these datasets are still removed,
1176 but any artifacts (e.g. files) will not be.
1178 Raises
1179 ------
1180 TypeError
1181 Raised if one or more collections are not of type
1182 `~CollectionType.RUN`.
1183 """
1184 if not self.isWriteable():
1185 raise TypeError("Butler is read-only.")
1186 names = list(names)
1187 refs: List[DatasetRef] = []
1188 for name in names:
1189 collectionType = self.registry.getCollectionType(name)
1190 if collectionType is not CollectionType.RUN:
1191 raise TypeError(f"The collection type of '{name}' is {collectionType.name}, not RUN.")
1192 refs.extend(self.registry.queryDatasets(..., collections=name, findFirst=True))
1193 with self.registry.transaction():
1194 if unstore:
1195 for ref in refs:
1196 if self.datastore.exists(ref):
1197 self.datastore.trash(ref)
1198 else:
1199 self.datastore.forget(refs)
1200 for name in names:
1201 self.registry.removeCollection(name)
1202 if unstore:
1203 # Point of no return for removing artifacts
1204 self.datastore.emptyTrash()
1206 def pruneCollection(self, name: str, purge: bool = False, unstore: bool = False) -> None:
1207 """Remove a collection and possibly prune datasets within it.
1209 Parameters
1210 ----------
1211 name : `str`
1212 Name of the collection to remove. If this is a
1213 `~CollectionType.TAGGED` or `~CollectionType.CHAINED` collection,
1214 datasets within the collection are not modified unless ``unstore``
1215 is `True`. If this is a `~CollectionType.RUN` collection,
1216 ``purge`` and ``unstore`` must be `True`, and all datasets in it
1217 are fully removed from the data repository.
1218 purge : `bool`, optional
1219 If `True`, permit `~CollectionType.RUN` collections to be removed,
1220 fully removing datasets within them. Requires ``unstore=True`` as
1221 well as an added precaution against accidental deletion. Must be
1222 `False` (default) if the collection is not a ``RUN``.
1223 unstore: `bool`, optional
1224 If `True`, remove all datasets in the collection from all
1225 datastores in which they appear.
1227 Raises
1228 ------
1229 TypeError
1230 Raised if the butler is read-only or arguments are mutually
1231 inconsistent.
1232 """
1234 # See pruneDatasets comments for more information about the logic here;
1235 # the cases are almost the same, but here we can rely on Registry to
1236 # take care everything but Datastore deletion when we remove the
1237 # collection.
1238 if not self.isWriteable():
1239 raise TypeError("Butler is read-only.")
1240 collectionType = self.registry.getCollectionType(name)
1241 if purge and not unstore:
1242 raise PurgeWithoutUnstorePruneCollectionsError()
1243 if collectionType is CollectionType.RUN and not purge:
1244 raise RunWithoutPurgePruneCollectionsError(collectionType)
1245 if collectionType is not CollectionType.RUN and purge:
1246 raise PurgeUnsupportedPruneCollectionsError(collectionType)
1248 with self.registry.transaction():
1249 if unstore:
1250 for ref in self.registry.queryDatasets(..., collections=name, findFirst=True):
1251 if self.datastore.exists(ref):
1252 self.datastore.trash(ref)
1253 self.registry.removeCollection(name)
1254 if unstore:
1255 # Point of no return for removing artifacts
1256 self.datastore.emptyTrash()
1258 def pruneDatasets(self, refs: Iterable[DatasetRef], *,
1259 disassociate: bool = True,
1260 unstore: bool = False,
1261 tags: Iterable[str] = (),
1262 purge: bool = False,
1263 run: Optional[str] = None) -> None:
1264 """Remove one or more datasets from a collection and/or storage.
1266 Parameters
1267 ----------
1268 refs : `~collections.abc.Iterable` of `DatasetRef`
1269 Datasets to prune. These must be "resolved" references (not just
1270 a `DatasetType` and data ID).
1271 disassociate : `bool`, optional
1272 Disassociate pruned datasets from ``tags``, or from all collections
1273 if ``purge=True``.
1274 unstore : `bool`, optional
1275 If `True` (`False` is default) remove these datasets from all
1276 datastores known to this butler. Note that this will make it
1277 impossible to retrieve these datasets even via other collections.
1278 Datasets that are already not stored are ignored by this option.
1279 tags : `Iterable` [ `str` ], optional
1280 `~CollectionType.TAGGED` collections to disassociate the datasets
1281 from. Ignored if ``disassociate`` is `False` or ``purge`` is
1282 `True`.
1283 purge : `bool`, optional
1284 If `True` (`False` is default), completely remove the dataset from
1285 the `Registry`. To prevent accidental deletions, ``purge`` may
1286 only be `True` if all of the following conditions are met:
1288 - All given datasets are in the given run.
1289 - ``disassociate`` is `True`;
1290 - ``unstore`` is `True`.
1292 This mode may remove provenance information from datasets other
1293 than those provided, and should be used with extreme care.
1295 Raises
1296 ------
1297 TypeError
1298 Raised if the butler is read-only, if no collection was provided,
1299 or the conditions for ``purge=True`` were not met.
1300 """
1301 if not self.isWriteable():
1302 raise TypeError("Butler is read-only.")
1303 if purge:
1304 if not disassociate:
1305 raise TypeError("Cannot pass purge=True without disassociate=True.")
1306 if not unstore:
1307 raise TypeError("Cannot pass purge=True without unstore=True.")
1308 elif disassociate:
1309 tags = tuple(tags)
1310 if not tags:
1311 raise TypeError("No tags provided but disassociate=True.")
1312 for tag in tags:
1313 collectionType = self.registry.getCollectionType(tag)
1314 if collectionType is not CollectionType.TAGGED:
1315 raise TypeError(f"Cannot disassociate from collection '{tag}' "
1316 f"of non-TAGGED type {collectionType.name}.")
1317 # Transform possibly-single-pass iterable into something we can iterate
1318 # over multiple times.
1319 refs = list(refs)
1320 # Pruning a component of a DatasetRef makes no sense since registry
1321 # doesn't know about components and datastore might not store
1322 # components in a separate file
1323 for ref in refs:
1324 if ref.datasetType.component():
1325 raise ValueError(f"Can not prune a component of a dataset (ref={ref})")
1326 # We don't need an unreliable Datastore transaction for this, because
1327 # we've been extra careful to ensure that Datastore.trash only involves
1328 # mutating the Registry (it can _look_ at Datastore-specific things,
1329 # but shouldn't change them), and hence all operations here are
1330 # Registry operations.
1331 with self.registry.transaction():
1332 if unstore:
1333 for ref in refs:
1334 # There is a difference between a concrete composite
1335 # and virtual composite. In a virtual composite the
1336 # datastore is never given the top level DatasetRef. In
1337 # the concrete composite the datastore knows all the
1338 # refs and will clean up itself if asked to remove the
1339 # parent ref. We can not check configuration for this
1340 # since we can not trust that the configuration is the
1341 # same. We therefore have to ask if the ref exists or
1342 # not. This is consistent with the fact that we want
1343 # to ignore already-removed-from-datastore datasets
1344 # anyway.
1345 if self.datastore.exists(ref):
1346 self.datastore.trash(ref)
1347 if purge:
1348 self.registry.removeDatasets(refs)
1349 elif disassociate:
1350 assert tags, "Guaranteed by earlier logic in this function."
1351 for tag in tags:
1352 self.registry.disassociate(tag, refs)
1353 # We've exited the Registry transaction, and apparently committed.
1354 # (if there was an exception, everything rolled back, and it's as if
1355 # nothing happened - and we never get here).
1356 # Datastore artifacts are not yet gone, but they're clearly marked
1357 # as trash, so if we fail to delete now because of (e.g.) filesystem
1358 # problems we can try again later, and if manual administrative
1359 # intervention is required, it's pretty clear what that should entail:
1360 # deleting everything on disk and in private Datastore tables that is
1361 # in the dataset_location_trash table.
1362 if unstore:
1363 # Point of no return for removing artifacts
1364 self.datastore.emptyTrash()
1366 @transactional
1367 def ingest(self, *datasets: FileDataset, transfer: Optional[str] = "auto", run: Optional[str] = None,
1368 ) -> None:
1369 """Store and register one or more datasets that already exist on disk.
1371 Parameters
1372 ----------
1373 datasets : `FileDataset`
1374 Each positional argument is a struct containing information about
1375 a file to be ingested, including its path (either absolute or
1376 relative to the datastore root, if applicable), a `DatasetRef`,
1377 and optionally a formatter class or its fully-qualified string
1378 name. If a formatter is not provided, the formatter that would be
1379 used for `put` is assumed. On successful return, all
1380 `FileDataset.ref` attributes will have their `DatasetRef.id`
1381 attribute populated and all `FileDataset.formatter` attributes will
1382 be set to the formatter class used. `FileDataset.path` attributes
1383 may be modified to put paths in whatever the datastore considers a
1384 standardized form.
1385 transfer : `str`, optional
1386 If not `None`, must be one of 'auto', 'move', 'copy', 'direct',
1387 'hardlink', 'relsymlink' or 'symlink', indicating how to transfer
1388 the file.
1389 run : `str`, optional
1390 The name of the run ingested datasets should be added to,
1391 overriding ``self.run``.
1393 Raises
1394 ------
1395 TypeError
1396 Raised if the butler is read-only or if no run was provided.
1397 NotImplementedError
1398 Raised if the `Datastore` does not support the given transfer mode.
1399 DatasetTypeNotSupportedError
1400 Raised if one or more files to be ingested have a dataset type that
1401 is not supported by the `Datastore`..
1402 FileNotFoundError
1403 Raised if one of the given files does not exist.
1404 FileExistsError
1405 Raised if transfer is not `None` but the (internal) location the
1406 file would be moved to is already occupied.
1408 Notes
1409 -----
1410 This operation is not fully exception safe: if a database operation
1411 fails, the given `FileDataset` instances may be only partially updated.
1413 It is atomic in terms of database operations (they will either all
1414 succeed or all fail) providing the database engine implements
1415 transactions correctly. It will attempt to be atomic in terms of
1416 filesystem operations as well, but this cannot be implemented
1417 rigorously for most datastores.
1418 """
1419 if not self.isWriteable():
1420 raise TypeError("Butler is read-only.")
1421 # Reorganize the inputs so they're grouped by DatasetType and then
1422 # data ID. We also include a list of DatasetRefs for each FileDataset
1423 # to hold the resolved DatasetRefs returned by the Registry, before
1424 # it's safe to swap them into FileDataset.refs.
1425 # Some type annotation aliases to make that clearer:
1426 GroupForType = Dict[DataCoordinate, Tuple[FileDataset, List[DatasetRef]]]
1427 GroupedData = MutableMapping[DatasetType, GroupForType]
1428 # The actual data structure:
1429 groupedData: GroupedData = defaultdict(dict)
1430 # And the nested loop that populates it:
1431 for dataset in datasets:
1432 # This list intentionally shared across the inner loop, since it's
1433 # associated with `dataset`.
1434 resolvedRefs: List[DatasetRef] = []
1435 for ref in dataset.refs:
1436 groupedData[ref.datasetType][ref.dataId] = (dataset, resolvedRefs)
1438 # Now we can bulk-insert into Registry for each DatasetType.
1439 allResolvedRefs: List[DatasetRef] = []
1440 for datasetType, groupForType in groupedData.items():
1441 refs = self.registry.insertDatasets(datasetType,
1442 dataIds=groupForType.keys(),
1443 run=run)
1444 # Append those resolved DatasetRefs to the new lists we set up for
1445 # them.
1446 for ref, (_, resolvedRefs) in zip(refs, groupForType.values()):
1447 resolvedRefs.append(ref)
1449 # Go back to the original FileDatasets to replace their refs with the
1450 # new resolved ones, and also build a big list of all refs.
1451 allResolvedRefs = []
1452 for groupForType in groupedData.values():
1453 for dataset, resolvedRefs in groupForType.values():
1454 dataset.refs = resolvedRefs
1455 allResolvedRefs.extend(resolvedRefs)
1457 # Bulk-insert everything into Datastore.
1458 self.datastore.ingest(*datasets, transfer=transfer)
1460 @contextlib.contextmanager
1461 def export(self, *, directory: Optional[str] = None,
1462 filename: Optional[str] = None,
1463 format: Optional[str] = None,
1464 transfer: Optional[str] = None) -> Iterator[RepoExportContext]:
1465 """Export datasets from the repository represented by this `Butler`.
1467 This method is a context manager that returns a helper object
1468 (`RepoExportContext`) that is used to indicate what information from
1469 the repository should be exported.
1471 Parameters
1472 ----------
1473 directory : `str`, optional
1474 Directory dataset files should be written to if ``transfer`` is not
1475 `None`.
1476 filename : `str`, optional
1477 Name for the file that will include database information associated
1478 with the exported datasets. If this is not an absolute path and
1479 ``directory`` is not `None`, it will be written to ``directory``
1480 instead of the current working directory. Defaults to
1481 "export.{format}".
1482 format : `str`, optional
1483 File format for the database information file. If `None`, the
1484 extension of ``filename`` will be used.
1485 transfer : `str`, optional
1486 Transfer mode passed to `Datastore.export`.
1488 Raises
1489 ------
1490 TypeError
1491 Raised if the set of arguments passed is inconsistent.
1493 Examples
1494 --------
1495 Typically the `Registry.queryDataIds` and `Registry.queryDatasets`
1496 methods are used to provide the iterables over data IDs and/or datasets
1497 to be exported::
1499 with butler.export("exports.yaml") as export:
1500 # Export all flats, but none of the dimension element rows
1501 # (i.e. data ID information) associated with them.
1502 export.saveDatasets(butler.registry.queryDatasets("flat"),
1503 elements=())
1504 # Export all datasets that start with "deepCoadd_" and all of
1505 # their associated data ID information.
1506 export.saveDatasets(butler.registry.queryDatasets("deepCoadd_*"))
1507 """
1508 if directory is None and transfer is not None:
1509 raise TypeError("Cannot transfer without providing a directory.")
1510 if transfer == "move":
1511 raise TypeError("Transfer may not be 'move': export is read-only")
1512 if format is None:
1513 if filename is None:
1514 raise TypeError("At least one of 'filename' or 'format' must be provided.")
1515 else:
1516 _, format = os.path.splitext(filename)
1517 elif filename is None:
1518 filename = f"export.{format}"
1519 if directory is not None:
1520 filename = os.path.join(directory, filename)
1521 BackendClass = getClassOf(self._config["repo_transfer_formats"][format]["export"])
1522 with open(filename, 'w') as stream:
1523 backend = BackendClass(stream)
1524 try:
1525 helper = RepoExportContext(self.registry, self.datastore, backend=backend,
1526 directory=directory, transfer=transfer)
1527 yield helper
1528 except BaseException:
1529 raise
1530 else:
1531 helper._finish()
1533 def import_(self, *, directory: Optional[str] = None,
1534 filename: Union[str, TextIO, None] = None,
1535 format: Optional[str] = None,
1536 transfer: Optional[str] = None,
1537 skip_dimensions: Optional[Set] = None) -> None:
1538 """Import datasets into this repository that were exported from a
1539 different butler repository via `~lsst.daf.butler.Butler.export`.
1541 Parameters
1542 ----------
1543 directory : `str`, optional
1544 Directory containing dataset files to import from. If `None`,
1545 ``filename`` and all dataset file paths specified therein must
1546 be absolute.
1547 filename : `str` or `TextIO`, optional
1548 A stream or name of file that contains database information
1549 associated with the exported datasets, typically generated by
1550 `~lsst.daf.butler.Butler.export`. If this a string (name) and
1551 is not an absolute path, does not exist in the current working
1552 directory, and ``directory`` is not `None`, it is assumed to be in
1553 ``directory``. Defaults to "export.{format}".
1554 format : `str`, optional
1555 File format for ``filename``. If `None`, the extension of
1556 ``filename`` will be used.
1557 transfer : `str`, optional
1558 Transfer mode passed to `~lsst.daf.butler.Datastore.ingest`.
1559 skip_dimensions : `set`, optional
1560 Names of dimensions that should be skipped and not imported.
1562 Raises
1563 ------
1564 TypeError
1565 Raised if the set of arguments passed is inconsistent, or if the
1566 butler is read-only.
1567 """
1568 if not self.isWriteable():
1569 raise TypeError("Butler is read-only.")
1570 if format is None:
1571 if filename is None:
1572 raise TypeError("At least one of 'filename' or 'format' must be provided.")
1573 else:
1574 _, format = os.path.splitext(filename) # type: ignore
1575 elif filename is None:
1576 filename = f"export.{format}"
1577 if isinstance(filename, str) and directory is not None and not os.path.exists(filename):
1578 filename = os.path.join(directory, filename)
1579 BackendClass = getClassOf(self._config["repo_transfer_formats"][format]["import"])
1581 def doImport(importStream: TextIO) -> None:
1582 backend = BackendClass(importStream, self.registry)
1583 backend.register()
1584 with self.transaction():
1585 backend.load(self.datastore, directory=directory, transfer=transfer,
1586 skip_dimensions=skip_dimensions)
1588 if isinstance(filename, str):
1589 with open(filename, "r") as stream:
1590 doImport(stream)
1591 else:
1592 doImport(filename)
1594 def validateConfiguration(self, logFailures: bool = False,
1595 datasetTypeNames: Optional[Iterable[str]] = None,
1596 ignore: Iterable[str] = None) -> None:
1597 """Validate butler configuration.
1599 Checks that each `DatasetType` can be stored in the `Datastore`.
1601 Parameters
1602 ----------
1603 logFailures : `bool`, optional
1604 If `True`, output a log message for every validation error
1605 detected.
1606 datasetTypeNames : iterable of `str`, optional
1607 The `DatasetType` names that should be checked. This allows
1608 only a subset to be selected.
1609 ignore : iterable of `str`, optional
1610 Names of DatasetTypes to skip over. This can be used to skip
1611 known problems. If a named `DatasetType` corresponds to a
1612 composite, all components of that `DatasetType` will also be
1613 ignored.
1615 Raises
1616 ------
1617 ButlerValidationError
1618 Raised if there is some inconsistency with how this Butler
1619 is configured.
1620 """
1621 if datasetTypeNames:
1622 datasetTypes = [self.registry.getDatasetType(name) for name in datasetTypeNames]
1623 else:
1624 datasetTypes = list(self.registry.queryDatasetTypes())
1626 # filter out anything from the ignore list
1627 if ignore:
1628 ignore = set(ignore)
1629 datasetTypes = [e for e in datasetTypes
1630 if e.name not in ignore and e.nameAndComponent()[0] not in ignore]
1631 else:
1632 ignore = set()
1634 # Find all the registered instruments
1635 instruments = set(
1636 record.name for record in self.registry.queryDimensionRecords("instrument")
1637 )
1639 # For each datasetType that has an instrument dimension, create
1640 # a DatasetRef for each defined instrument
1641 datasetRefs = []
1643 for datasetType in datasetTypes:
1644 if "instrument" in datasetType.dimensions:
1645 for instrument in instruments:
1646 datasetRef = DatasetRef(datasetType, {"instrument": instrument}, # type: ignore
1647 conform=False)
1648 datasetRefs.append(datasetRef)
1650 entities: List[Union[DatasetType, DatasetRef]] = []
1651 entities.extend(datasetTypes)
1652 entities.extend(datasetRefs)
1654 datastoreErrorStr = None
1655 try:
1656 self.datastore.validateConfiguration(entities, logFailures=logFailures)
1657 except ValidationError as e:
1658 datastoreErrorStr = str(e)
1660 # Also check that the LookupKeys used by the datastores match
1661 # registry and storage class definitions
1662 keys = self.datastore.getLookupKeys()
1664 failedNames = set()
1665 failedDataId = set()
1666 for key in keys:
1667 if key.name is not None:
1668 if key.name in ignore:
1669 continue
1671 # skip if specific datasetType names were requested and this
1672 # name does not match
1673 if datasetTypeNames and key.name not in datasetTypeNames:
1674 continue
1676 # See if it is a StorageClass or a DatasetType
1677 if key.name in self.storageClasses:
1678 pass
1679 else:
1680 try:
1681 self.registry.getDatasetType(key.name)
1682 except KeyError:
1683 if logFailures:
1684 log.critical("Key '%s' does not correspond to a DatasetType or StorageClass", key)
1685 failedNames.add(key)
1686 else:
1687 # Dimensions are checked for consistency when the Butler
1688 # is created and rendezvoused with a universe.
1689 pass
1691 # Check that the instrument is a valid instrument
1692 # Currently only support instrument so check for that
1693 if key.dataId:
1694 dataIdKeys = set(key.dataId)
1695 if set(["instrument"]) != dataIdKeys:
1696 if logFailures:
1697 log.critical("Key '%s' has unsupported DataId override", key)
1698 failedDataId.add(key)
1699 elif key.dataId["instrument"] not in instruments:
1700 if logFailures:
1701 log.critical("Key '%s' has unknown instrument", key)
1702 failedDataId.add(key)
1704 messages = []
1706 if datastoreErrorStr:
1707 messages.append(datastoreErrorStr)
1709 for failed, msg in ((failedNames, "Keys without corresponding DatasetType or StorageClass entry: "),
1710 (failedDataId, "Keys with bad DataId entries: ")):
1711 if failed:
1712 msg += ", ".join(str(k) for k in failed)
1713 messages.append(msg)
1715 if messages:
1716 raise ValidationError(";\n".join(messages))
1718 @property
1719 def collections(self) -> CollectionSearch:
1720 """The collections to search by default, in order (`CollectionSearch`).
1722 This is an alias for ``self.registry.defaults.collections``. It cannot
1723 be set directly in isolation, but all defaults may be changed together
1724 by assigning a new `RegistryDefaults` instance to
1725 ``self.registry.defaults``.
1726 """
1727 return self.registry.defaults.collections
1729 @property
1730 def run(self) -> Optional[str]:
1731 """Name of the run this butler writes outputs to by default (`str` or
1732 `None`).
1734 This is an alias for ``self.registry.defaults.run``. It cannot be set
1735 directly in isolation, but all defaults may be changed together by
1736 assigning a new `RegistryDefaults` instance to
1737 ``self.registry.defaults``.
1738 """
1739 return self.registry.defaults.run
1741 registry: Registry
1742 """The object that manages dataset metadata and relationships (`Registry`).
1744 Most operations that don't involve reading or writing butler datasets are
1745 accessible only via `Registry` methods.
1746 """
1748 datastore: Datastore
1749 """The object that manages actual dataset storage (`Datastore`).
1751 Direct user access to the datastore should rarely be necessary; the primary
1752 exception is the case where a `Datastore` implementation provides extra
1753 functionality beyond what the base class defines.
1754 """
1756 storageClasses: StorageClassFactory
1757 """An object that maps known storage class names to objects that fully
1758 describe them (`StorageClassFactory`).
1759 """