Coverage for python/lsst/daf/butler/_butler.py: 10%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""
23Butler top level classes.
24"""
25from __future__ import annotations
27__all__ = (
28 "Butler",
29 "ButlerValidationError",
30 "PruneCollectionsArgsError",
31 "PurgeWithoutUnstorePruneCollectionsError",
32 "RunWithoutPurgePruneCollectionsError",
33 "PurgeUnsupportedPruneCollectionsError",
34)
36import collections.abc
37import contextlib
38import logging
39import numbers
40import os
41from collections import defaultdict
42from typing import (
43 Any,
44 ClassVar,
45 Counter,
46 Dict,
47 Iterable,
48 Iterator,
49 List,
50 MutableMapping,
51 Optional,
52 Set,
53 TextIO,
54 Tuple,
55 Type,
56 Union,
57)
59try:
60 import boto3
61except ImportError:
62 boto3 = None
64from lsst.utils import doImportType
65from lsst.utils.introspection import get_class_of
66from lsst.utils.logging import VERBOSE, getLogger
68from ._butlerConfig import ButlerConfig
69from ._butlerRepoIndex import ButlerRepoIndex
70from ._deferredDatasetHandle import DeferredDatasetHandle
71from .core import (
72 AmbiguousDatasetError,
73 ButlerURI,
74 Config,
75 ConfigSubset,
76 DataCoordinate,
77 DataId,
78 DataIdValue,
79 DatasetRef,
80 DatasetType,
81 Datastore,
82 Dimension,
83 DimensionConfig,
84 FileDataset,
85 Progress,
86 StorageClassFactory,
87 Timespan,
88 ValidationError,
89)
90from .core.repoRelocation import BUTLER_ROOT_TAG
91from .core.utils import transactional
92from .registry import (
93 CollectionSearch,
94 CollectionType,
95 ConflictingDefinitionError,
96 DatasetIdGenEnum,
97 Registry,
98 RegistryConfig,
99 RegistryDefaults,
100)
101from .transfers import RepoExportContext
103log = getLogger(__name__)
106class ButlerValidationError(ValidationError):
107 """There is a problem with the Butler configuration."""
109 pass
112class PruneCollectionsArgsError(TypeError):
113 """Base class for errors relating to Butler.pruneCollections input
114 arguments.
115 """
117 pass
120class PurgeWithoutUnstorePruneCollectionsError(PruneCollectionsArgsError):
121 """Raised when purge and unstore are both required to be True, and
122 purge is True but unstore is False.
123 """
125 def __init__(self) -> None:
126 super().__init__("Cannot pass purge=True without unstore=True.")
129class RunWithoutPurgePruneCollectionsError(PruneCollectionsArgsError):
130 """Raised when pruning a RUN collection but purge is False."""
132 def __init__(self, collectionType: CollectionType):
133 self.collectionType = collectionType
134 super().__init__(f"Cannot prune RUN collection {self.collectionType.name} without purge=True.")
137class PurgeUnsupportedPruneCollectionsError(PruneCollectionsArgsError):
138 """Raised when purge is True but is not supported for the given
139 collection."""
141 def __init__(self, collectionType: CollectionType):
142 self.collectionType = collectionType
143 super().__init__(
144 f"Cannot prune {self.collectionType} collection {self.collectionType.name} with purge=True."
145 )
148class Butler:
149 """Main entry point for the data access system.
151 Parameters
152 ----------
153 config : `ButlerConfig`, `Config` or `str`, optional.
154 Configuration. Anything acceptable to the
155 `ButlerConfig` constructor. If a directory path
156 is given the configuration will be read from a ``butler.yaml`` file in
157 that location. If `None` is given default values will be used.
158 butler : `Butler`, optional.
159 If provided, construct a new Butler that uses the same registry and
160 datastore as the given one, but with the given collection and run.
161 Incompatible with the ``config``, ``searchPaths``, and ``writeable``
162 arguments.
163 collections : `str` or `Iterable` [ `str` ], optional
164 An expression specifying the collections to be searched (in order) when
165 reading datasets.
166 This may be a `str` collection name or an iterable thereof.
167 See :ref:`daf_butler_collection_expressions` for more information.
168 These collections are not registered automatically and must be
169 manually registered before they are used by any method, but they may be
170 manually registered after the `Butler` is initialized.
171 run : `str`, optional
172 Name of the `~CollectionType.RUN` collection new datasets should be
173 inserted into. If ``collections`` is `None` and ``run`` is not `None`,
174 ``collections`` will be set to ``[run]``. If not `None`, this
175 collection will automatically be registered. If this is not set (and
176 ``writeable`` is not set either), a read-only butler will be created.
177 searchPaths : `list` of `str`, optional
178 Directory paths to search when calculating the full Butler
179 configuration. Not used if the supplied config is already a
180 `ButlerConfig`.
181 writeable : `bool`, optional
182 Explicitly sets whether the butler supports write operations. If not
183 provided, a read-write butler is created if any of ``run``, ``tags``,
184 or ``chains`` is non-empty.
185 inferDefaults : `bool`, optional
186 If `True` (default) infer default data ID values from the values
187 present in the datasets in ``collections``: if all collections have the
188 same value (or no value) for a governor dimension, that value will be
189 the default for that dimension. Nonexistent collections are ignored.
190 If a default value is provided explicitly for a governor dimension via
191 ``**kwargs``, no default will be inferred for that dimension.
192 **kwargs : `str`
193 Default data ID key-value pairs. These may only identify "governor"
194 dimensions like ``instrument`` and ``skymap``.
196 Examples
197 --------
198 While there are many ways to control exactly how a `Butler` interacts with
199 the collections in its `Registry`, the most common cases are still simple.
201 For a read-only `Butler` that searches one collection, do::
203 butler = Butler("/path/to/repo", collections=["u/alice/DM-50000"])
205 For a read-write `Butler` that writes to and reads from a
206 `~CollectionType.RUN` collection::
208 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a")
210 The `Butler` passed to a ``PipelineTask`` is often much more complex,
211 because we want to write to one `~CollectionType.RUN` collection but read
212 from several others (as well)::
214 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a",
215 collections=["u/alice/DM-50000/a",
216 "u/bob/DM-49998",
217 "HSC/defaults"])
219 This butler will `put` new datasets to the run ``u/alice/DM-50000/a``.
220 Datasets will be read first from that run (since it appears first in the
221 chain), and then from ``u/bob/DM-49998`` and finally ``HSC/defaults``.
223 Finally, one can always create a `Butler` with no collections::
225 butler = Butler("/path/to/repo", writeable=True)
227 This can be extremely useful when you just want to use ``butler.registry``,
228 e.g. for inserting dimension data or managing collections, or when the
229 collections you want to use with the butler are not consistent.
230 Passing ``writeable`` explicitly here is only necessary if you want to be
231 able to make changes to the repo - usually the value for ``writeable`` can
232 be guessed from the collection arguments provided, but it defaults to
233 `False` when there are not collection arguments.
234 """
236 def __init__(
237 self,
238 config: Union[Config, str, None] = None,
239 *,
240 butler: Optional[Butler] = None,
241 collections: Any = None,
242 run: Optional[str] = None,
243 searchPaths: Optional[List[str]] = None,
244 writeable: Optional[bool] = None,
245 inferDefaults: bool = True,
246 **kwargs: str,
247 ):
248 defaults = RegistryDefaults(collections=collections, run=run, infer=inferDefaults, **kwargs)
249 # Load registry, datastore, etc. from config or existing butler.
250 if butler is not None:
251 if config is not None or searchPaths is not None or writeable is not None:
252 raise TypeError(
253 "Cannot pass 'config', 'searchPaths', or 'writeable' arguments with 'butler' argument."
254 )
255 self.registry = butler.registry.copy(defaults)
256 self.datastore = butler.datastore
257 self.storageClasses = butler.storageClasses
258 self._config: ButlerConfig = butler._config
259 self._allow_put_of_predefined_dataset = butler._allow_put_of_predefined_dataset
260 else:
261 self._config = ButlerConfig(config, searchPaths=searchPaths)
262 try:
263 if "root" in self._config:
264 butlerRoot = self._config["root"]
265 else:
266 butlerRoot = self._config.configDir
267 if writeable is None:
268 writeable = run is not None
269 self.registry = Registry.fromConfig(
270 self._config, butlerRoot=butlerRoot, writeable=writeable, defaults=defaults
271 )
272 self.datastore = Datastore.fromConfig(
273 self._config, self.registry.getDatastoreBridgeManager(), butlerRoot=butlerRoot
274 )
275 self.storageClasses = StorageClassFactory()
276 self.storageClasses.addFromConfig(self._config)
277 self._allow_put_of_predefined_dataset = self._config.get(
278 "allow_put_of_predefined_dataset", False
279 )
280 except Exception:
281 # Failures here usually mean that configuration is incomplete,
282 # just issue an error message which includes config file URI.
283 log.error(f"Failed to instantiate Butler from config {self._config.configFile}.")
284 raise
286 if "run" in self._config or "collection" in self._config:
287 raise ValueError("Passing a run or collection via configuration is no longer supported.")
289 GENERATION: ClassVar[int] = 3
290 """This is a Generation 3 Butler.
292 This attribute may be removed in the future, once the Generation 2 Butler
293 interface has been fully retired; it should only be used in transitional
294 code.
295 """
297 @classmethod
298 def get_repo_uri(cls, label: str) -> ButlerURI:
299 """Look up the label in a butler repository index.
301 Parameters
302 ----------
303 label : `str`
304 Label of the Butler repository to look up.
306 Returns
307 -------
308 uri : `ButlerURI`
309 URI to the Butler repository associated with the given label.
311 Raises
312 ------
313 KeyError
314 Raised if the label is not found in the index, or if an index
315 can not be found at all.
317 Notes
318 -----
319 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the
320 information is discovered.
321 """
322 return ButlerRepoIndex.get_repo_uri(label)
324 @classmethod
325 def get_known_repos(cls) -> Set[str]:
326 """Retrieve the list of known repository labels.
328 Returns
329 -------
330 repos : `set` of `str`
331 All the known labels. Can be empty if no index can be found.
333 Notes
334 -----
335 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the
336 information is discovered.
337 """
338 return ButlerRepoIndex.get_known_repos()
340 @staticmethod
341 def makeRepo(
342 root: str,
343 config: Union[Config, str, None] = None,
344 dimensionConfig: Union[Config, str, None] = None,
345 standalone: bool = False,
346 searchPaths: Optional[List[str]] = None,
347 forceConfigRoot: bool = True,
348 outfile: Optional[str] = None,
349 overwrite: bool = False,
350 ) -> Config:
351 """Create an empty data repository by adding a butler.yaml config
352 to a repository root directory.
354 Parameters
355 ----------
356 root : `str` or `ButlerURI`
357 Path or URI to the root location of the new repository. Will be
358 created if it does not exist.
359 config : `Config` or `str`, optional
360 Configuration to write to the repository, after setting any
361 root-dependent Registry or Datastore config options. Can not
362 be a `ButlerConfig` or a `ConfigSubset`. If `None`, default
363 configuration will be used. Root-dependent config options
364 specified in this config are overwritten if ``forceConfigRoot``
365 is `True`.
366 dimensionConfig : `Config` or `str`, optional
367 Configuration for dimensions, will be used to initialize registry
368 database.
369 standalone : `bool`
370 If True, write all expanded defaults, not just customized or
371 repository-specific settings.
372 This (mostly) decouples the repository from the default
373 configuration, insulating it from changes to the defaults (which
374 may be good or bad, depending on the nature of the changes).
375 Future *additions* to the defaults will still be picked up when
376 initializing `Butlers` to repos created with ``standalone=True``.
377 searchPaths : `list` of `str`, optional
378 Directory paths to search when calculating the full butler
379 configuration.
380 forceConfigRoot : `bool`, optional
381 If `False`, any values present in the supplied ``config`` that
382 would normally be reset are not overridden and will appear
383 directly in the output config. This allows non-standard overrides
384 of the root directory for a datastore or registry to be given.
385 If this parameter is `True` the values for ``root`` will be
386 forced into the resulting config if appropriate.
387 outfile : `str`, optional
388 If not-`None`, the output configuration will be written to this
389 location rather than into the repository itself. Can be a URI
390 string. Can refer to a directory that will be used to write
391 ``butler.yaml``.
392 overwrite : `bool`, optional
393 Create a new configuration file even if one already exists
394 in the specified output location. Default is to raise
395 an exception.
397 Returns
398 -------
399 config : `Config`
400 The updated `Config` instance written to the repo.
402 Raises
403 ------
404 ValueError
405 Raised if a ButlerConfig or ConfigSubset is passed instead of a
406 regular Config (as these subclasses would make it impossible to
407 support ``standalone=False``).
408 FileExistsError
409 Raised if the output config file already exists.
410 os.error
411 Raised if the directory does not exist, exists but is not a
412 directory, or cannot be created.
414 Notes
415 -----
416 Note that when ``standalone=False`` (the default), the configuration
417 search path (see `ConfigSubset.defaultSearchPaths`) that was used to
418 construct the repository should also be used to construct any Butlers
419 to avoid configuration inconsistencies.
420 """
421 if isinstance(config, (ButlerConfig, ConfigSubset)):
422 raise ValueError("makeRepo must be passed a regular Config without defaults applied.")
424 # Ensure that the root of the repository exists or can be made
425 uri = ButlerURI(root, forceDirectory=True)
426 uri.mkdir()
428 config = Config(config)
430 # If we are creating a new repo from scratch with relative roots,
431 # do not propagate an explicit root from the config file
432 if "root" in config:
433 del config["root"]
435 full = ButlerConfig(config, searchPaths=searchPaths) # this applies defaults
436 imported_class = doImportType(full["datastore", "cls"])
437 if not issubclass(imported_class, Datastore):
438 raise TypeError(f"Imported datastore class {full['datastore', 'cls']} is not a Datastore")
439 datastoreClass: Type[Datastore] = imported_class
440 datastoreClass.setConfigRoot(BUTLER_ROOT_TAG, config, full, overwrite=forceConfigRoot)
442 # if key exists in given config, parse it, otherwise parse the defaults
443 # in the expanded config
444 if config.get(("registry", "db")):
445 registryConfig = RegistryConfig(config)
446 else:
447 registryConfig = RegistryConfig(full)
448 defaultDatabaseUri = registryConfig.makeDefaultDatabaseUri(BUTLER_ROOT_TAG)
449 if defaultDatabaseUri is not None:
450 Config.updateParameters(
451 RegistryConfig, config, full, toUpdate={"db": defaultDatabaseUri}, overwrite=forceConfigRoot
452 )
453 else:
454 Config.updateParameters(RegistryConfig, config, full, toCopy=("db",), overwrite=forceConfigRoot)
456 if standalone:
457 config.merge(full)
458 else:
459 # Always expand the registry.managers section into the per-repo
460 # config, because after the database schema is created, it's not
461 # allowed to change anymore. Note that in the standalone=True
462 # branch, _everything_ in the config is expanded, so there's no
463 # need to special case this.
464 Config.updateParameters(RegistryConfig, config, full, toMerge=("managers",), overwrite=False)
465 configURI: Union[str, ButlerURI]
466 if outfile is not None:
467 # When writing to a separate location we must include
468 # the root of the butler repo in the config else it won't know
469 # where to look.
470 config["root"] = uri.geturl()
471 configURI = outfile
472 else:
473 configURI = uri
474 config.dumpToUri(configURI, overwrite=overwrite)
476 # Create Registry and populate tables
477 registryConfig = RegistryConfig(config.get("registry"))
478 dimensionConfig = DimensionConfig(dimensionConfig)
479 Registry.createFromConfig(registryConfig, dimensionConfig=dimensionConfig, butlerRoot=root)
481 log.verbose("Wrote new Butler configuration file to %s", configURI)
483 return config
485 @classmethod
486 def _unpickle(
487 cls,
488 config: ButlerConfig,
489 collections: Optional[CollectionSearch],
490 run: Optional[str],
491 defaultDataId: Dict[str, str],
492 writeable: bool,
493 ) -> Butler:
494 """Callable used to unpickle a Butler.
496 We prefer not to use ``Butler.__init__`` directly so we can force some
497 of its many arguments to be keyword-only (note that ``__reduce__``
498 can only invoke callables with positional arguments).
500 Parameters
501 ----------
502 config : `ButlerConfig`
503 Butler configuration, already coerced into a true `ButlerConfig`
504 instance (and hence after any search paths for overrides have been
505 utilized).
506 collections : `CollectionSearch`
507 Names of the default collections to read from.
508 run : `str`, optional
509 Name of the default `~CollectionType.RUN` collection to write to.
510 defaultDataId : `dict` [ `str`, `str` ]
511 Default data ID values.
512 writeable : `bool`
513 Whether the Butler should support write operations.
515 Returns
516 -------
517 butler : `Butler`
518 A new `Butler` instance.
519 """
520 # MyPy doesn't recognize that the kwargs below are totally valid; it
521 # seems to think '**defaultDataId* is a _positional_ argument!
522 return cls(
523 config=config,
524 collections=collections,
525 run=run,
526 writeable=writeable,
527 **defaultDataId, # type: ignore
528 )
530 def __reduce__(self) -> tuple:
531 """Support pickling."""
532 return (
533 Butler._unpickle,
534 (
535 self._config,
536 self.collections,
537 self.run,
538 self.registry.defaults.dataId.byName(),
539 self.registry.isWriteable(),
540 ),
541 )
543 def __str__(self) -> str:
544 return "Butler(collections={}, run={}, datastore='{}', registry='{}')".format(
545 self.collections, self.run, self.datastore, self.registry
546 )
548 def isWriteable(self) -> bool:
549 """Return `True` if this `Butler` supports write operations."""
550 return self.registry.isWriteable()
552 @contextlib.contextmanager
553 def transaction(self) -> Iterator[None]:
554 """Context manager supporting `Butler` transactions.
556 Transactions can be nested.
557 """
558 with self.registry.transaction():
559 with self.datastore.transaction():
560 yield
562 def _standardizeArgs(
563 self,
564 datasetRefOrType: Union[DatasetRef, DatasetType, str],
565 dataId: Optional[DataId] = None,
566 **kwargs: Any,
567 ) -> Tuple[DatasetType, Optional[DataId]]:
568 """Standardize the arguments passed to several Butler APIs.
570 Parameters
571 ----------
572 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
573 When `DatasetRef` the `dataId` should be `None`.
574 Otherwise the `DatasetType` or name thereof.
575 dataId : `dict` or `DataCoordinate`
576 A `dict` of `Dimension` link name, value pairs that label the
577 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
578 should be provided as the second argument.
579 **kwargs
580 Additional keyword arguments used to augment or construct a
581 `DataCoordinate`. See `DataCoordinate.standardize`
582 parameters.
584 Returns
585 -------
586 datasetType : `DatasetType`
587 A `DatasetType` instance extracted from ``datasetRefOrType``.
588 dataId : `dict` or `DataId`, optional
589 Argument that can be used (along with ``kwargs``) to construct a
590 `DataId`.
592 Notes
593 -----
594 Butler APIs that conceptually need a DatasetRef also allow passing a
595 `DatasetType` (or the name of one) and a `DataId` (or a dict and
596 keyword arguments that can be used to construct one) separately. This
597 method accepts those arguments and always returns a true `DatasetType`
598 and a `DataId` or `dict`.
600 Standardization of `dict` vs `DataId` is best handled by passing the
601 returned ``dataId`` (and ``kwargs``) to `Registry` APIs, which are
602 generally similarly flexible.
603 """
604 externalDatasetType: Optional[DatasetType] = None
605 internalDatasetType: Optional[DatasetType] = None
606 if isinstance(datasetRefOrType, DatasetRef):
607 if dataId is not None or kwargs:
608 raise ValueError("DatasetRef given, cannot use dataId as well")
609 externalDatasetType = datasetRefOrType.datasetType
610 dataId = datasetRefOrType.dataId
611 else:
612 # Don't check whether DataId is provided, because Registry APIs
613 # can usually construct a better error message when it wasn't.
614 if isinstance(datasetRefOrType, DatasetType):
615 externalDatasetType = datasetRefOrType
616 else:
617 internalDatasetType = self.registry.getDatasetType(datasetRefOrType)
619 # Check that they are self-consistent
620 if externalDatasetType is not None:
621 internalDatasetType = self.registry.getDatasetType(externalDatasetType.name)
622 if externalDatasetType != internalDatasetType:
623 raise ValueError(
624 f"Supplied dataset type ({externalDatasetType}) inconsistent with "
625 f"registry definition ({internalDatasetType})"
626 )
628 assert internalDatasetType is not None
629 return internalDatasetType, dataId
631 def _rewrite_data_id(
632 self, dataId: Optional[DataId], datasetType: DatasetType, **kwargs: Any
633 ) -> Tuple[Optional[DataId], Dict[str, Any]]:
634 """Rewrite a data ID taking into account dimension records.
636 Take a Data ID and keyword args and rewrite it if necessary to
637 allow the user to specify dimension records rather than dimension
638 primary values.
640 This allows a user to include a dataId dict with keys of
641 ``exposure.day_obs`` and ``exposure.seq_num`` instead of giving
642 the integer exposure ID. It also allows a string to be given
643 for a dimension value rather than the integer ID if that is more
644 convenient. For example, rather than having to specifyin the
645 detector with ``detector.full_name``, a string given for ``detector``
646 will be interpreted as the full name and converted to the integer
647 value.
649 Keyword arguments can also use strings for dimensions like detector
650 and exposure but python does not allow them to include ``.`` and
651 so the ``exposure.day_obs`` syntax can not be used in a keyword
652 argument.
654 Parameters
655 ----------
656 dataId : `dict` or `DataCoordinate`
657 A `dict` of `Dimension` link name, value pairs that will label the
658 `DatasetRef` within a Collection.
659 datasetType : `DatasetType`
660 The dataset type associated with this dataId. Required to
661 determine the relevant dimensions.
662 **kwargs
663 Additional keyword arguments used to augment or construct a
664 `DataId`. See `DataId` parameters.
666 Returns
667 -------
668 dataId : `dict` or `DataCoordinate`
669 The, possibly rewritten, dataId. If given a `DataCoordinate` and
670 no keyword arguments, the original dataId will be returned
671 unchanged.
672 **kwargs : `dict`
673 Any unused keyword arguments.
674 """
675 # Do nothing if we have a standalone DataCoordinate.
676 if isinstance(dataId, DataCoordinate) and not kwargs:
677 return dataId, kwargs
679 # Process dimension records that are using record information
680 # rather than ids
681 newDataId: Dict[str, DataIdValue] = {}
682 byRecord: Dict[str, Dict[str, Any]] = defaultdict(dict)
684 # if all the dataId comes from keyword parameters we do not need
685 # to do anything here because they can't be of the form
686 # exposure.obs_id because a "." is not allowed in a keyword parameter.
687 if dataId:
688 for k, v in dataId.items():
689 # If we have a Dimension we do not need to do anything
690 # because it cannot be a compound key.
691 if isinstance(k, str) and "." in k:
692 # Someone is using a more human-readable dataId
693 dimensionName, record = k.split(".", 1)
694 byRecord[dimensionName][record] = v
695 elif isinstance(k, Dimension):
696 newDataId[k.name] = v
697 else:
698 newDataId[k] = v
700 # Go through the updated dataId and check the type in case someone is
701 # using an alternate key. We have already filtered out the compound
702 # keys dimensions.record format.
703 not_dimensions = {}
705 # Will need to look in the dataId and the keyword arguments
706 # and will remove them if they need to be fixed or are unrecognized.
707 for dataIdDict in (newDataId, kwargs):
708 # Use a list so we can adjust the dict safely in the loop
709 for dimensionName in list(dataIdDict):
710 value = dataIdDict[dimensionName]
711 try:
712 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
713 except KeyError:
714 # This is not a real dimension
715 not_dimensions[dimensionName] = value
716 del dataIdDict[dimensionName]
717 continue
719 # Convert an integral type to an explicit int to simplify
720 # comparisons here
721 if isinstance(value, numbers.Integral):
722 value = int(value)
724 if not isinstance(value, dimension.primaryKey.getPythonType()):
725 for alternate in dimension.alternateKeys:
726 if isinstance(value, alternate.getPythonType()):
727 byRecord[dimensionName][alternate.name] = value
728 del dataIdDict[dimensionName]
729 log.debug(
730 "Converting dimension %s to %s.%s=%s",
731 dimensionName,
732 dimensionName,
733 alternate.name,
734 value,
735 )
736 break
737 else:
738 log.warning(
739 "Type mismatch found for value '%r' provided for dimension %s. "
740 "Could not find matching alternative (primary key has type %s) "
741 "so attempting to use as-is.",
742 value,
743 dimensionName,
744 dimension.primaryKey.getPythonType(),
745 )
747 # If we have some unrecognized dimensions we have to try to connect
748 # them to records in other dimensions. This is made more complicated
749 # by some dimensions having records with clashing names. A mitigation
750 # is that we can tell by this point which dimensions are missing
751 # for the DatasetType but this does not work for calibrations
752 # where additional dimensions can be used to constrain the temporal
753 # axis.
754 if not_dimensions:
755 # Calculate missing dimensions
756 provided = set(newDataId) | set(kwargs) | set(byRecord)
757 missingDimensions = datasetType.dimensions.names - provided
759 # For calibrations we may well be needing temporal dimensions
760 # so rather than always including all dimensions in the scan
761 # restrict things a little. It is still possible for there
762 # to be confusion over day_obs in visit vs exposure for example.
763 # If we are not searching calibration collections things may
764 # fail but they are going to fail anyway because of the
765 # ambiguousness of the dataId...
766 candidateDimensions: Set[str] = set()
767 candidateDimensions.update(missingDimensions)
768 if datasetType.isCalibration():
769 for dim in self.registry.dimensions.getStaticDimensions():
770 if dim.temporal:
771 candidateDimensions.add(str(dim))
773 # Look up table for the first association with a dimension
774 guessedAssociation: Dict[str, Dict[str, Any]] = defaultdict(dict)
776 # Keep track of whether an item is associated with multiple
777 # dimensions.
778 counter: Counter[str] = Counter()
779 assigned: Dict[str, Set[str]] = defaultdict(set)
781 # Go through the missing dimensions and associate the
782 # given names with records within those dimensions
783 matched_dims = set()
784 for dimensionName in candidateDimensions:
785 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
786 fields = dimension.metadata.names | dimension.uniqueKeys.names
787 for field in not_dimensions:
788 if field in fields:
789 guessedAssociation[dimensionName][field] = not_dimensions[field]
790 counter[dimensionName] += 1
791 assigned[field].add(dimensionName)
792 matched_dims.add(field)
794 # Calculate the fields that matched nothing.
795 never_found = set(not_dimensions) - matched_dims
797 if never_found:
798 raise ValueError(f"Unrecognized keyword args given: {never_found}")
800 # There is a chance we have allocated a single dataId item
801 # to multiple dimensions. Need to decide which should be retained.
802 # For now assume that the most popular alternative wins.
803 # This means that day_obs with seq_num will result in
804 # exposure.day_obs and not visit.day_obs
805 # Also prefer an explicitly missing dimension over an inferred
806 # temporal dimension.
807 for fieldName, assignedDimensions in assigned.items():
808 if len(assignedDimensions) > 1:
809 # Pick the most popular (preferring mandatory dimensions)
810 requiredButMissing = assignedDimensions.intersection(missingDimensions)
811 if requiredButMissing:
812 candidateDimensions = requiredButMissing
813 else:
814 candidateDimensions = assignedDimensions
816 # Select the relevant items and get a new restricted
817 # counter.
818 theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions}
819 duplicatesCounter: Counter[str] = Counter()
820 duplicatesCounter.update(theseCounts)
822 # Choose the most common. If they are equally common
823 # we will pick the one that was found first.
824 # Returns a list of tuples
825 selected = duplicatesCounter.most_common(1)[0][0]
827 log.debug(
828 "Ambiguous dataId entry '%s' associated with multiple dimensions: %s."
829 " Removed ambiguity by choosing dimension %s.",
830 fieldName,
831 ", ".join(assignedDimensions),
832 selected,
833 )
835 for candidateDimension in assignedDimensions:
836 if candidateDimension != selected:
837 del guessedAssociation[candidateDimension][fieldName]
839 # Update the record look up dict with the new associations
840 for dimensionName, values in guessedAssociation.items():
841 if values: # A dict might now be empty
842 log.debug("Assigned non-dimension dataId keys to dimension %s: %s", dimensionName, values)
843 byRecord[dimensionName].update(values)
845 if byRecord:
846 # Some record specifiers were found so we need to convert
847 # them to the Id form
848 for dimensionName, values in byRecord.items():
849 if dimensionName in newDataId:
850 log.warning(
851 "DataId specified explicit %s dimension value of %s in addition to"
852 " general record specifiers for it of %s. Ignoring record information.",
853 dimensionName,
854 newDataId[dimensionName],
855 str(values),
856 )
857 continue
859 # Build up a WHERE expression
860 bind = {k: v for k, v in values.items()}
861 where = " AND ".join(f"{dimensionName}.{k} = {k}" for k in bind)
863 # Hopefully we get a single record that matches
864 records = set(
865 self.registry.queryDimensionRecords(
866 dimensionName, dataId=newDataId, where=where, bind=bind, **kwargs
867 )
868 )
870 if len(records) != 1:
871 if len(records) > 1:
872 log.debug("Received %d records from constraints of %s", len(records), str(values))
873 for r in records:
874 log.debug("- %s", str(r))
875 raise RuntimeError(
876 f"DataId specification for dimension {dimensionName} is not"
877 f" uniquely constrained to a single dataset by {values}."
878 f" Got {len(records)} results."
879 )
880 raise RuntimeError(
881 f"DataId specification for dimension {dimensionName} matched no"
882 f" records when constrained by {values}"
883 )
885 # Get the primary key from the real dimension object
886 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
887 if not isinstance(dimension, Dimension):
888 raise RuntimeError(
889 f"{dimension.name} is not a true dimension, and cannot be used in data IDs."
890 )
891 newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name)
893 # We have modified the dataId so need to switch to it
894 dataId = newDataId
896 return dataId, kwargs
898 def _findDatasetRef(
899 self,
900 datasetRefOrType: Union[DatasetRef, DatasetType, str],
901 dataId: Optional[DataId] = None,
902 *,
903 collections: Any = None,
904 allowUnresolved: bool = False,
905 **kwargs: Any,
906 ) -> DatasetRef:
907 """Shared logic for methods that start with a search for a dataset in
908 the registry.
910 Parameters
911 ----------
912 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
913 When `DatasetRef` the `dataId` should be `None`.
914 Otherwise the `DatasetType` or name thereof.
915 dataId : `dict` or `DataCoordinate`, optional
916 A `dict` of `Dimension` link name, value pairs that label the
917 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
918 should be provided as the first argument.
919 collections : Any, optional
920 Collections to be searched, overriding ``self.collections``.
921 Can be any of the types supported by the ``collections`` argument
922 to butler construction.
923 allowUnresolved : `bool`, optional
924 If `True`, return an unresolved `DatasetRef` if finding a resolved
925 one in the `Registry` fails. Defaults to `False`.
926 **kwargs
927 Additional keyword arguments used to augment or construct a
928 `DataId`. See `DataId` parameters.
930 Returns
931 -------
932 ref : `DatasetRef`
933 A reference to the dataset identified by the given arguments.
935 Raises
936 ------
937 LookupError
938 Raised if no matching dataset exists in the `Registry` (and
939 ``allowUnresolved is False``).
940 ValueError
941 Raised if a resolved `DatasetRef` was passed as an input, but it
942 differs from the one found in the registry.
943 TypeError
944 Raised if no collections were provided.
945 """
946 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs)
947 if isinstance(datasetRefOrType, DatasetRef):
948 idNumber = datasetRefOrType.id
949 else:
950 idNumber = None
951 timespan: Optional[Timespan] = None
953 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs)
955 if datasetType.isCalibration():
956 # Because this is a calibration dataset, first try to make a
957 # standardize the data ID without restricting the dimensions to
958 # those of the dataset type requested, because there may be extra
959 # dimensions that provide temporal information for a validity-range
960 # lookup.
961 dataId = DataCoordinate.standardize(
962 dataId, universe=self.registry.dimensions, defaults=self.registry.defaults.dataId, **kwargs
963 )
964 if dataId.graph.temporal:
965 dataId = self.registry.expandDataId(dataId)
966 timespan = dataId.timespan
967 else:
968 # Standardize the data ID to just the dimensions of the dataset
969 # type instead of letting registry.findDataset do it, so we get the
970 # result even if no dataset is found.
971 dataId = DataCoordinate.standardize(
972 dataId, graph=datasetType.dimensions, defaults=self.registry.defaults.dataId, **kwargs
973 )
974 # Always lookup the DatasetRef, even if one is given, to ensure it is
975 # present in the current collection.
976 ref = self.registry.findDataset(datasetType, dataId, collections=collections, timespan=timespan)
977 if ref is None:
978 if allowUnresolved:
979 return DatasetRef(datasetType, dataId)
980 else:
981 if collections is None:
982 collections = self.registry.defaults.collections
983 raise LookupError(
984 f"Dataset {datasetType.name} with data ID {dataId} "
985 f"could not be found in collections {collections}."
986 )
987 if idNumber is not None and idNumber != ref.id:
988 if collections is None:
989 collections = self.registry.defaults.collections
990 raise ValueError(
991 f"DatasetRef.id provided ({idNumber}) does not match "
992 f"id ({ref.id}) in registry in collections {collections}."
993 )
994 return ref
996 @transactional
997 def put(
998 self,
999 obj: Any,
1000 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1001 dataId: Optional[DataId] = None,
1002 *,
1003 run: Optional[str] = None,
1004 **kwargs: Any,
1005 ) -> DatasetRef:
1006 """Store and register a dataset.
1008 Parameters
1009 ----------
1010 obj : `object`
1011 The dataset.
1012 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1013 When `DatasetRef` is provided, ``dataId`` should be `None`.
1014 Otherwise the `DatasetType` or name thereof.
1015 dataId : `dict` or `DataCoordinate`
1016 A `dict` of `Dimension` link name, value pairs that label the
1017 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1018 should be provided as the second argument.
1019 run : `str`, optional
1020 The name of the run the dataset should be added to, overriding
1021 ``self.run``.
1022 **kwargs
1023 Additional keyword arguments used to augment or construct a
1024 `DataCoordinate`. See `DataCoordinate.standardize`
1025 parameters.
1027 Returns
1028 -------
1029 ref : `DatasetRef`
1030 A reference to the stored dataset, updated with the correct id if
1031 given.
1033 Raises
1034 ------
1035 TypeError
1036 Raised if the butler is read-only or if no run has been provided.
1037 """
1038 log.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run)
1039 if not self.isWriteable():
1040 raise TypeError("Butler is read-only.")
1041 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs)
1042 if isinstance(datasetRefOrType, DatasetRef) and datasetRefOrType.id is not None:
1043 raise ValueError("DatasetRef must not be in registry, must have None id")
1045 # Handle dimension records in dataId
1046 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs)
1048 # Add Registry Dataset entry.
1049 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions, **kwargs)
1051 # For an execution butler the datasets will be pre-defined.
1052 # If the butler is configured that way datasets should only be inserted
1053 # if they do not already exist in registry. Trying and catching
1054 # ConflictingDefinitionError will not work because the transaction
1055 # will be corrupted. Instead, in this mode always check first.
1056 ref = None
1057 ref_is_predefined = False
1058 if self._allow_put_of_predefined_dataset:
1059 # Get the matching ref for this run.
1060 ref = self.registry.findDataset(datasetType, collections=run, dataId=dataId)
1062 if ref:
1063 # Must be expanded form for datastore templating
1064 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions)
1065 ref = ref.expanded(dataId)
1066 ref_is_predefined = True
1068 if not ref:
1069 (ref,) = self.registry.insertDatasets(datasetType, run=run, dataIds=[dataId])
1071 # If the ref is predefined it is possible that the datastore also
1072 # has the record. Asking datastore to put it again will result in
1073 # the artifact being recreated, overwriting previous, then will cause
1074 # a failure in writing the record which will cause the artifact
1075 # to be removed. Much safer to ask first before attempting to
1076 # overwrite. Race conditions should not be an issue for the
1077 # execution butler environment.
1078 if ref_is_predefined:
1079 if self.datastore.knows(ref):
1080 raise ConflictingDefinitionError(f"Dataset associated {ref} already exists.")
1082 self.datastore.put(obj, ref)
1084 return ref
1086 def getDirect(self, ref: DatasetRef, *, parameters: Optional[Dict[str, Any]] = None) -> Any:
1087 """Retrieve a stored dataset.
1089 Unlike `Butler.get`, this method allows datasets outside the Butler's
1090 collection to be read as long as the `DatasetRef` that identifies them
1091 can be obtained separately.
1093 Parameters
1094 ----------
1095 ref : `DatasetRef`
1096 Resolved reference to an already stored dataset.
1097 parameters : `dict`
1098 Additional StorageClass-defined options to control reading,
1099 typically used to efficiently read only a subset of the dataset.
1101 Returns
1102 -------
1103 obj : `object`
1104 The dataset.
1105 """
1106 return self.datastore.get(ref, parameters=parameters)
1108 def getDirectDeferred(
1109 self, ref: DatasetRef, *, parameters: Union[dict, None] = None
1110 ) -> DeferredDatasetHandle:
1111 """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
1112 from a resolved `DatasetRef`.
1114 Parameters
1115 ----------
1116 ref : `DatasetRef`
1117 Resolved reference to an already stored dataset.
1118 parameters : `dict`
1119 Additional StorageClass-defined options to control reading,
1120 typically used to efficiently read only a subset of the dataset.
1122 Returns
1123 -------
1124 obj : `DeferredDatasetHandle`
1125 A handle which can be used to retrieve a dataset at a later time.
1127 Raises
1128 ------
1129 AmbiguousDatasetError
1130 Raised if ``ref.id is None``, i.e. the reference is unresolved.
1131 """
1132 if ref.id is None:
1133 raise AmbiguousDatasetError(
1134 f"Dataset of type {ref.datasetType.name} with data ID {ref.dataId} is not resolved."
1135 )
1136 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters)
1138 def getDeferred(
1139 self,
1140 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1141 dataId: Optional[DataId] = None,
1142 *,
1143 parameters: Union[dict, None] = None,
1144 collections: Any = None,
1145 **kwargs: Any,
1146 ) -> DeferredDatasetHandle:
1147 """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
1148 after an immediate registry lookup.
1150 Parameters
1151 ----------
1152 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1153 When `DatasetRef` the `dataId` should be `None`.
1154 Otherwise the `DatasetType` or name thereof.
1155 dataId : `dict` or `DataCoordinate`, optional
1156 A `dict` of `Dimension` link name, value pairs that label the
1157 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1158 should be provided as the first argument.
1159 parameters : `dict`
1160 Additional StorageClass-defined options to control reading,
1161 typically used to efficiently read only a subset of the dataset.
1162 collections : Any, optional
1163 Collections to be searched, overriding ``self.collections``.
1164 Can be any of the types supported by the ``collections`` argument
1165 to butler construction.
1166 **kwargs
1167 Additional keyword arguments used to augment or construct a
1168 `DataId`. See `DataId` parameters.
1170 Returns
1171 -------
1172 obj : `DeferredDatasetHandle`
1173 A handle which can be used to retrieve a dataset at a later time.
1175 Raises
1176 ------
1177 LookupError
1178 Raised if no matching dataset exists in the `Registry` (and
1179 ``allowUnresolved is False``).
1180 ValueError
1181 Raised if a resolved `DatasetRef` was passed as an input, but it
1182 differs from the one found in the registry.
1183 TypeError
1184 Raised if no collections were provided.
1185 """
1186 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs)
1187 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters)
1189 def get(
1190 self,
1191 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1192 dataId: Optional[DataId] = None,
1193 *,
1194 parameters: Optional[Dict[str, Any]] = None,
1195 collections: Any = None,
1196 **kwargs: Any,
1197 ) -> Any:
1198 """Retrieve a stored dataset.
1200 Parameters
1201 ----------
1202 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1203 When `DatasetRef` the `dataId` should be `None`.
1204 Otherwise the `DatasetType` or name thereof.
1205 dataId : `dict` or `DataCoordinate`
1206 A `dict` of `Dimension` link name, value pairs that label the
1207 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1208 should be provided as the first argument.
1209 parameters : `dict`
1210 Additional StorageClass-defined options to control reading,
1211 typically used to efficiently read only a subset of the dataset.
1212 collections : Any, optional
1213 Collections to be searched, overriding ``self.collections``.
1214 Can be any of the types supported by the ``collections`` argument
1215 to butler construction.
1216 **kwargs
1217 Additional keyword arguments used to augment or construct a
1218 `DataCoordinate`. See `DataCoordinate.standardize`
1219 parameters.
1221 Returns
1222 -------
1223 obj : `object`
1224 The dataset.
1226 Raises
1227 ------
1228 ValueError
1229 Raised if a resolved `DatasetRef` was passed as an input, but it
1230 differs from the one found in the registry.
1231 LookupError
1232 Raised if no matching dataset exists in the `Registry`.
1233 TypeError
1234 Raised if no collections were provided.
1236 Notes
1237 -----
1238 When looking up datasets in a `~CollectionType.CALIBRATION` collection,
1239 this method requires that the given data ID include temporal dimensions
1240 beyond the dimensions of the dataset type itself, in order to find the
1241 dataset with the appropriate validity range. For example, a "bias"
1242 dataset with native dimensions ``{instrument, detector}`` could be
1243 fetched with a ``{instrument, detector, exposure}`` data ID, because
1244 ``exposure`` is a temporal dimension.
1245 """
1246 log.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters)
1247 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs)
1248 return self.getDirect(ref, parameters=parameters)
1250 def getURIs(
1251 self,
1252 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1253 dataId: Optional[DataId] = None,
1254 *,
1255 predict: bool = False,
1256 collections: Any = None,
1257 run: Optional[str] = None,
1258 **kwargs: Any,
1259 ) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]:
1260 """Returns the URIs associated with the dataset.
1262 Parameters
1263 ----------
1264 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1265 When `DatasetRef` the `dataId` should be `None`.
1266 Otherwise the `DatasetType` or name thereof.
1267 dataId : `dict` or `DataCoordinate`
1268 A `dict` of `Dimension` link name, value pairs that label the
1269 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1270 should be provided as the first argument.
1271 predict : `bool`
1272 If `True`, allow URIs to be returned of datasets that have not
1273 been written.
1274 collections : Any, optional
1275 Collections to be searched, overriding ``self.collections``.
1276 Can be any of the types supported by the ``collections`` argument
1277 to butler construction.
1278 run : `str`, optional
1279 Run to use for predictions, overriding ``self.run``.
1280 **kwargs
1281 Additional keyword arguments used to augment or construct a
1282 `DataCoordinate`. See `DataCoordinate.standardize`
1283 parameters.
1285 Returns
1286 -------
1287 primary : `ButlerURI`
1288 The URI to the primary artifact associated with this dataset.
1289 If the dataset was disassembled within the datastore this
1290 may be `None`.
1291 components : `dict`
1292 URIs to any components associated with the dataset artifact.
1293 Can be empty if there are no components.
1294 """
1295 ref = self._findDatasetRef(
1296 datasetRefOrType, dataId, allowUnresolved=predict, collections=collections, **kwargs
1297 )
1298 if ref.id is None: # only possible if predict is True
1299 if run is None:
1300 run = self.run
1301 if run is None:
1302 raise TypeError("Cannot predict location with run=None.")
1303 # Lie about ID, because we can't guess it, and only
1304 # Datastore.getURIs() will ever see it (and it doesn't use it).
1305 ref = ref.resolved(id=0, run=run)
1306 return self.datastore.getURIs(ref, predict)
1308 def getURI(
1309 self,
1310 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1311 dataId: Optional[DataId] = None,
1312 *,
1313 predict: bool = False,
1314 collections: Any = None,
1315 run: Optional[str] = None,
1316 **kwargs: Any,
1317 ) -> ButlerURI:
1318 """Return the URI to the Dataset.
1320 Parameters
1321 ----------
1322 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1323 When `DatasetRef` the `dataId` should be `None`.
1324 Otherwise the `DatasetType` or name thereof.
1325 dataId : `dict` or `DataCoordinate`
1326 A `dict` of `Dimension` link name, value pairs that label the
1327 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1328 should be provided as the first argument.
1329 predict : `bool`
1330 If `True`, allow URIs to be returned of datasets that have not
1331 been written.
1332 collections : Any, optional
1333 Collections to be searched, overriding ``self.collections``.
1334 Can be any of the types supported by the ``collections`` argument
1335 to butler construction.
1336 run : `str`, optional
1337 Run to use for predictions, overriding ``self.run``.
1338 **kwargs
1339 Additional keyword arguments used to augment or construct a
1340 `DataCoordinate`. See `DataCoordinate.standardize`
1341 parameters.
1343 Returns
1344 -------
1345 uri : `ButlerURI`
1346 URI pointing to the Dataset within the datastore. If the
1347 Dataset does not exist in the datastore, and if ``predict`` is
1348 `True`, the URI will be a prediction and will include a URI
1349 fragment "#predicted".
1350 If the datastore does not have entities that relate well
1351 to the concept of a URI the returned URI string will be
1352 descriptive. The returned URI is not guaranteed to be obtainable.
1354 Raises
1355 ------
1356 LookupError
1357 A URI has been requested for a dataset that does not exist and
1358 guessing is not allowed.
1359 ValueError
1360 Raised if a resolved `DatasetRef` was passed as an input, but it
1361 differs from the one found in the registry.
1362 TypeError
1363 Raised if no collections were provided.
1364 RuntimeError
1365 Raised if a URI is requested for a dataset that consists of
1366 multiple artifacts.
1367 """
1368 primary, components = self.getURIs(
1369 datasetRefOrType, dataId=dataId, predict=predict, collections=collections, run=run, **kwargs
1370 )
1372 if primary is None or components:
1373 raise RuntimeError(
1374 f"Dataset ({datasetRefOrType}) includes distinct URIs for components. "
1375 "Use Butler.getURIs() instead."
1376 )
1377 return primary
1379 def retrieveArtifacts(
1380 self,
1381 refs: Iterable[DatasetRef],
1382 destination: Union[str, ButlerURI],
1383 transfer: str = "auto",
1384 preserve_path: bool = True,
1385 overwrite: bool = False,
1386 ) -> List[ButlerURI]:
1387 """Retrieve the artifacts associated with the supplied refs.
1389 Parameters
1390 ----------
1391 refs : iterable of `DatasetRef`
1392 The datasets for which artifacts are to be retrieved.
1393 A single ref can result in multiple artifacts. The refs must
1394 be resolved.
1395 destination : `ButlerURI` or `str`
1396 Location to write the artifacts.
1397 transfer : `str`, optional
1398 Method to use to transfer the artifacts. Must be one of the options
1399 supported by `ButlerURI.transfer_from()`. "move" is not allowed.
1400 preserve_path : `bool`, optional
1401 If `True` the full path of the artifact within the datastore
1402 is preserved. If `False` the final file component of the path
1403 is used.
1404 overwrite : `bool`, optional
1405 If `True` allow transfers to overwrite existing files at the
1406 destination.
1408 Returns
1409 -------
1410 targets : `list` of `ButlerURI`
1411 URIs of file artifacts in destination location. Order is not
1412 preserved.
1414 Notes
1415 -----
1416 For non-file datastores the artifacts written to the destination
1417 may not match the representation inside the datastore. For example
1418 a hierarchical data structure in a NoSQL database may well be stored
1419 as a JSON file.
1420 """
1421 return self.datastore.retrieveArtifacts(
1422 refs, ButlerURI(destination), transfer=transfer, preserve_path=preserve_path, overwrite=overwrite
1423 )
1425 def datasetExists(
1426 self,
1427 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1428 dataId: Optional[DataId] = None,
1429 *,
1430 collections: Any = None,
1431 **kwargs: Any,
1432 ) -> bool:
1433 """Return True if the Dataset is actually present in the Datastore.
1435 Parameters
1436 ----------
1437 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1438 When `DatasetRef` the `dataId` should be `None`.
1439 Otherwise the `DatasetType` or name thereof.
1440 dataId : `dict` or `DataCoordinate`
1441 A `dict` of `Dimension` link name, value pairs that label the
1442 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1443 should be provided as the first argument.
1444 collections : Any, optional
1445 Collections to be searched, overriding ``self.collections``.
1446 Can be any of the types supported by the ``collections`` argument
1447 to butler construction.
1448 **kwargs
1449 Additional keyword arguments used to augment or construct a
1450 `DataCoordinate`. See `DataCoordinate.standardize`
1451 parameters.
1453 Raises
1454 ------
1455 LookupError
1456 Raised if the dataset is not even present in the Registry.
1457 ValueError
1458 Raised if a resolved `DatasetRef` was passed as an input, but it
1459 differs from the one found in the registry.
1460 TypeError
1461 Raised if no collections were provided.
1462 """
1463 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs)
1464 return self.datastore.exists(ref)
1466 def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None:
1467 """Remove one or more `~CollectionType.RUN` collections and the
1468 datasets within them.
1470 Parameters
1471 ----------
1472 names : `Iterable` [ `str` ]
1473 The names of the collections to remove.
1474 unstore : `bool`, optional
1475 If `True` (default), delete datasets from all datastores in which
1476 they are present, and attempt to rollback the registry deletions if
1477 datastore deletions fail (which may not always be possible). If
1478 `False`, datastore records for these datasets are still removed,
1479 but any artifacts (e.g. files) will not be.
1481 Raises
1482 ------
1483 TypeError
1484 Raised if one or more collections are not of type
1485 `~CollectionType.RUN`.
1486 """
1487 if not self.isWriteable():
1488 raise TypeError("Butler is read-only.")
1489 names = list(names)
1490 refs: List[DatasetRef] = []
1491 for name in names:
1492 collectionType = self.registry.getCollectionType(name)
1493 if collectionType is not CollectionType.RUN:
1494 raise TypeError(f"The collection type of '{name}' is {collectionType.name}, not RUN.")
1495 refs.extend(self.registry.queryDatasets(..., collections=name, findFirst=True))
1496 with self.registry.transaction():
1497 if unstore:
1498 self.datastore.trash(refs)
1499 else:
1500 self.datastore.forget(refs)
1501 for name in names:
1502 self.registry.removeCollection(name)
1503 if unstore:
1504 # Point of no return for removing artifacts
1505 self.datastore.emptyTrash()
1507 def pruneCollection(
1508 self, name: str, purge: bool = False, unstore: bool = False, unlink: Optional[List[str]] = None
1509 ) -> None:
1510 """Remove a collection and possibly prune datasets within it.
1512 Parameters
1513 ----------
1514 name : `str`
1515 Name of the collection to remove. If this is a
1516 `~CollectionType.TAGGED` or `~CollectionType.CHAINED` collection,
1517 datasets within the collection are not modified unless ``unstore``
1518 is `True`. If this is a `~CollectionType.RUN` collection,
1519 ``purge`` and ``unstore`` must be `True`, and all datasets in it
1520 are fully removed from the data repository.
1521 purge : `bool`, optional
1522 If `True`, permit `~CollectionType.RUN` collections to be removed,
1523 fully removing datasets within them. Requires ``unstore=True`` as
1524 well as an added precaution against accidental deletion. Must be
1525 `False` (default) if the collection is not a ``RUN``.
1526 unstore: `bool`, optional
1527 If `True`, remove all datasets in the collection from all
1528 datastores in which they appear.
1529 unlink: `list` [`str`], optional
1530 Before removing the given `collection` unlink it from from these
1531 parent collections.
1533 Raises
1534 ------
1535 TypeError
1536 Raised if the butler is read-only or arguments are mutually
1537 inconsistent.
1538 """
1539 # See pruneDatasets comments for more information about the logic here;
1540 # the cases are almost the same, but here we can rely on Registry to
1541 # take care everything but Datastore deletion when we remove the
1542 # collection.
1543 if not self.isWriteable():
1544 raise TypeError("Butler is read-only.")
1545 collectionType = self.registry.getCollectionType(name)
1546 if purge and not unstore:
1547 raise PurgeWithoutUnstorePruneCollectionsError()
1548 if collectionType is CollectionType.RUN and not purge:
1549 raise RunWithoutPurgePruneCollectionsError(collectionType)
1550 if collectionType is not CollectionType.RUN and purge:
1551 raise PurgeUnsupportedPruneCollectionsError(collectionType)
1553 def remove(child: str, parent: str) -> None:
1554 """Remove a child collection from a parent collection."""
1555 # Remove child from parent.
1556 chain = list(self.registry.getCollectionChain(parent))
1557 try:
1558 chain.remove(name)
1559 except ValueError as e:
1560 raise RuntimeError(f"{name} is not a child of {parent}") from e
1561 self.registry.setCollectionChain(parent, chain)
1563 with self.registry.transaction():
1564 if unlink:
1565 for parent in unlink:
1566 remove(name, parent)
1567 if unstore:
1568 refs = self.registry.queryDatasets(..., collections=name, findFirst=True)
1569 self.datastore.trash(refs)
1570 self.registry.removeCollection(name)
1572 if unstore:
1573 # Point of no return for removing artifacts
1574 self.datastore.emptyTrash()
1576 def pruneDatasets(
1577 self,
1578 refs: Iterable[DatasetRef],
1579 *,
1580 disassociate: bool = True,
1581 unstore: bool = False,
1582 tags: Iterable[str] = (),
1583 purge: bool = False,
1584 run: Optional[str] = None,
1585 ) -> None:
1586 """Remove one or more datasets from a collection and/or storage.
1588 Parameters
1589 ----------
1590 refs : `~collections.abc.Iterable` of `DatasetRef`
1591 Datasets to prune. These must be "resolved" references (not just
1592 a `DatasetType` and data ID).
1593 disassociate : `bool`, optional
1594 Disassociate pruned datasets from ``tags``, or from all collections
1595 if ``purge=True``.
1596 unstore : `bool`, optional
1597 If `True` (`False` is default) remove these datasets from all
1598 datastores known to this butler. Note that this will make it
1599 impossible to retrieve these datasets even via other collections.
1600 Datasets that are already not stored are ignored by this option.
1601 tags : `Iterable` [ `str` ], optional
1602 `~CollectionType.TAGGED` collections to disassociate the datasets
1603 from. Ignored if ``disassociate`` is `False` or ``purge`` is
1604 `True`.
1605 purge : `bool`, optional
1606 If `True` (`False` is default), completely remove the dataset from
1607 the `Registry`. To prevent accidental deletions, ``purge`` may
1608 only be `True` if all of the following conditions are met:
1610 - All given datasets are in the given run.
1611 - ``disassociate`` is `True`;
1612 - ``unstore`` is `True`.
1614 This mode may remove provenance information from datasets other
1615 than those provided, and should be used with extreme care.
1617 Raises
1618 ------
1619 TypeError
1620 Raised if the butler is read-only, if no collection was provided,
1621 or the conditions for ``purge=True`` were not met.
1622 """
1623 if not self.isWriteable():
1624 raise TypeError("Butler is read-only.")
1625 if purge:
1626 if not disassociate:
1627 raise TypeError("Cannot pass purge=True without disassociate=True.")
1628 if not unstore:
1629 raise TypeError("Cannot pass purge=True without unstore=True.")
1630 elif disassociate:
1631 tags = tuple(tags)
1632 if not tags:
1633 raise TypeError("No tags provided but disassociate=True.")
1634 for tag in tags:
1635 collectionType = self.registry.getCollectionType(tag)
1636 if collectionType is not CollectionType.TAGGED:
1637 raise TypeError(
1638 f"Cannot disassociate from collection '{tag}' "
1639 f"of non-TAGGED type {collectionType.name}."
1640 )
1641 # Transform possibly-single-pass iterable into something we can iterate
1642 # over multiple times.
1643 refs = list(refs)
1644 # Pruning a component of a DatasetRef makes no sense since registry
1645 # doesn't know about components and datastore might not store
1646 # components in a separate file
1647 for ref in refs:
1648 if ref.datasetType.component():
1649 raise ValueError(f"Can not prune a component of a dataset (ref={ref})")
1650 # We don't need an unreliable Datastore transaction for this, because
1651 # we've been extra careful to ensure that Datastore.trash only involves
1652 # mutating the Registry (it can _look_ at Datastore-specific things,
1653 # but shouldn't change them), and hence all operations here are
1654 # Registry operations.
1655 with self.registry.transaction():
1656 if unstore:
1657 self.datastore.trash(refs)
1658 if purge:
1659 self.registry.removeDatasets(refs)
1660 elif disassociate:
1661 assert tags, "Guaranteed by earlier logic in this function."
1662 for tag in tags:
1663 self.registry.disassociate(tag, refs)
1664 # We've exited the Registry transaction, and apparently committed.
1665 # (if there was an exception, everything rolled back, and it's as if
1666 # nothing happened - and we never get here).
1667 # Datastore artifacts are not yet gone, but they're clearly marked
1668 # as trash, so if we fail to delete now because of (e.g.) filesystem
1669 # problems we can try again later, and if manual administrative
1670 # intervention is required, it's pretty clear what that should entail:
1671 # deleting everything on disk and in private Datastore tables that is
1672 # in the dataset_location_trash table.
1673 if unstore:
1674 # Point of no return for removing artifacts
1675 self.datastore.emptyTrash()
1677 @transactional
1678 def ingest(
1679 self,
1680 *datasets: FileDataset,
1681 transfer: Optional[str] = "auto",
1682 run: Optional[str] = None,
1683 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
1684 ) -> None:
1685 """Store and register one or more datasets that already exist on disk.
1687 Parameters
1688 ----------
1689 datasets : `FileDataset`
1690 Each positional argument is a struct containing information about
1691 a file to be ingested, including its URI (either absolute or
1692 relative to the datastore root, if applicable), a `DatasetRef`,
1693 and optionally a formatter class or its fully-qualified string
1694 name. If a formatter is not provided, the formatter that would be
1695 used for `put` is assumed. On successful return, all
1696 `FileDataset.ref` attributes will have their `DatasetRef.id`
1697 attribute populated and all `FileDataset.formatter` attributes will
1698 be set to the formatter class used. `FileDataset.path` attributes
1699 may be modified to put paths in whatever the datastore considers a
1700 standardized form.
1701 transfer : `str`, optional
1702 If not `None`, must be one of 'auto', 'move', 'copy', 'direct',
1703 'split', 'hardlink', 'relsymlink' or 'symlink', indicating how to
1704 transfer the file.
1705 run : `str`, optional
1706 The name of the run ingested datasets should be added to,
1707 overriding ``self.run``.
1708 idGenerationMode : `DatasetIdGenEnum`, optional
1709 Specifies option for generating dataset IDs. By default unique IDs
1710 are generated for each inserted dataset.
1712 Raises
1713 ------
1714 TypeError
1715 Raised if the butler is read-only or if no run was provided.
1716 NotImplementedError
1717 Raised if the `Datastore` does not support the given transfer mode.
1718 DatasetTypeNotSupportedError
1719 Raised if one or more files to be ingested have a dataset type that
1720 is not supported by the `Datastore`..
1721 FileNotFoundError
1722 Raised if one of the given files does not exist.
1723 FileExistsError
1724 Raised if transfer is not `None` but the (internal) location the
1725 file would be moved to is already occupied.
1727 Notes
1728 -----
1729 This operation is not fully exception safe: if a database operation
1730 fails, the given `FileDataset` instances may be only partially updated.
1732 It is atomic in terms of database operations (they will either all
1733 succeed or all fail) providing the database engine implements
1734 transactions correctly. It will attempt to be atomic in terms of
1735 filesystem operations as well, but this cannot be implemented
1736 rigorously for most datastores.
1737 """
1738 if not self.isWriteable():
1739 raise TypeError("Butler is read-only.")
1740 progress = Progress("lsst.daf.butler.Butler.ingest", level=logging.DEBUG)
1741 # Reorganize the inputs so they're grouped by DatasetType and then
1742 # data ID. We also include a list of DatasetRefs for each FileDataset
1743 # to hold the resolved DatasetRefs returned by the Registry, before
1744 # it's safe to swap them into FileDataset.refs.
1745 # Some type annotation aliases to make that clearer:
1746 GroupForType = Dict[DataCoordinate, Tuple[FileDataset, List[DatasetRef]]]
1747 GroupedData = MutableMapping[DatasetType, GroupForType]
1748 # The actual data structure:
1749 groupedData: GroupedData = defaultdict(dict)
1750 # And the nested loop that populates it:
1751 for dataset in progress.wrap(datasets, desc="Grouping by dataset type"):
1752 # This list intentionally shared across the inner loop, since it's
1753 # associated with `dataset`.
1754 resolvedRefs: List[DatasetRef] = []
1756 # Somewhere to store pre-existing refs if we have an
1757 # execution butler.
1758 existingRefs: List[DatasetRef] = []
1760 for ref in dataset.refs:
1761 if ref.dataId in groupedData[ref.datasetType]:
1762 raise ConflictingDefinitionError(
1763 f"Ingest conflict. Dataset {dataset.path} has same"
1764 " DataId as other ingest dataset"
1765 f" {groupedData[ref.datasetType][ref.dataId][0].path} "
1766 f" ({ref.dataId})"
1767 )
1768 if self._allow_put_of_predefined_dataset:
1769 existing_ref = self.registry.findDataset(
1770 ref.datasetType, dataId=ref.dataId, collections=run
1771 )
1772 if existing_ref:
1773 if self.datastore.knows(existing_ref):
1774 raise ConflictingDefinitionError(
1775 f"Dataset associated with path {dataset.path}"
1776 f" already exists as {existing_ref}."
1777 )
1778 # Store this ref elsewhere since it already exists
1779 # and we do not want to remake it but we do want
1780 # to store it in the datastore.
1781 existingRefs.append(existing_ref)
1783 # Nothing else to do until we have finished
1784 # iterating.
1785 continue
1787 groupedData[ref.datasetType][ref.dataId] = (dataset, resolvedRefs)
1789 if existingRefs:
1791 if len(dataset.refs) != len(existingRefs):
1792 # Keeping track of partially pre-existing datasets is hard
1793 # and should generally never happen. For now don't allow
1794 # it.
1795 raise ConflictingDefinitionError(
1796 f"For dataset {dataset.path} some dataIds already exist"
1797 " in registry but others do not. This is not supported."
1798 )
1800 # Attach the resolved refs if we found them.
1801 dataset.refs = existingRefs
1803 # Now we can bulk-insert into Registry for each DatasetType.
1804 for datasetType, groupForType in progress.iter_item_chunks(
1805 groupedData.items(), desc="Bulk-inserting datasets by type"
1806 ):
1807 refs = self.registry.insertDatasets(
1808 datasetType,
1809 dataIds=groupForType.keys(),
1810 run=run,
1811 expand=self.datastore.needs_expanded_data_ids(transfer, datasetType),
1812 idGenerationMode=idGenerationMode,
1813 )
1814 # Append those resolved DatasetRefs to the new lists we set up for
1815 # them.
1816 for ref, (_, resolvedRefs) in zip(refs, groupForType.values()):
1817 resolvedRefs.append(ref)
1819 # Go back to the original FileDatasets to replace their refs with the
1820 # new resolved ones.
1821 for groupForType in progress.iter_chunks(
1822 groupedData.values(), desc="Reassociating resolved dataset refs with files"
1823 ):
1824 for dataset, resolvedRefs in groupForType.values():
1825 dataset.refs = resolvedRefs
1827 # Bulk-insert everything into Datastore.
1828 self.datastore.ingest(*datasets, transfer=transfer)
1830 @contextlib.contextmanager
1831 def export(
1832 self,
1833 *,
1834 directory: Optional[str] = None,
1835 filename: Optional[str] = None,
1836 format: Optional[str] = None,
1837 transfer: Optional[str] = None,
1838 ) -> Iterator[RepoExportContext]:
1839 """Export datasets from the repository represented by this `Butler`.
1841 This method is a context manager that returns a helper object
1842 (`RepoExportContext`) that is used to indicate what information from
1843 the repository should be exported.
1845 Parameters
1846 ----------
1847 directory : `str`, optional
1848 Directory dataset files should be written to if ``transfer`` is not
1849 `None`.
1850 filename : `str`, optional
1851 Name for the file that will include database information associated
1852 with the exported datasets. If this is not an absolute path and
1853 ``directory`` is not `None`, it will be written to ``directory``
1854 instead of the current working directory. Defaults to
1855 "export.{format}".
1856 format : `str`, optional
1857 File format for the database information file. If `None`, the
1858 extension of ``filename`` will be used.
1859 transfer : `str`, optional
1860 Transfer mode passed to `Datastore.export`.
1862 Raises
1863 ------
1864 TypeError
1865 Raised if the set of arguments passed is inconsistent.
1867 Examples
1868 --------
1869 Typically the `Registry.queryDataIds` and `Registry.queryDatasets`
1870 methods are used to provide the iterables over data IDs and/or datasets
1871 to be exported::
1873 with butler.export("exports.yaml") as export:
1874 # Export all flats, but none of the dimension element rows
1875 # (i.e. data ID information) associated with them.
1876 export.saveDatasets(butler.registry.queryDatasets("flat"),
1877 elements=())
1878 # Export all datasets that start with "deepCoadd_" and all of
1879 # their associated data ID information.
1880 export.saveDatasets(butler.registry.queryDatasets("deepCoadd_*"))
1881 """
1882 if directory is None and transfer is not None:
1883 raise TypeError("Cannot transfer without providing a directory.")
1884 if transfer == "move":
1885 raise TypeError("Transfer may not be 'move': export is read-only")
1886 if format is None:
1887 if filename is None:
1888 raise TypeError("At least one of 'filename' or 'format' must be provided.")
1889 else:
1890 _, format = os.path.splitext(filename)
1891 elif filename is None:
1892 filename = f"export.{format}"
1893 if directory is not None:
1894 filename = os.path.join(directory, filename)
1895 BackendClass = get_class_of(self._config["repo_transfer_formats"][format]["export"])
1896 with open(filename, "w") as stream:
1897 backend = BackendClass(stream)
1898 try:
1899 helper = RepoExportContext(
1900 self.registry, self.datastore, backend=backend, directory=directory, transfer=transfer
1901 )
1902 yield helper
1903 except BaseException:
1904 raise
1905 else:
1906 helper._finish()
1908 def import_(
1909 self,
1910 *,
1911 directory: Optional[str] = None,
1912 filename: Union[str, TextIO, None] = None,
1913 format: Optional[str] = None,
1914 transfer: Optional[str] = None,
1915 skip_dimensions: Optional[Set] = None,
1916 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
1917 reuseIds: bool = False,
1918 ) -> None:
1919 """Import datasets into this repository that were exported from a
1920 different butler repository via `~lsst.daf.butler.Butler.export`.
1922 Parameters
1923 ----------
1924 directory : `str`, optional
1925 Directory containing dataset files to import from. If `None`,
1926 ``filename`` and all dataset file paths specified therein must
1927 be absolute.
1928 filename : `str` or `TextIO`, optional
1929 A stream or name of file that contains database information
1930 associated with the exported datasets, typically generated by
1931 `~lsst.daf.butler.Butler.export`. If this a string (name) and
1932 is not an absolute path, does not exist in the current working
1933 directory, and ``directory`` is not `None`, it is assumed to be in
1934 ``directory``. Defaults to "export.{format}".
1935 format : `str`, optional
1936 File format for ``filename``. If `None`, the extension of
1937 ``filename`` will be used.
1938 transfer : `str`, optional
1939 Transfer mode passed to `~lsst.daf.butler.Datastore.ingest`.
1940 skip_dimensions : `set`, optional
1941 Names of dimensions that should be skipped and not imported.
1942 idGenerationMode : `DatasetIdGenEnum`, optional
1943 Specifies option for generating dataset IDs when IDs are not
1944 provided or their type does not match backend type. By default
1945 unique IDs are generated for each inserted dataset.
1946 reuseIds : `bool`, optional
1947 If `True` then forces re-use of imported dataset IDs for integer
1948 IDs which are normally generated as auto-incremented; exception
1949 will be raised if imported IDs clash with existing ones. This
1950 option has no effect on the use of globally-unique IDs which are
1951 always re-used (or generated if integer IDs are being imported).
1953 Raises
1954 ------
1955 TypeError
1956 Raised if the set of arguments passed is inconsistent, or if the
1957 butler is read-only.
1958 """
1959 if not self.isWriteable():
1960 raise TypeError("Butler is read-only.")
1961 if format is None:
1962 if filename is None:
1963 raise TypeError("At least one of 'filename' or 'format' must be provided.")
1964 else:
1965 _, format = os.path.splitext(filename) # type: ignore
1966 elif filename is None:
1967 filename = f"export.{format}"
1968 if isinstance(filename, str) and directory is not None and not os.path.exists(filename):
1969 filename = os.path.join(directory, filename)
1970 BackendClass = get_class_of(self._config["repo_transfer_formats"][format]["import"])
1972 def doImport(importStream: TextIO) -> None:
1973 backend = BackendClass(importStream, self.registry)
1974 backend.register()
1975 with self.transaction():
1976 backend.load(
1977 self.datastore,
1978 directory=directory,
1979 transfer=transfer,
1980 skip_dimensions=skip_dimensions,
1981 idGenerationMode=idGenerationMode,
1982 reuseIds=reuseIds,
1983 )
1985 if isinstance(filename, str):
1986 with open(filename, "r") as stream:
1987 doImport(stream)
1988 else:
1989 doImport(filename)
1991 def transfer_from(
1992 self,
1993 source_butler: Butler,
1994 source_refs: Iterable[DatasetRef],
1995 transfer: str = "auto",
1996 id_gen_map: Dict[str, DatasetIdGenEnum] = None,
1997 skip_missing: bool = True,
1998 register_dataset_types: bool = False,
1999 ) -> List[DatasetRef]:
2000 """Transfer datasets to this Butler from a run in another Butler.
2002 Parameters
2003 ----------
2004 source_butler : `Butler`
2005 Butler from which the datasets are to be transferred.
2006 source_refs : iterable of `DatasetRef`
2007 Datasets defined in the source butler that should be transferred to
2008 this butler.
2009 transfer : `str`, optional
2010 Transfer mode passed to `~lsst.daf.butler.Datastore.transfer_from`.
2011 id_gen_map : `dict` of [`str`, `DatasetIdGenEnum`], optional
2012 A mapping of dataset type to ID generation mode. Only used if
2013 the source butler is using integer IDs. Should not be used
2014 if this receiving butler uses integer IDs. Without this dataset
2015 import always uses unique.
2016 skip_missing : `bool`
2017 If `True`, datasets with no datastore artifact associated with
2018 them are not transferred. If `False` a registry entry will be
2019 created even if no datastore record is created (and so will
2020 look equivalent to the dataset being unstored).
2021 register_dataset_types : `bool`
2022 If `True` any missing dataset types are registered. Otherwise
2023 an exception is raised.
2025 Returns
2026 -------
2027 refs : `list` of `DatasetRef`
2028 The refs added to this Butler.
2030 Notes
2031 -----
2032 Requires that any dimension definitions are already present in the
2033 receiving Butler. The datastore artifact has to exist for a transfer
2034 to be made but non-existence is not an error.
2036 Datasets that already exist in this run will be skipped.
2038 The datasets are imported as part of a transaction, although
2039 dataset types are registered before the transaction is started.
2040 This means that it is possible for a dataset type to be registered
2041 even though transfer has failed.
2042 """
2043 if not self.isWriteable():
2044 raise TypeError("Butler is read-only.")
2045 progress = Progress("lsst.daf.butler.Butler.transfer_from", level=VERBOSE)
2047 # Will iterate through the refs multiple times so need to convert
2048 # to a list if this isn't a collection.
2049 if not isinstance(source_refs, collections.abc.Collection):
2050 source_refs = list(source_refs)
2052 original_count = len(source_refs)
2053 log.info("Transferring %d datasets into %s", original_count, str(self))
2055 if id_gen_map is None:
2056 id_gen_map = {}
2058 # In some situations the datastore artifact may be missing
2059 # and we do not want that registry entry to be imported.
2060 # Asking datastore is not sufficient, the records may have been
2061 # purged, we have to ask for the (predicted) URI and check
2062 # existence explicitly. Execution butler is set up exactly like
2063 # this with no datastore records.
2064 artifact_existence: Dict[ButlerURI, bool] = {}
2065 if skip_missing:
2066 dataset_existence = source_butler.datastore.mexists(
2067 source_refs, artifact_existence=artifact_existence
2068 )
2069 source_refs = [ref for ref, exists in dataset_existence.items() if exists]
2070 filtered_count = len(source_refs)
2071 log.verbose(
2072 "%d datasets removed because the artifact does not exist. Now have %d.",
2073 original_count - filtered_count,
2074 filtered_count,
2075 )
2077 # Importing requires that we group the refs by dataset type and run
2078 # before doing the import.
2079 source_dataset_types = set()
2080 grouped_refs = defaultdict(list)
2081 grouped_indices = defaultdict(list)
2082 for i, ref in enumerate(source_refs):
2083 grouped_refs[ref.datasetType, ref.run].append(ref)
2084 grouped_indices[ref.datasetType, ref.run].append(i)
2085 source_dataset_types.add(ref.datasetType)
2087 # Check to see if the dataset type in the source butler has
2088 # the same definition in the target butler and register missing
2089 # ones if requested. Registration must happen outside a transaction.
2090 newly_registered_dataset_types = set()
2091 for datasetType in source_dataset_types:
2092 if register_dataset_types:
2093 # Let this raise immediately if inconsistent. Continuing
2094 # on to find additional inconsistent dataset types
2095 # might result in additional unwanted dataset types being
2096 # registered.
2097 if self.registry.registerDatasetType(datasetType):
2098 newly_registered_dataset_types.add(datasetType)
2099 else:
2100 # If the dataset type is missing, let it fail immediately.
2101 target_dataset_type = self.registry.getDatasetType(datasetType.name)
2102 if target_dataset_type != datasetType:
2103 raise ConflictingDefinitionError(
2104 "Source butler dataset type differs from definition"
2105 f" in target butler: {datasetType} !="
2106 f" {target_dataset_type}"
2107 )
2108 if newly_registered_dataset_types:
2109 # We may have registered some even if there were inconsistencies
2110 # but should let people know (or else remove them again).
2111 log.log(
2112 VERBOSE,
2113 "Registered the following dataset types in the target Butler: %s",
2114 ", ".join(d.name for d in newly_registered_dataset_types),
2115 )
2116 else:
2117 log.log(VERBOSE, "All required dataset types are known to the target Butler")
2119 # The returned refs should be identical for UUIDs.
2120 # For now must also support integers and so need to retain the
2121 # newly-created refs from this registry.
2122 # Pre-size it so we can assign refs into the correct slots
2123 transferred_refs_tmp: List[Optional[DatasetRef]] = [None] * len(source_refs)
2124 default_id_gen = DatasetIdGenEnum.UNIQUE
2126 handled_collections: Set[str] = set()
2128 # Do all the importing in a single transaction.
2129 with self.transaction():
2130 for (datasetType, run), refs_to_import in progress.iter_item_chunks(
2131 grouped_refs.items(), desc="Importing to registry by run and dataset type"
2132 ):
2133 if run not in handled_collections:
2134 run_doc = source_butler.registry.getCollectionDocumentation(run)
2135 registered = self.registry.registerRun(run, doc=run_doc)
2136 handled_collections.add(run)
2137 if registered:
2138 log.log(VERBOSE, "Creating output run %s", run)
2140 id_generation_mode = default_id_gen
2141 if isinstance(refs_to_import[0].id, int):
2142 # ID generation mode might need to be overridden when
2143 # targetting UUID
2144 id_generation_mode = id_gen_map.get(datasetType.name, default_id_gen)
2146 n_refs = len(refs_to_import)
2147 log.verbose(
2148 "Importing %d ref%s of dataset type %s into run %s",
2149 n_refs,
2150 "" if n_refs == 1 else "s",
2151 datasetType.name,
2152 run,
2153 )
2155 # No way to know if this butler's registry uses UUID.
2156 # We have to trust the caller on this. If it fails they will
2157 # have to change their approach. We can't catch the exception
2158 # and retry with unique because that will mess up the
2159 # transaction handling. We aren't allowed to ask the registry
2160 # manager what type of ID it is using.
2161 imported_refs = self.registry._importDatasets(
2162 refs_to_import, idGenerationMode=id_generation_mode, expand=False
2163 )
2165 # Map them into the correct slots to match the initial order
2166 for i, ref in zip(grouped_indices[datasetType, run], imported_refs):
2167 transferred_refs_tmp[i] = ref
2169 # Mypy insists that we might have None in here so we have to make
2170 # that explicit by assigning to a new variable and filtering out
2171 # something that won't be there.
2172 transferred_refs = [ref for ref in transferred_refs_tmp if ref is not None]
2174 # Check consistency
2175 assert len(source_refs) == len(transferred_refs), "Different number of refs imported than given"
2177 log.verbose("Imported %d datasets into destination butler", len(transferred_refs))
2179 # The transferred refs need to be reordered to match the original
2180 # ordering given by the caller. Without this the datastore transfer
2181 # will be broken.
2183 # Ask the datastore to transfer. The datastore has to check that
2184 # the source datastore is compatible with the target datastore.
2185 self.datastore.transfer_from(
2186 source_butler.datastore,
2187 source_refs,
2188 local_refs=transferred_refs,
2189 transfer=transfer,
2190 artifact_existence=artifact_existence,
2191 )
2193 return transferred_refs
2195 def validateConfiguration(
2196 self,
2197 logFailures: bool = False,
2198 datasetTypeNames: Optional[Iterable[str]] = None,
2199 ignore: Iterable[str] = None,
2200 ) -> None:
2201 """Validate butler configuration.
2203 Checks that each `DatasetType` can be stored in the `Datastore`.
2205 Parameters
2206 ----------
2207 logFailures : `bool`, optional
2208 If `True`, output a log message for every validation error
2209 detected.
2210 datasetTypeNames : iterable of `str`, optional
2211 The `DatasetType` names that should be checked. This allows
2212 only a subset to be selected.
2213 ignore : iterable of `str`, optional
2214 Names of DatasetTypes to skip over. This can be used to skip
2215 known problems. If a named `DatasetType` corresponds to a
2216 composite, all components of that `DatasetType` will also be
2217 ignored.
2219 Raises
2220 ------
2221 ButlerValidationError
2222 Raised if there is some inconsistency with how this Butler
2223 is configured.
2224 """
2225 if datasetTypeNames:
2226 datasetTypes = [self.registry.getDatasetType(name) for name in datasetTypeNames]
2227 else:
2228 datasetTypes = list(self.registry.queryDatasetTypes())
2230 # filter out anything from the ignore list
2231 if ignore:
2232 ignore = set(ignore)
2233 datasetTypes = [
2234 e for e in datasetTypes if e.name not in ignore and e.nameAndComponent()[0] not in ignore
2235 ]
2236 else:
2237 ignore = set()
2239 # Find all the registered instruments
2240 instruments = set(record.name for record in self.registry.queryDimensionRecords("instrument"))
2242 # For each datasetType that has an instrument dimension, create
2243 # a DatasetRef for each defined instrument
2244 datasetRefs = []
2246 for datasetType in datasetTypes:
2247 if "instrument" in datasetType.dimensions:
2248 for instrument in instruments:
2249 datasetRef = DatasetRef(
2250 datasetType, {"instrument": instrument}, conform=False # type: ignore
2251 )
2252 datasetRefs.append(datasetRef)
2254 entities: List[Union[DatasetType, DatasetRef]] = []
2255 entities.extend(datasetTypes)
2256 entities.extend(datasetRefs)
2258 datastoreErrorStr = None
2259 try:
2260 self.datastore.validateConfiguration(entities, logFailures=logFailures)
2261 except ValidationError as e:
2262 datastoreErrorStr = str(e)
2264 # Also check that the LookupKeys used by the datastores match
2265 # registry and storage class definitions
2266 keys = self.datastore.getLookupKeys()
2268 failedNames = set()
2269 failedDataId = set()
2270 for key in keys:
2271 if key.name is not None:
2272 if key.name in ignore:
2273 continue
2275 # skip if specific datasetType names were requested and this
2276 # name does not match
2277 if datasetTypeNames and key.name not in datasetTypeNames:
2278 continue
2280 # See if it is a StorageClass or a DatasetType
2281 if key.name in self.storageClasses:
2282 pass
2283 else:
2284 try:
2285 self.registry.getDatasetType(key.name)
2286 except KeyError:
2287 if logFailures:
2288 log.critical("Key '%s' does not correspond to a DatasetType or StorageClass", key)
2289 failedNames.add(key)
2290 else:
2291 # Dimensions are checked for consistency when the Butler
2292 # is created and rendezvoused with a universe.
2293 pass
2295 # Check that the instrument is a valid instrument
2296 # Currently only support instrument so check for that
2297 if key.dataId:
2298 dataIdKeys = set(key.dataId)
2299 if set(["instrument"]) != dataIdKeys:
2300 if logFailures:
2301 log.critical("Key '%s' has unsupported DataId override", key)
2302 failedDataId.add(key)
2303 elif key.dataId["instrument"] not in instruments:
2304 if logFailures:
2305 log.critical("Key '%s' has unknown instrument", key)
2306 failedDataId.add(key)
2308 messages = []
2310 if datastoreErrorStr:
2311 messages.append(datastoreErrorStr)
2313 for failed, msg in (
2314 (failedNames, "Keys without corresponding DatasetType or StorageClass entry: "),
2315 (failedDataId, "Keys with bad DataId entries: "),
2316 ):
2317 if failed:
2318 msg += ", ".join(str(k) for k in failed)
2319 messages.append(msg)
2321 if messages:
2322 raise ValidationError(";\n".join(messages))
2324 @property
2325 def collections(self) -> CollectionSearch:
2326 """The collections to search by default, in order (`CollectionSearch`).
2328 This is an alias for ``self.registry.defaults.collections``. It cannot
2329 be set directly in isolation, but all defaults may be changed together
2330 by assigning a new `RegistryDefaults` instance to
2331 ``self.registry.defaults``.
2332 """
2333 return self.registry.defaults.collections
2335 @property
2336 def run(self) -> Optional[str]:
2337 """Name of the run this butler writes outputs to by default (`str` or
2338 `None`).
2340 This is an alias for ``self.registry.defaults.run``. It cannot be set
2341 directly in isolation, but all defaults may be changed together by
2342 assigning a new `RegistryDefaults` instance to
2343 ``self.registry.defaults``.
2344 """
2345 return self.registry.defaults.run
2347 registry: Registry
2348 """The object that manages dataset metadata and relationships (`Registry`).
2350 Most operations that don't involve reading or writing butler datasets are
2351 accessible only via `Registry` methods.
2352 """
2354 datastore: Datastore
2355 """The object that manages actual dataset storage (`Datastore`).
2357 Direct user access to the datastore should rarely be necessary; the primary
2358 exception is the case where a `Datastore` implementation provides extra
2359 functionality beyond what the base class defines.
2360 """
2362 storageClasses: StorageClassFactory
2363 """An object that maps known storage class names to objects that fully
2364 describe them (`StorageClassFactory`).
2365 """
2367 _allow_put_of_predefined_dataset: bool
2368 """Allow a put to succeed even if there is already a registry entry for it
2369 but not a datastore record. (`bool`)."""