Coverage for python/lsst/daf/butler/_butler.py: 10%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""
23Butler top level classes.
24"""
25from __future__ import annotations
27__all__ = (
28 "Butler",
29 "ButlerValidationError",
30 "PruneCollectionsArgsError",
31 "PurgeWithoutUnstorePruneCollectionsError",
32 "RunWithoutPurgePruneCollectionsError",
33 "PurgeUnsupportedPruneCollectionsError",
34)
36import collections.abc
37from collections import defaultdict
38import contextlib
39import logging
40import numbers
41import os
42from typing import (
43 Any,
44 ClassVar,
45 Counter,
46 Dict,
47 Iterable,
48 Iterator,
49 List,
50 MutableMapping,
51 Optional,
52 Set,
53 TextIO,
54 Tuple,
55 Type,
56 Union,
57)
59try:
60 import boto3
61except ImportError:
62 boto3 = None
64from lsst.utils import doImportType
65from lsst.utils.introspection import get_class_of
66from lsst.utils.logging import getLogger, VERBOSE
67from .core import (
68 AmbiguousDatasetError,
69 ButlerURI,
70 Config,
71 ConfigSubset,
72 DataCoordinate,
73 DataId,
74 DataIdValue,
75 DatasetRef,
76 DatasetType,
77 Datastore,
78 Dimension,
79 DimensionConfig,
80 FileDataset,
81 Progress,
82 StorageClassFactory,
83 Timespan,
84 ValidationError,
85)
86from .core.repoRelocation import BUTLER_ROOT_TAG
87from .core.utils import transactional
88from ._deferredDatasetHandle import DeferredDatasetHandle
89from ._butlerConfig import ButlerConfig
90from ._butlerRepoIndex import ButlerRepoIndex
91from .registry import (
92 Registry,
93 RegistryConfig,
94 RegistryDefaults,
95 CollectionSearch,
96 CollectionType,
97 ConflictingDefinitionError,
98 DatasetIdGenEnum,
99)
100from .transfers import RepoExportContext
102log = getLogger(__name__)
105class ButlerValidationError(ValidationError):
106 """There is a problem with the Butler configuration."""
107 pass
110class PruneCollectionsArgsError(TypeError):
111 """Base class for errors relating to Butler.pruneCollections input
112 arguments.
113 """
114 pass
117class PurgeWithoutUnstorePruneCollectionsError(PruneCollectionsArgsError):
118 """Raised when purge and unstore are both required to be True, and
119 purge is True but unstore is False.
120 """
122 def __init__(self) -> None:
123 super().__init__("Cannot pass purge=True without unstore=True.")
126class RunWithoutPurgePruneCollectionsError(PruneCollectionsArgsError):
127 """Raised when pruning a RUN collection but purge is False."""
129 def __init__(self, collectionType: CollectionType):
130 self.collectionType = collectionType
131 super().__init__(f"Cannot prune RUN collection {self.collectionType.name} without purge=True.")
134class PurgeUnsupportedPruneCollectionsError(PruneCollectionsArgsError):
135 """Raised when purge is True but is not supported for the given
136 collection."""
138 def __init__(self, collectionType: CollectionType):
139 self.collectionType = collectionType
140 super().__init__(
141 f"Cannot prune {self.collectionType} collection {self.collectionType.name} with purge=True.")
144class Butler:
145 """Main entry point for the data access system.
147 Parameters
148 ----------
149 config : `ButlerConfig`, `Config` or `str`, optional.
150 Configuration. Anything acceptable to the
151 `ButlerConfig` constructor. If a directory path
152 is given the configuration will be read from a ``butler.yaml`` file in
153 that location. If `None` is given default values will be used.
154 butler : `Butler`, optional.
155 If provided, construct a new Butler that uses the same registry and
156 datastore as the given one, but with the given collection and run.
157 Incompatible with the ``config``, ``searchPaths``, and ``writeable``
158 arguments.
159 collections : `str` or `Iterable` [ `str` ], optional
160 An expression specifying the collections to be searched (in order) when
161 reading datasets.
162 This may be a `str` collection name or an iterable thereof.
163 See :ref:`daf_butler_collection_expressions` for more information.
164 These collections are not registered automatically and must be
165 manually registered before they are used by any method, but they may be
166 manually registered after the `Butler` is initialized.
167 run : `str`, optional
168 Name of the `~CollectionType.RUN` collection new datasets should be
169 inserted into. If ``collections`` is `None` and ``run`` is not `None`,
170 ``collections`` will be set to ``[run]``. If not `None`, this
171 collection will automatically be registered. If this is not set (and
172 ``writeable`` is not set either), a read-only butler will be created.
173 searchPaths : `list` of `str`, optional
174 Directory paths to search when calculating the full Butler
175 configuration. Not used if the supplied config is already a
176 `ButlerConfig`.
177 writeable : `bool`, optional
178 Explicitly sets whether the butler supports write operations. If not
179 provided, a read-write butler is created if any of ``run``, ``tags``,
180 or ``chains`` is non-empty.
181 inferDefaults : `bool`, optional
182 If `True` (default) infer default data ID values from the values
183 present in the datasets in ``collections``: if all collections have the
184 same value (or no value) for a governor dimension, that value will be
185 the default for that dimension. Nonexistent collections are ignored.
186 If a default value is provided explicitly for a governor dimension via
187 ``**kwargs``, no default will be inferred for that dimension.
188 **kwargs : `str`
189 Default data ID key-value pairs. These may only identify "governor"
190 dimensions like ``instrument`` and ``skymap``.
192 Examples
193 --------
194 While there are many ways to control exactly how a `Butler` interacts with
195 the collections in its `Registry`, the most common cases are still simple.
197 For a read-only `Butler` that searches one collection, do::
199 butler = Butler("/path/to/repo", collections=["u/alice/DM-50000"])
201 For a read-write `Butler` that writes to and reads from a
202 `~CollectionType.RUN` collection::
204 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a")
206 The `Butler` passed to a ``PipelineTask`` is often much more complex,
207 because we want to write to one `~CollectionType.RUN` collection but read
208 from several others (as well)::
210 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a",
211 collections=["u/alice/DM-50000/a",
212 "u/bob/DM-49998",
213 "HSC/defaults"])
215 This butler will `put` new datasets to the run ``u/alice/DM-50000/a``.
216 Datasets will be read first from that run (since it appears first in the
217 chain), and then from ``u/bob/DM-49998`` and finally ``HSC/defaults``.
219 Finally, one can always create a `Butler` with no collections::
221 butler = Butler("/path/to/repo", writeable=True)
223 This can be extremely useful when you just want to use ``butler.registry``,
224 e.g. for inserting dimension data or managing collections, or when the
225 collections you want to use with the butler are not consistent.
226 Passing ``writeable`` explicitly here is only necessary if you want to be
227 able to make changes to the repo - usually the value for ``writeable`` can
228 be guessed from the collection arguments provided, but it defaults to
229 `False` when there are not collection arguments.
230 """
231 def __init__(self, config: Union[Config, str, None] = None, *,
232 butler: Optional[Butler] = None,
233 collections: Any = None,
234 run: Optional[str] = None,
235 searchPaths: Optional[List[str]] = None,
236 writeable: Optional[bool] = None,
237 inferDefaults: bool = True,
238 **kwargs: str,
239 ):
240 defaults = RegistryDefaults(collections=collections, run=run, infer=inferDefaults, **kwargs)
241 # Load registry, datastore, etc. from config or existing butler.
242 if butler is not None:
243 if config is not None or searchPaths is not None or writeable is not None:
244 raise TypeError("Cannot pass 'config', 'searchPaths', or 'writeable' "
245 "arguments with 'butler' argument.")
246 self.registry = butler.registry.copy(defaults)
247 self.datastore = butler.datastore
248 self.storageClasses = butler.storageClasses
249 self._config: ButlerConfig = butler._config
250 self._allow_put_of_predefined_dataset = butler._allow_put_of_predefined_dataset
251 else:
252 self._config = ButlerConfig(config, searchPaths=searchPaths)
253 try:
254 if "root" in self._config:
255 butlerRoot = self._config["root"]
256 else:
257 butlerRoot = self._config.configDir
258 if writeable is None:
259 writeable = run is not None
260 self.registry = Registry.fromConfig(self._config, butlerRoot=butlerRoot, writeable=writeable,
261 defaults=defaults)
262 self.datastore = Datastore.fromConfig(self._config, self.registry.getDatastoreBridgeManager(),
263 butlerRoot=butlerRoot)
264 self.storageClasses = StorageClassFactory()
265 self.storageClasses.addFromConfig(self._config)
266 self._allow_put_of_predefined_dataset = self._config.get("allow_put_of_predefined_dataset",
267 False)
268 except Exception:
269 # Failures here usually mean that configuration is incomplete,
270 # just issue an error message which includes config file URI.
271 log.error(f"Failed to instantiate Butler from config {self._config.configFile}.")
272 raise
274 if "run" in self._config or "collection" in self._config:
275 raise ValueError("Passing a run or collection via configuration is no longer supported.")
277 GENERATION: ClassVar[int] = 3
278 """This is a Generation 3 Butler.
280 This attribute may be removed in the future, once the Generation 2 Butler
281 interface has been fully retired; it should only be used in transitional
282 code.
283 """
285 @classmethod
286 def get_repo_uri(cls, label: str) -> ButlerURI:
287 """Look up the label in a butler repository index.
289 Parameters
290 ----------
291 label : `str`
292 Label of the Butler repository to look up.
294 Returns
295 -------
296 uri : `ButlerURI`
297 URI to the Butler repository associated with the given label.
299 Raises
300 ------
301 KeyError
302 Raised if the label is not found in the index, or if an index
303 can not be found at all.
305 Notes
306 -----
307 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the
308 information is discovered.
309 """
310 return ButlerRepoIndex.get_repo_uri(label)
312 @classmethod
313 def get_known_repos(cls) -> Set[str]:
314 """Retrieve the list of known repository labels.
316 Returns
317 -------
318 repos : `set` of `str`
319 All the known labels. Can be empty if no index can be found.
321 Notes
322 -----
323 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the
324 information is discovered.
325 """
326 return ButlerRepoIndex.get_known_repos()
328 @staticmethod
329 def makeRepo(root: str, config: Union[Config, str, None] = None,
330 dimensionConfig: Union[Config, str, None] = None, standalone: bool = False,
331 searchPaths: Optional[List[str]] = None, forceConfigRoot: bool = True,
332 outfile: Optional[str] = None, overwrite: bool = False) -> Config:
333 """Create an empty data repository by adding a butler.yaml config
334 to a repository root directory.
336 Parameters
337 ----------
338 root : `str` or `ButlerURI`
339 Path or URI to the root location of the new repository. Will be
340 created if it does not exist.
341 config : `Config` or `str`, optional
342 Configuration to write to the repository, after setting any
343 root-dependent Registry or Datastore config options. Can not
344 be a `ButlerConfig` or a `ConfigSubset`. If `None`, default
345 configuration will be used. Root-dependent config options
346 specified in this config are overwritten if ``forceConfigRoot``
347 is `True`.
348 dimensionConfig : `Config` or `str`, optional
349 Configuration for dimensions, will be used to initialize registry
350 database.
351 standalone : `bool`
352 If True, write all expanded defaults, not just customized or
353 repository-specific settings.
354 This (mostly) decouples the repository from the default
355 configuration, insulating it from changes to the defaults (which
356 may be good or bad, depending on the nature of the changes).
357 Future *additions* to the defaults will still be picked up when
358 initializing `Butlers` to repos created with ``standalone=True``.
359 searchPaths : `list` of `str`, optional
360 Directory paths to search when calculating the full butler
361 configuration.
362 forceConfigRoot : `bool`, optional
363 If `False`, any values present in the supplied ``config`` that
364 would normally be reset are not overridden and will appear
365 directly in the output config. This allows non-standard overrides
366 of the root directory for a datastore or registry to be given.
367 If this parameter is `True` the values for ``root`` will be
368 forced into the resulting config if appropriate.
369 outfile : `str`, optional
370 If not-`None`, the output configuration will be written to this
371 location rather than into the repository itself. Can be a URI
372 string. Can refer to a directory that will be used to write
373 ``butler.yaml``.
374 overwrite : `bool`, optional
375 Create a new configuration file even if one already exists
376 in the specified output location. Default is to raise
377 an exception.
379 Returns
380 -------
381 config : `Config`
382 The updated `Config` instance written to the repo.
384 Raises
385 ------
386 ValueError
387 Raised if a ButlerConfig or ConfigSubset is passed instead of a
388 regular Config (as these subclasses would make it impossible to
389 support ``standalone=False``).
390 FileExistsError
391 Raised if the output config file already exists.
392 os.error
393 Raised if the directory does not exist, exists but is not a
394 directory, or cannot be created.
396 Notes
397 -----
398 Note that when ``standalone=False`` (the default), the configuration
399 search path (see `ConfigSubset.defaultSearchPaths`) that was used to
400 construct the repository should also be used to construct any Butlers
401 to avoid configuration inconsistencies.
402 """
403 if isinstance(config, (ButlerConfig, ConfigSubset)):
404 raise ValueError("makeRepo must be passed a regular Config without defaults applied.")
406 # Ensure that the root of the repository exists or can be made
407 uri = ButlerURI(root, forceDirectory=True)
408 uri.mkdir()
410 config = Config(config)
412 # If we are creating a new repo from scratch with relative roots,
413 # do not propagate an explicit root from the config file
414 if "root" in config:
415 del config["root"]
417 full = ButlerConfig(config, searchPaths=searchPaths) # this applies defaults
418 imported_class = doImportType(full["datastore", "cls"])
419 if not issubclass(imported_class, Datastore):
420 raise TypeError(f"Imported datastore class {full['datastore', 'cls']} is not a Datastore")
421 datastoreClass: Type[Datastore] = imported_class
422 datastoreClass.setConfigRoot(BUTLER_ROOT_TAG, config, full, overwrite=forceConfigRoot)
424 # if key exists in given config, parse it, otherwise parse the defaults
425 # in the expanded config
426 if config.get(("registry", "db")):
427 registryConfig = RegistryConfig(config)
428 else:
429 registryConfig = RegistryConfig(full)
430 defaultDatabaseUri = registryConfig.makeDefaultDatabaseUri(BUTLER_ROOT_TAG)
431 if defaultDatabaseUri is not None:
432 Config.updateParameters(RegistryConfig, config, full,
433 toUpdate={"db": defaultDatabaseUri},
434 overwrite=forceConfigRoot)
435 else:
436 Config.updateParameters(RegistryConfig, config, full, toCopy=("db",),
437 overwrite=forceConfigRoot)
439 if standalone:
440 config.merge(full)
441 else:
442 # Always expand the registry.managers section into the per-repo
443 # config, because after the database schema is created, it's not
444 # allowed to change anymore. Note that in the standalone=True
445 # branch, _everything_ in the config is expanded, so there's no
446 # need to special case this.
447 Config.updateParameters(RegistryConfig, config, full, toMerge=("managers",), overwrite=False)
448 configURI: Union[str, ButlerURI]
449 if outfile is not None:
450 # When writing to a separate location we must include
451 # the root of the butler repo in the config else it won't know
452 # where to look.
453 config["root"] = uri.geturl()
454 configURI = outfile
455 else:
456 configURI = uri
457 config.dumpToUri(configURI, overwrite=overwrite)
459 # Create Registry and populate tables
460 registryConfig = RegistryConfig(config.get("registry"))
461 dimensionConfig = DimensionConfig(dimensionConfig)
462 Registry.createFromConfig(registryConfig, dimensionConfig=dimensionConfig, butlerRoot=root)
464 log.verbose("Wrote new Butler configuration file to %s", configURI)
466 return config
468 @classmethod
469 def _unpickle(cls, config: ButlerConfig, collections: Optional[CollectionSearch], run: Optional[str],
470 defaultDataId: Dict[str, str], writeable: bool) -> Butler:
471 """Callable used to unpickle a Butler.
473 We prefer not to use ``Butler.__init__`` directly so we can force some
474 of its many arguments to be keyword-only (note that ``__reduce__``
475 can only invoke callables with positional arguments).
477 Parameters
478 ----------
479 config : `ButlerConfig`
480 Butler configuration, already coerced into a true `ButlerConfig`
481 instance (and hence after any search paths for overrides have been
482 utilized).
483 collections : `CollectionSearch`
484 Names of the default collections to read from.
485 run : `str`, optional
486 Name of the default `~CollectionType.RUN` collection to write to.
487 defaultDataId : `dict` [ `str`, `str` ]
488 Default data ID values.
489 writeable : `bool`
490 Whether the Butler should support write operations.
492 Returns
493 -------
494 butler : `Butler`
495 A new `Butler` instance.
496 """
497 # MyPy doesn't recognize that the kwargs below are totally valid; it
498 # seems to think '**defaultDataId* is a _positional_ argument!
499 return cls(config=config, collections=collections, run=run, writeable=writeable,
500 **defaultDataId) # type: ignore
502 def __reduce__(self) -> tuple:
503 """Support pickling.
504 """
505 return (Butler._unpickle, (self._config, self.collections, self.run,
506 self.registry.defaults.dataId.byName(),
507 self.registry.isWriteable()))
509 def __str__(self) -> str:
510 return "Butler(collections={}, run={}, datastore='{}', registry='{}')".format(
511 self.collections, self.run, self.datastore, self.registry)
513 def isWriteable(self) -> bool:
514 """Return `True` if this `Butler` supports write operations.
515 """
516 return self.registry.isWriteable()
518 @contextlib.contextmanager
519 def transaction(self) -> Iterator[None]:
520 """Context manager supporting `Butler` transactions.
522 Transactions can be nested.
523 """
524 with self.registry.transaction():
525 with self.datastore.transaction():
526 yield
528 def _standardizeArgs(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
529 dataId: Optional[DataId] = None, **kwargs: Any
530 ) -> Tuple[DatasetType, Optional[DataId]]:
531 """Standardize the arguments passed to several Butler APIs.
533 Parameters
534 ----------
535 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
536 When `DatasetRef` the `dataId` should be `None`.
537 Otherwise the `DatasetType` or name thereof.
538 dataId : `dict` or `DataCoordinate`
539 A `dict` of `Dimension` link name, value pairs that label the
540 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
541 should be provided as the second argument.
542 **kwargs
543 Additional keyword arguments used to augment or construct a
544 `DataCoordinate`. See `DataCoordinate.standardize`
545 parameters.
547 Returns
548 -------
549 datasetType : `DatasetType`
550 A `DatasetType` instance extracted from ``datasetRefOrType``.
551 dataId : `dict` or `DataId`, optional
552 Argument that can be used (along with ``kwargs``) to construct a
553 `DataId`.
555 Notes
556 -----
557 Butler APIs that conceptually need a DatasetRef also allow passing a
558 `DatasetType` (or the name of one) and a `DataId` (or a dict and
559 keyword arguments that can be used to construct one) separately. This
560 method accepts those arguments and always returns a true `DatasetType`
561 and a `DataId` or `dict`.
563 Standardization of `dict` vs `DataId` is best handled by passing the
564 returned ``dataId`` (and ``kwargs``) to `Registry` APIs, which are
565 generally similarly flexible.
566 """
567 externalDatasetType: Optional[DatasetType] = None
568 internalDatasetType: Optional[DatasetType] = None
569 if isinstance(datasetRefOrType, DatasetRef):
570 if dataId is not None or kwargs:
571 raise ValueError("DatasetRef given, cannot use dataId as well")
572 externalDatasetType = datasetRefOrType.datasetType
573 dataId = datasetRefOrType.dataId
574 else:
575 # Don't check whether DataId is provided, because Registry APIs
576 # can usually construct a better error message when it wasn't.
577 if isinstance(datasetRefOrType, DatasetType):
578 externalDatasetType = datasetRefOrType
579 else:
580 internalDatasetType = self.registry.getDatasetType(datasetRefOrType)
582 # Check that they are self-consistent
583 if externalDatasetType is not None:
584 internalDatasetType = self.registry.getDatasetType(externalDatasetType.name)
585 if externalDatasetType != internalDatasetType:
586 raise ValueError(f"Supplied dataset type ({externalDatasetType}) inconsistent with "
587 f"registry definition ({internalDatasetType})")
589 assert internalDatasetType is not None
590 return internalDatasetType, dataId
592 def _rewrite_data_id(self, dataId: Optional[DataId], datasetType: DatasetType,
593 **kwargs: Any) -> Tuple[Optional[DataId], Dict[str, Any]]:
594 """Rewrite a data ID taking into account dimension records.
596 Take a Data ID and keyword args and rewrite it if necessary to
597 allow the user to specify dimension records rather than dimension
598 primary values.
600 This allows a user to include a dataId dict with keys of
601 ``exposure.day_obs`` and ``exposure.seq_num`` instead of giving
602 the integer exposure ID. It also allows a string to be given
603 for a dimension value rather than the integer ID if that is more
604 convenient. For example, rather than having to specifyin the
605 detector with ``detector.full_name``, a string given for ``detector``
606 will be interpreted as the full name and converted to the integer
607 value.
609 Keyword arguments can also use strings for dimensions like detector
610 and exposure but python does not allow them to include ``.`` and
611 so the ``exposure.day_obs`` syntax can not be used in a keyword
612 argument.
614 Parameters
615 ----------
616 dataId : `dict` or `DataCoordinate`
617 A `dict` of `Dimension` link name, value pairs that will label the
618 `DatasetRef` within a Collection.
619 datasetType : `DatasetType`
620 The dataset type associated with this dataId. Required to
621 determine the relevant dimensions.
622 **kwargs
623 Additional keyword arguments used to augment or construct a
624 `DataId`. See `DataId` parameters.
626 Returns
627 -------
628 dataId : `dict` or `DataCoordinate`
629 The, possibly rewritten, dataId. If given a `DataCoordinate` and
630 no keyword arguments, the original dataId will be returned
631 unchanged.
632 **kwargs : `dict`
633 Any unused keyword arguments.
634 """
635 # Do nothing if we have a standalone DataCoordinate.
636 if isinstance(dataId, DataCoordinate) and not kwargs:
637 return dataId, kwargs
639 # Process dimension records that are using record information
640 # rather than ids
641 newDataId: Dict[str, DataIdValue] = {}
642 byRecord: Dict[str, Dict[str, Any]] = defaultdict(dict)
644 # if all the dataId comes from keyword parameters we do not need
645 # to do anything here because they can't be of the form
646 # exposure.obs_id because a "." is not allowed in a keyword parameter.
647 if dataId:
648 for k, v in dataId.items():
649 # If we have a Dimension we do not need to do anything
650 # because it cannot be a compound key.
651 if isinstance(k, str) and "." in k:
652 # Someone is using a more human-readable dataId
653 dimensionName, record = k.split(".", 1)
654 byRecord[dimensionName][record] = v
655 elif isinstance(k, Dimension):
656 newDataId[k.name] = v
657 else:
658 newDataId[k] = v
660 # Go through the updated dataId and check the type in case someone is
661 # using an alternate key. We have already filtered out the compound
662 # keys dimensions.record format.
663 not_dimensions = {}
665 # Will need to look in the dataId and the keyword arguments
666 # and will remove them if they need to be fixed or are unrecognized.
667 for dataIdDict in (newDataId, kwargs):
668 # Use a list so we can adjust the dict safely in the loop
669 for dimensionName in list(dataIdDict):
670 value = dataIdDict[dimensionName]
671 try:
672 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
673 except KeyError:
674 # This is not a real dimension
675 not_dimensions[dimensionName] = value
676 del dataIdDict[dimensionName]
677 continue
679 # Convert an integral type to an explicit int to simplify
680 # comparisons here
681 if isinstance(value, numbers.Integral):
682 value = int(value)
684 if not isinstance(value, dimension.primaryKey.getPythonType()):
685 for alternate in dimension.alternateKeys:
686 if isinstance(value, alternate.getPythonType()):
687 byRecord[dimensionName][alternate.name] = value
688 del dataIdDict[dimensionName]
689 log.debug("Converting dimension %s to %s.%s=%s",
690 dimensionName, dimensionName, alternate.name, value)
691 break
692 else:
693 log.warning("Type mismatch found for value '%r' provided for dimension %s. "
694 "Could not find matching alternative (primary key has type %s) "
695 "so attempting to use as-is.",
696 value, dimensionName, dimension.primaryKey.getPythonType())
698 # If we have some unrecognized dimensions we have to try to connect
699 # them to records in other dimensions. This is made more complicated
700 # by some dimensions having records with clashing names. A mitigation
701 # is that we can tell by this point which dimensions are missing
702 # for the DatasetType but this does not work for calibrations
703 # where additional dimensions can be used to constrain the temporal
704 # axis.
705 if not_dimensions:
706 # Calculate missing dimensions
707 provided = set(newDataId) | set(kwargs) | set(byRecord)
708 missingDimensions = datasetType.dimensions.names - provided
710 # For calibrations we may well be needing temporal dimensions
711 # so rather than always including all dimensions in the scan
712 # restrict things a little. It is still possible for there
713 # to be confusion over day_obs in visit vs exposure for example.
714 # If we are not searching calibration collections things may
715 # fail but they are going to fail anyway because of the
716 # ambiguousness of the dataId...
717 candidateDimensions: Set[str] = set()
718 candidateDimensions.update(missingDimensions)
719 if datasetType.isCalibration():
720 for dim in self.registry.dimensions.getStaticDimensions():
721 if dim.temporal:
722 candidateDimensions.add(str(dim))
724 # Look up table for the first association with a dimension
725 guessedAssociation: Dict[str, Dict[str, Any]] = defaultdict(dict)
727 # Keep track of whether an item is associated with multiple
728 # dimensions.
729 counter: Counter[str] = Counter()
730 assigned: Dict[str, Set[str]] = defaultdict(set)
732 # Go through the missing dimensions and associate the
733 # given names with records within those dimensions
734 matched_dims = set()
735 for dimensionName in candidateDimensions:
736 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
737 fields = dimension.metadata.names | dimension.uniqueKeys.names
738 for field in not_dimensions:
739 if field in fields:
740 guessedAssociation[dimensionName][field] = not_dimensions[field]
741 counter[dimensionName] += 1
742 assigned[field].add(dimensionName)
743 matched_dims.add(field)
745 # Calculate the fields that matched nothing.
746 never_found = set(not_dimensions) - matched_dims
748 if never_found:
749 raise ValueError(f"Unrecognized keyword args given: {never_found}")
751 # There is a chance we have allocated a single dataId item
752 # to multiple dimensions. Need to decide which should be retained.
753 # For now assume that the most popular alternative wins.
754 # This means that day_obs with seq_num will result in
755 # exposure.day_obs and not visit.day_obs
756 # Also prefer an explicitly missing dimension over an inferred
757 # temporal dimension.
758 for fieldName, assignedDimensions in assigned.items():
759 if len(assignedDimensions) > 1:
760 # Pick the most popular (preferring mandatory dimensions)
761 requiredButMissing = assignedDimensions.intersection(missingDimensions)
762 if requiredButMissing:
763 candidateDimensions = requiredButMissing
764 else:
765 candidateDimensions = assignedDimensions
767 # Select the relevant items and get a new restricted
768 # counter.
769 theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions}
770 duplicatesCounter: Counter[str] = Counter()
771 duplicatesCounter.update(theseCounts)
773 # Choose the most common. If they are equally common
774 # we will pick the one that was found first.
775 # Returns a list of tuples
776 selected = duplicatesCounter.most_common(1)[0][0]
778 log.debug("Ambiguous dataId entry '%s' associated with multiple dimensions: %s."
779 " Removed ambiguity by choosing dimension %s.",
780 fieldName, ", ".join(assignedDimensions), selected)
782 for candidateDimension in assignedDimensions:
783 if candidateDimension != selected:
784 del guessedAssociation[candidateDimension][fieldName]
786 # Update the record look up dict with the new associations
787 for dimensionName, values in guessedAssociation.items():
788 if values: # A dict might now be empty
789 log.debug("Assigned non-dimension dataId keys to dimension %s: %s",
790 dimensionName, values)
791 byRecord[dimensionName].update(values)
793 if byRecord:
794 # Some record specifiers were found so we need to convert
795 # them to the Id form
796 for dimensionName, values in byRecord.items():
797 if dimensionName in newDataId:
798 log.warning("DataId specified explicit %s dimension value of %s in addition to"
799 " general record specifiers for it of %s. Ignoring record information.",
800 dimensionName, newDataId[dimensionName], str(values))
801 continue
803 # Build up a WHERE expression
804 bind = {k: v for k, v in values.items()}
805 where = " AND ".join(f"{dimensionName}.{k} = {k}"
806 for k in bind)
808 # Hopefully we get a single record that matches
809 records = set(self.registry.queryDimensionRecords(dimensionName, dataId=newDataId,
810 where=where, bind=bind, **kwargs))
812 if len(records) != 1:
813 if len(records) > 1:
814 log.debug("Received %d records from constraints of %s", len(records), str(values))
815 for r in records:
816 log.debug("- %s", str(r))
817 raise RuntimeError(f"DataId specification for dimension {dimensionName} is not"
818 f" uniquely constrained to a single dataset by {values}."
819 f" Got {len(records)} results.")
820 raise RuntimeError(f"DataId specification for dimension {dimensionName} matched no"
821 f" records when constrained by {values}")
823 # Get the primary key from the real dimension object
824 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
825 if not isinstance(dimension, Dimension):
826 raise RuntimeError(
827 f"{dimension.name} is not a true dimension, and cannot be used in data IDs."
828 )
829 newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name)
831 # We have modified the dataId so need to switch to it
832 dataId = newDataId
834 return dataId, kwargs
836 def _findDatasetRef(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
837 dataId: Optional[DataId] = None, *,
838 collections: Any = None,
839 allowUnresolved: bool = False,
840 **kwargs: Any) -> DatasetRef:
841 """Shared logic for methods that start with a search for a dataset in
842 the registry.
844 Parameters
845 ----------
846 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
847 When `DatasetRef` the `dataId` should be `None`.
848 Otherwise the `DatasetType` or name thereof.
849 dataId : `dict` or `DataCoordinate`, optional
850 A `dict` of `Dimension` link name, value pairs that label the
851 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
852 should be provided as the first argument.
853 collections : Any, optional
854 Collections to be searched, overriding ``self.collections``.
855 Can be any of the types supported by the ``collections`` argument
856 to butler construction.
857 allowUnresolved : `bool`, optional
858 If `True`, return an unresolved `DatasetRef` if finding a resolved
859 one in the `Registry` fails. Defaults to `False`.
860 **kwargs
861 Additional keyword arguments used to augment or construct a
862 `DataId`. See `DataId` parameters.
864 Returns
865 -------
866 ref : `DatasetRef`
867 A reference to the dataset identified by the given arguments.
869 Raises
870 ------
871 LookupError
872 Raised if no matching dataset exists in the `Registry` (and
873 ``allowUnresolved is False``).
874 ValueError
875 Raised if a resolved `DatasetRef` was passed as an input, but it
876 differs from the one found in the registry.
877 TypeError
878 Raised if no collections were provided.
879 """
880 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs)
881 if isinstance(datasetRefOrType, DatasetRef):
882 idNumber = datasetRefOrType.id
883 else:
884 idNumber = None
885 timespan: Optional[Timespan] = None
887 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs)
889 if datasetType.isCalibration():
890 # Because this is a calibration dataset, first try to make a
891 # standardize the data ID without restricting the dimensions to
892 # those of the dataset type requested, because there may be extra
893 # dimensions that provide temporal information for a validity-range
894 # lookup.
895 dataId = DataCoordinate.standardize(dataId, universe=self.registry.dimensions,
896 defaults=self.registry.defaults.dataId, **kwargs)
897 if dataId.graph.temporal:
898 dataId = self.registry.expandDataId(dataId)
899 timespan = dataId.timespan
900 else:
901 # Standardize the data ID to just the dimensions of the dataset
902 # type instead of letting registry.findDataset do it, so we get the
903 # result even if no dataset is found.
904 dataId = DataCoordinate.standardize(dataId, graph=datasetType.dimensions,
905 defaults=self.registry.defaults.dataId, **kwargs)
906 # Always lookup the DatasetRef, even if one is given, to ensure it is
907 # present in the current collection.
908 ref = self.registry.findDataset(datasetType, dataId, collections=collections, timespan=timespan)
909 if ref is None:
910 if allowUnresolved:
911 return DatasetRef(datasetType, dataId)
912 else:
913 if collections is None:
914 collections = self.registry.defaults.collections
915 raise LookupError(f"Dataset {datasetType.name} with data ID {dataId} "
916 f"could not be found in collections {collections}.")
917 if idNumber is not None and idNumber != ref.id:
918 if collections is None:
919 collections = self.registry.defaults.collections
920 raise ValueError(f"DatasetRef.id provided ({idNumber}) does not match "
921 f"id ({ref.id}) in registry in collections {collections}.")
922 return ref
924 @transactional
925 def put(self, obj: Any, datasetRefOrType: Union[DatasetRef, DatasetType, str],
926 dataId: Optional[DataId] = None, *,
927 run: Optional[str] = None,
928 **kwargs: Any) -> DatasetRef:
929 """Store and register a dataset.
931 Parameters
932 ----------
933 obj : `object`
934 The dataset.
935 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
936 When `DatasetRef` is provided, ``dataId`` should be `None`.
937 Otherwise the `DatasetType` or name thereof.
938 dataId : `dict` or `DataCoordinate`
939 A `dict` of `Dimension` link name, value pairs that label the
940 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
941 should be provided as the second argument.
942 run : `str`, optional
943 The name of the run the dataset should be added to, overriding
944 ``self.run``.
945 **kwargs
946 Additional keyword arguments used to augment or construct a
947 `DataCoordinate`. See `DataCoordinate.standardize`
948 parameters.
950 Returns
951 -------
952 ref : `DatasetRef`
953 A reference to the stored dataset, updated with the correct id if
954 given.
956 Raises
957 ------
958 TypeError
959 Raised if the butler is read-only or if no run has been provided.
960 """
961 log.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run)
962 if not self.isWriteable():
963 raise TypeError("Butler is read-only.")
964 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs)
965 if isinstance(datasetRefOrType, DatasetRef) and datasetRefOrType.id is not None:
966 raise ValueError("DatasetRef must not be in registry, must have None id")
968 # Handle dimension records in dataId
969 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs)
971 # Add Registry Dataset entry.
972 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions, **kwargs)
974 # For an execution butler the datasets will be pre-defined.
975 # If the butler is configured that way datasets should only be inserted
976 # if they do not already exist in registry. Trying and catching
977 # ConflictingDefinitionError will not work because the transaction
978 # will be corrupted. Instead, in this mode always check first.
979 ref = None
980 ref_is_predefined = False
981 if self._allow_put_of_predefined_dataset:
982 # Get the matching ref for this run.
983 ref = self.registry.findDataset(datasetType, collections=run,
984 dataId=dataId)
986 if ref:
987 # Must be expanded form for datastore templating
988 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions)
989 ref = ref.expanded(dataId)
990 ref_is_predefined = True
992 if not ref:
993 ref, = self.registry.insertDatasets(datasetType, run=run, dataIds=[dataId])
995 # If the ref is predefined it is possible that the datastore also
996 # has the record. Asking datastore to put it again will result in
997 # the artifact being recreated, overwriting previous, then will cause
998 # a failure in writing the record which will cause the artifact
999 # to be removed. Much safer to ask first before attempting to
1000 # overwrite. Race conditions should not be an issue for the
1001 # execution butler environment.
1002 if ref_is_predefined:
1003 if self.datastore.knows(ref):
1004 raise ConflictingDefinitionError(f"Dataset associated {ref} already exists.")
1006 self.datastore.put(obj, ref)
1008 return ref
1010 def getDirect(self, ref: DatasetRef, *, parameters: Optional[Dict[str, Any]] = None) -> Any:
1011 """Retrieve a stored dataset.
1013 Unlike `Butler.get`, this method allows datasets outside the Butler's
1014 collection to be read as long as the `DatasetRef` that identifies them
1015 can be obtained separately.
1017 Parameters
1018 ----------
1019 ref : `DatasetRef`
1020 Resolved reference to an already stored dataset.
1021 parameters : `dict`
1022 Additional StorageClass-defined options to control reading,
1023 typically used to efficiently read only a subset of the dataset.
1025 Returns
1026 -------
1027 obj : `object`
1028 The dataset.
1029 """
1030 return self.datastore.get(ref, parameters=parameters)
1032 def getDirectDeferred(self, ref: DatasetRef, *,
1033 parameters: Union[dict, None] = None) -> DeferredDatasetHandle:
1034 """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
1035 from a resolved `DatasetRef`.
1037 Parameters
1038 ----------
1039 ref : `DatasetRef`
1040 Resolved reference to an already stored dataset.
1041 parameters : `dict`
1042 Additional StorageClass-defined options to control reading,
1043 typically used to efficiently read only a subset of the dataset.
1045 Returns
1046 -------
1047 obj : `DeferredDatasetHandle`
1048 A handle which can be used to retrieve a dataset at a later time.
1050 Raises
1051 ------
1052 AmbiguousDatasetError
1053 Raised if ``ref.id is None``, i.e. the reference is unresolved.
1054 """
1055 if ref.id is None:
1056 raise AmbiguousDatasetError(
1057 f"Dataset of type {ref.datasetType.name} with data ID {ref.dataId} is not resolved."
1058 )
1059 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters)
1061 def getDeferred(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
1062 dataId: Optional[DataId] = None, *,
1063 parameters: Union[dict, None] = None,
1064 collections: Any = None,
1065 **kwargs: Any) -> DeferredDatasetHandle:
1066 """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
1067 after an immediate registry lookup.
1069 Parameters
1070 ----------
1071 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1072 When `DatasetRef` the `dataId` should be `None`.
1073 Otherwise the `DatasetType` or name thereof.
1074 dataId : `dict` or `DataCoordinate`, optional
1075 A `dict` of `Dimension` link name, value pairs that label the
1076 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1077 should be provided as the first argument.
1078 parameters : `dict`
1079 Additional StorageClass-defined options to control reading,
1080 typically used to efficiently read only a subset of the dataset.
1081 collections : Any, optional
1082 Collections to be searched, overriding ``self.collections``.
1083 Can be any of the types supported by the ``collections`` argument
1084 to butler construction.
1085 **kwargs
1086 Additional keyword arguments used to augment or construct a
1087 `DataId`. See `DataId` parameters.
1089 Returns
1090 -------
1091 obj : `DeferredDatasetHandle`
1092 A handle which can be used to retrieve a dataset at a later time.
1094 Raises
1095 ------
1096 LookupError
1097 Raised if no matching dataset exists in the `Registry` (and
1098 ``allowUnresolved is False``).
1099 ValueError
1100 Raised if a resolved `DatasetRef` was passed as an input, but it
1101 differs from the one found in the registry.
1102 TypeError
1103 Raised if no collections were provided.
1104 """
1105 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs)
1106 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters)
1108 def get(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
1109 dataId: Optional[DataId] = None, *,
1110 parameters: Optional[Dict[str, Any]] = None,
1111 collections: Any = None,
1112 **kwargs: Any) -> Any:
1113 """Retrieve a stored dataset.
1115 Parameters
1116 ----------
1117 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1118 When `DatasetRef` the `dataId` should be `None`.
1119 Otherwise the `DatasetType` or name thereof.
1120 dataId : `dict` or `DataCoordinate`
1121 A `dict` of `Dimension` link name, value pairs that label the
1122 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1123 should be provided as the first argument.
1124 parameters : `dict`
1125 Additional StorageClass-defined options to control reading,
1126 typically used to efficiently read only a subset of the dataset.
1127 collections : Any, optional
1128 Collections to be searched, overriding ``self.collections``.
1129 Can be any of the types supported by the ``collections`` argument
1130 to butler construction.
1131 **kwargs
1132 Additional keyword arguments used to augment or construct a
1133 `DataCoordinate`. See `DataCoordinate.standardize`
1134 parameters.
1136 Returns
1137 -------
1138 obj : `object`
1139 The dataset.
1141 Raises
1142 ------
1143 ValueError
1144 Raised if a resolved `DatasetRef` was passed as an input, but it
1145 differs from the one found in the registry.
1146 LookupError
1147 Raised if no matching dataset exists in the `Registry`.
1148 TypeError
1149 Raised if no collections were provided.
1151 Notes
1152 -----
1153 When looking up datasets in a `~CollectionType.CALIBRATION` collection,
1154 this method requires that the given data ID include temporal dimensions
1155 beyond the dimensions of the dataset type itself, in order to find the
1156 dataset with the appropriate validity range. For example, a "bias"
1157 dataset with native dimensions ``{instrument, detector}`` could be
1158 fetched with a ``{instrument, detector, exposure}`` data ID, because
1159 ``exposure`` is a temporal dimension.
1160 """
1161 log.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters)
1162 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs)
1163 return self.getDirect(ref, parameters=parameters)
1165 def getURIs(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
1166 dataId: Optional[DataId] = None, *,
1167 predict: bool = False,
1168 collections: Any = None,
1169 run: Optional[str] = None,
1170 **kwargs: Any) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]:
1171 """Returns the URIs associated with the dataset.
1173 Parameters
1174 ----------
1175 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1176 When `DatasetRef` the `dataId` should be `None`.
1177 Otherwise the `DatasetType` or name thereof.
1178 dataId : `dict` or `DataCoordinate`
1179 A `dict` of `Dimension` link name, value pairs that label the
1180 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1181 should be provided as the first argument.
1182 predict : `bool`
1183 If `True`, allow URIs to be returned of datasets that have not
1184 been written.
1185 collections : Any, optional
1186 Collections to be searched, overriding ``self.collections``.
1187 Can be any of the types supported by the ``collections`` argument
1188 to butler construction.
1189 run : `str`, optional
1190 Run to use for predictions, overriding ``self.run``.
1191 **kwargs
1192 Additional keyword arguments used to augment or construct a
1193 `DataCoordinate`. See `DataCoordinate.standardize`
1194 parameters.
1196 Returns
1197 -------
1198 primary : `ButlerURI`
1199 The URI to the primary artifact associated with this dataset.
1200 If the dataset was disassembled within the datastore this
1201 may be `None`.
1202 components : `dict`
1203 URIs to any components associated with the dataset artifact.
1204 Can be empty if there are no components.
1205 """
1206 ref = self._findDatasetRef(datasetRefOrType, dataId, allowUnresolved=predict,
1207 collections=collections, **kwargs)
1208 if ref.id is None: # only possible if predict is True
1209 if run is None:
1210 run = self.run
1211 if run is None:
1212 raise TypeError("Cannot predict location with run=None.")
1213 # Lie about ID, because we can't guess it, and only
1214 # Datastore.getURIs() will ever see it (and it doesn't use it).
1215 ref = ref.resolved(id=0, run=run)
1216 return self.datastore.getURIs(ref, predict)
1218 def getURI(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
1219 dataId: Optional[DataId] = None, *,
1220 predict: bool = False,
1221 collections: Any = None,
1222 run: Optional[str] = None,
1223 **kwargs: Any) -> ButlerURI:
1224 """Return the URI to the Dataset.
1226 Parameters
1227 ----------
1228 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1229 When `DatasetRef` the `dataId` should be `None`.
1230 Otherwise the `DatasetType` or name thereof.
1231 dataId : `dict` or `DataCoordinate`
1232 A `dict` of `Dimension` link name, value pairs that label the
1233 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1234 should be provided as the first argument.
1235 predict : `bool`
1236 If `True`, allow URIs to be returned of datasets that have not
1237 been written.
1238 collections : Any, optional
1239 Collections to be searched, overriding ``self.collections``.
1240 Can be any of the types supported by the ``collections`` argument
1241 to butler construction.
1242 run : `str`, optional
1243 Run to use for predictions, overriding ``self.run``.
1244 **kwargs
1245 Additional keyword arguments used to augment or construct a
1246 `DataCoordinate`. See `DataCoordinate.standardize`
1247 parameters.
1249 Returns
1250 -------
1251 uri : `ButlerURI`
1252 URI pointing to the Dataset within the datastore. If the
1253 Dataset does not exist in the datastore, and if ``predict`` is
1254 `True`, the URI will be a prediction and will include a URI
1255 fragment "#predicted".
1256 If the datastore does not have entities that relate well
1257 to the concept of a URI the returned URI string will be
1258 descriptive. The returned URI is not guaranteed to be obtainable.
1260 Raises
1261 ------
1262 LookupError
1263 A URI has been requested for a dataset that does not exist and
1264 guessing is not allowed.
1265 ValueError
1266 Raised if a resolved `DatasetRef` was passed as an input, but it
1267 differs from the one found in the registry.
1268 TypeError
1269 Raised if no collections were provided.
1270 RuntimeError
1271 Raised if a URI is requested for a dataset that consists of
1272 multiple artifacts.
1273 """
1274 primary, components = self.getURIs(datasetRefOrType, dataId=dataId, predict=predict,
1275 collections=collections, run=run, **kwargs)
1277 if primary is None or components:
1278 raise RuntimeError(f"Dataset ({datasetRefOrType}) includes distinct URIs for components. "
1279 "Use Butler.getURIs() instead.")
1280 return primary
1282 def retrieveArtifacts(self, refs: Iterable[DatasetRef],
1283 destination: Union[str, ButlerURI], transfer: str = "auto",
1284 preserve_path: bool = True,
1285 overwrite: bool = False) -> List[ButlerURI]:
1286 """Retrieve the artifacts associated with the supplied refs.
1288 Parameters
1289 ----------
1290 refs : iterable of `DatasetRef`
1291 The datasets for which artifacts are to be retrieved.
1292 A single ref can result in multiple artifacts. The refs must
1293 be resolved.
1294 destination : `ButlerURI` or `str`
1295 Location to write the artifacts.
1296 transfer : `str`, optional
1297 Method to use to transfer the artifacts. Must be one of the options
1298 supported by `ButlerURI.transfer_from()`. "move" is not allowed.
1299 preserve_path : `bool`, optional
1300 If `True` the full path of the artifact within the datastore
1301 is preserved. If `False` the final file component of the path
1302 is used.
1303 overwrite : `bool`, optional
1304 If `True` allow transfers to overwrite existing files at the
1305 destination.
1307 Returns
1308 -------
1309 targets : `list` of `ButlerURI`
1310 URIs of file artifacts in destination location. Order is not
1311 preserved.
1313 Notes
1314 -----
1315 For non-file datastores the artifacts written to the destination
1316 may not match the representation inside the datastore. For example
1317 a hierarchical data structure in a NoSQL database may well be stored
1318 as a JSON file.
1319 """
1320 return self.datastore.retrieveArtifacts(refs, ButlerURI(destination), transfer=transfer,
1321 preserve_path=preserve_path, overwrite=overwrite)
1323 def datasetExists(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
1324 dataId: Optional[DataId] = None, *,
1325 collections: Any = None,
1326 **kwargs: Any) -> bool:
1327 """Return True if the Dataset is actually present in the Datastore.
1329 Parameters
1330 ----------
1331 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1332 When `DatasetRef` the `dataId` should be `None`.
1333 Otherwise the `DatasetType` or name thereof.
1334 dataId : `dict` or `DataCoordinate`
1335 A `dict` of `Dimension` link name, value pairs that label the
1336 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1337 should be provided as the first argument.
1338 collections : Any, optional
1339 Collections to be searched, overriding ``self.collections``.
1340 Can be any of the types supported by the ``collections`` argument
1341 to butler construction.
1342 **kwargs
1343 Additional keyword arguments used to augment or construct a
1344 `DataCoordinate`. See `DataCoordinate.standardize`
1345 parameters.
1347 Raises
1348 ------
1349 LookupError
1350 Raised if the dataset is not even present in the Registry.
1351 ValueError
1352 Raised if a resolved `DatasetRef` was passed as an input, but it
1353 differs from the one found in the registry.
1354 TypeError
1355 Raised if no collections were provided.
1356 """
1357 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs)
1358 return self.datastore.exists(ref)
1360 def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None:
1361 """Remove one or more `~CollectionType.RUN` collections and the
1362 datasets within them.
1364 Parameters
1365 ----------
1366 names : `Iterable` [ `str` ]
1367 The names of the collections to remove.
1368 unstore : `bool`, optional
1369 If `True` (default), delete datasets from all datastores in which
1370 they are present, and attempt to rollback the registry deletions if
1371 datastore deletions fail (which may not always be possible). If
1372 `False`, datastore records for these datasets are still removed,
1373 but any artifacts (e.g. files) will not be.
1375 Raises
1376 ------
1377 TypeError
1378 Raised if one or more collections are not of type
1379 `~CollectionType.RUN`.
1380 """
1381 if not self.isWriteable():
1382 raise TypeError("Butler is read-only.")
1383 names = list(names)
1384 refs: List[DatasetRef] = []
1385 for name in names:
1386 collectionType = self.registry.getCollectionType(name)
1387 if collectionType is not CollectionType.RUN:
1388 raise TypeError(f"The collection type of '{name}' is {collectionType.name}, not RUN.")
1389 refs.extend(self.registry.queryDatasets(..., collections=name, findFirst=True))
1390 with self.registry.transaction():
1391 if unstore:
1392 self.datastore.trash(refs)
1393 else:
1394 self.datastore.forget(refs)
1395 for name in names:
1396 self.registry.removeCollection(name)
1397 if unstore:
1398 # Point of no return for removing artifacts
1399 self.datastore.emptyTrash()
1401 def pruneCollection(self, name: str, purge: bool = False, unstore: bool = False,
1402 unlink: Optional[List[str]] = None) -> None:
1403 """Remove a collection and possibly prune datasets within it.
1405 Parameters
1406 ----------
1407 name : `str`
1408 Name of the collection to remove. If this is a
1409 `~CollectionType.TAGGED` or `~CollectionType.CHAINED` collection,
1410 datasets within the collection are not modified unless ``unstore``
1411 is `True`. If this is a `~CollectionType.RUN` collection,
1412 ``purge`` and ``unstore`` must be `True`, and all datasets in it
1413 are fully removed from the data repository.
1414 purge : `bool`, optional
1415 If `True`, permit `~CollectionType.RUN` collections to be removed,
1416 fully removing datasets within them. Requires ``unstore=True`` as
1417 well as an added precaution against accidental deletion. Must be
1418 `False` (default) if the collection is not a ``RUN``.
1419 unstore: `bool`, optional
1420 If `True`, remove all datasets in the collection from all
1421 datastores in which they appear.
1422 unlink: `list` [`str`], optional
1423 Before removing the given `collection` unlink it from from these
1424 parent collections.
1426 Raises
1427 ------
1428 TypeError
1429 Raised if the butler is read-only or arguments are mutually
1430 inconsistent.
1431 """
1432 # See pruneDatasets comments for more information about the logic here;
1433 # the cases are almost the same, but here we can rely on Registry to
1434 # take care everything but Datastore deletion when we remove the
1435 # collection.
1436 if not self.isWriteable():
1437 raise TypeError("Butler is read-only.")
1438 collectionType = self.registry.getCollectionType(name)
1439 if purge and not unstore:
1440 raise PurgeWithoutUnstorePruneCollectionsError()
1441 if collectionType is CollectionType.RUN and not purge:
1442 raise RunWithoutPurgePruneCollectionsError(collectionType)
1443 if collectionType is not CollectionType.RUN and purge:
1444 raise PurgeUnsupportedPruneCollectionsError(collectionType)
1446 def remove(child: str, parent: str) -> None:
1447 """Remove a child collection from a parent collection."""
1448 # Remove child from parent.
1449 chain = list(self.registry.getCollectionChain(parent))
1450 try:
1451 chain.remove(name)
1452 except ValueError as e:
1453 raise RuntimeError(f"{name} is not a child of {parent}") from e
1454 self.registry.setCollectionChain(parent, chain)
1456 with self.registry.transaction():
1457 if (unlink):
1458 for parent in unlink:
1459 remove(name, parent)
1460 if unstore:
1461 refs = self.registry.queryDatasets(..., collections=name, findFirst=True)
1462 self.datastore.trash(refs)
1463 self.registry.removeCollection(name)
1465 if unstore:
1466 # Point of no return for removing artifacts
1467 self.datastore.emptyTrash()
1469 def pruneDatasets(self, refs: Iterable[DatasetRef], *,
1470 disassociate: bool = True,
1471 unstore: bool = False,
1472 tags: Iterable[str] = (),
1473 purge: bool = False,
1474 run: Optional[str] = None) -> None:
1475 """Remove one or more datasets from a collection and/or storage.
1477 Parameters
1478 ----------
1479 refs : `~collections.abc.Iterable` of `DatasetRef`
1480 Datasets to prune. These must be "resolved" references (not just
1481 a `DatasetType` and data ID).
1482 disassociate : `bool`, optional
1483 Disassociate pruned datasets from ``tags``, or from all collections
1484 if ``purge=True``.
1485 unstore : `bool`, optional
1486 If `True` (`False` is default) remove these datasets from all
1487 datastores known to this butler. Note that this will make it
1488 impossible to retrieve these datasets even via other collections.
1489 Datasets that are already not stored are ignored by this option.
1490 tags : `Iterable` [ `str` ], optional
1491 `~CollectionType.TAGGED` collections to disassociate the datasets
1492 from. Ignored if ``disassociate`` is `False` or ``purge`` is
1493 `True`.
1494 purge : `bool`, optional
1495 If `True` (`False` is default), completely remove the dataset from
1496 the `Registry`. To prevent accidental deletions, ``purge`` may
1497 only be `True` if all of the following conditions are met:
1499 - All given datasets are in the given run.
1500 - ``disassociate`` is `True`;
1501 - ``unstore`` is `True`.
1503 This mode may remove provenance information from datasets other
1504 than those provided, and should be used with extreme care.
1506 Raises
1507 ------
1508 TypeError
1509 Raised if the butler is read-only, if no collection was provided,
1510 or the conditions for ``purge=True`` were not met.
1511 """
1512 if not self.isWriteable():
1513 raise TypeError("Butler is read-only.")
1514 if purge:
1515 if not disassociate:
1516 raise TypeError("Cannot pass purge=True without disassociate=True.")
1517 if not unstore:
1518 raise TypeError("Cannot pass purge=True without unstore=True.")
1519 elif disassociate:
1520 tags = tuple(tags)
1521 if not tags:
1522 raise TypeError("No tags provided but disassociate=True.")
1523 for tag in tags:
1524 collectionType = self.registry.getCollectionType(tag)
1525 if collectionType is not CollectionType.TAGGED:
1526 raise TypeError(f"Cannot disassociate from collection '{tag}' "
1527 f"of non-TAGGED type {collectionType.name}.")
1528 # Transform possibly-single-pass iterable into something we can iterate
1529 # over multiple times.
1530 refs = list(refs)
1531 # Pruning a component of a DatasetRef makes no sense since registry
1532 # doesn't know about components and datastore might not store
1533 # components in a separate file
1534 for ref in refs:
1535 if ref.datasetType.component():
1536 raise ValueError(f"Can not prune a component of a dataset (ref={ref})")
1537 # We don't need an unreliable Datastore transaction for this, because
1538 # we've been extra careful to ensure that Datastore.trash only involves
1539 # mutating the Registry (it can _look_ at Datastore-specific things,
1540 # but shouldn't change them), and hence all operations here are
1541 # Registry operations.
1542 with self.registry.transaction():
1543 if unstore:
1544 self.datastore.trash(refs)
1545 if purge:
1546 self.registry.removeDatasets(refs)
1547 elif disassociate:
1548 assert tags, "Guaranteed by earlier logic in this function."
1549 for tag in tags:
1550 self.registry.disassociate(tag, refs)
1551 # We've exited the Registry transaction, and apparently committed.
1552 # (if there was an exception, everything rolled back, and it's as if
1553 # nothing happened - and we never get here).
1554 # Datastore artifacts are not yet gone, but they're clearly marked
1555 # as trash, so if we fail to delete now because of (e.g.) filesystem
1556 # problems we can try again later, and if manual administrative
1557 # intervention is required, it's pretty clear what that should entail:
1558 # deleting everything on disk and in private Datastore tables that is
1559 # in the dataset_location_trash table.
1560 if unstore:
1561 # Point of no return for removing artifacts
1562 self.datastore.emptyTrash()
1564 @transactional
1565 def ingest(self, *datasets: FileDataset, transfer: Optional[str] = "auto", run: Optional[str] = None,
1566 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
1567 ) -> None:
1568 """Store and register one or more datasets that already exist on disk.
1570 Parameters
1571 ----------
1572 datasets : `FileDataset`
1573 Each positional argument is a struct containing information about
1574 a file to be ingested, including its URI (either absolute or
1575 relative to the datastore root, if applicable), a `DatasetRef`,
1576 and optionally a formatter class or its fully-qualified string
1577 name. If a formatter is not provided, the formatter that would be
1578 used for `put` is assumed. On successful return, all
1579 `FileDataset.ref` attributes will have their `DatasetRef.id`
1580 attribute populated and all `FileDataset.formatter` attributes will
1581 be set to the formatter class used. `FileDataset.path` attributes
1582 may be modified to put paths in whatever the datastore considers a
1583 standardized form.
1584 transfer : `str`, optional
1585 If not `None`, must be one of 'auto', 'move', 'copy', 'direct',
1586 'split', 'hardlink', 'relsymlink' or 'symlink', indicating how to
1587 transfer the file.
1588 run : `str`, optional
1589 The name of the run ingested datasets should be added to,
1590 overriding ``self.run``.
1591 idGenerationMode : `DatasetIdGenEnum`, optional
1592 Specifies option for generating dataset IDs. By default unique IDs
1593 are generated for each inserted dataset.
1595 Raises
1596 ------
1597 TypeError
1598 Raised if the butler is read-only or if no run was provided.
1599 NotImplementedError
1600 Raised if the `Datastore` does not support the given transfer mode.
1601 DatasetTypeNotSupportedError
1602 Raised if one or more files to be ingested have a dataset type that
1603 is not supported by the `Datastore`..
1604 FileNotFoundError
1605 Raised if one of the given files does not exist.
1606 FileExistsError
1607 Raised if transfer is not `None` but the (internal) location the
1608 file would be moved to is already occupied.
1610 Notes
1611 -----
1612 This operation is not fully exception safe: if a database operation
1613 fails, the given `FileDataset` instances may be only partially updated.
1615 It is atomic in terms of database operations (they will either all
1616 succeed or all fail) providing the database engine implements
1617 transactions correctly. It will attempt to be atomic in terms of
1618 filesystem operations as well, but this cannot be implemented
1619 rigorously for most datastores.
1620 """
1621 if not self.isWriteable():
1622 raise TypeError("Butler is read-only.")
1623 progress = Progress("lsst.daf.butler.Butler.ingest", level=logging.DEBUG)
1624 # Reorganize the inputs so they're grouped by DatasetType and then
1625 # data ID. We also include a list of DatasetRefs for each FileDataset
1626 # to hold the resolved DatasetRefs returned by the Registry, before
1627 # it's safe to swap them into FileDataset.refs.
1628 # Some type annotation aliases to make that clearer:
1629 GroupForType = Dict[DataCoordinate, Tuple[FileDataset, List[DatasetRef]]]
1630 GroupedData = MutableMapping[DatasetType, GroupForType]
1631 # The actual data structure:
1632 groupedData: GroupedData = defaultdict(dict)
1633 # And the nested loop that populates it:
1634 for dataset in progress.wrap(datasets, desc="Grouping by dataset type"):
1635 # This list intentionally shared across the inner loop, since it's
1636 # associated with `dataset`.
1637 resolvedRefs: List[DatasetRef] = []
1639 # Somewhere to store pre-existing refs if we have an
1640 # execution butler.
1641 existingRefs: List[DatasetRef] = []
1643 for ref in dataset.refs:
1644 if ref.dataId in groupedData[ref.datasetType]:
1645 raise ConflictingDefinitionError(f"Ingest conflict. Dataset {dataset.path} has same"
1646 " DataId as other ingest dataset"
1647 f" {groupedData[ref.datasetType][ref.dataId][0].path} "
1648 f" ({ref.dataId})")
1649 if self._allow_put_of_predefined_dataset:
1650 existing_ref = self.registry.findDataset(ref.datasetType,
1651 dataId=ref.dataId,
1652 collections=run)
1653 if existing_ref:
1654 if self.datastore.knows(existing_ref):
1655 raise ConflictingDefinitionError(f"Dataset associated with path {dataset.path}"
1656 f" already exists as {existing_ref}.")
1657 # Store this ref elsewhere since it already exists
1658 # and we do not want to remake it but we do want
1659 # to store it in the datastore.
1660 existingRefs.append(existing_ref)
1662 # Nothing else to do until we have finished
1663 # iterating.
1664 continue
1666 groupedData[ref.datasetType][ref.dataId] = (dataset, resolvedRefs)
1668 if existingRefs:
1670 if len(dataset.refs) != len(existingRefs):
1671 # Keeping track of partially pre-existing datasets is hard
1672 # and should generally never happen. For now don't allow
1673 # it.
1674 raise ConflictingDefinitionError(f"For dataset {dataset.path} some dataIds already exist"
1675 " in registry but others do not. This is not supported.")
1677 # Attach the resolved refs if we found them.
1678 dataset.refs = existingRefs
1680 # Now we can bulk-insert into Registry for each DatasetType.
1681 for datasetType, groupForType in progress.iter_item_chunks(groupedData.items(),
1682 desc="Bulk-inserting datasets by type"):
1683 refs = self.registry.insertDatasets(
1684 datasetType,
1685 dataIds=groupForType.keys(),
1686 run=run,
1687 expand=self.datastore.needs_expanded_data_ids(transfer, datasetType),
1688 idGenerationMode=idGenerationMode,
1689 )
1690 # Append those resolved DatasetRefs to the new lists we set up for
1691 # them.
1692 for ref, (_, resolvedRefs) in zip(refs, groupForType.values()):
1693 resolvedRefs.append(ref)
1695 # Go back to the original FileDatasets to replace their refs with the
1696 # new resolved ones.
1697 for groupForType in progress.iter_chunks(groupedData.values(),
1698 desc="Reassociating resolved dataset refs with files"):
1699 for dataset, resolvedRefs in groupForType.values():
1700 dataset.refs = resolvedRefs
1702 # Bulk-insert everything into Datastore.
1703 self.datastore.ingest(*datasets, transfer=transfer)
1705 @contextlib.contextmanager
1706 def export(self, *, directory: Optional[str] = None,
1707 filename: Optional[str] = None,
1708 format: Optional[str] = None,
1709 transfer: Optional[str] = None) -> Iterator[RepoExportContext]:
1710 """Export datasets from the repository represented by this `Butler`.
1712 This method is a context manager that returns a helper object
1713 (`RepoExportContext`) that is used to indicate what information from
1714 the repository should be exported.
1716 Parameters
1717 ----------
1718 directory : `str`, optional
1719 Directory dataset files should be written to if ``transfer`` is not
1720 `None`.
1721 filename : `str`, optional
1722 Name for the file that will include database information associated
1723 with the exported datasets. If this is not an absolute path and
1724 ``directory`` is not `None`, it will be written to ``directory``
1725 instead of the current working directory. Defaults to
1726 "export.{format}".
1727 format : `str`, optional
1728 File format for the database information file. If `None`, the
1729 extension of ``filename`` will be used.
1730 transfer : `str`, optional
1731 Transfer mode passed to `Datastore.export`.
1733 Raises
1734 ------
1735 TypeError
1736 Raised if the set of arguments passed is inconsistent.
1738 Examples
1739 --------
1740 Typically the `Registry.queryDataIds` and `Registry.queryDatasets`
1741 methods are used to provide the iterables over data IDs and/or datasets
1742 to be exported::
1744 with butler.export("exports.yaml") as export:
1745 # Export all flats, but none of the dimension element rows
1746 # (i.e. data ID information) associated with them.
1747 export.saveDatasets(butler.registry.queryDatasets("flat"),
1748 elements=())
1749 # Export all datasets that start with "deepCoadd_" and all of
1750 # their associated data ID information.
1751 export.saveDatasets(butler.registry.queryDatasets("deepCoadd_*"))
1752 """
1753 if directory is None and transfer is not None:
1754 raise TypeError("Cannot transfer without providing a directory.")
1755 if transfer == "move":
1756 raise TypeError("Transfer may not be 'move': export is read-only")
1757 if format is None:
1758 if filename is None:
1759 raise TypeError("At least one of 'filename' or 'format' must be provided.")
1760 else:
1761 _, format = os.path.splitext(filename)
1762 elif filename is None:
1763 filename = f"export.{format}"
1764 if directory is not None:
1765 filename = os.path.join(directory, filename)
1766 BackendClass = get_class_of(self._config["repo_transfer_formats"][format]["export"])
1767 with open(filename, 'w') as stream:
1768 backend = BackendClass(stream)
1769 try:
1770 helper = RepoExportContext(self.registry, self.datastore, backend=backend,
1771 directory=directory, transfer=transfer)
1772 yield helper
1773 except BaseException:
1774 raise
1775 else:
1776 helper._finish()
1778 def import_(self, *, directory: Optional[str] = None,
1779 filename: Union[str, TextIO, None] = None,
1780 format: Optional[str] = None,
1781 transfer: Optional[str] = None,
1782 skip_dimensions: Optional[Set] = None,
1783 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
1784 reuseIds: bool = False) -> None:
1785 """Import datasets into this repository that were exported from a
1786 different butler repository via `~lsst.daf.butler.Butler.export`.
1788 Parameters
1789 ----------
1790 directory : `str`, optional
1791 Directory containing dataset files to import from. If `None`,
1792 ``filename`` and all dataset file paths specified therein must
1793 be absolute.
1794 filename : `str` or `TextIO`, optional
1795 A stream or name of file that contains database information
1796 associated with the exported datasets, typically generated by
1797 `~lsst.daf.butler.Butler.export`. If this a string (name) and
1798 is not an absolute path, does not exist in the current working
1799 directory, and ``directory`` is not `None`, it is assumed to be in
1800 ``directory``. Defaults to "export.{format}".
1801 format : `str`, optional
1802 File format for ``filename``. If `None`, the extension of
1803 ``filename`` will be used.
1804 transfer : `str`, optional
1805 Transfer mode passed to `~lsst.daf.butler.Datastore.ingest`.
1806 skip_dimensions : `set`, optional
1807 Names of dimensions that should be skipped and not imported.
1808 idGenerationMode : `DatasetIdGenEnum`, optional
1809 Specifies option for generating dataset IDs when IDs are not
1810 provided or their type does not match backend type. By default
1811 unique IDs are generated for each inserted dataset.
1812 reuseIds : `bool`, optional
1813 If `True` then forces re-use of imported dataset IDs for integer
1814 IDs which are normally generated as auto-incremented; exception
1815 will be raised if imported IDs clash with existing ones. This
1816 option has no effect on the use of globally-unique IDs which are
1817 always re-used (or generated if integer IDs are being imported).
1819 Raises
1820 ------
1821 TypeError
1822 Raised if the set of arguments passed is inconsistent, or if the
1823 butler is read-only.
1824 """
1825 if not self.isWriteable():
1826 raise TypeError("Butler is read-only.")
1827 if format is None:
1828 if filename is None:
1829 raise TypeError("At least one of 'filename' or 'format' must be provided.")
1830 else:
1831 _, format = os.path.splitext(filename) # type: ignore
1832 elif filename is None:
1833 filename = f"export.{format}"
1834 if isinstance(filename, str) and directory is not None and not os.path.exists(filename):
1835 filename = os.path.join(directory, filename)
1836 BackendClass = get_class_of(self._config["repo_transfer_formats"][format]["import"])
1838 def doImport(importStream: TextIO) -> None:
1839 backend = BackendClass(importStream, self.registry)
1840 backend.register()
1841 with self.transaction():
1842 backend.load(self.datastore, directory=directory, transfer=transfer,
1843 skip_dimensions=skip_dimensions, idGenerationMode=idGenerationMode,
1844 reuseIds=reuseIds)
1846 if isinstance(filename, str):
1847 with open(filename, "r") as stream:
1848 doImport(stream)
1849 else:
1850 doImport(filename)
1852 def transfer_from(self, source_butler: Butler, source_refs: Iterable[DatasetRef],
1853 transfer: str = "auto",
1854 id_gen_map: Dict[str, DatasetIdGenEnum] = None,
1855 skip_missing: bool = True,
1856 register_dataset_types: bool = False) -> List[DatasetRef]:
1857 """Transfer datasets to this Butler from a run in another Butler.
1859 Parameters
1860 ----------
1861 source_butler : `Butler`
1862 Butler from which the datasets are to be transferred.
1863 source_refs : iterable of `DatasetRef`
1864 Datasets defined in the source butler that should be transferred to
1865 this butler.
1866 transfer : `str`, optional
1867 Transfer mode passed to `~lsst.daf.butler.Datastore.transfer_from`.
1868 id_gen_map : `dict` of [`str`, `DatasetIdGenEnum`], optional
1869 A mapping of dataset type to ID generation mode. Only used if
1870 the source butler is using integer IDs. Should not be used
1871 if this receiving butler uses integer IDs. Without this dataset
1872 import always uses unique.
1873 skip_missing : `bool`
1874 If `True`, datasets with no datastore artifact associated with
1875 them are not transferred. If `False` a registry entry will be
1876 created even if no datastore record is created (and so will
1877 look equivalent to the dataset being unstored).
1878 register_dataset_types : `bool`
1879 If `True` any missing dataset types are registered. Otherwise
1880 an exception is raised.
1882 Returns
1883 -------
1884 refs : `list` of `DatasetRef`
1885 The refs added to this Butler.
1887 Notes
1888 -----
1889 Requires that any dimension definitions are already present in the
1890 receiving Butler. The datastore artifact has to exist for a transfer
1891 to be made but non-existence is not an error.
1893 Datasets that already exist in this run will be skipped.
1895 The datasets are imported as part of a transaction, although
1896 dataset types are registered before the transaction is started.
1897 This means that it is possible for a dataset type to be registered
1898 even though transfer has failed.
1899 """
1900 if not self.isWriteable():
1901 raise TypeError("Butler is read-only.")
1902 progress = Progress("lsst.daf.butler.Butler.transfer_from", level=VERBOSE)
1904 # Will iterate through the refs multiple times so need to convert
1905 # to a list if this isn't a collection.
1906 if not isinstance(source_refs, collections.abc.Collection):
1907 source_refs = list(source_refs)
1909 original_count = len(source_refs)
1910 log.info("Transferring %d datasets into %s", original_count, str(self))
1912 if id_gen_map is None:
1913 id_gen_map = {}
1915 # In some situations the datastore artifact may be missing
1916 # and we do not want that registry entry to be imported.
1917 # Asking datastore is not sufficient, the records may have been
1918 # purged, we have to ask for the (predicted) URI and check
1919 # existence explicitly. Execution butler is set up exactly like
1920 # this with no datastore records.
1921 artifact_existence: Dict[ButlerURI, bool] = {}
1922 if skip_missing:
1923 dataset_existence = source_butler.datastore.mexists(source_refs,
1924 artifact_existence=artifact_existence)
1925 source_refs = [ref for ref, exists in dataset_existence.items() if exists]
1926 filtered_count = len(source_refs)
1927 log.verbose("%d datasets removed because the artifact does not exist. Now have %d.",
1928 original_count - filtered_count, filtered_count)
1930 # Importing requires that we group the refs by dataset type and run
1931 # before doing the import.
1932 source_dataset_types = set()
1933 grouped_refs = defaultdict(list)
1934 grouped_indices = defaultdict(list)
1935 for i, ref in enumerate(source_refs):
1936 grouped_refs[ref.datasetType, ref.run].append(ref)
1937 grouped_indices[ref.datasetType, ref.run].append(i)
1938 source_dataset_types.add(ref.datasetType)
1940 # Check to see if the dataset type in the source butler has
1941 # the same definition in the target butler and register missing
1942 # ones if requested. Registration must happen outside a transaction.
1943 newly_registered_dataset_types = set()
1944 for datasetType in source_dataset_types:
1945 if register_dataset_types:
1946 # Let this raise immediately if inconsistent. Continuing
1947 # on to find additional inconsistent dataset types
1948 # might result in additional unwanted dataset types being
1949 # registered.
1950 if self.registry.registerDatasetType(datasetType):
1951 newly_registered_dataset_types.add(datasetType)
1952 else:
1953 # If the dataset type is missing, let it fail immediately.
1954 target_dataset_type = self.registry.getDatasetType(datasetType.name)
1955 if target_dataset_type != datasetType:
1956 raise ConflictingDefinitionError("Source butler dataset type differs from definition"
1957 f" in target butler: {datasetType} !="
1958 f" {target_dataset_type}")
1959 if newly_registered_dataset_types:
1960 # We may have registered some even if there were inconsistencies
1961 # but should let people know (or else remove them again).
1962 log.log(VERBOSE, "Registered the following dataset types in the target Butler: %s",
1963 ", ".join(d.name for d in newly_registered_dataset_types))
1964 else:
1965 log.log(VERBOSE, "All required dataset types are known to the target Butler")
1967 # The returned refs should be identical for UUIDs.
1968 # For now must also support integers and so need to retain the
1969 # newly-created refs from this registry.
1970 # Pre-size it so we can assign refs into the correct slots
1971 transferred_refs_tmp: List[Optional[DatasetRef]] = [None] * len(source_refs)
1972 default_id_gen = DatasetIdGenEnum.UNIQUE
1974 handled_collections: Set[str] = set()
1976 # Do all the importing in a single transaction.
1977 with self.transaction():
1978 for (datasetType, run), refs_to_import in progress.iter_item_chunks(grouped_refs.items(),
1979 desc="Importing to registry"
1980 " by run and dataset type"):
1981 if run not in handled_collections:
1982 run_doc = source_butler.registry.getCollectionDocumentation(run)
1983 registered = self.registry.registerRun(run, doc=run_doc)
1984 handled_collections.add(run)
1985 if registered:
1986 log.log(VERBOSE, "Creating output run %s", run)
1988 id_generation_mode = default_id_gen
1989 if isinstance(refs_to_import[0].id, int):
1990 # ID generation mode might need to be overridden when
1991 # targetting UUID
1992 id_generation_mode = id_gen_map.get(datasetType.name, default_id_gen)
1994 n_refs = len(refs_to_import)
1995 log.verbose("Importing %d ref%s of dataset type %s into run %s",
1996 n_refs, "" if n_refs == 1 else "s", datasetType.name, run)
1998 # No way to know if this butler's registry uses UUID.
1999 # We have to trust the caller on this. If it fails they will
2000 # have to change their approach. We can't catch the exception
2001 # and retry with unique because that will mess up the
2002 # transaction handling. We aren't allowed to ask the registry
2003 # manager what type of ID it is using.
2004 imported_refs = self.registry._importDatasets(refs_to_import,
2005 idGenerationMode=id_generation_mode,
2006 expand=False)
2008 # Map them into the correct slots to match the initial order
2009 for i, ref in zip(grouped_indices[datasetType, run], imported_refs):
2010 transferred_refs_tmp[i] = ref
2012 # Mypy insists that we might have None in here so we have to make
2013 # that explicit by assigning to a new variable and filtering out
2014 # something that won't be there.
2015 transferred_refs = [ref for ref in transferred_refs_tmp if ref is not None]
2017 # Check consistency
2018 assert len(source_refs) == len(transferred_refs), "Different number of refs imported than given"
2020 log.verbose("Imported %d datasets into destination butler", len(transferred_refs))
2022 # The transferred refs need to be reordered to match the original
2023 # ordering given by the caller. Without this the datastore transfer
2024 # will be broken.
2026 # Ask the datastore to transfer. The datastore has to check that
2027 # the source datastore is compatible with the target datastore.
2028 self.datastore.transfer_from(source_butler.datastore, source_refs,
2029 local_refs=transferred_refs, transfer=transfer,
2030 artifact_existence=artifact_existence)
2032 return transferred_refs
2034 def validateConfiguration(self, logFailures: bool = False,
2035 datasetTypeNames: Optional[Iterable[str]] = None,
2036 ignore: Iterable[str] = None) -> None:
2037 """Validate butler configuration.
2039 Checks that each `DatasetType` can be stored in the `Datastore`.
2041 Parameters
2042 ----------
2043 logFailures : `bool`, optional
2044 If `True`, output a log message for every validation error
2045 detected.
2046 datasetTypeNames : iterable of `str`, optional
2047 The `DatasetType` names that should be checked. This allows
2048 only a subset to be selected.
2049 ignore : iterable of `str`, optional
2050 Names of DatasetTypes to skip over. This can be used to skip
2051 known problems. If a named `DatasetType` corresponds to a
2052 composite, all components of that `DatasetType` will also be
2053 ignored.
2055 Raises
2056 ------
2057 ButlerValidationError
2058 Raised if there is some inconsistency with how this Butler
2059 is configured.
2060 """
2061 if datasetTypeNames:
2062 datasetTypes = [self.registry.getDatasetType(name) for name in datasetTypeNames]
2063 else:
2064 datasetTypes = list(self.registry.queryDatasetTypes())
2066 # filter out anything from the ignore list
2067 if ignore:
2068 ignore = set(ignore)
2069 datasetTypes = [e for e in datasetTypes
2070 if e.name not in ignore and e.nameAndComponent()[0] not in ignore]
2071 else:
2072 ignore = set()
2074 # Find all the registered instruments
2075 instruments = set(
2076 record.name for record in self.registry.queryDimensionRecords("instrument")
2077 )
2079 # For each datasetType that has an instrument dimension, create
2080 # a DatasetRef for each defined instrument
2081 datasetRefs = []
2083 for datasetType in datasetTypes:
2084 if "instrument" in datasetType.dimensions:
2085 for instrument in instruments:
2086 datasetRef = DatasetRef(datasetType, {"instrument": instrument}, # type: ignore
2087 conform=False)
2088 datasetRefs.append(datasetRef)
2090 entities: List[Union[DatasetType, DatasetRef]] = []
2091 entities.extend(datasetTypes)
2092 entities.extend(datasetRefs)
2094 datastoreErrorStr = None
2095 try:
2096 self.datastore.validateConfiguration(entities, logFailures=logFailures)
2097 except ValidationError as e:
2098 datastoreErrorStr = str(e)
2100 # Also check that the LookupKeys used by the datastores match
2101 # registry and storage class definitions
2102 keys = self.datastore.getLookupKeys()
2104 failedNames = set()
2105 failedDataId = set()
2106 for key in keys:
2107 if key.name is not None:
2108 if key.name in ignore:
2109 continue
2111 # skip if specific datasetType names were requested and this
2112 # name does not match
2113 if datasetTypeNames and key.name not in datasetTypeNames:
2114 continue
2116 # See if it is a StorageClass or a DatasetType
2117 if key.name in self.storageClasses:
2118 pass
2119 else:
2120 try:
2121 self.registry.getDatasetType(key.name)
2122 except KeyError:
2123 if logFailures:
2124 log.critical("Key '%s' does not correspond to a DatasetType or StorageClass", key)
2125 failedNames.add(key)
2126 else:
2127 # Dimensions are checked for consistency when the Butler
2128 # is created and rendezvoused with a universe.
2129 pass
2131 # Check that the instrument is a valid instrument
2132 # Currently only support instrument so check for that
2133 if key.dataId:
2134 dataIdKeys = set(key.dataId)
2135 if set(["instrument"]) != dataIdKeys:
2136 if logFailures:
2137 log.critical("Key '%s' has unsupported DataId override", key)
2138 failedDataId.add(key)
2139 elif key.dataId["instrument"] not in instruments:
2140 if logFailures:
2141 log.critical("Key '%s' has unknown instrument", key)
2142 failedDataId.add(key)
2144 messages = []
2146 if datastoreErrorStr:
2147 messages.append(datastoreErrorStr)
2149 for failed, msg in ((failedNames, "Keys without corresponding DatasetType or StorageClass entry: "),
2150 (failedDataId, "Keys with bad DataId entries: ")):
2151 if failed:
2152 msg += ", ".join(str(k) for k in failed)
2153 messages.append(msg)
2155 if messages:
2156 raise ValidationError(";\n".join(messages))
2158 @property
2159 def collections(self) -> CollectionSearch:
2160 """The collections to search by default, in order (`CollectionSearch`).
2162 This is an alias for ``self.registry.defaults.collections``. It cannot
2163 be set directly in isolation, but all defaults may be changed together
2164 by assigning a new `RegistryDefaults` instance to
2165 ``self.registry.defaults``.
2166 """
2167 return self.registry.defaults.collections
2169 @property
2170 def run(self) -> Optional[str]:
2171 """Name of the run this butler writes outputs to by default (`str` or
2172 `None`).
2174 This is an alias for ``self.registry.defaults.run``. It cannot be set
2175 directly in isolation, but all defaults may be changed together by
2176 assigning a new `RegistryDefaults` instance to
2177 ``self.registry.defaults``.
2178 """
2179 return self.registry.defaults.run
2181 registry: Registry
2182 """The object that manages dataset metadata and relationships (`Registry`).
2184 Most operations that don't involve reading or writing butler datasets are
2185 accessible only via `Registry` methods.
2186 """
2188 datastore: Datastore
2189 """The object that manages actual dataset storage (`Datastore`).
2191 Direct user access to the datastore should rarely be necessary; the primary
2192 exception is the case where a `Datastore` implementation provides extra
2193 functionality beyond what the base class defines.
2194 """
2196 storageClasses: StorageClassFactory
2197 """An object that maps known storage class names to objects that fully
2198 describe them (`StorageClassFactory`).
2199 """
2201 _allow_put_of_predefined_dataset: bool
2202 """Allow a put to succeed even if there is already a registry entry for it
2203 but not a datastore record. (`bool`)."""