Coverage for python/lsst/daf/butler/_butler.py: 10%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""
23Butler top level classes.
24"""
25from __future__ import annotations
27__all__ = (
28 "Butler",
29 "ButlerValidationError",
30 "PruneCollectionsArgsError",
31 "PurgeWithoutUnstorePruneCollectionsError",
32 "RunWithoutPurgePruneCollectionsError",
33 "PurgeUnsupportedPruneCollectionsError",
34)
36import collections.abc
37from collections import defaultdict
38import contextlib
39import logging
40import numbers
41import os
42from typing import (
43 Any,
44 ClassVar,
45 Counter,
46 Dict,
47 Iterable,
48 Iterator,
49 List,
50 MutableMapping,
51 Optional,
52 Set,
53 TextIO,
54 Tuple,
55 Type,
56 Union,
57)
59try:
60 import boto3
61except ImportError:
62 boto3 = None
64from lsst.utils import doImportType
65from lsst.utils.introspection import get_class_of
66from lsst.utils.logging import getLogger, VERBOSE
67from .core import (
68 AmbiguousDatasetError,
69 ButlerURI,
70 Config,
71 ConfigSubset,
72 DataCoordinate,
73 DataId,
74 DataIdValue,
75 DatasetRef,
76 DatasetType,
77 Datastore,
78 Dimension,
79 DimensionConfig,
80 FileDataset,
81 Progress,
82 StorageClassFactory,
83 Timespan,
84 ValidationError,
85)
86from .core.repoRelocation import BUTLER_ROOT_TAG
87from .core.utils import transactional
88from ._deferredDatasetHandle import DeferredDatasetHandle
89from ._butlerConfig import ButlerConfig
90from ._butlerRepoIndex import ButlerRepoIndex
91from .registry import (
92 Registry,
93 RegistryConfig,
94 RegistryDefaults,
95 CollectionSearch,
96 CollectionType,
97 ConflictingDefinitionError,
98 DatasetIdGenEnum,
99)
100from .transfers import RepoExportContext
102log = getLogger(__name__)
105class ButlerValidationError(ValidationError):
106 """There is a problem with the Butler configuration."""
107 pass
110class PruneCollectionsArgsError(TypeError):
111 """Base class for errors relating to Butler.pruneCollections input
112 arguments.
113 """
114 pass
117class PurgeWithoutUnstorePruneCollectionsError(PruneCollectionsArgsError):
118 """Raised when purge and unstore are both required to be True, and
119 purge is True but unstore is False.
120 """
122 def __init__(self) -> None:
123 super().__init__("Cannot pass purge=True without unstore=True.")
126class RunWithoutPurgePruneCollectionsError(PruneCollectionsArgsError):
127 """Raised when pruning a RUN collection but purge is False."""
129 def __init__(self, collectionType: CollectionType):
130 self.collectionType = collectionType
131 super().__init__(f"Cannot prune RUN collection {self.collectionType.name} without purge=True.")
134class PurgeUnsupportedPruneCollectionsError(PruneCollectionsArgsError):
135 """Raised when purge is True but is not supported for the given
136 collection."""
138 def __init__(self, collectionType: CollectionType):
139 self.collectionType = collectionType
140 super().__init__(
141 f"Cannot prune {self.collectionType} collection {self.collectionType.name} with purge=True.")
144class Butler:
145 """Main entry point for the data access system.
147 Parameters
148 ----------
149 config : `ButlerConfig`, `Config` or `str`, optional.
150 Configuration. Anything acceptable to the
151 `ButlerConfig` constructor. If a directory path
152 is given the configuration will be read from a ``butler.yaml`` file in
153 that location. If `None` is given default values will be used.
154 butler : `Butler`, optional.
155 If provided, construct a new Butler that uses the same registry and
156 datastore as the given one, but with the given collection and run.
157 Incompatible with the ``config``, ``searchPaths``, and ``writeable``
158 arguments.
159 collections : `str` or `Iterable` [ `str` ], optional
160 An expression specifying the collections to be searched (in order) when
161 reading datasets.
162 This may be a `str` collection name or an iterable thereof.
163 See :ref:`daf_butler_collection_expressions` for more information.
164 These collections are not registered automatically and must be
165 manually registered before they are used by any method, but they may be
166 manually registered after the `Butler` is initialized.
167 run : `str`, optional
168 Name of the `~CollectionType.RUN` collection new datasets should be
169 inserted into. If ``collections`` is `None` and ``run`` is not `None`,
170 ``collections`` will be set to ``[run]``. If not `None`, this
171 collection will automatically be registered. If this is not set (and
172 ``writeable`` is not set either), a read-only butler will be created.
173 searchPaths : `list` of `str`, optional
174 Directory paths to search when calculating the full Butler
175 configuration. Not used if the supplied config is already a
176 `ButlerConfig`.
177 writeable : `bool`, optional
178 Explicitly sets whether the butler supports write operations. If not
179 provided, a read-write butler is created if any of ``run``, ``tags``,
180 or ``chains`` is non-empty.
181 inferDefaults : `bool`, optional
182 If `True` (default) infer default data ID values from the values
183 present in the datasets in ``collections``: if all collections have the
184 same value (or no value) for a governor dimension, that value will be
185 the default for that dimension. Nonexistent collections are ignored.
186 If a default value is provided explicitly for a governor dimension via
187 ``**kwargs``, no default will be inferred for that dimension.
188 **kwargs : `str`
189 Default data ID key-value pairs. These may only identify "governor"
190 dimensions like ``instrument`` and ``skymap``.
192 Examples
193 --------
194 While there are many ways to control exactly how a `Butler` interacts with
195 the collections in its `Registry`, the most common cases are still simple.
197 For a read-only `Butler` that searches one collection, do::
199 butler = Butler("/path/to/repo", collections=["u/alice/DM-50000"])
201 For a read-write `Butler` that writes to and reads from a
202 `~CollectionType.RUN` collection::
204 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a")
206 The `Butler` passed to a ``PipelineTask`` is often much more complex,
207 because we want to write to one `~CollectionType.RUN` collection but read
208 from several others (as well)::
210 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a",
211 collections=["u/alice/DM-50000/a",
212 "u/bob/DM-49998",
213 "HSC/defaults"])
215 This butler will `put` new datasets to the run ``u/alice/DM-50000/a``.
216 Datasets will be read first from that run (since it appears first in the
217 chain), and then from ``u/bob/DM-49998`` and finally ``HSC/defaults``.
219 Finally, one can always create a `Butler` with no collections::
221 butler = Butler("/path/to/repo", writeable=True)
223 This can be extremely useful when you just want to use ``butler.registry``,
224 e.g. for inserting dimension data or managing collections, or when the
225 collections you want to use with the butler are not consistent.
226 Passing ``writeable`` explicitly here is only necessary if you want to be
227 able to make changes to the repo - usually the value for ``writeable`` can
228 be guessed from the collection arguments provided, but it defaults to
229 `False` when there are not collection arguments.
230 """
231 def __init__(self, config: Union[Config, str, None] = None, *,
232 butler: Optional[Butler] = None,
233 collections: Any = None,
234 run: Optional[str] = None,
235 searchPaths: Optional[List[str]] = None,
236 writeable: Optional[bool] = None,
237 inferDefaults: bool = True,
238 **kwargs: str,
239 ):
240 defaults = RegistryDefaults(collections=collections, run=run, infer=inferDefaults, **kwargs)
241 # Load registry, datastore, etc. from config or existing butler.
242 if butler is not None:
243 if config is not None or searchPaths is not None or writeable is not None:
244 raise TypeError("Cannot pass 'config', 'searchPaths', or 'writeable' "
245 "arguments with 'butler' argument.")
246 self.registry = butler.registry.copy(defaults)
247 self.datastore = butler.datastore
248 self.storageClasses = butler.storageClasses
249 self._config: ButlerConfig = butler._config
250 self._allow_put_of_predefined_dataset = butler._allow_put_of_predefined_dataset
251 else:
252 self._config = ButlerConfig(config, searchPaths=searchPaths)
253 try:
254 if "root" in self._config:
255 butlerRoot = self._config["root"]
256 else:
257 butlerRoot = self._config.configDir
258 if writeable is None:
259 writeable = run is not None
260 self.registry = Registry.fromConfig(self._config, butlerRoot=butlerRoot, writeable=writeable,
261 defaults=defaults)
262 self.datastore = Datastore.fromConfig(self._config, self.registry.getDatastoreBridgeManager(),
263 butlerRoot=butlerRoot)
264 self.storageClasses = StorageClassFactory()
265 self.storageClasses.addFromConfig(self._config)
266 self._allow_put_of_predefined_dataset = self._config.get("allow_put_of_predefined_dataset",
267 False)
268 except Exception:
269 # Failures here usually mean that configuration is incomplete,
270 # just issue an error message which includes config file URI.
271 log.error(f"Failed to instantiate Butler from config {self._config.configFile}.")
272 raise
274 if "run" in self._config or "collection" in self._config:
275 raise ValueError("Passing a run or collection via configuration is no longer supported.")
277 GENERATION: ClassVar[int] = 3
278 """This is a Generation 3 Butler.
280 This attribute may be removed in the future, once the Generation 2 Butler
281 interface has been fully retired; it should only be used in transitional
282 code.
283 """
285 @classmethod
286 def get_repo_uri(cls, label: str) -> ButlerURI:
287 """Look up the label in a butler repository index.
289 Parameters
290 ----------
291 label : `str`
292 Label of the Butler repository to look up.
294 Returns
295 -------
296 uri : `ButlerURI`
297 URI to the Butler repository associated with the given label.
299 Raises
300 ------
301 KeyError
302 Raised if the label is not found in the index, or if an index
303 can not be found at all.
305 Notes
306 -----
307 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the
308 information is discovered.
309 """
310 return ButlerRepoIndex.get_repo_uri(label)
312 @classmethod
313 def get_known_repos(cls) -> Set[str]:
314 """Retrieve the list of known repository labels.
316 Returns
317 -------
318 repos : `set` of `str`
319 All the known labels. Can be empty if no index can be found.
321 Notes
322 -----
323 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the
324 information is discovered.
325 """
326 return ButlerRepoIndex.get_known_repos()
328 @staticmethod
329 def makeRepo(root: str, config: Union[Config, str, None] = None,
330 dimensionConfig: Union[Config, str, None] = None, standalone: bool = False,
331 searchPaths: Optional[List[str]] = None, forceConfigRoot: bool = True,
332 outfile: Optional[str] = None, overwrite: bool = False) -> Config:
333 """Create an empty data repository by adding a butler.yaml config
334 to a repository root directory.
336 Parameters
337 ----------
338 root : `str` or `ButlerURI`
339 Path or URI to the root location of the new repository. Will be
340 created if it does not exist.
341 config : `Config` or `str`, optional
342 Configuration to write to the repository, after setting any
343 root-dependent Registry or Datastore config options. Can not
344 be a `ButlerConfig` or a `ConfigSubset`. If `None`, default
345 configuration will be used. Root-dependent config options
346 specified in this config are overwritten if ``forceConfigRoot``
347 is `True`.
348 dimensionConfig : `Config` or `str`, optional
349 Configuration for dimensions, will be used to initialize registry
350 database.
351 standalone : `bool`
352 If True, write all expanded defaults, not just customized or
353 repository-specific settings.
354 This (mostly) decouples the repository from the default
355 configuration, insulating it from changes to the defaults (which
356 may be good or bad, depending on the nature of the changes).
357 Future *additions* to the defaults will still be picked up when
358 initializing `Butlers` to repos created with ``standalone=True``.
359 searchPaths : `list` of `str`, optional
360 Directory paths to search when calculating the full butler
361 configuration.
362 forceConfigRoot : `bool`, optional
363 If `False`, any values present in the supplied ``config`` that
364 would normally be reset are not overridden and will appear
365 directly in the output config. This allows non-standard overrides
366 of the root directory for a datastore or registry to be given.
367 If this parameter is `True` the values for ``root`` will be
368 forced into the resulting config if appropriate.
369 outfile : `str`, optional
370 If not-`None`, the output configuration will be written to this
371 location rather than into the repository itself. Can be a URI
372 string. Can refer to a directory that will be used to write
373 ``butler.yaml``.
374 overwrite : `bool`, optional
375 Create a new configuration file even if one already exists
376 in the specified output location. Default is to raise
377 an exception.
379 Returns
380 -------
381 config : `Config`
382 The updated `Config` instance written to the repo.
384 Raises
385 ------
386 ValueError
387 Raised if a ButlerConfig or ConfigSubset is passed instead of a
388 regular Config (as these subclasses would make it impossible to
389 support ``standalone=False``).
390 FileExistsError
391 Raised if the output config file already exists.
392 os.error
393 Raised if the directory does not exist, exists but is not a
394 directory, or cannot be created.
396 Notes
397 -----
398 Note that when ``standalone=False`` (the default), the configuration
399 search path (see `ConfigSubset.defaultSearchPaths`) that was used to
400 construct the repository should also be used to construct any Butlers
401 to avoid configuration inconsistencies.
402 """
403 if isinstance(config, (ButlerConfig, ConfigSubset)):
404 raise ValueError("makeRepo must be passed a regular Config without defaults applied.")
406 # Ensure that the root of the repository exists or can be made
407 uri = ButlerURI(root, forceDirectory=True)
408 uri.mkdir()
410 config = Config(config)
412 # If we are creating a new repo from scratch with relative roots,
413 # do not propagate an explicit root from the config file
414 if "root" in config:
415 del config["root"]
417 full = ButlerConfig(config, searchPaths=searchPaths) # this applies defaults
418 imported_class = doImportType(full["datastore", "cls"])
419 if not issubclass(imported_class, Datastore):
420 raise TypeError(f"Imported datastore class {full['datastore', 'cls']} is not a Datastore")
421 datastoreClass: Type[Datastore] = imported_class
422 datastoreClass.setConfigRoot(BUTLER_ROOT_TAG, config, full, overwrite=forceConfigRoot)
424 # if key exists in given config, parse it, otherwise parse the defaults
425 # in the expanded config
426 if config.get(("registry", "db")):
427 registryConfig = RegistryConfig(config)
428 else:
429 registryConfig = RegistryConfig(full)
430 defaultDatabaseUri = registryConfig.makeDefaultDatabaseUri(BUTLER_ROOT_TAG)
431 if defaultDatabaseUri is not None:
432 Config.updateParameters(RegistryConfig, config, full,
433 toUpdate={"db": defaultDatabaseUri},
434 overwrite=forceConfigRoot)
435 else:
436 Config.updateParameters(RegistryConfig, config, full, toCopy=("db",),
437 overwrite=forceConfigRoot)
439 if standalone:
440 config.merge(full)
441 else:
442 # Always expand the registry.managers section into the per-repo
443 # config, because after the database schema is created, it's not
444 # allowed to change anymore. Note that in the standalone=True
445 # branch, _everything_ in the config is expanded, so there's no
446 # need to special case this.
447 Config.updateParameters(RegistryConfig, config, full, toMerge=("managers",), overwrite=False)
448 configURI: Union[str, ButlerURI]
449 if outfile is not None:
450 # When writing to a separate location we must include
451 # the root of the butler repo in the config else it won't know
452 # where to look.
453 config["root"] = uri.geturl()
454 configURI = outfile
455 else:
456 configURI = uri
457 config.dumpToUri(configURI, overwrite=overwrite)
459 # Create Registry and populate tables
460 registryConfig = RegistryConfig(config.get("registry"))
461 dimensionConfig = DimensionConfig(dimensionConfig)
462 Registry.createFromConfig(registryConfig, dimensionConfig=dimensionConfig, butlerRoot=root)
464 log.verbose("Wrote new Butler configuration file to %s", configURI)
466 return config
468 @classmethod
469 def _unpickle(cls, config: ButlerConfig, collections: Optional[CollectionSearch], run: Optional[str],
470 defaultDataId: Dict[str, str], writeable: bool) -> Butler:
471 """Callable used to unpickle a Butler.
473 We prefer not to use ``Butler.__init__`` directly so we can force some
474 of its many arguments to be keyword-only (note that ``__reduce__``
475 can only invoke callables with positional arguments).
477 Parameters
478 ----------
479 config : `ButlerConfig`
480 Butler configuration, already coerced into a true `ButlerConfig`
481 instance (and hence after any search paths for overrides have been
482 utilized).
483 collections : `CollectionSearch`
484 Names of the default collections to read from.
485 run : `str`, optional
486 Name of the default `~CollectionType.RUN` collection to write to.
487 defaultDataId : `dict` [ `str`, `str` ]
488 Default data ID values.
489 writeable : `bool`
490 Whether the Butler should support write operations.
492 Returns
493 -------
494 butler : `Butler`
495 A new `Butler` instance.
496 """
497 # MyPy doesn't recognize that the kwargs below are totally valid; it
498 # seems to think '**defaultDataId* is a _positional_ argument!
499 return cls(config=config, collections=collections, run=run, writeable=writeable,
500 **defaultDataId) # type: ignore
502 def __reduce__(self) -> tuple:
503 """Support pickling.
504 """
505 return (Butler._unpickle, (self._config, self.collections, self.run,
506 self.registry.defaults.dataId.byName(),
507 self.registry.isWriteable()))
509 def __str__(self) -> str:
510 return "Butler(collections={}, run={}, datastore='{}', registry='{}')".format(
511 self.collections, self.run, self.datastore, self.registry)
513 def isWriteable(self) -> bool:
514 """Return `True` if this `Butler` supports write operations.
515 """
516 return self.registry.isWriteable()
518 @contextlib.contextmanager
519 def transaction(self) -> Iterator[None]:
520 """Context manager supporting `Butler` transactions.
522 Transactions can be nested.
523 """
524 with self.registry.transaction():
525 with self.datastore.transaction():
526 yield
528 def _standardizeArgs(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
529 dataId: Optional[DataId] = None, **kwargs: Any
530 ) -> Tuple[DatasetType, Optional[DataId]]:
531 """Standardize the arguments passed to several Butler APIs.
533 Parameters
534 ----------
535 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
536 When `DatasetRef` the `dataId` should be `None`.
537 Otherwise the `DatasetType` or name thereof.
538 dataId : `dict` or `DataCoordinate`
539 A `dict` of `Dimension` link name, value pairs that label the
540 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
541 should be provided as the second argument.
542 **kwargs
543 Additional keyword arguments used to augment or construct a
544 `DataCoordinate`. See `DataCoordinate.standardize`
545 parameters.
547 Returns
548 -------
549 datasetType : `DatasetType`
550 A `DatasetType` instance extracted from ``datasetRefOrType``.
551 dataId : `dict` or `DataId`, optional
552 Argument that can be used (along with ``kwargs``) to construct a
553 `DataId`.
555 Notes
556 -----
557 Butler APIs that conceptually need a DatasetRef also allow passing a
558 `DatasetType` (or the name of one) and a `DataId` (or a dict and
559 keyword arguments that can be used to construct one) separately. This
560 method accepts those arguments and always returns a true `DatasetType`
561 and a `DataId` or `dict`.
563 Standardization of `dict` vs `DataId` is best handled by passing the
564 returned ``dataId`` (and ``kwargs``) to `Registry` APIs, which are
565 generally similarly flexible.
566 """
567 externalDatasetType: Optional[DatasetType] = None
568 internalDatasetType: Optional[DatasetType] = None
569 if isinstance(datasetRefOrType, DatasetRef):
570 if dataId is not None or kwargs:
571 raise ValueError("DatasetRef given, cannot use dataId as well")
572 externalDatasetType = datasetRefOrType.datasetType
573 dataId = datasetRefOrType.dataId
574 else:
575 # Don't check whether DataId is provided, because Registry APIs
576 # can usually construct a better error message when it wasn't.
577 if isinstance(datasetRefOrType, DatasetType):
578 externalDatasetType = datasetRefOrType
579 else:
580 internalDatasetType = self.registry.getDatasetType(datasetRefOrType)
582 # Check that they are self-consistent
583 if externalDatasetType is not None:
584 internalDatasetType = self.registry.getDatasetType(externalDatasetType.name)
585 if externalDatasetType != internalDatasetType:
586 raise ValueError(f"Supplied dataset type ({externalDatasetType}) inconsistent with "
587 f"registry definition ({internalDatasetType})")
589 assert internalDatasetType is not None
590 return internalDatasetType, dataId
592 def _rewrite_data_id(self, dataId: Optional[DataId], datasetType: DatasetType,
593 **kwargs: Any) -> Tuple[Optional[DataId], Dict[str, Any]]:
594 """Rewrite a data ID taking into account dimension records.
596 Take a Data ID and keyword args and rewrite it if necessary to
597 allow the user to specify dimension records rather than dimension
598 primary values.
600 This allows a user to include a dataId dict with keys of
601 ``exposure.day_obs`` and ``exposure.seq_num`` instead of giving
602 the integer exposure ID. It also allows a string to be given
603 for a dimension value rather than the integer ID if that is more
604 convenient. For example, rather than having to specifyin the
605 detector with ``detector.full_name``, a string given for ``detector``
606 will be interpreted as the full name and converted to the integer
607 value.
609 Keyword arguments can also use strings for dimensions like detector
610 and exposure but python does not allow them to include ``.`` and
611 so the ``exposure.day_obs`` syntax can not be used in a keyword
612 argument.
614 Parameters
615 ----------
616 dataId : `dict` or `DataCoordinate`
617 A `dict` of `Dimension` link name, value pairs that will label the
618 `DatasetRef` within a Collection.
619 datasetType : `DatasetType`
620 The dataset type associated with this dataId. Required to
621 determine the relevant dimensions.
622 **kwargs
623 Additional keyword arguments used to augment or construct a
624 `DataId`. See `DataId` parameters.
626 Returns
627 -------
628 dataId : `dict` or `DataCoordinate`
629 The, possibly rewritten, dataId. If given a `DataCoordinate` and
630 no keyword arguments, the original dataId will be returned
631 unchanged.
632 **kwargs : `dict`
633 Any unused keyword arguments.
634 """
635 # Do nothing if we have a standalone DataCoordinate.
636 if isinstance(dataId, DataCoordinate) and not kwargs:
637 return dataId, kwargs
639 # Process dimension records that are using record information
640 # rather than ids
641 newDataId: Dict[str, DataIdValue] = {}
642 byRecord: Dict[str, Dict[str, Any]] = defaultdict(dict)
644 # if all the dataId comes from keyword parameters we do not need
645 # to do anything here because they can't be of the form
646 # exposure.obs_id because a "." is not allowed in a keyword parameter.
647 if dataId:
648 for k, v in dataId.items():
649 # If we have a Dimension we do not need to do anything
650 # because it cannot be a compound key.
651 if isinstance(k, str) and "." in k:
652 # Someone is using a more human-readable dataId
653 dimensionName, record = k.split(".", 1)
654 byRecord[dimensionName][record] = v
655 elif isinstance(k, Dimension):
656 newDataId[k.name] = v
657 else:
658 newDataId[k] = v
660 # Go through the updated dataId and check the type in case someone is
661 # using an alternate key. We have already filtered out the compound
662 # keys dimensions.record format.
663 not_dimensions = {}
665 # Will need to look in the dataId and the keyword arguments
666 # and will remove them if they need to be fixed or are unrecognized.
667 for dataIdDict in (newDataId, kwargs):
668 # Use a list so we can adjust the dict safely in the loop
669 for dimensionName in list(dataIdDict):
670 value = dataIdDict[dimensionName]
671 try:
672 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
673 except KeyError:
674 # This is not a real dimension
675 not_dimensions[dimensionName] = value
676 del dataIdDict[dimensionName]
677 continue
679 # Convert an integral type to an explicit int to simplify
680 # comparisons here
681 if isinstance(value, numbers.Integral):
682 value = int(value)
684 if not isinstance(value, dimension.primaryKey.getPythonType()):
685 for alternate in dimension.alternateKeys:
686 if isinstance(value, alternate.getPythonType()):
687 byRecord[dimensionName][alternate.name] = value
688 del dataIdDict[dimensionName]
689 log.debug("Converting dimension %s to %s.%s=%s",
690 dimensionName, dimensionName, alternate.name, value)
691 break
692 else:
693 log.warning("Type mismatch found for value '%r' provided for dimension %s. "
694 "Could not find matching alternative (primary key has type %s) "
695 "so attempting to use as-is.",
696 value, dimensionName, dimension.primaryKey.getPythonType())
698 # If we have some unrecognized dimensions we have to try to connect
699 # them to records in other dimensions. This is made more complicated
700 # by some dimensions having records with clashing names. A mitigation
701 # is that we can tell by this point which dimensions are missing
702 # for the DatasetType but this does not work for calibrations
703 # where additional dimensions can be used to constrain the temporal
704 # axis.
705 if not_dimensions:
706 # Calculate missing dimensions
707 provided = set(newDataId) | set(kwargs) | set(byRecord)
708 missingDimensions = datasetType.dimensions.names - provided
710 # For calibrations we may well be needing temporal dimensions
711 # so rather than always including all dimensions in the scan
712 # restrict things a little. It is still possible for there
713 # to be confusion over day_obs in visit vs exposure for example.
714 # If we are not searching calibration collections things may
715 # fail but they are going to fail anyway because of the
716 # ambiguousness of the dataId...
717 candidateDimensions: Set[str] = set()
718 candidateDimensions.update(missingDimensions)
719 if datasetType.isCalibration():
720 for dim in self.registry.dimensions.getStaticDimensions():
721 if dim.temporal:
722 candidateDimensions.add(str(dim))
724 # Look up table for the first association with a dimension
725 guessedAssociation: Dict[str, Dict[str, Any]] = defaultdict(dict)
727 # Keep track of whether an item is associated with multiple
728 # dimensions.
729 counter: Counter[str] = Counter()
730 assigned: Dict[str, Set[str]] = defaultdict(set)
732 # Go through the missing dimensions and associate the
733 # given names with records within those dimensions
734 for dimensionName in candidateDimensions:
735 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
736 fields = dimension.metadata.names | dimension.uniqueKeys.names
737 for field in not_dimensions:
738 if field in fields:
739 guessedAssociation[dimensionName][field] = not_dimensions[field]
740 counter[dimensionName] += 1
741 assigned[field].add(dimensionName)
743 # There is a chance we have allocated a single dataId item
744 # to multiple dimensions. Need to decide which should be retained.
745 # For now assume that the most popular alternative wins.
746 # This means that day_obs with seq_num will result in
747 # exposure.day_obs and not visit.day_obs
748 # Also prefer an explicitly missing dimension over an inferred
749 # temporal dimension.
750 for fieldName, assignedDimensions in assigned.items():
751 if len(assignedDimensions) > 1:
752 # Pick the most popular (preferring mandatory dimensions)
753 requiredButMissing = assignedDimensions.intersection(missingDimensions)
754 if requiredButMissing:
755 candidateDimensions = requiredButMissing
756 else:
757 candidateDimensions = assignedDimensions
759 # Select the relevant items and get a new restricted
760 # counter.
761 theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions}
762 duplicatesCounter: Counter[str] = Counter()
763 duplicatesCounter.update(theseCounts)
765 # Choose the most common. If they are equally common
766 # we will pick the one that was found first.
767 # Returns a list of tuples
768 selected = duplicatesCounter.most_common(1)[0][0]
770 log.debug("Ambiguous dataId entry '%s' associated with multiple dimensions: %s."
771 " Removed ambiguity by choosing dimension %s.",
772 fieldName, ", ".join(assignedDimensions), selected)
774 for candidateDimension in assignedDimensions:
775 if candidateDimension != selected:
776 del guessedAssociation[candidateDimension][fieldName]
778 # Update the record look up dict with the new associations
779 for dimensionName, values in guessedAssociation.items():
780 if values: # A dict might now be empty
781 log.debug("Assigned non-dimension dataId keys to dimension %s: %s",
782 dimensionName, values)
783 byRecord[dimensionName].update(values)
785 if byRecord:
786 # Some record specifiers were found so we need to convert
787 # them to the Id form
788 for dimensionName, values in byRecord.items():
789 if dimensionName in newDataId:
790 log.warning("DataId specified explicit %s dimension value of %s in addition to"
791 " general record specifiers for it of %s. Ignoring record information.",
792 dimensionName, newDataId[dimensionName], str(values))
793 continue
795 # Build up a WHERE expression
796 bind = {k: v for k, v in values.items()}
797 where = " AND ".join(f"{dimensionName}.{k} = {k}"
798 for k in bind)
800 # Hopefully we get a single record that matches
801 records = set(self.registry.queryDimensionRecords(dimensionName, dataId=newDataId,
802 where=where, bind=bind, **kwargs))
804 if len(records) != 1:
805 if len(records) > 1:
806 log.debug("Received %d records from constraints of %s", len(records), str(values))
807 for r in records:
808 log.debug("- %s", str(r))
809 raise RuntimeError(f"DataId specification for dimension {dimensionName} is not"
810 f" uniquely constrained to a single dataset by {values}."
811 f" Got {len(records)} results.")
812 raise RuntimeError(f"DataId specification for dimension {dimensionName} matched no"
813 f" records when constrained by {values}")
815 # Get the primary key from the real dimension object
816 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
817 if not isinstance(dimension, Dimension):
818 raise RuntimeError(
819 f"{dimension.name} is not a true dimension, and cannot be used in data IDs."
820 )
821 newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name)
823 # We have modified the dataId so need to switch to it
824 dataId = newDataId
826 return dataId, kwargs
828 def _findDatasetRef(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
829 dataId: Optional[DataId] = None, *,
830 collections: Any = None,
831 allowUnresolved: bool = False,
832 **kwargs: Any) -> DatasetRef:
833 """Shared logic for methods that start with a search for a dataset in
834 the registry.
836 Parameters
837 ----------
838 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
839 When `DatasetRef` the `dataId` should be `None`.
840 Otherwise the `DatasetType` or name thereof.
841 dataId : `dict` or `DataCoordinate`, optional
842 A `dict` of `Dimension` link name, value pairs that label the
843 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
844 should be provided as the first argument.
845 collections : Any, optional
846 Collections to be searched, overriding ``self.collections``.
847 Can be any of the types supported by the ``collections`` argument
848 to butler construction.
849 allowUnresolved : `bool`, optional
850 If `True`, return an unresolved `DatasetRef` if finding a resolved
851 one in the `Registry` fails. Defaults to `False`.
852 **kwargs
853 Additional keyword arguments used to augment or construct a
854 `DataId`. See `DataId` parameters.
856 Returns
857 -------
858 ref : `DatasetRef`
859 A reference to the dataset identified by the given arguments.
861 Raises
862 ------
863 LookupError
864 Raised if no matching dataset exists in the `Registry` (and
865 ``allowUnresolved is False``).
866 ValueError
867 Raised if a resolved `DatasetRef` was passed as an input, but it
868 differs from the one found in the registry.
869 TypeError
870 Raised if no collections were provided.
871 """
872 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs)
873 if isinstance(datasetRefOrType, DatasetRef):
874 idNumber = datasetRefOrType.id
875 else:
876 idNumber = None
877 timespan: Optional[Timespan] = None
879 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs)
881 if datasetType.isCalibration():
882 # Because this is a calibration dataset, first try to make a
883 # standardize the data ID without restricting the dimensions to
884 # those of the dataset type requested, because there may be extra
885 # dimensions that provide temporal information for a validity-range
886 # lookup.
887 dataId = DataCoordinate.standardize(dataId, universe=self.registry.dimensions,
888 defaults=self.registry.defaults.dataId, **kwargs)
889 if dataId.graph.temporal:
890 dataId = self.registry.expandDataId(dataId)
891 timespan = dataId.timespan
892 else:
893 # Standardize the data ID to just the dimensions of the dataset
894 # type instead of letting registry.findDataset do it, so we get the
895 # result even if no dataset is found.
896 dataId = DataCoordinate.standardize(dataId, graph=datasetType.dimensions,
897 defaults=self.registry.defaults.dataId, **kwargs)
898 # Always lookup the DatasetRef, even if one is given, to ensure it is
899 # present in the current collection.
900 ref = self.registry.findDataset(datasetType, dataId, collections=collections, timespan=timespan)
901 if ref is None:
902 if allowUnresolved:
903 return DatasetRef(datasetType, dataId)
904 else:
905 if collections is None:
906 collections = self.registry.defaults.collections
907 raise LookupError(f"Dataset {datasetType.name} with data ID {dataId} "
908 f"could not be found in collections {collections}.")
909 if idNumber is not None and idNumber != ref.id:
910 if collections is None:
911 collections = self.registry.defaults.collections
912 raise ValueError(f"DatasetRef.id provided ({idNumber}) does not match "
913 f"id ({ref.id}) in registry in collections {collections}.")
914 return ref
916 @transactional
917 def put(self, obj: Any, datasetRefOrType: Union[DatasetRef, DatasetType, str],
918 dataId: Optional[DataId] = None, *,
919 run: Optional[str] = None,
920 **kwargs: Any) -> DatasetRef:
921 """Store and register a dataset.
923 Parameters
924 ----------
925 obj : `object`
926 The dataset.
927 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
928 When `DatasetRef` is provided, ``dataId`` should be `None`.
929 Otherwise the `DatasetType` or name thereof.
930 dataId : `dict` or `DataCoordinate`
931 A `dict` of `Dimension` link name, value pairs that label the
932 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
933 should be provided as the second argument.
934 run : `str`, optional
935 The name of the run the dataset should be added to, overriding
936 ``self.run``.
937 **kwargs
938 Additional keyword arguments used to augment or construct a
939 `DataCoordinate`. See `DataCoordinate.standardize`
940 parameters.
942 Returns
943 -------
944 ref : `DatasetRef`
945 A reference to the stored dataset, updated with the correct id if
946 given.
948 Raises
949 ------
950 TypeError
951 Raised if the butler is read-only or if no run has been provided.
952 """
953 log.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run)
954 if not self.isWriteable():
955 raise TypeError("Butler is read-only.")
956 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs)
957 if isinstance(datasetRefOrType, DatasetRef) and datasetRefOrType.id is not None:
958 raise ValueError("DatasetRef must not be in registry, must have None id")
960 # Handle dimension records in dataId
961 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs)
963 # Add Registry Dataset entry.
964 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions, **kwargs)
966 # For an execution butler the datasets will be pre-defined.
967 # If the butler is configured that way datasets should only be inserted
968 # if they do not already exist in registry. Trying and catching
969 # ConflictingDefinitionError will not work because the transaction
970 # will be corrupted. Instead, in this mode always check first.
971 ref = None
972 ref_is_predefined = False
973 if self._allow_put_of_predefined_dataset:
974 # Get the matching ref for this run.
975 ref = self.registry.findDataset(datasetType, collections=run,
976 dataId=dataId)
978 if ref:
979 # Must be expanded form for datastore templating
980 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions)
981 ref = ref.expanded(dataId)
982 ref_is_predefined = True
984 if not ref:
985 ref, = self.registry.insertDatasets(datasetType, run=run, dataIds=[dataId])
987 # If the ref is predefined it is possible that the datastore also
988 # has the record. Asking datastore to put it again will result in
989 # the artifact being recreated, overwriting previous, then will cause
990 # a failure in writing the record which will cause the artifact
991 # to be removed. Much safer to ask first before attempting to
992 # overwrite. Race conditions should not be an issue for the
993 # execution butler environment.
994 if ref_is_predefined:
995 if self.datastore.knows(ref):
996 raise ConflictingDefinitionError(f"Dataset associated {ref} already exists.")
998 self.datastore.put(obj, ref)
1000 return ref
1002 def getDirect(self, ref: DatasetRef, *, parameters: Optional[Dict[str, Any]] = None) -> Any:
1003 """Retrieve a stored dataset.
1005 Unlike `Butler.get`, this method allows datasets outside the Butler's
1006 collection to be read as long as the `DatasetRef` that identifies them
1007 can be obtained separately.
1009 Parameters
1010 ----------
1011 ref : `DatasetRef`
1012 Resolved reference to an already stored dataset.
1013 parameters : `dict`
1014 Additional StorageClass-defined options to control reading,
1015 typically used to efficiently read only a subset of the dataset.
1017 Returns
1018 -------
1019 obj : `object`
1020 The dataset.
1021 """
1022 return self.datastore.get(ref, parameters=parameters)
1024 def getDirectDeferred(self, ref: DatasetRef, *,
1025 parameters: Union[dict, None] = None) -> DeferredDatasetHandle:
1026 """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
1027 from a resolved `DatasetRef`.
1029 Parameters
1030 ----------
1031 ref : `DatasetRef`
1032 Resolved reference to an already stored dataset.
1033 parameters : `dict`
1034 Additional StorageClass-defined options to control reading,
1035 typically used to efficiently read only a subset of the dataset.
1037 Returns
1038 -------
1039 obj : `DeferredDatasetHandle`
1040 A handle which can be used to retrieve a dataset at a later time.
1042 Raises
1043 ------
1044 AmbiguousDatasetError
1045 Raised if ``ref.id is None``, i.e. the reference is unresolved.
1046 """
1047 if ref.id is None:
1048 raise AmbiguousDatasetError(
1049 f"Dataset of type {ref.datasetType.name} with data ID {ref.dataId} is not resolved."
1050 )
1051 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters)
1053 def getDeferred(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
1054 dataId: Optional[DataId] = None, *,
1055 parameters: Union[dict, None] = None,
1056 collections: Any = None,
1057 **kwargs: Any) -> DeferredDatasetHandle:
1058 """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
1059 after an immediate registry lookup.
1061 Parameters
1062 ----------
1063 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1064 When `DatasetRef` the `dataId` should be `None`.
1065 Otherwise the `DatasetType` or name thereof.
1066 dataId : `dict` or `DataCoordinate`, optional
1067 A `dict` of `Dimension` link name, value pairs that label the
1068 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1069 should be provided as the first argument.
1070 parameters : `dict`
1071 Additional StorageClass-defined options to control reading,
1072 typically used to efficiently read only a subset of the dataset.
1073 collections : Any, optional
1074 Collections to be searched, overriding ``self.collections``.
1075 Can be any of the types supported by the ``collections`` argument
1076 to butler construction.
1077 **kwargs
1078 Additional keyword arguments used to augment or construct a
1079 `DataId`. See `DataId` parameters.
1081 Returns
1082 -------
1083 obj : `DeferredDatasetHandle`
1084 A handle which can be used to retrieve a dataset at a later time.
1086 Raises
1087 ------
1088 LookupError
1089 Raised if no matching dataset exists in the `Registry` (and
1090 ``allowUnresolved is False``).
1091 ValueError
1092 Raised if a resolved `DatasetRef` was passed as an input, but it
1093 differs from the one found in the registry.
1094 TypeError
1095 Raised if no collections were provided.
1096 """
1097 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs)
1098 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters)
1100 def get(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
1101 dataId: Optional[DataId] = None, *,
1102 parameters: Optional[Dict[str, Any]] = None,
1103 collections: Any = None,
1104 **kwargs: Any) -> Any:
1105 """Retrieve a stored dataset.
1107 Parameters
1108 ----------
1109 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1110 When `DatasetRef` the `dataId` should be `None`.
1111 Otherwise the `DatasetType` or name thereof.
1112 dataId : `dict` or `DataCoordinate`
1113 A `dict` of `Dimension` link name, value pairs that label the
1114 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1115 should be provided as the first argument.
1116 parameters : `dict`
1117 Additional StorageClass-defined options to control reading,
1118 typically used to efficiently read only a subset of the dataset.
1119 collections : Any, optional
1120 Collections to be searched, overriding ``self.collections``.
1121 Can be any of the types supported by the ``collections`` argument
1122 to butler construction.
1123 **kwargs
1124 Additional keyword arguments used to augment or construct a
1125 `DataCoordinate`. See `DataCoordinate.standardize`
1126 parameters.
1128 Returns
1129 -------
1130 obj : `object`
1131 The dataset.
1133 Raises
1134 ------
1135 ValueError
1136 Raised if a resolved `DatasetRef` was passed as an input, but it
1137 differs from the one found in the registry.
1138 LookupError
1139 Raised if no matching dataset exists in the `Registry`.
1140 TypeError
1141 Raised if no collections were provided.
1143 Notes
1144 -----
1145 When looking up datasets in a `~CollectionType.CALIBRATION` collection,
1146 this method requires that the given data ID include temporal dimensions
1147 beyond the dimensions of the dataset type itself, in order to find the
1148 dataset with the appropriate validity range. For example, a "bias"
1149 dataset with native dimensions ``{instrument, detector}`` could be
1150 fetched with a ``{instrument, detector, exposure}`` data ID, because
1151 ``exposure`` is a temporal dimension.
1152 """
1153 log.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters)
1154 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs)
1155 return self.getDirect(ref, parameters=parameters)
1157 def getURIs(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
1158 dataId: Optional[DataId] = None, *,
1159 predict: bool = False,
1160 collections: Any = None,
1161 run: Optional[str] = None,
1162 **kwargs: Any) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]:
1163 """Returns the URIs associated with the dataset.
1165 Parameters
1166 ----------
1167 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1168 When `DatasetRef` the `dataId` should be `None`.
1169 Otherwise the `DatasetType` or name thereof.
1170 dataId : `dict` or `DataCoordinate`
1171 A `dict` of `Dimension` link name, value pairs that label the
1172 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1173 should be provided as the first argument.
1174 predict : `bool`
1175 If `True`, allow URIs to be returned of datasets that have not
1176 been written.
1177 collections : Any, optional
1178 Collections to be searched, overriding ``self.collections``.
1179 Can be any of the types supported by the ``collections`` argument
1180 to butler construction.
1181 run : `str`, optional
1182 Run to use for predictions, overriding ``self.run``.
1183 **kwargs
1184 Additional keyword arguments used to augment or construct a
1185 `DataCoordinate`. See `DataCoordinate.standardize`
1186 parameters.
1188 Returns
1189 -------
1190 primary : `ButlerURI`
1191 The URI to the primary artifact associated with this dataset.
1192 If the dataset was disassembled within the datastore this
1193 may be `None`.
1194 components : `dict`
1195 URIs to any components associated with the dataset artifact.
1196 Can be empty if there are no components.
1197 """
1198 ref = self._findDatasetRef(datasetRefOrType, dataId, allowUnresolved=predict,
1199 collections=collections, **kwargs)
1200 if ref.id is None: # only possible if predict is True
1201 if run is None:
1202 run = self.run
1203 if run is None:
1204 raise TypeError("Cannot predict location with run=None.")
1205 # Lie about ID, because we can't guess it, and only
1206 # Datastore.getURIs() will ever see it (and it doesn't use it).
1207 ref = ref.resolved(id=0, run=run)
1208 return self.datastore.getURIs(ref, predict)
1210 def getURI(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
1211 dataId: Optional[DataId] = None, *,
1212 predict: bool = False,
1213 collections: Any = None,
1214 run: Optional[str] = None,
1215 **kwargs: Any) -> ButlerURI:
1216 """Return the URI to the Dataset.
1218 Parameters
1219 ----------
1220 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1221 When `DatasetRef` the `dataId` should be `None`.
1222 Otherwise the `DatasetType` or name thereof.
1223 dataId : `dict` or `DataCoordinate`
1224 A `dict` of `Dimension` link name, value pairs that label the
1225 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1226 should be provided as the first argument.
1227 predict : `bool`
1228 If `True`, allow URIs to be returned of datasets that have not
1229 been written.
1230 collections : Any, optional
1231 Collections to be searched, overriding ``self.collections``.
1232 Can be any of the types supported by the ``collections`` argument
1233 to butler construction.
1234 run : `str`, optional
1235 Run to use for predictions, overriding ``self.run``.
1236 **kwargs
1237 Additional keyword arguments used to augment or construct a
1238 `DataCoordinate`. See `DataCoordinate.standardize`
1239 parameters.
1241 Returns
1242 -------
1243 uri : `ButlerURI`
1244 URI pointing to the Dataset within the datastore. If the
1245 Dataset does not exist in the datastore, and if ``predict`` is
1246 `True`, the URI will be a prediction and will include a URI
1247 fragment "#predicted".
1248 If the datastore does not have entities that relate well
1249 to the concept of a URI the returned URI string will be
1250 descriptive. The returned URI is not guaranteed to be obtainable.
1252 Raises
1253 ------
1254 LookupError
1255 A URI has been requested for a dataset that does not exist and
1256 guessing is not allowed.
1257 ValueError
1258 Raised if a resolved `DatasetRef` was passed as an input, but it
1259 differs from the one found in the registry.
1260 TypeError
1261 Raised if no collections were provided.
1262 RuntimeError
1263 Raised if a URI is requested for a dataset that consists of
1264 multiple artifacts.
1265 """
1266 primary, components = self.getURIs(datasetRefOrType, dataId=dataId, predict=predict,
1267 collections=collections, run=run, **kwargs)
1269 if primary is None or components:
1270 raise RuntimeError(f"Dataset ({datasetRefOrType}) includes distinct URIs for components. "
1271 "Use Butler.getURIs() instead.")
1272 return primary
1274 def retrieveArtifacts(self, refs: Iterable[DatasetRef],
1275 destination: Union[str, ButlerURI], transfer: str = "auto",
1276 preserve_path: bool = True,
1277 overwrite: bool = False) -> List[ButlerURI]:
1278 """Retrieve the artifacts associated with the supplied refs.
1280 Parameters
1281 ----------
1282 refs : iterable of `DatasetRef`
1283 The datasets for which artifacts are to be retrieved.
1284 A single ref can result in multiple artifacts. The refs must
1285 be resolved.
1286 destination : `ButlerURI` or `str`
1287 Location to write the artifacts.
1288 transfer : `str`, optional
1289 Method to use to transfer the artifacts. Must be one of the options
1290 supported by `ButlerURI.transfer_from()`. "move" is not allowed.
1291 preserve_path : `bool`, optional
1292 If `True` the full path of the artifact within the datastore
1293 is preserved. If `False` the final file component of the path
1294 is used.
1295 overwrite : `bool`, optional
1296 If `True` allow transfers to overwrite existing files at the
1297 destination.
1299 Returns
1300 -------
1301 targets : `list` of `ButlerURI`
1302 URIs of file artifacts in destination location. Order is not
1303 preserved.
1305 Notes
1306 -----
1307 For non-file datastores the artifacts written to the destination
1308 may not match the representation inside the datastore. For example
1309 a hierarchical data structure in a NoSQL database may well be stored
1310 as a JSON file.
1311 """
1312 return self.datastore.retrieveArtifacts(refs, ButlerURI(destination), transfer=transfer,
1313 preserve_path=preserve_path, overwrite=overwrite)
1315 def datasetExists(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
1316 dataId: Optional[DataId] = None, *,
1317 collections: Any = None,
1318 **kwargs: Any) -> bool:
1319 """Return True if the Dataset is actually present in the Datastore.
1321 Parameters
1322 ----------
1323 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1324 When `DatasetRef` the `dataId` should be `None`.
1325 Otherwise the `DatasetType` or name thereof.
1326 dataId : `dict` or `DataCoordinate`
1327 A `dict` of `Dimension` link name, value pairs that label the
1328 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1329 should be provided as the first argument.
1330 collections : Any, optional
1331 Collections to be searched, overriding ``self.collections``.
1332 Can be any of the types supported by the ``collections`` argument
1333 to butler construction.
1334 **kwargs
1335 Additional keyword arguments used to augment or construct a
1336 `DataCoordinate`. See `DataCoordinate.standardize`
1337 parameters.
1339 Raises
1340 ------
1341 LookupError
1342 Raised if the dataset is not even present in the Registry.
1343 ValueError
1344 Raised if a resolved `DatasetRef` was passed as an input, but it
1345 differs from the one found in the registry.
1346 TypeError
1347 Raised if no collections were provided.
1348 """
1349 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs)
1350 return self.datastore.exists(ref)
1352 def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None:
1353 """Remove one or more `~CollectionType.RUN` collections and the
1354 datasets within them.
1356 Parameters
1357 ----------
1358 names : `Iterable` [ `str` ]
1359 The names of the collections to remove.
1360 unstore : `bool`, optional
1361 If `True` (default), delete datasets from all datastores in which
1362 they are present, and attempt to rollback the registry deletions if
1363 datastore deletions fail (which may not always be possible). If
1364 `False`, datastore records for these datasets are still removed,
1365 but any artifacts (e.g. files) will not be.
1367 Raises
1368 ------
1369 TypeError
1370 Raised if one or more collections are not of type
1371 `~CollectionType.RUN`.
1372 """
1373 if not self.isWriteable():
1374 raise TypeError("Butler is read-only.")
1375 names = list(names)
1376 refs: List[DatasetRef] = []
1377 for name in names:
1378 collectionType = self.registry.getCollectionType(name)
1379 if collectionType is not CollectionType.RUN:
1380 raise TypeError(f"The collection type of '{name}' is {collectionType.name}, not RUN.")
1381 refs.extend(self.registry.queryDatasets(..., collections=name, findFirst=True))
1382 with self.registry.transaction():
1383 if unstore:
1384 self.datastore.trash(refs)
1385 else:
1386 self.datastore.forget(refs)
1387 for name in names:
1388 self.registry.removeCollection(name)
1389 if unstore:
1390 # Point of no return for removing artifacts
1391 self.datastore.emptyTrash()
1393 def pruneCollection(self, name: str, purge: bool = False, unstore: bool = False,
1394 unlink: Optional[List[str]] = None) -> None:
1395 """Remove a collection and possibly prune datasets within it.
1397 Parameters
1398 ----------
1399 name : `str`
1400 Name of the collection to remove. If this is a
1401 `~CollectionType.TAGGED` or `~CollectionType.CHAINED` collection,
1402 datasets within the collection are not modified unless ``unstore``
1403 is `True`. If this is a `~CollectionType.RUN` collection,
1404 ``purge`` and ``unstore`` must be `True`, and all datasets in it
1405 are fully removed from the data repository.
1406 purge : `bool`, optional
1407 If `True`, permit `~CollectionType.RUN` collections to be removed,
1408 fully removing datasets within them. Requires ``unstore=True`` as
1409 well as an added precaution against accidental deletion. Must be
1410 `False` (default) if the collection is not a ``RUN``.
1411 unstore: `bool`, optional
1412 If `True`, remove all datasets in the collection from all
1413 datastores in which they appear.
1414 unlink: `list` [`str`], optional
1415 Before removing the given `collection` unlink it from from these
1416 parent collections.
1418 Raises
1419 ------
1420 TypeError
1421 Raised if the butler is read-only or arguments are mutually
1422 inconsistent.
1423 """
1424 # See pruneDatasets comments for more information about the logic here;
1425 # the cases are almost the same, but here we can rely on Registry to
1426 # take care everything but Datastore deletion when we remove the
1427 # collection.
1428 if not self.isWriteable():
1429 raise TypeError("Butler is read-only.")
1430 collectionType = self.registry.getCollectionType(name)
1431 if purge and not unstore:
1432 raise PurgeWithoutUnstorePruneCollectionsError()
1433 if collectionType is CollectionType.RUN and not purge:
1434 raise RunWithoutPurgePruneCollectionsError(collectionType)
1435 if collectionType is not CollectionType.RUN and purge:
1436 raise PurgeUnsupportedPruneCollectionsError(collectionType)
1438 def remove(child: str, parent: str) -> None:
1439 """Remove a child collection from a parent collection."""
1440 # Remove child from parent.
1441 chain = list(self.registry.getCollectionChain(parent))
1442 try:
1443 chain.remove(name)
1444 except ValueError as e:
1445 raise RuntimeError(f"{name} is not a child of {parent}") from e
1446 self.registry.setCollectionChain(parent, chain)
1448 with self.registry.transaction():
1449 if (unlink):
1450 for parent in unlink:
1451 remove(name, parent)
1452 if unstore:
1453 refs = self.registry.queryDatasets(..., collections=name, findFirst=True)
1454 self.datastore.trash(refs)
1455 self.registry.removeCollection(name)
1457 if unstore:
1458 # Point of no return for removing artifacts
1459 self.datastore.emptyTrash()
1461 def pruneDatasets(self, refs: Iterable[DatasetRef], *,
1462 disassociate: bool = True,
1463 unstore: bool = False,
1464 tags: Iterable[str] = (),
1465 purge: bool = False,
1466 run: Optional[str] = None) -> None:
1467 """Remove one or more datasets from a collection and/or storage.
1469 Parameters
1470 ----------
1471 refs : `~collections.abc.Iterable` of `DatasetRef`
1472 Datasets to prune. These must be "resolved" references (not just
1473 a `DatasetType` and data ID).
1474 disassociate : `bool`, optional
1475 Disassociate pruned datasets from ``tags``, or from all collections
1476 if ``purge=True``.
1477 unstore : `bool`, optional
1478 If `True` (`False` is default) remove these datasets from all
1479 datastores known to this butler. Note that this will make it
1480 impossible to retrieve these datasets even via other collections.
1481 Datasets that are already not stored are ignored by this option.
1482 tags : `Iterable` [ `str` ], optional
1483 `~CollectionType.TAGGED` collections to disassociate the datasets
1484 from. Ignored if ``disassociate`` is `False` or ``purge`` is
1485 `True`.
1486 purge : `bool`, optional
1487 If `True` (`False` is default), completely remove the dataset from
1488 the `Registry`. To prevent accidental deletions, ``purge`` may
1489 only be `True` if all of the following conditions are met:
1491 - All given datasets are in the given run.
1492 - ``disassociate`` is `True`;
1493 - ``unstore`` is `True`.
1495 This mode may remove provenance information from datasets other
1496 than those provided, and should be used with extreme care.
1498 Raises
1499 ------
1500 TypeError
1501 Raised if the butler is read-only, if no collection was provided,
1502 or the conditions for ``purge=True`` were not met.
1503 """
1504 if not self.isWriteable():
1505 raise TypeError("Butler is read-only.")
1506 if purge:
1507 if not disassociate:
1508 raise TypeError("Cannot pass purge=True without disassociate=True.")
1509 if not unstore:
1510 raise TypeError("Cannot pass purge=True without unstore=True.")
1511 elif disassociate:
1512 tags = tuple(tags)
1513 if not tags:
1514 raise TypeError("No tags provided but disassociate=True.")
1515 for tag in tags:
1516 collectionType = self.registry.getCollectionType(tag)
1517 if collectionType is not CollectionType.TAGGED:
1518 raise TypeError(f"Cannot disassociate from collection '{tag}' "
1519 f"of non-TAGGED type {collectionType.name}.")
1520 # Transform possibly-single-pass iterable into something we can iterate
1521 # over multiple times.
1522 refs = list(refs)
1523 # Pruning a component of a DatasetRef makes no sense since registry
1524 # doesn't know about components and datastore might not store
1525 # components in a separate file
1526 for ref in refs:
1527 if ref.datasetType.component():
1528 raise ValueError(f"Can not prune a component of a dataset (ref={ref})")
1529 # We don't need an unreliable Datastore transaction for this, because
1530 # we've been extra careful to ensure that Datastore.trash only involves
1531 # mutating the Registry (it can _look_ at Datastore-specific things,
1532 # but shouldn't change them), and hence all operations here are
1533 # Registry operations.
1534 with self.registry.transaction():
1535 if unstore:
1536 self.datastore.trash(refs)
1537 if purge:
1538 self.registry.removeDatasets(refs)
1539 elif disassociate:
1540 assert tags, "Guaranteed by earlier logic in this function."
1541 for tag in tags:
1542 self.registry.disassociate(tag, refs)
1543 # We've exited the Registry transaction, and apparently committed.
1544 # (if there was an exception, everything rolled back, and it's as if
1545 # nothing happened - and we never get here).
1546 # Datastore artifacts are not yet gone, but they're clearly marked
1547 # as trash, so if we fail to delete now because of (e.g.) filesystem
1548 # problems we can try again later, and if manual administrative
1549 # intervention is required, it's pretty clear what that should entail:
1550 # deleting everything on disk and in private Datastore tables that is
1551 # in the dataset_location_trash table.
1552 if unstore:
1553 # Point of no return for removing artifacts
1554 self.datastore.emptyTrash()
1556 @transactional
1557 def ingest(self, *datasets: FileDataset, transfer: Optional[str] = "auto", run: Optional[str] = None,
1558 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
1559 ) -> None:
1560 """Store and register one or more datasets that already exist on disk.
1562 Parameters
1563 ----------
1564 datasets : `FileDataset`
1565 Each positional argument is a struct containing information about
1566 a file to be ingested, including its URI (either absolute or
1567 relative to the datastore root, if applicable), a `DatasetRef`,
1568 and optionally a formatter class or its fully-qualified string
1569 name. If a formatter is not provided, the formatter that would be
1570 used for `put` is assumed. On successful return, all
1571 `FileDataset.ref` attributes will have their `DatasetRef.id`
1572 attribute populated and all `FileDataset.formatter` attributes will
1573 be set to the formatter class used. `FileDataset.path` attributes
1574 may be modified to put paths in whatever the datastore considers a
1575 standardized form.
1576 transfer : `str`, optional
1577 If not `None`, must be one of 'auto', 'move', 'copy', 'direct',
1578 'split', 'hardlink', 'relsymlink' or 'symlink', indicating how to
1579 transfer the file.
1580 run : `str`, optional
1581 The name of the run ingested datasets should be added to,
1582 overriding ``self.run``.
1583 idGenerationMode : `DatasetIdGenEnum`, optional
1584 Specifies option for generating dataset IDs. By default unique IDs
1585 are generated for each inserted dataset.
1587 Raises
1588 ------
1589 TypeError
1590 Raised if the butler is read-only or if no run was provided.
1591 NotImplementedError
1592 Raised if the `Datastore` does not support the given transfer mode.
1593 DatasetTypeNotSupportedError
1594 Raised if one or more files to be ingested have a dataset type that
1595 is not supported by the `Datastore`..
1596 FileNotFoundError
1597 Raised if one of the given files does not exist.
1598 FileExistsError
1599 Raised if transfer is not `None` but the (internal) location the
1600 file would be moved to is already occupied.
1602 Notes
1603 -----
1604 This operation is not fully exception safe: if a database operation
1605 fails, the given `FileDataset` instances may be only partially updated.
1607 It is atomic in terms of database operations (they will either all
1608 succeed or all fail) providing the database engine implements
1609 transactions correctly. It will attempt to be atomic in terms of
1610 filesystem operations as well, but this cannot be implemented
1611 rigorously for most datastores.
1612 """
1613 if not self.isWriteable():
1614 raise TypeError("Butler is read-only.")
1615 progress = Progress("lsst.daf.butler.Butler.ingest", level=logging.DEBUG)
1616 # Reorganize the inputs so they're grouped by DatasetType and then
1617 # data ID. We also include a list of DatasetRefs for each FileDataset
1618 # to hold the resolved DatasetRefs returned by the Registry, before
1619 # it's safe to swap them into FileDataset.refs.
1620 # Some type annotation aliases to make that clearer:
1621 GroupForType = Dict[DataCoordinate, Tuple[FileDataset, List[DatasetRef]]]
1622 GroupedData = MutableMapping[DatasetType, GroupForType]
1623 # The actual data structure:
1624 groupedData: GroupedData = defaultdict(dict)
1625 # And the nested loop that populates it:
1626 for dataset in progress.wrap(datasets, desc="Grouping by dataset type"):
1627 # This list intentionally shared across the inner loop, since it's
1628 # associated with `dataset`.
1629 resolvedRefs: List[DatasetRef] = []
1631 # Somewhere to store pre-existing refs if we have an
1632 # execution butler.
1633 existingRefs: List[DatasetRef] = []
1635 for ref in dataset.refs:
1636 if ref.dataId in groupedData[ref.datasetType]:
1637 raise ConflictingDefinitionError(f"Ingest conflict. Dataset {dataset.path} has same"
1638 " DataId as other ingest dataset"
1639 f" {groupedData[ref.datasetType][ref.dataId][0].path} "
1640 f" ({ref.dataId})")
1641 if self._allow_put_of_predefined_dataset:
1642 existing_ref = self.registry.findDataset(ref.datasetType,
1643 dataId=ref.dataId,
1644 collections=run)
1645 if existing_ref:
1646 if self.datastore.knows(existing_ref):
1647 raise ConflictingDefinitionError(f"Dataset associated with path {dataset.path}"
1648 f" already exists as {existing_ref}.")
1649 # Store this ref elsewhere since it already exists
1650 # and we do not want to remake it but we do want
1651 # to store it in the datastore.
1652 existingRefs.append(existing_ref)
1654 # Nothing else to do until we have finished
1655 # iterating.
1656 continue
1658 groupedData[ref.datasetType][ref.dataId] = (dataset, resolvedRefs)
1660 if existingRefs:
1662 if len(dataset.refs) != len(existingRefs):
1663 # Keeping track of partially pre-existing datasets is hard
1664 # and should generally never happen. For now don't allow
1665 # it.
1666 raise ConflictingDefinitionError(f"For dataset {dataset.path} some dataIds already exist"
1667 " in registry but others do not. This is not supported.")
1669 # Attach the resolved refs if we found them.
1670 dataset.refs = existingRefs
1672 # Now we can bulk-insert into Registry for each DatasetType.
1673 for datasetType, groupForType in progress.iter_item_chunks(groupedData.items(),
1674 desc="Bulk-inserting datasets by type"):
1675 refs = self.registry.insertDatasets(
1676 datasetType,
1677 dataIds=groupForType.keys(),
1678 run=run,
1679 expand=self.datastore.needs_expanded_data_ids(transfer, datasetType),
1680 idGenerationMode=idGenerationMode,
1681 )
1682 # Append those resolved DatasetRefs to the new lists we set up for
1683 # them.
1684 for ref, (_, resolvedRefs) in zip(refs, groupForType.values()):
1685 resolvedRefs.append(ref)
1687 # Go back to the original FileDatasets to replace their refs with the
1688 # new resolved ones.
1689 for groupForType in progress.iter_chunks(groupedData.values(),
1690 desc="Reassociating resolved dataset refs with files"):
1691 for dataset, resolvedRefs in groupForType.values():
1692 dataset.refs = resolvedRefs
1694 # Bulk-insert everything into Datastore.
1695 self.datastore.ingest(*datasets, transfer=transfer)
1697 @contextlib.contextmanager
1698 def export(self, *, directory: Optional[str] = None,
1699 filename: Optional[str] = None,
1700 format: Optional[str] = None,
1701 transfer: Optional[str] = None) -> Iterator[RepoExportContext]:
1702 """Export datasets from the repository represented by this `Butler`.
1704 This method is a context manager that returns a helper object
1705 (`RepoExportContext`) that is used to indicate what information from
1706 the repository should be exported.
1708 Parameters
1709 ----------
1710 directory : `str`, optional
1711 Directory dataset files should be written to if ``transfer`` is not
1712 `None`.
1713 filename : `str`, optional
1714 Name for the file that will include database information associated
1715 with the exported datasets. If this is not an absolute path and
1716 ``directory`` is not `None`, it will be written to ``directory``
1717 instead of the current working directory. Defaults to
1718 "export.{format}".
1719 format : `str`, optional
1720 File format for the database information file. If `None`, the
1721 extension of ``filename`` will be used.
1722 transfer : `str`, optional
1723 Transfer mode passed to `Datastore.export`.
1725 Raises
1726 ------
1727 TypeError
1728 Raised if the set of arguments passed is inconsistent.
1730 Examples
1731 --------
1732 Typically the `Registry.queryDataIds` and `Registry.queryDatasets`
1733 methods are used to provide the iterables over data IDs and/or datasets
1734 to be exported::
1736 with butler.export("exports.yaml") as export:
1737 # Export all flats, but none of the dimension element rows
1738 # (i.e. data ID information) associated with them.
1739 export.saveDatasets(butler.registry.queryDatasets("flat"),
1740 elements=())
1741 # Export all datasets that start with "deepCoadd_" and all of
1742 # their associated data ID information.
1743 export.saveDatasets(butler.registry.queryDatasets("deepCoadd_*"))
1744 """
1745 if directory is None and transfer is not None:
1746 raise TypeError("Cannot transfer without providing a directory.")
1747 if transfer == "move":
1748 raise TypeError("Transfer may not be 'move': export is read-only")
1749 if format is None:
1750 if filename is None:
1751 raise TypeError("At least one of 'filename' or 'format' must be provided.")
1752 else:
1753 _, format = os.path.splitext(filename)
1754 elif filename is None:
1755 filename = f"export.{format}"
1756 if directory is not None:
1757 filename = os.path.join(directory, filename)
1758 BackendClass = get_class_of(self._config["repo_transfer_formats"][format]["export"])
1759 with open(filename, 'w') as stream:
1760 backend = BackendClass(stream)
1761 try:
1762 helper = RepoExportContext(self.registry, self.datastore, backend=backend,
1763 directory=directory, transfer=transfer)
1764 yield helper
1765 except BaseException:
1766 raise
1767 else:
1768 helper._finish()
1770 def import_(self, *, directory: Optional[str] = None,
1771 filename: Union[str, TextIO, None] = None,
1772 format: Optional[str] = None,
1773 transfer: Optional[str] = None,
1774 skip_dimensions: Optional[Set] = None,
1775 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
1776 reuseIds: bool = False) -> None:
1777 """Import datasets into this repository that were exported from a
1778 different butler repository via `~lsst.daf.butler.Butler.export`.
1780 Parameters
1781 ----------
1782 directory : `str`, optional
1783 Directory containing dataset files to import from. If `None`,
1784 ``filename`` and all dataset file paths specified therein must
1785 be absolute.
1786 filename : `str` or `TextIO`, optional
1787 A stream or name of file that contains database information
1788 associated with the exported datasets, typically generated by
1789 `~lsst.daf.butler.Butler.export`. If this a string (name) and
1790 is not an absolute path, does not exist in the current working
1791 directory, and ``directory`` is not `None`, it is assumed to be in
1792 ``directory``. Defaults to "export.{format}".
1793 format : `str`, optional
1794 File format for ``filename``. If `None`, the extension of
1795 ``filename`` will be used.
1796 transfer : `str`, optional
1797 Transfer mode passed to `~lsst.daf.butler.Datastore.ingest`.
1798 skip_dimensions : `set`, optional
1799 Names of dimensions that should be skipped and not imported.
1800 idGenerationMode : `DatasetIdGenEnum`, optional
1801 Specifies option for generating dataset IDs when IDs are not
1802 provided or their type does not match backend type. By default
1803 unique IDs are generated for each inserted dataset.
1804 reuseIds : `bool`, optional
1805 If `True` then forces re-use of imported dataset IDs for integer
1806 IDs which are normally generated as auto-incremented; exception
1807 will be raised if imported IDs clash with existing ones. This
1808 option has no effect on the use of globally-unique IDs which are
1809 always re-used (or generated if integer IDs are being imported).
1811 Raises
1812 ------
1813 TypeError
1814 Raised if the set of arguments passed is inconsistent, or if the
1815 butler is read-only.
1816 """
1817 if not self.isWriteable():
1818 raise TypeError("Butler is read-only.")
1819 if format is None:
1820 if filename is None:
1821 raise TypeError("At least one of 'filename' or 'format' must be provided.")
1822 else:
1823 _, format = os.path.splitext(filename) # type: ignore
1824 elif filename is None:
1825 filename = f"export.{format}"
1826 if isinstance(filename, str) and directory is not None and not os.path.exists(filename):
1827 filename = os.path.join(directory, filename)
1828 BackendClass = get_class_of(self._config["repo_transfer_formats"][format]["import"])
1830 def doImport(importStream: TextIO) -> None:
1831 backend = BackendClass(importStream, self.registry)
1832 backend.register()
1833 with self.transaction():
1834 backend.load(self.datastore, directory=directory, transfer=transfer,
1835 skip_dimensions=skip_dimensions, idGenerationMode=idGenerationMode,
1836 reuseIds=reuseIds)
1838 if isinstance(filename, str):
1839 with open(filename, "r") as stream:
1840 doImport(stream)
1841 else:
1842 doImport(filename)
1844 def transfer_from(self, source_butler: Butler, source_refs: Iterable[DatasetRef],
1845 transfer: str = "auto",
1846 id_gen_map: Dict[str, DatasetIdGenEnum] = None,
1847 skip_missing: bool = True,
1848 register_dataset_types: bool = False) -> List[DatasetRef]:
1849 """Transfer datasets to this Butler from a run in another Butler.
1851 Parameters
1852 ----------
1853 source_butler : `Butler`
1854 Butler from which the datasets are to be transferred.
1855 source_refs : iterable of `DatasetRef`
1856 Datasets defined in the source butler that should be transferred to
1857 this butler.
1858 transfer : `str`, optional
1859 Transfer mode passed to `~lsst.daf.butler.Datastore.transfer_from`.
1860 id_gen_map : `dict` of [`str`, `DatasetIdGenEnum`], optional
1861 A mapping of dataset type to ID generation mode. Only used if
1862 the source butler is using integer IDs. Should not be used
1863 if this receiving butler uses integer IDs. Without this dataset
1864 import always uses unique.
1865 skip_missing : `bool`
1866 If `True`, datasets with no datastore artifact associated with
1867 them are not transferred. If `False` a registry entry will be
1868 created even if no datastore record is created (and so will
1869 look equivalent to the dataset being unstored).
1870 register_dataset_types : `bool`
1871 If `True` any missing dataset types are registered. Otherwise
1872 an exception is raised.
1874 Returns
1875 -------
1876 refs : `list` of `DatasetRef`
1877 The refs added to this Butler.
1879 Notes
1880 -----
1881 Requires that any dimension definitions are already present in the
1882 receiving Butler. The datastore artifact has to exist for a transfer
1883 to be made but non-existence is not an error.
1885 Datasets that already exist in this run will be skipped.
1887 The datasets are imported as part of a transaction, although
1888 dataset types are registered before the transaction is started.
1889 This means that it is possible for a dataset type to be registered
1890 even though transfer has failed.
1891 """
1892 if not self.isWriteable():
1893 raise TypeError("Butler is read-only.")
1894 progress = Progress("lsst.daf.butler.Butler.transfer_from", level=VERBOSE)
1896 # Will iterate through the refs multiple times so need to convert
1897 # to a list if this isn't a collection.
1898 if not isinstance(source_refs, collections.abc.Collection):
1899 source_refs = list(source_refs)
1901 original_count = len(source_refs)
1902 log.info("Transferring %d datasets into %s", original_count, str(self))
1904 if id_gen_map is None:
1905 id_gen_map = {}
1907 # In some situations the datastore artifact may be missing
1908 # and we do not want that registry entry to be imported.
1909 # Asking datastore is not sufficient, the records may have been
1910 # purged, we have to ask for the (predicted) URI and check
1911 # existence explicitly. Execution butler is set up exactly like
1912 # this with no datastore records.
1913 artifact_existence: Dict[ButlerURI, bool] = {}
1914 if skip_missing:
1915 dataset_existence = source_butler.datastore.mexists(source_refs,
1916 artifact_existence=artifact_existence)
1917 source_refs = [ref for ref, exists in dataset_existence.items() if exists]
1918 filtered_count = len(source_refs)
1919 log.verbose("%d datasets removed because the artifact does not exist. Now have %d.",
1920 original_count - filtered_count, filtered_count)
1922 # Importing requires that we group the refs by dataset type and run
1923 # before doing the import.
1924 source_dataset_types = set()
1925 grouped_refs = defaultdict(list)
1926 grouped_indices = defaultdict(list)
1927 for i, ref in enumerate(source_refs):
1928 grouped_refs[ref.datasetType, ref.run].append(ref)
1929 grouped_indices[ref.datasetType, ref.run].append(i)
1930 source_dataset_types.add(ref.datasetType)
1932 # Check to see if the dataset type in the source butler has
1933 # the same definition in the target butler and register missing
1934 # ones if requested. Registration must happen outside a transaction.
1935 newly_registered_dataset_types = set()
1936 for datasetType in source_dataset_types:
1937 if register_dataset_types:
1938 # Let this raise immediately if inconsistent. Continuing
1939 # on to find additional inconsistent dataset types
1940 # might result in additional unwanted dataset types being
1941 # registered.
1942 if self.registry.registerDatasetType(datasetType):
1943 newly_registered_dataset_types.add(datasetType)
1944 else:
1945 # If the dataset type is missing, let it fail immediately.
1946 target_dataset_type = self.registry.getDatasetType(datasetType.name)
1947 if target_dataset_type != datasetType:
1948 raise ConflictingDefinitionError("Source butler dataset type differs from definition"
1949 f" in target butler: {datasetType} !="
1950 f" {target_dataset_type}")
1951 if newly_registered_dataset_types:
1952 # We may have registered some even if there were inconsistencies
1953 # but should let people know (or else remove them again).
1954 log.log(VERBOSE, "Registered the following dataset types in the target Butler: %s",
1955 ", ".join(d.name for d in newly_registered_dataset_types))
1956 else:
1957 log.log(VERBOSE, "All required dataset types are known to the target Butler")
1959 # The returned refs should be identical for UUIDs.
1960 # For now must also support integers and so need to retain the
1961 # newly-created refs from this registry.
1962 # Pre-size it so we can assign refs into the correct slots
1963 transferred_refs_tmp: List[Optional[DatasetRef]] = [None] * len(source_refs)
1964 default_id_gen = DatasetIdGenEnum.UNIQUE
1966 handled_collections: Set[str] = set()
1968 # Do all the importing in a single transaction.
1969 with self.transaction():
1970 for (datasetType, run), refs_to_import in progress.iter_item_chunks(grouped_refs.items(),
1971 desc="Importing to registry"
1972 " by run and dataset type"):
1973 if run not in handled_collections:
1974 run_doc = source_butler.registry.getCollectionDocumentation(run)
1975 registered = self.registry.registerRun(run, doc=run_doc)
1976 handled_collections.add(run)
1977 if registered:
1978 log.log(VERBOSE, "Creating output run %s", run)
1980 id_generation_mode = default_id_gen
1981 if isinstance(refs_to_import[0].id, int):
1982 # ID generation mode might need to be overridden when
1983 # targetting UUID
1984 id_generation_mode = id_gen_map.get(datasetType.name, default_id_gen)
1986 n_refs = len(refs_to_import)
1987 log.verbose("Importing %d ref%s of dataset type %s into run %s",
1988 n_refs, "" if n_refs == 1 else "s", datasetType.name, run)
1990 # No way to know if this butler's registry uses UUID.
1991 # We have to trust the caller on this. If it fails they will
1992 # have to change their approach. We can't catch the exception
1993 # and retry with unique because that will mess up the
1994 # transaction handling. We aren't allowed to ask the registry
1995 # manager what type of ID it is using.
1996 imported_refs = self.registry._importDatasets(refs_to_import,
1997 idGenerationMode=id_generation_mode,
1998 expand=False)
2000 # Map them into the correct slots to match the initial order
2001 for i, ref in zip(grouped_indices[datasetType, run], imported_refs):
2002 transferred_refs_tmp[i] = ref
2004 # Mypy insists that we might have None in here so we have to make
2005 # that explicit by assigning to a new variable and filtering out
2006 # something that won't be there.
2007 transferred_refs = [ref for ref in transferred_refs_tmp if ref is not None]
2009 # Check consistency
2010 assert len(source_refs) == len(transferred_refs), "Different number of refs imported than given"
2012 log.verbose("Imported %d datasets into destination butler", len(transferred_refs))
2014 # The transferred refs need to be reordered to match the original
2015 # ordering given by the caller. Without this the datastore transfer
2016 # will be broken.
2018 # Ask the datastore to transfer. The datastore has to check that
2019 # the source datastore is compatible with the target datastore.
2020 self.datastore.transfer_from(source_butler.datastore, source_refs,
2021 local_refs=transferred_refs, transfer=transfer,
2022 artifact_existence=artifact_existence)
2024 return transferred_refs
2026 def validateConfiguration(self, logFailures: bool = False,
2027 datasetTypeNames: Optional[Iterable[str]] = None,
2028 ignore: Iterable[str] = None) -> None:
2029 """Validate butler configuration.
2031 Checks that each `DatasetType` can be stored in the `Datastore`.
2033 Parameters
2034 ----------
2035 logFailures : `bool`, optional
2036 If `True`, output a log message for every validation error
2037 detected.
2038 datasetTypeNames : iterable of `str`, optional
2039 The `DatasetType` names that should be checked. This allows
2040 only a subset to be selected.
2041 ignore : iterable of `str`, optional
2042 Names of DatasetTypes to skip over. This can be used to skip
2043 known problems. If a named `DatasetType` corresponds to a
2044 composite, all components of that `DatasetType` will also be
2045 ignored.
2047 Raises
2048 ------
2049 ButlerValidationError
2050 Raised if there is some inconsistency with how this Butler
2051 is configured.
2052 """
2053 if datasetTypeNames:
2054 datasetTypes = [self.registry.getDatasetType(name) for name in datasetTypeNames]
2055 else:
2056 datasetTypes = list(self.registry.queryDatasetTypes())
2058 # filter out anything from the ignore list
2059 if ignore:
2060 ignore = set(ignore)
2061 datasetTypes = [e for e in datasetTypes
2062 if e.name not in ignore and e.nameAndComponent()[0] not in ignore]
2063 else:
2064 ignore = set()
2066 # Find all the registered instruments
2067 instruments = set(
2068 record.name for record in self.registry.queryDimensionRecords("instrument")
2069 )
2071 # For each datasetType that has an instrument dimension, create
2072 # a DatasetRef for each defined instrument
2073 datasetRefs = []
2075 for datasetType in datasetTypes:
2076 if "instrument" in datasetType.dimensions:
2077 for instrument in instruments:
2078 datasetRef = DatasetRef(datasetType, {"instrument": instrument}, # type: ignore
2079 conform=False)
2080 datasetRefs.append(datasetRef)
2082 entities: List[Union[DatasetType, DatasetRef]] = []
2083 entities.extend(datasetTypes)
2084 entities.extend(datasetRefs)
2086 datastoreErrorStr = None
2087 try:
2088 self.datastore.validateConfiguration(entities, logFailures=logFailures)
2089 except ValidationError as e:
2090 datastoreErrorStr = str(e)
2092 # Also check that the LookupKeys used by the datastores match
2093 # registry and storage class definitions
2094 keys = self.datastore.getLookupKeys()
2096 failedNames = set()
2097 failedDataId = set()
2098 for key in keys:
2099 if key.name is not None:
2100 if key.name in ignore:
2101 continue
2103 # skip if specific datasetType names were requested and this
2104 # name does not match
2105 if datasetTypeNames and key.name not in datasetTypeNames:
2106 continue
2108 # See if it is a StorageClass or a DatasetType
2109 if key.name in self.storageClasses:
2110 pass
2111 else:
2112 try:
2113 self.registry.getDatasetType(key.name)
2114 except KeyError:
2115 if logFailures:
2116 log.critical("Key '%s' does not correspond to a DatasetType or StorageClass", key)
2117 failedNames.add(key)
2118 else:
2119 # Dimensions are checked for consistency when the Butler
2120 # is created and rendezvoused with a universe.
2121 pass
2123 # Check that the instrument is a valid instrument
2124 # Currently only support instrument so check for that
2125 if key.dataId:
2126 dataIdKeys = set(key.dataId)
2127 if set(["instrument"]) != dataIdKeys:
2128 if logFailures:
2129 log.critical("Key '%s' has unsupported DataId override", key)
2130 failedDataId.add(key)
2131 elif key.dataId["instrument"] not in instruments:
2132 if logFailures:
2133 log.critical("Key '%s' has unknown instrument", key)
2134 failedDataId.add(key)
2136 messages = []
2138 if datastoreErrorStr:
2139 messages.append(datastoreErrorStr)
2141 for failed, msg in ((failedNames, "Keys without corresponding DatasetType or StorageClass entry: "),
2142 (failedDataId, "Keys with bad DataId entries: ")):
2143 if failed:
2144 msg += ", ".join(str(k) for k in failed)
2145 messages.append(msg)
2147 if messages:
2148 raise ValidationError(";\n".join(messages))
2150 @property
2151 def collections(self) -> CollectionSearch:
2152 """The collections to search by default, in order (`CollectionSearch`).
2154 This is an alias for ``self.registry.defaults.collections``. It cannot
2155 be set directly in isolation, but all defaults may be changed together
2156 by assigning a new `RegistryDefaults` instance to
2157 ``self.registry.defaults``.
2158 """
2159 return self.registry.defaults.collections
2161 @property
2162 def run(self) -> Optional[str]:
2163 """Name of the run this butler writes outputs to by default (`str` or
2164 `None`).
2166 This is an alias for ``self.registry.defaults.run``. It cannot be set
2167 directly in isolation, but all defaults may be changed together by
2168 assigning a new `RegistryDefaults` instance to
2169 ``self.registry.defaults``.
2170 """
2171 return self.registry.defaults.run
2173 registry: Registry
2174 """The object that manages dataset metadata and relationships (`Registry`).
2176 Most operations that don't involve reading or writing butler datasets are
2177 accessible only via `Registry` methods.
2178 """
2180 datastore: Datastore
2181 """The object that manages actual dataset storage (`Datastore`).
2183 Direct user access to the datastore should rarely be necessary; the primary
2184 exception is the case where a `Datastore` implementation provides extra
2185 functionality beyond what the base class defines.
2186 """
2188 storageClasses: StorageClassFactory
2189 """An object that maps known storage class names to objects that fully
2190 describe them (`StorageClassFactory`).
2191 """
2193 _allow_put_of_predefined_dataset: bool
2194 """Allow a put to succeed even if there is already a registry entry for it
2195 but not a datastore record. (`bool`)."""