Coverage for python/lsst/daf/butler/_butler.py: 8%
698 statements
« prev ^ index » next coverage.py v6.5.0, created at 2023-02-23 11:08 +0000
« prev ^ index » next coverage.py v6.5.0, created at 2023-02-23 11:08 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""
23Butler top level classes.
24"""
25from __future__ import annotations
27__all__ = (
28 "Butler",
29 "ButlerValidationError",
30 "PruneCollectionsArgsError",
31 "PurgeWithoutUnstorePruneCollectionsError",
32 "RunWithoutPurgePruneCollectionsError",
33 "PurgeUnsupportedPruneCollectionsError",
34)
36import collections.abc
37import contextlib
38import logging
39import numbers
40import os
41from collections import defaultdict
42from typing import (
43 Any,
44 ClassVar,
45 Counter,
46 Dict,
47 Iterable,
48 Iterator,
49 List,
50 MutableMapping,
51 Optional,
52 Sequence,
53 Set,
54 TextIO,
55 Tuple,
56 Type,
57 Union,
58)
60from lsst.resources import ResourcePath, ResourcePathExpression
61from lsst.utils import doImportType
62from lsst.utils.introspection import get_class_of
63from lsst.utils.logging import VERBOSE, getLogger
65from ._butlerConfig import ButlerConfig
66from ._butlerRepoIndex import ButlerRepoIndex
67from ._deferredDatasetHandle import DeferredDatasetHandle
68from ._limited_butler import LimitedButler
69from .core import (
70 AmbiguousDatasetError,
71 Config,
72 ConfigSubset,
73 DataCoordinate,
74 DataId,
75 DataIdValue,
76 DatasetRef,
77 DatasetRefURIs,
78 DatasetType,
79 Datastore,
80 Dimension,
81 DimensionConfig,
82 DimensionElement,
83 DimensionRecord,
84 DimensionUniverse,
85 FileDataset,
86 Progress,
87 StorageClass,
88 StorageClassFactory,
89 Timespan,
90 ValidationError,
91)
92from .core.repoRelocation import BUTLER_ROOT_TAG
93from .core.utils import transactional
94from .registry import (
95 CollectionType,
96 ConflictingDefinitionError,
97 DataIdError,
98 DatasetIdGenEnum,
99 Registry,
100 RegistryConfig,
101 RegistryDefaults,
102)
103from .transfers import RepoExportContext
105log = getLogger(__name__)
108class ButlerValidationError(ValidationError):
109 """There is a problem with the Butler configuration."""
111 pass
114class PruneCollectionsArgsError(TypeError):
115 """Base class for errors relating to Butler.pruneCollections input
116 arguments.
117 """
119 pass
122class PurgeWithoutUnstorePruneCollectionsError(PruneCollectionsArgsError):
123 """Raised when purge and unstore are both required to be True, and
124 purge is True but unstore is False.
125 """
127 def __init__(self) -> None:
128 super().__init__("Cannot pass purge=True without unstore=True.")
131class RunWithoutPurgePruneCollectionsError(PruneCollectionsArgsError):
132 """Raised when pruning a RUN collection but purge is False."""
134 def __init__(self, collectionType: CollectionType):
135 self.collectionType = collectionType
136 super().__init__(f"Cannot prune RUN collection {self.collectionType.name} without purge=True.")
139class PurgeUnsupportedPruneCollectionsError(PruneCollectionsArgsError):
140 """Raised when purge is True but is not supported for the given
141 collection."""
143 def __init__(self, collectionType: CollectionType):
144 self.collectionType = collectionType
145 super().__init__(
146 f"Cannot prune {self.collectionType} collection {self.collectionType.name} with purge=True."
147 )
150class Butler(LimitedButler):
151 """Main entry point for the data access system.
153 Parameters
154 ----------
155 config : `ButlerConfig`, `Config` or `str`, optional.
156 Configuration. Anything acceptable to the
157 `ButlerConfig` constructor. If a directory path
158 is given the configuration will be read from a ``butler.yaml`` file in
159 that location. If `None` is given default values will be used.
160 butler : `Butler`, optional.
161 If provided, construct a new Butler that uses the same registry and
162 datastore as the given one, but with the given collection and run.
163 Incompatible with the ``config``, ``searchPaths``, and ``writeable``
164 arguments.
165 collections : `str` or `Iterable` [ `str` ], optional
166 An expression specifying the collections to be searched (in order) when
167 reading datasets.
168 This may be a `str` collection name or an iterable thereof.
169 See :ref:`daf_butler_collection_expressions` for more information.
170 These collections are not registered automatically and must be
171 manually registered before they are used by any method, but they may be
172 manually registered after the `Butler` is initialized.
173 run : `str`, optional
174 Name of the `~CollectionType.RUN` collection new datasets should be
175 inserted into. If ``collections`` is `None` and ``run`` is not `None`,
176 ``collections`` will be set to ``[run]``. If not `None`, this
177 collection will automatically be registered. If this is not set (and
178 ``writeable`` is not set either), a read-only butler will be created.
179 searchPaths : `list` of `str`, optional
180 Directory paths to search when calculating the full Butler
181 configuration. Not used if the supplied config is already a
182 `ButlerConfig`.
183 writeable : `bool`, optional
184 Explicitly sets whether the butler supports write operations. If not
185 provided, a read-write butler is created if any of ``run``, ``tags``,
186 or ``chains`` is non-empty.
187 inferDefaults : `bool`, optional
188 If `True` (default) infer default data ID values from the values
189 present in the datasets in ``collections``: if all collections have the
190 same value (or no value) for a governor dimension, that value will be
191 the default for that dimension. Nonexistent collections are ignored.
192 If a default value is provided explicitly for a governor dimension via
193 ``**kwargs``, no default will be inferred for that dimension.
194 **kwargs : `str`
195 Default data ID key-value pairs. These may only identify "governor"
196 dimensions like ``instrument`` and ``skymap``.
198 Examples
199 --------
200 While there are many ways to control exactly how a `Butler` interacts with
201 the collections in its `Registry`, the most common cases are still simple.
203 For a read-only `Butler` that searches one collection, do::
205 butler = Butler("/path/to/repo", collections=["u/alice/DM-50000"])
207 For a read-write `Butler` that writes to and reads from a
208 `~CollectionType.RUN` collection::
210 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a")
212 The `Butler` passed to a ``PipelineTask`` is often much more complex,
213 because we want to write to one `~CollectionType.RUN` collection but read
214 from several others (as well)::
216 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a",
217 collections=["u/alice/DM-50000/a",
218 "u/bob/DM-49998",
219 "HSC/defaults"])
221 This butler will `put` new datasets to the run ``u/alice/DM-50000/a``.
222 Datasets will be read first from that run (since it appears first in the
223 chain), and then from ``u/bob/DM-49998`` and finally ``HSC/defaults``.
225 Finally, one can always create a `Butler` with no collections::
227 butler = Butler("/path/to/repo", writeable=True)
229 This can be extremely useful when you just want to use ``butler.registry``,
230 e.g. for inserting dimension data or managing collections, or when the
231 collections you want to use with the butler are not consistent.
232 Passing ``writeable`` explicitly here is only necessary if you want to be
233 able to make changes to the repo - usually the value for ``writeable`` can
234 be guessed from the collection arguments provided, but it defaults to
235 `False` when there are not collection arguments.
236 """
238 def __init__(
239 self,
240 config: Union[Config, str, None] = None,
241 *,
242 butler: Optional[Butler] = None,
243 collections: Any = None,
244 run: Optional[str] = None,
245 searchPaths: Optional[List[str]] = None,
246 writeable: Optional[bool] = None,
247 inferDefaults: bool = True,
248 **kwargs: str,
249 ):
250 defaults = RegistryDefaults(collections=collections, run=run, infer=inferDefaults, **kwargs)
251 # Load registry, datastore, etc. from config or existing butler.
252 if butler is not None:
253 if config is not None or searchPaths is not None or writeable is not None:
254 raise TypeError(
255 "Cannot pass 'config', 'searchPaths', or 'writeable' arguments with 'butler' argument."
256 )
257 self.registry = butler.registry.copy(defaults)
258 self.datastore = butler.datastore
259 self.storageClasses = butler.storageClasses
260 self._config: ButlerConfig = butler._config
261 self._allow_put_of_predefined_dataset = butler._allow_put_of_predefined_dataset
262 else:
263 # Can only look for strings in the known repos list.
264 if isinstance(config, str) and config in self.get_known_repos():
265 config = str(self.get_repo_uri(config))
266 try:
267 self._config = ButlerConfig(config, searchPaths=searchPaths)
268 except FileNotFoundError as e:
269 if known := self.get_known_repos():
270 aliases = f"(known aliases: {', '.join(known)})"
271 else:
272 aliases = "(no known aliases)"
273 raise FileNotFoundError(f"{e} {aliases}") from e
274 self._config = ButlerConfig(config, searchPaths=searchPaths)
275 try:
276 if "root" in self._config:
277 butlerRoot = self._config["root"]
278 else:
279 butlerRoot = self._config.configDir
280 if writeable is None:
281 writeable = run is not None
282 self.registry = Registry.fromConfig(
283 self._config, butlerRoot=butlerRoot, writeable=writeable, defaults=defaults
284 )
285 self.datastore = Datastore.fromConfig(
286 self._config, self.registry.getDatastoreBridgeManager(), butlerRoot=butlerRoot
287 )
288 self.storageClasses = StorageClassFactory()
289 self.storageClasses.addFromConfig(self._config)
290 self._allow_put_of_predefined_dataset = self._config.get(
291 "allow_put_of_predefined_dataset", False
292 )
293 except Exception:
294 # Failures here usually mean that configuration is incomplete,
295 # just issue an error message which includes config file URI.
296 log.error(f"Failed to instantiate Butler from config {self._config.configFile}.")
297 raise
299 if "run" in self._config or "collection" in self._config:
300 raise ValueError("Passing a run or collection via configuration is no longer supported.")
302 GENERATION: ClassVar[int] = 3
303 """This is a Generation 3 Butler.
305 This attribute may be removed in the future, once the Generation 2 Butler
306 interface has been fully retired; it should only be used in transitional
307 code.
308 """
310 @classmethod
311 def get_repo_uri(cls, label: str) -> ResourcePath:
312 """Look up the label in a butler repository index.
314 Parameters
315 ----------
316 label : `str`
317 Label of the Butler repository to look up.
319 Returns
320 -------
321 uri : `lsst.resources.ResourcePath`
322 URI to the Butler repository associated with the given label.
324 Raises
325 ------
326 KeyError
327 Raised if the label is not found in the index, or if an index
328 can not be found at all.
330 Notes
331 -----
332 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the
333 information is discovered.
334 """
335 return ButlerRepoIndex.get_repo_uri(label)
337 @classmethod
338 def get_known_repos(cls) -> Set[str]:
339 """Retrieve the list of known repository labels.
341 Returns
342 -------
343 repos : `set` of `str`
344 All the known labels. Can be empty if no index can be found.
346 Notes
347 -----
348 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the
349 information is discovered.
350 """
351 return ButlerRepoIndex.get_known_repos()
353 @staticmethod
354 def makeRepo(
355 root: ResourcePathExpression,
356 config: Union[Config, str, None] = None,
357 dimensionConfig: Union[Config, str, None] = None,
358 standalone: bool = False,
359 searchPaths: Optional[List[str]] = None,
360 forceConfigRoot: bool = True,
361 outfile: Optional[ResourcePathExpression] = None,
362 overwrite: bool = False,
363 ) -> Config:
364 """Create an empty data repository by adding a butler.yaml config
365 to a repository root directory.
367 Parameters
368 ----------
369 root : `lsst.resources.ResourcePathExpression`
370 Path or URI to the root location of the new repository. Will be
371 created if it does not exist.
372 config : `Config` or `str`, optional
373 Configuration to write to the repository, after setting any
374 root-dependent Registry or Datastore config options. Can not
375 be a `ButlerConfig` or a `ConfigSubset`. If `None`, default
376 configuration will be used. Root-dependent config options
377 specified in this config are overwritten if ``forceConfigRoot``
378 is `True`.
379 dimensionConfig : `Config` or `str`, optional
380 Configuration for dimensions, will be used to initialize registry
381 database.
382 standalone : `bool`
383 If True, write all expanded defaults, not just customized or
384 repository-specific settings.
385 This (mostly) decouples the repository from the default
386 configuration, insulating it from changes to the defaults (which
387 may be good or bad, depending on the nature of the changes).
388 Future *additions* to the defaults will still be picked up when
389 initializing `Butlers` to repos created with ``standalone=True``.
390 searchPaths : `list` of `str`, optional
391 Directory paths to search when calculating the full butler
392 configuration.
393 forceConfigRoot : `bool`, optional
394 If `False`, any values present in the supplied ``config`` that
395 would normally be reset are not overridden and will appear
396 directly in the output config. This allows non-standard overrides
397 of the root directory for a datastore or registry to be given.
398 If this parameter is `True` the values for ``root`` will be
399 forced into the resulting config if appropriate.
400 outfile : `lss.resources.ResourcePathExpression`, optional
401 If not-`None`, the output configuration will be written to this
402 location rather than into the repository itself. Can be a URI
403 string. Can refer to a directory that will be used to write
404 ``butler.yaml``.
405 overwrite : `bool`, optional
406 Create a new configuration file even if one already exists
407 in the specified output location. Default is to raise
408 an exception.
410 Returns
411 -------
412 config : `Config`
413 The updated `Config` instance written to the repo.
415 Raises
416 ------
417 ValueError
418 Raised if a ButlerConfig or ConfigSubset is passed instead of a
419 regular Config (as these subclasses would make it impossible to
420 support ``standalone=False``).
421 FileExistsError
422 Raised if the output config file already exists.
423 os.error
424 Raised if the directory does not exist, exists but is not a
425 directory, or cannot be created.
427 Notes
428 -----
429 Note that when ``standalone=False`` (the default), the configuration
430 search path (see `ConfigSubset.defaultSearchPaths`) that was used to
431 construct the repository should also be used to construct any Butlers
432 to avoid configuration inconsistencies.
433 """
434 if isinstance(config, (ButlerConfig, ConfigSubset)):
435 raise ValueError("makeRepo must be passed a regular Config without defaults applied.")
437 # Ensure that the root of the repository exists or can be made
438 root_uri = ResourcePath(root, forceDirectory=True)
439 root_uri.mkdir()
441 config = Config(config)
443 # If we are creating a new repo from scratch with relative roots,
444 # do not propagate an explicit root from the config file
445 if "root" in config:
446 del config["root"]
448 full = ButlerConfig(config, searchPaths=searchPaths) # this applies defaults
449 imported_class = doImportType(full["datastore", "cls"])
450 if not issubclass(imported_class, Datastore):
451 raise TypeError(f"Imported datastore class {full['datastore', 'cls']} is not a Datastore")
452 datastoreClass: Type[Datastore] = imported_class
453 datastoreClass.setConfigRoot(BUTLER_ROOT_TAG, config, full, overwrite=forceConfigRoot)
455 # if key exists in given config, parse it, otherwise parse the defaults
456 # in the expanded config
457 if config.get(("registry", "db")):
458 registryConfig = RegistryConfig(config)
459 else:
460 registryConfig = RegistryConfig(full)
461 defaultDatabaseUri = registryConfig.makeDefaultDatabaseUri(BUTLER_ROOT_TAG)
462 if defaultDatabaseUri is not None:
463 Config.updateParameters(
464 RegistryConfig, config, full, toUpdate={"db": defaultDatabaseUri}, overwrite=forceConfigRoot
465 )
466 else:
467 Config.updateParameters(RegistryConfig, config, full, toCopy=("db",), overwrite=forceConfigRoot)
469 if standalone:
470 config.merge(full)
471 else:
472 # Always expand the registry.managers section into the per-repo
473 # config, because after the database schema is created, it's not
474 # allowed to change anymore. Note that in the standalone=True
475 # branch, _everything_ in the config is expanded, so there's no
476 # need to special case this.
477 Config.updateParameters(RegistryConfig, config, full, toMerge=("managers",), overwrite=False)
478 configURI: ResourcePathExpression
479 if outfile is not None:
480 # When writing to a separate location we must include
481 # the root of the butler repo in the config else it won't know
482 # where to look.
483 config["root"] = root_uri.geturl()
484 configURI = outfile
485 else:
486 configURI = root_uri
487 # Strip obscore configuration, if it is present, before writing config
488 # to a file, obscore config will be stored in registry.
489 config_to_write = config
490 if (obscore_config_key := ("registry", "managers", "obscore", "config")) in config:
491 config_to_write = config.copy()
492 del config_to_write[obscore_config_key]
493 config_to_write.dumpToUri(configURI, overwrite=overwrite)
495 # Create Registry and populate tables
496 registryConfig = RegistryConfig(config.get("registry"))
497 dimensionConfig = DimensionConfig(dimensionConfig)
498 Registry.createFromConfig(registryConfig, dimensionConfig=dimensionConfig, butlerRoot=root_uri)
500 log.verbose("Wrote new Butler configuration file to %s", configURI)
502 return config
504 @classmethod
505 def _unpickle(
506 cls,
507 config: ButlerConfig,
508 collections: Optional[tuple[str, ...]],
509 run: Optional[str],
510 defaultDataId: Dict[str, str],
511 writeable: bool,
512 ) -> Butler:
513 """Callable used to unpickle a Butler.
515 We prefer not to use ``Butler.__init__`` directly so we can force some
516 of its many arguments to be keyword-only (note that ``__reduce__``
517 can only invoke callables with positional arguments).
519 Parameters
520 ----------
521 config : `ButlerConfig`
522 Butler configuration, already coerced into a true `ButlerConfig`
523 instance (and hence after any search paths for overrides have been
524 utilized).
525 collections : `tuple` [ `str` ]
526 Names of the default collections to read from.
527 run : `str`, optional
528 Name of the default `~CollectionType.RUN` collection to write to.
529 defaultDataId : `dict` [ `str`, `str` ]
530 Default data ID values.
531 writeable : `bool`
532 Whether the Butler should support write operations.
534 Returns
535 -------
536 butler : `Butler`
537 A new `Butler` instance.
538 """
539 # MyPy doesn't recognize that the kwargs below are totally valid; it
540 # seems to think '**defaultDataId* is a _positional_ argument!
541 return cls(
542 config=config,
543 collections=collections,
544 run=run,
545 writeable=writeable,
546 **defaultDataId, # type: ignore
547 )
549 def __reduce__(self) -> tuple:
550 """Support pickling."""
551 return (
552 Butler._unpickle,
553 (
554 self._config,
555 self.collections,
556 self.run,
557 self.registry.defaults.dataId.byName(),
558 self.registry.isWriteable(),
559 ),
560 )
562 def __str__(self) -> str:
563 return "Butler(collections={}, run={}, datastore='{}', registry='{}')".format(
564 self.collections, self.run, self.datastore, self.registry
565 )
567 def isWriteable(self) -> bool:
568 """Return `True` if this `Butler` supports write operations."""
569 return self.registry.isWriteable()
571 @contextlib.contextmanager
572 def transaction(self) -> Iterator[None]:
573 """Context manager supporting `Butler` transactions.
575 Transactions can be nested.
576 """
577 with self.registry.transaction():
578 with self.datastore.transaction():
579 yield
581 def _standardizeArgs(
582 self,
583 datasetRefOrType: Union[DatasetRef, DatasetType, str],
584 dataId: Optional[DataId] = None,
585 for_put: bool = True,
586 **kwargs: Any,
587 ) -> Tuple[DatasetType, Optional[DataId]]:
588 """Standardize the arguments passed to several Butler APIs.
590 Parameters
591 ----------
592 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
593 When `DatasetRef` the `dataId` should be `None`.
594 Otherwise the `DatasetType` or name thereof.
595 dataId : `dict` or `DataCoordinate`
596 A `dict` of `Dimension` link name, value pairs that label the
597 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
598 should be provided as the second argument.
599 for_put : `bool`, optional
600 If `True` this call is invoked as part of a `Butler.put()`.
601 Otherwise it is assumed to be part of a `Butler.get()`. This
602 parameter is only relevant if there is dataset type
603 inconsistency.
604 **kwargs
605 Additional keyword arguments used to augment or construct a
606 `DataCoordinate`. See `DataCoordinate.standardize`
607 parameters.
609 Returns
610 -------
611 datasetType : `DatasetType`
612 A `DatasetType` instance extracted from ``datasetRefOrType``.
613 dataId : `dict` or `DataId`, optional
614 Argument that can be used (along with ``kwargs``) to construct a
615 `DataId`.
617 Notes
618 -----
619 Butler APIs that conceptually need a DatasetRef also allow passing a
620 `DatasetType` (or the name of one) and a `DataId` (or a dict and
621 keyword arguments that can be used to construct one) separately. This
622 method accepts those arguments and always returns a true `DatasetType`
623 and a `DataId` or `dict`.
625 Standardization of `dict` vs `DataId` is best handled by passing the
626 returned ``dataId`` (and ``kwargs``) to `Registry` APIs, which are
627 generally similarly flexible.
628 """
629 externalDatasetType: Optional[DatasetType] = None
630 internalDatasetType: Optional[DatasetType] = None
631 if isinstance(datasetRefOrType, DatasetRef):
632 if dataId is not None or kwargs:
633 raise ValueError("DatasetRef given, cannot use dataId as well")
634 externalDatasetType = datasetRefOrType.datasetType
635 dataId = datasetRefOrType.dataId
636 else:
637 # Don't check whether DataId is provided, because Registry APIs
638 # can usually construct a better error message when it wasn't.
639 if isinstance(datasetRefOrType, DatasetType):
640 externalDatasetType = datasetRefOrType
641 else:
642 internalDatasetType = self.registry.getDatasetType(datasetRefOrType)
644 # Check that they are self-consistent
645 if externalDatasetType is not None:
646 internalDatasetType = self.registry.getDatasetType(externalDatasetType.name)
647 if externalDatasetType != internalDatasetType:
648 # We can allow differences if they are compatible, depending
649 # on whether this is a get or a put. A get requires that
650 # the python type associated with the datastore can be
651 # converted to the user type. A put requires that the user
652 # supplied python type can be converted to the internal
653 # type expected by registry.
654 relevantDatasetType = internalDatasetType
655 if for_put:
656 is_compatible = internalDatasetType.is_compatible_with(externalDatasetType)
657 else:
658 is_compatible = externalDatasetType.is_compatible_with(internalDatasetType)
659 relevantDatasetType = externalDatasetType
660 if not is_compatible:
661 raise ValueError(
662 f"Supplied dataset type ({externalDatasetType}) inconsistent with "
663 f"registry definition ({internalDatasetType})"
664 )
665 # Override the internal definition.
666 internalDatasetType = relevantDatasetType
668 assert internalDatasetType is not None
669 return internalDatasetType, dataId
671 def _rewrite_data_id(
672 self, dataId: Optional[DataId], datasetType: DatasetType, **kwargs: Any
673 ) -> Tuple[Optional[DataId], Dict[str, Any]]:
674 """Rewrite a data ID taking into account dimension records.
676 Take a Data ID and keyword args and rewrite it if necessary to
677 allow the user to specify dimension records rather than dimension
678 primary values.
680 This allows a user to include a dataId dict with keys of
681 ``exposure.day_obs`` and ``exposure.seq_num`` instead of giving
682 the integer exposure ID. It also allows a string to be given
683 for a dimension value rather than the integer ID if that is more
684 convenient. For example, rather than having to specifyin the
685 detector with ``detector.full_name``, a string given for ``detector``
686 will be interpreted as the full name and converted to the integer
687 value.
689 Keyword arguments can also use strings for dimensions like detector
690 and exposure but python does not allow them to include ``.`` and
691 so the ``exposure.day_obs`` syntax can not be used in a keyword
692 argument.
694 Parameters
695 ----------
696 dataId : `dict` or `DataCoordinate`
697 A `dict` of `Dimension` link name, value pairs that will label the
698 `DatasetRef` within a Collection.
699 datasetType : `DatasetType`
700 The dataset type associated with this dataId. Required to
701 determine the relevant dimensions.
702 **kwargs
703 Additional keyword arguments used to augment or construct a
704 `DataId`. See `DataId` parameters.
706 Returns
707 -------
708 dataId : `dict` or `DataCoordinate`
709 The, possibly rewritten, dataId. If given a `DataCoordinate` and
710 no keyword arguments, the original dataId will be returned
711 unchanged.
712 **kwargs : `dict`
713 Any unused keyword arguments (would normally be empty dict).
714 """
715 # Do nothing if we have a standalone DataCoordinate.
716 if isinstance(dataId, DataCoordinate) and not kwargs:
717 return dataId, kwargs
719 # Process dimension records that are using record information
720 # rather than ids
721 newDataId: Dict[str, DataIdValue] = {}
722 byRecord: Dict[str, Dict[str, Any]] = defaultdict(dict)
724 # if all the dataId comes from keyword parameters we do not need
725 # to do anything here because they can't be of the form
726 # exposure.obs_id because a "." is not allowed in a keyword parameter.
727 if dataId:
728 for k, v in dataId.items():
729 # If we have a Dimension we do not need to do anything
730 # because it cannot be a compound key.
731 if isinstance(k, str) and "." in k:
732 # Someone is using a more human-readable dataId
733 dimensionName, record = k.split(".", 1)
734 byRecord[dimensionName][record] = v
735 elif isinstance(k, Dimension):
736 newDataId[k.name] = v
737 else:
738 newDataId[k] = v
740 # Go through the updated dataId and check the type in case someone is
741 # using an alternate key. We have already filtered out the compound
742 # keys dimensions.record format.
743 not_dimensions = {}
745 # Will need to look in the dataId and the keyword arguments
746 # and will remove them if they need to be fixed or are unrecognized.
747 for dataIdDict in (newDataId, kwargs):
748 # Use a list so we can adjust the dict safely in the loop
749 for dimensionName in list(dataIdDict):
750 value = dataIdDict[dimensionName]
751 try:
752 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
753 except KeyError:
754 # This is not a real dimension
755 not_dimensions[dimensionName] = value
756 del dataIdDict[dimensionName]
757 continue
759 # Convert an integral type to an explicit int to simplify
760 # comparisons here
761 if isinstance(value, numbers.Integral):
762 value = int(value)
764 if not isinstance(value, dimension.primaryKey.getPythonType()):
765 for alternate in dimension.alternateKeys:
766 if isinstance(value, alternate.getPythonType()):
767 byRecord[dimensionName][alternate.name] = value
768 del dataIdDict[dimensionName]
769 log.debug(
770 "Converting dimension %s to %s.%s=%s",
771 dimensionName,
772 dimensionName,
773 alternate.name,
774 value,
775 )
776 break
777 else:
778 log.warning(
779 "Type mismatch found for value '%r' provided for dimension %s. "
780 "Could not find matching alternative (primary key has type %s) "
781 "so attempting to use as-is.",
782 value,
783 dimensionName,
784 dimension.primaryKey.getPythonType(),
785 )
787 # By this point kwargs and newDataId should only include valid
788 # dimensions. Merge kwargs in to the new dataId and log if there
789 # are dimensions in both (rather than calling update).
790 for k, v in kwargs.items():
791 if k in newDataId and newDataId[k] != v:
792 log.debug(
793 "Keyword arg %s overriding explicit value in dataId of %s with %s", k, newDataId[k], v
794 )
795 newDataId[k] = v
796 # No need to retain any values in kwargs now.
797 kwargs = {}
799 # If we have some unrecognized dimensions we have to try to connect
800 # them to records in other dimensions. This is made more complicated
801 # by some dimensions having records with clashing names. A mitigation
802 # is that we can tell by this point which dimensions are missing
803 # for the DatasetType but this does not work for calibrations
804 # where additional dimensions can be used to constrain the temporal
805 # axis.
806 if not_dimensions:
807 # Search for all dimensions even if we have been given a value
808 # explicitly. In some cases records are given as well as the
809 # actually dimension and this should not be an error if they
810 # match.
811 mandatoryDimensions = datasetType.dimensions.names # - provided
813 candidateDimensions: Set[str] = set()
814 candidateDimensions.update(mandatoryDimensions)
816 # For calibrations we may well be needing temporal dimensions
817 # so rather than always including all dimensions in the scan
818 # restrict things a little. It is still possible for there
819 # to be confusion over day_obs in visit vs exposure for example.
820 # If we are not searching calibration collections things may
821 # fail but they are going to fail anyway because of the
822 # ambiguousness of the dataId...
823 if datasetType.isCalibration():
824 for dim in self.registry.dimensions.getStaticDimensions():
825 if dim.temporal:
826 candidateDimensions.add(str(dim))
828 # Look up table for the first association with a dimension
829 guessedAssociation: Dict[str, Dict[str, Any]] = defaultdict(dict)
831 # Keep track of whether an item is associated with multiple
832 # dimensions.
833 counter: Counter[str] = Counter()
834 assigned: Dict[str, Set[str]] = defaultdict(set)
836 # Go through the missing dimensions and associate the
837 # given names with records within those dimensions
838 matched_dims = set()
839 for dimensionName in candidateDimensions:
840 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
841 fields = dimension.metadata.names | dimension.uniqueKeys.names
842 for field in not_dimensions:
843 if field in fields:
844 guessedAssociation[dimensionName][field] = not_dimensions[field]
845 counter[dimensionName] += 1
846 assigned[field].add(dimensionName)
847 matched_dims.add(field)
849 # Calculate the fields that matched nothing.
850 never_found = set(not_dimensions) - matched_dims
852 if never_found:
853 raise ValueError(f"Unrecognized keyword args given: {never_found}")
855 # There is a chance we have allocated a single dataId item
856 # to multiple dimensions. Need to decide which should be retained.
857 # For now assume that the most popular alternative wins.
858 # This means that day_obs with seq_num will result in
859 # exposure.day_obs and not visit.day_obs
860 # Also prefer an explicitly missing dimension over an inferred
861 # temporal dimension.
862 for fieldName, assignedDimensions in assigned.items():
863 if len(assignedDimensions) > 1:
864 # Pick the most popular (preferring mandatory dimensions)
865 requiredButMissing = assignedDimensions.intersection(mandatoryDimensions)
866 if requiredButMissing:
867 candidateDimensions = requiredButMissing
868 else:
869 candidateDimensions = assignedDimensions
871 # If this is a choice between visit and exposure and
872 # neither was a required part of the dataset type,
873 # (hence in this branch) always prefer exposure over
874 # visit since exposures are always defined and visits
875 # are defined from exposures.
876 if candidateDimensions == {"exposure", "visit"}:
877 candidateDimensions = {"exposure"}
879 # Select the relevant items and get a new restricted
880 # counter.
881 theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions}
882 duplicatesCounter: Counter[str] = Counter()
883 duplicatesCounter.update(theseCounts)
885 # Choose the most common. If they are equally common
886 # we will pick the one that was found first.
887 # Returns a list of tuples
888 selected = duplicatesCounter.most_common(1)[0][0]
890 log.debug(
891 "Ambiguous dataId entry '%s' associated with multiple dimensions: %s."
892 " Removed ambiguity by choosing dimension %s.",
893 fieldName,
894 ", ".join(assignedDimensions),
895 selected,
896 )
898 for candidateDimension in assignedDimensions:
899 if candidateDimension != selected:
900 del guessedAssociation[candidateDimension][fieldName]
902 # Update the record look up dict with the new associations
903 for dimensionName, values in guessedAssociation.items():
904 if values: # A dict might now be empty
905 log.debug("Assigned non-dimension dataId keys to dimension %s: %s", dimensionName, values)
906 byRecord[dimensionName].update(values)
908 if byRecord:
909 # Some record specifiers were found so we need to convert
910 # them to the Id form
911 for dimensionName, values in byRecord.items():
912 if dimensionName in newDataId:
913 log.debug(
914 "DataId specified explicit %s dimension value of %s in addition to"
915 " general record specifiers for it of %s. Ignoring record information.",
916 dimensionName,
917 newDataId[dimensionName],
918 str(values),
919 )
920 # Get the actual record and compare with these values.
921 try:
922 recs = list(self.registry.queryDimensionRecords(dimensionName, dataId=newDataId))
923 except DataIdError:
924 raise ValueError(
925 f"Could not find dimension '{dimensionName}'"
926 f" with dataId {newDataId} as part of comparing with"
927 f" record values {byRecord[dimensionName]}"
928 ) from None
929 if len(recs) == 1:
930 errmsg: List[str] = []
931 for k, v in values.items():
932 if (recval := getattr(recs[0], k)) != v:
933 errmsg.append(f"{k}({recval} != {v})")
934 if errmsg:
935 raise ValueError(
936 f"Dimension {dimensionName} in dataId has explicit value"
937 " inconsistent with records: " + ", ".join(errmsg)
938 )
939 else:
940 # Multiple matches for an explicit dimension
941 # should never happen but let downstream complain.
942 pass
943 continue
945 # Build up a WHERE expression
946 bind = {k: v for k, v in values.items()}
947 where = " AND ".join(f"{dimensionName}.{k} = {k}" for k in bind)
949 # Hopefully we get a single record that matches
950 records = set(
951 self.registry.queryDimensionRecords(
952 dimensionName, dataId=newDataId, where=where, bind=bind, **kwargs
953 )
954 )
956 if len(records) != 1:
957 if len(records) > 1:
958 # visit can have an ambiguous answer without involving
959 # visit_system. The default visit_system is defined
960 # by the instrument.
961 if (
962 dimensionName == "visit"
963 and "visit_system_membership" in self.registry.dimensions
964 and "visit_system" in self.registry.dimensions["instrument"].metadata
965 ):
966 instrument_records = list(
967 self.registry.queryDimensionRecords(
968 "instrument",
969 dataId=newDataId,
970 **kwargs,
971 )
972 )
973 if len(instrument_records) == 1:
974 visit_system = instrument_records[0].visit_system
975 if visit_system is None:
976 # Set to a value that will never match.
977 visit_system = -1
979 # Look up each visit in the
980 # visit_system_membership records.
981 for rec in records:
982 membership = list(
983 self.registry.queryDimensionRecords(
984 # Use bind to allow zero results.
985 # This is a fully-specified query.
986 "visit_system_membership",
987 where="instrument = inst AND visit_system = system AND visit = v",
988 bind=dict(
989 inst=instrument_records[0].name, system=visit_system, v=rec.id
990 ),
991 )
992 )
993 if membership:
994 # This record is the right answer.
995 records = set([rec])
996 break
998 # The ambiguity may have been resolved so check again.
999 if len(records) > 1:
1000 log.debug("Received %d records from constraints of %s", len(records), str(values))
1001 for r in records:
1002 log.debug("- %s", str(r))
1003 raise ValueError(
1004 f"DataId specification for dimension {dimensionName} is not"
1005 f" uniquely constrained to a single dataset by {values}."
1006 f" Got {len(records)} results."
1007 )
1008 else:
1009 raise ValueError(
1010 f"DataId specification for dimension {dimensionName} matched no"
1011 f" records when constrained by {values}"
1012 )
1014 # Get the primary key from the real dimension object
1015 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
1016 if not isinstance(dimension, Dimension):
1017 raise RuntimeError(
1018 f"{dimension.name} is not a true dimension, and cannot be used in data IDs."
1019 )
1020 newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name)
1022 return newDataId, kwargs
1024 def _findDatasetRef(
1025 self,
1026 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1027 dataId: Optional[DataId] = None,
1028 *,
1029 collections: Any = None,
1030 allowUnresolved: bool = False,
1031 **kwargs: Any,
1032 ) -> DatasetRef:
1033 """Shared logic for methods that start with a search for a dataset in
1034 the registry.
1036 Parameters
1037 ----------
1038 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1039 When `DatasetRef` the `dataId` should be `None`.
1040 Otherwise the `DatasetType` or name thereof.
1041 dataId : `dict` or `DataCoordinate`, optional
1042 A `dict` of `Dimension` link name, value pairs that label the
1043 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1044 should be provided as the first argument.
1045 collections : Any, optional
1046 Collections to be searched, overriding ``self.collections``.
1047 Can be any of the types supported by the ``collections`` argument
1048 to butler construction.
1049 allowUnresolved : `bool`, optional
1050 If `True`, return an unresolved `DatasetRef` if finding a resolved
1051 one in the `Registry` fails. Defaults to `False`.
1052 **kwargs
1053 Additional keyword arguments used to augment or construct a
1054 `DataId`. See `DataId` parameters.
1056 Returns
1057 -------
1058 ref : `DatasetRef`
1059 A reference to the dataset identified by the given arguments.
1061 Raises
1062 ------
1063 LookupError
1064 Raised if no matching dataset exists in the `Registry` (and
1065 ``allowUnresolved is False``).
1066 ValueError
1067 Raised if a resolved `DatasetRef` was passed as an input, but it
1068 differs from the one found in the registry.
1069 TypeError
1070 Raised if no collections were provided.
1071 """
1072 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, for_put=False, **kwargs)
1073 if isinstance(datasetRefOrType, DatasetRef):
1074 idNumber = datasetRefOrType.id
1075 else:
1076 idNumber = None
1077 timespan: Optional[Timespan] = None
1079 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs)
1081 if datasetType.isCalibration():
1082 # Because this is a calibration dataset, first try to make a
1083 # standardize the data ID without restricting the dimensions to
1084 # those of the dataset type requested, because there may be extra
1085 # dimensions that provide temporal information for a validity-range
1086 # lookup.
1087 dataId = DataCoordinate.standardize(
1088 dataId, universe=self.registry.dimensions, defaults=self.registry.defaults.dataId, **kwargs
1089 )
1090 if dataId.graph.temporal:
1091 dataId = self.registry.expandDataId(dataId)
1092 timespan = dataId.timespan
1093 else:
1094 # Standardize the data ID to just the dimensions of the dataset
1095 # type instead of letting registry.findDataset do it, so we get the
1096 # result even if no dataset is found.
1097 dataId = DataCoordinate.standardize(
1098 dataId, graph=datasetType.dimensions, defaults=self.registry.defaults.dataId, **kwargs
1099 )
1100 # Always lookup the DatasetRef, even if one is given, to ensure it is
1101 # present in the current collection.
1102 ref = self.registry.findDataset(datasetType, dataId, collections=collections, timespan=timespan)
1103 if ref is None:
1104 if allowUnresolved:
1105 return DatasetRef(datasetType, dataId)
1106 else:
1107 if collections is None:
1108 collections = self.registry.defaults.collections
1109 raise LookupError(
1110 f"Dataset {datasetType.name} with data ID {dataId} "
1111 f"could not be found in collections {collections}."
1112 )
1113 if idNumber is not None and idNumber != ref.id:
1114 if collections is None:
1115 collections = self.registry.defaults.collections
1116 raise ValueError(
1117 f"DatasetRef.id provided ({idNumber}) does not match "
1118 f"id ({ref.id}) in registry in collections {collections}."
1119 )
1120 if datasetType != ref.datasetType:
1121 # If they differ it is because the user explicitly specified
1122 # a compatible dataset type to this call rather than using the
1123 # registry definition. The DatasetRef must therefore be recreated
1124 # using the user definition such that the expected type is
1125 # returned.
1126 ref = DatasetRef(datasetType, ref.dataId, run=ref.run, id=ref.id)
1128 return ref
1130 @transactional
1131 def putDirect(self, obj: Any, ref: DatasetRef) -> DatasetRef:
1132 # Docstring inherited.
1133 (imported_ref,) = self.registry._importDatasets(
1134 [ref],
1135 expand=True,
1136 )
1137 if imported_ref.id != ref.getCheckedId():
1138 raise RuntimeError("This registry configuration does not support putDirect.")
1139 self.datastore.put(obj, ref)
1140 return ref
1142 @transactional
1143 def put(
1144 self,
1145 obj: Any,
1146 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1147 dataId: Optional[DataId] = None,
1148 *,
1149 run: Optional[str] = None,
1150 **kwargs: Any,
1151 ) -> DatasetRef:
1152 """Store and register a dataset.
1154 Parameters
1155 ----------
1156 obj : `object`
1157 The dataset.
1158 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1159 When `DatasetRef` is provided, ``dataId`` should be `None`.
1160 Otherwise the `DatasetType` or name thereof.
1161 dataId : `dict` or `DataCoordinate`
1162 A `dict` of `Dimension` link name, value pairs that label the
1163 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1164 should be provided as the second argument.
1165 run : `str`, optional
1166 The name of the run the dataset should be added to, overriding
1167 ``self.run``.
1168 **kwargs
1169 Additional keyword arguments used to augment or construct a
1170 `DataCoordinate`. See `DataCoordinate.standardize`
1171 parameters.
1173 Returns
1174 -------
1175 ref : `DatasetRef`
1176 A reference to the stored dataset, updated with the correct id if
1177 given.
1179 Raises
1180 ------
1181 TypeError
1182 Raised if the butler is read-only or if no run has been provided.
1183 """
1184 log.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run)
1185 if not self.isWriteable():
1186 raise TypeError("Butler is read-only.")
1187 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs)
1188 if isinstance(datasetRefOrType, DatasetRef) and datasetRefOrType.id is not None:
1189 raise ValueError("DatasetRef must not be in registry, must have None id")
1191 # Handle dimension records in dataId
1192 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs)
1194 # Add Registry Dataset entry.
1195 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions, **kwargs)
1197 # For an execution butler the datasets will be pre-defined.
1198 # If the butler is configured that way datasets should only be inserted
1199 # if they do not already exist in registry. Trying and catching
1200 # ConflictingDefinitionError will not work because the transaction
1201 # will be corrupted. Instead, in this mode always check first.
1202 ref = None
1203 ref_is_predefined = False
1204 if self._allow_put_of_predefined_dataset:
1205 # Get the matching ref for this run.
1206 ref = self.registry.findDataset(datasetType, collections=run, dataId=dataId)
1208 if ref:
1209 # Must be expanded form for datastore templating
1210 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions)
1211 ref = ref.expanded(dataId)
1212 ref_is_predefined = True
1214 if not ref:
1215 (ref,) = self.registry.insertDatasets(datasetType, run=run, dataIds=[dataId])
1217 # If the ref is predefined it is possible that the datastore also
1218 # has the record. Asking datastore to put it again will result in
1219 # the artifact being recreated, overwriting previous, then will cause
1220 # a failure in writing the record which will cause the artifact
1221 # to be removed. Much safer to ask first before attempting to
1222 # overwrite. Race conditions should not be an issue for the
1223 # execution butler environment.
1224 if ref_is_predefined:
1225 if self.datastore.knows(ref):
1226 raise ConflictingDefinitionError(f"Dataset associated {ref} already exists.")
1228 self.datastore.put(obj, ref)
1230 return ref
1232 def getDirect(
1233 self,
1234 ref: DatasetRef,
1235 *,
1236 parameters: Optional[Dict[str, Any]] = None,
1237 storageClass: Optional[Union[StorageClass, str]] = None,
1238 ) -> Any:
1239 """Retrieve a stored dataset.
1241 Unlike `Butler.get`, this method allows datasets outside the Butler's
1242 collection to be read as long as the `DatasetRef` that identifies them
1243 can be obtained separately.
1245 Parameters
1246 ----------
1247 ref : `DatasetRef`
1248 Resolved reference to an already stored dataset.
1249 parameters : `dict`
1250 Additional StorageClass-defined options to control reading,
1251 typically used to efficiently read only a subset of the dataset.
1252 storageClass : `StorageClass` or `str`, optional
1253 The storage class to be used to override the Python type
1254 returned by this method. By default the returned type matches
1255 the dataset type definition for this dataset. Specifying a
1256 read `StorageClass` can force a different type to be returned.
1257 This type must be compatible with the original type.
1259 Returns
1260 -------
1261 obj : `object`
1262 The dataset.
1263 """
1264 return self.datastore.get(ref, parameters=parameters, storageClass=storageClass)
1266 def getDirectDeferred(
1267 self,
1268 ref: DatasetRef,
1269 *,
1270 parameters: Union[dict, None] = None,
1271 storageClass: str | StorageClass | None = None,
1272 ) -> DeferredDatasetHandle:
1273 """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
1274 from a resolved `DatasetRef`.
1276 Parameters
1277 ----------
1278 ref : `DatasetRef`
1279 Resolved reference to an already stored dataset.
1280 parameters : `dict`
1281 Additional StorageClass-defined options to control reading,
1282 typically used to efficiently read only a subset of the dataset.
1283 storageClass : `StorageClass` or `str`, optional
1284 The storage class to be used to override the Python type
1285 returned by this method. By default the returned type matches
1286 the dataset type definition for this dataset. Specifying a
1287 read `StorageClass` can force a different type to be returned.
1288 This type must be compatible with the original type.
1290 Returns
1291 -------
1292 obj : `DeferredDatasetHandle`
1293 A handle which can be used to retrieve a dataset at a later time.
1295 Raises
1296 ------
1297 AmbiguousDatasetError
1298 Raised if ``ref.id is None``, i.e. the reference is unresolved.
1299 """
1300 if ref.id is None:
1301 raise AmbiguousDatasetError(
1302 f"Dataset of type {ref.datasetType.name} with data ID {ref.dataId} is not resolved."
1303 )
1304 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters, storageClass=storageClass)
1306 def getDeferred(
1307 self,
1308 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1309 dataId: Optional[DataId] = None,
1310 *,
1311 parameters: Union[dict, None] = None,
1312 collections: Any = None,
1313 storageClass: str | StorageClass | None = None,
1314 **kwargs: Any,
1315 ) -> DeferredDatasetHandle:
1316 """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
1317 after an immediate registry lookup.
1319 Parameters
1320 ----------
1321 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1322 When `DatasetRef` the `dataId` should be `None`.
1323 Otherwise the `DatasetType` or name thereof.
1324 dataId : `dict` or `DataCoordinate`, optional
1325 A `dict` of `Dimension` link name, value pairs that label the
1326 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1327 should be provided as the first argument.
1328 parameters : `dict`
1329 Additional StorageClass-defined options to control reading,
1330 typically used to efficiently read only a subset of the dataset.
1331 collections : Any, optional
1332 Collections to be searched, overriding ``self.collections``.
1333 Can be any of the types supported by the ``collections`` argument
1334 to butler construction.
1335 storageClass : `StorageClass` or `str`, optional
1336 The storage class to be used to override the Python type
1337 returned by this method. By default the returned type matches
1338 the dataset type definition for this dataset. Specifying a
1339 read `StorageClass` can force a different type to be returned.
1340 This type must be compatible with the original type.
1341 **kwargs
1342 Additional keyword arguments used to augment or construct a
1343 `DataId`. See `DataId` parameters.
1345 Returns
1346 -------
1347 obj : `DeferredDatasetHandle`
1348 A handle which can be used to retrieve a dataset at a later time.
1350 Raises
1351 ------
1352 LookupError
1353 Raised if no matching dataset exists in the `Registry` (and
1354 ``allowUnresolved is False``).
1355 ValueError
1356 Raised if a resolved `DatasetRef` was passed as an input, but it
1357 differs from the one found in the registry.
1358 TypeError
1359 Raised if no collections were provided.
1360 """
1361 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs)
1362 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters, storageClass=storageClass)
1364 def get(
1365 self,
1366 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1367 dataId: Optional[DataId] = None,
1368 *,
1369 parameters: Optional[Dict[str, Any]] = None,
1370 collections: Any = None,
1371 storageClass: Optional[Union[StorageClass, str]] = None,
1372 **kwargs: Any,
1373 ) -> Any:
1374 """Retrieve a stored dataset.
1376 Parameters
1377 ----------
1378 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1379 When `DatasetRef` the `dataId` should be `None`.
1380 Otherwise the `DatasetType` or name thereof.
1381 dataId : `dict` or `DataCoordinate`
1382 A `dict` of `Dimension` link name, value pairs that label the
1383 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1384 should be provided as the first argument.
1385 parameters : `dict`
1386 Additional StorageClass-defined options to control reading,
1387 typically used to efficiently read only a subset of the dataset.
1388 collections : Any, optional
1389 Collections to be searched, overriding ``self.collections``.
1390 Can be any of the types supported by the ``collections`` argument
1391 to butler construction.
1392 storageClass : `StorageClass` or `str`, optional
1393 The storage class to be used to override the Python type
1394 returned by this method. By default the returned type matches
1395 the dataset type definition for this dataset. Specifying a
1396 read `StorageClass` can force a different type to be returned.
1397 This type must be compatible with the original type.
1398 **kwargs
1399 Additional keyword arguments used to augment or construct a
1400 `DataCoordinate`. See `DataCoordinate.standardize`
1401 parameters.
1403 Returns
1404 -------
1405 obj : `object`
1406 The dataset.
1408 Raises
1409 ------
1410 ValueError
1411 Raised if a resolved `DatasetRef` was passed as an input, but it
1412 differs from the one found in the registry.
1413 LookupError
1414 Raised if no matching dataset exists in the `Registry`.
1415 TypeError
1416 Raised if no collections were provided.
1418 Notes
1419 -----
1420 When looking up datasets in a `~CollectionType.CALIBRATION` collection,
1421 this method requires that the given data ID include temporal dimensions
1422 beyond the dimensions of the dataset type itself, in order to find the
1423 dataset with the appropriate validity range. For example, a "bias"
1424 dataset with native dimensions ``{instrument, detector}`` could be
1425 fetched with a ``{instrument, detector, exposure}`` data ID, because
1426 ``exposure`` is a temporal dimension.
1427 """
1428 log.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters)
1429 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs)
1430 return self.getDirect(ref, parameters=parameters, storageClass=storageClass)
1432 def getURIs(
1433 self,
1434 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1435 dataId: Optional[DataId] = None,
1436 *,
1437 predict: bool = False,
1438 collections: Any = None,
1439 run: Optional[str] = None,
1440 **kwargs: Any,
1441 ) -> DatasetRefURIs:
1442 """Returns the URIs associated with the dataset.
1444 Parameters
1445 ----------
1446 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1447 When `DatasetRef` the `dataId` should be `None`.
1448 Otherwise the `DatasetType` or name thereof.
1449 dataId : `dict` or `DataCoordinate`
1450 A `dict` of `Dimension` link name, value pairs that label the
1451 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1452 should be provided as the first argument.
1453 predict : `bool`
1454 If `True`, allow URIs to be returned of datasets that have not
1455 been written.
1456 collections : Any, optional
1457 Collections to be searched, overriding ``self.collections``.
1458 Can be any of the types supported by the ``collections`` argument
1459 to butler construction.
1460 run : `str`, optional
1461 Run to use for predictions, overriding ``self.run``.
1462 **kwargs
1463 Additional keyword arguments used to augment or construct a
1464 `DataCoordinate`. See `DataCoordinate.standardize`
1465 parameters.
1467 Returns
1468 -------
1469 uris : `DatasetRefURIs`
1470 The URI to the primary artifact associated with this dataset (if
1471 the dataset was disassembled within the datastore this may be
1472 `None`), and the URIs to any components associated with the dataset
1473 artifact. (can be empty if there are no components).
1474 """
1475 ref = self._findDatasetRef(
1476 datasetRefOrType, dataId, allowUnresolved=predict, collections=collections, **kwargs
1477 )
1478 if ref.id is None: # only possible if predict is True
1479 if run is None:
1480 run = self.run
1481 if run is None:
1482 raise TypeError("Cannot predict location with run=None.")
1483 # Lie about ID, because we can't guess it, and only
1484 # Datastore.getURIs() will ever see it (and it doesn't use it).
1485 ref = ref.resolved(id=0, run=run)
1486 return self.datastore.getURIs(ref, predict)
1488 def getURI(
1489 self,
1490 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1491 dataId: Optional[DataId] = None,
1492 *,
1493 predict: bool = False,
1494 collections: Any = None,
1495 run: Optional[str] = None,
1496 **kwargs: Any,
1497 ) -> ResourcePath:
1498 """Return the URI to the Dataset.
1500 Parameters
1501 ----------
1502 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1503 When `DatasetRef` the `dataId` should be `None`.
1504 Otherwise the `DatasetType` or name thereof.
1505 dataId : `dict` or `DataCoordinate`
1506 A `dict` of `Dimension` link name, value pairs that label the
1507 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1508 should be provided as the first argument.
1509 predict : `bool`
1510 If `True`, allow URIs to be returned of datasets that have not
1511 been written.
1512 collections : Any, optional
1513 Collections to be searched, overriding ``self.collections``.
1514 Can be any of the types supported by the ``collections`` argument
1515 to butler construction.
1516 run : `str`, optional
1517 Run to use for predictions, overriding ``self.run``.
1518 **kwargs
1519 Additional keyword arguments used to augment or construct a
1520 `DataCoordinate`. See `DataCoordinate.standardize`
1521 parameters.
1523 Returns
1524 -------
1525 uri : `lsst.resources.ResourcePath`
1526 URI pointing to the Dataset within the datastore. If the
1527 Dataset does not exist in the datastore, and if ``predict`` is
1528 `True`, the URI will be a prediction and will include a URI
1529 fragment "#predicted".
1530 If the datastore does not have entities that relate well
1531 to the concept of a URI the returned URI string will be
1532 descriptive. The returned URI is not guaranteed to be obtainable.
1534 Raises
1535 ------
1536 LookupError
1537 A URI has been requested for a dataset that does not exist and
1538 guessing is not allowed.
1539 ValueError
1540 Raised if a resolved `DatasetRef` was passed as an input, but it
1541 differs from the one found in the registry.
1542 TypeError
1543 Raised if no collections were provided.
1544 RuntimeError
1545 Raised if a URI is requested for a dataset that consists of
1546 multiple artifacts.
1547 """
1548 primary, components = self.getURIs(
1549 datasetRefOrType, dataId=dataId, predict=predict, collections=collections, run=run, **kwargs
1550 )
1552 if primary is None or components:
1553 raise RuntimeError(
1554 f"Dataset ({datasetRefOrType}) includes distinct URIs for components. "
1555 "Use Butler.getURIs() instead."
1556 )
1557 return primary
1559 def retrieveArtifacts(
1560 self,
1561 refs: Iterable[DatasetRef],
1562 destination: ResourcePathExpression,
1563 transfer: str = "auto",
1564 preserve_path: bool = True,
1565 overwrite: bool = False,
1566 ) -> List[ResourcePath]:
1567 """Retrieve the artifacts associated with the supplied refs.
1569 Parameters
1570 ----------
1571 refs : iterable of `DatasetRef`
1572 The datasets for which artifacts are to be retrieved.
1573 A single ref can result in multiple artifacts. The refs must
1574 be resolved.
1575 destination : `lsst.resources.ResourcePath` or `str`
1576 Location to write the artifacts.
1577 transfer : `str`, optional
1578 Method to use to transfer the artifacts. Must be one of the options
1579 supported by `~lsst.resources.ResourcePath.transfer_from()`.
1580 "move" is not allowed.
1581 preserve_path : `bool`, optional
1582 If `True` the full path of the artifact within the datastore
1583 is preserved. If `False` the final file component of the path
1584 is used.
1585 overwrite : `bool`, optional
1586 If `True` allow transfers to overwrite existing files at the
1587 destination.
1589 Returns
1590 -------
1591 targets : `list` of `lsst.resources.ResourcePath`
1592 URIs of file artifacts in destination location. Order is not
1593 preserved.
1595 Notes
1596 -----
1597 For non-file datastores the artifacts written to the destination
1598 may not match the representation inside the datastore. For example
1599 a hierarchical data structure in a NoSQL database may well be stored
1600 as a JSON file.
1601 """
1602 return self.datastore.retrieveArtifacts(
1603 refs,
1604 ResourcePath(destination),
1605 transfer=transfer,
1606 preserve_path=preserve_path,
1607 overwrite=overwrite,
1608 )
1610 def datasetExists(
1611 self,
1612 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1613 dataId: Optional[DataId] = None,
1614 *,
1615 collections: Any = None,
1616 **kwargs: Any,
1617 ) -> bool:
1618 """Return True if the Dataset is actually present in the Datastore.
1620 Parameters
1621 ----------
1622 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1623 When `DatasetRef` the `dataId` should be `None`.
1624 Otherwise the `DatasetType` or name thereof.
1625 dataId : `dict` or `DataCoordinate`
1626 A `dict` of `Dimension` link name, value pairs that label the
1627 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1628 should be provided as the first argument.
1629 collections : Any, optional
1630 Collections to be searched, overriding ``self.collections``.
1631 Can be any of the types supported by the ``collections`` argument
1632 to butler construction.
1633 **kwargs
1634 Additional keyword arguments used to augment or construct a
1635 `DataCoordinate`. See `DataCoordinate.standardize`
1636 parameters.
1638 Raises
1639 ------
1640 LookupError
1641 Raised if the dataset is not even present in the Registry.
1642 ValueError
1643 Raised if a resolved `DatasetRef` was passed as an input, but it
1644 differs from the one found in the registry.
1645 TypeError
1646 Raised if no collections were provided.
1647 """
1648 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs)
1649 return self.datastore.exists(ref)
1651 def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None:
1652 """Remove one or more `~CollectionType.RUN` collections and the
1653 datasets within them.
1655 Parameters
1656 ----------
1657 names : `Iterable` [ `str` ]
1658 The names of the collections to remove.
1659 unstore : `bool`, optional
1660 If `True` (default), delete datasets from all datastores in which
1661 they are present, and attempt to rollback the registry deletions if
1662 datastore deletions fail (which may not always be possible). If
1663 `False`, datastore records for these datasets are still removed,
1664 but any artifacts (e.g. files) will not be.
1666 Raises
1667 ------
1668 TypeError
1669 Raised if one or more collections are not of type
1670 `~CollectionType.RUN`.
1671 """
1672 if not self.isWriteable():
1673 raise TypeError("Butler is read-only.")
1674 names = list(names)
1675 refs: List[DatasetRef] = []
1676 for name in names:
1677 collectionType = self.registry.getCollectionType(name)
1678 if collectionType is not CollectionType.RUN:
1679 raise TypeError(f"The collection type of '{name}' is {collectionType.name}, not RUN.")
1680 refs.extend(self.registry.queryDatasets(..., collections=name, findFirst=True))
1681 with self.datastore.transaction():
1682 with self.registry.transaction():
1683 if unstore:
1684 self.datastore.trash(refs)
1685 else:
1686 self.datastore.forget(refs)
1687 for name in names:
1688 self.registry.removeCollection(name)
1689 if unstore:
1690 # Point of no return for removing artifacts
1691 self.datastore.emptyTrash()
1693 def pruneCollection(
1694 self, name: str, purge: bool = False, unstore: bool = False, unlink: Optional[List[str]] = None
1695 ) -> None:
1696 """Remove a collection and possibly prune datasets within it.
1698 Parameters
1699 ----------
1700 name : `str`
1701 Name of the collection to remove. If this is a
1702 `~CollectionType.TAGGED` or `~CollectionType.CHAINED` collection,
1703 datasets within the collection are not modified unless ``unstore``
1704 is `True`. If this is a `~CollectionType.RUN` collection,
1705 ``purge`` and ``unstore`` must be `True`, and all datasets in it
1706 are fully removed from the data repository.
1707 purge : `bool`, optional
1708 If `True`, permit `~CollectionType.RUN` collections to be removed,
1709 fully removing datasets within them. Requires ``unstore=True`` as
1710 well as an added precaution against accidental deletion. Must be
1711 `False` (default) if the collection is not a ``RUN``.
1712 unstore: `bool`, optional
1713 If `True`, remove all datasets in the collection from all
1714 datastores in which they appear.
1715 unlink: `list` [`str`], optional
1716 Before removing the given `collection` unlink it from from these
1717 parent collections.
1719 Raises
1720 ------
1721 TypeError
1722 Raised if the butler is read-only or arguments are mutually
1723 inconsistent.
1724 """
1725 # See pruneDatasets comments for more information about the logic here;
1726 # the cases are almost the same, but here we can rely on Registry to
1727 # take care everything but Datastore deletion when we remove the
1728 # collection.
1729 if not self.isWriteable():
1730 raise TypeError("Butler is read-only.")
1731 collectionType = self.registry.getCollectionType(name)
1732 if purge and not unstore:
1733 raise PurgeWithoutUnstorePruneCollectionsError()
1734 if collectionType is CollectionType.RUN and not purge:
1735 raise RunWithoutPurgePruneCollectionsError(collectionType)
1736 if collectionType is not CollectionType.RUN and purge:
1737 raise PurgeUnsupportedPruneCollectionsError(collectionType)
1739 def remove(child: str, parent: str) -> None:
1740 """Remove a child collection from a parent collection."""
1741 # Remove child from parent.
1742 chain = list(self.registry.getCollectionChain(parent))
1743 try:
1744 chain.remove(name)
1745 except ValueError as e:
1746 raise RuntimeError(f"{name} is not a child of {parent}") from e
1747 self.registry.setCollectionChain(parent, chain)
1749 with self.datastore.transaction():
1750 with self.registry.transaction():
1751 if unlink:
1752 for parent in unlink:
1753 remove(name, parent)
1754 if unstore:
1755 refs = self.registry.queryDatasets(..., collections=name, findFirst=True)
1756 self.datastore.trash(refs)
1757 self.registry.removeCollection(name)
1759 if unstore:
1760 # Point of no return for removing artifacts
1761 self.datastore.emptyTrash()
1763 def pruneDatasets(
1764 self,
1765 refs: Iterable[DatasetRef],
1766 *,
1767 disassociate: bool = True,
1768 unstore: bool = False,
1769 tags: Iterable[str] = (),
1770 purge: bool = False,
1771 ) -> None:
1772 # docstring inherited from LimitedButler
1774 if not self.isWriteable():
1775 raise TypeError("Butler is read-only.")
1776 if purge:
1777 if not disassociate:
1778 raise TypeError("Cannot pass purge=True without disassociate=True.")
1779 if not unstore:
1780 raise TypeError("Cannot pass purge=True without unstore=True.")
1781 elif disassociate:
1782 tags = tuple(tags)
1783 if not tags:
1784 raise TypeError("No tags provided but disassociate=True.")
1785 for tag in tags:
1786 collectionType = self.registry.getCollectionType(tag)
1787 if collectionType is not CollectionType.TAGGED:
1788 raise TypeError(
1789 f"Cannot disassociate from collection '{tag}' "
1790 f"of non-TAGGED type {collectionType.name}."
1791 )
1792 # For an execution butler we want to keep existing UUIDs for the
1793 # datasets, for that we need to keep them in the collections but
1794 # remove from datastore.
1795 if self._allow_put_of_predefined_dataset and purge:
1796 purge = False
1797 disassociate = False
1798 # Transform possibly-single-pass iterable into something we can iterate
1799 # over multiple times.
1800 refs = list(refs)
1801 # Pruning a component of a DatasetRef makes no sense since registry
1802 # doesn't know about components and datastore might not store
1803 # components in a separate file
1804 for ref in refs:
1805 if ref.datasetType.component():
1806 raise ValueError(f"Can not prune a component of a dataset (ref={ref})")
1807 # We don't need an unreliable Datastore transaction for this, because
1808 # we've been extra careful to ensure that Datastore.trash only involves
1809 # mutating the Registry (it can _look_ at Datastore-specific things,
1810 # but shouldn't change them), and hence all operations here are
1811 # Registry operations.
1812 with self.datastore.transaction():
1813 with self.registry.transaction():
1814 if unstore:
1815 self.datastore.trash(refs)
1816 if purge:
1817 self.registry.removeDatasets(refs)
1818 elif disassociate:
1819 assert tags, "Guaranteed by earlier logic in this function."
1820 for tag in tags:
1821 self.registry.disassociate(tag, refs)
1822 # We've exited the Registry transaction, and apparently committed.
1823 # (if there was an exception, everything rolled back, and it's as if
1824 # nothing happened - and we never get here).
1825 # Datastore artifacts are not yet gone, but they're clearly marked
1826 # as trash, so if we fail to delete now because of (e.g.) filesystem
1827 # problems we can try again later, and if manual administrative
1828 # intervention is required, it's pretty clear what that should entail:
1829 # deleting everything on disk and in private Datastore tables that is
1830 # in the dataset_location_trash table.
1831 if unstore:
1832 # Point of no return for removing artifacts
1833 self.datastore.emptyTrash()
1835 @transactional
1836 def ingest(
1837 self,
1838 *datasets: FileDataset,
1839 transfer: Optional[str] = "auto",
1840 run: Optional[str] = None,
1841 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
1842 record_validation_info: bool = True,
1843 ) -> None:
1844 """Store and register one or more datasets that already exist on disk.
1846 Parameters
1847 ----------
1848 datasets : `FileDataset`
1849 Each positional argument is a struct containing information about
1850 a file to be ingested, including its URI (either absolute or
1851 relative to the datastore root, if applicable), a `DatasetRef`,
1852 and optionally a formatter class or its fully-qualified string
1853 name. If a formatter is not provided, the formatter that would be
1854 used for `put` is assumed. On successful return, all
1855 `FileDataset.ref` attributes will have their `DatasetRef.id`
1856 attribute populated and all `FileDataset.formatter` attributes will
1857 be set to the formatter class used. `FileDataset.path` attributes
1858 may be modified to put paths in whatever the datastore considers a
1859 standardized form.
1860 transfer : `str`, optional
1861 If not `None`, must be one of 'auto', 'move', 'copy', 'direct',
1862 'split', 'hardlink', 'relsymlink' or 'symlink', indicating how to
1863 transfer the file.
1864 run : `str`, optional
1865 The name of the run ingested datasets should be added to,
1866 overriding ``self.run``.
1867 idGenerationMode : `DatasetIdGenEnum`, optional
1868 Specifies option for generating dataset IDs. By default unique IDs
1869 are generated for each inserted dataset.
1870 record_validation_info : `bool`, optional
1871 If `True`, the default, the datastore can record validation
1872 information associated with the file. If `False` the datastore
1873 will not attempt to track any information such as checksums
1874 or file sizes. This can be useful if such information is tracked
1875 in an external system or if the file is to be compressed in place.
1876 It is up to the datastore whether this parameter is relevant.
1878 Raises
1879 ------
1880 TypeError
1881 Raised if the butler is read-only or if no run was provided.
1882 NotImplementedError
1883 Raised if the `Datastore` does not support the given transfer mode.
1884 DatasetTypeNotSupportedError
1885 Raised if one or more files to be ingested have a dataset type that
1886 is not supported by the `Datastore`..
1887 FileNotFoundError
1888 Raised if one of the given files does not exist.
1889 FileExistsError
1890 Raised if transfer is not `None` but the (internal) location the
1891 file would be moved to is already occupied.
1893 Notes
1894 -----
1895 This operation is not fully exception safe: if a database operation
1896 fails, the given `FileDataset` instances may be only partially updated.
1898 It is atomic in terms of database operations (they will either all
1899 succeed or all fail) providing the database engine implements
1900 transactions correctly. It will attempt to be atomic in terms of
1901 filesystem operations as well, but this cannot be implemented
1902 rigorously for most datastores.
1903 """
1904 if not self.isWriteable():
1905 raise TypeError("Butler is read-only.")
1906 progress = Progress("lsst.daf.butler.Butler.ingest", level=logging.DEBUG)
1907 # Reorganize the inputs so they're grouped by DatasetType and then
1908 # data ID. We also include a list of DatasetRefs for each FileDataset
1909 # to hold the resolved DatasetRefs returned by the Registry, before
1910 # it's safe to swap them into FileDataset.refs.
1911 # Some type annotation aliases to make that clearer:
1912 GroupForType = Dict[DataCoordinate, Tuple[FileDataset, List[DatasetRef]]]
1913 GroupedData = MutableMapping[DatasetType, GroupForType]
1914 # The actual data structure:
1915 groupedData: GroupedData = defaultdict(dict)
1916 # And the nested loop that populates it:
1917 for dataset in progress.wrap(datasets, desc="Grouping by dataset type"):
1918 # This list intentionally shared across the inner loop, since it's
1919 # associated with `dataset`.
1920 resolvedRefs: List[DatasetRef] = []
1922 # Somewhere to store pre-existing refs if we have an
1923 # execution butler.
1924 existingRefs: List[DatasetRef] = []
1926 for ref in dataset.refs:
1927 if ref.dataId in groupedData[ref.datasetType]:
1928 raise ConflictingDefinitionError(
1929 f"Ingest conflict. Dataset {dataset.path} has same"
1930 " DataId as other ingest dataset"
1931 f" {groupedData[ref.datasetType][ref.dataId][0].path} "
1932 f" ({ref.dataId})"
1933 )
1934 if self._allow_put_of_predefined_dataset:
1935 existing_ref = self.registry.findDataset(
1936 ref.datasetType, dataId=ref.dataId, collections=run
1937 )
1938 if existing_ref:
1939 if self.datastore.knows(existing_ref):
1940 raise ConflictingDefinitionError(
1941 f"Dataset associated with path {dataset.path}"
1942 f" already exists as {existing_ref}."
1943 )
1944 # Store this ref elsewhere since it already exists
1945 # and we do not want to remake it but we do want
1946 # to store it in the datastore.
1947 existingRefs.append(existing_ref)
1949 # Nothing else to do until we have finished
1950 # iterating.
1951 continue
1953 groupedData[ref.datasetType][ref.dataId] = (dataset, resolvedRefs)
1955 if existingRefs:
1956 if len(dataset.refs) != len(existingRefs):
1957 # Keeping track of partially pre-existing datasets is hard
1958 # and should generally never happen. For now don't allow
1959 # it.
1960 raise ConflictingDefinitionError(
1961 f"For dataset {dataset.path} some dataIds already exist"
1962 " in registry but others do not. This is not supported."
1963 )
1965 # Attach the resolved refs if we found them.
1966 dataset.refs = existingRefs
1968 # Now we can bulk-insert into Registry for each DatasetType.
1969 for datasetType, groupForType in progress.iter_item_chunks(
1970 groupedData.items(), desc="Bulk-inserting datasets by type"
1971 ):
1972 refs = self.registry.insertDatasets(
1973 datasetType,
1974 dataIds=groupForType.keys(),
1975 run=run,
1976 expand=self.datastore.needs_expanded_data_ids(transfer, datasetType),
1977 idGenerationMode=idGenerationMode,
1978 )
1979 # Append those resolved DatasetRefs to the new lists we set up for
1980 # them.
1981 for ref, (_, resolvedRefs) in zip(refs, groupForType.values()):
1982 resolvedRefs.append(ref)
1984 # Go back to the original FileDatasets to replace their refs with the
1985 # new resolved ones.
1986 for groupForType in progress.iter_chunks(
1987 groupedData.values(), desc="Reassociating resolved dataset refs with files"
1988 ):
1989 for dataset, resolvedRefs in groupForType.values():
1990 dataset.refs = resolvedRefs
1992 # Bulk-insert everything into Datastore.
1993 self.datastore.ingest(*datasets, transfer=transfer, record_validation_info=record_validation_info)
1995 @contextlib.contextmanager
1996 def export(
1997 self,
1998 *,
1999 directory: Optional[str] = None,
2000 filename: Optional[str] = None,
2001 format: Optional[str] = None,
2002 transfer: Optional[str] = None,
2003 ) -> Iterator[RepoExportContext]:
2004 """Export datasets from the repository represented by this `Butler`.
2006 This method is a context manager that returns a helper object
2007 (`RepoExportContext`) that is used to indicate what information from
2008 the repository should be exported.
2010 Parameters
2011 ----------
2012 directory : `str`, optional
2013 Directory dataset files should be written to if ``transfer`` is not
2014 `None`.
2015 filename : `str`, optional
2016 Name for the file that will include database information associated
2017 with the exported datasets. If this is not an absolute path and
2018 ``directory`` is not `None`, it will be written to ``directory``
2019 instead of the current working directory. Defaults to
2020 "export.{format}".
2021 format : `str`, optional
2022 File format for the database information file. If `None`, the
2023 extension of ``filename`` will be used.
2024 transfer : `str`, optional
2025 Transfer mode passed to `Datastore.export`.
2027 Raises
2028 ------
2029 TypeError
2030 Raised if the set of arguments passed is inconsistent.
2032 Examples
2033 --------
2034 Typically the `Registry.queryDataIds` and `Registry.queryDatasets`
2035 methods are used to provide the iterables over data IDs and/or datasets
2036 to be exported::
2038 with butler.export("exports.yaml") as export:
2039 # Export all flats, but none of the dimension element rows
2040 # (i.e. data ID information) associated with them.
2041 export.saveDatasets(butler.registry.queryDatasets("flat"),
2042 elements=())
2043 # Export all datasets that start with "deepCoadd_" and all of
2044 # their associated data ID information.
2045 export.saveDatasets(butler.registry.queryDatasets("deepCoadd_*"))
2046 """
2047 if directory is None and transfer is not None:
2048 raise TypeError("Cannot transfer without providing a directory.")
2049 if transfer == "move":
2050 raise TypeError("Transfer may not be 'move': export is read-only")
2051 if format is None:
2052 if filename is None:
2053 raise TypeError("At least one of 'filename' or 'format' must be provided.")
2054 else:
2055 _, format = os.path.splitext(filename)
2056 if not format:
2057 raise ValueError("Please specify a file extension to determine export format.")
2058 format = format[1:] # Strip leading ".""
2059 elif filename is None:
2060 filename = f"export.{format}"
2061 if directory is not None:
2062 filename = os.path.join(directory, filename)
2063 formats = self._config["repo_transfer_formats"]
2064 if format not in formats:
2065 raise ValueError(f"Unknown export format {format!r}, allowed: {','.join(formats.keys())}")
2066 BackendClass = get_class_of(formats[format, "export"])
2067 with open(filename, "w") as stream:
2068 backend = BackendClass(stream, universe=self.registry.dimensions)
2069 try:
2070 helper = RepoExportContext(
2071 self.registry, self.datastore, backend=backend, directory=directory, transfer=transfer
2072 )
2073 yield helper
2074 except BaseException:
2075 raise
2076 else:
2077 helper._finish()
2079 def import_(
2080 self,
2081 *,
2082 directory: Optional[str] = None,
2083 filename: Union[str, TextIO, None] = None,
2084 format: Optional[str] = None,
2085 transfer: Optional[str] = None,
2086 skip_dimensions: Optional[Set] = None,
2087 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
2088 reuseIds: bool = False,
2089 ) -> None:
2090 """Import datasets into this repository that were exported from a
2091 different butler repository via `~lsst.daf.butler.Butler.export`.
2093 Parameters
2094 ----------
2095 directory : `str`, optional
2096 Directory containing dataset files to import from. If `None`,
2097 ``filename`` and all dataset file paths specified therein must
2098 be absolute.
2099 filename : `str` or `TextIO`, optional
2100 A stream or name of file that contains database information
2101 associated with the exported datasets, typically generated by
2102 `~lsst.daf.butler.Butler.export`. If this a string (name) and
2103 is not an absolute path, does not exist in the current working
2104 directory, and ``directory`` is not `None`, it is assumed to be in
2105 ``directory``. Defaults to "export.{format}".
2106 format : `str`, optional
2107 File format for ``filename``. If `None`, the extension of
2108 ``filename`` will be used.
2109 transfer : `str`, optional
2110 Transfer mode passed to `~lsst.daf.butler.Datastore.ingest`.
2111 skip_dimensions : `set`, optional
2112 Names of dimensions that should be skipped and not imported.
2113 idGenerationMode : `DatasetIdGenEnum`, optional
2114 Specifies option for generating dataset IDs when IDs are not
2115 provided or their type does not match backend type. By default
2116 unique IDs are generated for each inserted dataset.
2117 reuseIds : `bool`, optional
2118 If `True` then forces re-use of imported dataset IDs for integer
2119 IDs which are normally generated as auto-incremented; exception
2120 will be raised if imported IDs clash with existing ones. This
2121 option has no effect on the use of globally-unique IDs which are
2122 always re-used (or generated if integer IDs are being imported).
2124 Raises
2125 ------
2126 TypeError
2127 Raised if the set of arguments passed is inconsistent, or if the
2128 butler is read-only.
2129 """
2130 if not self.isWriteable():
2131 raise TypeError("Butler is read-only.")
2132 if format is None:
2133 if filename is None:
2134 raise TypeError("At least one of 'filename' or 'format' must be provided.")
2135 else:
2136 _, format = os.path.splitext(filename) # type: ignore
2137 elif filename is None:
2138 filename = f"export.{format}"
2139 if isinstance(filename, str) and directory is not None and not os.path.exists(filename):
2140 filename = os.path.join(directory, filename)
2141 BackendClass = get_class_of(self._config["repo_transfer_formats"][format]["import"])
2143 def doImport(importStream: TextIO) -> None:
2144 backend = BackendClass(importStream, self.registry)
2145 backend.register()
2146 with self.transaction():
2147 backend.load(
2148 self.datastore,
2149 directory=directory,
2150 transfer=transfer,
2151 skip_dimensions=skip_dimensions,
2152 idGenerationMode=idGenerationMode,
2153 reuseIds=reuseIds,
2154 )
2156 if isinstance(filename, str):
2157 with open(filename, "r") as stream:
2158 doImport(stream)
2159 else:
2160 doImport(filename)
2162 def transfer_from(
2163 self,
2164 source_butler: LimitedButler,
2165 source_refs: Iterable[DatasetRef],
2166 transfer: str = "auto",
2167 id_gen_map: Dict[str, DatasetIdGenEnum] | None = None,
2168 skip_missing: bool = True,
2169 register_dataset_types: bool = False,
2170 transfer_dimensions: bool = False,
2171 ) -> List[DatasetRef]:
2172 """Transfer datasets to this Butler from a run in another Butler.
2174 Parameters
2175 ----------
2176 source_butler : `LimitedButler`
2177 Butler from which the datasets are to be transferred. If data IDs
2178 in ``source_refs`` are not expanded then this has to be a full
2179 `Butler` whose registry will be used to expand data IDs.
2180 source_refs : iterable of `DatasetRef`
2181 Datasets defined in the source butler that should be transferred to
2182 this butler.
2183 transfer : `str`, optional
2184 Transfer mode passed to `~lsst.daf.butler.Datastore.transfer_from`.
2185 id_gen_map : `dict` of [`str`, `DatasetIdGenEnum`], optional
2186 A mapping of dataset type to ID generation mode. Only used if
2187 the source butler is using integer IDs. Should not be used
2188 if this receiving butler uses integer IDs. Without this dataset
2189 import always uses unique.
2190 skip_missing : `bool`
2191 If `True`, datasets with no datastore artifact associated with
2192 them are not transferred. If `False` a registry entry will be
2193 created even if no datastore record is created (and so will
2194 look equivalent to the dataset being unstored).
2195 register_dataset_types : `bool`
2196 If `True` any missing dataset types are registered. Otherwise
2197 an exception is raised.
2198 transfer_dimensions : `bool`, optional
2199 If `True`, dimension record data associated with the new datasets
2200 will be transferred.
2202 Returns
2203 -------
2204 refs : `list` of `DatasetRef`
2205 The refs added to this Butler.
2207 Notes
2208 -----
2209 Requires that any dimension definitions are already present in the
2210 receiving Butler. The datastore artifact has to exist for a transfer
2211 to be made but non-existence is not an error.
2213 Datasets that already exist in this run will be skipped.
2215 The datasets are imported as part of a transaction, although
2216 dataset types are registered before the transaction is started.
2217 This means that it is possible for a dataset type to be registered
2218 even though transfer has failed.
2219 """
2220 if not self.isWriteable():
2221 raise TypeError("Butler is read-only.")
2222 progress = Progress("lsst.daf.butler.Butler.transfer_from", level=VERBOSE)
2224 # Will iterate through the refs multiple times so need to convert
2225 # to a list if this isn't a collection.
2226 if not isinstance(source_refs, collections.abc.Collection):
2227 source_refs = list(source_refs)
2229 original_count = len(source_refs)
2230 log.info("Transferring %d datasets into %s", original_count, str(self))
2232 if id_gen_map is None:
2233 id_gen_map = {}
2235 # In some situations the datastore artifact may be missing
2236 # and we do not want that registry entry to be imported.
2237 # Asking datastore is not sufficient, the records may have been
2238 # purged, we have to ask for the (predicted) URI and check
2239 # existence explicitly. Execution butler is set up exactly like
2240 # this with no datastore records.
2241 artifact_existence: Dict[ResourcePath, bool] = {}
2242 if skip_missing:
2243 dataset_existence = source_butler.datastore.mexists(
2244 source_refs, artifact_existence=artifact_existence
2245 )
2246 source_refs = [ref for ref, exists in dataset_existence.items() if exists]
2247 filtered_count = len(source_refs)
2248 log.verbose(
2249 "%d datasets removed because the artifact does not exist. Now have %d.",
2250 original_count - filtered_count,
2251 filtered_count,
2252 )
2254 # Importing requires that we group the refs by dataset type and run
2255 # before doing the import.
2256 source_dataset_types = set()
2257 grouped_refs = defaultdict(list)
2258 grouped_indices = defaultdict(list)
2259 for i, ref in enumerate(source_refs):
2260 grouped_refs[ref.datasetType, ref.run].append(ref)
2261 grouped_indices[ref.datasetType, ref.run].append(i)
2262 source_dataset_types.add(ref.datasetType)
2264 # Check to see if the dataset type in the source butler has
2265 # the same definition in the target butler and register missing
2266 # ones if requested. Registration must happen outside a transaction.
2267 newly_registered_dataset_types = set()
2268 for datasetType in source_dataset_types:
2269 if register_dataset_types:
2270 # Let this raise immediately if inconsistent. Continuing
2271 # on to find additional inconsistent dataset types
2272 # might result in additional unwanted dataset types being
2273 # registered.
2274 if self.registry.registerDatasetType(datasetType):
2275 newly_registered_dataset_types.add(datasetType)
2276 else:
2277 # If the dataset type is missing, let it fail immediately.
2278 target_dataset_type = self.registry.getDatasetType(datasetType.name)
2279 if target_dataset_type != datasetType:
2280 raise ConflictingDefinitionError(
2281 "Source butler dataset type differs from definition"
2282 f" in target butler: {datasetType} !="
2283 f" {target_dataset_type}"
2284 )
2285 if newly_registered_dataset_types:
2286 # We may have registered some even if there were inconsistencies
2287 # but should let people know (or else remove them again).
2288 log.log(
2289 VERBOSE,
2290 "Registered the following dataset types in the target Butler: %s",
2291 ", ".join(d.name for d in newly_registered_dataset_types),
2292 )
2293 else:
2294 log.log(VERBOSE, "All required dataset types are known to the target Butler")
2296 dimension_records: Dict[DimensionElement, Dict[DataCoordinate, DimensionRecord]] = defaultdict(dict)
2297 if transfer_dimensions:
2298 # Collect all the dimension records for these refs.
2299 # All dimensions are to be copied but the list of valid dimensions
2300 # come from this butler's universe.
2301 elements = frozenset(
2302 element
2303 for element in self.registry.dimensions.getStaticElements()
2304 if element.hasTable() and element.viewOf is None
2305 )
2306 dataIds = set(ref.dataId for ref in source_refs)
2307 # This logic comes from saveDataIds.
2308 for dataId in dataIds:
2309 # Need an expanded record, if not expanded that we need a full
2310 # butler with registry (allow mocks with registry too).
2311 if not dataId.hasRecords():
2312 if registry := getattr(source_butler, "registry", None):
2313 dataId = registry.expandDataId(dataId)
2314 else:
2315 raise TypeError("Input butler needs to be a full butler to expand DataId.")
2316 # If this butler doesn't know about a dimension in the source
2317 # butler things will break later.
2318 for record in dataId.records.values():
2319 if record is not None and record.definition in elements:
2320 dimension_records[record.definition].setdefault(record.dataId, record)
2322 # The returned refs should be identical for UUIDs.
2323 # For now must also support integers and so need to retain the
2324 # newly-created refs from this registry.
2325 # Pre-size it so we can assign refs into the correct slots
2326 transferred_refs_tmp: List[Optional[DatasetRef]] = [None] * len(source_refs)
2327 default_id_gen = DatasetIdGenEnum.UNIQUE
2329 handled_collections: Set[str] = set()
2331 # Do all the importing in a single transaction.
2332 with self.transaction():
2333 if dimension_records:
2334 log.verbose("Ensuring that dimension records exist for transferred datasets.")
2335 for element, r in dimension_records.items():
2336 records = [r[dataId] for dataId in r]
2337 # Assume that if the record is already present that we can
2338 # use it without having to check that the record metadata
2339 # is consistent.
2340 self.registry.insertDimensionData(element, *records, skip_existing=True)
2342 for (datasetType, run), refs_to_import in progress.iter_item_chunks(
2343 grouped_refs.items(), desc="Importing to registry by run and dataset type"
2344 ):
2345 if run not in handled_collections:
2346 # May need to create output collection. If source butler
2347 # has a registry, ask for documentation string.
2348 run_doc = None
2349 if registry := getattr(source_butler, "registry", None):
2350 run_doc = registry.getCollectionDocumentation(run)
2351 registered = self.registry.registerRun(run, doc=run_doc)
2352 handled_collections.add(run)
2353 if registered:
2354 log.log(VERBOSE, "Creating output run %s", run)
2356 id_generation_mode = default_id_gen
2357 if isinstance(refs_to_import[0].id, int):
2358 # ID generation mode might need to be overridden when
2359 # targetting UUID
2360 id_generation_mode = id_gen_map.get(datasetType.name, default_id_gen)
2362 n_refs = len(refs_to_import)
2363 log.verbose(
2364 "Importing %d ref%s of dataset type %s into run %s",
2365 n_refs,
2366 "" if n_refs == 1 else "s",
2367 datasetType.name,
2368 run,
2369 )
2371 # No way to know if this butler's registry uses UUID.
2372 # We have to trust the caller on this. If it fails they will
2373 # have to change their approach. We can't catch the exception
2374 # and retry with unique because that will mess up the
2375 # transaction handling. We aren't allowed to ask the registry
2376 # manager what type of ID it is using.
2377 imported_refs = self.registry._importDatasets(
2378 refs_to_import, idGenerationMode=id_generation_mode, expand=False
2379 )
2381 # Map them into the correct slots to match the initial order
2382 for i, ref in zip(grouped_indices[datasetType, run], imported_refs):
2383 transferred_refs_tmp[i] = ref
2385 # Mypy insists that we might have None in here so we have to make
2386 # that explicit by assigning to a new variable and filtering out
2387 # something that won't be there.
2388 transferred_refs = [ref for ref in transferred_refs_tmp if ref is not None]
2390 # Check consistency
2391 assert len(source_refs) == len(transferred_refs), "Different number of refs imported than given"
2393 log.verbose("Imported %d datasets into destination butler", len(transferred_refs))
2395 # The transferred refs need to be reordered to match the original
2396 # ordering given by the caller. Without this the datastore transfer
2397 # will be broken.
2399 # Ask the datastore to transfer. The datastore has to check that
2400 # the source datastore is compatible with the target datastore.
2401 self.datastore.transfer_from(
2402 source_butler.datastore,
2403 source_refs,
2404 local_refs=transferred_refs,
2405 transfer=transfer,
2406 artifact_existence=artifact_existence,
2407 )
2409 return transferred_refs
2411 def validateConfiguration(
2412 self,
2413 logFailures: bool = False,
2414 datasetTypeNames: Optional[Iterable[str]] = None,
2415 ignore: Iterable[str] | None = None,
2416 ) -> None:
2417 """Validate butler configuration.
2419 Checks that each `DatasetType` can be stored in the `Datastore`.
2421 Parameters
2422 ----------
2423 logFailures : `bool`, optional
2424 If `True`, output a log message for every validation error
2425 detected.
2426 datasetTypeNames : iterable of `str`, optional
2427 The `DatasetType` names that should be checked. This allows
2428 only a subset to be selected.
2429 ignore : iterable of `str`, optional
2430 Names of DatasetTypes to skip over. This can be used to skip
2431 known problems. If a named `DatasetType` corresponds to a
2432 composite, all components of that `DatasetType` will also be
2433 ignored.
2435 Raises
2436 ------
2437 ButlerValidationError
2438 Raised if there is some inconsistency with how this Butler
2439 is configured.
2440 """
2441 if datasetTypeNames:
2442 datasetTypes = [self.registry.getDatasetType(name) for name in datasetTypeNames]
2443 else:
2444 datasetTypes = list(self.registry.queryDatasetTypes())
2446 # filter out anything from the ignore list
2447 if ignore:
2448 ignore = set(ignore)
2449 datasetTypes = [
2450 e for e in datasetTypes if e.name not in ignore and e.nameAndComponent()[0] not in ignore
2451 ]
2452 else:
2453 ignore = set()
2455 # Find all the registered instruments
2456 instruments = set(record.name for record in self.registry.queryDimensionRecords("instrument"))
2458 # For each datasetType that has an instrument dimension, create
2459 # a DatasetRef for each defined instrument
2460 datasetRefs = []
2462 for datasetType in datasetTypes:
2463 if "instrument" in datasetType.dimensions:
2464 for instrument in instruments:
2465 datasetRef = DatasetRef(
2466 datasetType, {"instrument": instrument}, conform=False # type: ignore
2467 )
2468 datasetRefs.append(datasetRef)
2470 entities: List[Union[DatasetType, DatasetRef]] = []
2471 entities.extend(datasetTypes)
2472 entities.extend(datasetRefs)
2474 datastoreErrorStr = None
2475 try:
2476 self.datastore.validateConfiguration(entities, logFailures=logFailures)
2477 except ValidationError as e:
2478 datastoreErrorStr = str(e)
2480 # Also check that the LookupKeys used by the datastores match
2481 # registry and storage class definitions
2482 keys = self.datastore.getLookupKeys()
2484 failedNames = set()
2485 failedDataId = set()
2486 for key in keys:
2487 if key.name is not None:
2488 if key.name in ignore:
2489 continue
2491 # skip if specific datasetType names were requested and this
2492 # name does not match
2493 if datasetTypeNames and key.name not in datasetTypeNames:
2494 continue
2496 # See if it is a StorageClass or a DatasetType
2497 if key.name in self.storageClasses:
2498 pass
2499 else:
2500 try:
2501 self.registry.getDatasetType(key.name)
2502 except KeyError:
2503 if logFailures:
2504 log.critical("Key '%s' does not correspond to a DatasetType or StorageClass", key)
2505 failedNames.add(key)
2506 else:
2507 # Dimensions are checked for consistency when the Butler
2508 # is created and rendezvoused with a universe.
2509 pass
2511 # Check that the instrument is a valid instrument
2512 # Currently only support instrument so check for that
2513 if key.dataId:
2514 dataIdKeys = set(key.dataId)
2515 if set(["instrument"]) != dataIdKeys:
2516 if logFailures:
2517 log.critical("Key '%s' has unsupported DataId override", key)
2518 failedDataId.add(key)
2519 elif key.dataId["instrument"] not in instruments:
2520 if logFailures:
2521 log.critical("Key '%s' has unknown instrument", key)
2522 failedDataId.add(key)
2524 messages = []
2526 if datastoreErrorStr:
2527 messages.append(datastoreErrorStr)
2529 for failed, msg in (
2530 (failedNames, "Keys without corresponding DatasetType or StorageClass entry: "),
2531 (failedDataId, "Keys with bad DataId entries: "),
2532 ):
2533 if failed:
2534 msg += ", ".join(str(k) for k in failed)
2535 messages.append(msg)
2537 if messages:
2538 raise ValidationError(";\n".join(messages))
2540 @property
2541 def collections(self) -> Sequence[str]:
2542 """The collections to search by default, in order
2543 (`Sequence` [ `str` ]).
2545 This is an alias for ``self.registry.defaults.collections``. It cannot
2546 be set directly in isolation, but all defaults may be changed together
2547 by assigning a new `RegistryDefaults` instance to
2548 ``self.registry.defaults``.
2549 """
2550 return self.registry.defaults.collections
2552 @property
2553 def run(self) -> Optional[str]:
2554 """Name of the run this butler writes outputs to by default (`str` or
2555 `None`).
2557 This is an alias for ``self.registry.defaults.run``. It cannot be set
2558 directly in isolation, but all defaults may be changed together by
2559 assigning a new `RegistryDefaults` instance to
2560 ``self.registry.defaults``.
2561 """
2562 return self.registry.defaults.run
2564 @property
2565 def dimensions(self) -> DimensionUniverse:
2566 # Docstring inherited.
2567 return self.registry.dimensions
2569 registry: Registry
2570 """The object that manages dataset metadata and relationships (`Registry`).
2572 Most operations that don't involve reading or writing butler datasets are
2573 accessible only via `Registry` methods.
2574 """
2576 datastore: Datastore
2577 """The object that manages actual dataset storage (`Datastore`).
2579 Direct user access to the datastore should rarely be necessary; the primary
2580 exception is the case where a `Datastore` implementation provides extra
2581 functionality beyond what the base class defines.
2582 """
2584 storageClasses: StorageClassFactory
2585 """An object that maps known storage class names to objects that fully
2586 describe them (`StorageClassFactory`).
2587 """
2589 _allow_put_of_predefined_dataset: bool
2590 """Allow a put to succeed even if there is already a registry entry for it
2591 but not a datastore record. (`bool`)."""