Coverage for python/lsst/daf/butler/_butler.py: 8%
718 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-23 09:30 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-23 09:30 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""
23Butler top level classes.
24"""
25from __future__ import annotations
27__all__ = (
28 "Butler",
29 "ButlerValidationError",
30)
32import collections.abc
33import contextlib
34import io
35import logging
36import numbers
37import os
38import warnings
39from collections import Counter, defaultdict
40from collections.abc import Iterable, Iterator, MutableMapping, Sequence
41from typing import TYPE_CHECKING, Any, ClassVar, TextIO
43from deprecated.sphinx import deprecated
44from lsst.resources import ResourcePath, ResourcePathExpression
45from lsst.utils import doImportType
46from lsst.utils.introspection import get_class_of
47from lsst.utils.logging import VERBOSE, getLogger
48from sqlalchemy.exc import IntegrityError
50from ._butlerConfig import ButlerConfig
51from ._butlerRepoIndex import ButlerRepoIndex
52from ._dataset_existence import DatasetExistence
53from ._deferredDatasetHandle import DeferredDatasetHandle
54from ._limited_butler import LimitedButler
55from .core import (
56 Config,
57 ConfigSubset,
58 DataCoordinate,
59 DataId,
60 DataIdValue,
61 DatasetIdGenEnum,
62 DatasetRef,
63 DatasetRefURIs,
64 DatasetType,
65 Datastore,
66 Dimension,
67 DimensionConfig,
68 DimensionElement,
69 DimensionRecord,
70 DimensionUniverse,
71 FileDataset,
72 Progress,
73 StorageClass,
74 StorageClassFactory,
75 Timespan,
76 ValidationError,
77)
78from .core.repoRelocation import BUTLER_ROOT_TAG
79from .core.utils import transactional
80from .registry import (
81 CollectionType,
82 ConflictingDefinitionError,
83 DataIdError,
84 MissingDatasetTypeError,
85 NoDefaultCollectionError,
86 Registry,
87 RegistryConfig,
88 RegistryDefaults,
89)
90from .transfers import RepoExportContext
92if TYPE_CHECKING:
93 from lsst.resources import ResourceHandleProtocol
95 from .transfers import RepoImportBackend
97log = getLogger(__name__)
100class ButlerValidationError(ValidationError):
101 """There is a problem with the Butler configuration."""
103 pass
106class Butler(LimitedButler):
107 """Main entry point for the data access system.
109 Parameters
110 ----------
111 config : `ButlerConfig`, `Config` or `str`, optional.
112 Configuration. Anything acceptable to the
113 `ButlerConfig` constructor. If a directory path
114 is given the configuration will be read from a ``butler.yaml`` file in
115 that location. If `None` is given default values will be used.
116 butler : `Butler`, optional.
117 If provided, construct a new Butler that uses the same registry and
118 datastore as the given one, but with the given collection and run.
119 Incompatible with the ``config``, ``searchPaths``, and ``writeable``
120 arguments.
121 collections : `str` or `~collections.abc.Iterable` [ `str` ], optional
122 An expression specifying the collections to be searched (in order) when
123 reading datasets.
124 This may be a `str` collection name or an iterable thereof.
125 See :ref:`daf_butler_collection_expressions` for more information.
126 These collections are not registered automatically and must be
127 manually registered before they are used by any method, but they may be
128 manually registered after the `Butler` is initialized.
129 run : `str`, optional
130 Name of the `~CollectionType.RUN` collection new datasets should be
131 inserted into. If ``collections`` is `None` and ``run`` is not `None`,
132 ``collections`` will be set to ``[run]``. If not `None`, this
133 collection will automatically be registered. If this is not set (and
134 ``writeable`` is not set either), a read-only butler will be created.
135 searchPaths : `list` of `str`, optional
136 Directory paths to search when calculating the full Butler
137 configuration. Not used if the supplied config is already a
138 `ButlerConfig`.
139 writeable : `bool`, optional
140 Explicitly sets whether the butler supports write operations. If not
141 provided, a read-write butler is created if any of ``run``, ``tags``,
142 or ``chains`` is non-empty.
143 inferDefaults : `bool`, optional
144 If `True` (default) infer default data ID values from the values
145 present in the datasets in ``collections``: if all collections have the
146 same value (or no value) for a governor dimension, that value will be
147 the default for that dimension. Nonexistent collections are ignored.
148 If a default value is provided explicitly for a governor dimension via
149 ``**kwargs``, no default will be inferred for that dimension.
150 **kwargs : `str`
151 Default data ID key-value pairs. These may only identify "governor"
152 dimensions like ``instrument`` and ``skymap``.
154 Examples
155 --------
156 While there are many ways to control exactly how a `Butler` interacts with
157 the collections in its `Registry`, the most common cases are still simple.
159 For a read-only `Butler` that searches one collection, do::
161 butler = Butler("/path/to/repo", collections=["u/alice/DM-50000"])
163 For a read-write `Butler` that writes to and reads from a
164 `~CollectionType.RUN` collection::
166 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a")
168 The `Butler` passed to a ``PipelineTask`` is often much more complex,
169 because we want to write to one `~CollectionType.RUN` collection but read
170 from several others (as well)::
172 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a",
173 collections=["u/alice/DM-50000/a",
174 "u/bob/DM-49998",
175 "HSC/defaults"])
177 This butler will `put` new datasets to the run ``u/alice/DM-50000/a``.
178 Datasets will be read first from that run (since it appears first in the
179 chain), and then from ``u/bob/DM-49998`` and finally ``HSC/defaults``.
181 Finally, one can always create a `Butler` with no collections::
183 butler = Butler("/path/to/repo", writeable=True)
185 This can be extremely useful when you just want to use ``butler.registry``,
186 e.g. for inserting dimension data or managing collections, or when the
187 collections you want to use with the butler are not consistent.
188 Passing ``writeable`` explicitly here is only necessary if you want to be
189 able to make changes to the repo - usually the value for ``writeable`` can
190 be guessed from the collection arguments provided, but it defaults to
191 `False` when there are not collection arguments.
192 """
194 def __init__(
195 self,
196 config: Config | ResourcePathExpression | None = None,
197 *,
198 butler: Butler | None = None,
199 collections: Any = None,
200 run: str | None = None,
201 searchPaths: Sequence[ResourcePathExpression] | None = None,
202 writeable: bool | None = None,
203 inferDefaults: bool = True,
204 **kwargs: str,
205 ):
206 defaults = RegistryDefaults(collections=collections, run=run, infer=inferDefaults, **kwargs)
207 # Load registry, datastore, etc. from config or existing butler.
208 if butler is not None:
209 if config is not None or searchPaths is not None or writeable is not None:
210 raise TypeError(
211 "Cannot pass 'config', 'searchPaths', or 'writeable' arguments with 'butler' argument."
212 )
213 self.registry = butler.registry.copy(defaults)
214 self.datastore = butler.datastore
215 self.storageClasses = butler.storageClasses
216 self._config: ButlerConfig = butler._config
217 else:
218 self._config = ButlerConfig(config, searchPaths=searchPaths)
219 try:
220 if "root" in self._config:
221 butlerRoot = self._config["root"]
222 else:
223 butlerRoot = self._config.configDir
224 if writeable is None:
225 writeable = run is not None
226 self.registry = Registry.fromConfig(
227 self._config, butlerRoot=butlerRoot, writeable=writeable, defaults=defaults
228 )
229 self.datastore = Datastore.fromConfig(
230 self._config, self.registry.getDatastoreBridgeManager(), butlerRoot=butlerRoot
231 )
232 self.storageClasses = StorageClassFactory()
233 self.storageClasses.addFromConfig(self._config)
234 except Exception:
235 # Failures here usually mean that configuration is incomplete,
236 # just issue an error message which includes config file URI.
237 log.error(f"Failed to instantiate Butler from config {self._config.configFile}.")
238 raise
240 # For execution butler the datastore needs a special
241 # dependency-inversion trick. This is not used by regular butler,
242 # but we do not have a way to distinguish regular butler from execution
243 # butler.
244 self.datastore.set_retrieve_dataset_type_method(self._retrieve_dataset_type)
246 if "run" in self._config or "collection" in self._config:
247 raise ValueError("Passing a run or collection via configuration is no longer supported.")
249 GENERATION: ClassVar[int] = 3
250 """This is a Generation 3 Butler.
252 This attribute may be removed in the future, once the Generation 2 Butler
253 interface has been fully retired; it should only be used in transitional
254 code.
255 """
257 def _retrieve_dataset_type(self, name: str) -> DatasetType | None:
258 """Return DatasetType defined in registry given dataset type name."""
259 try:
260 return self.registry.getDatasetType(name)
261 except MissingDatasetTypeError:
262 return None
264 @classmethod
265 def get_repo_uri(cls, label: str, return_label: bool = False) -> ResourcePath:
266 """Look up the label in a butler repository index.
268 Parameters
269 ----------
270 label : `str`
271 Label of the Butler repository to look up.
272 return_label : `bool`, optional
273 If ``label`` cannot be found in the repository index (either
274 because index is not defined or ``label`` is not in the index) and
275 ``return_label`` is `True` then return ``ResourcePath(label)``.
276 If ``return_label`` is `False` (default) then an exception will be
277 raised instead.
279 Returns
280 -------
281 uri : `lsst.resources.ResourcePath`
282 URI to the Butler repository associated with the given label or
283 default value if it is provided.
285 Raises
286 ------
287 KeyError
288 Raised if the label is not found in the index, or if an index
289 is not defined, and ``return_label`` is `False`.
291 Notes
292 -----
293 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the
294 information is discovered.
295 """
296 return ButlerRepoIndex.get_repo_uri(label, return_label)
298 @classmethod
299 def get_known_repos(cls) -> set[str]:
300 """Retrieve the list of known repository labels.
302 Returns
303 -------
304 repos : `set` of `str`
305 All the known labels. Can be empty if no index can be found.
307 Notes
308 -----
309 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the
310 information is discovered.
311 """
312 return ButlerRepoIndex.get_known_repos()
314 @staticmethod
315 def makeRepo(
316 root: ResourcePathExpression,
317 config: Config | str | None = None,
318 dimensionConfig: Config | str | None = None,
319 standalone: bool = False,
320 searchPaths: list[str] | None = None,
321 forceConfigRoot: bool = True,
322 outfile: ResourcePathExpression | None = None,
323 overwrite: bool = False,
324 ) -> Config:
325 """Create an empty data repository by adding a butler.yaml config
326 to a repository root directory.
328 Parameters
329 ----------
330 root : `lsst.resources.ResourcePathExpression`
331 Path or URI to the root location of the new repository. Will be
332 created if it does not exist.
333 config : `Config` or `str`, optional
334 Configuration to write to the repository, after setting any
335 root-dependent Registry or Datastore config options. Can not
336 be a `ButlerConfig` or a `ConfigSubset`. If `None`, default
337 configuration will be used. Root-dependent config options
338 specified in this config are overwritten if ``forceConfigRoot``
339 is `True`.
340 dimensionConfig : `Config` or `str`, optional
341 Configuration for dimensions, will be used to initialize registry
342 database.
343 standalone : `bool`
344 If True, write all expanded defaults, not just customized or
345 repository-specific settings.
346 This (mostly) decouples the repository from the default
347 configuration, insulating it from changes to the defaults (which
348 may be good or bad, depending on the nature of the changes).
349 Future *additions* to the defaults will still be picked up when
350 initializing `Butlers` to repos created with ``standalone=True``.
351 searchPaths : `list` of `str`, optional
352 Directory paths to search when calculating the full butler
353 configuration.
354 forceConfigRoot : `bool`, optional
355 If `False`, any values present in the supplied ``config`` that
356 would normally be reset are not overridden and will appear
357 directly in the output config. This allows non-standard overrides
358 of the root directory for a datastore or registry to be given.
359 If this parameter is `True` the values for ``root`` will be
360 forced into the resulting config if appropriate.
361 outfile : `lss.resources.ResourcePathExpression`, optional
362 If not-`None`, the output configuration will be written to this
363 location rather than into the repository itself. Can be a URI
364 string. Can refer to a directory that will be used to write
365 ``butler.yaml``.
366 overwrite : `bool`, optional
367 Create a new configuration file even if one already exists
368 in the specified output location. Default is to raise
369 an exception.
371 Returns
372 -------
373 config : `Config`
374 The updated `Config` instance written to the repo.
376 Raises
377 ------
378 ValueError
379 Raised if a ButlerConfig or ConfigSubset is passed instead of a
380 regular Config (as these subclasses would make it impossible to
381 support ``standalone=False``).
382 FileExistsError
383 Raised if the output config file already exists.
384 os.error
385 Raised if the directory does not exist, exists but is not a
386 directory, or cannot be created.
388 Notes
389 -----
390 Note that when ``standalone=False`` (the default), the configuration
391 search path (see `ConfigSubset.defaultSearchPaths`) that was used to
392 construct the repository should also be used to construct any Butlers
393 to avoid configuration inconsistencies.
394 """
395 if isinstance(config, (ButlerConfig, ConfigSubset)):
396 raise ValueError("makeRepo must be passed a regular Config without defaults applied.")
398 # Ensure that the root of the repository exists or can be made
399 root_uri = ResourcePath(root, forceDirectory=True)
400 root_uri.mkdir()
402 config = Config(config)
404 # If we are creating a new repo from scratch with relative roots,
405 # do not propagate an explicit root from the config file
406 if "root" in config:
407 del config["root"]
409 full = ButlerConfig(config, searchPaths=searchPaths) # this applies defaults
410 imported_class = doImportType(full["datastore", "cls"])
411 if not issubclass(imported_class, Datastore):
412 raise TypeError(f"Imported datastore class {full['datastore', 'cls']} is not a Datastore")
413 datastoreClass: type[Datastore] = imported_class
414 datastoreClass.setConfigRoot(BUTLER_ROOT_TAG, config, full, overwrite=forceConfigRoot)
416 # if key exists in given config, parse it, otherwise parse the defaults
417 # in the expanded config
418 if config.get(("registry", "db")):
419 registryConfig = RegistryConfig(config)
420 else:
421 registryConfig = RegistryConfig(full)
422 defaultDatabaseUri = registryConfig.makeDefaultDatabaseUri(BUTLER_ROOT_TAG)
423 if defaultDatabaseUri is not None:
424 Config.updateParameters(
425 RegistryConfig, config, full, toUpdate={"db": defaultDatabaseUri}, overwrite=forceConfigRoot
426 )
427 else:
428 Config.updateParameters(RegistryConfig, config, full, toCopy=("db",), overwrite=forceConfigRoot)
430 if standalone:
431 config.merge(full)
432 else:
433 # Always expand the registry.managers section into the per-repo
434 # config, because after the database schema is created, it's not
435 # allowed to change anymore. Note that in the standalone=True
436 # branch, _everything_ in the config is expanded, so there's no
437 # need to special case this.
438 Config.updateParameters(RegistryConfig, config, full, toMerge=("managers",), overwrite=False)
439 configURI: ResourcePathExpression
440 if outfile is not None:
441 # When writing to a separate location we must include
442 # the root of the butler repo in the config else it won't know
443 # where to look.
444 config["root"] = root_uri.geturl()
445 configURI = outfile
446 else:
447 configURI = root_uri
448 # Strip obscore configuration, if it is present, before writing config
449 # to a file, obscore config will be stored in registry.
450 if (obscore_config_key := ("registry", "managers", "obscore", "config")) in config:
451 config_to_write = config.copy()
452 del config_to_write[obscore_config_key]
453 config_to_write.dumpToUri(configURI, overwrite=overwrite)
454 # configFile attribute is updated, need to copy it to original.
455 config.configFile = config_to_write.configFile
456 else:
457 config.dumpToUri(configURI, overwrite=overwrite)
459 # Create Registry and populate tables
460 registryConfig = RegistryConfig(config.get("registry"))
461 dimensionConfig = DimensionConfig(dimensionConfig)
462 Registry.createFromConfig(registryConfig, dimensionConfig=dimensionConfig, butlerRoot=root_uri)
464 log.verbose("Wrote new Butler configuration file to %s", configURI)
466 return config
468 @classmethod
469 def _unpickle(
470 cls,
471 config: ButlerConfig,
472 collections: tuple[str, ...] | None,
473 run: str | None,
474 defaultDataId: dict[str, str],
475 writeable: bool,
476 ) -> Butler:
477 """Callable used to unpickle a Butler.
479 We prefer not to use ``Butler.__init__`` directly so we can force some
480 of its many arguments to be keyword-only (note that ``__reduce__``
481 can only invoke callables with positional arguments).
483 Parameters
484 ----------
485 config : `ButlerConfig`
486 Butler configuration, already coerced into a true `ButlerConfig`
487 instance (and hence after any search paths for overrides have been
488 utilized).
489 collections : `tuple` [ `str` ]
490 Names of the default collections to read from.
491 run : `str`, optional
492 Name of the default `~CollectionType.RUN` collection to write to.
493 defaultDataId : `dict` [ `str`, `str` ]
494 Default data ID values.
495 writeable : `bool`
496 Whether the Butler should support write operations.
498 Returns
499 -------
500 butler : `Butler`
501 A new `Butler` instance.
502 """
503 # MyPy doesn't recognize that the kwargs below are totally valid; it
504 # seems to think '**defaultDataId* is a _positional_ argument!
505 return cls(
506 config=config,
507 collections=collections,
508 run=run,
509 writeable=writeable,
510 **defaultDataId, # type: ignore
511 )
513 def __reduce__(self) -> tuple:
514 """Support pickling."""
515 return (
516 Butler._unpickle,
517 (
518 self._config,
519 self.collections,
520 self.run,
521 self.registry.defaults.dataId.byName(),
522 self.registry.isWriteable(),
523 ),
524 )
526 def __str__(self) -> str:
527 return "Butler(collections={}, run={}, datastore='{}', registry='{}')".format(
528 self.collections, self.run, self.datastore, self.registry
529 )
531 def isWriteable(self) -> bool:
532 """Return `True` if this `Butler` supports write operations."""
533 return self.registry.isWriteable()
535 @contextlib.contextmanager
536 def transaction(self) -> Iterator[None]:
537 """Context manager supporting `Butler` transactions.
539 Transactions can be nested.
540 """
541 with self.registry.transaction():
542 with self.datastore.transaction():
543 yield
545 def _standardizeArgs(
546 self,
547 datasetRefOrType: DatasetRef | DatasetType | str,
548 dataId: DataId | None = None,
549 for_put: bool = True,
550 **kwargs: Any,
551 ) -> tuple[DatasetType, DataId | None]:
552 """Standardize the arguments passed to several Butler APIs.
554 Parameters
555 ----------
556 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
557 When `DatasetRef` the `dataId` should be `None`.
558 Otherwise the `DatasetType` or name thereof.
559 dataId : `dict` or `DataCoordinate`
560 A `dict` of `Dimension` link name, value pairs that label the
561 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
562 should be provided as the second argument.
563 for_put : `bool`, optional
564 If `True` this call is invoked as part of a `Butler.put()`.
565 Otherwise it is assumed to be part of a `Butler.get()`. This
566 parameter is only relevant if there is dataset type
567 inconsistency.
568 **kwargs
569 Additional keyword arguments used to augment or construct a
570 `DataCoordinate`. See `DataCoordinate.standardize`
571 parameters.
573 Returns
574 -------
575 datasetType : `DatasetType`
576 A `DatasetType` instance extracted from ``datasetRefOrType``.
577 dataId : `dict` or `DataId`, optional
578 Argument that can be used (along with ``kwargs``) to construct a
579 `DataId`.
581 Notes
582 -----
583 Butler APIs that conceptually need a DatasetRef also allow passing a
584 `DatasetType` (or the name of one) and a `DataId` (or a dict and
585 keyword arguments that can be used to construct one) separately. This
586 method accepts those arguments and always returns a true `DatasetType`
587 and a `DataId` or `dict`.
589 Standardization of `dict` vs `DataId` is best handled by passing the
590 returned ``dataId`` (and ``kwargs``) to `Registry` APIs, which are
591 generally similarly flexible.
592 """
593 externalDatasetType: DatasetType | None = None
594 internalDatasetType: DatasetType | None = None
595 if isinstance(datasetRefOrType, DatasetRef):
596 if dataId is not None or kwargs:
597 raise ValueError("DatasetRef given, cannot use dataId as well")
598 externalDatasetType = datasetRefOrType.datasetType
599 dataId = datasetRefOrType.dataId
600 else:
601 # Don't check whether DataId is provided, because Registry APIs
602 # can usually construct a better error message when it wasn't.
603 if isinstance(datasetRefOrType, DatasetType):
604 externalDatasetType = datasetRefOrType
605 else:
606 internalDatasetType = self.registry.getDatasetType(datasetRefOrType)
608 # Check that they are self-consistent
609 if externalDatasetType is not None:
610 internalDatasetType = self.registry.getDatasetType(externalDatasetType.name)
611 if externalDatasetType != internalDatasetType:
612 # We can allow differences if they are compatible, depending
613 # on whether this is a get or a put. A get requires that
614 # the python type associated with the datastore can be
615 # converted to the user type. A put requires that the user
616 # supplied python type can be converted to the internal
617 # type expected by registry.
618 relevantDatasetType = internalDatasetType
619 if for_put:
620 is_compatible = internalDatasetType.is_compatible_with(externalDatasetType)
621 else:
622 is_compatible = externalDatasetType.is_compatible_with(internalDatasetType)
623 relevantDatasetType = externalDatasetType
624 if not is_compatible:
625 raise ValueError(
626 f"Supplied dataset type ({externalDatasetType}) inconsistent with "
627 f"registry definition ({internalDatasetType})"
628 )
629 # Override the internal definition.
630 internalDatasetType = relevantDatasetType
632 assert internalDatasetType is not None
633 return internalDatasetType, dataId
635 def _rewrite_data_id(
636 self, dataId: DataId | None, datasetType: DatasetType, **kwargs: Any
637 ) -> tuple[DataId | None, dict[str, Any]]:
638 """Rewrite a data ID taking into account dimension records.
640 Take a Data ID and keyword args and rewrite it if necessary to
641 allow the user to specify dimension records rather than dimension
642 primary values.
644 This allows a user to include a dataId dict with keys of
645 ``exposure.day_obs`` and ``exposure.seq_num`` instead of giving
646 the integer exposure ID. It also allows a string to be given
647 for a dimension value rather than the integer ID if that is more
648 convenient. For example, rather than having to specifyin the
649 detector with ``detector.full_name``, a string given for ``detector``
650 will be interpreted as the full name and converted to the integer
651 value.
653 Keyword arguments can also use strings for dimensions like detector
654 and exposure but python does not allow them to include ``.`` and
655 so the ``exposure.day_obs`` syntax can not be used in a keyword
656 argument.
658 Parameters
659 ----------
660 dataId : `dict` or `DataCoordinate`
661 A `dict` of `Dimension` link name, value pairs that will label the
662 `DatasetRef` within a Collection.
663 datasetType : `DatasetType`
664 The dataset type associated with this dataId. Required to
665 determine the relevant dimensions.
666 **kwargs
667 Additional keyword arguments used to augment or construct a
668 `DataId`. See `DataId` parameters.
670 Returns
671 -------
672 dataId : `dict` or `DataCoordinate`
673 The, possibly rewritten, dataId. If given a `DataCoordinate` and
674 no keyword arguments, the original dataId will be returned
675 unchanged.
676 **kwargs : `dict`
677 Any unused keyword arguments (would normally be empty dict).
678 """
679 # Do nothing if we have a standalone DataCoordinate.
680 if isinstance(dataId, DataCoordinate) and not kwargs:
681 return dataId, kwargs
683 # Process dimension records that are using record information
684 # rather than ids
685 newDataId: dict[str, DataIdValue] = {}
686 byRecord: dict[str, dict[str, Any]] = defaultdict(dict)
688 # if all the dataId comes from keyword parameters we do not need
689 # to do anything here because they can't be of the form
690 # exposure.obs_id because a "." is not allowed in a keyword parameter.
691 if dataId:
692 for k, v in dataId.items():
693 # If we have a Dimension we do not need to do anything
694 # because it cannot be a compound key.
695 if isinstance(k, str) and "." in k:
696 # Someone is using a more human-readable dataId
697 dimensionName, record = k.split(".", 1)
698 byRecord[dimensionName][record] = v
699 elif isinstance(k, Dimension):
700 newDataId[k.name] = v
701 else:
702 newDataId[k] = v
704 # Go through the updated dataId and check the type in case someone is
705 # using an alternate key. We have already filtered out the compound
706 # keys dimensions.record format.
707 not_dimensions = {}
709 # Will need to look in the dataId and the keyword arguments
710 # and will remove them if they need to be fixed or are unrecognized.
711 for dataIdDict in (newDataId, kwargs):
712 # Use a list so we can adjust the dict safely in the loop
713 for dimensionName in list(dataIdDict):
714 value = dataIdDict[dimensionName]
715 try:
716 dimension = self.dimensions.getStaticDimensions()[dimensionName]
717 except KeyError:
718 # This is not a real dimension
719 not_dimensions[dimensionName] = value
720 del dataIdDict[dimensionName]
721 continue
723 # Convert an integral type to an explicit int to simplify
724 # comparisons here
725 if isinstance(value, numbers.Integral):
726 value = int(value)
728 if not isinstance(value, dimension.primaryKey.getPythonType()):
729 for alternate in dimension.alternateKeys:
730 if isinstance(value, alternate.getPythonType()):
731 byRecord[dimensionName][alternate.name] = value
732 del dataIdDict[dimensionName]
733 log.debug(
734 "Converting dimension %s to %s.%s=%s",
735 dimensionName,
736 dimensionName,
737 alternate.name,
738 value,
739 )
740 break
741 else:
742 log.warning(
743 "Type mismatch found for value '%r' provided for dimension %s. "
744 "Could not find matching alternative (primary key has type %s) "
745 "so attempting to use as-is.",
746 value,
747 dimensionName,
748 dimension.primaryKey.getPythonType(),
749 )
751 # By this point kwargs and newDataId should only include valid
752 # dimensions. Merge kwargs in to the new dataId and log if there
753 # are dimensions in both (rather than calling update).
754 for k, v in kwargs.items():
755 if k in newDataId and newDataId[k] != v:
756 log.debug(
757 "Keyword arg %s overriding explicit value in dataId of %s with %s", k, newDataId[k], v
758 )
759 newDataId[k] = v
760 # No need to retain any values in kwargs now.
761 kwargs = {}
763 # If we have some unrecognized dimensions we have to try to connect
764 # them to records in other dimensions. This is made more complicated
765 # by some dimensions having records with clashing names. A mitigation
766 # is that we can tell by this point which dimensions are missing
767 # for the DatasetType but this does not work for calibrations
768 # where additional dimensions can be used to constrain the temporal
769 # axis.
770 if not_dimensions:
771 # Search for all dimensions even if we have been given a value
772 # explicitly. In some cases records are given as well as the
773 # actually dimension and this should not be an error if they
774 # match.
775 mandatoryDimensions = datasetType.dimensions.names # - provided
777 candidateDimensions: set[str] = set()
778 candidateDimensions.update(mandatoryDimensions)
780 # For calibrations we may well be needing temporal dimensions
781 # so rather than always including all dimensions in the scan
782 # restrict things a little. It is still possible for there
783 # to be confusion over day_obs in visit vs exposure for example.
784 # If we are not searching calibration collections things may
785 # fail but they are going to fail anyway because of the
786 # ambiguousness of the dataId...
787 if datasetType.isCalibration():
788 for dim in self.dimensions.getStaticDimensions():
789 if dim.temporal:
790 candidateDimensions.add(str(dim))
792 # Look up table for the first association with a dimension
793 guessedAssociation: dict[str, dict[str, Any]] = defaultdict(dict)
795 # Keep track of whether an item is associated with multiple
796 # dimensions.
797 counter: Counter[str] = Counter()
798 assigned: dict[str, set[str]] = defaultdict(set)
800 # Go through the missing dimensions and associate the
801 # given names with records within those dimensions
802 matched_dims = set()
803 for dimensionName in candidateDimensions:
804 dimension = self.dimensions.getStaticDimensions()[dimensionName]
805 fields = dimension.metadata.names | dimension.uniqueKeys.names
806 for field in not_dimensions:
807 if field in fields:
808 guessedAssociation[dimensionName][field] = not_dimensions[field]
809 counter[dimensionName] += 1
810 assigned[field].add(dimensionName)
811 matched_dims.add(field)
813 # Calculate the fields that matched nothing.
814 never_found = set(not_dimensions) - matched_dims
816 if never_found:
817 raise ValueError(f"Unrecognized keyword args given: {never_found}")
819 # There is a chance we have allocated a single dataId item
820 # to multiple dimensions. Need to decide which should be retained.
821 # For now assume that the most popular alternative wins.
822 # This means that day_obs with seq_num will result in
823 # exposure.day_obs and not visit.day_obs
824 # Also prefer an explicitly missing dimension over an inferred
825 # temporal dimension.
826 for fieldName, assignedDimensions in assigned.items():
827 if len(assignedDimensions) > 1:
828 # Pick the most popular (preferring mandatory dimensions)
829 requiredButMissing = assignedDimensions.intersection(mandatoryDimensions)
830 if requiredButMissing:
831 candidateDimensions = requiredButMissing
832 else:
833 candidateDimensions = assignedDimensions
835 # If this is a choice between visit and exposure and
836 # neither was a required part of the dataset type,
837 # (hence in this branch) always prefer exposure over
838 # visit since exposures are always defined and visits
839 # are defined from exposures.
840 if candidateDimensions == {"exposure", "visit"}:
841 candidateDimensions = {"exposure"}
843 # Select the relevant items and get a new restricted
844 # counter.
845 theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions}
846 duplicatesCounter: Counter[str] = Counter()
847 duplicatesCounter.update(theseCounts)
849 # Choose the most common. If they are equally common
850 # we will pick the one that was found first.
851 # Returns a list of tuples
852 selected = duplicatesCounter.most_common(1)[0][0]
854 log.debug(
855 "Ambiguous dataId entry '%s' associated with multiple dimensions: %s."
856 " Removed ambiguity by choosing dimension %s.",
857 fieldName,
858 ", ".join(assignedDimensions),
859 selected,
860 )
862 for candidateDimension in assignedDimensions:
863 if candidateDimension != selected:
864 del guessedAssociation[candidateDimension][fieldName]
866 # Update the record look up dict with the new associations
867 for dimensionName, values in guessedAssociation.items():
868 if values: # A dict might now be empty
869 log.debug("Assigned non-dimension dataId keys to dimension %s: %s", dimensionName, values)
870 byRecord[dimensionName].update(values)
872 if byRecord:
873 # Some record specifiers were found so we need to convert
874 # them to the Id form
875 for dimensionName, values in byRecord.items():
876 if dimensionName in newDataId:
877 log.debug(
878 "DataId specified explicit %s dimension value of %s in addition to"
879 " general record specifiers for it of %s. Ignoring record information.",
880 dimensionName,
881 newDataId[dimensionName],
882 str(values),
883 )
884 # Get the actual record and compare with these values.
885 try:
886 recs = list(self.registry.queryDimensionRecords(dimensionName, dataId=newDataId))
887 except DataIdError:
888 raise ValueError(
889 f"Could not find dimension '{dimensionName}'"
890 f" with dataId {newDataId} as part of comparing with"
891 f" record values {byRecord[dimensionName]}"
892 ) from None
893 if len(recs) == 1:
894 errmsg: list[str] = []
895 for k, v in values.items():
896 if (recval := getattr(recs[0], k)) != v:
897 errmsg.append(f"{k}({recval} != {v})")
898 if errmsg:
899 raise ValueError(
900 f"Dimension {dimensionName} in dataId has explicit value"
901 " inconsistent with records: " + ", ".join(errmsg)
902 )
903 else:
904 # Multiple matches for an explicit dimension
905 # should never happen but let downstream complain.
906 pass
907 continue
909 # Build up a WHERE expression
910 bind = {k: v for k, v in values.items()}
911 where = " AND ".join(f"{dimensionName}.{k} = {k}" for k in bind)
913 # Hopefully we get a single record that matches
914 records = set(
915 self.registry.queryDimensionRecords(
916 dimensionName, dataId=newDataId, where=where, bind=bind, **kwargs
917 )
918 )
920 if len(records) != 1:
921 if len(records) > 1:
922 # visit can have an ambiguous answer without involving
923 # visit_system. The default visit_system is defined
924 # by the instrument.
925 if (
926 dimensionName == "visit"
927 and "visit_system_membership" in self.dimensions
928 and "visit_system" in self.dimensions["instrument"].metadata
929 ):
930 instrument_records = list(
931 self.registry.queryDimensionRecords(
932 "instrument",
933 dataId=newDataId,
934 **kwargs,
935 )
936 )
937 if len(instrument_records) == 1:
938 visit_system = instrument_records[0].visit_system
939 if visit_system is None:
940 # Set to a value that will never match.
941 visit_system = -1
943 # Look up each visit in the
944 # visit_system_membership records.
945 for rec in records:
946 membership = list(
947 self.registry.queryDimensionRecords(
948 # Use bind to allow zero results.
949 # This is a fully-specified query.
950 "visit_system_membership",
951 where="instrument = inst AND visit_system = system AND visit = v",
952 bind=dict(
953 inst=instrument_records[0].name, system=visit_system, v=rec.id
954 ),
955 )
956 )
957 if membership:
958 # This record is the right answer.
959 records = {rec}
960 break
962 # The ambiguity may have been resolved so check again.
963 if len(records) > 1:
964 log.debug("Received %d records from constraints of %s", len(records), str(values))
965 for r in records:
966 log.debug("- %s", str(r))
967 raise ValueError(
968 f"DataId specification for dimension {dimensionName} is not"
969 f" uniquely constrained to a single dataset by {values}."
970 f" Got {len(records)} results."
971 )
972 else:
973 raise ValueError(
974 f"DataId specification for dimension {dimensionName} matched no"
975 f" records when constrained by {values}"
976 )
978 # Get the primary key from the real dimension object
979 dimension = self.dimensions.getStaticDimensions()[dimensionName]
980 if not isinstance(dimension, Dimension):
981 raise RuntimeError(
982 f"{dimension.name} is not a true dimension, and cannot be used in data IDs."
983 )
984 newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name)
986 return newDataId, kwargs
988 def _findDatasetRef(
989 self,
990 datasetRefOrType: DatasetRef | DatasetType | str,
991 dataId: DataId | None = None,
992 *,
993 collections: Any = None,
994 predict: bool = False,
995 run: str | None = None,
996 **kwargs: Any,
997 ) -> DatasetRef:
998 """Shared logic for methods that start with a search for a dataset in
999 the registry.
1001 Parameters
1002 ----------
1003 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1004 When `DatasetRef` the `dataId` should be `None`.
1005 Otherwise the `DatasetType` or name thereof.
1006 dataId : `dict` or `DataCoordinate`, optional
1007 A `dict` of `Dimension` link name, value pairs that label the
1008 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1009 should be provided as the first argument.
1010 collections : Any, optional
1011 Collections to be searched, overriding ``self.collections``.
1012 Can be any of the types supported by the ``collections`` argument
1013 to butler construction.
1014 predict : `bool`, optional
1015 If `True`, return a newly created `DatasetRef` with a unique
1016 dataset ID if finding a reference in the `Registry` fails.
1017 Defaults to `False`.
1018 run : `str`, optional
1019 Run collection name to use for creating `DatasetRef` for predicted
1020 datasets. Only used if ``predict`` is `True`.
1021 **kwargs
1022 Additional keyword arguments used to augment or construct a
1023 `DataId`. See `DataId` parameters.
1025 Returns
1026 -------
1027 ref : `DatasetRef`
1028 A reference to the dataset identified by the given arguments.
1029 This can be the same dataset reference as given if it was
1030 resolved.
1032 Raises
1033 ------
1034 LookupError
1035 Raised if no matching dataset exists in the `Registry` (and
1036 ``predict`` is `False`).
1037 ValueError
1038 Raised if a resolved `DatasetRef` was passed as an input, but it
1039 differs from the one found in the registry.
1040 TypeError
1041 Raised if no collections were provided.
1042 """
1043 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, for_put=False, **kwargs)
1044 if isinstance(datasetRefOrType, DatasetRef):
1045 if collections is not None:
1046 warnings.warn("Collections should not be specified with DatasetRef", stacklevel=3)
1047 return datasetRefOrType
1048 timespan: Timespan | None = None
1050 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs)
1052 if datasetType.isCalibration():
1053 # Because this is a calibration dataset, first try to make a
1054 # standardize the data ID without restricting the dimensions to
1055 # those of the dataset type requested, because there may be extra
1056 # dimensions that provide temporal information for a validity-range
1057 # lookup.
1058 dataId = DataCoordinate.standardize(
1059 dataId, universe=self.dimensions, defaults=self.registry.defaults.dataId, **kwargs
1060 )
1061 if dataId.graph.temporal:
1062 dataId = self.registry.expandDataId(dataId)
1063 timespan = dataId.timespan
1064 else:
1065 # Standardize the data ID to just the dimensions of the dataset
1066 # type instead of letting registry.findDataset do it, so we get the
1067 # result even if no dataset is found.
1068 dataId = DataCoordinate.standardize(
1069 dataId, graph=datasetType.dimensions, defaults=self.registry.defaults.dataId, **kwargs
1070 )
1071 # Always lookup the DatasetRef, even if one is given, to ensure it is
1072 # present in the current collection.
1073 ref = self.registry.findDataset(datasetType, dataId, collections=collections, timespan=timespan)
1074 if ref is None:
1075 if predict:
1076 if run is None:
1077 run = self.run
1078 if run is None:
1079 raise TypeError("Cannot predict dataset ID/location with run=None.")
1080 return DatasetRef(datasetType, dataId, run=run)
1081 else:
1082 if collections is None:
1083 collections = self.registry.defaults.collections
1084 raise LookupError(
1085 f"Dataset {datasetType.name} with data ID {dataId} "
1086 f"could not be found in collections {collections}."
1087 )
1088 if datasetType != ref.datasetType:
1089 # If they differ it is because the user explicitly specified
1090 # a compatible dataset type to this call rather than using the
1091 # registry definition. The DatasetRef must therefore be recreated
1092 # using the user definition such that the expected type is
1093 # returned.
1094 ref = DatasetRef(datasetType, ref.dataId, run=ref.run, id=ref.id)
1096 return ref
1098 @transactional
1099 @deprecated(
1100 reason="Butler.put() now behaves like Butler.putDirect() when given a DatasetRef."
1101 " Please use Butler.put(). Be aware that you may need to adjust your usage if you"
1102 " were relying on the run parameter to determine the run."
1103 " Will be removed after v27.0.",
1104 version="v26.0",
1105 category=FutureWarning,
1106 )
1107 def putDirect(self, obj: Any, ref: DatasetRef, /) -> DatasetRef:
1108 # Docstring inherited.
1109 return self.put(obj, ref)
1111 @transactional
1112 def put(
1113 self,
1114 obj: Any,
1115 datasetRefOrType: DatasetRef | DatasetType | str,
1116 /,
1117 dataId: DataId | None = None,
1118 *,
1119 run: str | None = None,
1120 **kwargs: Any,
1121 ) -> DatasetRef:
1122 """Store and register a dataset.
1124 Parameters
1125 ----------
1126 obj : `object`
1127 The dataset.
1128 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1129 When `DatasetRef` is provided, ``dataId`` should be `None`.
1130 Otherwise the `DatasetType` or name thereof. If a fully resolved
1131 `DatasetRef` is given the run and ID are used directly.
1132 dataId : `dict` or `DataCoordinate`
1133 A `dict` of `Dimension` link name, value pairs that label the
1134 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1135 should be provided as the second argument.
1136 run : `str`, optional
1137 The name of the run the dataset should be added to, overriding
1138 ``self.run``. Not used if a resolved `DatasetRef` is provided.
1139 **kwargs
1140 Additional keyword arguments used to augment or construct a
1141 `DataCoordinate`. See `DataCoordinate.standardize`
1142 parameters. Not used if a resolve `DatasetRef` is provided.
1144 Returns
1145 -------
1146 ref : `DatasetRef`
1147 A reference to the stored dataset, updated with the correct id if
1148 given.
1150 Raises
1151 ------
1152 TypeError
1153 Raised if the butler is read-only or if no run has been provided.
1154 """
1155 if isinstance(datasetRefOrType, DatasetRef):
1156 # This is a direct put of predefined DatasetRef.
1157 log.debug("Butler put direct: %s", datasetRefOrType)
1158 if run is not None:
1159 warnings.warn("Run collection is not used for DatasetRef", stacklevel=3)
1160 # If registry already has a dataset with the same dataset ID,
1161 # dataset type and DataId, then _importDatasets will do nothing and
1162 # just return an original ref. We have to raise in this case, there
1163 # is a datastore check below for that.
1164 self.registry._importDatasets([datasetRefOrType], expand=True)
1165 # Before trying to write to the datastore check that it does not
1166 # know this dataset. This is prone to races, of course.
1167 if self.datastore.knows(datasetRefOrType):
1168 raise ConflictingDefinitionError(f"Datastore already contains dataset: {datasetRefOrType}")
1169 # Try to write dataset to the datastore, if it fails due to a race
1170 # with another write, the content of stored data may be
1171 # unpredictable.
1172 try:
1173 self.datastore.put(obj, datasetRefOrType)
1174 except IntegrityError as e:
1175 raise ConflictingDefinitionError(f"Datastore already contains dataset: {e}")
1176 return datasetRefOrType
1178 log.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run)
1179 if not self.isWriteable():
1180 raise TypeError("Butler is read-only.")
1181 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs)
1183 # Handle dimension records in dataId
1184 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs)
1186 # Add Registry Dataset entry.
1187 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions, **kwargs)
1188 (ref,) = self.registry.insertDatasets(datasetType, run=run, dataIds=[dataId])
1189 self.datastore.put(obj, ref)
1191 return ref
1193 @deprecated(
1194 reason="Butler.get() now behaves like Butler.getDirect() when given a DatasetRef."
1195 " Please use Butler.get(). Will be removed after v27.0.",
1196 version="v26.0",
1197 category=FutureWarning,
1198 )
1199 def getDirect(
1200 self,
1201 ref: DatasetRef,
1202 *,
1203 parameters: dict[str, Any] | None = None,
1204 storageClass: StorageClass | str | None = None,
1205 ) -> Any:
1206 """Retrieve a stored dataset.
1208 Parameters
1209 ----------
1210 ref : `DatasetRef`
1211 Resolved reference to an already stored dataset.
1212 parameters : `dict`
1213 Additional StorageClass-defined options to control reading,
1214 typically used to efficiently read only a subset of the dataset.
1215 storageClass : `StorageClass` or `str`, optional
1216 The storage class to be used to override the Python type
1217 returned by this method. By default the returned type matches
1218 the dataset type definition for this dataset. Specifying a
1219 read `StorageClass` can force a different type to be returned.
1220 This type must be compatible with the original type.
1222 Returns
1223 -------
1224 obj : `object`
1225 The dataset.
1226 """
1227 return self.datastore.get(ref, parameters=parameters, storageClass=storageClass)
1229 @deprecated(
1230 reason="Butler.getDeferred() now behaves like getDirectDeferred() when given a DatasetRef. "
1231 "Please use Butler.getDeferred(). Will be removed after v27.0.",
1232 version="v26.0",
1233 category=FutureWarning,
1234 )
1235 def getDirectDeferred(
1236 self,
1237 ref: DatasetRef,
1238 *,
1239 parameters: dict | None = None,
1240 storageClass: str | StorageClass | None = None,
1241 ) -> DeferredDatasetHandle:
1242 """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
1243 from a resolved `DatasetRef`.
1245 Parameters
1246 ----------
1247 ref : `DatasetRef`
1248 Resolved reference to an already stored dataset.
1249 parameters : `dict`
1250 Additional StorageClass-defined options to control reading,
1251 typically used to efficiently read only a subset of the dataset.
1252 storageClass : `StorageClass` or `str`, optional
1253 The storage class to be used to override the Python type
1254 returned by this method. By default the returned type matches
1255 the dataset type definition for this dataset. Specifying a
1256 read `StorageClass` can force a different type to be returned.
1257 This type must be compatible with the original type.
1259 Returns
1260 -------
1261 obj : `DeferredDatasetHandle`
1262 A handle which can be used to retrieve a dataset at a later time.
1264 Raises
1265 ------
1266 LookupError
1267 Raised if no matching dataset exists in the `Registry`.
1268 """
1269 # Check thad dataset actuall exists.
1270 if not self.datastore.exists(ref):
1271 raise LookupError(f"Dataset reference {ref} does not exist.")
1272 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters, storageClass=storageClass)
1274 def getDeferred(
1275 self,
1276 datasetRefOrType: DatasetRef | DatasetType | str,
1277 /,
1278 dataId: DataId | None = None,
1279 *,
1280 parameters: dict | None = None,
1281 collections: Any = None,
1282 storageClass: str | StorageClass | None = None,
1283 **kwargs: Any,
1284 ) -> DeferredDatasetHandle:
1285 """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
1286 after an immediate registry lookup.
1288 Parameters
1289 ----------
1290 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1291 When `DatasetRef` the `dataId` should be `None`.
1292 Otherwise the `DatasetType` or name thereof.
1293 dataId : `dict` or `DataCoordinate`, optional
1294 A `dict` of `Dimension` link name, value pairs that label the
1295 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1296 should be provided as the first argument.
1297 parameters : `dict`
1298 Additional StorageClass-defined options to control reading,
1299 typically used to efficiently read only a subset of the dataset.
1300 collections : Any, optional
1301 Collections to be searched, overriding ``self.collections``.
1302 Can be any of the types supported by the ``collections`` argument
1303 to butler construction.
1304 storageClass : `StorageClass` or `str`, optional
1305 The storage class to be used to override the Python type
1306 returned by this method. By default the returned type matches
1307 the dataset type definition for this dataset. Specifying a
1308 read `StorageClass` can force a different type to be returned.
1309 This type must be compatible with the original type.
1310 **kwargs
1311 Additional keyword arguments used to augment or construct a
1312 `DataId`. See `DataId` parameters.
1314 Returns
1315 -------
1316 obj : `DeferredDatasetHandle`
1317 A handle which can be used to retrieve a dataset at a later time.
1319 Raises
1320 ------
1321 LookupError
1322 Raised if no matching dataset exists in the `Registry`.
1323 ValueError
1324 Raised if a resolved `DatasetRef` was passed as an input, but it
1325 differs from the one found in the registry.
1326 TypeError
1327 Raised if no collections were provided.
1328 """
1329 if isinstance(datasetRefOrType, DatasetRef) and not self.datastore.exists(datasetRefOrType):
1330 raise LookupError(f"Dataset reference {datasetRefOrType} does not exist.")
1331 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs)
1332 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters, storageClass=storageClass)
1334 def get(
1335 self,
1336 datasetRefOrType: DatasetRef | DatasetType | str,
1337 /,
1338 dataId: DataId | None = None,
1339 *,
1340 parameters: dict[str, Any] | None = None,
1341 collections: Any = None,
1342 storageClass: StorageClass | str | None = None,
1343 **kwargs: Any,
1344 ) -> Any:
1345 """Retrieve a stored dataset.
1347 Parameters
1348 ----------
1349 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1350 When `DatasetRef` the `dataId` should be `None`.
1351 Otherwise the `DatasetType` or name thereof.
1352 If a resolved `DatasetRef`, the associated dataset
1353 is returned directly without additional querying.
1354 dataId : `dict` or `DataCoordinate`
1355 A `dict` of `Dimension` link name, value pairs that label the
1356 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1357 should be provided as the first argument.
1358 parameters : `dict`
1359 Additional StorageClass-defined options to control reading,
1360 typically used to efficiently read only a subset of the dataset.
1361 collections : Any, optional
1362 Collections to be searched, overriding ``self.collections``.
1363 Can be any of the types supported by the ``collections`` argument
1364 to butler construction.
1365 storageClass : `StorageClass` or `str`, optional
1366 The storage class to be used to override the Python type
1367 returned by this method. By default the returned type matches
1368 the dataset type definition for this dataset. Specifying a
1369 read `StorageClass` can force a different type to be returned.
1370 This type must be compatible with the original type.
1371 **kwargs
1372 Additional keyword arguments used to augment or construct a
1373 `DataCoordinate`. See `DataCoordinate.standardize`
1374 parameters.
1376 Returns
1377 -------
1378 obj : `object`
1379 The dataset.
1381 Raises
1382 ------
1383 LookupError
1384 Raised if no matching dataset exists in the `Registry`.
1385 TypeError
1386 Raised if no collections were provided.
1388 Notes
1389 -----
1390 When looking up datasets in a `~CollectionType.CALIBRATION` collection,
1391 this method requires that the given data ID include temporal dimensions
1392 beyond the dimensions of the dataset type itself, in order to find the
1393 dataset with the appropriate validity range. For example, a "bias"
1394 dataset with native dimensions ``{instrument, detector}`` could be
1395 fetched with a ``{instrument, detector, exposure}`` data ID, because
1396 ``exposure`` is a temporal dimension.
1397 """
1398 log.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters)
1399 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs)
1400 return self.datastore.get(ref, parameters=parameters, storageClass=storageClass)
1402 def getURIs(
1403 self,
1404 datasetRefOrType: DatasetRef | DatasetType | str,
1405 /,
1406 dataId: DataId | None = None,
1407 *,
1408 predict: bool = False,
1409 collections: Any = None,
1410 run: str | None = None,
1411 **kwargs: Any,
1412 ) -> DatasetRefURIs:
1413 """Returns the URIs associated with the dataset.
1415 Parameters
1416 ----------
1417 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1418 When `DatasetRef` the `dataId` should be `None`.
1419 Otherwise the `DatasetType` or name thereof.
1420 dataId : `dict` or `DataCoordinate`
1421 A `dict` of `Dimension` link name, value pairs that label the
1422 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1423 should be provided as the first argument.
1424 predict : `bool`
1425 If `True`, allow URIs to be returned of datasets that have not
1426 been written.
1427 collections : Any, optional
1428 Collections to be searched, overriding ``self.collections``.
1429 Can be any of the types supported by the ``collections`` argument
1430 to butler construction.
1431 run : `str`, optional
1432 Run to use for predictions, overriding ``self.run``.
1433 **kwargs
1434 Additional keyword arguments used to augment or construct a
1435 `DataCoordinate`. See `DataCoordinate.standardize`
1436 parameters.
1438 Returns
1439 -------
1440 uris : `DatasetRefURIs`
1441 The URI to the primary artifact associated with this dataset (if
1442 the dataset was disassembled within the datastore this may be
1443 `None`), and the URIs to any components associated with the dataset
1444 artifact. (can be empty if there are no components).
1445 """
1446 ref = self._findDatasetRef(
1447 datasetRefOrType, dataId, predict=predict, run=run, collections=collections, **kwargs
1448 )
1449 return self.datastore.getURIs(ref, predict)
1451 def getURI(
1452 self,
1453 datasetRefOrType: DatasetRef | DatasetType | str,
1454 /,
1455 dataId: DataId | None = None,
1456 *,
1457 predict: bool = False,
1458 collections: Any = None,
1459 run: str | None = None,
1460 **kwargs: Any,
1461 ) -> ResourcePath:
1462 """Return the URI to the Dataset.
1464 Parameters
1465 ----------
1466 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1467 When `DatasetRef` the `dataId` should be `None`.
1468 Otherwise the `DatasetType` or name thereof.
1469 dataId : `dict` or `DataCoordinate`
1470 A `dict` of `Dimension` link name, value pairs that label the
1471 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1472 should be provided as the first argument.
1473 predict : `bool`
1474 If `True`, allow URIs to be returned of datasets that have not
1475 been written.
1476 collections : Any, optional
1477 Collections to be searched, overriding ``self.collections``.
1478 Can be any of the types supported by the ``collections`` argument
1479 to butler construction.
1480 run : `str`, optional
1481 Run to use for predictions, overriding ``self.run``.
1482 **kwargs
1483 Additional keyword arguments used to augment or construct a
1484 `DataCoordinate`. See `DataCoordinate.standardize`
1485 parameters.
1487 Returns
1488 -------
1489 uri : `lsst.resources.ResourcePath`
1490 URI pointing to the Dataset within the datastore. If the
1491 Dataset does not exist in the datastore, and if ``predict`` is
1492 `True`, the URI will be a prediction and will include a URI
1493 fragment "#predicted".
1494 If the datastore does not have entities that relate well
1495 to the concept of a URI the returned URI string will be
1496 descriptive. The returned URI is not guaranteed to be obtainable.
1498 Raises
1499 ------
1500 LookupError
1501 A URI has been requested for a dataset that does not exist and
1502 guessing is not allowed.
1503 ValueError
1504 Raised if a resolved `DatasetRef` was passed as an input, but it
1505 differs from the one found in the registry.
1506 TypeError
1507 Raised if no collections were provided.
1508 RuntimeError
1509 Raised if a URI is requested for a dataset that consists of
1510 multiple artifacts.
1511 """
1512 primary, components = self.getURIs(
1513 datasetRefOrType, dataId=dataId, predict=predict, collections=collections, run=run, **kwargs
1514 )
1516 if primary is None or components:
1517 raise RuntimeError(
1518 f"Dataset ({datasetRefOrType}) includes distinct URIs for components. "
1519 "Use Butler.getURIs() instead."
1520 )
1521 return primary
1523 def retrieveArtifacts(
1524 self,
1525 refs: Iterable[DatasetRef],
1526 destination: ResourcePathExpression,
1527 transfer: str = "auto",
1528 preserve_path: bool = True,
1529 overwrite: bool = False,
1530 ) -> list[ResourcePath]:
1531 """Retrieve the artifacts associated with the supplied refs.
1533 Parameters
1534 ----------
1535 refs : iterable of `DatasetRef`
1536 The datasets for which artifacts are to be retrieved.
1537 A single ref can result in multiple artifacts. The refs must
1538 be resolved.
1539 destination : `lsst.resources.ResourcePath` or `str`
1540 Location to write the artifacts.
1541 transfer : `str`, optional
1542 Method to use to transfer the artifacts. Must be one of the options
1543 supported by `~lsst.resources.ResourcePath.transfer_from()`.
1544 "move" is not allowed.
1545 preserve_path : `bool`, optional
1546 If `True` the full path of the artifact within the datastore
1547 is preserved. If `False` the final file component of the path
1548 is used.
1549 overwrite : `bool`, optional
1550 If `True` allow transfers to overwrite existing files at the
1551 destination.
1553 Returns
1554 -------
1555 targets : `list` of `lsst.resources.ResourcePath`
1556 URIs of file artifacts in destination location. Order is not
1557 preserved.
1559 Notes
1560 -----
1561 For non-file datastores the artifacts written to the destination
1562 may not match the representation inside the datastore. For example
1563 a hierarchical data structure in a NoSQL database may well be stored
1564 as a JSON file.
1565 """
1566 return self.datastore.retrieveArtifacts(
1567 refs,
1568 ResourcePath(destination),
1569 transfer=transfer,
1570 preserve_path=preserve_path,
1571 overwrite=overwrite,
1572 )
1574 def exists(
1575 self,
1576 dataset_ref_or_type: DatasetRef | DatasetType | str,
1577 /,
1578 data_id: DataId | None = None,
1579 *,
1580 full_check: bool = True,
1581 collections: Any = None,
1582 **kwargs: Any,
1583 ) -> DatasetExistence:
1584 """Indicate whether a dataset is known to Butler registry and
1585 datastore.
1587 Parameters
1588 ----------
1589 dataset_ref_or_type : `DatasetRef`, `DatasetType`, or `str`
1590 When `DatasetRef` the `dataId` should be `None`.
1591 Otherwise the `DatasetType` or name thereof.
1592 data_id : `dict` or `DataCoordinate`
1593 A `dict` of `Dimension` link name, value pairs that label the
1594 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1595 should be provided as the first argument.
1596 full_check : `bool`, optional
1597 If `True`, an additional check will be made for dataset artifact
1598 existence. This will involve additional overhead due to the need
1599 to query an external system. If `False` registry and datastore
1600 will solely be asked if they know about the dataset but no
1601 check for the artifact will be performed.
1602 collections : Any, optional
1603 Collections to be searched, overriding ``self.collections``.
1604 Can be any of the types supported by the ``collections`` argument
1605 to butler construction.
1606 **kwargs
1607 Additional keyword arguments used to augment or construct a
1608 `DataCoordinate`. See `DataCoordinate.standardize`
1609 parameters.
1611 Returns
1612 -------
1613 existence : `DatasetExistence`
1614 Object indicating whether the dataset is known to registry and
1615 datastore. Evaluates to `True` if the dataset is present and known
1616 to both.
1617 """
1618 existence = DatasetExistence.UNRECOGNIZED
1620 if isinstance(dataset_ref_or_type, DatasetRef):
1621 if collections is not None:
1622 warnings.warn("Collections should not be specified with DatasetRef", stacklevel=2)
1623 if data_id is not None:
1624 warnings.warn("A DataID should not be specified with DatasetRef", stacklevel=2)
1625 ref = dataset_ref_or_type
1626 registry_ref = self.registry.getDataset(dataset_ref_or_type.id)
1627 if registry_ref is not None:
1628 existence |= DatasetExistence.RECORDED
1630 if dataset_ref_or_type != registry_ref:
1631 # This could mean that storage classes differ, so we should
1632 # check for that but use the registry ref for the rest of
1633 # the method.
1634 if registry_ref.is_compatible_with(dataset_ref_or_type):
1635 # Use the registry version from now on.
1636 ref = registry_ref
1637 else:
1638 raise ValueError(
1639 f"The ref given to exists() ({ref}) has the same dataset ID as one "
1640 f"in registry but has different incompatible values ({registry_ref})."
1641 )
1642 else:
1643 try:
1644 ref = self._findDatasetRef(dataset_ref_or_type, data_id, collections=collections, **kwargs)
1645 except (LookupError, TypeError, NoDefaultCollectionError):
1646 return existence
1647 existence |= DatasetExistence.RECORDED
1649 if self.datastore.knows(ref):
1650 existence |= DatasetExistence.DATASTORE
1652 if full_check:
1653 if self.datastore.exists(ref):
1654 existence |= DatasetExistence._ARTIFACT
1655 elif existence != DatasetExistence.UNRECOGNIZED:
1656 # Do not add this flag if we have no other idea about a dataset.
1657 existence |= DatasetExistence._ASSUMED
1659 return existence
1661 def _exists_many(
1662 self,
1663 refs: Iterable[DatasetRef],
1664 /,
1665 *,
1666 full_check: bool = True,
1667 ) -> dict[DatasetRef, DatasetExistence]:
1668 """Indicate whether multiple datasets are known to Butler registry and
1669 datastore.
1671 This is an experimental API that may change at any moment.
1673 Parameters
1674 ----------
1675 refs : iterable of `DatasetRef`
1676 The datasets to be checked.
1677 full_check : `bool`, optional
1678 If `True`, an additional check will be made for dataset artifact
1679 existence. This will involve additional overhead due to the need
1680 to query an external system. If `False` registry and datastore
1681 will solely be asked if they know about the dataset but no
1682 check for the artifact will be performed.
1684 Returns
1685 -------
1686 existence : dict of [`DatasetRef`, `DatasetExistence`]
1687 Mapping from the given dataset refs to an enum indicating the
1688 status of the dataset in registry and datastore.
1689 Each value evaluates to `True` if the dataset is present and known
1690 to both.
1691 """
1692 existence = {ref: DatasetExistence.UNRECOGNIZED for ref in refs}
1694 # Registry does not have a bulk API to check for a ref.
1695 for ref in refs:
1696 registry_ref = self.registry.getDataset(ref.id)
1697 if registry_ref is not None:
1698 # It is possible, albeit unlikely, that the given ref does
1699 # not match the one in registry even though the UUID matches.
1700 # When checking a single ref we raise, but it's impolite to
1701 # do that when potentially hundreds of refs are being checked.
1702 # We could change the API to only accept UUIDs and that would
1703 # remove the ability to even check and remove the worry
1704 # about differing storage classes. Given the ongoing discussion
1705 # on refs vs UUIDs and whether to raise or have a new
1706 # private flag, treat this as a private API for now.
1707 existence[ref] |= DatasetExistence.RECORDED
1709 # Ask datastore if it knows about these refs.
1710 knows = self.datastore.knows_these(refs)
1711 for ref, known in knows.items():
1712 if known:
1713 existence[ref] |= DatasetExistence.DATASTORE
1715 if full_check:
1716 mexists = self.datastore.mexists(refs)
1717 for ref, exists in mexists.items():
1718 if exists:
1719 existence[ref] |= DatasetExistence._ARTIFACT
1720 else:
1721 # Do not set this flag if nothing is known about the dataset.
1722 for ref in existence.keys():
1723 if existence[ref] != DatasetExistence.UNRECOGNIZED:
1724 existence[ref] |= DatasetExistence._ASSUMED
1726 return existence
1728 @deprecated(
1729 reason="Butler.datasetExists() has been replaced by Butler.exists(). Will be removed after v27.0.",
1730 version="v26.0",
1731 category=FutureWarning,
1732 )
1733 def datasetExists(
1734 self,
1735 datasetRefOrType: DatasetRef | DatasetType | str,
1736 dataId: DataId | None = None,
1737 *,
1738 collections: Any = None,
1739 **kwargs: Any,
1740 ) -> bool:
1741 """Return True if the Dataset is actually present in the Datastore.
1743 Parameters
1744 ----------
1745 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1746 When `DatasetRef` the `dataId` should be `None`.
1747 Otherwise the `DatasetType` or name thereof.
1748 dataId : `dict` or `DataCoordinate`
1749 A `dict` of `Dimension` link name, value pairs that label the
1750 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1751 should be provided as the first argument.
1752 collections : Any, optional
1753 Collections to be searched, overriding ``self.collections``.
1754 Can be any of the types supported by the ``collections`` argument
1755 to butler construction.
1756 **kwargs
1757 Additional keyword arguments used to augment or construct a
1758 `DataCoordinate`. See `DataCoordinate.standardize`
1759 parameters.
1761 Raises
1762 ------
1763 LookupError
1764 Raised if the dataset is not even present in the Registry.
1765 ValueError
1766 Raised if a resolved `DatasetRef` was passed as an input, but it
1767 differs from the one found in the registry.
1768 NoDefaultCollectionError
1769 Raised if no collections were provided.
1770 """
1771 # A resolved ref may be given that is not known to this butler.
1772 if isinstance(datasetRefOrType, DatasetRef):
1773 ref = self.registry.getDataset(datasetRefOrType.id)
1774 if ref is None:
1775 raise LookupError(
1776 f"Resolved DatasetRef with id {datasetRefOrType.id} is not known to registry."
1777 )
1778 else:
1779 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs)
1780 return self.datastore.exists(ref)
1782 def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None:
1783 """Remove one or more `~CollectionType.RUN` collections and the
1784 datasets within them.
1786 Parameters
1787 ----------
1788 names : `~collections.abc.Iterable` [ `str` ]
1789 The names of the collections to remove.
1790 unstore : `bool`, optional
1791 If `True` (default), delete datasets from all datastores in which
1792 they are present, and attempt to rollback the registry deletions if
1793 datastore deletions fail (which may not always be possible). If
1794 `False`, datastore records for these datasets are still removed,
1795 but any artifacts (e.g. files) will not be.
1797 Raises
1798 ------
1799 TypeError
1800 Raised if one or more collections are not of type
1801 `~CollectionType.RUN`.
1802 """
1803 if not self.isWriteable():
1804 raise TypeError("Butler is read-only.")
1805 names = list(names)
1806 refs: list[DatasetRef] = []
1807 for name in names:
1808 collectionType = self.registry.getCollectionType(name)
1809 if collectionType is not CollectionType.RUN:
1810 raise TypeError(f"The collection type of '{name}' is {collectionType.name}, not RUN.")
1811 refs.extend(self.registry.queryDatasets(..., collections=name, findFirst=True))
1812 with self.datastore.transaction():
1813 with self.registry.transaction():
1814 if unstore:
1815 self.datastore.trash(refs)
1816 else:
1817 self.datastore.forget(refs)
1818 for name in names:
1819 self.registry.removeCollection(name)
1820 if unstore:
1821 # Point of no return for removing artifacts
1822 self.datastore.emptyTrash()
1824 def pruneDatasets(
1825 self,
1826 refs: Iterable[DatasetRef],
1827 *,
1828 disassociate: bool = True,
1829 unstore: bool = False,
1830 tags: Iterable[str] = (),
1831 purge: bool = False,
1832 ) -> None:
1833 # docstring inherited from LimitedButler
1835 if not self.isWriteable():
1836 raise TypeError("Butler is read-only.")
1837 if purge:
1838 if not disassociate:
1839 raise TypeError("Cannot pass purge=True without disassociate=True.")
1840 if not unstore:
1841 raise TypeError("Cannot pass purge=True without unstore=True.")
1842 elif disassociate:
1843 tags = tuple(tags)
1844 if not tags:
1845 raise TypeError("No tags provided but disassociate=True.")
1846 for tag in tags:
1847 collectionType = self.registry.getCollectionType(tag)
1848 if collectionType is not CollectionType.TAGGED:
1849 raise TypeError(
1850 f"Cannot disassociate from collection '{tag}' "
1851 f"of non-TAGGED type {collectionType.name}."
1852 )
1853 # Transform possibly-single-pass iterable into something we can iterate
1854 # over multiple times.
1855 refs = list(refs)
1856 # Pruning a component of a DatasetRef makes no sense since registry
1857 # doesn't know about components and datastore might not store
1858 # components in a separate file
1859 for ref in refs:
1860 if ref.datasetType.component():
1861 raise ValueError(f"Can not prune a component of a dataset (ref={ref})")
1862 # We don't need an unreliable Datastore transaction for this, because
1863 # we've been extra careful to ensure that Datastore.trash only involves
1864 # mutating the Registry (it can _look_ at Datastore-specific things,
1865 # but shouldn't change them), and hence all operations here are
1866 # Registry operations.
1867 with self.datastore.transaction():
1868 with self.registry.transaction():
1869 if unstore:
1870 self.datastore.trash(refs)
1871 if purge:
1872 self.registry.removeDatasets(refs)
1873 elif disassociate:
1874 assert tags, "Guaranteed by earlier logic in this function."
1875 for tag in tags:
1876 self.registry.disassociate(tag, refs)
1877 # We've exited the Registry transaction, and apparently committed.
1878 # (if there was an exception, everything rolled back, and it's as if
1879 # nothing happened - and we never get here).
1880 # Datastore artifacts are not yet gone, but they're clearly marked
1881 # as trash, so if we fail to delete now because of (e.g.) filesystem
1882 # problems we can try again later, and if manual administrative
1883 # intervention is required, it's pretty clear what that should entail:
1884 # deleting everything on disk and in private Datastore tables that is
1885 # in the dataset_location_trash table.
1886 if unstore:
1887 # Point of no return for removing artifacts
1888 self.datastore.emptyTrash()
1890 @transactional
1891 def ingest(
1892 self,
1893 *datasets: FileDataset,
1894 transfer: str | None = "auto",
1895 run: str | None = None,
1896 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
1897 record_validation_info: bool = True,
1898 ) -> None:
1899 """Store and register one or more datasets that already exist on disk.
1901 Parameters
1902 ----------
1903 datasets : `FileDataset`
1904 Each positional argument is a struct containing information about
1905 a file to be ingested, including its URI (either absolute or
1906 relative to the datastore root, if applicable), a resolved
1907 `DatasetRef`, and optionally a formatter class or its
1908 fully-qualified string name. If a formatter is not provided, the
1909 formatter that would be used for `put` is assumed. On successful
1910 ingest all `FileDataset.formatter` attributes will be set to the
1911 formatter class used. `FileDataset.path` attributes may be modified
1912 to put paths in whatever the datastore considers a standardized
1913 form.
1914 transfer : `str`, optional
1915 If not `None`, must be one of 'auto', 'move', 'copy', 'direct',
1916 'split', 'hardlink', 'relsymlink' or 'symlink', indicating how to
1917 transfer the file.
1918 run : `str`, optional
1919 The name of the run ingested datasets should be added to,
1920 overriding ``self.run``. This parameter is now deprecated since
1921 the run is encoded in the ``FileDataset``.
1922 idGenerationMode : `DatasetIdGenEnum`, optional
1923 Specifies option for generating dataset IDs. By default unique IDs
1924 are generated for each inserted dataset.
1925 record_validation_info : `bool`, optional
1926 If `True`, the default, the datastore can record validation
1927 information associated with the file. If `False` the datastore
1928 will not attempt to track any information such as checksums
1929 or file sizes. This can be useful if such information is tracked
1930 in an external system or if the file is to be compressed in place.
1931 It is up to the datastore whether this parameter is relevant.
1933 Raises
1934 ------
1935 TypeError
1936 Raised if the butler is read-only or if no run was provided.
1937 NotImplementedError
1938 Raised if the `Datastore` does not support the given transfer mode.
1939 DatasetTypeNotSupportedError
1940 Raised if one or more files to be ingested have a dataset type that
1941 is not supported by the `Datastore`..
1942 FileNotFoundError
1943 Raised if one of the given files does not exist.
1944 FileExistsError
1945 Raised if transfer is not `None` but the (internal) location the
1946 file would be moved to is already occupied.
1948 Notes
1949 -----
1950 This operation is not fully exception safe: if a database operation
1951 fails, the given `FileDataset` instances may be only partially updated.
1953 It is atomic in terms of database operations (they will either all
1954 succeed or all fail) providing the database engine implements
1955 transactions correctly. It will attempt to be atomic in terms of
1956 filesystem operations as well, but this cannot be implemented
1957 rigorously for most datastores.
1958 """
1959 if not self.isWriteable():
1960 raise TypeError("Butler is read-only.")
1962 log.verbose("Ingesting %d file dataset%s.", len(datasets), "" if len(datasets) == 1 else "s")
1963 if not datasets:
1964 return
1966 progress = Progress("lsst.daf.butler.Butler.ingest", level=logging.DEBUG)
1968 # We need to reorganize all the inputs so that they are grouped
1969 # by dataset type and run. Multiple refs in a single FileDataset
1970 # are required to share the run and dataset type.
1971 GroupedData = MutableMapping[tuple[DatasetType, str], list[FileDataset]]
1972 groupedData: GroupedData = defaultdict(list)
1974 # Track DataIDs that are being ingested so we can spot issues early
1975 # with duplication. Retain previous FileDataset so we can report it.
1976 groupedDataIds: MutableMapping[
1977 tuple[DatasetType, str], dict[DataCoordinate, FileDataset]
1978 ] = defaultdict(dict)
1980 used_run = False
1982 # And the nested loop that populates it:
1983 for dataset in progress.wrap(datasets, desc="Grouping by dataset type"):
1984 # Somewhere to store pre-existing refs if we have an
1985 # execution butler.
1986 existingRefs: list[DatasetRef] = []
1988 for ref in dataset.refs:
1989 assert ref.run is not None # For mypy
1990 group_key = (ref.datasetType, ref.run)
1992 if ref.dataId in groupedDataIds[group_key]:
1993 raise ConflictingDefinitionError(
1994 f"Ingest conflict. Dataset {dataset.path} has same"
1995 " DataId as other ingest dataset"
1996 f" {groupedDataIds[group_key][ref.dataId].path} "
1997 f" ({ref.dataId})"
1998 )
2000 groupedDataIds[group_key][ref.dataId] = dataset
2002 if existingRefs:
2003 if len(dataset.refs) != len(existingRefs):
2004 # Keeping track of partially pre-existing datasets is hard
2005 # and should generally never happen. For now don't allow
2006 # it.
2007 raise ConflictingDefinitionError(
2008 f"For dataset {dataset.path} some dataIds already exist"
2009 " in registry but others do not. This is not supported."
2010 )
2012 # Store expanded form in the original FileDataset.
2013 dataset.refs = existingRefs
2014 else:
2015 groupedData[group_key].append(dataset)
2017 if not used_run and run is not None:
2018 warnings.warn(
2019 "All DatasetRefs to be ingested had resolved dataset IDs. The value given to the "
2020 f"'run' parameter ({run!r}) was not used and the parameter will be removed in the future.",
2021 category=FutureWarning,
2022 stacklevel=3, # Take into account the @transactional decorator.
2023 )
2025 # Now we can bulk-insert into Registry for each DatasetType.
2026 for (datasetType, this_run), grouped_datasets in progress.iter_item_chunks(
2027 groupedData.items(), desc="Bulk-inserting datasets by type"
2028 ):
2029 refs_to_import = []
2030 for dataset in grouped_datasets:
2031 refs_to_import.extend(dataset.refs)
2033 n_refs = len(refs_to_import)
2034 log.verbose(
2035 "Importing %d ref%s of dataset type %r into run %r",
2036 n_refs,
2037 "" if n_refs == 1 else "s",
2038 datasetType.name,
2039 this_run,
2040 )
2042 # Import the refs and expand the DataCoordinates since we can't
2043 # guarantee that they are expanded and Datastore will need
2044 # the records.
2045 imported_refs = self.registry._importDatasets(refs_to_import, expand=True)
2046 assert set(imported_refs) == set(refs_to_import)
2048 # Replace all the refs in the FileDataset with expanded versions.
2049 # Pull them off in the order we put them on the list.
2050 for dataset in grouped_datasets:
2051 n_dataset_refs = len(dataset.refs)
2052 dataset.refs = imported_refs[:n_dataset_refs]
2053 del imported_refs[:n_dataset_refs]
2055 # Bulk-insert everything into Datastore.
2056 # We do not know if any of the registry entries already existed
2057 # (_importDatasets only complains if they exist but differ) so
2058 # we have to catch IntegrityError explicitly.
2059 try:
2060 self.datastore.ingest(*datasets, transfer=transfer, record_validation_info=record_validation_info)
2061 except IntegrityError as e:
2062 raise ConflictingDefinitionError(f"Datastore already contains one or more datasets: {e}")
2064 @contextlib.contextmanager
2065 def export(
2066 self,
2067 *,
2068 directory: str | None = None,
2069 filename: str | None = None,
2070 format: str | None = None,
2071 transfer: str | None = None,
2072 ) -> Iterator[RepoExportContext]:
2073 """Export datasets from the repository represented by this `Butler`.
2075 This method is a context manager that returns a helper object
2076 (`RepoExportContext`) that is used to indicate what information from
2077 the repository should be exported.
2079 Parameters
2080 ----------
2081 directory : `str`, optional
2082 Directory dataset files should be written to if ``transfer`` is not
2083 `None`.
2084 filename : `str`, optional
2085 Name for the file that will include database information associated
2086 with the exported datasets. If this is not an absolute path and
2087 ``directory`` is not `None`, it will be written to ``directory``
2088 instead of the current working directory. Defaults to
2089 "export.{format}".
2090 format : `str`, optional
2091 File format for the database information file. If `None`, the
2092 extension of ``filename`` will be used.
2093 transfer : `str`, optional
2094 Transfer mode passed to `Datastore.export`.
2096 Raises
2097 ------
2098 TypeError
2099 Raised if the set of arguments passed is inconsistent.
2101 Examples
2102 --------
2103 Typically the `Registry.queryDataIds` and `Registry.queryDatasets`
2104 methods are used to provide the iterables over data IDs and/or datasets
2105 to be exported::
2107 with butler.export("exports.yaml") as export:
2108 # Export all flats, but none of the dimension element rows
2109 # (i.e. data ID information) associated with them.
2110 export.saveDatasets(butler.registry.queryDatasets("flat"),
2111 elements=())
2112 # Export all datasets that start with "deepCoadd_" and all of
2113 # their associated data ID information.
2114 export.saveDatasets(butler.registry.queryDatasets("deepCoadd_*"))
2115 """
2116 if directory is None and transfer is not None:
2117 raise TypeError("Cannot transfer without providing a directory.")
2118 if transfer == "move":
2119 raise TypeError("Transfer may not be 'move': export is read-only")
2120 if format is None:
2121 if filename is None:
2122 raise TypeError("At least one of 'filename' or 'format' must be provided.")
2123 else:
2124 _, format = os.path.splitext(filename)
2125 if not format:
2126 raise ValueError("Please specify a file extension to determine export format.")
2127 format = format[1:] # Strip leading ".""
2128 elif filename is None:
2129 filename = f"export.{format}"
2130 if directory is not None:
2131 filename = os.path.join(directory, filename)
2132 formats = self._config["repo_transfer_formats"]
2133 if format not in formats:
2134 raise ValueError(f"Unknown export format {format!r}, allowed: {','.join(formats.keys())}")
2135 BackendClass = get_class_of(formats[format, "export"])
2136 with open(filename, "w") as stream:
2137 backend = BackendClass(stream, universe=self.dimensions)
2138 try:
2139 helper = RepoExportContext(
2140 self.registry, self.datastore, backend=backend, directory=directory, transfer=transfer
2141 )
2142 yield helper
2143 except BaseException:
2144 raise
2145 else:
2146 helper._finish()
2148 def import_(
2149 self,
2150 *,
2151 directory: ResourcePathExpression | None = None,
2152 filename: ResourcePathExpression | TextIO | None = None,
2153 format: str | None = None,
2154 transfer: str | None = None,
2155 skip_dimensions: set | None = None,
2156 ) -> None:
2157 """Import datasets into this repository that were exported from a
2158 different butler repository via `~lsst.daf.butler.Butler.export`.
2160 Parameters
2161 ----------
2162 directory : `~lsst.resources.ResourcePathExpression`, optional
2163 Directory containing dataset files to import from. If `None`,
2164 ``filename`` and all dataset file paths specified therein must
2165 be absolute.
2166 filename : `~lsst.resources.ResourcePathExpression` or `TextIO`
2167 A stream or name of file that contains database information
2168 associated with the exported datasets, typically generated by
2169 `~lsst.daf.butler.Butler.export`. If this a string (name) or
2170 `~lsst.resources.ResourcePath` and is not an absolute path,
2171 it will first be looked for relative to ``directory`` and if not
2172 found there it will be looked for in the current working
2173 directory. Defaults to "export.{format}".
2174 format : `str`, optional
2175 File format for ``filename``. If `None`, the extension of
2176 ``filename`` will be used.
2177 transfer : `str`, optional
2178 Transfer mode passed to `~lsst.daf.butler.Datastore.ingest`.
2179 skip_dimensions : `set`, optional
2180 Names of dimensions that should be skipped and not imported.
2182 Raises
2183 ------
2184 TypeError
2185 Raised if the set of arguments passed is inconsistent, or if the
2186 butler is read-only.
2187 """
2188 if not self.isWriteable():
2189 raise TypeError("Butler is read-only.")
2190 if format is None:
2191 if filename is None:
2192 raise TypeError("At least one of 'filename' or 'format' must be provided.")
2193 else:
2194 _, format = os.path.splitext(filename) # type: ignore
2195 elif filename is None:
2196 filename = ResourcePath(f"export.{format}", forceAbsolute=False)
2197 if directory is not None:
2198 directory = ResourcePath(directory, forceDirectory=True)
2199 # mypy doesn't think this will work but it does in python >= 3.10.
2200 if isinstance(filename, ResourcePathExpression): # type: ignore
2201 filename = ResourcePath(filename, forceAbsolute=False) # type: ignore
2202 if not filename.isabs() and directory is not None:
2203 potential = directory.join(filename)
2204 exists_in_cwd = filename.exists()
2205 exists_in_dir = potential.exists()
2206 if exists_in_cwd and exists_in_dir:
2207 log.warning(
2208 "A relative path for filename was specified (%s) which exists relative to cwd. "
2209 "Additionally, the file exists relative to the given search directory (%s). "
2210 "Using the export file in the given directory.",
2211 filename,
2212 potential,
2213 )
2214 # Given they specified an explicit directory and that
2215 # directory has the export file in it, assume that that
2216 # is what was meant despite the file in cwd.
2217 filename = potential
2218 elif exists_in_dir:
2219 filename = potential
2220 elif not exists_in_cwd and not exists_in_dir:
2221 # Raise early.
2222 raise FileNotFoundError(
2223 f"Export file could not be found in {filename.abspath()} or {potential.abspath()}."
2224 )
2225 BackendClass: type[RepoImportBackend] = get_class_of(
2226 self._config["repo_transfer_formats"][format]["import"]
2227 )
2229 def doImport(importStream: TextIO | ResourceHandleProtocol) -> None:
2230 backend = BackendClass(importStream, self.registry) # type: ignore[call-arg]
2231 backend.register()
2232 with self.transaction():
2233 backend.load(
2234 self.datastore,
2235 directory=directory,
2236 transfer=transfer,
2237 skip_dimensions=skip_dimensions,
2238 )
2240 if isinstance(filename, ResourcePath):
2241 # We can not use open() here at the moment because of
2242 # DM-38589 since yaml does stream.read(8192) in a loop.
2243 stream = io.StringIO(filename.read().decode())
2244 doImport(stream)
2245 else:
2246 doImport(filename) # type: ignore
2248 def transfer_from(
2249 self,
2250 source_butler: LimitedButler,
2251 source_refs: Iterable[DatasetRef],
2252 transfer: str = "auto",
2253 skip_missing: bool = True,
2254 register_dataset_types: bool = False,
2255 transfer_dimensions: bool = False,
2256 ) -> collections.abc.Collection[DatasetRef]:
2257 """Transfer datasets to this Butler from a run in another Butler.
2259 Parameters
2260 ----------
2261 source_butler : `LimitedButler`
2262 Butler from which the datasets are to be transferred. If data IDs
2263 in ``source_refs`` are not expanded then this has to be a full
2264 `Butler` whose registry will be used to expand data IDs.
2265 source_refs : iterable of `DatasetRef`
2266 Datasets defined in the source butler that should be transferred to
2267 this butler.
2268 transfer : `str`, optional
2269 Transfer mode passed to `~lsst.daf.butler.Datastore.transfer_from`.
2270 skip_missing : `bool`
2271 If `True`, datasets with no datastore artifact associated with
2272 them are not transferred. If `False` a registry entry will be
2273 created even if no datastore record is created (and so will
2274 look equivalent to the dataset being unstored).
2275 register_dataset_types : `bool`
2276 If `True` any missing dataset types are registered. Otherwise
2277 an exception is raised.
2278 transfer_dimensions : `bool`, optional
2279 If `True`, dimension record data associated with the new datasets
2280 will be transferred.
2282 Returns
2283 -------
2284 refs : `list` of `DatasetRef`
2285 The refs added to this Butler.
2287 Notes
2288 -----
2289 The datastore artifact has to exist for a transfer
2290 to be made but non-existence is not an error.
2292 Datasets that already exist in this run will be skipped.
2294 The datasets are imported as part of a transaction, although
2295 dataset types are registered before the transaction is started.
2296 This means that it is possible for a dataset type to be registered
2297 even though transfer has failed.
2298 """
2299 if not self.isWriteable():
2300 raise TypeError("Butler is read-only.")
2301 progress = Progress("lsst.daf.butler.Butler.transfer_from", level=VERBOSE)
2303 # Will iterate through the refs multiple times so need to convert
2304 # to a list if this isn't a collection.
2305 if not isinstance(source_refs, collections.abc.Collection):
2306 source_refs = list(source_refs)
2308 original_count = len(source_refs)
2309 log.info("Transferring %d datasets into %s", original_count, str(self))
2311 # In some situations the datastore artifact may be missing
2312 # and we do not want that registry entry to be imported.
2313 # Asking datastore is not sufficient, the records may have been
2314 # purged, we have to ask for the (predicted) URI and check
2315 # existence explicitly. Execution butler is set up exactly like
2316 # this with no datastore records.
2317 artifact_existence: dict[ResourcePath, bool] = {}
2318 if skip_missing:
2319 dataset_existence = source_butler.datastore.mexists(
2320 source_refs, artifact_existence=artifact_existence
2321 )
2322 source_refs = [ref for ref, exists in dataset_existence.items() if exists]
2323 filtered_count = len(source_refs)
2324 n_missing = original_count - filtered_count
2325 log.verbose(
2326 "%d dataset%s removed because the artifact does not exist. Now have %d.",
2327 n_missing,
2328 "" if n_missing == 1 else "s",
2329 filtered_count,
2330 )
2332 # Importing requires that we group the refs by dataset type and run
2333 # before doing the import.
2334 source_dataset_types = set()
2335 grouped_refs = defaultdict(list)
2336 for ref in source_refs:
2337 grouped_refs[ref.datasetType, ref.run].append(ref)
2338 source_dataset_types.add(ref.datasetType)
2340 # Check to see if the dataset type in the source butler has
2341 # the same definition in the target butler and register missing
2342 # ones if requested. Registration must happen outside a transaction.
2343 newly_registered_dataset_types = set()
2344 for datasetType in source_dataset_types:
2345 if register_dataset_types:
2346 # Let this raise immediately if inconsistent. Continuing
2347 # on to find additional inconsistent dataset types
2348 # might result in additional unwanted dataset types being
2349 # registered.
2350 if self.registry.registerDatasetType(datasetType):
2351 newly_registered_dataset_types.add(datasetType)
2352 else:
2353 # If the dataset type is missing, let it fail immediately.
2354 target_dataset_type = self.registry.getDatasetType(datasetType.name)
2355 if target_dataset_type != datasetType:
2356 raise ConflictingDefinitionError(
2357 "Source butler dataset type differs from definition"
2358 f" in target butler: {datasetType} !="
2359 f" {target_dataset_type}"
2360 )
2361 if newly_registered_dataset_types:
2362 # We may have registered some even if there were inconsistencies
2363 # but should let people know (or else remove them again).
2364 log.log(
2365 VERBOSE,
2366 "Registered the following dataset types in the target Butler: %s",
2367 ", ".join(d.name for d in newly_registered_dataset_types),
2368 )
2369 else:
2370 log.log(VERBOSE, "All required dataset types are known to the target Butler")
2372 dimension_records: dict[DimensionElement, dict[DataCoordinate, DimensionRecord]] = defaultdict(dict)
2373 if transfer_dimensions:
2374 # Collect all the dimension records for these refs.
2375 # All dimensions are to be copied but the list of valid dimensions
2376 # come from this butler's universe.
2377 elements = frozenset(
2378 element
2379 for element in self.dimensions.getStaticElements()
2380 if element.hasTable() and element.viewOf is None
2381 )
2382 dataIds = {ref.dataId for ref in source_refs}
2383 # This logic comes from saveDataIds.
2384 for dataId in dataIds:
2385 # Need an expanded record, if not expanded that we need a full
2386 # butler with registry (allow mocks with registry too).
2387 if not dataId.hasRecords():
2388 if registry := getattr(source_butler, "registry", None):
2389 dataId = registry.expandDataId(dataId)
2390 else:
2391 raise TypeError("Input butler needs to be a full butler to expand DataId.")
2392 # If this butler doesn't know about a dimension in the source
2393 # butler things will break later.
2394 for record in dataId.records.values():
2395 if record is not None and record.definition in elements:
2396 dimension_records[record.definition].setdefault(record.dataId, record)
2398 handled_collections: set[str] = set()
2400 # Do all the importing in a single transaction.
2401 with self.transaction():
2402 if dimension_records:
2403 log.verbose("Ensuring that dimension records exist for transferred datasets.")
2404 for element, r in dimension_records.items():
2405 records = [r[dataId] for dataId in r]
2406 # Assume that if the record is already present that we can
2407 # use it without having to check that the record metadata
2408 # is consistent.
2409 self.registry.insertDimensionData(element, *records, skip_existing=True)
2411 n_imported = 0
2412 for (datasetType, run), refs_to_import in progress.iter_item_chunks(
2413 grouped_refs.items(), desc="Importing to registry by run and dataset type"
2414 ):
2415 if run not in handled_collections:
2416 # May need to create output collection. If source butler
2417 # has a registry, ask for documentation string.
2418 run_doc = None
2419 if registry := getattr(source_butler, "registry", None):
2420 run_doc = registry.getCollectionDocumentation(run)
2421 registered = self.registry.registerRun(run, doc=run_doc)
2422 handled_collections.add(run)
2423 if registered:
2424 log.log(VERBOSE, "Creating output run %s", run)
2426 n_refs = len(refs_to_import)
2427 log.verbose(
2428 "Importing %d ref%s of dataset type %s into run %s",
2429 n_refs,
2430 "" if n_refs == 1 else "s",
2431 datasetType.name,
2432 run,
2433 )
2435 # Assume we are using UUIDs and the source refs will match
2436 # those imported.
2437 imported_refs = self.registry._importDatasets(refs_to_import, expand=False)
2438 assert set(imported_refs) == set(refs_to_import)
2439 n_imported += len(imported_refs)
2441 assert len(source_refs) == n_imported
2442 log.verbose("Imported %d datasets into destination butler", n_imported)
2444 # Ask the datastore to transfer. The datastore has to check that
2445 # the source datastore is compatible with the target datastore.
2446 accepted, rejected = self.datastore.transfer_from(
2447 source_butler.datastore,
2448 source_refs,
2449 transfer=transfer,
2450 artifact_existence=artifact_existence,
2451 )
2452 if rejected:
2453 # For now, accept the registry entries but not the files.
2454 log.warning(
2455 "%d datasets were rejected and %d accepted for dataset type %s in run %r.",
2456 len(rejected),
2457 len(accepted),
2458 datasetType,
2459 run,
2460 )
2462 return source_refs
2464 def validateConfiguration(
2465 self,
2466 logFailures: bool = False,
2467 datasetTypeNames: Iterable[str] | None = None,
2468 ignore: Iterable[str] | None = None,
2469 ) -> None:
2470 """Validate butler configuration.
2472 Checks that each `DatasetType` can be stored in the `Datastore`.
2474 Parameters
2475 ----------
2476 logFailures : `bool`, optional
2477 If `True`, output a log message for every validation error
2478 detected.
2479 datasetTypeNames : iterable of `str`, optional
2480 The `DatasetType` names that should be checked. This allows
2481 only a subset to be selected.
2482 ignore : iterable of `str`, optional
2483 Names of DatasetTypes to skip over. This can be used to skip
2484 known problems. If a named `DatasetType` corresponds to a
2485 composite, all components of that `DatasetType` will also be
2486 ignored.
2488 Raises
2489 ------
2490 ButlerValidationError
2491 Raised if there is some inconsistency with how this Butler
2492 is configured.
2493 """
2494 if datasetTypeNames:
2495 datasetTypes = [self.registry.getDatasetType(name) for name in datasetTypeNames]
2496 else:
2497 datasetTypes = list(self.registry.queryDatasetTypes())
2499 # filter out anything from the ignore list
2500 if ignore:
2501 ignore = set(ignore)
2502 datasetTypes = [
2503 e for e in datasetTypes if e.name not in ignore and e.nameAndComponent()[0] not in ignore
2504 ]
2505 else:
2506 ignore = set()
2508 # For each datasetType that has an instrument dimension, create
2509 # a DatasetRef for each defined instrument
2510 datasetRefs = []
2512 # Find all the registered instruments (if "instrument" is in the
2513 # universe).
2514 if "instrument" in self.dimensions:
2515 instruments = {record.name for record in self.registry.queryDimensionRecords("instrument")}
2517 for datasetType in datasetTypes:
2518 if "instrument" in datasetType.dimensions:
2519 # In order to create a conforming dataset ref, create
2520 # fake DataCoordinate values for the non-instrument
2521 # dimensions. The type of the value does not matter here.
2522 dataId = {dim.name: 1 for dim in datasetType.dimensions if dim.name != "instrument"}
2524 for instrument in instruments:
2525 datasetRef = DatasetRef(
2526 datasetType,
2527 DataCoordinate.standardize(
2528 dataId, instrument=instrument, graph=datasetType.dimensions
2529 ),
2530 run="validate",
2531 )
2532 datasetRefs.append(datasetRef)
2534 entities: list[DatasetType | DatasetRef] = []
2535 entities.extend(datasetTypes)
2536 entities.extend(datasetRefs)
2538 datastoreErrorStr = None
2539 try:
2540 self.datastore.validateConfiguration(entities, logFailures=logFailures)
2541 except ValidationError as e:
2542 datastoreErrorStr = str(e)
2544 # Also check that the LookupKeys used by the datastores match
2545 # registry and storage class definitions
2546 keys = self.datastore.getLookupKeys()
2548 failedNames = set()
2549 failedDataId = set()
2550 for key in keys:
2551 if key.name is not None:
2552 if key.name in ignore:
2553 continue
2555 # skip if specific datasetType names were requested and this
2556 # name does not match
2557 if datasetTypeNames and key.name not in datasetTypeNames:
2558 continue
2560 # See if it is a StorageClass or a DatasetType
2561 if key.name in self.storageClasses:
2562 pass
2563 else:
2564 try:
2565 self.registry.getDatasetType(key.name)
2566 except KeyError:
2567 if logFailures:
2568 log.critical("Key '%s' does not correspond to a DatasetType or StorageClass", key)
2569 failedNames.add(key)
2570 else:
2571 # Dimensions are checked for consistency when the Butler
2572 # is created and rendezvoused with a universe.
2573 pass
2575 # Check that the instrument is a valid instrument
2576 # Currently only support instrument so check for that
2577 if key.dataId:
2578 dataIdKeys = set(key.dataId)
2579 if {"instrument"} != dataIdKeys:
2580 if logFailures:
2581 log.critical("Key '%s' has unsupported DataId override", key)
2582 failedDataId.add(key)
2583 elif key.dataId["instrument"] not in instruments:
2584 if logFailures:
2585 log.critical("Key '%s' has unknown instrument", key)
2586 failedDataId.add(key)
2588 messages = []
2590 if datastoreErrorStr:
2591 messages.append(datastoreErrorStr)
2593 for failed, msg in (
2594 (failedNames, "Keys without corresponding DatasetType or StorageClass entry: "),
2595 (failedDataId, "Keys with bad DataId entries: "),
2596 ):
2597 if failed:
2598 msg += ", ".join(str(k) for k in failed)
2599 messages.append(msg)
2601 if messages:
2602 raise ValidationError(";\n".join(messages))
2604 @property
2605 def collections(self) -> Sequence[str]:
2606 """The collections to search by default, in order
2607 (`~collections.abc.Sequence` [ `str` ]).
2609 This is an alias for ``self.registry.defaults.collections``. It cannot
2610 be set directly in isolation, but all defaults may be changed together
2611 by assigning a new `RegistryDefaults` instance to
2612 ``self.registry.defaults``.
2613 """
2614 return self.registry.defaults.collections
2616 @property
2617 def run(self) -> str | None:
2618 """Name of the run this butler writes outputs to by default (`str` or
2619 `None`).
2621 This is an alias for ``self.registry.defaults.run``. It cannot be set
2622 directly in isolation, but all defaults may be changed together by
2623 assigning a new `RegistryDefaults` instance to
2624 ``self.registry.defaults``.
2625 """
2626 return self.registry.defaults.run
2628 @property
2629 def dimensions(self) -> DimensionUniverse:
2630 # Docstring inherited.
2631 return self.registry.dimensions
2633 registry: Registry
2634 """The object that manages dataset metadata and relationships (`Registry`).
2636 Most operations that don't involve reading or writing butler datasets are
2637 accessible only via `Registry` methods.
2638 """
2640 datastore: Datastore
2641 """The object that manages actual dataset storage (`Datastore`).
2643 Direct user access to the datastore should rarely be necessary; the primary
2644 exception is the case where a `Datastore` implementation provides extra
2645 functionality beyond what the base class defines.
2646 """
2648 storageClasses: StorageClassFactory
2649 """An object that maps known storage class names to objects that fully
2650 describe them (`StorageClassFactory`).
2651 """