Coverage for python/lsst/daf/butler/_butler.py: 8%
671 statements
« prev ^ index » next coverage.py v6.5.0, created at 2023-04-07 00:58 -0700
« prev ^ index » next coverage.py v6.5.0, created at 2023-04-07 00:58 -0700
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""
23Butler top level classes.
24"""
25from __future__ import annotations
27__all__ = (
28 "Butler",
29 "ButlerValidationError",
30)
32import collections.abc
33import contextlib
34import logging
35import numbers
36import os
37import uuid
38from collections import defaultdict
39from typing import (
40 Any,
41 ClassVar,
42 Counter,
43 Dict,
44 Iterable,
45 Iterator,
46 List,
47 MutableMapping,
48 Optional,
49 Sequence,
50 Set,
51 TextIO,
52 Tuple,
53 Type,
54 Union,
55)
57from deprecated.sphinx import deprecated
58from lsst.resources import ResourcePath, ResourcePathExpression
59from lsst.utils import doImportType
60from lsst.utils.introspection import get_class_of
61from lsst.utils.logging import VERBOSE, getLogger
63from ._butlerConfig import ButlerConfig
64from ._butlerRepoIndex import ButlerRepoIndex
65from ._deferredDatasetHandle import DeferredDatasetHandle
66from ._limited_butler import LimitedButler
67from .core import (
68 AmbiguousDatasetError,
69 Config,
70 ConfigSubset,
71 DataCoordinate,
72 DataId,
73 DataIdValue,
74 DatasetRef,
75 DatasetRefURIs,
76 DatasetType,
77 Datastore,
78 Dimension,
79 DimensionConfig,
80 DimensionElement,
81 DimensionRecord,
82 DimensionUniverse,
83 FileDataset,
84 Progress,
85 StorageClass,
86 StorageClassFactory,
87 Timespan,
88 ValidationError,
89)
90from .core.repoRelocation import BUTLER_ROOT_TAG
91from .core.utils import transactional
92from .registry import (
93 CollectionType,
94 ConflictingDefinitionError,
95 DataIdError,
96 DatasetIdGenEnum,
97 MissingDatasetTypeError,
98 Registry,
99 RegistryConfig,
100 RegistryDefaults,
101)
102from .transfers import RepoExportContext
104log = getLogger(__name__)
107class ButlerValidationError(ValidationError):
108 """There is a problem with the Butler configuration."""
110 pass
113class Butler(LimitedButler):
114 """Main entry point for the data access system.
116 Parameters
117 ----------
118 config : `ButlerConfig`, `Config` or `str`, optional.
119 Configuration. Anything acceptable to the
120 `ButlerConfig` constructor. If a directory path
121 is given the configuration will be read from a ``butler.yaml`` file in
122 that location. If `None` is given default values will be used.
123 butler : `Butler`, optional.
124 If provided, construct a new Butler that uses the same registry and
125 datastore as the given one, but with the given collection and run.
126 Incompatible with the ``config``, ``searchPaths``, and ``writeable``
127 arguments.
128 collections : `str` or `Iterable` [ `str` ], optional
129 An expression specifying the collections to be searched (in order) when
130 reading datasets.
131 This may be a `str` collection name or an iterable thereof.
132 See :ref:`daf_butler_collection_expressions` for more information.
133 These collections are not registered automatically and must be
134 manually registered before they are used by any method, but they may be
135 manually registered after the `Butler` is initialized.
136 run : `str`, optional
137 Name of the `~CollectionType.RUN` collection new datasets should be
138 inserted into. If ``collections`` is `None` and ``run`` is not `None`,
139 ``collections`` will be set to ``[run]``. If not `None`, this
140 collection will automatically be registered. If this is not set (and
141 ``writeable`` is not set either), a read-only butler will be created.
142 searchPaths : `list` of `str`, optional
143 Directory paths to search when calculating the full Butler
144 configuration. Not used if the supplied config is already a
145 `ButlerConfig`.
146 writeable : `bool`, optional
147 Explicitly sets whether the butler supports write operations. If not
148 provided, a read-write butler is created if any of ``run``, ``tags``,
149 or ``chains`` is non-empty.
150 inferDefaults : `bool`, optional
151 If `True` (default) infer default data ID values from the values
152 present in the datasets in ``collections``: if all collections have the
153 same value (or no value) for a governor dimension, that value will be
154 the default for that dimension. Nonexistent collections are ignored.
155 If a default value is provided explicitly for a governor dimension via
156 ``**kwargs``, no default will be inferred for that dimension.
157 **kwargs : `str`
158 Default data ID key-value pairs. These may only identify "governor"
159 dimensions like ``instrument`` and ``skymap``.
161 Examples
162 --------
163 While there are many ways to control exactly how a `Butler` interacts with
164 the collections in its `Registry`, the most common cases are still simple.
166 For a read-only `Butler` that searches one collection, do::
168 butler = Butler("/path/to/repo", collections=["u/alice/DM-50000"])
170 For a read-write `Butler` that writes to and reads from a
171 `~CollectionType.RUN` collection::
173 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a")
175 The `Butler` passed to a ``PipelineTask`` is often much more complex,
176 because we want to write to one `~CollectionType.RUN` collection but read
177 from several others (as well)::
179 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a",
180 collections=["u/alice/DM-50000/a",
181 "u/bob/DM-49998",
182 "HSC/defaults"])
184 This butler will `put` new datasets to the run ``u/alice/DM-50000/a``.
185 Datasets will be read first from that run (since it appears first in the
186 chain), and then from ``u/bob/DM-49998`` and finally ``HSC/defaults``.
188 Finally, one can always create a `Butler` with no collections::
190 butler = Butler("/path/to/repo", writeable=True)
192 This can be extremely useful when you just want to use ``butler.registry``,
193 e.g. for inserting dimension data or managing collections, or when the
194 collections you want to use with the butler are not consistent.
195 Passing ``writeable`` explicitly here is only necessary if you want to be
196 able to make changes to the repo - usually the value for ``writeable`` can
197 be guessed from the collection arguments provided, but it defaults to
198 `False` when there are not collection arguments.
199 """
201 def __init__(
202 self,
203 config: Union[Config, str, None] = None,
204 *,
205 butler: Optional[Butler] = None,
206 collections: Any = None,
207 run: Optional[str] = None,
208 searchPaths: Optional[List[str]] = None,
209 writeable: Optional[bool] = None,
210 inferDefaults: bool = True,
211 **kwargs: str,
212 ):
213 defaults = RegistryDefaults(collections=collections, run=run, infer=inferDefaults, **kwargs)
214 # Load registry, datastore, etc. from config or existing butler.
215 if butler is not None:
216 if config is not None or searchPaths is not None or writeable is not None:
217 raise TypeError(
218 "Cannot pass 'config', 'searchPaths', or 'writeable' arguments with 'butler' argument."
219 )
220 self.registry = butler.registry.copy(defaults)
221 self.datastore = butler.datastore
222 self.storageClasses = butler.storageClasses
223 self._config: ButlerConfig = butler._config
224 self._allow_put_of_predefined_dataset = butler._allow_put_of_predefined_dataset
225 else:
226 # Can only look for strings in the known repos list.
227 if isinstance(config, str) and config in self.get_known_repos():
228 config = str(self.get_repo_uri(config))
229 try:
230 self._config = ButlerConfig(config, searchPaths=searchPaths)
231 except FileNotFoundError as e:
232 if known := self.get_known_repos():
233 aliases = f"(known aliases: {', '.join(known)})"
234 else:
235 aliases = "(no known aliases)"
236 raise FileNotFoundError(f"{e} {aliases}") from e
237 self._config = ButlerConfig(config, searchPaths=searchPaths)
238 try:
239 if "root" in self._config:
240 butlerRoot = self._config["root"]
241 else:
242 butlerRoot = self._config.configDir
243 if writeable is None:
244 writeable = run is not None
245 self.registry = Registry.fromConfig(
246 self._config, butlerRoot=butlerRoot, writeable=writeable, defaults=defaults
247 )
248 self.datastore = Datastore.fromConfig(
249 self._config, self.registry.getDatastoreBridgeManager(), butlerRoot=butlerRoot
250 )
251 self.storageClasses = StorageClassFactory()
252 self.storageClasses.addFromConfig(self._config)
253 self._allow_put_of_predefined_dataset = self._config.get(
254 "allow_put_of_predefined_dataset", False
255 )
256 except Exception:
257 # Failures here usually mean that configuration is incomplete,
258 # just issue an error message which includes config file URI.
259 log.error(f"Failed to instantiate Butler from config {self._config.configFile}.")
260 raise
262 # For execution butler the datastore needs a special
263 # dependency-inversion trick. This is not used by regular butler,
264 # but we do not have a way to distinguish regular butler from execution
265 # butler.
266 self.datastore.set_retrieve_dataset_type_method(self._retrieve_dataset_type)
268 if "run" in self._config or "collection" in self._config:
269 raise ValueError("Passing a run or collection via configuration is no longer supported.")
271 GENERATION: ClassVar[int] = 3
272 """This is a Generation 3 Butler.
274 This attribute may be removed in the future, once the Generation 2 Butler
275 interface has been fully retired; it should only be used in transitional
276 code.
277 """
279 def _retrieve_dataset_type(self, name: str) -> DatasetType | None:
280 """Return DatasetType defined in registry given dataset type name."""
281 try:
282 return self.registry.getDatasetType(name)
283 except MissingDatasetTypeError:
284 return None
286 @classmethod
287 def get_repo_uri(cls, label: str) -> ResourcePath:
288 """Look up the label in a butler repository index.
290 Parameters
291 ----------
292 label : `str`
293 Label of the Butler repository to look up.
295 Returns
296 -------
297 uri : `lsst.resources.ResourcePath`
298 URI to the Butler repository associated with the given label.
300 Raises
301 ------
302 KeyError
303 Raised if the label is not found in the index, or if an index
304 can not be found at all.
306 Notes
307 -----
308 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the
309 information is discovered.
310 """
311 return ButlerRepoIndex.get_repo_uri(label)
313 @classmethod
314 def get_known_repos(cls) -> Set[str]:
315 """Retrieve the list of known repository labels.
317 Returns
318 -------
319 repos : `set` of `str`
320 All the known labels. Can be empty if no index can be found.
322 Notes
323 -----
324 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the
325 information is discovered.
326 """
327 return ButlerRepoIndex.get_known_repos()
329 @staticmethod
330 def makeRepo(
331 root: ResourcePathExpression,
332 config: Union[Config, str, None] = None,
333 dimensionConfig: Union[Config, str, None] = None,
334 standalone: bool = False,
335 searchPaths: Optional[List[str]] = None,
336 forceConfigRoot: bool = True,
337 outfile: Optional[ResourcePathExpression] = None,
338 overwrite: bool = False,
339 ) -> Config:
340 """Create an empty data repository by adding a butler.yaml config
341 to a repository root directory.
343 Parameters
344 ----------
345 root : `lsst.resources.ResourcePathExpression`
346 Path or URI to the root location of the new repository. Will be
347 created if it does not exist.
348 config : `Config` or `str`, optional
349 Configuration to write to the repository, after setting any
350 root-dependent Registry or Datastore config options. Can not
351 be a `ButlerConfig` or a `ConfigSubset`. If `None`, default
352 configuration will be used. Root-dependent config options
353 specified in this config are overwritten if ``forceConfigRoot``
354 is `True`.
355 dimensionConfig : `Config` or `str`, optional
356 Configuration for dimensions, will be used to initialize registry
357 database.
358 standalone : `bool`
359 If True, write all expanded defaults, not just customized or
360 repository-specific settings.
361 This (mostly) decouples the repository from the default
362 configuration, insulating it from changes to the defaults (which
363 may be good or bad, depending on the nature of the changes).
364 Future *additions* to the defaults will still be picked up when
365 initializing `Butlers` to repos created with ``standalone=True``.
366 searchPaths : `list` of `str`, optional
367 Directory paths to search when calculating the full butler
368 configuration.
369 forceConfigRoot : `bool`, optional
370 If `False`, any values present in the supplied ``config`` that
371 would normally be reset are not overridden and will appear
372 directly in the output config. This allows non-standard overrides
373 of the root directory for a datastore or registry to be given.
374 If this parameter is `True` the values for ``root`` will be
375 forced into the resulting config if appropriate.
376 outfile : `lss.resources.ResourcePathExpression`, optional
377 If not-`None`, the output configuration will be written to this
378 location rather than into the repository itself. Can be a URI
379 string. Can refer to a directory that will be used to write
380 ``butler.yaml``.
381 overwrite : `bool`, optional
382 Create a new configuration file even if one already exists
383 in the specified output location. Default is to raise
384 an exception.
386 Returns
387 -------
388 config : `Config`
389 The updated `Config` instance written to the repo.
391 Raises
392 ------
393 ValueError
394 Raised if a ButlerConfig or ConfigSubset is passed instead of a
395 regular Config (as these subclasses would make it impossible to
396 support ``standalone=False``).
397 FileExistsError
398 Raised if the output config file already exists.
399 os.error
400 Raised if the directory does not exist, exists but is not a
401 directory, or cannot be created.
403 Notes
404 -----
405 Note that when ``standalone=False`` (the default), the configuration
406 search path (see `ConfigSubset.defaultSearchPaths`) that was used to
407 construct the repository should also be used to construct any Butlers
408 to avoid configuration inconsistencies.
409 """
410 if isinstance(config, (ButlerConfig, ConfigSubset)):
411 raise ValueError("makeRepo must be passed a regular Config without defaults applied.")
413 # Ensure that the root of the repository exists or can be made
414 root_uri = ResourcePath(root, forceDirectory=True)
415 root_uri.mkdir()
417 config = Config(config)
419 # If we are creating a new repo from scratch with relative roots,
420 # do not propagate an explicit root from the config file
421 if "root" in config:
422 del config["root"]
424 full = ButlerConfig(config, searchPaths=searchPaths) # this applies defaults
425 imported_class = doImportType(full["datastore", "cls"])
426 if not issubclass(imported_class, Datastore):
427 raise TypeError(f"Imported datastore class {full['datastore', 'cls']} is not a Datastore")
428 datastoreClass: Type[Datastore] = imported_class
429 datastoreClass.setConfigRoot(BUTLER_ROOT_TAG, config, full, overwrite=forceConfigRoot)
431 # if key exists in given config, parse it, otherwise parse the defaults
432 # in the expanded config
433 if config.get(("registry", "db")):
434 registryConfig = RegistryConfig(config)
435 else:
436 registryConfig = RegistryConfig(full)
437 defaultDatabaseUri = registryConfig.makeDefaultDatabaseUri(BUTLER_ROOT_TAG)
438 if defaultDatabaseUri is not None:
439 Config.updateParameters(
440 RegistryConfig, config, full, toUpdate={"db": defaultDatabaseUri}, overwrite=forceConfigRoot
441 )
442 else:
443 Config.updateParameters(RegistryConfig, config, full, toCopy=("db",), overwrite=forceConfigRoot)
445 if standalone:
446 config.merge(full)
447 else:
448 # Always expand the registry.managers section into the per-repo
449 # config, because after the database schema is created, it's not
450 # allowed to change anymore. Note that in the standalone=True
451 # branch, _everything_ in the config is expanded, so there's no
452 # need to special case this.
453 Config.updateParameters(RegistryConfig, config, full, toMerge=("managers",), overwrite=False)
454 configURI: ResourcePathExpression
455 if outfile is not None:
456 # When writing to a separate location we must include
457 # the root of the butler repo in the config else it won't know
458 # where to look.
459 config["root"] = root_uri.geturl()
460 configURI = outfile
461 else:
462 configURI = root_uri
463 # Strip obscore configuration, if it is present, before writing config
464 # to a file, obscore config will be stored in registry.
465 config_to_write = config
466 if (obscore_config_key := ("registry", "managers", "obscore", "config")) in config:
467 config_to_write = config.copy()
468 del config_to_write[obscore_config_key]
469 config_to_write.dumpToUri(configURI, overwrite=overwrite)
471 # Create Registry and populate tables
472 registryConfig = RegistryConfig(config.get("registry"))
473 dimensionConfig = DimensionConfig(dimensionConfig)
474 Registry.createFromConfig(registryConfig, dimensionConfig=dimensionConfig, butlerRoot=root_uri)
476 log.verbose("Wrote new Butler configuration file to %s", configURI)
478 return config
480 @classmethod
481 def _unpickle(
482 cls,
483 config: ButlerConfig,
484 collections: Optional[tuple[str, ...]],
485 run: Optional[str],
486 defaultDataId: Dict[str, str],
487 writeable: bool,
488 ) -> Butler:
489 """Callable used to unpickle a Butler.
491 We prefer not to use ``Butler.__init__`` directly so we can force some
492 of its many arguments to be keyword-only (note that ``__reduce__``
493 can only invoke callables with positional arguments).
495 Parameters
496 ----------
497 config : `ButlerConfig`
498 Butler configuration, already coerced into a true `ButlerConfig`
499 instance (and hence after any search paths for overrides have been
500 utilized).
501 collections : `tuple` [ `str` ]
502 Names of the default collections to read from.
503 run : `str`, optional
504 Name of the default `~CollectionType.RUN` collection to write to.
505 defaultDataId : `dict` [ `str`, `str` ]
506 Default data ID values.
507 writeable : `bool`
508 Whether the Butler should support write operations.
510 Returns
511 -------
512 butler : `Butler`
513 A new `Butler` instance.
514 """
515 # MyPy doesn't recognize that the kwargs below are totally valid; it
516 # seems to think '**defaultDataId* is a _positional_ argument!
517 return cls(
518 config=config,
519 collections=collections,
520 run=run,
521 writeable=writeable,
522 **defaultDataId, # type: ignore
523 )
525 def __reduce__(self) -> tuple:
526 """Support pickling."""
527 return (
528 Butler._unpickle,
529 (
530 self._config,
531 self.collections,
532 self.run,
533 self.registry.defaults.dataId.byName(),
534 self.registry.isWriteable(),
535 ),
536 )
538 def __str__(self) -> str:
539 return "Butler(collections={}, run={}, datastore='{}', registry='{}')".format(
540 self.collections, self.run, self.datastore, self.registry
541 )
543 def isWriteable(self) -> bool:
544 """Return `True` if this `Butler` supports write operations."""
545 return self.registry.isWriteable()
547 @contextlib.contextmanager
548 def transaction(self) -> Iterator[None]:
549 """Context manager supporting `Butler` transactions.
551 Transactions can be nested.
552 """
553 with self.registry.transaction():
554 with self.datastore.transaction():
555 yield
557 def _standardizeArgs(
558 self,
559 datasetRefOrType: Union[DatasetRef, DatasetType, str],
560 dataId: Optional[DataId] = None,
561 for_put: bool = True,
562 **kwargs: Any,
563 ) -> Tuple[DatasetType, Optional[DataId]]:
564 """Standardize the arguments passed to several Butler APIs.
566 Parameters
567 ----------
568 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
569 When `DatasetRef` the `dataId` should be `None`.
570 Otherwise the `DatasetType` or name thereof.
571 dataId : `dict` or `DataCoordinate`
572 A `dict` of `Dimension` link name, value pairs that label the
573 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
574 should be provided as the second argument.
575 for_put : `bool`, optional
576 If `True` this call is invoked as part of a `Butler.put()`.
577 Otherwise it is assumed to be part of a `Butler.get()`. This
578 parameter is only relevant if there is dataset type
579 inconsistency.
580 **kwargs
581 Additional keyword arguments used to augment or construct a
582 `DataCoordinate`. See `DataCoordinate.standardize`
583 parameters.
585 Returns
586 -------
587 datasetType : `DatasetType`
588 A `DatasetType` instance extracted from ``datasetRefOrType``.
589 dataId : `dict` or `DataId`, optional
590 Argument that can be used (along with ``kwargs``) to construct a
591 `DataId`.
593 Notes
594 -----
595 Butler APIs that conceptually need a DatasetRef also allow passing a
596 `DatasetType` (or the name of one) and a `DataId` (or a dict and
597 keyword arguments that can be used to construct one) separately. This
598 method accepts those arguments and always returns a true `DatasetType`
599 and a `DataId` or `dict`.
601 Standardization of `dict` vs `DataId` is best handled by passing the
602 returned ``dataId`` (and ``kwargs``) to `Registry` APIs, which are
603 generally similarly flexible.
604 """
605 externalDatasetType: Optional[DatasetType] = None
606 internalDatasetType: Optional[DatasetType] = None
607 if isinstance(datasetRefOrType, DatasetRef):
608 if dataId is not None or kwargs:
609 raise ValueError("DatasetRef given, cannot use dataId as well")
610 externalDatasetType = datasetRefOrType.datasetType
611 dataId = datasetRefOrType.dataId
612 else:
613 # Don't check whether DataId is provided, because Registry APIs
614 # can usually construct a better error message when it wasn't.
615 if isinstance(datasetRefOrType, DatasetType):
616 externalDatasetType = datasetRefOrType
617 else:
618 internalDatasetType = self.registry.getDatasetType(datasetRefOrType)
620 # Check that they are self-consistent
621 if externalDatasetType is not None:
622 internalDatasetType = self.registry.getDatasetType(externalDatasetType.name)
623 if externalDatasetType != internalDatasetType:
624 # We can allow differences if they are compatible, depending
625 # on whether this is a get or a put. A get requires that
626 # the python type associated with the datastore can be
627 # converted to the user type. A put requires that the user
628 # supplied python type can be converted to the internal
629 # type expected by registry.
630 relevantDatasetType = internalDatasetType
631 if for_put:
632 is_compatible = internalDatasetType.is_compatible_with(externalDatasetType)
633 else:
634 is_compatible = externalDatasetType.is_compatible_with(internalDatasetType)
635 relevantDatasetType = externalDatasetType
636 if not is_compatible:
637 raise ValueError(
638 f"Supplied dataset type ({externalDatasetType}) inconsistent with "
639 f"registry definition ({internalDatasetType})"
640 )
641 # Override the internal definition.
642 internalDatasetType = relevantDatasetType
644 assert internalDatasetType is not None
645 return internalDatasetType, dataId
647 def _rewrite_data_id(
648 self, dataId: Optional[DataId], datasetType: DatasetType, **kwargs: Any
649 ) -> Tuple[Optional[DataId], Dict[str, Any]]:
650 """Rewrite a data ID taking into account dimension records.
652 Take a Data ID and keyword args and rewrite it if necessary to
653 allow the user to specify dimension records rather than dimension
654 primary values.
656 This allows a user to include a dataId dict with keys of
657 ``exposure.day_obs`` and ``exposure.seq_num`` instead of giving
658 the integer exposure ID. It also allows a string to be given
659 for a dimension value rather than the integer ID if that is more
660 convenient. For example, rather than having to specifyin the
661 detector with ``detector.full_name``, a string given for ``detector``
662 will be interpreted as the full name and converted to the integer
663 value.
665 Keyword arguments can also use strings for dimensions like detector
666 and exposure but python does not allow them to include ``.`` and
667 so the ``exposure.day_obs`` syntax can not be used in a keyword
668 argument.
670 Parameters
671 ----------
672 dataId : `dict` or `DataCoordinate`
673 A `dict` of `Dimension` link name, value pairs that will label the
674 `DatasetRef` within a Collection.
675 datasetType : `DatasetType`
676 The dataset type associated with this dataId. Required to
677 determine the relevant dimensions.
678 **kwargs
679 Additional keyword arguments used to augment or construct a
680 `DataId`. See `DataId` parameters.
682 Returns
683 -------
684 dataId : `dict` or `DataCoordinate`
685 The, possibly rewritten, dataId. If given a `DataCoordinate` and
686 no keyword arguments, the original dataId will be returned
687 unchanged.
688 **kwargs : `dict`
689 Any unused keyword arguments (would normally be empty dict).
690 """
691 # Do nothing if we have a standalone DataCoordinate.
692 if isinstance(dataId, DataCoordinate) and not kwargs:
693 return dataId, kwargs
695 # Process dimension records that are using record information
696 # rather than ids
697 newDataId: Dict[str, DataIdValue] = {}
698 byRecord: Dict[str, Dict[str, Any]] = defaultdict(dict)
700 # if all the dataId comes from keyword parameters we do not need
701 # to do anything here because they can't be of the form
702 # exposure.obs_id because a "." is not allowed in a keyword parameter.
703 if dataId:
704 for k, v in dataId.items():
705 # If we have a Dimension we do not need to do anything
706 # because it cannot be a compound key.
707 if isinstance(k, str) and "." in k:
708 # Someone is using a more human-readable dataId
709 dimensionName, record = k.split(".", 1)
710 byRecord[dimensionName][record] = v
711 elif isinstance(k, Dimension):
712 newDataId[k.name] = v
713 else:
714 newDataId[k] = v
716 # Go through the updated dataId and check the type in case someone is
717 # using an alternate key. We have already filtered out the compound
718 # keys dimensions.record format.
719 not_dimensions = {}
721 # Will need to look in the dataId and the keyword arguments
722 # and will remove them if they need to be fixed or are unrecognized.
723 for dataIdDict in (newDataId, kwargs):
724 # Use a list so we can adjust the dict safely in the loop
725 for dimensionName in list(dataIdDict):
726 value = dataIdDict[dimensionName]
727 try:
728 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
729 except KeyError:
730 # This is not a real dimension
731 not_dimensions[dimensionName] = value
732 del dataIdDict[dimensionName]
733 continue
735 # Convert an integral type to an explicit int to simplify
736 # comparisons here
737 if isinstance(value, numbers.Integral):
738 value = int(value)
740 if not isinstance(value, dimension.primaryKey.getPythonType()):
741 for alternate in dimension.alternateKeys:
742 if isinstance(value, alternate.getPythonType()):
743 byRecord[dimensionName][alternate.name] = value
744 del dataIdDict[dimensionName]
745 log.debug(
746 "Converting dimension %s to %s.%s=%s",
747 dimensionName,
748 dimensionName,
749 alternate.name,
750 value,
751 )
752 break
753 else:
754 log.warning(
755 "Type mismatch found for value '%r' provided for dimension %s. "
756 "Could not find matching alternative (primary key has type %s) "
757 "so attempting to use as-is.",
758 value,
759 dimensionName,
760 dimension.primaryKey.getPythonType(),
761 )
763 # By this point kwargs and newDataId should only include valid
764 # dimensions. Merge kwargs in to the new dataId and log if there
765 # are dimensions in both (rather than calling update).
766 for k, v in kwargs.items():
767 if k in newDataId and newDataId[k] != v:
768 log.debug(
769 "Keyword arg %s overriding explicit value in dataId of %s with %s", k, newDataId[k], v
770 )
771 newDataId[k] = v
772 # No need to retain any values in kwargs now.
773 kwargs = {}
775 # If we have some unrecognized dimensions we have to try to connect
776 # them to records in other dimensions. This is made more complicated
777 # by some dimensions having records with clashing names. A mitigation
778 # is that we can tell by this point which dimensions are missing
779 # for the DatasetType but this does not work for calibrations
780 # where additional dimensions can be used to constrain the temporal
781 # axis.
782 if not_dimensions:
783 # Search for all dimensions even if we have been given a value
784 # explicitly. In some cases records are given as well as the
785 # actually dimension and this should not be an error if they
786 # match.
787 mandatoryDimensions = datasetType.dimensions.names # - provided
789 candidateDimensions: Set[str] = set()
790 candidateDimensions.update(mandatoryDimensions)
792 # For calibrations we may well be needing temporal dimensions
793 # so rather than always including all dimensions in the scan
794 # restrict things a little. It is still possible for there
795 # to be confusion over day_obs in visit vs exposure for example.
796 # If we are not searching calibration collections things may
797 # fail but they are going to fail anyway because of the
798 # ambiguousness of the dataId...
799 if datasetType.isCalibration():
800 for dim in self.registry.dimensions.getStaticDimensions():
801 if dim.temporal:
802 candidateDimensions.add(str(dim))
804 # Look up table for the first association with a dimension
805 guessedAssociation: Dict[str, Dict[str, Any]] = defaultdict(dict)
807 # Keep track of whether an item is associated with multiple
808 # dimensions.
809 counter: Counter[str] = Counter()
810 assigned: Dict[str, Set[str]] = defaultdict(set)
812 # Go through the missing dimensions and associate the
813 # given names with records within those dimensions
814 matched_dims = set()
815 for dimensionName in candidateDimensions:
816 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
817 fields = dimension.metadata.names | dimension.uniqueKeys.names
818 for field in not_dimensions:
819 if field in fields:
820 guessedAssociation[dimensionName][field] = not_dimensions[field]
821 counter[dimensionName] += 1
822 assigned[field].add(dimensionName)
823 matched_dims.add(field)
825 # Calculate the fields that matched nothing.
826 never_found = set(not_dimensions) - matched_dims
828 if never_found:
829 raise ValueError(f"Unrecognized keyword args given: {never_found}")
831 # There is a chance we have allocated a single dataId item
832 # to multiple dimensions. Need to decide which should be retained.
833 # For now assume that the most popular alternative wins.
834 # This means that day_obs with seq_num will result in
835 # exposure.day_obs and not visit.day_obs
836 # Also prefer an explicitly missing dimension over an inferred
837 # temporal dimension.
838 for fieldName, assignedDimensions in assigned.items():
839 if len(assignedDimensions) > 1:
840 # Pick the most popular (preferring mandatory dimensions)
841 requiredButMissing = assignedDimensions.intersection(mandatoryDimensions)
842 if requiredButMissing:
843 candidateDimensions = requiredButMissing
844 else:
845 candidateDimensions = assignedDimensions
847 # If this is a choice between visit and exposure and
848 # neither was a required part of the dataset type,
849 # (hence in this branch) always prefer exposure over
850 # visit since exposures are always defined and visits
851 # are defined from exposures.
852 if candidateDimensions == {"exposure", "visit"}:
853 candidateDimensions = {"exposure"}
855 # Select the relevant items and get a new restricted
856 # counter.
857 theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions}
858 duplicatesCounter: Counter[str] = Counter()
859 duplicatesCounter.update(theseCounts)
861 # Choose the most common. If they are equally common
862 # we will pick the one that was found first.
863 # Returns a list of tuples
864 selected = duplicatesCounter.most_common(1)[0][0]
866 log.debug(
867 "Ambiguous dataId entry '%s' associated with multiple dimensions: %s."
868 " Removed ambiguity by choosing dimension %s.",
869 fieldName,
870 ", ".join(assignedDimensions),
871 selected,
872 )
874 for candidateDimension in assignedDimensions:
875 if candidateDimension != selected:
876 del guessedAssociation[candidateDimension][fieldName]
878 # Update the record look up dict with the new associations
879 for dimensionName, values in guessedAssociation.items():
880 if values: # A dict might now be empty
881 log.debug("Assigned non-dimension dataId keys to dimension %s: %s", dimensionName, values)
882 byRecord[dimensionName].update(values)
884 if byRecord:
885 # Some record specifiers were found so we need to convert
886 # them to the Id form
887 for dimensionName, values in byRecord.items():
888 if dimensionName in newDataId:
889 log.debug(
890 "DataId specified explicit %s dimension value of %s in addition to"
891 " general record specifiers for it of %s. Ignoring record information.",
892 dimensionName,
893 newDataId[dimensionName],
894 str(values),
895 )
896 # Get the actual record and compare with these values.
897 try:
898 recs = list(self.registry.queryDimensionRecords(dimensionName, dataId=newDataId))
899 except DataIdError:
900 raise ValueError(
901 f"Could not find dimension '{dimensionName}'"
902 f" with dataId {newDataId} as part of comparing with"
903 f" record values {byRecord[dimensionName]}"
904 ) from None
905 if len(recs) == 1:
906 errmsg: List[str] = []
907 for k, v in values.items():
908 if (recval := getattr(recs[0], k)) != v:
909 errmsg.append(f"{k}({recval} != {v})")
910 if errmsg:
911 raise ValueError(
912 f"Dimension {dimensionName} in dataId has explicit value"
913 " inconsistent with records: " + ", ".join(errmsg)
914 )
915 else:
916 # Multiple matches for an explicit dimension
917 # should never happen but let downstream complain.
918 pass
919 continue
921 # Build up a WHERE expression
922 bind = {k: v for k, v in values.items()}
923 where = " AND ".join(f"{dimensionName}.{k} = {k}" for k in bind)
925 # Hopefully we get a single record that matches
926 records = set(
927 self.registry.queryDimensionRecords(
928 dimensionName, dataId=newDataId, where=where, bind=bind, **kwargs
929 )
930 )
932 if len(records) != 1:
933 if len(records) > 1:
934 # visit can have an ambiguous answer without involving
935 # visit_system. The default visit_system is defined
936 # by the instrument.
937 if (
938 dimensionName == "visit"
939 and "visit_system_membership" in self.registry.dimensions
940 and "visit_system" in self.registry.dimensions["instrument"].metadata
941 ):
942 instrument_records = list(
943 self.registry.queryDimensionRecords(
944 "instrument",
945 dataId=newDataId,
946 **kwargs,
947 )
948 )
949 if len(instrument_records) == 1:
950 visit_system = instrument_records[0].visit_system
951 if visit_system is None:
952 # Set to a value that will never match.
953 visit_system = -1
955 # Look up each visit in the
956 # visit_system_membership records.
957 for rec in records:
958 membership = list(
959 self.registry.queryDimensionRecords(
960 # Use bind to allow zero results.
961 # This is a fully-specified query.
962 "visit_system_membership",
963 where="instrument = inst AND visit_system = system AND visit = v",
964 bind=dict(
965 inst=instrument_records[0].name, system=visit_system, v=rec.id
966 ),
967 )
968 )
969 if membership:
970 # This record is the right answer.
971 records = set([rec])
972 break
974 # The ambiguity may have been resolved so check again.
975 if len(records) > 1:
976 log.debug("Received %d records from constraints of %s", len(records), str(values))
977 for r in records:
978 log.debug("- %s", str(r))
979 raise ValueError(
980 f"DataId specification for dimension {dimensionName} is not"
981 f" uniquely constrained to a single dataset by {values}."
982 f" Got {len(records)} results."
983 )
984 else:
985 raise ValueError(
986 f"DataId specification for dimension {dimensionName} matched no"
987 f" records when constrained by {values}"
988 )
990 # Get the primary key from the real dimension object
991 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
992 if not isinstance(dimension, Dimension):
993 raise RuntimeError(
994 f"{dimension.name} is not a true dimension, and cannot be used in data IDs."
995 )
996 newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name)
998 return newDataId, kwargs
1000 def _findDatasetRef(
1001 self,
1002 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1003 dataId: Optional[DataId] = None,
1004 *,
1005 collections: Any = None,
1006 allowUnresolved: bool = False,
1007 **kwargs: Any,
1008 ) -> DatasetRef:
1009 """Shared logic for methods that start with a search for a dataset in
1010 the registry.
1012 Parameters
1013 ----------
1014 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1015 When `DatasetRef` the `dataId` should be `None`.
1016 Otherwise the `DatasetType` or name thereof.
1017 dataId : `dict` or `DataCoordinate`, optional
1018 A `dict` of `Dimension` link name, value pairs that label the
1019 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1020 should be provided as the first argument.
1021 collections : Any, optional
1022 Collections to be searched, overriding ``self.collections``.
1023 Can be any of the types supported by the ``collections`` argument
1024 to butler construction.
1025 allowUnresolved : `bool`, optional
1026 If `True`, return an unresolved `DatasetRef` if finding a resolved
1027 one in the `Registry` fails. Defaults to `False`.
1028 **kwargs
1029 Additional keyword arguments used to augment or construct a
1030 `DataId`. See `DataId` parameters.
1032 Returns
1033 -------
1034 ref : `DatasetRef`
1035 A reference to the dataset identified by the given arguments.
1036 This can be the same dataset reference as given if it was
1037 resolved.
1039 Raises
1040 ------
1041 LookupError
1042 Raised if no matching dataset exists in the `Registry` (and
1043 ``allowUnresolved is False``).
1044 ValueError
1045 Raised if a resolved `DatasetRef` was passed as an input, but it
1046 differs from the one found in the registry.
1047 TypeError
1048 Raised if no collections were provided.
1049 """
1050 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, for_put=False, **kwargs)
1051 if isinstance(datasetRefOrType, DatasetRef):
1052 idNumber = datasetRefOrType.id
1053 # This is a resolved ref, return it immediately.
1054 if idNumber:
1055 return datasetRefOrType
1056 else:
1057 idNumber = None
1058 timespan: Optional[Timespan] = None
1060 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs)
1062 if datasetType.isCalibration():
1063 # Because this is a calibration dataset, first try to make a
1064 # standardize the data ID without restricting the dimensions to
1065 # those of the dataset type requested, because there may be extra
1066 # dimensions that provide temporal information for a validity-range
1067 # lookup.
1068 dataId = DataCoordinate.standardize(
1069 dataId, universe=self.registry.dimensions, defaults=self.registry.defaults.dataId, **kwargs
1070 )
1071 if dataId.graph.temporal:
1072 dataId = self.registry.expandDataId(dataId)
1073 timespan = dataId.timespan
1074 else:
1075 # Standardize the data ID to just the dimensions of the dataset
1076 # type instead of letting registry.findDataset do it, so we get the
1077 # result even if no dataset is found.
1078 dataId = DataCoordinate.standardize(
1079 dataId, graph=datasetType.dimensions, defaults=self.registry.defaults.dataId, **kwargs
1080 )
1081 # Always lookup the DatasetRef, even if one is given, to ensure it is
1082 # present in the current collection.
1083 ref = self.registry.findDataset(datasetType, dataId, collections=collections, timespan=timespan)
1084 if ref is None:
1085 if allowUnresolved:
1086 return DatasetRef(datasetType, dataId)
1087 else:
1088 if collections is None:
1089 collections = self.registry.defaults.collections
1090 raise LookupError(
1091 f"Dataset {datasetType.name} with data ID {dataId} "
1092 f"could not be found in collections {collections}."
1093 )
1094 if idNumber is not None and idNumber != ref.id:
1095 if collections is None:
1096 collections = self.registry.defaults.collections
1097 raise ValueError(
1098 f"DatasetRef.id provided ({idNumber}) does not match "
1099 f"id ({ref.id}) in registry in collections {collections}."
1100 )
1101 if datasetType != ref.datasetType:
1102 # If they differ it is because the user explicitly specified
1103 # a compatible dataset type to this call rather than using the
1104 # registry definition. The DatasetRef must therefore be recreated
1105 # using the user definition such that the expected type is
1106 # returned.
1107 ref = DatasetRef(datasetType, ref.dataId, run=ref.run, id=ref.id)
1109 return ref
1111 @transactional
1112 @deprecated(
1113 reason="Butler.put() now behaves like Butler.putDirect() when given a DatasetRef."
1114 " Please use Butler.put(). Be aware that you may need to adjust your usage if you"
1115 " were relying on the run parameter to determine the run."
1116 " Will be removed after v27.0.",
1117 version="v26.0",
1118 category=FutureWarning,
1119 )
1120 def putDirect(self, obj: Any, ref: DatasetRef, /) -> DatasetRef:
1121 # Docstring inherited.
1122 return self.put(obj, ref)
1124 @transactional
1125 def put(
1126 self,
1127 obj: Any,
1128 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1129 /,
1130 dataId: Optional[DataId] = None,
1131 *,
1132 run: Optional[str] = None,
1133 **kwargs: Any,
1134 ) -> DatasetRef:
1135 """Store and register a dataset.
1137 Parameters
1138 ----------
1139 obj : `object`
1140 The dataset.
1141 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1142 When `DatasetRef` is provided, ``dataId`` should be `None`.
1143 Otherwise the `DatasetType` or name thereof. If a fully resolved
1144 `DatasetRef` is given the run and ID are used directly.
1145 dataId : `dict` or `DataCoordinate`
1146 A `dict` of `Dimension` link name, value pairs that label the
1147 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1148 should be provided as the second argument.
1149 run : `str`, optional
1150 The name of the run the dataset should be added to, overriding
1151 ``self.run``. Not used if a resolved `DatasetRef` is provided.
1152 **kwargs
1153 Additional keyword arguments used to augment or construct a
1154 `DataCoordinate`. See `DataCoordinate.standardize`
1155 parameters. Not used if a resolve `DatasetRef` is provided.
1157 Returns
1158 -------
1159 ref : `DatasetRef`
1160 A reference to the stored dataset, updated with the correct id if
1161 given.
1163 Raises
1164 ------
1165 TypeError
1166 Raised if the butler is read-only or if no run has been provided.
1167 """
1168 if isinstance(datasetRefOrType, DatasetRef) and datasetRefOrType.id is not None:
1169 # This is a direct put of predefined DatasetRef.
1170 log.debug("Butler put direct: %s", datasetRefOrType)
1171 (imported_ref,) = self.registry._importDatasets(
1172 [datasetRefOrType],
1173 expand=True,
1174 )
1175 if imported_ref.id != datasetRefOrType.getCheckedId():
1176 raise RuntimeError("This registry configuration does not support direct put of ref.")
1177 self.datastore.put(obj, datasetRefOrType)
1178 return datasetRefOrType
1180 log.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run)
1181 if not self.isWriteable():
1182 raise TypeError("Butler is read-only.")
1183 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs)
1184 if isinstance(datasetRefOrType, DatasetRef) and datasetRefOrType.id is not None:
1185 raise ValueError("DatasetRef must not be in registry, must have None id")
1187 # Handle dimension records in dataId
1188 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs)
1190 # Add Registry Dataset entry.
1191 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions, **kwargs)
1193 # For an execution butler the datasets will be pre-defined.
1194 # If the butler is configured that way datasets should only be inserted
1195 # if they do not already exist in registry. Trying and catching
1196 # ConflictingDefinitionError will not work because the transaction
1197 # will be corrupted. Instead, in this mode always check first.
1198 ref = None
1199 ref_is_predefined = False
1200 if self._allow_put_of_predefined_dataset:
1201 # Get the matching ref for this run.
1202 ref = self.registry.findDataset(datasetType, collections=run, dataId=dataId)
1204 if ref:
1205 # Must be expanded form for datastore templating
1206 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions)
1207 ref = ref.expanded(dataId)
1208 ref_is_predefined = True
1210 if not ref:
1211 (ref,) = self.registry.insertDatasets(datasetType, run=run, dataIds=[dataId])
1213 # If the ref is predefined it is possible that the datastore also
1214 # has the record. Asking datastore to put it again will result in
1215 # the artifact being recreated, overwriting previous, then will cause
1216 # a failure in writing the record which will cause the artifact
1217 # to be removed. Much safer to ask first before attempting to
1218 # overwrite. Race conditions should not be an issue for the
1219 # execution butler environment.
1220 if ref_is_predefined:
1221 if self.datastore.knows(ref):
1222 raise ConflictingDefinitionError(f"Dataset associated {ref} already exists.")
1224 self.datastore.put(obj, ref)
1226 return ref
1228 @deprecated(
1229 reason="Butler.get() now behaves like Butler.getDirect() when given a DatasetRef."
1230 " Please use Butler.get(). Will be removed after v27.0.",
1231 version="v26.0",
1232 category=FutureWarning,
1233 )
1234 def getDirect(
1235 self,
1236 ref: DatasetRef,
1237 *,
1238 parameters: Optional[Dict[str, Any]] = None,
1239 storageClass: Optional[Union[StorageClass, str]] = None,
1240 ) -> Any:
1241 """Retrieve a stored dataset.
1243 Parameters
1244 ----------
1245 ref : `DatasetRef`
1246 Resolved reference to an already stored dataset.
1247 parameters : `dict`
1248 Additional StorageClass-defined options to control reading,
1249 typically used to efficiently read only a subset of the dataset.
1250 storageClass : `StorageClass` or `str`, optional
1251 The storage class to be used to override the Python type
1252 returned by this method. By default the returned type matches
1253 the dataset type definition for this dataset. Specifying a
1254 read `StorageClass` can force a different type to be returned.
1255 This type must be compatible with the original type.
1257 Returns
1258 -------
1259 obj : `object`
1260 The dataset.
1261 """
1262 return self.datastore.get(ref, parameters=parameters, storageClass=storageClass)
1264 @deprecated(
1265 reason="Butler.getDeferred() now behaves like getDirectDeferred() when given a DatasetRef. "
1266 "Please use Butler.getDeferred(). Will be removed after v27.0.",
1267 version="v26.0",
1268 category=FutureWarning,
1269 )
1270 def getDirectDeferred(
1271 self,
1272 ref: DatasetRef,
1273 *,
1274 parameters: Union[dict, None] = None,
1275 storageClass: str | StorageClass | None = None,
1276 ) -> DeferredDatasetHandle:
1277 """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
1278 from a resolved `DatasetRef`.
1280 Parameters
1281 ----------
1282 ref : `DatasetRef`
1283 Resolved reference to an already stored dataset.
1284 parameters : `dict`
1285 Additional StorageClass-defined options to control reading,
1286 typically used to efficiently read only a subset of the dataset.
1287 storageClass : `StorageClass` or `str`, optional
1288 The storage class to be used to override the Python type
1289 returned by this method. By default the returned type matches
1290 the dataset type definition for this dataset. Specifying a
1291 read `StorageClass` can force a different type to be returned.
1292 This type must be compatible with the original type.
1294 Returns
1295 -------
1296 obj : `DeferredDatasetHandle`
1297 A handle which can be used to retrieve a dataset at a later time.
1299 Raises
1300 ------
1301 AmbiguousDatasetError
1302 Raised if ``ref.id is None``, i.e. the reference is unresolved.
1303 """
1304 if ref.id is None:
1305 raise AmbiguousDatasetError(
1306 f"Dataset of type {ref.datasetType.name} with data ID {ref.dataId} is not resolved."
1307 )
1308 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters, storageClass=storageClass)
1310 def getDeferred(
1311 self,
1312 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1313 /,
1314 dataId: Optional[DataId] = None,
1315 *,
1316 parameters: Union[dict, None] = None,
1317 collections: Any = None,
1318 storageClass: str | StorageClass | None = None,
1319 **kwargs: Any,
1320 ) -> DeferredDatasetHandle:
1321 """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
1322 after an immediate registry lookup.
1324 Parameters
1325 ----------
1326 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1327 When `DatasetRef` the `dataId` should be `None`.
1328 Otherwise the `DatasetType` or name thereof.
1329 dataId : `dict` or `DataCoordinate`, optional
1330 A `dict` of `Dimension` link name, value pairs that label the
1331 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1332 should be provided as the first argument.
1333 parameters : `dict`
1334 Additional StorageClass-defined options to control reading,
1335 typically used to efficiently read only a subset of the dataset.
1336 collections : Any, optional
1337 Collections to be searched, overriding ``self.collections``.
1338 Can be any of the types supported by the ``collections`` argument
1339 to butler construction.
1340 storageClass : `StorageClass` or `str`, optional
1341 The storage class to be used to override the Python type
1342 returned by this method. By default the returned type matches
1343 the dataset type definition for this dataset. Specifying a
1344 read `StorageClass` can force a different type to be returned.
1345 This type must be compatible with the original type.
1346 **kwargs
1347 Additional keyword arguments used to augment or construct a
1348 `DataId`. See `DataId` parameters.
1350 Returns
1351 -------
1352 obj : `DeferredDatasetHandle`
1353 A handle which can be used to retrieve a dataset at a later time.
1355 Raises
1356 ------
1357 LookupError
1358 Raised if no matching dataset exists in the `Registry` (and
1359 ``allowUnresolved is False``).
1360 ValueError
1361 Raised if a resolved `DatasetRef` was passed as an input, but it
1362 differs from the one found in the registry.
1363 TypeError
1364 Raised if no collections were provided.
1365 """
1366 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs)
1367 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters, storageClass=storageClass)
1369 def get(
1370 self,
1371 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1372 /,
1373 dataId: Optional[DataId] = None,
1374 *,
1375 parameters: Optional[Dict[str, Any]] = None,
1376 collections: Any = None,
1377 storageClass: Optional[Union[StorageClass, str]] = None,
1378 **kwargs: Any,
1379 ) -> Any:
1380 """Retrieve a stored dataset.
1382 Parameters
1383 ----------
1384 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1385 When `DatasetRef` the `dataId` should be `None`.
1386 Otherwise the `DatasetType` or name thereof.
1387 If a resolved `DatasetRef`, the associated dataset
1388 is returned directly without additional querying.
1389 dataId : `dict` or `DataCoordinate`
1390 A `dict` of `Dimension` link name, value pairs that label the
1391 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1392 should be provided as the first argument.
1393 parameters : `dict`
1394 Additional StorageClass-defined options to control reading,
1395 typically used to efficiently read only a subset of the dataset.
1396 collections : Any, optional
1397 Collections to be searched, overriding ``self.collections``.
1398 Can be any of the types supported by the ``collections`` argument
1399 to butler construction.
1400 storageClass : `StorageClass` or `str`, optional
1401 The storage class to be used to override the Python type
1402 returned by this method. By default the returned type matches
1403 the dataset type definition for this dataset. Specifying a
1404 read `StorageClass` can force a different type to be returned.
1405 This type must be compatible with the original type.
1406 **kwargs
1407 Additional keyword arguments used to augment or construct a
1408 `DataCoordinate`. See `DataCoordinate.standardize`
1409 parameters.
1411 Returns
1412 -------
1413 obj : `object`
1414 The dataset.
1416 Raises
1417 ------
1418 LookupError
1419 Raised if no matching dataset exists in the `Registry`.
1420 TypeError
1421 Raised if no collections were provided.
1423 Notes
1424 -----
1425 When looking up datasets in a `~CollectionType.CALIBRATION` collection,
1426 this method requires that the given data ID include temporal dimensions
1427 beyond the dimensions of the dataset type itself, in order to find the
1428 dataset with the appropriate validity range. For example, a "bias"
1429 dataset with native dimensions ``{instrument, detector}`` could be
1430 fetched with a ``{instrument, detector, exposure}`` data ID, because
1431 ``exposure`` is a temporal dimension.
1432 """
1433 log.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters)
1434 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs)
1435 return self.datastore.get(ref, parameters=parameters, storageClass=storageClass)
1437 def getURIs(
1438 self,
1439 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1440 /,
1441 dataId: Optional[DataId] = None,
1442 *,
1443 predict: bool = False,
1444 collections: Any = None,
1445 run: Optional[str] = None,
1446 **kwargs: Any,
1447 ) -> DatasetRefURIs:
1448 """Returns the URIs associated with the dataset.
1450 Parameters
1451 ----------
1452 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1453 When `DatasetRef` the `dataId` should be `None`.
1454 Otherwise the `DatasetType` or name thereof.
1455 dataId : `dict` or `DataCoordinate`
1456 A `dict` of `Dimension` link name, value pairs that label the
1457 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1458 should be provided as the first argument.
1459 predict : `bool`
1460 If `True`, allow URIs to be returned of datasets that have not
1461 been written.
1462 collections : Any, optional
1463 Collections to be searched, overriding ``self.collections``.
1464 Can be any of the types supported by the ``collections`` argument
1465 to butler construction.
1466 run : `str`, optional
1467 Run to use for predictions, overriding ``self.run``.
1468 **kwargs
1469 Additional keyword arguments used to augment or construct a
1470 `DataCoordinate`. See `DataCoordinate.standardize`
1471 parameters.
1473 Returns
1474 -------
1475 uris : `DatasetRefURIs`
1476 The URI to the primary artifact associated with this dataset (if
1477 the dataset was disassembled within the datastore this may be
1478 `None`), and the URIs to any components associated with the dataset
1479 artifact. (can be empty if there are no components).
1480 """
1481 ref = self._findDatasetRef(
1482 datasetRefOrType, dataId, allowUnresolved=predict, collections=collections, **kwargs
1483 )
1484 if ref.id is None: # only possible if predict is True
1485 if run is None:
1486 run = self.run
1487 if run is None:
1488 raise TypeError("Cannot predict location with run=None.")
1489 # Lie about ID, because we can't guess it, and only
1490 # Datastore.getURIs() will ever see it (and it doesn't use it).
1491 ref = ref.resolved(id=uuid.UUID("{00000000-0000-0000-0000-000000000000}"), run=run)
1492 return self.datastore.getURIs(ref, predict)
1494 def getURI(
1495 self,
1496 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1497 /,
1498 dataId: Optional[DataId] = None,
1499 *,
1500 predict: bool = False,
1501 collections: Any = None,
1502 run: Optional[str] = None,
1503 **kwargs: Any,
1504 ) -> ResourcePath:
1505 """Return the URI to the Dataset.
1507 Parameters
1508 ----------
1509 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1510 When `DatasetRef` the `dataId` should be `None`.
1511 Otherwise the `DatasetType` or name thereof.
1512 dataId : `dict` or `DataCoordinate`
1513 A `dict` of `Dimension` link name, value pairs that label the
1514 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1515 should be provided as the first argument.
1516 predict : `bool`
1517 If `True`, allow URIs to be returned of datasets that have not
1518 been written.
1519 collections : Any, optional
1520 Collections to be searched, overriding ``self.collections``.
1521 Can be any of the types supported by the ``collections`` argument
1522 to butler construction.
1523 run : `str`, optional
1524 Run to use for predictions, overriding ``self.run``.
1525 **kwargs
1526 Additional keyword arguments used to augment or construct a
1527 `DataCoordinate`. See `DataCoordinate.standardize`
1528 parameters.
1530 Returns
1531 -------
1532 uri : `lsst.resources.ResourcePath`
1533 URI pointing to the Dataset within the datastore. If the
1534 Dataset does not exist in the datastore, and if ``predict`` is
1535 `True`, the URI will be a prediction and will include a URI
1536 fragment "#predicted".
1537 If the datastore does not have entities that relate well
1538 to the concept of a URI the returned URI string will be
1539 descriptive. The returned URI is not guaranteed to be obtainable.
1541 Raises
1542 ------
1543 LookupError
1544 A URI has been requested for a dataset that does not exist and
1545 guessing is not allowed.
1546 ValueError
1547 Raised if a resolved `DatasetRef` was passed as an input, but it
1548 differs from the one found in the registry.
1549 TypeError
1550 Raised if no collections were provided.
1551 RuntimeError
1552 Raised if a URI is requested for a dataset that consists of
1553 multiple artifacts.
1554 """
1555 primary, components = self.getURIs(
1556 datasetRefOrType, dataId=dataId, predict=predict, collections=collections, run=run, **kwargs
1557 )
1559 if primary is None or components:
1560 raise RuntimeError(
1561 f"Dataset ({datasetRefOrType}) includes distinct URIs for components. "
1562 "Use Butler.getURIs() instead."
1563 )
1564 return primary
1566 def retrieveArtifacts(
1567 self,
1568 refs: Iterable[DatasetRef],
1569 destination: ResourcePathExpression,
1570 transfer: str = "auto",
1571 preserve_path: bool = True,
1572 overwrite: bool = False,
1573 ) -> List[ResourcePath]:
1574 """Retrieve the artifacts associated with the supplied refs.
1576 Parameters
1577 ----------
1578 refs : iterable of `DatasetRef`
1579 The datasets for which artifacts are to be retrieved.
1580 A single ref can result in multiple artifacts. The refs must
1581 be resolved.
1582 destination : `lsst.resources.ResourcePath` or `str`
1583 Location to write the artifacts.
1584 transfer : `str`, optional
1585 Method to use to transfer the artifacts. Must be one of the options
1586 supported by `~lsst.resources.ResourcePath.transfer_from()`.
1587 "move" is not allowed.
1588 preserve_path : `bool`, optional
1589 If `True` the full path of the artifact within the datastore
1590 is preserved. If `False` the final file component of the path
1591 is used.
1592 overwrite : `bool`, optional
1593 If `True` allow transfers to overwrite existing files at the
1594 destination.
1596 Returns
1597 -------
1598 targets : `list` of `lsst.resources.ResourcePath`
1599 URIs of file artifacts in destination location. Order is not
1600 preserved.
1602 Notes
1603 -----
1604 For non-file datastores the artifacts written to the destination
1605 may not match the representation inside the datastore. For example
1606 a hierarchical data structure in a NoSQL database may well be stored
1607 as a JSON file.
1608 """
1609 return self.datastore.retrieveArtifacts(
1610 refs,
1611 ResourcePath(destination),
1612 transfer=transfer,
1613 preserve_path=preserve_path,
1614 overwrite=overwrite,
1615 )
1617 def datasetExists(
1618 self,
1619 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1620 dataId: Optional[DataId] = None,
1621 *,
1622 collections: Any = None,
1623 **kwargs: Any,
1624 ) -> bool:
1625 """Return True if the Dataset is actually present in the Datastore.
1627 Parameters
1628 ----------
1629 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1630 When `DatasetRef` the `dataId` should be `None`.
1631 Otherwise the `DatasetType` or name thereof.
1632 dataId : `dict` or `DataCoordinate`
1633 A `dict` of `Dimension` link name, value pairs that label the
1634 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1635 should be provided as the first argument.
1636 collections : Any, optional
1637 Collections to be searched, overriding ``self.collections``.
1638 Can be any of the types supported by the ``collections`` argument
1639 to butler construction.
1640 **kwargs
1641 Additional keyword arguments used to augment or construct a
1642 `DataCoordinate`. See `DataCoordinate.standardize`
1643 parameters.
1645 Raises
1646 ------
1647 LookupError
1648 Raised if the dataset is not even present in the Registry.
1649 ValueError
1650 Raised if a resolved `DatasetRef` was passed as an input, but it
1651 differs from the one found in the registry.
1652 TypeError
1653 Raised if no collections were provided.
1654 """
1655 # A resolved ref may be given that is not known to this butler.
1656 if isinstance(datasetRefOrType, DatasetRef) and datasetRefOrType.id is not None:
1657 ref = self.registry.getDataset(datasetRefOrType.id)
1658 if ref is None:
1659 raise LookupError(
1660 f"Resolved DatasetRef with id {datasetRefOrType.id} is not known to registry."
1661 )
1662 else:
1663 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs)
1664 return self.datastore.exists(ref)
1666 def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None:
1667 """Remove one or more `~CollectionType.RUN` collections and the
1668 datasets within them.
1670 Parameters
1671 ----------
1672 names : `Iterable` [ `str` ]
1673 The names of the collections to remove.
1674 unstore : `bool`, optional
1675 If `True` (default), delete datasets from all datastores in which
1676 they are present, and attempt to rollback the registry deletions if
1677 datastore deletions fail (which may not always be possible). If
1678 `False`, datastore records for these datasets are still removed,
1679 but any artifacts (e.g. files) will not be.
1681 Raises
1682 ------
1683 TypeError
1684 Raised if one or more collections are not of type
1685 `~CollectionType.RUN`.
1686 """
1687 if not self.isWriteable():
1688 raise TypeError("Butler is read-only.")
1689 names = list(names)
1690 refs: List[DatasetRef] = []
1691 for name in names:
1692 collectionType = self.registry.getCollectionType(name)
1693 if collectionType is not CollectionType.RUN:
1694 raise TypeError(f"The collection type of '{name}' is {collectionType.name}, not RUN.")
1695 refs.extend(self.registry.queryDatasets(..., collections=name, findFirst=True))
1696 with self.datastore.transaction():
1697 with self.registry.transaction():
1698 if unstore:
1699 self.datastore.trash(refs)
1700 else:
1701 self.datastore.forget(refs)
1702 for name in names:
1703 self.registry.removeCollection(name)
1704 if unstore:
1705 # Point of no return for removing artifacts
1706 self.datastore.emptyTrash()
1708 def pruneDatasets(
1709 self,
1710 refs: Iterable[DatasetRef],
1711 *,
1712 disassociate: bool = True,
1713 unstore: bool = False,
1714 tags: Iterable[str] = (),
1715 purge: bool = False,
1716 ) -> None:
1717 # docstring inherited from LimitedButler
1719 if not self.isWriteable():
1720 raise TypeError("Butler is read-only.")
1721 if purge:
1722 if not disassociate:
1723 raise TypeError("Cannot pass purge=True without disassociate=True.")
1724 if not unstore:
1725 raise TypeError("Cannot pass purge=True without unstore=True.")
1726 elif disassociate:
1727 tags = tuple(tags)
1728 if not tags:
1729 raise TypeError("No tags provided but disassociate=True.")
1730 for tag in tags:
1731 collectionType = self.registry.getCollectionType(tag)
1732 if collectionType is not CollectionType.TAGGED:
1733 raise TypeError(
1734 f"Cannot disassociate from collection '{tag}' "
1735 f"of non-TAGGED type {collectionType.name}."
1736 )
1737 # For an execution butler we want to keep existing UUIDs for the
1738 # datasets, for that we need to keep them in the collections but
1739 # remove from datastore.
1740 if self._allow_put_of_predefined_dataset and purge:
1741 purge = False
1742 disassociate = False
1743 # Transform possibly-single-pass iterable into something we can iterate
1744 # over multiple times.
1745 refs = list(refs)
1746 # Pruning a component of a DatasetRef makes no sense since registry
1747 # doesn't know about components and datastore might not store
1748 # components in a separate file
1749 for ref in refs:
1750 if ref.datasetType.component():
1751 raise ValueError(f"Can not prune a component of a dataset (ref={ref})")
1752 # We don't need an unreliable Datastore transaction for this, because
1753 # we've been extra careful to ensure that Datastore.trash only involves
1754 # mutating the Registry (it can _look_ at Datastore-specific things,
1755 # but shouldn't change them), and hence all operations here are
1756 # Registry operations.
1757 with self.datastore.transaction():
1758 with self.registry.transaction():
1759 if unstore:
1760 self.datastore.trash(refs)
1761 if purge:
1762 self.registry.removeDatasets(refs)
1763 elif disassociate:
1764 assert tags, "Guaranteed by earlier logic in this function."
1765 for tag in tags:
1766 self.registry.disassociate(tag, refs)
1767 # We've exited the Registry transaction, and apparently committed.
1768 # (if there was an exception, everything rolled back, and it's as if
1769 # nothing happened - and we never get here).
1770 # Datastore artifacts are not yet gone, but they're clearly marked
1771 # as trash, so if we fail to delete now because of (e.g.) filesystem
1772 # problems we can try again later, and if manual administrative
1773 # intervention is required, it's pretty clear what that should entail:
1774 # deleting everything on disk and in private Datastore tables that is
1775 # in the dataset_location_trash table.
1776 if unstore:
1777 # Point of no return for removing artifacts
1778 self.datastore.emptyTrash()
1780 @transactional
1781 def ingest(
1782 self,
1783 *datasets: FileDataset,
1784 transfer: Optional[str] = "auto",
1785 run: Optional[str] = None,
1786 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
1787 record_validation_info: bool = True,
1788 ) -> None:
1789 """Store and register one or more datasets that already exist on disk.
1791 Parameters
1792 ----------
1793 datasets : `FileDataset`
1794 Each positional argument is a struct containing information about
1795 a file to be ingested, including its URI (either absolute or
1796 relative to the datastore root, if applicable), a `DatasetRef`,
1797 and optionally a formatter class or its fully-qualified string
1798 name. If a formatter is not provided, the formatter that would be
1799 used for `put` is assumed. On successful return, all
1800 `FileDataset.ref` attributes will have their `DatasetRef.id`
1801 attribute populated and all `FileDataset.formatter` attributes will
1802 be set to the formatter class used. `FileDataset.path` attributes
1803 may be modified to put paths in whatever the datastore considers a
1804 standardized form.
1805 transfer : `str`, optional
1806 If not `None`, must be one of 'auto', 'move', 'copy', 'direct',
1807 'split', 'hardlink', 'relsymlink' or 'symlink', indicating how to
1808 transfer the file.
1809 run : `str`, optional
1810 The name of the run ingested datasets should be added to,
1811 overriding ``self.run``.
1812 idGenerationMode : `DatasetIdGenEnum`, optional
1813 Specifies option for generating dataset IDs. By default unique IDs
1814 are generated for each inserted dataset.
1815 record_validation_info : `bool`, optional
1816 If `True`, the default, the datastore can record validation
1817 information associated with the file. If `False` the datastore
1818 will not attempt to track any information such as checksums
1819 or file sizes. This can be useful if such information is tracked
1820 in an external system or if the file is to be compressed in place.
1821 It is up to the datastore whether this parameter is relevant.
1823 Raises
1824 ------
1825 TypeError
1826 Raised if the butler is read-only or if no run was provided.
1827 NotImplementedError
1828 Raised if the `Datastore` does not support the given transfer mode.
1829 DatasetTypeNotSupportedError
1830 Raised if one or more files to be ingested have a dataset type that
1831 is not supported by the `Datastore`..
1832 FileNotFoundError
1833 Raised if one of the given files does not exist.
1834 FileExistsError
1835 Raised if transfer is not `None` but the (internal) location the
1836 file would be moved to is already occupied.
1838 Notes
1839 -----
1840 This operation is not fully exception safe: if a database operation
1841 fails, the given `FileDataset` instances may be only partially updated.
1843 It is atomic in terms of database operations (they will either all
1844 succeed or all fail) providing the database engine implements
1845 transactions correctly. It will attempt to be atomic in terms of
1846 filesystem operations as well, but this cannot be implemented
1847 rigorously for most datastores.
1848 """
1849 if not self.isWriteable():
1850 raise TypeError("Butler is read-only.")
1851 progress = Progress("lsst.daf.butler.Butler.ingest", level=logging.DEBUG)
1852 # Reorganize the inputs so they're grouped by DatasetType and then
1853 # data ID. We also include a list of DatasetRefs for each FileDataset
1854 # to hold the resolved DatasetRefs returned by the Registry, before
1855 # it's safe to swap them into FileDataset.refs.
1856 # Some type annotation aliases to make that clearer:
1857 GroupForType = Dict[DataCoordinate, Tuple[FileDataset, List[DatasetRef]]]
1858 GroupedData = MutableMapping[DatasetType, GroupForType]
1859 # The actual data structure:
1860 groupedData: GroupedData = defaultdict(dict)
1861 # And the nested loop that populates it:
1862 for dataset in progress.wrap(datasets, desc="Grouping by dataset type"):
1863 # This list intentionally shared across the inner loop, since it's
1864 # associated with `dataset`.
1865 resolvedRefs: List[DatasetRef] = []
1867 # Somewhere to store pre-existing refs if we have an
1868 # execution butler.
1869 existingRefs: List[DatasetRef] = []
1871 for ref in dataset.refs:
1872 if ref.dataId in groupedData[ref.datasetType]:
1873 raise ConflictingDefinitionError(
1874 f"Ingest conflict. Dataset {dataset.path} has same"
1875 " DataId as other ingest dataset"
1876 f" {groupedData[ref.datasetType][ref.dataId][0].path} "
1877 f" ({ref.dataId})"
1878 )
1879 if self._allow_put_of_predefined_dataset:
1880 existing_ref = self.registry.findDataset(
1881 ref.datasetType, dataId=ref.dataId, collections=run
1882 )
1883 if existing_ref:
1884 if self.datastore.knows(existing_ref):
1885 raise ConflictingDefinitionError(
1886 f"Dataset associated with path {dataset.path}"
1887 f" already exists as {existing_ref}."
1888 )
1889 # Store this ref elsewhere since it already exists
1890 # and we do not want to remake it but we do want
1891 # to store it in the datastore.
1892 existingRefs.append(existing_ref)
1894 # Nothing else to do until we have finished
1895 # iterating.
1896 continue
1898 groupedData[ref.datasetType][ref.dataId] = (dataset, resolvedRefs)
1900 if existingRefs:
1901 if len(dataset.refs) != len(existingRefs):
1902 # Keeping track of partially pre-existing datasets is hard
1903 # and should generally never happen. For now don't allow
1904 # it.
1905 raise ConflictingDefinitionError(
1906 f"For dataset {dataset.path} some dataIds already exist"
1907 " in registry but others do not. This is not supported."
1908 )
1910 # Attach the resolved refs if we found them.
1911 dataset.refs = existingRefs
1913 # Now we can bulk-insert into Registry for each DatasetType.
1914 for datasetType, groupForType in progress.iter_item_chunks(
1915 groupedData.items(), desc="Bulk-inserting datasets by type"
1916 ):
1917 refs = self.registry.insertDatasets(
1918 datasetType,
1919 dataIds=groupForType.keys(),
1920 run=run,
1921 expand=self.datastore.needs_expanded_data_ids(transfer, datasetType),
1922 idGenerationMode=idGenerationMode,
1923 )
1924 # Append those resolved DatasetRefs to the new lists we set up for
1925 # them.
1926 for ref, (_, resolvedRefs) in zip(refs, groupForType.values()):
1927 resolvedRefs.append(ref)
1929 # Go back to the original FileDatasets to replace their refs with the
1930 # new resolved ones.
1931 for groupForType in progress.iter_chunks(
1932 groupedData.values(), desc="Reassociating resolved dataset refs with files"
1933 ):
1934 for dataset, resolvedRefs in groupForType.values():
1935 dataset.refs = resolvedRefs
1937 # Bulk-insert everything into Datastore.
1938 self.datastore.ingest(*datasets, transfer=transfer, record_validation_info=record_validation_info)
1940 @contextlib.contextmanager
1941 def export(
1942 self,
1943 *,
1944 directory: Optional[str] = None,
1945 filename: Optional[str] = None,
1946 format: Optional[str] = None,
1947 transfer: Optional[str] = None,
1948 ) -> Iterator[RepoExportContext]:
1949 """Export datasets from the repository represented by this `Butler`.
1951 This method is a context manager that returns a helper object
1952 (`RepoExportContext`) that is used to indicate what information from
1953 the repository should be exported.
1955 Parameters
1956 ----------
1957 directory : `str`, optional
1958 Directory dataset files should be written to if ``transfer`` is not
1959 `None`.
1960 filename : `str`, optional
1961 Name for the file that will include database information associated
1962 with the exported datasets. If this is not an absolute path and
1963 ``directory`` is not `None`, it will be written to ``directory``
1964 instead of the current working directory. Defaults to
1965 "export.{format}".
1966 format : `str`, optional
1967 File format for the database information file. If `None`, the
1968 extension of ``filename`` will be used.
1969 transfer : `str`, optional
1970 Transfer mode passed to `Datastore.export`.
1972 Raises
1973 ------
1974 TypeError
1975 Raised if the set of arguments passed is inconsistent.
1977 Examples
1978 --------
1979 Typically the `Registry.queryDataIds` and `Registry.queryDatasets`
1980 methods are used to provide the iterables over data IDs and/or datasets
1981 to be exported::
1983 with butler.export("exports.yaml") as export:
1984 # Export all flats, but none of the dimension element rows
1985 # (i.e. data ID information) associated with them.
1986 export.saveDatasets(butler.registry.queryDatasets("flat"),
1987 elements=())
1988 # Export all datasets that start with "deepCoadd_" and all of
1989 # their associated data ID information.
1990 export.saveDatasets(butler.registry.queryDatasets("deepCoadd_*"))
1991 """
1992 if directory is None and transfer is not None:
1993 raise TypeError("Cannot transfer without providing a directory.")
1994 if transfer == "move":
1995 raise TypeError("Transfer may not be 'move': export is read-only")
1996 if format is None:
1997 if filename is None:
1998 raise TypeError("At least one of 'filename' or 'format' must be provided.")
1999 else:
2000 _, format = os.path.splitext(filename)
2001 if not format:
2002 raise ValueError("Please specify a file extension to determine export format.")
2003 format = format[1:] # Strip leading ".""
2004 elif filename is None:
2005 filename = f"export.{format}"
2006 if directory is not None:
2007 filename = os.path.join(directory, filename)
2008 formats = self._config["repo_transfer_formats"]
2009 if format not in formats:
2010 raise ValueError(f"Unknown export format {format!r}, allowed: {','.join(formats.keys())}")
2011 BackendClass = get_class_of(formats[format, "export"])
2012 with open(filename, "w") as stream:
2013 backend = BackendClass(stream, universe=self.registry.dimensions)
2014 try:
2015 helper = RepoExportContext(
2016 self.registry, self.datastore, backend=backend, directory=directory, transfer=transfer
2017 )
2018 yield helper
2019 except BaseException:
2020 raise
2021 else:
2022 helper._finish()
2024 def import_(
2025 self,
2026 *,
2027 directory: Optional[str] = None,
2028 filename: Union[str, TextIO, None] = None,
2029 format: Optional[str] = None,
2030 transfer: Optional[str] = None,
2031 skip_dimensions: Optional[Set] = None,
2032 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
2033 reuseIds: bool = False,
2034 ) -> None:
2035 """Import datasets into this repository that were exported from a
2036 different butler repository via `~lsst.daf.butler.Butler.export`.
2038 Parameters
2039 ----------
2040 directory : `str`, optional
2041 Directory containing dataset files to import from. If `None`,
2042 ``filename`` and all dataset file paths specified therein must
2043 be absolute.
2044 filename : `str` or `TextIO`, optional
2045 A stream or name of file that contains database information
2046 associated with the exported datasets, typically generated by
2047 `~lsst.daf.butler.Butler.export`. If this a string (name) and
2048 is not an absolute path, does not exist in the current working
2049 directory, and ``directory`` is not `None`, it is assumed to be in
2050 ``directory``. Defaults to "export.{format}".
2051 format : `str`, optional
2052 File format for ``filename``. If `None`, the extension of
2053 ``filename`` will be used.
2054 transfer : `str`, optional
2055 Transfer mode passed to `~lsst.daf.butler.Datastore.ingest`.
2056 skip_dimensions : `set`, optional
2057 Names of dimensions that should be skipped and not imported.
2058 idGenerationMode : `DatasetIdGenEnum`, optional
2059 Specifies option for generating dataset IDs when IDs are not
2060 provided or their type does not match backend type. By default
2061 unique IDs are generated for each inserted dataset.
2062 reuseIds : `bool`, optional
2063 If `True` then forces re-use of imported dataset IDs for integer
2064 IDs which are normally generated as auto-incremented; exception
2065 will be raised if imported IDs clash with existing ones. This
2066 option has no effect on the use of globally-unique IDs which are
2067 always re-used (or generated if integer IDs are being imported).
2069 Raises
2070 ------
2071 TypeError
2072 Raised if the set of arguments passed is inconsistent, or if the
2073 butler is read-only.
2074 """
2075 if not self.isWriteable():
2076 raise TypeError("Butler is read-only.")
2077 if format is None:
2078 if filename is None:
2079 raise TypeError("At least one of 'filename' or 'format' must be provided.")
2080 else:
2081 _, format = os.path.splitext(filename) # type: ignore
2082 elif filename is None:
2083 filename = f"export.{format}"
2084 if isinstance(filename, str) and directory is not None and not os.path.exists(filename):
2085 filename = os.path.join(directory, filename)
2086 BackendClass = get_class_of(self._config["repo_transfer_formats"][format]["import"])
2088 def doImport(importStream: TextIO) -> None:
2089 backend = BackendClass(importStream, self.registry)
2090 backend.register()
2091 with self.transaction():
2092 backend.load(
2093 self.datastore,
2094 directory=directory,
2095 transfer=transfer,
2096 skip_dimensions=skip_dimensions,
2097 idGenerationMode=idGenerationMode,
2098 reuseIds=reuseIds,
2099 )
2101 if isinstance(filename, str):
2102 with open(filename, "r") as stream:
2103 doImport(stream)
2104 else:
2105 doImport(filename)
2107 def transfer_from(
2108 self,
2109 source_butler: LimitedButler,
2110 source_refs: Iterable[DatasetRef],
2111 transfer: str = "auto",
2112 skip_missing: bool = True,
2113 register_dataset_types: bool = False,
2114 transfer_dimensions: bool = False,
2115 ) -> collections.abc.Collection[DatasetRef]:
2116 """Transfer datasets to this Butler from a run in another Butler.
2118 Parameters
2119 ----------
2120 source_butler : `LimitedButler`
2121 Butler from which the datasets are to be transferred. If data IDs
2122 in ``source_refs`` are not expanded then this has to be a full
2123 `Butler` whose registry will be used to expand data IDs.
2124 source_refs : iterable of `DatasetRef`
2125 Datasets defined in the source butler that should be transferred to
2126 this butler.
2127 transfer : `str`, optional
2128 Transfer mode passed to `~lsst.daf.butler.Datastore.transfer_from`.
2129 skip_missing : `bool`
2130 If `True`, datasets with no datastore artifact associated with
2131 them are not transferred. If `False` a registry entry will be
2132 created even if no datastore record is created (and so will
2133 look equivalent to the dataset being unstored).
2134 register_dataset_types : `bool`
2135 If `True` any missing dataset types are registered. Otherwise
2136 an exception is raised.
2137 transfer_dimensions : `bool`, optional
2138 If `True`, dimension record data associated with the new datasets
2139 will be transferred.
2141 Returns
2142 -------
2143 refs : `list` of `DatasetRef`
2144 The refs added to this Butler.
2146 Notes
2147 -----
2148 The datastore artifact has to exist for a transfer
2149 to be made but non-existence is not an error.
2151 Datasets that already exist in this run will be skipped.
2153 The datasets are imported as part of a transaction, although
2154 dataset types are registered before the transaction is started.
2155 This means that it is possible for a dataset type to be registered
2156 even though transfer has failed.
2157 """
2158 if not self.isWriteable():
2159 raise TypeError("Butler is read-only.")
2160 progress = Progress("lsst.daf.butler.Butler.transfer_from", level=VERBOSE)
2162 # Will iterate through the refs multiple times so need to convert
2163 # to a list if this isn't a collection.
2164 if not isinstance(source_refs, collections.abc.Collection):
2165 source_refs = list(source_refs)
2167 original_count = len(source_refs)
2168 log.info("Transferring %d datasets into %s", original_count, str(self))
2170 # In some situations the datastore artifact may be missing
2171 # and we do not want that registry entry to be imported.
2172 # Asking datastore is not sufficient, the records may have been
2173 # purged, we have to ask for the (predicted) URI and check
2174 # existence explicitly. Execution butler is set up exactly like
2175 # this with no datastore records.
2176 artifact_existence: Dict[ResourcePath, bool] = {}
2177 if skip_missing:
2178 dataset_existence = source_butler.datastore.mexists(
2179 source_refs, artifact_existence=artifact_existence
2180 )
2181 source_refs = [ref for ref, exists in dataset_existence.items() if exists]
2182 filtered_count = len(source_refs)
2183 n_missing = original_count - filtered_count
2184 log.verbose(
2185 "%d dataset%s removed because the artifact does not exist. Now have %d.",
2186 n_missing,
2187 "" if n_missing == 1 else "s",
2188 filtered_count,
2189 )
2191 # Importing requires that we group the refs by dataset type and run
2192 # before doing the import.
2193 source_dataset_types = set()
2194 grouped_refs = defaultdict(list)
2195 for ref in source_refs:
2196 grouped_refs[ref.datasetType, ref.run].append(ref)
2197 source_dataset_types.add(ref.datasetType)
2199 # Check to see if the dataset type in the source butler has
2200 # the same definition in the target butler and register missing
2201 # ones if requested. Registration must happen outside a transaction.
2202 newly_registered_dataset_types = set()
2203 for datasetType in source_dataset_types:
2204 if register_dataset_types:
2205 # Let this raise immediately if inconsistent. Continuing
2206 # on to find additional inconsistent dataset types
2207 # might result in additional unwanted dataset types being
2208 # registered.
2209 if self.registry.registerDatasetType(datasetType):
2210 newly_registered_dataset_types.add(datasetType)
2211 else:
2212 # If the dataset type is missing, let it fail immediately.
2213 target_dataset_type = self.registry.getDatasetType(datasetType.name)
2214 if target_dataset_type != datasetType:
2215 raise ConflictingDefinitionError(
2216 "Source butler dataset type differs from definition"
2217 f" in target butler: {datasetType} !="
2218 f" {target_dataset_type}"
2219 )
2220 if newly_registered_dataset_types:
2221 # We may have registered some even if there were inconsistencies
2222 # but should let people know (or else remove them again).
2223 log.log(
2224 VERBOSE,
2225 "Registered the following dataset types in the target Butler: %s",
2226 ", ".join(d.name for d in newly_registered_dataset_types),
2227 )
2228 else:
2229 log.log(VERBOSE, "All required dataset types are known to the target Butler")
2231 dimension_records: Dict[DimensionElement, Dict[DataCoordinate, DimensionRecord]] = defaultdict(dict)
2232 if transfer_dimensions:
2233 # Collect all the dimension records for these refs.
2234 # All dimensions are to be copied but the list of valid dimensions
2235 # come from this butler's universe.
2236 elements = frozenset(
2237 element
2238 for element in self.registry.dimensions.getStaticElements()
2239 if element.hasTable() and element.viewOf is None
2240 )
2241 dataIds = set(ref.dataId for ref in source_refs)
2242 # This logic comes from saveDataIds.
2243 for dataId in dataIds:
2244 # Need an expanded record, if not expanded that we need a full
2245 # butler with registry (allow mocks with registry too).
2246 if not dataId.hasRecords():
2247 if registry := getattr(source_butler, "registry", None):
2248 dataId = registry.expandDataId(dataId)
2249 else:
2250 raise TypeError("Input butler needs to be a full butler to expand DataId.")
2251 # If this butler doesn't know about a dimension in the source
2252 # butler things will break later.
2253 for record in dataId.records.values():
2254 if record is not None and record.definition in elements:
2255 dimension_records[record.definition].setdefault(record.dataId, record)
2257 handled_collections: Set[str] = set()
2259 # Do all the importing in a single transaction.
2260 with self.transaction():
2261 if dimension_records:
2262 log.verbose("Ensuring that dimension records exist for transferred datasets.")
2263 for element, r in dimension_records.items():
2264 records = [r[dataId] for dataId in r]
2265 # Assume that if the record is already present that we can
2266 # use it without having to check that the record metadata
2267 # is consistent.
2268 self.registry.insertDimensionData(element, *records, skip_existing=True)
2270 n_imported = 0
2271 for (datasetType, run), refs_to_import in progress.iter_item_chunks(
2272 grouped_refs.items(), desc="Importing to registry by run and dataset type"
2273 ):
2274 if run not in handled_collections:
2275 # May need to create output collection. If source butler
2276 # has a registry, ask for documentation string.
2277 run_doc = None
2278 if registry := getattr(source_butler, "registry", None):
2279 run_doc = registry.getCollectionDocumentation(run)
2280 registered = self.registry.registerRun(run, doc=run_doc)
2281 handled_collections.add(run)
2282 if registered:
2283 log.log(VERBOSE, "Creating output run %s", run)
2285 n_refs = len(refs_to_import)
2286 log.verbose(
2287 "Importing %d ref%s of dataset type %s into run %s",
2288 n_refs,
2289 "" if n_refs == 1 else "s",
2290 datasetType.name,
2291 run,
2292 )
2294 # Assume we are using UUIDs and the source refs will match
2295 # those imported.
2296 imported_refs = self.registry._importDatasets(refs_to_import, expand=False)
2297 assert set(imported_refs) == set(refs_to_import)
2298 n_imported += len(imported_refs)
2300 assert len(source_refs) == n_imported
2301 log.verbose("Imported %d datasets into destination butler", n_imported)
2303 # Ask the datastore to transfer. The datastore has to check that
2304 # the source datastore is compatible with the target datastore.
2305 accepted, rejected = self.datastore.transfer_from(
2306 source_butler.datastore,
2307 source_refs,
2308 transfer=transfer,
2309 artifact_existence=artifact_existence,
2310 )
2311 if rejected:
2312 # For now, accept the registry entries but not the files.
2313 log.warning(
2314 "%d datasets were rejected and %d accepted for dataset type %s in run %r.",
2315 len(rejected),
2316 len(accepted),
2317 datasetType,
2318 run,
2319 )
2321 return source_refs
2323 def validateConfiguration(
2324 self,
2325 logFailures: bool = False,
2326 datasetTypeNames: Optional[Iterable[str]] = None,
2327 ignore: Iterable[str] | None = None,
2328 ) -> None:
2329 """Validate butler configuration.
2331 Checks that each `DatasetType` can be stored in the `Datastore`.
2333 Parameters
2334 ----------
2335 logFailures : `bool`, optional
2336 If `True`, output a log message for every validation error
2337 detected.
2338 datasetTypeNames : iterable of `str`, optional
2339 The `DatasetType` names that should be checked. This allows
2340 only a subset to be selected.
2341 ignore : iterable of `str`, optional
2342 Names of DatasetTypes to skip over. This can be used to skip
2343 known problems. If a named `DatasetType` corresponds to a
2344 composite, all components of that `DatasetType` will also be
2345 ignored.
2347 Raises
2348 ------
2349 ButlerValidationError
2350 Raised if there is some inconsistency with how this Butler
2351 is configured.
2352 """
2353 if datasetTypeNames:
2354 datasetTypes = [self.registry.getDatasetType(name) for name in datasetTypeNames]
2355 else:
2356 datasetTypes = list(self.registry.queryDatasetTypes())
2358 # filter out anything from the ignore list
2359 if ignore:
2360 ignore = set(ignore)
2361 datasetTypes = [
2362 e for e in datasetTypes if e.name not in ignore and e.nameAndComponent()[0] not in ignore
2363 ]
2364 else:
2365 ignore = set()
2367 # Find all the registered instruments
2368 instruments = set(record.name for record in self.registry.queryDimensionRecords("instrument"))
2370 # For each datasetType that has an instrument dimension, create
2371 # a DatasetRef for each defined instrument
2372 datasetRefs = []
2374 for datasetType in datasetTypes:
2375 if "instrument" in datasetType.dimensions:
2376 for instrument in instruments:
2377 datasetRef = DatasetRef(
2378 datasetType, {"instrument": instrument}, conform=False # type: ignore
2379 )
2380 datasetRefs.append(datasetRef)
2382 entities: List[Union[DatasetType, DatasetRef]] = []
2383 entities.extend(datasetTypes)
2384 entities.extend(datasetRefs)
2386 datastoreErrorStr = None
2387 try:
2388 self.datastore.validateConfiguration(entities, logFailures=logFailures)
2389 except ValidationError as e:
2390 datastoreErrorStr = str(e)
2392 # Also check that the LookupKeys used by the datastores match
2393 # registry and storage class definitions
2394 keys = self.datastore.getLookupKeys()
2396 failedNames = set()
2397 failedDataId = set()
2398 for key in keys:
2399 if key.name is not None:
2400 if key.name in ignore:
2401 continue
2403 # skip if specific datasetType names were requested and this
2404 # name does not match
2405 if datasetTypeNames and key.name not in datasetTypeNames:
2406 continue
2408 # See if it is a StorageClass or a DatasetType
2409 if key.name in self.storageClasses:
2410 pass
2411 else:
2412 try:
2413 self.registry.getDatasetType(key.name)
2414 except KeyError:
2415 if logFailures:
2416 log.critical("Key '%s' does not correspond to a DatasetType or StorageClass", key)
2417 failedNames.add(key)
2418 else:
2419 # Dimensions are checked for consistency when the Butler
2420 # is created and rendezvoused with a universe.
2421 pass
2423 # Check that the instrument is a valid instrument
2424 # Currently only support instrument so check for that
2425 if key.dataId:
2426 dataIdKeys = set(key.dataId)
2427 if set(["instrument"]) != dataIdKeys:
2428 if logFailures:
2429 log.critical("Key '%s' has unsupported DataId override", key)
2430 failedDataId.add(key)
2431 elif key.dataId["instrument"] not in instruments:
2432 if logFailures:
2433 log.critical("Key '%s' has unknown instrument", key)
2434 failedDataId.add(key)
2436 messages = []
2438 if datastoreErrorStr:
2439 messages.append(datastoreErrorStr)
2441 for failed, msg in (
2442 (failedNames, "Keys without corresponding DatasetType or StorageClass entry: "),
2443 (failedDataId, "Keys with bad DataId entries: "),
2444 ):
2445 if failed:
2446 msg += ", ".join(str(k) for k in failed)
2447 messages.append(msg)
2449 if messages:
2450 raise ValidationError(";\n".join(messages))
2452 @property
2453 def collections(self) -> Sequence[str]:
2454 """The collections to search by default, in order
2455 (`Sequence` [ `str` ]).
2457 This is an alias for ``self.registry.defaults.collections``. It cannot
2458 be set directly in isolation, but all defaults may be changed together
2459 by assigning a new `RegistryDefaults` instance to
2460 ``self.registry.defaults``.
2461 """
2462 return self.registry.defaults.collections
2464 @property
2465 def run(self) -> Optional[str]:
2466 """Name of the run this butler writes outputs to by default (`str` or
2467 `None`).
2469 This is an alias for ``self.registry.defaults.run``. It cannot be set
2470 directly in isolation, but all defaults may be changed together by
2471 assigning a new `RegistryDefaults` instance to
2472 ``self.registry.defaults``.
2473 """
2474 return self.registry.defaults.run
2476 @property
2477 def dimensions(self) -> DimensionUniverse:
2478 # Docstring inherited.
2479 return self.registry.dimensions
2481 registry: Registry
2482 """The object that manages dataset metadata and relationships (`Registry`).
2484 Most operations that don't involve reading or writing butler datasets are
2485 accessible only via `Registry` methods.
2486 """
2488 datastore: Datastore
2489 """The object that manages actual dataset storage (`Datastore`).
2491 Direct user access to the datastore should rarely be necessary; the primary
2492 exception is the case where a `Datastore` implementation provides extra
2493 functionality beyond what the base class defines.
2494 """
2496 storageClasses: StorageClassFactory
2497 """An object that maps known storage class names to objects that fully
2498 describe them (`StorageClassFactory`).
2499 """
2501 _allow_put_of_predefined_dataset: bool
2502 """Allow a put to succeed even if there is already a registry entry for it
2503 but not a datastore record. (`bool`)."""