Coverage for python/lsst/daf/butler/_butler.py: 8%
691 statements
« prev ^ index » next coverage.py v7.2.4, created at 2023-04-29 02:58 -0700
« prev ^ index » next coverage.py v7.2.4, created at 2023-04-29 02:58 -0700
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""
23Butler top level classes.
24"""
25from __future__ import annotations
27__all__ = (
28 "Butler",
29 "ButlerValidationError",
30)
32import collections.abc
33import contextlib
34import io
35import logging
36import numbers
37import os
38import uuid
39import warnings
40from collections import defaultdict
41from typing import (
42 TYPE_CHECKING,
43 Any,
44 ClassVar,
45 Counter,
46 Dict,
47 Iterable,
48 Iterator,
49 List,
50 MutableMapping,
51 Optional,
52 Sequence,
53 Set,
54 TextIO,
55 Tuple,
56 Type,
57 Union,
58)
60from deprecated.sphinx import deprecated
61from lsst.resources import ResourcePath, ResourcePathExpression
62from lsst.utils import doImportType
63from lsst.utils.introspection import get_class_of
64from lsst.utils.logging import VERBOSE, getLogger
66from ._butlerConfig import ButlerConfig
67from ._butlerRepoIndex import ButlerRepoIndex
68from ._deferredDatasetHandle import DeferredDatasetHandle
69from ._limited_butler import LimitedButler
70from .core import (
71 AmbiguousDatasetError,
72 Config,
73 ConfigSubset,
74 DataCoordinate,
75 DataId,
76 DataIdValue,
77 DatasetIdGenEnum,
78 DatasetRef,
79 DatasetRefURIs,
80 DatasetType,
81 Datastore,
82 Dimension,
83 DimensionConfig,
84 DimensionElement,
85 DimensionRecord,
86 DimensionUniverse,
87 FileDataset,
88 Progress,
89 StorageClass,
90 StorageClassFactory,
91 Timespan,
92 UnresolvedRefWarning,
93 ValidationError,
94)
95from .core.repoRelocation import BUTLER_ROOT_TAG
96from .core.utils import transactional
97from .registry import (
98 CollectionType,
99 ConflictingDefinitionError,
100 DataIdError,
101 MissingDatasetTypeError,
102 Registry,
103 RegistryConfig,
104 RegistryDefaults,
105)
106from .transfers import RepoExportContext
108if TYPE_CHECKING:
109 from lsst.resources import ResourceHandleProtocol
111log = getLogger(__name__)
114class ButlerValidationError(ValidationError):
115 """There is a problem with the Butler configuration."""
117 pass
120class Butler(LimitedButler):
121 """Main entry point for the data access system.
123 Parameters
124 ----------
125 config : `ButlerConfig`, `Config` or `str`, optional.
126 Configuration. Anything acceptable to the
127 `ButlerConfig` constructor. If a directory path
128 is given the configuration will be read from a ``butler.yaml`` file in
129 that location. If `None` is given default values will be used.
130 butler : `Butler`, optional.
131 If provided, construct a new Butler that uses the same registry and
132 datastore as the given one, but with the given collection and run.
133 Incompatible with the ``config``, ``searchPaths``, and ``writeable``
134 arguments.
135 collections : `str` or `Iterable` [ `str` ], optional
136 An expression specifying the collections to be searched (in order) when
137 reading datasets.
138 This may be a `str` collection name or an iterable thereof.
139 See :ref:`daf_butler_collection_expressions` for more information.
140 These collections are not registered automatically and must be
141 manually registered before they are used by any method, but they may be
142 manually registered after the `Butler` is initialized.
143 run : `str`, optional
144 Name of the `~CollectionType.RUN` collection new datasets should be
145 inserted into. If ``collections`` is `None` and ``run`` is not `None`,
146 ``collections`` will be set to ``[run]``. If not `None`, this
147 collection will automatically be registered. If this is not set (and
148 ``writeable`` is not set either), a read-only butler will be created.
149 searchPaths : `list` of `str`, optional
150 Directory paths to search when calculating the full Butler
151 configuration. Not used if the supplied config is already a
152 `ButlerConfig`.
153 writeable : `bool`, optional
154 Explicitly sets whether the butler supports write operations. If not
155 provided, a read-write butler is created if any of ``run``, ``tags``,
156 or ``chains`` is non-empty.
157 inferDefaults : `bool`, optional
158 If `True` (default) infer default data ID values from the values
159 present in the datasets in ``collections``: if all collections have the
160 same value (or no value) for a governor dimension, that value will be
161 the default for that dimension. Nonexistent collections are ignored.
162 If a default value is provided explicitly for a governor dimension via
163 ``**kwargs``, no default will be inferred for that dimension.
164 **kwargs : `str`
165 Default data ID key-value pairs. These may only identify "governor"
166 dimensions like ``instrument`` and ``skymap``.
168 Examples
169 --------
170 While there are many ways to control exactly how a `Butler` interacts with
171 the collections in its `Registry`, the most common cases are still simple.
173 For a read-only `Butler` that searches one collection, do::
175 butler = Butler("/path/to/repo", collections=["u/alice/DM-50000"])
177 For a read-write `Butler` that writes to and reads from a
178 `~CollectionType.RUN` collection::
180 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a")
182 The `Butler` passed to a ``PipelineTask`` is often much more complex,
183 because we want to write to one `~CollectionType.RUN` collection but read
184 from several others (as well)::
186 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a",
187 collections=["u/alice/DM-50000/a",
188 "u/bob/DM-49998",
189 "HSC/defaults"])
191 This butler will `put` new datasets to the run ``u/alice/DM-50000/a``.
192 Datasets will be read first from that run (since it appears first in the
193 chain), and then from ``u/bob/DM-49998`` and finally ``HSC/defaults``.
195 Finally, one can always create a `Butler` with no collections::
197 butler = Butler("/path/to/repo", writeable=True)
199 This can be extremely useful when you just want to use ``butler.registry``,
200 e.g. for inserting dimension data or managing collections, or when the
201 collections you want to use with the butler are not consistent.
202 Passing ``writeable`` explicitly here is only necessary if you want to be
203 able to make changes to the repo - usually the value for ``writeable`` can
204 be guessed from the collection arguments provided, but it defaults to
205 `False` when there are not collection arguments.
206 """
208 def __init__(
209 self,
210 config: Union[Config, str, None] = None,
211 *,
212 butler: Optional[Butler] = None,
213 collections: Any = None,
214 run: Optional[str] = None,
215 searchPaths: Optional[List[str]] = None,
216 writeable: Optional[bool] = None,
217 inferDefaults: bool = True,
218 **kwargs: str,
219 ):
220 defaults = RegistryDefaults(collections=collections, run=run, infer=inferDefaults, **kwargs)
221 # Load registry, datastore, etc. from config or existing butler.
222 if butler is not None:
223 if config is not None or searchPaths is not None or writeable is not None:
224 raise TypeError(
225 "Cannot pass 'config', 'searchPaths', or 'writeable' arguments with 'butler' argument."
226 )
227 self.registry = butler.registry.copy(defaults)
228 self.datastore = butler.datastore
229 self.storageClasses = butler.storageClasses
230 self._config: ButlerConfig = butler._config
231 self._allow_put_of_predefined_dataset = butler._allow_put_of_predefined_dataset
232 else:
233 # Can only look for strings in the known repos list.
234 if isinstance(config, str) and config in self.get_known_repos():
235 config = str(self.get_repo_uri(config))
236 try:
237 self._config = ButlerConfig(config, searchPaths=searchPaths)
238 except FileNotFoundError as e:
239 if known := self.get_known_repos():
240 aliases = f"(known aliases: {', '.join(known)})"
241 else:
242 aliases = "(no known aliases)"
243 raise FileNotFoundError(f"{e} {aliases}") from e
244 self._config = ButlerConfig(config, searchPaths=searchPaths)
245 try:
246 if "root" in self._config:
247 butlerRoot = self._config["root"]
248 else:
249 butlerRoot = self._config.configDir
250 if writeable is None:
251 writeable = run is not None
252 self.registry = Registry.fromConfig(
253 self._config, butlerRoot=butlerRoot, writeable=writeable, defaults=defaults
254 )
255 self.datastore = Datastore.fromConfig(
256 self._config, self.registry.getDatastoreBridgeManager(), butlerRoot=butlerRoot
257 )
258 self.storageClasses = StorageClassFactory()
259 self.storageClasses.addFromConfig(self._config)
260 self._allow_put_of_predefined_dataset = self._config.get(
261 "allow_put_of_predefined_dataset", False
262 )
263 except Exception:
264 # Failures here usually mean that configuration is incomplete,
265 # just issue an error message which includes config file URI.
266 log.error(f"Failed to instantiate Butler from config {self._config.configFile}.")
267 raise
269 # For execution butler the datastore needs a special
270 # dependency-inversion trick. This is not used by regular butler,
271 # but we do not have a way to distinguish regular butler from execution
272 # butler.
273 self.datastore.set_retrieve_dataset_type_method(self._retrieve_dataset_type)
275 if "run" in self._config or "collection" in self._config:
276 raise ValueError("Passing a run or collection via configuration is no longer supported.")
278 GENERATION: ClassVar[int] = 3
279 """This is a Generation 3 Butler.
281 This attribute may be removed in the future, once the Generation 2 Butler
282 interface has been fully retired; it should only be used in transitional
283 code.
284 """
286 def _retrieve_dataset_type(self, name: str) -> DatasetType | None:
287 """Return DatasetType defined in registry given dataset type name."""
288 try:
289 return self.registry.getDatasetType(name)
290 except MissingDatasetTypeError:
291 return None
293 @classmethod
294 def get_repo_uri(cls, label: str) -> ResourcePath:
295 """Look up the label in a butler repository index.
297 Parameters
298 ----------
299 label : `str`
300 Label of the Butler repository to look up.
302 Returns
303 -------
304 uri : `lsst.resources.ResourcePath`
305 URI to the Butler repository associated with the given label.
307 Raises
308 ------
309 KeyError
310 Raised if the label is not found in the index, or if an index
311 can not be found at all.
313 Notes
314 -----
315 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the
316 information is discovered.
317 """
318 return ButlerRepoIndex.get_repo_uri(label)
320 @classmethod
321 def get_known_repos(cls) -> Set[str]:
322 """Retrieve the list of known repository labels.
324 Returns
325 -------
326 repos : `set` of `str`
327 All the known labels. Can be empty if no index can be found.
329 Notes
330 -----
331 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the
332 information is discovered.
333 """
334 return ButlerRepoIndex.get_known_repos()
336 @staticmethod
337 def makeRepo(
338 root: ResourcePathExpression,
339 config: Union[Config, str, None] = None,
340 dimensionConfig: Union[Config, str, None] = None,
341 standalone: bool = False,
342 searchPaths: Optional[List[str]] = None,
343 forceConfigRoot: bool = True,
344 outfile: Optional[ResourcePathExpression] = None,
345 overwrite: bool = False,
346 ) -> Config:
347 """Create an empty data repository by adding a butler.yaml config
348 to a repository root directory.
350 Parameters
351 ----------
352 root : `lsst.resources.ResourcePathExpression`
353 Path or URI to the root location of the new repository. Will be
354 created if it does not exist.
355 config : `Config` or `str`, optional
356 Configuration to write to the repository, after setting any
357 root-dependent Registry or Datastore config options. Can not
358 be a `ButlerConfig` or a `ConfigSubset`. If `None`, default
359 configuration will be used. Root-dependent config options
360 specified in this config are overwritten if ``forceConfigRoot``
361 is `True`.
362 dimensionConfig : `Config` or `str`, optional
363 Configuration for dimensions, will be used to initialize registry
364 database.
365 standalone : `bool`
366 If True, write all expanded defaults, not just customized or
367 repository-specific settings.
368 This (mostly) decouples the repository from the default
369 configuration, insulating it from changes to the defaults (which
370 may be good or bad, depending on the nature of the changes).
371 Future *additions* to the defaults will still be picked up when
372 initializing `Butlers` to repos created with ``standalone=True``.
373 searchPaths : `list` of `str`, optional
374 Directory paths to search when calculating the full butler
375 configuration.
376 forceConfigRoot : `bool`, optional
377 If `False`, any values present in the supplied ``config`` that
378 would normally be reset are not overridden and will appear
379 directly in the output config. This allows non-standard overrides
380 of the root directory for a datastore or registry to be given.
381 If this parameter is `True` the values for ``root`` will be
382 forced into the resulting config if appropriate.
383 outfile : `lss.resources.ResourcePathExpression`, optional
384 If not-`None`, the output configuration will be written to this
385 location rather than into the repository itself. Can be a URI
386 string. Can refer to a directory that will be used to write
387 ``butler.yaml``.
388 overwrite : `bool`, optional
389 Create a new configuration file even if one already exists
390 in the specified output location. Default is to raise
391 an exception.
393 Returns
394 -------
395 config : `Config`
396 The updated `Config` instance written to the repo.
398 Raises
399 ------
400 ValueError
401 Raised if a ButlerConfig or ConfigSubset is passed instead of a
402 regular Config (as these subclasses would make it impossible to
403 support ``standalone=False``).
404 FileExistsError
405 Raised if the output config file already exists.
406 os.error
407 Raised if the directory does not exist, exists but is not a
408 directory, or cannot be created.
410 Notes
411 -----
412 Note that when ``standalone=False`` (the default), the configuration
413 search path (see `ConfigSubset.defaultSearchPaths`) that was used to
414 construct the repository should also be used to construct any Butlers
415 to avoid configuration inconsistencies.
416 """
417 if isinstance(config, (ButlerConfig, ConfigSubset)):
418 raise ValueError("makeRepo must be passed a regular Config without defaults applied.")
420 # Ensure that the root of the repository exists or can be made
421 root_uri = ResourcePath(root, forceDirectory=True)
422 root_uri.mkdir()
424 config = Config(config)
426 # If we are creating a new repo from scratch with relative roots,
427 # do not propagate an explicit root from the config file
428 if "root" in config:
429 del config["root"]
431 full = ButlerConfig(config, searchPaths=searchPaths) # this applies defaults
432 imported_class = doImportType(full["datastore", "cls"])
433 if not issubclass(imported_class, Datastore):
434 raise TypeError(f"Imported datastore class {full['datastore', 'cls']} is not a Datastore")
435 datastoreClass: Type[Datastore] = imported_class
436 datastoreClass.setConfigRoot(BUTLER_ROOT_TAG, config, full, overwrite=forceConfigRoot)
438 # if key exists in given config, parse it, otherwise parse the defaults
439 # in the expanded config
440 if config.get(("registry", "db")):
441 registryConfig = RegistryConfig(config)
442 else:
443 registryConfig = RegistryConfig(full)
444 defaultDatabaseUri = registryConfig.makeDefaultDatabaseUri(BUTLER_ROOT_TAG)
445 if defaultDatabaseUri is not None:
446 Config.updateParameters(
447 RegistryConfig, config, full, toUpdate={"db": defaultDatabaseUri}, overwrite=forceConfigRoot
448 )
449 else:
450 Config.updateParameters(RegistryConfig, config, full, toCopy=("db",), overwrite=forceConfigRoot)
452 if standalone:
453 config.merge(full)
454 else:
455 # Always expand the registry.managers section into the per-repo
456 # config, because after the database schema is created, it's not
457 # allowed to change anymore. Note that in the standalone=True
458 # branch, _everything_ in the config is expanded, so there's no
459 # need to special case this.
460 Config.updateParameters(RegistryConfig, config, full, toMerge=("managers",), overwrite=False)
461 configURI: ResourcePathExpression
462 if outfile is not None:
463 # When writing to a separate location we must include
464 # the root of the butler repo in the config else it won't know
465 # where to look.
466 config["root"] = root_uri.geturl()
467 configURI = outfile
468 else:
469 configURI = root_uri
470 # Strip obscore configuration, if it is present, before writing config
471 # to a file, obscore config will be stored in registry.
472 if (obscore_config_key := ("registry", "managers", "obscore", "config")) in config:
473 config_to_write = config.copy()
474 del config_to_write[obscore_config_key]
475 config_to_write.dumpToUri(configURI, overwrite=overwrite)
476 # configFile attribute is updated, need to copy it to original.
477 config.configFile = config_to_write.configFile
478 else:
479 config.dumpToUri(configURI, overwrite=overwrite)
481 # Create Registry and populate tables
482 registryConfig = RegistryConfig(config.get("registry"))
483 dimensionConfig = DimensionConfig(dimensionConfig)
484 Registry.createFromConfig(registryConfig, dimensionConfig=dimensionConfig, butlerRoot=root_uri)
486 log.verbose("Wrote new Butler configuration file to %s", configURI)
488 return config
490 @classmethod
491 def _unpickle(
492 cls,
493 config: ButlerConfig,
494 collections: Optional[tuple[str, ...]],
495 run: Optional[str],
496 defaultDataId: Dict[str, str],
497 writeable: bool,
498 ) -> Butler:
499 """Callable used to unpickle a Butler.
501 We prefer not to use ``Butler.__init__`` directly so we can force some
502 of its many arguments to be keyword-only (note that ``__reduce__``
503 can only invoke callables with positional arguments).
505 Parameters
506 ----------
507 config : `ButlerConfig`
508 Butler configuration, already coerced into a true `ButlerConfig`
509 instance (and hence after any search paths for overrides have been
510 utilized).
511 collections : `tuple` [ `str` ]
512 Names of the default collections to read from.
513 run : `str`, optional
514 Name of the default `~CollectionType.RUN` collection to write to.
515 defaultDataId : `dict` [ `str`, `str` ]
516 Default data ID values.
517 writeable : `bool`
518 Whether the Butler should support write operations.
520 Returns
521 -------
522 butler : `Butler`
523 A new `Butler` instance.
524 """
525 # MyPy doesn't recognize that the kwargs below are totally valid; it
526 # seems to think '**defaultDataId* is a _positional_ argument!
527 return cls(
528 config=config,
529 collections=collections,
530 run=run,
531 writeable=writeable,
532 **defaultDataId, # type: ignore
533 )
535 def __reduce__(self) -> tuple:
536 """Support pickling."""
537 return (
538 Butler._unpickle,
539 (
540 self._config,
541 self.collections,
542 self.run,
543 self.registry.defaults.dataId.byName(),
544 self.registry.isWriteable(),
545 ),
546 )
548 def __str__(self) -> str:
549 return "Butler(collections={}, run={}, datastore='{}', registry='{}')".format(
550 self.collections, self.run, self.datastore, self.registry
551 )
553 def isWriteable(self) -> bool:
554 """Return `True` if this `Butler` supports write operations."""
555 return self.registry.isWriteable()
557 @contextlib.contextmanager
558 def transaction(self) -> Iterator[None]:
559 """Context manager supporting `Butler` transactions.
561 Transactions can be nested.
562 """
563 with self.registry.transaction():
564 with self.datastore.transaction():
565 yield
567 def _standardizeArgs(
568 self,
569 datasetRefOrType: Union[DatasetRef, DatasetType, str],
570 dataId: Optional[DataId] = None,
571 for_put: bool = True,
572 **kwargs: Any,
573 ) -> Tuple[DatasetType, Optional[DataId]]:
574 """Standardize the arguments passed to several Butler APIs.
576 Parameters
577 ----------
578 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
579 When `DatasetRef` the `dataId` should be `None`.
580 Otherwise the `DatasetType` or name thereof.
581 dataId : `dict` or `DataCoordinate`
582 A `dict` of `Dimension` link name, value pairs that label the
583 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
584 should be provided as the second argument.
585 for_put : `bool`, optional
586 If `True` this call is invoked as part of a `Butler.put()`.
587 Otherwise it is assumed to be part of a `Butler.get()`. This
588 parameter is only relevant if there is dataset type
589 inconsistency.
590 **kwargs
591 Additional keyword arguments used to augment or construct a
592 `DataCoordinate`. See `DataCoordinate.standardize`
593 parameters.
595 Returns
596 -------
597 datasetType : `DatasetType`
598 A `DatasetType` instance extracted from ``datasetRefOrType``.
599 dataId : `dict` or `DataId`, optional
600 Argument that can be used (along with ``kwargs``) to construct a
601 `DataId`.
603 Notes
604 -----
605 Butler APIs that conceptually need a DatasetRef also allow passing a
606 `DatasetType` (or the name of one) and a `DataId` (or a dict and
607 keyword arguments that can be used to construct one) separately. This
608 method accepts those arguments and always returns a true `DatasetType`
609 and a `DataId` or `dict`.
611 Standardization of `dict` vs `DataId` is best handled by passing the
612 returned ``dataId`` (and ``kwargs``) to `Registry` APIs, which are
613 generally similarly flexible.
614 """
615 externalDatasetType: Optional[DatasetType] = None
616 internalDatasetType: Optional[DatasetType] = None
617 if isinstance(datasetRefOrType, DatasetRef):
618 if dataId is not None or kwargs:
619 raise ValueError("DatasetRef given, cannot use dataId as well")
620 externalDatasetType = datasetRefOrType.datasetType
621 dataId = datasetRefOrType.dataId
622 else:
623 # Don't check whether DataId is provided, because Registry APIs
624 # can usually construct a better error message when it wasn't.
625 if isinstance(datasetRefOrType, DatasetType):
626 externalDatasetType = datasetRefOrType
627 else:
628 internalDatasetType = self.registry.getDatasetType(datasetRefOrType)
630 # Check that they are self-consistent
631 if externalDatasetType is not None:
632 internalDatasetType = self.registry.getDatasetType(externalDatasetType.name)
633 if externalDatasetType != internalDatasetType:
634 # We can allow differences if they are compatible, depending
635 # on whether this is a get or a put. A get requires that
636 # the python type associated with the datastore can be
637 # converted to the user type. A put requires that the user
638 # supplied python type can be converted to the internal
639 # type expected by registry.
640 relevantDatasetType = internalDatasetType
641 if for_put:
642 is_compatible = internalDatasetType.is_compatible_with(externalDatasetType)
643 else:
644 is_compatible = externalDatasetType.is_compatible_with(internalDatasetType)
645 relevantDatasetType = externalDatasetType
646 if not is_compatible:
647 raise ValueError(
648 f"Supplied dataset type ({externalDatasetType}) inconsistent with "
649 f"registry definition ({internalDatasetType})"
650 )
651 # Override the internal definition.
652 internalDatasetType = relevantDatasetType
654 assert internalDatasetType is not None
655 return internalDatasetType, dataId
657 def _rewrite_data_id(
658 self, dataId: Optional[DataId], datasetType: DatasetType, **kwargs: Any
659 ) -> Tuple[Optional[DataId], Dict[str, Any]]:
660 """Rewrite a data ID taking into account dimension records.
662 Take a Data ID and keyword args and rewrite it if necessary to
663 allow the user to specify dimension records rather than dimension
664 primary values.
666 This allows a user to include a dataId dict with keys of
667 ``exposure.day_obs`` and ``exposure.seq_num`` instead of giving
668 the integer exposure ID. It also allows a string to be given
669 for a dimension value rather than the integer ID if that is more
670 convenient. For example, rather than having to specifyin the
671 detector with ``detector.full_name``, a string given for ``detector``
672 will be interpreted as the full name and converted to the integer
673 value.
675 Keyword arguments can also use strings for dimensions like detector
676 and exposure but python does not allow them to include ``.`` and
677 so the ``exposure.day_obs`` syntax can not be used in a keyword
678 argument.
680 Parameters
681 ----------
682 dataId : `dict` or `DataCoordinate`
683 A `dict` of `Dimension` link name, value pairs that will label the
684 `DatasetRef` within a Collection.
685 datasetType : `DatasetType`
686 The dataset type associated with this dataId. Required to
687 determine the relevant dimensions.
688 **kwargs
689 Additional keyword arguments used to augment or construct a
690 `DataId`. See `DataId` parameters.
692 Returns
693 -------
694 dataId : `dict` or `DataCoordinate`
695 The, possibly rewritten, dataId. If given a `DataCoordinate` and
696 no keyword arguments, the original dataId will be returned
697 unchanged.
698 **kwargs : `dict`
699 Any unused keyword arguments (would normally be empty dict).
700 """
701 # Do nothing if we have a standalone DataCoordinate.
702 if isinstance(dataId, DataCoordinate) and not kwargs:
703 return dataId, kwargs
705 # Process dimension records that are using record information
706 # rather than ids
707 newDataId: Dict[str, DataIdValue] = {}
708 byRecord: Dict[str, Dict[str, Any]] = defaultdict(dict)
710 # if all the dataId comes from keyword parameters we do not need
711 # to do anything here because they can't be of the form
712 # exposure.obs_id because a "." is not allowed in a keyword parameter.
713 if dataId:
714 for k, v in dataId.items():
715 # If we have a Dimension we do not need to do anything
716 # because it cannot be a compound key.
717 if isinstance(k, str) and "." in k:
718 # Someone is using a more human-readable dataId
719 dimensionName, record = k.split(".", 1)
720 byRecord[dimensionName][record] = v
721 elif isinstance(k, Dimension):
722 newDataId[k.name] = v
723 else:
724 newDataId[k] = v
726 # Go through the updated dataId and check the type in case someone is
727 # using an alternate key. We have already filtered out the compound
728 # keys dimensions.record format.
729 not_dimensions = {}
731 # Will need to look in the dataId and the keyword arguments
732 # and will remove them if they need to be fixed or are unrecognized.
733 for dataIdDict in (newDataId, kwargs):
734 # Use a list so we can adjust the dict safely in the loop
735 for dimensionName in list(dataIdDict):
736 value = dataIdDict[dimensionName]
737 try:
738 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
739 except KeyError:
740 # This is not a real dimension
741 not_dimensions[dimensionName] = value
742 del dataIdDict[dimensionName]
743 continue
745 # Convert an integral type to an explicit int to simplify
746 # comparisons here
747 if isinstance(value, numbers.Integral):
748 value = int(value)
750 if not isinstance(value, dimension.primaryKey.getPythonType()):
751 for alternate in dimension.alternateKeys:
752 if isinstance(value, alternate.getPythonType()):
753 byRecord[dimensionName][alternate.name] = value
754 del dataIdDict[dimensionName]
755 log.debug(
756 "Converting dimension %s to %s.%s=%s",
757 dimensionName,
758 dimensionName,
759 alternate.name,
760 value,
761 )
762 break
763 else:
764 log.warning(
765 "Type mismatch found for value '%r' provided for dimension %s. "
766 "Could not find matching alternative (primary key has type %s) "
767 "so attempting to use as-is.",
768 value,
769 dimensionName,
770 dimension.primaryKey.getPythonType(),
771 )
773 # By this point kwargs and newDataId should only include valid
774 # dimensions. Merge kwargs in to the new dataId and log if there
775 # are dimensions in both (rather than calling update).
776 for k, v in kwargs.items():
777 if k in newDataId and newDataId[k] != v:
778 log.debug(
779 "Keyword arg %s overriding explicit value in dataId of %s with %s", k, newDataId[k], v
780 )
781 newDataId[k] = v
782 # No need to retain any values in kwargs now.
783 kwargs = {}
785 # If we have some unrecognized dimensions we have to try to connect
786 # them to records in other dimensions. This is made more complicated
787 # by some dimensions having records with clashing names. A mitigation
788 # is that we can tell by this point which dimensions are missing
789 # for the DatasetType but this does not work for calibrations
790 # where additional dimensions can be used to constrain the temporal
791 # axis.
792 if not_dimensions:
793 # Search for all dimensions even if we have been given a value
794 # explicitly. In some cases records are given as well as the
795 # actually dimension and this should not be an error if they
796 # match.
797 mandatoryDimensions = datasetType.dimensions.names # - provided
799 candidateDimensions: Set[str] = set()
800 candidateDimensions.update(mandatoryDimensions)
802 # For calibrations we may well be needing temporal dimensions
803 # so rather than always including all dimensions in the scan
804 # restrict things a little. It is still possible for there
805 # to be confusion over day_obs in visit vs exposure for example.
806 # If we are not searching calibration collections things may
807 # fail but they are going to fail anyway because of the
808 # ambiguousness of the dataId...
809 if datasetType.isCalibration():
810 for dim in self.registry.dimensions.getStaticDimensions():
811 if dim.temporal:
812 candidateDimensions.add(str(dim))
814 # Look up table for the first association with a dimension
815 guessedAssociation: Dict[str, Dict[str, Any]] = defaultdict(dict)
817 # Keep track of whether an item is associated with multiple
818 # dimensions.
819 counter: Counter[str] = Counter()
820 assigned: Dict[str, Set[str]] = defaultdict(set)
822 # Go through the missing dimensions and associate the
823 # given names with records within those dimensions
824 matched_dims = set()
825 for dimensionName in candidateDimensions:
826 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
827 fields = dimension.metadata.names | dimension.uniqueKeys.names
828 for field in not_dimensions:
829 if field in fields:
830 guessedAssociation[dimensionName][field] = not_dimensions[field]
831 counter[dimensionName] += 1
832 assigned[field].add(dimensionName)
833 matched_dims.add(field)
835 # Calculate the fields that matched nothing.
836 never_found = set(not_dimensions) - matched_dims
838 if never_found:
839 raise ValueError(f"Unrecognized keyword args given: {never_found}")
841 # There is a chance we have allocated a single dataId item
842 # to multiple dimensions. Need to decide which should be retained.
843 # For now assume that the most popular alternative wins.
844 # This means that day_obs with seq_num will result in
845 # exposure.day_obs and not visit.day_obs
846 # Also prefer an explicitly missing dimension over an inferred
847 # temporal dimension.
848 for fieldName, assignedDimensions in assigned.items():
849 if len(assignedDimensions) > 1:
850 # Pick the most popular (preferring mandatory dimensions)
851 requiredButMissing = assignedDimensions.intersection(mandatoryDimensions)
852 if requiredButMissing:
853 candidateDimensions = requiredButMissing
854 else:
855 candidateDimensions = assignedDimensions
857 # If this is a choice between visit and exposure and
858 # neither was a required part of the dataset type,
859 # (hence in this branch) always prefer exposure over
860 # visit since exposures are always defined and visits
861 # are defined from exposures.
862 if candidateDimensions == {"exposure", "visit"}:
863 candidateDimensions = {"exposure"}
865 # Select the relevant items and get a new restricted
866 # counter.
867 theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions}
868 duplicatesCounter: Counter[str] = Counter()
869 duplicatesCounter.update(theseCounts)
871 # Choose the most common. If they are equally common
872 # we will pick the one that was found first.
873 # Returns a list of tuples
874 selected = duplicatesCounter.most_common(1)[0][0]
876 log.debug(
877 "Ambiguous dataId entry '%s' associated with multiple dimensions: %s."
878 " Removed ambiguity by choosing dimension %s.",
879 fieldName,
880 ", ".join(assignedDimensions),
881 selected,
882 )
884 for candidateDimension in assignedDimensions:
885 if candidateDimension != selected:
886 del guessedAssociation[candidateDimension][fieldName]
888 # Update the record look up dict with the new associations
889 for dimensionName, values in guessedAssociation.items():
890 if values: # A dict might now be empty
891 log.debug("Assigned non-dimension dataId keys to dimension %s: %s", dimensionName, values)
892 byRecord[dimensionName].update(values)
894 if byRecord:
895 # Some record specifiers were found so we need to convert
896 # them to the Id form
897 for dimensionName, values in byRecord.items():
898 if dimensionName in newDataId:
899 log.debug(
900 "DataId specified explicit %s dimension value of %s in addition to"
901 " general record specifiers for it of %s. Ignoring record information.",
902 dimensionName,
903 newDataId[dimensionName],
904 str(values),
905 )
906 # Get the actual record and compare with these values.
907 try:
908 recs = list(self.registry.queryDimensionRecords(dimensionName, dataId=newDataId))
909 except DataIdError:
910 raise ValueError(
911 f"Could not find dimension '{dimensionName}'"
912 f" with dataId {newDataId} as part of comparing with"
913 f" record values {byRecord[dimensionName]}"
914 ) from None
915 if len(recs) == 1:
916 errmsg: List[str] = []
917 for k, v in values.items():
918 if (recval := getattr(recs[0], k)) != v:
919 errmsg.append(f"{k}({recval} != {v})")
920 if errmsg:
921 raise ValueError(
922 f"Dimension {dimensionName} in dataId has explicit value"
923 " inconsistent with records: " + ", ".join(errmsg)
924 )
925 else:
926 # Multiple matches for an explicit dimension
927 # should never happen but let downstream complain.
928 pass
929 continue
931 # Build up a WHERE expression
932 bind = {k: v for k, v in values.items()}
933 where = " AND ".join(f"{dimensionName}.{k} = {k}" for k in bind)
935 # Hopefully we get a single record that matches
936 records = set(
937 self.registry.queryDimensionRecords(
938 dimensionName, dataId=newDataId, where=where, bind=bind, **kwargs
939 )
940 )
942 if len(records) != 1:
943 if len(records) > 1:
944 # visit can have an ambiguous answer without involving
945 # visit_system. The default visit_system is defined
946 # by the instrument.
947 if (
948 dimensionName == "visit"
949 and "visit_system_membership" in self.registry.dimensions
950 and "visit_system" in self.registry.dimensions["instrument"].metadata
951 ):
952 instrument_records = list(
953 self.registry.queryDimensionRecords(
954 "instrument",
955 dataId=newDataId,
956 **kwargs,
957 )
958 )
959 if len(instrument_records) == 1:
960 visit_system = instrument_records[0].visit_system
961 if visit_system is None:
962 # Set to a value that will never match.
963 visit_system = -1
965 # Look up each visit in the
966 # visit_system_membership records.
967 for rec in records:
968 membership = list(
969 self.registry.queryDimensionRecords(
970 # Use bind to allow zero results.
971 # This is a fully-specified query.
972 "visit_system_membership",
973 where="instrument = inst AND visit_system = system AND visit = v",
974 bind=dict(
975 inst=instrument_records[0].name, system=visit_system, v=rec.id
976 ),
977 )
978 )
979 if membership:
980 # This record is the right answer.
981 records = set([rec])
982 break
984 # The ambiguity may have been resolved so check again.
985 if len(records) > 1:
986 log.debug("Received %d records from constraints of %s", len(records), str(values))
987 for r in records:
988 log.debug("- %s", str(r))
989 raise ValueError(
990 f"DataId specification for dimension {dimensionName} is not"
991 f" uniquely constrained to a single dataset by {values}."
992 f" Got {len(records)} results."
993 )
994 else:
995 raise ValueError(
996 f"DataId specification for dimension {dimensionName} matched no"
997 f" records when constrained by {values}"
998 )
1000 # Get the primary key from the real dimension object
1001 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
1002 if not isinstance(dimension, Dimension):
1003 raise RuntimeError(
1004 f"{dimension.name} is not a true dimension, and cannot be used in data IDs."
1005 )
1006 newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name)
1008 return newDataId, kwargs
1010 def _findDatasetRef(
1011 self,
1012 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1013 dataId: Optional[DataId] = None,
1014 *,
1015 collections: Any = None,
1016 allowUnresolved: bool = False,
1017 **kwargs: Any,
1018 ) -> DatasetRef:
1019 """Shared logic for methods that start with a search for a dataset in
1020 the registry.
1022 Parameters
1023 ----------
1024 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1025 When `DatasetRef` the `dataId` should be `None`.
1026 Otherwise the `DatasetType` or name thereof.
1027 dataId : `dict` or `DataCoordinate`, optional
1028 A `dict` of `Dimension` link name, value pairs that label the
1029 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1030 should be provided as the first argument.
1031 collections : Any, optional
1032 Collections to be searched, overriding ``self.collections``.
1033 Can be any of the types supported by the ``collections`` argument
1034 to butler construction.
1035 allowUnresolved : `bool`, optional
1036 If `True`, return an unresolved `DatasetRef` if finding a resolved
1037 one in the `Registry` fails. Defaults to `False`.
1038 **kwargs
1039 Additional keyword arguments used to augment or construct a
1040 `DataId`. See `DataId` parameters.
1042 Returns
1043 -------
1044 ref : `DatasetRef`
1045 A reference to the dataset identified by the given arguments.
1046 This can be the same dataset reference as given if it was
1047 resolved.
1049 Raises
1050 ------
1051 LookupError
1052 Raised if no matching dataset exists in the `Registry` (and
1053 ``allowUnresolved is False``).
1054 ValueError
1055 Raised if a resolved `DatasetRef` was passed as an input, but it
1056 differs from the one found in the registry.
1057 TypeError
1058 Raised if no collections were provided.
1059 """
1060 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, for_put=False, **kwargs)
1061 if isinstance(datasetRefOrType, DatasetRef):
1062 idNumber = datasetRefOrType.id
1063 # This is a resolved ref, return it immediately.
1064 if idNumber:
1065 return datasetRefOrType
1066 else:
1067 idNumber = None
1068 timespan: Optional[Timespan] = None
1070 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs)
1072 if datasetType.isCalibration():
1073 # Because this is a calibration dataset, first try to make a
1074 # standardize the data ID without restricting the dimensions to
1075 # those of the dataset type requested, because there may be extra
1076 # dimensions that provide temporal information for a validity-range
1077 # lookup.
1078 dataId = DataCoordinate.standardize(
1079 dataId, universe=self.registry.dimensions, defaults=self.registry.defaults.dataId, **kwargs
1080 )
1081 if dataId.graph.temporal:
1082 dataId = self.registry.expandDataId(dataId)
1083 timespan = dataId.timespan
1084 else:
1085 # Standardize the data ID to just the dimensions of the dataset
1086 # type instead of letting registry.findDataset do it, so we get the
1087 # result even if no dataset is found.
1088 dataId = DataCoordinate.standardize(
1089 dataId, graph=datasetType.dimensions, defaults=self.registry.defaults.dataId, **kwargs
1090 )
1091 # Always lookup the DatasetRef, even if one is given, to ensure it is
1092 # present in the current collection.
1093 ref = self.registry.findDataset(datasetType, dataId, collections=collections, timespan=timespan)
1094 if ref is None:
1095 if allowUnresolved:
1096 with warnings.catch_warnings():
1097 warnings.simplefilter("ignore", category=UnresolvedRefWarning)
1098 return DatasetRef(datasetType, dataId)
1099 else:
1100 if collections is None:
1101 collections = self.registry.defaults.collections
1102 raise LookupError(
1103 f"Dataset {datasetType.name} with data ID {dataId} "
1104 f"could not be found in collections {collections}."
1105 )
1106 if idNumber is not None and idNumber != ref.id:
1107 if collections is None:
1108 collections = self.registry.defaults.collections
1109 raise ValueError(
1110 f"DatasetRef.id provided ({idNumber}) does not match "
1111 f"id ({ref.id}) in registry in collections {collections}."
1112 )
1113 if datasetType != ref.datasetType:
1114 # If they differ it is because the user explicitly specified
1115 # a compatible dataset type to this call rather than using the
1116 # registry definition. The DatasetRef must therefore be recreated
1117 # using the user definition such that the expected type is
1118 # returned.
1119 ref = DatasetRef(datasetType, ref.dataId, run=ref.run, id=ref.id)
1121 return ref
1123 @transactional
1124 @deprecated(
1125 reason="Butler.put() now behaves like Butler.putDirect() when given a DatasetRef."
1126 " Please use Butler.put(). Be aware that you may need to adjust your usage if you"
1127 " were relying on the run parameter to determine the run."
1128 " Will be removed after v27.0.",
1129 version="v26.0",
1130 category=FutureWarning,
1131 )
1132 def putDirect(self, obj: Any, ref: DatasetRef, /) -> DatasetRef:
1133 # Docstring inherited.
1134 return self.put(obj, ref)
1136 @transactional
1137 def put(
1138 self,
1139 obj: Any,
1140 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1141 /,
1142 dataId: Optional[DataId] = None,
1143 *,
1144 run: Optional[str] = None,
1145 **kwargs: Any,
1146 ) -> DatasetRef:
1147 """Store and register a dataset.
1149 Parameters
1150 ----------
1151 obj : `object`
1152 The dataset.
1153 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1154 When `DatasetRef` is provided, ``dataId`` should be `None`.
1155 Otherwise the `DatasetType` or name thereof. If a fully resolved
1156 `DatasetRef` is given the run and ID are used directly.
1157 dataId : `dict` or `DataCoordinate`
1158 A `dict` of `Dimension` link name, value pairs that label the
1159 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1160 should be provided as the second argument.
1161 run : `str`, optional
1162 The name of the run the dataset should be added to, overriding
1163 ``self.run``. Not used if a resolved `DatasetRef` is provided.
1164 **kwargs
1165 Additional keyword arguments used to augment or construct a
1166 `DataCoordinate`. See `DataCoordinate.standardize`
1167 parameters. Not used if a resolve `DatasetRef` is provided.
1169 Returns
1170 -------
1171 ref : `DatasetRef`
1172 A reference to the stored dataset, updated with the correct id if
1173 given.
1175 Raises
1176 ------
1177 TypeError
1178 Raised if the butler is read-only or if no run has been provided.
1179 """
1180 if isinstance(datasetRefOrType, DatasetRef) and datasetRefOrType.id is not None:
1181 # This is a direct put of predefined DatasetRef.
1182 log.debug("Butler put direct: %s", datasetRefOrType)
1183 (imported_ref,) = self.registry._importDatasets(
1184 [datasetRefOrType],
1185 expand=True,
1186 )
1187 if imported_ref.id != datasetRefOrType.getCheckedId():
1188 raise RuntimeError("This registry configuration does not support direct put of ref.")
1189 self.datastore.put(obj, datasetRefOrType)
1190 return datasetRefOrType
1192 log.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run)
1193 if not self.isWriteable():
1194 raise TypeError("Butler is read-only.")
1195 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs)
1196 if isinstance(datasetRefOrType, DatasetRef) and datasetRefOrType.id is not None:
1197 raise ValueError("DatasetRef must not be in registry, must have None id")
1199 # Handle dimension records in dataId
1200 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs)
1202 # Add Registry Dataset entry.
1203 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions, **kwargs)
1205 # For an execution butler the datasets will be pre-defined.
1206 # If the butler is configured that way datasets should only be inserted
1207 # if they do not already exist in registry. Trying and catching
1208 # ConflictingDefinitionError will not work because the transaction
1209 # will be corrupted. Instead, in this mode always check first.
1210 ref = None
1211 ref_is_predefined = False
1212 if self._allow_put_of_predefined_dataset:
1213 # Get the matching ref for this run.
1214 ref = self.registry.findDataset(datasetType, collections=run, dataId=dataId)
1216 if ref:
1217 # Must be expanded form for datastore templating
1218 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions)
1219 ref = ref.expanded(dataId)
1220 ref_is_predefined = True
1222 if not ref:
1223 (ref,) = self.registry.insertDatasets(datasetType, run=run, dataIds=[dataId])
1225 # If the ref is predefined it is possible that the datastore also
1226 # has the record. Asking datastore to put it again will result in
1227 # the artifact being recreated, overwriting previous, then will cause
1228 # a failure in writing the record which will cause the artifact
1229 # to be removed. Much safer to ask first before attempting to
1230 # overwrite. Race conditions should not be an issue for the
1231 # execution butler environment.
1232 if ref_is_predefined:
1233 if self.datastore.knows(ref):
1234 raise ConflictingDefinitionError(f"Dataset associated {ref} already exists.")
1236 self.datastore.put(obj, ref)
1238 return ref
1240 @deprecated(
1241 reason="Butler.get() now behaves like Butler.getDirect() when given a DatasetRef."
1242 " Please use Butler.get(). Will be removed after v27.0.",
1243 version="v26.0",
1244 category=FutureWarning,
1245 )
1246 def getDirect(
1247 self,
1248 ref: DatasetRef,
1249 *,
1250 parameters: Optional[Dict[str, Any]] = None,
1251 storageClass: Optional[Union[StorageClass, str]] = None,
1252 ) -> Any:
1253 """Retrieve a stored dataset.
1255 Parameters
1256 ----------
1257 ref : `DatasetRef`
1258 Resolved reference to an already stored dataset.
1259 parameters : `dict`
1260 Additional StorageClass-defined options to control reading,
1261 typically used to efficiently read only a subset of the dataset.
1262 storageClass : `StorageClass` or `str`, optional
1263 The storage class to be used to override the Python type
1264 returned by this method. By default the returned type matches
1265 the dataset type definition for this dataset. Specifying a
1266 read `StorageClass` can force a different type to be returned.
1267 This type must be compatible with the original type.
1269 Returns
1270 -------
1271 obj : `object`
1272 The dataset.
1273 """
1274 return self.datastore.get(ref, parameters=parameters, storageClass=storageClass)
1276 @deprecated(
1277 reason="Butler.getDeferred() now behaves like getDirectDeferred() when given a DatasetRef. "
1278 "Please use Butler.getDeferred(). Will be removed after v27.0.",
1279 version="v26.0",
1280 category=FutureWarning,
1281 )
1282 def getDirectDeferred(
1283 self,
1284 ref: DatasetRef,
1285 *,
1286 parameters: Union[dict, None] = None,
1287 storageClass: str | StorageClass | None = None,
1288 ) -> DeferredDatasetHandle:
1289 """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
1290 from a resolved `DatasetRef`.
1292 Parameters
1293 ----------
1294 ref : `DatasetRef`
1295 Resolved reference to an already stored dataset.
1296 parameters : `dict`
1297 Additional StorageClass-defined options to control reading,
1298 typically used to efficiently read only a subset of the dataset.
1299 storageClass : `StorageClass` or `str`, optional
1300 The storage class to be used to override the Python type
1301 returned by this method. By default the returned type matches
1302 the dataset type definition for this dataset. Specifying a
1303 read `StorageClass` can force a different type to be returned.
1304 This type must be compatible with the original type.
1306 Returns
1307 -------
1308 obj : `DeferredDatasetHandle`
1309 A handle which can be used to retrieve a dataset at a later time.
1311 Raises
1312 ------
1313 AmbiguousDatasetError
1314 Raised if ``ref.id is None``, i.e. the reference is unresolved.
1315 """
1316 if ref.id is None:
1317 raise AmbiguousDatasetError(
1318 f"Dataset of type {ref.datasetType.name} with data ID {ref.dataId} is not resolved."
1319 )
1320 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters, storageClass=storageClass)
1322 def getDeferred(
1323 self,
1324 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1325 /,
1326 dataId: Optional[DataId] = None,
1327 *,
1328 parameters: Union[dict, None] = None,
1329 collections: Any = None,
1330 storageClass: str | StorageClass | None = None,
1331 **kwargs: Any,
1332 ) -> DeferredDatasetHandle:
1333 """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
1334 after an immediate registry lookup.
1336 Parameters
1337 ----------
1338 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1339 When `DatasetRef` the `dataId` should be `None`.
1340 Otherwise the `DatasetType` or name thereof.
1341 dataId : `dict` or `DataCoordinate`, optional
1342 A `dict` of `Dimension` link name, value pairs that label the
1343 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1344 should be provided as the first argument.
1345 parameters : `dict`
1346 Additional StorageClass-defined options to control reading,
1347 typically used to efficiently read only a subset of the dataset.
1348 collections : Any, optional
1349 Collections to be searched, overriding ``self.collections``.
1350 Can be any of the types supported by the ``collections`` argument
1351 to butler construction.
1352 storageClass : `StorageClass` or `str`, optional
1353 The storage class to be used to override the Python type
1354 returned by this method. By default the returned type matches
1355 the dataset type definition for this dataset. Specifying a
1356 read `StorageClass` can force a different type to be returned.
1357 This type must be compatible with the original type.
1358 **kwargs
1359 Additional keyword arguments used to augment or construct a
1360 `DataId`. See `DataId` parameters.
1362 Returns
1363 -------
1364 obj : `DeferredDatasetHandle`
1365 A handle which can be used to retrieve a dataset at a later time.
1367 Raises
1368 ------
1369 LookupError
1370 Raised if no matching dataset exists in the `Registry` (and
1371 ``allowUnresolved is False``).
1372 ValueError
1373 Raised if a resolved `DatasetRef` was passed as an input, but it
1374 differs from the one found in the registry.
1375 TypeError
1376 Raised if no collections were provided.
1377 """
1378 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs)
1379 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters, storageClass=storageClass)
1381 def get(
1382 self,
1383 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1384 /,
1385 dataId: Optional[DataId] = None,
1386 *,
1387 parameters: Optional[Dict[str, Any]] = None,
1388 collections: Any = None,
1389 storageClass: Optional[Union[StorageClass, str]] = None,
1390 **kwargs: Any,
1391 ) -> Any:
1392 """Retrieve a stored dataset.
1394 Parameters
1395 ----------
1396 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1397 When `DatasetRef` the `dataId` should be `None`.
1398 Otherwise the `DatasetType` or name thereof.
1399 If a resolved `DatasetRef`, the associated dataset
1400 is returned directly without additional querying.
1401 dataId : `dict` or `DataCoordinate`
1402 A `dict` of `Dimension` link name, value pairs that label the
1403 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1404 should be provided as the first argument.
1405 parameters : `dict`
1406 Additional StorageClass-defined options to control reading,
1407 typically used to efficiently read only a subset of the dataset.
1408 collections : Any, optional
1409 Collections to be searched, overriding ``self.collections``.
1410 Can be any of the types supported by the ``collections`` argument
1411 to butler construction.
1412 storageClass : `StorageClass` or `str`, optional
1413 The storage class to be used to override the Python type
1414 returned by this method. By default the returned type matches
1415 the dataset type definition for this dataset. Specifying a
1416 read `StorageClass` can force a different type to be returned.
1417 This type must be compatible with the original type.
1418 **kwargs
1419 Additional keyword arguments used to augment or construct a
1420 `DataCoordinate`. See `DataCoordinate.standardize`
1421 parameters.
1423 Returns
1424 -------
1425 obj : `object`
1426 The dataset.
1428 Raises
1429 ------
1430 LookupError
1431 Raised if no matching dataset exists in the `Registry`.
1432 TypeError
1433 Raised if no collections were provided.
1435 Notes
1436 -----
1437 When looking up datasets in a `~CollectionType.CALIBRATION` collection,
1438 this method requires that the given data ID include temporal dimensions
1439 beyond the dimensions of the dataset type itself, in order to find the
1440 dataset with the appropriate validity range. For example, a "bias"
1441 dataset with native dimensions ``{instrument, detector}`` could be
1442 fetched with a ``{instrument, detector, exposure}`` data ID, because
1443 ``exposure`` is a temporal dimension.
1444 """
1445 log.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters)
1446 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs)
1447 return self.datastore.get(ref, parameters=parameters, storageClass=storageClass)
1449 def getURIs(
1450 self,
1451 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1452 /,
1453 dataId: Optional[DataId] = None,
1454 *,
1455 predict: bool = False,
1456 collections: Any = None,
1457 run: Optional[str] = None,
1458 **kwargs: Any,
1459 ) -> DatasetRefURIs:
1460 """Returns the URIs associated with the dataset.
1462 Parameters
1463 ----------
1464 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1465 When `DatasetRef` the `dataId` should be `None`.
1466 Otherwise the `DatasetType` or name thereof.
1467 dataId : `dict` or `DataCoordinate`
1468 A `dict` of `Dimension` link name, value pairs that label the
1469 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1470 should be provided as the first argument.
1471 predict : `bool`
1472 If `True`, allow URIs to be returned of datasets that have not
1473 been written.
1474 collections : Any, optional
1475 Collections to be searched, overriding ``self.collections``.
1476 Can be any of the types supported by the ``collections`` argument
1477 to butler construction.
1478 run : `str`, optional
1479 Run to use for predictions, overriding ``self.run``.
1480 **kwargs
1481 Additional keyword arguments used to augment or construct a
1482 `DataCoordinate`. See `DataCoordinate.standardize`
1483 parameters.
1485 Returns
1486 -------
1487 uris : `DatasetRefURIs`
1488 The URI to the primary artifact associated with this dataset (if
1489 the dataset was disassembled within the datastore this may be
1490 `None`), and the URIs to any components associated with the dataset
1491 artifact. (can be empty if there are no components).
1492 """
1493 ref = self._findDatasetRef(
1494 datasetRefOrType, dataId, allowUnresolved=predict, collections=collections, **kwargs
1495 )
1496 if ref.id is None: # only possible if predict is True
1497 if run is None:
1498 run = self.run
1499 if run is None:
1500 raise TypeError("Cannot predict location with run=None.")
1501 # Lie about ID, because we can't guess it, and only
1502 # Datastore.getURIs() will ever see it (and it doesn't use it).
1503 with warnings.catch_warnings():
1504 warnings.simplefilter("ignore", category=UnresolvedRefWarning)
1505 ref = ref.resolved(id=uuid.UUID("{00000000-0000-0000-0000-000000000000}"), run=run)
1506 return self.datastore.getURIs(ref, predict)
1508 def getURI(
1509 self,
1510 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1511 /,
1512 dataId: Optional[DataId] = None,
1513 *,
1514 predict: bool = False,
1515 collections: Any = None,
1516 run: Optional[str] = None,
1517 **kwargs: Any,
1518 ) -> ResourcePath:
1519 """Return the URI to the Dataset.
1521 Parameters
1522 ----------
1523 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1524 When `DatasetRef` the `dataId` should be `None`.
1525 Otherwise the `DatasetType` or name thereof.
1526 dataId : `dict` or `DataCoordinate`
1527 A `dict` of `Dimension` link name, value pairs that label the
1528 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1529 should be provided as the first argument.
1530 predict : `bool`
1531 If `True`, allow URIs to be returned of datasets that have not
1532 been written.
1533 collections : Any, optional
1534 Collections to be searched, overriding ``self.collections``.
1535 Can be any of the types supported by the ``collections`` argument
1536 to butler construction.
1537 run : `str`, optional
1538 Run to use for predictions, overriding ``self.run``.
1539 **kwargs
1540 Additional keyword arguments used to augment or construct a
1541 `DataCoordinate`. See `DataCoordinate.standardize`
1542 parameters.
1544 Returns
1545 -------
1546 uri : `lsst.resources.ResourcePath`
1547 URI pointing to the Dataset within the datastore. If the
1548 Dataset does not exist in the datastore, and if ``predict`` is
1549 `True`, the URI will be a prediction and will include a URI
1550 fragment "#predicted".
1551 If the datastore does not have entities that relate well
1552 to the concept of a URI the returned URI string will be
1553 descriptive. The returned URI is not guaranteed to be obtainable.
1555 Raises
1556 ------
1557 LookupError
1558 A URI has been requested for a dataset that does not exist and
1559 guessing is not allowed.
1560 ValueError
1561 Raised if a resolved `DatasetRef` was passed as an input, but it
1562 differs from the one found in the registry.
1563 TypeError
1564 Raised if no collections were provided.
1565 RuntimeError
1566 Raised if a URI is requested for a dataset that consists of
1567 multiple artifacts.
1568 """
1569 primary, components = self.getURIs(
1570 datasetRefOrType, dataId=dataId, predict=predict, collections=collections, run=run, **kwargs
1571 )
1573 if primary is None or components:
1574 raise RuntimeError(
1575 f"Dataset ({datasetRefOrType}) includes distinct URIs for components. "
1576 "Use Butler.getURIs() instead."
1577 )
1578 return primary
1580 def retrieveArtifacts(
1581 self,
1582 refs: Iterable[DatasetRef],
1583 destination: ResourcePathExpression,
1584 transfer: str = "auto",
1585 preserve_path: bool = True,
1586 overwrite: bool = False,
1587 ) -> List[ResourcePath]:
1588 """Retrieve the artifacts associated with the supplied refs.
1590 Parameters
1591 ----------
1592 refs : iterable of `DatasetRef`
1593 The datasets for which artifacts are to be retrieved.
1594 A single ref can result in multiple artifacts. The refs must
1595 be resolved.
1596 destination : `lsst.resources.ResourcePath` or `str`
1597 Location to write the artifacts.
1598 transfer : `str`, optional
1599 Method to use to transfer the artifacts. Must be one of the options
1600 supported by `~lsst.resources.ResourcePath.transfer_from()`.
1601 "move" is not allowed.
1602 preserve_path : `bool`, optional
1603 If `True` the full path of the artifact within the datastore
1604 is preserved. If `False` the final file component of the path
1605 is used.
1606 overwrite : `bool`, optional
1607 If `True` allow transfers to overwrite existing files at the
1608 destination.
1610 Returns
1611 -------
1612 targets : `list` of `lsst.resources.ResourcePath`
1613 URIs of file artifacts in destination location. Order is not
1614 preserved.
1616 Notes
1617 -----
1618 For non-file datastores the artifacts written to the destination
1619 may not match the representation inside the datastore. For example
1620 a hierarchical data structure in a NoSQL database may well be stored
1621 as a JSON file.
1622 """
1623 return self.datastore.retrieveArtifacts(
1624 refs,
1625 ResourcePath(destination),
1626 transfer=transfer,
1627 preserve_path=preserve_path,
1628 overwrite=overwrite,
1629 )
1631 def datasetExists(
1632 self,
1633 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1634 dataId: Optional[DataId] = None,
1635 *,
1636 collections: Any = None,
1637 **kwargs: Any,
1638 ) -> bool:
1639 """Return True if the Dataset is actually present in the Datastore.
1641 Parameters
1642 ----------
1643 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1644 When `DatasetRef` the `dataId` should be `None`.
1645 Otherwise the `DatasetType` or name thereof.
1646 dataId : `dict` or `DataCoordinate`
1647 A `dict` of `Dimension` link name, value pairs that label the
1648 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1649 should be provided as the first argument.
1650 collections : Any, optional
1651 Collections to be searched, overriding ``self.collections``.
1652 Can be any of the types supported by the ``collections`` argument
1653 to butler construction.
1654 **kwargs
1655 Additional keyword arguments used to augment or construct a
1656 `DataCoordinate`. See `DataCoordinate.standardize`
1657 parameters.
1659 Raises
1660 ------
1661 LookupError
1662 Raised if the dataset is not even present in the Registry.
1663 ValueError
1664 Raised if a resolved `DatasetRef` was passed as an input, but it
1665 differs from the one found in the registry.
1666 TypeError
1667 Raised if no collections were provided.
1668 """
1669 # A resolved ref may be given that is not known to this butler.
1670 if isinstance(datasetRefOrType, DatasetRef) and datasetRefOrType.id is not None:
1671 ref = self.registry.getDataset(datasetRefOrType.id)
1672 if ref is None:
1673 raise LookupError(
1674 f"Resolved DatasetRef with id {datasetRefOrType.id} is not known to registry."
1675 )
1676 else:
1677 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs)
1678 return self.datastore.exists(ref)
1680 def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None:
1681 """Remove one or more `~CollectionType.RUN` collections and the
1682 datasets within them.
1684 Parameters
1685 ----------
1686 names : `Iterable` [ `str` ]
1687 The names of the collections to remove.
1688 unstore : `bool`, optional
1689 If `True` (default), delete datasets from all datastores in which
1690 they are present, and attempt to rollback the registry deletions if
1691 datastore deletions fail (which may not always be possible). If
1692 `False`, datastore records for these datasets are still removed,
1693 but any artifacts (e.g. files) will not be.
1695 Raises
1696 ------
1697 TypeError
1698 Raised if one or more collections are not of type
1699 `~CollectionType.RUN`.
1700 """
1701 if not self.isWriteable():
1702 raise TypeError("Butler is read-only.")
1703 names = list(names)
1704 refs: List[DatasetRef] = []
1705 for name in names:
1706 collectionType = self.registry.getCollectionType(name)
1707 if collectionType is not CollectionType.RUN:
1708 raise TypeError(f"The collection type of '{name}' is {collectionType.name}, not RUN.")
1709 refs.extend(self.registry.queryDatasets(..., collections=name, findFirst=True))
1710 with self.datastore.transaction():
1711 with self.registry.transaction():
1712 if unstore:
1713 self.datastore.trash(refs)
1714 else:
1715 self.datastore.forget(refs)
1716 for name in names:
1717 self.registry.removeCollection(name)
1718 if unstore:
1719 # Point of no return for removing artifacts
1720 self.datastore.emptyTrash()
1722 def pruneDatasets(
1723 self,
1724 refs: Iterable[DatasetRef],
1725 *,
1726 disassociate: bool = True,
1727 unstore: bool = False,
1728 tags: Iterable[str] = (),
1729 purge: bool = False,
1730 ) -> None:
1731 # docstring inherited from LimitedButler
1733 if not self.isWriteable():
1734 raise TypeError("Butler is read-only.")
1735 if purge:
1736 if not disassociate:
1737 raise TypeError("Cannot pass purge=True without disassociate=True.")
1738 if not unstore:
1739 raise TypeError("Cannot pass purge=True without unstore=True.")
1740 elif disassociate:
1741 tags = tuple(tags)
1742 if not tags:
1743 raise TypeError("No tags provided but disassociate=True.")
1744 for tag in tags:
1745 collectionType = self.registry.getCollectionType(tag)
1746 if collectionType is not CollectionType.TAGGED:
1747 raise TypeError(
1748 f"Cannot disassociate from collection '{tag}' "
1749 f"of non-TAGGED type {collectionType.name}."
1750 )
1751 # For an execution butler we want to keep existing UUIDs for the
1752 # datasets, for that we need to keep them in the collections but
1753 # remove from datastore.
1754 if self._allow_put_of_predefined_dataset and purge:
1755 purge = False
1756 disassociate = False
1757 # Transform possibly-single-pass iterable into something we can iterate
1758 # over multiple times.
1759 refs = list(refs)
1760 # Pruning a component of a DatasetRef makes no sense since registry
1761 # doesn't know about components and datastore might not store
1762 # components in a separate file
1763 for ref in refs:
1764 if ref.datasetType.component():
1765 raise ValueError(f"Can not prune a component of a dataset (ref={ref})")
1766 # We don't need an unreliable Datastore transaction for this, because
1767 # we've been extra careful to ensure that Datastore.trash only involves
1768 # mutating the Registry (it can _look_ at Datastore-specific things,
1769 # but shouldn't change them), and hence all operations here are
1770 # Registry operations.
1771 with self.datastore.transaction():
1772 with self.registry.transaction():
1773 if unstore:
1774 self.datastore.trash(refs)
1775 if purge:
1776 self.registry.removeDatasets(refs)
1777 elif disassociate:
1778 assert tags, "Guaranteed by earlier logic in this function."
1779 for tag in tags:
1780 self.registry.disassociate(tag, refs)
1781 # We've exited the Registry transaction, and apparently committed.
1782 # (if there was an exception, everything rolled back, and it's as if
1783 # nothing happened - and we never get here).
1784 # Datastore artifacts are not yet gone, but they're clearly marked
1785 # as trash, so if we fail to delete now because of (e.g.) filesystem
1786 # problems we can try again later, and if manual administrative
1787 # intervention is required, it's pretty clear what that should entail:
1788 # deleting everything on disk and in private Datastore tables that is
1789 # in the dataset_location_trash table.
1790 if unstore:
1791 # Point of no return for removing artifacts
1792 self.datastore.emptyTrash()
1794 @transactional
1795 def ingest(
1796 self,
1797 *datasets: FileDataset,
1798 transfer: Optional[str] = "auto",
1799 run: Optional[str] = None,
1800 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
1801 record_validation_info: bool = True,
1802 ) -> None:
1803 """Store and register one or more datasets that already exist on disk.
1805 Parameters
1806 ----------
1807 datasets : `FileDataset`
1808 Each positional argument is a struct containing information about
1809 a file to be ingested, including its URI (either absolute or
1810 relative to the datastore root, if applicable), a `DatasetRef`,
1811 and optionally a formatter class or its fully-qualified string
1812 name. If a formatter is not provided, the formatter that would be
1813 used for `put` is assumed. On successful return, all
1814 `FileDataset.ref` attributes will have their `DatasetRef.id`
1815 attribute populated and all `FileDataset.formatter` attributes will
1816 be set to the formatter class used. `FileDataset.path` attributes
1817 may be modified to put paths in whatever the datastore considers a
1818 standardized form.
1819 transfer : `str`, optional
1820 If not `None`, must be one of 'auto', 'move', 'copy', 'direct',
1821 'split', 'hardlink', 'relsymlink' or 'symlink', indicating how to
1822 transfer the file.
1823 run : `str`, optional
1824 The name of the run ingested datasets should be added to,
1825 overriding ``self.run``.
1826 idGenerationMode : `DatasetIdGenEnum`, optional
1827 Specifies option for generating dataset IDs. By default unique IDs
1828 are generated for each inserted dataset.
1829 record_validation_info : `bool`, optional
1830 If `True`, the default, the datastore can record validation
1831 information associated with the file. If `False` the datastore
1832 will not attempt to track any information such as checksums
1833 or file sizes. This can be useful if such information is tracked
1834 in an external system or if the file is to be compressed in place.
1835 It is up to the datastore whether this parameter is relevant.
1837 Raises
1838 ------
1839 TypeError
1840 Raised if the butler is read-only or if no run was provided.
1841 NotImplementedError
1842 Raised if the `Datastore` does not support the given transfer mode.
1843 DatasetTypeNotSupportedError
1844 Raised if one or more files to be ingested have a dataset type that
1845 is not supported by the `Datastore`..
1846 FileNotFoundError
1847 Raised if one of the given files does not exist.
1848 FileExistsError
1849 Raised if transfer is not `None` but the (internal) location the
1850 file would be moved to is already occupied.
1852 Notes
1853 -----
1854 This operation is not fully exception safe: if a database operation
1855 fails, the given `FileDataset` instances may be only partially updated.
1857 It is atomic in terms of database operations (they will either all
1858 succeed or all fail) providing the database engine implements
1859 transactions correctly. It will attempt to be atomic in terms of
1860 filesystem operations as well, but this cannot be implemented
1861 rigorously for most datastores.
1862 """
1863 if not self.isWriteable():
1864 raise TypeError("Butler is read-only.")
1865 progress = Progress("lsst.daf.butler.Butler.ingest", level=logging.DEBUG)
1866 # Reorganize the inputs so they're grouped by DatasetType and then
1867 # data ID. We also include a list of DatasetRefs for each FileDataset
1868 # to hold the resolved DatasetRefs returned by the Registry, before
1869 # it's safe to swap them into FileDataset.refs.
1870 # Some type annotation aliases to make that clearer:
1871 GroupForType = Dict[DataCoordinate, Tuple[FileDataset, List[DatasetRef]]]
1872 GroupedData = MutableMapping[DatasetType, GroupForType]
1873 # The actual data structure:
1874 groupedData: GroupedData = defaultdict(dict)
1875 # And the nested loop that populates it:
1876 for dataset in progress.wrap(datasets, desc="Grouping by dataset type"):
1877 # This list intentionally shared across the inner loop, since it's
1878 # associated with `dataset`.
1879 resolvedRefs: List[DatasetRef] = []
1881 # Somewhere to store pre-existing refs if we have an
1882 # execution butler.
1883 existingRefs: List[DatasetRef] = []
1885 for ref in dataset.refs:
1886 if ref.dataId in groupedData[ref.datasetType]:
1887 raise ConflictingDefinitionError(
1888 f"Ingest conflict. Dataset {dataset.path} has same"
1889 " DataId as other ingest dataset"
1890 f" {groupedData[ref.datasetType][ref.dataId][0].path} "
1891 f" ({ref.dataId})"
1892 )
1893 if self._allow_put_of_predefined_dataset:
1894 existing_ref = self.registry.findDataset(
1895 ref.datasetType, dataId=ref.dataId, collections=run
1896 )
1897 if existing_ref:
1898 if self.datastore.knows(existing_ref):
1899 raise ConflictingDefinitionError(
1900 f"Dataset associated with path {dataset.path}"
1901 f" already exists as {existing_ref}."
1902 )
1903 # Store this ref elsewhere since it already exists
1904 # and we do not want to remake it but we do want
1905 # to store it in the datastore.
1906 existingRefs.append(existing_ref)
1908 # Nothing else to do until we have finished
1909 # iterating.
1910 continue
1912 groupedData[ref.datasetType][ref.dataId] = (dataset, resolvedRefs)
1914 if existingRefs:
1915 if len(dataset.refs) != len(existingRefs):
1916 # Keeping track of partially pre-existing datasets is hard
1917 # and should generally never happen. For now don't allow
1918 # it.
1919 raise ConflictingDefinitionError(
1920 f"For dataset {dataset.path} some dataIds already exist"
1921 " in registry but others do not. This is not supported."
1922 )
1924 # Attach the resolved refs if we found them.
1925 dataset.refs = existingRefs
1927 # Now we can bulk-insert into Registry for each DatasetType.
1928 for datasetType, groupForType in progress.iter_item_chunks(
1929 groupedData.items(), desc="Bulk-inserting datasets by type"
1930 ):
1931 refs = self.registry.insertDatasets(
1932 datasetType,
1933 dataIds=groupForType.keys(),
1934 run=run,
1935 expand=self.datastore.needs_expanded_data_ids(transfer, datasetType),
1936 idGenerationMode=idGenerationMode,
1937 )
1938 # Append those resolved DatasetRefs to the new lists we set up for
1939 # them.
1940 for ref, (_, resolvedRefs) in zip(refs, groupForType.values()):
1941 resolvedRefs.append(ref)
1943 # Go back to the original FileDatasets to replace their refs with the
1944 # new resolved ones.
1945 for groupForType in progress.iter_chunks(
1946 groupedData.values(), desc="Reassociating resolved dataset refs with files"
1947 ):
1948 for dataset, resolvedRefs in groupForType.values():
1949 dataset.refs = resolvedRefs
1951 # Bulk-insert everything into Datastore.
1952 self.datastore.ingest(*datasets, transfer=transfer, record_validation_info=record_validation_info)
1954 @contextlib.contextmanager
1955 def export(
1956 self,
1957 *,
1958 directory: Optional[str] = None,
1959 filename: Optional[str] = None,
1960 format: Optional[str] = None,
1961 transfer: Optional[str] = None,
1962 ) -> Iterator[RepoExportContext]:
1963 """Export datasets from the repository represented by this `Butler`.
1965 This method is a context manager that returns a helper object
1966 (`RepoExportContext`) that is used to indicate what information from
1967 the repository should be exported.
1969 Parameters
1970 ----------
1971 directory : `str`, optional
1972 Directory dataset files should be written to if ``transfer`` is not
1973 `None`.
1974 filename : `str`, optional
1975 Name for the file that will include database information associated
1976 with the exported datasets. If this is not an absolute path and
1977 ``directory`` is not `None`, it will be written to ``directory``
1978 instead of the current working directory. Defaults to
1979 "export.{format}".
1980 format : `str`, optional
1981 File format for the database information file. If `None`, the
1982 extension of ``filename`` will be used.
1983 transfer : `str`, optional
1984 Transfer mode passed to `Datastore.export`.
1986 Raises
1987 ------
1988 TypeError
1989 Raised if the set of arguments passed is inconsistent.
1991 Examples
1992 --------
1993 Typically the `Registry.queryDataIds` and `Registry.queryDatasets`
1994 methods are used to provide the iterables over data IDs and/or datasets
1995 to be exported::
1997 with butler.export("exports.yaml") as export:
1998 # Export all flats, but none of the dimension element rows
1999 # (i.e. data ID information) associated with them.
2000 export.saveDatasets(butler.registry.queryDatasets("flat"),
2001 elements=())
2002 # Export all datasets that start with "deepCoadd_" and all of
2003 # their associated data ID information.
2004 export.saveDatasets(butler.registry.queryDatasets("deepCoadd_*"))
2005 """
2006 if directory is None and transfer is not None:
2007 raise TypeError("Cannot transfer without providing a directory.")
2008 if transfer == "move":
2009 raise TypeError("Transfer may not be 'move': export is read-only")
2010 if format is None:
2011 if filename is None:
2012 raise TypeError("At least one of 'filename' or 'format' must be provided.")
2013 else:
2014 _, format = os.path.splitext(filename)
2015 if not format:
2016 raise ValueError("Please specify a file extension to determine export format.")
2017 format = format[1:] # Strip leading ".""
2018 elif filename is None:
2019 filename = f"export.{format}"
2020 if directory is not None:
2021 filename = os.path.join(directory, filename)
2022 formats = self._config["repo_transfer_formats"]
2023 if format not in formats:
2024 raise ValueError(f"Unknown export format {format!r}, allowed: {','.join(formats.keys())}")
2025 BackendClass = get_class_of(formats[format, "export"])
2026 with open(filename, "w") as stream:
2027 backend = BackendClass(stream, universe=self.registry.dimensions)
2028 try:
2029 helper = RepoExportContext(
2030 self.registry, self.datastore, backend=backend, directory=directory, transfer=transfer
2031 )
2032 yield helper
2033 except BaseException:
2034 raise
2035 else:
2036 helper._finish()
2038 def import_(
2039 self,
2040 *,
2041 directory: Optional[ResourcePathExpression] = None,
2042 filename: Union[ResourcePathExpression, TextIO, None] = None,
2043 format: Optional[str] = None,
2044 transfer: Optional[str] = None,
2045 skip_dimensions: Optional[Set] = None,
2046 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
2047 reuseIds: bool = False,
2048 ) -> None:
2049 """Import datasets into this repository that were exported from a
2050 different butler repository via `~lsst.daf.butler.Butler.export`.
2052 Parameters
2053 ----------
2054 directory : `~lsst.resources.ResourcePathExpression`, optional
2055 Directory containing dataset files to import from. If `None`,
2056 ``filename`` and all dataset file paths specified therein must
2057 be absolute.
2058 filename : `~lsst.resources.ResourcePathExpression` or `TextIO`
2059 A stream or name of file that contains database information
2060 associated with the exported datasets, typically generated by
2061 `~lsst.daf.butler.Butler.export`. If this a string (name) or
2062 `~lsst.resources.ResourcePath` and is not an absolute path,
2063 it will first be looked for relative to ``directory`` and if not
2064 found there it will be looked for in the current working
2065 directory. Defaults to "export.{format}".
2066 format : `str`, optional
2067 File format for ``filename``. If `None`, the extension of
2068 ``filename`` will be used.
2069 transfer : `str`, optional
2070 Transfer mode passed to `~lsst.daf.butler.Datastore.ingest`.
2071 skip_dimensions : `set`, optional
2072 Names of dimensions that should be skipped and not imported.
2073 idGenerationMode : `DatasetIdGenEnum`, optional
2074 Specifies option for generating dataset IDs when IDs are not
2075 provided or their type does not match backend type. By default
2076 unique IDs are generated for each inserted dataset.
2077 reuseIds : `bool`, optional
2078 If `True` then forces re-use of imported dataset IDs for integer
2079 IDs which are normally generated as auto-incremented; exception
2080 will be raised if imported IDs clash with existing ones. This
2081 option has no effect on the use of globally-unique IDs which are
2082 always re-used (or generated if integer IDs are being imported).
2084 Raises
2085 ------
2086 TypeError
2087 Raised if the set of arguments passed is inconsistent, or if the
2088 butler is read-only.
2089 """
2090 if not self.isWriteable():
2091 raise TypeError("Butler is read-only.")
2092 if format is None:
2093 if filename is None:
2094 raise TypeError("At least one of 'filename' or 'format' must be provided.")
2095 else:
2096 _, format = os.path.splitext(filename) # type: ignore
2097 elif filename is None:
2098 filename = ResourcePath(f"export.{format}", forceAbsolute=False)
2099 if directory is not None:
2100 directory = ResourcePath(directory, forceDirectory=True)
2101 # mypy doesn't think this will work but it does in python >= 3.10.
2102 if isinstance(filename, ResourcePathExpression): # type: ignore
2103 filename = ResourcePath(filename, forceAbsolute=False) # type: ignore
2104 if not filename.isabs() and directory is not None:
2105 potential = directory.join(filename)
2106 exists_in_cwd = filename.exists()
2107 exists_in_dir = potential.exists()
2108 if exists_in_cwd and exists_in_dir:
2109 log.warning(
2110 "A relative path for filename was specified (%s) which exists relative to cwd. "
2111 "Additionally, the file exists relative to the given search directory (%s). "
2112 "Using the export file in the given directory.",
2113 filename,
2114 potential,
2115 )
2116 # Given they specified an explicit directory and that
2117 # directory has the export file in it, assume that that
2118 # is what was meant despite the file in cwd.
2119 filename = potential
2120 elif exists_in_dir:
2121 filename = potential
2122 elif not exists_in_cwd and not exists_in_dir:
2123 # Raise early.
2124 raise FileNotFoundError(
2125 f"Export file could not be found in {filename.abspath()} or {potential.abspath()}."
2126 )
2127 BackendClass = get_class_of(self._config["repo_transfer_formats"][format]["import"])
2129 def doImport(importStream: TextIO | ResourceHandleProtocol) -> None:
2130 backend = BackendClass(importStream, self.registry)
2131 backend.register()
2132 with self.transaction():
2133 backend.load(
2134 self.datastore,
2135 directory=directory,
2136 transfer=transfer,
2137 skip_dimensions=skip_dimensions,
2138 idGenerationMode=idGenerationMode,
2139 reuseIds=reuseIds,
2140 )
2142 if isinstance(filename, ResourcePath):
2143 # We can not use open() here at the moment because of
2144 # DM-38589 since yaml does stream.read(8192) in a loop.
2145 stream = io.StringIO(filename.read().decode())
2146 doImport(stream)
2147 else:
2148 doImport(filename) # type: ignore
2150 def transfer_from(
2151 self,
2152 source_butler: LimitedButler,
2153 source_refs: Iterable[DatasetRef],
2154 transfer: str = "auto",
2155 skip_missing: bool = True,
2156 register_dataset_types: bool = False,
2157 transfer_dimensions: bool = False,
2158 ) -> collections.abc.Collection[DatasetRef]:
2159 """Transfer datasets to this Butler from a run in another Butler.
2161 Parameters
2162 ----------
2163 source_butler : `LimitedButler`
2164 Butler from which the datasets are to be transferred. If data IDs
2165 in ``source_refs`` are not expanded then this has to be a full
2166 `Butler` whose registry will be used to expand data IDs.
2167 source_refs : iterable of `DatasetRef`
2168 Datasets defined in the source butler that should be transferred to
2169 this butler.
2170 transfer : `str`, optional
2171 Transfer mode passed to `~lsst.daf.butler.Datastore.transfer_from`.
2172 skip_missing : `bool`
2173 If `True`, datasets with no datastore artifact associated with
2174 them are not transferred. If `False` a registry entry will be
2175 created even if no datastore record is created (and so will
2176 look equivalent to the dataset being unstored).
2177 register_dataset_types : `bool`
2178 If `True` any missing dataset types are registered. Otherwise
2179 an exception is raised.
2180 transfer_dimensions : `bool`, optional
2181 If `True`, dimension record data associated with the new datasets
2182 will be transferred.
2184 Returns
2185 -------
2186 refs : `list` of `DatasetRef`
2187 The refs added to this Butler.
2189 Notes
2190 -----
2191 The datastore artifact has to exist for a transfer
2192 to be made but non-existence is not an error.
2194 Datasets that already exist in this run will be skipped.
2196 The datasets are imported as part of a transaction, although
2197 dataset types are registered before the transaction is started.
2198 This means that it is possible for a dataset type to be registered
2199 even though transfer has failed.
2200 """
2201 if not self.isWriteable():
2202 raise TypeError("Butler is read-only.")
2203 progress = Progress("lsst.daf.butler.Butler.transfer_from", level=VERBOSE)
2205 # Will iterate through the refs multiple times so need to convert
2206 # to a list if this isn't a collection.
2207 if not isinstance(source_refs, collections.abc.Collection):
2208 source_refs = list(source_refs)
2210 original_count = len(source_refs)
2211 log.info("Transferring %d datasets into %s", original_count, str(self))
2213 # In some situations the datastore artifact may be missing
2214 # and we do not want that registry entry to be imported.
2215 # Asking datastore is not sufficient, the records may have been
2216 # purged, we have to ask for the (predicted) URI and check
2217 # existence explicitly. Execution butler is set up exactly like
2218 # this with no datastore records.
2219 artifact_existence: Dict[ResourcePath, bool] = {}
2220 if skip_missing:
2221 dataset_existence = source_butler.datastore.mexists(
2222 source_refs, artifact_existence=artifact_existence
2223 )
2224 source_refs = [ref for ref, exists in dataset_existence.items() if exists]
2225 filtered_count = len(source_refs)
2226 n_missing = original_count - filtered_count
2227 log.verbose(
2228 "%d dataset%s removed because the artifact does not exist. Now have %d.",
2229 n_missing,
2230 "" if n_missing == 1 else "s",
2231 filtered_count,
2232 )
2234 # Importing requires that we group the refs by dataset type and run
2235 # before doing the import.
2236 source_dataset_types = set()
2237 grouped_refs = defaultdict(list)
2238 for ref in source_refs:
2239 grouped_refs[ref.datasetType, ref.run].append(ref)
2240 source_dataset_types.add(ref.datasetType)
2242 # Check to see if the dataset type in the source butler has
2243 # the same definition in the target butler and register missing
2244 # ones if requested. Registration must happen outside a transaction.
2245 newly_registered_dataset_types = set()
2246 for datasetType in source_dataset_types:
2247 if register_dataset_types:
2248 # Let this raise immediately if inconsistent. Continuing
2249 # on to find additional inconsistent dataset types
2250 # might result in additional unwanted dataset types being
2251 # registered.
2252 if self.registry.registerDatasetType(datasetType):
2253 newly_registered_dataset_types.add(datasetType)
2254 else:
2255 # If the dataset type is missing, let it fail immediately.
2256 target_dataset_type = self.registry.getDatasetType(datasetType.name)
2257 if target_dataset_type != datasetType:
2258 raise ConflictingDefinitionError(
2259 "Source butler dataset type differs from definition"
2260 f" in target butler: {datasetType} !="
2261 f" {target_dataset_type}"
2262 )
2263 if newly_registered_dataset_types:
2264 # We may have registered some even if there were inconsistencies
2265 # but should let people know (or else remove them again).
2266 log.log(
2267 VERBOSE,
2268 "Registered the following dataset types in the target Butler: %s",
2269 ", ".join(d.name for d in newly_registered_dataset_types),
2270 )
2271 else:
2272 log.log(VERBOSE, "All required dataset types are known to the target Butler")
2274 dimension_records: Dict[DimensionElement, Dict[DataCoordinate, DimensionRecord]] = defaultdict(dict)
2275 if transfer_dimensions:
2276 # Collect all the dimension records for these refs.
2277 # All dimensions are to be copied but the list of valid dimensions
2278 # come from this butler's universe.
2279 elements = frozenset(
2280 element
2281 for element in self.registry.dimensions.getStaticElements()
2282 if element.hasTable() and element.viewOf is None
2283 )
2284 dataIds = set(ref.dataId for ref in source_refs)
2285 # This logic comes from saveDataIds.
2286 for dataId in dataIds:
2287 # Need an expanded record, if not expanded that we need a full
2288 # butler with registry (allow mocks with registry too).
2289 if not dataId.hasRecords():
2290 if registry := getattr(source_butler, "registry", None):
2291 dataId = registry.expandDataId(dataId)
2292 else:
2293 raise TypeError("Input butler needs to be a full butler to expand DataId.")
2294 # If this butler doesn't know about a dimension in the source
2295 # butler things will break later.
2296 for record in dataId.records.values():
2297 if record is not None and record.definition in elements:
2298 dimension_records[record.definition].setdefault(record.dataId, record)
2300 handled_collections: Set[str] = set()
2302 # Do all the importing in a single transaction.
2303 with self.transaction():
2304 if dimension_records:
2305 log.verbose("Ensuring that dimension records exist for transferred datasets.")
2306 for element, r in dimension_records.items():
2307 records = [r[dataId] for dataId in r]
2308 # Assume that if the record is already present that we can
2309 # use it without having to check that the record metadata
2310 # is consistent.
2311 self.registry.insertDimensionData(element, *records, skip_existing=True)
2313 n_imported = 0
2314 for (datasetType, run), refs_to_import in progress.iter_item_chunks(
2315 grouped_refs.items(), desc="Importing to registry by run and dataset type"
2316 ):
2317 if run not in handled_collections:
2318 # May need to create output collection. If source butler
2319 # has a registry, ask for documentation string.
2320 run_doc = None
2321 if registry := getattr(source_butler, "registry", None):
2322 run_doc = registry.getCollectionDocumentation(run)
2323 registered = self.registry.registerRun(run, doc=run_doc)
2324 handled_collections.add(run)
2325 if registered:
2326 log.log(VERBOSE, "Creating output run %s", run)
2328 n_refs = len(refs_to_import)
2329 log.verbose(
2330 "Importing %d ref%s of dataset type %s into run %s",
2331 n_refs,
2332 "" if n_refs == 1 else "s",
2333 datasetType.name,
2334 run,
2335 )
2337 # Assume we are using UUIDs and the source refs will match
2338 # those imported.
2339 imported_refs = self.registry._importDatasets(refs_to_import, expand=False)
2340 assert set(imported_refs) == set(refs_to_import)
2341 n_imported += len(imported_refs)
2343 assert len(source_refs) == n_imported
2344 log.verbose("Imported %d datasets into destination butler", n_imported)
2346 # Ask the datastore to transfer. The datastore has to check that
2347 # the source datastore is compatible with the target datastore.
2348 accepted, rejected = self.datastore.transfer_from(
2349 source_butler.datastore,
2350 source_refs,
2351 transfer=transfer,
2352 artifact_existence=artifact_existence,
2353 )
2354 if rejected:
2355 # For now, accept the registry entries but not the files.
2356 log.warning(
2357 "%d datasets were rejected and %d accepted for dataset type %s in run %r.",
2358 len(rejected),
2359 len(accepted),
2360 datasetType,
2361 run,
2362 )
2364 return source_refs
2366 def validateConfiguration(
2367 self,
2368 logFailures: bool = False,
2369 datasetTypeNames: Optional[Iterable[str]] = None,
2370 ignore: Iterable[str] | None = None,
2371 ) -> None:
2372 """Validate butler configuration.
2374 Checks that each `DatasetType` can be stored in the `Datastore`.
2376 Parameters
2377 ----------
2378 logFailures : `bool`, optional
2379 If `True`, output a log message for every validation error
2380 detected.
2381 datasetTypeNames : iterable of `str`, optional
2382 The `DatasetType` names that should be checked. This allows
2383 only a subset to be selected.
2384 ignore : iterable of `str`, optional
2385 Names of DatasetTypes to skip over. This can be used to skip
2386 known problems. If a named `DatasetType` corresponds to a
2387 composite, all components of that `DatasetType` will also be
2388 ignored.
2390 Raises
2391 ------
2392 ButlerValidationError
2393 Raised if there is some inconsistency with how this Butler
2394 is configured.
2395 """
2396 if datasetTypeNames:
2397 datasetTypes = [self.registry.getDatasetType(name) for name in datasetTypeNames]
2398 else:
2399 datasetTypes = list(self.registry.queryDatasetTypes())
2401 # filter out anything from the ignore list
2402 if ignore:
2403 ignore = set(ignore)
2404 datasetTypes = [
2405 e for e in datasetTypes if e.name not in ignore and e.nameAndComponent()[0] not in ignore
2406 ]
2407 else:
2408 ignore = set()
2410 # Find all the registered instruments
2411 instruments = set(record.name for record in self.registry.queryDimensionRecords("instrument"))
2413 # For each datasetType that has an instrument dimension, create
2414 # a DatasetRef for each defined instrument
2415 datasetRefs = []
2417 for datasetType in datasetTypes:
2418 if "instrument" in datasetType.dimensions:
2419 for instrument in instruments:
2420 datasetRef = DatasetRef(
2421 datasetType,
2422 {"instrument": instrument}, # type: ignore
2423 conform=False,
2424 run="validate",
2425 )
2426 datasetRefs.append(datasetRef)
2428 entities: List[Union[DatasetType, DatasetRef]] = []
2429 entities.extend(datasetTypes)
2430 entities.extend(datasetRefs)
2432 datastoreErrorStr = None
2433 try:
2434 self.datastore.validateConfiguration(entities, logFailures=logFailures)
2435 except ValidationError as e:
2436 datastoreErrorStr = str(e)
2438 # Also check that the LookupKeys used by the datastores match
2439 # registry and storage class definitions
2440 keys = self.datastore.getLookupKeys()
2442 failedNames = set()
2443 failedDataId = set()
2444 for key in keys:
2445 if key.name is not None:
2446 if key.name in ignore:
2447 continue
2449 # skip if specific datasetType names were requested and this
2450 # name does not match
2451 if datasetTypeNames and key.name not in datasetTypeNames:
2452 continue
2454 # See if it is a StorageClass or a DatasetType
2455 if key.name in self.storageClasses:
2456 pass
2457 else:
2458 try:
2459 self.registry.getDatasetType(key.name)
2460 except KeyError:
2461 if logFailures:
2462 log.critical("Key '%s' does not correspond to a DatasetType or StorageClass", key)
2463 failedNames.add(key)
2464 else:
2465 # Dimensions are checked for consistency when the Butler
2466 # is created and rendezvoused with a universe.
2467 pass
2469 # Check that the instrument is a valid instrument
2470 # Currently only support instrument so check for that
2471 if key.dataId:
2472 dataIdKeys = set(key.dataId)
2473 if set(["instrument"]) != dataIdKeys:
2474 if logFailures:
2475 log.critical("Key '%s' has unsupported DataId override", key)
2476 failedDataId.add(key)
2477 elif key.dataId["instrument"] not in instruments:
2478 if logFailures:
2479 log.critical("Key '%s' has unknown instrument", key)
2480 failedDataId.add(key)
2482 messages = []
2484 if datastoreErrorStr:
2485 messages.append(datastoreErrorStr)
2487 for failed, msg in (
2488 (failedNames, "Keys without corresponding DatasetType or StorageClass entry: "),
2489 (failedDataId, "Keys with bad DataId entries: "),
2490 ):
2491 if failed:
2492 msg += ", ".join(str(k) for k in failed)
2493 messages.append(msg)
2495 if messages:
2496 raise ValidationError(";\n".join(messages))
2498 @property
2499 def collections(self) -> Sequence[str]:
2500 """The collections to search by default, in order
2501 (`Sequence` [ `str` ]).
2503 This is an alias for ``self.registry.defaults.collections``. It cannot
2504 be set directly in isolation, but all defaults may be changed together
2505 by assigning a new `RegistryDefaults` instance to
2506 ``self.registry.defaults``.
2507 """
2508 return self.registry.defaults.collections
2510 @property
2511 def run(self) -> Optional[str]:
2512 """Name of the run this butler writes outputs to by default (`str` or
2513 `None`).
2515 This is an alias for ``self.registry.defaults.run``. It cannot be set
2516 directly in isolation, but all defaults may be changed together by
2517 assigning a new `RegistryDefaults` instance to
2518 ``self.registry.defaults``.
2519 """
2520 return self.registry.defaults.run
2522 @property
2523 def dimensions(self) -> DimensionUniverse:
2524 # Docstring inherited.
2525 return self.registry.dimensions
2527 registry: Registry
2528 """The object that manages dataset metadata and relationships (`Registry`).
2530 Most operations that don't involve reading or writing butler datasets are
2531 accessible only via `Registry` methods.
2532 """
2534 datastore: Datastore
2535 """The object that manages actual dataset storage (`Datastore`).
2537 Direct user access to the datastore should rarely be necessary; the primary
2538 exception is the case where a `Datastore` implementation provides extra
2539 functionality beyond what the base class defines.
2540 """
2542 storageClasses: StorageClassFactory
2543 """An object that maps known storage class names to objects that fully
2544 describe them (`StorageClassFactory`).
2545 """
2547 _allow_put_of_predefined_dataset: bool
2548 """Allow a put to succeed even if there is already a registry entry for it
2549 but not a datastore record. (`bool`)."""