Coverage for python/lsst/daf/butler/_butler.py: 8%
733 statements
« prev ^ index » next coverage.py v7.2.5, created at 2023-05-06 09:33 +0000
« prev ^ index » next coverage.py v7.2.5, created at 2023-05-06 09:33 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""
23Butler top level classes.
24"""
25from __future__ import annotations
27__all__ = (
28 "Butler",
29 "ButlerValidationError",
30)
32import collections.abc
33import contextlib
34import io
35import logging
36import numbers
37import os
38import uuid
39import warnings
40from collections import defaultdict
41from typing import (
42 TYPE_CHECKING,
43 Any,
44 ClassVar,
45 Counter,
46 Dict,
47 Iterable,
48 Iterator,
49 List,
50 MutableMapping,
51 Optional,
52 Sequence,
53 Set,
54 TextIO,
55 Tuple,
56 Type,
57 Union,
58)
60from deprecated.sphinx import deprecated
61from lsst.resources import ResourcePath, ResourcePathExpression
62from lsst.utils import doImportType
63from lsst.utils.introspection import get_class_of
64from lsst.utils.logging import VERBOSE, getLogger
65from sqlalchemy.exc import IntegrityError
67from ._butlerConfig import ButlerConfig
68from ._butlerRepoIndex import ButlerRepoIndex
69from ._deferredDatasetHandle import DeferredDatasetHandle
70from ._limited_butler import LimitedButler
71from .core import (
72 AmbiguousDatasetError,
73 Config,
74 ConfigSubset,
75 DataCoordinate,
76 DataId,
77 DataIdValue,
78 DatasetIdFactory,
79 DatasetIdGenEnum,
80 DatasetRef,
81 DatasetRefURIs,
82 DatasetType,
83 Datastore,
84 Dimension,
85 DimensionConfig,
86 DimensionElement,
87 DimensionRecord,
88 DimensionUniverse,
89 FileDataset,
90 Progress,
91 StorageClass,
92 StorageClassFactory,
93 Timespan,
94 UnresolvedRefWarning,
95 ValidationError,
96)
97from .core.repoRelocation import BUTLER_ROOT_TAG
98from .core.utils import transactional
99from .registry import (
100 CollectionType,
101 ConflictingDefinitionError,
102 DataIdError,
103 MissingDatasetTypeError,
104 Registry,
105 RegistryConfig,
106 RegistryDefaults,
107)
108from .transfers import RepoExportContext
110if TYPE_CHECKING:
111 from lsst.resources import ResourceHandleProtocol
113log = getLogger(__name__)
116class ButlerValidationError(ValidationError):
117 """There is a problem with the Butler configuration."""
119 pass
122class Butler(LimitedButler):
123 """Main entry point for the data access system.
125 Parameters
126 ----------
127 config : `ButlerConfig`, `Config` or `str`, optional.
128 Configuration. Anything acceptable to the
129 `ButlerConfig` constructor. If a directory path
130 is given the configuration will be read from a ``butler.yaml`` file in
131 that location. If `None` is given default values will be used.
132 butler : `Butler`, optional.
133 If provided, construct a new Butler that uses the same registry and
134 datastore as the given one, but with the given collection and run.
135 Incompatible with the ``config``, ``searchPaths``, and ``writeable``
136 arguments.
137 collections : `str` or `Iterable` [ `str` ], optional
138 An expression specifying the collections to be searched (in order) when
139 reading datasets.
140 This may be a `str` collection name or an iterable thereof.
141 See :ref:`daf_butler_collection_expressions` for more information.
142 These collections are not registered automatically and must be
143 manually registered before they are used by any method, but they may be
144 manually registered after the `Butler` is initialized.
145 run : `str`, optional
146 Name of the `~CollectionType.RUN` collection new datasets should be
147 inserted into. If ``collections`` is `None` and ``run`` is not `None`,
148 ``collections`` will be set to ``[run]``. If not `None`, this
149 collection will automatically be registered. If this is not set (and
150 ``writeable`` is not set either), a read-only butler will be created.
151 searchPaths : `list` of `str`, optional
152 Directory paths to search when calculating the full Butler
153 configuration. Not used if the supplied config is already a
154 `ButlerConfig`.
155 writeable : `bool`, optional
156 Explicitly sets whether the butler supports write operations. If not
157 provided, a read-write butler is created if any of ``run``, ``tags``,
158 or ``chains`` is non-empty.
159 inferDefaults : `bool`, optional
160 If `True` (default) infer default data ID values from the values
161 present in the datasets in ``collections``: if all collections have the
162 same value (or no value) for a governor dimension, that value will be
163 the default for that dimension. Nonexistent collections are ignored.
164 If a default value is provided explicitly for a governor dimension via
165 ``**kwargs``, no default will be inferred for that dimension.
166 **kwargs : `str`
167 Default data ID key-value pairs. These may only identify "governor"
168 dimensions like ``instrument`` and ``skymap``.
170 Examples
171 --------
172 While there are many ways to control exactly how a `Butler` interacts with
173 the collections in its `Registry`, the most common cases are still simple.
175 For a read-only `Butler` that searches one collection, do::
177 butler = Butler("/path/to/repo", collections=["u/alice/DM-50000"])
179 For a read-write `Butler` that writes to and reads from a
180 `~CollectionType.RUN` collection::
182 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a")
184 The `Butler` passed to a ``PipelineTask`` is often much more complex,
185 because we want to write to one `~CollectionType.RUN` collection but read
186 from several others (as well)::
188 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a",
189 collections=["u/alice/DM-50000/a",
190 "u/bob/DM-49998",
191 "HSC/defaults"])
193 This butler will `put` new datasets to the run ``u/alice/DM-50000/a``.
194 Datasets will be read first from that run (since it appears first in the
195 chain), and then from ``u/bob/DM-49998`` and finally ``HSC/defaults``.
197 Finally, one can always create a `Butler` with no collections::
199 butler = Butler("/path/to/repo", writeable=True)
201 This can be extremely useful when you just want to use ``butler.registry``,
202 e.g. for inserting dimension data or managing collections, or when the
203 collections you want to use with the butler are not consistent.
204 Passing ``writeable`` explicitly here is only necessary if you want to be
205 able to make changes to the repo - usually the value for ``writeable`` can
206 be guessed from the collection arguments provided, but it defaults to
207 `False` when there are not collection arguments.
208 """
210 def __init__(
211 self,
212 config: Union[Config, str, None] = None,
213 *,
214 butler: Optional[Butler] = None,
215 collections: Any = None,
216 run: Optional[str] = None,
217 searchPaths: Optional[List[str]] = None,
218 writeable: Optional[bool] = None,
219 inferDefaults: bool = True,
220 **kwargs: str,
221 ):
222 defaults = RegistryDefaults(collections=collections, run=run, infer=inferDefaults, **kwargs)
223 # Load registry, datastore, etc. from config or existing butler.
224 if butler is not None:
225 if config is not None or searchPaths is not None or writeable is not None:
226 raise TypeError(
227 "Cannot pass 'config', 'searchPaths', or 'writeable' arguments with 'butler' argument."
228 )
229 self.registry = butler.registry.copy(defaults)
230 self.datastore = butler.datastore
231 self.storageClasses = butler.storageClasses
232 self._config: ButlerConfig = butler._config
233 self._allow_put_of_predefined_dataset = butler._allow_put_of_predefined_dataset
234 else:
235 # Can only look for strings in the known repos list.
236 if isinstance(config, str) and config in self.get_known_repos():
237 config = str(self.get_repo_uri(config))
238 try:
239 self._config = ButlerConfig(config, searchPaths=searchPaths)
240 except FileNotFoundError as e:
241 if known := self.get_known_repos():
242 aliases = f"(known aliases: {', '.join(known)})"
243 else:
244 aliases = "(no known aliases)"
245 raise FileNotFoundError(f"{e} {aliases}") from e
246 self._config = ButlerConfig(config, searchPaths=searchPaths)
247 try:
248 if "root" in self._config:
249 butlerRoot = self._config["root"]
250 else:
251 butlerRoot = self._config.configDir
252 if writeable is None:
253 writeable = run is not None
254 self.registry = Registry.fromConfig(
255 self._config, butlerRoot=butlerRoot, writeable=writeable, defaults=defaults
256 )
257 self.datastore = Datastore.fromConfig(
258 self._config, self.registry.getDatastoreBridgeManager(), butlerRoot=butlerRoot
259 )
260 self.storageClasses = StorageClassFactory()
261 self.storageClasses.addFromConfig(self._config)
262 self._allow_put_of_predefined_dataset = self._config.get(
263 "allow_put_of_predefined_dataset", False
264 )
265 except Exception:
266 # Failures here usually mean that configuration is incomplete,
267 # just issue an error message which includes config file URI.
268 log.error(f"Failed to instantiate Butler from config {self._config.configFile}.")
269 raise
271 # For execution butler the datastore needs a special
272 # dependency-inversion trick. This is not used by regular butler,
273 # but we do not have a way to distinguish regular butler from execution
274 # butler.
275 self.datastore.set_retrieve_dataset_type_method(self._retrieve_dataset_type)
277 if "run" in self._config or "collection" in self._config:
278 raise ValueError("Passing a run or collection via configuration is no longer supported.")
280 GENERATION: ClassVar[int] = 3
281 """This is a Generation 3 Butler.
283 This attribute may be removed in the future, once the Generation 2 Butler
284 interface has been fully retired; it should only be used in transitional
285 code.
286 """
288 def _retrieve_dataset_type(self, name: str) -> DatasetType | None:
289 """Return DatasetType defined in registry given dataset type name."""
290 try:
291 return self.registry.getDatasetType(name)
292 except MissingDatasetTypeError:
293 return None
295 @classmethod
296 def get_repo_uri(cls, label: str) -> ResourcePath:
297 """Look up the label in a butler repository index.
299 Parameters
300 ----------
301 label : `str`
302 Label of the Butler repository to look up.
304 Returns
305 -------
306 uri : `lsst.resources.ResourcePath`
307 URI to the Butler repository associated with the given label.
309 Raises
310 ------
311 KeyError
312 Raised if the label is not found in the index, or if an index
313 can not be found at all.
315 Notes
316 -----
317 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the
318 information is discovered.
319 """
320 return ButlerRepoIndex.get_repo_uri(label)
322 @classmethod
323 def get_known_repos(cls) -> Set[str]:
324 """Retrieve the list of known repository labels.
326 Returns
327 -------
328 repos : `set` of `str`
329 All the known labels. Can be empty if no index can be found.
331 Notes
332 -----
333 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the
334 information is discovered.
335 """
336 return ButlerRepoIndex.get_known_repos()
338 @staticmethod
339 def makeRepo(
340 root: ResourcePathExpression,
341 config: Union[Config, str, None] = None,
342 dimensionConfig: Union[Config, str, None] = None,
343 standalone: bool = False,
344 searchPaths: Optional[List[str]] = None,
345 forceConfigRoot: bool = True,
346 outfile: Optional[ResourcePathExpression] = None,
347 overwrite: bool = False,
348 ) -> Config:
349 """Create an empty data repository by adding a butler.yaml config
350 to a repository root directory.
352 Parameters
353 ----------
354 root : `lsst.resources.ResourcePathExpression`
355 Path or URI to the root location of the new repository. Will be
356 created if it does not exist.
357 config : `Config` or `str`, optional
358 Configuration to write to the repository, after setting any
359 root-dependent Registry or Datastore config options. Can not
360 be a `ButlerConfig` or a `ConfigSubset`. If `None`, default
361 configuration will be used. Root-dependent config options
362 specified in this config are overwritten if ``forceConfigRoot``
363 is `True`.
364 dimensionConfig : `Config` or `str`, optional
365 Configuration for dimensions, will be used to initialize registry
366 database.
367 standalone : `bool`
368 If True, write all expanded defaults, not just customized or
369 repository-specific settings.
370 This (mostly) decouples the repository from the default
371 configuration, insulating it from changes to the defaults (which
372 may be good or bad, depending on the nature of the changes).
373 Future *additions* to the defaults will still be picked up when
374 initializing `Butlers` to repos created with ``standalone=True``.
375 searchPaths : `list` of `str`, optional
376 Directory paths to search when calculating the full butler
377 configuration.
378 forceConfigRoot : `bool`, optional
379 If `False`, any values present in the supplied ``config`` that
380 would normally be reset are not overridden and will appear
381 directly in the output config. This allows non-standard overrides
382 of the root directory for a datastore or registry to be given.
383 If this parameter is `True` the values for ``root`` will be
384 forced into the resulting config if appropriate.
385 outfile : `lss.resources.ResourcePathExpression`, optional
386 If not-`None`, the output configuration will be written to this
387 location rather than into the repository itself. Can be a URI
388 string. Can refer to a directory that will be used to write
389 ``butler.yaml``.
390 overwrite : `bool`, optional
391 Create a new configuration file even if one already exists
392 in the specified output location. Default is to raise
393 an exception.
395 Returns
396 -------
397 config : `Config`
398 The updated `Config` instance written to the repo.
400 Raises
401 ------
402 ValueError
403 Raised if a ButlerConfig or ConfigSubset is passed instead of a
404 regular Config (as these subclasses would make it impossible to
405 support ``standalone=False``).
406 FileExistsError
407 Raised if the output config file already exists.
408 os.error
409 Raised if the directory does not exist, exists but is not a
410 directory, or cannot be created.
412 Notes
413 -----
414 Note that when ``standalone=False`` (the default), the configuration
415 search path (see `ConfigSubset.defaultSearchPaths`) that was used to
416 construct the repository should also be used to construct any Butlers
417 to avoid configuration inconsistencies.
418 """
419 if isinstance(config, (ButlerConfig, ConfigSubset)):
420 raise ValueError("makeRepo must be passed a regular Config without defaults applied.")
422 # Ensure that the root of the repository exists or can be made
423 root_uri = ResourcePath(root, forceDirectory=True)
424 root_uri.mkdir()
426 config = Config(config)
428 # If we are creating a new repo from scratch with relative roots,
429 # do not propagate an explicit root from the config file
430 if "root" in config:
431 del config["root"]
433 full = ButlerConfig(config, searchPaths=searchPaths) # this applies defaults
434 imported_class = doImportType(full["datastore", "cls"])
435 if not issubclass(imported_class, Datastore):
436 raise TypeError(f"Imported datastore class {full['datastore', 'cls']} is not a Datastore")
437 datastoreClass: Type[Datastore] = imported_class
438 datastoreClass.setConfigRoot(BUTLER_ROOT_TAG, config, full, overwrite=forceConfigRoot)
440 # if key exists in given config, parse it, otherwise parse the defaults
441 # in the expanded config
442 if config.get(("registry", "db")):
443 registryConfig = RegistryConfig(config)
444 else:
445 registryConfig = RegistryConfig(full)
446 defaultDatabaseUri = registryConfig.makeDefaultDatabaseUri(BUTLER_ROOT_TAG)
447 if defaultDatabaseUri is not None:
448 Config.updateParameters(
449 RegistryConfig, config, full, toUpdate={"db": defaultDatabaseUri}, overwrite=forceConfigRoot
450 )
451 else:
452 Config.updateParameters(RegistryConfig, config, full, toCopy=("db",), overwrite=forceConfigRoot)
454 if standalone:
455 config.merge(full)
456 else:
457 # Always expand the registry.managers section into the per-repo
458 # config, because after the database schema is created, it's not
459 # allowed to change anymore. Note that in the standalone=True
460 # branch, _everything_ in the config is expanded, so there's no
461 # need to special case this.
462 Config.updateParameters(RegistryConfig, config, full, toMerge=("managers",), overwrite=False)
463 configURI: ResourcePathExpression
464 if outfile is not None:
465 # When writing to a separate location we must include
466 # the root of the butler repo in the config else it won't know
467 # where to look.
468 config["root"] = root_uri.geturl()
469 configURI = outfile
470 else:
471 configURI = root_uri
472 # Strip obscore configuration, if it is present, before writing config
473 # to a file, obscore config will be stored in registry.
474 if (obscore_config_key := ("registry", "managers", "obscore", "config")) in config:
475 config_to_write = config.copy()
476 del config_to_write[obscore_config_key]
477 config_to_write.dumpToUri(configURI, overwrite=overwrite)
478 # configFile attribute is updated, need to copy it to original.
479 config.configFile = config_to_write.configFile
480 else:
481 config.dumpToUri(configURI, overwrite=overwrite)
483 # Create Registry and populate tables
484 registryConfig = RegistryConfig(config.get("registry"))
485 dimensionConfig = DimensionConfig(dimensionConfig)
486 Registry.createFromConfig(registryConfig, dimensionConfig=dimensionConfig, butlerRoot=root_uri)
488 log.verbose("Wrote new Butler configuration file to %s", configURI)
490 return config
492 @classmethod
493 def _unpickle(
494 cls,
495 config: ButlerConfig,
496 collections: Optional[tuple[str, ...]],
497 run: Optional[str],
498 defaultDataId: Dict[str, str],
499 writeable: bool,
500 ) -> Butler:
501 """Callable used to unpickle a Butler.
503 We prefer not to use ``Butler.__init__`` directly so we can force some
504 of its many arguments to be keyword-only (note that ``__reduce__``
505 can only invoke callables with positional arguments).
507 Parameters
508 ----------
509 config : `ButlerConfig`
510 Butler configuration, already coerced into a true `ButlerConfig`
511 instance (and hence after any search paths for overrides have been
512 utilized).
513 collections : `tuple` [ `str` ]
514 Names of the default collections to read from.
515 run : `str`, optional
516 Name of the default `~CollectionType.RUN` collection to write to.
517 defaultDataId : `dict` [ `str`, `str` ]
518 Default data ID values.
519 writeable : `bool`
520 Whether the Butler should support write operations.
522 Returns
523 -------
524 butler : `Butler`
525 A new `Butler` instance.
526 """
527 # MyPy doesn't recognize that the kwargs below are totally valid; it
528 # seems to think '**defaultDataId* is a _positional_ argument!
529 return cls(
530 config=config,
531 collections=collections,
532 run=run,
533 writeable=writeable,
534 **defaultDataId, # type: ignore
535 )
537 def __reduce__(self) -> tuple:
538 """Support pickling."""
539 return (
540 Butler._unpickle,
541 (
542 self._config,
543 self.collections,
544 self.run,
545 self.registry.defaults.dataId.byName(),
546 self.registry.isWriteable(),
547 ),
548 )
550 def __str__(self) -> str:
551 return "Butler(collections={}, run={}, datastore='{}', registry='{}')".format(
552 self.collections, self.run, self.datastore, self.registry
553 )
555 def isWriteable(self) -> bool:
556 """Return `True` if this `Butler` supports write operations."""
557 return self.registry.isWriteable()
559 @contextlib.contextmanager
560 def transaction(self) -> Iterator[None]:
561 """Context manager supporting `Butler` transactions.
563 Transactions can be nested.
564 """
565 with self.registry.transaction():
566 with self.datastore.transaction():
567 yield
569 def _standardizeArgs(
570 self,
571 datasetRefOrType: Union[DatasetRef, DatasetType, str],
572 dataId: Optional[DataId] = None,
573 for_put: bool = True,
574 **kwargs: Any,
575 ) -> Tuple[DatasetType, Optional[DataId]]:
576 """Standardize the arguments passed to several Butler APIs.
578 Parameters
579 ----------
580 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
581 When `DatasetRef` the `dataId` should be `None`.
582 Otherwise the `DatasetType` or name thereof.
583 dataId : `dict` or `DataCoordinate`
584 A `dict` of `Dimension` link name, value pairs that label the
585 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
586 should be provided as the second argument.
587 for_put : `bool`, optional
588 If `True` this call is invoked as part of a `Butler.put()`.
589 Otherwise it is assumed to be part of a `Butler.get()`. This
590 parameter is only relevant if there is dataset type
591 inconsistency.
592 **kwargs
593 Additional keyword arguments used to augment or construct a
594 `DataCoordinate`. See `DataCoordinate.standardize`
595 parameters.
597 Returns
598 -------
599 datasetType : `DatasetType`
600 A `DatasetType` instance extracted from ``datasetRefOrType``.
601 dataId : `dict` or `DataId`, optional
602 Argument that can be used (along with ``kwargs``) to construct a
603 `DataId`.
605 Notes
606 -----
607 Butler APIs that conceptually need a DatasetRef also allow passing a
608 `DatasetType` (or the name of one) and a `DataId` (or a dict and
609 keyword arguments that can be used to construct one) separately. This
610 method accepts those arguments and always returns a true `DatasetType`
611 and a `DataId` or `dict`.
613 Standardization of `dict` vs `DataId` is best handled by passing the
614 returned ``dataId`` (and ``kwargs``) to `Registry` APIs, which are
615 generally similarly flexible.
616 """
617 externalDatasetType: Optional[DatasetType] = None
618 internalDatasetType: Optional[DatasetType] = None
619 if isinstance(datasetRefOrType, DatasetRef):
620 if dataId is not None or kwargs:
621 raise ValueError("DatasetRef given, cannot use dataId as well")
622 externalDatasetType = datasetRefOrType.datasetType
623 dataId = datasetRefOrType.dataId
624 else:
625 # Don't check whether DataId is provided, because Registry APIs
626 # can usually construct a better error message when it wasn't.
627 if isinstance(datasetRefOrType, DatasetType):
628 externalDatasetType = datasetRefOrType
629 else:
630 internalDatasetType = self.registry.getDatasetType(datasetRefOrType)
632 # Check that they are self-consistent
633 if externalDatasetType is not None:
634 internalDatasetType = self.registry.getDatasetType(externalDatasetType.name)
635 if externalDatasetType != internalDatasetType:
636 # We can allow differences if they are compatible, depending
637 # on whether this is a get or a put. A get requires that
638 # the python type associated with the datastore can be
639 # converted to the user type. A put requires that the user
640 # supplied python type can be converted to the internal
641 # type expected by registry.
642 relevantDatasetType = internalDatasetType
643 if for_put:
644 is_compatible = internalDatasetType.is_compatible_with(externalDatasetType)
645 else:
646 is_compatible = externalDatasetType.is_compatible_with(internalDatasetType)
647 relevantDatasetType = externalDatasetType
648 if not is_compatible:
649 raise ValueError(
650 f"Supplied dataset type ({externalDatasetType}) inconsistent with "
651 f"registry definition ({internalDatasetType})"
652 )
653 # Override the internal definition.
654 internalDatasetType = relevantDatasetType
656 assert internalDatasetType is not None
657 return internalDatasetType, dataId
659 def _rewrite_data_id(
660 self, dataId: Optional[DataId], datasetType: DatasetType, **kwargs: Any
661 ) -> Tuple[Optional[DataId], Dict[str, Any]]:
662 """Rewrite a data ID taking into account dimension records.
664 Take a Data ID and keyword args and rewrite it if necessary to
665 allow the user to specify dimension records rather than dimension
666 primary values.
668 This allows a user to include a dataId dict with keys of
669 ``exposure.day_obs`` and ``exposure.seq_num`` instead of giving
670 the integer exposure ID. It also allows a string to be given
671 for a dimension value rather than the integer ID if that is more
672 convenient. For example, rather than having to specifyin the
673 detector with ``detector.full_name``, a string given for ``detector``
674 will be interpreted as the full name and converted to the integer
675 value.
677 Keyword arguments can also use strings for dimensions like detector
678 and exposure but python does not allow them to include ``.`` and
679 so the ``exposure.day_obs`` syntax can not be used in a keyword
680 argument.
682 Parameters
683 ----------
684 dataId : `dict` or `DataCoordinate`
685 A `dict` of `Dimension` link name, value pairs that will label the
686 `DatasetRef` within a Collection.
687 datasetType : `DatasetType`
688 The dataset type associated with this dataId. Required to
689 determine the relevant dimensions.
690 **kwargs
691 Additional keyword arguments used to augment or construct a
692 `DataId`. See `DataId` parameters.
694 Returns
695 -------
696 dataId : `dict` or `DataCoordinate`
697 The, possibly rewritten, dataId. If given a `DataCoordinate` and
698 no keyword arguments, the original dataId will be returned
699 unchanged.
700 **kwargs : `dict`
701 Any unused keyword arguments (would normally be empty dict).
702 """
703 # Do nothing if we have a standalone DataCoordinate.
704 if isinstance(dataId, DataCoordinate) and not kwargs:
705 return dataId, kwargs
707 # Process dimension records that are using record information
708 # rather than ids
709 newDataId: Dict[str, DataIdValue] = {}
710 byRecord: Dict[str, Dict[str, Any]] = defaultdict(dict)
712 # if all the dataId comes from keyword parameters we do not need
713 # to do anything here because they can't be of the form
714 # exposure.obs_id because a "." is not allowed in a keyword parameter.
715 if dataId:
716 for k, v in dataId.items():
717 # If we have a Dimension we do not need to do anything
718 # because it cannot be a compound key.
719 if isinstance(k, str) and "." in k:
720 # Someone is using a more human-readable dataId
721 dimensionName, record = k.split(".", 1)
722 byRecord[dimensionName][record] = v
723 elif isinstance(k, Dimension):
724 newDataId[k.name] = v
725 else:
726 newDataId[k] = v
728 # Go through the updated dataId and check the type in case someone is
729 # using an alternate key. We have already filtered out the compound
730 # keys dimensions.record format.
731 not_dimensions = {}
733 # Will need to look in the dataId and the keyword arguments
734 # and will remove them if they need to be fixed or are unrecognized.
735 for dataIdDict in (newDataId, kwargs):
736 # Use a list so we can adjust the dict safely in the loop
737 for dimensionName in list(dataIdDict):
738 value = dataIdDict[dimensionName]
739 try:
740 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
741 except KeyError:
742 # This is not a real dimension
743 not_dimensions[dimensionName] = value
744 del dataIdDict[dimensionName]
745 continue
747 # Convert an integral type to an explicit int to simplify
748 # comparisons here
749 if isinstance(value, numbers.Integral):
750 value = int(value)
752 if not isinstance(value, dimension.primaryKey.getPythonType()):
753 for alternate in dimension.alternateKeys:
754 if isinstance(value, alternate.getPythonType()):
755 byRecord[dimensionName][alternate.name] = value
756 del dataIdDict[dimensionName]
757 log.debug(
758 "Converting dimension %s to %s.%s=%s",
759 dimensionName,
760 dimensionName,
761 alternate.name,
762 value,
763 )
764 break
765 else:
766 log.warning(
767 "Type mismatch found for value '%r' provided for dimension %s. "
768 "Could not find matching alternative (primary key has type %s) "
769 "so attempting to use as-is.",
770 value,
771 dimensionName,
772 dimension.primaryKey.getPythonType(),
773 )
775 # By this point kwargs and newDataId should only include valid
776 # dimensions. Merge kwargs in to the new dataId and log if there
777 # are dimensions in both (rather than calling update).
778 for k, v in kwargs.items():
779 if k in newDataId and newDataId[k] != v:
780 log.debug(
781 "Keyword arg %s overriding explicit value in dataId of %s with %s", k, newDataId[k], v
782 )
783 newDataId[k] = v
784 # No need to retain any values in kwargs now.
785 kwargs = {}
787 # If we have some unrecognized dimensions we have to try to connect
788 # them to records in other dimensions. This is made more complicated
789 # by some dimensions having records with clashing names. A mitigation
790 # is that we can tell by this point which dimensions are missing
791 # for the DatasetType but this does not work for calibrations
792 # where additional dimensions can be used to constrain the temporal
793 # axis.
794 if not_dimensions:
795 # Search for all dimensions even if we have been given a value
796 # explicitly. In some cases records are given as well as the
797 # actually dimension and this should not be an error if they
798 # match.
799 mandatoryDimensions = datasetType.dimensions.names # - provided
801 candidateDimensions: Set[str] = set()
802 candidateDimensions.update(mandatoryDimensions)
804 # For calibrations we may well be needing temporal dimensions
805 # so rather than always including all dimensions in the scan
806 # restrict things a little. It is still possible for there
807 # to be confusion over day_obs in visit vs exposure for example.
808 # If we are not searching calibration collections things may
809 # fail but they are going to fail anyway because of the
810 # ambiguousness of the dataId...
811 if datasetType.isCalibration():
812 for dim in self.registry.dimensions.getStaticDimensions():
813 if dim.temporal:
814 candidateDimensions.add(str(dim))
816 # Look up table for the first association with a dimension
817 guessedAssociation: Dict[str, Dict[str, Any]] = defaultdict(dict)
819 # Keep track of whether an item is associated with multiple
820 # dimensions.
821 counter: Counter[str] = Counter()
822 assigned: Dict[str, Set[str]] = defaultdict(set)
824 # Go through the missing dimensions and associate the
825 # given names with records within those dimensions
826 matched_dims = set()
827 for dimensionName in candidateDimensions:
828 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
829 fields = dimension.metadata.names | dimension.uniqueKeys.names
830 for field in not_dimensions:
831 if field in fields:
832 guessedAssociation[dimensionName][field] = not_dimensions[field]
833 counter[dimensionName] += 1
834 assigned[field].add(dimensionName)
835 matched_dims.add(field)
837 # Calculate the fields that matched nothing.
838 never_found = set(not_dimensions) - matched_dims
840 if never_found:
841 raise ValueError(f"Unrecognized keyword args given: {never_found}")
843 # There is a chance we have allocated a single dataId item
844 # to multiple dimensions. Need to decide which should be retained.
845 # For now assume that the most popular alternative wins.
846 # This means that day_obs with seq_num will result in
847 # exposure.day_obs and not visit.day_obs
848 # Also prefer an explicitly missing dimension over an inferred
849 # temporal dimension.
850 for fieldName, assignedDimensions in assigned.items():
851 if len(assignedDimensions) > 1:
852 # Pick the most popular (preferring mandatory dimensions)
853 requiredButMissing = assignedDimensions.intersection(mandatoryDimensions)
854 if requiredButMissing:
855 candidateDimensions = requiredButMissing
856 else:
857 candidateDimensions = assignedDimensions
859 # If this is a choice between visit and exposure and
860 # neither was a required part of the dataset type,
861 # (hence in this branch) always prefer exposure over
862 # visit since exposures are always defined and visits
863 # are defined from exposures.
864 if candidateDimensions == {"exposure", "visit"}:
865 candidateDimensions = {"exposure"}
867 # Select the relevant items and get a new restricted
868 # counter.
869 theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions}
870 duplicatesCounter: Counter[str] = Counter()
871 duplicatesCounter.update(theseCounts)
873 # Choose the most common. If they are equally common
874 # we will pick the one that was found first.
875 # Returns a list of tuples
876 selected = duplicatesCounter.most_common(1)[0][0]
878 log.debug(
879 "Ambiguous dataId entry '%s' associated with multiple dimensions: %s."
880 " Removed ambiguity by choosing dimension %s.",
881 fieldName,
882 ", ".join(assignedDimensions),
883 selected,
884 )
886 for candidateDimension in assignedDimensions:
887 if candidateDimension != selected:
888 del guessedAssociation[candidateDimension][fieldName]
890 # Update the record look up dict with the new associations
891 for dimensionName, values in guessedAssociation.items():
892 if values: # A dict might now be empty
893 log.debug("Assigned non-dimension dataId keys to dimension %s: %s", dimensionName, values)
894 byRecord[dimensionName].update(values)
896 if byRecord:
897 # Some record specifiers were found so we need to convert
898 # them to the Id form
899 for dimensionName, values in byRecord.items():
900 if dimensionName in newDataId:
901 log.debug(
902 "DataId specified explicit %s dimension value of %s in addition to"
903 " general record specifiers for it of %s. Ignoring record information.",
904 dimensionName,
905 newDataId[dimensionName],
906 str(values),
907 )
908 # Get the actual record and compare with these values.
909 try:
910 recs = list(self.registry.queryDimensionRecords(dimensionName, dataId=newDataId))
911 except DataIdError:
912 raise ValueError(
913 f"Could not find dimension '{dimensionName}'"
914 f" with dataId {newDataId} as part of comparing with"
915 f" record values {byRecord[dimensionName]}"
916 ) from None
917 if len(recs) == 1:
918 errmsg: List[str] = []
919 for k, v in values.items():
920 if (recval := getattr(recs[0], k)) != v:
921 errmsg.append(f"{k}({recval} != {v})")
922 if errmsg:
923 raise ValueError(
924 f"Dimension {dimensionName} in dataId has explicit value"
925 " inconsistent with records: " + ", ".join(errmsg)
926 )
927 else:
928 # Multiple matches for an explicit dimension
929 # should never happen but let downstream complain.
930 pass
931 continue
933 # Build up a WHERE expression
934 bind = {k: v for k, v in values.items()}
935 where = " AND ".join(f"{dimensionName}.{k} = {k}" for k in bind)
937 # Hopefully we get a single record that matches
938 records = set(
939 self.registry.queryDimensionRecords(
940 dimensionName, dataId=newDataId, where=where, bind=bind, **kwargs
941 )
942 )
944 if len(records) != 1:
945 if len(records) > 1:
946 # visit can have an ambiguous answer without involving
947 # visit_system. The default visit_system is defined
948 # by the instrument.
949 if (
950 dimensionName == "visit"
951 and "visit_system_membership" in self.registry.dimensions
952 and "visit_system" in self.registry.dimensions["instrument"].metadata
953 ):
954 instrument_records = list(
955 self.registry.queryDimensionRecords(
956 "instrument",
957 dataId=newDataId,
958 **kwargs,
959 )
960 )
961 if len(instrument_records) == 1:
962 visit_system = instrument_records[0].visit_system
963 if visit_system is None:
964 # Set to a value that will never match.
965 visit_system = -1
967 # Look up each visit in the
968 # visit_system_membership records.
969 for rec in records:
970 membership = list(
971 self.registry.queryDimensionRecords(
972 # Use bind to allow zero results.
973 # This is a fully-specified query.
974 "visit_system_membership",
975 where="instrument = inst AND visit_system = system AND visit = v",
976 bind=dict(
977 inst=instrument_records[0].name, system=visit_system, v=rec.id
978 ),
979 )
980 )
981 if membership:
982 # This record is the right answer.
983 records = set([rec])
984 break
986 # The ambiguity may have been resolved so check again.
987 if len(records) > 1:
988 log.debug("Received %d records from constraints of %s", len(records), str(values))
989 for r in records:
990 log.debug("- %s", str(r))
991 raise ValueError(
992 f"DataId specification for dimension {dimensionName} is not"
993 f" uniquely constrained to a single dataset by {values}."
994 f" Got {len(records)} results."
995 )
996 else:
997 raise ValueError(
998 f"DataId specification for dimension {dimensionName} matched no"
999 f" records when constrained by {values}"
1000 )
1002 # Get the primary key from the real dimension object
1003 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
1004 if not isinstance(dimension, Dimension):
1005 raise RuntimeError(
1006 f"{dimension.name} is not a true dimension, and cannot be used in data IDs."
1007 )
1008 newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name)
1010 return newDataId, kwargs
1012 def _findDatasetRef(
1013 self,
1014 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1015 dataId: Optional[DataId] = None,
1016 *,
1017 collections: Any = None,
1018 allowUnresolved: bool = False,
1019 **kwargs: Any,
1020 ) -> DatasetRef:
1021 """Shared logic for methods that start with a search for a dataset in
1022 the registry.
1024 Parameters
1025 ----------
1026 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1027 When `DatasetRef` the `dataId` should be `None`.
1028 Otherwise the `DatasetType` or name thereof.
1029 dataId : `dict` or `DataCoordinate`, optional
1030 A `dict` of `Dimension` link name, value pairs that label the
1031 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1032 should be provided as the first argument.
1033 collections : Any, optional
1034 Collections to be searched, overriding ``self.collections``.
1035 Can be any of the types supported by the ``collections`` argument
1036 to butler construction.
1037 allowUnresolved : `bool`, optional
1038 If `True`, return an unresolved `DatasetRef` if finding a resolved
1039 one in the `Registry` fails. Defaults to `False`.
1040 **kwargs
1041 Additional keyword arguments used to augment or construct a
1042 `DataId`. See `DataId` parameters.
1044 Returns
1045 -------
1046 ref : `DatasetRef`
1047 A reference to the dataset identified by the given arguments.
1048 This can be the same dataset reference as given if it was
1049 resolved.
1051 Raises
1052 ------
1053 LookupError
1054 Raised if no matching dataset exists in the `Registry` (and
1055 ``allowUnresolved is False``).
1056 ValueError
1057 Raised if a resolved `DatasetRef` was passed as an input, but it
1058 differs from the one found in the registry.
1059 TypeError
1060 Raised if no collections were provided.
1061 """
1062 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, for_put=False, **kwargs)
1063 if isinstance(datasetRefOrType, DatasetRef):
1064 idNumber = datasetRefOrType.id
1065 # This is a resolved ref, return it immediately.
1066 if idNumber:
1067 return datasetRefOrType
1068 else:
1069 idNumber = None
1070 timespan: Optional[Timespan] = None
1072 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs)
1074 if datasetType.isCalibration():
1075 # Because this is a calibration dataset, first try to make a
1076 # standardize the data ID without restricting the dimensions to
1077 # those of the dataset type requested, because there may be extra
1078 # dimensions that provide temporal information for a validity-range
1079 # lookup.
1080 dataId = DataCoordinate.standardize(
1081 dataId, universe=self.registry.dimensions, defaults=self.registry.defaults.dataId, **kwargs
1082 )
1083 if dataId.graph.temporal:
1084 dataId = self.registry.expandDataId(dataId)
1085 timespan = dataId.timespan
1086 else:
1087 # Standardize the data ID to just the dimensions of the dataset
1088 # type instead of letting registry.findDataset do it, so we get the
1089 # result even if no dataset is found.
1090 dataId = DataCoordinate.standardize(
1091 dataId, graph=datasetType.dimensions, defaults=self.registry.defaults.dataId, **kwargs
1092 )
1093 # Always lookup the DatasetRef, even if one is given, to ensure it is
1094 # present in the current collection.
1095 ref = self.registry.findDataset(datasetType, dataId, collections=collections, timespan=timespan)
1096 if ref is None:
1097 if allowUnresolved:
1098 with warnings.catch_warnings():
1099 warnings.simplefilter("ignore", category=UnresolvedRefWarning)
1100 return DatasetRef(datasetType, dataId)
1101 else:
1102 if collections is None:
1103 collections = self.registry.defaults.collections
1104 raise LookupError(
1105 f"Dataset {datasetType.name} with data ID {dataId} "
1106 f"could not be found in collections {collections}."
1107 )
1108 if idNumber is not None and idNumber != ref.id:
1109 if collections is None:
1110 collections = self.registry.defaults.collections
1111 raise ValueError(
1112 f"DatasetRef.id provided ({idNumber}) does not match "
1113 f"id ({ref.id}) in registry in collections {collections}."
1114 )
1115 if datasetType != ref.datasetType:
1116 # If they differ it is because the user explicitly specified
1117 # a compatible dataset type to this call rather than using the
1118 # registry definition. The DatasetRef must therefore be recreated
1119 # using the user definition such that the expected type is
1120 # returned.
1121 ref = DatasetRef(datasetType, ref.dataId, run=ref.run, id=ref.id)
1123 return ref
1125 @transactional
1126 @deprecated(
1127 reason="Butler.put() now behaves like Butler.putDirect() when given a DatasetRef."
1128 " Please use Butler.put(). Be aware that you may need to adjust your usage if you"
1129 " were relying on the run parameter to determine the run."
1130 " Will be removed after v27.0.",
1131 version="v26.0",
1132 category=FutureWarning,
1133 )
1134 def putDirect(self, obj: Any, ref: DatasetRef, /) -> DatasetRef:
1135 # Docstring inherited.
1136 return self.put(obj, ref)
1138 @transactional
1139 def put(
1140 self,
1141 obj: Any,
1142 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1143 /,
1144 dataId: Optional[DataId] = None,
1145 *,
1146 run: Optional[str] = None,
1147 **kwargs: Any,
1148 ) -> DatasetRef:
1149 """Store and register a dataset.
1151 Parameters
1152 ----------
1153 obj : `object`
1154 The dataset.
1155 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1156 When `DatasetRef` is provided, ``dataId`` should be `None`.
1157 Otherwise the `DatasetType` or name thereof. If a fully resolved
1158 `DatasetRef` is given the run and ID are used directly.
1159 dataId : `dict` or `DataCoordinate`
1160 A `dict` of `Dimension` link name, value pairs that label the
1161 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1162 should be provided as the second argument.
1163 run : `str`, optional
1164 The name of the run the dataset should be added to, overriding
1165 ``self.run``. Not used if a resolved `DatasetRef` is provided.
1166 **kwargs
1167 Additional keyword arguments used to augment or construct a
1168 `DataCoordinate`. See `DataCoordinate.standardize`
1169 parameters. Not used if a resolve `DatasetRef` is provided.
1171 Returns
1172 -------
1173 ref : `DatasetRef`
1174 A reference to the stored dataset, updated with the correct id if
1175 given.
1177 Raises
1178 ------
1179 TypeError
1180 Raised if the butler is read-only or if no run has been provided.
1181 """
1182 if isinstance(datasetRefOrType, DatasetRef) and datasetRefOrType.id is not None:
1183 # This is a direct put of predefined DatasetRef.
1184 log.debug("Butler put direct: %s", datasetRefOrType)
1185 (imported_ref,) = self.registry._importDatasets(
1186 [datasetRefOrType],
1187 expand=True,
1188 )
1189 if imported_ref.id != datasetRefOrType.getCheckedId():
1190 raise RuntimeError("This registry configuration does not support direct put of ref.")
1191 self.datastore.put(obj, datasetRefOrType)
1192 return datasetRefOrType
1194 log.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run)
1195 if not self.isWriteable():
1196 raise TypeError("Butler is read-only.")
1197 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs)
1198 if isinstance(datasetRefOrType, DatasetRef) and datasetRefOrType.id is not None:
1199 raise ValueError("DatasetRef must not be in registry, must have None id")
1201 # Handle dimension records in dataId
1202 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs)
1204 # Add Registry Dataset entry.
1205 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions, **kwargs)
1207 # For an execution butler the datasets will be pre-defined.
1208 # If the butler is configured that way datasets should only be inserted
1209 # if they do not already exist in registry. Trying and catching
1210 # ConflictingDefinitionError will not work because the transaction
1211 # will be corrupted. Instead, in this mode always check first.
1212 ref = None
1213 ref_is_predefined = False
1214 if self._allow_put_of_predefined_dataset:
1215 # Get the matching ref for this run.
1216 ref = self.registry.findDataset(datasetType, collections=run, dataId=dataId)
1218 if ref:
1219 # Must be expanded form for datastore templating
1220 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions)
1221 ref = ref.expanded(dataId)
1222 ref_is_predefined = True
1224 if not ref:
1225 (ref,) = self.registry.insertDatasets(datasetType, run=run, dataIds=[dataId])
1227 # If the ref is predefined it is possible that the datastore also
1228 # has the record. Asking datastore to put it again will result in
1229 # the artifact being recreated, overwriting previous, then will cause
1230 # a failure in writing the record which will cause the artifact
1231 # to be removed. Much safer to ask first before attempting to
1232 # overwrite. Race conditions should not be an issue for the
1233 # execution butler environment.
1234 if ref_is_predefined:
1235 if self.datastore.knows(ref):
1236 raise ConflictingDefinitionError(f"Dataset associated {ref} already exists.")
1238 self.datastore.put(obj, ref)
1240 return ref
1242 @deprecated(
1243 reason="Butler.get() now behaves like Butler.getDirect() when given a DatasetRef."
1244 " Please use Butler.get(). Will be removed after v27.0.",
1245 version="v26.0",
1246 category=FutureWarning,
1247 )
1248 def getDirect(
1249 self,
1250 ref: DatasetRef,
1251 *,
1252 parameters: Optional[Dict[str, Any]] = None,
1253 storageClass: Optional[Union[StorageClass, str]] = None,
1254 ) -> Any:
1255 """Retrieve a stored dataset.
1257 Parameters
1258 ----------
1259 ref : `DatasetRef`
1260 Resolved reference to an already stored dataset.
1261 parameters : `dict`
1262 Additional StorageClass-defined options to control reading,
1263 typically used to efficiently read only a subset of the dataset.
1264 storageClass : `StorageClass` or `str`, optional
1265 The storage class to be used to override the Python type
1266 returned by this method. By default the returned type matches
1267 the dataset type definition for this dataset. Specifying a
1268 read `StorageClass` can force a different type to be returned.
1269 This type must be compatible with the original type.
1271 Returns
1272 -------
1273 obj : `object`
1274 The dataset.
1275 """
1276 return self.datastore.get(ref, parameters=parameters, storageClass=storageClass)
1278 @deprecated(
1279 reason="Butler.getDeferred() now behaves like getDirectDeferred() when given a DatasetRef. "
1280 "Please use Butler.getDeferred(). Will be removed after v27.0.",
1281 version="v26.0",
1282 category=FutureWarning,
1283 )
1284 def getDirectDeferred(
1285 self,
1286 ref: DatasetRef,
1287 *,
1288 parameters: Union[dict, None] = None,
1289 storageClass: str | StorageClass | None = None,
1290 ) -> DeferredDatasetHandle:
1291 """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
1292 from a resolved `DatasetRef`.
1294 Parameters
1295 ----------
1296 ref : `DatasetRef`
1297 Resolved reference to an already stored dataset.
1298 parameters : `dict`
1299 Additional StorageClass-defined options to control reading,
1300 typically used to efficiently read only a subset of the dataset.
1301 storageClass : `StorageClass` or `str`, optional
1302 The storage class to be used to override the Python type
1303 returned by this method. By default the returned type matches
1304 the dataset type definition for this dataset. Specifying a
1305 read `StorageClass` can force a different type to be returned.
1306 This type must be compatible with the original type.
1308 Returns
1309 -------
1310 obj : `DeferredDatasetHandle`
1311 A handle which can be used to retrieve a dataset at a later time.
1313 Raises
1314 ------
1315 AmbiguousDatasetError
1316 Raised if ``ref.id is None``, i.e. the reference is unresolved.
1317 """
1318 if ref.id is None:
1319 raise AmbiguousDatasetError(
1320 f"Dataset of type {ref.datasetType.name} with data ID {ref.dataId} is not resolved."
1321 )
1322 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters, storageClass=storageClass)
1324 def getDeferred(
1325 self,
1326 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1327 /,
1328 dataId: Optional[DataId] = None,
1329 *,
1330 parameters: Union[dict, None] = None,
1331 collections: Any = None,
1332 storageClass: str | StorageClass | None = None,
1333 **kwargs: Any,
1334 ) -> DeferredDatasetHandle:
1335 """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
1336 after an immediate registry lookup.
1338 Parameters
1339 ----------
1340 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1341 When `DatasetRef` the `dataId` should be `None`.
1342 Otherwise the `DatasetType` or name thereof.
1343 dataId : `dict` or `DataCoordinate`, optional
1344 A `dict` of `Dimension` link name, value pairs that label the
1345 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1346 should be provided as the first argument.
1347 parameters : `dict`
1348 Additional StorageClass-defined options to control reading,
1349 typically used to efficiently read only a subset of the dataset.
1350 collections : Any, optional
1351 Collections to be searched, overriding ``self.collections``.
1352 Can be any of the types supported by the ``collections`` argument
1353 to butler construction.
1354 storageClass : `StorageClass` or `str`, optional
1355 The storage class to be used to override the Python type
1356 returned by this method. By default the returned type matches
1357 the dataset type definition for this dataset. Specifying a
1358 read `StorageClass` can force a different type to be returned.
1359 This type must be compatible with the original type.
1360 **kwargs
1361 Additional keyword arguments used to augment or construct a
1362 `DataId`. See `DataId` parameters.
1364 Returns
1365 -------
1366 obj : `DeferredDatasetHandle`
1367 A handle which can be used to retrieve a dataset at a later time.
1369 Raises
1370 ------
1371 LookupError
1372 Raised if no matching dataset exists in the `Registry` (and
1373 ``allowUnresolved is False``).
1374 ValueError
1375 Raised if a resolved `DatasetRef` was passed as an input, but it
1376 differs from the one found in the registry.
1377 TypeError
1378 Raised if no collections were provided.
1379 """
1380 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs)
1381 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters, storageClass=storageClass)
1383 def get(
1384 self,
1385 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1386 /,
1387 dataId: Optional[DataId] = None,
1388 *,
1389 parameters: Optional[Dict[str, Any]] = None,
1390 collections: Any = None,
1391 storageClass: Optional[Union[StorageClass, str]] = None,
1392 **kwargs: Any,
1393 ) -> Any:
1394 """Retrieve a stored dataset.
1396 Parameters
1397 ----------
1398 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1399 When `DatasetRef` the `dataId` should be `None`.
1400 Otherwise the `DatasetType` or name thereof.
1401 If a resolved `DatasetRef`, the associated dataset
1402 is returned directly without additional querying.
1403 dataId : `dict` or `DataCoordinate`
1404 A `dict` of `Dimension` link name, value pairs that label the
1405 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1406 should be provided as the first argument.
1407 parameters : `dict`
1408 Additional StorageClass-defined options to control reading,
1409 typically used to efficiently read only a subset of the dataset.
1410 collections : Any, optional
1411 Collections to be searched, overriding ``self.collections``.
1412 Can be any of the types supported by the ``collections`` argument
1413 to butler construction.
1414 storageClass : `StorageClass` or `str`, optional
1415 The storage class to be used to override the Python type
1416 returned by this method. By default the returned type matches
1417 the dataset type definition for this dataset. Specifying a
1418 read `StorageClass` can force a different type to be returned.
1419 This type must be compatible with the original type.
1420 **kwargs
1421 Additional keyword arguments used to augment or construct a
1422 `DataCoordinate`. See `DataCoordinate.standardize`
1423 parameters.
1425 Returns
1426 -------
1427 obj : `object`
1428 The dataset.
1430 Raises
1431 ------
1432 LookupError
1433 Raised if no matching dataset exists in the `Registry`.
1434 TypeError
1435 Raised if no collections were provided.
1437 Notes
1438 -----
1439 When looking up datasets in a `~CollectionType.CALIBRATION` collection,
1440 this method requires that the given data ID include temporal dimensions
1441 beyond the dimensions of the dataset type itself, in order to find the
1442 dataset with the appropriate validity range. For example, a "bias"
1443 dataset with native dimensions ``{instrument, detector}`` could be
1444 fetched with a ``{instrument, detector, exposure}`` data ID, because
1445 ``exposure`` is a temporal dimension.
1446 """
1447 log.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters)
1448 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs)
1449 return self.datastore.get(ref, parameters=parameters, storageClass=storageClass)
1451 def getURIs(
1452 self,
1453 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1454 /,
1455 dataId: Optional[DataId] = None,
1456 *,
1457 predict: bool = False,
1458 collections: Any = None,
1459 run: Optional[str] = None,
1460 **kwargs: Any,
1461 ) -> DatasetRefURIs:
1462 """Returns the URIs associated with the dataset.
1464 Parameters
1465 ----------
1466 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1467 When `DatasetRef` the `dataId` should be `None`.
1468 Otherwise the `DatasetType` or name thereof.
1469 dataId : `dict` or `DataCoordinate`
1470 A `dict` of `Dimension` link name, value pairs that label the
1471 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1472 should be provided as the first argument.
1473 predict : `bool`
1474 If `True`, allow URIs to be returned of datasets that have not
1475 been written.
1476 collections : Any, optional
1477 Collections to be searched, overriding ``self.collections``.
1478 Can be any of the types supported by the ``collections`` argument
1479 to butler construction.
1480 run : `str`, optional
1481 Run to use for predictions, overriding ``self.run``.
1482 **kwargs
1483 Additional keyword arguments used to augment or construct a
1484 `DataCoordinate`. See `DataCoordinate.standardize`
1485 parameters.
1487 Returns
1488 -------
1489 uris : `DatasetRefURIs`
1490 The URI to the primary artifact associated with this dataset (if
1491 the dataset was disassembled within the datastore this may be
1492 `None`), and the URIs to any components associated with the dataset
1493 artifact. (can be empty if there are no components).
1494 """
1495 ref = self._findDatasetRef(
1496 datasetRefOrType, dataId, allowUnresolved=predict, collections=collections, **kwargs
1497 )
1498 if ref.id is None: # only possible if predict is True
1499 if run is None:
1500 run = self.run
1501 if run is None:
1502 raise TypeError("Cannot predict location with run=None.")
1503 # Lie about ID, because we can't guess it, and only
1504 # Datastore.getURIs() will ever see it (and it doesn't use it).
1505 with warnings.catch_warnings():
1506 warnings.simplefilter("ignore", category=UnresolvedRefWarning)
1507 ref = ref.resolved(id=uuid.UUID("{00000000-0000-0000-0000-000000000000}"), run=run)
1508 return self.datastore.getURIs(ref, predict)
1510 def getURI(
1511 self,
1512 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1513 /,
1514 dataId: Optional[DataId] = None,
1515 *,
1516 predict: bool = False,
1517 collections: Any = None,
1518 run: Optional[str] = None,
1519 **kwargs: Any,
1520 ) -> ResourcePath:
1521 """Return the URI to the Dataset.
1523 Parameters
1524 ----------
1525 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1526 When `DatasetRef` the `dataId` should be `None`.
1527 Otherwise the `DatasetType` or name thereof.
1528 dataId : `dict` or `DataCoordinate`
1529 A `dict` of `Dimension` link name, value pairs that label the
1530 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1531 should be provided as the first argument.
1532 predict : `bool`
1533 If `True`, allow URIs to be returned of datasets that have not
1534 been written.
1535 collections : Any, optional
1536 Collections to be searched, overriding ``self.collections``.
1537 Can be any of the types supported by the ``collections`` argument
1538 to butler construction.
1539 run : `str`, optional
1540 Run to use for predictions, overriding ``self.run``.
1541 **kwargs
1542 Additional keyword arguments used to augment or construct a
1543 `DataCoordinate`. See `DataCoordinate.standardize`
1544 parameters.
1546 Returns
1547 -------
1548 uri : `lsst.resources.ResourcePath`
1549 URI pointing to the Dataset within the datastore. If the
1550 Dataset does not exist in the datastore, and if ``predict`` is
1551 `True`, the URI will be a prediction and will include a URI
1552 fragment "#predicted".
1553 If the datastore does not have entities that relate well
1554 to the concept of a URI the returned URI string will be
1555 descriptive. The returned URI is not guaranteed to be obtainable.
1557 Raises
1558 ------
1559 LookupError
1560 A URI has been requested for a dataset that does not exist and
1561 guessing is not allowed.
1562 ValueError
1563 Raised if a resolved `DatasetRef` was passed as an input, but it
1564 differs from the one found in the registry.
1565 TypeError
1566 Raised if no collections were provided.
1567 RuntimeError
1568 Raised if a URI is requested for a dataset that consists of
1569 multiple artifacts.
1570 """
1571 primary, components = self.getURIs(
1572 datasetRefOrType, dataId=dataId, predict=predict, collections=collections, run=run, **kwargs
1573 )
1575 if primary is None or components:
1576 raise RuntimeError(
1577 f"Dataset ({datasetRefOrType}) includes distinct URIs for components. "
1578 "Use Butler.getURIs() instead."
1579 )
1580 return primary
1582 def retrieveArtifacts(
1583 self,
1584 refs: Iterable[DatasetRef],
1585 destination: ResourcePathExpression,
1586 transfer: str = "auto",
1587 preserve_path: bool = True,
1588 overwrite: bool = False,
1589 ) -> List[ResourcePath]:
1590 """Retrieve the artifacts associated with the supplied refs.
1592 Parameters
1593 ----------
1594 refs : iterable of `DatasetRef`
1595 The datasets for which artifacts are to be retrieved.
1596 A single ref can result in multiple artifacts. The refs must
1597 be resolved.
1598 destination : `lsst.resources.ResourcePath` or `str`
1599 Location to write the artifacts.
1600 transfer : `str`, optional
1601 Method to use to transfer the artifacts. Must be one of the options
1602 supported by `~lsst.resources.ResourcePath.transfer_from()`.
1603 "move" is not allowed.
1604 preserve_path : `bool`, optional
1605 If `True` the full path of the artifact within the datastore
1606 is preserved. If `False` the final file component of the path
1607 is used.
1608 overwrite : `bool`, optional
1609 If `True` allow transfers to overwrite existing files at the
1610 destination.
1612 Returns
1613 -------
1614 targets : `list` of `lsst.resources.ResourcePath`
1615 URIs of file artifacts in destination location. Order is not
1616 preserved.
1618 Notes
1619 -----
1620 For non-file datastores the artifacts written to the destination
1621 may not match the representation inside the datastore. For example
1622 a hierarchical data structure in a NoSQL database may well be stored
1623 as a JSON file.
1624 """
1625 return self.datastore.retrieveArtifacts(
1626 refs,
1627 ResourcePath(destination),
1628 transfer=transfer,
1629 preserve_path=preserve_path,
1630 overwrite=overwrite,
1631 )
1633 def datasetExists(
1634 self,
1635 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1636 dataId: Optional[DataId] = None,
1637 *,
1638 collections: Any = None,
1639 **kwargs: Any,
1640 ) -> bool:
1641 """Return True if the Dataset is actually present in the Datastore.
1643 Parameters
1644 ----------
1645 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1646 When `DatasetRef` the `dataId` should be `None`.
1647 Otherwise the `DatasetType` or name thereof.
1648 dataId : `dict` or `DataCoordinate`
1649 A `dict` of `Dimension` link name, value pairs that label the
1650 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1651 should be provided as the first argument.
1652 collections : Any, optional
1653 Collections to be searched, overriding ``self.collections``.
1654 Can be any of the types supported by the ``collections`` argument
1655 to butler construction.
1656 **kwargs
1657 Additional keyword arguments used to augment or construct a
1658 `DataCoordinate`. See `DataCoordinate.standardize`
1659 parameters.
1661 Raises
1662 ------
1663 LookupError
1664 Raised if the dataset is not even present in the Registry.
1665 ValueError
1666 Raised if a resolved `DatasetRef` was passed as an input, but it
1667 differs from the one found in the registry.
1668 TypeError
1669 Raised if no collections were provided.
1670 """
1671 # A resolved ref may be given that is not known to this butler.
1672 if isinstance(datasetRefOrType, DatasetRef) and datasetRefOrType.id is not None:
1673 ref = self.registry.getDataset(datasetRefOrType.id)
1674 if ref is None:
1675 raise LookupError(
1676 f"Resolved DatasetRef with id {datasetRefOrType.id} is not known to registry."
1677 )
1678 else:
1679 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs)
1680 return self.datastore.exists(ref)
1682 def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None:
1683 """Remove one or more `~CollectionType.RUN` collections and the
1684 datasets within them.
1686 Parameters
1687 ----------
1688 names : `Iterable` [ `str` ]
1689 The names of the collections to remove.
1690 unstore : `bool`, optional
1691 If `True` (default), delete datasets from all datastores in which
1692 they are present, and attempt to rollback the registry deletions if
1693 datastore deletions fail (which may not always be possible). If
1694 `False`, datastore records for these datasets are still removed,
1695 but any artifacts (e.g. files) will not be.
1697 Raises
1698 ------
1699 TypeError
1700 Raised if one or more collections are not of type
1701 `~CollectionType.RUN`.
1702 """
1703 if not self.isWriteable():
1704 raise TypeError("Butler is read-only.")
1705 names = list(names)
1706 refs: List[DatasetRef] = []
1707 for name in names:
1708 collectionType = self.registry.getCollectionType(name)
1709 if collectionType is not CollectionType.RUN:
1710 raise TypeError(f"The collection type of '{name}' is {collectionType.name}, not RUN.")
1711 refs.extend(self.registry.queryDatasets(..., collections=name, findFirst=True))
1712 with self.datastore.transaction():
1713 with self.registry.transaction():
1714 if unstore:
1715 self.datastore.trash(refs)
1716 else:
1717 self.datastore.forget(refs)
1718 for name in names:
1719 self.registry.removeCollection(name)
1720 if unstore:
1721 # Point of no return for removing artifacts
1722 self.datastore.emptyTrash()
1724 def pruneDatasets(
1725 self,
1726 refs: Iterable[DatasetRef],
1727 *,
1728 disassociate: bool = True,
1729 unstore: bool = False,
1730 tags: Iterable[str] = (),
1731 purge: bool = False,
1732 ) -> None:
1733 # docstring inherited from LimitedButler
1735 if not self.isWriteable():
1736 raise TypeError("Butler is read-only.")
1737 if purge:
1738 if not disassociate:
1739 raise TypeError("Cannot pass purge=True without disassociate=True.")
1740 if not unstore:
1741 raise TypeError("Cannot pass purge=True without unstore=True.")
1742 elif disassociate:
1743 tags = tuple(tags)
1744 if not tags:
1745 raise TypeError("No tags provided but disassociate=True.")
1746 for tag in tags:
1747 collectionType = self.registry.getCollectionType(tag)
1748 if collectionType is not CollectionType.TAGGED:
1749 raise TypeError(
1750 f"Cannot disassociate from collection '{tag}' "
1751 f"of non-TAGGED type {collectionType.name}."
1752 )
1753 # For an execution butler we want to keep existing UUIDs for the
1754 # datasets, for that we need to keep them in the collections but
1755 # remove from datastore.
1756 if self._allow_put_of_predefined_dataset and purge:
1757 purge = False
1758 disassociate = False
1759 # Transform possibly-single-pass iterable into something we can iterate
1760 # over multiple times.
1761 refs = list(refs)
1762 # Pruning a component of a DatasetRef makes no sense since registry
1763 # doesn't know about components and datastore might not store
1764 # components in a separate file
1765 for ref in refs:
1766 if ref.datasetType.component():
1767 raise ValueError(f"Can not prune a component of a dataset (ref={ref})")
1768 # We don't need an unreliable Datastore transaction for this, because
1769 # we've been extra careful to ensure that Datastore.trash only involves
1770 # mutating the Registry (it can _look_ at Datastore-specific things,
1771 # but shouldn't change them), and hence all operations here are
1772 # Registry operations.
1773 with self.datastore.transaction():
1774 with self.registry.transaction():
1775 if unstore:
1776 self.datastore.trash(refs)
1777 if purge:
1778 self.registry.removeDatasets(refs)
1779 elif disassociate:
1780 assert tags, "Guaranteed by earlier logic in this function."
1781 for tag in tags:
1782 self.registry.disassociate(tag, refs)
1783 # We've exited the Registry transaction, and apparently committed.
1784 # (if there was an exception, everything rolled back, and it's as if
1785 # nothing happened - and we never get here).
1786 # Datastore artifacts are not yet gone, but they're clearly marked
1787 # as trash, so if we fail to delete now because of (e.g.) filesystem
1788 # problems we can try again later, and if manual administrative
1789 # intervention is required, it's pretty clear what that should entail:
1790 # deleting everything on disk and in private Datastore tables that is
1791 # in the dataset_location_trash table.
1792 if unstore:
1793 # Point of no return for removing artifacts
1794 self.datastore.emptyTrash()
1796 @transactional
1797 def ingest(
1798 self,
1799 *datasets: FileDataset,
1800 transfer: Optional[str] = "auto",
1801 run: Optional[str] = None,
1802 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
1803 record_validation_info: bool = True,
1804 ) -> None:
1805 """Store and register one or more datasets that already exist on disk.
1807 Parameters
1808 ----------
1809 datasets : `FileDataset`
1810 Each positional argument is a struct containing information about
1811 a file to be ingested, including its URI (either absolute or
1812 relative to the datastore root, if applicable), a resolved
1813 `DatasetRef`, and optionally a formatter class or its
1814 fully-qualified string name. If a formatter is not provided, the
1815 formatter that would be used for `put` is assumed. On successful
1816 ingest all `FileDataset.formatter` attributes will be set to the
1817 formatter class used. `FileDataset.path` attributes may be modified
1818 to put paths in whatever the datastore considers a standardized
1819 form.
1820 transfer : `str`, optional
1821 If not `None`, must be one of 'auto', 'move', 'copy', 'direct',
1822 'split', 'hardlink', 'relsymlink' or 'symlink', indicating how to
1823 transfer the file.
1824 run : `str`, optional
1825 The name of the run ingested datasets should be added to,
1826 overriding ``self.run``. This parameter is now deprecated since
1827 the run is encoded in the ``FileDataset``.
1828 idGenerationMode : `DatasetIdGenEnum`, optional
1829 Specifies option for generating dataset IDs. By default unique IDs
1830 are generated for each inserted dataset.
1831 record_validation_info : `bool`, optional
1832 If `True`, the default, the datastore can record validation
1833 information associated with the file. If `False` the datastore
1834 will not attempt to track any information such as checksums
1835 or file sizes. This can be useful if such information is tracked
1836 in an external system or if the file is to be compressed in place.
1837 It is up to the datastore whether this parameter is relevant.
1839 Raises
1840 ------
1841 TypeError
1842 Raised if the butler is read-only or if no run was provided.
1843 NotImplementedError
1844 Raised if the `Datastore` does not support the given transfer mode.
1845 DatasetTypeNotSupportedError
1846 Raised if one or more files to be ingested have a dataset type that
1847 is not supported by the `Datastore`..
1848 FileNotFoundError
1849 Raised if one of the given files does not exist.
1850 FileExistsError
1851 Raised if transfer is not `None` but the (internal) location the
1852 file would be moved to is already occupied.
1854 Notes
1855 -----
1856 This operation is not fully exception safe: if a database operation
1857 fails, the given `FileDataset` instances may be only partially updated.
1859 It is atomic in terms of database operations (they will either all
1860 succeed or all fail) providing the database engine implements
1861 transactions correctly. It will attempt to be atomic in terms of
1862 filesystem operations as well, but this cannot be implemented
1863 rigorously for most datastores.
1864 """
1865 if not self.isWriteable():
1866 raise TypeError("Butler is read-only.")
1868 log.verbose("Ingesting %d file dataset%s.", len(datasets), "" if len(datasets) == 1 else "s")
1869 if not datasets:
1870 return
1872 progress = Progress("lsst.daf.butler.Butler.ingest", level=logging.DEBUG)
1874 # We need to reorganize all the inputs so that they are grouped
1875 # by dataset type and run. Multiple refs in a single FileDataset
1876 # are required to share the run and dataset type.
1877 GroupedData = MutableMapping[tuple[DatasetType, str], list[FileDataset]]
1878 groupedData: GroupedData = defaultdict(list)
1880 # Track DataIDs that are being ingested so we can spot issues early
1881 # with duplication. Retain previous FileDataset so we can report it.
1882 groupedDataIds: MutableMapping[
1883 tuple[DatasetType, str], dict[DataCoordinate, FileDataset]
1884 ] = defaultdict(dict)
1886 logged_resolving = False
1887 used_run = False
1888 default_run = run or self.run
1890 # And the nested loop that populates it:
1891 for dataset in progress.wrap(datasets, desc="Grouping by dataset type"):
1892 # Somewhere to store pre-existing refs if we have an
1893 # execution butler.
1894 existingRefs: List[DatasetRef] = []
1896 # Any newly-resolved refs.
1897 resolvedRefs: list[DatasetRef] = []
1899 for ref in dataset.refs:
1900 if ref.id is None:
1901 # Eventually this will be impossible. For now we must
1902 # resolve this ref.
1903 if default_run is None:
1904 raise ValueError("Unresolved DatasetRef used for ingest but no run specified.")
1905 expanded_dataId = self.registry.expandDataId(ref.dataId)
1906 if not logged_resolving:
1907 log.info("ingest() given unresolved refs. Resolving them into run %r", default_run)
1908 logged_resolving = True
1909 resolved = DatasetIdFactory().resolveRef(ref, default_run, idGenerationMode)
1910 ref = resolved.expanded(expanded_dataId)
1911 resolvedRefs.append(ref)
1912 used_run = True
1914 assert ref.run is not None # For mypy
1915 group_key = (ref.datasetType, ref.run)
1917 if ref.dataId in groupedDataIds[group_key]:
1918 raise ConflictingDefinitionError(
1919 f"Ingest conflict. Dataset {dataset.path} has same"
1920 " DataId as other ingest dataset"
1921 f" {groupedDataIds[group_key][ref.dataId].path} "
1922 f" ({ref.dataId})"
1923 )
1924 if self._allow_put_of_predefined_dataset:
1925 existing_ref = self.registry.findDataset(
1926 ref.datasetType, dataId=ref.dataId, collections=ref.run
1927 )
1928 if existing_ref:
1929 if existing_ref.id != ref.id:
1930 raise ConflictingDefinitionError(
1931 f"Registry has registered dataset {existing_ref!r} which has differing ID "
1932 f"from that being ingested ({ref!r})."
1933 )
1934 if self.datastore.knows(existing_ref):
1935 raise ConflictingDefinitionError(
1936 f"Dataset associated with path {dataset.path}"
1937 f" already exists as {existing_ref}."
1938 )
1939 # Datastore will need expanded data coordinate
1940 # so this has to be attached to the FileDataset
1941 # if necessary.
1942 if not ref.dataId.hasRecords():
1943 expanded_dataId = self.registry.expandDataId(ref.dataId)
1944 existing_ref = existing_ref.expanded(expanded_dataId)
1945 else:
1946 # Both refs are identical but we want to
1947 # keep the expanded one.
1948 existing_ref = ref
1950 # Store this ref elsewhere since it already exists
1951 # and we do not want to remake it but we do want
1952 # to store it in the datastore.
1953 existingRefs.append(existing_ref)
1955 # Nothing else to do until we have finished
1956 # iterating.
1957 continue
1959 groupedDataIds[group_key][ref.dataId] = dataset
1961 if existingRefs:
1962 if len(dataset.refs) != len(existingRefs):
1963 # Keeping track of partially pre-existing datasets is hard
1964 # and should generally never happen. For now don't allow
1965 # it.
1966 raise ConflictingDefinitionError(
1967 f"For dataset {dataset.path} some dataIds already exist"
1968 " in registry but others do not. This is not supported."
1969 )
1971 # Store expanded form in the original FileDataset.
1972 dataset.refs = existingRefs
1973 elif resolvedRefs:
1974 if len(dataset.refs) != len(resolvedRefs):
1975 raise ConflictingDefinitionError(
1976 f"For dataset {dataset.path} some DatasetRef were "
1977 "resolved and others were not. This is not supported."
1978 )
1979 dataset.refs = resolvedRefs
1981 # These datasets have to be registered.
1982 self.registry._importDatasets(resolvedRefs)
1983 else:
1984 groupedData[group_key].append(dataset)
1986 if not used_run and run is not None:
1987 warnings.warn(
1988 "All DatasetRefs to be ingested had resolved dataset IDs. The value given to the "
1989 f"'run' parameter ({run!r}) was not used and the parameter will be removed in the future.",
1990 category=FutureWarning,
1991 stacklevel=3, # Take into account the @transactional decorator.
1992 )
1994 # Now we can bulk-insert into Registry for each DatasetType.
1995 for (datasetType, this_run), grouped_datasets in progress.iter_item_chunks(
1996 groupedData.items(), desc="Bulk-inserting datasets by type"
1997 ):
1998 refs_to_import = []
1999 for dataset in grouped_datasets:
2000 refs_to_import.extend(dataset.refs)
2002 n_refs = len(refs_to_import)
2003 log.verbose(
2004 "Importing %d ref%s of dataset type %r into run %r",
2005 n_refs,
2006 "" if n_refs == 1 else "s",
2007 datasetType.name,
2008 this_run,
2009 )
2011 # Import the refs and expand the DataCoordinates since we can't
2012 # guarantee that they are expanded and Datastore will need
2013 # the records.
2014 imported_refs = self.registry._importDatasets(refs_to_import, expand=True)
2015 assert set(imported_refs) == set(refs_to_import)
2017 # Replace all the refs in the FileDataset with expanded versions.
2018 # Pull them off in the order we put them on the list.
2019 for dataset in grouped_datasets:
2020 n_dataset_refs = len(dataset.refs)
2021 dataset.refs = imported_refs[:n_dataset_refs]
2022 del imported_refs[:n_dataset_refs]
2024 # Bulk-insert everything into Datastore.
2025 # We do not know if any of the registry entries already existed
2026 # (_importDatasets only complains if they exist but differ) so
2027 # we have to catch IntegrityError explicitly.
2028 try:
2029 self.datastore.ingest(*datasets, transfer=transfer, record_validation_info=record_validation_info)
2030 except IntegrityError as e:
2031 raise ConflictingDefinitionError(f"Datastore already contains one or more datasets: {e}")
2033 @contextlib.contextmanager
2034 def export(
2035 self,
2036 *,
2037 directory: Optional[str] = None,
2038 filename: Optional[str] = None,
2039 format: Optional[str] = None,
2040 transfer: Optional[str] = None,
2041 ) -> Iterator[RepoExportContext]:
2042 """Export datasets from the repository represented by this `Butler`.
2044 This method is a context manager that returns a helper object
2045 (`RepoExportContext`) that is used to indicate what information from
2046 the repository should be exported.
2048 Parameters
2049 ----------
2050 directory : `str`, optional
2051 Directory dataset files should be written to if ``transfer`` is not
2052 `None`.
2053 filename : `str`, optional
2054 Name for the file that will include database information associated
2055 with the exported datasets. If this is not an absolute path and
2056 ``directory`` is not `None`, it will be written to ``directory``
2057 instead of the current working directory. Defaults to
2058 "export.{format}".
2059 format : `str`, optional
2060 File format for the database information file. If `None`, the
2061 extension of ``filename`` will be used.
2062 transfer : `str`, optional
2063 Transfer mode passed to `Datastore.export`.
2065 Raises
2066 ------
2067 TypeError
2068 Raised if the set of arguments passed is inconsistent.
2070 Examples
2071 --------
2072 Typically the `Registry.queryDataIds` and `Registry.queryDatasets`
2073 methods are used to provide the iterables over data IDs and/or datasets
2074 to be exported::
2076 with butler.export("exports.yaml") as export:
2077 # Export all flats, but none of the dimension element rows
2078 # (i.e. data ID information) associated with them.
2079 export.saveDatasets(butler.registry.queryDatasets("flat"),
2080 elements=())
2081 # Export all datasets that start with "deepCoadd_" and all of
2082 # their associated data ID information.
2083 export.saveDatasets(butler.registry.queryDatasets("deepCoadd_*"))
2084 """
2085 if directory is None and transfer is not None:
2086 raise TypeError("Cannot transfer without providing a directory.")
2087 if transfer == "move":
2088 raise TypeError("Transfer may not be 'move': export is read-only")
2089 if format is None:
2090 if filename is None:
2091 raise TypeError("At least one of 'filename' or 'format' must be provided.")
2092 else:
2093 _, format = os.path.splitext(filename)
2094 if not format:
2095 raise ValueError("Please specify a file extension to determine export format.")
2096 format = format[1:] # Strip leading ".""
2097 elif filename is None:
2098 filename = f"export.{format}"
2099 if directory is not None:
2100 filename = os.path.join(directory, filename)
2101 formats = self._config["repo_transfer_formats"]
2102 if format not in formats:
2103 raise ValueError(f"Unknown export format {format!r}, allowed: {','.join(formats.keys())}")
2104 BackendClass = get_class_of(formats[format, "export"])
2105 with open(filename, "w") as stream:
2106 backend = BackendClass(stream, universe=self.registry.dimensions)
2107 try:
2108 helper = RepoExportContext(
2109 self.registry, self.datastore, backend=backend, directory=directory, transfer=transfer
2110 )
2111 yield helper
2112 except BaseException:
2113 raise
2114 else:
2115 helper._finish()
2117 def import_(
2118 self,
2119 *,
2120 directory: Optional[ResourcePathExpression] = None,
2121 filename: Union[ResourcePathExpression, TextIO, None] = None,
2122 format: Optional[str] = None,
2123 transfer: Optional[str] = None,
2124 skip_dimensions: Optional[Set] = None,
2125 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
2126 reuseIds: bool = False,
2127 ) -> None:
2128 """Import datasets into this repository that were exported from a
2129 different butler repository via `~lsst.daf.butler.Butler.export`.
2131 Parameters
2132 ----------
2133 directory : `~lsst.resources.ResourcePathExpression`, optional
2134 Directory containing dataset files to import from. If `None`,
2135 ``filename`` and all dataset file paths specified therein must
2136 be absolute.
2137 filename : `~lsst.resources.ResourcePathExpression` or `TextIO`
2138 A stream or name of file that contains database information
2139 associated with the exported datasets, typically generated by
2140 `~lsst.daf.butler.Butler.export`. If this a string (name) or
2141 `~lsst.resources.ResourcePath` and is not an absolute path,
2142 it will first be looked for relative to ``directory`` and if not
2143 found there it will be looked for in the current working
2144 directory. Defaults to "export.{format}".
2145 format : `str`, optional
2146 File format for ``filename``. If `None`, the extension of
2147 ``filename`` will be used.
2148 transfer : `str`, optional
2149 Transfer mode passed to `~lsst.daf.butler.Datastore.ingest`.
2150 skip_dimensions : `set`, optional
2151 Names of dimensions that should be skipped and not imported.
2152 idGenerationMode : `DatasetIdGenEnum`, optional
2153 Specifies option for generating dataset IDs when IDs are not
2154 provided or their type does not match backend type. By default
2155 unique IDs are generated for each inserted dataset.
2156 reuseIds : `bool`, optional
2157 If `True` then forces re-use of imported dataset IDs for integer
2158 IDs which are normally generated as auto-incremented; exception
2159 will be raised if imported IDs clash with existing ones. This
2160 option has no effect on the use of globally-unique IDs which are
2161 always re-used (or generated if integer IDs are being imported).
2163 Raises
2164 ------
2165 TypeError
2166 Raised if the set of arguments passed is inconsistent, or if the
2167 butler is read-only.
2168 """
2169 if not self.isWriteable():
2170 raise TypeError("Butler is read-only.")
2171 if format is None:
2172 if filename is None:
2173 raise TypeError("At least one of 'filename' or 'format' must be provided.")
2174 else:
2175 _, format = os.path.splitext(filename) # type: ignore
2176 elif filename is None:
2177 filename = ResourcePath(f"export.{format}", forceAbsolute=False)
2178 if directory is not None:
2179 directory = ResourcePath(directory, forceDirectory=True)
2180 # mypy doesn't think this will work but it does in python >= 3.10.
2181 if isinstance(filename, ResourcePathExpression): # type: ignore
2182 filename = ResourcePath(filename, forceAbsolute=False) # type: ignore
2183 if not filename.isabs() and directory is not None:
2184 potential = directory.join(filename)
2185 exists_in_cwd = filename.exists()
2186 exists_in_dir = potential.exists()
2187 if exists_in_cwd and exists_in_dir:
2188 log.warning(
2189 "A relative path for filename was specified (%s) which exists relative to cwd. "
2190 "Additionally, the file exists relative to the given search directory (%s). "
2191 "Using the export file in the given directory.",
2192 filename,
2193 potential,
2194 )
2195 # Given they specified an explicit directory and that
2196 # directory has the export file in it, assume that that
2197 # is what was meant despite the file in cwd.
2198 filename = potential
2199 elif exists_in_dir:
2200 filename = potential
2201 elif not exists_in_cwd and not exists_in_dir:
2202 # Raise early.
2203 raise FileNotFoundError(
2204 f"Export file could not be found in {filename.abspath()} or {potential.abspath()}."
2205 )
2206 BackendClass = get_class_of(self._config["repo_transfer_formats"][format]["import"])
2208 def doImport(importStream: TextIO | ResourceHandleProtocol) -> None:
2209 backend = BackendClass(importStream, self.registry)
2210 backend.register()
2211 with self.transaction():
2212 backend.load(
2213 self.datastore,
2214 directory=directory,
2215 transfer=transfer,
2216 skip_dimensions=skip_dimensions,
2217 idGenerationMode=idGenerationMode,
2218 reuseIds=reuseIds,
2219 )
2221 if isinstance(filename, ResourcePath):
2222 # We can not use open() here at the moment because of
2223 # DM-38589 since yaml does stream.read(8192) in a loop.
2224 stream = io.StringIO(filename.read().decode())
2225 doImport(stream)
2226 else:
2227 doImport(filename) # type: ignore
2229 def transfer_from(
2230 self,
2231 source_butler: LimitedButler,
2232 source_refs: Iterable[DatasetRef],
2233 transfer: str = "auto",
2234 skip_missing: bool = True,
2235 register_dataset_types: bool = False,
2236 transfer_dimensions: bool = False,
2237 ) -> collections.abc.Collection[DatasetRef]:
2238 """Transfer datasets to this Butler from a run in another Butler.
2240 Parameters
2241 ----------
2242 source_butler : `LimitedButler`
2243 Butler from which the datasets are to be transferred. If data IDs
2244 in ``source_refs`` are not expanded then this has to be a full
2245 `Butler` whose registry will be used to expand data IDs.
2246 source_refs : iterable of `DatasetRef`
2247 Datasets defined in the source butler that should be transferred to
2248 this butler.
2249 transfer : `str`, optional
2250 Transfer mode passed to `~lsst.daf.butler.Datastore.transfer_from`.
2251 skip_missing : `bool`
2252 If `True`, datasets with no datastore artifact associated with
2253 them are not transferred. If `False` a registry entry will be
2254 created even if no datastore record is created (and so will
2255 look equivalent to the dataset being unstored).
2256 register_dataset_types : `bool`
2257 If `True` any missing dataset types are registered. Otherwise
2258 an exception is raised.
2259 transfer_dimensions : `bool`, optional
2260 If `True`, dimension record data associated with the new datasets
2261 will be transferred.
2263 Returns
2264 -------
2265 refs : `list` of `DatasetRef`
2266 The refs added to this Butler.
2268 Notes
2269 -----
2270 The datastore artifact has to exist for a transfer
2271 to be made but non-existence is not an error.
2273 Datasets that already exist in this run will be skipped.
2275 The datasets are imported as part of a transaction, although
2276 dataset types are registered before the transaction is started.
2277 This means that it is possible for a dataset type to be registered
2278 even though transfer has failed.
2279 """
2280 if not self.isWriteable():
2281 raise TypeError("Butler is read-only.")
2282 progress = Progress("lsst.daf.butler.Butler.transfer_from", level=VERBOSE)
2284 # Will iterate through the refs multiple times so need to convert
2285 # to a list if this isn't a collection.
2286 if not isinstance(source_refs, collections.abc.Collection):
2287 source_refs = list(source_refs)
2289 original_count = len(source_refs)
2290 log.info("Transferring %d datasets into %s", original_count, str(self))
2292 # In some situations the datastore artifact may be missing
2293 # and we do not want that registry entry to be imported.
2294 # Asking datastore is not sufficient, the records may have been
2295 # purged, we have to ask for the (predicted) URI and check
2296 # existence explicitly. Execution butler is set up exactly like
2297 # this with no datastore records.
2298 artifact_existence: Dict[ResourcePath, bool] = {}
2299 if skip_missing:
2300 dataset_existence = source_butler.datastore.mexists(
2301 source_refs, artifact_existence=artifact_existence
2302 )
2303 source_refs = [ref for ref, exists in dataset_existence.items() if exists]
2304 filtered_count = len(source_refs)
2305 n_missing = original_count - filtered_count
2306 log.verbose(
2307 "%d dataset%s removed because the artifact does not exist. Now have %d.",
2308 n_missing,
2309 "" if n_missing == 1 else "s",
2310 filtered_count,
2311 )
2313 # Importing requires that we group the refs by dataset type and run
2314 # before doing the import.
2315 source_dataset_types = set()
2316 grouped_refs = defaultdict(list)
2317 for ref in source_refs:
2318 grouped_refs[ref.datasetType, ref.run].append(ref)
2319 source_dataset_types.add(ref.datasetType)
2321 # Check to see if the dataset type in the source butler has
2322 # the same definition in the target butler and register missing
2323 # ones if requested. Registration must happen outside a transaction.
2324 newly_registered_dataset_types = set()
2325 for datasetType in source_dataset_types:
2326 if register_dataset_types:
2327 # Let this raise immediately if inconsistent. Continuing
2328 # on to find additional inconsistent dataset types
2329 # might result in additional unwanted dataset types being
2330 # registered.
2331 if self.registry.registerDatasetType(datasetType):
2332 newly_registered_dataset_types.add(datasetType)
2333 else:
2334 # If the dataset type is missing, let it fail immediately.
2335 target_dataset_type = self.registry.getDatasetType(datasetType.name)
2336 if target_dataset_type != datasetType:
2337 raise ConflictingDefinitionError(
2338 "Source butler dataset type differs from definition"
2339 f" in target butler: {datasetType} !="
2340 f" {target_dataset_type}"
2341 )
2342 if newly_registered_dataset_types:
2343 # We may have registered some even if there were inconsistencies
2344 # but should let people know (or else remove them again).
2345 log.log(
2346 VERBOSE,
2347 "Registered the following dataset types in the target Butler: %s",
2348 ", ".join(d.name for d in newly_registered_dataset_types),
2349 )
2350 else:
2351 log.log(VERBOSE, "All required dataset types are known to the target Butler")
2353 dimension_records: Dict[DimensionElement, Dict[DataCoordinate, DimensionRecord]] = defaultdict(dict)
2354 if transfer_dimensions:
2355 # Collect all the dimension records for these refs.
2356 # All dimensions are to be copied but the list of valid dimensions
2357 # come from this butler's universe.
2358 elements = frozenset(
2359 element
2360 for element in self.registry.dimensions.getStaticElements()
2361 if element.hasTable() and element.viewOf is None
2362 )
2363 dataIds = set(ref.dataId for ref in source_refs)
2364 # This logic comes from saveDataIds.
2365 for dataId in dataIds:
2366 # Need an expanded record, if not expanded that we need a full
2367 # butler with registry (allow mocks with registry too).
2368 if not dataId.hasRecords():
2369 if registry := getattr(source_butler, "registry", None):
2370 dataId = registry.expandDataId(dataId)
2371 else:
2372 raise TypeError("Input butler needs to be a full butler to expand DataId.")
2373 # If this butler doesn't know about a dimension in the source
2374 # butler things will break later.
2375 for record in dataId.records.values():
2376 if record is not None and record.definition in elements:
2377 dimension_records[record.definition].setdefault(record.dataId, record)
2379 handled_collections: Set[str] = set()
2381 # Do all the importing in a single transaction.
2382 with self.transaction():
2383 if dimension_records:
2384 log.verbose("Ensuring that dimension records exist for transferred datasets.")
2385 for element, r in dimension_records.items():
2386 records = [r[dataId] for dataId in r]
2387 # Assume that if the record is already present that we can
2388 # use it without having to check that the record metadata
2389 # is consistent.
2390 self.registry.insertDimensionData(element, *records, skip_existing=True)
2392 n_imported = 0
2393 for (datasetType, run), refs_to_import in progress.iter_item_chunks(
2394 grouped_refs.items(), desc="Importing to registry by run and dataset type"
2395 ):
2396 if run not in handled_collections:
2397 # May need to create output collection. If source butler
2398 # has a registry, ask for documentation string.
2399 run_doc = None
2400 if registry := getattr(source_butler, "registry", None):
2401 run_doc = registry.getCollectionDocumentation(run)
2402 registered = self.registry.registerRun(run, doc=run_doc)
2403 handled_collections.add(run)
2404 if registered:
2405 log.log(VERBOSE, "Creating output run %s", run)
2407 n_refs = len(refs_to_import)
2408 log.verbose(
2409 "Importing %d ref%s of dataset type %s into run %s",
2410 n_refs,
2411 "" if n_refs == 1 else "s",
2412 datasetType.name,
2413 run,
2414 )
2416 # Assume we are using UUIDs and the source refs will match
2417 # those imported.
2418 imported_refs = self.registry._importDatasets(refs_to_import, expand=False)
2419 assert set(imported_refs) == set(refs_to_import)
2420 n_imported += len(imported_refs)
2422 assert len(source_refs) == n_imported
2423 log.verbose("Imported %d datasets into destination butler", n_imported)
2425 # Ask the datastore to transfer. The datastore has to check that
2426 # the source datastore is compatible with the target datastore.
2427 accepted, rejected = self.datastore.transfer_from(
2428 source_butler.datastore,
2429 source_refs,
2430 transfer=transfer,
2431 artifact_existence=artifact_existence,
2432 )
2433 if rejected:
2434 # For now, accept the registry entries but not the files.
2435 log.warning(
2436 "%d datasets were rejected and %d accepted for dataset type %s in run %r.",
2437 len(rejected),
2438 len(accepted),
2439 datasetType,
2440 run,
2441 )
2443 return source_refs
2445 def validateConfiguration(
2446 self,
2447 logFailures: bool = False,
2448 datasetTypeNames: Optional[Iterable[str]] = None,
2449 ignore: Iterable[str] | None = None,
2450 ) -> None:
2451 """Validate butler configuration.
2453 Checks that each `DatasetType` can be stored in the `Datastore`.
2455 Parameters
2456 ----------
2457 logFailures : `bool`, optional
2458 If `True`, output a log message for every validation error
2459 detected.
2460 datasetTypeNames : iterable of `str`, optional
2461 The `DatasetType` names that should be checked. This allows
2462 only a subset to be selected.
2463 ignore : iterable of `str`, optional
2464 Names of DatasetTypes to skip over. This can be used to skip
2465 known problems. If a named `DatasetType` corresponds to a
2466 composite, all components of that `DatasetType` will also be
2467 ignored.
2469 Raises
2470 ------
2471 ButlerValidationError
2472 Raised if there is some inconsistency with how this Butler
2473 is configured.
2474 """
2475 if datasetTypeNames:
2476 datasetTypes = [self.registry.getDatasetType(name) for name in datasetTypeNames]
2477 else:
2478 datasetTypes = list(self.registry.queryDatasetTypes())
2480 # filter out anything from the ignore list
2481 if ignore:
2482 ignore = set(ignore)
2483 datasetTypes = [
2484 e for e in datasetTypes if e.name not in ignore and e.nameAndComponent()[0] not in ignore
2485 ]
2486 else:
2487 ignore = set()
2489 # Find all the registered instruments
2490 instruments = set(record.name for record in self.registry.queryDimensionRecords("instrument"))
2492 # For each datasetType that has an instrument dimension, create
2493 # a DatasetRef for each defined instrument
2494 datasetRefs = []
2496 for datasetType in datasetTypes:
2497 if "instrument" in datasetType.dimensions:
2498 for instrument in instruments:
2499 datasetRef = DatasetRef(
2500 datasetType,
2501 {"instrument": instrument}, # type: ignore
2502 conform=False,
2503 run="validate",
2504 )
2505 datasetRefs.append(datasetRef)
2507 entities: List[Union[DatasetType, DatasetRef]] = []
2508 entities.extend(datasetTypes)
2509 entities.extend(datasetRefs)
2511 datastoreErrorStr = None
2512 try:
2513 self.datastore.validateConfiguration(entities, logFailures=logFailures)
2514 except ValidationError as e:
2515 datastoreErrorStr = str(e)
2517 # Also check that the LookupKeys used by the datastores match
2518 # registry and storage class definitions
2519 keys = self.datastore.getLookupKeys()
2521 failedNames = set()
2522 failedDataId = set()
2523 for key in keys:
2524 if key.name is not None:
2525 if key.name in ignore:
2526 continue
2528 # skip if specific datasetType names were requested and this
2529 # name does not match
2530 if datasetTypeNames and key.name not in datasetTypeNames:
2531 continue
2533 # See if it is a StorageClass or a DatasetType
2534 if key.name in self.storageClasses:
2535 pass
2536 else:
2537 try:
2538 self.registry.getDatasetType(key.name)
2539 except KeyError:
2540 if logFailures:
2541 log.critical("Key '%s' does not correspond to a DatasetType or StorageClass", key)
2542 failedNames.add(key)
2543 else:
2544 # Dimensions are checked for consistency when the Butler
2545 # is created and rendezvoused with a universe.
2546 pass
2548 # Check that the instrument is a valid instrument
2549 # Currently only support instrument so check for that
2550 if key.dataId:
2551 dataIdKeys = set(key.dataId)
2552 if set(["instrument"]) != dataIdKeys:
2553 if logFailures:
2554 log.critical("Key '%s' has unsupported DataId override", key)
2555 failedDataId.add(key)
2556 elif key.dataId["instrument"] not in instruments:
2557 if logFailures:
2558 log.critical("Key '%s' has unknown instrument", key)
2559 failedDataId.add(key)
2561 messages = []
2563 if datastoreErrorStr:
2564 messages.append(datastoreErrorStr)
2566 for failed, msg in (
2567 (failedNames, "Keys without corresponding DatasetType or StorageClass entry: "),
2568 (failedDataId, "Keys with bad DataId entries: "),
2569 ):
2570 if failed:
2571 msg += ", ".join(str(k) for k in failed)
2572 messages.append(msg)
2574 if messages:
2575 raise ValidationError(";\n".join(messages))
2577 @property
2578 def collections(self) -> Sequence[str]:
2579 """The collections to search by default, in order
2580 (`Sequence` [ `str` ]).
2582 This is an alias for ``self.registry.defaults.collections``. It cannot
2583 be set directly in isolation, but all defaults may be changed together
2584 by assigning a new `RegistryDefaults` instance to
2585 ``self.registry.defaults``.
2586 """
2587 return self.registry.defaults.collections
2589 @property
2590 def run(self) -> Optional[str]:
2591 """Name of the run this butler writes outputs to by default (`str` or
2592 `None`).
2594 This is an alias for ``self.registry.defaults.run``. It cannot be set
2595 directly in isolation, but all defaults may be changed together by
2596 assigning a new `RegistryDefaults` instance to
2597 ``self.registry.defaults``.
2598 """
2599 return self.registry.defaults.run
2601 @property
2602 def dimensions(self) -> DimensionUniverse:
2603 # Docstring inherited.
2604 return self.registry.dimensions
2606 registry: Registry
2607 """The object that manages dataset metadata and relationships (`Registry`).
2609 Most operations that don't involve reading or writing butler datasets are
2610 accessible only via `Registry` methods.
2611 """
2613 datastore: Datastore
2614 """The object that manages actual dataset storage (`Datastore`).
2616 Direct user access to the datastore should rarely be necessary; the primary
2617 exception is the case where a `Datastore` implementation provides extra
2618 functionality beyond what the base class defines.
2619 """
2621 storageClasses: StorageClassFactory
2622 """An object that maps known storage class names to objects that fully
2623 describe them (`StorageClassFactory`).
2624 """
2626 _allow_put_of_predefined_dataset: bool
2627 """Allow a put to succeed even if there is already a registry entry for it
2628 but not a datastore record. (`bool`)."""