Coverage for python/lsst/daf/butler/_butler.py: 8%
699 statements
« prev ^ index » next coverage.py v6.5.0, created at 2023-03-23 02:06 -0700
« prev ^ index » next coverage.py v6.5.0, created at 2023-03-23 02:06 -0700
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""
23Butler top level classes.
24"""
25from __future__ import annotations
27__all__ = (
28 "Butler",
29 "ButlerValidationError",
30 "PruneCollectionsArgsError",
31 "PurgeWithoutUnstorePruneCollectionsError",
32 "RunWithoutPurgePruneCollectionsError",
33 "PurgeUnsupportedPruneCollectionsError",
34)
36import collections.abc
37import contextlib
38import logging
39import numbers
40import os
41import uuid
42from collections import defaultdict
43from typing import (
44 Any,
45 ClassVar,
46 Counter,
47 Dict,
48 Iterable,
49 Iterator,
50 List,
51 MutableMapping,
52 Optional,
53 Sequence,
54 Set,
55 TextIO,
56 Tuple,
57 Type,
58 Union,
59)
61from lsst.resources import ResourcePath, ResourcePathExpression
62from lsst.utils import doImportType
63from lsst.utils.introspection import get_class_of
64from lsst.utils.logging import VERBOSE, getLogger
66from ._butlerConfig import ButlerConfig
67from ._butlerRepoIndex import ButlerRepoIndex
68from ._deferredDatasetHandle import DeferredDatasetHandle
69from ._limited_butler import LimitedButler
70from .core import (
71 AmbiguousDatasetError,
72 Config,
73 ConfigSubset,
74 DataCoordinate,
75 DataId,
76 DataIdValue,
77 DatasetRef,
78 DatasetRefURIs,
79 DatasetType,
80 Datastore,
81 Dimension,
82 DimensionConfig,
83 DimensionElement,
84 DimensionRecord,
85 DimensionUniverse,
86 FileDataset,
87 Progress,
88 StorageClass,
89 StorageClassFactory,
90 Timespan,
91 ValidationError,
92)
93from .core.repoRelocation import BUTLER_ROOT_TAG
94from .core.utils import transactional
95from .registry import (
96 CollectionType,
97 ConflictingDefinitionError,
98 DataIdError,
99 DatasetIdGenEnum,
100 MissingDatasetTypeError,
101 Registry,
102 RegistryConfig,
103 RegistryDefaults,
104)
105from .transfers import RepoExportContext
107log = getLogger(__name__)
110class ButlerValidationError(ValidationError):
111 """There is a problem with the Butler configuration."""
113 pass
116class PruneCollectionsArgsError(TypeError):
117 """Base class for errors relating to Butler.pruneCollections input
118 arguments.
119 """
121 pass
124class PurgeWithoutUnstorePruneCollectionsError(PruneCollectionsArgsError):
125 """Raised when purge and unstore are both required to be True, and
126 purge is True but unstore is False.
127 """
129 def __init__(self) -> None:
130 super().__init__("Cannot pass purge=True without unstore=True.")
133class RunWithoutPurgePruneCollectionsError(PruneCollectionsArgsError):
134 """Raised when pruning a RUN collection but purge is False."""
136 def __init__(self, collectionType: CollectionType):
137 self.collectionType = collectionType
138 super().__init__(f"Cannot prune RUN collection {self.collectionType.name} without purge=True.")
141class PurgeUnsupportedPruneCollectionsError(PruneCollectionsArgsError):
142 """Raised when purge is True but is not supported for the given
143 collection."""
145 def __init__(self, collectionType: CollectionType):
146 self.collectionType = collectionType
147 super().__init__(
148 f"Cannot prune {self.collectionType} collection {self.collectionType.name} with purge=True."
149 )
152class Butler(LimitedButler):
153 """Main entry point for the data access system.
155 Parameters
156 ----------
157 config : `ButlerConfig`, `Config` or `str`, optional.
158 Configuration. Anything acceptable to the
159 `ButlerConfig` constructor. If a directory path
160 is given the configuration will be read from a ``butler.yaml`` file in
161 that location. If `None` is given default values will be used.
162 butler : `Butler`, optional.
163 If provided, construct a new Butler that uses the same registry and
164 datastore as the given one, but with the given collection and run.
165 Incompatible with the ``config``, ``searchPaths``, and ``writeable``
166 arguments.
167 collections : `str` or `Iterable` [ `str` ], optional
168 An expression specifying the collections to be searched (in order) when
169 reading datasets.
170 This may be a `str` collection name or an iterable thereof.
171 See :ref:`daf_butler_collection_expressions` for more information.
172 These collections are not registered automatically and must be
173 manually registered before they are used by any method, but they may be
174 manually registered after the `Butler` is initialized.
175 run : `str`, optional
176 Name of the `~CollectionType.RUN` collection new datasets should be
177 inserted into. If ``collections`` is `None` and ``run`` is not `None`,
178 ``collections`` will be set to ``[run]``. If not `None`, this
179 collection will automatically be registered. If this is not set (and
180 ``writeable`` is not set either), a read-only butler will be created.
181 searchPaths : `list` of `str`, optional
182 Directory paths to search when calculating the full Butler
183 configuration. Not used if the supplied config is already a
184 `ButlerConfig`.
185 writeable : `bool`, optional
186 Explicitly sets whether the butler supports write operations. If not
187 provided, a read-write butler is created if any of ``run``, ``tags``,
188 or ``chains`` is non-empty.
189 inferDefaults : `bool`, optional
190 If `True` (default) infer default data ID values from the values
191 present in the datasets in ``collections``: if all collections have the
192 same value (or no value) for a governor dimension, that value will be
193 the default for that dimension. Nonexistent collections are ignored.
194 If a default value is provided explicitly for a governor dimension via
195 ``**kwargs``, no default will be inferred for that dimension.
196 **kwargs : `str`
197 Default data ID key-value pairs. These may only identify "governor"
198 dimensions like ``instrument`` and ``skymap``.
200 Examples
201 --------
202 While there are many ways to control exactly how a `Butler` interacts with
203 the collections in its `Registry`, the most common cases are still simple.
205 For a read-only `Butler` that searches one collection, do::
207 butler = Butler("/path/to/repo", collections=["u/alice/DM-50000"])
209 For a read-write `Butler` that writes to and reads from a
210 `~CollectionType.RUN` collection::
212 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a")
214 The `Butler` passed to a ``PipelineTask`` is often much more complex,
215 because we want to write to one `~CollectionType.RUN` collection but read
216 from several others (as well)::
218 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a",
219 collections=["u/alice/DM-50000/a",
220 "u/bob/DM-49998",
221 "HSC/defaults"])
223 This butler will `put` new datasets to the run ``u/alice/DM-50000/a``.
224 Datasets will be read first from that run (since it appears first in the
225 chain), and then from ``u/bob/DM-49998`` and finally ``HSC/defaults``.
227 Finally, one can always create a `Butler` with no collections::
229 butler = Butler("/path/to/repo", writeable=True)
231 This can be extremely useful when you just want to use ``butler.registry``,
232 e.g. for inserting dimension data or managing collections, or when the
233 collections you want to use with the butler are not consistent.
234 Passing ``writeable`` explicitly here is only necessary if you want to be
235 able to make changes to the repo - usually the value for ``writeable`` can
236 be guessed from the collection arguments provided, but it defaults to
237 `False` when there are not collection arguments.
238 """
240 def __init__(
241 self,
242 config: Union[Config, str, None] = None,
243 *,
244 butler: Optional[Butler] = None,
245 collections: Any = None,
246 run: Optional[str] = None,
247 searchPaths: Optional[List[str]] = None,
248 writeable: Optional[bool] = None,
249 inferDefaults: bool = True,
250 **kwargs: str,
251 ):
252 defaults = RegistryDefaults(collections=collections, run=run, infer=inferDefaults, **kwargs)
253 # Load registry, datastore, etc. from config or existing butler.
254 if butler is not None:
255 if config is not None or searchPaths is not None or writeable is not None:
256 raise TypeError(
257 "Cannot pass 'config', 'searchPaths', or 'writeable' arguments with 'butler' argument."
258 )
259 self.registry = butler.registry.copy(defaults)
260 self.datastore = butler.datastore
261 self.storageClasses = butler.storageClasses
262 self._config: ButlerConfig = butler._config
263 self._allow_put_of_predefined_dataset = butler._allow_put_of_predefined_dataset
264 else:
265 # Can only look for strings in the known repos list.
266 if isinstance(config, str) and config in self.get_known_repos():
267 config = str(self.get_repo_uri(config))
268 try:
269 self._config = ButlerConfig(config, searchPaths=searchPaths)
270 except FileNotFoundError as e:
271 if known := self.get_known_repos():
272 aliases = f"(known aliases: {', '.join(known)})"
273 else:
274 aliases = "(no known aliases)"
275 raise FileNotFoundError(f"{e} {aliases}") from e
276 self._config = ButlerConfig(config, searchPaths=searchPaths)
277 try:
278 if "root" in self._config:
279 butlerRoot = self._config["root"]
280 else:
281 butlerRoot = self._config.configDir
282 if writeable is None:
283 writeable = run is not None
284 self.registry = Registry.fromConfig(
285 self._config, butlerRoot=butlerRoot, writeable=writeable, defaults=defaults
286 )
287 self.datastore = Datastore.fromConfig(
288 self._config, self.registry.getDatastoreBridgeManager(), butlerRoot=butlerRoot
289 )
290 self.storageClasses = StorageClassFactory()
291 self.storageClasses.addFromConfig(self._config)
292 self._allow_put_of_predefined_dataset = self._config.get(
293 "allow_put_of_predefined_dataset", False
294 )
295 except Exception:
296 # Failures here usually mean that configuration is incomplete,
297 # just issue an error message which includes config file URI.
298 log.error(f"Failed to instantiate Butler from config {self._config.configFile}.")
299 raise
301 # For execution butler the datastore needs a special
302 # dependency-inversion trick. This is not used by regular butler,
303 # but we do not have a way to distinguish regular butler from execution
304 # butler.
305 self.datastore.set_retrieve_dataset_type_method(self._retrieve_dataset_type)
307 if "run" in self._config or "collection" in self._config:
308 raise ValueError("Passing a run or collection via configuration is no longer supported.")
310 GENERATION: ClassVar[int] = 3
311 """This is a Generation 3 Butler.
313 This attribute may be removed in the future, once the Generation 2 Butler
314 interface has been fully retired; it should only be used in transitional
315 code.
316 """
318 def _retrieve_dataset_type(self, name: str) -> DatasetType | None:
319 """Return DatasetType defined in registry given dataset type name."""
320 try:
321 return self.registry.getDatasetType(name)
322 except MissingDatasetTypeError:
323 return None
325 @classmethod
326 def get_repo_uri(cls, label: str) -> ResourcePath:
327 """Look up the label in a butler repository index.
329 Parameters
330 ----------
331 label : `str`
332 Label of the Butler repository to look up.
334 Returns
335 -------
336 uri : `lsst.resources.ResourcePath`
337 URI to the Butler repository associated with the given label.
339 Raises
340 ------
341 KeyError
342 Raised if the label is not found in the index, or if an index
343 can not be found at all.
345 Notes
346 -----
347 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the
348 information is discovered.
349 """
350 return ButlerRepoIndex.get_repo_uri(label)
352 @classmethod
353 def get_known_repos(cls) -> Set[str]:
354 """Retrieve the list of known repository labels.
356 Returns
357 -------
358 repos : `set` of `str`
359 All the known labels. Can be empty if no index can be found.
361 Notes
362 -----
363 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the
364 information is discovered.
365 """
366 return ButlerRepoIndex.get_known_repos()
368 @staticmethod
369 def makeRepo(
370 root: ResourcePathExpression,
371 config: Union[Config, str, None] = None,
372 dimensionConfig: Union[Config, str, None] = None,
373 standalone: bool = False,
374 searchPaths: Optional[List[str]] = None,
375 forceConfigRoot: bool = True,
376 outfile: Optional[ResourcePathExpression] = None,
377 overwrite: bool = False,
378 ) -> Config:
379 """Create an empty data repository by adding a butler.yaml config
380 to a repository root directory.
382 Parameters
383 ----------
384 root : `lsst.resources.ResourcePathExpression`
385 Path or URI to the root location of the new repository. Will be
386 created if it does not exist.
387 config : `Config` or `str`, optional
388 Configuration to write to the repository, after setting any
389 root-dependent Registry or Datastore config options. Can not
390 be a `ButlerConfig` or a `ConfigSubset`. If `None`, default
391 configuration will be used. Root-dependent config options
392 specified in this config are overwritten if ``forceConfigRoot``
393 is `True`.
394 dimensionConfig : `Config` or `str`, optional
395 Configuration for dimensions, will be used to initialize registry
396 database.
397 standalone : `bool`
398 If True, write all expanded defaults, not just customized or
399 repository-specific settings.
400 This (mostly) decouples the repository from the default
401 configuration, insulating it from changes to the defaults (which
402 may be good or bad, depending on the nature of the changes).
403 Future *additions* to the defaults will still be picked up when
404 initializing `Butlers` to repos created with ``standalone=True``.
405 searchPaths : `list` of `str`, optional
406 Directory paths to search when calculating the full butler
407 configuration.
408 forceConfigRoot : `bool`, optional
409 If `False`, any values present in the supplied ``config`` that
410 would normally be reset are not overridden and will appear
411 directly in the output config. This allows non-standard overrides
412 of the root directory for a datastore or registry to be given.
413 If this parameter is `True` the values for ``root`` will be
414 forced into the resulting config if appropriate.
415 outfile : `lss.resources.ResourcePathExpression`, optional
416 If not-`None`, the output configuration will be written to this
417 location rather than into the repository itself. Can be a URI
418 string. Can refer to a directory that will be used to write
419 ``butler.yaml``.
420 overwrite : `bool`, optional
421 Create a new configuration file even if one already exists
422 in the specified output location. Default is to raise
423 an exception.
425 Returns
426 -------
427 config : `Config`
428 The updated `Config` instance written to the repo.
430 Raises
431 ------
432 ValueError
433 Raised if a ButlerConfig or ConfigSubset is passed instead of a
434 regular Config (as these subclasses would make it impossible to
435 support ``standalone=False``).
436 FileExistsError
437 Raised if the output config file already exists.
438 os.error
439 Raised if the directory does not exist, exists but is not a
440 directory, or cannot be created.
442 Notes
443 -----
444 Note that when ``standalone=False`` (the default), the configuration
445 search path (see `ConfigSubset.defaultSearchPaths`) that was used to
446 construct the repository should also be used to construct any Butlers
447 to avoid configuration inconsistencies.
448 """
449 if isinstance(config, (ButlerConfig, ConfigSubset)):
450 raise ValueError("makeRepo must be passed a regular Config without defaults applied.")
452 # Ensure that the root of the repository exists or can be made
453 root_uri = ResourcePath(root, forceDirectory=True)
454 root_uri.mkdir()
456 config = Config(config)
458 # If we are creating a new repo from scratch with relative roots,
459 # do not propagate an explicit root from the config file
460 if "root" in config:
461 del config["root"]
463 full = ButlerConfig(config, searchPaths=searchPaths) # this applies defaults
464 imported_class = doImportType(full["datastore", "cls"])
465 if not issubclass(imported_class, Datastore):
466 raise TypeError(f"Imported datastore class {full['datastore', 'cls']} is not a Datastore")
467 datastoreClass: Type[Datastore] = imported_class
468 datastoreClass.setConfigRoot(BUTLER_ROOT_TAG, config, full, overwrite=forceConfigRoot)
470 # if key exists in given config, parse it, otherwise parse the defaults
471 # in the expanded config
472 if config.get(("registry", "db")):
473 registryConfig = RegistryConfig(config)
474 else:
475 registryConfig = RegistryConfig(full)
476 defaultDatabaseUri = registryConfig.makeDefaultDatabaseUri(BUTLER_ROOT_TAG)
477 if defaultDatabaseUri is not None:
478 Config.updateParameters(
479 RegistryConfig, config, full, toUpdate={"db": defaultDatabaseUri}, overwrite=forceConfigRoot
480 )
481 else:
482 Config.updateParameters(RegistryConfig, config, full, toCopy=("db",), overwrite=forceConfigRoot)
484 if standalone:
485 config.merge(full)
486 else:
487 # Always expand the registry.managers section into the per-repo
488 # config, because after the database schema is created, it's not
489 # allowed to change anymore. Note that in the standalone=True
490 # branch, _everything_ in the config is expanded, so there's no
491 # need to special case this.
492 Config.updateParameters(RegistryConfig, config, full, toMerge=("managers",), overwrite=False)
493 configURI: ResourcePathExpression
494 if outfile is not None:
495 # When writing to a separate location we must include
496 # the root of the butler repo in the config else it won't know
497 # where to look.
498 config["root"] = root_uri.geturl()
499 configURI = outfile
500 else:
501 configURI = root_uri
502 # Strip obscore configuration, if it is present, before writing config
503 # to a file, obscore config will be stored in registry.
504 config_to_write = config
505 if (obscore_config_key := ("registry", "managers", "obscore", "config")) in config:
506 config_to_write = config.copy()
507 del config_to_write[obscore_config_key]
508 config_to_write.dumpToUri(configURI, overwrite=overwrite)
510 # Create Registry and populate tables
511 registryConfig = RegistryConfig(config.get("registry"))
512 dimensionConfig = DimensionConfig(dimensionConfig)
513 Registry.createFromConfig(registryConfig, dimensionConfig=dimensionConfig, butlerRoot=root_uri)
515 log.verbose("Wrote new Butler configuration file to %s", configURI)
517 return config
519 @classmethod
520 def _unpickle(
521 cls,
522 config: ButlerConfig,
523 collections: Optional[tuple[str, ...]],
524 run: Optional[str],
525 defaultDataId: Dict[str, str],
526 writeable: bool,
527 ) -> Butler:
528 """Callable used to unpickle a Butler.
530 We prefer not to use ``Butler.__init__`` directly so we can force some
531 of its many arguments to be keyword-only (note that ``__reduce__``
532 can only invoke callables with positional arguments).
534 Parameters
535 ----------
536 config : `ButlerConfig`
537 Butler configuration, already coerced into a true `ButlerConfig`
538 instance (and hence after any search paths for overrides have been
539 utilized).
540 collections : `tuple` [ `str` ]
541 Names of the default collections to read from.
542 run : `str`, optional
543 Name of the default `~CollectionType.RUN` collection to write to.
544 defaultDataId : `dict` [ `str`, `str` ]
545 Default data ID values.
546 writeable : `bool`
547 Whether the Butler should support write operations.
549 Returns
550 -------
551 butler : `Butler`
552 A new `Butler` instance.
553 """
554 # MyPy doesn't recognize that the kwargs below are totally valid; it
555 # seems to think '**defaultDataId* is a _positional_ argument!
556 return cls(
557 config=config,
558 collections=collections,
559 run=run,
560 writeable=writeable,
561 **defaultDataId, # type: ignore
562 )
564 def __reduce__(self) -> tuple:
565 """Support pickling."""
566 return (
567 Butler._unpickle,
568 (
569 self._config,
570 self.collections,
571 self.run,
572 self.registry.defaults.dataId.byName(),
573 self.registry.isWriteable(),
574 ),
575 )
577 def __str__(self) -> str:
578 return "Butler(collections={}, run={}, datastore='{}', registry='{}')".format(
579 self.collections, self.run, self.datastore, self.registry
580 )
582 def isWriteable(self) -> bool:
583 """Return `True` if this `Butler` supports write operations."""
584 return self.registry.isWriteable()
586 @contextlib.contextmanager
587 def transaction(self) -> Iterator[None]:
588 """Context manager supporting `Butler` transactions.
590 Transactions can be nested.
591 """
592 with self.registry.transaction():
593 with self.datastore.transaction():
594 yield
596 def _standardizeArgs(
597 self,
598 datasetRefOrType: Union[DatasetRef, DatasetType, str],
599 dataId: Optional[DataId] = None,
600 for_put: bool = True,
601 **kwargs: Any,
602 ) -> Tuple[DatasetType, Optional[DataId]]:
603 """Standardize the arguments passed to several Butler APIs.
605 Parameters
606 ----------
607 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
608 When `DatasetRef` the `dataId` should be `None`.
609 Otherwise the `DatasetType` or name thereof.
610 dataId : `dict` or `DataCoordinate`
611 A `dict` of `Dimension` link name, value pairs that label the
612 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
613 should be provided as the second argument.
614 for_put : `bool`, optional
615 If `True` this call is invoked as part of a `Butler.put()`.
616 Otherwise it is assumed to be part of a `Butler.get()`. This
617 parameter is only relevant if there is dataset type
618 inconsistency.
619 **kwargs
620 Additional keyword arguments used to augment or construct a
621 `DataCoordinate`. See `DataCoordinate.standardize`
622 parameters.
624 Returns
625 -------
626 datasetType : `DatasetType`
627 A `DatasetType` instance extracted from ``datasetRefOrType``.
628 dataId : `dict` or `DataId`, optional
629 Argument that can be used (along with ``kwargs``) to construct a
630 `DataId`.
632 Notes
633 -----
634 Butler APIs that conceptually need a DatasetRef also allow passing a
635 `DatasetType` (or the name of one) and a `DataId` (or a dict and
636 keyword arguments that can be used to construct one) separately. This
637 method accepts those arguments and always returns a true `DatasetType`
638 and a `DataId` or `dict`.
640 Standardization of `dict` vs `DataId` is best handled by passing the
641 returned ``dataId`` (and ``kwargs``) to `Registry` APIs, which are
642 generally similarly flexible.
643 """
644 externalDatasetType: Optional[DatasetType] = None
645 internalDatasetType: Optional[DatasetType] = None
646 if isinstance(datasetRefOrType, DatasetRef):
647 if dataId is not None or kwargs:
648 raise ValueError("DatasetRef given, cannot use dataId as well")
649 externalDatasetType = datasetRefOrType.datasetType
650 dataId = datasetRefOrType.dataId
651 else:
652 # Don't check whether DataId is provided, because Registry APIs
653 # can usually construct a better error message when it wasn't.
654 if isinstance(datasetRefOrType, DatasetType):
655 externalDatasetType = datasetRefOrType
656 else:
657 internalDatasetType = self.registry.getDatasetType(datasetRefOrType)
659 # Check that they are self-consistent
660 if externalDatasetType is not None:
661 internalDatasetType = self.registry.getDatasetType(externalDatasetType.name)
662 if externalDatasetType != internalDatasetType:
663 # We can allow differences if they are compatible, depending
664 # on whether this is a get or a put. A get requires that
665 # the python type associated with the datastore can be
666 # converted to the user type. A put requires that the user
667 # supplied python type can be converted to the internal
668 # type expected by registry.
669 relevantDatasetType = internalDatasetType
670 if for_put:
671 is_compatible = internalDatasetType.is_compatible_with(externalDatasetType)
672 else:
673 is_compatible = externalDatasetType.is_compatible_with(internalDatasetType)
674 relevantDatasetType = externalDatasetType
675 if not is_compatible:
676 raise ValueError(
677 f"Supplied dataset type ({externalDatasetType}) inconsistent with "
678 f"registry definition ({internalDatasetType})"
679 )
680 # Override the internal definition.
681 internalDatasetType = relevantDatasetType
683 assert internalDatasetType is not None
684 return internalDatasetType, dataId
686 def _rewrite_data_id(
687 self, dataId: Optional[DataId], datasetType: DatasetType, **kwargs: Any
688 ) -> Tuple[Optional[DataId], Dict[str, Any]]:
689 """Rewrite a data ID taking into account dimension records.
691 Take a Data ID and keyword args and rewrite it if necessary to
692 allow the user to specify dimension records rather than dimension
693 primary values.
695 This allows a user to include a dataId dict with keys of
696 ``exposure.day_obs`` and ``exposure.seq_num`` instead of giving
697 the integer exposure ID. It also allows a string to be given
698 for a dimension value rather than the integer ID if that is more
699 convenient. For example, rather than having to specifyin the
700 detector with ``detector.full_name``, a string given for ``detector``
701 will be interpreted as the full name and converted to the integer
702 value.
704 Keyword arguments can also use strings for dimensions like detector
705 and exposure but python does not allow them to include ``.`` and
706 so the ``exposure.day_obs`` syntax can not be used in a keyword
707 argument.
709 Parameters
710 ----------
711 dataId : `dict` or `DataCoordinate`
712 A `dict` of `Dimension` link name, value pairs that will label the
713 `DatasetRef` within a Collection.
714 datasetType : `DatasetType`
715 The dataset type associated with this dataId. Required to
716 determine the relevant dimensions.
717 **kwargs
718 Additional keyword arguments used to augment or construct a
719 `DataId`. See `DataId` parameters.
721 Returns
722 -------
723 dataId : `dict` or `DataCoordinate`
724 The, possibly rewritten, dataId. If given a `DataCoordinate` and
725 no keyword arguments, the original dataId will be returned
726 unchanged.
727 **kwargs : `dict`
728 Any unused keyword arguments (would normally be empty dict).
729 """
730 # Do nothing if we have a standalone DataCoordinate.
731 if isinstance(dataId, DataCoordinate) and not kwargs:
732 return dataId, kwargs
734 # Process dimension records that are using record information
735 # rather than ids
736 newDataId: Dict[str, DataIdValue] = {}
737 byRecord: Dict[str, Dict[str, Any]] = defaultdict(dict)
739 # if all the dataId comes from keyword parameters we do not need
740 # to do anything here because they can't be of the form
741 # exposure.obs_id because a "." is not allowed in a keyword parameter.
742 if dataId:
743 for k, v in dataId.items():
744 # If we have a Dimension we do not need to do anything
745 # because it cannot be a compound key.
746 if isinstance(k, str) and "." in k:
747 # Someone is using a more human-readable dataId
748 dimensionName, record = k.split(".", 1)
749 byRecord[dimensionName][record] = v
750 elif isinstance(k, Dimension):
751 newDataId[k.name] = v
752 else:
753 newDataId[k] = v
755 # Go through the updated dataId and check the type in case someone is
756 # using an alternate key. We have already filtered out the compound
757 # keys dimensions.record format.
758 not_dimensions = {}
760 # Will need to look in the dataId and the keyword arguments
761 # and will remove them if they need to be fixed or are unrecognized.
762 for dataIdDict in (newDataId, kwargs):
763 # Use a list so we can adjust the dict safely in the loop
764 for dimensionName in list(dataIdDict):
765 value = dataIdDict[dimensionName]
766 try:
767 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
768 except KeyError:
769 # This is not a real dimension
770 not_dimensions[dimensionName] = value
771 del dataIdDict[dimensionName]
772 continue
774 # Convert an integral type to an explicit int to simplify
775 # comparisons here
776 if isinstance(value, numbers.Integral):
777 value = int(value)
779 if not isinstance(value, dimension.primaryKey.getPythonType()):
780 for alternate in dimension.alternateKeys:
781 if isinstance(value, alternate.getPythonType()):
782 byRecord[dimensionName][alternate.name] = value
783 del dataIdDict[dimensionName]
784 log.debug(
785 "Converting dimension %s to %s.%s=%s",
786 dimensionName,
787 dimensionName,
788 alternate.name,
789 value,
790 )
791 break
792 else:
793 log.warning(
794 "Type mismatch found for value '%r' provided for dimension %s. "
795 "Could not find matching alternative (primary key has type %s) "
796 "so attempting to use as-is.",
797 value,
798 dimensionName,
799 dimension.primaryKey.getPythonType(),
800 )
802 # By this point kwargs and newDataId should only include valid
803 # dimensions. Merge kwargs in to the new dataId and log if there
804 # are dimensions in both (rather than calling update).
805 for k, v in kwargs.items():
806 if k in newDataId and newDataId[k] != v:
807 log.debug(
808 "Keyword arg %s overriding explicit value in dataId of %s with %s", k, newDataId[k], v
809 )
810 newDataId[k] = v
811 # No need to retain any values in kwargs now.
812 kwargs = {}
814 # If we have some unrecognized dimensions we have to try to connect
815 # them to records in other dimensions. This is made more complicated
816 # by some dimensions having records with clashing names. A mitigation
817 # is that we can tell by this point which dimensions are missing
818 # for the DatasetType but this does not work for calibrations
819 # where additional dimensions can be used to constrain the temporal
820 # axis.
821 if not_dimensions:
822 # Search for all dimensions even if we have been given a value
823 # explicitly. In some cases records are given as well as the
824 # actually dimension and this should not be an error if they
825 # match.
826 mandatoryDimensions = datasetType.dimensions.names # - provided
828 candidateDimensions: Set[str] = set()
829 candidateDimensions.update(mandatoryDimensions)
831 # For calibrations we may well be needing temporal dimensions
832 # so rather than always including all dimensions in the scan
833 # restrict things a little. It is still possible for there
834 # to be confusion over day_obs in visit vs exposure for example.
835 # If we are not searching calibration collections things may
836 # fail but they are going to fail anyway because of the
837 # ambiguousness of the dataId...
838 if datasetType.isCalibration():
839 for dim in self.registry.dimensions.getStaticDimensions():
840 if dim.temporal:
841 candidateDimensions.add(str(dim))
843 # Look up table for the first association with a dimension
844 guessedAssociation: Dict[str, Dict[str, Any]] = defaultdict(dict)
846 # Keep track of whether an item is associated with multiple
847 # dimensions.
848 counter: Counter[str] = Counter()
849 assigned: Dict[str, Set[str]] = defaultdict(set)
851 # Go through the missing dimensions and associate the
852 # given names with records within those dimensions
853 matched_dims = set()
854 for dimensionName in candidateDimensions:
855 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
856 fields = dimension.metadata.names | dimension.uniqueKeys.names
857 for field in not_dimensions:
858 if field in fields:
859 guessedAssociation[dimensionName][field] = not_dimensions[field]
860 counter[dimensionName] += 1
861 assigned[field].add(dimensionName)
862 matched_dims.add(field)
864 # Calculate the fields that matched nothing.
865 never_found = set(not_dimensions) - matched_dims
867 if never_found:
868 raise ValueError(f"Unrecognized keyword args given: {never_found}")
870 # There is a chance we have allocated a single dataId item
871 # to multiple dimensions. Need to decide which should be retained.
872 # For now assume that the most popular alternative wins.
873 # This means that day_obs with seq_num will result in
874 # exposure.day_obs and not visit.day_obs
875 # Also prefer an explicitly missing dimension over an inferred
876 # temporal dimension.
877 for fieldName, assignedDimensions in assigned.items():
878 if len(assignedDimensions) > 1:
879 # Pick the most popular (preferring mandatory dimensions)
880 requiredButMissing = assignedDimensions.intersection(mandatoryDimensions)
881 if requiredButMissing:
882 candidateDimensions = requiredButMissing
883 else:
884 candidateDimensions = assignedDimensions
886 # If this is a choice between visit and exposure and
887 # neither was a required part of the dataset type,
888 # (hence in this branch) always prefer exposure over
889 # visit since exposures are always defined and visits
890 # are defined from exposures.
891 if candidateDimensions == {"exposure", "visit"}:
892 candidateDimensions = {"exposure"}
894 # Select the relevant items and get a new restricted
895 # counter.
896 theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions}
897 duplicatesCounter: Counter[str] = Counter()
898 duplicatesCounter.update(theseCounts)
900 # Choose the most common. If they are equally common
901 # we will pick the one that was found first.
902 # Returns a list of tuples
903 selected = duplicatesCounter.most_common(1)[0][0]
905 log.debug(
906 "Ambiguous dataId entry '%s' associated with multiple dimensions: %s."
907 " Removed ambiguity by choosing dimension %s.",
908 fieldName,
909 ", ".join(assignedDimensions),
910 selected,
911 )
913 for candidateDimension in assignedDimensions:
914 if candidateDimension != selected:
915 del guessedAssociation[candidateDimension][fieldName]
917 # Update the record look up dict with the new associations
918 for dimensionName, values in guessedAssociation.items():
919 if values: # A dict might now be empty
920 log.debug("Assigned non-dimension dataId keys to dimension %s: %s", dimensionName, values)
921 byRecord[dimensionName].update(values)
923 if byRecord:
924 # Some record specifiers were found so we need to convert
925 # them to the Id form
926 for dimensionName, values in byRecord.items():
927 if dimensionName in newDataId:
928 log.debug(
929 "DataId specified explicit %s dimension value of %s in addition to"
930 " general record specifiers for it of %s. Ignoring record information.",
931 dimensionName,
932 newDataId[dimensionName],
933 str(values),
934 )
935 # Get the actual record and compare with these values.
936 try:
937 recs = list(self.registry.queryDimensionRecords(dimensionName, dataId=newDataId))
938 except DataIdError:
939 raise ValueError(
940 f"Could not find dimension '{dimensionName}'"
941 f" with dataId {newDataId} as part of comparing with"
942 f" record values {byRecord[dimensionName]}"
943 ) from None
944 if len(recs) == 1:
945 errmsg: List[str] = []
946 for k, v in values.items():
947 if (recval := getattr(recs[0], k)) != v:
948 errmsg.append(f"{k}({recval} != {v})")
949 if errmsg:
950 raise ValueError(
951 f"Dimension {dimensionName} in dataId has explicit value"
952 " inconsistent with records: " + ", ".join(errmsg)
953 )
954 else:
955 # Multiple matches for an explicit dimension
956 # should never happen but let downstream complain.
957 pass
958 continue
960 # Build up a WHERE expression
961 bind = {k: v for k, v in values.items()}
962 where = " AND ".join(f"{dimensionName}.{k} = {k}" for k in bind)
964 # Hopefully we get a single record that matches
965 records = set(
966 self.registry.queryDimensionRecords(
967 dimensionName, dataId=newDataId, where=where, bind=bind, **kwargs
968 )
969 )
971 if len(records) != 1:
972 if len(records) > 1:
973 # visit can have an ambiguous answer without involving
974 # visit_system. The default visit_system is defined
975 # by the instrument.
976 if (
977 dimensionName == "visit"
978 and "visit_system_membership" in self.registry.dimensions
979 and "visit_system" in self.registry.dimensions["instrument"].metadata
980 ):
981 instrument_records = list(
982 self.registry.queryDimensionRecords(
983 "instrument",
984 dataId=newDataId,
985 **kwargs,
986 )
987 )
988 if len(instrument_records) == 1:
989 visit_system = instrument_records[0].visit_system
990 if visit_system is None:
991 # Set to a value that will never match.
992 visit_system = -1
994 # Look up each visit in the
995 # visit_system_membership records.
996 for rec in records:
997 membership = list(
998 self.registry.queryDimensionRecords(
999 # Use bind to allow zero results.
1000 # This is a fully-specified query.
1001 "visit_system_membership",
1002 where="instrument = inst AND visit_system = system AND visit = v",
1003 bind=dict(
1004 inst=instrument_records[0].name, system=visit_system, v=rec.id
1005 ),
1006 )
1007 )
1008 if membership:
1009 # This record is the right answer.
1010 records = set([rec])
1011 break
1013 # The ambiguity may have been resolved so check again.
1014 if len(records) > 1:
1015 log.debug("Received %d records from constraints of %s", len(records), str(values))
1016 for r in records:
1017 log.debug("- %s", str(r))
1018 raise ValueError(
1019 f"DataId specification for dimension {dimensionName} is not"
1020 f" uniquely constrained to a single dataset by {values}."
1021 f" Got {len(records)} results."
1022 )
1023 else:
1024 raise ValueError(
1025 f"DataId specification for dimension {dimensionName} matched no"
1026 f" records when constrained by {values}"
1027 )
1029 # Get the primary key from the real dimension object
1030 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
1031 if not isinstance(dimension, Dimension):
1032 raise RuntimeError(
1033 f"{dimension.name} is not a true dimension, and cannot be used in data IDs."
1034 )
1035 newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name)
1037 return newDataId, kwargs
1039 def _findDatasetRef(
1040 self,
1041 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1042 dataId: Optional[DataId] = None,
1043 *,
1044 collections: Any = None,
1045 allowUnresolved: bool = False,
1046 **kwargs: Any,
1047 ) -> DatasetRef:
1048 """Shared logic for methods that start with a search for a dataset in
1049 the registry.
1051 Parameters
1052 ----------
1053 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1054 When `DatasetRef` the `dataId` should be `None`.
1055 Otherwise the `DatasetType` or name thereof.
1056 dataId : `dict` or `DataCoordinate`, optional
1057 A `dict` of `Dimension` link name, value pairs that label the
1058 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1059 should be provided as the first argument.
1060 collections : Any, optional
1061 Collections to be searched, overriding ``self.collections``.
1062 Can be any of the types supported by the ``collections`` argument
1063 to butler construction.
1064 allowUnresolved : `bool`, optional
1065 If `True`, return an unresolved `DatasetRef` if finding a resolved
1066 one in the `Registry` fails. Defaults to `False`.
1067 **kwargs
1068 Additional keyword arguments used to augment or construct a
1069 `DataId`. See `DataId` parameters.
1071 Returns
1072 -------
1073 ref : `DatasetRef`
1074 A reference to the dataset identified by the given arguments.
1076 Raises
1077 ------
1078 LookupError
1079 Raised if no matching dataset exists in the `Registry` (and
1080 ``allowUnresolved is False``).
1081 ValueError
1082 Raised if a resolved `DatasetRef` was passed as an input, but it
1083 differs from the one found in the registry.
1084 TypeError
1085 Raised if no collections were provided.
1086 """
1087 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, for_put=False, **kwargs)
1088 if isinstance(datasetRefOrType, DatasetRef):
1089 idNumber = datasetRefOrType.id
1090 else:
1091 idNumber = None
1092 timespan: Optional[Timespan] = None
1094 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs)
1096 if datasetType.isCalibration():
1097 # Because this is a calibration dataset, first try to make a
1098 # standardize the data ID without restricting the dimensions to
1099 # those of the dataset type requested, because there may be extra
1100 # dimensions that provide temporal information for a validity-range
1101 # lookup.
1102 dataId = DataCoordinate.standardize(
1103 dataId, universe=self.registry.dimensions, defaults=self.registry.defaults.dataId, **kwargs
1104 )
1105 if dataId.graph.temporal:
1106 dataId = self.registry.expandDataId(dataId)
1107 timespan = dataId.timespan
1108 else:
1109 # Standardize the data ID to just the dimensions of the dataset
1110 # type instead of letting registry.findDataset do it, so we get the
1111 # result even if no dataset is found.
1112 dataId = DataCoordinate.standardize(
1113 dataId, graph=datasetType.dimensions, defaults=self.registry.defaults.dataId, **kwargs
1114 )
1115 # Always lookup the DatasetRef, even if one is given, to ensure it is
1116 # present in the current collection.
1117 ref = self.registry.findDataset(datasetType, dataId, collections=collections, timespan=timespan)
1118 if ref is None:
1119 if allowUnresolved:
1120 return DatasetRef(datasetType, dataId)
1121 else:
1122 if collections is None:
1123 collections = self.registry.defaults.collections
1124 raise LookupError(
1125 f"Dataset {datasetType.name} with data ID {dataId} "
1126 f"could not be found in collections {collections}."
1127 )
1128 if idNumber is not None and idNumber != ref.id:
1129 if collections is None:
1130 collections = self.registry.defaults.collections
1131 raise ValueError(
1132 f"DatasetRef.id provided ({idNumber}) does not match "
1133 f"id ({ref.id}) in registry in collections {collections}."
1134 )
1135 if datasetType != ref.datasetType:
1136 # If they differ it is because the user explicitly specified
1137 # a compatible dataset type to this call rather than using the
1138 # registry definition. The DatasetRef must therefore be recreated
1139 # using the user definition such that the expected type is
1140 # returned.
1141 ref = DatasetRef(datasetType, ref.dataId, run=ref.run, id=ref.id)
1143 return ref
1145 @transactional
1146 def putDirect(self, obj: Any, ref: DatasetRef) -> DatasetRef:
1147 # Docstring inherited.
1148 (imported_ref,) = self.registry._importDatasets(
1149 [ref],
1150 expand=True,
1151 )
1152 if imported_ref.id != ref.getCheckedId():
1153 raise RuntimeError("This registry configuration does not support putDirect.")
1154 self.datastore.put(obj, ref)
1155 return ref
1157 @transactional
1158 def put(
1159 self,
1160 obj: Any,
1161 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1162 dataId: Optional[DataId] = None,
1163 *,
1164 run: Optional[str] = None,
1165 **kwargs: Any,
1166 ) -> DatasetRef:
1167 """Store and register a dataset.
1169 Parameters
1170 ----------
1171 obj : `object`
1172 The dataset.
1173 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1174 When `DatasetRef` is provided, ``dataId`` should be `None`.
1175 Otherwise the `DatasetType` or name thereof.
1176 dataId : `dict` or `DataCoordinate`
1177 A `dict` of `Dimension` link name, value pairs that label the
1178 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1179 should be provided as the second argument.
1180 run : `str`, optional
1181 The name of the run the dataset should be added to, overriding
1182 ``self.run``.
1183 **kwargs
1184 Additional keyword arguments used to augment or construct a
1185 `DataCoordinate`. See `DataCoordinate.standardize`
1186 parameters.
1188 Returns
1189 -------
1190 ref : `DatasetRef`
1191 A reference to the stored dataset, updated with the correct id if
1192 given.
1194 Raises
1195 ------
1196 TypeError
1197 Raised if the butler is read-only or if no run has been provided.
1198 """
1199 log.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run)
1200 if not self.isWriteable():
1201 raise TypeError("Butler is read-only.")
1202 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs)
1203 if isinstance(datasetRefOrType, DatasetRef) and datasetRefOrType.id is not None:
1204 raise ValueError("DatasetRef must not be in registry, must have None id")
1206 # Handle dimension records in dataId
1207 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs)
1209 # Add Registry Dataset entry.
1210 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions, **kwargs)
1212 # For an execution butler the datasets will be pre-defined.
1213 # If the butler is configured that way datasets should only be inserted
1214 # if they do not already exist in registry. Trying and catching
1215 # ConflictingDefinitionError will not work because the transaction
1216 # will be corrupted. Instead, in this mode always check first.
1217 ref = None
1218 ref_is_predefined = False
1219 if self._allow_put_of_predefined_dataset:
1220 # Get the matching ref for this run.
1221 ref = self.registry.findDataset(datasetType, collections=run, dataId=dataId)
1223 if ref:
1224 # Must be expanded form for datastore templating
1225 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions)
1226 ref = ref.expanded(dataId)
1227 ref_is_predefined = True
1229 if not ref:
1230 (ref,) = self.registry.insertDatasets(datasetType, run=run, dataIds=[dataId])
1232 # If the ref is predefined it is possible that the datastore also
1233 # has the record. Asking datastore to put it again will result in
1234 # the artifact being recreated, overwriting previous, then will cause
1235 # a failure in writing the record which will cause the artifact
1236 # to be removed. Much safer to ask first before attempting to
1237 # overwrite. Race conditions should not be an issue for the
1238 # execution butler environment.
1239 if ref_is_predefined:
1240 if self.datastore.knows(ref):
1241 raise ConflictingDefinitionError(f"Dataset associated {ref} already exists.")
1243 self.datastore.put(obj, ref)
1245 return ref
1247 def getDirect(
1248 self,
1249 ref: DatasetRef,
1250 *,
1251 parameters: Optional[Dict[str, Any]] = None,
1252 storageClass: Optional[Union[StorageClass, str]] = None,
1253 ) -> Any:
1254 """Retrieve a stored dataset.
1256 Unlike `Butler.get`, this method allows datasets outside the Butler's
1257 collection to be read as long as the `DatasetRef` that identifies them
1258 can be obtained separately.
1260 Parameters
1261 ----------
1262 ref : `DatasetRef`
1263 Resolved reference to an already stored dataset.
1264 parameters : `dict`
1265 Additional StorageClass-defined options to control reading,
1266 typically used to efficiently read only a subset of the dataset.
1267 storageClass : `StorageClass` or `str`, optional
1268 The storage class to be used to override the Python type
1269 returned by this method. By default the returned type matches
1270 the dataset type definition for this dataset. Specifying a
1271 read `StorageClass` can force a different type to be returned.
1272 This type must be compatible with the original type.
1274 Returns
1275 -------
1276 obj : `object`
1277 The dataset.
1278 """
1279 return self.datastore.get(ref, parameters=parameters, storageClass=storageClass)
1281 def getDirectDeferred(
1282 self,
1283 ref: DatasetRef,
1284 *,
1285 parameters: Union[dict, None] = None,
1286 storageClass: str | StorageClass | None = None,
1287 ) -> DeferredDatasetHandle:
1288 """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
1289 from a resolved `DatasetRef`.
1291 Parameters
1292 ----------
1293 ref : `DatasetRef`
1294 Resolved reference to an already stored dataset.
1295 parameters : `dict`
1296 Additional StorageClass-defined options to control reading,
1297 typically used to efficiently read only a subset of the dataset.
1298 storageClass : `StorageClass` or `str`, optional
1299 The storage class to be used to override the Python type
1300 returned by this method. By default the returned type matches
1301 the dataset type definition for this dataset. Specifying a
1302 read `StorageClass` can force a different type to be returned.
1303 This type must be compatible with the original type.
1305 Returns
1306 -------
1307 obj : `DeferredDatasetHandle`
1308 A handle which can be used to retrieve a dataset at a later time.
1310 Raises
1311 ------
1312 AmbiguousDatasetError
1313 Raised if ``ref.id is None``, i.e. the reference is unresolved.
1314 """
1315 if ref.id is None:
1316 raise AmbiguousDatasetError(
1317 f"Dataset of type {ref.datasetType.name} with data ID {ref.dataId} is not resolved."
1318 )
1319 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters, storageClass=storageClass)
1321 def getDeferred(
1322 self,
1323 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1324 dataId: Optional[DataId] = None,
1325 *,
1326 parameters: Union[dict, None] = None,
1327 collections: Any = None,
1328 storageClass: str | StorageClass | None = None,
1329 **kwargs: Any,
1330 ) -> DeferredDatasetHandle:
1331 """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
1332 after an immediate registry lookup.
1334 Parameters
1335 ----------
1336 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1337 When `DatasetRef` the `dataId` should be `None`.
1338 Otherwise the `DatasetType` or name thereof.
1339 dataId : `dict` or `DataCoordinate`, optional
1340 A `dict` of `Dimension` link name, value pairs that label the
1341 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1342 should be provided as the first argument.
1343 parameters : `dict`
1344 Additional StorageClass-defined options to control reading,
1345 typically used to efficiently read only a subset of the dataset.
1346 collections : Any, optional
1347 Collections to be searched, overriding ``self.collections``.
1348 Can be any of the types supported by the ``collections`` argument
1349 to butler construction.
1350 storageClass : `StorageClass` or `str`, optional
1351 The storage class to be used to override the Python type
1352 returned by this method. By default the returned type matches
1353 the dataset type definition for this dataset. Specifying a
1354 read `StorageClass` can force a different type to be returned.
1355 This type must be compatible with the original type.
1356 **kwargs
1357 Additional keyword arguments used to augment or construct a
1358 `DataId`. See `DataId` parameters.
1360 Returns
1361 -------
1362 obj : `DeferredDatasetHandle`
1363 A handle which can be used to retrieve a dataset at a later time.
1365 Raises
1366 ------
1367 LookupError
1368 Raised if no matching dataset exists in the `Registry` (and
1369 ``allowUnresolved is False``).
1370 ValueError
1371 Raised if a resolved `DatasetRef` was passed as an input, but it
1372 differs from the one found in the registry.
1373 TypeError
1374 Raised if no collections were provided.
1375 """
1376 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs)
1377 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters, storageClass=storageClass)
1379 def get(
1380 self,
1381 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1382 dataId: Optional[DataId] = None,
1383 *,
1384 parameters: Optional[Dict[str, Any]] = None,
1385 collections: Any = None,
1386 storageClass: Optional[Union[StorageClass, str]] = None,
1387 **kwargs: Any,
1388 ) -> Any:
1389 """Retrieve a stored dataset.
1391 Parameters
1392 ----------
1393 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1394 When `DatasetRef` the `dataId` should be `None`.
1395 Otherwise the `DatasetType` or name thereof.
1396 dataId : `dict` or `DataCoordinate`
1397 A `dict` of `Dimension` link name, value pairs that label the
1398 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1399 should be provided as the first argument.
1400 parameters : `dict`
1401 Additional StorageClass-defined options to control reading,
1402 typically used to efficiently read only a subset of the dataset.
1403 collections : Any, optional
1404 Collections to be searched, overriding ``self.collections``.
1405 Can be any of the types supported by the ``collections`` argument
1406 to butler construction.
1407 storageClass : `StorageClass` or `str`, optional
1408 The storage class to be used to override the Python type
1409 returned by this method. By default the returned type matches
1410 the dataset type definition for this dataset. Specifying a
1411 read `StorageClass` can force a different type to be returned.
1412 This type must be compatible with the original type.
1413 **kwargs
1414 Additional keyword arguments used to augment or construct a
1415 `DataCoordinate`. See `DataCoordinate.standardize`
1416 parameters.
1418 Returns
1419 -------
1420 obj : `object`
1421 The dataset.
1423 Raises
1424 ------
1425 ValueError
1426 Raised if a resolved `DatasetRef` was passed as an input, but it
1427 differs from the one found in the registry.
1428 LookupError
1429 Raised if no matching dataset exists in the `Registry`.
1430 TypeError
1431 Raised if no collections were provided.
1433 Notes
1434 -----
1435 When looking up datasets in a `~CollectionType.CALIBRATION` collection,
1436 this method requires that the given data ID include temporal dimensions
1437 beyond the dimensions of the dataset type itself, in order to find the
1438 dataset with the appropriate validity range. For example, a "bias"
1439 dataset with native dimensions ``{instrument, detector}`` could be
1440 fetched with a ``{instrument, detector, exposure}`` data ID, because
1441 ``exposure`` is a temporal dimension.
1442 """
1443 log.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters)
1444 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs)
1445 return self.getDirect(ref, parameters=parameters, storageClass=storageClass)
1447 def getURIs(
1448 self,
1449 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1450 dataId: Optional[DataId] = None,
1451 *,
1452 predict: bool = False,
1453 collections: Any = None,
1454 run: Optional[str] = None,
1455 **kwargs: Any,
1456 ) -> DatasetRefURIs:
1457 """Returns the URIs associated with the dataset.
1459 Parameters
1460 ----------
1461 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1462 When `DatasetRef` the `dataId` should be `None`.
1463 Otherwise the `DatasetType` or name thereof.
1464 dataId : `dict` or `DataCoordinate`
1465 A `dict` of `Dimension` link name, value pairs that label the
1466 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1467 should be provided as the first argument.
1468 predict : `bool`
1469 If `True`, allow URIs to be returned of datasets that have not
1470 been written.
1471 collections : Any, optional
1472 Collections to be searched, overriding ``self.collections``.
1473 Can be any of the types supported by the ``collections`` argument
1474 to butler construction.
1475 run : `str`, optional
1476 Run to use for predictions, overriding ``self.run``.
1477 **kwargs
1478 Additional keyword arguments used to augment or construct a
1479 `DataCoordinate`. See `DataCoordinate.standardize`
1480 parameters.
1482 Returns
1483 -------
1484 uris : `DatasetRefURIs`
1485 The URI to the primary artifact associated with this dataset (if
1486 the dataset was disassembled within the datastore this may be
1487 `None`), and the URIs to any components associated with the dataset
1488 artifact. (can be empty if there are no components).
1489 """
1490 ref = self._findDatasetRef(
1491 datasetRefOrType, dataId, allowUnresolved=predict, collections=collections, **kwargs
1492 )
1493 if ref.id is None: # only possible if predict is True
1494 if run is None:
1495 run = self.run
1496 if run is None:
1497 raise TypeError("Cannot predict location with run=None.")
1498 # Lie about ID, because we can't guess it, and only
1499 # Datastore.getURIs() will ever see it (and it doesn't use it).
1500 ref = ref.resolved(id=uuid.UUID("{00000000-0000-0000-0000-000000000000}"), run=run)
1501 return self.datastore.getURIs(ref, predict)
1503 def getURI(
1504 self,
1505 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1506 dataId: Optional[DataId] = None,
1507 *,
1508 predict: bool = False,
1509 collections: Any = None,
1510 run: Optional[str] = None,
1511 **kwargs: Any,
1512 ) -> ResourcePath:
1513 """Return the URI to the Dataset.
1515 Parameters
1516 ----------
1517 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1518 When `DatasetRef` the `dataId` should be `None`.
1519 Otherwise the `DatasetType` or name thereof.
1520 dataId : `dict` or `DataCoordinate`
1521 A `dict` of `Dimension` link name, value pairs that label the
1522 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1523 should be provided as the first argument.
1524 predict : `bool`
1525 If `True`, allow URIs to be returned of datasets that have not
1526 been written.
1527 collections : Any, optional
1528 Collections to be searched, overriding ``self.collections``.
1529 Can be any of the types supported by the ``collections`` argument
1530 to butler construction.
1531 run : `str`, optional
1532 Run to use for predictions, overriding ``self.run``.
1533 **kwargs
1534 Additional keyword arguments used to augment or construct a
1535 `DataCoordinate`. See `DataCoordinate.standardize`
1536 parameters.
1538 Returns
1539 -------
1540 uri : `lsst.resources.ResourcePath`
1541 URI pointing to the Dataset within the datastore. If the
1542 Dataset does not exist in the datastore, and if ``predict`` is
1543 `True`, the URI will be a prediction and will include a URI
1544 fragment "#predicted".
1545 If the datastore does not have entities that relate well
1546 to the concept of a URI the returned URI string will be
1547 descriptive. The returned URI is not guaranteed to be obtainable.
1549 Raises
1550 ------
1551 LookupError
1552 A URI has been requested for a dataset that does not exist and
1553 guessing is not allowed.
1554 ValueError
1555 Raised if a resolved `DatasetRef` was passed as an input, but it
1556 differs from the one found in the registry.
1557 TypeError
1558 Raised if no collections were provided.
1559 RuntimeError
1560 Raised if a URI is requested for a dataset that consists of
1561 multiple artifacts.
1562 """
1563 primary, components = self.getURIs(
1564 datasetRefOrType, dataId=dataId, predict=predict, collections=collections, run=run, **kwargs
1565 )
1567 if primary is None or components:
1568 raise RuntimeError(
1569 f"Dataset ({datasetRefOrType}) includes distinct URIs for components. "
1570 "Use Butler.getURIs() instead."
1571 )
1572 return primary
1574 def retrieveArtifacts(
1575 self,
1576 refs: Iterable[DatasetRef],
1577 destination: ResourcePathExpression,
1578 transfer: str = "auto",
1579 preserve_path: bool = True,
1580 overwrite: bool = False,
1581 ) -> List[ResourcePath]:
1582 """Retrieve the artifacts associated with the supplied refs.
1584 Parameters
1585 ----------
1586 refs : iterable of `DatasetRef`
1587 The datasets for which artifacts are to be retrieved.
1588 A single ref can result in multiple artifacts. The refs must
1589 be resolved.
1590 destination : `lsst.resources.ResourcePath` or `str`
1591 Location to write the artifacts.
1592 transfer : `str`, optional
1593 Method to use to transfer the artifacts. Must be one of the options
1594 supported by `~lsst.resources.ResourcePath.transfer_from()`.
1595 "move" is not allowed.
1596 preserve_path : `bool`, optional
1597 If `True` the full path of the artifact within the datastore
1598 is preserved. If `False` the final file component of the path
1599 is used.
1600 overwrite : `bool`, optional
1601 If `True` allow transfers to overwrite existing files at the
1602 destination.
1604 Returns
1605 -------
1606 targets : `list` of `lsst.resources.ResourcePath`
1607 URIs of file artifacts in destination location. Order is not
1608 preserved.
1610 Notes
1611 -----
1612 For non-file datastores the artifacts written to the destination
1613 may not match the representation inside the datastore. For example
1614 a hierarchical data structure in a NoSQL database may well be stored
1615 as a JSON file.
1616 """
1617 return self.datastore.retrieveArtifacts(
1618 refs,
1619 ResourcePath(destination),
1620 transfer=transfer,
1621 preserve_path=preserve_path,
1622 overwrite=overwrite,
1623 )
1625 def datasetExists(
1626 self,
1627 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1628 dataId: Optional[DataId] = None,
1629 *,
1630 collections: Any = None,
1631 **kwargs: Any,
1632 ) -> bool:
1633 """Return True if the Dataset is actually present in the Datastore.
1635 Parameters
1636 ----------
1637 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1638 When `DatasetRef` the `dataId` should be `None`.
1639 Otherwise the `DatasetType` or name thereof.
1640 dataId : `dict` or `DataCoordinate`
1641 A `dict` of `Dimension` link name, value pairs that label the
1642 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1643 should be provided as the first argument.
1644 collections : Any, optional
1645 Collections to be searched, overriding ``self.collections``.
1646 Can be any of the types supported by the ``collections`` argument
1647 to butler construction.
1648 **kwargs
1649 Additional keyword arguments used to augment or construct a
1650 `DataCoordinate`. See `DataCoordinate.standardize`
1651 parameters.
1653 Raises
1654 ------
1655 LookupError
1656 Raised if the dataset is not even present in the Registry.
1657 ValueError
1658 Raised if a resolved `DatasetRef` was passed as an input, but it
1659 differs from the one found in the registry.
1660 TypeError
1661 Raised if no collections were provided.
1662 """
1663 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs)
1664 return self.datastore.exists(ref)
1666 def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None:
1667 """Remove one or more `~CollectionType.RUN` collections and the
1668 datasets within them.
1670 Parameters
1671 ----------
1672 names : `Iterable` [ `str` ]
1673 The names of the collections to remove.
1674 unstore : `bool`, optional
1675 If `True` (default), delete datasets from all datastores in which
1676 they are present, and attempt to rollback the registry deletions if
1677 datastore deletions fail (which may not always be possible). If
1678 `False`, datastore records for these datasets are still removed,
1679 but any artifacts (e.g. files) will not be.
1681 Raises
1682 ------
1683 TypeError
1684 Raised if one or more collections are not of type
1685 `~CollectionType.RUN`.
1686 """
1687 if not self.isWriteable():
1688 raise TypeError("Butler is read-only.")
1689 names = list(names)
1690 refs: List[DatasetRef] = []
1691 for name in names:
1692 collectionType = self.registry.getCollectionType(name)
1693 if collectionType is not CollectionType.RUN:
1694 raise TypeError(f"The collection type of '{name}' is {collectionType.name}, not RUN.")
1695 refs.extend(self.registry.queryDatasets(..., collections=name, findFirst=True))
1696 with self.datastore.transaction():
1697 with self.registry.transaction():
1698 if unstore:
1699 self.datastore.trash(refs)
1700 else:
1701 self.datastore.forget(refs)
1702 for name in names:
1703 self.registry.removeCollection(name)
1704 if unstore:
1705 # Point of no return for removing artifacts
1706 self.datastore.emptyTrash()
1708 def pruneCollection(
1709 self, name: str, purge: bool = False, unstore: bool = False, unlink: Optional[List[str]] = None
1710 ) -> None:
1711 """Remove a collection and possibly prune datasets within it.
1713 Parameters
1714 ----------
1715 name : `str`
1716 Name of the collection to remove. If this is a
1717 `~CollectionType.TAGGED` or `~CollectionType.CHAINED` collection,
1718 datasets within the collection are not modified unless ``unstore``
1719 is `True`. If this is a `~CollectionType.RUN` collection,
1720 ``purge`` and ``unstore`` must be `True`, and all datasets in it
1721 are fully removed from the data repository.
1722 purge : `bool`, optional
1723 If `True`, permit `~CollectionType.RUN` collections to be removed,
1724 fully removing datasets within them. Requires ``unstore=True`` as
1725 well as an added precaution against accidental deletion. Must be
1726 `False` (default) if the collection is not a ``RUN``.
1727 unstore: `bool`, optional
1728 If `True`, remove all datasets in the collection from all
1729 datastores in which they appear.
1730 unlink: `list` [`str`], optional
1731 Before removing the given `collection` unlink it from from these
1732 parent collections.
1734 Raises
1735 ------
1736 TypeError
1737 Raised if the butler is read-only or arguments are mutually
1738 inconsistent.
1739 """
1740 # See pruneDatasets comments for more information about the logic here;
1741 # the cases are almost the same, but here we can rely on Registry to
1742 # take care everything but Datastore deletion when we remove the
1743 # collection.
1744 if not self.isWriteable():
1745 raise TypeError("Butler is read-only.")
1746 collectionType = self.registry.getCollectionType(name)
1747 if purge and not unstore:
1748 raise PurgeWithoutUnstorePruneCollectionsError()
1749 if collectionType is CollectionType.RUN and not purge:
1750 raise RunWithoutPurgePruneCollectionsError(collectionType)
1751 if collectionType is not CollectionType.RUN and purge:
1752 raise PurgeUnsupportedPruneCollectionsError(collectionType)
1754 def remove(child: str, parent: str) -> None:
1755 """Remove a child collection from a parent collection."""
1756 # Remove child from parent.
1757 chain = list(self.registry.getCollectionChain(parent))
1758 try:
1759 chain.remove(name)
1760 except ValueError as e:
1761 raise RuntimeError(f"{name} is not a child of {parent}") from e
1762 self.registry.setCollectionChain(parent, chain)
1764 with self.datastore.transaction():
1765 with self.registry.transaction():
1766 if unlink:
1767 for parent in unlink:
1768 remove(name, parent)
1769 if unstore:
1770 refs = self.registry.queryDatasets(..., collections=name, findFirst=True)
1771 self.datastore.trash(refs)
1772 self.registry.removeCollection(name)
1774 if unstore:
1775 # Point of no return for removing artifacts
1776 self.datastore.emptyTrash()
1778 def pruneDatasets(
1779 self,
1780 refs: Iterable[DatasetRef],
1781 *,
1782 disassociate: bool = True,
1783 unstore: bool = False,
1784 tags: Iterable[str] = (),
1785 purge: bool = False,
1786 ) -> None:
1787 # docstring inherited from LimitedButler
1789 if not self.isWriteable():
1790 raise TypeError("Butler is read-only.")
1791 if purge:
1792 if not disassociate:
1793 raise TypeError("Cannot pass purge=True without disassociate=True.")
1794 if not unstore:
1795 raise TypeError("Cannot pass purge=True without unstore=True.")
1796 elif disassociate:
1797 tags = tuple(tags)
1798 if not tags:
1799 raise TypeError("No tags provided but disassociate=True.")
1800 for tag in tags:
1801 collectionType = self.registry.getCollectionType(tag)
1802 if collectionType is not CollectionType.TAGGED:
1803 raise TypeError(
1804 f"Cannot disassociate from collection '{tag}' "
1805 f"of non-TAGGED type {collectionType.name}."
1806 )
1807 # For an execution butler we want to keep existing UUIDs for the
1808 # datasets, for that we need to keep them in the collections but
1809 # remove from datastore.
1810 if self._allow_put_of_predefined_dataset and purge:
1811 purge = False
1812 disassociate = False
1813 # Transform possibly-single-pass iterable into something we can iterate
1814 # over multiple times.
1815 refs = list(refs)
1816 # Pruning a component of a DatasetRef makes no sense since registry
1817 # doesn't know about components and datastore might not store
1818 # components in a separate file
1819 for ref in refs:
1820 if ref.datasetType.component():
1821 raise ValueError(f"Can not prune a component of a dataset (ref={ref})")
1822 # We don't need an unreliable Datastore transaction for this, because
1823 # we've been extra careful to ensure that Datastore.trash only involves
1824 # mutating the Registry (it can _look_ at Datastore-specific things,
1825 # but shouldn't change them), and hence all operations here are
1826 # Registry operations.
1827 with self.datastore.transaction():
1828 with self.registry.transaction():
1829 if unstore:
1830 self.datastore.trash(refs)
1831 if purge:
1832 self.registry.removeDatasets(refs)
1833 elif disassociate:
1834 assert tags, "Guaranteed by earlier logic in this function."
1835 for tag in tags:
1836 self.registry.disassociate(tag, refs)
1837 # We've exited the Registry transaction, and apparently committed.
1838 # (if there was an exception, everything rolled back, and it's as if
1839 # nothing happened - and we never get here).
1840 # Datastore artifacts are not yet gone, but they're clearly marked
1841 # as trash, so if we fail to delete now because of (e.g.) filesystem
1842 # problems we can try again later, and if manual administrative
1843 # intervention is required, it's pretty clear what that should entail:
1844 # deleting everything on disk and in private Datastore tables that is
1845 # in the dataset_location_trash table.
1846 if unstore:
1847 # Point of no return for removing artifacts
1848 self.datastore.emptyTrash()
1850 @transactional
1851 def ingest(
1852 self,
1853 *datasets: FileDataset,
1854 transfer: Optional[str] = "auto",
1855 run: Optional[str] = None,
1856 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
1857 record_validation_info: bool = True,
1858 ) -> None:
1859 """Store and register one or more datasets that already exist on disk.
1861 Parameters
1862 ----------
1863 datasets : `FileDataset`
1864 Each positional argument is a struct containing information about
1865 a file to be ingested, including its URI (either absolute or
1866 relative to the datastore root, if applicable), a `DatasetRef`,
1867 and optionally a formatter class or its fully-qualified string
1868 name. If a formatter is not provided, the formatter that would be
1869 used for `put` is assumed. On successful return, all
1870 `FileDataset.ref` attributes will have their `DatasetRef.id`
1871 attribute populated and all `FileDataset.formatter` attributes will
1872 be set to the formatter class used. `FileDataset.path` attributes
1873 may be modified to put paths in whatever the datastore considers a
1874 standardized form.
1875 transfer : `str`, optional
1876 If not `None`, must be one of 'auto', 'move', 'copy', 'direct',
1877 'split', 'hardlink', 'relsymlink' or 'symlink', indicating how to
1878 transfer the file.
1879 run : `str`, optional
1880 The name of the run ingested datasets should be added to,
1881 overriding ``self.run``.
1882 idGenerationMode : `DatasetIdGenEnum`, optional
1883 Specifies option for generating dataset IDs. By default unique IDs
1884 are generated for each inserted dataset.
1885 record_validation_info : `bool`, optional
1886 If `True`, the default, the datastore can record validation
1887 information associated with the file. If `False` the datastore
1888 will not attempt to track any information such as checksums
1889 or file sizes. This can be useful if such information is tracked
1890 in an external system or if the file is to be compressed in place.
1891 It is up to the datastore whether this parameter is relevant.
1893 Raises
1894 ------
1895 TypeError
1896 Raised if the butler is read-only or if no run was provided.
1897 NotImplementedError
1898 Raised if the `Datastore` does not support the given transfer mode.
1899 DatasetTypeNotSupportedError
1900 Raised if one or more files to be ingested have a dataset type that
1901 is not supported by the `Datastore`..
1902 FileNotFoundError
1903 Raised if one of the given files does not exist.
1904 FileExistsError
1905 Raised if transfer is not `None` but the (internal) location the
1906 file would be moved to is already occupied.
1908 Notes
1909 -----
1910 This operation is not fully exception safe: if a database operation
1911 fails, the given `FileDataset` instances may be only partially updated.
1913 It is atomic in terms of database operations (they will either all
1914 succeed or all fail) providing the database engine implements
1915 transactions correctly. It will attempt to be atomic in terms of
1916 filesystem operations as well, but this cannot be implemented
1917 rigorously for most datastores.
1918 """
1919 if not self.isWriteable():
1920 raise TypeError("Butler is read-only.")
1921 progress = Progress("lsst.daf.butler.Butler.ingest", level=logging.DEBUG)
1922 # Reorganize the inputs so they're grouped by DatasetType and then
1923 # data ID. We also include a list of DatasetRefs for each FileDataset
1924 # to hold the resolved DatasetRefs returned by the Registry, before
1925 # it's safe to swap them into FileDataset.refs.
1926 # Some type annotation aliases to make that clearer:
1927 GroupForType = Dict[DataCoordinate, Tuple[FileDataset, List[DatasetRef]]]
1928 GroupedData = MutableMapping[DatasetType, GroupForType]
1929 # The actual data structure:
1930 groupedData: GroupedData = defaultdict(dict)
1931 # And the nested loop that populates it:
1932 for dataset in progress.wrap(datasets, desc="Grouping by dataset type"):
1933 # This list intentionally shared across the inner loop, since it's
1934 # associated with `dataset`.
1935 resolvedRefs: List[DatasetRef] = []
1937 # Somewhere to store pre-existing refs if we have an
1938 # execution butler.
1939 existingRefs: List[DatasetRef] = []
1941 for ref in dataset.refs:
1942 if ref.dataId in groupedData[ref.datasetType]:
1943 raise ConflictingDefinitionError(
1944 f"Ingest conflict. Dataset {dataset.path} has same"
1945 " DataId as other ingest dataset"
1946 f" {groupedData[ref.datasetType][ref.dataId][0].path} "
1947 f" ({ref.dataId})"
1948 )
1949 if self._allow_put_of_predefined_dataset:
1950 existing_ref = self.registry.findDataset(
1951 ref.datasetType, dataId=ref.dataId, collections=run
1952 )
1953 if existing_ref:
1954 if self.datastore.knows(existing_ref):
1955 raise ConflictingDefinitionError(
1956 f"Dataset associated with path {dataset.path}"
1957 f" already exists as {existing_ref}."
1958 )
1959 # Store this ref elsewhere since it already exists
1960 # and we do not want to remake it but we do want
1961 # to store it in the datastore.
1962 existingRefs.append(existing_ref)
1964 # Nothing else to do until we have finished
1965 # iterating.
1966 continue
1968 groupedData[ref.datasetType][ref.dataId] = (dataset, resolvedRefs)
1970 if existingRefs:
1971 if len(dataset.refs) != len(existingRefs):
1972 # Keeping track of partially pre-existing datasets is hard
1973 # and should generally never happen. For now don't allow
1974 # it.
1975 raise ConflictingDefinitionError(
1976 f"For dataset {dataset.path} some dataIds already exist"
1977 " in registry but others do not. This is not supported."
1978 )
1980 # Attach the resolved refs if we found them.
1981 dataset.refs = existingRefs
1983 # Now we can bulk-insert into Registry for each DatasetType.
1984 for datasetType, groupForType in progress.iter_item_chunks(
1985 groupedData.items(), desc="Bulk-inserting datasets by type"
1986 ):
1987 refs = self.registry.insertDatasets(
1988 datasetType,
1989 dataIds=groupForType.keys(),
1990 run=run,
1991 expand=self.datastore.needs_expanded_data_ids(transfer, datasetType),
1992 idGenerationMode=idGenerationMode,
1993 )
1994 # Append those resolved DatasetRefs to the new lists we set up for
1995 # them.
1996 for ref, (_, resolvedRefs) in zip(refs, groupForType.values()):
1997 resolvedRefs.append(ref)
1999 # Go back to the original FileDatasets to replace their refs with the
2000 # new resolved ones.
2001 for groupForType in progress.iter_chunks(
2002 groupedData.values(), desc="Reassociating resolved dataset refs with files"
2003 ):
2004 for dataset, resolvedRefs in groupForType.values():
2005 dataset.refs = resolvedRefs
2007 # Bulk-insert everything into Datastore.
2008 self.datastore.ingest(*datasets, transfer=transfer, record_validation_info=record_validation_info)
2010 @contextlib.contextmanager
2011 def export(
2012 self,
2013 *,
2014 directory: Optional[str] = None,
2015 filename: Optional[str] = None,
2016 format: Optional[str] = None,
2017 transfer: Optional[str] = None,
2018 ) -> Iterator[RepoExportContext]:
2019 """Export datasets from the repository represented by this `Butler`.
2021 This method is a context manager that returns a helper object
2022 (`RepoExportContext`) that is used to indicate what information from
2023 the repository should be exported.
2025 Parameters
2026 ----------
2027 directory : `str`, optional
2028 Directory dataset files should be written to if ``transfer`` is not
2029 `None`.
2030 filename : `str`, optional
2031 Name for the file that will include database information associated
2032 with the exported datasets. If this is not an absolute path and
2033 ``directory`` is not `None`, it will be written to ``directory``
2034 instead of the current working directory. Defaults to
2035 "export.{format}".
2036 format : `str`, optional
2037 File format for the database information file. If `None`, the
2038 extension of ``filename`` will be used.
2039 transfer : `str`, optional
2040 Transfer mode passed to `Datastore.export`.
2042 Raises
2043 ------
2044 TypeError
2045 Raised if the set of arguments passed is inconsistent.
2047 Examples
2048 --------
2049 Typically the `Registry.queryDataIds` and `Registry.queryDatasets`
2050 methods are used to provide the iterables over data IDs and/or datasets
2051 to be exported::
2053 with butler.export("exports.yaml") as export:
2054 # Export all flats, but none of the dimension element rows
2055 # (i.e. data ID information) associated with them.
2056 export.saveDatasets(butler.registry.queryDatasets("flat"),
2057 elements=())
2058 # Export all datasets that start with "deepCoadd_" and all of
2059 # their associated data ID information.
2060 export.saveDatasets(butler.registry.queryDatasets("deepCoadd_*"))
2061 """
2062 if directory is None and transfer is not None:
2063 raise TypeError("Cannot transfer without providing a directory.")
2064 if transfer == "move":
2065 raise TypeError("Transfer may not be 'move': export is read-only")
2066 if format is None:
2067 if filename is None:
2068 raise TypeError("At least one of 'filename' or 'format' must be provided.")
2069 else:
2070 _, format = os.path.splitext(filename)
2071 if not format:
2072 raise ValueError("Please specify a file extension to determine export format.")
2073 format = format[1:] # Strip leading ".""
2074 elif filename is None:
2075 filename = f"export.{format}"
2076 if directory is not None:
2077 filename = os.path.join(directory, filename)
2078 formats = self._config["repo_transfer_formats"]
2079 if format not in formats:
2080 raise ValueError(f"Unknown export format {format!r}, allowed: {','.join(formats.keys())}")
2081 BackendClass = get_class_of(formats[format, "export"])
2082 with open(filename, "w") as stream:
2083 backend = BackendClass(stream, universe=self.registry.dimensions)
2084 try:
2085 helper = RepoExportContext(
2086 self.registry, self.datastore, backend=backend, directory=directory, transfer=transfer
2087 )
2088 yield helper
2089 except BaseException:
2090 raise
2091 else:
2092 helper._finish()
2094 def import_(
2095 self,
2096 *,
2097 directory: Optional[str] = None,
2098 filename: Union[str, TextIO, None] = None,
2099 format: Optional[str] = None,
2100 transfer: Optional[str] = None,
2101 skip_dimensions: Optional[Set] = None,
2102 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
2103 reuseIds: bool = False,
2104 ) -> None:
2105 """Import datasets into this repository that were exported from a
2106 different butler repository via `~lsst.daf.butler.Butler.export`.
2108 Parameters
2109 ----------
2110 directory : `str`, optional
2111 Directory containing dataset files to import from. If `None`,
2112 ``filename`` and all dataset file paths specified therein must
2113 be absolute.
2114 filename : `str` or `TextIO`, optional
2115 A stream or name of file that contains database information
2116 associated with the exported datasets, typically generated by
2117 `~lsst.daf.butler.Butler.export`. If this a string (name) and
2118 is not an absolute path, does not exist in the current working
2119 directory, and ``directory`` is not `None`, it is assumed to be in
2120 ``directory``. Defaults to "export.{format}".
2121 format : `str`, optional
2122 File format for ``filename``. If `None`, the extension of
2123 ``filename`` will be used.
2124 transfer : `str`, optional
2125 Transfer mode passed to `~lsst.daf.butler.Datastore.ingest`.
2126 skip_dimensions : `set`, optional
2127 Names of dimensions that should be skipped and not imported.
2128 idGenerationMode : `DatasetIdGenEnum`, optional
2129 Specifies option for generating dataset IDs when IDs are not
2130 provided or their type does not match backend type. By default
2131 unique IDs are generated for each inserted dataset.
2132 reuseIds : `bool`, optional
2133 If `True` then forces re-use of imported dataset IDs for integer
2134 IDs which are normally generated as auto-incremented; exception
2135 will be raised if imported IDs clash with existing ones. This
2136 option has no effect on the use of globally-unique IDs which are
2137 always re-used (or generated if integer IDs are being imported).
2139 Raises
2140 ------
2141 TypeError
2142 Raised if the set of arguments passed is inconsistent, or if the
2143 butler is read-only.
2144 """
2145 if not self.isWriteable():
2146 raise TypeError("Butler is read-only.")
2147 if format is None:
2148 if filename is None:
2149 raise TypeError("At least one of 'filename' or 'format' must be provided.")
2150 else:
2151 _, format = os.path.splitext(filename) # type: ignore
2152 elif filename is None:
2153 filename = f"export.{format}"
2154 if isinstance(filename, str) and directory is not None and not os.path.exists(filename):
2155 filename = os.path.join(directory, filename)
2156 BackendClass = get_class_of(self._config["repo_transfer_formats"][format]["import"])
2158 def doImport(importStream: TextIO) -> None:
2159 backend = BackendClass(importStream, self.registry)
2160 backend.register()
2161 with self.transaction():
2162 backend.load(
2163 self.datastore,
2164 directory=directory,
2165 transfer=transfer,
2166 skip_dimensions=skip_dimensions,
2167 idGenerationMode=idGenerationMode,
2168 reuseIds=reuseIds,
2169 )
2171 if isinstance(filename, str):
2172 with open(filename, "r") as stream:
2173 doImport(stream)
2174 else:
2175 doImport(filename)
2177 def transfer_from(
2178 self,
2179 source_butler: LimitedButler,
2180 source_refs: Iterable[DatasetRef],
2181 transfer: str = "auto",
2182 skip_missing: bool = True,
2183 register_dataset_types: bool = False,
2184 transfer_dimensions: bool = False,
2185 ) -> collections.abc.Collection[DatasetRef]:
2186 """Transfer datasets to this Butler from a run in another Butler.
2188 Parameters
2189 ----------
2190 source_butler : `LimitedButler`
2191 Butler from which the datasets are to be transferred. If data IDs
2192 in ``source_refs`` are not expanded then this has to be a full
2193 `Butler` whose registry will be used to expand data IDs.
2194 source_refs : iterable of `DatasetRef`
2195 Datasets defined in the source butler that should be transferred to
2196 this butler.
2197 transfer : `str`, optional
2198 Transfer mode passed to `~lsst.daf.butler.Datastore.transfer_from`.
2199 skip_missing : `bool`
2200 If `True`, datasets with no datastore artifact associated with
2201 them are not transferred. If `False` a registry entry will be
2202 created even if no datastore record is created (and so will
2203 look equivalent to the dataset being unstored).
2204 register_dataset_types : `bool`
2205 If `True` any missing dataset types are registered. Otherwise
2206 an exception is raised.
2207 transfer_dimensions : `bool`, optional
2208 If `True`, dimension record data associated with the new datasets
2209 will be transferred.
2211 Returns
2212 -------
2213 refs : `list` of `DatasetRef`
2214 The refs added to this Butler.
2216 Notes
2217 -----
2218 The datastore artifact has to exist for a transfer
2219 to be made but non-existence is not an error.
2221 Datasets that already exist in this run will be skipped.
2223 The datasets are imported as part of a transaction, although
2224 dataset types are registered before the transaction is started.
2225 This means that it is possible for a dataset type to be registered
2226 even though transfer has failed.
2227 """
2228 if not self.isWriteable():
2229 raise TypeError("Butler is read-only.")
2230 progress = Progress("lsst.daf.butler.Butler.transfer_from", level=VERBOSE)
2232 # Will iterate through the refs multiple times so need to convert
2233 # to a list if this isn't a collection.
2234 if not isinstance(source_refs, collections.abc.Collection):
2235 source_refs = list(source_refs)
2237 original_count = len(source_refs)
2238 log.info("Transferring %d datasets into %s", original_count, str(self))
2240 # In some situations the datastore artifact may be missing
2241 # and we do not want that registry entry to be imported.
2242 # Asking datastore is not sufficient, the records may have been
2243 # purged, we have to ask for the (predicted) URI and check
2244 # existence explicitly. Execution butler is set up exactly like
2245 # this with no datastore records.
2246 artifact_existence: Dict[ResourcePath, bool] = {}
2247 if skip_missing:
2248 dataset_existence = source_butler.datastore.mexists(
2249 source_refs, artifact_existence=artifact_existence
2250 )
2251 source_refs = [ref for ref, exists in dataset_existence.items() if exists]
2252 filtered_count = len(source_refs)
2253 n_missing = original_count - filtered_count
2254 log.verbose(
2255 "%d dataset%s removed because the artifact does not exist. Now have %d.",
2256 n_missing,
2257 "" if n_missing == 1 else "s",
2258 filtered_count,
2259 )
2261 # Importing requires that we group the refs by dataset type and run
2262 # before doing the import.
2263 source_dataset_types = set()
2264 grouped_refs = defaultdict(list)
2265 for ref in source_refs:
2266 grouped_refs[ref.datasetType, ref.run].append(ref)
2267 source_dataset_types.add(ref.datasetType)
2269 # Check to see if the dataset type in the source butler has
2270 # the same definition in the target butler and register missing
2271 # ones if requested. Registration must happen outside a transaction.
2272 newly_registered_dataset_types = set()
2273 for datasetType in source_dataset_types:
2274 if register_dataset_types:
2275 # Let this raise immediately if inconsistent. Continuing
2276 # on to find additional inconsistent dataset types
2277 # might result in additional unwanted dataset types being
2278 # registered.
2279 if self.registry.registerDatasetType(datasetType):
2280 newly_registered_dataset_types.add(datasetType)
2281 else:
2282 # If the dataset type is missing, let it fail immediately.
2283 target_dataset_type = self.registry.getDatasetType(datasetType.name)
2284 if target_dataset_type != datasetType:
2285 raise ConflictingDefinitionError(
2286 "Source butler dataset type differs from definition"
2287 f" in target butler: {datasetType} !="
2288 f" {target_dataset_type}"
2289 )
2290 if newly_registered_dataset_types:
2291 # We may have registered some even if there were inconsistencies
2292 # but should let people know (or else remove them again).
2293 log.log(
2294 VERBOSE,
2295 "Registered the following dataset types in the target Butler: %s",
2296 ", ".join(d.name for d in newly_registered_dataset_types),
2297 )
2298 else:
2299 log.log(VERBOSE, "All required dataset types are known to the target Butler")
2301 dimension_records: Dict[DimensionElement, Dict[DataCoordinate, DimensionRecord]] = defaultdict(dict)
2302 if transfer_dimensions:
2303 # Collect all the dimension records for these refs.
2304 # All dimensions are to be copied but the list of valid dimensions
2305 # come from this butler's universe.
2306 elements = frozenset(
2307 element
2308 for element in self.registry.dimensions.getStaticElements()
2309 if element.hasTable() and element.viewOf is None
2310 )
2311 dataIds = set(ref.dataId for ref in source_refs)
2312 # This logic comes from saveDataIds.
2313 for dataId in dataIds:
2314 # Need an expanded record, if not expanded that we need a full
2315 # butler with registry (allow mocks with registry too).
2316 if not dataId.hasRecords():
2317 if registry := getattr(source_butler, "registry", None):
2318 dataId = registry.expandDataId(dataId)
2319 else:
2320 raise TypeError("Input butler needs to be a full butler to expand DataId.")
2321 # If this butler doesn't know about a dimension in the source
2322 # butler things will break later.
2323 for record in dataId.records.values():
2324 if record is not None and record.definition in elements:
2325 dimension_records[record.definition].setdefault(record.dataId, record)
2327 handled_collections: Set[str] = set()
2329 # Do all the importing in a single transaction.
2330 with self.transaction():
2331 if dimension_records:
2332 log.verbose("Ensuring that dimension records exist for transferred datasets.")
2333 for element, r in dimension_records.items():
2334 records = [r[dataId] for dataId in r]
2335 # Assume that if the record is already present that we can
2336 # use it without having to check that the record metadata
2337 # is consistent.
2338 self.registry.insertDimensionData(element, *records, skip_existing=True)
2340 n_imported = 0
2341 for (datasetType, run), refs_to_import in progress.iter_item_chunks(
2342 grouped_refs.items(), desc="Importing to registry by run and dataset type"
2343 ):
2344 if run not in handled_collections:
2345 # May need to create output collection. If source butler
2346 # has a registry, ask for documentation string.
2347 run_doc = None
2348 if registry := getattr(source_butler, "registry", None):
2349 run_doc = registry.getCollectionDocumentation(run)
2350 registered = self.registry.registerRun(run, doc=run_doc)
2351 handled_collections.add(run)
2352 if registered:
2353 log.log(VERBOSE, "Creating output run %s", run)
2355 n_refs = len(refs_to_import)
2356 log.verbose(
2357 "Importing %d ref%s of dataset type %s into run %s",
2358 n_refs,
2359 "" if n_refs == 1 else "s",
2360 datasetType.name,
2361 run,
2362 )
2364 # Assume we are using UUIDs and the source refs will match
2365 # those imported.
2366 imported_refs = self.registry._importDatasets(refs_to_import, expand=False)
2367 assert set(imported_refs) == set(refs_to_import)
2368 n_imported += len(imported_refs)
2370 assert len(source_refs) == n_imported
2371 log.verbose("Imported %d datasets into destination butler", n_imported)
2373 # Ask the datastore to transfer. The datastore has to check that
2374 # the source datastore is compatible with the target datastore.
2375 accepted, rejected = self.datastore.transfer_from(
2376 source_butler.datastore,
2377 source_refs,
2378 transfer=transfer,
2379 artifact_existence=artifact_existence,
2380 )
2381 if rejected:
2382 # For now, accept the registry entries but not the files.
2383 log.warning(
2384 "%d datasets were rejected and %d accepted for dataset type %s in run %r.",
2385 len(rejected),
2386 len(accepted),
2387 datasetType,
2388 run,
2389 )
2391 return source_refs
2393 def validateConfiguration(
2394 self,
2395 logFailures: bool = False,
2396 datasetTypeNames: Optional[Iterable[str]] = None,
2397 ignore: Iterable[str] | None = None,
2398 ) -> None:
2399 """Validate butler configuration.
2401 Checks that each `DatasetType` can be stored in the `Datastore`.
2403 Parameters
2404 ----------
2405 logFailures : `bool`, optional
2406 If `True`, output a log message for every validation error
2407 detected.
2408 datasetTypeNames : iterable of `str`, optional
2409 The `DatasetType` names that should be checked. This allows
2410 only a subset to be selected.
2411 ignore : iterable of `str`, optional
2412 Names of DatasetTypes to skip over. This can be used to skip
2413 known problems. If a named `DatasetType` corresponds to a
2414 composite, all components of that `DatasetType` will also be
2415 ignored.
2417 Raises
2418 ------
2419 ButlerValidationError
2420 Raised if there is some inconsistency with how this Butler
2421 is configured.
2422 """
2423 if datasetTypeNames:
2424 datasetTypes = [self.registry.getDatasetType(name) for name in datasetTypeNames]
2425 else:
2426 datasetTypes = list(self.registry.queryDatasetTypes())
2428 # filter out anything from the ignore list
2429 if ignore:
2430 ignore = set(ignore)
2431 datasetTypes = [
2432 e for e in datasetTypes if e.name not in ignore and e.nameAndComponent()[0] not in ignore
2433 ]
2434 else:
2435 ignore = set()
2437 # Find all the registered instruments
2438 instruments = set(record.name for record in self.registry.queryDimensionRecords("instrument"))
2440 # For each datasetType that has an instrument dimension, create
2441 # a DatasetRef for each defined instrument
2442 datasetRefs = []
2444 for datasetType in datasetTypes:
2445 if "instrument" in datasetType.dimensions:
2446 for instrument in instruments:
2447 datasetRef = DatasetRef(
2448 datasetType, {"instrument": instrument}, conform=False # type: ignore
2449 )
2450 datasetRefs.append(datasetRef)
2452 entities: List[Union[DatasetType, DatasetRef]] = []
2453 entities.extend(datasetTypes)
2454 entities.extend(datasetRefs)
2456 datastoreErrorStr = None
2457 try:
2458 self.datastore.validateConfiguration(entities, logFailures=logFailures)
2459 except ValidationError as e:
2460 datastoreErrorStr = str(e)
2462 # Also check that the LookupKeys used by the datastores match
2463 # registry and storage class definitions
2464 keys = self.datastore.getLookupKeys()
2466 failedNames = set()
2467 failedDataId = set()
2468 for key in keys:
2469 if key.name is not None:
2470 if key.name in ignore:
2471 continue
2473 # skip if specific datasetType names were requested and this
2474 # name does not match
2475 if datasetTypeNames and key.name not in datasetTypeNames:
2476 continue
2478 # See if it is a StorageClass or a DatasetType
2479 if key.name in self.storageClasses:
2480 pass
2481 else:
2482 try:
2483 self.registry.getDatasetType(key.name)
2484 except KeyError:
2485 if logFailures:
2486 log.critical("Key '%s' does not correspond to a DatasetType or StorageClass", key)
2487 failedNames.add(key)
2488 else:
2489 # Dimensions are checked for consistency when the Butler
2490 # is created and rendezvoused with a universe.
2491 pass
2493 # Check that the instrument is a valid instrument
2494 # Currently only support instrument so check for that
2495 if key.dataId:
2496 dataIdKeys = set(key.dataId)
2497 if set(["instrument"]) != dataIdKeys:
2498 if logFailures:
2499 log.critical("Key '%s' has unsupported DataId override", key)
2500 failedDataId.add(key)
2501 elif key.dataId["instrument"] not in instruments:
2502 if logFailures:
2503 log.critical("Key '%s' has unknown instrument", key)
2504 failedDataId.add(key)
2506 messages = []
2508 if datastoreErrorStr:
2509 messages.append(datastoreErrorStr)
2511 for failed, msg in (
2512 (failedNames, "Keys without corresponding DatasetType or StorageClass entry: "),
2513 (failedDataId, "Keys with bad DataId entries: "),
2514 ):
2515 if failed:
2516 msg += ", ".join(str(k) for k in failed)
2517 messages.append(msg)
2519 if messages:
2520 raise ValidationError(";\n".join(messages))
2522 @property
2523 def collections(self) -> Sequence[str]:
2524 """The collections to search by default, in order
2525 (`Sequence` [ `str` ]).
2527 This is an alias for ``self.registry.defaults.collections``. It cannot
2528 be set directly in isolation, but all defaults may be changed together
2529 by assigning a new `RegistryDefaults` instance to
2530 ``self.registry.defaults``.
2531 """
2532 return self.registry.defaults.collections
2534 @property
2535 def run(self) -> Optional[str]:
2536 """Name of the run this butler writes outputs to by default (`str` or
2537 `None`).
2539 This is an alias for ``self.registry.defaults.run``. It cannot be set
2540 directly in isolation, but all defaults may be changed together by
2541 assigning a new `RegistryDefaults` instance to
2542 ``self.registry.defaults``.
2543 """
2544 return self.registry.defaults.run
2546 @property
2547 def dimensions(self) -> DimensionUniverse:
2548 # Docstring inherited.
2549 return self.registry.dimensions
2551 registry: Registry
2552 """The object that manages dataset metadata and relationships (`Registry`).
2554 Most operations that don't involve reading or writing butler datasets are
2555 accessible only via `Registry` methods.
2556 """
2558 datastore: Datastore
2559 """The object that manages actual dataset storage (`Datastore`).
2561 Direct user access to the datastore should rarely be necessary; the primary
2562 exception is the case where a `Datastore` implementation provides extra
2563 functionality beyond what the base class defines.
2564 """
2566 storageClasses: StorageClassFactory
2567 """An object that maps known storage class names to objects that fully
2568 describe them (`StorageClassFactory`).
2569 """
2571 _allow_put_of_predefined_dataset: bool
2572 """Allow a put to succeed even if there is already a registry entry for it
2573 but not a datastore record. (`bool`)."""