Coverage for python/lsst/daf/butler/_butler.py: 8%
666 statements
« prev ^ index » next coverage.py v7.2.5, created at 2023-05-02 18:18 -0700
« prev ^ index » next coverage.py v7.2.5, created at 2023-05-02 18:18 -0700
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""
23Butler top level classes.
24"""
25from __future__ import annotations
27__all__ = (
28 "Butler",
29 "ButlerValidationError",
30 "PruneCollectionsArgsError",
31 "PurgeWithoutUnstorePruneCollectionsError",
32 "RunWithoutPurgePruneCollectionsError",
33 "PurgeUnsupportedPruneCollectionsError",
34)
36import collections.abc
37import contextlib
38import logging
39import numbers
40import os
41from collections import defaultdict
42from typing import (
43 Any,
44 ClassVar,
45 Counter,
46 Dict,
47 Iterable,
48 Iterator,
49 List,
50 MutableMapping,
51 Optional,
52 Set,
53 TextIO,
54 Tuple,
55 Type,
56 Union,
57)
59from lsst.resources import ResourcePath, ResourcePathExpression
60from lsst.utils import doImportType
61from lsst.utils.introspection import get_class_of
62from lsst.utils.logging import VERBOSE, getLogger
64from ._butlerConfig import ButlerConfig
65from ._butlerRepoIndex import ButlerRepoIndex
66from ._deferredDatasetHandle import DeferredDatasetHandle
67from ._limited_butler import LimitedButler
68from .core import (
69 AmbiguousDatasetError,
70 Config,
71 ConfigSubset,
72 DataCoordinate,
73 DataId,
74 DataIdValue,
75 DatasetRef,
76 DatasetRefURIs,
77 DatasetType,
78 Datastore,
79 Dimension,
80 DimensionConfig,
81 DimensionUniverse,
82 FileDataset,
83 Progress,
84 StorageClassFactory,
85 Timespan,
86 ValidationError,
87)
88from .core.repoRelocation import BUTLER_ROOT_TAG
89from .core.utils import transactional
90from .registry import (
91 CollectionSearch,
92 CollectionType,
93 ConflictingDefinitionError,
94 DataIdError,
95 DatasetIdGenEnum,
96 Registry,
97 RegistryConfig,
98 RegistryDefaults,
99)
100from .transfers import RepoExportContext
102log = getLogger(__name__)
105class ButlerValidationError(ValidationError):
106 """There is a problem with the Butler configuration."""
108 pass
111class PruneCollectionsArgsError(TypeError):
112 """Base class for errors relating to Butler.pruneCollections input
113 arguments.
114 """
116 pass
119class PurgeWithoutUnstorePruneCollectionsError(PruneCollectionsArgsError):
120 """Raised when purge and unstore are both required to be True, and
121 purge is True but unstore is False.
122 """
124 def __init__(self) -> None:
125 super().__init__("Cannot pass purge=True without unstore=True.")
128class RunWithoutPurgePruneCollectionsError(PruneCollectionsArgsError):
129 """Raised when pruning a RUN collection but purge is False."""
131 def __init__(self, collectionType: CollectionType):
132 self.collectionType = collectionType
133 super().__init__(f"Cannot prune RUN collection {self.collectionType.name} without purge=True.")
136class PurgeUnsupportedPruneCollectionsError(PruneCollectionsArgsError):
137 """Raised when purge is True but is not supported for the given
138 collection."""
140 def __init__(self, collectionType: CollectionType):
141 self.collectionType = collectionType
142 super().__init__(
143 f"Cannot prune {self.collectionType} collection {self.collectionType.name} with purge=True."
144 )
147class Butler(LimitedButler):
148 """Main entry point for the data access system.
150 Parameters
151 ----------
152 config : `ButlerConfig`, `Config` or `str`, optional.
153 Configuration. Anything acceptable to the
154 `ButlerConfig` constructor. If a directory path
155 is given the configuration will be read from a ``butler.yaml`` file in
156 that location. If `None` is given default values will be used.
157 butler : `Butler`, optional.
158 If provided, construct a new Butler that uses the same registry and
159 datastore as the given one, but with the given collection and run.
160 Incompatible with the ``config``, ``searchPaths``, and ``writeable``
161 arguments.
162 collections : `str` or `Iterable` [ `str` ], optional
163 An expression specifying the collections to be searched (in order) when
164 reading datasets.
165 This may be a `str` collection name or an iterable thereof.
166 See :ref:`daf_butler_collection_expressions` for more information.
167 These collections are not registered automatically and must be
168 manually registered before they are used by any method, but they may be
169 manually registered after the `Butler` is initialized.
170 run : `str`, optional
171 Name of the `~CollectionType.RUN` collection new datasets should be
172 inserted into. If ``collections`` is `None` and ``run`` is not `None`,
173 ``collections`` will be set to ``[run]``. If not `None`, this
174 collection will automatically be registered. If this is not set (and
175 ``writeable`` is not set either), a read-only butler will be created.
176 searchPaths : `list` of `str`, optional
177 Directory paths to search when calculating the full Butler
178 configuration. Not used if the supplied config is already a
179 `ButlerConfig`.
180 writeable : `bool`, optional
181 Explicitly sets whether the butler supports write operations. If not
182 provided, a read-write butler is created if any of ``run``, ``tags``,
183 or ``chains`` is non-empty.
184 inferDefaults : `bool`, optional
185 If `True` (default) infer default data ID values from the values
186 present in the datasets in ``collections``: if all collections have the
187 same value (or no value) for a governor dimension, that value will be
188 the default for that dimension. Nonexistent collections are ignored.
189 If a default value is provided explicitly for a governor dimension via
190 ``**kwargs``, no default will be inferred for that dimension.
191 **kwargs : `str`
192 Default data ID key-value pairs. These may only identify "governor"
193 dimensions like ``instrument`` and ``skymap``.
195 Examples
196 --------
197 While there are many ways to control exactly how a `Butler` interacts with
198 the collections in its `Registry`, the most common cases are still simple.
200 For a read-only `Butler` that searches one collection, do::
202 butler = Butler("/path/to/repo", collections=["u/alice/DM-50000"])
204 For a read-write `Butler` that writes to and reads from a
205 `~CollectionType.RUN` collection::
207 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a")
209 The `Butler` passed to a ``PipelineTask`` is often much more complex,
210 because we want to write to one `~CollectionType.RUN` collection but read
211 from several others (as well)::
213 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a",
214 collections=["u/alice/DM-50000/a",
215 "u/bob/DM-49998",
216 "HSC/defaults"])
218 This butler will `put` new datasets to the run ``u/alice/DM-50000/a``.
219 Datasets will be read first from that run (since it appears first in the
220 chain), and then from ``u/bob/DM-49998`` and finally ``HSC/defaults``.
222 Finally, one can always create a `Butler` with no collections::
224 butler = Butler("/path/to/repo", writeable=True)
226 This can be extremely useful when you just want to use ``butler.registry``,
227 e.g. for inserting dimension data or managing collections, or when the
228 collections you want to use with the butler are not consistent.
229 Passing ``writeable`` explicitly here is only necessary if you want to be
230 able to make changes to the repo - usually the value for ``writeable`` can
231 be guessed from the collection arguments provided, but it defaults to
232 `False` when there are not collection arguments.
233 """
235 def __init__(
236 self,
237 config: Union[Config, str, None] = None,
238 *,
239 butler: Optional[Butler] = None,
240 collections: Any = None,
241 run: Optional[str] = None,
242 searchPaths: Optional[List[str]] = None,
243 writeable: Optional[bool] = None,
244 inferDefaults: bool = True,
245 **kwargs: str,
246 ):
247 defaults = RegistryDefaults(collections=collections, run=run, infer=inferDefaults, **kwargs)
248 # Load registry, datastore, etc. from config or existing butler.
249 if butler is not None:
250 if config is not None or searchPaths is not None or writeable is not None:
251 raise TypeError(
252 "Cannot pass 'config', 'searchPaths', or 'writeable' arguments with 'butler' argument."
253 )
254 self.registry = butler.registry.copy(defaults)
255 self.datastore = butler.datastore
256 self.storageClasses = butler.storageClasses
257 self._config: ButlerConfig = butler._config
258 self._allow_put_of_predefined_dataset = butler._allow_put_of_predefined_dataset
259 else:
260 # Can only look for strings in the known repos list.
261 if isinstance(config, str) and config in self.get_known_repos():
262 config = str(self.get_repo_uri(config))
263 try:
264 self._config = ButlerConfig(config, searchPaths=searchPaths)
265 except FileNotFoundError as e:
266 if known := self.get_known_repos():
267 aliases = f"(known aliases: {', '.join(known)})"
268 else:
269 aliases = "(no known aliases)"
270 raise FileNotFoundError(f"{e} {aliases}") from e
271 self._config = ButlerConfig(config, searchPaths=searchPaths)
272 try:
273 if "root" in self._config:
274 butlerRoot = self._config["root"]
275 else:
276 butlerRoot = self._config.configDir
277 if writeable is None:
278 writeable = run is not None
279 self.registry = Registry.fromConfig(
280 self._config, butlerRoot=butlerRoot, writeable=writeable, defaults=defaults
281 )
282 self.datastore = Datastore.fromConfig(
283 self._config, self.registry.getDatastoreBridgeManager(), butlerRoot=butlerRoot
284 )
285 self.storageClasses = StorageClassFactory()
286 self.storageClasses.addFromConfig(self._config)
287 self._allow_put_of_predefined_dataset = self._config.get(
288 "allow_put_of_predefined_dataset", False
289 )
290 except Exception:
291 # Failures here usually mean that configuration is incomplete,
292 # just issue an error message which includes config file URI.
293 log.error(f"Failed to instantiate Butler from config {self._config.configFile}.")
294 raise
296 if "run" in self._config or "collection" in self._config:
297 raise ValueError("Passing a run or collection via configuration is no longer supported.")
299 GENERATION: ClassVar[int] = 3
300 """This is a Generation 3 Butler.
302 This attribute may be removed in the future, once the Generation 2 Butler
303 interface has been fully retired; it should only be used in transitional
304 code.
305 """
307 @classmethod
308 def get_repo_uri(cls, label: str) -> ResourcePath:
309 """Look up the label in a butler repository index.
311 Parameters
312 ----------
313 label : `str`
314 Label of the Butler repository to look up.
316 Returns
317 -------
318 uri : `lsst.resources.ResourcePath`
319 URI to the Butler repository associated with the given label.
321 Raises
322 ------
323 KeyError
324 Raised if the label is not found in the index, or if an index
325 can not be found at all.
327 Notes
328 -----
329 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the
330 information is discovered.
331 """
332 return ButlerRepoIndex.get_repo_uri(label)
334 @classmethod
335 def get_known_repos(cls) -> Set[str]:
336 """Retrieve the list of known repository labels.
338 Returns
339 -------
340 repos : `set` of `str`
341 All the known labels. Can be empty if no index can be found.
343 Notes
344 -----
345 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the
346 information is discovered.
347 """
348 return ButlerRepoIndex.get_known_repos()
350 @staticmethod
351 def makeRepo(
352 root: ResourcePathExpression,
353 config: Union[Config, str, None] = None,
354 dimensionConfig: Union[Config, str, None] = None,
355 standalone: bool = False,
356 searchPaths: Optional[List[str]] = None,
357 forceConfigRoot: bool = True,
358 outfile: Optional[ResourcePathExpression] = None,
359 overwrite: bool = False,
360 ) -> Config:
361 """Create an empty data repository by adding a butler.yaml config
362 to a repository root directory.
364 Parameters
365 ----------
366 root : `lsst.resources.ResourcePathExpression`
367 Path or URI to the root location of the new repository. Will be
368 created if it does not exist.
369 config : `Config` or `str`, optional
370 Configuration to write to the repository, after setting any
371 root-dependent Registry or Datastore config options. Can not
372 be a `ButlerConfig` or a `ConfigSubset`. If `None`, default
373 configuration will be used. Root-dependent config options
374 specified in this config are overwritten if ``forceConfigRoot``
375 is `True`.
376 dimensionConfig : `Config` or `str`, optional
377 Configuration for dimensions, will be used to initialize registry
378 database.
379 standalone : `bool`
380 If True, write all expanded defaults, not just customized or
381 repository-specific settings.
382 This (mostly) decouples the repository from the default
383 configuration, insulating it from changes to the defaults (which
384 may be good or bad, depending on the nature of the changes).
385 Future *additions* to the defaults will still be picked up when
386 initializing `Butlers` to repos created with ``standalone=True``.
387 searchPaths : `list` of `str`, optional
388 Directory paths to search when calculating the full butler
389 configuration.
390 forceConfigRoot : `bool`, optional
391 If `False`, any values present in the supplied ``config`` that
392 would normally be reset are not overridden and will appear
393 directly in the output config. This allows non-standard overrides
394 of the root directory for a datastore or registry to be given.
395 If this parameter is `True` the values for ``root`` will be
396 forced into the resulting config if appropriate.
397 outfile : `lss.resources.ResourcePathExpression`, optional
398 If not-`None`, the output configuration will be written to this
399 location rather than into the repository itself. Can be a URI
400 string. Can refer to a directory that will be used to write
401 ``butler.yaml``.
402 overwrite : `bool`, optional
403 Create a new configuration file even if one already exists
404 in the specified output location. Default is to raise
405 an exception.
407 Returns
408 -------
409 config : `Config`
410 The updated `Config` instance written to the repo.
412 Raises
413 ------
414 ValueError
415 Raised if a ButlerConfig or ConfigSubset is passed instead of a
416 regular Config (as these subclasses would make it impossible to
417 support ``standalone=False``).
418 FileExistsError
419 Raised if the output config file already exists.
420 os.error
421 Raised if the directory does not exist, exists but is not a
422 directory, or cannot be created.
424 Notes
425 -----
426 Note that when ``standalone=False`` (the default), the configuration
427 search path (see `ConfigSubset.defaultSearchPaths`) that was used to
428 construct the repository should also be used to construct any Butlers
429 to avoid configuration inconsistencies.
430 """
431 if isinstance(config, (ButlerConfig, ConfigSubset)):
432 raise ValueError("makeRepo must be passed a regular Config without defaults applied.")
434 # Ensure that the root of the repository exists or can be made
435 root_uri = ResourcePath(root, forceDirectory=True)
436 root_uri.mkdir()
438 config = Config(config)
440 # If we are creating a new repo from scratch with relative roots,
441 # do not propagate an explicit root from the config file
442 if "root" in config:
443 del config["root"]
445 full = ButlerConfig(config, searchPaths=searchPaths) # this applies defaults
446 imported_class = doImportType(full["datastore", "cls"])
447 if not issubclass(imported_class, Datastore):
448 raise TypeError(f"Imported datastore class {full['datastore', 'cls']} is not a Datastore")
449 datastoreClass: Type[Datastore] = imported_class
450 datastoreClass.setConfigRoot(BUTLER_ROOT_TAG, config, full, overwrite=forceConfigRoot)
452 # if key exists in given config, parse it, otherwise parse the defaults
453 # in the expanded config
454 if config.get(("registry", "db")):
455 registryConfig = RegistryConfig(config)
456 else:
457 registryConfig = RegistryConfig(full)
458 defaultDatabaseUri = registryConfig.makeDefaultDatabaseUri(BUTLER_ROOT_TAG)
459 if defaultDatabaseUri is not None:
460 Config.updateParameters(
461 RegistryConfig, config, full, toUpdate={"db": defaultDatabaseUri}, overwrite=forceConfigRoot
462 )
463 else:
464 Config.updateParameters(RegistryConfig, config, full, toCopy=("db",), overwrite=forceConfigRoot)
466 if standalone:
467 config.merge(full)
468 else:
469 # Always expand the registry.managers section into the per-repo
470 # config, because after the database schema is created, it's not
471 # allowed to change anymore. Note that in the standalone=True
472 # branch, _everything_ in the config is expanded, so there's no
473 # need to special case this.
474 Config.updateParameters(RegistryConfig, config, full, toMerge=("managers",), overwrite=False)
475 configURI: ResourcePathExpression
476 if outfile is not None:
477 # When writing to a separate location we must include
478 # the root of the butler repo in the config else it won't know
479 # where to look.
480 config["root"] = root_uri.geturl()
481 configURI = outfile
482 else:
483 configURI = root_uri
484 config.dumpToUri(configURI, overwrite=overwrite)
486 # Create Registry and populate tables
487 registryConfig = RegistryConfig(config.get("registry"))
488 dimensionConfig = DimensionConfig(dimensionConfig)
489 Registry.createFromConfig(registryConfig, dimensionConfig=dimensionConfig, butlerRoot=root_uri)
491 log.verbose("Wrote new Butler configuration file to %s", configURI)
493 return config
495 @classmethod
496 def _unpickle(
497 cls,
498 config: ButlerConfig,
499 collections: Optional[CollectionSearch],
500 run: Optional[str],
501 defaultDataId: Dict[str, str],
502 writeable: bool,
503 ) -> Butler:
504 """Callable used to unpickle a Butler.
506 We prefer not to use ``Butler.__init__`` directly so we can force some
507 of its many arguments to be keyword-only (note that ``__reduce__``
508 can only invoke callables with positional arguments).
510 Parameters
511 ----------
512 config : `ButlerConfig`
513 Butler configuration, already coerced into a true `ButlerConfig`
514 instance (and hence after any search paths for overrides have been
515 utilized).
516 collections : `CollectionSearch`
517 Names of the default collections to read from.
518 run : `str`, optional
519 Name of the default `~CollectionType.RUN` collection to write to.
520 defaultDataId : `dict` [ `str`, `str` ]
521 Default data ID values.
522 writeable : `bool`
523 Whether the Butler should support write operations.
525 Returns
526 -------
527 butler : `Butler`
528 A new `Butler` instance.
529 """
530 # MyPy doesn't recognize that the kwargs below are totally valid; it
531 # seems to think '**defaultDataId* is a _positional_ argument!
532 return cls(
533 config=config,
534 collections=collections,
535 run=run,
536 writeable=writeable,
537 **defaultDataId, # type: ignore
538 )
540 def __reduce__(self) -> tuple:
541 """Support pickling."""
542 return (
543 Butler._unpickle,
544 (
545 self._config,
546 self.collections,
547 self.run,
548 self.registry.defaults.dataId.byName(),
549 self.registry.isWriteable(),
550 ),
551 )
553 def __str__(self) -> str:
554 return "Butler(collections={}, run={}, datastore='{}', registry='{}')".format(
555 self.collections, self.run, self.datastore, self.registry
556 )
558 def isWriteable(self) -> bool:
559 """Return `True` if this `Butler` supports write operations."""
560 return self.registry.isWriteable()
562 @contextlib.contextmanager
563 def transaction(self) -> Iterator[None]:
564 """Context manager supporting `Butler` transactions.
566 Transactions can be nested.
567 """
568 with self.registry.transaction():
569 with self.datastore.transaction():
570 yield
572 def _standardizeArgs(
573 self,
574 datasetRefOrType: Union[DatasetRef, DatasetType, str],
575 dataId: Optional[DataId] = None,
576 for_put: bool = True,
577 **kwargs: Any,
578 ) -> Tuple[DatasetType, Optional[DataId]]:
579 """Standardize the arguments passed to several Butler APIs.
581 Parameters
582 ----------
583 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
584 When `DatasetRef` the `dataId` should be `None`.
585 Otherwise the `DatasetType` or name thereof.
586 dataId : `dict` or `DataCoordinate`
587 A `dict` of `Dimension` link name, value pairs that label the
588 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
589 should be provided as the second argument.
590 for_put : `bool`, optional
591 If `True` this call is invoked as part of a `Butler.put()`.
592 Otherwise it is assumed to be part of a `Butler.get()`. This
593 parameter is only relevant if there is dataset type
594 inconsistency.
595 **kwargs
596 Additional keyword arguments used to augment or construct a
597 `DataCoordinate`. See `DataCoordinate.standardize`
598 parameters.
600 Returns
601 -------
602 datasetType : `DatasetType`
603 A `DatasetType` instance extracted from ``datasetRefOrType``.
604 dataId : `dict` or `DataId`, optional
605 Argument that can be used (along with ``kwargs``) to construct a
606 `DataId`.
608 Notes
609 -----
610 Butler APIs that conceptually need a DatasetRef also allow passing a
611 `DatasetType` (or the name of one) and a `DataId` (or a dict and
612 keyword arguments that can be used to construct one) separately. This
613 method accepts those arguments and always returns a true `DatasetType`
614 and a `DataId` or `dict`.
616 Standardization of `dict` vs `DataId` is best handled by passing the
617 returned ``dataId`` (and ``kwargs``) to `Registry` APIs, which are
618 generally similarly flexible.
619 """
620 externalDatasetType: Optional[DatasetType] = None
621 internalDatasetType: Optional[DatasetType] = None
622 if isinstance(datasetRefOrType, DatasetRef):
623 if dataId is not None or kwargs:
624 raise ValueError("DatasetRef given, cannot use dataId as well")
625 externalDatasetType = datasetRefOrType.datasetType
626 dataId = datasetRefOrType.dataId
627 else:
628 # Don't check whether DataId is provided, because Registry APIs
629 # can usually construct a better error message when it wasn't.
630 if isinstance(datasetRefOrType, DatasetType):
631 externalDatasetType = datasetRefOrType
632 else:
633 internalDatasetType = self.registry.getDatasetType(datasetRefOrType)
635 # Check that they are self-consistent
636 if externalDatasetType is not None:
637 internalDatasetType = self.registry.getDatasetType(externalDatasetType.name)
638 if externalDatasetType != internalDatasetType:
639 # We can allow differences if they are compatible, depending
640 # on whether this is a get or a put. A get requires that
641 # the python type associated with the datastore can be
642 # converted to the user type. A put requires that the user
643 # supplied python type can be converted to the internal
644 # type expected by registry.
645 relevantDatasetType = internalDatasetType
646 if for_put:
647 is_compatible = internalDatasetType.is_compatible_with(externalDatasetType)
648 else:
649 is_compatible = externalDatasetType.is_compatible_with(internalDatasetType)
650 relevantDatasetType = externalDatasetType
651 if not is_compatible:
652 raise ValueError(
653 f"Supplied dataset type ({externalDatasetType}) inconsistent with "
654 f"registry definition ({internalDatasetType})"
655 )
656 # Override the internal definition.
657 internalDatasetType = relevantDatasetType
659 assert internalDatasetType is not None
660 return internalDatasetType, dataId
662 def _rewrite_data_id(
663 self, dataId: Optional[DataId], datasetType: DatasetType, **kwargs: Any
664 ) -> Tuple[Optional[DataId], Dict[str, Any]]:
665 """Rewrite a data ID taking into account dimension records.
667 Take a Data ID and keyword args and rewrite it if necessary to
668 allow the user to specify dimension records rather than dimension
669 primary values.
671 This allows a user to include a dataId dict with keys of
672 ``exposure.day_obs`` and ``exposure.seq_num`` instead of giving
673 the integer exposure ID. It also allows a string to be given
674 for a dimension value rather than the integer ID if that is more
675 convenient. For example, rather than having to specifyin the
676 detector with ``detector.full_name``, a string given for ``detector``
677 will be interpreted as the full name and converted to the integer
678 value.
680 Keyword arguments can also use strings for dimensions like detector
681 and exposure but python does not allow them to include ``.`` and
682 so the ``exposure.day_obs`` syntax can not be used in a keyword
683 argument.
685 Parameters
686 ----------
687 dataId : `dict` or `DataCoordinate`
688 A `dict` of `Dimension` link name, value pairs that will label the
689 `DatasetRef` within a Collection.
690 datasetType : `DatasetType`
691 The dataset type associated with this dataId. Required to
692 determine the relevant dimensions.
693 **kwargs
694 Additional keyword arguments used to augment or construct a
695 `DataId`. See `DataId` parameters.
697 Returns
698 -------
699 dataId : `dict` or `DataCoordinate`
700 The, possibly rewritten, dataId. If given a `DataCoordinate` and
701 no keyword arguments, the original dataId will be returned
702 unchanged.
703 **kwargs : `dict`
704 Any unused keyword arguments (would normally be empty dict).
705 """
706 # Do nothing if we have a standalone DataCoordinate.
707 if isinstance(dataId, DataCoordinate) and not kwargs:
708 return dataId, kwargs
710 # Process dimension records that are using record information
711 # rather than ids
712 newDataId: Dict[str, DataIdValue] = {}
713 byRecord: Dict[str, Dict[str, Any]] = defaultdict(dict)
715 # if all the dataId comes from keyword parameters we do not need
716 # to do anything here because they can't be of the form
717 # exposure.obs_id because a "." is not allowed in a keyword parameter.
718 if dataId:
719 for k, v in dataId.items():
720 # If we have a Dimension we do not need to do anything
721 # because it cannot be a compound key.
722 if isinstance(k, str) and "." in k:
723 # Someone is using a more human-readable dataId
724 dimensionName, record = k.split(".", 1)
725 byRecord[dimensionName][record] = v
726 elif isinstance(k, Dimension):
727 newDataId[k.name] = v
728 else:
729 newDataId[k] = v
731 # Go through the updated dataId and check the type in case someone is
732 # using an alternate key. We have already filtered out the compound
733 # keys dimensions.record format.
734 not_dimensions = {}
736 # Will need to look in the dataId and the keyword arguments
737 # and will remove them if they need to be fixed or are unrecognized.
738 for dataIdDict in (newDataId, kwargs):
739 # Use a list so we can adjust the dict safely in the loop
740 for dimensionName in list(dataIdDict):
741 value = dataIdDict[dimensionName]
742 try:
743 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
744 except KeyError:
745 # This is not a real dimension
746 not_dimensions[dimensionName] = value
747 del dataIdDict[dimensionName]
748 continue
750 # Convert an integral type to an explicit int to simplify
751 # comparisons here
752 if isinstance(value, numbers.Integral):
753 value = int(value)
755 if not isinstance(value, dimension.primaryKey.getPythonType()):
756 for alternate in dimension.alternateKeys:
757 if isinstance(value, alternate.getPythonType()):
758 byRecord[dimensionName][alternate.name] = value
759 del dataIdDict[dimensionName]
760 log.debug(
761 "Converting dimension %s to %s.%s=%s",
762 dimensionName,
763 dimensionName,
764 alternate.name,
765 value,
766 )
767 break
768 else:
769 log.warning(
770 "Type mismatch found for value '%r' provided for dimension %s. "
771 "Could not find matching alternative (primary key has type %s) "
772 "so attempting to use as-is.",
773 value,
774 dimensionName,
775 dimension.primaryKey.getPythonType(),
776 )
778 # By this point kwargs and newDataId should only include valid
779 # dimensions. Merge kwargs in to the new dataId and log if there
780 # are dimensions in both (rather than calling update).
781 for k, v in kwargs.items():
782 if k in newDataId and newDataId[k] != v:
783 log.debug(
784 "Keyword arg %s overriding explicit value in dataId of %s with %s", k, newDataId[k], v
785 )
786 newDataId[k] = v
787 # No need to retain any values in kwargs now.
788 kwargs = {}
790 # If we have some unrecognized dimensions we have to try to connect
791 # them to records in other dimensions. This is made more complicated
792 # by some dimensions having records with clashing names. A mitigation
793 # is that we can tell by this point which dimensions are missing
794 # for the DatasetType but this does not work for calibrations
795 # where additional dimensions can be used to constrain the temporal
796 # axis.
797 if not_dimensions:
798 # Search for all dimensions even if we have been given a value
799 # explicitly. In some cases records are given as well as the
800 # actually dimension and this should not be an error if they
801 # match.
802 mandatoryDimensions = datasetType.dimensions.names # - provided
804 candidateDimensions: Set[str] = set()
805 candidateDimensions.update(mandatoryDimensions)
807 # For calibrations we may well be needing temporal dimensions
808 # so rather than always including all dimensions in the scan
809 # restrict things a little. It is still possible for there
810 # to be confusion over day_obs in visit vs exposure for example.
811 # If we are not searching calibration collections things may
812 # fail but they are going to fail anyway because of the
813 # ambiguousness of the dataId...
814 if datasetType.isCalibration():
815 for dim in self.registry.dimensions.getStaticDimensions():
816 if dim.temporal:
817 candidateDimensions.add(str(dim))
819 # Look up table for the first association with a dimension
820 guessedAssociation: Dict[str, Dict[str, Any]] = defaultdict(dict)
822 # Keep track of whether an item is associated with multiple
823 # dimensions.
824 counter: Counter[str] = Counter()
825 assigned: Dict[str, Set[str]] = defaultdict(set)
827 # Go through the missing dimensions and associate the
828 # given names with records within those dimensions
829 matched_dims = set()
830 for dimensionName in candidateDimensions:
831 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
832 fields = dimension.metadata.names | dimension.uniqueKeys.names
833 for field in not_dimensions:
834 if field in fields:
835 guessedAssociation[dimensionName][field] = not_dimensions[field]
836 counter[dimensionName] += 1
837 assigned[field].add(dimensionName)
838 matched_dims.add(field)
840 # Calculate the fields that matched nothing.
841 never_found = set(not_dimensions) - matched_dims
843 if never_found:
844 raise ValueError(f"Unrecognized keyword args given: {never_found}")
846 # There is a chance we have allocated a single dataId item
847 # to multiple dimensions. Need to decide which should be retained.
848 # For now assume that the most popular alternative wins.
849 # This means that day_obs with seq_num will result in
850 # exposure.day_obs and not visit.day_obs
851 # Also prefer an explicitly missing dimension over an inferred
852 # temporal dimension.
853 for fieldName, assignedDimensions in assigned.items():
854 if len(assignedDimensions) > 1:
855 # Pick the most popular (preferring mandatory dimensions)
856 requiredButMissing = assignedDimensions.intersection(mandatoryDimensions)
857 if requiredButMissing:
858 candidateDimensions = requiredButMissing
859 else:
860 candidateDimensions = assignedDimensions
862 # If this is a choice between visit and exposure and
863 # neither was a required part of the dataset type,
864 # (hence in this branch) always prefer exposure over
865 # visit since exposures are always defined and visits
866 # are defined from exposures.
867 if candidateDimensions == {"exposure", "visit"}:
868 candidateDimensions = {"exposure"}
870 # Select the relevant items and get a new restricted
871 # counter.
872 theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions}
873 duplicatesCounter: Counter[str] = Counter()
874 duplicatesCounter.update(theseCounts)
876 # Choose the most common. If they are equally common
877 # we will pick the one that was found first.
878 # Returns a list of tuples
879 selected = duplicatesCounter.most_common(1)[0][0]
881 log.debug(
882 "Ambiguous dataId entry '%s' associated with multiple dimensions: %s."
883 " Removed ambiguity by choosing dimension %s.",
884 fieldName,
885 ", ".join(assignedDimensions),
886 selected,
887 )
889 for candidateDimension in assignedDimensions:
890 if candidateDimension != selected:
891 del guessedAssociation[candidateDimension][fieldName]
893 # Update the record look up dict with the new associations
894 for dimensionName, values in guessedAssociation.items():
895 if values: # A dict might now be empty
896 log.debug("Assigned non-dimension dataId keys to dimension %s: %s", dimensionName, values)
897 byRecord[dimensionName].update(values)
899 if byRecord:
900 # Some record specifiers were found so we need to convert
901 # them to the Id form
902 for dimensionName, values in byRecord.items():
903 if dimensionName in newDataId:
904 log.debug(
905 "DataId specified explicit %s dimension value of %s in addition to"
906 " general record specifiers for it of %s. Ignoring record information.",
907 dimensionName,
908 newDataId[dimensionName],
909 str(values),
910 )
911 # Get the actual record and compare with these values.
912 try:
913 recs = list(self.registry.queryDimensionRecords(dimensionName, dataId=newDataId))
914 except DataIdError:
915 raise ValueError(
916 f"Could not find dimension '{dimensionName}'"
917 f" with dataId {newDataId} as part of comparing with"
918 f" record values {byRecord[dimensionName]}"
919 ) from None
920 if len(recs) == 1:
921 errmsg: List[str] = []
922 for k, v in values.items():
923 if (recval := getattr(recs[0], k)) != v:
924 errmsg.append(f"{k}({recval} != {v})")
925 if errmsg:
926 raise ValueError(
927 f"Dimension {dimensionName} in dataId has explicit value"
928 " inconsistent with records: " + ", ".join(errmsg)
929 )
930 else:
931 # Multiple matches for an explicit dimension
932 # should never happen but let downstream complain.
933 pass
934 continue
936 # Build up a WHERE expression
937 bind = {k: v for k, v in values.items()}
938 where = " AND ".join(f"{dimensionName}.{k} = {k}" for k in bind)
940 # Hopefully we get a single record that matches
941 records = set(
942 self.registry.queryDimensionRecords(
943 dimensionName, dataId=newDataId, where=where, bind=bind, **kwargs
944 )
945 )
947 if len(records) != 1:
948 if len(records) > 1:
949 # visit can have an ambiguous answer without involving
950 # visit_system. The default visit_system is defined
951 # by the instrument.
952 if (
953 dimensionName == "visit"
954 and "visit_system_membership" in self.registry.dimensions
955 and "visit_system"
956 in self.registry.dimensions["instrument"].metadata # type: ignore
957 ):
958 instrument_records = list(
959 self.registry.queryDimensionRecords(
960 "instrument",
961 dataId=newDataId,
962 **kwargs,
963 )
964 )
965 if len(instrument_records) == 1:
966 visit_system = instrument_records[0].visit_system
967 if visit_system is None:
968 # Set to a value that will never match.
969 visit_system = -1
971 # Look up each visit in the
972 # visit_system_membership records.
973 for rec in records:
974 membership = list(
975 self.registry.queryDimensionRecords(
976 # Use bind to allow zero results.
977 # This is a fully-specified query.
978 "visit_system_membership",
979 where="instrument = inst AND visit_system = system AND visit = v",
980 bind=dict(
981 inst=instrument_records[0].name, system=visit_system, v=rec.id
982 ),
983 )
984 )
985 if membership:
986 # This record is the right answer.
987 records = set([rec])
988 break
990 # The ambiguity may have been resolved so check again.
991 if len(records) > 1:
992 log.debug("Received %d records from constraints of %s", len(records), str(values))
993 for r in records:
994 log.debug("- %s", str(r))
995 raise ValueError(
996 f"DataId specification for dimension {dimensionName} is not"
997 f" uniquely constrained to a single dataset by {values}."
998 f" Got {len(records)} results."
999 )
1000 else:
1001 raise ValueError(
1002 f"DataId specification for dimension {dimensionName} matched no"
1003 f" records when constrained by {values}"
1004 )
1006 # Get the primary key from the real dimension object
1007 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
1008 if not isinstance(dimension, Dimension):
1009 raise RuntimeError(
1010 f"{dimension.name} is not a true dimension, and cannot be used in data IDs."
1011 )
1012 newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name)
1014 return newDataId, kwargs
1016 def _findDatasetRef(
1017 self,
1018 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1019 dataId: Optional[DataId] = None,
1020 *,
1021 collections: Any = None,
1022 allowUnresolved: bool = False,
1023 **kwargs: Any,
1024 ) -> DatasetRef:
1025 """Shared logic for methods that start with a search for a dataset in
1026 the registry.
1028 Parameters
1029 ----------
1030 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1031 When `DatasetRef` the `dataId` should be `None`.
1032 Otherwise the `DatasetType` or name thereof.
1033 dataId : `dict` or `DataCoordinate`, optional
1034 A `dict` of `Dimension` link name, value pairs that label the
1035 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1036 should be provided as the first argument.
1037 collections : Any, optional
1038 Collections to be searched, overriding ``self.collections``.
1039 Can be any of the types supported by the ``collections`` argument
1040 to butler construction.
1041 allowUnresolved : `bool`, optional
1042 If `True`, return an unresolved `DatasetRef` if finding a resolved
1043 one in the `Registry` fails. Defaults to `False`.
1044 **kwargs
1045 Additional keyword arguments used to augment or construct a
1046 `DataId`. See `DataId` parameters.
1048 Returns
1049 -------
1050 ref : `DatasetRef`
1051 A reference to the dataset identified by the given arguments.
1053 Raises
1054 ------
1055 LookupError
1056 Raised if no matching dataset exists in the `Registry` (and
1057 ``allowUnresolved is False``).
1058 ValueError
1059 Raised if a resolved `DatasetRef` was passed as an input, but it
1060 differs from the one found in the registry.
1061 TypeError
1062 Raised if no collections were provided.
1063 """
1064 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, for_put=False, **kwargs)
1065 if isinstance(datasetRefOrType, DatasetRef):
1066 idNumber = datasetRefOrType.id
1067 else:
1068 idNumber = None
1069 timespan: Optional[Timespan] = None
1071 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs)
1073 if datasetType.isCalibration():
1074 # Because this is a calibration dataset, first try to make a
1075 # standardize the data ID without restricting the dimensions to
1076 # those of the dataset type requested, because there may be extra
1077 # dimensions that provide temporal information for a validity-range
1078 # lookup.
1079 dataId = DataCoordinate.standardize(
1080 dataId, universe=self.registry.dimensions, defaults=self.registry.defaults.dataId, **kwargs
1081 )
1082 if dataId.graph.temporal:
1083 dataId = self.registry.expandDataId(dataId)
1084 timespan = dataId.timespan
1085 else:
1086 # Standardize the data ID to just the dimensions of the dataset
1087 # type instead of letting registry.findDataset do it, so we get the
1088 # result even if no dataset is found.
1089 dataId = DataCoordinate.standardize(
1090 dataId, graph=datasetType.dimensions, defaults=self.registry.defaults.dataId, **kwargs
1091 )
1092 # Always lookup the DatasetRef, even if one is given, to ensure it is
1093 # present in the current collection.
1094 ref = self.registry.findDataset(datasetType, dataId, collections=collections, timespan=timespan)
1095 if ref is None:
1096 if allowUnresolved:
1097 return DatasetRef(datasetType, dataId)
1098 else:
1099 if collections is None:
1100 collections = self.registry.defaults.collections
1101 raise LookupError(
1102 f"Dataset {datasetType.name} with data ID {dataId} "
1103 f"could not be found in collections {collections}."
1104 )
1105 if idNumber is not None and idNumber != ref.id:
1106 if collections is None:
1107 collections = self.registry.defaults.collections
1108 raise ValueError(
1109 f"DatasetRef.id provided ({idNumber}) does not match "
1110 f"id ({ref.id}) in registry in collections {collections}."
1111 )
1112 if datasetType != ref.datasetType:
1113 # If they differ it is because the user explicitly specified
1114 # a compatible dataset type to this call rather than using the
1115 # registry definition. The DatasetRef must therefore be recreated
1116 # using the user definition such that the expected type is
1117 # returned.
1118 ref = DatasetRef(datasetType, ref.dataId, run=ref.run, id=ref.id)
1120 return ref
1122 @transactional
1123 def putDirect(self, obj: Any, ref: DatasetRef) -> DatasetRef:
1124 # Docstring inherited.
1125 (imported_ref,) = self.registry._importDatasets(
1126 [ref],
1127 expand=True,
1128 )
1129 if imported_ref.id != ref.getCheckedId():
1130 raise RuntimeError("This registry configuration does not support putDirect.")
1131 self.datastore.put(obj, ref)
1132 return ref
1134 @transactional
1135 def put(
1136 self,
1137 obj: Any,
1138 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1139 dataId: Optional[DataId] = None,
1140 *,
1141 run: Optional[str] = None,
1142 **kwargs: Any,
1143 ) -> DatasetRef:
1144 """Store and register a dataset.
1146 Parameters
1147 ----------
1148 obj : `object`
1149 The dataset.
1150 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1151 When `DatasetRef` is provided, ``dataId`` should be `None`.
1152 Otherwise the `DatasetType` or name thereof.
1153 dataId : `dict` or `DataCoordinate`
1154 A `dict` of `Dimension` link name, value pairs that label the
1155 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1156 should be provided as the second argument.
1157 run : `str`, optional
1158 The name of the run the dataset should be added to, overriding
1159 ``self.run``.
1160 **kwargs
1161 Additional keyword arguments used to augment or construct a
1162 `DataCoordinate`. See `DataCoordinate.standardize`
1163 parameters.
1165 Returns
1166 -------
1167 ref : `DatasetRef`
1168 A reference to the stored dataset, updated with the correct id if
1169 given.
1171 Raises
1172 ------
1173 TypeError
1174 Raised if the butler is read-only or if no run has been provided.
1175 """
1176 log.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run)
1177 if not self.isWriteable():
1178 raise TypeError("Butler is read-only.")
1179 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs)
1180 if isinstance(datasetRefOrType, DatasetRef) and datasetRefOrType.id is not None:
1181 raise ValueError("DatasetRef must not be in registry, must have None id")
1183 # Handle dimension records in dataId
1184 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs)
1186 # Add Registry Dataset entry.
1187 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions, **kwargs)
1189 # For an execution butler the datasets will be pre-defined.
1190 # If the butler is configured that way datasets should only be inserted
1191 # if they do not already exist in registry. Trying and catching
1192 # ConflictingDefinitionError will not work because the transaction
1193 # will be corrupted. Instead, in this mode always check first.
1194 ref = None
1195 ref_is_predefined = False
1196 if self._allow_put_of_predefined_dataset:
1197 # Get the matching ref for this run.
1198 ref = self.registry.findDataset(datasetType, collections=run, dataId=dataId)
1200 if ref:
1201 # Must be expanded form for datastore templating
1202 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions)
1203 ref = ref.expanded(dataId)
1204 ref_is_predefined = True
1206 if not ref:
1207 (ref,) = self.registry.insertDatasets(datasetType, run=run, dataIds=[dataId])
1209 # If the ref is predefined it is possible that the datastore also
1210 # has the record. Asking datastore to put it again will result in
1211 # the artifact being recreated, overwriting previous, then will cause
1212 # a failure in writing the record which will cause the artifact
1213 # to be removed. Much safer to ask first before attempting to
1214 # overwrite. Race conditions should not be an issue for the
1215 # execution butler environment.
1216 if ref_is_predefined:
1217 if self.datastore.knows(ref):
1218 raise ConflictingDefinitionError(f"Dataset associated {ref} already exists.")
1220 self.datastore.put(obj, ref)
1222 return ref
1224 def getDirect(self, ref: DatasetRef, *, parameters: Optional[Dict[str, Any]] = None) -> Any:
1225 """Retrieve a stored dataset.
1227 Unlike `Butler.get`, this method allows datasets outside the Butler's
1228 collection to be read as long as the `DatasetRef` that identifies them
1229 can be obtained separately.
1231 Parameters
1232 ----------
1233 ref : `DatasetRef`
1234 Resolved reference to an already stored dataset.
1235 parameters : `dict`
1236 Additional StorageClass-defined options to control reading,
1237 typically used to efficiently read only a subset of the dataset.
1239 Returns
1240 -------
1241 obj : `object`
1242 The dataset.
1243 """
1244 return self.datastore.get(ref, parameters=parameters)
1246 def getDirectDeferred(
1247 self, ref: DatasetRef, *, parameters: Union[dict, None] = None
1248 ) -> DeferredDatasetHandle:
1249 """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
1250 from a resolved `DatasetRef`.
1252 Parameters
1253 ----------
1254 ref : `DatasetRef`
1255 Resolved reference to an already stored dataset.
1256 parameters : `dict`
1257 Additional StorageClass-defined options to control reading,
1258 typically used to efficiently read only a subset of the dataset.
1260 Returns
1261 -------
1262 obj : `DeferredDatasetHandle`
1263 A handle which can be used to retrieve a dataset at a later time.
1265 Raises
1266 ------
1267 AmbiguousDatasetError
1268 Raised if ``ref.id is None``, i.e. the reference is unresolved.
1269 """
1270 if ref.id is None:
1271 raise AmbiguousDatasetError(
1272 f"Dataset of type {ref.datasetType.name} with data ID {ref.dataId} is not resolved."
1273 )
1274 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters)
1276 def getDeferred(
1277 self,
1278 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1279 dataId: Optional[DataId] = None,
1280 *,
1281 parameters: Union[dict, None] = None,
1282 collections: Any = None,
1283 **kwargs: Any,
1284 ) -> DeferredDatasetHandle:
1285 """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
1286 after an immediate registry lookup.
1288 Parameters
1289 ----------
1290 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1291 When `DatasetRef` the `dataId` should be `None`.
1292 Otherwise the `DatasetType` or name thereof.
1293 dataId : `dict` or `DataCoordinate`, optional
1294 A `dict` of `Dimension` link name, value pairs that label the
1295 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1296 should be provided as the first argument.
1297 parameters : `dict`
1298 Additional StorageClass-defined options to control reading,
1299 typically used to efficiently read only a subset of the dataset.
1300 collections : Any, optional
1301 Collections to be searched, overriding ``self.collections``.
1302 Can be any of the types supported by the ``collections`` argument
1303 to butler construction.
1304 **kwargs
1305 Additional keyword arguments used to augment or construct a
1306 `DataId`. See `DataId` parameters.
1308 Returns
1309 -------
1310 obj : `DeferredDatasetHandle`
1311 A handle which can be used to retrieve a dataset at a later time.
1313 Raises
1314 ------
1315 LookupError
1316 Raised if no matching dataset exists in the `Registry` (and
1317 ``allowUnresolved is False``).
1318 ValueError
1319 Raised if a resolved `DatasetRef` was passed as an input, but it
1320 differs from the one found in the registry.
1321 TypeError
1322 Raised if no collections were provided.
1323 """
1324 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs)
1325 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters)
1327 def get(
1328 self,
1329 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1330 dataId: Optional[DataId] = None,
1331 *,
1332 parameters: Optional[Dict[str, Any]] = None,
1333 collections: Any = None,
1334 **kwargs: Any,
1335 ) -> Any:
1336 """Retrieve a stored dataset.
1338 Parameters
1339 ----------
1340 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1341 When `DatasetRef` the `dataId` should be `None`.
1342 Otherwise the `DatasetType` or name thereof.
1343 dataId : `dict` or `DataCoordinate`
1344 A `dict` of `Dimension` link name, value pairs that label the
1345 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1346 should be provided as the first argument.
1347 parameters : `dict`
1348 Additional StorageClass-defined options to control reading,
1349 typically used to efficiently read only a subset of the dataset.
1350 collections : Any, optional
1351 Collections to be searched, overriding ``self.collections``.
1352 Can be any of the types supported by the ``collections`` argument
1353 to butler construction.
1354 **kwargs
1355 Additional keyword arguments used to augment or construct a
1356 `DataCoordinate`. See `DataCoordinate.standardize`
1357 parameters.
1359 Returns
1360 -------
1361 obj : `object`
1362 The dataset.
1364 Raises
1365 ------
1366 ValueError
1367 Raised if a resolved `DatasetRef` was passed as an input, but it
1368 differs from the one found in the registry.
1369 LookupError
1370 Raised if no matching dataset exists in the `Registry`.
1371 TypeError
1372 Raised if no collections were provided.
1374 Notes
1375 -----
1376 When looking up datasets in a `~CollectionType.CALIBRATION` collection,
1377 this method requires that the given data ID include temporal dimensions
1378 beyond the dimensions of the dataset type itself, in order to find the
1379 dataset with the appropriate validity range. For example, a "bias"
1380 dataset with native dimensions ``{instrument, detector}`` could be
1381 fetched with a ``{instrument, detector, exposure}`` data ID, because
1382 ``exposure`` is a temporal dimension.
1383 """
1384 log.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters)
1385 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs)
1386 return self.getDirect(ref, parameters=parameters)
1388 def getURIs(
1389 self,
1390 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1391 dataId: Optional[DataId] = None,
1392 *,
1393 predict: bool = False,
1394 collections: Any = None,
1395 run: Optional[str] = None,
1396 **kwargs: Any,
1397 ) -> DatasetRefURIs:
1398 """Returns the URIs associated with the dataset.
1400 Parameters
1401 ----------
1402 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1403 When `DatasetRef` the `dataId` should be `None`.
1404 Otherwise the `DatasetType` or name thereof.
1405 dataId : `dict` or `DataCoordinate`
1406 A `dict` of `Dimension` link name, value pairs that label the
1407 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1408 should be provided as the first argument.
1409 predict : `bool`
1410 If `True`, allow URIs to be returned of datasets that have not
1411 been written.
1412 collections : Any, optional
1413 Collections to be searched, overriding ``self.collections``.
1414 Can be any of the types supported by the ``collections`` argument
1415 to butler construction.
1416 run : `str`, optional
1417 Run to use for predictions, overriding ``self.run``.
1418 **kwargs
1419 Additional keyword arguments used to augment or construct a
1420 `DataCoordinate`. See `DataCoordinate.standardize`
1421 parameters.
1423 Returns
1424 -------
1425 uris : `DatasetRefURIs`
1426 The URI to the primary artifact associated with this dataset (if
1427 the dataset was disassembled within the datastore this may be
1428 `None`), and the URIs to any components associated with the dataset
1429 artifact. (can be empty if there are no components).
1430 """
1431 ref = self._findDatasetRef(
1432 datasetRefOrType, dataId, allowUnresolved=predict, collections=collections, **kwargs
1433 )
1434 if ref.id is None: # only possible if predict is True
1435 if run is None:
1436 run = self.run
1437 if run is None:
1438 raise TypeError("Cannot predict location with run=None.")
1439 # Lie about ID, because we can't guess it, and only
1440 # Datastore.getURIs() will ever see it (and it doesn't use it).
1441 ref = ref.resolved(id=0, run=run)
1442 return self.datastore.getURIs(ref, predict)
1444 def getURI(
1445 self,
1446 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1447 dataId: Optional[DataId] = None,
1448 *,
1449 predict: bool = False,
1450 collections: Any = None,
1451 run: Optional[str] = None,
1452 **kwargs: Any,
1453 ) -> ResourcePath:
1454 """Return the URI to the Dataset.
1456 Parameters
1457 ----------
1458 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1459 When `DatasetRef` the `dataId` should be `None`.
1460 Otherwise the `DatasetType` or name thereof.
1461 dataId : `dict` or `DataCoordinate`
1462 A `dict` of `Dimension` link name, value pairs that label the
1463 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1464 should be provided as the first argument.
1465 predict : `bool`
1466 If `True`, allow URIs to be returned of datasets that have not
1467 been written.
1468 collections : Any, optional
1469 Collections to be searched, overriding ``self.collections``.
1470 Can be any of the types supported by the ``collections`` argument
1471 to butler construction.
1472 run : `str`, optional
1473 Run to use for predictions, overriding ``self.run``.
1474 **kwargs
1475 Additional keyword arguments used to augment or construct a
1476 `DataCoordinate`. See `DataCoordinate.standardize`
1477 parameters.
1479 Returns
1480 -------
1481 uri : `lsst.resources.ResourcePath`
1482 URI pointing to the Dataset within the datastore. If the
1483 Dataset does not exist in the datastore, and if ``predict`` is
1484 `True`, the URI will be a prediction and will include a URI
1485 fragment "#predicted".
1486 If the datastore does not have entities that relate well
1487 to the concept of a URI the returned URI string will be
1488 descriptive. The returned URI is not guaranteed to be obtainable.
1490 Raises
1491 ------
1492 LookupError
1493 A URI has been requested for a dataset that does not exist and
1494 guessing is not allowed.
1495 ValueError
1496 Raised if a resolved `DatasetRef` was passed as an input, but it
1497 differs from the one found in the registry.
1498 TypeError
1499 Raised if no collections were provided.
1500 RuntimeError
1501 Raised if a URI is requested for a dataset that consists of
1502 multiple artifacts.
1503 """
1504 primary, components = self.getURIs(
1505 datasetRefOrType, dataId=dataId, predict=predict, collections=collections, run=run, **kwargs
1506 )
1508 if primary is None or components:
1509 raise RuntimeError(
1510 f"Dataset ({datasetRefOrType}) includes distinct URIs for components. "
1511 "Use Butler.getURIs() instead."
1512 )
1513 return primary
1515 def retrieveArtifacts(
1516 self,
1517 refs: Iterable[DatasetRef],
1518 destination: ResourcePathExpression,
1519 transfer: str = "auto",
1520 preserve_path: bool = True,
1521 overwrite: bool = False,
1522 ) -> List[ResourcePath]:
1523 """Retrieve the artifacts associated with the supplied refs.
1525 Parameters
1526 ----------
1527 refs : iterable of `DatasetRef`
1528 The datasets for which artifacts are to be retrieved.
1529 A single ref can result in multiple artifacts. The refs must
1530 be resolved.
1531 destination : `lsst.resources.ResourcePath` or `str`
1532 Location to write the artifacts.
1533 transfer : `str`, optional
1534 Method to use to transfer the artifacts. Must be one of the options
1535 supported by `~lsst.resources.ResourcePath.transfer_from()`.
1536 "move" is not allowed.
1537 preserve_path : `bool`, optional
1538 If `True` the full path of the artifact within the datastore
1539 is preserved. If `False` the final file component of the path
1540 is used.
1541 overwrite : `bool`, optional
1542 If `True` allow transfers to overwrite existing files at the
1543 destination.
1545 Returns
1546 -------
1547 targets : `list` of `lsst.resources.ResourcePath`
1548 URIs of file artifacts in destination location. Order is not
1549 preserved.
1551 Notes
1552 -----
1553 For non-file datastores the artifacts written to the destination
1554 may not match the representation inside the datastore. For example
1555 a hierarchical data structure in a NoSQL database may well be stored
1556 as a JSON file.
1557 """
1558 return self.datastore.retrieveArtifacts(
1559 refs,
1560 ResourcePath(destination),
1561 transfer=transfer,
1562 preserve_path=preserve_path,
1563 overwrite=overwrite,
1564 )
1566 def datasetExists(
1567 self,
1568 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1569 dataId: Optional[DataId] = None,
1570 *,
1571 collections: Any = None,
1572 **kwargs: Any,
1573 ) -> bool:
1574 """Return True if the Dataset is actually present in the Datastore.
1576 Parameters
1577 ----------
1578 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1579 When `DatasetRef` the `dataId` should be `None`.
1580 Otherwise the `DatasetType` or name thereof.
1581 dataId : `dict` or `DataCoordinate`
1582 A `dict` of `Dimension` link name, value pairs that label the
1583 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1584 should be provided as the first argument.
1585 collections : Any, optional
1586 Collections to be searched, overriding ``self.collections``.
1587 Can be any of the types supported by the ``collections`` argument
1588 to butler construction.
1589 **kwargs
1590 Additional keyword arguments used to augment or construct a
1591 `DataCoordinate`. See `DataCoordinate.standardize`
1592 parameters.
1594 Raises
1595 ------
1596 LookupError
1597 Raised if the dataset is not even present in the Registry.
1598 ValueError
1599 Raised if a resolved `DatasetRef` was passed as an input, but it
1600 differs from the one found in the registry.
1601 TypeError
1602 Raised if no collections were provided.
1603 """
1604 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs)
1605 return self.datastore.exists(ref)
1607 def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None:
1608 """Remove one or more `~CollectionType.RUN` collections and the
1609 datasets within them.
1611 Parameters
1612 ----------
1613 names : `Iterable` [ `str` ]
1614 The names of the collections to remove.
1615 unstore : `bool`, optional
1616 If `True` (default), delete datasets from all datastores in which
1617 they are present, and attempt to rollback the registry deletions if
1618 datastore deletions fail (which may not always be possible). If
1619 `False`, datastore records for these datasets are still removed,
1620 but any artifacts (e.g. files) will not be.
1622 Raises
1623 ------
1624 TypeError
1625 Raised if one or more collections are not of type
1626 `~CollectionType.RUN`.
1627 """
1628 if not self.isWriteable():
1629 raise TypeError("Butler is read-only.")
1630 names = list(names)
1631 refs: List[DatasetRef] = []
1632 for name in names:
1633 collectionType = self.registry.getCollectionType(name)
1634 if collectionType is not CollectionType.RUN:
1635 raise TypeError(f"The collection type of '{name}' is {collectionType.name}, not RUN.")
1636 refs.extend(self.registry.queryDatasets(..., collections=name, findFirst=True))
1637 with self.registry.transaction():
1638 if unstore:
1639 self.datastore.trash(refs)
1640 else:
1641 self.datastore.forget(refs)
1642 for name in names:
1643 self.registry.removeCollection(name)
1644 if unstore:
1645 # Point of no return for removing artifacts
1646 self.datastore.emptyTrash()
1648 def pruneCollection(
1649 self, name: str, purge: bool = False, unstore: bool = False, unlink: Optional[List[str]] = None
1650 ) -> None:
1651 """Remove a collection and possibly prune datasets within it.
1653 Parameters
1654 ----------
1655 name : `str`
1656 Name of the collection to remove. If this is a
1657 `~CollectionType.TAGGED` or `~CollectionType.CHAINED` collection,
1658 datasets within the collection are not modified unless ``unstore``
1659 is `True`. If this is a `~CollectionType.RUN` collection,
1660 ``purge`` and ``unstore`` must be `True`, and all datasets in it
1661 are fully removed from the data repository.
1662 purge : `bool`, optional
1663 If `True`, permit `~CollectionType.RUN` collections to be removed,
1664 fully removing datasets within them. Requires ``unstore=True`` as
1665 well as an added precaution against accidental deletion. Must be
1666 `False` (default) if the collection is not a ``RUN``.
1667 unstore: `bool`, optional
1668 If `True`, remove all datasets in the collection from all
1669 datastores in which they appear.
1670 unlink: `list` [`str`], optional
1671 Before removing the given `collection` unlink it from from these
1672 parent collections.
1674 Raises
1675 ------
1676 TypeError
1677 Raised if the butler is read-only or arguments are mutually
1678 inconsistent.
1679 """
1680 # See pruneDatasets comments for more information about the logic here;
1681 # the cases are almost the same, but here we can rely on Registry to
1682 # take care everything but Datastore deletion when we remove the
1683 # collection.
1684 if not self.isWriteable():
1685 raise TypeError("Butler is read-only.")
1686 collectionType = self.registry.getCollectionType(name)
1687 if purge and not unstore:
1688 raise PurgeWithoutUnstorePruneCollectionsError()
1689 if collectionType is CollectionType.RUN and not purge:
1690 raise RunWithoutPurgePruneCollectionsError(collectionType)
1691 if collectionType is not CollectionType.RUN and purge:
1692 raise PurgeUnsupportedPruneCollectionsError(collectionType)
1694 def remove(child: str, parent: str) -> None:
1695 """Remove a child collection from a parent collection."""
1696 # Remove child from parent.
1697 chain = list(self.registry.getCollectionChain(parent))
1698 try:
1699 chain.remove(name)
1700 except ValueError as e:
1701 raise RuntimeError(f"{name} is not a child of {parent}") from e
1702 self.registry.setCollectionChain(parent, chain)
1704 with self.registry.transaction():
1705 if unlink:
1706 for parent in unlink:
1707 remove(name, parent)
1708 if unstore:
1709 refs = self.registry.queryDatasets(..., collections=name, findFirst=True)
1710 self.datastore.trash(refs)
1711 self.registry.removeCollection(name)
1713 if unstore:
1714 # Point of no return for removing artifacts
1715 self.datastore.emptyTrash()
1717 def pruneDatasets(
1718 self,
1719 refs: Iterable[DatasetRef],
1720 *,
1721 disassociate: bool = True,
1722 unstore: bool = False,
1723 tags: Iterable[str] = (),
1724 purge: bool = False,
1725 ) -> None:
1726 # docstring inherited from LimitedButler
1728 if not self.isWriteable():
1729 raise TypeError("Butler is read-only.")
1730 if purge:
1731 if not disassociate:
1732 raise TypeError("Cannot pass purge=True without disassociate=True.")
1733 if not unstore:
1734 raise TypeError("Cannot pass purge=True without unstore=True.")
1735 elif disassociate:
1736 tags = tuple(tags)
1737 if not tags:
1738 raise TypeError("No tags provided but disassociate=True.")
1739 for tag in tags:
1740 collectionType = self.registry.getCollectionType(tag)
1741 if collectionType is not CollectionType.TAGGED:
1742 raise TypeError(
1743 f"Cannot disassociate from collection '{tag}' "
1744 f"of non-TAGGED type {collectionType.name}."
1745 )
1746 # For an execution butler we want to keep existing UUIDs for the
1747 # datasets, for that we need to keep them in the collections but
1748 # remove from datastore.
1749 if self._allow_put_of_predefined_dataset and purge:
1750 purge = False
1751 disassociate = False
1752 # Transform possibly-single-pass iterable into something we can iterate
1753 # over multiple times.
1754 refs = list(refs)
1755 # Pruning a component of a DatasetRef makes no sense since registry
1756 # doesn't know about components and datastore might not store
1757 # components in a separate file
1758 for ref in refs:
1759 if ref.datasetType.component():
1760 raise ValueError(f"Can not prune a component of a dataset (ref={ref})")
1761 # We don't need an unreliable Datastore transaction for this, because
1762 # we've been extra careful to ensure that Datastore.trash only involves
1763 # mutating the Registry (it can _look_ at Datastore-specific things,
1764 # but shouldn't change them), and hence all operations here are
1765 # Registry operations.
1766 with self.registry.transaction():
1767 if unstore:
1768 self.datastore.trash(refs)
1769 if purge:
1770 self.registry.removeDatasets(refs)
1771 elif disassociate:
1772 assert tags, "Guaranteed by earlier logic in this function."
1773 for tag in tags:
1774 self.registry.disassociate(tag, refs)
1775 # We've exited the Registry transaction, and apparently committed.
1776 # (if there was an exception, everything rolled back, and it's as if
1777 # nothing happened - and we never get here).
1778 # Datastore artifacts are not yet gone, but they're clearly marked
1779 # as trash, so if we fail to delete now because of (e.g.) filesystem
1780 # problems we can try again later, and if manual administrative
1781 # intervention is required, it's pretty clear what that should entail:
1782 # deleting everything on disk and in private Datastore tables that is
1783 # in the dataset_location_trash table.
1784 if unstore:
1785 # Point of no return for removing artifacts
1786 self.datastore.emptyTrash()
1788 @transactional
1789 def ingest(
1790 self,
1791 *datasets: FileDataset,
1792 transfer: Optional[str] = "auto",
1793 run: Optional[str] = None,
1794 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
1795 record_validation_info: bool = True,
1796 ) -> None:
1797 """Store and register one or more datasets that already exist on disk.
1799 Parameters
1800 ----------
1801 datasets : `FileDataset`
1802 Each positional argument is a struct containing information about
1803 a file to be ingested, including its URI (either absolute or
1804 relative to the datastore root, if applicable), a `DatasetRef`,
1805 and optionally a formatter class or its fully-qualified string
1806 name. If a formatter is not provided, the formatter that would be
1807 used for `put` is assumed. On successful return, all
1808 `FileDataset.ref` attributes will have their `DatasetRef.id`
1809 attribute populated and all `FileDataset.formatter` attributes will
1810 be set to the formatter class used. `FileDataset.path` attributes
1811 may be modified to put paths in whatever the datastore considers a
1812 standardized form.
1813 transfer : `str`, optional
1814 If not `None`, must be one of 'auto', 'move', 'copy', 'direct',
1815 'split', 'hardlink', 'relsymlink' or 'symlink', indicating how to
1816 transfer the file.
1817 run : `str`, optional
1818 The name of the run ingested datasets should be added to,
1819 overriding ``self.run``.
1820 idGenerationMode : `DatasetIdGenEnum`, optional
1821 Specifies option for generating dataset IDs. By default unique IDs
1822 are generated for each inserted dataset.
1823 record_validation_info : `bool`, optional
1824 If `True`, the default, the datastore can record validation
1825 information associated with the file. If `False` the datastore
1826 will not attempt to track any information such as checksums
1827 or file sizes. This can be useful if such information is tracked
1828 in an external system or if the file is to be compressed in place.
1829 It is up to the datastore whether this parameter is relevant.
1831 Raises
1832 ------
1833 TypeError
1834 Raised if the butler is read-only or if no run was provided.
1835 NotImplementedError
1836 Raised if the `Datastore` does not support the given transfer mode.
1837 DatasetTypeNotSupportedError
1838 Raised if one or more files to be ingested have a dataset type that
1839 is not supported by the `Datastore`..
1840 FileNotFoundError
1841 Raised if one of the given files does not exist.
1842 FileExistsError
1843 Raised if transfer is not `None` but the (internal) location the
1844 file would be moved to is already occupied.
1846 Notes
1847 -----
1848 This operation is not fully exception safe: if a database operation
1849 fails, the given `FileDataset` instances may be only partially updated.
1851 It is atomic in terms of database operations (they will either all
1852 succeed or all fail) providing the database engine implements
1853 transactions correctly. It will attempt to be atomic in terms of
1854 filesystem operations as well, but this cannot be implemented
1855 rigorously for most datastores.
1856 """
1857 if not self.isWriteable():
1858 raise TypeError("Butler is read-only.")
1859 progress = Progress("lsst.daf.butler.Butler.ingest", level=logging.DEBUG)
1860 # Reorganize the inputs so they're grouped by DatasetType and then
1861 # data ID. We also include a list of DatasetRefs for each FileDataset
1862 # to hold the resolved DatasetRefs returned by the Registry, before
1863 # it's safe to swap them into FileDataset.refs.
1864 # Some type annotation aliases to make that clearer:
1865 GroupForType = Dict[DataCoordinate, Tuple[FileDataset, List[DatasetRef]]]
1866 GroupedData = MutableMapping[DatasetType, GroupForType]
1867 # The actual data structure:
1868 groupedData: GroupedData = defaultdict(dict)
1869 # And the nested loop that populates it:
1870 for dataset in progress.wrap(datasets, desc="Grouping by dataset type"):
1871 # This list intentionally shared across the inner loop, since it's
1872 # associated with `dataset`.
1873 resolvedRefs: List[DatasetRef] = []
1875 # Somewhere to store pre-existing refs if we have an
1876 # execution butler.
1877 existingRefs: List[DatasetRef] = []
1879 for ref in dataset.refs:
1880 if ref.dataId in groupedData[ref.datasetType]:
1881 raise ConflictingDefinitionError(
1882 f"Ingest conflict. Dataset {dataset.path} has same"
1883 " DataId as other ingest dataset"
1884 f" {groupedData[ref.datasetType][ref.dataId][0].path} "
1885 f" ({ref.dataId})"
1886 )
1887 if self._allow_put_of_predefined_dataset:
1888 existing_ref = self.registry.findDataset(
1889 ref.datasetType, dataId=ref.dataId, collections=run
1890 )
1891 if existing_ref:
1892 if self.datastore.knows(existing_ref):
1893 raise ConflictingDefinitionError(
1894 f"Dataset associated with path {dataset.path}"
1895 f" already exists as {existing_ref}."
1896 )
1897 # Store this ref elsewhere since it already exists
1898 # and we do not want to remake it but we do want
1899 # to store it in the datastore.
1900 existingRefs.append(existing_ref)
1902 # Nothing else to do until we have finished
1903 # iterating.
1904 continue
1906 groupedData[ref.datasetType][ref.dataId] = (dataset, resolvedRefs)
1908 if existingRefs:
1909 if len(dataset.refs) != len(existingRefs):
1910 # Keeping track of partially pre-existing datasets is hard
1911 # and should generally never happen. For now don't allow
1912 # it.
1913 raise ConflictingDefinitionError(
1914 f"For dataset {dataset.path} some dataIds already exist"
1915 " in registry but others do not. This is not supported."
1916 )
1918 # Attach the resolved refs if we found them.
1919 dataset.refs = existingRefs
1921 # Now we can bulk-insert into Registry for each DatasetType.
1922 for datasetType, groupForType in progress.iter_item_chunks(
1923 groupedData.items(), desc="Bulk-inserting datasets by type"
1924 ):
1925 refs = self.registry.insertDatasets(
1926 datasetType,
1927 dataIds=groupForType.keys(),
1928 run=run,
1929 expand=self.datastore.needs_expanded_data_ids(transfer, datasetType),
1930 idGenerationMode=idGenerationMode,
1931 )
1932 # Append those resolved DatasetRefs to the new lists we set up for
1933 # them.
1934 for ref, (_, resolvedRefs) in zip(refs, groupForType.values()):
1935 resolvedRefs.append(ref)
1937 # Go back to the original FileDatasets to replace their refs with the
1938 # new resolved ones.
1939 for groupForType in progress.iter_chunks(
1940 groupedData.values(), desc="Reassociating resolved dataset refs with files"
1941 ):
1942 for dataset, resolvedRefs in groupForType.values():
1943 dataset.refs = resolvedRefs
1945 # Bulk-insert everything into Datastore.
1946 self.datastore.ingest(*datasets, transfer=transfer, record_validation_info=record_validation_info)
1948 @contextlib.contextmanager
1949 def export(
1950 self,
1951 *,
1952 directory: Optional[str] = None,
1953 filename: Optional[str] = None,
1954 format: Optional[str] = None,
1955 transfer: Optional[str] = None,
1956 ) -> Iterator[RepoExportContext]:
1957 """Export datasets from the repository represented by this `Butler`.
1959 This method is a context manager that returns a helper object
1960 (`RepoExportContext`) that is used to indicate what information from
1961 the repository should be exported.
1963 Parameters
1964 ----------
1965 directory : `str`, optional
1966 Directory dataset files should be written to if ``transfer`` is not
1967 `None`.
1968 filename : `str`, optional
1969 Name for the file that will include database information associated
1970 with the exported datasets. If this is not an absolute path and
1971 ``directory`` is not `None`, it will be written to ``directory``
1972 instead of the current working directory. Defaults to
1973 "export.{format}".
1974 format : `str`, optional
1975 File format for the database information file. If `None`, the
1976 extension of ``filename`` will be used.
1977 transfer : `str`, optional
1978 Transfer mode passed to `Datastore.export`.
1980 Raises
1981 ------
1982 TypeError
1983 Raised if the set of arguments passed is inconsistent.
1985 Examples
1986 --------
1987 Typically the `Registry.queryDataIds` and `Registry.queryDatasets`
1988 methods are used to provide the iterables over data IDs and/or datasets
1989 to be exported::
1991 with butler.export("exports.yaml") as export:
1992 # Export all flats, but none of the dimension element rows
1993 # (i.e. data ID information) associated with them.
1994 export.saveDatasets(butler.registry.queryDatasets("flat"),
1995 elements=())
1996 # Export all datasets that start with "deepCoadd_" and all of
1997 # their associated data ID information.
1998 export.saveDatasets(butler.registry.queryDatasets("deepCoadd_*"))
1999 """
2000 if directory is None and transfer is not None:
2001 raise TypeError("Cannot transfer without providing a directory.")
2002 if transfer == "move":
2003 raise TypeError("Transfer may not be 'move': export is read-only")
2004 if format is None:
2005 if filename is None:
2006 raise TypeError("At least one of 'filename' or 'format' must be provided.")
2007 else:
2008 _, format = os.path.splitext(filename)
2009 elif filename is None:
2010 filename = f"export.{format}"
2011 if directory is not None:
2012 filename = os.path.join(directory, filename)
2013 BackendClass = get_class_of(self._config["repo_transfer_formats"][format]["export"])
2014 with open(filename, "w") as stream:
2015 backend = BackendClass(stream, universe=self.registry.dimensions)
2016 try:
2017 helper = RepoExportContext(
2018 self.registry, self.datastore, backend=backend, directory=directory, transfer=transfer
2019 )
2020 yield helper
2021 except BaseException:
2022 raise
2023 else:
2024 helper._finish()
2026 def import_(
2027 self,
2028 *,
2029 directory: Optional[str] = None,
2030 filename: Union[str, TextIO, None] = None,
2031 format: Optional[str] = None,
2032 transfer: Optional[str] = None,
2033 skip_dimensions: Optional[Set] = None,
2034 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
2035 reuseIds: bool = False,
2036 ) -> None:
2037 """Import datasets into this repository that were exported from a
2038 different butler repository via `~lsst.daf.butler.Butler.export`.
2040 Parameters
2041 ----------
2042 directory : `str`, optional
2043 Directory containing dataset files to import from. If `None`,
2044 ``filename`` and all dataset file paths specified therein must
2045 be absolute.
2046 filename : `str` or `TextIO`, optional
2047 A stream or name of file that contains database information
2048 associated with the exported datasets, typically generated by
2049 `~lsst.daf.butler.Butler.export`. If this a string (name) and
2050 is not an absolute path, does not exist in the current working
2051 directory, and ``directory`` is not `None`, it is assumed to be in
2052 ``directory``. Defaults to "export.{format}".
2053 format : `str`, optional
2054 File format for ``filename``. If `None`, the extension of
2055 ``filename`` will be used.
2056 transfer : `str`, optional
2057 Transfer mode passed to `~lsst.daf.butler.Datastore.ingest`.
2058 skip_dimensions : `set`, optional
2059 Names of dimensions that should be skipped and not imported.
2060 idGenerationMode : `DatasetIdGenEnum`, optional
2061 Specifies option for generating dataset IDs when IDs are not
2062 provided or their type does not match backend type. By default
2063 unique IDs are generated for each inserted dataset.
2064 reuseIds : `bool`, optional
2065 If `True` then forces re-use of imported dataset IDs for integer
2066 IDs which are normally generated as auto-incremented; exception
2067 will be raised if imported IDs clash with existing ones. This
2068 option has no effect on the use of globally-unique IDs which are
2069 always re-used (or generated if integer IDs are being imported).
2071 Raises
2072 ------
2073 TypeError
2074 Raised if the set of arguments passed is inconsistent, or if the
2075 butler is read-only.
2076 """
2077 if not self.isWriteable():
2078 raise TypeError("Butler is read-only.")
2079 if format is None:
2080 if filename is None:
2081 raise TypeError("At least one of 'filename' or 'format' must be provided.")
2082 else:
2083 _, format = os.path.splitext(filename) # type: ignore
2084 elif filename is None:
2085 filename = f"export.{format}"
2086 if isinstance(filename, str) and directory is not None and not os.path.exists(filename):
2087 filename = os.path.join(directory, filename)
2088 BackendClass = get_class_of(self._config["repo_transfer_formats"][format]["import"])
2090 def doImport(importStream: TextIO) -> None:
2091 backend = BackendClass(importStream, self.registry)
2092 backend.register()
2093 with self.transaction():
2094 backend.load(
2095 self.datastore,
2096 directory=directory,
2097 transfer=transfer,
2098 skip_dimensions=skip_dimensions,
2099 idGenerationMode=idGenerationMode,
2100 reuseIds=reuseIds,
2101 )
2103 if isinstance(filename, str):
2104 with open(filename, "r") as stream:
2105 doImport(stream)
2106 else:
2107 doImport(filename)
2109 def transfer_from(
2110 self,
2111 source_butler: Butler,
2112 source_refs: Iterable[DatasetRef],
2113 transfer: str = "auto",
2114 id_gen_map: Dict[str, DatasetIdGenEnum] = None,
2115 skip_missing: bool = True,
2116 register_dataset_types: bool = False,
2117 ) -> List[DatasetRef]:
2118 """Transfer datasets to this Butler from a run in another Butler.
2120 Parameters
2121 ----------
2122 source_butler : `Butler`
2123 Butler from which the datasets are to be transferred.
2124 source_refs : iterable of `DatasetRef`
2125 Datasets defined in the source butler that should be transferred to
2126 this butler.
2127 transfer : `str`, optional
2128 Transfer mode passed to `~lsst.daf.butler.Datastore.transfer_from`.
2129 id_gen_map : `dict` of [`str`, `DatasetIdGenEnum`], optional
2130 A mapping of dataset type to ID generation mode. Only used if
2131 the source butler is using integer IDs. Should not be used
2132 if this receiving butler uses integer IDs. Without this dataset
2133 import always uses unique.
2134 skip_missing : `bool`
2135 If `True`, datasets with no datastore artifact associated with
2136 them are not transferred. If `False` a registry entry will be
2137 created even if no datastore record is created (and so will
2138 look equivalent to the dataset being unstored).
2139 register_dataset_types : `bool`
2140 If `True` any missing dataset types are registered. Otherwise
2141 an exception is raised.
2143 Returns
2144 -------
2145 refs : `list` of `DatasetRef`
2146 The refs added to this Butler.
2148 Notes
2149 -----
2150 Requires that any dimension definitions are already present in the
2151 receiving Butler. The datastore artifact has to exist for a transfer
2152 to be made but non-existence is not an error.
2154 Datasets that already exist in this run will be skipped.
2156 The datasets are imported as part of a transaction, although
2157 dataset types are registered before the transaction is started.
2158 This means that it is possible for a dataset type to be registered
2159 even though transfer has failed.
2160 """
2161 if not self.isWriteable():
2162 raise TypeError("Butler is read-only.")
2163 progress = Progress("lsst.daf.butler.Butler.transfer_from", level=VERBOSE)
2165 # Will iterate through the refs multiple times so need to convert
2166 # to a list if this isn't a collection.
2167 if not isinstance(source_refs, collections.abc.Collection):
2168 source_refs = list(source_refs)
2170 original_count = len(source_refs)
2171 log.info("Transferring %d datasets into %s", original_count, str(self))
2173 if id_gen_map is None:
2174 id_gen_map = {}
2176 # In some situations the datastore artifact may be missing
2177 # and we do not want that registry entry to be imported.
2178 # Asking datastore is not sufficient, the records may have been
2179 # purged, we have to ask for the (predicted) URI and check
2180 # existence explicitly. Execution butler is set up exactly like
2181 # this with no datastore records.
2182 artifact_existence: Dict[ResourcePath, bool] = {}
2183 if skip_missing:
2184 dataset_existence = source_butler.datastore.mexists(
2185 source_refs, artifact_existence=artifact_existence
2186 )
2187 source_refs = [ref for ref, exists in dataset_existence.items() if exists]
2188 filtered_count = len(source_refs)
2189 log.verbose(
2190 "%d datasets removed because the artifact does not exist. Now have %d.",
2191 original_count - filtered_count,
2192 filtered_count,
2193 )
2195 # Importing requires that we group the refs by dataset type and run
2196 # before doing the import.
2197 source_dataset_types = set()
2198 grouped_refs = defaultdict(list)
2199 grouped_indices = defaultdict(list)
2200 for i, ref in enumerate(source_refs):
2201 grouped_refs[ref.datasetType, ref.run].append(ref)
2202 grouped_indices[ref.datasetType, ref.run].append(i)
2203 source_dataset_types.add(ref.datasetType)
2205 # Check to see if the dataset type in the source butler has
2206 # the same definition in the target butler and register missing
2207 # ones if requested. Registration must happen outside a transaction.
2208 newly_registered_dataset_types = set()
2209 for datasetType in source_dataset_types:
2210 if register_dataset_types:
2211 # Let this raise immediately if inconsistent. Continuing
2212 # on to find additional inconsistent dataset types
2213 # might result in additional unwanted dataset types being
2214 # registered.
2215 if self.registry.registerDatasetType(datasetType):
2216 newly_registered_dataset_types.add(datasetType)
2217 else:
2218 # If the dataset type is missing, let it fail immediately.
2219 target_dataset_type = self.registry.getDatasetType(datasetType.name)
2220 if target_dataset_type != datasetType:
2221 raise ConflictingDefinitionError(
2222 "Source butler dataset type differs from definition"
2223 f" in target butler: {datasetType} !="
2224 f" {target_dataset_type}"
2225 )
2226 if newly_registered_dataset_types:
2227 # We may have registered some even if there were inconsistencies
2228 # but should let people know (or else remove them again).
2229 log.log(
2230 VERBOSE,
2231 "Registered the following dataset types in the target Butler: %s",
2232 ", ".join(d.name for d in newly_registered_dataset_types),
2233 )
2234 else:
2235 log.log(VERBOSE, "All required dataset types are known to the target Butler")
2237 # The returned refs should be identical for UUIDs.
2238 # For now must also support integers and so need to retain the
2239 # newly-created refs from this registry.
2240 # Pre-size it so we can assign refs into the correct slots
2241 transferred_refs_tmp: List[Optional[DatasetRef]] = [None] * len(source_refs)
2242 default_id_gen = DatasetIdGenEnum.UNIQUE
2244 handled_collections: Set[str] = set()
2246 # Do all the importing in a single transaction.
2247 with self.transaction():
2248 for (datasetType, run), refs_to_import in progress.iter_item_chunks(
2249 grouped_refs.items(), desc="Importing to registry by run and dataset type"
2250 ):
2251 if run not in handled_collections:
2252 run_doc = source_butler.registry.getCollectionDocumentation(run)
2253 registered = self.registry.registerRun(run, doc=run_doc)
2254 handled_collections.add(run)
2255 if registered:
2256 log.log(VERBOSE, "Creating output run %s", run)
2258 id_generation_mode = default_id_gen
2259 if isinstance(refs_to_import[0].id, int):
2260 # ID generation mode might need to be overridden when
2261 # targetting UUID
2262 id_generation_mode = id_gen_map.get(datasetType.name, default_id_gen)
2264 n_refs = len(refs_to_import)
2265 log.verbose(
2266 "Importing %d ref%s of dataset type %s into run %s",
2267 n_refs,
2268 "" if n_refs == 1 else "s",
2269 datasetType.name,
2270 run,
2271 )
2273 # No way to know if this butler's registry uses UUID.
2274 # We have to trust the caller on this. If it fails they will
2275 # have to change their approach. We can't catch the exception
2276 # and retry with unique because that will mess up the
2277 # transaction handling. We aren't allowed to ask the registry
2278 # manager what type of ID it is using.
2279 imported_refs = self.registry._importDatasets(
2280 refs_to_import, idGenerationMode=id_generation_mode, expand=False
2281 )
2283 # Map them into the correct slots to match the initial order
2284 for i, ref in zip(grouped_indices[datasetType, run], imported_refs):
2285 transferred_refs_tmp[i] = ref
2287 # Mypy insists that we might have None in here so we have to make
2288 # that explicit by assigning to a new variable and filtering out
2289 # something that won't be there.
2290 transferred_refs = [ref for ref in transferred_refs_tmp if ref is not None]
2292 # Check consistency
2293 assert len(source_refs) == len(transferred_refs), "Different number of refs imported than given"
2295 log.verbose("Imported %d datasets into destination butler", len(transferred_refs))
2297 # The transferred refs need to be reordered to match the original
2298 # ordering given by the caller. Without this the datastore transfer
2299 # will be broken.
2301 # Ask the datastore to transfer. The datastore has to check that
2302 # the source datastore is compatible with the target datastore.
2303 self.datastore.transfer_from(
2304 source_butler.datastore,
2305 source_refs,
2306 local_refs=transferred_refs,
2307 transfer=transfer,
2308 artifact_existence=artifact_existence,
2309 )
2311 return transferred_refs
2313 def validateConfiguration(
2314 self,
2315 logFailures: bool = False,
2316 datasetTypeNames: Optional[Iterable[str]] = None,
2317 ignore: Iterable[str] = None,
2318 ) -> None:
2319 """Validate butler configuration.
2321 Checks that each `DatasetType` can be stored in the `Datastore`.
2323 Parameters
2324 ----------
2325 logFailures : `bool`, optional
2326 If `True`, output a log message for every validation error
2327 detected.
2328 datasetTypeNames : iterable of `str`, optional
2329 The `DatasetType` names that should be checked. This allows
2330 only a subset to be selected.
2331 ignore : iterable of `str`, optional
2332 Names of DatasetTypes to skip over. This can be used to skip
2333 known problems. If a named `DatasetType` corresponds to a
2334 composite, all components of that `DatasetType` will also be
2335 ignored.
2337 Raises
2338 ------
2339 ButlerValidationError
2340 Raised if there is some inconsistency with how this Butler
2341 is configured.
2342 """
2343 if datasetTypeNames:
2344 datasetTypes = [self.registry.getDatasetType(name) for name in datasetTypeNames]
2345 else:
2346 datasetTypes = list(self.registry.queryDatasetTypes())
2348 # filter out anything from the ignore list
2349 if ignore:
2350 ignore = set(ignore)
2351 datasetTypes = [
2352 e for e in datasetTypes if e.name not in ignore and e.nameAndComponent()[0] not in ignore
2353 ]
2354 else:
2355 ignore = set()
2357 # Find all the registered instruments
2358 instruments = set(record.name for record in self.registry.queryDimensionRecords("instrument"))
2360 # For each datasetType that has an instrument dimension, create
2361 # a DatasetRef for each defined instrument
2362 datasetRefs = []
2364 for datasetType in datasetTypes:
2365 if "instrument" in datasetType.dimensions:
2366 for instrument in instruments:
2367 datasetRef = DatasetRef(
2368 datasetType, {"instrument": instrument}, conform=False # type: ignore
2369 )
2370 datasetRefs.append(datasetRef)
2372 entities: List[Union[DatasetType, DatasetRef]] = []
2373 entities.extend(datasetTypes)
2374 entities.extend(datasetRefs)
2376 datastoreErrorStr = None
2377 try:
2378 self.datastore.validateConfiguration(entities, logFailures=logFailures)
2379 except ValidationError as e:
2380 datastoreErrorStr = str(e)
2382 # Also check that the LookupKeys used by the datastores match
2383 # registry and storage class definitions
2384 keys = self.datastore.getLookupKeys()
2386 failedNames = set()
2387 failedDataId = set()
2388 for key in keys:
2389 if key.name is not None:
2390 if key.name in ignore:
2391 continue
2393 # skip if specific datasetType names were requested and this
2394 # name does not match
2395 if datasetTypeNames and key.name not in datasetTypeNames:
2396 continue
2398 # See if it is a StorageClass or a DatasetType
2399 if key.name in self.storageClasses:
2400 pass
2401 else:
2402 try:
2403 self.registry.getDatasetType(key.name)
2404 except KeyError:
2405 if logFailures:
2406 log.critical("Key '%s' does not correspond to a DatasetType or StorageClass", key)
2407 failedNames.add(key)
2408 else:
2409 # Dimensions are checked for consistency when the Butler
2410 # is created and rendezvoused with a universe.
2411 pass
2413 # Check that the instrument is a valid instrument
2414 # Currently only support instrument so check for that
2415 if key.dataId:
2416 dataIdKeys = set(key.dataId)
2417 if set(["instrument"]) != dataIdKeys:
2418 if logFailures:
2419 log.critical("Key '%s' has unsupported DataId override", key)
2420 failedDataId.add(key)
2421 elif key.dataId["instrument"] not in instruments:
2422 if logFailures:
2423 log.critical("Key '%s' has unknown instrument", key)
2424 failedDataId.add(key)
2426 messages = []
2428 if datastoreErrorStr:
2429 messages.append(datastoreErrorStr)
2431 for failed, msg in (
2432 (failedNames, "Keys without corresponding DatasetType or StorageClass entry: "),
2433 (failedDataId, "Keys with bad DataId entries: "),
2434 ):
2435 if failed:
2436 msg += ", ".join(str(k) for k in failed)
2437 messages.append(msg)
2439 if messages:
2440 raise ValidationError(";\n".join(messages))
2442 @property
2443 def collections(self) -> CollectionSearch:
2444 """The collections to search by default, in order (`CollectionSearch`).
2446 This is an alias for ``self.registry.defaults.collections``. It cannot
2447 be set directly in isolation, but all defaults may be changed together
2448 by assigning a new `RegistryDefaults` instance to
2449 ``self.registry.defaults``.
2450 """
2451 return self.registry.defaults.collections
2453 @property
2454 def run(self) -> Optional[str]:
2455 """Name of the run this butler writes outputs to by default (`str` or
2456 `None`).
2458 This is an alias for ``self.registry.defaults.run``. It cannot be set
2459 directly in isolation, but all defaults may be changed together by
2460 assigning a new `RegistryDefaults` instance to
2461 ``self.registry.defaults``.
2462 """
2463 return self.registry.defaults.run
2465 @property
2466 def dimensions(self) -> DimensionUniverse:
2467 # Docstring inherited.
2468 return self.registry.dimensions
2470 registry: Registry
2471 """The object that manages dataset metadata and relationships (`Registry`).
2473 Most operations that don't involve reading or writing butler datasets are
2474 accessible only via `Registry` methods.
2475 """
2477 datastore: Datastore
2478 """The object that manages actual dataset storage (`Datastore`).
2480 Direct user access to the datastore should rarely be necessary; the primary
2481 exception is the case where a `Datastore` implementation provides extra
2482 functionality beyond what the base class defines.
2483 """
2485 storageClasses: StorageClassFactory
2486 """An object that maps known storage class names to objects that fully
2487 describe them (`StorageClassFactory`).
2488 """
2490 _allow_put_of_predefined_dataset: bool
2491 """Allow a put to succeed even if there is already a registry entry for it
2492 but not a datastore record. (`bool`)."""