Coverage for python/lsst/daf/butler/_butler.py: 9%
687 statements
« prev ^ index » next coverage.py v6.4.4, created at 2022-09-30 02:19 -0700
« prev ^ index » next coverage.py v6.4.4, created at 2022-09-30 02:19 -0700
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""
23Butler top level classes.
24"""
25from __future__ import annotations
27__all__ = (
28 "Butler",
29 "ButlerValidationError",
30 "PruneCollectionsArgsError",
31 "PurgeWithoutUnstorePruneCollectionsError",
32 "RunWithoutPurgePruneCollectionsError",
33 "PurgeUnsupportedPruneCollectionsError",
34)
36import collections.abc
37import contextlib
38import logging
39import numbers
40import os
41from collections import defaultdict
42from typing import (
43 Any,
44 ClassVar,
45 Counter,
46 Dict,
47 Iterable,
48 Iterator,
49 List,
50 MutableMapping,
51 Optional,
52 Set,
53 TextIO,
54 Tuple,
55 Type,
56 Union,
57)
59from lsst.resources import ResourcePath, ResourcePathExpression
60from lsst.utils import doImportType
61from lsst.utils.introspection import get_class_of
62from lsst.utils.logging import VERBOSE, getLogger
64from ._butlerConfig import ButlerConfig
65from ._butlerRepoIndex import ButlerRepoIndex
66from ._deferredDatasetHandle import DeferredDatasetHandle
67from ._limited_butler import LimitedButler
68from .core import (
69 AmbiguousDatasetError,
70 Config,
71 ConfigSubset,
72 DataCoordinate,
73 DataId,
74 DataIdValue,
75 DatasetRef,
76 DatasetRefURIs,
77 DatasetType,
78 Datastore,
79 Dimension,
80 DimensionConfig,
81 DimensionElement,
82 DimensionRecord,
83 DimensionUniverse,
84 FileDataset,
85 Progress,
86 StorageClassFactory,
87 Timespan,
88 ValidationError,
89)
90from .core.repoRelocation import BUTLER_ROOT_TAG
91from .core.utils import transactional
92from .registry import (
93 CollectionSearch,
94 CollectionType,
95 ConflictingDefinitionError,
96 DataIdError,
97 DatasetIdGenEnum,
98 Registry,
99 RegistryConfig,
100 RegistryDefaults,
101)
102from .transfers import RepoExportContext
104log = getLogger(__name__)
107class ButlerValidationError(ValidationError):
108 """There is a problem with the Butler configuration."""
110 pass
113class PruneCollectionsArgsError(TypeError):
114 """Base class for errors relating to Butler.pruneCollections input
115 arguments.
116 """
118 pass
121class PurgeWithoutUnstorePruneCollectionsError(PruneCollectionsArgsError):
122 """Raised when purge and unstore are both required to be True, and
123 purge is True but unstore is False.
124 """
126 def __init__(self) -> None:
127 super().__init__("Cannot pass purge=True without unstore=True.")
130class RunWithoutPurgePruneCollectionsError(PruneCollectionsArgsError):
131 """Raised when pruning a RUN collection but purge is False."""
133 def __init__(self, collectionType: CollectionType):
134 self.collectionType = collectionType
135 super().__init__(f"Cannot prune RUN collection {self.collectionType.name} without purge=True.")
138class PurgeUnsupportedPruneCollectionsError(PruneCollectionsArgsError):
139 """Raised when purge is True but is not supported for the given
140 collection."""
142 def __init__(self, collectionType: CollectionType):
143 self.collectionType = collectionType
144 super().__init__(
145 f"Cannot prune {self.collectionType} collection {self.collectionType.name} with purge=True."
146 )
149class Butler(LimitedButler):
150 """Main entry point for the data access system.
152 Parameters
153 ----------
154 config : `ButlerConfig`, `Config` or `str`, optional.
155 Configuration. Anything acceptable to the
156 `ButlerConfig` constructor. If a directory path
157 is given the configuration will be read from a ``butler.yaml`` file in
158 that location. If `None` is given default values will be used.
159 butler : `Butler`, optional.
160 If provided, construct a new Butler that uses the same registry and
161 datastore as the given one, but with the given collection and run.
162 Incompatible with the ``config``, ``searchPaths``, and ``writeable``
163 arguments.
164 collections : `str` or `Iterable` [ `str` ], optional
165 An expression specifying the collections to be searched (in order) when
166 reading datasets.
167 This may be a `str` collection name or an iterable thereof.
168 See :ref:`daf_butler_collection_expressions` for more information.
169 These collections are not registered automatically and must be
170 manually registered before they are used by any method, but they may be
171 manually registered after the `Butler` is initialized.
172 run : `str`, optional
173 Name of the `~CollectionType.RUN` collection new datasets should be
174 inserted into. If ``collections`` is `None` and ``run`` is not `None`,
175 ``collections`` will be set to ``[run]``. If not `None`, this
176 collection will automatically be registered. If this is not set (and
177 ``writeable`` is not set either), a read-only butler will be created.
178 searchPaths : `list` of `str`, optional
179 Directory paths to search when calculating the full Butler
180 configuration. Not used if the supplied config is already a
181 `ButlerConfig`.
182 writeable : `bool`, optional
183 Explicitly sets whether the butler supports write operations. If not
184 provided, a read-write butler is created if any of ``run``, ``tags``,
185 or ``chains`` is non-empty.
186 inferDefaults : `bool`, optional
187 If `True` (default) infer default data ID values from the values
188 present in the datasets in ``collections``: if all collections have the
189 same value (or no value) for a governor dimension, that value will be
190 the default for that dimension. Nonexistent collections are ignored.
191 If a default value is provided explicitly for a governor dimension via
192 ``**kwargs``, no default will be inferred for that dimension.
193 **kwargs : `str`
194 Default data ID key-value pairs. These may only identify "governor"
195 dimensions like ``instrument`` and ``skymap``.
197 Examples
198 --------
199 While there are many ways to control exactly how a `Butler` interacts with
200 the collections in its `Registry`, the most common cases are still simple.
202 For a read-only `Butler` that searches one collection, do::
204 butler = Butler("/path/to/repo", collections=["u/alice/DM-50000"])
206 For a read-write `Butler` that writes to and reads from a
207 `~CollectionType.RUN` collection::
209 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a")
211 The `Butler` passed to a ``PipelineTask`` is often much more complex,
212 because we want to write to one `~CollectionType.RUN` collection but read
213 from several others (as well)::
215 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a",
216 collections=["u/alice/DM-50000/a",
217 "u/bob/DM-49998",
218 "HSC/defaults"])
220 This butler will `put` new datasets to the run ``u/alice/DM-50000/a``.
221 Datasets will be read first from that run (since it appears first in the
222 chain), and then from ``u/bob/DM-49998`` and finally ``HSC/defaults``.
224 Finally, one can always create a `Butler` with no collections::
226 butler = Butler("/path/to/repo", writeable=True)
228 This can be extremely useful when you just want to use ``butler.registry``,
229 e.g. for inserting dimension data or managing collections, or when the
230 collections you want to use with the butler are not consistent.
231 Passing ``writeable`` explicitly here is only necessary if you want to be
232 able to make changes to the repo - usually the value for ``writeable`` can
233 be guessed from the collection arguments provided, but it defaults to
234 `False` when there are not collection arguments.
235 """
237 def __init__(
238 self,
239 config: Union[Config, str, None] = None,
240 *,
241 butler: Optional[Butler] = None,
242 collections: Any = None,
243 run: Optional[str] = None,
244 searchPaths: Optional[List[str]] = None,
245 writeable: Optional[bool] = None,
246 inferDefaults: bool = True,
247 **kwargs: str,
248 ):
249 defaults = RegistryDefaults(collections=collections, run=run, infer=inferDefaults, **kwargs)
250 # Load registry, datastore, etc. from config or existing butler.
251 if butler is not None:
252 if config is not None or searchPaths is not None or writeable is not None:
253 raise TypeError(
254 "Cannot pass 'config', 'searchPaths', or 'writeable' arguments with 'butler' argument."
255 )
256 self.registry = butler.registry.copy(defaults)
257 self.datastore = butler.datastore
258 self.storageClasses = butler.storageClasses
259 self._config: ButlerConfig = butler._config
260 self._allow_put_of_predefined_dataset = butler._allow_put_of_predefined_dataset
261 else:
262 # Can only look for strings in the known repos list.
263 if isinstance(config, str) and config in self.get_known_repos():
264 config = str(self.get_repo_uri(config))
265 try:
266 self._config = ButlerConfig(config, searchPaths=searchPaths)
267 except FileNotFoundError as e:
268 if known := self.get_known_repos():
269 aliases = f"(known aliases: {', '.join(known)})"
270 else:
271 aliases = "(no known aliases)"
272 raise FileNotFoundError(f"{e} {aliases}") from e
273 self._config = ButlerConfig(config, searchPaths=searchPaths)
274 try:
275 if "root" in self._config:
276 butlerRoot = self._config["root"]
277 else:
278 butlerRoot = self._config.configDir
279 if writeable is None:
280 writeable = run is not None
281 self.registry = Registry.fromConfig(
282 self._config, butlerRoot=butlerRoot, writeable=writeable, defaults=defaults
283 )
284 self.datastore = Datastore.fromConfig(
285 self._config, self.registry.getDatastoreBridgeManager(), butlerRoot=butlerRoot
286 )
287 self.storageClasses = StorageClassFactory()
288 self.storageClasses.addFromConfig(self._config)
289 self._allow_put_of_predefined_dataset = self._config.get(
290 "allow_put_of_predefined_dataset", False
291 )
292 except Exception:
293 # Failures here usually mean that configuration is incomplete,
294 # just issue an error message which includes config file URI.
295 log.error(f"Failed to instantiate Butler from config {self._config.configFile}.")
296 raise
298 if "run" in self._config or "collection" in self._config:
299 raise ValueError("Passing a run or collection via configuration is no longer supported.")
301 GENERATION: ClassVar[int] = 3
302 """This is a Generation 3 Butler.
304 This attribute may be removed in the future, once the Generation 2 Butler
305 interface has been fully retired; it should only be used in transitional
306 code.
307 """
309 @classmethod
310 def get_repo_uri(cls, label: str) -> ResourcePath:
311 """Look up the label in a butler repository index.
313 Parameters
314 ----------
315 label : `str`
316 Label of the Butler repository to look up.
318 Returns
319 -------
320 uri : `lsst.resources.ResourcePath`
321 URI to the Butler repository associated with the given label.
323 Raises
324 ------
325 KeyError
326 Raised if the label is not found in the index, or if an index
327 can not be found at all.
329 Notes
330 -----
331 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the
332 information is discovered.
333 """
334 return ButlerRepoIndex.get_repo_uri(label)
336 @classmethod
337 def get_known_repos(cls) -> Set[str]:
338 """Retrieve the list of known repository labels.
340 Returns
341 -------
342 repos : `set` of `str`
343 All the known labels. Can be empty if no index can be found.
345 Notes
346 -----
347 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the
348 information is discovered.
349 """
350 return ButlerRepoIndex.get_known_repos()
352 @staticmethod
353 def makeRepo(
354 root: ResourcePathExpression,
355 config: Union[Config, str, None] = None,
356 dimensionConfig: Union[Config, str, None] = None,
357 standalone: bool = False,
358 searchPaths: Optional[List[str]] = None,
359 forceConfigRoot: bool = True,
360 outfile: Optional[ResourcePathExpression] = None,
361 overwrite: bool = False,
362 ) -> Config:
363 """Create an empty data repository by adding a butler.yaml config
364 to a repository root directory.
366 Parameters
367 ----------
368 root : `lsst.resources.ResourcePathExpression`
369 Path or URI to the root location of the new repository. Will be
370 created if it does not exist.
371 config : `Config` or `str`, optional
372 Configuration to write to the repository, after setting any
373 root-dependent Registry or Datastore config options. Can not
374 be a `ButlerConfig` or a `ConfigSubset`. If `None`, default
375 configuration will be used. Root-dependent config options
376 specified in this config are overwritten if ``forceConfigRoot``
377 is `True`.
378 dimensionConfig : `Config` or `str`, optional
379 Configuration for dimensions, will be used to initialize registry
380 database.
381 standalone : `bool`
382 If True, write all expanded defaults, not just customized or
383 repository-specific settings.
384 This (mostly) decouples the repository from the default
385 configuration, insulating it from changes to the defaults (which
386 may be good or bad, depending on the nature of the changes).
387 Future *additions* to the defaults will still be picked up when
388 initializing `Butlers` to repos created with ``standalone=True``.
389 searchPaths : `list` of `str`, optional
390 Directory paths to search when calculating the full butler
391 configuration.
392 forceConfigRoot : `bool`, optional
393 If `False`, any values present in the supplied ``config`` that
394 would normally be reset are not overridden and will appear
395 directly in the output config. This allows non-standard overrides
396 of the root directory for a datastore or registry to be given.
397 If this parameter is `True` the values for ``root`` will be
398 forced into the resulting config if appropriate.
399 outfile : `lss.resources.ResourcePathExpression`, optional
400 If not-`None`, the output configuration will be written to this
401 location rather than into the repository itself. Can be a URI
402 string. Can refer to a directory that will be used to write
403 ``butler.yaml``.
404 overwrite : `bool`, optional
405 Create a new configuration file even if one already exists
406 in the specified output location. Default is to raise
407 an exception.
409 Returns
410 -------
411 config : `Config`
412 The updated `Config` instance written to the repo.
414 Raises
415 ------
416 ValueError
417 Raised if a ButlerConfig or ConfigSubset is passed instead of a
418 regular Config (as these subclasses would make it impossible to
419 support ``standalone=False``).
420 FileExistsError
421 Raised if the output config file already exists.
422 os.error
423 Raised if the directory does not exist, exists but is not a
424 directory, or cannot be created.
426 Notes
427 -----
428 Note that when ``standalone=False`` (the default), the configuration
429 search path (see `ConfigSubset.defaultSearchPaths`) that was used to
430 construct the repository should also be used to construct any Butlers
431 to avoid configuration inconsistencies.
432 """
433 if isinstance(config, (ButlerConfig, ConfigSubset)):
434 raise ValueError("makeRepo must be passed a regular Config without defaults applied.")
436 # Ensure that the root of the repository exists or can be made
437 root_uri = ResourcePath(root, forceDirectory=True)
438 root_uri.mkdir()
440 config = Config(config)
442 # If we are creating a new repo from scratch with relative roots,
443 # do not propagate an explicit root from the config file
444 if "root" in config:
445 del config["root"]
447 full = ButlerConfig(config, searchPaths=searchPaths) # this applies defaults
448 imported_class = doImportType(full["datastore", "cls"])
449 if not issubclass(imported_class, Datastore):
450 raise TypeError(f"Imported datastore class {full['datastore', 'cls']} is not a Datastore")
451 datastoreClass: Type[Datastore] = imported_class
452 datastoreClass.setConfigRoot(BUTLER_ROOT_TAG, config, full, overwrite=forceConfigRoot)
454 # if key exists in given config, parse it, otherwise parse the defaults
455 # in the expanded config
456 if config.get(("registry", "db")):
457 registryConfig = RegistryConfig(config)
458 else:
459 registryConfig = RegistryConfig(full)
460 defaultDatabaseUri = registryConfig.makeDefaultDatabaseUri(BUTLER_ROOT_TAG)
461 if defaultDatabaseUri is not None:
462 Config.updateParameters(
463 RegistryConfig, config, full, toUpdate={"db": defaultDatabaseUri}, overwrite=forceConfigRoot
464 )
465 else:
466 Config.updateParameters(RegistryConfig, config, full, toCopy=("db",), overwrite=forceConfigRoot)
468 if standalone:
469 config.merge(full)
470 else:
471 # Always expand the registry.managers section into the per-repo
472 # config, because after the database schema is created, it's not
473 # allowed to change anymore. Note that in the standalone=True
474 # branch, _everything_ in the config is expanded, so there's no
475 # need to special case this.
476 Config.updateParameters(RegistryConfig, config, full, toMerge=("managers",), overwrite=False)
477 configURI: ResourcePathExpression
478 if outfile is not None:
479 # When writing to a separate location we must include
480 # the root of the butler repo in the config else it won't know
481 # where to look.
482 config["root"] = root_uri.geturl()
483 configURI = outfile
484 else:
485 configURI = root_uri
486 # Strip obscore configuration, if it is present, before writing config
487 # to a file, obscore config will be stored in registry.
488 config_to_write = config
489 if ("registry", "managers", "obscore") in config:
490 config_to_write = config.copy()
491 del config_to_write["registry", "managers", "obscore", "config"]
492 config_to_write.dumpToUri(configURI, overwrite=overwrite)
494 # Create Registry and populate tables
495 registryConfig = RegistryConfig(config.get("registry"))
496 dimensionConfig = DimensionConfig(dimensionConfig)
497 Registry.createFromConfig(registryConfig, dimensionConfig=dimensionConfig, butlerRoot=root_uri)
499 log.verbose("Wrote new Butler configuration file to %s", configURI)
501 return config
503 @classmethod
504 def _unpickle(
505 cls,
506 config: ButlerConfig,
507 collections: Optional[CollectionSearch],
508 run: Optional[str],
509 defaultDataId: Dict[str, str],
510 writeable: bool,
511 ) -> Butler:
512 """Callable used to unpickle a Butler.
514 We prefer not to use ``Butler.__init__`` directly so we can force some
515 of its many arguments to be keyword-only (note that ``__reduce__``
516 can only invoke callables with positional arguments).
518 Parameters
519 ----------
520 config : `ButlerConfig`
521 Butler configuration, already coerced into a true `ButlerConfig`
522 instance (and hence after any search paths for overrides have been
523 utilized).
524 collections : `CollectionSearch`
525 Names of the default collections to read from.
526 run : `str`, optional
527 Name of the default `~CollectionType.RUN` collection to write to.
528 defaultDataId : `dict` [ `str`, `str` ]
529 Default data ID values.
530 writeable : `bool`
531 Whether the Butler should support write operations.
533 Returns
534 -------
535 butler : `Butler`
536 A new `Butler` instance.
537 """
538 # MyPy doesn't recognize that the kwargs below are totally valid; it
539 # seems to think '**defaultDataId* is a _positional_ argument!
540 return cls(
541 config=config,
542 collections=collections,
543 run=run,
544 writeable=writeable,
545 **defaultDataId, # type: ignore
546 )
548 def __reduce__(self) -> tuple:
549 """Support pickling."""
550 return (
551 Butler._unpickle,
552 (
553 self._config,
554 self.collections,
555 self.run,
556 self.registry.defaults.dataId.byName(),
557 self.registry.isWriteable(),
558 ),
559 )
561 def __str__(self) -> str:
562 return "Butler(collections={}, run={}, datastore='{}', registry='{}')".format(
563 self.collections, self.run, self.datastore, self.registry
564 )
566 def isWriteable(self) -> bool:
567 """Return `True` if this `Butler` supports write operations."""
568 return self.registry.isWriteable()
570 @contextlib.contextmanager
571 def transaction(self) -> Iterator[None]:
572 """Context manager supporting `Butler` transactions.
574 Transactions can be nested.
575 """
576 with self.registry.transaction():
577 with self.datastore.transaction():
578 yield
580 def _standardizeArgs(
581 self,
582 datasetRefOrType: Union[DatasetRef, DatasetType, str],
583 dataId: Optional[DataId] = None,
584 for_put: bool = True,
585 **kwargs: Any,
586 ) -> Tuple[DatasetType, Optional[DataId]]:
587 """Standardize the arguments passed to several Butler APIs.
589 Parameters
590 ----------
591 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
592 When `DatasetRef` the `dataId` should be `None`.
593 Otherwise the `DatasetType` or name thereof.
594 dataId : `dict` or `DataCoordinate`
595 A `dict` of `Dimension` link name, value pairs that label the
596 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
597 should be provided as the second argument.
598 for_put : `bool`, optional
599 If `True` this call is invoked as part of a `Butler.put()`.
600 Otherwise it is assumed to be part of a `Butler.get()`. This
601 parameter is only relevant if there is dataset type
602 inconsistency.
603 **kwargs
604 Additional keyword arguments used to augment or construct a
605 `DataCoordinate`. See `DataCoordinate.standardize`
606 parameters.
608 Returns
609 -------
610 datasetType : `DatasetType`
611 A `DatasetType` instance extracted from ``datasetRefOrType``.
612 dataId : `dict` or `DataId`, optional
613 Argument that can be used (along with ``kwargs``) to construct a
614 `DataId`.
616 Notes
617 -----
618 Butler APIs that conceptually need a DatasetRef also allow passing a
619 `DatasetType` (or the name of one) and a `DataId` (or a dict and
620 keyword arguments that can be used to construct one) separately. This
621 method accepts those arguments and always returns a true `DatasetType`
622 and a `DataId` or `dict`.
624 Standardization of `dict` vs `DataId` is best handled by passing the
625 returned ``dataId`` (and ``kwargs``) to `Registry` APIs, which are
626 generally similarly flexible.
627 """
628 externalDatasetType: Optional[DatasetType] = None
629 internalDatasetType: Optional[DatasetType] = None
630 if isinstance(datasetRefOrType, DatasetRef):
631 if dataId is not None or kwargs:
632 raise ValueError("DatasetRef given, cannot use dataId as well")
633 externalDatasetType = datasetRefOrType.datasetType
634 dataId = datasetRefOrType.dataId
635 else:
636 # Don't check whether DataId is provided, because Registry APIs
637 # can usually construct a better error message when it wasn't.
638 if isinstance(datasetRefOrType, DatasetType):
639 externalDatasetType = datasetRefOrType
640 else:
641 internalDatasetType = self.registry.getDatasetType(datasetRefOrType)
643 # Check that they are self-consistent
644 if externalDatasetType is not None:
645 internalDatasetType = self.registry.getDatasetType(externalDatasetType.name)
646 if externalDatasetType != internalDatasetType:
647 # We can allow differences if they are compatible, depending
648 # on whether this is a get or a put. A get requires that
649 # the python type associated with the datastore can be
650 # converted to the user type. A put requires that the user
651 # supplied python type can be converted to the internal
652 # type expected by registry.
653 relevantDatasetType = internalDatasetType
654 if for_put:
655 is_compatible = internalDatasetType.is_compatible_with(externalDatasetType)
656 else:
657 is_compatible = externalDatasetType.is_compatible_with(internalDatasetType)
658 relevantDatasetType = externalDatasetType
659 if not is_compatible:
660 raise ValueError(
661 f"Supplied dataset type ({externalDatasetType}) inconsistent with "
662 f"registry definition ({internalDatasetType})"
663 )
664 # Override the internal definition.
665 internalDatasetType = relevantDatasetType
667 assert internalDatasetType is not None
668 return internalDatasetType, dataId
670 def _rewrite_data_id(
671 self, dataId: Optional[DataId], datasetType: DatasetType, **kwargs: Any
672 ) -> Tuple[Optional[DataId], Dict[str, Any]]:
673 """Rewrite a data ID taking into account dimension records.
675 Take a Data ID and keyword args and rewrite it if necessary to
676 allow the user to specify dimension records rather than dimension
677 primary values.
679 This allows a user to include a dataId dict with keys of
680 ``exposure.day_obs`` and ``exposure.seq_num`` instead of giving
681 the integer exposure ID. It also allows a string to be given
682 for a dimension value rather than the integer ID if that is more
683 convenient. For example, rather than having to specifyin the
684 detector with ``detector.full_name``, a string given for ``detector``
685 will be interpreted as the full name and converted to the integer
686 value.
688 Keyword arguments can also use strings for dimensions like detector
689 and exposure but python does not allow them to include ``.`` and
690 so the ``exposure.day_obs`` syntax can not be used in a keyword
691 argument.
693 Parameters
694 ----------
695 dataId : `dict` or `DataCoordinate`
696 A `dict` of `Dimension` link name, value pairs that will label the
697 `DatasetRef` within a Collection.
698 datasetType : `DatasetType`
699 The dataset type associated with this dataId. Required to
700 determine the relevant dimensions.
701 **kwargs
702 Additional keyword arguments used to augment or construct a
703 `DataId`. See `DataId` parameters.
705 Returns
706 -------
707 dataId : `dict` or `DataCoordinate`
708 The, possibly rewritten, dataId. If given a `DataCoordinate` and
709 no keyword arguments, the original dataId will be returned
710 unchanged.
711 **kwargs : `dict`
712 Any unused keyword arguments (would normally be empty dict).
713 """
714 # Do nothing if we have a standalone DataCoordinate.
715 if isinstance(dataId, DataCoordinate) and not kwargs:
716 return dataId, kwargs
718 # Process dimension records that are using record information
719 # rather than ids
720 newDataId: Dict[str, DataIdValue] = {}
721 byRecord: Dict[str, Dict[str, Any]] = defaultdict(dict)
723 # if all the dataId comes from keyword parameters we do not need
724 # to do anything here because they can't be of the form
725 # exposure.obs_id because a "." is not allowed in a keyword parameter.
726 if dataId:
727 for k, v in dataId.items():
728 # If we have a Dimension we do not need to do anything
729 # because it cannot be a compound key.
730 if isinstance(k, str) and "." in k:
731 # Someone is using a more human-readable dataId
732 dimensionName, record = k.split(".", 1)
733 byRecord[dimensionName][record] = v
734 elif isinstance(k, Dimension):
735 newDataId[k.name] = v
736 else:
737 newDataId[k] = v
739 # Go through the updated dataId and check the type in case someone is
740 # using an alternate key. We have already filtered out the compound
741 # keys dimensions.record format.
742 not_dimensions = {}
744 # Will need to look in the dataId and the keyword arguments
745 # and will remove them if they need to be fixed or are unrecognized.
746 for dataIdDict in (newDataId, kwargs):
747 # Use a list so we can adjust the dict safely in the loop
748 for dimensionName in list(dataIdDict):
749 value = dataIdDict[dimensionName]
750 try:
751 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
752 except KeyError:
753 # This is not a real dimension
754 not_dimensions[dimensionName] = value
755 del dataIdDict[dimensionName]
756 continue
758 # Convert an integral type to an explicit int to simplify
759 # comparisons here
760 if isinstance(value, numbers.Integral):
761 value = int(value)
763 if not isinstance(value, dimension.primaryKey.getPythonType()):
764 for alternate in dimension.alternateKeys:
765 if isinstance(value, alternate.getPythonType()):
766 byRecord[dimensionName][alternate.name] = value
767 del dataIdDict[dimensionName]
768 log.debug(
769 "Converting dimension %s to %s.%s=%s",
770 dimensionName,
771 dimensionName,
772 alternate.name,
773 value,
774 )
775 break
776 else:
777 log.warning(
778 "Type mismatch found for value '%r' provided for dimension %s. "
779 "Could not find matching alternative (primary key has type %s) "
780 "so attempting to use as-is.",
781 value,
782 dimensionName,
783 dimension.primaryKey.getPythonType(),
784 )
786 # By this point kwargs and newDataId should only include valid
787 # dimensions. Merge kwargs in to the new dataId and log if there
788 # are dimensions in both (rather than calling update).
789 for k, v in kwargs.items():
790 if k in newDataId and newDataId[k] != v:
791 log.debug(
792 "Keyword arg %s overriding explicit value in dataId of %s with %s", k, newDataId[k], v
793 )
794 newDataId[k] = v
795 # No need to retain any values in kwargs now.
796 kwargs = {}
798 # If we have some unrecognized dimensions we have to try to connect
799 # them to records in other dimensions. This is made more complicated
800 # by some dimensions having records with clashing names. A mitigation
801 # is that we can tell by this point which dimensions are missing
802 # for the DatasetType but this does not work for calibrations
803 # where additional dimensions can be used to constrain the temporal
804 # axis.
805 if not_dimensions:
806 # Search for all dimensions even if we have been given a value
807 # explicitly. In some cases records are given as well as the
808 # actually dimension and this should not be an error if they
809 # match.
810 mandatoryDimensions = datasetType.dimensions.names # - provided
812 candidateDimensions: Set[str] = set()
813 candidateDimensions.update(mandatoryDimensions)
815 # For calibrations we may well be needing temporal dimensions
816 # so rather than always including all dimensions in the scan
817 # restrict things a little. It is still possible for there
818 # to be confusion over day_obs in visit vs exposure for example.
819 # If we are not searching calibration collections things may
820 # fail but they are going to fail anyway because of the
821 # ambiguousness of the dataId...
822 if datasetType.isCalibration():
823 for dim in self.registry.dimensions.getStaticDimensions():
824 if dim.temporal:
825 candidateDimensions.add(str(dim))
827 # Look up table for the first association with a dimension
828 guessedAssociation: Dict[str, Dict[str, Any]] = defaultdict(dict)
830 # Keep track of whether an item is associated with multiple
831 # dimensions.
832 counter: Counter[str] = Counter()
833 assigned: Dict[str, Set[str]] = defaultdict(set)
835 # Go through the missing dimensions and associate the
836 # given names with records within those dimensions
837 matched_dims = set()
838 for dimensionName in candidateDimensions:
839 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
840 fields = dimension.metadata.names | dimension.uniqueKeys.names
841 for field in not_dimensions:
842 if field in fields:
843 guessedAssociation[dimensionName][field] = not_dimensions[field]
844 counter[dimensionName] += 1
845 assigned[field].add(dimensionName)
846 matched_dims.add(field)
848 # Calculate the fields that matched nothing.
849 never_found = set(not_dimensions) - matched_dims
851 if never_found:
852 raise ValueError(f"Unrecognized keyword args given: {never_found}")
854 # There is a chance we have allocated a single dataId item
855 # to multiple dimensions. Need to decide which should be retained.
856 # For now assume that the most popular alternative wins.
857 # This means that day_obs with seq_num will result in
858 # exposure.day_obs and not visit.day_obs
859 # Also prefer an explicitly missing dimension over an inferred
860 # temporal dimension.
861 for fieldName, assignedDimensions in assigned.items():
862 if len(assignedDimensions) > 1:
863 # Pick the most popular (preferring mandatory dimensions)
864 requiredButMissing = assignedDimensions.intersection(mandatoryDimensions)
865 if requiredButMissing:
866 candidateDimensions = requiredButMissing
867 else:
868 candidateDimensions = assignedDimensions
870 # If this is a choice between visit and exposure and
871 # neither was a required part of the dataset type,
872 # (hence in this branch) always prefer exposure over
873 # visit since exposures are always defined and visits
874 # are defined from exposures.
875 if candidateDimensions == {"exposure", "visit"}:
876 candidateDimensions = {"exposure"}
878 # Select the relevant items and get a new restricted
879 # counter.
880 theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions}
881 duplicatesCounter: Counter[str] = Counter()
882 duplicatesCounter.update(theseCounts)
884 # Choose the most common. If they are equally common
885 # we will pick the one that was found first.
886 # Returns a list of tuples
887 selected = duplicatesCounter.most_common(1)[0][0]
889 log.debug(
890 "Ambiguous dataId entry '%s' associated with multiple dimensions: %s."
891 " Removed ambiguity by choosing dimension %s.",
892 fieldName,
893 ", ".join(assignedDimensions),
894 selected,
895 )
897 for candidateDimension in assignedDimensions:
898 if candidateDimension != selected:
899 del guessedAssociation[candidateDimension][fieldName]
901 # Update the record look up dict with the new associations
902 for dimensionName, values in guessedAssociation.items():
903 if values: # A dict might now be empty
904 log.debug("Assigned non-dimension dataId keys to dimension %s: %s", dimensionName, values)
905 byRecord[dimensionName].update(values)
907 if byRecord:
908 # Some record specifiers were found so we need to convert
909 # them to the Id form
910 for dimensionName, values in byRecord.items():
911 if dimensionName in newDataId:
912 log.debug(
913 "DataId specified explicit %s dimension value of %s in addition to"
914 " general record specifiers for it of %s. Ignoring record information.",
915 dimensionName,
916 newDataId[dimensionName],
917 str(values),
918 )
919 # Get the actual record and compare with these values.
920 try:
921 recs = list(self.registry.queryDimensionRecords(dimensionName, dataId=newDataId))
922 except DataIdError:
923 raise ValueError(
924 f"Could not find dimension '{dimensionName}'"
925 f" with dataId {newDataId} as part of comparing with"
926 f" record values {byRecord[dimensionName]}"
927 ) from None
928 if len(recs) == 1:
929 errmsg: List[str] = []
930 for k, v in values.items():
931 if (recval := getattr(recs[0], k)) != v:
932 errmsg.append(f"{k}({recval} != {v})")
933 if errmsg:
934 raise ValueError(
935 f"Dimension {dimensionName} in dataId has explicit value"
936 " inconsistent with records: " + ", ".join(errmsg)
937 )
938 else:
939 # Multiple matches for an explicit dimension
940 # should never happen but let downstream complain.
941 pass
942 continue
944 # Build up a WHERE expression
945 bind = {k: v for k, v in values.items()}
946 where = " AND ".join(f"{dimensionName}.{k} = {k}" for k in bind)
948 # Hopefully we get a single record that matches
949 records = set(
950 self.registry.queryDimensionRecords(
951 dimensionName, dataId=newDataId, where=where, bind=bind, **kwargs
952 )
953 )
955 if len(records) != 1:
956 if len(records) > 1:
957 # visit can have an ambiguous answer without involving
958 # visit_system. The default visit_system is defined
959 # by the instrument.
960 if (
961 dimensionName == "visit"
962 and "visit_system_membership" in self.registry.dimensions
963 and "visit_system"
964 in self.registry.dimensions["instrument"].metadata # type: ignore
965 ):
966 instrument_records = list(
967 self.registry.queryDimensionRecords(
968 "instrument",
969 dataId=newDataId,
970 **kwargs,
971 )
972 )
973 if len(instrument_records) == 1:
974 visit_system = instrument_records[0].visit_system
975 if visit_system is None:
976 # Set to a value that will never match.
977 visit_system = -1
979 # Look up each visit in the
980 # visit_system_membership records.
981 for rec in records:
982 membership = list(
983 self.registry.queryDimensionRecords(
984 # Use bind to allow zero results.
985 # This is a fully-specified query.
986 "visit_system_membership",
987 where="instrument = inst AND visit_system = system AND visit = v",
988 bind=dict(
989 inst=instrument_records[0].name, system=visit_system, v=rec.id
990 ),
991 )
992 )
993 if membership:
994 # This record is the right answer.
995 records = set([rec])
996 break
998 # The ambiguity may have been resolved so check again.
999 if len(records) > 1:
1000 log.debug("Received %d records from constraints of %s", len(records), str(values))
1001 for r in records:
1002 log.debug("- %s", str(r))
1003 raise ValueError(
1004 f"DataId specification for dimension {dimensionName} is not"
1005 f" uniquely constrained to a single dataset by {values}."
1006 f" Got {len(records)} results."
1007 )
1008 else:
1009 raise ValueError(
1010 f"DataId specification for dimension {dimensionName} matched no"
1011 f" records when constrained by {values}"
1012 )
1014 # Get the primary key from the real dimension object
1015 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
1016 if not isinstance(dimension, Dimension):
1017 raise RuntimeError(
1018 f"{dimension.name} is not a true dimension, and cannot be used in data IDs."
1019 )
1020 newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name)
1022 return newDataId, kwargs
1024 def _findDatasetRef(
1025 self,
1026 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1027 dataId: Optional[DataId] = None,
1028 *,
1029 collections: Any = None,
1030 allowUnresolved: bool = False,
1031 **kwargs: Any,
1032 ) -> DatasetRef:
1033 """Shared logic for methods that start with a search for a dataset in
1034 the registry.
1036 Parameters
1037 ----------
1038 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1039 When `DatasetRef` the `dataId` should be `None`.
1040 Otherwise the `DatasetType` or name thereof.
1041 dataId : `dict` or `DataCoordinate`, optional
1042 A `dict` of `Dimension` link name, value pairs that label the
1043 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1044 should be provided as the first argument.
1045 collections : Any, optional
1046 Collections to be searched, overriding ``self.collections``.
1047 Can be any of the types supported by the ``collections`` argument
1048 to butler construction.
1049 allowUnresolved : `bool`, optional
1050 If `True`, return an unresolved `DatasetRef` if finding a resolved
1051 one in the `Registry` fails. Defaults to `False`.
1052 **kwargs
1053 Additional keyword arguments used to augment or construct a
1054 `DataId`. See `DataId` parameters.
1056 Returns
1057 -------
1058 ref : `DatasetRef`
1059 A reference to the dataset identified by the given arguments.
1061 Raises
1062 ------
1063 LookupError
1064 Raised if no matching dataset exists in the `Registry` (and
1065 ``allowUnresolved is False``).
1066 ValueError
1067 Raised if a resolved `DatasetRef` was passed as an input, but it
1068 differs from the one found in the registry.
1069 TypeError
1070 Raised if no collections were provided.
1071 """
1072 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, for_put=False, **kwargs)
1073 if isinstance(datasetRefOrType, DatasetRef):
1074 idNumber = datasetRefOrType.id
1075 else:
1076 idNumber = None
1077 timespan: Optional[Timespan] = None
1079 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs)
1081 if datasetType.isCalibration():
1082 # Because this is a calibration dataset, first try to make a
1083 # standardize the data ID without restricting the dimensions to
1084 # those of the dataset type requested, because there may be extra
1085 # dimensions that provide temporal information for a validity-range
1086 # lookup.
1087 dataId = DataCoordinate.standardize(
1088 dataId, universe=self.registry.dimensions, defaults=self.registry.defaults.dataId, **kwargs
1089 )
1090 if dataId.graph.temporal:
1091 dataId = self.registry.expandDataId(dataId)
1092 timespan = dataId.timespan
1093 else:
1094 # Standardize the data ID to just the dimensions of the dataset
1095 # type instead of letting registry.findDataset do it, so we get the
1096 # result even if no dataset is found.
1097 dataId = DataCoordinate.standardize(
1098 dataId, graph=datasetType.dimensions, defaults=self.registry.defaults.dataId, **kwargs
1099 )
1100 # Always lookup the DatasetRef, even if one is given, to ensure it is
1101 # present in the current collection.
1102 ref = self.registry.findDataset(datasetType, dataId, collections=collections, timespan=timespan)
1103 if ref is None:
1104 if allowUnresolved:
1105 return DatasetRef(datasetType, dataId)
1106 else:
1107 if collections is None:
1108 collections = self.registry.defaults.collections
1109 raise LookupError(
1110 f"Dataset {datasetType.name} with data ID {dataId} "
1111 f"could not be found in collections {collections}."
1112 )
1113 if idNumber is not None and idNumber != ref.id:
1114 if collections is None:
1115 collections = self.registry.defaults.collections
1116 raise ValueError(
1117 f"DatasetRef.id provided ({idNumber}) does not match "
1118 f"id ({ref.id}) in registry in collections {collections}."
1119 )
1120 if datasetType != ref.datasetType:
1121 # If they differ it is because the user explicitly specified
1122 # a compatible dataset type to this call rather than using the
1123 # registry definition. The DatasetRef must therefore be recreated
1124 # using the user definition such that the expected type is
1125 # returned.
1126 ref = DatasetRef(datasetType, ref.dataId, run=ref.run, id=ref.id)
1128 return ref
1130 @transactional
1131 def putDirect(self, obj: Any, ref: DatasetRef) -> DatasetRef:
1132 # Docstring inherited.
1133 (imported_ref,) = self.registry._importDatasets(
1134 [ref],
1135 expand=True,
1136 )
1137 if imported_ref.id != ref.getCheckedId():
1138 raise RuntimeError("This registry configuration does not support putDirect.")
1139 self.datastore.put(obj, ref)
1140 return ref
1142 @transactional
1143 def put(
1144 self,
1145 obj: Any,
1146 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1147 dataId: Optional[DataId] = None,
1148 *,
1149 run: Optional[str] = None,
1150 **kwargs: Any,
1151 ) -> DatasetRef:
1152 """Store and register a dataset.
1154 Parameters
1155 ----------
1156 obj : `object`
1157 The dataset.
1158 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1159 When `DatasetRef` is provided, ``dataId`` should be `None`.
1160 Otherwise the `DatasetType` or name thereof.
1161 dataId : `dict` or `DataCoordinate`
1162 A `dict` of `Dimension` link name, value pairs that label the
1163 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1164 should be provided as the second argument.
1165 run : `str`, optional
1166 The name of the run the dataset should be added to, overriding
1167 ``self.run``.
1168 **kwargs
1169 Additional keyword arguments used to augment or construct a
1170 `DataCoordinate`. See `DataCoordinate.standardize`
1171 parameters.
1173 Returns
1174 -------
1175 ref : `DatasetRef`
1176 A reference to the stored dataset, updated with the correct id if
1177 given.
1179 Raises
1180 ------
1181 TypeError
1182 Raised if the butler is read-only or if no run has been provided.
1183 """
1184 log.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run)
1185 if not self.isWriteable():
1186 raise TypeError("Butler is read-only.")
1187 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs)
1188 if isinstance(datasetRefOrType, DatasetRef) and datasetRefOrType.id is not None:
1189 raise ValueError("DatasetRef must not be in registry, must have None id")
1191 # Handle dimension records in dataId
1192 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs)
1194 # Add Registry Dataset entry.
1195 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions, **kwargs)
1197 # For an execution butler the datasets will be pre-defined.
1198 # If the butler is configured that way datasets should only be inserted
1199 # if they do not already exist in registry. Trying and catching
1200 # ConflictingDefinitionError will not work because the transaction
1201 # will be corrupted. Instead, in this mode always check first.
1202 ref = None
1203 ref_is_predefined = False
1204 if self._allow_put_of_predefined_dataset:
1205 # Get the matching ref for this run.
1206 ref = self.registry.findDataset(datasetType, collections=run, dataId=dataId)
1208 if ref:
1209 # Must be expanded form for datastore templating
1210 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions)
1211 ref = ref.expanded(dataId)
1212 ref_is_predefined = True
1214 if not ref:
1215 (ref,) = self.registry.insertDatasets(datasetType, run=run, dataIds=[dataId])
1217 # If the ref is predefined it is possible that the datastore also
1218 # has the record. Asking datastore to put it again will result in
1219 # the artifact being recreated, overwriting previous, then will cause
1220 # a failure in writing the record which will cause the artifact
1221 # to be removed. Much safer to ask first before attempting to
1222 # overwrite. Race conditions should not be an issue for the
1223 # execution butler environment.
1224 if ref_is_predefined:
1225 if self.datastore.knows(ref):
1226 raise ConflictingDefinitionError(f"Dataset associated {ref} already exists.")
1228 self.datastore.put(obj, ref)
1230 return ref
1232 def getDirect(self, ref: DatasetRef, *, parameters: Optional[Dict[str, Any]] = None) -> Any:
1233 """Retrieve a stored dataset.
1235 Unlike `Butler.get`, this method allows datasets outside the Butler's
1236 collection to be read as long as the `DatasetRef` that identifies them
1237 can be obtained separately.
1239 Parameters
1240 ----------
1241 ref : `DatasetRef`
1242 Resolved reference to an already stored dataset.
1243 parameters : `dict`
1244 Additional StorageClass-defined options to control reading,
1245 typically used to efficiently read only a subset of the dataset.
1247 Returns
1248 -------
1249 obj : `object`
1250 The dataset.
1251 """
1252 return self.datastore.get(ref, parameters=parameters)
1254 def getDirectDeferred(
1255 self, ref: DatasetRef, *, parameters: Union[dict, None] = None
1256 ) -> DeferredDatasetHandle:
1257 """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
1258 from a resolved `DatasetRef`.
1260 Parameters
1261 ----------
1262 ref : `DatasetRef`
1263 Resolved reference to an already stored dataset.
1264 parameters : `dict`
1265 Additional StorageClass-defined options to control reading,
1266 typically used to efficiently read only a subset of the dataset.
1268 Returns
1269 -------
1270 obj : `DeferredDatasetHandle`
1271 A handle which can be used to retrieve a dataset at a later time.
1273 Raises
1274 ------
1275 AmbiguousDatasetError
1276 Raised if ``ref.id is None``, i.e. the reference is unresolved.
1277 """
1278 if ref.id is None:
1279 raise AmbiguousDatasetError(
1280 f"Dataset of type {ref.datasetType.name} with data ID {ref.dataId} is not resolved."
1281 )
1282 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters)
1284 def getDeferred(
1285 self,
1286 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1287 dataId: Optional[DataId] = None,
1288 *,
1289 parameters: Union[dict, None] = None,
1290 collections: Any = None,
1291 **kwargs: Any,
1292 ) -> DeferredDatasetHandle:
1293 """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
1294 after an immediate registry lookup.
1296 Parameters
1297 ----------
1298 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1299 When `DatasetRef` the `dataId` should be `None`.
1300 Otherwise the `DatasetType` or name thereof.
1301 dataId : `dict` or `DataCoordinate`, optional
1302 A `dict` of `Dimension` link name, value pairs that label the
1303 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1304 should be provided as the first argument.
1305 parameters : `dict`
1306 Additional StorageClass-defined options to control reading,
1307 typically used to efficiently read only a subset of the dataset.
1308 collections : Any, optional
1309 Collections to be searched, overriding ``self.collections``.
1310 Can be any of the types supported by the ``collections`` argument
1311 to butler construction.
1312 **kwargs
1313 Additional keyword arguments used to augment or construct a
1314 `DataId`. See `DataId` parameters.
1316 Returns
1317 -------
1318 obj : `DeferredDatasetHandle`
1319 A handle which can be used to retrieve a dataset at a later time.
1321 Raises
1322 ------
1323 LookupError
1324 Raised if no matching dataset exists in the `Registry` (and
1325 ``allowUnresolved is False``).
1326 ValueError
1327 Raised if a resolved `DatasetRef` was passed as an input, but it
1328 differs from the one found in the registry.
1329 TypeError
1330 Raised if no collections were provided.
1331 """
1332 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs)
1333 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters)
1335 def get(
1336 self,
1337 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1338 dataId: Optional[DataId] = None,
1339 *,
1340 parameters: Optional[Dict[str, Any]] = None,
1341 collections: Any = None,
1342 **kwargs: Any,
1343 ) -> Any:
1344 """Retrieve a stored dataset.
1346 Parameters
1347 ----------
1348 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1349 When `DatasetRef` the `dataId` should be `None`.
1350 Otherwise the `DatasetType` or name thereof.
1351 dataId : `dict` or `DataCoordinate`
1352 A `dict` of `Dimension` link name, value pairs that label the
1353 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1354 should be provided as the first argument.
1355 parameters : `dict`
1356 Additional StorageClass-defined options to control reading,
1357 typically used to efficiently read only a subset of the dataset.
1358 collections : Any, optional
1359 Collections to be searched, overriding ``self.collections``.
1360 Can be any of the types supported by the ``collections`` argument
1361 to butler construction.
1362 **kwargs
1363 Additional keyword arguments used to augment or construct a
1364 `DataCoordinate`. See `DataCoordinate.standardize`
1365 parameters.
1367 Returns
1368 -------
1369 obj : `object`
1370 The dataset.
1372 Raises
1373 ------
1374 ValueError
1375 Raised if a resolved `DatasetRef` was passed as an input, but it
1376 differs from the one found in the registry.
1377 LookupError
1378 Raised if no matching dataset exists in the `Registry`.
1379 TypeError
1380 Raised if no collections were provided.
1382 Notes
1383 -----
1384 When looking up datasets in a `~CollectionType.CALIBRATION` collection,
1385 this method requires that the given data ID include temporal dimensions
1386 beyond the dimensions of the dataset type itself, in order to find the
1387 dataset with the appropriate validity range. For example, a "bias"
1388 dataset with native dimensions ``{instrument, detector}`` could be
1389 fetched with a ``{instrument, detector, exposure}`` data ID, because
1390 ``exposure`` is a temporal dimension.
1391 """
1392 log.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters)
1393 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs)
1394 return self.getDirect(ref, parameters=parameters)
1396 def getURIs(
1397 self,
1398 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1399 dataId: Optional[DataId] = None,
1400 *,
1401 predict: bool = False,
1402 collections: Any = None,
1403 run: Optional[str] = None,
1404 **kwargs: Any,
1405 ) -> DatasetRefURIs:
1406 """Returns the URIs associated with the dataset.
1408 Parameters
1409 ----------
1410 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1411 When `DatasetRef` the `dataId` should be `None`.
1412 Otherwise the `DatasetType` or name thereof.
1413 dataId : `dict` or `DataCoordinate`
1414 A `dict` of `Dimension` link name, value pairs that label the
1415 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1416 should be provided as the first argument.
1417 predict : `bool`
1418 If `True`, allow URIs to be returned of datasets that have not
1419 been written.
1420 collections : Any, optional
1421 Collections to be searched, overriding ``self.collections``.
1422 Can be any of the types supported by the ``collections`` argument
1423 to butler construction.
1424 run : `str`, optional
1425 Run to use for predictions, overriding ``self.run``.
1426 **kwargs
1427 Additional keyword arguments used to augment or construct a
1428 `DataCoordinate`. See `DataCoordinate.standardize`
1429 parameters.
1431 Returns
1432 -------
1433 uris : `DatasetRefURIs`
1434 The URI to the primary artifact associated with this dataset (if
1435 the dataset was disassembled within the datastore this may be
1436 `None`), and the URIs to any components associated with the dataset
1437 artifact. (can be empty if there are no components).
1438 """
1439 ref = self._findDatasetRef(
1440 datasetRefOrType, dataId, allowUnresolved=predict, collections=collections, **kwargs
1441 )
1442 if ref.id is None: # only possible if predict is True
1443 if run is None:
1444 run = self.run
1445 if run is None:
1446 raise TypeError("Cannot predict location with run=None.")
1447 # Lie about ID, because we can't guess it, and only
1448 # Datastore.getURIs() will ever see it (and it doesn't use it).
1449 ref = ref.resolved(id=0, run=run)
1450 return self.datastore.getURIs(ref, predict)
1452 def getURI(
1453 self,
1454 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1455 dataId: Optional[DataId] = None,
1456 *,
1457 predict: bool = False,
1458 collections: Any = None,
1459 run: Optional[str] = None,
1460 **kwargs: Any,
1461 ) -> ResourcePath:
1462 """Return the URI to the Dataset.
1464 Parameters
1465 ----------
1466 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1467 When `DatasetRef` the `dataId` should be `None`.
1468 Otherwise the `DatasetType` or name thereof.
1469 dataId : `dict` or `DataCoordinate`
1470 A `dict` of `Dimension` link name, value pairs that label the
1471 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1472 should be provided as the first argument.
1473 predict : `bool`
1474 If `True`, allow URIs to be returned of datasets that have not
1475 been written.
1476 collections : Any, optional
1477 Collections to be searched, overriding ``self.collections``.
1478 Can be any of the types supported by the ``collections`` argument
1479 to butler construction.
1480 run : `str`, optional
1481 Run to use for predictions, overriding ``self.run``.
1482 **kwargs
1483 Additional keyword arguments used to augment or construct a
1484 `DataCoordinate`. See `DataCoordinate.standardize`
1485 parameters.
1487 Returns
1488 -------
1489 uri : `lsst.resources.ResourcePath`
1490 URI pointing to the Dataset within the datastore. If the
1491 Dataset does not exist in the datastore, and if ``predict`` is
1492 `True`, the URI will be a prediction and will include a URI
1493 fragment "#predicted".
1494 If the datastore does not have entities that relate well
1495 to the concept of a URI the returned URI string will be
1496 descriptive. The returned URI is not guaranteed to be obtainable.
1498 Raises
1499 ------
1500 LookupError
1501 A URI has been requested for a dataset that does not exist and
1502 guessing is not allowed.
1503 ValueError
1504 Raised if a resolved `DatasetRef` was passed as an input, but it
1505 differs from the one found in the registry.
1506 TypeError
1507 Raised if no collections were provided.
1508 RuntimeError
1509 Raised if a URI is requested for a dataset that consists of
1510 multiple artifacts.
1511 """
1512 primary, components = self.getURIs(
1513 datasetRefOrType, dataId=dataId, predict=predict, collections=collections, run=run, **kwargs
1514 )
1516 if primary is None or components:
1517 raise RuntimeError(
1518 f"Dataset ({datasetRefOrType}) includes distinct URIs for components. "
1519 "Use Butler.getURIs() instead."
1520 )
1521 return primary
1523 def retrieveArtifacts(
1524 self,
1525 refs: Iterable[DatasetRef],
1526 destination: ResourcePathExpression,
1527 transfer: str = "auto",
1528 preserve_path: bool = True,
1529 overwrite: bool = False,
1530 ) -> List[ResourcePath]:
1531 """Retrieve the artifacts associated with the supplied refs.
1533 Parameters
1534 ----------
1535 refs : iterable of `DatasetRef`
1536 The datasets for which artifacts are to be retrieved.
1537 A single ref can result in multiple artifacts. The refs must
1538 be resolved.
1539 destination : `lsst.resources.ResourcePath` or `str`
1540 Location to write the artifacts.
1541 transfer : `str`, optional
1542 Method to use to transfer the artifacts. Must be one of the options
1543 supported by `~lsst.resources.ResourcePath.transfer_from()`.
1544 "move" is not allowed.
1545 preserve_path : `bool`, optional
1546 If `True` the full path of the artifact within the datastore
1547 is preserved. If `False` the final file component of the path
1548 is used.
1549 overwrite : `bool`, optional
1550 If `True` allow transfers to overwrite existing files at the
1551 destination.
1553 Returns
1554 -------
1555 targets : `list` of `lsst.resources.ResourcePath`
1556 URIs of file artifacts in destination location. Order is not
1557 preserved.
1559 Notes
1560 -----
1561 For non-file datastores the artifacts written to the destination
1562 may not match the representation inside the datastore. For example
1563 a hierarchical data structure in a NoSQL database may well be stored
1564 as a JSON file.
1565 """
1566 return self.datastore.retrieveArtifacts(
1567 refs,
1568 ResourcePath(destination),
1569 transfer=transfer,
1570 preserve_path=preserve_path,
1571 overwrite=overwrite,
1572 )
1574 def datasetExists(
1575 self,
1576 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1577 dataId: Optional[DataId] = None,
1578 *,
1579 collections: Any = None,
1580 **kwargs: Any,
1581 ) -> bool:
1582 """Return True if the Dataset is actually present in the Datastore.
1584 Parameters
1585 ----------
1586 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1587 When `DatasetRef` the `dataId` should be `None`.
1588 Otherwise the `DatasetType` or name thereof.
1589 dataId : `dict` or `DataCoordinate`
1590 A `dict` of `Dimension` link name, value pairs that label the
1591 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1592 should be provided as the first argument.
1593 collections : Any, optional
1594 Collections to be searched, overriding ``self.collections``.
1595 Can be any of the types supported by the ``collections`` argument
1596 to butler construction.
1597 **kwargs
1598 Additional keyword arguments used to augment or construct a
1599 `DataCoordinate`. See `DataCoordinate.standardize`
1600 parameters.
1602 Raises
1603 ------
1604 LookupError
1605 Raised if the dataset is not even present in the Registry.
1606 ValueError
1607 Raised if a resolved `DatasetRef` was passed as an input, but it
1608 differs from the one found in the registry.
1609 TypeError
1610 Raised if no collections were provided.
1611 """
1612 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs)
1613 return self.datastore.exists(ref)
1615 def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None:
1616 """Remove one or more `~CollectionType.RUN` collections and the
1617 datasets within them.
1619 Parameters
1620 ----------
1621 names : `Iterable` [ `str` ]
1622 The names of the collections to remove.
1623 unstore : `bool`, optional
1624 If `True` (default), delete datasets from all datastores in which
1625 they are present, and attempt to rollback the registry deletions if
1626 datastore deletions fail (which may not always be possible). If
1627 `False`, datastore records for these datasets are still removed,
1628 but any artifacts (e.g. files) will not be.
1630 Raises
1631 ------
1632 TypeError
1633 Raised if one or more collections are not of type
1634 `~CollectionType.RUN`.
1635 """
1636 if not self.isWriteable():
1637 raise TypeError("Butler is read-only.")
1638 names = list(names)
1639 refs: List[DatasetRef] = []
1640 for name in names:
1641 collectionType = self.registry.getCollectionType(name)
1642 if collectionType is not CollectionType.RUN:
1643 raise TypeError(f"The collection type of '{name}' is {collectionType.name}, not RUN.")
1644 refs.extend(self.registry.queryDatasets(..., collections=name, findFirst=True))
1645 with self.datastore.transaction():
1646 with self.registry.transaction():
1647 if unstore:
1648 self.datastore.trash(refs)
1649 else:
1650 self.datastore.forget(refs)
1651 for name in names:
1652 self.registry.removeCollection(name)
1653 if unstore:
1654 # Point of no return for removing artifacts
1655 self.datastore.emptyTrash()
1657 def pruneCollection(
1658 self, name: str, purge: bool = False, unstore: bool = False, unlink: Optional[List[str]] = None
1659 ) -> None:
1660 """Remove a collection and possibly prune datasets within it.
1662 Parameters
1663 ----------
1664 name : `str`
1665 Name of the collection to remove. If this is a
1666 `~CollectionType.TAGGED` or `~CollectionType.CHAINED` collection,
1667 datasets within the collection are not modified unless ``unstore``
1668 is `True`. If this is a `~CollectionType.RUN` collection,
1669 ``purge`` and ``unstore`` must be `True`, and all datasets in it
1670 are fully removed from the data repository.
1671 purge : `bool`, optional
1672 If `True`, permit `~CollectionType.RUN` collections to be removed,
1673 fully removing datasets within them. Requires ``unstore=True`` as
1674 well as an added precaution against accidental deletion. Must be
1675 `False` (default) if the collection is not a ``RUN``.
1676 unstore: `bool`, optional
1677 If `True`, remove all datasets in the collection from all
1678 datastores in which they appear.
1679 unlink: `list` [`str`], optional
1680 Before removing the given `collection` unlink it from from these
1681 parent collections.
1683 Raises
1684 ------
1685 TypeError
1686 Raised if the butler is read-only or arguments are mutually
1687 inconsistent.
1688 """
1689 # See pruneDatasets comments for more information about the logic here;
1690 # the cases are almost the same, but here we can rely on Registry to
1691 # take care everything but Datastore deletion when we remove the
1692 # collection.
1693 if not self.isWriteable():
1694 raise TypeError("Butler is read-only.")
1695 collectionType = self.registry.getCollectionType(name)
1696 if purge and not unstore:
1697 raise PurgeWithoutUnstorePruneCollectionsError()
1698 if collectionType is CollectionType.RUN and not purge:
1699 raise RunWithoutPurgePruneCollectionsError(collectionType)
1700 if collectionType is not CollectionType.RUN and purge:
1701 raise PurgeUnsupportedPruneCollectionsError(collectionType)
1703 def remove(child: str, parent: str) -> None:
1704 """Remove a child collection from a parent collection."""
1705 # Remove child from parent.
1706 chain = list(self.registry.getCollectionChain(parent))
1707 try:
1708 chain.remove(name)
1709 except ValueError as e:
1710 raise RuntimeError(f"{name} is not a child of {parent}") from e
1711 self.registry.setCollectionChain(parent, chain)
1713 with self.datastore.transaction():
1714 with self.registry.transaction():
1715 if unlink:
1716 for parent in unlink:
1717 remove(name, parent)
1718 if unstore:
1719 refs = self.registry.queryDatasets(..., collections=name, findFirst=True)
1720 self.datastore.trash(refs)
1721 self.registry.removeCollection(name)
1723 if unstore:
1724 # Point of no return for removing artifacts
1725 self.datastore.emptyTrash()
1727 def pruneDatasets(
1728 self,
1729 refs: Iterable[DatasetRef],
1730 *,
1731 disassociate: bool = True,
1732 unstore: bool = False,
1733 tags: Iterable[str] = (),
1734 purge: bool = False,
1735 ) -> None:
1736 # docstring inherited from LimitedButler
1738 if not self.isWriteable():
1739 raise TypeError("Butler is read-only.")
1740 if purge:
1741 if not disassociate:
1742 raise TypeError("Cannot pass purge=True without disassociate=True.")
1743 if not unstore:
1744 raise TypeError("Cannot pass purge=True without unstore=True.")
1745 elif disassociate:
1746 tags = tuple(tags)
1747 if not tags:
1748 raise TypeError("No tags provided but disassociate=True.")
1749 for tag in tags:
1750 collectionType = self.registry.getCollectionType(tag)
1751 if collectionType is not CollectionType.TAGGED:
1752 raise TypeError(
1753 f"Cannot disassociate from collection '{tag}' "
1754 f"of non-TAGGED type {collectionType.name}."
1755 )
1756 # For an execution butler we want to keep existing UUIDs for the
1757 # datasets, for that we need to keep them in the collections but
1758 # remove from datastore.
1759 if self._allow_put_of_predefined_dataset and purge:
1760 purge = False
1761 disassociate = False
1762 # Transform possibly-single-pass iterable into something we can iterate
1763 # over multiple times.
1764 refs = list(refs)
1765 # Pruning a component of a DatasetRef makes no sense since registry
1766 # doesn't know about components and datastore might not store
1767 # components in a separate file
1768 for ref in refs:
1769 if ref.datasetType.component():
1770 raise ValueError(f"Can not prune a component of a dataset (ref={ref})")
1771 # We don't need an unreliable Datastore transaction for this, because
1772 # we've been extra careful to ensure that Datastore.trash only involves
1773 # mutating the Registry (it can _look_ at Datastore-specific things,
1774 # but shouldn't change them), and hence all operations here are
1775 # Registry operations.
1776 with self.datastore.transaction():
1777 with self.registry.transaction():
1778 if unstore:
1779 self.datastore.trash(refs)
1780 if purge:
1781 self.registry.removeDatasets(refs)
1782 elif disassociate:
1783 assert tags, "Guaranteed by earlier logic in this function."
1784 for tag in tags:
1785 self.registry.disassociate(tag, refs)
1786 # We've exited the Registry transaction, and apparently committed.
1787 # (if there was an exception, everything rolled back, and it's as if
1788 # nothing happened - and we never get here).
1789 # Datastore artifacts are not yet gone, but they're clearly marked
1790 # as trash, so if we fail to delete now because of (e.g.) filesystem
1791 # problems we can try again later, and if manual administrative
1792 # intervention is required, it's pretty clear what that should entail:
1793 # deleting everything on disk and in private Datastore tables that is
1794 # in the dataset_location_trash table.
1795 if unstore:
1796 # Point of no return for removing artifacts
1797 self.datastore.emptyTrash()
1799 @transactional
1800 def ingest(
1801 self,
1802 *datasets: FileDataset,
1803 transfer: Optional[str] = "auto",
1804 run: Optional[str] = None,
1805 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
1806 record_validation_info: bool = True,
1807 ) -> None:
1808 """Store and register one or more datasets that already exist on disk.
1810 Parameters
1811 ----------
1812 datasets : `FileDataset`
1813 Each positional argument is a struct containing information about
1814 a file to be ingested, including its URI (either absolute or
1815 relative to the datastore root, if applicable), a `DatasetRef`,
1816 and optionally a formatter class or its fully-qualified string
1817 name. If a formatter is not provided, the formatter that would be
1818 used for `put` is assumed. On successful return, all
1819 `FileDataset.ref` attributes will have their `DatasetRef.id`
1820 attribute populated and all `FileDataset.formatter` attributes will
1821 be set to the formatter class used. `FileDataset.path` attributes
1822 may be modified to put paths in whatever the datastore considers a
1823 standardized form.
1824 transfer : `str`, optional
1825 If not `None`, must be one of 'auto', 'move', 'copy', 'direct',
1826 'split', 'hardlink', 'relsymlink' or 'symlink', indicating how to
1827 transfer the file.
1828 run : `str`, optional
1829 The name of the run ingested datasets should be added to,
1830 overriding ``self.run``.
1831 idGenerationMode : `DatasetIdGenEnum`, optional
1832 Specifies option for generating dataset IDs. By default unique IDs
1833 are generated for each inserted dataset.
1834 record_validation_info : `bool`, optional
1835 If `True`, the default, the datastore can record validation
1836 information associated with the file. If `False` the datastore
1837 will not attempt to track any information such as checksums
1838 or file sizes. This can be useful if such information is tracked
1839 in an external system or if the file is to be compressed in place.
1840 It is up to the datastore whether this parameter is relevant.
1842 Raises
1843 ------
1844 TypeError
1845 Raised if the butler is read-only or if no run was provided.
1846 NotImplementedError
1847 Raised if the `Datastore` does not support the given transfer mode.
1848 DatasetTypeNotSupportedError
1849 Raised if one or more files to be ingested have a dataset type that
1850 is not supported by the `Datastore`..
1851 FileNotFoundError
1852 Raised if one of the given files does not exist.
1853 FileExistsError
1854 Raised if transfer is not `None` but the (internal) location the
1855 file would be moved to is already occupied.
1857 Notes
1858 -----
1859 This operation is not fully exception safe: if a database operation
1860 fails, the given `FileDataset` instances may be only partially updated.
1862 It is atomic in terms of database operations (they will either all
1863 succeed or all fail) providing the database engine implements
1864 transactions correctly. It will attempt to be atomic in terms of
1865 filesystem operations as well, but this cannot be implemented
1866 rigorously for most datastores.
1867 """
1868 if not self.isWriteable():
1869 raise TypeError("Butler is read-only.")
1870 progress = Progress("lsst.daf.butler.Butler.ingest", level=logging.DEBUG)
1871 # Reorganize the inputs so they're grouped by DatasetType and then
1872 # data ID. We also include a list of DatasetRefs for each FileDataset
1873 # to hold the resolved DatasetRefs returned by the Registry, before
1874 # it's safe to swap them into FileDataset.refs.
1875 # Some type annotation aliases to make that clearer:
1876 GroupForType = Dict[DataCoordinate, Tuple[FileDataset, List[DatasetRef]]]
1877 GroupedData = MutableMapping[DatasetType, GroupForType]
1878 # The actual data structure:
1879 groupedData: GroupedData = defaultdict(dict)
1880 # And the nested loop that populates it:
1881 for dataset in progress.wrap(datasets, desc="Grouping by dataset type"):
1882 # This list intentionally shared across the inner loop, since it's
1883 # associated with `dataset`.
1884 resolvedRefs: List[DatasetRef] = []
1886 # Somewhere to store pre-existing refs if we have an
1887 # execution butler.
1888 existingRefs: List[DatasetRef] = []
1890 for ref in dataset.refs:
1891 if ref.dataId in groupedData[ref.datasetType]:
1892 raise ConflictingDefinitionError(
1893 f"Ingest conflict. Dataset {dataset.path} has same"
1894 " DataId as other ingest dataset"
1895 f" {groupedData[ref.datasetType][ref.dataId][0].path} "
1896 f" ({ref.dataId})"
1897 )
1898 if self._allow_put_of_predefined_dataset:
1899 existing_ref = self.registry.findDataset(
1900 ref.datasetType, dataId=ref.dataId, collections=run
1901 )
1902 if existing_ref:
1903 if self.datastore.knows(existing_ref):
1904 raise ConflictingDefinitionError(
1905 f"Dataset associated with path {dataset.path}"
1906 f" already exists as {existing_ref}."
1907 )
1908 # Store this ref elsewhere since it already exists
1909 # and we do not want to remake it but we do want
1910 # to store it in the datastore.
1911 existingRefs.append(existing_ref)
1913 # Nothing else to do until we have finished
1914 # iterating.
1915 continue
1917 groupedData[ref.datasetType][ref.dataId] = (dataset, resolvedRefs)
1919 if existingRefs:
1921 if len(dataset.refs) != len(existingRefs):
1922 # Keeping track of partially pre-existing datasets is hard
1923 # and should generally never happen. For now don't allow
1924 # it.
1925 raise ConflictingDefinitionError(
1926 f"For dataset {dataset.path} some dataIds already exist"
1927 " in registry but others do not. This is not supported."
1928 )
1930 # Attach the resolved refs if we found them.
1931 dataset.refs = existingRefs
1933 # Now we can bulk-insert into Registry for each DatasetType.
1934 for datasetType, groupForType in progress.iter_item_chunks(
1935 groupedData.items(), desc="Bulk-inserting datasets by type"
1936 ):
1937 refs = self.registry.insertDatasets(
1938 datasetType,
1939 dataIds=groupForType.keys(),
1940 run=run,
1941 expand=self.datastore.needs_expanded_data_ids(transfer, datasetType),
1942 idGenerationMode=idGenerationMode,
1943 )
1944 # Append those resolved DatasetRefs to the new lists we set up for
1945 # them.
1946 for ref, (_, resolvedRefs) in zip(refs, groupForType.values()):
1947 resolvedRefs.append(ref)
1949 # Go back to the original FileDatasets to replace their refs with the
1950 # new resolved ones.
1951 for groupForType in progress.iter_chunks(
1952 groupedData.values(), desc="Reassociating resolved dataset refs with files"
1953 ):
1954 for dataset, resolvedRefs in groupForType.values():
1955 dataset.refs = resolvedRefs
1957 # Bulk-insert everything into Datastore.
1958 self.datastore.ingest(*datasets, transfer=transfer, record_validation_info=record_validation_info)
1960 @contextlib.contextmanager
1961 def export(
1962 self,
1963 *,
1964 directory: Optional[str] = None,
1965 filename: Optional[str] = None,
1966 format: Optional[str] = None,
1967 transfer: Optional[str] = None,
1968 ) -> Iterator[RepoExportContext]:
1969 """Export datasets from the repository represented by this `Butler`.
1971 This method is a context manager that returns a helper object
1972 (`RepoExportContext`) that is used to indicate what information from
1973 the repository should be exported.
1975 Parameters
1976 ----------
1977 directory : `str`, optional
1978 Directory dataset files should be written to if ``transfer`` is not
1979 `None`.
1980 filename : `str`, optional
1981 Name for the file that will include database information associated
1982 with the exported datasets. If this is not an absolute path and
1983 ``directory`` is not `None`, it will be written to ``directory``
1984 instead of the current working directory. Defaults to
1985 "export.{format}".
1986 format : `str`, optional
1987 File format for the database information file. If `None`, the
1988 extension of ``filename`` will be used.
1989 transfer : `str`, optional
1990 Transfer mode passed to `Datastore.export`.
1992 Raises
1993 ------
1994 TypeError
1995 Raised if the set of arguments passed is inconsistent.
1997 Examples
1998 --------
1999 Typically the `Registry.queryDataIds` and `Registry.queryDatasets`
2000 methods are used to provide the iterables over data IDs and/or datasets
2001 to be exported::
2003 with butler.export("exports.yaml") as export:
2004 # Export all flats, but none of the dimension element rows
2005 # (i.e. data ID information) associated with them.
2006 export.saveDatasets(butler.registry.queryDatasets("flat"),
2007 elements=())
2008 # Export all datasets that start with "deepCoadd_" and all of
2009 # their associated data ID information.
2010 export.saveDatasets(butler.registry.queryDatasets("deepCoadd_*"))
2011 """
2012 if directory is None and transfer is not None:
2013 raise TypeError("Cannot transfer without providing a directory.")
2014 if transfer == "move":
2015 raise TypeError("Transfer may not be 'move': export is read-only")
2016 if format is None:
2017 if filename is None:
2018 raise TypeError("At least one of 'filename' or 'format' must be provided.")
2019 else:
2020 _, format = os.path.splitext(filename)
2021 elif filename is None:
2022 filename = f"export.{format}"
2023 if directory is not None:
2024 filename = os.path.join(directory, filename)
2025 BackendClass = get_class_of(self._config["repo_transfer_formats"][format]["export"])
2026 with open(filename, "w") as stream:
2027 backend = BackendClass(stream, universe=self.registry.dimensions)
2028 try:
2029 helper = RepoExportContext(
2030 self.registry, self.datastore, backend=backend, directory=directory, transfer=transfer
2031 )
2032 yield helper
2033 except BaseException:
2034 raise
2035 else:
2036 helper._finish()
2038 def import_(
2039 self,
2040 *,
2041 directory: Optional[str] = None,
2042 filename: Union[str, TextIO, None] = None,
2043 format: Optional[str] = None,
2044 transfer: Optional[str] = None,
2045 skip_dimensions: Optional[Set] = None,
2046 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
2047 reuseIds: bool = False,
2048 ) -> None:
2049 """Import datasets into this repository that were exported from a
2050 different butler repository via `~lsst.daf.butler.Butler.export`.
2052 Parameters
2053 ----------
2054 directory : `str`, optional
2055 Directory containing dataset files to import from. If `None`,
2056 ``filename`` and all dataset file paths specified therein must
2057 be absolute.
2058 filename : `str` or `TextIO`, optional
2059 A stream or name of file that contains database information
2060 associated with the exported datasets, typically generated by
2061 `~lsst.daf.butler.Butler.export`. If this a string (name) and
2062 is not an absolute path, does not exist in the current working
2063 directory, and ``directory`` is not `None`, it is assumed to be in
2064 ``directory``. Defaults to "export.{format}".
2065 format : `str`, optional
2066 File format for ``filename``. If `None`, the extension of
2067 ``filename`` will be used.
2068 transfer : `str`, optional
2069 Transfer mode passed to `~lsst.daf.butler.Datastore.ingest`.
2070 skip_dimensions : `set`, optional
2071 Names of dimensions that should be skipped and not imported.
2072 idGenerationMode : `DatasetIdGenEnum`, optional
2073 Specifies option for generating dataset IDs when IDs are not
2074 provided or their type does not match backend type. By default
2075 unique IDs are generated for each inserted dataset.
2076 reuseIds : `bool`, optional
2077 If `True` then forces re-use of imported dataset IDs for integer
2078 IDs which are normally generated as auto-incremented; exception
2079 will be raised if imported IDs clash with existing ones. This
2080 option has no effect on the use of globally-unique IDs which are
2081 always re-used (or generated if integer IDs are being imported).
2083 Raises
2084 ------
2085 TypeError
2086 Raised if the set of arguments passed is inconsistent, or if the
2087 butler is read-only.
2088 """
2089 if not self.isWriteable():
2090 raise TypeError("Butler is read-only.")
2091 if format is None:
2092 if filename is None:
2093 raise TypeError("At least one of 'filename' or 'format' must be provided.")
2094 else:
2095 _, format = os.path.splitext(filename) # type: ignore
2096 elif filename is None:
2097 filename = f"export.{format}"
2098 if isinstance(filename, str) and directory is not None and not os.path.exists(filename):
2099 filename = os.path.join(directory, filename)
2100 BackendClass = get_class_of(self._config["repo_transfer_formats"][format]["import"])
2102 def doImport(importStream: TextIO) -> None:
2103 backend = BackendClass(importStream, self.registry)
2104 backend.register()
2105 with self.transaction():
2106 backend.load(
2107 self.datastore,
2108 directory=directory,
2109 transfer=transfer,
2110 skip_dimensions=skip_dimensions,
2111 idGenerationMode=idGenerationMode,
2112 reuseIds=reuseIds,
2113 )
2115 if isinstance(filename, str):
2116 with open(filename, "r") as stream:
2117 doImport(stream)
2118 else:
2119 doImport(filename)
2121 def transfer_from(
2122 self,
2123 source_butler: Butler,
2124 source_refs: Iterable[DatasetRef],
2125 transfer: str = "auto",
2126 id_gen_map: Dict[str, DatasetIdGenEnum] = None,
2127 skip_missing: bool = True,
2128 register_dataset_types: bool = False,
2129 transfer_dimensions: bool = False,
2130 ) -> List[DatasetRef]:
2131 """Transfer datasets to this Butler from a run in another Butler.
2133 Parameters
2134 ----------
2135 source_butler : `Butler`
2136 Butler from which the datasets are to be transferred.
2137 source_refs : iterable of `DatasetRef`
2138 Datasets defined in the source butler that should be transferred to
2139 this butler.
2140 transfer : `str`, optional
2141 Transfer mode passed to `~lsst.daf.butler.Datastore.transfer_from`.
2142 id_gen_map : `dict` of [`str`, `DatasetIdGenEnum`], optional
2143 A mapping of dataset type to ID generation mode. Only used if
2144 the source butler is using integer IDs. Should not be used
2145 if this receiving butler uses integer IDs. Without this dataset
2146 import always uses unique.
2147 skip_missing : `bool`
2148 If `True`, datasets with no datastore artifact associated with
2149 them are not transferred. If `False` a registry entry will be
2150 created even if no datastore record is created (and so will
2151 look equivalent to the dataset being unstored).
2152 register_dataset_types : `bool`
2153 If `True` any missing dataset types are registered. Otherwise
2154 an exception is raised.
2155 transfer_dimensions : `bool`, optional
2156 If `True`, dimension record data associated with the new datasets
2157 will be transferred.
2159 Returns
2160 -------
2161 refs : `list` of `DatasetRef`
2162 The refs added to this Butler.
2164 Notes
2165 -----
2166 Requires that any dimension definitions are already present in the
2167 receiving Butler. The datastore artifact has to exist for a transfer
2168 to be made but non-existence is not an error.
2170 Datasets that already exist in this run will be skipped.
2172 The datasets are imported as part of a transaction, although
2173 dataset types are registered before the transaction is started.
2174 This means that it is possible for a dataset type to be registered
2175 even though transfer has failed.
2176 """
2177 if not self.isWriteable():
2178 raise TypeError("Butler is read-only.")
2179 progress = Progress("lsst.daf.butler.Butler.transfer_from", level=VERBOSE)
2181 # Will iterate through the refs multiple times so need to convert
2182 # to a list if this isn't a collection.
2183 if not isinstance(source_refs, collections.abc.Collection):
2184 source_refs = list(source_refs)
2186 original_count = len(source_refs)
2187 log.info("Transferring %d datasets into %s", original_count, str(self))
2189 if id_gen_map is None:
2190 id_gen_map = {}
2192 # In some situations the datastore artifact may be missing
2193 # and we do not want that registry entry to be imported.
2194 # Asking datastore is not sufficient, the records may have been
2195 # purged, we have to ask for the (predicted) URI and check
2196 # existence explicitly. Execution butler is set up exactly like
2197 # this with no datastore records.
2198 artifact_existence: Dict[ResourcePath, bool] = {}
2199 if skip_missing:
2200 dataset_existence = source_butler.datastore.mexists(
2201 source_refs, artifact_existence=artifact_existence
2202 )
2203 source_refs = [ref for ref, exists in dataset_existence.items() if exists]
2204 filtered_count = len(source_refs)
2205 log.verbose(
2206 "%d datasets removed because the artifact does not exist. Now have %d.",
2207 original_count - filtered_count,
2208 filtered_count,
2209 )
2211 # Importing requires that we group the refs by dataset type and run
2212 # before doing the import.
2213 source_dataset_types = set()
2214 grouped_refs = defaultdict(list)
2215 grouped_indices = defaultdict(list)
2216 for i, ref in enumerate(source_refs):
2217 grouped_refs[ref.datasetType, ref.run].append(ref)
2218 grouped_indices[ref.datasetType, ref.run].append(i)
2219 source_dataset_types.add(ref.datasetType)
2221 # Check to see if the dataset type in the source butler has
2222 # the same definition in the target butler and register missing
2223 # ones if requested. Registration must happen outside a transaction.
2224 newly_registered_dataset_types = set()
2225 for datasetType in source_dataset_types:
2226 if register_dataset_types:
2227 # Let this raise immediately if inconsistent. Continuing
2228 # on to find additional inconsistent dataset types
2229 # might result in additional unwanted dataset types being
2230 # registered.
2231 if self.registry.registerDatasetType(datasetType):
2232 newly_registered_dataset_types.add(datasetType)
2233 else:
2234 # If the dataset type is missing, let it fail immediately.
2235 target_dataset_type = self.registry.getDatasetType(datasetType.name)
2236 if target_dataset_type != datasetType:
2237 raise ConflictingDefinitionError(
2238 "Source butler dataset type differs from definition"
2239 f" in target butler: {datasetType} !="
2240 f" {target_dataset_type}"
2241 )
2242 if newly_registered_dataset_types:
2243 # We may have registered some even if there were inconsistencies
2244 # but should let people know (or else remove them again).
2245 log.log(
2246 VERBOSE,
2247 "Registered the following dataset types in the target Butler: %s",
2248 ", ".join(d.name for d in newly_registered_dataset_types),
2249 )
2250 else:
2251 log.log(VERBOSE, "All required dataset types are known to the target Butler")
2253 dimension_records: Dict[DimensionElement, Dict[DataCoordinate, DimensionRecord]] = defaultdict(dict)
2254 if transfer_dimensions:
2255 # Collect all the dimension records for these refs.
2256 # All dimensions are to be copied but the list of valid dimensions
2257 # come from this butler's universe.
2258 elements = frozenset(
2259 element
2260 for element in self.registry.dimensions.getStaticElements()
2261 if element.hasTable() and element.viewOf is None
2262 )
2263 dataIds = set(ref.dataId for ref in source_refs)
2264 # This logic comes from saveDataIds.
2265 for dataId in dataIds:
2266 # Should be a no-op if the ref has already been expanded.
2267 dataId = source_butler.registry.expandDataId(dataId)
2268 # If this butler doesn't know about a dimension in the source
2269 # butler things will break later.
2270 for record in dataId.records.values():
2271 if record is not None and record.definition in elements:
2272 dimension_records[record.definition].setdefault(record.dataId, record)
2274 # The returned refs should be identical for UUIDs.
2275 # For now must also support integers and so need to retain the
2276 # newly-created refs from this registry.
2277 # Pre-size it so we can assign refs into the correct slots
2278 transferred_refs_tmp: List[Optional[DatasetRef]] = [None] * len(source_refs)
2279 default_id_gen = DatasetIdGenEnum.UNIQUE
2281 handled_collections: Set[str] = set()
2283 # Do all the importing in a single transaction.
2284 with self.transaction():
2285 if dimension_records:
2286 log.verbose("Ensuring that dimension records exist for transferred datasets.")
2287 for element, r in dimension_records.items():
2288 records = [r[dataId] for dataId in r]
2289 # Assume that if the record is already present that we can
2290 # use it without having to check that the record metadata
2291 # is consistent.
2292 self.registry.insertDimensionData(element, *records, skip_existing=True)
2294 for (datasetType, run), refs_to_import in progress.iter_item_chunks(
2295 grouped_refs.items(), desc="Importing to registry by run and dataset type"
2296 ):
2297 if run not in handled_collections:
2298 run_doc = source_butler.registry.getCollectionDocumentation(run)
2299 registered = self.registry.registerRun(run, doc=run_doc)
2300 handled_collections.add(run)
2301 if registered:
2302 log.log(VERBOSE, "Creating output run %s", run)
2304 id_generation_mode = default_id_gen
2305 if isinstance(refs_to_import[0].id, int):
2306 # ID generation mode might need to be overridden when
2307 # targetting UUID
2308 id_generation_mode = id_gen_map.get(datasetType.name, default_id_gen)
2310 n_refs = len(refs_to_import)
2311 log.verbose(
2312 "Importing %d ref%s of dataset type %s into run %s",
2313 n_refs,
2314 "" if n_refs == 1 else "s",
2315 datasetType.name,
2316 run,
2317 )
2319 # No way to know if this butler's registry uses UUID.
2320 # We have to trust the caller on this. If it fails they will
2321 # have to change their approach. We can't catch the exception
2322 # and retry with unique because that will mess up the
2323 # transaction handling. We aren't allowed to ask the registry
2324 # manager what type of ID it is using.
2325 imported_refs = self.registry._importDatasets(
2326 refs_to_import, idGenerationMode=id_generation_mode, expand=False
2327 )
2329 # Map them into the correct slots to match the initial order
2330 for i, ref in zip(grouped_indices[datasetType, run], imported_refs):
2331 transferred_refs_tmp[i] = ref
2333 # Mypy insists that we might have None in here so we have to make
2334 # that explicit by assigning to a new variable and filtering out
2335 # something that won't be there.
2336 transferred_refs = [ref for ref in transferred_refs_tmp if ref is not None]
2338 # Check consistency
2339 assert len(source_refs) == len(transferred_refs), "Different number of refs imported than given"
2341 log.verbose("Imported %d datasets into destination butler", len(transferred_refs))
2343 # The transferred refs need to be reordered to match the original
2344 # ordering given by the caller. Without this the datastore transfer
2345 # will be broken.
2347 # Ask the datastore to transfer. The datastore has to check that
2348 # the source datastore is compatible with the target datastore.
2349 self.datastore.transfer_from(
2350 source_butler.datastore,
2351 source_refs,
2352 local_refs=transferred_refs,
2353 transfer=transfer,
2354 artifact_existence=artifact_existence,
2355 )
2357 return transferred_refs
2359 def validateConfiguration(
2360 self,
2361 logFailures: bool = False,
2362 datasetTypeNames: Optional[Iterable[str]] = None,
2363 ignore: Iterable[str] = None,
2364 ) -> None:
2365 """Validate butler configuration.
2367 Checks that each `DatasetType` can be stored in the `Datastore`.
2369 Parameters
2370 ----------
2371 logFailures : `bool`, optional
2372 If `True`, output a log message for every validation error
2373 detected.
2374 datasetTypeNames : iterable of `str`, optional
2375 The `DatasetType` names that should be checked. This allows
2376 only a subset to be selected.
2377 ignore : iterable of `str`, optional
2378 Names of DatasetTypes to skip over. This can be used to skip
2379 known problems. If a named `DatasetType` corresponds to a
2380 composite, all components of that `DatasetType` will also be
2381 ignored.
2383 Raises
2384 ------
2385 ButlerValidationError
2386 Raised if there is some inconsistency with how this Butler
2387 is configured.
2388 """
2389 if datasetTypeNames:
2390 datasetTypes = [self.registry.getDatasetType(name) for name in datasetTypeNames]
2391 else:
2392 datasetTypes = list(self.registry.queryDatasetTypes())
2394 # filter out anything from the ignore list
2395 if ignore:
2396 ignore = set(ignore)
2397 datasetTypes = [
2398 e for e in datasetTypes if e.name not in ignore and e.nameAndComponent()[0] not in ignore
2399 ]
2400 else:
2401 ignore = set()
2403 # Find all the registered instruments
2404 instruments = set(record.name for record in self.registry.queryDimensionRecords("instrument"))
2406 # For each datasetType that has an instrument dimension, create
2407 # a DatasetRef for each defined instrument
2408 datasetRefs = []
2410 for datasetType in datasetTypes:
2411 if "instrument" in datasetType.dimensions:
2412 for instrument in instruments:
2413 datasetRef = DatasetRef(
2414 datasetType, {"instrument": instrument}, conform=False # type: ignore
2415 )
2416 datasetRefs.append(datasetRef)
2418 entities: List[Union[DatasetType, DatasetRef]] = []
2419 entities.extend(datasetTypes)
2420 entities.extend(datasetRefs)
2422 datastoreErrorStr = None
2423 try:
2424 self.datastore.validateConfiguration(entities, logFailures=logFailures)
2425 except ValidationError as e:
2426 datastoreErrorStr = str(e)
2428 # Also check that the LookupKeys used by the datastores match
2429 # registry and storage class definitions
2430 keys = self.datastore.getLookupKeys()
2432 failedNames = set()
2433 failedDataId = set()
2434 for key in keys:
2435 if key.name is not None:
2436 if key.name in ignore:
2437 continue
2439 # skip if specific datasetType names were requested and this
2440 # name does not match
2441 if datasetTypeNames and key.name not in datasetTypeNames:
2442 continue
2444 # See if it is a StorageClass or a DatasetType
2445 if key.name in self.storageClasses:
2446 pass
2447 else:
2448 try:
2449 self.registry.getDatasetType(key.name)
2450 except KeyError:
2451 if logFailures:
2452 log.critical("Key '%s' does not correspond to a DatasetType or StorageClass", key)
2453 failedNames.add(key)
2454 else:
2455 # Dimensions are checked for consistency when the Butler
2456 # is created and rendezvoused with a universe.
2457 pass
2459 # Check that the instrument is a valid instrument
2460 # Currently only support instrument so check for that
2461 if key.dataId:
2462 dataIdKeys = set(key.dataId)
2463 if set(["instrument"]) != dataIdKeys:
2464 if logFailures:
2465 log.critical("Key '%s' has unsupported DataId override", key)
2466 failedDataId.add(key)
2467 elif key.dataId["instrument"] not in instruments:
2468 if logFailures:
2469 log.critical("Key '%s' has unknown instrument", key)
2470 failedDataId.add(key)
2472 messages = []
2474 if datastoreErrorStr:
2475 messages.append(datastoreErrorStr)
2477 for failed, msg in (
2478 (failedNames, "Keys without corresponding DatasetType or StorageClass entry: "),
2479 (failedDataId, "Keys with bad DataId entries: "),
2480 ):
2481 if failed:
2482 msg += ", ".join(str(k) for k in failed)
2483 messages.append(msg)
2485 if messages:
2486 raise ValidationError(";\n".join(messages))
2488 @property
2489 def collections(self) -> CollectionSearch:
2490 """The collections to search by default, in order (`CollectionSearch`).
2492 This is an alias for ``self.registry.defaults.collections``. It cannot
2493 be set directly in isolation, but all defaults may be changed together
2494 by assigning a new `RegistryDefaults` instance to
2495 ``self.registry.defaults``.
2496 """
2497 return self.registry.defaults.collections
2499 @property
2500 def run(self) -> Optional[str]:
2501 """Name of the run this butler writes outputs to by default (`str` or
2502 `None`).
2504 This is an alias for ``self.registry.defaults.run``. It cannot be set
2505 directly in isolation, but all defaults may be changed together by
2506 assigning a new `RegistryDefaults` instance to
2507 ``self.registry.defaults``.
2508 """
2509 return self.registry.defaults.run
2511 @property
2512 def dimensions(self) -> DimensionUniverse:
2513 # Docstring inherited.
2514 return self.registry.dimensions
2516 registry: Registry
2517 """The object that manages dataset metadata and relationships (`Registry`).
2519 Most operations that don't involve reading or writing butler datasets are
2520 accessible only via `Registry` methods.
2521 """
2523 datastore: Datastore
2524 """The object that manages actual dataset storage (`Datastore`).
2526 Direct user access to the datastore should rarely be necessary; the primary
2527 exception is the case where a `Datastore` implementation provides extra
2528 functionality beyond what the base class defines.
2529 """
2531 storageClasses: StorageClassFactory
2532 """An object that maps known storage class names to objects that fully
2533 describe them (`StorageClassFactory`).
2534 """
2536 _allow_put_of_predefined_dataset: bool
2537 """Allow a put to succeed even if there is already a registry entry for it
2538 but not a datastore record. (`bool`)."""