Coverage for python/lsst/daf/butler/_butler.py: 9%
666 statements
« prev ^ index » next coverage.py v6.4.1, created at 2022-06-17 02:08 -0700
« prev ^ index » next coverage.py v6.4.1, created at 2022-06-17 02:08 -0700
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""
23Butler top level classes.
24"""
25from __future__ import annotations
27__all__ = (
28 "Butler",
29 "ButlerValidationError",
30 "PruneCollectionsArgsError",
31 "PurgeWithoutUnstorePruneCollectionsError",
32 "RunWithoutPurgePruneCollectionsError",
33 "PurgeUnsupportedPruneCollectionsError",
34)
36import collections.abc
37import contextlib
38import logging
39import numbers
40import os
41from collections import defaultdict
42from typing import (
43 Any,
44 ClassVar,
45 Counter,
46 Dict,
47 Iterable,
48 Iterator,
49 List,
50 MutableMapping,
51 Optional,
52 Set,
53 TextIO,
54 Tuple,
55 Type,
56 Union,
57)
59from lsst.resources import ResourcePath, ResourcePathExpression
60from lsst.utils import doImportType
61from lsst.utils.introspection import get_class_of
62from lsst.utils.logging import VERBOSE, getLogger
64from ._butlerConfig import ButlerConfig
65from ._butlerRepoIndex import ButlerRepoIndex
66from ._deferredDatasetHandle import DeferredDatasetHandle
67from ._limited_butler import LimitedButler
68from .core import (
69 AmbiguousDatasetError,
70 Config,
71 ConfigSubset,
72 DataCoordinate,
73 DataId,
74 DataIdValue,
75 DatasetRef,
76 DatasetType,
77 Datastore,
78 Dimension,
79 DimensionConfig,
80 DimensionUniverse,
81 FileDataset,
82 Progress,
83 StorageClassFactory,
84 Timespan,
85 ValidationError,
86)
87from .core.repoRelocation import BUTLER_ROOT_TAG
88from .core.utils import transactional
89from .registry import (
90 CollectionSearch,
91 CollectionType,
92 ConflictingDefinitionError,
93 DataIdError,
94 DatasetIdGenEnum,
95 Registry,
96 RegistryConfig,
97 RegistryDefaults,
98)
99from .transfers import RepoExportContext
101log = getLogger(__name__)
104class ButlerValidationError(ValidationError):
105 """There is a problem with the Butler configuration."""
107 pass
110class PruneCollectionsArgsError(TypeError):
111 """Base class for errors relating to Butler.pruneCollections input
112 arguments.
113 """
115 pass
118class PurgeWithoutUnstorePruneCollectionsError(PruneCollectionsArgsError):
119 """Raised when purge and unstore are both required to be True, and
120 purge is True but unstore is False.
121 """
123 def __init__(self) -> None:
124 super().__init__("Cannot pass purge=True without unstore=True.")
127class RunWithoutPurgePruneCollectionsError(PruneCollectionsArgsError):
128 """Raised when pruning a RUN collection but purge is False."""
130 def __init__(self, collectionType: CollectionType):
131 self.collectionType = collectionType
132 super().__init__(f"Cannot prune RUN collection {self.collectionType.name} without purge=True.")
135class PurgeUnsupportedPruneCollectionsError(PruneCollectionsArgsError):
136 """Raised when purge is True but is not supported for the given
137 collection."""
139 def __init__(self, collectionType: CollectionType):
140 self.collectionType = collectionType
141 super().__init__(
142 f"Cannot prune {self.collectionType} collection {self.collectionType.name} with purge=True."
143 )
146class Butler(LimitedButler):
147 """Main entry point for the data access system.
149 Parameters
150 ----------
151 config : `ButlerConfig`, `Config` or `str`, optional.
152 Configuration. Anything acceptable to the
153 `ButlerConfig` constructor. If a directory path
154 is given the configuration will be read from a ``butler.yaml`` file in
155 that location. If `None` is given default values will be used.
156 butler : `Butler`, optional.
157 If provided, construct a new Butler that uses the same registry and
158 datastore as the given one, but with the given collection and run.
159 Incompatible with the ``config``, ``searchPaths``, and ``writeable``
160 arguments.
161 collections : `str` or `Iterable` [ `str` ], optional
162 An expression specifying the collections to be searched (in order) when
163 reading datasets.
164 This may be a `str` collection name or an iterable thereof.
165 See :ref:`daf_butler_collection_expressions` for more information.
166 These collections are not registered automatically and must be
167 manually registered before they are used by any method, but they may be
168 manually registered after the `Butler` is initialized.
169 run : `str`, optional
170 Name of the `~CollectionType.RUN` collection new datasets should be
171 inserted into. If ``collections`` is `None` and ``run`` is not `None`,
172 ``collections`` will be set to ``[run]``. If not `None`, this
173 collection will automatically be registered. If this is not set (and
174 ``writeable`` is not set either), a read-only butler will be created.
175 searchPaths : `list` of `str`, optional
176 Directory paths to search when calculating the full Butler
177 configuration. Not used if the supplied config is already a
178 `ButlerConfig`.
179 writeable : `bool`, optional
180 Explicitly sets whether the butler supports write operations. If not
181 provided, a read-write butler is created if any of ``run``, ``tags``,
182 or ``chains`` is non-empty.
183 inferDefaults : `bool`, optional
184 If `True` (default) infer default data ID values from the values
185 present in the datasets in ``collections``: if all collections have the
186 same value (or no value) for a governor dimension, that value will be
187 the default for that dimension. Nonexistent collections are ignored.
188 If a default value is provided explicitly for a governor dimension via
189 ``**kwargs``, no default will be inferred for that dimension.
190 **kwargs : `str`
191 Default data ID key-value pairs. These may only identify "governor"
192 dimensions like ``instrument`` and ``skymap``.
194 Examples
195 --------
196 While there are many ways to control exactly how a `Butler` interacts with
197 the collections in its `Registry`, the most common cases are still simple.
199 For a read-only `Butler` that searches one collection, do::
201 butler = Butler("/path/to/repo", collections=["u/alice/DM-50000"])
203 For a read-write `Butler` that writes to and reads from a
204 `~CollectionType.RUN` collection::
206 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a")
208 The `Butler` passed to a ``PipelineTask`` is often much more complex,
209 because we want to write to one `~CollectionType.RUN` collection but read
210 from several others (as well)::
212 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a",
213 collections=["u/alice/DM-50000/a",
214 "u/bob/DM-49998",
215 "HSC/defaults"])
217 This butler will `put` new datasets to the run ``u/alice/DM-50000/a``.
218 Datasets will be read first from that run (since it appears first in the
219 chain), and then from ``u/bob/DM-49998`` and finally ``HSC/defaults``.
221 Finally, one can always create a `Butler` with no collections::
223 butler = Butler("/path/to/repo", writeable=True)
225 This can be extremely useful when you just want to use ``butler.registry``,
226 e.g. for inserting dimension data or managing collections, or when the
227 collections you want to use with the butler are not consistent.
228 Passing ``writeable`` explicitly here is only necessary if you want to be
229 able to make changes to the repo - usually the value for ``writeable`` can
230 be guessed from the collection arguments provided, but it defaults to
231 `False` when there are not collection arguments.
232 """
234 def __init__(
235 self,
236 config: Union[Config, str, None] = None,
237 *,
238 butler: Optional[Butler] = None,
239 collections: Any = None,
240 run: Optional[str] = None,
241 searchPaths: Optional[List[str]] = None,
242 writeable: Optional[bool] = None,
243 inferDefaults: bool = True,
244 **kwargs: str,
245 ):
246 defaults = RegistryDefaults(collections=collections, run=run, infer=inferDefaults, **kwargs)
247 # Load registry, datastore, etc. from config or existing butler.
248 if butler is not None:
249 if config is not None or searchPaths is not None or writeable is not None:
250 raise TypeError(
251 "Cannot pass 'config', 'searchPaths', or 'writeable' arguments with 'butler' argument."
252 )
253 self.registry = butler.registry.copy(defaults)
254 self.datastore = butler.datastore
255 self.storageClasses = butler.storageClasses
256 self._config: ButlerConfig = butler._config
257 self._allow_put_of_predefined_dataset = butler._allow_put_of_predefined_dataset
258 else:
259 # Can only look for strings in the known repos list.
260 if isinstance(config, str) and config in self.get_known_repos():
261 config = str(self.get_repo_uri(config))
262 try:
263 self._config = ButlerConfig(config, searchPaths=searchPaths)
264 except FileNotFoundError as e:
265 if known := self.get_known_repos():
266 aliases = f"(known aliases: {', '.join(known)})"
267 else:
268 aliases = "(no known aliases)"
269 raise FileNotFoundError(f"{e} {aliases}") from e
270 self._config = ButlerConfig(config, searchPaths=searchPaths)
271 try:
272 if "root" in self._config:
273 butlerRoot = self._config["root"]
274 else:
275 butlerRoot = self._config.configDir
276 if writeable is None:
277 writeable = run is not None
278 self.registry = Registry.fromConfig(
279 self._config, butlerRoot=butlerRoot, writeable=writeable, defaults=defaults
280 )
281 self.datastore = Datastore.fromConfig(
282 self._config, self.registry.getDatastoreBridgeManager(), butlerRoot=butlerRoot
283 )
284 self.storageClasses = StorageClassFactory()
285 self.storageClasses.addFromConfig(self._config)
286 self._allow_put_of_predefined_dataset = self._config.get(
287 "allow_put_of_predefined_dataset", False
288 )
289 except Exception:
290 # Failures here usually mean that configuration is incomplete,
291 # just issue an error message which includes config file URI.
292 log.error(f"Failed to instantiate Butler from config {self._config.configFile}.")
293 raise
295 if "run" in self._config or "collection" in self._config:
296 raise ValueError("Passing a run or collection via configuration is no longer supported.")
298 GENERATION: ClassVar[int] = 3
299 """This is a Generation 3 Butler.
301 This attribute may be removed in the future, once the Generation 2 Butler
302 interface has been fully retired; it should only be used in transitional
303 code.
304 """
306 @classmethod
307 def get_repo_uri(cls, label: str) -> ResourcePath:
308 """Look up the label in a butler repository index.
310 Parameters
311 ----------
312 label : `str`
313 Label of the Butler repository to look up.
315 Returns
316 -------
317 uri : `lsst.resources.ResourcePath`
318 URI to the Butler repository associated with the given label.
320 Raises
321 ------
322 KeyError
323 Raised if the label is not found in the index, or if an index
324 can not be found at all.
326 Notes
327 -----
328 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the
329 information is discovered.
330 """
331 return ButlerRepoIndex.get_repo_uri(label)
333 @classmethod
334 def get_known_repos(cls) -> Set[str]:
335 """Retrieve the list of known repository labels.
337 Returns
338 -------
339 repos : `set` of `str`
340 All the known labels. Can be empty if no index can be found.
342 Notes
343 -----
344 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the
345 information is discovered.
346 """
347 return ButlerRepoIndex.get_known_repos()
349 @staticmethod
350 def makeRepo(
351 root: ResourcePathExpression,
352 config: Union[Config, str, None] = None,
353 dimensionConfig: Union[Config, str, None] = None,
354 standalone: bool = False,
355 searchPaths: Optional[List[str]] = None,
356 forceConfigRoot: bool = True,
357 outfile: Optional[ResourcePathExpression] = None,
358 overwrite: bool = False,
359 ) -> Config:
360 """Create an empty data repository by adding a butler.yaml config
361 to a repository root directory.
363 Parameters
364 ----------
365 root : `lsst.resources.ResourcePathExpression`
366 Path or URI to the root location of the new repository. Will be
367 created if it does not exist.
368 config : `Config` or `str`, optional
369 Configuration to write to the repository, after setting any
370 root-dependent Registry or Datastore config options. Can not
371 be a `ButlerConfig` or a `ConfigSubset`. If `None`, default
372 configuration will be used. Root-dependent config options
373 specified in this config are overwritten if ``forceConfigRoot``
374 is `True`.
375 dimensionConfig : `Config` or `str`, optional
376 Configuration for dimensions, will be used to initialize registry
377 database.
378 standalone : `bool`
379 If True, write all expanded defaults, not just customized or
380 repository-specific settings.
381 This (mostly) decouples the repository from the default
382 configuration, insulating it from changes to the defaults (which
383 may be good or bad, depending on the nature of the changes).
384 Future *additions* to the defaults will still be picked up when
385 initializing `Butlers` to repos created with ``standalone=True``.
386 searchPaths : `list` of `str`, optional
387 Directory paths to search when calculating the full butler
388 configuration.
389 forceConfigRoot : `bool`, optional
390 If `False`, any values present in the supplied ``config`` that
391 would normally be reset are not overridden and will appear
392 directly in the output config. This allows non-standard overrides
393 of the root directory for a datastore or registry to be given.
394 If this parameter is `True` the values for ``root`` will be
395 forced into the resulting config if appropriate.
396 outfile : `lss.resources.ResourcePathExpression`, optional
397 If not-`None`, the output configuration will be written to this
398 location rather than into the repository itself. Can be a URI
399 string. Can refer to a directory that will be used to write
400 ``butler.yaml``.
401 overwrite : `bool`, optional
402 Create a new configuration file even if one already exists
403 in the specified output location. Default is to raise
404 an exception.
406 Returns
407 -------
408 config : `Config`
409 The updated `Config` instance written to the repo.
411 Raises
412 ------
413 ValueError
414 Raised if a ButlerConfig or ConfigSubset is passed instead of a
415 regular Config (as these subclasses would make it impossible to
416 support ``standalone=False``).
417 FileExistsError
418 Raised if the output config file already exists.
419 os.error
420 Raised if the directory does not exist, exists but is not a
421 directory, or cannot be created.
423 Notes
424 -----
425 Note that when ``standalone=False`` (the default), the configuration
426 search path (see `ConfigSubset.defaultSearchPaths`) that was used to
427 construct the repository should also be used to construct any Butlers
428 to avoid configuration inconsistencies.
429 """
430 if isinstance(config, (ButlerConfig, ConfigSubset)):
431 raise ValueError("makeRepo must be passed a regular Config without defaults applied.")
433 # Ensure that the root of the repository exists or can be made
434 root_uri = ResourcePath(root, forceDirectory=True)
435 root_uri.mkdir()
437 config = Config(config)
439 # If we are creating a new repo from scratch with relative roots,
440 # do not propagate an explicit root from the config file
441 if "root" in config:
442 del config["root"]
444 full = ButlerConfig(config, searchPaths=searchPaths) # this applies defaults
445 imported_class = doImportType(full["datastore", "cls"])
446 if not issubclass(imported_class, Datastore):
447 raise TypeError(f"Imported datastore class {full['datastore', 'cls']} is not a Datastore")
448 datastoreClass: Type[Datastore] = imported_class
449 datastoreClass.setConfigRoot(BUTLER_ROOT_TAG, config, full, overwrite=forceConfigRoot)
451 # if key exists in given config, parse it, otherwise parse the defaults
452 # in the expanded config
453 if config.get(("registry", "db")):
454 registryConfig = RegistryConfig(config)
455 else:
456 registryConfig = RegistryConfig(full)
457 defaultDatabaseUri = registryConfig.makeDefaultDatabaseUri(BUTLER_ROOT_TAG)
458 if defaultDatabaseUri is not None:
459 Config.updateParameters(
460 RegistryConfig, config, full, toUpdate={"db": defaultDatabaseUri}, overwrite=forceConfigRoot
461 )
462 else:
463 Config.updateParameters(RegistryConfig, config, full, toCopy=("db",), overwrite=forceConfigRoot)
465 if standalone:
466 config.merge(full)
467 else:
468 # Always expand the registry.managers section into the per-repo
469 # config, because after the database schema is created, it's not
470 # allowed to change anymore. Note that in the standalone=True
471 # branch, _everything_ in the config is expanded, so there's no
472 # need to special case this.
473 Config.updateParameters(RegistryConfig, config, full, toMerge=("managers",), overwrite=False)
474 configURI: ResourcePathExpression
475 if outfile is not None:
476 # When writing to a separate location we must include
477 # the root of the butler repo in the config else it won't know
478 # where to look.
479 config["root"] = root_uri.geturl()
480 configURI = outfile
481 else:
482 configURI = root_uri
483 config.dumpToUri(configURI, overwrite=overwrite)
485 # Create Registry and populate tables
486 registryConfig = RegistryConfig(config.get("registry"))
487 dimensionConfig = DimensionConfig(dimensionConfig)
488 Registry.createFromConfig(registryConfig, dimensionConfig=dimensionConfig, butlerRoot=root_uri)
490 log.verbose("Wrote new Butler configuration file to %s", configURI)
492 return config
494 @classmethod
495 def _unpickle(
496 cls,
497 config: ButlerConfig,
498 collections: Optional[CollectionSearch],
499 run: Optional[str],
500 defaultDataId: Dict[str, str],
501 writeable: bool,
502 ) -> Butler:
503 """Callable used to unpickle a Butler.
505 We prefer not to use ``Butler.__init__`` directly so we can force some
506 of its many arguments to be keyword-only (note that ``__reduce__``
507 can only invoke callables with positional arguments).
509 Parameters
510 ----------
511 config : `ButlerConfig`
512 Butler configuration, already coerced into a true `ButlerConfig`
513 instance (and hence after any search paths for overrides have been
514 utilized).
515 collections : `CollectionSearch`
516 Names of the default collections to read from.
517 run : `str`, optional
518 Name of the default `~CollectionType.RUN` collection to write to.
519 defaultDataId : `dict` [ `str`, `str` ]
520 Default data ID values.
521 writeable : `bool`
522 Whether the Butler should support write operations.
524 Returns
525 -------
526 butler : `Butler`
527 A new `Butler` instance.
528 """
529 # MyPy doesn't recognize that the kwargs below are totally valid; it
530 # seems to think '**defaultDataId* is a _positional_ argument!
531 return cls(
532 config=config,
533 collections=collections,
534 run=run,
535 writeable=writeable,
536 **defaultDataId, # type: ignore
537 )
539 def __reduce__(self) -> tuple:
540 """Support pickling."""
541 return (
542 Butler._unpickle,
543 (
544 self._config,
545 self.collections,
546 self.run,
547 self.registry.defaults.dataId.byName(),
548 self.registry.isWriteable(),
549 ),
550 )
552 def __str__(self) -> str:
553 return "Butler(collections={}, run={}, datastore='{}', registry='{}')".format(
554 self.collections, self.run, self.datastore, self.registry
555 )
557 def isWriteable(self) -> bool:
558 """Return `True` if this `Butler` supports write operations."""
559 return self.registry.isWriteable()
561 @contextlib.contextmanager
562 def transaction(self) -> Iterator[None]:
563 """Context manager supporting `Butler` transactions.
565 Transactions can be nested.
566 """
567 with self.registry.transaction():
568 with self.datastore.transaction():
569 yield
571 def _standardizeArgs(
572 self,
573 datasetRefOrType: Union[DatasetRef, DatasetType, str],
574 dataId: Optional[DataId] = None,
575 for_put: bool = True,
576 **kwargs: Any,
577 ) -> Tuple[DatasetType, Optional[DataId]]:
578 """Standardize the arguments passed to several Butler APIs.
580 Parameters
581 ----------
582 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
583 When `DatasetRef` the `dataId` should be `None`.
584 Otherwise the `DatasetType` or name thereof.
585 dataId : `dict` or `DataCoordinate`
586 A `dict` of `Dimension` link name, value pairs that label the
587 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
588 should be provided as the second argument.
589 for_put : `bool`, optional
590 If `True` this call is invoked as part of a `Butler.put()`.
591 Otherwise it is assumed to be part of a `Butler.get()`. This
592 parameter is only relevant if there is dataset type
593 inconsistency.
594 **kwargs
595 Additional keyword arguments used to augment or construct a
596 `DataCoordinate`. See `DataCoordinate.standardize`
597 parameters.
599 Returns
600 -------
601 datasetType : `DatasetType`
602 A `DatasetType` instance extracted from ``datasetRefOrType``.
603 dataId : `dict` or `DataId`, optional
604 Argument that can be used (along with ``kwargs``) to construct a
605 `DataId`.
607 Notes
608 -----
609 Butler APIs that conceptually need a DatasetRef also allow passing a
610 `DatasetType` (or the name of one) and a `DataId` (or a dict and
611 keyword arguments that can be used to construct one) separately. This
612 method accepts those arguments and always returns a true `DatasetType`
613 and a `DataId` or `dict`.
615 Standardization of `dict` vs `DataId` is best handled by passing the
616 returned ``dataId`` (and ``kwargs``) to `Registry` APIs, which are
617 generally similarly flexible.
618 """
619 externalDatasetType: Optional[DatasetType] = None
620 internalDatasetType: Optional[DatasetType] = None
621 if isinstance(datasetRefOrType, DatasetRef):
622 if dataId is not None or kwargs:
623 raise ValueError("DatasetRef given, cannot use dataId as well")
624 externalDatasetType = datasetRefOrType.datasetType
625 dataId = datasetRefOrType.dataId
626 else:
627 # Don't check whether DataId is provided, because Registry APIs
628 # can usually construct a better error message when it wasn't.
629 if isinstance(datasetRefOrType, DatasetType):
630 externalDatasetType = datasetRefOrType
631 else:
632 internalDatasetType = self.registry.getDatasetType(datasetRefOrType)
634 # Check that they are self-consistent
635 if externalDatasetType is not None:
636 internalDatasetType = self.registry.getDatasetType(externalDatasetType.name)
637 if externalDatasetType != internalDatasetType:
638 # We can allow differences if they are compatible, depending
639 # on whether this is a get or a put. A get requires that
640 # the python type associated with the datastore can be
641 # converted to the user type. A put requires that the user
642 # supplied python type can be converted to the internal
643 # type expected by registry.
644 relevantDatasetType = internalDatasetType
645 if for_put:
646 is_compatible = internalDatasetType.is_compatible_with(externalDatasetType)
647 else:
648 is_compatible = externalDatasetType.is_compatible_with(internalDatasetType)
649 relevantDatasetType = externalDatasetType
650 if not is_compatible:
651 raise ValueError(
652 f"Supplied dataset type ({externalDatasetType}) inconsistent with "
653 f"registry definition ({internalDatasetType})"
654 )
655 # Override the internal definition.
656 internalDatasetType = relevantDatasetType
658 assert internalDatasetType is not None
659 return internalDatasetType, dataId
661 def _rewrite_data_id(
662 self, dataId: Optional[DataId], datasetType: DatasetType, **kwargs: Any
663 ) -> Tuple[Optional[DataId], Dict[str, Any]]:
664 """Rewrite a data ID taking into account dimension records.
666 Take a Data ID and keyword args and rewrite it if necessary to
667 allow the user to specify dimension records rather than dimension
668 primary values.
670 This allows a user to include a dataId dict with keys of
671 ``exposure.day_obs`` and ``exposure.seq_num`` instead of giving
672 the integer exposure ID. It also allows a string to be given
673 for a dimension value rather than the integer ID if that is more
674 convenient. For example, rather than having to specifyin the
675 detector with ``detector.full_name``, a string given for ``detector``
676 will be interpreted as the full name and converted to the integer
677 value.
679 Keyword arguments can also use strings for dimensions like detector
680 and exposure but python does not allow them to include ``.`` and
681 so the ``exposure.day_obs`` syntax can not be used in a keyword
682 argument.
684 Parameters
685 ----------
686 dataId : `dict` or `DataCoordinate`
687 A `dict` of `Dimension` link name, value pairs that will label the
688 `DatasetRef` within a Collection.
689 datasetType : `DatasetType`
690 The dataset type associated with this dataId. Required to
691 determine the relevant dimensions.
692 **kwargs
693 Additional keyword arguments used to augment or construct a
694 `DataId`. See `DataId` parameters.
696 Returns
697 -------
698 dataId : `dict` or `DataCoordinate`
699 The, possibly rewritten, dataId. If given a `DataCoordinate` and
700 no keyword arguments, the original dataId will be returned
701 unchanged.
702 **kwargs : `dict`
703 Any unused keyword arguments (would normally be empty dict).
704 """
705 # Do nothing if we have a standalone DataCoordinate.
706 if isinstance(dataId, DataCoordinate) and not kwargs:
707 return dataId, kwargs
709 # Process dimension records that are using record information
710 # rather than ids
711 newDataId: Dict[str, DataIdValue] = {}
712 byRecord: Dict[str, Dict[str, Any]] = defaultdict(dict)
714 # if all the dataId comes from keyword parameters we do not need
715 # to do anything here because they can't be of the form
716 # exposure.obs_id because a "." is not allowed in a keyword parameter.
717 if dataId:
718 for k, v in dataId.items():
719 # If we have a Dimension we do not need to do anything
720 # because it cannot be a compound key.
721 if isinstance(k, str) and "." in k:
722 # Someone is using a more human-readable dataId
723 dimensionName, record = k.split(".", 1)
724 byRecord[dimensionName][record] = v
725 elif isinstance(k, Dimension):
726 newDataId[k.name] = v
727 else:
728 newDataId[k] = v
730 # Go through the updated dataId and check the type in case someone is
731 # using an alternate key. We have already filtered out the compound
732 # keys dimensions.record format.
733 not_dimensions = {}
735 # Will need to look in the dataId and the keyword arguments
736 # and will remove them if they need to be fixed or are unrecognized.
737 for dataIdDict in (newDataId, kwargs):
738 # Use a list so we can adjust the dict safely in the loop
739 for dimensionName in list(dataIdDict):
740 value = dataIdDict[dimensionName]
741 try:
742 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
743 except KeyError:
744 # This is not a real dimension
745 not_dimensions[dimensionName] = value
746 del dataIdDict[dimensionName]
747 continue
749 # Convert an integral type to an explicit int to simplify
750 # comparisons here
751 if isinstance(value, numbers.Integral):
752 value = int(value)
754 if not isinstance(value, dimension.primaryKey.getPythonType()):
755 for alternate in dimension.alternateKeys:
756 if isinstance(value, alternate.getPythonType()):
757 byRecord[dimensionName][alternate.name] = value
758 del dataIdDict[dimensionName]
759 log.debug(
760 "Converting dimension %s to %s.%s=%s",
761 dimensionName,
762 dimensionName,
763 alternate.name,
764 value,
765 )
766 break
767 else:
768 log.warning(
769 "Type mismatch found for value '%r' provided for dimension %s. "
770 "Could not find matching alternative (primary key has type %s) "
771 "so attempting to use as-is.",
772 value,
773 dimensionName,
774 dimension.primaryKey.getPythonType(),
775 )
777 # By this point kwargs and newDataId should only include valid
778 # dimensions. Merge kwargs in to the new dataId and log if there
779 # are dimensions in both (rather than calling update).
780 for k, v in kwargs.items():
781 if k in newDataId and newDataId[k] != v:
782 log.debug(
783 "Keyword arg %s overriding explicit value in dataId of %s with %s", k, newDataId[k], v
784 )
785 newDataId[k] = v
786 # No need to retain any values in kwargs now.
787 kwargs = {}
789 # If we have some unrecognized dimensions we have to try to connect
790 # them to records in other dimensions. This is made more complicated
791 # by some dimensions having records with clashing names. A mitigation
792 # is that we can tell by this point which dimensions are missing
793 # for the DatasetType but this does not work for calibrations
794 # where additional dimensions can be used to constrain the temporal
795 # axis.
796 if not_dimensions:
797 # Search for all dimensions even if we have been given a value
798 # explicitly. In some cases records are given as well as the
799 # actually dimension and this should not be an error if they
800 # match.
801 mandatoryDimensions = datasetType.dimensions.names # - provided
803 candidateDimensions: Set[str] = set()
804 candidateDimensions.update(mandatoryDimensions)
806 # For calibrations we may well be needing temporal dimensions
807 # so rather than always including all dimensions in the scan
808 # restrict things a little. It is still possible for there
809 # to be confusion over day_obs in visit vs exposure for example.
810 # If we are not searching calibration collections things may
811 # fail but they are going to fail anyway because of the
812 # ambiguousness of the dataId...
813 if datasetType.isCalibration():
814 for dim in self.registry.dimensions.getStaticDimensions():
815 if dim.temporal:
816 candidateDimensions.add(str(dim))
818 # Look up table for the first association with a dimension
819 guessedAssociation: Dict[str, Dict[str, Any]] = defaultdict(dict)
821 # Keep track of whether an item is associated with multiple
822 # dimensions.
823 counter: Counter[str] = Counter()
824 assigned: Dict[str, Set[str]] = defaultdict(set)
826 # Go through the missing dimensions and associate the
827 # given names with records within those dimensions
828 matched_dims = set()
829 for dimensionName in candidateDimensions:
830 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
831 fields = dimension.metadata.names | dimension.uniqueKeys.names
832 for field in not_dimensions:
833 if field in fields:
834 guessedAssociation[dimensionName][field] = not_dimensions[field]
835 counter[dimensionName] += 1
836 assigned[field].add(dimensionName)
837 matched_dims.add(field)
839 # Calculate the fields that matched nothing.
840 never_found = set(not_dimensions) - matched_dims
842 if never_found:
843 raise ValueError(f"Unrecognized keyword args given: {never_found}")
845 # There is a chance we have allocated a single dataId item
846 # to multiple dimensions. Need to decide which should be retained.
847 # For now assume that the most popular alternative wins.
848 # This means that day_obs with seq_num will result in
849 # exposure.day_obs and not visit.day_obs
850 # Also prefer an explicitly missing dimension over an inferred
851 # temporal dimension.
852 for fieldName, assignedDimensions in assigned.items():
853 if len(assignedDimensions) > 1:
854 # Pick the most popular (preferring mandatory dimensions)
855 requiredButMissing = assignedDimensions.intersection(mandatoryDimensions)
856 if requiredButMissing:
857 candidateDimensions = requiredButMissing
858 else:
859 candidateDimensions = assignedDimensions
861 # If this is a choice between visit and exposure and
862 # neither was a required part of the dataset type,
863 # (hence in this branch) always prefer exposure over
864 # visit since exposures are always defined and visits
865 # are defined from exposures.
866 if candidateDimensions == {"exposure", "visit"}:
867 candidateDimensions = {"exposure"}
869 # Select the relevant items and get a new restricted
870 # counter.
871 theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions}
872 duplicatesCounter: Counter[str] = Counter()
873 duplicatesCounter.update(theseCounts)
875 # Choose the most common. If they are equally common
876 # we will pick the one that was found first.
877 # Returns a list of tuples
878 selected = duplicatesCounter.most_common(1)[0][0]
880 log.debug(
881 "Ambiguous dataId entry '%s' associated with multiple dimensions: %s."
882 " Removed ambiguity by choosing dimension %s.",
883 fieldName,
884 ", ".join(assignedDimensions),
885 selected,
886 )
888 for candidateDimension in assignedDimensions:
889 if candidateDimension != selected:
890 del guessedAssociation[candidateDimension][fieldName]
892 # Update the record look up dict with the new associations
893 for dimensionName, values in guessedAssociation.items():
894 if values: # A dict might now be empty
895 log.debug("Assigned non-dimension dataId keys to dimension %s: %s", dimensionName, values)
896 byRecord[dimensionName].update(values)
898 if byRecord:
899 # Some record specifiers were found so we need to convert
900 # them to the Id form
901 for dimensionName, values in byRecord.items():
902 if dimensionName in newDataId:
903 log.debug(
904 "DataId specified explicit %s dimension value of %s in addition to"
905 " general record specifiers for it of %s. Ignoring record information.",
906 dimensionName,
907 newDataId[dimensionName],
908 str(values),
909 )
910 # Get the actual record and compare with these values.
911 try:
912 recs = list(self.registry.queryDimensionRecords(dimensionName, dataId=newDataId))
913 except DataIdError:
914 raise ValueError(
915 f"Could not find dimension '{dimensionName}'"
916 f" with dataId {newDataId} as part of comparing with"
917 f" record values {byRecord[dimensionName]}"
918 ) from None
919 if len(recs) == 1:
920 errmsg: List[str] = []
921 for k, v in values.items():
922 if (recval := getattr(recs[0], k)) != v:
923 errmsg.append(f"{k}({recval} != {v})")
924 if errmsg:
925 raise ValueError(
926 f"Dimension {dimensionName} in dataId has explicit value"
927 " inconsistent with records: " + ", ".join(errmsg)
928 )
929 else:
930 # Multiple matches for an explicit dimension
931 # should never happen but let downstream complain.
932 pass
933 continue
935 # Build up a WHERE expression
936 bind = {k: v for k, v in values.items()}
937 where = " AND ".join(f"{dimensionName}.{k} = {k}" for k in bind)
939 # Hopefully we get a single record that matches
940 records = set(
941 self.registry.queryDimensionRecords(
942 dimensionName, dataId=newDataId, where=where, bind=bind, **kwargs
943 )
944 )
946 if len(records) != 1:
947 if len(records) > 1:
948 # visit can have an ambiguous answer without involving
949 # visit_system. The default visit_system is defined
950 # by the instrument.
951 if (
952 dimensionName == "visit"
953 and "visit_system_membership" in self.registry.dimensions
954 and "visit_system"
955 in self.registry.dimensions["instrument"].metadata # type: ignore
956 ):
957 instrument_records = list(
958 self.registry.queryDimensionRecords(
959 "instrument",
960 dataId=newDataId,
961 **kwargs,
962 )
963 )
964 if len(instrument_records) == 1:
965 visit_system = instrument_records[0].visit_system
966 if visit_system is None:
967 # Set to a value that will never match.
968 visit_system = -1
970 # Look up each visit in the
971 # visit_system_membership records.
972 for rec in records:
973 membership = list(
974 self.registry.queryDimensionRecords(
975 # Use bind to allow zero results.
976 # This is a fully-specified query.
977 "visit_system_membership",
978 where="instrument = inst AND visit_system = system AND visit = v",
979 bind=dict(
980 inst=instrument_records[0].name, system=visit_system, v=rec.id
981 ),
982 )
983 )
984 if membership:
985 # This record is the right answer.
986 records = set([rec])
987 break
989 # The ambiguity may have been resolved so check again.
990 if len(records) > 1:
991 log.debug("Received %d records from constraints of %s", len(records), str(values))
992 for r in records:
993 log.debug("- %s", str(r))
994 raise ValueError(
995 f"DataId specification for dimension {dimensionName} is not"
996 f" uniquely constrained to a single dataset by {values}."
997 f" Got {len(records)} results."
998 )
999 else:
1000 raise ValueError(
1001 f"DataId specification for dimension {dimensionName} matched no"
1002 f" records when constrained by {values}"
1003 )
1005 # Get the primary key from the real dimension object
1006 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
1007 if not isinstance(dimension, Dimension):
1008 raise RuntimeError(
1009 f"{dimension.name} is not a true dimension, and cannot be used in data IDs."
1010 )
1011 newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name)
1013 return newDataId, kwargs
1015 def _findDatasetRef(
1016 self,
1017 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1018 dataId: Optional[DataId] = None,
1019 *,
1020 collections: Any = None,
1021 allowUnresolved: bool = False,
1022 **kwargs: Any,
1023 ) -> DatasetRef:
1024 """Shared logic for methods that start with a search for a dataset in
1025 the registry.
1027 Parameters
1028 ----------
1029 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1030 When `DatasetRef` the `dataId` should be `None`.
1031 Otherwise the `DatasetType` or name thereof.
1032 dataId : `dict` or `DataCoordinate`, optional
1033 A `dict` of `Dimension` link name, value pairs that label the
1034 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1035 should be provided as the first argument.
1036 collections : Any, optional
1037 Collections to be searched, overriding ``self.collections``.
1038 Can be any of the types supported by the ``collections`` argument
1039 to butler construction.
1040 allowUnresolved : `bool`, optional
1041 If `True`, return an unresolved `DatasetRef` if finding a resolved
1042 one in the `Registry` fails. Defaults to `False`.
1043 **kwargs
1044 Additional keyword arguments used to augment or construct a
1045 `DataId`. See `DataId` parameters.
1047 Returns
1048 -------
1049 ref : `DatasetRef`
1050 A reference to the dataset identified by the given arguments.
1052 Raises
1053 ------
1054 LookupError
1055 Raised if no matching dataset exists in the `Registry` (and
1056 ``allowUnresolved is False``).
1057 ValueError
1058 Raised if a resolved `DatasetRef` was passed as an input, but it
1059 differs from the one found in the registry.
1060 TypeError
1061 Raised if no collections were provided.
1062 """
1063 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, for_put=False, **kwargs)
1064 if isinstance(datasetRefOrType, DatasetRef):
1065 idNumber = datasetRefOrType.id
1066 else:
1067 idNumber = None
1068 timespan: Optional[Timespan] = None
1070 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs)
1072 if datasetType.isCalibration():
1073 # Because this is a calibration dataset, first try to make a
1074 # standardize the data ID without restricting the dimensions to
1075 # those of the dataset type requested, because there may be extra
1076 # dimensions that provide temporal information for a validity-range
1077 # lookup.
1078 dataId = DataCoordinate.standardize(
1079 dataId, universe=self.registry.dimensions, defaults=self.registry.defaults.dataId, **kwargs
1080 )
1081 if dataId.graph.temporal:
1082 dataId = self.registry.expandDataId(dataId)
1083 timespan = dataId.timespan
1084 else:
1085 # Standardize the data ID to just the dimensions of the dataset
1086 # type instead of letting registry.findDataset do it, so we get the
1087 # result even if no dataset is found.
1088 dataId = DataCoordinate.standardize(
1089 dataId, graph=datasetType.dimensions, defaults=self.registry.defaults.dataId, **kwargs
1090 )
1091 # Always lookup the DatasetRef, even if one is given, to ensure it is
1092 # present in the current collection.
1093 ref = self.registry.findDataset(datasetType, dataId, collections=collections, timespan=timespan)
1094 if ref is None:
1095 if allowUnresolved:
1096 return DatasetRef(datasetType, dataId)
1097 else:
1098 if collections is None:
1099 collections = self.registry.defaults.collections
1100 raise LookupError(
1101 f"Dataset {datasetType.name} with data ID {dataId} "
1102 f"could not be found in collections {collections}."
1103 )
1104 if idNumber is not None and idNumber != ref.id:
1105 if collections is None:
1106 collections = self.registry.defaults.collections
1107 raise ValueError(
1108 f"DatasetRef.id provided ({idNumber}) does not match "
1109 f"id ({ref.id}) in registry in collections {collections}."
1110 )
1111 if datasetType != ref.datasetType:
1112 # If they differ it is because the user explicitly specified
1113 # a compatible dataset type to this call rather than using the
1114 # registry definition. The DatasetRef must therefore be recreated
1115 # using the user definition such that the expected type is
1116 # returned.
1117 ref = DatasetRef(datasetType, ref.dataId, run=ref.run, id=ref.id)
1119 return ref
1121 @transactional
1122 def putDirect(self, obj: Any, ref: DatasetRef) -> DatasetRef:
1123 # Docstring inherited.
1124 (imported_ref,) = self.registry._importDatasets(
1125 [ref],
1126 expand=True,
1127 )
1128 if imported_ref.id != ref.getCheckedId():
1129 raise RuntimeError("This registry configuration does not support putDirect.")
1130 self.datastore.put(obj, ref)
1131 return ref
1133 @transactional
1134 def put(
1135 self,
1136 obj: Any,
1137 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1138 dataId: Optional[DataId] = None,
1139 *,
1140 run: Optional[str] = None,
1141 **kwargs: Any,
1142 ) -> DatasetRef:
1143 """Store and register a dataset.
1145 Parameters
1146 ----------
1147 obj : `object`
1148 The dataset.
1149 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1150 When `DatasetRef` is provided, ``dataId`` should be `None`.
1151 Otherwise the `DatasetType` or name thereof.
1152 dataId : `dict` or `DataCoordinate`
1153 A `dict` of `Dimension` link name, value pairs that label the
1154 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1155 should be provided as the second argument.
1156 run : `str`, optional
1157 The name of the run the dataset should be added to, overriding
1158 ``self.run``.
1159 **kwargs
1160 Additional keyword arguments used to augment or construct a
1161 `DataCoordinate`. See `DataCoordinate.standardize`
1162 parameters.
1164 Returns
1165 -------
1166 ref : `DatasetRef`
1167 A reference to the stored dataset, updated with the correct id if
1168 given.
1170 Raises
1171 ------
1172 TypeError
1173 Raised if the butler is read-only or if no run has been provided.
1174 """
1175 log.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run)
1176 if not self.isWriteable():
1177 raise TypeError("Butler is read-only.")
1178 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs)
1179 if isinstance(datasetRefOrType, DatasetRef) and datasetRefOrType.id is not None:
1180 raise ValueError("DatasetRef must not be in registry, must have None id")
1182 # Handle dimension records in dataId
1183 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs)
1185 # Add Registry Dataset entry.
1186 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions, **kwargs)
1188 # For an execution butler the datasets will be pre-defined.
1189 # If the butler is configured that way datasets should only be inserted
1190 # if they do not already exist in registry. Trying and catching
1191 # ConflictingDefinitionError will not work because the transaction
1192 # will be corrupted. Instead, in this mode always check first.
1193 ref = None
1194 ref_is_predefined = False
1195 if self._allow_put_of_predefined_dataset:
1196 # Get the matching ref for this run.
1197 ref = self.registry.findDataset(datasetType, collections=run, dataId=dataId)
1199 if ref:
1200 # Must be expanded form for datastore templating
1201 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions)
1202 ref = ref.expanded(dataId)
1203 ref_is_predefined = True
1205 if not ref:
1206 (ref,) = self.registry.insertDatasets(datasetType, run=run, dataIds=[dataId])
1208 # If the ref is predefined it is possible that the datastore also
1209 # has the record. Asking datastore to put it again will result in
1210 # the artifact being recreated, overwriting previous, then will cause
1211 # a failure in writing the record which will cause the artifact
1212 # to be removed. Much safer to ask first before attempting to
1213 # overwrite. Race conditions should not be an issue for the
1214 # execution butler environment.
1215 if ref_is_predefined:
1216 if self.datastore.knows(ref):
1217 raise ConflictingDefinitionError(f"Dataset associated {ref} already exists.")
1219 self.datastore.put(obj, ref)
1221 return ref
1223 def getDirect(self, ref: DatasetRef, *, parameters: Optional[Dict[str, Any]] = None) -> Any:
1224 """Retrieve a stored dataset.
1226 Unlike `Butler.get`, this method allows datasets outside the Butler's
1227 collection to be read as long as the `DatasetRef` that identifies them
1228 can be obtained separately.
1230 Parameters
1231 ----------
1232 ref : `DatasetRef`
1233 Resolved reference to an already stored dataset.
1234 parameters : `dict`
1235 Additional StorageClass-defined options to control reading,
1236 typically used to efficiently read only a subset of the dataset.
1238 Returns
1239 -------
1240 obj : `object`
1241 The dataset.
1242 """
1243 return self.datastore.get(ref, parameters=parameters)
1245 def getDirectDeferred(
1246 self, ref: DatasetRef, *, parameters: Union[dict, None] = None
1247 ) -> DeferredDatasetHandle:
1248 """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
1249 from a resolved `DatasetRef`.
1251 Parameters
1252 ----------
1253 ref : `DatasetRef`
1254 Resolved reference to an already stored dataset.
1255 parameters : `dict`
1256 Additional StorageClass-defined options to control reading,
1257 typically used to efficiently read only a subset of the dataset.
1259 Returns
1260 -------
1261 obj : `DeferredDatasetHandle`
1262 A handle which can be used to retrieve a dataset at a later time.
1264 Raises
1265 ------
1266 AmbiguousDatasetError
1267 Raised if ``ref.id is None``, i.e. the reference is unresolved.
1268 """
1269 if ref.id is None:
1270 raise AmbiguousDatasetError(
1271 f"Dataset of type {ref.datasetType.name} with data ID {ref.dataId} is not resolved."
1272 )
1273 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters)
1275 def getDeferred(
1276 self,
1277 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1278 dataId: Optional[DataId] = None,
1279 *,
1280 parameters: Union[dict, None] = None,
1281 collections: Any = None,
1282 **kwargs: Any,
1283 ) -> DeferredDatasetHandle:
1284 """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
1285 after an immediate registry lookup.
1287 Parameters
1288 ----------
1289 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1290 When `DatasetRef` the `dataId` should be `None`.
1291 Otherwise the `DatasetType` or name thereof.
1292 dataId : `dict` or `DataCoordinate`, optional
1293 A `dict` of `Dimension` link name, value pairs that label the
1294 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1295 should be provided as the first argument.
1296 parameters : `dict`
1297 Additional StorageClass-defined options to control reading,
1298 typically used to efficiently read only a subset of the dataset.
1299 collections : Any, optional
1300 Collections to be searched, overriding ``self.collections``.
1301 Can be any of the types supported by the ``collections`` argument
1302 to butler construction.
1303 **kwargs
1304 Additional keyword arguments used to augment or construct a
1305 `DataId`. See `DataId` parameters.
1307 Returns
1308 -------
1309 obj : `DeferredDatasetHandle`
1310 A handle which can be used to retrieve a dataset at a later time.
1312 Raises
1313 ------
1314 LookupError
1315 Raised if no matching dataset exists in the `Registry` (and
1316 ``allowUnresolved is False``).
1317 ValueError
1318 Raised if a resolved `DatasetRef` was passed as an input, but it
1319 differs from the one found in the registry.
1320 TypeError
1321 Raised if no collections were provided.
1322 """
1323 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs)
1324 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters)
1326 def get(
1327 self,
1328 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1329 dataId: Optional[DataId] = None,
1330 *,
1331 parameters: Optional[Dict[str, Any]] = None,
1332 collections: Any = None,
1333 **kwargs: Any,
1334 ) -> Any:
1335 """Retrieve a stored dataset.
1337 Parameters
1338 ----------
1339 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1340 When `DatasetRef` the `dataId` should be `None`.
1341 Otherwise the `DatasetType` or name thereof.
1342 dataId : `dict` or `DataCoordinate`
1343 A `dict` of `Dimension` link name, value pairs that label the
1344 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1345 should be provided as the first argument.
1346 parameters : `dict`
1347 Additional StorageClass-defined options to control reading,
1348 typically used to efficiently read only a subset of the dataset.
1349 collections : Any, optional
1350 Collections to be searched, overriding ``self.collections``.
1351 Can be any of the types supported by the ``collections`` argument
1352 to butler construction.
1353 **kwargs
1354 Additional keyword arguments used to augment or construct a
1355 `DataCoordinate`. See `DataCoordinate.standardize`
1356 parameters.
1358 Returns
1359 -------
1360 obj : `object`
1361 The dataset.
1363 Raises
1364 ------
1365 ValueError
1366 Raised if a resolved `DatasetRef` was passed as an input, but it
1367 differs from the one found in the registry.
1368 LookupError
1369 Raised if no matching dataset exists in the `Registry`.
1370 TypeError
1371 Raised if no collections were provided.
1373 Notes
1374 -----
1375 When looking up datasets in a `~CollectionType.CALIBRATION` collection,
1376 this method requires that the given data ID include temporal dimensions
1377 beyond the dimensions of the dataset type itself, in order to find the
1378 dataset with the appropriate validity range. For example, a "bias"
1379 dataset with native dimensions ``{instrument, detector}`` could be
1380 fetched with a ``{instrument, detector, exposure}`` data ID, because
1381 ``exposure`` is a temporal dimension.
1382 """
1383 log.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters)
1384 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs)
1385 return self.getDirect(ref, parameters=parameters)
1387 def getURIs(
1388 self,
1389 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1390 dataId: Optional[DataId] = None,
1391 *,
1392 predict: bool = False,
1393 collections: Any = None,
1394 run: Optional[str] = None,
1395 **kwargs: Any,
1396 ) -> Tuple[Optional[ResourcePath], Dict[str, ResourcePath]]:
1397 """Returns the URIs associated with the dataset.
1399 Parameters
1400 ----------
1401 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1402 When `DatasetRef` the `dataId` should be `None`.
1403 Otherwise the `DatasetType` or name thereof.
1404 dataId : `dict` or `DataCoordinate`
1405 A `dict` of `Dimension` link name, value pairs that label the
1406 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1407 should be provided as the first argument.
1408 predict : `bool`
1409 If `True`, allow URIs to be returned of datasets that have not
1410 been written.
1411 collections : Any, optional
1412 Collections to be searched, overriding ``self.collections``.
1413 Can be any of the types supported by the ``collections`` argument
1414 to butler construction.
1415 run : `str`, optional
1416 Run to use for predictions, overriding ``self.run``.
1417 **kwargs
1418 Additional keyword arguments used to augment or construct a
1419 `DataCoordinate`. See `DataCoordinate.standardize`
1420 parameters.
1422 Returns
1423 -------
1424 primary : `lsst.resources.ResourcePath`
1425 The URI to the primary artifact associated with this dataset.
1426 If the dataset was disassembled within the datastore this
1427 may be `None`.
1428 components : `dict`
1429 URIs to any components associated with the dataset artifact.
1430 Can be empty if there are no components.
1431 """
1432 ref = self._findDatasetRef(
1433 datasetRefOrType, dataId, allowUnresolved=predict, collections=collections, **kwargs
1434 )
1435 if ref.id is None: # only possible if predict is True
1436 if run is None:
1437 run = self.run
1438 if run is None:
1439 raise TypeError("Cannot predict location with run=None.")
1440 # Lie about ID, because we can't guess it, and only
1441 # Datastore.getURIs() will ever see it (and it doesn't use it).
1442 ref = ref.resolved(id=0, run=run)
1443 return self.datastore.getURIs(ref, predict)
1445 def getURI(
1446 self,
1447 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1448 dataId: Optional[DataId] = None,
1449 *,
1450 predict: bool = False,
1451 collections: Any = None,
1452 run: Optional[str] = None,
1453 **kwargs: Any,
1454 ) -> ResourcePath:
1455 """Return the URI to the Dataset.
1457 Parameters
1458 ----------
1459 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1460 When `DatasetRef` the `dataId` should be `None`.
1461 Otherwise the `DatasetType` or name thereof.
1462 dataId : `dict` or `DataCoordinate`
1463 A `dict` of `Dimension` link name, value pairs that label the
1464 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1465 should be provided as the first argument.
1466 predict : `bool`
1467 If `True`, allow URIs to be returned of datasets that have not
1468 been written.
1469 collections : Any, optional
1470 Collections to be searched, overriding ``self.collections``.
1471 Can be any of the types supported by the ``collections`` argument
1472 to butler construction.
1473 run : `str`, optional
1474 Run to use for predictions, overriding ``self.run``.
1475 **kwargs
1476 Additional keyword arguments used to augment or construct a
1477 `DataCoordinate`. See `DataCoordinate.standardize`
1478 parameters.
1480 Returns
1481 -------
1482 uri : `lsst.resources.ResourcePath`
1483 URI pointing to the Dataset within the datastore. If the
1484 Dataset does not exist in the datastore, and if ``predict`` is
1485 `True`, the URI will be a prediction and will include a URI
1486 fragment "#predicted".
1487 If the datastore does not have entities that relate well
1488 to the concept of a URI the returned URI string will be
1489 descriptive. The returned URI is not guaranteed to be obtainable.
1491 Raises
1492 ------
1493 LookupError
1494 A URI has been requested for a dataset that does not exist and
1495 guessing is not allowed.
1496 ValueError
1497 Raised if a resolved `DatasetRef` was passed as an input, but it
1498 differs from the one found in the registry.
1499 TypeError
1500 Raised if no collections were provided.
1501 RuntimeError
1502 Raised if a URI is requested for a dataset that consists of
1503 multiple artifacts.
1504 """
1505 primary, components = self.getURIs(
1506 datasetRefOrType, dataId=dataId, predict=predict, collections=collections, run=run, **kwargs
1507 )
1509 if primary is None or components:
1510 raise RuntimeError(
1511 f"Dataset ({datasetRefOrType}) includes distinct URIs for components. "
1512 "Use Butler.getURIs() instead."
1513 )
1514 return primary
1516 def retrieveArtifacts(
1517 self,
1518 refs: Iterable[DatasetRef],
1519 destination: ResourcePathExpression,
1520 transfer: str = "auto",
1521 preserve_path: bool = True,
1522 overwrite: bool = False,
1523 ) -> List[ResourcePath]:
1524 """Retrieve the artifacts associated with the supplied refs.
1526 Parameters
1527 ----------
1528 refs : iterable of `DatasetRef`
1529 The datasets for which artifacts are to be retrieved.
1530 A single ref can result in multiple artifacts. The refs must
1531 be resolved.
1532 destination : `lsst.resources.ResourcePath` or `str`
1533 Location to write the artifacts.
1534 transfer : `str`, optional
1535 Method to use to transfer the artifacts. Must be one of the options
1536 supported by `~lsst.resources.ResourcePath.transfer_from()`.
1537 "move" is not allowed.
1538 preserve_path : `bool`, optional
1539 If `True` the full path of the artifact within the datastore
1540 is preserved. If `False` the final file component of the path
1541 is used.
1542 overwrite : `bool`, optional
1543 If `True` allow transfers to overwrite existing files at the
1544 destination.
1546 Returns
1547 -------
1548 targets : `list` of `lsst.resources.ResourcePath`
1549 URIs of file artifacts in destination location. Order is not
1550 preserved.
1552 Notes
1553 -----
1554 For non-file datastores the artifacts written to the destination
1555 may not match the representation inside the datastore. For example
1556 a hierarchical data structure in a NoSQL database may well be stored
1557 as a JSON file.
1558 """
1559 return self.datastore.retrieveArtifacts(
1560 refs,
1561 ResourcePath(destination),
1562 transfer=transfer,
1563 preserve_path=preserve_path,
1564 overwrite=overwrite,
1565 )
1567 def datasetExists(
1568 self,
1569 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1570 dataId: Optional[DataId] = None,
1571 *,
1572 collections: Any = None,
1573 **kwargs: Any,
1574 ) -> bool:
1575 """Return True if the Dataset is actually present in the Datastore.
1577 Parameters
1578 ----------
1579 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1580 When `DatasetRef` the `dataId` should be `None`.
1581 Otherwise the `DatasetType` or name thereof.
1582 dataId : `dict` or `DataCoordinate`
1583 A `dict` of `Dimension` link name, value pairs that label the
1584 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1585 should be provided as the first argument.
1586 collections : Any, optional
1587 Collections to be searched, overriding ``self.collections``.
1588 Can be any of the types supported by the ``collections`` argument
1589 to butler construction.
1590 **kwargs
1591 Additional keyword arguments used to augment or construct a
1592 `DataCoordinate`. See `DataCoordinate.standardize`
1593 parameters.
1595 Raises
1596 ------
1597 LookupError
1598 Raised if the dataset is not even present in the Registry.
1599 ValueError
1600 Raised if a resolved `DatasetRef` was passed as an input, but it
1601 differs from the one found in the registry.
1602 TypeError
1603 Raised if no collections were provided.
1604 """
1605 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs)
1606 return self.datastore.exists(ref)
1608 def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None:
1609 """Remove one or more `~CollectionType.RUN` collections and the
1610 datasets within them.
1612 Parameters
1613 ----------
1614 names : `Iterable` [ `str` ]
1615 The names of the collections to remove.
1616 unstore : `bool`, optional
1617 If `True` (default), delete datasets from all datastores in which
1618 they are present, and attempt to rollback the registry deletions if
1619 datastore deletions fail (which may not always be possible). If
1620 `False`, datastore records for these datasets are still removed,
1621 but any artifacts (e.g. files) will not be.
1623 Raises
1624 ------
1625 TypeError
1626 Raised if one or more collections are not of type
1627 `~CollectionType.RUN`.
1628 """
1629 if not self.isWriteable():
1630 raise TypeError("Butler is read-only.")
1631 names = list(names)
1632 refs: List[DatasetRef] = []
1633 for name in names:
1634 collectionType = self.registry.getCollectionType(name)
1635 if collectionType is not CollectionType.RUN:
1636 raise TypeError(f"The collection type of '{name}' is {collectionType.name}, not RUN.")
1637 refs.extend(self.registry.queryDatasets(..., collections=name, findFirst=True))
1638 with self.registry.transaction():
1639 if unstore:
1640 self.datastore.trash(refs)
1641 else:
1642 self.datastore.forget(refs)
1643 for name in names:
1644 self.registry.removeCollection(name)
1645 if unstore:
1646 # Point of no return for removing artifacts
1647 self.datastore.emptyTrash()
1649 def pruneCollection(
1650 self, name: str, purge: bool = False, unstore: bool = False, unlink: Optional[List[str]] = None
1651 ) -> None:
1652 """Remove a collection and possibly prune datasets within it.
1654 Parameters
1655 ----------
1656 name : `str`
1657 Name of the collection to remove. If this is a
1658 `~CollectionType.TAGGED` or `~CollectionType.CHAINED` collection,
1659 datasets within the collection are not modified unless ``unstore``
1660 is `True`. If this is a `~CollectionType.RUN` collection,
1661 ``purge`` and ``unstore`` must be `True`, and all datasets in it
1662 are fully removed from the data repository.
1663 purge : `bool`, optional
1664 If `True`, permit `~CollectionType.RUN` collections to be removed,
1665 fully removing datasets within them. Requires ``unstore=True`` as
1666 well as an added precaution against accidental deletion. Must be
1667 `False` (default) if the collection is not a ``RUN``.
1668 unstore: `bool`, optional
1669 If `True`, remove all datasets in the collection from all
1670 datastores in which they appear.
1671 unlink: `list` [`str`], optional
1672 Before removing the given `collection` unlink it from from these
1673 parent collections.
1675 Raises
1676 ------
1677 TypeError
1678 Raised if the butler is read-only or arguments are mutually
1679 inconsistent.
1680 """
1681 # See pruneDatasets comments for more information about the logic here;
1682 # the cases are almost the same, but here we can rely on Registry to
1683 # take care everything but Datastore deletion when we remove the
1684 # collection.
1685 if not self.isWriteable():
1686 raise TypeError("Butler is read-only.")
1687 collectionType = self.registry.getCollectionType(name)
1688 if purge and not unstore:
1689 raise PurgeWithoutUnstorePruneCollectionsError()
1690 if collectionType is CollectionType.RUN and not purge:
1691 raise RunWithoutPurgePruneCollectionsError(collectionType)
1692 if collectionType is not CollectionType.RUN and purge:
1693 raise PurgeUnsupportedPruneCollectionsError(collectionType)
1695 def remove(child: str, parent: str) -> None:
1696 """Remove a child collection from a parent collection."""
1697 # Remove child from parent.
1698 chain = list(self.registry.getCollectionChain(parent))
1699 try:
1700 chain.remove(name)
1701 except ValueError as e:
1702 raise RuntimeError(f"{name} is not a child of {parent}") from e
1703 self.registry.setCollectionChain(parent, chain)
1705 with self.registry.transaction():
1706 if unlink:
1707 for parent in unlink:
1708 remove(name, parent)
1709 if unstore:
1710 refs = self.registry.queryDatasets(..., collections=name, findFirst=True)
1711 self.datastore.trash(refs)
1712 self.registry.removeCollection(name)
1714 if unstore:
1715 # Point of no return for removing artifacts
1716 self.datastore.emptyTrash()
1718 def pruneDatasets(
1719 self,
1720 refs: Iterable[DatasetRef],
1721 *,
1722 disassociate: bool = True,
1723 unstore: bool = False,
1724 tags: Iterable[str] = (),
1725 purge: bool = False,
1726 ) -> None:
1727 # docstring inherited from LimitedButler
1729 if not self.isWriteable():
1730 raise TypeError("Butler is read-only.")
1731 if purge:
1732 if not disassociate:
1733 raise TypeError("Cannot pass purge=True without disassociate=True.")
1734 if not unstore:
1735 raise TypeError("Cannot pass purge=True without unstore=True.")
1736 elif disassociate:
1737 tags = tuple(tags)
1738 if not tags:
1739 raise TypeError("No tags provided but disassociate=True.")
1740 for tag in tags:
1741 collectionType = self.registry.getCollectionType(tag)
1742 if collectionType is not CollectionType.TAGGED:
1743 raise TypeError(
1744 f"Cannot disassociate from collection '{tag}' "
1745 f"of non-TAGGED type {collectionType.name}."
1746 )
1747 # For an execution butler we want to keep existing UUIDs for the
1748 # datasets, for that we need to keep them in the collections but
1749 # remove from datastore.
1750 if self._allow_put_of_predefined_dataset and purge:
1751 purge = False
1752 disassociate = False
1753 # Transform possibly-single-pass iterable into something we can iterate
1754 # over multiple times.
1755 refs = list(refs)
1756 # Pruning a component of a DatasetRef makes no sense since registry
1757 # doesn't know about components and datastore might not store
1758 # components in a separate file
1759 for ref in refs:
1760 if ref.datasetType.component():
1761 raise ValueError(f"Can not prune a component of a dataset (ref={ref})")
1762 # We don't need an unreliable Datastore transaction for this, because
1763 # we've been extra careful to ensure that Datastore.trash only involves
1764 # mutating the Registry (it can _look_ at Datastore-specific things,
1765 # but shouldn't change them), and hence all operations here are
1766 # Registry operations.
1767 with self.registry.transaction():
1768 if unstore:
1769 self.datastore.trash(refs)
1770 if purge:
1771 self.registry.removeDatasets(refs)
1772 elif disassociate:
1773 assert tags, "Guaranteed by earlier logic in this function."
1774 for tag in tags:
1775 self.registry.disassociate(tag, refs)
1776 # We've exited the Registry transaction, and apparently committed.
1777 # (if there was an exception, everything rolled back, and it's as if
1778 # nothing happened - and we never get here).
1779 # Datastore artifacts are not yet gone, but they're clearly marked
1780 # as trash, so if we fail to delete now because of (e.g.) filesystem
1781 # problems we can try again later, and if manual administrative
1782 # intervention is required, it's pretty clear what that should entail:
1783 # deleting everything on disk and in private Datastore tables that is
1784 # in the dataset_location_trash table.
1785 if unstore:
1786 # Point of no return for removing artifacts
1787 self.datastore.emptyTrash()
1789 @transactional
1790 def ingest(
1791 self,
1792 *datasets: FileDataset,
1793 transfer: Optional[str] = "auto",
1794 run: Optional[str] = None,
1795 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
1796 record_validation_info: bool = True,
1797 ) -> None:
1798 """Store and register one or more datasets that already exist on disk.
1800 Parameters
1801 ----------
1802 datasets : `FileDataset`
1803 Each positional argument is a struct containing information about
1804 a file to be ingested, including its URI (either absolute or
1805 relative to the datastore root, if applicable), a `DatasetRef`,
1806 and optionally a formatter class or its fully-qualified string
1807 name. If a formatter is not provided, the formatter that would be
1808 used for `put` is assumed. On successful return, all
1809 `FileDataset.ref` attributes will have their `DatasetRef.id`
1810 attribute populated and all `FileDataset.formatter` attributes will
1811 be set to the formatter class used. `FileDataset.path` attributes
1812 may be modified to put paths in whatever the datastore considers a
1813 standardized form.
1814 transfer : `str`, optional
1815 If not `None`, must be one of 'auto', 'move', 'copy', 'direct',
1816 'split', 'hardlink', 'relsymlink' or 'symlink', indicating how to
1817 transfer the file.
1818 run : `str`, optional
1819 The name of the run ingested datasets should be added to,
1820 overriding ``self.run``.
1821 idGenerationMode : `DatasetIdGenEnum`, optional
1822 Specifies option for generating dataset IDs. By default unique IDs
1823 are generated for each inserted dataset.
1824 record_validation_info : `bool`, optional
1825 If `True`, the default, the datastore can record validation
1826 information associated with the file. If `False` the datastore
1827 will not attempt to track any information such as checksums
1828 or file sizes. This can be useful if such information is tracked
1829 in an external system or if the file is to be compressed in place.
1830 It is up to the datastore whether this parameter is relevant.
1832 Raises
1833 ------
1834 TypeError
1835 Raised if the butler is read-only or if no run was provided.
1836 NotImplementedError
1837 Raised if the `Datastore` does not support the given transfer mode.
1838 DatasetTypeNotSupportedError
1839 Raised if one or more files to be ingested have a dataset type that
1840 is not supported by the `Datastore`..
1841 FileNotFoundError
1842 Raised if one of the given files does not exist.
1843 FileExistsError
1844 Raised if transfer is not `None` but the (internal) location the
1845 file would be moved to is already occupied.
1847 Notes
1848 -----
1849 This operation is not fully exception safe: if a database operation
1850 fails, the given `FileDataset` instances may be only partially updated.
1852 It is atomic in terms of database operations (they will either all
1853 succeed or all fail) providing the database engine implements
1854 transactions correctly. It will attempt to be atomic in terms of
1855 filesystem operations as well, but this cannot be implemented
1856 rigorously for most datastores.
1857 """
1858 if not self.isWriteable():
1859 raise TypeError("Butler is read-only.")
1860 progress = Progress("lsst.daf.butler.Butler.ingest", level=logging.DEBUG)
1861 # Reorganize the inputs so they're grouped by DatasetType and then
1862 # data ID. We also include a list of DatasetRefs for each FileDataset
1863 # to hold the resolved DatasetRefs returned by the Registry, before
1864 # it's safe to swap them into FileDataset.refs.
1865 # Some type annotation aliases to make that clearer:
1866 GroupForType = Dict[DataCoordinate, Tuple[FileDataset, List[DatasetRef]]]
1867 GroupedData = MutableMapping[DatasetType, GroupForType]
1868 # The actual data structure:
1869 groupedData: GroupedData = defaultdict(dict)
1870 # And the nested loop that populates it:
1871 for dataset in progress.wrap(datasets, desc="Grouping by dataset type"):
1872 # This list intentionally shared across the inner loop, since it's
1873 # associated with `dataset`.
1874 resolvedRefs: List[DatasetRef] = []
1876 # Somewhere to store pre-existing refs if we have an
1877 # execution butler.
1878 existingRefs: List[DatasetRef] = []
1880 for ref in dataset.refs:
1881 if ref.dataId in groupedData[ref.datasetType]:
1882 raise ConflictingDefinitionError(
1883 f"Ingest conflict. Dataset {dataset.path} has same"
1884 " DataId as other ingest dataset"
1885 f" {groupedData[ref.datasetType][ref.dataId][0].path} "
1886 f" ({ref.dataId})"
1887 )
1888 if self._allow_put_of_predefined_dataset:
1889 existing_ref = self.registry.findDataset(
1890 ref.datasetType, dataId=ref.dataId, collections=run
1891 )
1892 if existing_ref:
1893 if self.datastore.knows(existing_ref):
1894 raise ConflictingDefinitionError(
1895 f"Dataset associated with path {dataset.path}"
1896 f" already exists as {existing_ref}."
1897 )
1898 # Store this ref elsewhere since it already exists
1899 # and we do not want to remake it but we do want
1900 # to store it in the datastore.
1901 existingRefs.append(existing_ref)
1903 # Nothing else to do until we have finished
1904 # iterating.
1905 continue
1907 groupedData[ref.datasetType][ref.dataId] = (dataset, resolvedRefs)
1909 if existingRefs:
1911 if len(dataset.refs) != len(existingRefs):
1912 # Keeping track of partially pre-existing datasets is hard
1913 # and should generally never happen. For now don't allow
1914 # it.
1915 raise ConflictingDefinitionError(
1916 f"For dataset {dataset.path} some dataIds already exist"
1917 " in registry but others do not. This is not supported."
1918 )
1920 # Attach the resolved refs if we found them.
1921 dataset.refs = existingRefs
1923 # Now we can bulk-insert into Registry for each DatasetType.
1924 for datasetType, groupForType in progress.iter_item_chunks(
1925 groupedData.items(), desc="Bulk-inserting datasets by type"
1926 ):
1927 refs = self.registry.insertDatasets(
1928 datasetType,
1929 dataIds=groupForType.keys(),
1930 run=run,
1931 expand=self.datastore.needs_expanded_data_ids(transfer, datasetType),
1932 idGenerationMode=idGenerationMode,
1933 )
1934 # Append those resolved DatasetRefs to the new lists we set up for
1935 # them.
1936 for ref, (_, resolvedRefs) in zip(refs, groupForType.values()):
1937 resolvedRefs.append(ref)
1939 # Go back to the original FileDatasets to replace their refs with the
1940 # new resolved ones.
1941 for groupForType in progress.iter_chunks(
1942 groupedData.values(), desc="Reassociating resolved dataset refs with files"
1943 ):
1944 for dataset, resolvedRefs in groupForType.values():
1945 dataset.refs = resolvedRefs
1947 # Bulk-insert everything into Datastore.
1948 self.datastore.ingest(*datasets, transfer=transfer, record_validation_info=record_validation_info)
1950 @contextlib.contextmanager
1951 def export(
1952 self,
1953 *,
1954 directory: Optional[str] = None,
1955 filename: Optional[str] = None,
1956 format: Optional[str] = None,
1957 transfer: Optional[str] = None,
1958 ) -> Iterator[RepoExportContext]:
1959 """Export datasets from the repository represented by this `Butler`.
1961 This method is a context manager that returns a helper object
1962 (`RepoExportContext`) that is used to indicate what information from
1963 the repository should be exported.
1965 Parameters
1966 ----------
1967 directory : `str`, optional
1968 Directory dataset files should be written to if ``transfer`` is not
1969 `None`.
1970 filename : `str`, optional
1971 Name for the file that will include database information associated
1972 with the exported datasets. If this is not an absolute path and
1973 ``directory`` is not `None`, it will be written to ``directory``
1974 instead of the current working directory. Defaults to
1975 "export.{format}".
1976 format : `str`, optional
1977 File format for the database information file. If `None`, the
1978 extension of ``filename`` will be used.
1979 transfer : `str`, optional
1980 Transfer mode passed to `Datastore.export`.
1982 Raises
1983 ------
1984 TypeError
1985 Raised if the set of arguments passed is inconsistent.
1987 Examples
1988 --------
1989 Typically the `Registry.queryDataIds` and `Registry.queryDatasets`
1990 methods are used to provide the iterables over data IDs and/or datasets
1991 to be exported::
1993 with butler.export("exports.yaml") as export:
1994 # Export all flats, but none of the dimension element rows
1995 # (i.e. data ID information) associated with them.
1996 export.saveDatasets(butler.registry.queryDatasets("flat"),
1997 elements=())
1998 # Export all datasets that start with "deepCoadd_" and all of
1999 # their associated data ID information.
2000 export.saveDatasets(butler.registry.queryDatasets("deepCoadd_*"))
2001 """
2002 if directory is None and transfer is not None:
2003 raise TypeError("Cannot transfer without providing a directory.")
2004 if transfer == "move":
2005 raise TypeError("Transfer may not be 'move': export is read-only")
2006 if format is None:
2007 if filename is None:
2008 raise TypeError("At least one of 'filename' or 'format' must be provided.")
2009 else:
2010 _, format = os.path.splitext(filename)
2011 elif filename is None:
2012 filename = f"export.{format}"
2013 if directory is not None:
2014 filename = os.path.join(directory, filename)
2015 BackendClass = get_class_of(self._config["repo_transfer_formats"][format]["export"])
2016 with open(filename, "w") as stream:
2017 backend = BackendClass(stream, universe=self.registry.dimensions)
2018 try:
2019 helper = RepoExportContext(
2020 self.registry, self.datastore, backend=backend, directory=directory, transfer=transfer
2021 )
2022 yield helper
2023 except BaseException:
2024 raise
2025 else:
2026 helper._finish()
2028 def import_(
2029 self,
2030 *,
2031 directory: Optional[str] = None,
2032 filename: Union[str, TextIO, None] = None,
2033 format: Optional[str] = None,
2034 transfer: Optional[str] = None,
2035 skip_dimensions: Optional[Set] = None,
2036 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
2037 reuseIds: bool = False,
2038 ) -> None:
2039 """Import datasets into this repository that were exported from a
2040 different butler repository via `~lsst.daf.butler.Butler.export`.
2042 Parameters
2043 ----------
2044 directory : `str`, optional
2045 Directory containing dataset files to import from. If `None`,
2046 ``filename`` and all dataset file paths specified therein must
2047 be absolute.
2048 filename : `str` or `TextIO`, optional
2049 A stream or name of file that contains database information
2050 associated with the exported datasets, typically generated by
2051 `~lsst.daf.butler.Butler.export`. If this a string (name) and
2052 is not an absolute path, does not exist in the current working
2053 directory, and ``directory`` is not `None`, it is assumed to be in
2054 ``directory``. Defaults to "export.{format}".
2055 format : `str`, optional
2056 File format for ``filename``. If `None`, the extension of
2057 ``filename`` will be used.
2058 transfer : `str`, optional
2059 Transfer mode passed to `~lsst.daf.butler.Datastore.ingest`.
2060 skip_dimensions : `set`, optional
2061 Names of dimensions that should be skipped and not imported.
2062 idGenerationMode : `DatasetIdGenEnum`, optional
2063 Specifies option for generating dataset IDs when IDs are not
2064 provided or their type does not match backend type. By default
2065 unique IDs are generated for each inserted dataset.
2066 reuseIds : `bool`, optional
2067 If `True` then forces re-use of imported dataset IDs for integer
2068 IDs which are normally generated as auto-incremented; exception
2069 will be raised if imported IDs clash with existing ones. This
2070 option has no effect on the use of globally-unique IDs which are
2071 always re-used (or generated if integer IDs are being imported).
2073 Raises
2074 ------
2075 TypeError
2076 Raised if the set of arguments passed is inconsistent, or if the
2077 butler is read-only.
2078 """
2079 if not self.isWriteable():
2080 raise TypeError("Butler is read-only.")
2081 if format is None:
2082 if filename is None:
2083 raise TypeError("At least one of 'filename' or 'format' must be provided.")
2084 else:
2085 _, format = os.path.splitext(filename) # type: ignore
2086 elif filename is None:
2087 filename = f"export.{format}"
2088 if isinstance(filename, str) and directory is not None and not os.path.exists(filename):
2089 filename = os.path.join(directory, filename)
2090 BackendClass = get_class_of(self._config["repo_transfer_formats"][format]["import"])
2092 def doImport(importStream: TextIO) -> None:
2093 backend = BackendClass(importStream, self.registry)
2094 backend.register()
2095 with self.transaction():
2096 backend.load(
2097 self.datastore,
2098 directory=directory,
2099 transfer=transfer,
2100 skip_dimensions=skip_dimensions,
2101 idGenerationMode=idGenerationMode,
2102 reuseIds=reuseIds,
2103 )
2105 if isinstance(filename, str):
2106 with open(filename, "r") as stream:
2107 doImport(stream)
2108 else:
2109 doImport(filename)
2111 def transfer_from(
2112 self,
2113 source_butler: Butler,
2114 source_refs: Iterable[DatasetRef],
2115 transfer: str = "auto",
2116 id_gen_map: Dict[str, DatasetIdGenEnum] = None,
2117 skip_missing: bool = True,
2118 register_dataset_types: bool = False,
2119 ) -> List[DatasetRef]:
2120 """Transfer datasets to this Butler from a run in another Butler.
2122 Parameters
2123 ----------
2124 source_butler : `Butler`
2125 Butler from which the datasets are to be transferred.
2126 source_refs : iterable of `DatasetRef`
2127 Datasets defined in the source butler that should be transferred to
2128 this butler.
2129 transfer : `str`, optional
2130 Transfer mode passed to `~lsst.daf.butler.Datastore.transfer_from`.
2131 id_gen_map : `dict` of [`str`, `DatasetIdGenEnum`], optional
2132 A mapping of dataset type to ID generation mode. Only used if
2133 the source butler is using integer IDs. Should not be used
2134 if this receiving butler uses integer IDs. Without this dataset
2135 import always uses unique.
2136 skip_missing : `bool`
2137 If `True`, datasets with no datastore artifact associated with
2138 them are not transferred. If `False` a registry entry will be
2139 created even if no datastore record is created (and so will
2140 look equivalent to the dataset being unstored).
2141 register_dataset_types : `bool`
2142 If `True` any missing dataset types are registered. Otherwise
2143 an exception is raised.
2145 Returns
2146 -------
2147 refs : `list` of `DatasetRef`
2148 The refs added to this Butler.
2150 Notes
2151 -----
2152 Requires that any dimension definitions are already present in the
2153 receiving Butler. The datastore artifact has to exist for a transfer
2154 to be made but non-existence is not an error.
2156 Datasets that already exist in this run will be skipped.
2158 The datasets are imported as part of a transaction, although
2159 dataset types are registered before the transaction is started.
2160 This means that it is possible for a dataset type to be registered
2161 even though transfer has failed.
2162 """
2163 if not self.isWriteable():
2164 raise TypeError("Butler is read-only.")
2165 progress = Progress("lsst.daf.butler.Butler.transfer_from", level=VERBOSE)
2167 # Will iterate through the refs multiple times so need to convert
2168 # to a list if this isn't a collection.
2169 if not isinstance(source_refs, collections.abc.Collection):
2170 source_refs = list(source_refs)
2172 original_count = len(source_refs)
2173 log.info("Transferring %d datasets into %s", original_count, str(self))
2175 if id_gen_map is None:
2176 id_gen_map = {}
2178 # In some situations the datastore artifact may be missing
2179 # and we do not want that registry entry to be imported.
2180 # Asking datastore is not sufficient, the records may have been
2181 # purged, we have to ask for the (predicted) URI and check
2182 # existence explicitly. Execution butler is set up exactly like
2183 # this with no datastore records.
2184 artifact_existence: Dict[ResourcePath, bool] = {}
2185 if skip_missing:
2186 dataset_existence = source_butler.datastore.mexists(
2187 source_refs, artifact_existence=artifact_existence
2188 )
2189 source_refs = [ref for ref, exists in dataset_existence.items() if exists]
2190 filtered_count = len(source_refs)
2191 log.verbose(
2192 "%d datasets removed because the artifact does not exist. Now have %d.",
2193 original_count - filtered_count,
2194 filtered_count,
2195 )
2197 # Importing requires that we group the refs by dataset type and run
2198 # before doing the import.
2199 source_dataset_types = set()
2200 grouped_refs = defaultdict(list)
2201 grouped_indices = defaultdict(list)
2202 for i, ref in enumerate(source_refs):
2203 grouped_refs[ref.datasetType, ref.run].append(ref)
2204 grouped_indices[ref.datasetType, ref.run].append(i)
2205 source_dataset_types.add(ref.datasetType)
2207 # Check to see if the dataset type in the source butler has
2208 # the same definition in the target butler and register missing
2209 # ones if requested. Registration must happen outside a transaction.
2210 newly_registered_dataset_types = set()
2211 for datasetType in source_dataset_types:
2212 if register_dataset_types:
2213 # Let this raise immediately if inconsistent. Continuing
2214 # on to find additional inconsistent dataset types
2215 # might result in additional unwanted dataset types being
2216 # registered.
2217 if self.registry.registerDatasetType(datasetType):
2218 newly_registered_dataset_types.add(datasetType)
2219 else:
2220 # If the dataset type is missing, let it fail immediately.
2221 target_dataset_type = self.registry.getDatasetType(datasetType.name)
2222 if target_dataset_type != datasetType:
2223 raise ConflictingDefinitionError(
2224 "Source butler dataset type differs from definition"
2225 f" in target butler: {datasetType} !="
2226 f" {target_dataset_type}"
2227 )
2228 if newly_registered_dataset_types:
2229 # We may have registered some even if there were inconsistencies
2230 # but should let people know (or else remove them again).
2231 log.log(
2232 VERBOSE,
2233 "Registered the following dataset types in the target Butler: %s",
2234 ", ".join(d.name for d in newly_registered_dataset_types),
2235 )
2236 else:
2237 log.log(VERBOSE, "All required dataset types are known to the target Butler")
2239 # The returned refs should be identical for UUIDs.
2240 # For now must also support integers and so need to retain the
2241 # newly-created refs from this registry.
2242 # Pre-size it so we can assign refs into the correct slots
2243 transferred_refs_tmp: List[Optional[DatasetRef]] = [None] * len(source_refs)
2244 default_id_gen = DatasetIdGenEnum.UNIQUE
2246 handled_collections: Set[str] = set()
2248 # Do all the importing in a single transaction.
2249 with self.transaction():
2250 for (datasetType, run), refs_to_import in progress.iter_item_chunks(
2251 grouped_refs.items(), desc="Importing to registry by run and dataset type"
2252 ):
2253 if run not in handled_collections:
2254 run_doc = source_butler.registry.getCollectionDocumentation(run)
2255 registered = self.registry.registerRun(run, doc=run_doc)
2256 handled_collections.add(run)
2257 if registered:
2258 log.log(VERBOSE, "Creating output run %s", run)
2260 id_generation_mode = default_id_gen
2261 if isinstance(refs_to_import[0].id, int):
2262 # ID generation mode might need to be overridden when
2263 # targetting UUID
2264 id_generation_mode = id_gen_map.get(datasetType.name, default_id_gen)
2266 n_refs = len(refs_to_import)
2267 log.verbose(
2268 "Importing %d ref%s of dataset type %s into run %s",
2269 n_refs,
2270 "" if n_refs == 1 else "s",
2271 datasetType.name,
2272 run,
2273 )
2275 # No way to know if this butler's registry uses UUID.
2276 # We have to trust the caller on this. If it fails they will
2277 # have to change their approach. We can't catch the exception
2278 # and retry with unique because that will mess up the
2279 # transaction handling. We aren't allowed to ask the registry
2280 # manager what type of ID it is using.
2281 imported_refs = self.registry._importDatasets(
2282 refs_to_import, idGenerationMode=id_generation_mode, expand=False
2283 )
2285 # Map them into the correct slots to match the initial order
2286 for i, ref in zip(grouped_indices[datasetType, run], imported_refs):
2287 transferred_refs_tmp[i] = ref
2289 # Mypy insists that we might have None in here so we have to make
2290 # that explicit by assigning to a new variable and filtering out
2291 # something that won't be there.
2292 transferred_refs = [ref for ref in transferred_refs_tmp if ref is not None]
2294 # Check consistency
2295 assert len(source_refs) == len(transferred_refs), "Different number of refs imported than given"
2297 log.verbose("Imported %d datasets into destination butler", len(transferred_refs))
2299 # The transferred refs need to be reordered to match the original
2300 # ordering given by the caller. Without this the datastore transfer
2301 # will be broken.
2303 # Ask the datastore to transfer. The datastore has to check that
2304 # the source datastore is compatible with the target datastore.
2305 self.datastore.transfer_from(
2306 source_butler.datastore,
2307 source_refs,
2308 local_refs=transferred_refs,
2309 transfer=transfer,
2310 artifact_existence=artifact_existence,
2311 )
2313 return transferred_refs
2315 def validateConfiguration(
2316 self,
2317 logFailures: bool = False,
2318 datasetTypeNames: Optional[Iterable[str]] = None,
2319 ignore: Iterable[str] = None,
2320 ) -> None:
2321 """Validate butler configuration.
2323 Checks that each `DatasetType` can be stored in the `Datastore`.
2325 Parameters
2326 ----------
2327 logFailures : `bool`, optional
2328 If `True`, output a log message for every validation error
2329 detected.
2330 datasetTypeNames : iterable of `str`, optional
2331 The `DatasetType` names that should be checked. This allows
2332 only a subset to be selected.
2333 ignore : iterable of `str`, optional
2334 Names of DatasetTypes to skip over. This can be used to skip
2335 known problems. If a named `DatasetType` corresponds to a
2336 composite, all components of that `DatasetType` will also be
2337 ignored.
2339 Raises
2340 ------
2341 ButlerValidationError
2342 Raised if there is some inconsistency with how this Butler
2343 is configured.
2344 """
2345 if datasetTypeNames:
2346 datasetTypes = [self.registry.getDatasetType(name) for name in datasetTypeNames]
2347 else:
2348 datasetTypes = list(self.registry.queryDatasetTypes())
2350 # filter out anything from the ignore list
2351 if ignore:
2352 ignore = set(ignore)
2353 datasetTypes = [
2354 e for e in datasetTypes if e.name not in ignore and e.nameAndComponent()[0] not in ignore
2355 ]
2356 else:
2357 ignore = set()
2359 # Find all the registered instruments
2360 instruments = set(record.name for record in self.registry.queryDimensionRecords("instrument"))
2362 # For each datasetType that has an instrument dimension, create
2363 # a DatasetRef for each defined instrument
2364 datasetRefs = []
2366 for datasetType in datasetTypes:
2367 if "instrument" in datasetType.dimensions:
2368 for instrument in instruments:
2369 datasetRef = DatasetRef(
2370 datasetType, {"instrument": instrument}, conform=False # type: ignore
2371 )
2372 datasetRefs.append(datasetRef)
2374 entities: List[Union[DatasetType, DatasetRef]] = []
2375 entities.extend(datasetTypes)
2376 entities.extend(datasetRefs)
2378 datastoreErrorStr = None
2379 try:
2380 self.datastore.validateConfiguration(entities, logFailures=logFailures)
2381 except ValidationError as e:
2382 datastoreErrorStr = str(e)
2384 # Also check that the LookupKeys used by the datastores match
2385 # registry and storage class definitions
2386 keys = self.datastore.getLookupKeys()
2388 failedNames = set()
2389 failedDataId = set()
2390 for key in keys:
2391 if key.name is not None:
2392 if key.name in ignore:
2393 continue
2395 # skip if specific datasetType names were requested and this
2396 # name does not match
2397 if datasetTypeNames and key.name not in datasetTypeNames:
2398 continue
2400 # See if it is a StorageClass or a DatasetType
2401 if key.name in self.storageClasses:
2402 pass
2403 else:
2404 try:
2405 self.registry.getDatasetType(key.name)
2406 except KeyError:
2407 if logFailures:
2408 log.critical("Key '%s' does not correspond to a DatasetType or StorageClass", key)
2409 failedNames.add(key)
2410 else:
2411 # Dimensions are checked for consistency when the Butler
2412 # is created and rendezvoused with a universe.
2413 pass
2415 # Check that the instrument is a valid instrument
2416 # Currently only support instrument so check for that
2417 if key.dataId:
2418 dataIdKeys = set(key.dataId)
2419 if set(["instrument"]) != dataIdKeys:
2420 if logFailures:
2421 log.critical("Key '%s' has unsupported DataId override", key)
2422 failedDataId.add(key)
2423 elif key.dataId["instrument"] not in instruments:
2424 if logFailures:
2425 log.critical("Key '%s' has unknown instrument", key)
2426 failedDataId.add(key)
2428 messages = []
2430 if datastoreErrorStr:
2431 messages.append(datastoreErrorStr)
2433 for failed, msg in (
2434 (failedNames, "Keys without corresponding DatasetType or StorageClass entry: "),
2435 (failedDataId, "Keys with bad DataId entries: "),
2436 ):
2437 if failed:
2438 msg += ", ".join(str(k) for k in failed)
2439 messages.append(msg)
2441 if messages:
2442 raise ValidationError(";\n".join(messages))
2444 @property
2445 def collections(self) -> CollectionSearch:
2446 """The collections to search by default, in order (`CollectionSearch`).
2448 This is an alias for ``self.registry.defaults.collections``. It cannot
2449 be set directly in isolation, but all defaults may be changed together
2450 by assigning a new `RegistryDefaults` instance to
2451 ``self.registry.defaults``.
2452 """
2453 return self.registry.defaults.collections
2455 @property
2456 def run(self) -> Optional[str]:
2457 """Name of the run this butler writes outputs to by default (`str` or
2458 `None`).
2460 This is an alias for ``self.registry.defaults.run``. It cannot be set
2461 directly in isolation, but all defaults may be changed together by
2462 assigning a new `RegistryDefaults` instance to
2463 ``self.registry.defaults``.
2464 """
2465 return self.registry.defaults.run
2467 @property
2468 def dimensions(self) -> DimensionUniverse:
2469 # Docstring inherited.
2470 return self.registry.dimensions
2472 registry: Registry
2473 """The object that manages dataset metadata and relationships (`Registry`).
2475 Most operations that don't involve reading or writing butler datasets are
2476 accessible only via `Registry` methods.
2477 """
2479 datastore: Datastore
2480 """The object that manages actual dataset storage (`Datastore`).
2482 Direct user access to the datastore should rarely be necessary; the primary
2483 exception is the case where a `Datastore` implementation provides extra
2484 functionality beyond what the base class defines.
2485 """
2487 storageClasses: StorageClassFactory
2488 """An object that maps known storage class names to objects that fully
2489 describe them (`StorageClassFactory`).
2490 """
2492 _allow_put_of_predefined_dataset: bool
2493 """Allow a put to succeed even if there is already a registry entry for it
2494 but not a datastore record. (`bool`)."""