Coverage for python/lsst/daf/butler/_butler.py: 10%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""
23Butler top level classes.
24"""
25from __future__ import annotations
27__all__ = (
28 "Butler",
29 "ButlerValidationError",
30 "PruneCollectionsArgsError",
31 "PurgeWithoutUnstorePruneCollectionsError",
32 "RunWithoutPurgePruneCollectionsError",
33 "PurgeUnsupportedPruneCollectionsError",
34)
36import collections.abc
37import contextlib
38import logging
39import numbers
40import os
41from collections import defaultdict
42from typing import (
43 Any,
44 ClassVar,
45 Counter,
46 Dict,
47 Iterable,
48 Iterator,
49 List,
50 MutableMapping,
51 Optional,
52 Set,
53 TextIO,
54 Tuple,
55 Type,
56 Union,
57)
59from lsst.resources import ResourcePath, ResourcePathExpression
60from lsst.utils import doImportType
61from lsst.utils.introspection import get_class_of
62from lsst.utils.logging import VERBOSE, getLogger
64from ._butlerConfig import ButlerConfig
65from ._butlerRepoIndex import ButlerRepoIndex
66from ._deferredDatasetHandle import DeferredDatasetHandle
67from ._limited_butler import LimitedButler
68from .core import (
69 AmbiguousDatasetError,
70 Config,
71 ConfigSubset,
72 DataCoordinate,
73 DataId,
74 DataIdValue,
75 DatasetRef,
76 DatasetType,
77 Datastore,
78 Dimension,
79 DimensionConfig,
80 DimensionUniverse,
81 FileDataset,
82 Progress,
83 StorageClassFactory,
84 Timespan,
85 ValidationError,
86)
87from .core.repoRelocation import BUTLER_ROOT_TAG
88from .core.utils import transactional
89from .registry import (
90 CollectionSearch,
91 CollectionType,
92 ConflictingDefinitionError,
93 DataIdError,
94 DataIdValueError,
95 DatasetIdGenEnum,
96 DimensionNameError,
97 InconsistentDataIdError,
98 Registry,
99 RegistryConfig,
100 RegistryDefaults,
101)
102from .transfers import RepoExportContext
104log = getLogger(__name__)
107class ButlerValidationError(ValidationError):
108 """There is a problem with the Butler configuration."""
110 pass
113class PruneCollectionsArgsError(TypeError):
114 """Base class for errors relating to Butler.pruneCollections input
115 arguments.
116 """
118 pass
121class PurgeWithoutUnstorePruneCollectionsError(PruneCollectionsArgsError):
122 """Raised when purge and unstore are both required to be True, and
123 purge is True but unstore is False.
124 """
126 def __init__(self) -> None:
127 super().__init__("Cannot pass purge=True without unstore=True.")
130class RunWithoutPurgePruneCollectionsError(PruneCollectionsArgsError):
131 """Raised when pruning a RUN collection but purge is False."""
133 def __init__(self, collectionType: CollectionType):
134 self.collectionType = collectionType
135 super().__init__(f"Cannot prune RUN collection {self.collectionType.name} without purge=True.")
138class PurgeUnsupportedPruneCollectionsError(PruneCollectionsArgsError):
139 """Raised when purge is True but is not supported for the given
140 collection."""
142 def __init__(self, collectionType: CollectionType):
143 self.collectionType = collectionType
144 super().__init__(
145 f"Cannot prune {self.collectionType} collection {self.collectionType.name} with purge=True."
146 )
149class Butler(LimitedButler):
150 """Main entry point for the data access system.
152 Parameters
153 ----------
154 config : `ButlerConfig`, `Config` or `str`, optional.
155 Configuration. Anything acceptable to the
156 `ButlerConfig` constructor. If a directory path
157 is given the configuration will be read from a ``butler.yaml`` file in
158 that location. If `None` is given default values will be used.
159 butler : `Butler`, optional.
160 If provided, construct a new Butler that uses the same registry and
161 datastore as the given one, but with the given collection and run.
162 Incompatible with the ``config``, ``searchPaths``, and ``writeable``
163 arguments.
164 collections : `str` or `Iterable` [ `str` ], optional
165 An expression specifying the collections to be searched (in order) when
166 reading datasets.
167 This may be a `str` collection name or an iterable thereof.
168 See :ref:`daf_butler_collection_expressions` for more information.
169 These collections are not registered automatically and must be
170 manually registered before they are used by any method, but they may be
171 manually registered after the `Butler` is initialized.
172 run : `str`, optional
173 Name of the `~CollectionType.RUN` collection new datasets should be
174 inserted into. If ``collections`` is `None` and ``run`` is not `None`,
175 ``collections`` will be set to ``[run]``. If not `None`, this
176 collection will automatically be registered. If this is not set (and
177 ``writeable`` is not set either), a read-only butler will be created.
178 searchPaths : `list` of `str`, optional
179 Directory paths to search when calculating the full Butler
180 configuration. Not used if the supplied config is already a
181 `ButlerConfig`.
182 writeable : `bool`, optional
183 Explicitly sets whether the butler supports write operations. If not
184 provided, a read-write butler is created if any of ``run``, ``tags``,
185 or ``chains`` is non-empty.
186 inferDefaults : `bool`, optional
187 If `True` (default) infer default data ID values from the values
188 present in the datasets in ``collections``: if all collections have the
189 same value (or no value) for a governor dimension, that value will be
190 the default for that dimension. Nonexistent collections are ignored.
191 If a default value is provided explicitly for a governor dimension via
192 ``**kwargs``, no default will be inferred for that dimension.
193 **kwargs : `str`
194 Default data ID key-value pairs. These may only identify "governor"
195 dimensions like ``instrument`` and ``skymap``.
197 Examples
198 --------
199 While there are many ways to control exactly how a `Butler` interacts with
200 the collections in its `Registry`, the most common cases are still simple.
202 For a read-only `Butler` that searches one collection, do::
204 butler = Butler("/path/to/repo", collections=["u/alice/DM-50000"])
206 For a read-write `Butler` that writes to and reads from a
207 `~CollectionType.RUN` collection::
209 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a")
211 The `Butler` passed to a ``PipelineTask`` is often much more complex,
212 because we want to write to one `~CollectionType.RUN` collection but read
213 from several others (as well)::
215 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a",
216 collections=["u/alice/DM-50000/a",
217 "u/bob/DM-49998",
218 "HSC/defaults"])
220 This butler will `put` new datasets to the run ``u/alice/DM-50000/a``.
221 Datasets will be read first from that run (since it appears first in the
222 chain), and then from ``u/bob/DM-49998`` and finally ``HSC/defaults``.
224 Finally, one can always create a `Butler` with no collections::
226 butler = Butler("/path/to/repo", writeable=True)
228 This can be extremely useful when you just want to use ``butler.registry``,
229 e.g. for inserting dimension data or managing collections, or when the
230 collections you want to use with the butler are not consistent.
231 Passing ``writeable`` explicitly here is only necessary if you want to be
232 able to make changes to the repo - usually the value for ``writeable`` can
233 be guessed from the collection arguments provided, but it defaults to
234 `False` when there are not collection arguments.
235 """
237 def __init__(
238 self,
239 config: Union[Config, str, None] = None,
240 *,
241 butler: Optional[Butler] = None,
242 collections: Any = None,
243 run: Optional[str] = None,
244 searchPaths: Optional[List[str]] = None,
245 writeable: Optional[bool] = None,
246 inferDefaults: bool = True,
247 **kwargs: str,
248 ):
249 defaults = RegistryDefaults(collections=collections, run=run, infer=inferDefaults, **kwargs)
250 # Load registry, datastore, etc. from config or existing butler.
251 if butler is not None:
252 if config is not None or searchPaths is not None or writeable is not None:
253 raise TypeError(
254 "Cannot pass 'config', 'searchPaths', or 'writeable' arguments with 'butler' argument."
255 )
256 self.registry = butler.registry.copy(defaults)
257 self.datastore = butler.datastore
258 self.storageClasses = butler.storageClasses
259 self._config: ButlerConfig = butler._config
260 self._allow_put_of_predefined_dataset = butler._allow_put_of_predefined_dataset
261 else:
262 # Can only look for strings in the known repos list.
263 if isinstance(config, str) and config in self.get_known_repos():
264 config = str(self.get_repo_uri(config))
265 try:
266 self._config = ButlerConfig(config, searchPaths=searchPaths)
267 except FileNotFoundError as e:
268 if known := self.get_known_repos():
269 aliases = f"(known aliases: {', '.join(known)})"
270 else:
271 aliases = "(no known aliases)"
272 raise FileNotFoundError(f"{e} {aliases}") from e
273 self._config = ButlerConfig(config, searchPaths=searchPaths)
274 try:
275 if "root" in self._config:
276 butlerRoot = self._config["root"]
277 else:
278 butlerRoot = self._config.configDir
279 if writeable is None:
280 writeable = run is not None
281 self.registry = Registry.fromConfig(
282 self._config, butlerRoot=butlerRoot, writeable=writeable, defaults=defaults
283 )
284 self.datastore = Datastore.fromConfig(
285 self._config, self.registry.getDatastoreBridgeManager(), butlerRoot=butlerRoot
286 )
287 self.storageClasses = StorageClassFactory()
288 self.storageClasses.addFromConfig(self._config)
289 self._allow_put_of_predefined_dataset = self._config.get(
290 "allow_put_of_predefined_dataset", False
291 )
292 except Exception:
293 # Failures here usually mean that configuration is incomplete,
294 # just issue an error message which includes config file URI.
295 log.error(f"Failed to instantiate Butler from config {self._config.configFile}.")
296 raise
298 if "run" in self._config or "collection" in self._config:
299 raise ValueError("Passing a run or collection via configuration is no longer supported.")
301 GENERATION: ClassVar[int] = 3
302 """This is a Generation 3 Butler.
304 This attribute may be removed in the future, once the Generation 2 Butler
305 interface has been fully retired; it should only be used in transitional
306 code.
307 """
309 @classmethod
310 def get_repo_uri(cls, label: str) -> ResourcePath:
311 """Look up the label in a butler repository index.
313 Parameters
314 ----------
315 label : `str`
316 Label of the Butler repository to look up.
318 Returns
319 -------
320 uri : `lsst.resources.ResourcePath`
321 URI to the Butler repository associated with the given label.
323 Raises
324 ------
325 KeyError
326 Raised if the label is not found in the index, or if an index
327 can not be found at all.
329 Notes
330 -----
331 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the
332 information is discovered.
333 """
334 return ButlerRepoIndex.get_repo_uri(label)
336 @classmethod
337 def get_known_repos(cls) -> Set[str]:
338 """Retrieve the list of known repository labels.
340 Returns
341 -------
342 repos : `set` of `str`
343 All the known labels. Can be empty if no index can be found.
345 Notes
346 -----
347 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the
348 information is discovered.
349 """
350 return ButlerRepoIndex.get_known_repos()
352 @staticmethod
353 def makeRepo(
354 root: ResourcePathExpression,
355 config: Union[Config, str, None] = None,
356 dimensionConfig: Union[Config, str, None] = None,
357 standalone: bool = False,
358 searchPaths: Optional[List[str]] = None,
359 forceConfigRoot: bool = True,
360 outfile: Optional[ResourcePathExpression] = None,
361 overwrite: bool = False,
362 ) -> Config:
363 """Create an empty data repository by adding a butler.yaml config
364 to a repository root directory.
366 Parameters
367 ----------
368 root : `lsst.resources.ResourcePathExpression`
369 Path or URI to the root location of the new repository. Will be
370 created if it does not exist.
371 config : `Config` or `str`, optional
372 Configuration to write to the repository, after setting any
373 root-dependent Registry or Datastore config options. Can not
374 be a `ButlerConfig` or a `ConfigSubset`. If `None`, default
375 configuration will be used. Root-dependent config options
376 specified in this config are overwritten if ``forceConfigRoot``
377 is `True`.
378 dimensionConfig : `Config` or `str`, optional
379 Configuration for dimensions, will be used to initialize registry
380 database.
381 standalone : `bool`
382 If True, write all expanded defaults, not just customized or
383 repository-specific settings.
384 This (mostly) decouples the repository from the default
385 configuration, insulating it from changes to the defaults (which
386 may be good or bad, depending on the nature of the changes).
387 Future *additions* to the defaults will still be picked up when
388 initializing `Butlers` to repos created with ``standalone=True``.
389 searchPaths : `list` of `str`, optional
390 Directory paths to search when calculating the full butler
391 configuration.
392 forceConfigRoot : `bool`, optional
393 If `False`, any values present in the supplied ``config`` that
394 would normally be reset are not overridden and will appear
395 directly in the output config. This allows non-standard overrides
396 of the root directory for a datastore or registry to be given.
397 If this parameter is `True` the values for ``root`` will be
398 forced into the resulting config if appropriate.
399 outfile : `lss.resources.ResourcePathExpression`, optional
400 If not-`None`, the output configuration will be written to this
401 location rather than into the repository itself. Can be a URI
402 string. Can refer to a directory that will be used to write
403 ``butler.yaml``.
404 overwrite : `bool`, optional
405 Create a new configuration file even if one already exists
406 in the specified output location. Default is to raise
407 an exception.
409 Returns
410 -------
411 config : `Config`
412 The updated `Config` instance written to the repo.
414 Raises
415 ------
416 ValueError
417 Raised if a ButlerConfig or ConfigSubset is passed instead of a
418 regular Config (as these subclasses would make it impossible to
419 support ``standalone=False``).
420 FileExistsError
421 Raised if the output config file already exists.
422 os.error
423 Raised if the directory does not exist, exists but is not a
424 directory, or cannot be created.
426 Notes
427 -----
428 Note that when ``standalone=False`` (the default), the configuration
429 search path (see `ConfigSubset.defaultSearchPaths`) that was used to
430 construct the repository should also be used to construct any Butlers
431 to avoid configuration inconsistencies.
432 """
433 if isinstance(config, (ButlerConfig, ConfigSubset)):
434 raise ValueError("makeRepo must be passed a regular Config without defaults applied.")
436 # Ensure that the root of the repository exists or can be made
437 root_uri = ResourcePath(root, forceDirectory=True)
438 root_uri.mkdir()
440 config = Config(config)
442 # If we are creating a new repo from scratch with relative roots,
443 # do not propagate an explicit root from the config file
444 if "root" in config:
445 del config["root"]
447 full = ButlerConfig(config, searchPaths=searchPaths) # this applies defaults
448 imported_class = doImportType(full["datastore", "cls"])
449 if not issubclass(imported_class, Datastore):
450 raise TypeError(f"Imported datastore class {full['datastore', 'cls']} is not a Datastore")
451 datastoreClass: Type[Datastore] = imported_class
452 datastoreClass.setConfigRoot(BUTLER_ROOT_TAG, config, full, overwrite=forceConfigRoot)
454 # if key exists in given config, parse it, otherwise parse the defaults
455 # in the expanded config
456 if config.get(("registry", "db")):
457 registryConfig = RegistryConfig(config)
458 else:
459 registryConfig = RegistryConfig(full)
460 defaultDatabaseUri = registryConfig.makeDefaultDatabaseUri(BUTLER_ROOT_TAG)
461 if defaultDatabaseUri is not None:
462 Config.updateParameters(
463 RegistryConfig, config, full, toUpdate={"db": defaultDatabaseUri}, overwrite=forceConfigRoot
464 )
465 else:
466 Config.updateParameters(RegistryConfig, config, full, toCopy=("db",), overwrite=forceConfigRoot)
468 if standalone:
469 config.merge(full)
470 else:
471 # Always expand the registry.managers section into the per-repo
472 # config, because after the database schema is created, it's not
473 # allowed to change anymore. Note that in the standalone=True
474 # branch, _everything_ in the config is expanded, so there's no
475 # need to special case this.
476 Config.updateParameters(RegistryConfig, config, full, toMerge=("managers",), overwrite=False)
477 configURI: ResourcePathExpression
478 if outfile is not None:
479 # When writing to a separate location we must include
480 # the root of the butler repo in the config else it won't know
481 # where to look.
482 config["root"] = root_uri.geturl()
483 configURI = outfile
484 else:
485 configURI = root_uri
486 config.dumpToUri(configURI, overwrite=overwrite)
488 # Create Registry and populate tables
489 registryConfig = RegistryConfig(config.get("registry"))
490 dimensionConfig = DimensionConfig(dimensionConfig)
491 Registry.createFromConfig(registryConfig, dimensionConfig=dimensionConfig, butlerRoot=root_uri)
493 log.verbose("Wrote new Butler configuration file to %s", configURI)
495 return config
497 @classmethod
498 def _unpickle(
499 cls,
500 config: ButlerConfig,
501 collections: Optional[CollectionSearch],
502 run: Optional[str],
503 defaultDataId: Dict[str, str],
504 writeable: bool,
505 ) -> Butler:
506 """Callable used to unpickle a Butler.
508 We prefer not to use ``Butler.__init__`` directly so we can force some
509 of its many arguments to be keyword-only (note that ``__reduce__``
510 can only invoke callables with positional arguments).
512 Parameters
513 ----------
514 config : `ButlerConfig`
515 Butler configuration, already coerced into a true `ButlerConfig`
516 instance (and hence after any search paths for overrides have been
517 utilized).
518 collections : `CollectionSearch`
519 Names of the default collections to read from.
520 run : `str`, optional
521 Name of the default `~CollectionType.RUN` collection to write to.
522 defaultDataId : `dict` [ `str`, `str` ]
523 Default data ID values.
524 writeable : `bool`
525 Whether the Butler should support write operations.
527 Returns
528 -------
529 butler : `Butler`
530 A new `Butler` instance.
531 """
532 # MyPy doesn't recognize that the kwargs below are totally valid; it
533 # seems to think '**defaultDataId* is a _positional_ argument!
534 return cls(
535 config=config,
536 collections=collections,
537 run=run,
538 writeable=writeable,
539 **defaultDataId, # type: ignore
540 )
542 def __reduce__(self) -> tuple:
543 """Support pickling."""
544 return (
545 Butler._unpickle,
546 (
547 self._config,
548 self.collections,
549 self.run,
550 self.registry.defaults.dataId.byName(),
551 self.registry.isWriteable(),
552 ),
553 )
555 def __str__(self) -> str:
556 return "Butler(collections={}, run={}, datastore='{}', registry='{}')".format(
557 self.collections, self.run, self.datastore, self.registry
558 )
560 def isWriteable(self) -> bool:
561 """Return `True` if this `Butler` supports write operations."""
562 return self.registry.isWriteable()
564 @contextlib.contextmanager
565 def transaction(self) -> Iterator[None]:
566 """Context manager supporting `Butler` transactions.
568 Transactions can be nested.
569 """
570 with self.registry.transaction():
571 with self.datastore.transaction():
572 yield
574 def _standardizeArgs(
575 self,
576 datasetRefOrType: Union[DatasetRef, DatasetType, str],
577 dataId: Optional[DataId] = None,
578 for_put: bool = True,
579 **kwargs: Any,
580 ) -> Tuple[DatasetType, Optional[DataId]]:
581 """Standardize the arguments passed to several Butler APIs.
583 Parameters
584 ----------
585 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
586 When `DatasetRef` the `dataId` should be `None`.
587 Otherwise the `DatasetType` or name thereof.
588 dataId : `dict` or `DataCoordinate`
589 A `dict` of `Dimension` link name, value pairs that label the
590 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
591 should be provided as the second argument.
592 for_put : `bool`, optional
593 If `True` this call is invoked as part of a `Butler.put()`.
594 Otherwise it is assumed to be part of a `Butler.get()`. This
595 parameter is only relevant if there is dataset type
596 inconsistency.
597 **kwargs
598 Additional keyword arguments used to augment or construct a
599 `DataCoordinate`. See `DataCoordinate.standardize`
600 parameters.
602 Returns
603 -------
604 datasetType : `DatasetType`
605 A `DatasetType` instance extracted from ``datasetRefOrType``.
606 dataId : `dict` or `DataId`, optional
607 Argument that can be used (along with ``kwargs``) to construct a
608 `DataId`.
610 Notes
611 -----
612 Butler APIs that conceptually need a DatasetRef also allow passing a
613 `DatasetType` (or the name of one) and a `DataId` (or a dict and
614 keyword arguments that can be used to construct one) separately. This
615 method accepts those arguments and always returns a true `DatasetType`
616 and a `DataId` or `dict`.
618 Standardization of `dict` vs `DataId` is best handled by passing the
619 returned ``dataId`` (and ``kwargs``) to `Registry` APIs, which are
620 generally similarly flexible.
621 """
622 externalDatasetType: Optional[DatasetType] = None
623 internalDatasetType: Optional[DatasetType] = None
624 if isinstance(datasetRefOrType, DatasetRef):
625 if dataId is not None or kwargs:
626 raise ValueError("DatasetRef given, cannot use dataId as well")
627 externalDatasetType = datasetRefOrType.datasetType
628 dataId = datasetRefOrType.dataId
629 else:
630 # Don't check whether DataId is provided, because Registry APIs
631 # can usually construct a better error message when it wasn't.
632 if isinstance(datasetRefOrType, DatasetType):
633 externalDatasetType = datasetRefOrType
634 else:
635 internalDatasetType = self.registry.getDatasetType(datasetRefOrType)
637 # Check that they are self-consistent
638 if externalDatasetType is not None:
639 internalDatasetType = self.registry.getDatasetType(externalDatasetType.name)
640 if externalDatasetType != internalDatasetType:
641 # We can allow differences if they are compatible, depending
642 # on whether this is a get or a put. A get requires that
643 # the python type associated with the datastore can be
644 # converted to the user type. A put requires that the user
645 # supplied python type can be converted to the internal
646 # type expected by registry.
647 relevantDatasetType = internalDatasetType
648 if for_put:
649 is_compatible = internalDatasetType.is_compatible_with(externalDatasetType)
650 else:
651 is_compatible = externalDatasetType.is_compatible_with(internalDatasetType)
652 relevantDatasetType = externalDatasetType
653 if not is_compatible:
654 raise ValueError(
655 f"Supplied dataset type ({externalDatasetType}) inconsistent with "
656 f"registry definition ({internalDatasetType})"
657 )
658 # Override the internal definition.
659 internalDatasetType = relevantDatasetType
661 assert internalDatasetType is not None
662 return internalDatasetType, dataId
664 def _rewrite_data_id(
665 self, dataId: Optional[DataId], datasetType: DatasetType, **kwargs: Any
666 ) -> Tuple[Optional[DataId], Dict[str, Any]]:
667 """Rewrite a data ID taking into account dimension records.
669 Take a Data ID and keyword args and rewrite it if necessary to
670 allow the user to specify dimension records rather than dimension
671 primary values.
673 This allows a user to include a dataId dict with keys of
674 ``exposure.day_obs`` and ``exposure.seq_num`` instead of giving
675 the integer exposure ID. It also allows a string to be given
676 for a dimension value rather than the integer ID if that is more
677 convenient. For example, rather than having to specifyin the
678 detector with ``detector.full_name``, a string given for ``detector``
679 will be interpreted as the full name and converted to the integer
680 value.
682 Keyword arguments can also use strings for dimensions like detector
683 and exposure but python does not allow them to include ``.`` and
684 so the ``exposure.day_obs`` syntax can not be used in a keyword
685 argument.
687 Parameters
688 ----------
689 dataId : `dict` or `DataCoordinate`
690 A `dict` of `Dimension` link name, value pairs that will label the
691 `DatasetRef` within a Collection.
692 datasetType : `DatasetType`
693 The dataset type associated with this dataId. Required to
694 determine the relevant dimensions.
695 **kwargs
696 Additional keyword arguments used to augment or construct a
697 `DataId`. See `DataId` parameters.
699 Returns
700 -------
701 dataId : `dict` or `DataCoordinate`
702 The, possibly rewritten, dataId. If given a `DataCoordinate` and
703 no keyword arguments, the original dataId will be returned
704 unchanged.
705 **kwargs : `dict`
706 Any unused keyword arguments (would normally be empty dict).
707 """
708 # Do nothing if we have a standalone DataCoordinate.
709 if isinstance(dataId, DataCoordinate) and not kwargs:
710 return dataId, kwargs
712 # Process dimension records that are using record information
713 # rather than ids
714 newDataId: Dict[str, DataIdValue] = {}
715 byRecord: Dict[str, Dict[str, Any]] = defaultdict(dict)
717 # if all the dataId comes from keyword parameters we do not need
718 # to do anything here because they can't be of the form
719 # exposure.obs_id because a "." is not allowed in a keyword parameter.
720 if dataId:
721 for k, v in dataId.items():
722 # If we have a Dimension we do not need to do anything
723 # because it cannot be a compound key.
724 if isinstance(k, str) and "." in k:
725 # Someone is using a more human-readable dataId
726 dimensionName, record = k.split(".", 1)
727 byRecord[dimensionName][record] = v
728 elif isinstance(k, Dimension):
729 newDataId[k.name] = v
730 else:
731 newDataId[k] = v
733 # Go through the updated dataId and check the type in case someone is
734 # using an alternate key. We have already filtered out the compound
735 # keys dimensions.record format.
736 not_dimensions = {}
738 # Will need to look in the dataId and the keyword arguments
739 # and will remove them if they need to be fixed or are unrecognized.
740 for dataIdDict in (newDataId, kwargs):
741 # Use a list so we can adjust the dict safely in the loop
742 for dimensionName in list(dataIdDict):
743 value = dataIdDict[dimensionName]
744 try:
745 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
746 except KeyError:
747 # This is not a real dimension
748 not_dimensions[dimensionName] = value
749 del dataIdDict[dimensionName]
750 continue
752 # Convert an integral type to an explicit int to simplify
753 # comparisons here
754 if isinstance(value, numbers.Integral):
755 value = int(value)
757 if not isinstance(value, dimension.primaryKey.getPythonType()):
758 for alternate in dimension.alternateKeys:
759 if isinstance(value, alternate.getPythonType()):
760 byRecord[dimensionName][alternate.name] = value
761 del dataIdDict[dimensionName]
762 log.debug(
763 "Converting dimension %s to %s.%s=%s",
764 dimensionName,
765 dimensionName,
766 alternate.name,
767 value,
768 )
769 break
770 else:
771 log.warning(
772 "Type mismatch found for value '%r' provided for dimension %s. "
773 "Could not find matching alternative (primary key has type %s) "
774 "so attempting to use as-is.",
775 value,
776 dimensionName,
777 dimension.primaryKey.getPythonType(),
778 )
780 # By this point kwargs and newDataId should only include valid
781 # dimensions. Merge kwargs in to the new dataId and log if there
782 # are dimensions in both (rather than calling update).
783 for k, v in kwargs.items():
784 if k in newDataId and newDataId[k] != v:
785 log.debug(
786 "Keyword arg %s overriding explicit value in dataId of %s with %s", k, newDataId[k], v
787 )
788 newDataId[k] = v
789 # No need to retain any values in kwargs now.
790 kwargs = {}
792 # If we have some unrecognized dimensions we have to try to connect
793 # them to records in other dimensions. This is made more complicated
794 # by some dimensions having records with clashing names. A mitigation
795 # is that we can tell by this point which dimensions are missing
796 # for the DatasetType but this does not work for calibrations
797 # where additional dimensions can be used to constrain the temporal
798 # axis.
799 if not_dimensions:
800 # Search for all dimensions even if we have been given a value
801 # explicitly. In some cases records are given as well as the
802 # actually dimension and this should not be an error if they
803 # match.
804 mandatoryDimensions = datasetType.dimensions.names # - provided
806 candidateDimensions: Set[str] = set()
807 candidateDimensions.update(mandatoryDimensions)
809 # For calibrations we may well be needing temporal dimensions
810 # so rather than always including all dimensions in the scan
811 # restrict things a little. It is still possible for there
812 # to be confusion over day_obs in visit vs exposure for example.
813 # If we are not searching calibration collections things may
814 # fail but they are going to fail anyway because of the
815 # ambiguousness of the dataId...
816 if datasetType.isCalibration():
817 for dim in self.registry.dimensions.getStaticDimensions():
818 if dim.temporal:
819 candidateDimensions.add(str(dim))
821 # Look up table for the first association with a dimension
822 guessedAssociation: Dict[str, Dict[str, Any]] = defaultdict(dict)
824 # Keep track of whether an item is associated with multiple
825 # dimensions.
826 counter: Counter[str] = Counter()
827 assigned: Dict[str, Set[str]] = defaultdict(set)
829 # Go through the missing dimensions and associate the
830 # given names with records within those dimensions
831 matched_dims = set()
832 for dimensionName in candidateDimensions:
833 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
834 fields = dimension.metadata.names | dimension.uniqueKeys.names
835 for field in not_dimensions:
836 if field in fields:
837 guessedAssociation[dimensionName][field] = not_dimensions[field]
838 counter[dimensionName] += 1
839 assigned[field].add(dimensionName)
840 matched_dims.add(field)
842 # Calculate the fields that matched nothing.
843 never_found = set(not_dimensions) - matched_dims
845 if never_found:
846 raise DimensionNameError(f"Unrecognized keyword args given: {never_found}")
848 # There is a chance we have allocated a single dataId item
849 # to multiple dimensions. Need to decide which should be retained.
850 # For now assume that the most popular alternative wins.
851 # This means that day_obs with seq_num will result in
852 # exposure.day_obs and not visit.day_obs
853 # Also prefer an explicitly missing dimension over an inferred
854 # temporal dimension.
855 for fieldName, assignedDimensions in assigned.items():
856 if len(assignedDimensions) > 1:
857 # Pick the most popular (preferring mandatory dimensions)
858 requiredButMissing = assignedDimensions.intersection(mandatoryDimensions)
859 if requiredButMissing:
860 candidateDimensions = requiredButMissing
861 else:
862 candidateDimensions = assignedDimensions
864 # Select the relevant items and get a new restricted
865 # counter.
866 theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions}
867 duplicatesCounter: Counter[str] = Counter()
868 duplicatesCounter.update(theseCounts)
870 # Choose the most common. If they are equally common
871 # we will pick the one that was found first.
872 # Returns a list of tuples
873 selected = duplicatesCounter.most_common(1)[0][0]
875 log.debug(
876 "Ambiguous dataId entry '%s' associated with multiple dimensions: %s."
877 " Removed ambiguity by choosing dimension %s.",
878 fieldName,
879 ", ".join(assignedDimensions),
880 selected,
881 )
883 for candidateDimension in assignedDimensions:
884 if candidateDimension != selected:
885 del guessedAssociation[candidateDimension][fieldName]
887 # Update the record look up dict with the new associations
888 for dimensionName, values in guessedAssociation.items():
889 if values: # A dict might now be empty
890 log.debug("Assigned non-dimension dataId keys to dimension %s: %s", dimensionName, values)
891 byRecord[dimensionName].update(values)
893 if byRecord:
894 # Some record specifiers were found so we need to convert
895 # them to the Id form
896 for dimensionName, values in byRecord.items():
897 if dimensionName in newDataId:
898 log.debug(
899 "DataId specified explicit %s dimension value of %s in addition to"
900 " general record specifiers for it of %s. Ignoring record information.",
901 dimensionName,
902 newDataId[dimensionName],
903 str(values),
904 )
905 # Get the actual record and compare with these values.
906 try:
907 recs = list(self.registry.queryDimensionRecords(dimensionName, dataId=newDataId))
908 except DataIdError:
909 raise DataIdValueError(
910 f"Could not find dimension '{dimensionName}'"
911 f" with dataId {newDataId} as part of comparing with"
912 f" record values {byRecord[dimensionName]}"
913 ) from None
914 if len(recs) == 1:
915 errmsg: List[str] = []
916 for k, v in values.items():
917 if (recval := getattr(recs[0], k)) != v:
918 errmsg.append(f"{k}({recval} != {v})")
919 if errmsg:
920 raise InconsistentDataIdError(
921 f"Dimension {dimensionName} in dataId has explicit value"
922 " inconsistent with records: " + ", ".join(errmsg)
923 )
924 else:
925 # Multiple matches for an explicit dimension
926 # should never happen but let downstream complain.
927 pass
928 continue
930 # Build up a WHERE expression
931 bind = {k: v for k, v in values.items()}
932 where = " AND ".join(f"{dimensionName}.{k} = {k}" for k in bind)
934 # Hopefully we get a single record that matches
935 records = set(
936 self.registry.queryDimensionRecords(
937 dimensionName, dataId=newDataId, where=where, bind=bind, **kwargs
938 )
939 )
941 if len(records) != 1:
942 if len(records) > 1:
943 log.debug("Received %d records from constraints of %s", len(records), str(values))
944 for r in records:
945 log.debug("- %s", str(r))
946 raise InconsistentDataIdError(
947 f"DataId specification for dimension {dimensionName} is not"
948 f" uniquely constrained to a single dataset by {values}."
949 f" Got {len(records)} results."
950 )
951 raise InconsistentDataIdError(
952 f"DataId specification for dimension {dimensionName} matched no"
953 f" records when constrained by {values}"
954 )
956 # Get the primary key from the real dimension object
957 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
958 if not isinstance(dimension, Dimension):
959 raise RuntimeError(
960 f"{dimension.name} is not a true dimension, and cannot be used in data IDs."
961 )
962 newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name)
964 return newDataId, kwargs
966 def _findDatasetRef(
967 self,
968 datasetRefOrType: Union[DatasetRef, DatasetType, str],
969 dataId: Optional[DataId] = None,
970 *,
971 collections: Any = None,
972 allowUnresolved: bool = False,
973 **kwargs: Any,
974 ) -> DatasetRef:
975 """Shared logic for methods that start with a search for a dataset in
976 the registry.
978 Parameters
979 ----------
980 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
981 When `DatasetRef` the `dataId` should be `None`.
982 Otherwise the `DatasetType` or name thereof.
983 dataId : `dict` or `DataCoordinate`, optional
984 A `dict` of `Dimension` link name, value pairs that label the
985 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
986 should be provided as the first argument.
987 collections : Any, optional
988 Collections to be searched, overriding ``self.collections``.
989 Can be any of the types supported by the ``collections`` argument
990 to butler construction.
991 allowUnresolved : `bool`, optional
992 If `True`, return an unresolved `DatasetRef` if finding a resolved
993 one in the `Registry` fails. Defaults to `False`.
994 **kwargs
995 Additional keyword arguments used to augment or construct a
996 `DataId`. See `DataId` parameters.
998 Returns
999 -------
1000 ref : `DatasetRef`
1001 A reference to the dataset identified by the given arguments.
1003 Raises
1004 ------
1005 LookupError
1006 Raised if no matching dataset exists in the `Registry` (and
1007 ``allowUnresolved is False``).
1008 ValueError
1009 Raised if a resolved `DatasetRef` was passed as an input, but it
1010 differs from the one found in the registry.
1011 TypeError
1012 Raised if no collections were provided.
1013 """
1014 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, for_put=False, **kwargs)
1015 if isinstance(datasetRefOrType, DatasetRef):
1016 idNumber = datasetRefOrType.id
1017 else:
1018 idNumber = None
1019 timespan: Optional[Timespan] = None
1021 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs)
1023 if datasetType.isCalibration():
1024 # Because this is a calibration dataset, first try to make a
1025 # standardize the data ID without restricting the dimensions to
1026 # those of the dataset type requested, because there may be extra
1027 # dimensions that provide temporal information for a validity-range
1028 # lookup.
1029 dataId = DataCoordinate.standardize(
1030 dataId, universe=self.registry.dimensions, defaults=self.registry.defaults.dataId, **kwargs
1031 )
1032 if dataId.graph.temporal:
1033 dataId = self.registry.expandDataId(dataId)
1034 timespan = dataId.timespan
1035 else:
1036 # Standardize the data ID to just the dimensions of the dataset
1037 # type instead of letting registry.findDataset do it, so we get the
1038 # result even if no dataset is found.
1039 dataId = DataCoordinate.standardize(
1040 dataId, graph=datasetType.dimensions, defaults=self.registry.defaults.dataId, **kwargs
1041 )
1042 # Always lookup the DatasetRef, even if one is given, to ensure it is
1043 # present in the current collection.
1044 ref = self.registry.findDataset(datasetType, dataId, collections=collections, timespan=timespan)
1045 if ref is None:
1046 if allowUnresolved:
1047 return DatasetRef(datasetType, dataId)
1048 else:
1049 if collections is None:
1050 collections = self.registry.defaults.collections
1051 raise LookupError(
1052 f"Dataset {datasetType.name} with data ID {dataId} "
1053 f"could not be found in collections {collections}."
1054 )
1055 if idNumber is not None and idNumber != ref.id:
1056 if collections is None:
1057 collections = self.registry.defaults.collections
1058 raise ValueError(
1059 f"DatasetRef.id provided ({idNumber}) does not match "
1060 f"id ({ref.id}) in registry in collections {collections}."
1061 )
1062 if datasetType != ref.datasetType:
1063 # If they differ it is because the user explicitly specified
1064 # a compatible dataset type to this call rather than using the
1065 # registry definition. The DatasetRef must therefore be recreated
1066 # using the user definition such that the expected type is
1067 # returned.
1068 ref = DatasetRef(datasetType, ref.dataId, run=ref.run, id=ref.id)
1070 return ref
1072 @transactional
1073 def putDirect(self, obj: Any, ref: DatasetRef) -> DatasetRef:
1074 # Docstring inherited.
1075 (imported_ref,) = self.registry._importDatasets(
1076 [ref],
1077 expand=True,
1078 )
1079 if imported_ref.id != ref.getCheckedId():
1080 raise RuntimeError("This registry configuration does not support putDirect.")
1081 self.datastore.put(obj, ref)
1082 return ref
1084 @transactional
1085 def put(
1086 self,
1087 obj: Any,
1088 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1089 dataId: Optional[DataId] = None,
1090 *,
1091 run: Optional[str] = None,
1092 **kwargs: Any,
1093 ) -> DatasetRef:
1094 """Store and register a dataset.
1096 Parameters
1097 ----------
1098 obj : `object`
1099 The dataset.
1100 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1101 When `DatasetRef` is provided, ``dataId`` should be `None`.
1102 Otherwise the `DatasetType` or name thereof.
1103 dataId : `dict` or `DataCoordinate`
1104 A `dict` of `Dimension` link name, value pairs that label the
1105 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1106 should be provided as the second argument.
1107 run : `str`, optional
1108 The name of the run the dataset should be added to, overriding
1109 ``self.run``.
1110 **kwargs
1111 Additional keyword arguments used to augment or construct a
1112 `DataCoordinate`. See `DataCoordinate.standardize`
1113 parameters.
1115 Returns
1116 -------
1117 ref : `DatasetRef`
1118 A reference to the stored dataset, updated with the correct id if
1119 given.
1121 Raises
1122 ------
1123 TypeError
1124 Raised if the butler is read-only or if no run has been provided.
1125 """
1126 log.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run)
1127 if not self.isWriteable():
1128 raise TypeError("Butler is read-only.")
1129 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs)
1130 if isinstance(datasetRefOrType, DatasetRef) and datasetRefOrType.id is not None:
1131 raise ValueError("DatasetRef must not be in registry, must have None id")
1133 # Handle dimension records in dataId
1134 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs)
1136 # Add Registry Dataset entry.
1137 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions, **kwargs)
1139 # For an execution butler the datasets will be pre-defined.
1140 # If the butler is configured that way datasets should only be inserted
1141 # if they do not already exist in registry. Trying and catching
1142 # ConflictingDefinitionError will not work because the transaction
1143 # will be corrupted. Instead, in this mode always check first.
1144 ref = None
1145 ref_is_predefined = False
1146 if self._allow_put_of_predefined_dataset:
1147 # Get the matching ref for this run.
1148 ref = self.registry.findDataset(datasetType, collections=run, dataId=dataId)
1150 if ref:
1151 # Must be expanded form for datastore templating
1152 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions)
1153 ref = ref.expanded(dataId)
1154 ref_is_predefined = True
1156 if not ref:
1157 (ref,) = self.registry.insertDatasets(datasetType, run=run, dataIds=[dataId])
1159 # If the ref is predefined it is possible that the datastore also
1160 # has the record. Asking datastore to put it again will result in
1161 # the artifact being recreated, overwriting previous, then will cause
1162 # a failure in writing the record which will cause the artifact
1163 # to be removed. Much safer to ask first before attempting to
1164 # overwrite. Race conditions should not be an issue for the
1165 # execution butler environment.
1166 if ref_is_predefined:
1167 if self.datastore.knows(ref):
1168 raise ConflictingDefinitionError(f"Dataset associated {ref} already exists.")
1170 self.datastore.put(obj, ref)
1172 return ref
1174 def getDirect(self, ref: DatasetRef, *, parameters: Optional[Dict[str, Any]] = None) -> Any:
1175 """Retrieve a stored dataset.
1177 Unlike `Butler.get`, this method allows datasets outside the Butler's
1178 collection to be read as long as the `DatasetRef` that identifies them
1179 can be obtained separately.
1181 Parameters
1182 ----------
1183 ref : `DatasetRef`
1184 Resolved reference to an already stored dataset.
1185 parameters : `dict`
1186 Additional StorageClass-defined options to control reading,
1187 typically used to efficiently read only a subset of the dataset.
1189 Returns
1190 -------
1191 obj : `object`
1192 The dataset.
1193 """
1194 return self.datastore.get(ref, parameters=parameters)
1196 def getDirectDeferred(
1197 self, ref: DatasetRef, *, parameters: Union[dict, None] = None
1198 ) -> DeferredDatasetHandle:
1199 """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
1200 from a resolved `DatasetRef`.
1202 Parameters
1203 ----------
1204 ref : `DatasetRef`
1205 Resolved reference to an already stored dataset.
1206 parameters : `dict`
1207 Additional StorageClass-defined options to control reading,
1208 typically used to efficiently read only a subset of the dataset.
1210 Returns
1211 -------
1212 obj : `DeferredDatasetHandle`
1213 A handle which can be used to retrieve a dataset at a later time.
1215 Raises
1216 ------
1217 AmbiguousDatasetError
1218 Raised if ``ref.id is None``, i.e. the reference is unresolved.
1219 """
1220 if ref.id is None:
1221 raise AmbiguousDatasetError(
1222 f"Dataset of type {ref.datasetType.name} with data ID {ref.dataId} is not resolved."
1223 )
1224 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters)
1226 def getDeferred(
1227 self,
1228 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1229 dataId: Optional[DataId] = None,
1230 *,
1231 parameters: Union[dict, None] = None,
1232 collections: Any = None,
1233 **kwargs: Any,
1234 ) -> DeferredDatasetHandle:
1235 """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
1236 after an immediate registry lookup.
1238 Parameters
1239 ----------
1240 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1241 When `DatasetRef` the `dataId` should be `None`.
1242 Otherwise the `DatasetType` or name thereof.
1243 dataId : `dict` or `DataCoordinate`, optional
1244 A `dict` of `Dimension` link name, value pairs that label the
1245 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1246 should be provided as the first argument.
1247 parameters : `dict`
1248 Additional StorageClass-defined options to control reading,
1249 typically used to efficiently read only a subset of the dataset.
1250 collections : Any, optional
1251 Collections to be searched, overriding ``self.collections``.
1252 Can be any of the types supported by the ``collections`` argument
1253 to butler construction.
1254 **kwargs
1255 Additional keyword arguments used to augment or construct a
1256 `DataId`. See `DataId` parameters.
1258 Returns
1259 -------
1260 obj : `DeferredDatasetHandle`
1261 A handle which can be used to retrieve a dataset at a later time.
1263 Raises
1264 ------
1265 LookupError
1266 Raised if no matching dataset exists in the `Registry` (and
1267 ``allowUnresolved is False``).
1268 ValueError
1269 Raised if a resolved `DatasetRef` was passed as an input, but it
1270 differs from the one found in the registry.
1271 TypeError
1272 Raised if no collections were provided.
1273 """
1274 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs)
1275 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters)
1277 def get(
1278 self,
1279 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1280 dataId: Optional[DataId] = None,
1281 *,
1282 parameters: Optional[Dict[str, Any]] = None,
1283 collections: Any = None,
1284 **kwargs: Any,
1285 ) -> Any:
1286 """Retrieve a stored dataset.
1288 Parameters
1289 ----------
1290 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1291 When `DatasetRef` the `dataId` should be `None`.
1292 Otherwise the `DatasetType` or name thereof.
1293 dataId : `dict` or `DataCoordinate`
1294 A `dict` of `Dimension` link name, value pairs that label the
1295 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1296 should be provided as the first argument.
1297 parameters : `dict`
1298 Additional StorageClass-defined options to control reading,
1299 typically used to efficiently read only a subset of the dataset.
1300 collections : Any, optional
1301 Collections to be searched, overriding ``self.collections``.
1302 Can be any of the types supported by the ``collections`` argument
1303 to butler construction.
1304 **kwargs
1305 Additional keyword arguments used to augment or construct a
1306 `DataCoordinate`. See `DataCoordinate.standardize`
1307 parameters.
1309 Returns
1310 -------
1311 obj : `object`
1312 The dataset.
1314 Raises
1315 ------
1316 ValueError
1317 Raised if a resolved `DatasetRef` was passed as an input, but it
1318 differs from the one found in the registry.
1319 LookupError
1320 Raised if no matching dataset exists in the `Registry`.
1321 TypeError
1322 Raised if no collections were provided.
1324 Notes
1325 -----
1326 When looking up datasets in a `~CollectionType.CALIBRATION` collection,
1327 this method requires that the given data ID include temporal dimensions
1328 beyond the dimensions of the dataset type itself, in order to find the
1329 dataset with the appropriate validity range. For example, a "bias"
1330 dataset with native dimensions ``{instrument, detector}`` could be
1331 fetched with a ``{instrument, detector, exposure}`` data ID, because
1332 ``exposure`` is a temporal dimension.
1333 """
1334 log.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters)
1335 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs)
1336 return self.getDirect(ref, parameters=parameters)
1338 def getURIs(
1339 self,
1340 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1341 dataId: Optional[DataId] = None,
1342 *,
1343 predict: bool = False,
1344 collections: Any = None,
1345 run: Optional[str] = None,
1346 **kwargs: Any,
1347 ) -> Tuple[Optional[ResourcePath], Dict[str, ResourcePath]]:
1348 """Returns the URIs associated with the dataset.
1350 Parameters
1351 ----------
1352 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1353 When `DatasetRef` the `dataId` should be `None`.
1354 Otherwise the `DatasetType` or name thereof.
1355 dataId : `dict` or `DataCoordinate`
1356 A `dict` of `Dimension` link name, value pairs that label the
1357 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1358 should be provided as the first argument.
1359 predict : `bool`
1360 If `True`, allow URIs to be returned of datasets that have not
1361 been written.
1362 collections : Any, optional
1363 Collections to be searched, overriding ``self.collections``.
1364 Can be any of the types supported by the ``collections`` argument
1365 to butler construction.
1366 run : `str`, optional
1367 Run to use for predictions, overriding ``self.run``.
1368 **kwargs
1369 Additional keyword arguments used to augment or construct a
1370 `DataCoordinate`. See `DataCoordinate.standardize`
1371 parameters.
1373 Returns
1374 -------
1375 primary : `lsst.resources.ResourcePath`
1376 The URI to the primary artifact associated with this dataset.
1377 If the dataset was disassembled within the datastore this
1378 may be `None`.
1379 components : `dict`
1380 URIs to any components associated with the dataset artifact.
1381 Can be empty if there are no components.
1382 """
1383 ref = self._findDatasetRef(
1384 datasetRefOrType, dataId, allowUnresolved=predict, collections=collections, **kwargs
1385 )
1386 if ref.id is None: # only possible if predict is True
1387 if run is None:
1388 run = self.run
1389 if run is None:
1390 raise TypeError("Cannot predict location with run=None.")
1391 # Lie about ID, because we can't guess it, and only
1392 # Datastore.getURIs() will ever see it (and it doesn't use it).
1393 ref = ref.resolved(id=0, run=run)
1394 return self.datastore.getURIs(ref, predict)
1396 def getURI(
1397 self,
1398 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1399 dataId: Optional[DataId] = None,
1400 *,
1401 predict: bool = False,
1402 collections: Any = None,
1403 run: Optional[str] = None,
1404 **kwargs: Any,
1405 ) -> ResourcePath:
1406 """Return the URI to the Dataset.
1408 Parameters
1409 ----------
1410 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1411 When `DatasetRef` the `dataId` should be `None`.
1412 Otherwise the `DatasetType` or name thereof.
1413 dataId : `dict` or `DataCoordinate`
1414 A `dict` of `Dimension` link name, value pairs that label the
1415 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1416 should be provided as the first argument.
1417 predict : `bool`
1418 If `True`, allow URIs to be returned of datasets that have not
1419 been written.
1420 collections : Any, optional
1421 Collections to be searched, overriding ``self.collections``.
1422 Can be any of the types supported by the ``collections`` argument
1423 to butler construction.
1424 run : `str`, optional
1425 Run to use for predictions, overriding ``self.run``.
1426 **kwargs
1427 Additional keyword arguments used to augment or construct a
1428 `DataCoordinate`. See `DataCoordinate.standardize`
1429 parameters.
1431 Returns
1432 -------
1433 uri : `lsst.resources.ResourcePath`
1434 URI pointing to the Dataset within the datastore. If the
1435 Dataset does not exist in the datastore, and if ``predict`` is
1436 `True`, the URI will be a prediction and will include a URI
1437 fragment "#predicted".
1438 If the datastore does not have entities that relate well
1439 to the concept of a URI the returned URI string will be
1440 descriptive. The returned URI is not guaranteed to be obtainable.
1442 Raises
1443 ------
1444 LookupError
1445 A URI has been requested for a dataset that does not exist and
1446 guessing is not allowed.
1447 ValueError
1448 Raised if a resolved `DatasetRef` was passed as an input, but it
1449 differs from the one found in the registry.
1450 TypeError
1451 Raised if no collections were provided.
1452 RuntimeError
1453 Raised if a URI is requested for a dataset that consists of
1454 multiple artifacts.
1455 """
1456 primary, components = self.getURIs(
1457 datasetRefOrType, dataId=dataId, predict=predict, collections=collections, run=run, **kwargs
1458 )
1460 if primary is None or components:
1461 raise RuntimeError(
1462 f"Dataset ({datasetRefOrType}) includes distinct URIs for components. "
1463 "Use Butler.getURIs() instead."
1464 )
1465 return primary
1467 def retrieveArtifacts(
1468 self,
1469 refs: Iterable[DatasetRef],
1470 destination: ResourcePathExpression,
1471 transfer: str = "auto",
1472 preserve_path: bool = True,
1473 overwrite: bool = False,
1474 ) -> List[ResourcePath]:
1475 """Retrieve the artifacts associated with the supplied refs.
1477 Parameters
1478 ----------
1479 refs : iterable of `DatasetRef`
1480 The datasets for which artifacts are to be retrieved.
1481 A single ref can result in multiple artifacts. The refs must
1482 be resolved.
1483 destination : `lsst.resources.ResourcePath` or `str`
1484 Location to write the artifacts.
1485 transfer : `str`, optional
1486 Method to use to transfer the artifacts. Must be one of the options
1487 supported by `~lsst.resources.ResourcePath.transfer_from()`.
1488 "move" is not allowed.
1489 preserve_path : `bool`, optional
1490 If `True` the full path of the artifact within the datastore
1491 is preserved. If `False` the final file component of the path
1492 is used.
1493 overwrite : `bool`, optional
1494 If `True` allow transfers to overwrite existing files at the
1495 destination.
1497 Returns
1498 -------
1499 targets : `list` of `lsst.resources.ResourcePath`
1500 URIs of file artifacts in destination location. Order is not
1501 preserved.
1503 Notes
1504 -----
1505 For non-file datastores the artifacts written to the destination
1506 may not match the representation inside the datastore. For example
1507 a hierarchical data structure in a NoSQL database may well be stored
1508 as a JSON file.
1509 """
1510 return self.datastore.retrieveArtifacts(
1511 refs,
1512 ResourcePath(destination),
1513 transfer=transfer,
1514 preserve_path=preserve_path,
1515 overwrite=overwrite,
1516 )
1518 def datasetExists(
1519 self,
1520 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1521 dataId: Optional[DataId] = None,
1522 *,
1523 collections: Any = None,
1524 **kwargs: Any,
1525 ) -> bool:
1526 """Return True if the Dataset is actually present in the Datastore.
1528 Parameters
1529 ----------
1530 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1531 When `DatasetRef` the `dataId` should be `None`.
1532 Otherwise the `DatasetType` or name thereof.
1533 dataId : `dict` or `DataCoordinate`
1534 A `dict` of `Dimension` link name, value pairs that label the
1535 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1536 should be provided as the first argument.
1537 collections : Any, optional
1538 Collections to be searched, overriding ``self.collections``.
1539 Can be any of the types supported by the ``collections`` argument
1540 to butler construction.
1541 **kwargs
1542 Additional keyword arguments used to augment or construct a
1543 `DataCoordinate`. See `DataCoordinate.standardize`
1544 parameters.
1546 Raises
1547 ------
1548 LookupError
1549 Raised if the dataset is not even present in the Registry.
1550 ValueError
1551 Raised if a resolved `DatasetRef` was passed as an input, but it
1552 differs from the one found in the registry.
1553 TypeError
1554 Raised if no collections were provided.
1555 """
1556 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs)
1557 return self.datastore.exists(ref)
1559 def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None:
1560 """Remove one or more `~CollectionType.RUN` collections and the
1561 datasets within them.
1563 Parameters
1564 ----------
1565 names : `Iterable` [ `str` ]
1566 The names of the collections to remove.
1567 unstore : `bool`, optional
1568 If `True` (default), delete datasets from all datastores in which
1569 they are present, and attempt to rollback the registry deletions if
1570 datastore deletions fail (which may not always be possible). If
1571 `False`, datastore records for these datasets are still removed,
1572 but any artifacts (e.g. files) will not be.
1574 Raises
1575 ------
1576 TypeError
1577 Raised if one or more collections are not of type
1578 `~CollectionType.RUN`.
1579 """
1580 if not self.isWriteable():
1581 raise TypeError("Butler is read-only.")
1582 names = list(names)
1583 refs: List[DatasetRef] = []
1584 for name in names:
1585 collectionType = self.registry.getCollectionType(name)
1586 if collectionType is not CollectionType.RUN:
1587 raise TypeError(f"The collection type of '{name}' is {collectionType.name}, not RUN.")
1588 refs.extend(self.registry.queryDatasets(..., collections=name, findFirst=True))
1589 with self.registry.transaction():
1590 if unstore:
1591 self.datastore.trash(refs)
1592 else:
1593 self.datastore.forget(refs)
1594 for name in names:
1595 self.registry.removeCollection(name)
1596 if unstore:
1597 # Point of no return for removing artifacts
1598 self.datastore.emptyTrash()
1600 def pruneCollection(
1601 self, name: str, purge: bool = False, unstore: bool = False, unlink: Optional[List[str]] = None
1602 ) -> None:
1603 """Remove a collection and possibly prune datasets within it.
1605 Parameters
1606 ----------
1607 name : `str`
1608 Name of the collection to remove. If this is a
1609 `~CollectionType.TAGGED` or `~CollectionType.CHAINED` collection,
1610 datasets within the collection are not modified unless ``unstore``
1611 is `True`. If this is a `~CollectionType.RUN` collection,
1612 ``purge`` and ``unstore`` must be `True`, and all datasets in it
1613 are fully removed from the data repository.
1614 purge : `bool`, optional
1615 If `True`, permit `~CollectionType.RUN` collections to be removed,
1616 fully removing datasets within them. Requires ``unstore=True`` as
1617 well as an added precaution against accidental deletion. Must be
1618 `False` (default) if the collection is not a ``RUN``.
1619 unstore: `bool`, optional
1620 If `True`, remove all datasets in the collection from all
1621 datastores in which they appear.
1622 unlink: `list` [`str`], optional
1623 Before removing the given `collection` unlink it from from these
1624 parent collections.
1626 Raises
1627 ------
1628 TypeError
1629 Raised if the butler is read-only or arguments are mutually
1630 inconsistent.
1631 """
1632 # See pruneDatasets comments for more information about the logic here;
1633 # the cases are almost the same, but here we can rely on Registry to
1634 # take care everything but Datastore deletion when we remove the
1635 # collection.
1636 if not self.isWriteable():
1637 raise TypeError("Butler is read-only.")
1638 collectionType = self.registry.getCollectionType(name)
1639 if purge and not unstore:
1640 raise PurgeWithoutUnstorePruneCollectionsError()
1641 if collectionType is CollectionType.RUN and not purge:
1642 raise RunWithoutPurgePruneCollectionsError(collectionType)
1643 if collectionType is not CollectionType.RUN and purge:
1644 raise PurgeUnsupportedPruneCollectionsError(collectionType)
1646 def remove(child: str, parent: str) -> None:
1647 """Remove a child collection from a parent collection."""
1648 # Remove child from parent.
1649 chain = list(self.registry.getCollectionChain(parent))
1650 try:
1651 chain.remove(name)
1652 except ValueError as e:
1653 raise RuntimeError(f"{name} is not a child of {parent}") from e
1654 self.registry.setCollectionChain(parent, chain)
1656 with self.registry.transaction():
1657 if unlink:
1658 for parent in unlink:
1659 remove(name, parent)
1660 if unstore:
1661 refs = self.registry.queryDatasets(..., collections=name, findFirst=True)
1662 self.datastore.trash(refs)
1663 self.registry.removeCollection(name)
1665 if unstore:
1666 # Point of no return for removing artifacts
1667 self.datastore.emptyTrash()
1669 def pruneDatasets(
1670 self,
1671 refs: Iterable[DatasetRef],
1672 *,
1673 disassociate: bool = True,
1674 unstore: bool = False,
1675 tags: Iterable[str] = (),
1676 purge: bool = False,
1677 run: Optional[str] = None,
1678 ) -> None:
1679 """Remove one or more datasets from a collection and/or storage.
1681 Parameters
1682 ----------
1683 refs : `~collections.abc.Iterable` of `DatasetRef`
1684 Datasets to prune. These must be "resolved" references (not just
1685 a `DatasetType` and data ID).
1686 disassociate : `bool`, optional
1687 Disassociate pruned datasets from ``tags``, or from all collections
1688 if ``purge=True``.
1689 unstore : `bool`, optional
1690 If `True` (`False` is default) remove these datasets from all
1691 datastores known to this butler. Note that this will make it
1692 impossible to retrieve these datasets even via other collections.
1693 Datasets that are already not stored are ignored by this option.
1694 tags : `Iterable` [ `str` ], optional
1695 `~CollectionType.TAGGED` collections to disassociate the datasets
1696 from. Ignored if ``disassociate`` is `False` or ``purge`` is
1697 `True`.
1698 purge : `bool`, optional
1699 If `True` (`False` is default), completely remove the dataset from
1700 the `Registry`. To prevent accidental deletions, ``purge`` may
1701 only be `True` if all of the following conditions are met:
1703 - All given datasets are in the given run.
1704 - ``disassociate`` is `True`;
1705 - ``unstore`` is `True`.
1707 This mode may remove provenance information from datasets other
1708 than those provided, and should be used with extreme care.
1710 Raises
1711 ------
1712 TypeError
1713 Raised if the butler is read-only, if no collection was provided,
1714 or the conditions for ``purge=True`` were not met.
1715 """
1716 if not self.isWriteable():
1717 raise TypeError("Butler is read-only.")
1718 if purge:
1719 if not disassociate:
1720 raise TypeError("Cannot pass purge=True without disassociate=True.")
1721 if not unstore:
1722 raise TypeError("Cannot pass purge=True without unstore=True.")
1723 elif disassociate:
1724 tags = tuple(tags)
1725 if not tags:
1726 raise TypeError("No tags provided but disassociate=True.")
1727 for tag in tags:
1728 collectionType = self.registry.getCollectionType(tag)
1729 if collectionType is not CollectionType.TAGGED:
1730 raise TypeError(
1731 f"Cannot disassociate from collection '{tag}' "
1732 f"of non-TAGGED type {collectionType.name}."
1733 )
1734 # Transform possibly-single-pass iterable into something we can iterate
1735 # over multiple times.
1736 refs = list(refs)
1737 # Pruning a component of a DatasetRef makes no sense since registry
1738 # doesn't know about components and datastore might not store
1739 # components in a separate file
1740 for ref in refs:
1741 if ref.datasetType.component():
1742 raise ValueError(f"Can not prune a component of a dataset (ref={ref})")
1743 # We don't need an unreliable Datastore transaction for this, because
1744 # we've been extra careful to ensure that Datastore.trash only involves
1745 # mutating the Registry (it can _look_ at Datastore-specific things,
1746 # but shouldn't change them), and hence all operations here are
1747 # Registry operations.
1748 with self.registry.transaction():
1749 if unstore:
1750 self.datastore.trash(refs)
1751 if purge:
1752 self.registry.removeDatasets(refs)
1753 elif disassociate:
1754 assert tags, "Guaranteed by earlier logic in this function."
1755 for tag in tags:
1756 self.registry.disassociate(tag, refs)
1757 # We've exited the Registry transaction, and apparently committed.
1758 # (if there was an exception, everything rolled back, and it's as if
1759 # nothing happened - and we never get here).
1760 # Datastore artifacts are not yet gone, but they're clearly marked
1761 # as trash, so if we fail to delete now because of (e.g.) filesystem
1762 # problems we can try again later, and if manual administrative
1763 # intervention is required, it's pretty clear what that should entail:
1764 # deleting everything on disk and in private Datastore tables that is
1765 # in the dataset_location_trash table.
1766 if unstore:
1767 # Point of no return for removing artifacts
1768 self.datastore.emptyTrash()
1770 @transactional
1771 def ingest(
1772 self,
1773 *datasets: FileDataset,
1774 transfer: Optional[str] = "auto",
1775 run: Optional[str] = None,
1776 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
1777 record_validation_info: bool = True,
1778 ) -> None:
1779 """Store and register one or more datasets that already exist on disk.
1781 Parameters
1782 ----------
1783 datasets : `FileDataset`
1784 Each positional argument is a struct containing information about
1785 a file to be ingested, including its URI (either absolute or
1786 relative to the datastore root, if applicable), a `DatasetRef`,
1787 and optionally a formatter class or its fully-qualified string
1788 name. If a formatter is not provided, the formatter that would be
1789 used for `put` is assumed. On successful return, all
1790 `FileDataset.ref` attributes will have their `DatasetRef.id`
1791 attribute populated and all `FileDataset.formatter` attributes will
1792 be set to the formatter class used. `FileDataset.path` attributes
1793 may be modified to put paths in whatever the datastore considers a
1794 standardized form.
1795 transfer : `str`, optional
1796 If not `None`, must be one of 'auto', 'move', 'copy', 'direct',
1797 'split', 'hardlink', 'relsymlink' or 'symlink', indicating how to
1798 transfer the file.
1799 run : `str`, optional
1800 The name of the run ingested datasets should be added to,
1801 overriding ``self.run``.
1802 idGenerationMode : `DatasetIdGenEnum`, optional
1803 Specifies option for generating dataset IDs. By default unique IDs
1804 are generated for each inserted dataset.
1805 record_validation_info : `bool`, optional
1806 If `True`, the default, the datastore can record validation
1807 information associated with the file. If `False` the datastore
1808 will not attempt to track any information such as checksums
1809 or file sizes. This can be useful if such information is tracked
1810 in an external system or if the file is to be compressed in place.
1811 It is up to the datastore whether this parameter is relevant.
1813 Raises
1814 ------
1815 TypeError
1816 Raised if the butler is read-only or if no run was provided.
1817 NotImplementedError
1818 Raised if the `Datastore` does not support the given transfer mode.
1819 DatasetTypeNotSupportedError
1820 Raised if one or more files to be ingested have a dataset type that
1821 is not supported by the `Datastore`..
1822 FileNotFoundError
1823 Raised if one of the given files does not exist.
1824 FileExistsError
1825 Raised if transfer is not `None` but the (internal) location the
1826 file would be moved to is already occupied.
1828 Notes
1829 -----
1830 This operation is not fully exception safe: if a database operation
1831 fails, the given `FileDataset` instances may be only partially updated.
1833 It is atomic in terms of database operations (they will either all
1834 succeed or all fail) providing the database engine implements
1835 transactions correctly. It will attempt to be atomic in terms of
1836 filesystem operations as well, but this cannot be implemented
1837 rigorously for most datastores.
1838 """
1839 if not self.isWriteable():
1840 raise TypeError("Butler is read-only.")
1841 progress = Progress("lsst.daf.butler.Butler.ingest", level=logging.DEBUG)
1842 # Reorganize the inputs so they're grouped by DatasetType and then
1843 # data ID. We also include a list of DatasetRefs for each FileDataset
1844 # to hold the resolved DatasetRefs returned by the Registry, before
1845 # it's safe to swap them into FileDataset.refs.
1846 # Some type annotation aliases to make that clearer:
1847 GroupForType = Dict[DataCoordinate, Tuple[FileDataset, List[DatasetRef]]]
1848 GroupedData = MutableMapping[DatasetType, GroupForType]
1849 # The actual data structure:
1850 groupedData: GroupedData = defaultdict(dict)
1851 # And the nested loop that populates it:
1852 for dataset in progress.wrap(datasets, desc="Grouping by dataset type"):
1853 # This list intentionally shared across the inner loop, since it's
1854 # associated with `dataset`.
1855 resolvedRefs: List[DatasetRef] = []
1857 # Somewhere to store pre-existing refs if we have an
1858 # execution butler.
1859 existingRefs: List[DatasetRef] = []
1861 for ref in dataset.refs:
1862 if ref.dataId in groupedData[ref.datasetType]:
1863 raise ConflictingDefinitionError(
1864 f"Ingest conflict. Dataset {dataset.path} has same"
1865 " DataId as other ingest dataset"
1866 f" {groupedData[ref.datasetType][ref.dataId][0].path} "
1867 f" ({ref.dataId})"
1868 )
1869 if self._allow_put_of_predefined_dataset:
1870 existing_ref = self.registry.findDataset(
1871 ref.datasetType, dataId=ref.dataId, collections=run
1872 )
1873 if existing_ref:
1874 if self.datastore.knows(existing_ref):
1875 raise ConflictingDefinitionError(
1876 f"Dataset associated with path {dataset.path}"
1877 f" already exists as {existing_ref}."
1878 )
1879 # Store this ref elsewhere since it already exists
1880 # and we do not want to remake it but we do want
1881 # to store it in the datastore.
1882 existingRefs.append(existing_ref)
1884 # Nothing else to do until we have finished
1885 # iterating.
1886 continue
1888 groupedData[ref.datasetType][ref.dataId] = (dataset, resolvedRefs)
1890 if existingRefs:
1892 if len(dataset.refs) != len(existingRefs):
1893 # Keeping track of partially pre-existing datasets is hard
1894 # and should generally never happen. For now don't allow
1895 # it.
1896 raise ConflictingDefinitionError(
1897 f"For dataset {dataset.path} some dataIds already exist"
1898 " in registry but others do not. This is not supported."
1899 )
1901 # Attach the resolved refs if we found them.
1902 dataset.refs = existingRefs
1904 # Now we can bulk-insert into Registry for each DatasetType.
1905 for datasetType, groupForType in progress.iter_item_chunks(
1906 groupedData.items(), desc="Bulk-inserting datasets by type"
1907 ):
1908 refs = self.registry.insertDatasets(
1909 datasetType,
1910 dataIds=groupForType.keys(),
1911 run=run,
1912 expand=self.datastore.needs_expanded_data_ids(transfer, datasetType),
1913 idGenerationMode=idGenerationMode,
1914 )
1915 # Append those resolved DatasetRefs to the new lists we set up for
1916 # them.
1917 for ref, (_, resolvedRefs) in zip(refs, groupForType.values()):
1918 resolvedRefs.append(ref)
1920 # Go back to the original FileDatasets to replace their refs with the
1921 # new resolved ones.
1922 for groupForType in progress.iter_chunks(
1923 groupedData.values(), desc="Reassociating resolved dataset refs with files"
1924 ):
1925 for dataset, resolvedRefs in groupForType.values():
1926 dataset.refs = resolvedRefs
1928 # Bulk-insert everything into Datastore.
1929 self.datastore.ingest(*datasets, transfer=transfer, record_validation_info=record_validation_info)
1931 @contextlib.contextmanager
1932 def export(
1933 self,
1934 *,
1935 directory: Optional[str] = None,
1936 filename: Optional[str] = None,
1937 format: Optional[str] = None,
1938 transfer: Optional[str] = None,
1939 ) -> Iterator[RepoExportContext]:
1940 """Export datasets from the repository represented by this `Butler`.
1942 This method is a context manager that returns a helper object
1943 (`RepoExportContext`) that is used to indicate what information from
1944 the repository should be exported.
1946 Parameters
1947 ----------
1948 directory : `str`, optional
1949 Directory dataset files should be written to if ``transfer`` is not
1950 `None`.
1951 filename : `str`, optional
1952 Name for the file that will include database information associated
1953 with the exported datasets. If this is not an absolute path and
1954 ``directory`` is not `None`, it will be written to ``directory``
1955 instead of the current working directory. Defaults to
1956 "export.{format}".
1957 format : `str`, optional
1958 File format for the database information file. If `None`, the
1959 extension of ``filename`` will be used.
1960 transfer : `str`, optional
1961 Transfer mode passed to `Datastore.export`.
1963 Raises
1964 ------
1965 TypeError
1966 Raised if the set of arguments passed is inconsistent.
1968 Examples
1969 --------
1970 Typically the `Registry.queryDataIds` and `Registry.queryDatasets`
1971 methods are used to provide the iterables over data IDs and/or datasets
1972 to be exported::
1974 with butler.export("exports.yaml") as export:
1975 # Export all flats, but none of the dimension element rows
1976 # (i.e. data ID information) associated with them.
1977 export.saveDatasets(butler.registry.queryDatasets("flat"),
1978 elements=())
1979 # Export all datasets that start with "deepCoadd_" and all of
1980 # their associated data ID information.
1981 export.saveDatasets(butler.registry.queryDatasets("deepCoadd_*"))
1982 """
1983 if directory is None and transfer is not None:
1984 raise TypeError("Cannot transfer without providing a directory.")
1985 if transfer == "move":
1986 raise TypeError("Transfer may not be 'move': export is read-only")
1987 if format is None:
1988 if filename is None:
1989 raise TypeError("At least one of 'filename' or 'format' must be provided.")
1990 else:
1991 _, format = os.path.splitext(filename)
1992 elif filename is None:
1993 filename = f"export.{format}"
1994 if directory is not None:
1995 filename = os.path.join(directory, filename)
1996 BackendClass = get_class_of(self._config["repo_transfer_formats"][format]["export"])
1997 with open(filename, "w") as stream:
1998 backend = BackendClass(stream)
1999 try:
2000 helper = RepoExportContext(
2001 self.registry, self.datastore, backend=backend, directory=directory, transfer=transfer
2002 )
2003 yield helper
2004 except BaseException:
2005 raise
2006 else:
2007 helper._finish()
2009 def import_(
2010 self,
2011 *,
2012 directory: Optional[str] = None,
2013 filename: Union[str, TextIO, None] = None,
2014 format: Optional[str] = None,
2015 transfer: Optional[str] = None,
2016 skip_dimensions: Optional[Set] = None,
2017 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
2018 reuseIds: bool = False,
2019 ) -> None:
2020 """Import datasets into this repository that were exported from a
2021 different butler repository via `~lsst.daf.butler.Butler.export`.
2023 Parameters
2024 ----------
2025 directory : `str`, optional
2026 Directory containing dataset files to import from. If `None`,
2027 ``filename`` and all dataset file paths specified therein must
2028 be absolute.
2029 filename : `str` or `TextIO`, optional
2030 A stream or name of file that contains database information
2031 associated with the exported datasets, typically generated by
2032 `~lsst.daf.butler.Butler.export`. If this a string (name) and
2033 is not an absolute path, does not exist in the current working
2034 directory, and ``directory`` is not `None`, it is assumed to be in
2035 ``directory``. Defaults to "export.{format}".
2036 format : `str`, optional
2037 File format for ``filename``. If `None`, the extension of
2038 ``filename`` will be used.
2039 transfer : `str`, optional
2040 Transfer mode passed to `~lsst.daf.butler.Datastore.ingest`.
2041 skip_dimensions : `set`, optional
2042 Names of dimensions that should be skipped and not imported.
2043 idGenerationMode : `DatasetIdGenEnum`, optional
2044 Specifies option for generating dataset IDs when IDs are not
2045 provided or their type does not match backend type. By default
2046 unique IDs are generated for each inserted dataset.
2047 reuseIds : `bool`, optional
2048 If `True` then forces re-use of imported dataset IDs for integer
2049 IDs which are normally generated as auto-incremented; exception
2050 will be raised if imported IDs clash with existing ones. This
2051 option has no effect on the use of globally-unique IDs which are
2052 always re-used (or generated if integer IDs are being imported).
2054 Raises
2055 ------
2056 TypeError
2057 Raised if the set of arguments passed is inconsistent, or if the
2058 butler is read-only.
2059 """
2060 if not self.isWriteable():
2061 raise TypeError("Butler is read-only.")
2062 if format is None:
2063 if filename is None:
2064 raise TypeError("At least one of 'filename' or 'format' must be provided.")
2065 else:
2066 _, format = os.path.splitext(filename) # type: ignore
2067 elif filename is None:
2068 filename = f"export.{format}"
2069 if isinstance(filename, str) and directory is not None and not os.path.exists(filename):
2070 filename = os.path.join(directory, filename)
2071 BackendClass = get_class_of(self._config["repo_transfer_formats"][format]["import"])
2073 def doImport(importStream: TextIO) -> None:
2074 backend = BackendClass(importStream, self.registry)
2075 backend.register()
2076 with self.transaction():
2077 backend.load(
2078 self.datastore,
2079 directory=directory,
2080 transfer=transfer,
2081 skip_dimensions=skip_dimensions,
2082 idGenerationMode=idGenerationMode,
2083 reuseIds=reuseIds,
2084 )
2086 if isinstance(filename, str):
2087 with open(filename, "r") as stream:
2088 doImport(stream)
2089 else:
2090 doImport(filename)
2092 def transfer_from(
2093 self,
2094 source_butler: Butler,
2095 source_refs: Iterable[DatasetRef],
2096 transfer: str = "auto",
2097 id_gen_map: Dict[str, DatasetIdGenEnum] = None,
2098 skip_missing: bool = True,
2099 register_dataset_types: bool = False,
2100 ) -> List[DatasetRef]:
2101 """Transfer datasets to this Butler from a run in another Butler.
2103 Parameters
2104 ----------
2105 source_butler : `Butler`
2106 Butler from which the datasets are to be transferred.
2107 source_refs : iterable of `DatasetRef`
2108 Datasets defined in the source butler that should be transferred to
2109 this butler.
2110 transfer : `str`, optional
2111 Transfer mode passed to `~lsst.daf.butler.Datastore.transfer_from`.
2112 id_gen_map : `dict` of [`str`, `DatasetIdGenEnum`], optional
2113 A mapping of dataset type to ID generation mode. Only used if
2114 the source butler is using integer IDs. Should not be used
2115 if this receiving butler uses integer IDs. Without this dataset
2116 import always uses unique.
2117 skip_missing : `bool`
2118 If `True`, datasets with no datastore artifact associated with
2119 them are not transferred. If `False` a registry entry will be
2120 created even if no datastore record is created (and so will
2121 look equivalent to the dataset being unstored).
2122 register_dataset_types : `bool`
2123 If `True` any missing dataset types are registered. Otherwise
2124 an exception is raised.
2126 Returns
2127 -------
2128 refs : `list` of `DatasetRef`
2129 The refs added to this Butler.
2131 Notes
2132 -----
2133 Requires that any dimension definitions are already present in the
2134 receiving Butler. The datastore artifact has to exist for a transfer
2135 to be made but non-existence is not an error.
2137 Datasets that already exist in this run will be skipped.
2139 The datasets are imported as part of a transaction, although
2140 dataset types are registered before the transaction is started.
2141 This means that it is possible for a dataset type to be registered
2142 even though transfer has failed.
2143 """
2144 if not self.isWriteable():
2145 raise TypeError("Butler is read-only.")
2146 progress = Progress("lsst.daf.butler.Butler.transfer_from", level=VERBOSE)
2148 # Will iterate through the refs multiple times so need to convert
2149 # to a list if this isn't a collection.
2150 if not isinstance(source_refs, collections.abc.Collection):
2151 source_refs = list(source_refs)
2153 original_count = len(source_refs)
2154 log.info("Transferring %d datasets into %s", original_count, str(self))
2156 if id_gen_map is None:
2157 id_gen_map = {}
2159 # In some situations the datastore artifact may be missing
2160 # and we do not want that registry entry to be imported.
2161 # Asking datastore is not sufficient, the records may have been
2162 # purged, we have to ask for the (predicted) URI and check
2163 # existence explicitly. Execution butler is set up exactly like
2164 # this with no datastore records.
2165 artifact_existence: Dict[ResourcePath, bool] = {}
2166 if skip_missing:
2167 dataset_existence = source_butler.datastore.mexists(
2168 source_refs, artifact_existence=artifact_existence
2169 )
2170 source_refs = [ref for ref, exists in dataset_existence.items() if exists]
2171 filtered_count = len(source_refs)
2172 log.verbose(
2173 "%d datasets removed because the artifact does not exist. Now have %d.",
2174 original_count - filtered_count,
2175 filtered_count,
2176 )
2178 # Importing requires that we group the refs by dataset type and run
2179 # before doing the import.
2180 source_dataset_types = set()
2181 grouped_refs = defaultdict(list)
2182 grouped_indices = defaultdict(list)
2183 for i, ref in enumerate(source_refs):
2184 grouped_refs[ref.datasetType, ref.run].append(ref)
2185 grouped_indices[ref.datasetType, ref.run].append(i)
2186 source_dataset_types.add(ref.datasetType)
2188 # Check to see if the dataset type in the source butler has
2189 # the same definition in the target butler and register missing
2190 # ones if requested. Registration must happen outside a transaction.
2191 newly_registered_dataset_types = set()
2192 for datasetType in source_dataset_types:
2193 if register_dataset_types:
2194 # Let this raise immediately if inconsistent. Continuing
2195 # on to find additional inconsistent dataset types
2196 # might result in additional unwanted dataset types being
2197 # registered.
2198 if self.registry.registerDatasetType(datasetType):
2199 newly_registered_dataset_types.add(datasetType)
2200 else:
2201 # If the dataset type is missing, let it fail immediately.
2202 target_dataset_type = self.registry.getDatasetType(datasetType.name)
2203 if target_dataset_type != datasetType:
2204 raise ConflictingDefinitionError(
2205 "Source butler dataset type differs from definition"
2206 f" in target butler: {datasetType} !="
2207 f" {target_dataset_type}"
2208 )
2209 if newly_registered_dataset_types:
2210 # We may have registered some even if there were inconsistencies
2211 # but should let people know (or else remove them again).
2212 log.log(
2213 VERBOSE,
2214 "Registered the following dataset types in the target Butler: %s",
2215 ", ".join(d.name for d in newly_registered_dataset_types),
2216 )
2217 else:
2218 log.log(VERBOSE, "All required dataset types are known to the target Butler")
2220 # The returned refs should be identical for UUIDs.
2221 # For now must also support integers and so need to retain the
2222 # newly-created refs from this registry.
2223 # Pre-size it so we can assign refs into the correct slots
2224 transferred_refs_tmp: List[Optional[DatasetRef]] = [None] * len(source_refs)
2225 default_id_gen = DatasetIdGenEnum.UNIQUE
2227 handled_collections: Set[str] = set()
2229 # Do all the importing in a single transaction.
2230 with self.transaction():
2231 for (datasetType, run), refs_to_import in progress.iter_item_chunks(
2232 grouped_refs.items(), desc="Importing to registry by run and dataset type"
2233 ):
2234 if run not in handled_collections:
2235 run_doc = source_butler.registry.getCollectionDocumentation(run)
2236 registered = self.registry.registerRun(run, doc=run_doc)
2237 handled_collections.add(run)
2238 if registered:
2239 log.log(VERBOSE, "Creating output run %s", run)
2241 id_generation_mode = default_id_gen
2242 if isinstance(refs_to_import[0].id, int):
2243 # ID generation mode might need to be overridden when
2244 # targetting UUID
2245 id_generation_mode = id_gen_map.get(datasetType.name, default_id_gen)
2247 n_refs = len(refs_to_import)
2248 log.verbose(
2249 "Importing %d ref%s of dataset type %s into run %s",
2250 n_refs,
2251 "" if n_refs == 1 else "s",
2252 datasetType.name,
2253 run,
2254 )
2256 # No way to know if this butler's registry uses UUID.
2257 # We have to trust the caller on this. If it fails they will
2258 # have to change their approach. We can't catch the exception
2259 # and retry with unique because that will mess up the
2260 # transaction handling. We aren't allowed to ask the registry
2261 # manager what type of ID it is using.
2262 imported_refs = self.registry._importDatasets(
2263 refs_to_import, idGenerationMode=id_generation_mode, expand=False
2264 )
2266 # Map them into the correct slots to match the initial order
2267 for i, ref in zip(grouped_indices[datasetType, run], imported_refs):
2268 transferred_refs_tmp[i] = ref
2270 # Mypy insists that we might have None in here so we have to make
2271 # that explicit by assigning to a new variable and filtering out
2272 # something that won't be there.
2273 transferred_refs = [ref for ref in transferred_refs_tmp if ref is not None]
2275 # Check consistency
2276 assert len(source_refs) == len(transferred_refs), "Different number of refs imported than given"
2278 log.verbose("Imported %d datasets into destination butler", len(transferred_refs))
2280 # The transferred refs need to be reordered to match the original
2281 # ordering given by the caller. Without this the datastore transfer
2282 # will be broken.
2284 # Ask the datastore to transfer. The datastore has to check that
2285 # the source datastore is compatible with the target datastore.
2286 self.datastore.transfer_from(
2287 source_butler.datastore,
2288 source_refs,
2289 local_refs=transferred_refs,
2290 transfer=transfer,
2291 artifact_existence=artifact_existence,
2292 )
2294 return transferred_refs
2296 def validateConfiguration(
2297 self,
2298 logFailures: bool = False,
2299 datasetTypeNames: Optional[Iterable[str]] = None,
2300 ignore: Iterable[str] = None,
2301 ) -> None:
2302 """Validate butler configuration.
2304 Checks that each `DatasetType` can be stored in the `Datastore`.
2306 Parameters
2307 ----------
2308 logFailures : `bool`, optional
2309 If `True`, output a log message for every validation error
2310 detected.
2311 datasetTypeNames : iterable of `str`, optional
2312 The `DatasetType` names that should be checked. This allows
2313 only a subset to be selected.
2314 ignore : iterable of `str`, optional
2315 Names of DatasetTypes to skip over. This can be used to skip
2316 known problems. If a named `DatasetType` corresponds to a
2317 composite, all components of that `DatasetType` will also be
2318 ignored.
2320 Raises
2321 ------
2322 ButlerValidationError
2323 Raised if there is some inconsistency with how this Butler
2324 is configured.
2325 """
2326 if datasetTypeNames:
2327 datasetTypes = [self.registry.getDatasetType(name) for name in datasetTypeNames]
2328 else:
2329 datasetTypes = list(self.registry.queryDatasetTypes())
2331 # filter out anything from the ignore list
2332 if ignore:
2333 ignore = set(ignore)
2334 datasetTypes = [
2335 e for e in datasetTypes if e.name not in ignore and e.nameAndComponent()[0] not in ignore
2336 ]
2337 else:
2338 ignore = set()
2340 # Find all the registered instruments
2341 instruments = set(record.name for record in self.registry.queryDimensionRecords("instrument"))
2343 # For each datasetType that has an instrument dimension, create
2344 # a DatasetRef for each defined instrument
2345 datasetRefs = []
2347 for datasetType in datasetTypes:
2348 if "instrument" in datasetType.dimensions:
2349 for instrument in instruments:
2350 datasetRef = DatasetRef(
2351 datasetType, {"instrument": instrument}, conform=False # type: ignore
2352 )
2353 datasetRefs.append(datasetRef)
2355 entities: List[Union[DatasetType, DatasetRef]] = []
2356 entities.extend(datasetTypes)
2357 entities.extend(datasetRefs)
2359 datastoreErrorStr = None
2360 try:
2361 self.datastore.validateConfiguration(entities, logFailures=logFailures)
2362 except ValidationError as e:
2363 datastoreErrorStr = str(e)
2365 # Also check that the LookupKeys used by the datastores match
2366 # registry and storage class definitions
2367 keys = self.datastore.getLookupKeys()
2369 failedNames = set()
2370 failedDataId = set()
2371 for key in keys:
2372 if key.name is not None:
2373 if key.name in ignore:
2374 continue
2376 # skip if specific datasetType names were requested and this
2377 # name does not match
2378 if datasetTypeNames and key.name not in datasetTypeNames:
2379 continue
2381 # See if it is a StorageClass or a DatasetType
2382 if key.name in self.storageClasses:
2383 pass
2384 else:
2385 try:
2386 self.registry.getDatasetType(key.name)
2387 except KeyError:
2388 if logFailures:
2389 log.critical("Key '%s' does not correspond to a DatasetType or StorageClass", key)
2390 failedNames.add(key)
2391 else:
2392 # Dimensions are checked for consistency when the Butler
2393 # is created and rendezvoused with a universe.
2394 pass
2396 # Check that the instrument is a valid instrument
2397 # Currently only support instrument so check for that
2398 if key.dataId:
2399 dataIdKeys = set(key.dataId)
2400 if set(["instrument"]) != dataIdKeys:
2401 if logFailures:
2402 log.critical("Key '%s' has unsupported DataId override", key)
2403 failedDataId.add(key)
2404 elif key.dataId["instrument"] not in instruments:
2405 if logFailures:
2406 log.critical("Key '%s' has unknown instrument", key)
2407 failedDataId.add(key)
2409 messages = []
2411 if datastoreErrorStr:
2412 messages.append(datastoreErrorStr)
2414 for failed, msg in (
2415 (failedNames, "Keys without corresponding DatasetType or StorageClass entry: "),
2416 (failedDataId, "Keys with bad DataId entries: "),
2417 ):
2418 if failed:
2419 msg += ", ".join(str(k) for k in failed)
2420 messages.append(msg)
2422 if messages:
2423 raise ValidationError(";\n".join(messages))
2425 @property
2426 def collections(self) -> CollectionSearch:
2427 """The collections to search by default, in order (`CollectionSearch`).
2429 This is an alias for ``self.registry.defaults.collections``. It cannot
2430 be set directly in isolation, but all defaults may be changed together
2431 by assigning a new `RegistryDefaults` instance to
2432 ``self.registry.defaults``.
2433 """
2434 return self.registry.defaults.collections
2436 @property
2437 def run(self) -> Optional[str]:
2438 """Name of the run this butler writes outputs to by default (`str` or
2439 `None`).
2441 This is an alias for ``self.registry.defaults.run``. It cannot be set
2442 directly in isolation, but all defaults may be changed together by
2443 assigning a new `RegistryDefaults` instance to
2444 ``self.registry.defaults``.
2445 """
2446 return self.registry.defaults.run
2448 @property
2449 def dimensions(self) -> DimensionUniverse:
2450 # Docstring inherited.
2451 return self.registry.dimensions
2453 registry: Registry
2454 """The object that manages dataset metadata and relationships (`Registry`).
2456 Most operations that don't involve reading or writing butler datasets are
2457 accessible only via `Registry` methods.
2458 """
2460 datastore: Datastore
2461 """The object that manages actual dataset storage (`Datastore`).
2463 Direct user access to the datastore should rarely be necessary; the primary
2464 exception is the case where a `Datastore` implementation provides extra
2465 functionality beyond what the base class defines.
2466 """
2468 storageClasses: StorageClassFactory
2469 """An object that maps known storage class names to objects that fully
2470 describe them (`StorageClassFactory`).
2471 """
2473 _allow_put_of_predefined_dataset: bool
2474 """Allow a put to succeed even if there is already a registry entry for it
2475 but not a datastore record. (`bool`)."""