Coverage for python/lsst/daf/butler/_butler.py: 10%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""
23Butler top level classes.
24"""
25from __future__ import annotations
27__all__ = (
28 "Butler",
29 "ButlerValidationError",
30 "PruneCollectionsArgsError",
31 "PurgeWithoutUnstorePruneCollectionsError",
32 "RunWithoutPurgePruneCollectionsError",
33 "PurgeUnsupportedPruneCollectionsError",
34)
36import collections.abc
37import contextlib
38import logging
39import numbers
40import os
41from collections import defaultdict
42from typing import (
43 Any,
44 ClassVar,
45 Counter,
46 Dict,
47 Iterable,
48 Iterator,
49 List,
50 MutableMapping,
51 Optional,
52 Set,
53 TextIO,
54 Tuple,
55 Type,
56 Union,
57)
59from lsst.resources import ResourcePath, ResourcePathExpression
60from lsst.utils import doImportType
61from lsst.utils.introspection import get_class_of
62from lsst.utils.logging import VERBOSE, getLogger
64from ._butlerConfig import ButlerConfig
65from ._butlerRepoIndex import ButlerRepoIndex
66from ._deferredDatasetHandle import DeferredDatasetHandle
67from ._limited_butler import LimitedButler
68from .core import (
69 AmbiguousDatasetError,
70 Config,
71 ConfigSubset,
72 DataCoordinate,
73 DataId,
74 DataIdValue,
75 DatasetRef,
76 DatasetType,
77 Datastore,
78 Dimension,
79 DimensionConfig,
80 DimensionUniverse,
81 FileDataset,
82 Progress,
83 StorageClassFactory,
84 Timespan,
85 ValidationError,
86)
87from .core.repoRelocation import BUTLER_ROOT_TAG
88from .core.utils import transactional
89from .registry import (
90 CollectionSearch,
91 CollectionType,
92 ConflictingDefinitionError,
93 DataIdError,
94 DatasetIdGenEnum,
95 Registry,
96 RegistryConfig,
97 RegistryDefaults,
98)
99from .transfers import RepoExportContext
101log = getLogger(__name__)
104class ButlerValidationError(ValidationError):
105 """There is a problem with the Butler configuration."""
107 pass
110class PruneCollectionsArgsError(TypeError):
111 """Base class for errors relating to Butler.pruneCollections input
112 arguments.
113 """
115 pass
118class PurgeWithoutUnstorePruneCollectionsError(PruneCollectionsArgsError):
119 """Raised when purge and unstore are both required to be True, and
120 purge is True but unstore is False.
121 """
123 def __init__(self) -> None:
124 super().__init__("Cannot pass purge=True without unstore=True.")
127class RunWithoutPurgePruneCollectionsError(PruneCollectionsArgsError):
128 """Raised when pruning a RUN collection but purge is False."""
130 def __init__(self, collectionType: CollectionType):
131 self.collectionType = collectionType
132 super().__init__(f"Cannot prune RUN collection {self.collectionType.name} without purge=True.")
135class PurgeUnsupportedPruneCollectionsError(PruneCollectionsArgsError):
136 """Raised when purge is True but is not supported for the given
137 collection."""
139 def __init__(self, collectionType: CollectionType):
140 self.collectionType = collectionType
141 super().__init__(
142 f"Cannot prune {self.collectionType} collection {self.collectionType.name} with purge=True."
143 )
146class Butler(LimitedButler):
147 """Main entry point for the data access system.
149 Parameters
150 ----------
151 config : `ButlerConfig`, `Config` or `str`, optional.
152 Configuration. Anything acceptable to the
153 `ButlerConfig` constructor. If a directory path
154 is given the configuration will be read from a ``butler.yaml`` file in
155 that location. If `None` is given default values will be used.
156 butler : `Butler`, optional.
157 If provided, construct a new Butler that uses the same registry and
158 datastore as the given one, but with the given collection and run.
159 Incompatible with the ``config``, ``searchPaths``, and ``writeable``
160 arguments.
161 collections : `str` or `Iterable` [ `str` ], optional
162 An expression specifying the collections to be searched (in order) when
163 reading datasets.
164 This may be a `str` collection name or an iterable thereof.
165 See :ref:`daf_butler_collection_expressions` for more information.
166 These collections are not registered automatically and must be
167 manually registered before they are used by any method, but they may be
168 manually registered after the `Butler` is initialized.
169 run : `str`, optional
170 Name of the `~CollectionType.RUN` collection new datasets should be
171 inserted into. If ``collections`` is `None` and ``run`` is not `None`,
172 ``collections`` will be set to ``[run]``. If not `None`, this
173 collection will automatically be registered. If this is not set (and
174 ``writeable`` is not set either), a read-only butler will be created.
175 searchPaths : `list` of `str`, optional
176 Directory paths to search when calculating the full Butler
177 configuration. Not used if the supplied config is already a
178 `ButlerConfig`.
179 writeable : `bool`, optional
180 Explicitly sets whether the butler supports write operations. If not
181 provided, a read-write butler is created if any of ``run``, ``tags``,
182 or ``chains`` is non-empty.
183 inferDefaults : `bool`, optional
184 If `True` (default) infer default data ID values from the values
185 present in the datasets in ``collections``: if all collections have the
186 same value (or no value) for a governor dimension, that value will be
187 the default for that dimension. Nonexistent collections are ignored.
188 If a default value is provided explicitly for a governor dimension via
189 ``**kwargs``, no default will be inferred for that dimension.
190 **kwargs : `str`
191 Default data ID key-value pairs. These may only identify "governor"
192 dimensions like ``instrument`` and ``skymap``.
194 Examples
195 --------
196 While there are many ways to control exactly how a `Butler` interacts with
197 the collections in its `Registry`, the most common cases are still simple.
199 For a read-only `Butler` that searches one collection, do::
201 butler = Butler("/path/to/repo", collections=["u/alice/DM-50000"])
203 For a read-write `Butler` that writes to and reads from a
204 `~CollectionType.RUN` collection::
206 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a")
208 The `Butler` passed to a ``PipelineTask`` is often much more complex,
209 because we want to write to one `~CollectionType.RUN` collection but read
210 from several others (as well)::
212 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a",
213 collections=["u/alice/DM-50000/a",
214 "u/bob/DM-49998",
215 "HSC/defaults"])
217 This butler will `put` new datasets to the run ``u/alice/DM-50000/a``.
218 Datasets will be read first from that run (since it appears first in the
219 chain), and then from ``u/bob/DM-49998`` and finally ``HSC/defaults``.
221 Finally, one can always create a `Butler` with no collections::
223 butler = Butler("/path/to/repo", writeable=True)
225 This can be extremely useful when you just want to use ``butler.registry``,
226 e.g. for inserting dimension data or managing collections, or when the
227 collections you want to use with the butler are not consistent.
228 Passing ``writeable`` explicitly here is only necessary if you want to be
229 able to make changes to the repo - usually the value for ``writeable`` can
230 be guessed from the collection arguments provided, but it defaults to
231 `False` when there are not collection arguments.
232 """
234 def __init__(
235 self,
236 config: Union[Config, str, None] = None,
237 *,
238 butler: Optional[Butler] = None,
239 collections: Any = None,
240 run: Optional[str] = None,
241 searchPaths: Optional[List[str]] = None,
242 writeable: Optional[bool] = None,
243 inferDefaults: bool = True,
244 **kwargs: str,
245 ):
246 defaults = RegistryDefaults(collections=collections, run=run, infer=inferDefaults, **kwargs)
247 # Load registry, datastore, etc. from config or existing butler.
248 if butler is not None:
249 if config is not None or searchPaths is not None or writeable is not None:
250 raise TypeError(
251 "Cannot pass 'config', 'searchPaths', or 'writeable' arguments with 'butler' argument."
252 )
253 self.registry = butler.registry.copy(defaults)
254 self.datastore = butler.datastore
255 self.storageClasses = butler.storageClasses
256 self._config: ButlerConfig = butler._config
257 self._allow_put_of_predefined_dataset = butler._allow_put_of_predefined_dataset
258 else:
259 # Can only look for strings in the known repos list.
260 if isinstance(config, str) and config in self.get_known_repos():
261 config = str(self.get_repo_uri(config))
262 try:
263 self._config = ButlerConfig(config, searchPaths=searchPaths)
264 except FileNotFoundError as e:
265 if known := self.get_known_repos():
266 aliases = f"(known aliases: {', '.join(known)})"
267 else:
268 aliases = "(no known aliases)"
269 raise FileNotFoundError(f"{e} {aliases}") from e
270 self._config = ButlerConfig(config, searchPaths=searchPaths)
271 try:
272 if "root" in self._config:
273 butlerRoot = self._config["root"]
274 else:
275 butlerRoot = self._config.configDir
276 if writeable is None:
277 writeable = run is not None
278 self.registry = Registry.fromConfig(
279 self._config, butlerRoot=butlerRoot, writeable=writeable, defaults=defaults
280 )
281 self.datastore = Datastore.fromConfig(
282 self._config, self.registry.getDatastoreBridgeManager(), butlerRoot=butlerRoot
283 )
284 self.storageClasses = StorageClassFactory()
285 self.storageClasses.addFromConfig(self._config)
286 self._allow_put_of_predefined_dataset = self._config.get(
287 "allow_put_of_predefined_dataset", False
288 )
289 except Exception:
290 # Failures here usually mean that configuration is incomplete,
291 # just issue an error message which includes config file URI.
292 log.error(f"Failed to instantiate Butler from config {self._config.configFile}.")
293 raise
295 if "run" in self._config or "collection" in self._config:
296 raise ValueError("Passing a run or collection via configuration is no longer supported.")
298 GENERATION: ClassVar[int] = 3
299 """This is a Generation 3 Butler.
301 This attribute may be removed in the future, once the Generation 2 Butler
302 interface has been fully retired; it should only be used in transitional
303 code.
304 """
306 @classmethod
307 def get_repo_uri(cls, label: str) -> ResourcePath:
308 """Look up the label in a butler repository index.
310 Parameters
311 ----------
312 label : `str`
313 Label of the Butler repository to look up.
315 Returns
316 -------
317 uri : `lsst.resources.ResourcePath`
318 URI to the Butler repository associated with the given label.
320 Raises
321 ------
322 KeyError
323 Raised if the label is not found in the index, or if an index
324 can not be found at all.
326 Notes
327 -----
328 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the
329 information is discovered.
330 """
331 return ButlerRepoIndex.get_repo_uri(label)
333 @classmethod
334 def get_known_repos(cls) -> Set[str]:
335 """Retrieve the list of known repository labels.
337 Returns
338 -------
339 repos : `set` of `str`
340 All the known labels. Can be empty if no index can be found.
342 Notes
343 -----
344 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the
345 information is discovered.
346 """
347 return ButlerRepoIndex.get_known_repos()
349 @staticmethod
350 def makeRepo(
351 root: ResourcePathExpression,
352 config: Union[Config, str, None] = None,
353 dimensionConfig: Union[Config, str, None] = None,
354 standalone: bool = False,
355 searchPaths: Optional[List[str]] = None,
356 forceConfigRoot: bool = True,
357 outfile: Optional[ResourcePathExpression] = None,
358 overwrite: bool = False,
359 ) -> Config:
360 """Create an empty data repository by adding a butler.yaml config
361 to a repository root directory.
363 Parameters
364 ----------
365 root : `lsst.resources.ResourcePathExpression`
366 Path or URI to the root location of the new repository. Will be
367 created if it does not exist.
368 config : `Config` or `str`, optional
369 Configuration to write to the repository, after setting any
370 root-dependent Registry or Datastore config options. Can not
371 be a `ButlerConfig` or a `ConfigSubset`. If `None`, default
372 configuration will be used. Root-dependent config options
373 specified in this config are overwritten if ``forceConfigRoot``
374 is `True`.
375 dimensionConfig : `Config` or `str`, optional
376 Configuration for dimensions, will be used to initialize registry
377 database.
378 standalone : `bool`
379 If True, write all expanded defaults, not just customized or
380 repository-specific settings.
381 This (mostly) decouples the repository from the default
382 configuration, insulating it from changes to the defaults (which
383 may be good or bad, depending on the nature of the changes).
384 Future *additions* to the defaults will still be picked up when
385 initializing `Butlers` to repos created with ``standalone=True``.
386 searchPaths : `list` of `str`, optional
387 Directory paths to search when calculating the full butler
388 configuration.
389 forceConfigRoot : `bool`, optional
390 If `False`, any values present in the supplied ``config`` that
391 would normally be reset are not overridden and will appear
392 directly in the output config. This allows non-standard overrides
393 of the root directory for a datastore or registry to be given.
394 If this parameter is `True` the values for ``root`` will be
395 forced into the resulting config if appropriate.
396 outfile : `lss.resources.ResourcePathExpression`, optional
397 If not-`None`, the output configuration will be written to this
398 location rather than into the repository itself. Can be a URI
399 string. Can refer to a directory that will be used to write
400 ``butler.yaml``.
401 overwrite : `bool`, optional
402 Create a new configuration file even if one already exists
403 in the specified output location. Default is to raise
404 an exception.
406 Returns
407 -------
408 config : `Config`
409 The updated `Config` instance written to the repo.
411 Raises
412 ------
413 ValueError
414 Raised if a ButlerConfig or ConfigSubset is passed instead of a
415 regular Config (as these subclasses would make it impossible to
416 support ``standalone=False``).
417 FileExistsError
418 Raised if the output config file already exists.
419 os.error
420 Raised if the directory does not exist, exists but is not a
421 directory, or cannot be created.
423 Notes
424 -----
425 Note that when ``standalone=False`` (the default), the configuration
426 search path (see `ConfigSubset.defaultSearchPaths`) that was used to
427 construct the repository should also be used to construct any Butlers
428 to avoid configuration inconsistencies.
429 """
430 if isinstance(config, (ButlerConfig, ConfigSubset)):
431 raise ValueError("makeRepo must be passed a regular Config without defaults applied.")
433 # Ensure that the root of the repository exists or can be made
434 root_uri = ResourcePath(root, forceDirectory=True)
435 root_uri.mkdir()
437 config = Config(config)
439 # If we are creating a new repo from scratch with relative roots,
440 # do not propagate an explicit root from the config file
441 if "root" in config:
442 del config["root"]
444 full = ButlerConfig(config, searchPaths=searchPaths) # this applies defaults
445 imported_class = doImportType(full["datastore", "cls"])
446 if not issubclass(imported_class, Datastore):
447 raise TypeError(f"Imported datastore class {full['datastore', 'cls']} is not a Datastore")
448 datastoreClass: Type[Datastore] = imported_class
449 datastoreClass.setConfigRoot(BUTLER_ROOT_TAG, config, full, overwrite=forceConfigRoot)
451 # if key exists in given config, parse it, otherwise parse the defaults
452 # in the expanded config
453 if config.get(("registry", "db")):
454 registryConfig = RegistryConfig(config)
455 else:
456 registryConfig = RegistryConfig(full)
457 defaultDatabaseUri = registryConfig.makeDefaultDatabaseUri(BUTLER_ROOT_TAG)
458 if defaultDatabaseUri is not None:
459 Config.updateParameters(
460 RegistryConfig, config, full, toUpdate={"db": defaultDatabaseUri}, overwrite=forceConfigRoot
461 )
462 else:
463 Config.updateParameters(RegistryConfig, config, full, toCopy=("db",), overwrite=forceConfigRoot)
465 if standalone:
466 config.merge(full)
467 else:
468 # Always expand the registry.managers section into the per-repo
469 # config, because after the database schema is created, it's not
470 # allowed to change anymore. Note that in the standalone=True
471 # branch, _everything_ in the config is expanded, so there's no
472 # need to special case this.
473 Config.updateParameters(RegistryConfig, config, full, toMerge=("managers",), overwrite=False)
474 configURI: ResourcePathExpression
475 if outfile is not None:
476 # When writing to a separate location we must include
477 # the root of the butler repo in the config else it won't know
478 # where to look.
479 config["root"] = root_uri.geturl()
480 configURI = outfile
481 else:
482 configURI = root_uri
483 config.dumpToUri(configURI, overwrite=overwrite)
485 # Create Registry and populate tables
486 registryConfig = RegistryConfig(config.get("registry"))
487 dimensionConfig = DimensionConfig(dimensionConfig)
488 Registry.createFromConfig(registryConfig, dimensionConfig=dimensionConfig, butlerRoot=root_uri)
490 log.verbose("Wrote new Butler configuration file to %s", configURI)
492 return config
494 @classmethod
495 def _unpickle(
496 cls,
497 config: ButlerConfig,
498 collections: Optional[CollectionSearch],
499 run: Optional[str],
500 defaultDataId: Dict[str, str],
501 writeable: bool,
502 ) -> Butler:
503 """Callable used to unpickle a Butler.
505 We prefer not to use ``Butler.__init__`` directly so we can force some
506 of its many arguments to be keyword-only (note that ``__reduce__``
507 can only invoke callables with positional arguments).
509 Parameters
510 ----------
511 config : `ButlerConfig`
512 Butler configuration, already coerced into a true `ButlerConfig`
513 instance (and hence after any search paths for overrides have been
514 utilized).
515 collections : `CollectionSearch`
516 Names of the default collections to read from.
517 run : `str`, optional
518 Name of the default `~CollectionType.RUN` collection to write to.
519 defaultDataId : `dict` [ `str`, `str` ]
520 Default data ID values.
521 writeable : `bool`
522 Whether the Butler should support write operations.
524 Returns
525 -------
526 butler : `Butler`
527 A new `Butler` instance.
528 """
529 # MyPy doesn't recognize that the kwargs below are totally valid; it
530 # seems to think '**defaultDataId* is a _positional_ argument!
531 return cls(
532 config=config,
533 collections=collections,
534 run=run,
535 writeable=writeable,
536 **defaultDataId, # type: ignore
537 )
539 def __reduce__(self) -> tuple:
540 """Support pickling."""
541 return (
542 Butler._unpickle,
543 (
544 self._config,
545 self.collections,
546 self.run,
547 self.registry.defaults.dataId.byName(),
548 self.registry.isWriteable(),
549 ),
550 )
552 def __str__(self) -> str:
553 return "Butler(collections={}, run={}, datastore='{}', registry='{}')".format(
554 self.collections, self.run, self.datastore, self.registry
555 )
557 def isWriteable(self) -> bool:
558 """Return `True` if this `Butler` supports write operations."""
559 return self.registry.isWriteable()
561 @contextlib.contextmanager
562 def transaction(self) -> Iterator[None]:
563 """Context manager supporting `Butler` transactions.
565 Transactions can be nested.
566 """
567 with self.registry.transaction():
568 with self.datastore.transaction():
569 yield
571 def _standardizeArgs(
572 self,
573 datasetRefOrType: Union[DatasetRef, DatasetType, str],
574 dataId: Optional[DataId] = None,
575 for_put: bool = True,
576 **kwargs: Any,
577 ) -> Tuple[DatasetType, Optional[DataId]]:
578 """Standardize the arguments passed to several Butler APIs.
580 Parameters
581 ----------
582 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
583 When `DatasetRef` the `dataId` should be `None`.
584 Otherwise the `DatasetType` or name thereof.
585 dataId : `dict` or `DataCoordinate`
586 A `dict` of `Dimension` link name, value pairs that label the
587 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
588 should be provided as the second argument.
589 for_put : `bool`, optional
590 If `True` this call is invoked as part of a `Butler.put()`.
591 Otherwise it is assumed to be part of a `Butler.get()`. This
592 parameter is only relevant if there is dataset type
593 inconsistency.
594 **kwargs
595 Additional keyword arguments used to augment or construct a
596 `DataCoordinate`. See `DataCoordinate.standardize`
597 parameters.
599 Returns
600 -------
601 datasetType : `DatasetType`
602 A `DatasetType` instance extracted from ``datasetRefOrType``.
603 dataId : `dict` or `DataId`, optional
604 Argument that can be used (along with ``kwargs``) to construct a
605 `DataId`.
607 Notes
608 -----
609 Butler APIs that conceptually need a DatasetRef also allow passing a
610 `DatasetType` (or the name of one) and a `DataId` (or a dict and
611 keyword arguments that can be used to construct one) separately. This
612 method accepts those arguments and always returns a true `DatasetType`
613 and a `DataId` or `dict`.
615 Standardization of `dict` vs `DataId` is best handled by passing the
616 returned ``dataId`` (and ``kwargs``) to `Registry` APIs, which are
617 generally similarly flexible.
618 """
619 externalDatasetType: Optional[DatasetType] = None
620 internalDatasetType: Optional[DatasetType] = None
621 if isinstance(datasetRefOrType, DatasetRef):
622 if dataId is not None or kwargs:
623 raise ValueError("DatasetRef given, cannot use dataId as well")
624 externalDatasetType = datasetRefOrType.datasetType
625 dataId = datasetRefOrType.dataId
626 else:
627 # Don't check whether DataId is provided, because Registry APIs
628 # can usually construct a better error message when it wasn't.
629 if isinstance(datasetRefOrType, DatasetType):
630 externalDatasetType = datasetRefOrType
631 else:
632 internalDatasetType = self.registry.getDatasetType(datasetRefOrType)
634 # Check that they are self-consistent
635 if externalDatasetType is not None:
636 internalDatasetType = self.registry.getDatasetType(externalDatasetType.name)
637 if externalDatasetType != internalDatasetType:
638 # We can allow differences if they are compatible, depending
639 # on whether this is a get or a put. A get requires that
640 # the python type associated with the datastore can be
641 # converted to the user type. A put requires that the user
642 # supplied python type can be converted to the internal
643 # type expected by registry.
644 relevantDatasetType = internalDatasetType
645 if for_put:
646 is_compatible = internalDatasetType.is_compatible_with(externalDatasetType)
647 else:
648 is_compatible = externalDatasetType.is_compatible_with(internalDatasetType)
649 relevantDatasetType = externalDatasetType
650 if not is_compatible:
651 raise ValueError(
652 f"Supplied dataset type ({externalDatasetType}) inconsistent with "
653 f"registry definition ({internalDatasetType})"
654 )
655 # Override the internal definition.
656 internalDatasetType = relevantDatasetType
658 assert internalDatasetType is not None
659 return internalDatasetType, dataId
661 def _rewrite_data_id(
662 self, dataId: Optional[DataId], datasetType: DatasetType, **kwargs: Any
663 ) -> Tuple[Optional[DataId], Dict[str, Any]]:
664 """Rewrite a data ID taking into account dimension records.
666 Take a Data ID and keyword args and rewrite it if necessary to
667 allow the user to specify dimension records rather than dimension
668 primary values.
670 This allows a user to include a dataId dict with keys of
671 ``exposure.day_obs`` and ``exposure.seq_num`` instead of giving
672 the integer exposure ID. It also allows a string to be given
673 for a dimension value rather than the integer ID if that is more
674 convenient. For example, rather than having to specifyin the
675 detector with ``detector.full_name``, a string given for ``detector``
676 will be interpreted as the full name and converted to the integer
677 value.
679 Keyword arguments can also use strings for dimensions like detector
680 and exposure but python does not allow them to include ``.`` and
681 so the ``exposure.day_obs`` syntax can not be used in a keyword
682 argument.
684 Parameters
685 ----------
686 dataId : `dict` or `DataCoordinate`
687 A `dict` of `Dimension` link name, value pairs that will label the
688 `DatasetRef` within a Collection.
689 datasetType : `DatasetType`
690 The dataset type associated with this dataId. Required to
691 determine the relevant dimensions.
692 **kwargs
693 Additional keyword arguments used to augment or construct a
694 `DataId`. See `DataId` parameters.
696 Returns
697 -------
698 dataId : `dict` or `DataCoordinate`
699 The, possibly rewritten, dataId. If given a `DataCoordinate` and
700 no keyword arguments, the original dataId will be returned
701 unchanged.
702 **kwargs : `dict`
703 Any unused keyword arguments (would normally be empty dict).
704 """
705 # Do nothing if we have a standalone DataCoordinate.
706 if isinstance(dataId, DataCoordinate) and not kwargs:
707 return dataId, kwargs
709 # Process dimension records that are using record information
710 # rather than ids
711 newDataId: Dict[str, DataIdValue] = {}
712 byRecord: Dict[str, Dict[str, Any]] = defaultdict(dict)
714 # if all the dataId comes from keyword parameters we do not need
715 # to do anything here because they can't be of the form
716 # exposure.obs_id because a "." is not allowed in a keyword parameter.
717 if dataId:
718 for k, v in dataId.items():
719 # If we have a Dimension we do not need to do anything
720 # because it cannot be a compound key.
721 if isinstance(k, str) and "." in k:
722 # Someone is using a more human-readable dataId
723 dimensionName, record = k.split(".", 1)
724 byRecord[dimensionName][record] = v
725 elif isinstance(k, Dimension):
726 newDataId[k.name] = v
727 else:
728 newDataId[k] = v
730 # Go through the updated dataId and check the type in case someone is
731 # using an alternate key. We have already filtered out the compound
732 # keys dimensions.record format.
733 not_dimensions = {}
735 # Will need to look in the dataId and the keyword arguments
736 # and will remove them if they need to be fixed or are unrecognized.
737 for dataIdDict in (newDataId, kwargs):
738 # Use a list so we can adjust the dict safely in the loop
739 for dimensionName in list(dataIdDict):
740 value = dataIdDict[dimensionName]
741 try:
742 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
743 except KeyError:
744 # This is not a real dimension
745 not_dimensions[dimensionName] = value
746 del dataIdDict[dimensionName]
747 continue
749 # Convert an integral type to an explicit int to simplify
750 # comparisons here
751 if isinstance(value, numbers.Integral):
752 value = int(value)
754 if not isinstance(value, dimension.primaryKey.getPythonType()):
755 for alternate in dimension.alternateKeys:
756 if isinstance(value, alternate.getPythonType()):
757 byRecord[dimensionName][alternate.name] = value
758 del dataIdDict[dimensionName]
759 log.debug(
760 "Converting dimension %s to %s.%s=%s",
761 dimensionName,
762 dimensionName,
763 alternate.name,
764 value,
765 )
766 break
767 else:
768 log.warning(
769 "Type mismatch found for value '%r' provided for dimension %s. "
770 "Could not find matching alternative (primary key has type %s) "
771 "so attempting to use as-is.",
772 value,
773 dimensionName,
774 dimension.primaryKey.getPythonType(),
775 )
777 # By this point kwargs and newDataId should only include valid
778 # dimensions. Merge kwargs in to the new dataId and log if there
779 # are dimensions in both (rather than calling update).
780 for k, v in kwargs.items():
781 if k in newDataId and newDataId[k] != v:
782 log.debug(
783 "Keyword arg %s overriding explicit value in dataId of %s with %s", k, newDataId[k], v
784 )
785 newDataId[k] = v
786 # No need to retain any values in kwargs now.
787 kwargs = {}
789 # If we have some unrecognized dimensions we have to try to connect
790 # them to records in other dimensions. This is made more complicated
791 # by some dimensions having records with clashing names. A mitigation
792 # is that we can tell by this point which dimensions are missing
793 # for the DatasetType but this does not work for calibrations
794 # where additional dimensions can be used to constrain the temporal
795 # axis.
796 if not_dimensions:
797 # Search for all dimensions even if we have been given a value
798 # explicitly. In some cases records are given as well as the
799 # actually dimension and this should not be an error if they
800 # match.
801 mandatoryDimensions = datasetType.dimensions.names # - provided
803 candidateDimensions: Set[str] = set()
804 candidateDimensions.update(mandatoryDimensions)
806 # For calibrations we may well be needing temporal dimensions
807 # so rather than always including all dimensions in the scan
808 # restrict things a little. It is still possible for there
809 # to be confusion over day_obs in visit vs exposure for example.
810 # If we are not searching calibration collections things may
811 # fail but they are going to fail anyway because of the
812 # ambiguousness of the dataId...
813 if datasetType.isCalibration():
814 for dim in self.registry.dimensions.getStaticDimensions():
815 if dim.temporal:
816 candidateDimensions.add(str(dim))
818 # Look up table for the first association with a dimension
819 guessedAssociation: Dict[str, Dict[str, Any]] = defaultdict(dict)
821 # Keep track of whether an item is associated with multiple
822 # dimensions.
823 counter: Counter[str] = Counter()
824 assigned: Dict[str, Set[str]] = defaultdict(set)
826 # Go through the missing dimensions and associate the
827 # given names with records within those dimensions
828 matched_dims = set()
829 for dimensionName in candidateDimensions:
830 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
831 fields = dimension.metadata.names | dimension.uniqueKeys.names
832 for field in not_dimensions:
833 if field in fields:
834 guessedAssociation[dimensionName][field] = not_dimensions[field]
835 counter[dimensionName] += 1
836 assigned[field].add(dimensionName)
837 matched_dims.add(field)
839 # Calculate the fields that matched nothing.
840 never_found = set(not_dimensions) - matched_dims
842 if never_found:
843 raise ValueError(f"Unrecognized keyword args given: {never_found}")
845 # There is a chance we have allocated a single dataId item
846 # to multiple dimensions. Need to decide which should be retained.
847 # For now assume that the most popular alternative wins.
848 # This means that day_obs with seq_num will result in
849 # exposure.day_obs and not visit.day_obs
850 # Also prefer an explicitly missing dimension over an inferred
851 # temporal dimension.
852 for fieldName, assignedDimensions in assigned.items():
853 if len(assignedDimensions) > 1:
854 # Pick the most popular (preferring mandatory dimensions)
855 requiredButMissing = assignedDimensions.intersection(mandatoryDimensions)
856 if requiredButMissing:
857 candidateDimensions = requiredButMissing
858 else:
859 candidateDimensions = assignedDimensions
861 # Select the relevant items and get a new restricted
862 # counter.
863 theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions}
864 duplicatesCounter: Counter[str] = Counter()
865 duplicatesCounter.update(theseCounts)
867 # Choose the most common. If they are equally common
868 # we will pick the one that was found first.
869 # Returns a list of tuples
870 selected = duplicatesCounter.most_common(1)[0][0]
872 log.debug(
873 "Ambiguous dataId entry '%s' associated with multiple dimensions: %s."
874 " Removed ambiguity by choosing dimension %s.",
875 fieldName,
876 ", ".join(assignedDimensions),
877 selected,
878 )
880 for candidateDimension in assignedDimensions:
881 if candidateDimension != selected:
882 del guessedAssociation[candidateDimension][fieldName]
884 # Update the record look up dict with the new associations
885 for dimensionName, values in guessedAssociation.items():
886 if values: # A dict might now be empty
887 log.debug("Assigned non-dimension dataId keys to dimension %s: %s", dimensionName, values)
888 byRecord[dimensionName].update(values)
890 if byRecord:
891 # Some record specifiers were found so we need to convert
892 # them to the Id form
893 for dimensionName, values in byRecord.items():
894 if dimensionName in newDataId:
895 log.debug(
896 "DataId specified explicit %s dimension value of %s in addition to"
897 " general record specifiers for it of %s. Ignoring record information.",
898 dimensionName,
899 newDataId[dimensionName],
900 str(values),
901 )
902 # Get the actual record and compare with these values.
903 try:
904 recs = list(self.registry.queryDimensionRecords(dimensionName, dataId=newDataId))
905 except DataIdError:
906 raise ValueError(
907 f"Could not find dimension '{dimensionName}'"
908 f" with dataId {newDataId} as part of comparing with"
909 f" record values {byRecord[dimensionName]}"
910 ) from None
911 if len(recs) == 1:
912 errmsg: List[str] = []
913 for k, v in values.items():
914 if (recval := getattr(recs[0], k)) != v:
915 errmsg.append(f"{k}({recval} != {v})")
916 if errmsg:
917 raise ValueError(
918 f"Dimension {dimensionName} in dataId has explicit value"
919 " inconsistent with records: " + ", ".join(errmsg)
920 )
921 else:
922 # Multiple matches for an explicit dimension
923 # should never happen but let downstream complain.
924 pass
925 continue
927 # Build up a WHERE expression
928 bind = {k: v for k, v in values.items()}
929 where = " AND ".join(f"{dimensionName}.{k} = {k}" for k in bind)
931 # Hopefully we get a single record that matches
932 records = set(
933 self.registry.queryDimensionRecords(
934 dimensionName, dataId=newDataId, where=where, bind=bind, **kwargs
935 )
936 )
938 if len(records) != 1:
939 if len(records) > 1:
940 log.debug("Received %d records from constraints of %s", len(records), str(values))
941 for r in records:
942 log.debug("- %s", str(r))
943 raise ValueError(
944 f"DataId specification for dimension {dimensionName} is not"
945 f" uniquely constrained to a single dataset by {values}."
946 f" Got {len(records)} results."
947 )
948 raise ValueError(
949 f"DataId specification for dimension {dimensionName} matched no"
950 f" records when constrained by {values}"
951 )
953 # Get the primary key from the real dimension object
954 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
955 if not isinstance(dimension, Dimension):
956 raise RuntimeError(
957 f"{dimension.name} is not a true dimension, and cannot be used in data IDs."
958 )
959 newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name)
961 return newDataId, kwargs
963 def _findDatasetRef(
964 self,
965 datasetRefOrType: Union[DatasetRef, DatasetType, str],
966 dataId: Optional[DataId] = None,
967 *,
968 collections: Any = None,
969 allowUnresolved: bool = False,
970 **kwargs: Any,
971 ) -> DatasetRef:
972 """Shared logic for methods that start with a search for a dataset in
973 the registry.
975 Parameters
976 ----------
977 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
978 When `DatasetRef` the `dataId` should be `None`.
979 Otherwise the `DatasetType` or name thereof.
980 dataId : `dict` or `DataCoordinate`, optional
981 A `dict` of `Dimension` link name, value pairs that label the
982 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
983 should be provided as the first argument.
984 collections : Any, optional
985 Collections to be searched, overriding ``self.collections``.
986 Can be any of the types supported by the ``collections`` argument
987 to butler construction.
988 allowUnresolved : `bool`, optional
989 If `True`, return an unresolved `DatasetRef` if finding a resolved
990 one in the `Registry` fails. Defaults to `False`.
991 **kwargs
992 Additional keyword arguments used to augment or construct a
993 `DataId`. See `DataId` parameters.
995 Returns
996 -------
997 ref : `DatasetRef`
998 A reference to the dataset identified by the given arguments.
1000 Raises
1001 ------
1002 LookupError
1003 Raised if no matching dataset exists in the `Registry` (and
1004 ``allowUnresolved is False``).
1005 ValueError
1006 Raised if a resolved `DatasetRef` was passed as an input, but it
1007 differs from the one found in the registry.
1008 TypeError
1009 Raised if no collections were provided.
1010 """
1011 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, for_put=False, **kwargs)
1012 if isinstance(datasetRefOrType, DatasetRef):
1013 idNumber = datasetRefOrType.id
1014 else:
1015 idNumber = None
1016 timespan: Optional[Timespan] = None
1018 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs)
1020 if datasetType.isCalibration():
1021 # Because this is a calibration dataset, first try to make a
1022 # standardize the data ID without restricting the dimensions to
1023 # those of the dataset type requested, because there may be extra
1024 # dimensions that provide temporal information for a validity-range
1025 # lookup.
1026 dataId = DataCoordinate.standardize(
1027 dataId, universe=self.registry.dimensions, defaults=self.registry.defaults.dataId, **kwargs
1028 )
1029 if dataId.graph.temporal:
1030 dataId = self.registry.expandDataId(dataId)
1031 timespan = dataId.timespan
1032 else:
1033 # Standardize the data ID to just the dimensions of the dataset
1034 # type instead of letting registry.findDataset do it, so we get the
1035 # result even if no dataset is found.
1036 dataId = DataCoordinate.standardize(
1037 dataId, graph=datasetType.dimensions, defaults=self.registry.defaults.dataId, **kwargs
1038 )
1039 # Always lookup the DatasetRef, even if one is given, to ensure it is
1040 # present in the current collection.
1041 ref = self.registry.findDataset(datasetType, dataId, collections=collections, timespan=timespan)
1042 if ref is None:
1043 if allowUnresolved:
1044 return DatasetRef(datasetType, dataId)
1045 else:
1046 if collections is None:
1047 collections = self.registry.defaults.collections
1048 raise LookupError(
1049 f"Dataset {datasetType.name} with data ID {dataId} "
1050 f"could not be found in collections {collections}."
1051 )
1052 if idNumber is not None and idNumber != ref.id:
1053 if collections is None:
1054 collections = self.registry.defaults.collections
1055 raise ValueError(
1056 f"DatasetRef.id provided ({idNumber}) does not match "
1057 f"id ({ref.id}) in registry in collections {collections}."
1058 )
1059 if datasetType != ref.datasetType:
1060 # If they differ it is because the user explicitly specified
1061 # a compatible dataset type to this call rather than using the
1062 # registry definition. The DatasetRef must therefore be recreated
1063 # using the user definition such that the expected type is
1064 # returned.
1065 ref = DatasetRef(datasetType, ref.dataId, run=ref.run, id=ref.id)
1067 return ref
1069 @transactional
1070 def putDirect(self, obj: Any, ref: DatasetRef) -> DatasetRef:
1071 # Docstring inherited.
1072 (imported_ref,) = self.registry._importDatasets(
1073 [ref],
1074 expand=True,
1075 )
1076 if imported_ref.id != ref.getCheckedId():
1077 raise RuntimeError("This registry configuration does not support putDirect.")
1078 self.datastore.put(obj, ref)
1079 return ref
1081 @transactional
1082 def put(
1083 self,
1084 obj: Any,
1085 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1086 dataId: Optional[DataId] = None,
1087 *,
1088 run: Optional[str] = None,
1089 **kwargs: Any,
1090 ) -> DatasetRef:
1091 """Store and register a dataset.
1093 Parameters
1094 ----------
1095 obj : `object`
1096 The dataset.
1097 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1098 When `DatasetRef` is provided, ``dataId`` should be `None`.
1099 Otherwise the `DatasetType` or name thereof.
1100 dataId : `dict` or `DataCoordinate`
1101 A `dict` of `Dimension` link name, value pairs that label the
1102 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1103 should be provided as the second argument.
1104 run : `str`, optional
1105 The name of the run the dataset should be added to, overriding
1106 ``self.run``.
1107 **kwargs
1108 Additional keyword arguments used to augment or construct a
1109 `DataCoordinate`. See `DataCoordinate.standardize`
1110 parameters.
1112 Returns
1113 -------
1114 ref : `DatasetRef`
1115 A reference to the stored dataset, updated with the correct id if
1116 given.
1118 Raises
1119 ------
1120 TypeError
1121 Raised if the butler is read-only or if no run has been provided.
1122 """
1123 log.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run)
1124 if not self.isWriteable():
1125 raise TypeError("Butler is read-only.")
1126 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs)
1127 if isinstance(datasetRefOrType, DatasetRef) and datasetRefOrType.id is not None:
1128 raise ValueError("DatasetRef must not be in registry, must have None id")
1130 # Handle dimension records in dataId
1131 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs)
1133 # Add Registry Dataset entry.
1134 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions, **kwargs)
1136 # For an execution butler the datasets will be pre-defined.
1137 # If the butler is configured that way datasets should only be inserted
1138 # if they do not already exist in registry. Trying and catching
1139 # ConflictingDefinitionError will not work because the transaction
1140 # will be corrupted. Instead, in this mode always check first.
1141 ref = None
1142 ref_is_predefined = False
1143 if self._allow_put_of_predefined_dataset:
1144 # Get the matching ref for this run.
1145 ref = self.registry.findDataset(datasetType, collections=run, dataId=dataId)
1147 if ref:
1148 # Must be expanded form for datastore templating
1149 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions)
1150 ref = ref.expanded(dataId)
1151 ref_is_predefined = True
1153 if not ref:
1154 (ref,) = self.registry.insertDatasets(datasetType, run=run, dataIds=[dataId])
1156 # If the ref is predefined it is possible that the datastore also
1157 # has the record. Asking datastore to put it again will result in
1158 # the artifact being recreated, overwriting previous, then will cause
1159 # a failure in writing the record which will cause the artifact
1160 # to be removed. Much safer to ask first before attempting to
1161 # overwrite. Race conditions should not be an issue for the
1162 # execution butler environment.
1163 if ref_is_predefined:
1164 if self.datastore.knows(ref):
1165 raise ConflictingDefinitionError(f"Dataset associated {ref} already exists.")
1167 self.datastore.put(obj, ref)
1169 return ref
1171 def getDirect(self, ref: DatasetRef, *, parameters: Optional[Dict[str, Any]] = None) -> Any:
1172 """Retrieve a stored dataset.
1174 Unlike `Butler.get`, this method allows datasets outside the Butler's
1175 collection to be read as long as the `DatasetRef` that identifies them
1176 can be obtained separately.
1178 Parameters
1179 ----------
1180 ref : `DatasetRef`
1181 Resolved reference to an already stored dataset.
1182 parameters : `dict`
1183 Additional StorageClass-defined options to control reading,
1184 typically used to efficiently read only a subset of the dataset.
1186 Returns
1187 -------
1188 obj : `object`
1189 The dataset.
1190 """
1191 return self.datastore.get(ref, parameters=parameters)
1193 def getDirectDeferred(
1194 self, ref: DatasetRef, *, parameters: Union[dict, None] = None
1195 ) -> DeferredDatasetHandle:
1196 """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
1197 from a resolved `DatasetRef`.
1199 Parameters
1200 ----------
1201 ref : `DatasetRef`
1202 Resolved reference to an already stored dataset.
1203 parameters : `dict`
1204 Additional StorageClass-defined options to control reading,
1205 typically used to efficiently read only a subset of the dataset.
1207 Returns
1208 -------
1209 obj : `DeferredDatasetHandle`
1210 A handle which can be used to retrieve a dataset at a later time.
1212 Raises
1213 ------
1214 AmbiguousDatasetError
1215 Raised if ``ref.id is None``, i.e. the reference is unresolved.
1216 """
1217 if ref.id is None:
1218 raise AmbiguousDatasetError(
1219 f"Dataset of type {ref.datasetType.name} with data ID {ref.dataId} is not resolved."
1220 )
1221 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters)
1223 def getDeferred(
1224 self,
1225 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1226 dataId: Optional[DataId] = None,
1227 *,
1228 parameters: Union[dict, None] = None,
1229 collections: Any = None,
1230 **kwargs: Any,
1231 ) -> DeferredDatasetHandle:
1232 """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
1233 after an immediate registry lookup.
1235 Parameters
1236 ----------
1237 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1238 When `DatasetRef` the `dataId` should be `None`.
1239 Otherwise the `DatasetType` or name thereof.
1240 dataId : `dict` or `DataCoordinate`, optional
1241 A `dict` of `Dimension` link name, value pairs that label the
1242 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1243 should be provided as the first argument.
1244 parameters : `dict`
1245 Additional StorageClass-defined options to control reading,
1246 typically used to efficiently read only a subset of the dataset.
1247 collections : Any, optional
1248 Collections to be searched, overriding ``self.collections``.
1249 Can be any of the types supported by the ``collections`` argument
1250 to butler construction.
1251 **kwargs
1252 Additional keyword arguments used to augment or construct a
1253 `DataId`. See `DataId` parameters.
1255 Returns
1256 -------
1257 obj : `DeferredDatasetHandle`
1258 A handle which can be used to retrieve a dataset at a later time.
1260 Raises
1261 ------
1262 LookupError
1263 Raised if no matching dataset exists in the `Registry` (and
1264 ``allowUnresolved is False``).
1265 ValueError
1266 Raised if a resolved `DatasetRef` was passed as an input, but it
1267 differs from the one found in the registry.
1268 TypeError
1269 Raised if no collections were provided.
1270 """
1271 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs)
1272 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters)
1274 def get(
1275 self,
1276 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1277 dataId: Optional[DataId] = None,
1278 *,
1279 parameters: Optional[Dict[str, Any]] = None,
1280 collections: Any = None,
1281 **kwargs: Any,
1282 ) -> Any:
1283 """Retrieve a stored dataset.
1285 Parameters
1286 ----------
1287 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1288 When `DatasetRef` the `dataId` should be `None`.
1289 Otherwise the `DatasetType` or name thereof.
1290 dataId : `dict` or `DataCoordinate`
1291 A `dict` of `Dimension` link name, value pairs that label the
1292 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1293 should be provided as the first argument.
1294 parameters : `dict`
1295 Additional StorageClass-defined options to control reading,
1296 typically used to efficiently read only a subset of the dataset.
1297 collections : Any, optional
1298 Collections to be searched, overriding ``self.collections``.
1299 Can be any of the types supported by the ``collections`` argument
1300 to butler construction.
1301 **kwargs
1302 Additional keyword arguments used to augment or construct a
1303 `DataCoordinate`. See `DataCoordinate.standardize`
1304 parameters.
1306 Returns
1307 -------
1308 obj : `object`
1309 The dataset.
1311 Raises
1312 ------
1313 ValueError
1314 Raised if a resolved `DatasetRef` was passed as an input, but it
1315 differs from the one found in the registry.
1316 LookupError
1317 Raised if no matching dataset exists in the `Registry`.
1318 TypeError
1319 Raised if no collections were provided.
1321 Notes
1322 -----
1323 When looking up datasets in a `~CollectionType.CALIBRATION` collection,
1324 this method requires that the given data ID include temporal dimensions
1325 beyond the dimensions of the dataset type itself, in order to find the
1326 dataset with the appropriate validity range. For example, a "bias"
1327 dataset with native dimensions ``{instrument, detector}`` could be
1328 fetched with a ``{instrument, detector, exposure}`` data ID, because
1329 ``exposure`` is a temporal dimension.
1330 """
1331 log.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters)
1332 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs)
1333 return self.getDirect(ref, parameters=parameters)
1335 def getURIs(
1336 self,
1337 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1338 dataId: Optional[DataId] = None,
1339 *,
1340 predict: bool = False,
1341 collections: Any = None,
1342 run: Optional[str] = None,
1343 **kwargs: Any,
1344 ) -> Tuple[Optional[ResourcePath], Dict[str, ResourcePath]]:
1345 """Returns the URIs associated with the dataset.
1347 Parameters
1348 ----------
1349 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1350 When `DatasetRef` the `dataId` should be `None`.
1351 Otherwise the `DatasetType` or name thereof.
1352 dataId : `dict` or `DataCoordinate`
1353 A `dict` of `Dimension` link name, value pairs that label the
1354 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1355 should be provided as the first argument.
1356 predict : `bool`
1357 If `True`, allow URIs to be returned of datasets that have not
1358 been written.
1359 collections : Any, optional
1360 Collections to be searched, overriding ``self.collections``.
1361 Can be any of the types supported by the ``collections`` argument
1362 to butler construction.
1363 run : `str`, optional
1364 Run to use for predictions, overriding ``self.run``.
1365 **kwargs
1366 Additional keyword arguments used to augment or construct a
1367 `DataCoordinate`. See `DataCoordinate.standardize`
1368 parameters.
1370 Returns
1371 -------
1372 primary : `lsst.resources.ResourcePath`
1373 The URI to the primary artifact associated with this dataset.
1374 If the dataset was disassembled within the datastore this
1375 may be `None`.
1376 components : `dict`
1377 URIs to any components associated with the dataset artifact.
1378 Can be empty if there are no components.
1379 """
1380 ref = self._findDatasetRef(
1381 datasetRefOrType, dataId, allowUnresolved=predict, collections=collections, **kwargs
1382 )
1383 if ref.id is None: # only possible if predict is True
1384 if run is None:
1385 run = self.run
1386 if run is None:
1387 raise TypeError("Cannot predict location with run=None.")
1388 # Lie about ID, because we can't guess it, and only
1389 # Datastore.getURIs() will ever see it (and it doesn't use it).
1390 ref = ref.resolved(id=0, run=run)
1391 return self.datastore.getURIs(ref, predict)
1393 def getURI(
1394 self,
1395 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1396 dataId: Optional[DataId] = None,
1397 *,
1398 predict: bool = False,
1399 collections: Any = None,
1400 run: Optional[str] = None,
1401 **kwargs: Any,
1402 ) -> ResourcePath:
1403 """Return the URI to the Dataset.
1405 Parameters
1406 ----------
1407 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1408 When `DatasetRef` the `dataId` should be `None`.
1409 Otherwise the `DatasetType` or name thereof.
1410 dataId : `dict` or `DataCoordinate`
1411 A `dict` of `Dimension` link name, value pairs that label the
1412 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1413 should be provided as the first argument.
1414 predict : `bool`
1415 If `True`, allow URIs to be returned of datasets that have not
1416 been written.
1417 collections : Any, optional
1418 Collections to be searched, overriding ``self.collections``.
1419 Can be any of the types supported by the ``collections`` argument
1420 to butler construction.
1421 run : `str`, optional
1422 Run to use for predictions, overriding ``self.run``.
1423 **kwargs
1424 Additional keyword arguments used to augment or construct a
1425 `DataCoordinate`. See `DataCoordinate.standardize`
1426 parameters.
1428 Returns
1429 -------
1430 uri : `lsst.resources.ResourcePath`
1431 URI pointing to the Dataset within the datastore. If the
1432 Dataset does not exist in the datastore, and if ``predict`` is
1433 `True`, the URI will be a prediction and will include a URI
1434 fragment "#predicted".
1435 If the datastore does not have entities that relate well
1436 to the concept of a URI the returned URI string will be
1437 descriptive. The returned URI is not guaranteed to be obtainable.
1439 Raises
1440 ------
1441 LookupError
1442 A URI has been requested for a dataset that does not exist and
1443 guessing is not allowed.
1444 ValueError
1445 Raised if a resolved `DatasetRef` was passed as an input, but it
1446 differs from the one found in the registry.
1447 TypeError
1448 Raised if no collections were provided.
1449 RuntimeError
1450 Raised if a URI is requested for a dataset that consists of
1451 multiple artifacts.
1452 """
1453 primary, components = self.getURIs(
1454 datasetRefOrType, dataId=dataId, predict=predict, collections=collections, run=run, **kwargs
1455 )
1457 if primary is None or components:
1458 raise RuntimeError(
1459 f"Dataset ({datasetRefOrType}) includes distinct URIs for components. "
1460 "Use Butler.getURIs() instead."
1461 )
1462 return primary
1464 def retrieveArtifacts(
1465 self,
1466 refs: Iterable[DatasetRef],
1467 destination: ResourcePathExpression,
1468 transfer: str = "auto",
1469 preserve_path: bool = True,
1470 overwrite: bool = False,
1471 ) -> List[ResourcePath]:
1472 """Retrieve the artifacts associated with the supplied refs.
1474 Parameters
1475 ----------
1476 refs : iterable of `DatasetRef`
1477 The datasets for which artifacts are to be retrieved.
1478 A single ref can result in multiple artifacts. The refs must
1479 be resolved.
1480 destination : `lsst.resources.ResourcePath` or `str`
1481 Location to write the artifacts.
1482 transfer : `str`, optional
1483 Method to use to transfer the artifacts. Must be one of the options
1484 supported by `~lsst.resources.ResourcePath.transfer_from()`.
1485 "move" is not allowed.
1486 preserve_path : `bool`, optional
1487 If `True` the full path of the artifact within the datastore
1488 is preserved. If `False` the final file component of the path
1489 is used.
1490 overwrite : `bool`, optional
1491 If `True` allow transfers to overwrite existing files at the
1492 destination.
1494 Returns
1495 -------
1496 targets : `list` of `lsst.resources.ResourcePath`
1497 URIs of file artifacts in destination location. Order is not
1498 preserved.
1500 Notes
1501 -----
1502 For non-file datastores the artifacts written to the destination
1503 may not match the representation inside the datastore. For example
1504 a hierarchical data structure in a NoSQL database may well be stored
1505 as a JSON file.
1506 """
1507 return self.datastore.retrieveArtifacts(
1508 refs,
1509 ResourcePath(destination),
1510 transfer=transfer,
1511 preserve_path=preserve_path,
1512 overwrite=overwrite,
1513 )
1515 def datasetExists(
1516 self,
1517 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1518 dataId: Optional[DataId] = None,
1519 *,
1520 collections: Any = None,
1521 **kwargs: Any,
1522 ) -> bool:
1523 """Return True if the Dataset is actually present in the Datastore.
1525 Parameters
1526 ----------
1527 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1528 When `DatasetRef` the `dataId` should be `None`.
1529 Otherwise the `DatasetType` or name thereof.
1530 dataId : `dict` or `DataCoordinate`
1531 A `dict` of `Dimension` link name, value pairs that label the
1532 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1533 should be provided as the first argument.
1534 collections : Any, optional
1535 Collections to be searched, overriding ``self.collections``.
1536 Can be any of the types supported by the ``collections`` argument
1537 to butler construction.
1538 **kwargs
1539 Additional keyword arguments used to augment or construct a
1540 `DataCoordinate`. See `DataCoordinate.standardize`
1541 parameters.
1543 Raises
1544 ------
1545 LookupError
1546 Raised if the dataset is not even present in the Registry.
1547 ValueError
1548 Raised if a resolved `DatasetRef` was passed as an input, but it
1549 differs from the one found in the registry.
1550 TypeError
1551 Raised if no collections were provided.
1552 """
1553 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs)
1554 return self.datastore.exists(ref)
1556 def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None:
1557 """Remove one or more `~CollectionType.RUN` collections and the
1558 datasets within them.
1560 Parameters
1561 ----------
1562 names : `Iterable` [ `str` ]
1563 The names of the collections to remove.
1564 unstore : `bool`, optional
1565 If `True` (default), delete datasets from all datastores in which
1566 they are present, and attempt to rollback the registry deletions if
1567 datastore deletions fail (which may not always be possible). If
1568 `False`, datastore records for these datasets are still removed,
1569 but any artifacts (e.g. files) will not be.
1571 Raises
1572 ------
1573 TypeError
1574 Raised if one or more collections are not of type
1575 `~CollectionType.RUN`.
1576 """
1577 if not self.isWriteable():
1578 raise TypeError("Butler is read-only.")
1579 names = list(names)
1580 refs: List[DatasetRef] = []
1581 for name in names:
1582 collectionType = self.registry.getCollectionType(name)
1583 if collectionType is not CollectionType.RUN:
1584 raise TypeError(f"The collection type of '{name}' is {collectionType.name}, not RUN.")
1585 refs.extend(self.registry.queryDatasets(..., collections=name, findFirst=True))
1586 with self.registry.transaction():
1587 if unstore:
1588 self.datastore.trash(refs)
1589 else:
1590 self.datastore.forget(refs)
1591 for name in names:
1592 self.registry.removeCollection(name)
1593 if unstore:
1594 # Point of no return for removing artifacts
1595 self.datastore.emptyTrash()
1597 def pruneCollection(
1598 self, name: str, purge: bool = False, unstore: bool = False, unlink: Optional[List[str]] = None
1599 ) -> None:
1600 """Remove a collection and possibly prune datasets within it.
1602 Parameters
1603 ----------
1604 name : `str`
1605 Name of the collection to remove. If this is a
1606 `~CollectionType.TAGGED` or `~CollectionType.CHAINED` collection,
1607 datasets within the collection are not modified unless ``unstore``
1608 is `True`. If this is a `~CollectionType.RUN` collection,
1609 ``purge`` and ``unstore`` must be `True`, and all datasets in it
1610 are fully removed from the data repository.
1611 purge : `bool`, optional
1612 If `True`, permit `~CollectionType.RUN` collections to be removed,
1613 fully removing datasets within them. Requires ``unstore=True`` as
1614 well as an added precaution against accidental deletion. Must be
1615 `False` (default) if the collection is not a ``RUN``.
1616 unstore: `bool`, optional
1617 If `True`, remove all datasets in the collection from all
1618 datastores in which they appear.
1619 unlink: `list` [`str`], optional
1620 Before removing the given `collection` unlink it from from these
1621 parent collections.
1623 Raises
1624 ------
1625 TypeError
1626 Raised if the butler is read-only or arguments are mutually
1627 inconsistent.
1628 """
1629 # See pruneDatasets comments for more information about the logic here;
1630 # the cases are almost the same, but here we can rely on Registry to
1631 # take care everything but Datastore deletion when we remove the
1632 # collection.
1633 if not self.isWriteable():
1634 raise TypeError("Butler is read-only.")
1635 collectionType = self.registry.getCollectionType(name)
1636 if purge and not unstore:
1637 raise PurgeWithoutUnstorePruneCollectionsError()
1638 if collectionType is CollectionType.RUN and not purge:
1639 raise RunWithoutPurgePruneCollectionsError(collectionType)
1640 if collectionType is not CollectionType.RUN and purge:
1641 raise PurgeUnsupportedPruneCollectionsError(collectionType)
1643 def remove(child: str, parent: str) -> None:
1644 """Remove a child collection from a parent collection."""
1645 # Remove child from parent.
1646 chain = list(self.registry.getCollectionChain(parent))
1647 try:
1648 chain.remove(name)
1649 except ValueError as e:
1650 raise RuntimeError(f"{name} is not a child of {parent}") from e
1651 self.registry.setCollectionChain(parent, chain)
1653 with self.registry.transaction():
1654 if unlink:
1655 for parent in unlink:
1656 remove(name, parent)
1657 if unstore:
1658 refs = self.registry.queryDatasets(..., collections=name, findFirst=True)
1659 self.datastore.trash(refs)
1660 self.registry.removeCollection(name)
1662 if unstore:
1663 # Point of no return for removing artifacts
1664 self.datastore.emptyTrash()
1666 def pruneDatasets(
1667 self,
1668 refs: Iterable[DatasetRef],
1669 *,
1670 disassociate: bool = True,
1671 unstore: bool = False,
1672 tags: Iterable[str] = (),
1673 purge: bool = False,
1674 run: Optional[str] = None,
1675 ) -> None:
1676 """Remove one or more datasets from a collection and/or storage.
1678 Parameters
1679 ----------
1680 refs : `~collections.abc.Iterable` of `DatasetRef`
1681 Datasets to prune. These must be "resolved" references (not just
1682 a `DatasetType` and data ID).
1683 disassociate : `bool`, optional
1684 Disassociate pruned datasets from ``tags``, or from all collections
1685 if ``purge=True``.
1686 unstore : `bool`, optional
1687 If `True` (`False` is default) remove these datasets from all
1688 datastores known to this butler. Note that this will make it
1689 impossible to retrieve these datasets even via other collections.
1690 Datasets that are already not stored are ignored by this option.
1691 tags : `Iterable` [ `str` ], optional
1692 `~CollectionType.TAGGED` collections to disassociate the datasets
1693 from. Ignored if ``disassociate`` is `False` or ``purge`` is
1694 `True`.
1695 purge : `bool`, optional
1696 If `True` (`False` is default), completely remove the dataset from
1697 the `Registry`. To prevent accidental deletions, ``purge`` may
1698 only be `True` if all of the following conditions are met:
1700 - All given datasets are in the given run.
1701 - ``disassociate`` is `True`;
1702 - ``unstore`` is `True`.
1704 This mode may remove provenance information from datasets other
1705 than those provided, and should be used with extreme care.
1707 Raises
1708 ------
1709 TypeError
1710 Raised if the butler is read-only, if no collection was provided,
1711 or the conditions for ``purge=True`` were not met.
1712 """
1713 if not self.isWriteable():
1714 raise TypeError("Butler is read-only.")
1715 if purge:
1716 if not disassociate:
1717 raise TypeError("Cannot pass purge=True without disassociate=True.")
1718 if not unstore:
1719 raise TypeError("Cannot pass purge=True without unstore=True.")
1720 elif disassociate:
1721 tags = tuple(tags)
1722 if not tags:
1723 raise TypeError("No tags provided but disassociate=True.")
1724 for tag in tags:
1725 collectionType = self.registry.getCollectionType(tag)
1726 if collectionType is not CollectionType.TAGGED:
1727 raise TypeError(
1728 f"Cannot disassociate from collection '{tag}' "
1729 f"of non-TAGGED type {collectionType.name}."
1730 )
1731 # Transform possibly-single-pass iterable into something we can iterate
1732 # over multiple times.
1733 refs = list(refs)
1734 # Pruning a component of a DatasetRef makes no sense since registry
1735 # doesn't know about components and datastore might not store
1736 # components in a separate file
1737 for ref in refs:
1738 if ref.datasetType.component():
1739 raise ValueError(f"Can not prune a component of a dataset (ref={ref})")
1740 # We don't need an unreliable Datastore transaction for this, because
1741 # we've been extra careful to ensure that Datastore.trash only involves
1742 # mutating the Registry (it can _look_ at Datastore-specific things,
1743 # but shouldn't change them), and hence all operations here are
1744 # Registry operations.
1745 with self.registry.transaction():
1746 if unstore:
1747 self.datastore.trash(refs)
1748 if purge:
1749 self.registry.removeDatasets(refs)
1750 elif disassociate:
1751 assert tags, "Guaranteed by earlier logic in this function."
1752 for tag in tags:
1753 self.registry.disassociate(tag, refs)
1754 # We've exited the Registry transaction, and apparently committed.
1755 # (if there was an exception, everything rolled back, and it's as if
1756 # nothing happened - and we never get here).
1757 # Datastore artifacts are not yet gone, but they're clearly marked
1758 # as trash, so if we fail to delete now because of (e.g.) filesystem
1759 # problems we can try again later, and if manual administrative
1760 # intervention is required, it's pretty clear what that should entail:
1761 # deleting everything on disk and in private Datastore tables that is
1762 # in the dataset_location_trash table.
1763 if unstore:
1764 # Point of no return for removing artifacts
1765 self.datastore.emptyTrash()
1767 @transactional
1768 def ingest(
1769 self,
1770 *datasets: FileDataset,
1771 transfer: Optional[str] = "auto",
1772 run: Optional[str] = None,
1773 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
1774 record_validation_info: bool = True,
1775 ) -> None:
1776 """Store and register one or more datasets that already exist on disk.
1778 Parameters
1779 ----------
1780 datasets : `FileDataset`
1781 Each positional argument is a struct containing information about
1782 a file to be ingested, including its URI (either absolute or
1783 relative to the datastore root, if applicable), a `DatasetRef`,
1784 and optionally a formatter class or its fully-qualified string
1785 name. If a formatter is not provided, the formatter that would be
1786 used for `put` is assumed. On successful return, all
1787 `FileDataset.ref` attributes will have their `DatasetRef.id`
1788 attribute populated and all `FileDataset.formatter` attributes will
1789 be set to the formatter class used. `FileDataset.path` attributes
1790 may be modified to put paths in whatever the datastore considers a
1791 standardized form.
1792 transfer : `str`, optional
1793 If not `None`, must be one of 'auto', 'move', 'copy', 'direct',
1794 'split', 'hardlink', 'relsymlink' or 'symlink', indicating how to
1795 transfer the file.
1796 run : `str`, optional
1797 The name of the run ingested datasets should be added to,
1798 overriding ``self.run``.
1799 idGenerationMode : `DatasetIdGenEnum`, optional
1800 Specifies option for generating dataset IDs. By default unique IDs
1801 are generated for each inserted dataset.
1802 record_validation_info : `bool`, optional
1803 If `True`, the default, the datastore can record validation
1804 information associated with the file. If `False` the datastore
1805 will not attempt to track any information such as checksums
1806 or file sizes. This can be useful if such information is tracked
1807 in an external system or if the file is to be compressed in place.
1808 It is up to the datastore whether this parameter is relevant.
1810 Raises
1811 ------
1812 TypeError
1813 Raised if the butler is read-only or if no run was provided.
1814 NotImplementedError
1815 Raised if the `Datastore` does not support the given transfer mode.
1816 DatasetTypeNotSupportedError
1817 Raised if one or more files to be ingested have a dataset type that
1818 is not supported by the `Datastore`..
1819 FileNotFoundError
1820 Raised if one of the given files does not exist.
1821 FileExistsError
1822 Raised if transfer is not `None` but the (internal) location the
1823 file would be moved to is already occupied.
1825 Notes
1826 -----
1827 This operation is not fully exception safe: if a database operation
1828 fails, the given `FileDataset` instances may be only partially updated.
1830 It is atomic in terms of database operations (they will either all
1831 succeed or all fail) providing the database engine implements
1832 transactions correctly. It will attempt to be atomic in terms of
1833 filesystem operations as well, but this cannot be implemented
1834 rigorously for most datastores.
1835 """
1836 if not self.isWriteable():
1837 raise TypeError("Butler is read-only.")
1838 progress = Progress("lsst.daf.butler.Butler.ingest", level=logging.DEBUG)
1839 # Reorganize the inputs so they're grouped by DatasetType and then
1840 # data ID. We also include a list of DatasetRefs for each FileDataset
1841 # to hold the resolved DatasetRefs returned by the Registry, before
1842 # it's safe to swap them into FileDataset.refs.
1843 # Some type annotation aliases to make that clearer:
1844 GroupForType = Dict[DataCoordinate, Tuple[FileDataset, List[DatasetRef]]]
1845 GroupedData = MutableMapping[DatasetType, GroupForType]
1846 # The actual data structure:
1847 groupedData: GroupedData = defaultdict(dict)
1848 # And the nested loop that populates it:
1849 for dataset in progress.wrap(datasets, desc="Grouping by dataset type"):
1850 # This list intentionally shared across the inner loop, since it's
1851 # associated with `dataset`.
1852 resolvedRefs: List[DatasetRef] = []
1854 # Somewhere to store pre-existing refs if we have an
1855 # execution butler.
1856 existingRefs: List[DatasetRef] = []
1858 for ref in dataset.refs:
1859 if ref.dataId in groupedData[ref.datasetType]:
1860 raise ConflictingDefinitionError(
1861 f"Ingest conflict. Dataset {dataset.path} has same"
1862 " DataId as other ingest dataset"
1863 f" {groupedData[ref.datasetType][ref.dataId][0].path} "
1864 f" ({ref.dataId})"
1865 )
1866 if self._allow_put_of_predefined_dataset:
1867 existing_ref = self.registry.findDataset(
1868 ref.datasetType, dataId=ref.dataId, collections=run
1869 )
1870 if existing_ref:
1871 if self.datastore.knows(existing_ref):
1872 raise ConflictingDefinitionError(
1873 f"Dataset associated with path {dataset.path}"
1874 f" already exists as {existing_ref}."
1875 )
1876 # Store this ref elsewhere since it already exists
1877 # and we do not want to remake it but we do want
1878 # to store it in the datastore.
1879 existingRefs.append(existing_ref)
1881 # Nothing else to do until we have finished
1882 # iterating.
1883 continue
1885 groupedData[ref.datasetType][ref.dataId] = (dataset, resolvedRefs)
1887 if existingRefs:
1889 if len(dataset.refs) != len(existingRefs):
1890 # Keeping track of partially pre-existing datasets is hard
1891 # and should generally never happen. For now don't allow
1892 # it.
1893 raise ConflictingDefinitionError(
1894 f"For dataset {dataset.path} some dataIds already exist"
1895 " in registry but others do not. This is not supported."
1896 )
1898 # Attach the resolved refs if we found them.
1899 dataset.refs = existingRefs
1901 # Now we can bulk-insert into Registry for each DatasetType.
1902 for datasetType, groupForType in progress.iter_item_chunks(
1903 groupedData.items(), desc="Bulk-inserting datasets by type"
1904 ):
1905 refs = self.registry.insertDatasets(
1906 datasetType,
1907 dataIds=groupForType.keys(),
1908 run=run,
1909 expand=self.datastore.needs_expanded_data_ids(transfer, datasetType),
1910 idGenerationMode=idGenerationMode,
1911 )
1912 # Append those resolved DatasetRefs to the new lists we set up for
1913 # them.
1914 for ref, (_, resolvedRefs) in zip(refs, groupForType.values()):
1915 resolvedRefs.append(ref)
1917 # Go back to the original FileDatasets to replace their refs with the
1918 # new resolved ones.
1919 for groupForType in progress.iter_chunks(
1920 groupedData.values(), desc="Reassociating resolved dataset refs with files"
1921 ):
1922 for dataset, resolvedRefs in groupForType.values():
1923 dataset.refs = resolvedRefs
1925 # Bulk-insert everything into Datastore.
1926 self.datastore.ingest(*datasets, transfer=transfer, record_validation_info=record_validation_info)
1928 @contextlib.contextmanager
1929 def export(
1930 self,
1931 *,
1932 directory: Optional[str] = None,
1933 filename: Optional[str] = None,
1934 format: Optional[str] = None,
1935 transfer: Optional[str] = None,
1936 ) -> Iterator[RepoExportContext]:
1937 """Export datasets from the repository represented by this `Butler`.
1939 This method is a context manager that returns a helper object
1940 (`RepoExportContext`) that is used to indicate what information from
1941 the repository should be exported.
1943 Parameters
1944 ----------
1945 directory : `str`, optional
1946 Directory dataset files should be written to if ``transfer`` is not
1947 `None`.
1948 filename : `str`, optional
1949 Name for the file that will include database information associated
1950 with the exported datasets. If this is not an absolute path and
1951 ``directory`` is not `None`, it will be written to ``directory``
1952 instead of the current working directory. Defaults to
1953 "export.{format}".
1954 format : `str`, optional
1955 File format for the database information file. If `None`, the
1956 extension of ``filename`` will be used.
1957 transfer : `str`, optional
1958 Transfer mode passed to `Datastore.export`.
1960 Raises
1961 ------
1962 TypeError
1963 Raised if the set of arguments passed is inconsistent.
1965 Examples
1966 --------
1967 Typically the `Registry.queryDataIds` and `Registry.queryDatasets`
1968 methods are used to provide the iterables over data IDs and/or datasets
1969 to be exported::
1971 with butler.export("exports.yaml") as export:
1972 # Export all flats, but none of the dimension element rows
1973 # (i.e. data ID information) associated with them.
1974 export.saveDatasets(butler.registry.queryDatasets("flat"),
1975 elements=())
1976 # Export all datasets that start with "deepCoadd_" and all of
1977 # their associated data ID information.
1978 export.saveDatasets(butler.registry.queryDatasets("deepCoadd_*"))
1979 """
1980 if directory is None and transfer is not None:
1981 raise TypeError("Cannot transfer without providing a directory.")
1982 if transfer == "move":
1983 raise TypeError("Transfer may not be 'move': export is read-only")
1984 if format is None:
1985 if filename is None:
1986 raise TypeError("At least one of 'filename' or 'format' must be provided.")
1987 else:
1988 _, format = os.path.splitext(filename)
1989 elif filename is None:
1990 filename = f"export.{format}"
1991 if directory is not None:
1992 filename = os.path.join(directory, filename)
1993 BackendClass = get_class_of(self._config["repo_transfer_formats"][format]["export"])
1994 with open(filename, "w") as stream:
1995 backend = BackendClass(stream)
1996 try:
1997 helper = RepoExportContext(
1998 self.registry, self.datastore, backend=backend, directory=directory, transfer=transfer
1999 )
2000 yield helper
2001 except BaseException:
2002 raise
2003 else:
2004 helper._finish()
2006 def import_(
2007 self,
2008 *,
2009 directory: Optional[str] = None,
2010 filename: Union[str, TextIO, None] = None,
2011 format: Optional[str] = None,
2012 transfer: Optional[str] = None,
2013 skip_dimensions: Optional[Set] = None,
2014 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
2015 reuseIds: bool = False,
2016 ) -> None:
2017 """Import datasets into this repository that were exported from a
2018 different butler repository via `~lsst.daf.butler.Butler.export`.
2020 Parameters
2021 ----------
2022 directory : `str`, optional
2023 Directory containing dataset files to import from. If `None`,
2024 ``filename`` and all dataset file paths specified therein must
2025 be absolute.
2026 filename : `str` or `TextIO`, optional
2027 A stream or name of file that contains database information
2028 associated with the exported datasets, typically generated by
2029 `~lsst.daf.butler.Butler.export`. If this a string (name) and
2030 is not an absolute path, does not exist in the current working
2031 directory, and ``directory`` is not `None`, it is assumed to be in
2032 ``directory``. Defaults to "export.{format}".
2033 format : `str`, optional
2034 File format for ``filename``. If `None`, the extension of
2035 ``filename`` will be used.
2036 transfer : `str`, optional
2037 Transfer mode passed to `~lsst.daf.butler.Datastore.ingest`.
2038 skip_dimensions : `set`, optional
2039 Names of dimensions that should be skipped and not imported.
2040 idGenerationMode : `DatasetIdGenEnum`, optional
2041 Specifies option for generating dataset IDs when IDs are not
2042 provided or their type does not match backend type. By default
2043 unique IDs are generated for each inserted dataset.
2044 reuseIds : `bool`, optional
2045 If `True` then forces re-use of imported dataset IDs for integer
2046 IDs which are normally generated as auto-incremented; exception
2047 will be raised if imported IDs clash with existing ones. This
2048 option has no effect on the use of globally-unique IDs which are
2049 always re-used (or generated if integer IDs are being imported).
2051 Raises
2052 ------
2053 TypeError
2054 Raised if the set of arguments passed is inconsistent, or if the
2055 butler is read-only.
2056 """
2057 if not self.isWriteable():
2058 raise TypeError("Butler is read-only.")
2059 if format is None:
2060 if filename is None:
2061 raise TypeError("At least one of 'filename' or 'format' must be provided.")
2062 else:
2063 _, format = os.path.splitext(filename) # type: ignore
2064 elif filename is None:
2065 filename = f"export.{format}"
2066 if isinstance(filename, str) and directory is not None and not os.path.exists(filename):
2067 filename = os.path.join(directory, filename)
2068 BackendClass = get_class_of(self._config["repo_transfer_formats"][format]["import"])
2070 def doImport(importStream: TextIO) -> None:
2071 backend = BackendClass(importStream, self.registry)
2072 backend.register()
2073 with self.transaction():
2074 backend.load(
2075 self.datastore,
2076 directory=directory,
2077 transfer=transfer,
2078 skip_dimensions=skip_dimensions,
2079 idGenerationMode=idGenerationMode,
2080 reuseIds=reuseIds,
2081 )
2083 if isinstance(filename, str):
2084 with open(filename, "r") as stream:
2085 doImport(stream)
2086 else:
2087 doImport(filename)
2089 def transfer_from(
2090 self,
2091 source_butler: Butler,
2092 source_refs: Iterable[DatasetRef],
2093 transfer: str = "auto",
2094 id_gen_map: Dict[str, DatasetIdGenEnum] = None,
2095 skip_missing: bool = True,
2096 register_dataset_types: bool = False,
2097 ) -> List[DatasetRef]:
2098 """Transfer datasets to this Butler from a run in another Butler.
2100 Parameters
2101 ----------
2102 source_butler : `Butler`
2103 Butler from which the datasets are to be transferred.
2104 source_refs : iterable of `DatasetRef`
2105 Datasets defined in the source butler that should be transferred to
2106 this butler.
2107 transfer : `str`, optional
2108 Transfer mode passed to `~lsst.daf.butler.Datastore.transfer_from`.
2109 id_gen_map : `dict` of [`str`, `DatasetIdGenEnum`], optional
2110 A mapping of dataset type to ID generation mode. Only used if
2111 the source butler is using integer IDs. Should not be used
2112 if this receiving butler uses integer IDs. Without this dataset
2113 import always uses unique.
2114 skip_missing : `bool`
2115 If `True`, datasets with no datastore artifact associated with
2116 them are not transferred. If `False` a registry entry will be
2117 created even if no datastore record is created (and so will
2118 look equivalent to the dataset being unstored).
2119 register_dataset_types : `bool`
2120 If `True` any missing dataset types are registered. Otherwise
2121 an exception is raised.
2123 Returns
2124 -------
2125 refs : `list` of `DatasetRef`
2126 The refs added to this Butler.
2128 Notes
2129 -----
2130 Requires that any dimension definitions are already present in the
2131 receiving Butler. The datastore artifact has to exist for a transfer
2132 to be made but non-existence is not an error.
2134 Datasets that already exist in this run will be skipped.
2136 The datasets are imported as part of a transaction, although
2137 dataset types are registered before the transaction is started.
2138 This means that it is possible for a dataset type to be registered
2139 even though transfer has failed.
2140 """
2141 if not self.isWriteable():
2142 raise TypeError("Butler is read-only.")
2143 progress = Progress("lsst.daf.butler.Butler.transfer_from", level=VERBOSE)
2145 # Will iterate through the refs multiple times so need to convert
2146 # to a list if this isn't a collection.
2147 if not isinstance(source_refs, collections.abc.Collection):
2148 source_refs = list(source_refs)
2150 original_count = len(source_refs)
2151 log.info("Transferring %d datasets into %s", original_count, str(self))
2153 if id_gen_map is None:
2154 id_gen_map = {}
2156 # In some situations the datastore artifact may be missing
2157 # and we do not want that registry entry to be imported.
2158 # Asking datastore is not sufficient, the records may have been
2159 # purged, we have to ask for the (predicted) URI and check
2160 # existence explicitly. Execution butler is set up exactly like
2161 # this with no datastore records.
2162 artifact_existence: Dict[ResourcePath, bool] = {}
2163 if skip_missing:
2164 dataset_existence = source_butler.datastore.mexists(
2165 source_refs, artifact_existence=artifact_existence
2166 )
2167 source_refs = [ref for ref, exists in dataset_existence.items() if exists]
2168 filtered_count = len(source_refs)
2169 log.verbose(
2170 "%d datasets removed because the artifact does not exist. Now have %d.",
2171 original_count - filtered_count,
2172 filtered_count,
2173 )
2175 # Importing requires that we group the refs by dataset type and run
2176 # before doing the import.
2177 source_dataset_types = set()
2178 grouped_refs = defaultdict(list)
2179 grouped_indices = defaultdict(list)
2180 for i, ref in enumerate(source_refs):
2181 grouped_refs[ref.datasetType, ref.run].append(ref)
2182 grouped_indices[ref.datasetType, ref.run].append(i)
2183 source_dataset_types.add(ref.datasetType)
2185 # Check to see if the dataset type in the source butler has
2186 # the same definition in the target butler and register missing
2187 # ones if requested. Registration must happen outside a transaction.
2188 newly_registered_dataset_types = set()
2189 for datasetType in source_dataset_types:
2190 if register_dataset_types:
2191 # Let this raise immediately if inconsistent. Continuing
2192 # on to find additional inconsistent dataset types
2193 # might result in additional unwanted dataset types being
2194 # registered.
2195 if self.registry.registerDatasetType(datasetType):
2196 newly_registered_dataset_types.add(datasetType)
2197 else:
2198 # If the dataset type is missing, let it fail immediately.
2199 target_dataset_type = self.registry.getDatasetType(datasetType.name)
2200 if target_dataset_type != datasetType:
2201 raise ConflictingDefinitionError(
2202 "Source butler dataset type differs from definition"
2203 f" in target butler: {datasetType} !="
2204 f" {target_dataset_type}"
2205 )
2206 if newly_registered_dataset_types:
2207 # We may have registered some even if there were inconsistencies
2208 # but should let people know (or else remove them again).
2209 log.log(
2210 VERBOSE,
2211 "Registered the following dataset types in the target Butler: %s",
2212 ", ".join(d.name for d in newly_registered_dataset_types),
2213 )
2214 else:
2215 log.log(VERBOSE, "All required dataset types are known to the target Butler")
2217 # The returned refs should be identical for UUIDs.
2218 # For now must also support integers and so need to retain the
2219 # newly-created refs from this registry.
2220 # Pre-size it so we can assign refs into the correct slots
2221 transferred_refs_tmp: List[Optional[DatasetRef]] = [None] * len(source_refs)
2222 default_id_gen = DatasetIdGenEnum.UNIQUE
2224 handled_collections: Set[str] = set()
2226 # Do all the importing in a single transaction.
2227 with self.transaction():
2228 for (datasetType, run), refs_to_import in progress.iter_item_chunks(
2229 grouped_refs.items(), desc="Importing to registry by run and dataset type"
2230 ):
2231 if run not in handled_collections:
2232 run_doc = source_butler.registry.getCollectionDocumentation(run)
2233 registered = self.registry.registerRun(run, doc=run_doc)
2234 handled_collections.add(run)
2235 if registered:
2236 log.log(VERBOSE, "Creating output run %s", run)
2238 id_generation_mode = default_id_gen
2239 if isinstance(refs_to_import[0].id, int):
2240 # ID generation mode might need to be overridden when
2241 # targetting UUID
2242 id_generation_mode = id_gen_map.get(datasetType.name, default_id_gen)
2244 n_refs = len(refs_to_import)
2245 log.verbose(
2246 "Importing %d ref%s of dataset type %s into run %s",
2247 n_refs,
2248 "" if n_refs == 1 else "s",
2249 datasetType.name,
2250 run,
2251 )
2253 # No way to know if this butler's registry uses UUID.
2254 # We have to trust the caller on this. If it fails they will
2255 # have to change their approach. We can't catch the exception
2256 # and retry with unique because that will mess up the
2257 # transaction handling. We aren't allowed to ask the registry
2258 # manager what type of ID it is using.
2259 imported_refs = self.registry._importDatasets(
2260 refs_to_import, idGenerationMode=id_generation_mode, expand=False
2261 )
2263 # Map them into the correct slots to match the initial order
2264 for i, ref in zip(grouped_indices[datasetType, run], imported_refs):
2265 transferred_refs_tmp[i] = ref
2267 # Mypy insists that we might have None in here so we have to make
2268 # that explicit by assigning to a new variable and filtering out
2269 # something that won't be there.
2270 transferred_refs = [ref for ref in transferred_refs_tmp if ref is not None]
2272 # Check consistency
2273 assert len(source_refs) == len(transferred_refs), "Different number of refs imported than given"
2275 log.verbose("Imported %d datasets into destination butler", len(transferred_refs))
2277 # The transferred refs need to be reordered to match the original
2278 # ordering given by the caller. Without this the datastore transfer
2279 # will be broken.
2281 # Ask the datastore to transfer. The datastore has to check that
2282 # the source datastore is compatible with the target datastore.
2283 self.datastore.transfer_from(
2284 source_butler.datastore,
2285 source_refs,
2286 local_refs=transferred_refs,
2287 transfer=transfer,
2288 artifact_existence=artifact_existence,
2289 )
2291 return transferred_refs
2293 def validateConfiguration(
2294 self,
2295 logFailures: bool = False,
2296 datasetTypeNames: Optional[Iterable[str]] = None,
2297 ignore: Iterable[str] = None,
2298 ) -> None:
2299 """Validate butler configuration.
2301 Checks that each `DatasetType` can be stored in the `Datastore`.
2303 Parameters
2304 ----------
2305 logFailures : `bool`, optional
2306 If `True`, output a log message for every validation error
2307 detected.
2308 datasetTypeNames : iterable of `str`, optional
2309 The `DatasetType` names that should be checked. This allows
2310 only a subset to be selected.
2311 ignore : iterable of `str`, optional
2312 Names of DatasetTypes to skip over. This can be used to skip
2313 known problems. If a named `DatasetType` corresponds to a
2314 composite, all components of that `DatasetType` will also be
2315 ignored.
2317 Raises
2318 ------
2319 ButlerValidationError
2320 Raised if there is some inconsistency with how this Butler
2321 is configured.
2322 """
2323 if datasetTypeNames:
2324 datasetTypes = [self.registry.getDatasetType(name) for name in datasetTypeNames]
2325 else:
2326 datasetTypes = list(self.registry.queryDatasetTypes())
2328 # filter out anything from the ignore list
2329 if ignore:
2330 ignore = set(ignore)
2331 datasetTypes = [
2332 e for e in datasetTypes if e.name not in ignore and e.nameAndComponent()[0] not in ignore
2333 ]
2334 else:
2335 ignore = set()
2337 # Find all the registered instruments
2338 instruments = set(record.name for record in self.registry.queryDimensionRecords("instrument"))
2340 # For each datasetType that has an instrument dimension, create
2341 # a DatasetRef for each defined instrument
2342 datasetRefs = []
2344 for datasetType in datasetTypes:
2345 if "instrument" in datasetType.dimensions:
2346 for instrument in instruments:
2347 datasetRef = DatasetRef(
2348 datasetType, {"instrument": instrument}, conform=False # type: ignore
2349 )
2350 datasetRefs.append(datasetRef)
2352 entities: List[Union[DatasetType, DatasetRef]] = []
2353 entities.extend(datasetTypes)
2354 entities.extend(datasetRefs)
2356 datastoreErrorStr = None
2357 try:
2358 self.datastore.validateConfiguration(entities, logFailures=logFailures)
2359 except ValidationError as e:
2360 datastoreErrorStr = str(e)
2362 # Also check that the LookupKeys used by the datastores match
2363 # registry and storage class definitions
2364 keys = self.datastore.getLookupKeys()
2366 failedNames = set()
2367 failedDataId = set()
2368 for key in keys:
2369 if key.name is not None:
2370 if key.name in ignore:
2371 continue
2373 # skip if specific datasetType names were requested and this
2374 # name does not match
2375 if datasetTypeNames and key.name not in datasetTypeNames:
2376 continue
2378 # See if it is a StorageClass or a DatasetType
2379 if key.name in self.storageClasses:
2380 pass
2381 else:
2382 try:
2383 self.registry.getDatasetType(key.name)
2384 except KeyError:
2385 if logFailures:
2386 log.critical("Key '%s' does not correspond to a DatasetType or StorageClass", key)
2387 failedNames.add(key)
2388 else:
2389 # Dimensions are checked for consistency when the Butler
2390 # is created and rendezvoused with a universe.
2391 pass
2393 # Check that the instrument is a valid instrument
2394 # Currently only support instrument so check for that
2395 if key.dataId:
2396 dataIdKeys = set(key.dataId)
2397 if set(["instrument"]) != dataIdKeys:
2398 if logFailures:
2399 log.critical("Key '%s' has unsupported DataId override", key)
2400 failedDataId.add(key)
2401 elif key.dataId["instrument"] not in instruments:
2402 if logFailures:
2403 log.critical("Key '%s' has unknown instrument", key)
2404 failedDataId.add(key)
2406 messages = []
2408 if datastoreErrorStr:
2409 messages.append(datastoreErrorStr)
2411 for failed, msg in (
2412 (failedNames, "Keys without corresponding DatasetType or StorageClass entry: "),
2413 (failedDataId, "Keys with bad DataId entries: "),
2414 ):
2415 if failed:
2416 msg += ", ".join(str(k) for k in failed)
2417 messages.append(msg)
2419 if messages:
2420 raise ValidationError(";\n".join(messages))
2422 @property
2423 def collections(self) -> CollectionSearch:
2424 """The collections to search by default, in order (`CollectionSearch`).
2426 This is an alias for ``self.registry.defaults.collections``. It cannot
2427 be set directly in isolation, but all defaults may be changed together
2428 by assigning a new `RegistryDefaults` instance to
2429 ``self.registry.defaults``.
2430 """
2431 return self.registry.defaults.collections
2433 @property
2434 def run(self) -> Optional[str]:
2435 """Name of the run this butler writes outputs to by default (`str` or
2436 `None`).
2438 This is an alias for ``self.registry.defaults.run``. It cannot be set
2439 directly in isolation, but all defaults may be changed together by
2440 assigning a new `RegistryDefaults` instance to
2441 ``self.registry.defaults``.
2442 """
2443 return self.registry.defaults.run
2445 @property
2446 def dimensions(self) -> DimensionUniverse:
2447 # Docstring inherited.
2448 return self.registry.dimensions
2450 registry: Registry
2451 """The object that manages dataset metadata and relationships (`Registry`).
2453 Most operations that don't involve reading or writing butler datasets are
2454 accessible only via `Registry` methods.
2455 """
2457 datastore: Datastore
2458 """The object that manages actual dataset storage (`Datastore`).
2460 Direct user access to the datastore should rarely be necessary; the primary
2461 exception is the case where a `Datastore` implementation provides extra
2462 functionality beyond what the base class defines.
2463 """
2465 storageClasses: StorageClassFactory
2466 """An object that maps known storage class names to objects that fully
2467 describe them (`StorageClassFactory`).
2468 """
2470 _allow_put_of_predefined_dataset: bool
2471 """Allow a put to succeed even if there is already a registry entry for it
2472 but not a datastore record. (`bool`)."""