Coverage for python/lsst/daf/butler/_butler.py: 10%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""
23Butler top level classes.
24"""
25from __future__ import annotations
27__all__ = (
28 "Butler",
29 "ButlerValidationError",
30 "PruneCollectionsArgsError",
31 "PurgeWithoutUnstorePruneCollectionsError",
32 "RunWithoutPurgePruneCollectionsError",
33 "PurgeUnsupportedPruneCollectionsError",
34)
36import collections.abc
37import contextlib
38import logging
39import numbers
40import os
41from collections import defaultdict
42from typing import (
43 Any,
44 ClassVar,
45 Counter,
46 Dict,
47 Iterable,
48 Iterator,
49 List,
50 MutableMapping,
51 Optional,
52 Set,
53 TextIO,
54 Tuple,
55 Type,
56 Union,
57)
59from lsst.resources import ResourcePath, ResourcePathExpression
60from lsst.utils import doImportType
61from lsst.utils.introspection import get_class_of
62from lsst.utils.logging import VERBOSE, getLogger
64from ._butlerConfig import ButlerConfig
65from ._butlerRepoIndex import ButlerRepoIndex
66from ._deferredDatasetHandle import DeferredDatasetHandle
67from ._limited_butler import LimitedButler
68from .core import (
69 AmbiguousDatasetError,
70 Config,
71 ConfigSubset,
72 DataCoordinate,
73 DataId,
74 DataIdValue,
75 DatasetRef,
76 DatasetType,
77 Datastore,
78 Dimension,
79 DimensionConfig,
80 DimensionUniverse,
81 FileDataset,
82 Progress,
83 StorageClassFactory,
84 Timespan,
85 ValidationError,
86)
87from .core.repoRelocation import BUTLER_ROOT_TAG
88from .core.utils import transactional
89from .registry import (
90 CollectionSearch,
91 CollectionType,
92 ConflictingDefinitionError,
93 DataIdError,
94 DataIdValueError,
95 DatasetIdGenEnum,
96 DimensionNameError,
97 InconsistentDataIdError,
98 Registry,
99 RegistryConfig,
100 RegistryDefaults,
101)
102from .transfers import RepoExportContext
104log = getLogger(__name__)
107class ButlerValidationError(ValidationError):
108 """There is a problem with the Butler configuration."""
110 pass
113class PruneCollectionsArgsError(TypeError):
114 """Base class for errors relating to Butler.pruneCollections input
115 arguments.
116 """
118 pass
121class PurgeWithoutUnstorePruneCollectionsError(PruneCollectionsArgsError):
122 """Raised when purge and unstore are both required to be True, and
123 purge is True but unstore is False.
124 """
126 def __init__(self) -> None:
127 super().__init__("Cannot pass purge=True without unstore=True.")
130class RunWithoutPurgePruneCollectionsError(PruneCollectionsArgsError):
131 """Raised when pruning a RUN collection but purge is False."""
133 def __init__(self, collectionType: CollectionType):
134 self.collectionType = collectionType
135 super().__init__(f"Cannot prune RUN collection {self.collectionType.name} without purge=True.")
138class PurgeUnsupportedPruneCollectionsError(PruneCollectionsArgsError):
139 """Raised when purge is True but is not supported for the given
140 collection."""
142 def __init__(self, collectionType: CollectionType):
143 self.collectionType = collectionType
144 super().__init__(
145 f"Cannot prune {self.collectionType} collection {self.collectionType.name} with purge=True."
146 )
149class Butler(LimitedButler):
150 """Main entry point for the data access system.
152 Parameters
153 ----------
154 config : `ButlerConfig`, `Config` or `str`, optional.
155 Configuration. Anything acceptable to the
156 `ButlerConfig` constructor. If a directory path
157 is given the configuration will be read from a ``butler.yaml`` file in
158 that location. If `None` is given default values will be used.
159 butler : `Butler`, optional.
160 If provided, construct a new Butler that uses the same registry and
161 datastore as the given one, but with the given collection and run.
162 Incompatible with the ``config``, ``searchPaths``, and ``writeable``
163 arguments.
164 collections : `str` or `Iterable` [ `str` ], optional
165 An expression specifying the collections to be searched (in order) when
166 reading datasets.
167 This may be a `str` collection name or an iterable thereof.
168 See :ref:`daf_butler_collection_expressions` for more information.
169 These collections are not registered automatically and must be
170 manually registered before they are used by any method, but they may be
171 manually registered after the `Butler` is initialized.
172 run : `str`, optional
173 Name of the `~CollectionType.RUN` collection new datasets should be
174 inserted into. If ``collections`` is `None` and ``run`` is not `None`,
175 ``collections`` will be set to ``[run]``. If not `None`, this
176 collection will automatically be registered. If this is not set (and
177 ``writeable`` is not set either), a read-only butler will be created.
178 searchPaths : `list` of `str`, optional
179 Directory paths to search when calculating the full Butler
180 configuration. Not used if the supplied config is already a
181 `ButlerConfig`.
182 writeable : `bool`, optional
183 Explicitly sets whether the butler supports write operations. If not
184 provided, a read-write butler is created if any of ``run``, ``tags``,
185 or ``chains`` is non-empty.
186 inferDefaults : `bool`, optional
187 If `True` (default) infer default data ID values from the values
188 present in the datasets in ``collections``: if all collections have the
189 same value (or no value) for a governor dimension, that value will be
190 the default for that dimension. Nonexistent collections are ignored.
191 If a default value is provided explicitly for a governor dimension via
192 ``**kwargs``, no default will be inferred for that dimension.
193 **kwargs : `str`
194 Default data ID key-value pairs. These may only identify "governor"
195 dimensions like ``instrument`` and ``skymap``.
197 Examples
198 --------
199 While there are many ways to control exactly how a `Butler` interacts with
200 the collections in its `Registry`, the most common cases are still simple.
202 For a read-only `Butler` that searches one collection, do::
204 butler = Butler("/path/to/repo", collections=["u/alice/DM-50000"])
206 For a read-write `Butler` that writes to and reads from a
207 `~CollectionType.RUN` collection::
209 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a")
211 The `Butler` passed to a ``PipelineTask`` is often much more complex,
212 because we want to write to one `~CollectionType.RUN` collection but read
213 from several others (as well)::
215 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a",
216 collections=["u/alice/DM-50000/a",
217 "u/bob/DM-49998",
218 "HSC/defaults"])
220 This butler will `put` new datasets to the run ``u/alice/DM-50000/a``.
221 Datasets will be read first from that run (since it appears first in the
222 chain), and then from ``u/bob/DM-49998`` and finally ``HSC/defaults``.
224 Finally, one can always create a `Butler` with no collections::
226 butler = Butler("/path/to/repo", writeable=True)
228 This can be extremely useful when you just want to use ``butler.registry``,
229 e.g. for inserting dimension data or managing collections, or when the
230 collections you want to use with the butler are not consistent.
231 Passing ``writeable`` explicitly here is only necessary if you want to be
232 able to make changes to the repo - usually the value for ``writeable`` can
233 be guessed from the collection arguments provided, but it defaults to
234 `False` when there are not collection arguments.
235 """
237 def __init__(
238 self,
239 config: Union[Config, str, None] = None,
240 *,
241 butler: Optional[Butler] = None,
242 collections: Any = None,
243 run: Optional[str] = None,
244 searchPaths: Optional[List[str]] = None,
245 writeable: Optional[bool] = None,
246 inferDefaults: bool = True,
247 **kwargs: str,
248 ):
249 defaults = RegistryDefaults(collections=collections, run=run, infer=inferDefaults, **kwargs)
250 # Load registry, datastore, etc. from config or existing butler.
251 if butler is not None:
252 if config is not None or searchPaths is not None or writeable is not None:
253 raise TypeError(
254 "Cannot pass 'config', 'searchPaths', or 'writeable' arguments with 'butler' argument."
255 )
256 self.registry = butler.registry.copy(defaults)
257 self.datastore = butler.datastore
258 self.storageClasses = butler.storageClasses
259 self._config: ButlerConfig = butler._config
260 self._allow_put_of_predefined_dataset = butler._allow_put_of_predefined_dataset
261 else:
262 self._config = ButlerConfig(config, searchPaths=searchPaths)
263 try:
264 if "root" in self._config:
265 butlerRoot = self._config["root"]
266 else:
267 butlerRoot = self._config.configDir
268 if writeable is None:
269 writeable = run is not None
270 self.registry = Registry.fromConfig(
271 self._config, butlerRoot=butlerRoot, writeable=writeable, defaults=defaults
272 )
273 self.datastore = Datastore.fromConfig(
274 self._config, self.registry.getDatastoreBridgeManager(), butlerRoot=butlerRoot
275 )
276 self.storageClasses = StorageClassFactory()
277 self.storageClasses.addFromConfig(self._config)
278 self._allow_put_of_predefined_dataset = self._config.get(
279 "allow_put_of_predefined_dataset", False
280 )
281 except Exception:
282 # Failures here usually mean that configuration is incomplete,
283 # just issue an error message which includes config file URI.
284 log.error(f"Failed to instantiate Butler from config {self._config.configFile}.")
285 raise
287 if "run" in self._config or "collection" in self._config:
288 raise ValueError("Passing a run or collection via configuration is no longer supported.")
290 GENERATION: ClassVar[int] = 3
291 """This is a Generation 3 Butler.
293 This attribute may be removed in the future, once the Generation 2 Butler
294 interface has been fully retired; it should only be used in transitional
295 code.
296 """
298 @classmethod
299 def get_repo_uri(cls, label: str) -> ResourcePath:
300 """Look up the label in a butler repository index.
302 Parameters
303 ----------
304 label : `str`
305 Label of the Butler repository to look up.
307 Returns
308 -------
309 uri : `lsst.resources.ResourcePath`
310 URI to the Butler repository associated with the given label.
312 Raises
313 ------
314 KeyError
315 Raised if the label is not found in the index, or if an index
316 can not be found at all.
318 Notes
319 -----
320 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the
321 information is discovered.
322 """
323 return ButlerRepoIndex.get_repo_uri(label)
325 @classmethod
326 def get_known_repos(cls) -> Set[str]:
327 """Retrieve the list of known repository labels.
329 Returns
330 -------
331 repos : `set` of `str`
332 All the known labels. Can be empty if no index can be found.
334 Notes
335 -----
336 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the
337 information is discovered.
338 """
339 return ButlerRepoIndex.get_known_repos()
341 @staticmethod
342 def makeRepo(
343 root: ResourcePathExpression,
344 config: Union[Config, str, None] = None,
345 dimensionConfig: Union[Config, str, None] = None,
346 standalone: bool = False,
347 searchPaths: Optional[List[str]] = None,
348 forceConfigRoot: bool = True,
349 outfile: Optional[ResourcePathExpression] = None,
350 overwrite: bool = False,
351 ) -> Config:
352 """Create an empty data repository by adding a butler.yaml config
353 to a repository root directory.
355 Parameters
356 ----------
357 root : `lsst.resources.ResourcePathExpression`
358 Path or URI to the root location of the new repository. Will be
359 created if it does not exist.
360 config : `Config` or `str`, optional
361 Configuration to write to the repository, after setting any
362 root-dependent Registry or Datastore config options. Can not
363 be a `ButlerConfig` or a `ConfigSubset`. If `None`, default
364 configuration will be used. Root-dependent config options
365 specified in this config are overwritten if ``forceConfigRoot``
366 is `True`.
367 dimensionConfig : `Config` or `str`, optional
368 Configuration for dimensions, will be used to initialize registry
369 database.
370 standalone : `bool`
371 If True, write all expanded defaults, not just customized or
372 repository-specific settings.
373 This (mostly) decouples the repository from the default
374 configuration, insulating it from changes to the defaults (which
375 may be good or bad, depending on the nature of the changes).
376 Future *additions* to the defaults will still be picked up when
377 initializing `Butlers` to repos created with ``standalone=True``.
378 searchPaths : `list` of `str`, optional
379 Directory paths to search when calculating the full butler
380 configuration.
381 forceConfigRoot : `bool`, optional
382 If `False`, any values present in the supplied ``config`` that
383 would normally be reset are not overridden and will appear
384 directly in the output config. This allows non-standard overrides
385 of the root directory for a datastore or registry to be given.
386 If this parameter is `True` the values for ``root`` will be
387 forced into the resulting config if appropriate.
388 outfile : `lss.resources.ResourcePathExpression`, optional
389 If not-`None`, the output configuration will be written to this
390 location rather than into the repository itself. Can be a URI
391 string. Can refer to a directory that will be used to write
392 ``butler.yaml``.
393 overwrite : `bool`, optional
394 Create a new configuration file even if one already exists
395 in the specified output location. Default is to raise
396 an exception.
398 Returns
399 -------
400 config : `Config`
401 The updated `Config` instance written to the repo.
403 Raises
404 ------
405 ValueError
406 Raised if a ButlerConfig or ConfigSubset is passed instead of a
407 regular Config (as these subclasses would make it impossible to
408 support ``standalone=False``).
409 FileExistsError
410 Raised if the output config file already exists.
411 os.error
412 Raised if the directory does not exist, exists but is not a
413 directory, or cannot be created.
415 Notes
416 -----
417 Note that when ``standalone=False`` (the default), the configuration
418 search path (see `ConfigSubset.defaultSearchPaths`) that was used to
419 construct the repository should also be used to construct any Butlers
420 to avoid configuration inconsistencies.
421 """
422 if isinstance(config, (ButlerConfig, ConfigSubset)):
423 raise ValueError("makeRepo must be passed a regular Config without defaults applied.")
425 # Ensure that the root of the repository exists or can be made
426 root_uri = ResourcePath(root, forceDirectory=True)
427 root_uri.mkdir()
429 config = Config(config)
431 # If we are creating a new repo from scratch with relative roots,
432 # do not propagate an explicit root from the config file
433 if "root" in config:
434 del config["root"]
436 full = ButlerConfig(config, searchPaths=searchPaths) # this applies defaults
437 imported_class = doImportType(full["datastore", "cls"])
438 if not issubclass(imported_class, Datastore):
439 raise TypeError(f"Imported datastore class {full['datastore', 'cls']} is not a Datastore")
440 datastoreClass: Type[Datastore] = imported_class
441 datastoreClass.setConfigRoot(BUTLER_ROOT_TAG, config, full, overwrite=forceConfigRoot)
443 # if key exists in given config, parse it, otherwise parse the defaults
444 # in the expanded config
445 if config.get(("registry", "db")):
446 registryConfig = RegistryConfig(config)
447 else:
448 registryConfig = RegistryConfig(full)
449 defaultDatabaseUri = registryConfig.makeDefaultDatabaseUri(BUTLER_ROOT_TAG)
450 if defaultDatabaseUri is not None:
451 Config.updateParameters(
452 RegistryConfig, config, full, toUpdate={"db": defaultDatabaseUri}, overwrite=forceConfigRoot
453 )
454 else:
455 Config.updateParameters(RegistryConfig, config, full, toCopy=("db",), overwrite=forceConfigRoot)
457 if standalone:
458 config.merge(full)
459 else:
460 # Always expand the registry.managers section into the per-repo
461 # config, because after the database schema is created, it's not
462 # allowed to change anymore. Note that in the standalone=True
463 # branch, _everything_ in the config is expanded, so there's no
464 # need to special case this.
465 Config.updateParameters(RegistryConfig, config, full, toMerge=("managers",), overwrite=False)
466 configURI: ResourcePathExpression
467 if outfile is not None:
468 # When writing to a separate location we must include
469 # the root of the butler repo in the config else it won't know
470 # where to look.
471 config["root"] = root_uri.geturl()
472 configURI = outfile
473 else:
474 configURI = root_uri
475 config.dumpToUri(configURI, overwrite=overwrite)
477 # Create Registry and populate tables
478 registryConfig = RegistryConfig(config.get("registry"))
479 dimensionConfig = DimensionConfig(dimensionConfig)
480 Registry.createFromConfig(registryConfig, dimensionConfig=dimensionConfig, butlerRoot=root_uri)
482 log.verbose("Wrote new Butler configuration file to %s", configURI)
484 return config
486 @classmethod
487 def _unpickle(
488 cls,
489 config: ButlerConfig,
490 collections: Optional[CollectionSearch],
491 run: Optional[str],
492 defaultDataId: Dict[str, str],
493 writeable: bool,
494 ) -> Butler:
495 """Callable used to unpickle a Butler.
497 We prefer not to use ``Butler.__init__`` directly so we can force some
498 of its many arguments to be keyword-only (note that ``__reduce__``
499 can only invoke callables with positional arguments).
501 Parameters
502 ----------
503 config : `ButlerConfig`
504 Butler configuration, already coerced into a true `ButlerConfig`
505 instance (and hence after any search paths for overrides have been
506 utilized).
507 collections : `CollectionSearch`
508 Names of the default collections to read from.
509 run : `str`, optional
510 Name of the default `~CollectionType.RUN` collection to write to.
511 defaultDataId : `dict` [ `str`, `str` ]
512 Default data ID values.
513 writeable : `bool`
514 Whether the Butler should support write operations.
516 Returns
517 -------
518 butler : `Butler`
519 A new `Butler` instance.
520 """
521 # MyPy doesn't recognize that the kwargs below are totally valid; it
522 # seems to think '**defaultDataId* is a _positional_ argument!
523 return cls(
524 config=config,
525 collections=collections,
526 run=run,
527 writeable=writeable,
528 **defaultDataId, # type: ignore
529 )
531 def __reduce__(self) -> tuple:
532 """Support pickling."""
533 return (
534 Butler._unpickle,
535 (
536 self._config,
537 self.collections,
538 self.run,
539 self.registry.defaults.dataId.byName(),
540 self.registry.isWriteable(),
541 ),
542 )
544 def __str__(self) -> str:
545 return "Butler(collections={}, run={}, datastore='{}', registry='{}')".format(
546 self.collections, self.run, self.datastore, self.registry
547 )
549 def isWriteable(self) -> bool:
550 """Return `True` if this `Butler` supports write operations."""
551 return self.registry.isWriteable()
553 @contextlib.contextmanager
554 def transaction(self) -> Iterator[None]:
555 """Context manager supporting `Butler` transactions.
557 Transactions can be nested.
558 """
559 with self.registry.transaction():
560 with self.datastore.transaction():
561 yield
563 def _standardizeArgs(
564 self,
565 datasetRefOrType: Union[DatasetRef, DatasetType, str],
566 dataId: Optional[DataId] = None,
567 for_put: bool = True,
568 **kwargs: Any,
569 ) -> Tuple[DatasetType, Optional[DataId]]:
570 """Standardize the arguments passed to several Butler APIs.
572 Parameters
573 ----------
574 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
575 When `DatasetRef` the `dataId` should be `None`.
576 Otherwise the `DatasetType` or name thereof.
577 dataId : `dict` or `DataCoordinate`
578 A `dict` of `Dimension` link name, value pairs that label the
579 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
580 should be provided as the second argument.
581 for_put : `bool`, optional
582 If `True` this call is invoked as part of a `Butler.put()`.
583 Otherwise it is assumed to be part of a `Butler.get()`. This
584 parameter is only relevant if there is dataset type
585 inconsistency.
586 **kwargs
587 Additional keyword arguments used to augment or construct a
588 `DataCoordinate`. See `DataCoordinate.standardize`
589 parameters.
591 Returns
592 -------
593 datasetType : `DatasetType`
594 A `DatasetType` instance extracted from ``datasetRefOrType``.
595 dataId : `dict` or `DataId`, optional
596 Argument that can be used (along with ``kwargs``) to construct a
597 `DataId`.
599 Notes
600 -----
601 Butler APIs that conceptually need a DatasetRef also allow passing a
602 `DatasetType` (or the name of one) and a `DataId` (or a dict and
603 keyword arguments that can be used to construct one) separately. This
604 method accepts those arguments and always returns a true `DatasetType`
605 and a `DataId` or `dict`.
607 Standardization of `dict` vs `DataId` is best handled by passing the
608 returned ``dataId`` (and ``kwargs``) to `Registry` APIs, which are
609 generally similarly flexible.
610 """
611 externalDatasetType: Optional[DatasetType] = None
612 internalDatasetType: Optional[DatasetType] = None
613 if isinstance(datasetRefOrType, DatasetRef):
614 if dataId is not None or kwargs:
615 raise ValueError("DatasetRef given, cannot use dataId as well")
616 externalDatasetType = datasetRefOrType.datasetType
617 dataId = datasetRefOrType.dataId
618 else:
619 # Don't check whether DataId is provided, because Registry APIs
620 # can usually construct a better error message when it wasn't.
621 if isinstance(datasetRefOrType, DatasetType):
622 externalDatasetType = datasetRefOrType
623 else:
624 internalDatasetType = self.registry.getDatasetType(datasetRefOrType)
626 # Check that they are self-consistent
627 if externalDatasetType is not None:
628 internalDatasetType = self.registry.getDatasetType(externalDatasetType.name)
629 if externalDatasetType != internalDatasetType:
630 # We can allow differences if they are compatible, depending
631 # on whether this is a get or a put. A get requires that
632 # the python type associated with the datastore can be
633 # converted to the user type. A put requires that the user
634 # supplied python type can be converted to the internal
635 # type expected by registry.
636 relevantDatasetType = internalDatasetType
637 if for_put:
638 is_compatible = internalDatasetType.is_compatible_with(externalDatasetType)
639 else:
640 is_compatible = externalDatasetType.is_compatible_with(internalDatasetType)
641 relevantDatasetType = externalDatasetType
642 if not is_compatible:
643 raise ValueError(
644 f"Supplied dataset type ({externalDatasetType}) inconsistent with "
645 f"registry definition ({internalDatasetType})"
646 )
647 # Override the internal definition.
648 internalDatasetType = relevantDatasetType
650 assert internalDatasetType is not None
651 return internalDatasetType, dataId
653 def _rewrite_data_id(
654 self, dataId: Optional[DataId], datasetType: DatasetType, **kwargs: Any
655 ) -> Tuple[Optional[DataId], Dict[str, Any]]:
656 """Rewrite a data ID taking into account dimension records.
658 Take a Data ID and keyword args and rewrite it if necessary to
659 allow the user to specify dimension records rather than dimension
660 primary values.
662 This allows a user to include a dataId dict with keys of
663 ``exposure.day_obs`` and ``exposure.seq_num`` instead of giving
664 the integer exposure ID. It also allows a string to be given
665 for a dimension value rather than the integer ID if that is more
666 convenient. For example, rather than having to specifyin the
667 detector with ``detector.full_name``, a string given for ``detector``
668 will be interpreted as the full name and converted to the integer
669 value.
671 Keyword arguments can also use strings for dimensions like detector
672 and exposure but python does not allow them to include ``.`` and
673 so the ``exposure.day_obs`` syntax can not be used in a keyword
674 argument.
676 Parameters
677 ----------
678 dataId : `dict` or `DataCoordinate`
679 A `dict` of `Dimension` link name, value pairs that will label the
680 `DatasetRef` within a Collection.
681 datasetType : `DatasetType`
682 The dataset type associated with this dataId. Required to
683 determine the relevant dimensions.
684 **kwargs
685 Additional keyword arguments used to augment or construct a
686 `DataId`. See `DataId` parameters.
688 Returns
689 -------
690 dataId : `dict` or `DataCoordinate`
691 The, possibly rewritten, dataId. If given a `DataCoordinate` and
692 no keyword arguments, the original dataId will be returned
693 unchanged.
694 **kwargs : `dict`
695 Any unused keyword arguments (would normally be empty dict).
696 """
697 # Do nothing if we have a standalone DataCoordinate.
698 if isinstance(dataId, DataCoordinate) and not kwargs:
699 return dataId, kwargs
701 # Process dimension records that are using record information
702 # rather than ids
703 newDataId: Dict[str, DataIdValue] = {}
704 byRecord: Dict[str, Dict[str, Any]] = defaultdict(dict)
706 # if all the dataId comes from keyword parameters we do not need
707 # to do anything here because they can't be of the form
708 # exposure.obs_id because a "." is not allowed in a keyword parameter.
709 if dataId:
710 for k, v in dataId.items():
711 # If we have a Dimension we do not need to do anything
712 # because it cannot be a compound key.
713 if isinstance(k, str) and "." in k:
714 # Someone is using a more human-readable dataId
715 dimensionName, record = k.split(".", 1)
716 byRecord[dimensionName][record] = v
717 elif isinstance(k, Dimension):
718 newDataId[k.name] = v
719 else:
720 newDataId[k] = v
722 # Go through the updated dataId and check the type in case someone is
723 # using an alternate key. We have already filtered out the compound
724 # keys dimensions.record format.
725 not_dimensions = {}
727 # Will need to look in the dataId and the keyword arguments
728 # and will remove them if they need to be fixed or are unrecognized.
729 for dataIdDict in (newDataId, kwargs):
730 # Use a list so we can adjust the dict safely in the loop
731 for dimensionName in list(dataIdDict):
732 value = dataIdDict[dimensionName]
733 try:
734 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
735 except KeyError:
736 # This is not a real dimension
737 not_dimensions[dimensionName] = value
738 del dataIdDict[dimensionName]
739 continue
741 # Convert an integral type to an explicit int to simplify
742 # comparisons here
743 if isinstance(value, numbers.Integral):
744 value = int(value)
746 if not isinstance(value, dimension.primaryKey.getPythonType()):
747 for alternate in dimension.alternateKeys:
748 if isinstance(value, alternate.getPythonType()):
749 byRecord[dimensionName][alternate.name] = value
750 del dataIdDict[dimensionName]
751 log.debug(
752 "Converting dimension %s to %s.%s=%s",
753 dimensionName,
754 dimensionName,
755 alternate.name,
756 value,
757 )
758 break
759 else:
760 log.warning(
761 "Type mismatch found for value '%r' provided for dimension %s. "
762 "Could not find matching alternative (primary key has type %s) "
763 "so attempting to use as-is.",
764 value,
765 dimensionName,
766 dimension.primaryKey.getPythonType(),
767 )
769 # By this point kwargs and newDataId should only include valid
770 # dimensions. Merge kwargs in to the new dataId and log if there
771 # are dimensions in both (rather than calling update).
772 for k, v in kwargs.items():
773 if k in newDataId and newDataId[k] != v:
774 log.debug(
775 "Keyword arg %s overriding explicit value in dataId of %s with %s", k, newDataId[k], v
776 )
777 newDataId[k] = v
778 # No need to retain any values in kwargs now.
779 kwargs = {}
781 # If we have some unrecognized dimensions we have to try to connect
782 # them to records in other dimensions. This is made more complicated
783 # by some dimensions having records with clashing names. A mitigation
784 # is that we can tell by this point which dimensions are missing
785 # for the DatasetType but this does not work for calibrations
786 # where additional dimensions can be used to constrain the temporal
787 # axis.
788 if not_dimensions:
789 # Search for all dimensions even if we have been given a value
790 # explicitly. In some cases records are given as well as the
791 # actually dimension and this should not be an error if they
792 # match.
793 mandatoryDimensions = datasetType.dimensions.names # - provided
795 candidateDimensions: Set[str] = set()
796 candidateDimensions.update(mandatoryDimensions)
798 # For calibrations we may well be needing temporal dimensions
799 # so rather than always including all dimensions in the scan
800 # restrict things a little. It is still possible for there
801 # to be confusion over day_obs in visit vs exposure for example.
802 # If we are not searching calibration collections things may
803 # fail but they are going to fail anyway because of the
804 # ambiguousness of the dataId...
805 if datasetType.isCalibration():
806 for dim in self.registry.dimensions.getStaticDimensions():
807 if dim.temporal:
808 candidateDimensions.add(str(dim))
810 # Look up table for the first association with a dimension
811 guessedAssociation: Dict[str, Dict[str, Any]] = defaultdict(dict)
813 # Keep track of whether an item is associated with multiple
814 # dimensions.
815 counter: Counter[str] = Counter()
816 assigned: Dict[str, Set[str]] = defaultdict(set)
818 # Go through the missing dimensions and associate the
819 # given names with records within those dimensions
820 matched_dims = set()
821 for dimensionName in candidateDimensions:
822 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
823 fields = dimension.metadata.names | dimension.uniqueKeys.names
824 for field in not_dimensions:
825 if field in fields:
826 guessedAssociation[dimensionName][field] = not_dimensions[field]
827 counter[dimensionName] += 1
828 assigned[field].add(dimensionName)
829 matched_dims.add(field)
831 # Calculate the fields that matched nothing.
832 never_found = set(not_dimensions) - matched_dims
834 if never_found:
835 raise DimensionNameError(f"Unrecognized keyword args given: {never_found}")
837 # There is a chance we have allocated a single dataId item
838 # to multiple dimensions. Need to decide which should be retained.
839 # For now assume that the most popular alternative wins.
840 # This means that day_obs with seq_num will result in
841 # exposure.day_obs and not visit.day_obs
842 # Also prefer an explicitly missing dimension over an inferred
843 # temporal dimension.
844 for fieldName, assignedDimensions in assigned.items():
845 if len(assignedDimensions) > 1:
846 # Pick the most popular (preferring mandatory dimensions)
847 requiredButMissing = assignedDimensions.intersection(mandatoryDimensions)
848 if requiredButMissing:
849 candidateDimensions = requiredButMissing
850 else:
851 candidateDimensions = assignedDimensions
853 # Select the relevant items and get a new restricted
854 # counter.
855 theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions}
856 duplicatesCounter: Counter[str] = Counter()
857 duplicatesCounter.update(theseCounts)
859 # Choose the most common. If they are equally common
860 # we will pick the one that was found first.
861 # Returns a list of tuples
862 selected = duplicatesCounter.most_common(1)[0][0]
864 log.debug(
865 "Ambiguous dataId entry '%s' associated with multiple dimensions: %s."
866 " Removed ambiguity by choosing dimension %s.",
867 fieldName,
868 ", ".join(assignedDimensions),
869 selected,
870 )
872 for candidateDimension in assignedDimensions:
873 if candidateDimension != selected:
874 del guessedAssociation[candidateDimension][fieldName]
876 # Update the record look up dict with the new associations
877 for dimensionName, values in guessedAssociation.items():
878 if values: # A dict might now be empty
879 log.debug("Assigned non-dimension dataId keys to dimension %s: %s", dimensionName, values)
880 byRecord[dimensionName].update(values)
882 if byRecord:
883 # Some record specifiers were found so we need to convert
884 # them to the Id form
885 for dimensionName, values in byRecord.items():
886 if dimensionName in newDataId:
887 log.debug(
888 "DataId specified explicit %s dimension value of %s in addition to"
889 " general record specifiers for it of %s. Ignoring record information.",
890 dimensionName,
891 newDataId[dimensionName],
892 str(values),
893 )
894 # Get the actual record and compare with these values.
895 try:
896 recs = list(self.registry.queryDimensionRecords(dimensionName, dataId=newDataId))
897 except DataIdError:
898 raise DataIdValueError(
899 f"Could not find dimension '{dimensionName}'"
900 f" with dataId {newDataId} as part of comparing with"
901 f" record values {byRecord[dimensionName]}"
902 ) from None
903 if len(recs) == 1:
904 errmsg: List[str] = []
905 for k, v in values.items():
906 if (recval := getattr(recs[0], k)) != v:
907 errmsg.append(f"{k}({recval} != {v})")
908 if errmsg:
909 raise InconsistentDataIdError(
910 f"Dimension {dimensionName} in dataId has explicit value"
911 " inconsistent with records: " + ", ".join(errmsg)
912 )
913 else:
914 # Multiple matches for an explicit dimension
915 # should never happen but let downstream complain.
916 pass
917 continue
919 # Build up a WHERE expression
920 bind = {k: v for k, v in values.items()}
921 where = " AND ".join(f"{dimensionName}.{k} = {k}" for k in bind)
923 # Hopefully we get a single record that matches
924 records = set(
925 self.registry.queryDimensionRecords(
926 dimensionName, dataId=newDataId, where=where, bind=bind, **kwargs
927 )
928 )
930 if len(records) != 1:
931 if len(records) > 1:
932 log.debug("Received %d records from constraints of %s", len(records), str(values))
933 for r in records:
934 log.debug("- %s", str(r))
935 raise InconsistentDataIdError(
936 f"DataId specification for dimension {dimensionName} is not"
937 f" uniquely constrained to a single dataset by {values}."
938 f" Got {len(records)} results."
939 )
940 raise InconsistentDataIdError(
941 f"DataId specification for dimension {dimensionName} matched no"
942 f" records when constrained by {values}"
943 )
945 # Get the primary key from the real dimension object
946 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
947 if not isinstance(dimension, Dimension):
948 raise RuntimeError(
949 f"{dimension.name} is not a true dimension, and cannot be used in data IDs."
950 )
951 newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name)
953 return newDataId, kwargs
955 def _findDatasetRef(
956 self,
957 datasetRefOrType: Union[DatasetRef, DatasetType, str],
958 dataId: Optional[DataId] = None,
959 *,
960 collections: Any = None,
961 allowUnresolved: bool = False,
962 **kwargs: Any,
963 ) -> DatasetRef:
964 """Shared logic for methods that start with a search for a dataset in
965 the registry.
967 Parameters
968 ----------
969 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
970 When `DatasetRef` the `dataId` should be `None`.
971 Otherwise the `DatasetType` or name thereof.
972 dataId : `dict` or `DataCoordinate`, optional
973 A `dict` of `Dimension` link name, value pairs that label the
974 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
975 should be provided as the first argument.
976 collections : Any, optional
977 Collections to be searched, overriding ``self.collections``.
978 Can be any of the types supported by the ``collections`` argument
979 to butler construction.
980 allowUnresolved : `bool`, optional
981 If `True`, return an unresolved `DatasetRef` if finding a resolved
982 one in the `Registry` fails. Defaults to `False`.
983 **kwargs
984 Additional keyword arguments used to augment or construct a
985 `DataId`. See `DataId` parameters.
987 Returns
988 -------
989 ref : `DatasetRef`
990 A reference to the dataset identified by the given arguments.
992 Raises
993 ------
994 LookupError
995 Raised if no matching dataset exists in the `Registry` (and
996 ``allowUnresolved is False``).
997 ValueError
998 Raised if a resolved `DatasetRef` was passed as an input, but it
999 differs from the one found in the registry.
1000 TypeError
1001 Raised if no collections were provided.
1002 """
1003 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, for_put=False, **kwargs)
1004 if isinstance(datasetRefOrType, DatasetRef):
1005 idNumber = datasetRefOrType.id
1006 else:
1007 idNumber = None
1008 timespan: Optional[Timespan] = None
1010 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs)
1012 if datasetType.isCalibration():
1013 # Because this is a calibration dataset, first try to make a
1014 # standardize the data ID without restricting the dimensions to
1015 # those of the dataset type requested, because there may be extra
1016 # dimensions that provide temporal information for a validity-range
1017 # lookup.
1018 dataId = DataCoordinate.standardize(
1019 dataId, universe=self.registry.dimensions, defaults=self.registry.defaults.dataId, **kwargs
1020 )
1021 if dataId.graph.temporal:
1022 dataId = self.registry.expandDataId(dataId)
1023 timespan = dataId.timespan
1024 else:
1025 # Standardize the data ID to just the dimensions of the dataset
1026 # type instead of letting registry.findDataset do it, so we get the
1027 # result even if no dataset is found.
1028 dataId = DataCoordinate.standardize(
1029 dataId, graph=datasetType.dimensions, defaults=self.registry.defaults.dataId, **kwargs
1030 )
1031 # Always lookup the DatasetRef, even if one is given, to ensure it is
1032 # present in the current collection.
1033 ref = self.registry.findDataset(datasetType, dataId, collections=collections, timespan=timespan)
1034 if ref is None:
1035 if allowUnresolved:
1036 return DatasetRef(datasetType, dataId)
1037 else:
1038 if collections is None:
1039 collections = self.registry.defaults.collections
1040 raise LookupError(
1041 f"Dataset {datasetType.name} with data ID {dataId} "
1042 f"could not be found in collections {collections}."
1043 )
1044 if idNumber is not None and idNumber != ref.id:
1045 if collections is None:
1046 collections = self.registry.defaults.collections
1047 raise ValueError(
1048 f"DatasetRef.id provided ({idNumber}) does not match "
1049 f"id ({ref.id}) in registry in collections {collections}."
1050 )
1051 if datasetType != ref.datasetType:
1052 # If they differ it is because the user explicitly specified
1053 # a compatible dataset type to this call rather than using the
1054 # registry definition. The DatasetRef must therefore be recreated
1055 # using the user definition such that the expected type is
1056 # returned.
1057 ref = DatasetRef(datasetType, ref.dataId, run=ref.run, id=ref.id)
1059 return ref
1061 @transactional
1062 def putDirect(self, obj: Any, ref: DatasetRef) -> DatasetRef:
1063 # Docstring inherited.
1064 (imported_ref,) = self.registry._importDatasets(
1065 [ref],
1066 expand=True,
1067 )
1068 if imported_ref.id != ref.getCheckedId():
1069 raise RuntimeError("This registry configuration does not support putDirect.")
1070 self.datastore.put(obj, ref)
1071 return ref
1073 @transactional
1074 def put(
1075 self,
1076 obj: Any,
1077 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1078 dataId: Optional[DataId] = None,
1079 *,
1080 run: Optional[str] = None,
1081 **kwargs: Any,
1082 ) -> DatasetRef:
1083 """Store and register a dataset.
1085 Parameters
1086 ----------
1087 obj : `object`
1088 The dataset.
1089 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1090 When `DatasetRef` is provided, ``dataId`` should be `None`.
1091 Otherwise the `DatasetType` or name thereof.
1092 dataId : `dict` or `DataCoordinate`
1093 A `dict` of `Dimension` link name, value pairs that label the
1094 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1095 should be provided as the second argument.
1096 run : `str`, optional
1097 The name of the run the dataset should be added to, overriding
1098 ``self.run``.
1099 **kwargs
1100 Additional keyword arguments used to augment or construct a
1101 `DataCoordinate`. See `DataCoordinate.standardize`
1102 parameters.
1104 Returns
1105 -------
1106 ref : `DatasetRef`
1107 A reference to the stored dataset, updated with the correct id if
1108 given.
1110 Raises
1111 ------
1112 TypeError
1113 Raised if the butler is read-only or if no run has been provided.
1114 """
1115 log.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run)
1116 if not self.isWriteable():
1117 raise TypeError("Butler is read-only.")
1118 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs)
1119 if isinstance(datasetRefOrType, DatasetRef) and datasetRefOrType.id is not None:
1120 raise ValueError("DatasetRef must not be in registry, must have None id")
1122 # Handle dimension records in dataId
1123 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs)
1125 # Add Registry Dataset entry.
1126 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions, **kwargs)
1128 # For an execution butler the datasets will be pre-defined.
1129 # If the butler is configured that way datasets should only be inserted
1130 # if they do not already exist in registry. Trying and catching
1131 # ConflictingDefinitionError will not work because the transaction
1132 # will be corrupted. Instead, in this mode always check first.
1133 ref = None
1134 ref_is_predefined = False
1135 if self._allow_put_of_predefined_dataset:
1136 # Get the matching ref for this run.
1137 ref = self.registry.findDataset(datasetType, collections=run, dataId=dataId)
1139 if ref:
1140 # Must be expanded form for datastore templating
1141 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions)
1142 ref = ref.expanded(dataId)
1143 ref_is_predefined = True
1145 if not ref:
1146 (ref,) = self.registry.insertDatasets(datasetType, run=run, dataIds=[dataId])
1148 # If the ref is predefined it is possible that the datastore also
1149 # has the record. Asking datastore to put it again will result in
1150 # the artifact being recreated, overwriting previous, then will cause
1151 # a failure in writing the record which will cause the artifact
1152 # to be removed. Much safer to ask first before attempting to
1153 # overwrite. Race conditions should not be an issue for the
1154 # execution butler environment.
1155 if ref_is_predefined:
1156 if self.datastore.knows(ref):
1157 raise ConflictingDefinitionError(f"Dataset associated {ref} already exists.")
1159 self.datastore.put(obj, ref)
1161 return ref
1163 def getDirect(self, ref: DatasetRef, *, parameters: Optional[Dict[str, Any]] = None) -> Any:
1164 """Retrieve a stored dataset.
1166 Unlike `Butler.get`, this method allows datasets outside the Butler's
1167 collection to be read as long as the `DatasetRef` that identifies them
1168 can be obtained separately.
1170 Parameters
1171 ----------
1172 ref : `DatasetRef`
1173 Resolved reference to an already stored dataset.
1174 parameters : `dict`
1175 Additional StorageClass-defined options to control reading,
1176 typically used to efficiently read only a subset of the dataset.
1178 Returns
1179 -------
1180 obj : `object`
1181 The dataset.
1182 """
1183 return self.datastore.get(ref, parameters=parameters)
1185 def getDirectDeferred(
1186 self, ref: DatasetRef, *, parameters: Union[dict, None] = None
1187 ) -> DeferredDatasetHandle:
1188 """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
1189 from a resolved `DatasetRef`.
1191 Parameters
1192 ----------
1193 ref : `DatasetRef`
1194 Resolved reference to an already stored dataset.
1195 parameters : `dict`
1196 Additional StorageClass-defined options to control reading,
1197 typically used to efficiently read only a subset of the dataset.
1199 Returns
1200 -------
1201 obj : `DeferredDatasetHandle`
1202 A handle which can be used to retrieve a dataset at a later time.
1204 Raises
1205 ------
1206 AmbiguousDatasetError
1207 Raised if ``ref.id is None``, i.e. the reference is unresolved.
1208 """
1209 if ref.id is None:
1210 raise AmbiguousDatasetError(
1211 f"Dataset of type {ref.datasetType.name} with data ID {ref.dataId} is not resolved."
1212 )
1213 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters)
1215 def getDeferred(
1216 self,
1217 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1218 dataId: Optional[DataId] = None,
1219 *,
1220 parameters: Union[dict, None] = None,
1221 collections: Any = None,
1222 **kwargs: Any,
1223 ) -> DeferredDatasetHandle:
1224 """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
1225 after an immediate registry lookup.
1227 Parameters
1228 ----------
1229 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1230 When `DatasetRef` the `dataId` should be `None`.
1231 Otherwise the `DatasetType` or name thereof.
1232 dataId : `dict` or `DataCoordinate`, optional
1233 A `dict` of `Dimension` link name, value pairs that label the
1234 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1235 should be provided as the first argument.
1236 parameters : `dict`
1237 Additional StorageClass-defined options to control reading,
1238 typically used to efficiently read only a subset of the dataset.
1239 collections : Any, optional
1240 Collections to be searched, overriding ``self.collections``.
1241 Can be any of the types supported by the ``collections`` argument
1242 to butler construction.
1243 **kwargs
1244 Additional keyword arguments used to augment or construct a
1245 `DataId`. See `DataId` parameters.
1247 Returns
1248 -------
1249 obj : `DeferredDatasetHandle`
1250 A handle which can be used to retrieve a dataset at a later time.
1252 Raises
1253 ------
1254 LookupError
1255 Raised if no matching dataset exists in the `Registry` (and
1256 ``allowUnresolved is False``).
1257 ValueError
1258 Raised if a resolved `DatasetRef` was passed as an input, but it
1259 differs from the one found in the registry.
1260 TypeError
1261 Raised if no collections were provided.
1262 """
1263 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs)
1264 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters)
1266 def get(
1267 self,
1268 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1269 dataId: Optional[DataId] = None,
1270 *,
1271 parameters: Optional[Dict[str, Any]] = None,
1272 collections: Any = None,
1273 **kwargs: Any,
1274 ) -> Any:
1275 """Retrieve a stored dataset.
1277 Parameters
1278 ----------
1279 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1280 When `DatasetRef` the `dataId` should be `None`.
1281 Otherwise the `DatasetType` or name thereof.
1282 dataId : `dict` or `DataCoordinate`
1283 A `dict` of `Dimension` link name, value pairs that label the
1284 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1285 should be provided as the first argument.
1286 parameters : `dict`
1287 Additional StorageClass-defined options to control reading,
1288 typically used to efficiently read only a subset of the dataset.
1289 collections : Any, optional
1290 Collections to be searched, overriding ``self.collections``.
1291 Can be any of the types supported by the ``collections`` argument
1292 to butler construction.
1293 **kwargs
1294 Additional keyword arguments used to augment or construct a
1295 `DataCoordinate`. See `DataCoordinate.standardize`
1296 parameters.
1298 Returns
1299 -------
1300 obj : `object`
1301 The dataset.
1303 Raises
1304 ------
1305 ValueError
1306 Raised if a resolved `DatasetRef` was passed as an input, but it
1307 differs from the one found in the registry.
1308 LookupError
1309 Raised if no matching dataset exists in the `Registry`.
1310 TypeError
1311 Raised if no collections were provided.
1313 Notes
1314 -----
1315 When looking up datasets in a `~CollectionType.CALIBRATION` collection,
1316 this method requires that the given data ID include temporal dimensions
1317 beyond the dimensions of the dataset type itself, in order to find the
1318 dataset with the appropriate validity range. For example, a "bias"
1319 dataset with native dimensions ``{instrument, detector}`` could be
1320 fetched with a ``{instrument, detector, exposure}`` data ID, because
1321 ``exposure`` is a temporal dimension.
1322 """
1323 log.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters)
1324 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs)
1325 return self.getDirect(ref, parameters=parameters)
1327 def getURIs(
1328 self,
1329 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1330 dataId: Optional[DataId] = None,
1331 *,
1332 predict: bool = False,
1333 collections: Any = None,
1334 run: Optional[str] = None,
1335 **kwargs: Any,
1336 ) -> Tuple[Optional[ResourcePath], Dict[str, ResourcePath]]:
1337 """Returns the URIs associated with the dataset.
1339 Parameters
1340 ----------
1341 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1342 When `DatasetRef` the `dataId` should be `None`.
1343 Otherwise the `DatasetType` or name thereof.
1344 dataId : `dict` or `DataCoordinate`
1345 A `dict` of `Dimension` link name, value pairs that label the
1346 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1347 should be provided as the first argument.
1348 predict : `bool`
1349 If `True`, allow URIs to be returned of datasets that have not
1350 been written.
1351 collections : Any, optional
1352 Collections to be searched, overriding ``self.collections``.
1353 Can be any of the types supported by the ``collections`` argument
1354 to butler construction.
1355 run : `str`, optional
1356 Run to use for predictions, overriding ``self.run``.
1357 **kwargs
1358 Additional keyword arguments used to augment or construct a
1359 `DataCoordinate`. See `DataCoordinate.standardize`
1360 parameters.
1362 Returns
1363 -------
1364 primary : `lsst.resources.ResourcePath`
1365 The URI to the primary artifact associated with this dataset.
1366 If the dataset was disassembled within the datastore this
1367 may be `None`.
1368 components : `dict`
1369 URIs to any components associated with the dataset artifact.
1370 Can be empty if there are no components.
1371 """
1372 ref = self._findDatasetRef(
1373 datasetRefOrType, dataId, allowUnresolved=predict, collections=collections, **kwargs
1374 )
1375 if ref.id is None: # only possible if predict is True
1376 if run is None:
1377 run = self.run
1378 if run is None:
1379 raise TypeError("Cannot predict location with run=None.")
1380 # Lie about ID, because we can't guess it, and only
1381 # Datastore.getURIs() will ever see it (and it doesn't use it).
1382 ref = ref.resolved(id=0, run=run)
1383 return self.datastore.getURIs(ref, predict)
1385 def getURI(
1386 self,
1387 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1388 dataId: Optional[DataId] = None,
1389 *,
1390 predict: bool = False,
1391 collections: Any = None,
1392 run: Optional[str] = None,
1393 **kwargs: Any,
1394 ) -> ResourcePath:
1395 """Return the URI to the Dataset.
1397 Parameters
1398 ----------
1399 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1400 When `DatasetRef` the `dataId` should be `None`.
1401 Otherwise the `DatasetType` or name thereof.
1402 dataId : `dict` or `DataCoordinate`
1403 A `dict` of `Dimension` link name, value pairs that label the
1404 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1405 should be provided as the first argument.
1406 predict : `bool`
1407 If `True`, allow URIs to be returned of datasets that have not
1408 been written.
1409 collections : Any, optional
1410 Collections to be searched, overriding ``self.collections``.
1411 Can be any of the types supported by the ``collections`` argument
1412 to butler construction.
1413 run : `str`, optional
1414 Run to use for predictions, overriding ``self.run``.
1415 **kwargs
1416 Additional keyword arguments used to augment or construct a
1417 `DataCoordinate`. See `DataCoordinate.standardize`
1418 parameters.
1420 Returns
1421 -------
1422 uri : `lsst.resources.ResourcePath`
1423 URI pointing to the Dataset within the datastore. If the
1424 Dataset does not exist in the datastore, and if ``predict`` is
1425 `True`, the URI will be a prediction and will include a URI
1426 fragment "#predicted".
1427 If the datastore does not have entities that relate well
1428 to the concept of a URI the returned URI string will be
1429 descriptive. The returned URI is not guaranteed to be obtainable.
1431 Raises
1432 ------
1433 LookupError
1434 A URI has been requested for a dataset that does not exist and
1435 guessing is not allowed.
1436 ValueError
1437 Raised if a resolved `DatasetRef` was passed as an input, but it
1438 differs from the one found in the registry.
1439 TypeError
1440 Raised if no collections were provided.
1441 RuntimeError
1442 Raised if a URI is requested for a dataset that consists of
1443 multiple artifacts.
1444 """
1445 primary, components = self.getURIs(
1446 datasetRefOrType, dataId=dataId, predict=predict, collections=collections, run=run, **kwargs
1447 )
1449 if primary is None or components:
1450 raise RuntimeError(
1451 f"Dataset ({datasetRefOrType}) includes distinct URIs for components. "
1452 "Use Butler.getURIs() instead."
1453 )
1454 return primary
1456 def retrieveArtifacts(
1457 self,
1458 refs: Iterable[DatasetRef],
1459 destination: ResourcePathExpression,
1460 transfer: str = "auto",
1461 preserve_path: bool = True,
1462 overwrite: bool = False,
1463 ) -> List[ResourcePath]:
1464 """Retrieve the artifacts associated with the supplied refs.
1466 Parameters
1467 ----------
1468 refs : iterable of `DatasetRef`
1469 The datasets for which artifacts are to be retrieved.
1470 A single ref can result in multiple artifacts. The refs must
1471 be resolved.
1472 destination : `lsst.resources.ResourcePath` or `str`
1473 Location to write the artifacts.
1474 transfer : `str`, optional
1475 Method to use to transfer the artifacts. Must be one of the options
1476 supported by `~lsst.resources.ResourcePath.transfer_from()`.
1477 "move" is not allowed.
1478 preserve_path : `bool`, optional
1479 If `True` the full path of the artifact within the datastore
1480 is preserved. If `False` the final file component of the path
1481 is used.
1482 overwrite : `bool`, optional
1483 If `True` allow transfers to overwrite existing files at the
1484 destination.
1486 Returns
1487 -------
1488 targets : `list` of `lsst.resources.ResourcePath`
1489 URIs of file artifacts in destination location. Order is not
1490 preserved.
1492 Notes
1493 -----
1494 For non-file datastores the artifacts written to the destination
1495 may not match the representation inside the datastore. For example
1496 a hierarchical data structure in a NoSQL database may well be stored
1497 as a JSON file.
1498 """
1499 return self.datastore.retrieveArtifacts(
1500 refs,
1501 ResourcePath(destination),
1502 transfer=transfer,
1503 preserve_path=preserve_path,
1504 overwrite=overwrite,
1505 )
1507 def datasetExists(
1508 self,
1509 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1510 dataId: Optional[DataId] = None,
1511 *,
1512 collections: Any = None,
1513 **kwargs: Any,
1514 ) -> bool:
1515 """Return True if the Dataset is actually present in the Datastore.
1517 Parameters
1518 ----------
1519 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1520 When `DatasetRef` the `dataId` should be `None`.
1521 Otherwise the `DatasetType` or name thereof.
1522 dataId : `dict` or `DataCoordinate`
1523 A `dict` of `Dimension` link name, value pairs that label the
1524 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1525 should be provided as the first argument.
1526 collections : Any, optional
1527 Collections to be searched, overriding ``self.collections``.
1528 Can be any of the types supported by the ``collections`` argument
1529 to butler construction.
1530 **kwargs
1531 Additional keyword arguments used to augment or construct a
1532 `DataCoordinate`. See `DataCoordinate.standardize`
1533 parameters.
1535 Raises
1536 ------
1537 LookupError
1538 Raised if the dataset is not even present in the Registry.
1539 ValueError
1540 Raised if a resolved `DatasetRef` was passed as an input, but it
1541 differs from the one found in the registry.
1542 TypeError
1543 Raised if no collections were provided.
1544 """
1545 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs)
1546 return self.datastore.exists(ref)
1548 def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None:
1549 """Remove one or more `~CollectionType.RUN` collections and the
1550 datasets within them.
1552 Parameters
1553 ----------
1554 names : `Iterable` [ `str` ]
1555 The names of the collections to remove.
1556 unstore : `bool`, optional
1557 If `True` (default), delete datasets from all datastores in which
1558 they are present, and attempt to rollback the registry deletions if
1559 datastore deletions fail (which may not always be possible). If
1560 `False`, datastore records for these datasets are still removed,
1561 but any artifacts (e.g. files) will not be.
1563 Raises
1564 ------
1565 TypeError
1566 Raised if one or more collections are not of type
1567 `~CollectionType.RUN`.
1568 """
1569 if not self.isWriteable():
1570 raise TypeError("Butler is read-only.")
1571 names = list(names)
1572 refs: List[DatasetRef] = []
1573 for name in names:
1574 collectionType = self.registry.getCollectionType(name)
1575 if collectionType is not CollectionType.RUN:
1576 raise TypeError(f"The collection type of '{name}' is {collectionType.name}, not RUN.")
1577 refs.extend(self.registry.queryDatasets(..., collections=name, findFirst=True))
1578 with self.registry.transaction():
1579 if unstore:
1580 self.datastore.trash(refs)
1581 else:
1582 self.datastore.forget(refs)
1583 for name in names:
1584 self.registry.removeCollection(name)
1585 if unstore:
1586 # Point of no return for removing artifacts
1587 self.datastore.emptyTrash()
1589 def pruneCollection(
1590 self, name: str, purge: bool = False, unstore: bool = False, unlink: Optional[List[str]] = None
1591 ) -> None:
1592 """Remove a collection and possibly prune datasets within it.
1594 Parameters
1595 ----------
1596 name : `str`
1597 Name of the collection to remove. If this is a
1598 `~CollectionType.TAGGED` or `~CollectionType.CHAINED` collection,
1599 datasets within the collection are not modified unless ``unstore``
1600 is `True`. If this is a `~CollectionType.RUN` collection,
1601 ``purge`` and ``unstore`` must be `True`, and all datasets in it
1602 are fully removed from the data repository.
1603 purge : `bool`, optional
1604 If `True`, permit `~CollectionType.RUN` collections to be removed,
1605 fully removing datasets within them. Requires ``unstore=True`` as
1606 well as an added precaution against accidental deletion. Must be
1607 `False` (default) if the collection is not a ``RUN``.
1608 unstore: `bool`, optional
1609 If `True`, remove all datasets in the collection from all
1610 datastores in which they appear.
1611 unlink: `list` [`str`], optional
1612 Before removing the given `collection` unlink it from from these
1613 parent collections.
1615 Raises
1616 ------
1617 TypeError
1618 Raised if the butler is read-only or arguments are mutually
1619 inconsistent.
1620 """
1621 # See pruneDatasets comments for more information about the logic here;
1622 # the cases are almost the same, but here we can rely on Registry to
1623 # take care everything but Datastore deletion when we remove the
1624 # collection.
1625 if not self.isWriteable():
1626 raise TypeError("Butler is read-only.")
1627 collectionType = self.registry.getCollectionType(name)
1628 if purge and not unstore:
1629 raise PurgeWithoutUnstorePruneCollectionsError()
1630 if collectionType is CollectionType.RUN and not purge:
1631 raise RunWithoutPurgePruneCollectionsError(collectionType)
1632 if collectionType is not CollectionType.RUN and purge:
1633 raise PurgeUnsupportedPruneCollectionsError(collectionType)
1635 def remove(child: str, parent: str) -> None:
1636 """Remove a child collection from a parent collection."""
1637 # Remove child from parent.
1638 chain = list(self.registry.getCollectionChain(parent))
1639 try:
1640 chain.remove(name)
1641 except ValueError as e:
1642 raise RuntimeError(f"{name} is not a child of {parent}") from e
1643 self.registry.setCollectionChain(parent, chain)
1645 with self.registry.transaction():
1646 if unlink:
1647 for parent in unlink:
1648 remove(name, parent)
1649 if unstore:
1650 refs = self.registry.queryDatasets(..., collections=name, findFirst=True)
1651 self.datastore.trash(refs)
1652 self.registry.removeCollection(name)
1654 if unstore:
1655 # Point of no return for removing artifacts
1656 self.datastore.emptyTrash()
1658 def pruneDatasets(
1659 self,
1660 refs: Iterable[DatasetRef],
1661 *,
1662 disassociate: bool = True,
1663 unstore: bool = False,
1664 tags: Iterable[str] = (),
1665 purge: bool = False,
1666 run: Optional[str] = None,
1667 ) -> None:
1668 """Remove one or more datasets from a collection and/or storage.
1670 Parameters
1671 ----------
1672 refs : `~collections.abc.Iterable` of `DatasetRef`
1673 Datasets to prune. These must be "resolved" references (not just
1674 a `DatasetType` and data ID).
1675 disassociate : `bool`, optional
1676 Disassociate pruned datasets from ``tags``, or from all collections
1677 if ``purge=True``.
1678 unstore : `bool`, optional
1679 If `True` (`False` is default) remove these datasets from all
1680 datastores known to this butler. Note that this will make it
1681 impossible to retrieve these datasets even via other collections.
1682 Datasets that are already not stored are ignored by this option.
1683 tags : `Iterable` [ `str` ], optional
1684 `~CollectionType.TAGGED` collections to disassociate the datasets
1685 from. Ignored if ``disassociate`` is `False` or ``purge`` is
1686 `True`.
1687 purge : `bool`, optional
1688 If `True` (`False` is default), completely remove the dataset from
1689 the `Registry`. To prevent accidental deletions, ``purge`` may
1690 only be `True` if all of the following conditions are met:
1692 - All given datasets are in the given run.
1693 - ``disassociate`` is `True`;
1694 - ``unstore`` is `True`.
1696 This mode may remove provenance information from datasets other
1697 than those provided, and should be used with extreme care.
1699 Raises
1700 ------
1701 TypeError
1702 Raised if the butler is read-only, if no collection was provided,
1703 or the conditions for ``purge=True`` were not met.
1704 """
1705 if not self.isWriteable():
1706 raise TypeError("Butler is read-only.")
1707 if purge:
1708 if not disassociate:
1709 raise TypeError("Cannot pass purge=True without disassociate=True.")
1710 if not unstore:
1711 raise TypeError("Cannot pass purge=True without unstore=True.")
1712 elif disassociate:
1713 tags = tuple(tags)
1714 if not tags:
1715 raise TypeError("No tags provided but disassociate=True.")
1716 for tag in tags:
1717 collectionType = self.registry.getCollectionType(tag)
1718 if collectionType is not CollectionType.TAGGED:
1719 raise TypeError(
1720 f"Cannot disassociate from collection '{tag}' "
1721 f"of non-TAGGED type {collectionType.name}."
1722 )
1723 # Transform possibly-single-pass iterable into something we can iterate
1724 # over multiple times.
1725 refs = list(refs)
1726 # Pruning a component of a DatasetRef makes no sense since registry
1727 # doesn't know about components and datastore might not store
1728 # components in a separate file
1729 for ref in refs:
1730 if ref.datasetType.component():
1731 raise ValueError(f"Can not prune a component of a dataset (ref={ref})")
1732 # We don't need an unreliable Datastore transaction for this, because
1733 # we've been extra careful to ensure that Datastore.trash only involves
1734 # mutating the Registry (it can _look_ at Datastore-specific things,
1735 # but shouldn't change them), and hence all operations here are
1736 # Registry operations.
1737 with self.registry.transaction():
1738 if unstore:
1739 self.datastore.trash(refs)
1740 if purge:
1741 self.registry.removeDatasets(refs)
1742 elif disassociate:
1743 assert tags, "Guaranteed by earlier logic in this function."
1744 for tag in tags:
1745 self.registry.disassociate(tag, refs)
1746 # We've exited the Registry transaction, and apparently committed.
1747 # (if there was an exception, everything rolled back, and it's as if
1748 # nothing happened - and we never get here).
1749 # Datastore artifacts are not yet gone, but they're clearly marked
1750 # as trash, so if we fail to delete now because of (e.g.) filesystem
1751 # problems we can try again later, and if manual administrative
1752 # intervention is required, it's pretty clear what that should entail:
1753 # deleting everything on disk and in private Datastore tables that is
1754 # in the dataset_location_trash table.
1755 if unstore:
1756 # Point of no return for removing artifacts
1757 self.datastore.emptyTrash()
1759 @transactional
1760 def ingest(
1761 self,
1762 *datasets: FileDataset,
1763 transfer: Optional[str] = "auto",
1764 run: Optional[str] = None,
1765 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
1766 record_validation_info: bool = True,
1767 ) -> None:
1768 """Store and register one or more datasets that already exist on disk.
1770 Parameters
1771 ----------
1772 datasets : `FileDataset`
1773 Each positional argument is a struct containing information about
1774 a file to be ingested, including its URI (either absolute or
1775 relative to the datastore root, if applicable), a `DatasetRef`,
1776 and optionally a formatter class or its fully-qualified string
1777 name. If a formatter is not provided, the formatter that would be
1778 used for `put` is assumed. On successful return, all
1779 `FileDataset.ref` attributes will have their `DatasetRef.id`
1780 attribute populated and all `FileDataset.formatter` attributes will
1781 be set to the formatter class used. `FileDataset.path` attributes
1782 may be modified to put paths in whatever the datastore considers a
1783 standardized form.
1784 transfer : `str`, optional
1785 If not `None`, must be one of 'auto', 'move', 'copy', 'direct',
1786 'split', 'hardlink', 'relsymlink' or 'symlink', indicating how to
1787 transfer the file.
1788 run : `str`, optional
1789 The name of the run ingested datasets should be added to,
1790 overriding ``self.run``.
1791 idGenerationMode : `DatasetIdGenEnum`, optional
1792 Specifies option for generating dataset IDs. By default unique IDs
1793 are generated for each inserted dataset.
1794 record_validation_info : `bool`, optional
1795 If `True`, the default, the datastore can record validation
1796 information associated with the file. If `False` the datastore
1797 will not attempt to track any information such as checksums
1798 or file sizes. This can be useful if such information is tracked
1799 in an external system or if the file is to be compressed in place.
1800 It is up to the datastore whether this parameter is relevant.
1802 Raises
1803 ------
1804 TypeError
1805 Raised if the butler is read-only or if no run was provided.
1806 NotImplementedError
1807 Raised if the `Datastore` does not support the given transfer mode.
1808 DatasetTypeNotSupportedError
1809 Raised if one or more files to be ingested have a dataset type that
1810 is not supported by the `Datastore`..
1811 FileNotFoundError
1812 Raised if one of the given files does not exist.
1813 FileExistsError
1814 Raised if transfer is not `None` but the (internal) location the
1815 file would be moved to is already occupied.
1817 Notes
1818 -----
1819 This operation is not fully exception safe: if a database operation
1820 fails, the given `FileDataset` instances may be only partially updated.
1822 It is atomic in terms of database operations (they will either all
1823 succeed or all fail) providing the database engine implements
1824 transactions correctly. It will attempt to be atomic in terms of
1825 filesystem operations as well, but this cannot be implemented
1826 rigorously for most datastores.
1827 """
1828 if not self.isWriteable():
1829 raise TypeError("Butler is read-only.")
1830 progress = Progress("lsst.daf.butler.Butler.ingest", level=logging.DEBUG)
1831 # Reorganize the inputs so they're grouped by DatasetType and then
1832 # data ID. We also include a list of DatasetRefs for each FileDataset
1833 # to hold the resolved DatasetRefs returned by the Registry, before
1834 # it's safe to swap them into FileDataset.refs.
1835 # Some type annotation aliases to make that clearer:
1836 GroupForType = Dict[DataCoordinate, Tuple[FileDataset, List[DatasetRef]]]
1837 GroupedData = MutableMapping[DatasetType, GroupForType]
1838 # The actual data structure:
1839 groupedData: GroupedData = defaultdict(dict)
1840 # And the nested loop that populates it:
1841 for dataset in progress.wrap(datasets, desc="Grouping by dataset type"):
1842 # This list intentionally shared across the inner loop, since it's
1843 # associated with `dataset`.
1844 resolvedRefs: List[DatasetRef] = []
1846 # Somewhere to store pre-existing refs if we have an
1847 # execution butler.
1848 existingRefs: List[DatasetRef] = []
1850 for ref in dataset.refs:
1851 if ref.dataId in groupedData[ref.datasetType]:
1852 raise ConflictingDefinitionError(
1853 f"Ingest conflict. Dataset {dataset.path} has same"
1854 " DataId as other ingest dataset"
1855 f" {groupedData[ref.datasetType][ref.dataId][0].path} "
1856 f" ({ref.dataId})"
1857 )
1858 if self._allow_put_of_predefined_dataset:
1859 existing_ref = self.registry.findDataset(
1860 ref.datasetType, dataId=ref.dataId, collections=run
1861 )
1862 if existing_ref:
1863 if self.datastore.knows(existing_ref):
1864 raise ConflictingDefinitionError(
1865 f"Dataset associated with path {dataset.path}"
1866 f" already exists as {existing_ref}."
1867 )
1868 # Store this ref elsewhere since it already exists
1869 # and we do not want to remake it but we do want
1870 # to store it in the datastore.
1871 existingRefs.append(existing_ref)
1873 # Nothing else to do until we have finished
1874 # iterating.
1875 continue
1877 groupedData[ref.datasetType][ref.dataId] = (dataset, resolvedRefs)
1879 if existingRefs:
1881 if len(dataset.refs) != len(existingRefs):
1882 # Keeping track of partially pre-existing datasets is hard
1883 # and should generally never happen. For now don't allow
1884 # it.
1885 raise ConflictingDefinitionError(
1886 f"For dataset {dataset.path} some dataIds already exist"
1887 " in registry but others do not. This is not supported."
1888 )
1890 # Attach the resolved refs if we found them.
1891 dataset.refs = existingRefs
1893 # Now we can bulk-insert into Registry for each DatasetType.
1894 for datasetType, groupForType in progress.iter_item_chunks(
1895 groupedData.items(), desc="Bulk-inserting datasets by type"
1896 ):
1897 refs = self.registry.insertDatasets(
1898 datasetType,
1899 dataIds=groupForType.keys(),
1900 run=run,
1901 expand=self.datastore.needs_expanded_data_ids(transfer, datasetType),
1902 idGenerationMode=idGenerationMode,
1903 )
1904 # Append those resolved DatasetRefs to the new lists we set up for
1905 # them.
1906 for ref, (_, resolvedRefs) in zip(refs, groupForType.values()):
1907 resolvedRefs.append(ref)
1909 # Go back to the original FileDatasets to replace their refs with the
1910 # new resolved ones.
1911 for groupForType in progress.iter_chunks(
1912 groupedData.values(), desc="Reassociating resolved dataset refs with files"
1913 ):
1914 for dataset, resolvedRefs in groupForType.values():
1915 dataset.refs = resolvedRefs
1917 # Bulk-insert everything into Datastore.
1918 self.datastore.ingest(*datasets, transfer=transfer, record_validation_info=record_validation_info)
1920 @contextlib.contextmanager
1921 def export(
1922 self,
1923 *,
1924 directory: Optional[str] = None,
1925 filename: Optional[str] = None,
1926 format: Optional[str] = None,
1927 transfer: Optional[str] = None,
1928 ) -> Iterator[RepoExportContext]:
1929 """Export datasets from the repository represented by this `Butler`.
1931 This method is a context manager that returns a helper object
1932 (`RepoExportContext`) that is used to indicate what information from
1933 the repository should be exported.
1935 Parameters
1936 ----------
1937 directory : `str`, optional
1938 Directory dataset files should be written to if ``transfer`` is not
1939 `None`.
1940 filename : `str`, optional
1941 Name for the file that will include database information associated
1942 with the exported datasets. If this is not an absolute path and
1943 ``directory`` is not `None`, it will be written to ``directory``
1944 instead of the current working directory. Defaults to
1945 "export.{format}".
1946 format : `str`, optional
1947 File format for the database information file. If `None`, the
1948 extension of ``filename`` will be used.
1949 transfer : `str`, optional
1950 Transfer mode passed to `Datastore.export`.
1952 Raises
1953 ------
1954 TypeError
1955 Raised if the set of arguments passed is inconsistent.
1957 Examples
1958 --------
1959 Typically the `Registry.queryDataIds` and `Registry.queryDatasets`
1960 methods are used to provide the iterables over data IDs and/or datasets
1961 to be exported::
1963 with butler.export("exports.yaml") as export:
1964 # Export all flats, but none of the dimension element rows
1965 # (i.e. data ID information) associated with them.
1966 export.saveDatasets(butler.registry.queryDatasets("flat"),
1967 elements=())
1968 # Export all datasets that start with "deepCoadd_" and all of
1969 # their associated data ID information.
1970 export.saveDatasets(butler.registry.queryDatasets("deepCoadd_*"))
1971 """
1972 if directory is None and transfer is not None:
1973 raise TypeError("Cannot transfer without providing a directory.")
1974 if transfer == "move":
1975 raise TypeError("Transfer may not be 'move': export is read-only")
1976 if format is None:
1977 if filename is None:
1978 raise TypeError("At least one of 'filename' or 'format' must be provided.")
1979 else:
1980 _, format = os.path.splitext(filename)
1981 elif filename is None:
1982 filename = f"export.{format}"
1983 if directory is not None:
1984 filename = os.path.join(directory, filename)
1985 BackendClass = get_class_of(self._config["repo_transfer_formats"][format]["export"])
1986 with open(filename, "w") as stream:
1987 backend = BackendClass(stream)
1988 try:
1989 helper = RepoExportContext(
1990 self.registry, self.datastore, backend=backend, directory=directory, transfer=transfer
1991 )
1992 yield helper
1993 except BaseException:
1994 raise
1995 else:
1996 helper._finish()
1998 def import_(
1999 self,
2000 *,
2001 directory: Optional[str] = None,
2002 filename: Union[str, TextIO, None] = None,
2003 format: Optional[str] = None,
2004 transfer: Optional[str] = None,
2005 skip_dimensions: Optional[Set] = None,
2006 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
2007 reuseIds: bool = False,
2008 ) -> None:
2009 """Import datasets into this repository that were exported from a
2010 different butler repository via `~lsst.daf.butler.Butler.export`.
2012 Parameters
2013 ----------
2014 directory : `str`, optional
2015 Directory containing dataset files to import from. If `None`,
2016 ``filename`` and all dataset file paths specified therein must
2017 be absolute.
2018 filename : `str` or `TextIO`, optional
2019 A stream or name of file that contains database information
2020 associated with the exported datasets, typically generated by
2021 `~lsst.daf.butler.Butler.export`. If this a string (name) and
2022 is not an absolute path, does not exist in the current working
2023 directory, and ``directory`` is not `None`, it is assumed to be in
2024 ``directory``. Defaults to "export.{format}".
2025 format : `str`, optional
2026 File format for ``filename``. If `None`, the extension of
2027 ``filename`` will be used.
2028 transfer : `str`, optional
2029 Transfer mode passed to `~lsst.daf.butler.Datastore.ingest`.
2030 skip_dimensions : `set`, optional
2031 Names of dimensions that should be skipped and not imported.
2032 idGenerationMode : `DatasetIdGenEnum`, optional
2033 Specifies option for generating dataset IDs when IDs are not
2034 provided or their type does not match backend type. By default
2035 unique IDs are generated for each inserted dataset.
2036 reuseIds : `bool`, optional
2037 If `True` then forces re-use of imported dataset IDs for integer
2038 IDs which are normally generated as auto-incremented; exception
2039 will be raised if imported IDs clash with existing ones. This
2040 option has no effect on the use of globally-unique IDs which are
2041 always re-used (or generated if integer IDs are being imported).
2043 Raises
2044 ------
2045 TypeError
2046 Raised if the set of arguments passed is inconsistent, or if the
2047 butler is read-only.
2048 """
2049 if not self.isWriteable():
2050 raise TypeError("Butler is read-only.")
2051 if format is None:
2052 if filename is None:
2053 raise TypeError("At least one of 'filename' or 'format' must be provided.")
2054 else:
2055 _, format = os.path.splitext(filename) # type: ignore
2056 elif filename is None:
2057 filename = f"export.{format}"
2058 if isinstance(filename, str) and directory is not None and not os.path.exists(filename):
2059 filename = os.path.join(directory, filename)
2060 BackendClass = get_class_of(self._config["repo_transfer_formats"][format]["import"])
2062 def doImport(importStream: TextIO) -> None:
2063 backend = BackendClass(importStream, self.registry)
2064 backend.register()
2065 with self.transaction():
2066 backend.load(
2067 self.datastore,
2068 directory=directory,
2069 transfer=transfer,
2070 skip_dimensions=skip_dimensions,
2071 idGenerationMode=idGenerationMode,
2072 reuseIds=reuseIds,
2073 )
2075 if isinstance(filename, str):
2076 with open(filename, "r") as stream:
2077 doImport(stream)
2078 else:
2079 doImport(filename)
2081 def transfer_from(
2082 self,
2083 source_butler: Butler,
2084 source_refs: Iterable[DatasetRef],
2085 transfer: str = "auto",
2086 id_gen_map: Dict[str, DatasetIdGenEnum] = None,
2087 skip_missing: bool = True,
2088 register_dataset_types: bool = False,
2089 ) -> List[DatasetRef]:
2090 """Transfer datasets to this Butler from a run in another Butler.
2092 Parameters
2093 ----------
2094 source_butler : `Butler`
2095 Butler from which the datasets are to be transferred.
2096 source_refs : iterable of `DatasetRef`
2097 Datasets defined in the source butler that should be transferred to
2098 this butler.
2099 transfer : `str`, optional
2100 Transfer mode passed to `~lsst.daf.butler.Datastore.transfer_from`.
2101 id_gen_map : `dict` of [`str`, `DatasetIdGenEnum`], optional
2102 A mapping of dataset type to ID generation mode. Only used if
2103 the source butler is using integer IDs. Should not be used
2104 if this receiving butler uses integer IDs. Without this dataset
2105 import always uses unique.
2106 skip_missing : `bool`
2107 If `True`, datasets with no datastore artifact associated with
2108 them are not transferred. If `False` a registry entry will be
2109 created even if no datastore record is created (and so will
2110 look equivalent to the dataset being unstored).
2111 register_dataset_types : `bool`
2112 If `True` any missing dataset types are registered. Otherwise
2113 an exception is raised.
2115 Returns
2116 -------
2117 refs : `list` of `DatasetRef`
2118 The refs added to this Butler.
2120 Notes
2121 -----
2122 Requires that any dimension definitions are already present in the
2123 receiving Butler. The datastore artifact has to exist for a transfer
2124 to be made but non-existence is not an error.
2126 Datasets that already exist in this run will be skipped.
2128 The datasets are imported as part of a transaction, although
2129 dataset types are registered before the transaction is started.
2130 This means that it is possible for a dataset type to be registered
2131 even though transfer has failed.
2132 """
2133 if not self.isWriteable():
2134 raise TypeError("Butler is read-only.")
2135 progress = Progress("lsst.daf.butler.Butler.transfer_from", level=VERBOSE)
2137 # Will iterate through the refs multiple times so need to convert
2138 # to a list if this isn't a collection.
2139 if not isinstance(source_refs, collections.abc.Collection):
2140 source_refs = list(source_refs)
2142 original_count = len(source_refs)
2143 log.info("Transferring %d datasets into %s", original_count, str(self))
2145 if id_gen_map is None:
2146 id_gen_map = {}
2148 # In some situations the datastore artifact may be missing
2149 # and we do not want that registry entry to be imported.
2150 # Asking datastore is not sufficient, the records may have been
2151 # purged, we have to ask for the (predicted) URI and check
2152 # existence explicitly. Execution butler is set up exactly like
2153 # this with no datastore records.
2154 artifact_existence: Dict[ResourcePath, bool] = {}
2155 if skip_missing:
2156 dataset_existence = source_butler.datastore.mexists(
2157 source_refs, artifact_existence=artifact_existence
2158 )
2159 source_refs = [ref for ref, exists in dataset_existence.items() if exists]
2160 filtered_count = len(source_refs)
2161 log.verbose(
2162 "%d datasets removed because the artifact does not exist. Now have %d.",
2163 original_count - filtered_count,
2164 filtered_count,
2165 )
2167 # Importing requires that we group the refs by dataset type and run
2168 # before doing the import.
2169 source_dataset_types = set()
2170 grouped_refs = defaultdict(list)
2171 grouped_indices = defaultdict(list)
2172 for i, ref in enumerate(source_refs):
2173 grouped_refs[ref.datasetType, ref.run].append(ref)
2174 grouped_indices[ref.datasetType, ref.run].append(i)
2175 source_dataset_types.add(ref.datasetType)
2177 # Check to see if the dataset type in the source butler has
2178 # the same definition in the target butler and register missing
2179 # ones if requested. Registration must happen outside a transaction.
2180 newly_registered_dataset_types = set()
2181 for datasetType in source_dataset_types:
2182 if register_dataset_types:
2183 # Let this raise immediately if inconsistent. Continuing
2184 # on to find additional inconsistent dataset types
2185 # might result in additional unwanted dataset types being
2186 # registered.
2187 if self.registry.registerDatasetType(datasetType):
2188 newly_registered_dataset_types.add(datasetType)
2189 else:
2190 # If the dataset type is missing, let it fail immediately.
2191 target_dataset_type = self.registry.getDatasetType(datasetType.name)
2192 if target_dataset_type != datasetType:
2193 raise ConflictingDefinitionError(
2194 "Source butler dataset type differs from definition"
2195 f" in target butler: {datasetType} !="
2196 f" {target_dataset_type}"
2197 )
2198 if newly_registered_dataset_types:
2199 # We may have registered some even if there were inconsistencies
2200 # but should let people know (or else remove them again).
2201 log.log(
2202 VERBOSE,
2203 "Registered the following dataset types in the target Butler: %s",
2204 ", ".join(d.name for d in newly_registered_dataset_types),
2205 )
2206 else:
2207 log.log(VERBOSE, "All required dataset types are known to the target Butler")
2209 # The returned refs should be identical for UUIDs.
2210 # For now must also support integers and so need to retain the
2211 # newly-created refs from this registry.
2212 # Pre-size it so we can assign refs into the correct slots
2213 transferred_refs_tmp: List[Optional[DatasetRef]] = [None] * len(source_refs)
2214 default_id_gen = DatasetIdGenEnum.UNIQUE
2216 handled_collections: Set[str] = set()
2218 # Do all the importing in a single transaction.
2219 with self.transaction():
2220 for (datasetType, run), refs_to_import in progress.iter_item_chunks(
2221 grouped_refs.items(), desc="Importing to registry by run and dataset type"
2222 ):
2223 if run not in handled_collections:
2224 run_doc = source_butler.registry.getCollectionDocumentation(run)
2225 registered = self.registry.registerRun(run, doc=run_doc)
2226 handled_collections.add(run)
2227 if registered:
2228 log.log(VERBOSE, "Creating output run %s", run)
2230 id_generation_mode = default_id_gen
2231 if isinstance(refs_to_import[0].id, int):
2232 # ID generation mode might need to be overridden when
2233 # targetting UUID
2234 id_generation_mode = id_gen_map.get(datasetType.name, default_id_gen)
2236 n_refs = len(refs_to_import)
2237 log.verbose(
2238 "Importing %d ref%s of dataset type %s into run %s",
2239 n_refs,
2240 "" if n_refs == 1 else "s",
2241 datasetType.name,
2242 run,
2243 )
2245 # No way to know if this butler's registry uses UUID.
2246 # We have to trust the caller on this. If it fails they will
2247 # have to change their approach. We can't catch the exception
2248 # and retry with unique because that will mess up the
2249 # transaction handling. We aren't allowed to ask the registry
2250 # manager what type of ID it is using.
2251 imported_refs = self.registry._importDatasets(
2252 refs_to_import, idGenerationMode=id_generation_mode, expand=False
2253 )
2255 # Map them into the correct slots to match the initial order
2256 for i, ref in zip(grouped_indices[datasetType, run], imported_refs):
2257 transferred_refs_tmp[i] = ref
2259 # Mypy insists that we might have None in here so we have to make
2260 # that explicit by assigning to a new variable and filtering out
2261 # something that won't be there.
2262 transferred_refs = [ref for ref in transferred_refs_tmp if ref is not None]
2264 # Check consistency
2265 assert len(source_refs) == len(transferred_refs), "Different number of refs imported than given"
2267 log.verbose("Imported %d datasets into destination butler", len(transferred_refs))
2269 # The transferred refs need to be reordered to match the original
2270 # ordering given by the caller. Without this the datastore transfer
2271 # will be broken.
2273 # Ask the datastore to transfer. The datastore has to check that
2274 # the source datastore is compatible with the target datastore.
2275 self.datastore.transfer_from(
2276 source_butler.datastore,
2277 source_refs,
2278 local_refs=transferred_refs,
2279 transfer=transfer,
2280 artifact_existence=artifact_existence,
2281 )
2283 return transferred_refs
2285 def validateConfiguration(
2286 self,
2287 logFailures: bool = False,
2288 datasetTypeNames: Optional[Iterable[str]] = None,
2289 ignore: Iterable[str] = None,
2290 ) -> None:
2291 """Validate butler configuration.
2293 Checks that each `DatasetType` can be stored in the `Datastore`.
2295 Parameters
2296 ----------
2297 logFailures : `bool`, optional
2298 If `True`, output a log message for every validation error
2299 detected.
2300 datasetTypeNames : iterable of `str`, optional
2301 The `DatasetType` names that should be checked. This allows
2302 only a subset to be selected.
2303 ignore : iterable of `str`, optional
2304 Names of DatasetTypes to skip over. This can be used to skip
2305 known problems. If a named `DatasetType` corresponds to a
2306 composite, all components of that `DatasetType` will also be
2307 ignored.
2309 Raises
2310 ------
2311 ButlerValidationError
2312 Raised if there is some inconsistency with how this Butler
2313 is configured.
2314 """
2315 if datasetTypeNames:
2316 datasetTypes = [self.registry.getDatasetType(name) for name in datasetTypeNames]
2317 else:
2318 datasetTypes = list(self.registry.queryDatasetTypes())
2320 # filter out anything from the ignore list
2321 if ignore:
2322 ignore = set(ignore)
2323 datasetTypes = [
2324 e for e in datasetTypes if e.name not in ignore and e.nameAndComponent()[0] not in ignore
2325 ]
2326 else:
2327 ignore = set()
2329 # Find all the registered instruments
2330 instruments = set(record.name for record in self.registry.queryDimensionRecords("instrument"))
2332 # For each datasetType that has an instrument dimension, create
2333 # a DatasetRef for each defined instrument
2334 datasetRefs = []
2336 for datasetType in datasetTypes:
2337 if "instrument" in datasetType.dimensions:
2338 for instrument in instruments:
2339 datasetRef = DatasetRef(
2340 datasetType, {"instrument": instrument}, conform=False # type: ignore
2341 )
2342 datasetRefs.append(datasetRef)
2344 entities: List[Union[DatasetType, DatasetRef]] = []
2345 entities.extend(datasetTypes)
2346 entities.extend(datasetRefs)
2348 datastoreErrorStr = None
2349 try:
2350 self.datastore.validateConfiguration(entities, logFailures=logFailures)
2351 except ValidationError as e:
2352 datastoreErrorStr = str(e)
2354 # Also check that the LookupKeys used by the datastores match
2355 # registry and storage class definitions
2356 keys = self.datastore.getLookupKeys()
2358 failedNames = set()
2359 failedDataId = set()
2360 for key in keys:
2361 if key.name is not None:
2362 if key.name in ignore:
2363 continue
2365 # skip if specific datasetType names were requested and this
2366 # name does not match
2367 if datasetTypeNames and key.name not in datasetTypeNames:
2368 continue
2370 # See if it is a StorageClass or a DatasetType
2371 if key.name in self.storageClasses:
2372 pass
2373 else:
2374 try:
2375 self.registry.getDatasetType(key.name)
2376 except KeyError:
2377 if logFailures:
2378 log.critical("Key '%s' does not correspond to a DatasetType or StorageClass", key)
2379 failedNames.add(key)
2380 else:
2381 # Dimensions are checked for consistency when the Butler
2382 # is created and rendezvoused with a universe.
2383 pass
2385 # Check that the instrument is a valid instrument
2386 # Currently only support instrument so check for that
2387 if key.dataId:
2388 dataIdKeys = set(key.dataId)
2389 if set(["instrument"]) != dataIdKeys:
2390 if logFailures:
2391 log.critical("Key '%s' has unsupported DataId override", key)
2392 failedDataId.add(key)
2393 elif key.dataId["instrument"] not in instruments:
2394 if logFailures:
2395 log.critical("Key '%s' has unknown instrument", key)
2396 failedDataId.add(key)
2398 messages = []
2400 if datastoreErrorStr:
2401 messages.append(datastoreErrorStr)
2403 for failed, msg in (
2404 (failedNames, "Keys without corresponding DatasetType or StorageClass entry: "),
2405 (failedDataId, "Keys with bad DataId entries: "),
2406 ):
2407 if failed:
2408 msg += ", ".join(str(k) for k in failed)
2409 messages.append(msg)
2411 if messages:
2412 raise ValidationError(";\n".join(messages))
2414 @property
2415 def collections(self) -> CollectionSearch:
2416 """The collections to search by default, in order (`CollectionSearch`).
2418 This is an alias for ``self.registry.defaults.collections``. It cannot
2419 be set directly in isolation, but all defaults may be changed together
2420 by assigning a new `RegistryDefaults` instance to
2421 ``self.registry.defaults``.
2422 """
2423 return self.registry.defaults.collections
2425 @property
2426 def run(self) -> Optional[str]:
2427 """Name of the run this butler writes outputs to by default (`str` or
2428 `None`).
2430 This is an alias for ``self.registry.defaults.run``. It cannot be set
2431 directly in isolation, but all defaults may be changed together by
2432 assigning a new `RegistryDefaults` instance to
2433 ``self.registry.defaults``.
2434 """
2435 return self.registry.defaults.run
2437 @property
2438 def dimensions(self) -> DimensionUniverse:
2439 # Docstring inherited.
2440 return self.registry.dimensions
2442 registry: Registry
2443 """The object that manages dataset metadata and relationships (`Registry`).
2445 Most operations that don't involve reading or writing butler datasets are
2446 accessible only via `Registry` methods.
2447 """
2449 datastore: Datastore
2450 """The object that manages actual dataset storage (`Datastore`).
2452 Direct user access to the datastore should rarely be necessary; the primary
2453 exception is the case where a `Datastore` implementation provides extra
2454 functionality beyond what the base class defines.
2455 """
2457 storageClasses: StorageClassFactory
2458 """An object that maps known storage class names to objects that fully
2459 describe them (`StorageClassFactory`).
2460 """
2462 _allow_put_of_predefined_dataset: bool
2463 """Allow a put to succeed even if there is already a registry entry for it
2464 but not a datastore record. (`bool`)."""