Coverage for python/lsst/daf/butler/_butler.py: 9%
683 statements
« prev ^ index » next coverage.py v6.4.4, created at 2022-09-15 09:41 +0000
« prev ^ index » next coverage.py v6.4.4, created at 2022-09-15 09:41 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""
23Butler top level classes.
24"""
25from __future__ import annotations
27__all__ = (
28 "Butler",
29 "ButlerValidationError",
30 "PruneCollectionsArgsError",
31 "PurgeWithoutUnstorePruneCollectionsError",
32 "RunWithoutPurgePruneCollectionsError",
33 "PurgeUnsupportedPruneCollectionsError",
34)
36import collections.abc
37import contextlib
38import logging
39import numbers
40import os
41from collections import defaultdict
42from typing import (
43 Any,
44 ClassVar,
45 Counter,
46 Dict,
47 Iterable,
48 Iterator,
49 List,
50 MutableMapping,
51 Optional,
52 Set,
53 TextIO,
54 Tuple,
55 Type,
56 Union,
57)
59from lsst.resources import ResourcePath, ResourcePathExpression
60from lsst.utils import doImportType
61from lsst.utils.introspection import get_class_of
62from lsst.utils.logging import VERBOSE, getLogger
64from ._butlerConfig import ButlerConfig
65from ._butlerRepoIndex import ButlerRepoIndex
66from ._deferredDatasetHandle import DeferredDatasetHandle
67from ._limited_butler import LimitedButler
68from .core import (
69 AmbiguousDatasetError,
70 Config,
71 ConfigSubset,
72 DataCoordinate,
73 DataId,
74 DataIdValue,
75 DatasetRef,
76 DatasetRefURIs,
77 DatasetType,
78 Datastore,
79 Dimension,
80 DimensionConfig,
81 DimensionElement,
82 DimensionRecord,
83 DimensionUniverse,
84 FileDataset,
85 Progress,
86 StorageClassFactory,
87 Timespan,
88 ValidationError,
89)
90from .core.repoRelocation import BUTLER_ROOT_TAG
91from .core.utils import transactional
92from .registry import (
93 CollectionSearch,
94 CollectionType,
95 ConflictingDefinitionError,
96 DataIdError,
97 DatasetIdGenEnum,
98 Registry,
99 RegistryConfig,
100 RegistryDefaults,
101)
102from .transfers import RepoExportContext
104log = getLogger(__name__)
107class ButlerValidationError(ValidationError):
108 """There is a problem with the Butler configuration."""
110 pass
113class PruneCollectionsArgsError(TypeError):
114 """Base class for errors relating to Butler.pruneCollections input
115 arguments.
116 """
118 pass
121class PurgeWithoutUnstorePruneCollectionsError(PruneCollectionsArgsError):
122 """Raised when purge and unstore are both required to be True, and
123 purge is True but unstore is False.
124 """
126 def __init__(self) -> None:
127 super().__init__("Cannot pass purge=True without unstore=True.")
130class RunWithoutPurgePruneCollectionsError(PruneCollectionsArgsError):
131 """Raised when pruning a RUN collection but purge is False."""
133 def __init__(self, collectionType: CollectionType):
134 self.collectionType = collectionType
135 super().__init__(f"Cannot prune RUN collection {self.collectionType.name} without purge=True.")
138class PurgeUnsupportedPruneCollectionsError(PruneCollectionsArgsError):
139 """Raised when purge is True but is not supported for the given
140 collection."""
142 def __init__(self, collectionType: CollectionType):
143 self.collectionType = collectionType
144 super().__init__(
145 f"Cannot prune {self.collectionType} collection {self.collectionType.name} with purge=True."
146 )
149class Butler(LimitedButler):
150 """Main entry point for the data access system.
152 Parameters
153 ----------
154 config : `ButlerConfig`, `Config` or `str`, optional.
155 Configuration. Anything acceptable to the
156 `ButlerConfig` constructor. If a directory path
157 is given the configuration will be read from a ``butler.yaml`` file in
158 that location. If `None` is given default values will be used.
159 butler : `Butler`, optional.
160 If provided, construct a new Butler that uses the same registry and
161 datastore as the given one, but with the given collection and run.
162 Incompatible with the ``config``, ``searchPaths``, and ``writeable``
163 arguments.
164 collections : `str` or `Iterable` [ `str` ], optional
165 An expression specifying the collections to be searched (in order) when
166 reading datasets.
167 This may be a `str` collection name or an iterable thereof.
168 See :ref:`daf_butler_collection_expressions` for more information.
169 These collections are not registered automatically and must be
170 manually registered before they are used by any method, but they may be
171 manually registered after the `Butler` is initialized.
172 run : `str`, optional
173 Name of the `~CollectionType.RUN` collection new datasets should be
174 inserted into. If ``collections`` is `None` and ``run`` is not `None`,
175 ``collections`` will be set to ``[run]``. If not `None`, this
176 collection will automatically be registered. If this is not set (and
177 ``writeable`` is not set either), a read-only butler will be created.
178 searchPaths : `list` of `str`, optional
179 Directory paths to search when calculating the full Butler
180 configuration. Not used if the supplied config is already a
181 `ButlerConfig`.
182 writeable : `bool`, optional
183 Explicitly sets whether the butler supports write operations. If not
184 provided, a read-write butler is created if any of ``run``, ``tags``,
185 or ``chains`` is non-empty.
186 inferDefaults : `bool`, optional
187 If `True` (default) infer default data ID values from the values
188 present in the datasets in ``collections``: if all collections have the
189 same value (or no value) for a governor dimension, that value will be
190 the default for that dimension. Nonexistent collections are ignored.
191 If a default value is provided explicitly for a governor dimension via
192 ``**kwargs``, no default will be inferred for that dimension.
193 **kwargs : `str`
194 Default data ID key-value pairs. These may only identify "governor"
195 dimensions like ``instrument`` and ``skymap``.
197 Examples
198 --------
199 While there are many ways to control exactly how a `Butler` interacts with
200 the collections in its `Registry`, the most common cases are still simple.
202 For a read-only `Butler` that searches one collection, do::
204 butler = Butler("/path/to/repo", collections=["u/alice/DM-50000"])
206 For a read-write `Butler` that writes to and reads from a
207 `~CollectionType.RUN` collection::
209 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a")
211 The `Butler` passed to a ``PipelineTask`` is often much more complex,
212 because we want to write to one `~CollectionType.RUN` collection but read
213 from several others (as well)::
215 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a",
216 collections=["u/alice/DM-50000/a",
217 "u/bob/DM-49998",
218 "HSC/defaults"])
220 This butler will `put` new datasets to the run ``u/alice/DM-50000/a``.
221 Datasets will be read first from that run (since it appears first in the
222 chain), and then from ``u/bob/DM-49998`` and finally ``HSC/defaults``.
224 Finally, one can always create a `Butler` with no collections::
226 butler = Butler("/path/to/repo", writeable=True)
228 This can be extremely useful when you just want to use ``butler.registry``,
229 e.g. for inserting dimension data or managing collections, or when the
230 collections you want to use with the butler are not consistent.
231 Passing ``writeable`` explicitly here is only necessary if you want to be
232 able to make changes to the repo - usually the value for ``writeable`` can
233 be guessed from the collection arguments provided, but it defaults to
234 `False` when there are not collection arguments.
235 """
237 def __init__(
238 self,
239 config: Union[Config, str, None] = None,
240 *,
241 butler: Optional[Butler] = None,
242 collections: Any = None,
243 run: Optional[str] = None,
244 searchPaths: Optional[List[str]] = None,
245 writeable: Optional[bool] = None,
246 inferDefaults: bool = True,
247 **kwargs: str,
248 ):
249 defaults = RegistryDefaults(collections=collections, run=run, infer=inferDefaults, **kwargs)
250 # Load registry, datastore, etc. from config or existing butler.
251 if butler is not None:
252 if config is not None or searchPaths is not None or writeable is not None:
253 raise TypeError(
254 "Cannot pass 'config', 'searchPaths', or 'writeable' arguments with 'butler' argument."
255 )
256 self.registry = butler.registry.copy(defaults)
257 self.datastore = butler.datastore
258 self.storageClasses = butler.storageClasses
259 self._config: ButlerConfig = butler._config
260 self._allow_put_of_predefined_dataset = butler._allow_put_of_predefined_dataset
261 else:
262 # Can only look for strings in the known repos list.
263 if isinstance(config, str) and config in self.get_known_repos():
264 config = str(self.get_repo_uri(config))
265 try:
266 self._config = ButlerConfig(config, searchPaths=searchPaths)
267 except FileNotFoundError as e:
268 if known := self.get_known_repos():
269 aliases = f"(known aliases: {', '.join(known)})"
270 else:
271 aliases = "(no known aliases)"
272 raise FileNotFoundError(f"{e} {aliases}") from e
273 self._config = ButlerConfig(config, searchPaths=searchPaths)
274 try:
275 if "root" in self._config:
276 butlerRoot = self._config["root"]
277 else:
278 butlerRoot = self._config.configDir
279 if writeable is None:
280 writeable = run is not None
281 self.registry = Registry.fromConfig(
282 self._config, butlerRoot=butlerRoot, writeable=writeable, defaults=defaults
283 )
284 self.datastore = Datastore.fromConfig(
285 self._config, self.registry.getDatastoreBridgeManager(), butlerRoot=butlerRoot
286 )
287 self.storageClasses = StorageClassFactory()
288 self.storageClasses.addFromConfig(self._config)
289 self._allow_put_of_predefined_dataset = self._config.get(
290 "allow_put_of_predefined_dataset", False
291 )
292 except Exception:
293 # Failures here usually mean that configuration is incomplete,
294 # just issue an error message which includes config file URI.
295 log.error(f"Failed to instantiate Butler from config {self._config.configFile}.")
296 raise
298 if "run" in self._config or "collection" in self._config:
299 raise ValueError("Passing a run or collection via configuration is no longer supported.")
301 GENERATION: ClassVar[int] = 3
302 """This is a Generation 3 Butler.
304 This attribute may be removed in the future, once the Generation 2 Butler
305 interface has been fully retired; it should only be used in transitional
306 code.
307 """
309 @classmethod
310 def get_repo_uri(cls, label: str) -> ResourcePath:
311 """Look up the label in a butler repository index.
313 Parameters
314 ----------
315 label : `str`
316 Label of the Butler repository to look up.
318 Returns
319 -------
320 uri : `lsst.resources.ResourcePath`
321 URI to the Butler repository associated with the given label.
323 Raises
324 ------
325 KeyError
326 Raised if the label is not found in the index, or if an index
327 can not be found at all.
329 Notes
330 -----
331 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the
332 information is discovered.
333 """
334 return ButlerRepoIndex.get_repo_uri(label)
336 @classmethod
337 def get_known_repos(cls) -> Set[str]:
338 """Retrieve the list of known repository labels.
340 Returns
341 -------
342 repos : `set` of `str`
343 All the known labels. Can be empty if no index can be found.
345 Notes
346 -----
347 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the
348 information is discovered.
349 """
350 return ButlerRepoIndex.get_known_repos()
352 @staticmethod
353 def makeRepo(
354 root: ResourcePathExpression,
355 config: Union[Config, str, None] = None,
356 dimensionConfig: Union[Config, str, None] = None,
357 standalone: bool = False,
358 searchPaths: Optional[List[str]] = None,
359 forceConfigRoot: bool = True,
360 outfile: Optional[ResourcePathExpression] = None,
361 overwrite: bool = False,
362 ) -> Config:
363 """Create an empty data repository by adding a butler.yaml config
364 to a repository root directory.
366 Parameters
367 ----------
368 root : `lsst.resources.ResourcePathExpression`
369 Path or URI to the root location of the new repository. Will be
370 created if it does not exist.
371 config : `Config` or `str`, optional
372 Configuration to write to the repository, after setting any
373 root-dependent Registry or Datastore config options. Can not
374 be a `ButlerConfig` or a `ConfigSubset`. If `None`, default
375 configuration will be used. Root-dependent config options
376 specified in this config are overwritten if ``forceConfigRoot``
377 is `True`.
378 dimensionConfig : `Config` or `str`, optional
379 Configuration for dimensions, will be used to initialize registry
380 database.
381 standalone : `bool`
382 If True, write all expanded defaults, not just customized or
383 repository-specific settings.
384 This (mostly) decouples the repository from the default
385 configuration, insulating it from changes to the defaults (which
386 may be good or bad, depending on the nature of the changes).
387 Future *additions* to the defaults will still be picked up when
388 initializing `Butlers` to repos created with ``standalone=True``.
389 searchPaths : `list` of `str`, optional
390 Directory paths to search when calculating the full butler
391 configuration.
392 forceConfigRoot : `bool`, optional
393 If `False`, any values present in the supplied ``config`` that
394 would normally be reset are not overridden and will appear
395 directly in the output config. This allows non-standard overrides
396 of the root directory for a datastore or registry to be given.
397 If this parameter is `True` the values for ``root`` will be
398 forced into the resulting config if appropriate.
399 outfile : `lss.resources.ResourcePathExpression`, optional
400 If not-`None`, the output configuration will be written to this
401 location rather than into the repository itself. Can be a URI
402 string. Can refer to a directory that will be used to write
403 ``butler.yaml``.
404 overwrite : `bool`, optional
405 Create a new configuration file even if one already exists
406 in the specified output location. Default is to raise
407 an exception.
409 Returns
410 -------
411 config : `Config`
412 The updated `Config` instance written to the repo.
414 Raises
415 ------
416 ValueError
417 Raised if a ButlerConfig or ConfigSubset is passed instead of a
418 regular Config (as these subclasses would make it impossible to
419 support ``standalone=False``).
420 FileExistsError
421 Raised if the output config file already exists.
422 os.error
423 Raised if the directory does not exist, exists but is not a
424 directory, or cannot be created.
426 Notes
427 -----
428 Note that when ``standalone=False`` (the default), the configuration
429 search path (see `ConfigSubset.defaultSearchPaths`) that was used to
430 construct the repository should also be used to construct any Butlers
431 to avoid configuration inconsistencies.
432 """
433 if isinstance(config, (ButlerConfig, ConfigSubset)):
434 raise ValueError("makeRepo must be passed a regular Config without defaults applied.")
436 # Ensure that the root of the repository exists or can be made
437 root_uri = ResourcePath(root, forceDirectory=True)
438 root_uri.mkdir()
440 config = Config(config)
442 # If we are creating a new repo from scratch with relative roots,
443 # do not propagate an explicit root from the config file
444 if "root" in config:
445 del config["root"]
447 full = ButlerConfig(config, searchPaths=searchPaths) # this applies defaults
448 imported_class = doImportType(full["datastore", "cls"])
449 if not issubclass(imported_class, Datastore):
450 raise TypeError(f"Imported datastore class {full['datastore', 'cls']} is not a Datastore")
451 datastoreClass: Type[Datastore] = imported_class
452 datastoreClass.setConfigRoot(BUTLER_ROOT_TAG, config, full, overwrite=forceConfigRoot)
454 # if key exists in given config, parse it, otherwise parse the defaults
455 # in the expanded config
456 if config.get(("registry", "db")):
457 registryConfig = RegistryConfig(config)
458 else:
459 registryConfig = RegistryConfig(full)
460 defaultDatabaseUri = registryConfig.makeDefaultDatabaseUri(BUTLER_ROOT_TAG)
461 if defaultDatabaseUri is not None:
462 Config.updateParameters(
463 RegistryConfig, config, full, toUpdate={"db": defaultDatabaseUri}, overwrite=forceConfigRoot
464 )
465 else:
466 Config.updateParameters(RegistryConfig, config, full, toCopy=("db",), overwrite=forceConfigRoot)
468 if standalone:
469 config.merge(full)
470 else:
471 # Always expand the registry.managers section into the per-repo
472 # config, because after the database schema is created, it's not
473 # allowed to change anymore. Note that in the standalone=True
474 # branch, _everything_ in the config is expanded, so there's no
475 # need to special case this.
476 Config.updateParameters(RegistryConfig, config, full, toMerge=("managers",), overwrite=False)
477 configURI: ResourcePathExpression
478 if outfile is not None:
479 # When writing to a separate location we must include
480 # the root of the butler repo in the config else it won't know
481 # where to look.
482 config["root"] = root_uri.geturl()
483 configURI = outfile
484 else:
485 configURI = root_uri
486 config.dumpToUri(configURI, overwrite=overwrite)
488 # Create Registry and populate tables
489 registryConfig = RegistryConfig(config.get("registry"))
490 dimensionConfig = DimensionConfig(dimensionConfig)
491 Registry.createFromConfig(registryConfig, dimensionConfig=dimensionConfig, butlerRoot=root_uri)
493 log.verbose("Wrote new Butler configuration file to %s", configURI)
495 return config
497 @classmethod
498 def _unpickle(
499 cls,
500 config: ButlerConfig,
501 collections: Optional[CollectionSearch],
502 run: Optional[str],
503 defaultDataId: Dict[str, str],
504 writeable: bool,
505 ) -> Butler:
506 """Callable used to unpickle a Butler.
508 We prefer not to use ``Butler.__init__`` directly so we can force some
509 of its many arguments to be keyword-only (note that ``__reduce__``
510 can only invoke callables with positional arguments).
512 Parameters
513 ----------
514 config : `ButlerConfig`
515 Butler configuration, already coerced into a true `ButlerConfig`
516 instance (and hence after any search paths for overrides have been
517 utilized).
518 collections : `CollectionSearch`
519 Names of the default collections to read from.
520 run : `str`, optional
521 Name of the default `~CollectionType.RUN` collection to write to.
522 defaultDataId : `dict` [ `str`, `str` ]
523 Default data ID values.
524 writeable : `bool`
525 Whether the Butler should support write operations.
527 Returns
528 -------
529 butler : `Butler`
530 A new `Butler` instance.
531 """
532 # MyPy doesn't recognize that the kwargs below are totally valid; it
533 # seems to think '**defaultDataId* is a _positional_ argument!
534 return cls(
535 config=config,
536 collections=collections,
537 run=run,
538 writeable=writeable,
539 **defaultDataId, # type: ignore
540 )
542 def __reduce__(self) -> tuple:
543 """Support pickling."""
544 return (
545 Butler._unpickle,
546 (
547 self._config,
548 self.collections,
549 self.run,
550 self.registry.defaults.dataId.byName(),
551 self.registry.isWriteable(),
552 ),
553 )
555 def __str__(self) -> str:
556 return "Butler(collections={}, run={}, datastore='{}', registry='{}')".format(
557 self.collections, self.run, self.datastore, self.registry
558 )
560 def isWriteable(self) -> bool:
561 """Return `True` if this `Butler` supports write operations."""
562 return self.registry.isWriteable()
564 @contextlib.contextmanager
565 def transaction(self) -> Iterator[None]:
566 """Context manager supporting `Butler` transactions.
568 Transactions can be nested.
569 """
570 with self.registry.transaction():
571 with self.datastore.transaction():
572 yield
574 def _standardizeArgs(
575 self,
576 datasetRefOrType: Union[DatasetRef, DatasetType, str],
577 dataId: Optional[DataId] = None,
578 for_put: bool = True,
579 **kwargs: Any,
580 ) -> Tuple[DatasetType, Optional[DataId]]:
581 """Standardize the arguments passed to several Butler APIs.
583 Parameters
584 ----------
585 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
586 When `DatasetRef` the `dataId` should be `None`.
587 Otherwise the `DatasetType` or name thereof.
588 dataId : `dict` or `DataCoordinate`
589 A `dict` of `Dimension` link name, value pairs that label the
590 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
591 should be provided as the second argument.
592 for_put : `bool`, optional
593 If `True` this call is invoked as part of a `Butler.put()`.
594 Otherwise it is assumed to be part of a `Butler.get()`. This
595 parameter is only relevant if there is dataset type
596 inconsistency.
597 **kwargs
598 Additional keyword arguments used to augment or construct a
599 `DataCoordinate`. See `DataCoordinate.standardize`
600 parameters.
602 Returns
603 -------
604 datasetType : `DatasetType`
605 A `DatasetType` instance extracted from ``datasetRefOrType``.
606 dataId : `dict` or `DataId`, optional
607 Argument that can be used (along with ``kwargs``) to construct a
608 `DataId`.
610 Notes
611 -----
612 Butler APIs that conceptually need a DatasetRef also allow passing a
613 `DatasetType` (or the name of one) and a `DataId` (or a dict and
614 keyword arguments that can be used to construct one) separately. This
615 method accepts those arguments and always returns a true `DatasetType`
616 and a `DataId` or `dict`.
618 Standardization of `dict` vs `DataId` is best handled by passing the
619 returned ``dataId`` (and ``kwargs``) to `Registry` APIs, which are
620 generally similarly flexible.
621 """
622 externalDatasetType: Optional[DatasetType] = None
623 internalDatasetType: Optional[DatasetType] = None
624 if isinstance(datasetRefOrType, DatasetRef):
625 if dataId is not None or kwargs:
626 raise ValueError("DatasetRef given, cannot use dataId as well")
627 externalDatasetType = datasetRefOrType.datasetType
628 dataId = datasetRefOrType.dataId
629 else:
630 # Don't check whether DataId is provided, because Registry APIs
631 # can usually construct a better error message when it wasn't.
632 if isinstance(datasetRefOrType, DatasetType):
633 externalDatasetType = datasetRefOrType
634 else:
635 internalDatasetType = self.registry.getDatasetType(datasetRefOrType)
637 # Check that they are self-consistent
638 if externalDatasetType is not None:
639 internalDatasetType = self.registry.getDatasetType(externalDatasetType.name)
640 if externalDatasetType != internalDatasetType:
641 # We can allow differences if they are compatible, depending
642 # on whether this is a get or a put. A get requires that
643 # the python type associated with the datastore can be
644 # converted to the user type. A put requires that the user
645 # supplied python type can be converted to the internal
646 # type expected by registry.
647 relevantDatasetType = internalDatasetType
648 if for_put:
649 is_compatible = internalDatasetType.is_compatible_with(externalDatasetType)
650 else:
651 is_compatible = externalDatasetType.is_compatible_with(internalDatasetType)
652 relevantDatasetType = externalDatasetType
653 if not is_compatible:
654 raise ValueError(
655 f"Supplied dataset type ({externalDatasetType}) inconsistent with "
656 f"registry definition ({internalDatasetType})"
657 )
658 # Override the internal definition.
659 internalDatasetType = relevantDatasetType
661 assert internalDatasetType is not None
662 return internalDatasetType, dataId
664 def _rewrite_data_id(
665 self, dataId: Optional[DataId], datasetType: DatasetType, **kwargs: Any
666 ) -> Tuple[Optional[DataId], Dict[str, Any]]:
667 """Rewrite a data ID taking into account dimension records.
669 Take a Data ID and keyword args and rewrite it if necessary to
670 allow the user to specify dimension records rather than dimension
671 primary values.
673 This allows a user to include a dataId dict with keys of
674 ``exposure.day_obs`` and ``exposure.seq_num`` instead of giving
675 the integer exposure ID. It also allows a string to be given
676 for a dimension value rather than the integer ID if that is more
677 convenient. For example, rather than having to specifyin the
678 detector with ``detector.full_name``, a string given for ``detector``
679 will be interpreted as the full name and converted to the integer
680 value.
682 Keyword arguments can also use strings for dimensions like detector
683 and exposure but python does not allow them to include ``.`` and
684 so the ``exposure.day_obs`` syntax can not be used in a keyword
685 argument.
687 Parameters
688 ----------
689 dataId : `dict` or `DataCoordinate`
690 A `dict` of `Dimension` link name, value pairs that will label the
691 `DatasetRef` within a Collection.
692 datasetType : `DatasetType`
693 The dataset type associated with this dataId. Required to
694 determine the relevant dimensions.
695 **kwargs
696 Additional keyword arguments used to augment or construct a
697 `DataId`. See `DataId` parameters.
699 Returns
700 -------
701 dataId : `dict` or `DataCoordinate`
702 The, possibly rewritten, dataId. If given a `DataCoordinate` and
703 no keyword arguments, the original dataId will be returned
704 unchanged.
705 **kwargs : `dict`
706 Any unused keyword arguments (would normally be empty dict).
707 """
708 # Do nothing if we have a standalone DataCoordinate.
709 if isinstance(dataId, DataCoordinate) and not kwargs:
710 return dataId, kwargs
712 # Process dimension records that are using record information
713 # rather than ids
714 newDataId: Dict[str, DataIdValue] = {}
715 byRecord: Dict[str, Dict[str, Any]] = defaultdict(dict)
717 # if all the dataId comes from keyword parameters we do not need
718 # to do anything here because they can't be of the form
719 # exposure.obs_id because a "." is not allowed in a keyword parameter.
720 if dataId:
721 for k, v in dataId.items():
722 # If we have a Dimension we do not need to do anything
723 # because it cannot be a compound key.
724 if isinstance(k, str) and "." in k:
725 # Someone is using a more human-readable dataId
726 dimensionName, record = k.split(".", 1)
727 byRecord[dimensionName][record] = v
728 elif isinstance(k, Dimension):
729 newDataId[k.name] = v
730 else:
731 newDataId[k] = v
733 # Go through the updated dataId and check the type in case someone is
734 # using an alternate key. We have already filtered out the compound
735 # keys dimensions.record format.
736 not_dimensions = {}
738 # Will need to look in the dataId and the keyword arguments
739 # and will remove them if they need to be fixed or are unrecognized.
740 for dataIdDict in (newDataId, kwargs):
741 # Use a list so we can adjust the dict safely in the loop
742 for dimensionName in list(dataIdDict):
743 value = dataIdDict[dimensionName]
744 try:
745 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
746 except KeyError:
747 # This is not a real dimension
748 not_dimensions[dimensionName] = value
749 del dataIdDict[dimensionName]
750 continue
752 # Convert an integral type to an explicit int to simplify
753 # comparisons here
754 if isinstance(value, numbers.Integral):
755 value = int(value)
757 if not isinstance(value, dimension.primaryKey.getPythonType()):
758 for alternate in dimension.alternateKeys:
759 if isinstance(value, alternate.getPythonType()):
760 byRecord[dimensionName][alternate.name] = value
761 del dataIdDict[dimensionName]
762 log.debug(
763 "Converting dimension %s to %s.%s=%s",
764 dimensionName,
765 dimensionName,
766 alternate.name,
767 value,
768 )
769 break
770 else:
771 log.warning(
772 "Type mismatch found for value '%r' provided for dimension %s. "
773 "Could not find matching alternative (primary key has type %s) "
774 "so attempting to use as-is.",
775 value,
776 dimensionName,
777 dimension.primaryKey.getPythonType(),
778 )
780 # By this point kwargs and newDataId should only include valid
781 # dimensions. Merge kwargs in to the new dataId and log if there
782 # are dimensions in both (rather than calling update).
783 for k, v in kwargs.items():
784 if k in newDataId and newDataId[k] != v:
785 log.debug(
786 "Keyword arg %s overriding explicit value in dataId of %s with %s", k, newDataId[k], v
787 )
788 newDataId[k] = v
789 # No need to retain any values in kwargs now.
790 kwargs = {}
792 # If we have some unrecognized dimensions we have to try to connect
793 # them to records in other dimensions. This is made more complicated
794 # by some dimensions having records with clashing names. A mitigation
795 # is that we can tell by this point which dimensions are missing
796 # for the DatasetType but this does not work for calibrations
797 # where additional dimensions can be used to constrain the temporal
798 # axis.
799 if not_dimensions:
800 # Search for all dimensions even if we have been given a value
801 # explicitly. In some cases records are given as well as the
802 # actually dimension and this should not be an error if they
803 # match.
804 mandatoryDimensions = datasetType.dimensions.names # - provided
806 candidateDimensions: Set[str] = set()
807 candidateDimensions.update(mandatoryDimensions)
809 # For calibrations we may well be needing temporal dimensions
810 # so rather than always including all dimensions in the scan
811 # restrict things a little. It is still possible for there
812 # to be confusion over day_obs in visit vs exposure for example.
813 # If we are not searching calibration collections things may
814 # fail but they are going to fail anyway because of the
815 # ambiguousness of the dataId...
816 if datasetType.isCalibration():
817 for dim in self.registry.dimensions.getStaticDimensions():
818 if dim.temporal:
819 candidateDimensions.add(str(dim))
821 # Look up table for the first association with a dimension
822 guessedAssociation: Dict[str, Dict[str, Any]] = defaultdict(dict)
824 # Keep track of whether an item is associated with multiple
825 # dimensions.
826 counter: Counter[str] = Counter()
827 assigned: Dict[str, Set[str]] = defaultdict(set)
829 # Go through the missing dimensions and associate the
830 # given names with records within those dimensions
831 matched_dims = set()
832 for dimensionName in candidateDimensions:
833 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
834 fields = dimension.metadata.names | dimension.uniqueKeys.names
835 for field in not_dimensions:
836 if field in fields:
837 guessedAssociation[dimensionName][field] = not_dimensions[field]
838 counter[dimensionName] += 1
839 assigned[field].add(dimensionName)
840 matched_dims.add(field)
842 # Calculate the fields that matched nothing.
843 never_found = set(not_dimensions) - matched_dims
845 if never_found:
846 raise ValueError(f"Unrecognized keyword args given: {never_found}")
848 # There is a chance we have allocated a single dataId item
849 # to multiple dimensions. Need to decide which should be retained.
850 # For now assume that the most popular alternative wins.
851 # This means that day_obs with seq_num will result in
852 # exposure.day_obs and not visit.day_obs
853 # Also prefer an explicitly missing dimension over an inferred
854 # temporal dimension.
855 for fieldName, assignedDimensions in assigned.items():
856 if len(assignedDimensions) > 1:
857 # Pick the most popular (preferring mandatory dimensions)
858 requiredButMissing = assignedDimensions.intersection(mandatoryDimensions)
859 if requiredButMissing:
860 candidateDimensions = requiredButMissing
861 else:
862 candidateDimensions = assignedDimensions
864 # If this is a choice between visit and exposure and
865 # neither was a required part of the dataset type,
866 # (hence in this branch) always prefer exposure over
867 # visit since exposures are always defined and visits
868 # are defined from exposures.
869 if candidateDimensions == {"exposure", "visit"}:
870 candidateDimensions = {"exposure"}
872 # Select the relevant items and get a new restricted
873 # counter.
874 theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions}
875 duplicatesCounter: Counter[str] = Counter()
876 duplicatesCounter.update(theseCounts)
878 # Choose the most common. If they are equally common
879 # we will pick the one that was found first.
880 # Returns a list of tuples
881 selected = duplicatesCounter.most_common(1)[0][0]
883 log.debug(
884 "Ambiguous dataId entry '%s' associated with multiple dimensions: %s."
885 " Removed ambiguity by choosing dimension %s.",
886 fieldName,
887 ", ".join(assignedDimensions),
888 selected,
889 )
891 for candidateDimension in assignedDimensions:
892 if candidateDimension != selected:
893 del guessedAssociation[candidateDimension][fieldName]
895 # Update the record look up dict with the new associations
896 for dimensionName, values in guessedAssociation.items():
897 if values: # A dict might now be empty
898 log.debug("Assigned non-dimension dataId keys to dimension %s: %s", dimensionName, values)
899 byRecord[dimensionName].update(values)
901 if byRecord:
902 # Some record specifiers were found so we need to convert
903 # them to the Id form
904 for dimensionName, values in byRecord.items():
905 if dimensionName in newDataId:
906 log.debug(
907 "DataId specified explicit %s dimension value of %s in addition to"
908 " general record specifiers for it of %s. Ignoring record information.",
909 dimensionName,
910 newDataId[dimensionName],
911 str(values),
912 )
913 # Get the actual record and compare with these values.
914 try:
915 recs = list(self.registry.queryDimensionRecords(dimensionName, dataId=newDataId))
916 except DataIdError:
917 raise ValueError(
918 f"Could not find dimension '{dimensionName}'"
919 f" with dataId {newDataId} as part of comparing with"
920 f" record values {byRecord[dimensionName]}"
921 ) from None
922 if len(recs) == 1:
923 errmsg: List[str] = []
924 for k, v in values.items():
925 if (recval := getattr(recs[0], k)) != v:
926 errmsg.append(f"{k}({recval} != {v})")
927 if errmsg:
928 raise ValueError(
929 f"Dimension {dimensionName} in dataId has explicit value"
930 " inconsistent with records: " + ", ".join(errmsg)
931 )
932 else:
933 # Multiple matches for an explicit dimension
934 # should never happen but let downstream complain.
935 pass
936 continue
938 # Build up a WHERE expression
939 bind = {k: v for k, v in values.items()}
940 where = " AND ".join(f"{dimensionName}.{k} = {k}" for k in bind)
942 # Hopefully we get a single record that matches
943 records = set(
944 self.registry.queryDimensionRecords(
945 dimensionName, dataId=newDataId, where=where, bind=bind, **kwargs
946 )
947 )
949 if len(records) != 1:
950 if len(records) > 1:
951 # visit can have an ambiguous answer without involving
952 # visit_system. The default visit_system is defined
953 # by the instrument.
954 if (
955 dimensionName == "visit"
956 and "visit_system_membership" in self.registry.dimensions
957 and "visit_system"
958 in self.registry.dimensions["instrument"].metadata # type: ignore
959 ):
960 instrument_records = list(
961 self.registry.queryDimensionRecords(
962 "instrument",
963 dataId=newDataId,
964 **kwargs,
965 )
966 )
967 if len(instrument_records) == 1:
968 visit_system = instrument_records[0].visit_system
969 if visit_system is None:
970 # Set to a value that will never match.
971 visit_system = -1
973 # Look up each visit in the
974 # visit_system_membership records.
975 for rec in records:
976 membership = list(
977 self.registry.queryDimensionRecords(
978 # Use bind to allow zero results.
979 # This is a fully-specified query.
980 "visit_system_membership",
981 where="instrument = inst AND visit_system = system AND visit = v",
982 bind=dict(
983 inst=instrument_records[0].name, system=visit_system, v=rec.id
984 ),
985 )
986 )
987 if membership:
988 # This record is the right answer.
989 records = set([rec])
990 break
992 # The ambiguity may have been resolved so check again.
993 if len(records) > 1:
994 log.debug("Received %d records from constraints of %s", len(records), str(values))
995 for r in records:
996 log.debug("- %s", str(r))
997 raise ValueError(
998 f"DataId specification for dimension {dimensionName} is not"
999 f" uniquely constrained to a single dataset by {values}."
1000 f" Got {len(records)} results."
1001 )
1002 else:
1003 raise ValueError(
1004 f"DataId specification for dimension {dimensionName} matched no"
1005 f" records when constrained by {values}"
1006 )
1008 # Get the primary key from the real dimension object
1009 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
1010 if not isinstance(dimension, Dimension):
1011 raise RuntimeError(
1012 f"{dimension.name} is not a true dimension, and cannot be used in data IDs."
1013 )
1014 newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name)
1016 return newDataId, kwargs
1018 def _findDatasetRef(
1019 self,
1020 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1021 dataId: Optional[DataId] = None,
1022 *,
1023 collections: Any = None,
1024 allowUnresolved: bool = False,
1025 **kwargs: Any,
1026 ) -> DatasetRef:
1027 """Shared logic for methods that start with a search for a dataset in
1028 the registry.
1030 Parameters
1031 ----------
1032 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1033 When `DatasetRef` the `dataId` should be `None`.
1034 Otherwise the `DatasetType` or name thereof.
1035 dataId : `dict` or `DataCoordinate`, optional
1036 A `dict` of `Dimension` link name, value pairs that label the
1037 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1038 should be provided as the first argument.
1039 collections : Any, optional
1040 Collections to be searched, overriding ``self.collections``.
1041 Can be any of the types supported by the ``collections`` argument
1042 to butler construction.
1043 allowUnresolved : `bool`, optional
1044 If `True`, return an unresolved `DatasetRef` if finding a resolved
1045 one in the `Registry` fails. Defaults to `False`.
1046 **kwargs
1047 Additional keyword arguments used to augment or construct a
1048 `DataId`. See `DataId` parameters.
1050 Returns
1051 -------
1052 ref : `DatasetRef`
1053 A reference to the dataset identified by the given arguments.
1055 Raises
1056 ------
1057 LookupError
1058 Raised if no matching dataset exists in the `Registry` (and
1059 ``allowUnresolved is False``).
1060 ValueError
1061 Raised if a resolved `DatasetRef` was passed as an input, but it
1062 differs from the one found in the registry.
1063 TypeError
1064 Raised if no collections were provided.
1065 """
1066 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, for_put=False, **kwargs)
1067 if isinstance(datasetRefOrType, DatasetRef):
1068 idNumber = datasetRefOrType.id
1069 else:
1070 idNumber = None
1071 timespan: Optional[Timespan] = None
1073 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs)
1075 if datasetType.isCalibration():
1076 # Because this is a calibration dataset, first try to make a
1077 # standardize the data ID without restricting the dimensions to
1078 # those of the dataset type requested, because there may be extra
1079 # dimensions that provide temporal information for a validity-range
1080 # lookup.
1081 dataId = DataCoordinate.standardize(
1082 dataId, universe=self.registry.dimensions, defaults=self.registry.defaults.dataId, **kwargs
1083 )
1084 if dataId.graph.temporal:
1085 dataId = self.registry.expandDataId(dataId)
1086 timespan = dataId.timespan
1087 else:
1088 # Standardize the data ID to just the dimensions of the dataset
1089 # type instead of letting registry.findDataset do it, so we get the
1090 # result even if no dataset is found.
1091 dataId = DataCoordinate.standardize(
1092 dataId, graph=datasetType.dimensions, defaults=self.registry.defaults.dataId, **kwargs
1093 )
1094 # Always lookup the DatasetRef, even if one is given, to ensure it is
1095 # present in the current collection.
1096 ref = self.registry.findDataset(datasetType, dataId, collections=collections, timespan=timespan)
1097 if ref is None:
1098 if allowUnresolved:
1099 return DatasetRef(datasetType, dataId)
1100 else:
1101 if collections is None:
1102 collections = self.registry.defaults.collections
1103 raise LookupError(
1104 f"Dataset {datasetType.name} with data ID {dataId} "
1105 f"could not be found in collections {collections}."
1106 )
1107 if idNumber is not None and idNumber != ref.id:
1108 if collections is None:
1109 collections = self.registry.defaults.collections
1110 raise ValueError(
1111 f"DatasetRef.id provided ({idNumber}) does not match "
1112 f"id ({ref.id}) in registry in collections {collections}."
1113 )
1114 if datasetType != ref.datasetType:
1115 # If they differ it is because the user explicitly specified
1116 # a compatible dataset type to this call rather than using the
1117 # registry definition. The DatasetRef must therefore be recreated
1118 # using the user definition such that the expected type is
1119 # returned.
1120 ref = DatasetRef(datasetType, ref.dataId, run=ref.run, id=ref.id)
1122 return ref
1124 @transactional
1125 def putDirect(self, obj: Any, ref: DatasetRef) -> DatasetRef:
1126 # Docstring inherited.
1127 (imported_ref,) = self.registry._importDatasets(
1128 [ref],
1129 expand=True,
1130 )
1131 if imported_ref.id != ref.getCheckedId():
1132 raise RuntimeError("This registry configuration does not support putDirect.")
1133 self.datastore.put(obj, ref)
1134 return ref
1136 @transactional
1137 def put(
1138 self,
1139 obj: Any,
1140 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1141 dataId: Optional[DataId] = None,
1142 *,
1143 run: Optional[str] = None,
1144 **kwargs: Any,
1145 ) -> DatasetRef:
1146 """Store and register a dataset.
1148 Parameters
1149 ----------
1150 obj : `object`
1151 The dataset.
1152 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1153 When `DatasetRef` is provided, ``dataId`` should be `None`.
1154 Otherwise the `DatasetType` or name thereof.
1155 dataId : `dict` or `DataCoordinate`
1156 A `dict` of `Dimension` link name, value pairs that label the
1157 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1158 should be provided as the second argument.
1159 run : `str`, optional
1160 The name of the run the dataset should be added to, overriding
1161 ``self.run``.
1162 **kwargs
1163 Additional keyword arguments used to augment or construct a
1164 `DataCoordinate`. See `DataCoordinate.standardize`
1165 parameters.
1167 Returns
1168 -------
1169 ref : `DatasetRef`
1170 A reference to the stored dataset, updated with the correct id if
1171 given.
1173 Raises
1174 ------
1175 TypeError
1176 Raised if the butler is read-only or if no run has been provided.
1177 """
1178 log.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run)
1179 if not self.isWriteable():
1180 raise TypeError("Butler is read-only.")
1181 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs)
1182 if isinstance(datasetRefOrType, DatasetRef) and datasetRefOrType.id is not None:
1183 raise ValueError("DatasetRef must not be in registry, must have None id")
1185 # Handle dimension records in dataId
1186 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs)
1188 # Add Registry Dataset entry.
1189 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions, **kwargs)
1191 # For an execution butler the datasets will be pre-defined.
1192 # If the butler is configured that way datasets should only be inserted
1193 # if they do not already exist in registry. Trying and catching
1194 # ConflictingDefinitionError will not work because the transaction
1195 # will be corrupted. Instead, in this mode always check first.
1196 ref = None
1197 ref_is_predefined = False
1198 if self._allow_put_of_predefined_dataset:
1199 # Get the matching ref for this run.
1200 ref = self.registry.findDataset(datasetType, collections=run, dataId=dataId)
1202 if ref:
1203 # Must be expanded form for datastore templating
1204 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions)
1205 ref = ref.expanded(dataId)
1206 ref_is_predefined = True
1208 if not ref:
1209 (ref,) = self.registry.insertDatasets(datasetType, run=run, dataIds=[dataId])
1211 # If the ref is predefined it is possible that the datastore also
1212 # has the record. Asking datastore to put it again will result in
1213 # the artifact being recreated, overwriting previous, then will cause
1214 # a failure in writing the record which will cause the artifact
1215 # to be removed. Much safer to ask first before attempting to
1216 # overwrite. Race conditions should not be an issue for the
1217 # execution butler environment.
1218 if ref_is_predefined:
1219 if self.datastore.knows(ref):
1220 raise ConflictingDefinitionError(f"Dataset associated {ref} already exists.")
1222 self.datastore.put(obj, ref)
1224 return ref
1226 def getDirect(self, ref: DatasetRef, *, parameters: Optional[Dict[str, Any]] = None) -> Any:
1227 """Retrieve a stored dataset.
1229 Unlike `Butler.get`, this method allows datasets outside the Butler's
1230 collection to be read as long as the `DatasetRef` that identifies them
1231 can be obtained separately.
1233 Parameters
1234 ----------
1235 ref : `DatasetRef`
1236 Resolved reference to an already stored dataset.
1237 parameters : `dict`
1238 Additional StorageClass-defined options to control reading,
1239 typically used to efficiently read only a subset of the dataset.
1241 Returns
1242 -------
1243 obj : `object`
1244 The dataset.
1245 """
1246 return self.datastore.get(ref, parameters=parameters)
1248 def getDirectDeferred(
1249 self, ref: DatasetRef, *, parameters: Union[dict, None] = None
1250 ) -> DeferredDatasetHandle:
1251 """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
1252 from a resolved `DatasetRef`.
1254 Parameters
1255 ----------
1256 ref : `DatasetRef`
1257 Resolved reference to an already stored dataset.
1258 parameters : `dict`
1259 Additional StorageClass-defined options to control reading,
1260 typically used to efficiently read only a subset of the dataset.
1262 Returns
1263 -------
1264 obj : `DeferredDatasetHandle`
1265 A handle which can be used to retrieve a dataset at a later time.
1267 Raises
1268 ------
1269 AmbiguousDatasetError
1270 Raised if ``ref.id is None``, i.e. the reference is unresolved.
1271 """
1272 if ref.id is None:
1273 raise AmbiguousDatasetError(
1274 f"Dataset of type {ref.datasetType.name} with data ID {ref.dataId} is not resolved."
1275 )
1276 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters)
1278 def getDeferred(
1279 self,
1280 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1281 dataId: Optional[DataId] = None,
1282 *,
1283 parameters: Union[dict, None] = None,
1284 collections: Any = None,
1285 **kwargs: Any,
1286 ) -> DeferredDatasetHandle:
1287 """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
1288 after an immediate registry lookup.
1290 Parameters
1291 ----------
1292 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1293 When `DatasetRef` the `dataId` should be `None`.
1294 Otherwise the `DatasetType` or name thereof.
1295 dataId : `dict` or `DataCoordinate`, optional
1296 A `dict` of `Dimension` link name, value pairs that label the
1297 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1298 should be provided as the first argument.
1299 parameters : `dict`
1300 Additional StorageClass-defined options to control reading,
1301 typically used to efficiently read only a subset of the dataset.
1302 collections : Any, optional
1303 Collections to be searched, overriding ``self.collections``.
1304 Can be any of the types supported by the ``collections`` argument
1305 to butler construction.
1306 **kwargs
1307 Additional keyword arguments used to augment or construct a
1308 `DataId`. See `DataId` parameters.
1310 Returns
1311 -------
1312 obj : `DeferredDatasetHandle`
1313 A handle which can be used to retrieve a dataset at a later time.
1315 Raises
1316 ------
1317 LookupError
1318 Raised if no matching dataset exists in the `Registry` (and
1319 ``allowUnresolved is False``).
1320 ValueError
1321 Raised if a resolved `DatasetRef` was passed as an input, but it
1322 differs from the one found in the registry.
1323 TypeError
1324 Raised if no collections were provided.
1325 """
1326 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs)
1327 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters)
1329 def get(
1330 self,
1331 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1332 dataId: Optional[DataId] = None,
1333 *,
1334 parameters: Optional[Dict[str, Any]] = None,
1335 collections: Any = None,
1336 **kwargs: Any,
1337 ) -> Any:
1338 """Retrieve a stored dataset.
1340 Parameters
1341 ----------
1342 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1343 When `DatasetRef` the `dataId` should be `None`.
1344 Otherwise the `DatasetType` or name thereof.
1345 dataId : `dict` or `DataCoordinate`
1346 A `dict` of `Dimension` link name, value pairs that label the
1347 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1348 should be provided as the first argument.
1349 parameters : `dict`
1350 Additional StorageClass-defined options to control reading,
1351 typically used to efficiently read only a subset of the dataset.
1352 collections : Any, optional
1353 Collections to be searched, overriding ``self.collections``.
1354 Can be any of the types supported by the ``collections`` argument
1355 to butler construction.
1356 **kwargs
1357 Additional keyword arguments used to augment or construct a
1358 `DataCoordinate`. See `DataCoordinate.standardize`
1359 parameters.
1361 Returns
1362 -------
1363 obj : `object`
1364 The dataset.
1366 Raises
1367 ------
1368 ValueError
1369 Raised if a resolved `DatasetRef` was passed as an input, but it
1370 differs from the one found in the registry.
1371 LookupError
1372 Raised if no matching dataset exists in the `Registry`.
1373 TypeError
1374 Raised if no collections were provided.
1376 Notes
1377 -----
1378 When looking up datasets in a `~CollectionType.CALIBRATION` collection,
1379 this method requires that the given data ID include temporal dimensions
1380 beyond the dimensions of the dataset type itself, in order to find the
1381 dataset with the appropriate validity range. For example, a "bias"
1382 dataset with native dimensions ``{instrument, detector}`` could be
1383 fetched with a ``{instrument, detector, exposure}`` data ID, because
1384 ``exposure`` is a temporal dimension.
1385 """
1386 log.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters)
1387 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs)
1388 return self.getDirect(ref, parameters=parameters)
1390 def getURIs(
1391 self,
1392 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1393 dataId: Optional[DataId] = None,
1394 *,
1395 predict: bool = False,
1396 collections: Any = None,
1397 run: Optional[str] = None,
1398 **kwargs: Any,
1399 ) -> DatasetRefURIs:
1400 """Returns the URIs associated with the dataset.
1402 Parameters
1403 ----------
1404 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1405 When `DatasetRef` the `dataId` should be `None`.
1406 Otherwise the `DatasetType` or name thereof.
1407 dataId : `dict` or `DataCoordinate`
1408 A `dict` of `Dimension` link name, value pairs that label the
1409 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1410 should be provided as the first argument.
1411 predict : `bool`
1412 If `True`, allow URIs to be returned of datasets that have not
1413 been written.
1414 collections : Any, optional
1415 Collections to be searched, overriding ``self.collections``.
1416 Can be any of the types supported by the ``collections`` argument
1417 to butler construction.
1418 run : `str`, optional
1419 Run to use for predictions, overriding ``self.run``.
1420 **kwargs
1421 Additional keyword arguments used to augment or construct a
1422 `DataCoordinate`. See `DataCoordinate.standardize`
1423 parameters.
1425 Returns
1426 -------
1427 uris : `DatasetRefURIs`
1428 The URI to the primary artifact associated with this dataset (if
1429 the dataset was disassembled within the datastore this may be
1430 `None`), and the URIs to any components associated with the dataset
1431 artifact. (can be empty if there are no components).
1432 """
1433 ref = self._findDatasetRef(
1434 datasetRefOrType, dataId, allowUnresolved=predict, collections=collections, **kwargs
1435 )
1436 if ref.id is None: # only possible if predict is True
1437 if run is None:
1438 run = self.run
1439 if run is None:
1440 raise TypeError("Cannot predict location with run=None.")
1441 # Lie about ID, because we can't guess it, and only
1442 # Datastore.getURIs() will ever see it (and it doesn't use it).
1443 ref = ref.resolved(id=0, run=run)
1444 return self.datastore.getURIs(ref, predict)
1446 def getURI(
1447 self,
1448 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1449 dataId: Optional[DataId] = None,
1450 *,
1451 predict: bool = False,
1452 collections: Any = None,
1453 run: Optional[str] = None,
1454 **kwargs: Any,
1455 ) -> ResourcePath:
1456 """Return the URI to the Dataset.
1458 Parameters
1459 ----------
1460 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1461 When `DatasetRef` the `dataId` should be `None`.
1462 Otherwise the `DatasetType` or name thereof.
1463 dataId : `dict` or `DataCoordinate`
1464 A `dict` of `Dimension` link name, value pairs that label the
1465 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1466 should be provided as the first argument.
1467 predict : `bool`
1468 If `True`, allow URIs to be returned of datasets that have not
1469 been written.
1470 collections : Any, optional
1471 Collections to be searched, overriding ``self.collections``.
1472 Can be any of the types supported by the ``collections`` argument
1473 to butler construction.
1474 run : `str`, optional
1475 Run to use for predictions, overriding ``self.run``.
1476 **kwargs
1477 Additional keyword arguments used to augment or construct a
1478 `DataCoordinate`. See `DataCoordinate.standardize`
1479 parameters.
1481 Returns
1482 -------
1483 uri : `lsst.resources.ResourcePath`
1484 URI pointing to the Dataset within the datastore. If the
1485 Dataset does not exist in the datastore, and if ``predict`` is
1486 `True`, the URI will be a prediction and will include a URI
1487 fragment "#predicted".
1488 If the datastore does not have entities that relate well
1489 to the concept of a URI the returned URI string will be
1490 descriptive. The returned URI is not guaranteed to be obtainable.
1492 Raises
1493 ------
1494 LookupError
1495 A URI has been requested for a dataset that does not exist and
1496 guessing is not allowed.
1497 ValueError
1498 Raised if a resolved `DatasetRef` was passed as an input, but it
1499 differs from the one found in the registry.
1500 TypeError
1501 Raised if no collections were provided.
1502 RuntimeError
1503 Raised if a URI is requested for a dataset that consists of
1504 multiple artifacts.
1505 """
1506 primary, components = self.getURIs(
1507 datasetRefOrType, dataId=dataId, predict=predict, collections=collections, run=run, **kwargs
1508 )
1510 if primary is None or components:
1511 raise RuntimeError(
1512 f"Dataset ({datasetRefOrType}) includes distinct URIs for components. "
1513 "Use Butler.getURIs() instead."
1514 )
1515 return primary
1517 def retrieveArtifacts(
1518 self,
1519 refs: Iterable[DatasetRef],
1520 destination: ResourcePathExpression,
1521 transfer: str = "auto",
1522 preserve_path: bool = True,
1523 overwrite: bool = False,
1524 ) -> List[ResourcePath]:
1525 """Retrieve the artifacts associated with the supplied refs.
1527 Parameters
1528 ----------
1529 refs : iterable of `DatasetRef`
1530 The datasets for which artifacts are to be retrieved.
1531 A single ref can result in multiple artifacts. The refs must
1532 be resolved.
1533 destination : `lsst.resources.ResourcePath` or `str`
1534 Location to write the artifacts.
1535 transfer : `str`, optional
1536 Method to use to transfer the artifacts. Must be one of the options
1537 supported by `~lsst.resources.ResourcePath.transfer_from()`.
1538 "move" is not allowed.
1539 preserve_path : `bool`, optional
1540 If `True` the full path of the artifact within the datastore
1541 is preserved. If `False` the final file component of the path
1542 is used.
1543 overwrite : `bool`, optional
1544 If `True` allow transfers to overwrite existing files at the
1545 destination.
1547 Returns
1548 -------
1549 targets : `list` of `lsst.resources.ResourcePath`
1550 URIs of file artifacts in destination location. Order is not
1551 preserved.
1553 Notes
1554 -----
1555 For non-file datastores the artifacts written to the destination
1556 may not match the representation inside the datastore. For example
1557 a hierarchical data structure in a NoSQL database may well be stored
1558 as a JSON file.
1559 """
1560 return self.datastore.retrieveArtifacts(
1561 refs,
1562 ResourcePath(destination),
1563 transfer=transfer,
1564 preserve_path=preserve_path,
1565 overwrite=overwrite,
1566 )
1568 def datasetExists(
1569 self,
1570 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1571 dataId: Optional[DataId] = None,
1572 *,
1573 collections: Any = None,
1574 **kwargs: Any,
1575 ) -> bool:
1576 """Return True if the Dataset is actually present in the Datastore.
1578 Parameters
1579 ----------
1580 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1581 When `DatasetRef` the `dataId` should be `None`.
1582 Otherwise the `DatasetType` or name thereof.
1583 dataId : `dict` or `DataCoordinate`
1584 A `dict` of `Dimension` link name, value pairs that label the
1585 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1586 should be provided as the first argument.
1587 collections : Any, optional
1588 Collections to be searched, overriding ``self.collections``.
1589 Can be any of the types supported by the ``collections`` argument
1590 to butler construction.
1591 **kwargs
1592 Additional keyword arguments used to augment or construct a
1593 `DataCoordinate`. See `DataCoordinate.standardize`
1594 parameters.
1596 Raises
1597 ------
1598 LookupError
1599 Raised if the dataset is not even present in the Registry.
1600 ValueError
1601 Raised if a resolved `DatasetRef` was passed as an input, but it
1602 differs from the one found in the registry.
1603 TypeError
1604 Raised if no collections were provided.
1605 """
1606 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs)
1607 return self.datastore.exists(ref)
1609 def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None:
1610 """Remove one or more `~CollectionType.RUN` collections and the
1611 datasets within them.
1613 Parameters
1614 ----------
1615 names : `Iterable` [ `str` ]
1616 The names of the collections to remove.
1617 unstore : `bool`, optional
1618 If `True` (default), delete datasets from all datastores in which
1619 they are present, and attempt to rollback the registry deletions if
1620 datastore deletions fail (which may not always be possible). If
1621 `False`, datastore records for these datasets are still removed,
1622 but any artifacts (e.g. files) will not be.
1624 Raises
1625 ------
1626 TypeError
1627 Raised if one or more collections are not of type
1628 `~CollectionType.RUN`.
1629 """
1630 if not self.isWriteable():
1631 raise TypeError("Butler is read-only.")
1632 names = list(names)
1633 refs: List[DatasetRef] = []
1634 for name in names:
1635 collectionType = self.registry.getCollectionType(name)
1636 if collectionType is not CollectionType.RUN:
1637 raise TypeError(f"The collection type of '{name}' is {collectionType.name}, not RUN.")
1638 refs.extend(self.registry.queryDatasets(..., collections=name, findFirst=True))
1639 with self.datastore.transaction():
1640 with self.registry.transaction():
1641 if unstore:
1642 self.datastore.trash(refs)
1643 else:
1644 self.datastore.forget(refs)
1645 for name in names:
1646 self.registry.removeCollection(name)
1647 if unstore:
1648 # Point of no return for removing artifacts
1649 self.datastore.emptyTrash()
1651 def pruneCollection(
1652 self, name: str, purge: bool = False, unstore: bool = False, unlink: Optional[List[str]] = None
1653 ) -> None:
1654 """Remove a collection and possibly prune datasets within it.
1656 Parameters
1657 ----------
1658 name : `str`
1659 Name of the collection to remove. If this is a
1660 `~CollectionType.TAGGED` or `~CollectionType.CHAINED` collection,
1661 datasets within the collection are not modified unless ``unstore``
1662 is `True`. If this is a `~CollectionType.RUN` collection,
1663 ``purge`` and ``unstore`` must be `True`, and all datasets in it
1664 are fully removed from the data repository.
1665 purge : `bool`, optional
1666 If `True`, permit `~CollectionType.RUN` collections to be removed,
1667 fully removing datasets within them. Requires ``unstore=True`` as
1668 well as an added precaution against accidental deletion. Must be
1669 `False` (default) if the collection is not a ``RUN``.
1670 unstore: `bool`, optional
1671 If `True`, remove all datasets in the collection from all
1672 datastores in which they appear.
1673 unlink: `list` [`str`], optional
1674 Before removing the given `collection` unlink it from from these
1675 parent collections.
1677 Raises
1678 ------
1679 TypeError
1680 Raised if the butler is read-only or arguments are mutually
1681 inconsistent.
1682 """
1683 # See pruneDatasets comments for more information about the logic here;
1684 # the cases are almost the same, but here we can rely on Registry to
1685 # take care everything but Datastore deletion when we remove the
1686 # collection.
1687 if not self.isWriteable():
1688 raise TypeError("Butler is read-only.")
1689 collectionType = self.registry.getCollectionType(name)
1690 if purge and not unstore:
1691 raise PurgeWithoutUnstorePruneCollectionsError()
1692 if collectionType is CollectionType.RUN and not purge:
1693 raise RunWithoutPurgePruneCollectionsError(collectionType)
1694 if collectionType is not CollectionType.RUN and purge:
1695 raise PurgeUnsupportedPruneCollectionsError(collectionType)
1697 def remove(child: str, parent: str) -> None:
1698 """Remove a child collection from a parent collection."""
1699 # Remove child from parent.
1700 chain = list(self.registry.getCollectionChain(parent))
1701 try:
1702 chain.remove(name)
1703 except ValueError as e:
1704 raise RuntimeError(f"{name} is not a child of {parent}") from e
1705 self.registry.setCollectionChain(parent, chain)
1707 with self.datastore.transaction():
1708 with self.registry.transaction():
1709 if unlink:
1710 for parent in unlink:
1711 remove(name, parent)
1712 if unstore:
1713 refs = self.registry.queryDatasets(..., collections=name, findFirst=True)
1714 self.datastore.trash(refs)
1715 self.registry.removeCollection(name)
1717 if unstore:
1718 # Point of no return for removing artifacts
1719 self.datastore.emptyTrash()
1721 def pruneDatasets(
1722 self,
1723 refs: Iterable[DatasetRef],
1724 *,
1725 disassociate: bool = True,
1726 unstore: bool = False,
1727 tags: Iterable[str] = (),
1728 purge: bool = False,
1729 ) -> None:
1730 # docstring inherited from LimitedButler
1732 if not self.isWriteable():
1733 raise TypeError("Butler is read-only.")
1734 if purge:
1735 if not disassociate:
1736 raise TypeError("Cannot pass purge=True without disassociate=True.")
1737 if not unstore:
1738 raise TypeError("Cannot pass purge=True without unstore=True.")
1739 elif disassociate:
1740 tags = tuple(tags)
1741 if not tags:
1742 raise TypeError("No tags provided but disassociate=True.")
1743 for tag in tags:
1744 collectionType = self.registry.getCollectionType(tag)
1745 if collectionType is not CollectionType.TAGGED:
1746 raise TypeError(
1747 f"Cannot disassociate from collection '{tag}' "
1748 f"of non-TAGGED type {collectionType.name}."
1749 )
1750 # For an execution butler we want to keep existing UUIDs for the
1751 # datasets, for that we need to keep them in the collections but
1752 # remove from datastore.
1753 if self._allow_put_of_predefined_dataset and purge:
1754 purge = False
1755 disassociate = False
1756 # Transform possibly-single-pass iterable into something we can iterate
1757 # over multiple times.
1758 refs = list(refs)
1759 # Pruning a component of a DatasetRef makes no sense since registry
1760 # doesn't know about components and datastore might not store
1761 # components in a separate file
1762 for ref in refs:
1763 if ref.datasetType.component():
1764 raise ValueError(f"Can not prune a component of a dataset (ref={ref})")
1765 # We don't need an unreliable Datastore transaction for this, because
1766 # we've been extra careful to ensure that Datastore.trash only involves
1767 # mutating the Registry (it can _look_ at Datastore-specific things,
1768 # but shouldn't change them), and hence all operations here are
1769 # Registry operations.
1770 with self.datastore.transaction():
1771 with self.registry.transaction():
1772 if unstore:
1773 self.datastore.trash(refs)
1774 if purge:
1775 self.registry.removeDatasets(refs)
1776 elif disassociate:
1777 assert tags, "Guaranteed by earlier logic in this function."
1778 for tag in tags:
1779 self.registry.disassociate(tag, refs)
1780 # We've exited the Registry transaction, and apparently committed.
1781 # (if there was an exception, everything rolled back, and it's as if
1782 # nothing happened - and we never get here).
1783 # Datastore artifacts are not yet gone, but they're clearly marked
1784 # as trash, so if we fail to delete now because of (e.g.) filesystem
1785 # problems we can try again later, and if manual administrative
1786 # intervention is required, it's pretty clear what that should entail:
1787 # deleting everything on disk and in private Datastore tables that is
1788 # in the dataset_location_trash table.
1789 if unstore:
1790 # Point of no return for removing artifacts
1791 self.datastore.emptyTrash()
1793 @transactional
1794 def ingest(
1795 self,
1796 *datasets: FileDataset,
1797 transfer: Optional[str] = "auto",
1798 run: Optional[str] = None,
1799 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
1800 record_validation_info: bool = True,
1801 ) -> None:
1802 """Store and register one or more datasets that already exist on disk.
1804 Parameters
1805 ----------
1806 datasets : `FileDataset`
1807 Each positional argument is a struct containing information about
1808 a file to be ingested, including its URI (either absolute or
1809 relative to the datastore root, if applicable), a `DatasetRef`,
1810 and optionally a formatter class or its fully-qualified string
1811 name. If a formatter is not provided, the formatter that would be
1812 used for `put` is assumed. On successful return, all
1813 `FileDataset.ref` attributes will have their `DatasetRef.id`
1814 attribute populated and all `FileDataset.formatter` attributes will
1815 be set to the formatter class used. `FileDataset.path` attributes
1816 may be modified to put paths in whatever the datastore considers a
1817 standardized form.
1818 transfer : `str`, optional
1819 If not `None`, must be one of 'auto', 'move', 'copy', 'direct',
1820 'split', 'hardlink', 'relsymlink' or 'symlink', indicating how to
1821 transfer the file.
1822 run : `str`, optional
1823 The name of the run ingested datasets should be added to,
1824 overriding ``self.run``.
1825 idGenerationMode : `DatasetIdGenEnum`, optional
1826 Specifies option for generating dataset IDs. By default unique IDs
1827 are generated for each inserted dataset.
1828 record_validation_info : `bool`, optional
1829 If `True`, the default, the datastore can record validation
1830 information associated with the file. If `False` the datastore
1831 will not attempt to track any information such as checksums
1832 or file sizes. This can be useful if such information is tracked
1833 in an external system or if the file is to be compressed in place.
1834 It is up to the datastore whether this parameter is relevant.
1836 Raises
1837 ------
1838 TypeError
1839 Raised if the butler is read-only or if no run was provided.
1840 NotImplementedError
1841 Raised if the `Datastore` does not support the given transfer mode.
1842 DatasetTypeNotSupportedError
1843 Raised if one or more files to be ingested have a dataset type that
1844 is not supported by the `Datastore`..
1845 FileNotFoundError
1846 Raised if one of the given files does not exist.
1847 FileExistsError
1848 Raised if transfer is not `None` but the (internal) location the
1849 file would be moved to is already occupied.
1851 Notes
1852 -----
1853 This operation is not fully exception safe: if a database operation
1854 fails, the given `FileDataset` instances may be only partially updated.
1856 It is atomic in terms of database operations (they will either all
1857 succeed or all fail) providing the database engine implements
1858 transactions correctly. It will attempt to be atomic in terms of
1859 filesystem operations as well, but this cannot be implemented
1860 rigorously for most datastores.
1861 """
1862 if not self.isWriteable():
1863 raise TypeError("Butler is read-only.")
1864 progress = Progress("lsst.daf.butler.Butler.ingest", level=logging.DEBUG)
1865 # Reorganize the inputs so they're grouped by DatasetType and then
1866 # data ID. We also include a list of DatasetRefs for each FileDataset
1867 # to hold the resolved DatasetRefs returned by the Registry, before
1868 # it's safe to swap them into FileDataset.refs.
1869 # Some type annotation aliases to make that clearer:
1870 GroupForType = Dict[DataCoordinate, Tuple[FileDataset, List[DatasetRef]]]
1871 GroupedData = MutableMapping[DatasetType, GroupForType]
1872 # The actual data structure:
1873 groupedData: GroupedData = defaultdict(dict)
1874 # And the nested loop that populates it:
1875 for dataset in progress.wrap(datasets, desc="Grouping by dataset type"):
1876 # This list intentionally shared across the inner loop, since it's
1877 # associated with `dataset`.
1878 resolvedRefs: List[DatasetRef] = []
1880 # Somewhere to store pre-existing refs if we have an
1881 # execution butler.
1882 existingRefs: List[DatasetRef] = []
1884 for ref in dataset.refs:
1885 if ref.dataId in groupedData[ref.datasetType]:
1886 raise ConflictingDefinitionError(
1887 f"Ingest conflict. Dataset {dataset.path} has same"
1888 " DataId as other ingest dataset"
1889 f" {groupedData[ref.datasetType][ref.dataId][0].path} "
1890 f" ({ref.dataId})"
1891 )
1892 if self._allow_put_of_predefined_dataset:
1893 existing_ref = self.registry.findDataset(
1894 ref.datasetType, dataId=ref.dataId, collections=run
1895 )
1896 if existing_ref:
1897 if self.datastore.knows(existing_ref):
1898 raise ConflictingDefinitionError(
1899 f"Dataset associated with path {dataset.path}"
1900 f" already exists as {existing_ref}."
1901 )
1902 # Store this ref elsewhere since it already exists
1903 # and we do not want to remake it but we do want
1904 # to store it in the datastore.
1905 existingRefs.append(existing_ref)
1907 # Nothing else to do until we have finished
1908 # iterating.
1909 continue
1911 groupedData[ref.datasetType][ref.dataId] = (dataset, resolvedRefs)
1913 if existingRefs:
1915 if len(dataset.refs) != len(existingRefs):
1916 # Keeping track of partially pre-existing datasets is hard
1917 # and should generally never happen. For now don't allow
1918 # it.
1919 raise ConflictingDefinitionError(
1920 f"For dataset {dataset.path} some dataIds already exist"
1921 " in registry but others do not. This is not supported."
1922 )
1924 # Attach the resolved refs if we found them.
1925 dataset.refs = existingRefs
1927 # Now we can bulk-insert into Registry for each DatasetType.
1928 for datasetType, groupForType in progress.iter_item_chunks(
1929 groupedData.items(), desc="Bulk-inserting datasets by type"
1930 ):
1931 refs = self.registry.insertDatasets(
1932 datasetType,
1933 dataIds=groupForType.keys(),
1934 run=run,
1935 expand=self.datastore.needs_expanded_data_ids(transfer, datasetType),
1936 idGenerationMode=idGenerationMode,
1937 )
1938 # Append those resolved DatasetRefs to the new lists we set up for
1939 # them.
1940 for ref, (_, resolvedRefs) in zip(refs, groupForType.values()):
1941 resolvedRefs.append(ref)
1943 # Go back to the original FileDatasets to replace their refs with the
1944 # new resolved ones.
1945 for groupForType in progress.iter_chunks(
1946 groupedData.values(), desc="Reassociating resolved dataset refs with files"
1947 ):
1948 for dataset, resolvedRefs in groupForType.values():
1949 dataset.refs = resolvedRefs
1951 # Bulk-insert everything into Datastore.
1952 self.datastore.ingest(*datasets, transfer=transfer, record_validation_info=record_validation_info)
1954 @contextlib.contextmanager
1955 def export(
1956 self,
1957 *,
1958 directory: Optional[str] = None,
1959 filename: Optional[str] = None,
1960 format: Optional[str] = None,
1961 transfer: Optional[str] = None,
1962 ) -> Iterator[RepoExportContext]:
1963 """Export datasets from the repository represented by this `Butler`.
1965 This method is a context manager that returns a helper object
1966 (`RepoExportContext`) that is used to indicate what information from
1967 the repository should be exported.
1969 Parameters
1970 ----------
1971 directory : `str`, optional
1972 Directory dataset files should be written to if ``transfer`` is not
1973 `None`.
1974 filename : `str`, optional
1975 Name for the file that will include database information associated
1976 with the exported datasets. If this is not an absolute path and
1977 ``directory`` is not `None`, it will be written to ``directory``
1978 instead of the current working directory. Defaults to
1979 "export.{format}".
1980 format : `str`, optional
1981 File format for the database information file. If `None`, the
1982 extension of ``filename`` will be used.
1983 transfer : `str`, optional
1984 Transfer mode passed to `Datastore.export`.
1986 Raises
1987 ------
1988 TypeError
1989 Raised if the set of arguments passed is inconsistent.
1991 Examples
1992 --------
1993 Typically the `Registry.queryDataIds` and `Registry.queryDatasets`
1994 methods are used to provide the iterables over data IDs and/or datasets
1995 to be exported::
1997 with butler.export("exports.yaml") as export:
1998 # Export all flats, but none of the dimension element rows
1999 # (i.e. data ID information) associated with them.
2000 export.saveDatasets(butler.registry.queryDatasets("flat"),
2001 elements=())
2002 # Export all datasets that start with "deepCoadd_" and all of
2003 # their associated data ID information.
2004 export.saveDatasets(butler.registry.queryDatasets("deepCoadd_*"))
2005 """
2006 if directory is None and transfer is not None:
2007 raise TypeError("Cannot transfer without providing a directory.")
2008 if transfer == "move":
2009 raise TypeError("Transfer may not be 'move': export is read-only")
2010 if format is None:
2011 if filename is None:
2012 raise TypeError("At least one of 'filename' or 'format' must be provided.")
2013 else:
2014 _, format = os.path.splitext(filename)
2015 elif filename is None:
2016 filename = f"export.{format}"
2017 if directory is not None:
2018 filename = os.path.join(directory, filename)
2019 BackendClass = get_class_of(self._config["repo_transfer_formats"][format]["export"])
2020 with open(filename, "w") as stream:
2021 backend = BackendClass(stream, universe=self.registry.dimensions)
2022 try:
2023 helper = RepoExportContext(
2024 self.registry, self.datastore, backend=backend, directory=directory, transfer=transfer
2025 )
2026 yield helper
2027 except BaseException:
2028 raise
2029 else:
2030 helper._finish()
2032 def import_(
2033 self,
2034 *,
2035 directory: Optional[str] = None,
2036 filename: Union[str, TextIO, None] = None,
2037 format: Optional[str] = None,
2038 transfer: Optional[str] = None,
2039 skip_dimensions: Optional[Set] = None,
2040 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
2041 reuseIds: bool = False,
2042 ) -> None:
2043 """Import datasets into this repository that were exported from a
2044 different butler repository via `~lsst.daf.butler.Butler.export`.
2046 Parameters
2047 ----------
2048 directory : `str`, optional
2049 Directory containing dataset files to import from. If `None`,
2050 ``filename`` and all dataset file paths specified therein must
2051 be absolute.
2052 filename : `str` or `TextIO`, optional
2053 A stream or name of file that contains database information
2054 associated with the exported datasets, typically generated by
2055 `~lsst.daf.butler.Butler.export`. If this a string (name) and
2056 is not an absolute path, does not exist in the current working
2057 directory, and ``directory`` is not `None`, it is assumed to be in
2058 ``directory``. Defaults to "export.{format}".
2059 format : `str`, optional
2060 File format for ``filename``. If `None`, the extension of
2061 ``filename`` will be used.
2062 transfer : `str`, optional
2063 Transfer mode passed to `~lsst.daf.butler.Datastore.ingest`.
2064 skip_dimensions : `set`, optional
2065 Names of dimensions that should be skipped and not imported.
2066 idGenerationMode : `DatasetIdGenEnum`, optional
2067 Specifies option for generating dataset IDs when IDs are not
2068 provided or their type does not match backend type. By default
2069 unique IDs are generated for each inserted dataset.
2070 reuseIds : `bool`, optional
2071 If `True` then forces re-use of imported dataset IDs for integer
2072 IDs which are normally generated as auto-incremented; exception
2073 will be raised if imported IDs clash with existing ones. This
2074 option has no effect on the use of globally-unique IDs which are
2075 always re-used (or generated if integer IDs are being imported).
2077 Raises
2078 ------
2079 TypeError
2080 Raised if the set of arguments passed is inconsistent, or if the
2081 butler is read-only.
2082 """
2083 if not self.isWriteable():
2084 raise TypeError("Butler is read-only.")
2085 if format is None:
2086 if filename is None:
2087 raise TypeError("At least one of 'filename' or 'format' must be provided.")
2088 else:
2089 _, format = os.path.splitext(filename) # type: ignore
2090 elif filename is None:
2091 filename = f"export.{format}"
2092 if isinstance(filename, str) and directory is not None and not os.path.exists(filename):
2093 filename = os.path.join(directory, filename)
2094 BackendClass = get_class_of(self._config["repo_transfer_formats"][format]["import"])
2096 def doImport(importStream: TextIO) -> None:
2097 backend = BackendClass(importStream, self.registry)
2098 backend.register()
2099 with self.transaction():
2100 backend.load(
2101 self.datastore,
2102 directory=directory,
2103 transfer=transfer,
2104 skip_dimensions=skip_dimensions,
2105 idGenerationMode=idGenerationMode,
2106 reuseIds=reuseIds,
2107 )
2109 if isinstance(filename, str):
2110 with open(filename, "r") as stream:
2111 doImport(stream)
2112 else:
2113 doImport(filename)
2115 def transfer_from(
2116 self,
2117 source_butler: Butler,
2118 source_refs: Iterable[DatasetRef],
2119 transfer: str = "auto",
2120 id_gen_map: Dict[str, DatasetIdGenEnum] = None,
2121 skip_missing: bool = True,
2122 register_dataset_types: bool = False,
2123 transfer_dimensions: bool = False,
2124 ) -> List[DatasetRef]:
2125 """Transfer datasets to this Butler from a run in another Butler.
2127 Parameters
2128 ----------
2129 source_butler : `Butler`
2130 Butler from which the datasets are to be transferred.
2131 source_refs : iterable of `DatasetRef`
2132 Datasets defined in the source butler that should be transferred to
2133 this butler.
2134 transfer : `str`, optional
2135 Transfer mode passed to `~lsst.daf.butler.Datastore.transfer_from`.
2136 id_gen_map : `dict` of [`str`, `DatasetIdGenEnum`], optional
2137 A mapping of dataset type to ID generation mode. Only used if
2138 the source butler is using integer IDs. Should not be used
2139 if this receiving butler uses integer IDs. Without this dataset
2140 import always uses unique.
2141 skip_missing : `bool`
2142 If `True`, datasets with no datastore artifact associated with
2143 them are not transferred. If `False` a registry entry will be
2144 created even if no datastore record is created (and so will
2145 look equivalent to the dataset being unstored).
2146 register_dataset_types : `bool`
2147 If `True` any missing dataset types are registered. Otherwise
2148 an exception is raised.
2149 transfer_dimensions : `bool`, optional
2150 If `True`, dimension record data associated with the new datasets
2151 will be transferred.
2153 Returns
2154 -------
2155 refs : `list` of `DatasetRef`
2156 The refs added to this Butler.
2158 Notes
2159 -----
2160 Requires that any dimension definitions are already present in the
2161 receiving Butler. The datastore artifact has to exist for a transfer
2162 to be made but non-existence is not an error.
2164 Datasets that already exist in this run will be skipped.
2166 The datasets are imported as part of a transaction, although
2167 dataset types are registered before the transaction is started.
2168 This means that it is possible for a dataset type to be registered
2169 even though transfer has failed.
2170 """
2171 if not self.isWriteable():
2172 raise TypeError("Butler is read-only.")
2173 progress = Progress("lsst.daf.butler.Butler.transfer_from", level=VERBOSE)
2175 # Will iterate through the refs multiple times so need to convert
2176 # to a list if this isn't a collection.
2177 if not isinstance(source_refs, collections.abc.Collection):
2178 source_refs = list(source_refs)
2180 original_count = len(source_refs)
2181 log.info("Transferring %d datasets into %s", original_count, str(self))
2183 if id_gen_map is None:
2184 id_gen_map = {}
2186 # In some situations the datastore artifact may be missing
2187 # and we do not want that registry entry to be imported.
2188 # Asking datastore is not sufficient, the records may have been
2189 # purged, we have to ask for the (predicted) URI and check
2190 # existence explicitly. Execution butler is set up exactly like
2191 # this with no datastore records.
2192 artifact_existence: Dict[ResourcePath, bool] = {}
2193 if skip_missing:
2194 dataset_existence = source_butler.datastore.mexists(
2195 source_refs, artifact_existence=artifact_existence
2196 )
2197 source_refs = [ref for ref, exists in dataset_existence.items() if exists]
2198 filtered_count = len(source_refs)
2199 log.verbose(
2200 "%d datasets removed because the artifact does not exist. Now have %d.",
2201 original_count - filtered_count,
2202 filtered_count,
2203 )
2205 # Importing requires that we group the refs by dataset type and run
2206 # before doing the import.
2207 source_dataset_types = set()
2208 grouped_refs = defaultdict(list)
2209 grouped_indices = defaultdict(list)
2210 for i, ref in enumerate(source_refs):
2211 grouped_refs[ref.datasetType, ref.run].append(ref)
2212 grouped_indices[ref.datasetType, ref.run].append(i)
2213 source_dataset_types.add(ref.datasetType)
2215 # Check to see if the dataset type in the source butler has
2216 # the same definition in the target butler and register missing
2217 # ones if requested. Registration must happen outside a transaction.
2218 newly_registered_dataset_types = set()
2219 for datasetType in source_dataset_types:
2220 if register_dataset_types:
2221 # Let this raise immediately if inconsistent. Continuing
2222 # on to find additional inconsistent dataset types
2223 # might result in additional unwanted dataset types being
2224 # registered.
2225 if self.registry.registerDatasetType(datasetType):
2226 newly_registered_dataset_types.add(datasetType)
2227 else:
2228 # If the dataset type is missing, let it fail immediately.
2229 target_dataset_type = self.registry.getDatasetType(datasetType.name)
2230 if target_dataset_type != datasetType:
2231 raise ConflictingDefinitionError(
2232 "Source butler dataset type differs from definition"
2233 f" in target butler: {datasetType} !="
2234 f" {target_dataset_type}"
2235 )
2236 if newly_registered_dataset_types:
2237 # We may have registered some even if there were inconsistencies
2238 # but should let people know (or else remove them again).
2239 log.log(
2240 VERBOSE,
2241 "Registered the following dataset types in the target Butler: %s",
2242 ", ".join(d.name for d in newly_registered_dataset_types),
2243 )
2244 else:
2245 log.log(VERBOSE, "All required dataset types are known to the target Butler")
2247 dimension_records: Dict[DimensionElement, Dict[DataCoordinate, DimensionRecord]] = defaultdict(dict)
2248 if transfer_dimensions:
2249 # Collect all the dimension records for these refs.
2250 # All dimensions are to be copied but the list of valid dimensions
2251 # come from this butler's universe.
2252 elements = frozenset(
2253 element
2254 for element in self.registry.dimensions.getStaticElements()
2255 if element.hasTable() and element.viewOf is None
2256 )
2257 dataIds = set(ref.dataId for ref in source_refs)
2258 # This logic comes from saveDataIds.
2259 for dataId in dataIds:
2260 # Should be a no-op if the ref has already been expanded.
2261 dataId = source_butler.registry.expandDataId(dataId)
2262 # If this butler doesn't know about a dimension in the source
2263 # butler things will break later.
2264 for record in dataId.records.values():
2265 if record is not None and record.definition in elements:
2266 dimension_records[record.definition].setdefault(record.dataId, record)
2268 # The returned refs should be identical for UUIDs.
2269 # For now must also support integers and so need to retain the
2270 # newly-created refs from this registry.
2271 # Pre-size it so we can assign refs into the correct slots
2272 transferred_refs_tmp: List[Optional[DatasetRef]] = [None] * len(source_refs)
2273 default_id_gen = DatasetIdGenEnum.UNIQUE
2275 handled_collections: Set[str] = set()
2277 # Do all the importing in a single transaction.
2278 with self.transaction():
2279 if dimension_records:
2280 log.verbose("Ensuring that dimension records exist for transferred datasets.")
2281 for element, r in dimension_records.items():
2282 records = [r[dataId] for dataId in r]
2283 # Assume that if the record is already present that we can
2284 # use it without having to check that the record metadata
2285 # is consistent.
2286 self.registry.insertDimensionData(element, *records, skip_existing=True)
2288 for (datasetType, run), refs_to_import in progress.iter_item_chunks(
2289 grouped_refs.items(), desc="Importing to registry by run and dataset type"
2290 ):
2291 if run not in handled_collections:
2292 run_doc = source_butler.registry.getCollectionDocumentation(run)
2293 registered = self.registry.registerRun(run, doc=run_doc)
2294 handled_collections.add(run)
2295 if registered:
2296 log.log(VERBOSE, "Creating output run %s", run)
2298 id_generation_mode = default_id_gen
2299 if isinstance(refs_to_import[0].id, int):
2300 # ID generation mode might need to be overridden when
2301 # targetting UUID
2302 id_generation_mode = id_gen_map.get(datasetType.name, default_id_gen)
2304 n_refs = len(refs_to_import)
2305 log.verbose(
2306 "Importing %d ref%s of dataset type %s into run %s",
2307 n_refs,
2308 "" if n_refs == 1 else "s",
2309 datasetType.name,
2310 run,
2311 )
2313 # No way to know if this butler's registry uses UUID.
2314 # We have to trust the caller on this. If it fails they will
2315 # have to change their approach. We can't catch the exception
2316 # and retry with unique because that will mess up the
2317 # transaction handling. We aren't allowed to ask the registry
2318 # manager what type of ID it is using.
2319 imported_refs = self.registry._importDatasets(
2320 refs_to_import, idGenerationMode=id_generation_mode, expand=False
2321 )
2323 # Map them into the correct slots to match the initial order
2324 for i, ref in zip(grouped_indices[datasetType, run], imported_refs):
2325 transferred_refs_tmp[i] = ref
2327 # Mypy insists that we might have None in here so we have to make
2328 # that explicit by assigning to a new variable and filtering out
2329 # something that won't be there.
2330 transferred_refs = [ref for ref in transferred_refs_tmp if ref is not None]
2332 # Check consistency
2333 assert len(source_refs) == len(transferred_refs), "Different number of refs imported than given"
2335 log.verbose("Imported %d datasets into destination butler", len(transferred_refs))
2337 # The transferred refs need to be reordered to match the original
2338 # ordering given by the caller. Without this the datastore transfer
2339 # will be broken.
2341 # Ask the datastore to transfer. The datastore has to check that
2342 # the source datastore is compatible with the target datastore.
2343 self.datastore.transfer_from(
2344 source_butler.datastore,
2345 source_refs,
2346 local_refs=transferred_refs,
2347 transfer=transfer,
2348 artifact_existence=artifact_existence,
2349 )
2351 return transferred_refs
2353 def validateConfiguration(
2354 self,
2355 logFailures: bool = False,
2356 datasetTypeNames: Optional[Iterable[str]] = None,
2357 ignore: Iterable[str] = None,
2358 ) -> None:
2359 """Validate butler configuration.
2361 Checks that each `DatasetType` can be stored in the `Datastore`.
2363 Parameters
2364 ----------
2365 logFailures : `bool`, optional
2366 If `True`, output a log message for every validation error
2367 detected.
2368 datasetTypeNames : iterable of `str`, optional
2369 The `DatasetType` names that should be checked. This allows
2370 only a subset to be selected.
2371 ignore : iterable of `str`, optional
2372 Names of DatasetTypes to skip over. This can be used to skip
2373 known problems. If a named `DatasetType` corresponds to a
2374 composite, all components of that `DatasetType` will also be
2375 ignored.
2377 Raises
2378 ------
2379 ButlerValidationError
2380 Raised if there is some inconsistency with how this Butler
2381 is configured.
2382 """
2383 if datasetTypeNames:
2384 datasetTypes = [self.registry.getDatasetType(name) for name in datasetTypeNames]
2385 else:
2386 datasetTypes = list(self.registry.queryDatasetTypes())
2388 # filter out anything from the ignore list
2389 if ignore:
2390 ignore = set(ignore)
2391 datasetTypes = [
2392 e for e in datasetTypes if e.name not in ignore and e.nameAndComponent()[0] not in ignore
2393 ]
2394 else:
2395 ignore = set()
2397 # Find all the registered instruments
2398 instruments = set(record.name for record in self.registry.queryDimensionRecords("instrument"))
2400 # For each datasetType that has an instrument dimension, create
2401 # a DatasetRef for each defined instrument
2402 datasetRefs = []
2404 for datasetType in datasetTypes:
2405 if "instrument" in datasetType.dimensions:
2406 for instrument in instruments:
2407 datasetRef = DatasetRef(
2408 datasetType, {"instrument": instrument}, conform=False # type: ignore
2409 )
2410 datasetRefs.append(datasetRef)
2412 entities: List[Union[DatasetType, DatasetRef]] = []
2413 entities.extend(datasetTypes)
2414 entities.extend(datasetRefs)
2416 datastoreErrorStr = None
2417 try:
2418 self.datastore.validateConfiguration(entities, logFailures=logFailures)
2419 except ValidationError as e:
2420 datastoreErrorStr = str(e)
2422 # Also check that the LookupKeys used by the datastores match
2423 # registry and storage class definitions
2424 keys = self.datastore.getLookupKeys()
2426 failedNames = set()
2427 failedDataId = set()
2428 for key in keys:
2429 if key.name is not None:
2430 if key.name in ignore:
2431 continue
2433 # skip if specific datasetType names were requested and this
2434 # name does not match
2435 if datasetTypeNames and key.name not in datasetTypeNames:
2436 continue
2438 # See if it is a StorageClass or a DatasetType
2439 if key.name in self.storageClasses:
2440 pass
2441 else:
2442 try:
2443 self.registry.getDatasetType(key.name)
2444 except KeyError:
2445 if logFailures:
2446 log.critical("Key '%s' does not correspond to a DatasetType or StorageClass", key)
2447 failedNames.add(key)
2448 else:
2449 # Dimensions are checked for consistency when the Butler
2450 # is created and rendezvoused with a universe.
2451 pass
2453 # Check that the instrument is a valid instrument
2454 # Currently only support instrument so check for that
2455 if key.dataId:
2456 dataIdKeys = set(key.dataId)
2457 if set(["instrument"]) != dataIdKeys:
2458 if logFailures:
2459 log.critical("Key '%s' has unsupported DataId override", key)
2460 failedDataId.add(key)
2461 elif key.dataId["instrument"] not in instruments:
2462 if logFailures:
2463 log.critical("Key '%s' has unknown instrument", key)
2464 failedDataId.add(key)
2466 messages = []
2468 if datastoreErrorStr:
2469 messages.append(datastoreErrorStr)
2471 for failed, msg in (
2472 (failedNames, "Keys without corresponding DatasetType or StorageClass entry: "),
2473 (failedDataId, "Keys with bad DataId entries: "),
2474 ):
2475 if failed:
2476 msg += ", ".join(str(k) for k in failed)
2477 messages.append(msg)
2479 if messages:
2480 raise ValidationError(";\n".join(messages))
2482 @property
2483 def collections(self) -> CollectionSearch:
2484 """The collections to search by default, in order (`CollectionSearch`).
2486 This is an alias for ``self.registry.defaults.collections``. It cannot
2487 be set directly in isolation, but all defaults may be changed together
2488 by assigning a new `RegistryDefaults` instance to
2489 ``self.registry.defaults``.
2490 """
2491 return self.registry.defaults.collections
2493 @property
2494 def run(self) -> Optional[str]:
2495 """Name of the run this butler writes outputs to by default (`str` or
2496 `None`).
2498 This is an alias for ``self.registry.defaults.run``. It cannot be set
2499 directly in isolation, but all defaults may be changed together by
2500 assigning a new `RegistryDefaults` instance to
2501 ``self.registry.defaults``.
2502 """
2503 return self.registry.defaults.run
2505 @property
2506 def dimensions(self) -> DimensionUniverse:
2507 # Docstring inherited.
2508 return self.registry.dimensions
2510 registry: Registry
2511 """The object that manages dataset metadata and relationships (`Registry`).
2513 Most operations that don't involve reading or writing butler datasets are
2514 accessible only via `Registry` methods.
2515 """
2517 datastore: Datastore
2518 """The object that manages actual dataset storage (`Datastore`).
2520 Direct user access to the datastore should rarely be necessary; the primary
2521 exception is the case where a `Datastore` implementation provides extra
2522 functionality beyond what the base class defines.
2523 """
2525 storageClasses: StorageClassFactory
2526 """An object that maps known storage class names to objects that fully
2527 describe them (`StorageClassFactory`).
2528 """
2530 _allow_put_of_predefined_dataset: bool
2531 """Allow a put to succeed even if there is already a registry entry for it
2532 but not a datastore record. (`bool`)."""