Coverage for python/lsst/daf/butler/_butler.py: 8%
674 statements
« prev ^ index » next coverage.py v6.5.0, created at 2023-06-06 09:38 +0000
« prev ^ index » next coverage.py v6.5.0, created at 2023-06-06 09:38 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""
23Butler top level classes.
24"""
25from __future__ import annotations
27__all__ = (
28 "Butler",
29 "ButlerValidationError",
30)
32import collections.abc
33import contextlib
34import io
35import logging
36import numbers
37import os
38import warnings
39from collections import defaultdict
40from typing import (
41 TYPE_CHECKING,
42 Any,
43 ClassVar,
44 Counter,
45 Dict,
46 Iterable,
47 Iterator,
48 List,
49 MutableMapping,
50 Optional,
51 Sequence,
52 Set,
53 TextIO,
54 Tuple,
55 Type,
56 Union,
57)
59from deprecated.sphinx import deprecated
60from lsst.resources import ResourcePath, ResourcePathExpression
61from lsst.utils import doImportType
62from lsst.utils.introspection import get_class_of
63from lsst.utils.logging import VERBOSE, getLogger
64from sqlalchemy.exc import IntegrityError
66from ._butlerConfig import ButlerConfig
67from ._butlerRepoIndex import ButlerRepoIndex
68from ._deferredDatasetHandle import DeferredDatasetHandle
69from ._limited_butler import LimitedButler
70from .core import (
71 Config,
72 ConfigSubset,
73 DataCoordinate,
74 DataId,
75 DataIdValue,
76 DatasetIdGenEnum,
77 DatasetRef,
78 DatasetRefURIs,
79 DatasetType,
80 Datastore,
81 Dimension,
82 DimensionConfig,
83 DimensionElement,
84 DimensionRecord,
85 DimensionUniverse,
86 FileDataset,
87 Progress,
88 StorageClass,
89 StorageClassFactory,
90 Timespan,
91 ValidationError,
92)
93from .core.repoRelocation import BUTLER_ROOT_TAG
94from .core.utils import transactional
95from .registry import (
96 CollectionType,
97 ConflictingDefinitionError,
98 DataIdError,
99 MissingDatasetTypeError,
100 Registry,
101 RegistryConfig,
102 RegistryDefaults,
103)
104from .transfers import RepoExportContext
106if TYPE_CHECKING:
107 from lsst.resources import ResourceHandleProtocol
109 from .transfers import RepoImportBackend
111log = getLogger(__name__)
114class ButlerValidationError(ValidationError):
115 """There is a problem with the Butler configuration."""
117 pass
120class Butler(LimitedButler):
121 """Main entry point for the data access system.
123 Parameters
124 ----------
125 config : `ButlerConfig`, `Config` or `str`, optional.
126 Configuration. Anything acceptable to the
127 `ButlerConfig` constructor. If a directory path
128 is given the configuration will be read from a ``butler.yaml`` file in
129 that location. If `None` is given default values will be used.
130 butler : `Butler`, optional.
131 If provided, construct a new Butler that uses the same registry and
132 datastore as the given one, but with the given collection and run.
133 Incompatible with the ``config``, ``searchPaths``, and ``writeable``
134 arguments.
135 collections : `str` or `Iterable` [ `str` ], optional
136 An expression specifying the collections to be searched (in order) when
137 reading datasets.
138 This may be a `str` collection name or an iterable thereof.
139 See :ref:`daf_butler_collection_expressions` for more information.
140 These collections are not registered automatically and must be
141 manually registered before they are used by any method, but they may be
142 manually registered after the `Butler` is initialized.
143 run : `str`, optional
144 Name of the `~CollectionType.RUN` collection new datasets should be
145 inserted into. If ``collections`` is `None` and ``run`` is not `None`,
146 ``collections`` will be set to ``[run]``. If not `None`, this
147 collection will automatically be registered. If this is not set (and
148 ``writeable`` is not set either), a read-only butler will be created.
149 searchPaths : `list` of `str`, optional
150 Directory paths to search when calculating the full Butler
151 configuration. Not used if the supplied config is already a
152 `ButlerConfig`.
153 writeable : `bool`, optional
154 Explicitly sets whether the butler supports write operations. If not
155 provided, a read-write butler is created if any of ``run``, ``tags``,
156 or ``chains`` is non-empty.
157 inferDefaults : `bool`, optional
158 If `True` (default) infer default data ID values from the values
159 present in the datasets in ``collections``: if all collections have the
160 same value (or no value) for a governor dimension, that value will be
161 the default for that dimension. Nonexistent collections are ignored.
162 If a default value is provided explicitly for a governor dimension via
163 ``**kwargs``, no default will be inferred for that dimension.
164 **kwargs : `str`
165 Default data ID key-value pairs. These may only identify "governor"
166 dimensions like ``instrument`` and ``skymap``.
168 Examples
169 --------
170 While there are many ways to control exactly how a `Butler` interacts with
171 the collections in its `Registry`, the most common cases are still simple.
173 For a read-only `Butler` that searches one collection, do::
175 butler = Butler("/path/to/repo", collections=["u/alice/DM-50000"])
177 For a read-write `Butler` that writes to and reads from a
178 `~CollectionType.RUN` collection::
180 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a")
182 The `Butler` passed to a ``PipelineTask`` is often much more complex,
183 because we want to write to one `~CollectionType.RUN` collection but read
184 from several others (as well)::
186 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a",
187 collections=["u/alice/DM-50000/a",
188 "u/bob/DM-49998",
189 "HSC/defaults"])
191 This butler will `put` new datasets to the run ``u/alice/DM-50000/a``.
192 Datasets will be read first from that run (since it appears first in the
193 chain), and then from ``u/bob/DM-49998`` and finally ``HSC/defaults``.
195 Finally, one can always create a `Butler` with no collections::
197 butler = Butler("/path/to/repo", writeable=True)
199 This can be extremely useful when you just want to use ``butler.registry``,
200 e.g. for inserting dimension data or managing collections, or when the
201 collections you want to use with the butler are not consistent.
202 Passing ``writeable`` explicitly here is only necessary if you want to be
203 able to make changes to the repo - usually the value for ``writeable`` can
204 be guessed from the collection arguments provided, but it defaults to
205 `False` when there are not collection arguments.
206 """
208 def __init__(
209 self,
210 config: Union[Config, ResourcePathExpression, None] = None,
211 *,
212 butler: Optional[Butler] = None,
213 collections: Any = None,
214 run: Optional[str] = None,
215 searchPaths: Optional[Sequence[ResourcePathExpression]] = None,
216 writeable: Optional[bool] = None,
217 inferDefaults: bool = True,
218 **kwargs: str,
219 ):
220 defaults = RegistryDefaults(collections=collections, run=run, infer=inferDefaults, **kwargs)
221 # Load registry, datastore, etc. from config or existing butler.
222 if butler is not None:
223 if config is not None or searchPaths is not None or writeable is not None:
224 raise TypeError(
225 "Cannot pass 'config', 'searchPaths', or 'writeable' arguments with 'butler' argument."
226 )
227 self.registry = butler.registry.copy(defaults)
228 self.datastore = butler.datastore
229 self.storageClasses = butler.storageClasses
230 self._config: ButlerConfig = butler._config
231 else:
232 # Can only look for strings in the known repos list.
233 if isinstance(config, str):
234 # Somehow ButlerConfig fails in some cases if config is a
235 # ResourcePath, force it back to string here.
236 config = str(self.get_repo_uri(config, True))
237 try:
238 self._config = ButlerConfig(config, searchPaths=searchPaths)
239 except FileNotFoundError as e:
240 if known := self.get_known_repos():
241 aliases = f"(known aliases: {', '.join(known)})"
242 else:
243 aliases = "(no known aliases)"
244 raise FileNotFoundError(f"{e} {aliases}") from e
245 try:
246 if "root" in self._config:
247 butlerRoot = self._config["root"]
248 else:
249 butlerRoot = self._config.configDir
250 if writeable is None:
251 writeable = run is not None
252 self.registry = Registry.fromConfig(
253 self._config, butlerRoot=butlerRoot, writeable=writeable, defaults=defaults
254 )
255 self.datastore = Datastore.fromConfig(
256 self._config, self.registry.getDatastoreBridgeManager(), butlerRoot=butlerRoot
257 )
258 self.storageClasses = StorageClassFactory()
259 self.storageClasses.addFromConfig(self._config)
260 except Exception:
261 # Failures here usually mean that configuration is incomplete,
262 # just issue an error message which includes config file URI.
263 log.error(f"Failed to instantiate Butler from config {self._config.configFile}.")
264 raise
266 # For execution butler the datastore needs a special
267 # dependency-inversion trick. This is not used by regular butler,
268 # but we do not have a way to distinguish regular butler from execution
269 # butler.
270 self.datastore.set_retrieve_dataset_type_method(self._retrieve_dataset_type)
272 if "run" in self._config or "collection" in self._config:
273 raise ValueError("Passing a run or collection via configuration is no longer supported.")
275 GENERATION: ClassVar[int] = 3
276 """This is a Generation 3 Butler.
278 This attribute may be removed in the future, once the Generation 2 Butler
279 interface has been fully retired; it should only be used in transitional
280 code.
281 """
283 def _retrieve_dataset_type(self, name: str) -> DatasetType | None:
284 """Return DatasetType defined in registry given dataset type name."""
285 try:
286 return self.registry.getDatasetType(name)
287 except MissingDatasetTypeError:
288 return None
290 @classmethod
291 def get_repo_uri(cls, label: str, return_label: bool = False) -> ResourcePath:
292 """Look up the label in a butler repository index.
294 Parameters
295 ----------
296 label : `str`
297 Label of the Butler repository to look up.
298 return_label : `bool`, optional
299 If ``label`` cannot be found in the repository index (either
300 because index is not defined or ``label`` is not in the index) and
301 ``return_label`` is `True` then return ``ResourcePath(label)``.
302 If ``return_label`` is `False` (default) then an exception will be
303 raised instead.
305 Returns
306 -------
307 uri : `lsst.resources.ResourcePath`
308 URI to the Butler repository associated with the given label or
309 default value if it is provided.
311 Raises
312 ------
313 KeyError
314 Raised if the label is not found in the index, or if an index
315 is not defined, and ``return_label`` is `False`.
317 Notes
318 -----
319 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the
320 information is discovered.
321 """
322 return ButlerRepoIndex.get_repo_uri(label, return_label)
324 @classmethod
325 def get_known_repos(cls) -> Set[str]:
326 """Retrieve the list of known repository labels.
328 Returns
329 -------
330 repos : `set` of `str`
331 All the known labels. Can be empty if no index can be found.
333 Notes
334 -----
335 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the
336 information is discovered.
337 """
338 return ButlerRepoIndex.get_known_repos()
340 @staticmethod
341 def makeRepo(
342 root: ResourcePathExpression,
343 config: Union[Config, str, None] = None,
344 dimensionConfig: Union[Config, str, None] = None,
345 standalone: bool = False,
346 searchPaths: Optional[List[str]] = None,
347 forceConfigRoot: bool = True,
348 outfile: Optional[ResourcePathExpression] = None,
349 overwrite: bool = False,
350 ) -> Config:
351 """Create an empty data repository by adding a butler.yaml config
352 to a repository root directory.
354 Parameters
355 ----------
356 root : `lsst.resources.ResourcePathExpression`
357 Path or URI to the root location of the new repository. Will be
358 created if it does not exist.
359 config : `Config` or `str`, optional
360 Configuration to write to the repository, after setting any
361 root-dependent Registry or Datastore config options. Can not
362 be a `ButlerConfig` or a `ConfigSubset`. If `None`, default
363 configuration will be used. Root-dependent config options
364 specified in this config are overwritten if ``forceConfigRoot``
365 is `True`.
366 dimensionConfig : `Config` or `str`, optional
367 Configuration for dimensions, will be used to initialize registry
368 database.
369 standalone : `bool`
370 If True, write all expanded defaults, not just customized or
371 repository-specific settings.
372 This (mostly) decouples the repository from the default
373 configuration, insulating it from changes to the defaults (which
374 may be good or bad, depending on the nature of the changes).
375 Future *additions* to the defaults will still be picked up when
376 initializing `Butlers` to repos created with ``standalone=True``.
377 searchPaths : `list` of `str`, optional
378 Directory paths to search when calculating the full butler
379 configuration.
380 forceConfigRoot : `bool`, optional
381 If `False`, any values present in the supplied ``config`` that
382 would normally be reset are not overridden and will appear
383 directly in the output config. This allows non-standard overrides
384 of the root directory for a datastore or registry to be given.
385 If this parameter is `True` the values for ``root`` will be
386 forced into the resulting config if appropriate.
387 outfile : `lss.resources.ResourcePathExpression`, optional
388 If not-`None`, the output configuration will be written to this
389 location rather than into the repository itself. Can be a URI
390 string. Can refer to a directory that will be used to write
391 ``butler.yaml``.
392 overwrite : `bool`, optional
393 Create a new configuration file even if one already exists
394 in the specified output location. Default is to raise
395 an exception.
397 Returns
398 -------
399 config : `Config`
400 The updated `Config` instance written to the repo.
402 Raises
403 ------
404 ValueError
405 Raised if a ButlerConfig or ConfigSubset is passed instead of a
406 regular Config (as these subclasses would make it impossible to
407 support ``standalone=False``).
408 FileExistsError
409 Raised if the output config file already exists.
410 os.error
411 Raised if the directory does not exist, exists but is not a
412 directory, or cannot be created.
414 Notes
415 -----
416 Note that when ``standalone=False`` (the default), the configuration
417 search path (see `ConfigSubset.defaultSearchPaths`) that was used to
418 construct the repository should also be used to construct any Butlers
419 to avoid configuration inconsistencies.
420 """
421 if isinstance(config, (ButlerConfig, ConfigSubset)):
422 raise ValueError("makeRepo must be passed a regular Config without defaults applied.")
424 # Ensure that the root of the repository exists or can be made
425 root_uri = ResourcePath(root, forceDirectory=True)
426 root_uri.mkdir()
428 config = Config(config)
430 # If we are creating a new repo from scratch with relative roots,
431 # do not propagate an explicit root from the config file
432 if "root" in config:
433 del config["root"]
435 full = ButlerConfig(config, searchPaths=searchPaths) # this applies defaults
436 imported_class = doImportType(full["datastore", "cls"])
437 if not issubclass(imported_class, Datastore):
438 raise TypeError(f"Imported datastore class {full['datastore', 'cls']} is not a Datastore")
439 datastoreClass: Type[Datastore] = imported_class
440 datastoreClass.setConfigRoot(BUTLER_ROOT_TAG, config, full, overwrite=forceConfigRoot)
442 # if key exists in given config, parse it, otherwise parse the defaults
443 # in the expanded config
444 if config.get(("registry", "db")):
445 registryConfig = RegistryConfig(config)
446 else:
447 registryConfig = RegistryConfig(full)
448 defaultDatabaseUri = registryConfig.makeDefaultDatabaseUri(BUTLER_ROOT_TAG)
449 if defaultDatabaseUri is not None:
450 Config.updateParameters(
451 RegistryConfig, config, full, toUpdate={"db": defaultDatabaseUri}, overwrite=forceConfigRoot
452 )
453 else:
454 Config.updateParameters(RegistryConfig, config, full, toCopy=("db",), overwrite=forceConfigRoot)
456 if standalone:
457 config.merge(full)
458 else:
459 # Always expand the registry.managers section into the per-repo
460 # config, because after the database schema is created, it's not
461 # allowed to change anymore. Note that in the standalone=True
462 # branch, _everything_ in the config is expanded, so there's no
463 # need to special case this.
464 Config.updateParameters(RegistryConfig, config, full, toMerge=("managers",), overwrite=False)
465 configURI: ResourcePathExpression
466 if outfile is not None:
467 # When writing to a separate location we must include
468 # the root of the butler repo in the config else it won't know
469 # where to look.
470 config["root"] = root_uri.geturl()
471 configURI = outfile
472 else:
473 configURI = root_uri
474 # Strip obscore configuration, if it is present, before writing config
475 # to a file, obscore config will be stored in registry.
476 if (obscore_config_key := ("registry", "managers", "obscore", "config")) in config:
477 config_to_write = config.copy()
478 del config_to_write[obscore_config_key]
479 config_to_write.dumpToUri(configURI, overwrite=overwrite)
480 # configFile attribute is updated, need to copy it to original.
481 config.configFile = config_to_write.configFile
482 else:
483 config.dumpToUri(configURI, overwrite=overwrite)
485 # Create Registry and populate tables
486 registryConfig = RegistryConfig(config.get("registry"))
487 dimensionConfig = DimensionConfig(dimensionConfig)
488 Registry.createFromConfig(registryConfig, dimensionConfig=dimensionConfig, butlerRoot=root_uri)
490 log.verbose("Wrote new Butler configuration file to %s", configURI)
492 return config
494 @classmethod
495 def _unpickle(
496 cls,
497 config: ButlerConfig,
498 collections: Optional[tuple[str, ...]],
499 run: Optional[str],
500 defaultDataId: Dict[str, str],
501 writeable: bool,
502 ) -> Butler:
503 """Callable used to unpickle a Butler.
505 We prefer not to use ``Butler.__init__`` directly so we can force some
506 of its many arguments to be keyword-only (note that ``__reduce__``
507 can only invoke callables with positional arguments).
509 Parameters
510 ----------
511 config : `ButlerConfig`
512 Butler configuration, already coerced into a true `ButlerConfig`
513 instance (and hence after any search paths for overrides have been
514 utilized).
515 collections : `tuple` [ `str` ]
516 Names of the default collections to read from.
517 run : `str`, optional
518 Name of the default `~CollectionType.RUN` collection to write to.
519 defaultDataId : `dict` [ `str`, `str` ]
520 Default data ID values.
521 writeable : `bool`
522 Whether the Butler should support write operations.
524 Returns
525 -------
526 butler : `Butler`
527 A new `Butler` instance.
528 """
529 # MyPy doesn't recognize that the kwargs below are totally valid; it
530 # seems to think '**defaultDataId* is a _positional_ argument!
531 return cls(
532 config=config,
533 collections=collections,
534 run=run,
535 writeable=writeable,
536 **defaultDataId, # type: ignore
537 )
539 def __reduce__(self) -> tuple:
540 """Support pickling."""
541 return (
542 Butler._unpickle,
543 (
544 self._config,
545 self.collections,
546 self.run,
547 self.registry.defaults.dataId.byName(),
548 self.registry.isWriteable(),
549 ),
550 )
552 def __str__(self) -> str:
553 return "Butler(collections={}, run={}, datastore='{}', registry='{}')".format(
554 self.collections, self.run, self.datastore, self.registry
555 )
557 def isWriteable(self) -> bool:
558 """Return `True` if this `Butler` supports write operations."""
559 return self.registry.isWriteable()
561 @contextlib.contextmanager
562 def transaction(self) -> Iterator[None]:
563 """Context manager supporting `Butler` transactions.
565 Transactions can be nested.
566 """
567 with self.registry.transaction():
568 with self.datastore.transaction():
569 yield
571 def _standardizeArgs(
572 self,
573 datasetRefOrType: Union[DatasetRef, DatasetType, str],
574 dataId: Optional[DataId] = None,
575 for_put: bool = True,
576 **kwargs: Any,
577 ) -> Tuple[DatasetType, Optional[DataId]]:
578 """Standardize the arguments passed to several Butler APIs.
580 Parameters
581 ----------
582 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
583 When `DatasetRef` the `dataId` should be `None`.
584 Otherwise the `DatasetType` or name thereof.
585 dataId : `dict` or `DataCoordinate`
586 A `dict` of `Dimension` link name, value pairs that label the
587 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
588 should be provided as the second argument.
589 for_put : `bool`, optional
590 If `True` this call is invoked as part of a `Butler.put()`.
591 Otherwise it is assumed to be part of a `Butler.get()`. This
592 parameter is only relevant if there is dataset type
593 inconsistency.
594 **kwargs
595 Additional keyword arguments used to augment or construct a
596 `DataCoordinate`. See `DataCoordinate.standardize`
597 parameters.
599 Returns
600 -------
601 datasetType : `DatasetType`
602 A `DatasetType` instance extracted from ``datasetRefOrType``.
603 dataId : `dict` or `DataId`, optional
604 Argument that can be used (along with ``kwargs``) to construct a
605 `DataId`.
607 Notes
608 -----
609 Butler APIs that conceptually need a DatasetRef also allow passing a
610 `DatasetType` (or the name of one) and a `DataId` (or a dict and
611 keyword arguments that can be used to construct one) separately. This
612 method accepts those arguments and always returns a true `DatasetType`
613 and a `DataId` or `dict`.
615 Standardization of `dict` vs `DataId` is best handled by passing the
616 returned ``dataId`` (and ``kwargs``) to `Registry` APIs, which are
617 generally similarly flexible.
618 """
619 externalDatasetType: Optional[DatasetType] = None
620 internalDatasetType: Optional[DatasetType] = None
621 if isinstance(datasetRefOrType, DatasetRef):
622 if dataId is not None or kwargs:
623 raise ValueError("DatasetRef given, cannot use dataId as well")
624 externalDatasetType = datasetRefOrType.datasetType
625 dataId = datasetRefOrType.dataId
626 else:
627 # Don't check whether DataId is provided, because Registry APIs
628 # can usually construct a better error message when it wasn't.
629 if isinstance(datasetRefOrType, DatasetType):
630 externalDatasetType = datasetRefOrType
631 else:
632 internalDatasetType = self.registry.getDatasetType(datasetRefOrType)
634 # Check that they are self-consistent
635 if externalDatasetType is not None:
636 internalDatasetType = self.registry.getDatasetType(externalDatasetType.name)
637 if externalDatasetType != internalDatasetType:
638 # We can allow differences if they are compatible, depending
639 # on whether this is a get or a put. A get requires that
640 # the python type associated with the datastore can be
641 # converted to the user type. A put requires that the user
642 # supplied python type can be converted to the internal
643 # type expected by registry.
644 relevantDatasetType = internalDatasetType
645 if for_put:
646 is_compatible = internalDatasetType.is_compatible_with(externalDatasetType)
647 else:
648 is_compatible = externalDatasetType.is_compatible_with(internalDatasetType)
649 relevantDatasetType = externalDatasetType
650 if not is_compatible:
651 raise ValueError(
652 f"Supplied dataset type ({externalDatasetType}) inconsistent with "
653 f"registry definition ({internalDatasetType})"
654 )
655 # Override the internal definition.
656 internalDatasetType = relevantDatasetType
658 assert internalDatasetType is not None
659 return internalDatasetType, dataId
661 def _rewrite_data_id(
662 self, dataId: Optional[DataId], datasetType: DatasetType, **kwargs: Any
663 ) -> Tuple[Optional[DataId], Dict[str, Any]]:
664 """Rewrite a data ID taking into account dimension records.
666 Take a Data ID and keyword args and rewrite it if necessary to
667 allow the user to specify dimension records rather than dimension
668 primary values.
670 This allows a user to include a dataId dict with keys of
671 ``exposure.day_obs`` and ``exposure.seq_num`` instead of giving
672 the integer exposure ID. It also allows a string to be given
673 for a dimension value rather than the integer ID if that is more
674 convenient. For example, rather than having to specifyin the
675 detector with ``detector.full_name``, a string given for ``detector``
676 will be interpreted as the full name and converted to the integer
677 value.
679 Keyword arguments can also use strings for dimensions like detector
680 and exposure but python does not allow them to include ``.`` and
681 so the ``exposure.day_obs`` syntax can not be used in a keyword
682 argument.
684 Parameters
685 ----------
686 dataId : `dict` or `DataCoordinate`
687 A `dict` of `Dimension` link name, value pairs that will label the
688 `DatasetRef` within a Collection.
689 datasetType : `DatasetType`
690 The dataset type associated with this dataId. Required to
691 determine the relevant dimensions.
692 **kwargs
693 Additional keyword arguments used to augment or construct a
694 `DataId`. See `DataId` parameters.
696 Returns
697 -------
698 dataId : `dict` or `DataCoordinate`
699 The, possibly rewritten, dataId. If given a `DataCoordinate` and
700 no keyword arguments, the original dataId will be returned
701 unchanged.
702 **kwargs : `dict`
703 Any unused keyword arguments (would normally be empty dict).
704 """
705 # Do nothing if we have a standalone DataCoordinate.
706 if isinstance(dataId, DataCoordinate) and not kwargs:
707 return dataId, kwargs
709 # Process dimension records that are using record information
710 # rather than ids
711 newDataId: Dict[str, DataIdValue] = {}
712 byRecord: Dict[str, Dict[str, Any]] = defaultdict(dict)
714 # if all the dataId comes from keyword parameters we do not need
715 # to do anything here because they can't be of the form
716 # exposure.obs_id because a "." is not allowed in a keyword parameter.
717 if dataId:
718 for k, v in dataId.items():
719 # If we have a Dimension we do not need to do anything
720 # because it cannot be a compound key.
721 if isinstance(k, str) and "." in k:
722 # Someone is using a more human-readable dataId
723 dimensionName, record = k.split(".", 1)
724 byRecord[dimensionName][record] = v
725 elif isinstance(k, Dimension):
726 newDataId[k.name] = v
727 else:
728 newDataId[k] = v
730 # Go through the updated dataId and check the type in case someone is
731 # using an alternate key. We have already filtered out the compound
732 # keys dimensions.record format.
733 not_dimensions = {}
735 # Will need to look in the dataId and the keyword arguments
736 # and will remove them if they need to be fixed or are unrecognized.
737 for dataIdDict in (newDataId, kwargs):
738 # Use a list so we can adjust the dict safely in the loop
739 for dimensionName in list(dataIdDict):
740 value = dataIdDict[dimensionName]
741 try:
742 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
743 except KeyError:
744 # This is not a real dimension
745 not_dimensions[dimensionName] = value
746 del dataIdDict[dimensionName]
747 continue
749 # Convert an integral type to an explicit int to simplify
750 # comparisons here
751 if isinstance(value, numbers.Integral):
752 value = int(value)
754 if not isinstance(value, dimension.primaryKey.getPythonType()):
755 for alternate in dimension.alternateKeys:
756 if isinstance(value, alternate.getPythonType()):
757 byRecord[dimensionName][alternate.name] = value
758 del dataIdDict[dimensionName]
759 log.debug(
760 "Converting dimension %s to %s.%s=%s",
761 dimensionName,
762 dimensionName,
763 alternate.name,
764 value,
765 )
766 break
767 else:
768 log.warning(
769 "Type mismatch found for value '%r' provided for dimension %s. "
770 "Could not find matching alternative (primary key has type %s) "
771 "so attempting to use as-is.",
772 value,
773 dimensionName,
774 dimension.primaryKey.getPythonType(),
775 )
777 # By this point kwargs and newDataId should only include valid
778 # dimensions. Merge kwargs in to the new dataId and log if there
779 # are dimensions in both (rather than calling update).
780 for k, v in kwargs.items():
781 if k in newDataId and newDataId[k] != v:
782 log.debug(
783 "Keyword arg %s overriding explicit value in dataId of %s with %s", k, newDataId[k], v
784 )
785 newDataId[k] = v
786 # No need to retain any values in kwargs now.
787 kwargs = {}
789 # If we have some unrecognized dimensions we have to try to connect
790 # them to records in other dimensions. This is made more complicated
791 # by some dimensions having records with clashing names. A mitigation
792 # is that we can tell by this point which dimensions are missing
793 # for the DatasetType but this does not work for calibrations
794 # where additional dimensions can be used to constrain the temporal
795 # axis.
796 if not_dimensions:
797 # Search for all dimensions even if we have been given a value
798 # explicitly. In some cases records are given as well as the
799 # actually dimension and this should not be an error if they
800 # match.
801 mandatoryDimensions = datasetType.dimensions.names # - provided
803 candidateDimensions: Set[str] = set()
804 candidateDimensions.update(mandatoryDimensions)
806 # For calibrations we may well be needing temporal dimensions
807 # so rather than always including all dimensions in the scan
808 # restrict things a little. It is still possible for there
809 # to be confusion over day_obs in visit vs exposure for example.
810 # If we are not searching calibration collections things may
811 # fail but they are going to fail anyway because of the
812 # ambiguousness of the dataId...
813 if datasetType.isCalibration():
814 for dim in self.registry.dimensions.getStaticDimensions():
815 if dim.temporal:
816 candidateDimensions.add(str(dim))
818 # Look up table for the first association with a dimension
819 guessedAssociation: Dict[str, Dict[str, Any]] = defaultdict(dict)
821 # Keep track of whether an item is associated with multiple
822 # dimensions.
823 counter: Counter[str] = Counter()
824 assigned: Dict[str, Set[str]] = defaultdict(set)
826 # Go through the missing dimensions and associate the
827 # given names with records within those dimensions
828 matched_dims = set()
829 for dimensionName in candidateDimensions:
830 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
831 fields = dimension.metadata.names | dimension.uniqueKeys.names
832 for field in not_dimensions:
833 if field in fields:
834 guessedAssociation[dimensionName][field] = not_dimensions[field]
835 counter[dimensionName] += 1
836 assigned[field].add(dimensionName)
837 matched_dims.add(field)
839 # Calculate the fields that matched nothing.
840 never_found = set(not_dimensions) - matched_dims
842 if never_found:
843 raise ValueError(f"Unrecognized keyword args given: {never_found}")
845 # There is a chance we have allocated a single dataId item
846 # to multiple dimensions. Need to decide which should be retained.
847 # For now assume that the most popular alternative wins.
848 # This means that day_obs with seq_num will result in
849 # exposure.day_obs and not visit.day_obs
850 # Also prefer an explicitly missing dimension over an inferred
851 # temporal dimension.
852 for fieldName, assignedDimensions in assigned.items():
853 if len(assignedDimensions) > 1:
854 # Pick the most popular (preferring mandatory dimensions)
855 requiredButMissing = assignedDimensions.intersection(mandatoryDimensions)
856 if requiredButMissing:
857 candidateDimensions = requiredButMissing
858 else:
859 candidateDimensions = assignedDimensions
861 # If this is a choice between visit and exposure and
862 # neither was a required part of the dataset type,
863 # (hence in this branch) always prefer exposure over
864 # visit since exposures are always defined and visits
865 # are defined from exposures.
866 if candidateDimensions == {"exposure", "visit"}:
867 candidateDimensions = {"exposure"}
869 # Select the relevant items and get a new restricted
870 # counter.
871 theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions}
872 duplicatesCounter: Counter[str] = Counter()
873 duplicatesCounter.update(theseCounts)
875 # Choose the most common. If they are equally common
876 # we will pick the one that was found first.
877 # Returns a list of tuples
878 selected = duplicatesCounter.most_common(1)[0][0]
880 log.debug(
881 "Ambiguous dataId entry '%s' associated with multiple dimensions: %s."
882 " Removed ambiguity by choosing dimension %s.",
883 fieldName,
884 ", ".join(assignedDimensions),
885 selected,
886 )
888 for candidateDimension in assignedDimensions:
889 if candidateDimension != selected:
890 del guessedAssociation[candidateDimension][fieldName]
892 # Update the record look up dict with the new associations
893 for dimensionName, values in guessedAssociation.items():
894 if values: # A dict might now be empty
895 log.debug("Assigned non-dimension dataId keys to dimension %s: %s", dimensionName, values)
896 byRecord[dimensionName].update(values)
898 if byRecord:
899 # Some record specifiers were found so we need to convert
900 # them to the Id form
901 for dimensionName, values in byRecord.items():
902 if dimensionName in newDataId:
903 log.debug(
904 "DataId specified explicit %s dimension value of %s in addition to"
905 " general record specifiers for it of %s. Ignoring record information.",
906 dimensionName,
907 newDataId[dimensionName],
908 str(values),
909 )
910 # Get the actual record and compare with these values.
911 try:
912 recs = list(self.registry.queryDimensionRecords(dimensionName, dataId=newDataId))
913 except DataIdError:
914 raise ValueError(
915 f"Could not find dimension '{dimensionName}'"
916 f" with dataId {newDataId} as part of comparing with"
917 f" record values {byRecord[dimensionName]}"
918 ) from None
919 if len(recs) == 1:
920 errmsg: List[str] = []
921 for k, v in values.items():
922 if (recval := getattr(recs[0], k)) != v:
923 errmsg.append(f"{k}({recval} != {v})")
924 if errmsg:
925 raise ValueError(
926 f"Dimension {dimensionName} in dataId has explicit value"
927 " inconsistent with records: " + ", ".join(errmsg)
928 )
929 else:
930 # Multiple matches for an explicit dimension
931 # should never happen but let downstream complain.
932 pass
933 continue
935 # Build up a WHERE expression
936 bind = {k: v for k, v in values.items()}
937 where = " AND ".join(f"{dimensionName}.{k} = {k}" for k in bind)
939 # Hopefully we get a single record that matches
940 records = set(
941 self.registry.queryDimensionRecords(
942 dimensionName, dataId=newDataId, where=where, bind=bind, **kwargs
943 )
944 )
946 if len(records) != 1:
947 if len(records) > 1:
948 # visit can have an ambiguous answer without involving
949 # visit_system. The default visit_system is defined
950 # by the instrument.
951 if (
952 dimensionName == "visit"
953 and "visit_system_membership" in self.registry.dimensions
954 and "visit_system" in self.registry.dimensions["instrument"].metadata
955 ):
956 instrument_records = list(
957 self.registry.queryDimensionRecords(
958 "instrument",
959 dataId=newDataId,
960 **kwargs,
961 )
962 )
963 if len(instrument_records) == 1:
964 visit_system = instrument_records[0].visit_system
965 if visit_system is None:
966 # Set to a value that will never match.
967 visit_system = -1
969 # Look up each visit in the
970 # visit_system_membership records.
971 for rec in records:
972 membership = list(
973 self.registry.queryDimensionRecords(
974 # Use bind to allow zero results.
975 # This is a fully-specified query.
976 "visit_system_membership",
977 where="instrument = inst AND visit_system = system AND visit = v",
978 bind=dict(
979 inst=instrument_records[0].name, system=visit_system, v=rec.id
980 ),
981 )
982 )
983 if membership:
984 # This record is the right answer.
985 records = set([rec])
986 break
988 # The ambiguity may have been resolved so check again.
989 if len(records) > 1:
990 log.debug("Received %d records from constraints of %s", len(records), str(values))
991 for r in records:
992 log.debug("- %s", str(r))
993 raise ValueError(
994 f"DataId specification for dimension {dimensionName} is not"
995 f" uniquely constrained to a single dataset by {values}."
996 f" Got {len(records)} results."
997 )
998 else:
999 raise ValueError(
1000 f"DataId specification for dimension {dimensionName} matched no"
1001 f" records when constrained by {values}"
1002 )
1004 # Get the primary key from the real dimension object
1005 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
1006 if not isinstance(dimension, Dimension):
1007 raise RuntimeError(
1008 f"{dimension.name} is not a true dimension, and cannot be used in data IDs."
1009 )
1010 newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name)
1012 return newDataId, kwargs
1014 def _findDatasetRef(
1015 self,
1016 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1017 dataId: Optional[DataId] = None,
1018 *,
1019 collections: Any = None,
1020 predict: bool = False,
1021 run: str | None = None,
1022 **kwargs: Any,
1023 ) -> DatasetRef:
1024 """Shared logic for methods that start with a search for a dataset in
1025 the registry.
1027 Parameters
1028 ----------
1029 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1030 When `DatasetRef` the `dataId` should be `None`.
1031 Otherwise the `DatasetType` or name thereof.
1032 dataId : `dict` or `DataCoordinate`, optional
1033 A `dict` of `Dimension` link name, value pairs that label the
1034 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1035 should be provided as the first argument.
1036 collections : Any, optional
1037 Collections to be searched, overriding ``self.collections``.
1038 Can be any of the types supported by the ``collections`` argument
1039 to butler construction.
1040 predict : `bool`, optional
1041 If `True`, return a newly created `DatasetRef` with a unique
1042 dataset ID if finding a reference in the `Registry` fails.
1043 Defaults to `False`.
1044 run : `str`, optional
1045 Run collection name to use for creating `DatasetRef` for predicted
1046 datasets. Only used if ``predict`` is `True`.
1047 **kwargs
1048 Additional keyword arguments used to augment or construct a
1049 `DataId`. See `DataId` parameters.
1051 Returns
1052 -------
1053 ref : `DatasetRef`
1054 A reference to the dataset identified by the given arguments.
1055 This can be the same dataset reference as given if it was
1056 resolved.
1058 Raises
1059 ------
1060 LookupError
1061 Raised if no matching dataset exists in the `Registry` (and
1062 ``predict`` is `False`).
1063 ValueError
1064 Raised if a resolved `DatasetRef` was passed as an input, but it
1065 differs from the one found in the registry.
1066 TypeError
1067 Raised if no collections were provided.
1068 """
1069 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, for_put=False, **kwargs)
1070 if isinstance(datasetRefOrType, DatasetRef):
1071 if collections is not None:
1072 warnings.warn("Collections should not be specified with DatasetRef", stacklevel=2)
1073 return datasetRefOrType
1074 timespan: Optional[Timespan] = None
1076 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs)
1078 if datasetType.isCalibration():
1079 # Because this is a calibration dataset, first try to make a
1080 # standardize the data ID without restricting the dimensions to
1081 # those of the dataset type requested, because there may be extra
1082 # dimensions that provide temporal information for a validity-range
1083 # lookup.
1084 dataId = DataCoordinate.standardize(
1085 dataId, universe=self.registry.dimensions, defaults=self.registry.defaults.dataId, **kwargs
1086 )
1087 if dataId.graph.temporal:
1088 dataId = self.registry.expandDataId(dataId)
1089 timespan = dataId.timespan
1090 else:
1091 # Standardize the data ID to just the dimensions of the dataset
1092 # type instead of letting registry.findDataset do it, so we get the
1093 # result even if no dataset is found.
1094 dataId = DataCoordinate.standardize(
1095 dataId, graph=datasetType.dimensions, defaults=self.registry.defaults.dataId, **kwargs
1096 )
1097 # Always lookup the DatasetRef, even if one is given, to ensure it is
1098 # present in the current collection.
1099 ref = self.registry.findDataset(datasetType, dataId, collections=collections, timespan=timespan)
1100 if ref is None:
1101 if predict:
1102 if run is None:
1103 run = self.run
1104 if run is None:
1105 raise TypeError("Cannot predict dataset ID/location with run=None.")
1106 return DatasetRef(datasetType, dataId, run=run)
1107 else:
1108 if collections is None:
1109 collections = self.registry.defaults.collections
1110 raise LookupError(
1111 f"Dataset {datasetType.name} with data ID {dataId} "
1112 f"could not be found in collections {collections}."
1113 )
1114 if datasetType != ref.datasetType:
1115 # If they differ it is because the user explicitly specified
1116 # a compatible dataset type to this call rather than using the
1117 # registry definition. The DatasetRef must therefore be recreated
1118 # using the user definition such that the expected type is
1119 # returned.
1120 ref = DatasetRef(datasetType, ref.dataId, run=ref.run, id=ref.id)
1122 return ref
1124 @transactional
1125 @deprecated(
1126 reason="Butler.put() now behaves like Butler.putDirect() when given a DatasetRef."
1127 " Please use Butler.put(). Be aware that you may need to adjust your usage if you"
1128 " were relying on the run parameter to determine the run."
1129 " Will be removed after v27.0.",
1130 version="v26.0",
1131 category=FutureWarning,
1132 )
1133 def putDirect(self, obj: Any, ref: DatasetRef, /) -> DatasetRef:
1134 # Docstring inherited.
1135 return self.put(obj, ref)
1137 @transactional
1138 def put(
1139 self,
1140 obj: Any,
1141 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1142 /,
1143 dataId: Optional[DataId] = None,
1144 *,
1145 run: Optional[str] = None,
1146 **kwargs: Any,
1147 ) -> DatasetRef:
1148 """Store and register a dataset.
1150 Parameters
1151 ----------
1152 obj : `object`
1153 The dataset.
1154 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1155 When `DatasetRef` is provided, ``dataId`` should be `None`.
1156 Otherwise the `DatasetType` or name thereof. If a fully resolved
1157 `DatasetRef` is given the run and ID are used directly.
1158 dataId : `dict` or `DataCoordinate`
1159 A `dict` of `Dimension` link name, value pairs that label the
1160 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1161 should be provided as the second argument.
1162 run : `str`, optional
1163 The name of the run the dataset should be added to, overriding
1164 ``self.run``. Not used if a resolved `DatasetRef` is provided.
1165 **kwargs
1166 Additional keyword arguments used to augment or construct a
1167 `DataCoordinate`. See `DataCoordinate.standardize`
1168 parameters. Not used if a resolve `DatasetRef` is provided.
1170 Returns
1171 -------
1172 ref : `DatasetRef`
1173 A reference to the stored dataset, updated with the correct id if
1174 given.
1176 Raises
1177 ------
1178 TypeError
1179 Raised if the butler is read-only or if no run has been provided.
1180 """
1181 if isinstance(datasetRefOrType, DatasetRef):
1182 # This is a direct put of predefined DatasetRef.
1183 log.debug("Butler put direct: %s", datasetRefOrType)
1184 if run is not None:
1185 warnings.warn("Run collection is not used for DatasetRef")
1186 # If registry already has a dataset with the same dataset ID,
1187 # dataset type and DataId, then _importDatasets will do nothing and
1188 # just return an original ref. We have to raise in this case, there
1189 # is a datastore check below for that.
1190 self.registry._importDatasets([datasetRefOrType], expand=True)
1191 # Before trying to write to the datastore check that it does not
1192 # know this dataset. This is prone to races, of course.
1193 if self.datastore.knows(datasetRefOrType):
1194 raise ConflictingDefinitionError(f"Datastore already contains dataset: {datasetRefOrType}")
1195 # Try to write dataset to the datastore, if it fails due to a race
1196 # with another write, the content of stored data may be
1197 # unpredictable.
1198 try:
1199 self.datastore.put(obj, datasetRefOrType)
1200 except IntegrityError as e:
1201 raise ConflictingDefinitionError(f"Datastore already contains dataset: {e}")
1202 return datasetRefOrType
1204 log.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run)
1205 if not self.isWriteable():
1206 raise TypeError("Butler is read-only.")
1207 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs)
1209 # Handle dimension records in dataId
1210 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs)
1212 # Add Registry Dataset entry.
1213 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions, **kwargs)
1214 (ref,) = self.registry.insertDatasets(datasetType, run=run, dataIds=[dataId])
1215 self.datastore.put(obj, ref)
1217 return ref
1219 @deprecated(
1220 reason="Butler.get() now behaves like Butler.getDirect() when given a DatasetRef."
1221 " Please use Butler.get(). Will be removed after v27.0.",
1222 version="v26.0",
1223 category=FutureWarning,
1224 )
1225 def getDirect(
1226 self,
1227 ref: DatasetRef,
1228 *,
1229 parameters: Optional[Dict[str, Any]] = None,
1230 storageClass: Optional[Union[StorageClass, str]] = None,
1231 ) -> Any:
1232 """Retrieve a stored dataset.
1234 Parameters
1235 ----------
1236 ref : `DatasetRef`
1237 Resolved reference to an already stored dataset.
1238 parameters : `dict`
1239 Additional StorageClass-defined options to control reading,
1240 typically used to efficiently read only a subset of the dataset.
1241 storageClass : `StorageClass` or `str`, optional
1242 The storage class to be used to override the Python type
1243 returned by this method. By default the returned type matches
1244 the dataset type definition for this dataset. Specifying a
1245 read `StorageClass` can force a different type to be returned.
1246 This type must be compatible with the original type.
1248 Returns
1249 -------
1250 obj : `object`
1251 The dataset.
1252 """
1253 return self.datastore.get(ref, parameters=parameters, storageClass=storageClass)
1255 @deprecated(
1256 reason="Butler.getDeferred() now behaves like getDirectDeferred() when given a DatasetRef. "
1257 "Please use Butler.getDeferred(). Will be removed after v27.0.",
1258 version="v26.0",
1259 category=FutureWarning,
1260 )
1261 def getDirectDeferred(
1262 self,
1263 ref: DatasetRef,
1264 *,
1265 parameters: Union[dict, None] = None,
1266 storageClass: str | StorageClass | None = None,
1267 ) -> DeferredDatasetHandle:
1268 """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
1269 from a resolved `DatasetRef`.
1271 Parameters
1272 ----------
1273 ref : `DatasetRef`
1274 Resolved reference to an already stored dataset.
1275 parameters : `dict`
1276 Additional StorageClass-defined options to control reading,
1277 typically used to efficiently read only a subset of the dataset.
1278 storageClass : `StorageClass` or `str`, optional
1279 The storage class to be used to override the Python type
1280 returned by this method. By default the returned type matches
1281 the dataset type definition for this dataset. Specifying a
1282 read `StorageClass` can force a different type to be returned.
1283 This type must be compatible with the original type.
1285 Returns
1286 -------
1287 obj : `DeferredDatasetHandle`
1288 A handle which can be used to retrieve a dataset at a later time.
1290 Raises
1291 ------
1292 LookupError
1293 Raised if no matching dataset exists in the `Registry`.
1294 """
1295 # Check thad dataset actuall exists.
1296 if not self.datastore.exists(ref):
1297 raise LookupError(f"Dataset reference {ref} does not exist.")
1298 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters, storageClass=storageClass)
1300 def getDeferred(
1301 self,
1302 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1303 /,
1304 dataId: Optional[DataId] = None,
1305 *,
1306 parameters: Union[dict, None] = None,
1307 collections: Any = None,
1308 storageClass: str | StorageClass | None = None,
1309 **kwargs: Any,
1310 ) -> DeferredDatasetHandle:
1311 """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
1312 after an immediate registry lookup.
1314 Parameters
1315 ----------
1316 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1317 When `DatasetRef` the `dataId` should be `None`.
1318 Otherwise the `DatasetType` or name thereof.
1319 dataId : `dict` or `DataCoordinate`, optional
1320 A `dict` of `Dimension` link name, value pairs that label the
1321 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1322 should be provided as the first argument.
1323 parameters : `dict`
1324 Additional StorageClass-defined options to control reading,
1325 typically used to efficiently read only a subset of the dataset.
1326 collections : Any, optional
1327 Collections to be searched, overriding ``self.collections``.
1328 Can be any of the types supported by the ``collections`` argument
1329 to butler construction.
1330 storageClass : `StorageClass` or `str`, optional
1331 The storage class to be used to override the Python type
1332 returned by this method. By default the returned type matches
1333 the dataset type definition for this dataset. Specifying a
1334 read `StorageClass` can force a different type to be returned.
1335 This type must be compatible with the original type.
1336 **kwargs
1337 Additional keyword arguments used to augment or construct a
1338 `DataId`. See `DataId` parameters.
1340 Returns
1341 -------
1342 obj : `DeferredDatasetHandle`
1343 A handle which can be used to retrieve a dataset at a later time.
1345 Raises
1346 ------
1347 LookupError
1348 Raised if no matching dataset exists in the `Registry`.
1349 ValueError
1350 Raised if a resolved `DatasetRef` was passed as an input, but it
1351 differs from the one found in the registry.
1352 TypeError
1353 Raised if no collections were provided.
1354 """
1355 if isinstance(datasetRefOrType, DatasetRef) and not self.datastore.exists(datasetRefOrType):
1356 raise LookupError(f"Dataset reference {datasetRefOrType} does not exist.")
1357 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs)
1358 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters, storageClass=storageClass)
1360 def get(
1361 self,
1362 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1363 /,
1364 dataId: Optional[DataId] = None,
1365 *,
1366 parameters: Optional[Dict[str, Any]] = None,
1367 collections: Any = None,
1368 storageClass: Optional[Union[StorageClass, str]] = None,
1369 **kwargs: Any,
1370 ) -> Any:
1371 """Retrieve a stored dataset.
1373 Parameters
1374 ----------
1375 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1376 When `DatasetRef` the `dataId` should be `None`.
1377 Otherwise the `DatasetType` or name thereof.
1378 If a resolved `DatasetRef`, the associated dataset
1379 is returned directly without additional querying.
1380 dataId : `dict` or `DataCoordinate`
1381 A `dict` of `Dimension` link name, value pairs that label the
1382 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1383 should be provided as the first argument.
1384 parameters : `dict`
1385 Additional StorageClass-defined options to control reading,
1386 typically used to efficiently read only a subset of the dataset.
1387 collections : Any, optional
1388 Collections to be searched, overriding ``self.collections``.
1389 Can be any of the types supported by the ``collections`` argument
1390 to butler construction.
1391 storageClass : `StorageClass` or `str`, optional
1392 The storage class to be used to override the Python type
1393 returned by this method. By default the returned type matches
1394 the dataset type definition for this dataset. Specifying a
1395 read `StorageClass` can force a different type to be returned.
1396 This type must be compatible with the original type.
1397 **kwargs
1398 Additional keyword arguments used to augment or construct a
1399 `DataCoordinate`. See `DataCoordinate.standardize`
1400 parameters.
1402 Returns
1403 -------
1404 obj : `object`
1405 The dataset.
1407 Raises
1408 ------
1409 LookupError
1410 Raised if no matching dataset exists in the `Registry`.
1411 TypeError
1412 Raised if no collections were provided.
1414 Notes
1415 -----
1416 When looking up datasets in a `~CollectionType.CALIBRATION` collection,
1417 this method requires that the given data ID include temporal dimensions
1418 beyond the dimensions of the dataset type itself, in order to find the
1419 dataset with the appropriate validity range. For example, a "bias"
1420 dataset with native dimensions ``{instrument, detector}`` could be
1421 fetched with a ``{instrument, detector, exposure}`` data ID, because
1422 ``exposure`` is a temporal dimension.
1423 """
1424 log.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters)
1425 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs)
1426 return self.datastore.get(ref, parameters=parameters, storageClass=storageClass)
1428 def getURIs(
1429 self,
1430 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1431 /,
1432 dataId: Optional[DataId] = None,
1433 *,
1434 predict: bool = False,
1435 collections: Any = None,
1436 run: Optional[str] = None,
1437 **kwargs: Any,
1438 ) -> DatasetRefURIs:
1439 """Returns the URIs associated with the dataset.
1441 Parameters
1442 ----------
1443 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1444 When `DatasetRef` the `dataId` should be `None`.
1445 Otherwise the `DatasetType` or name thereof.
1446 dataId : `dict` or `DataCoordinate`
1447 A `dict` of `Dimension` link name, value pairs that label the
1448 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1449 should be provided as the first argument.
1450 predict : `bool`
1451 If `True`, allow URIs to be returned of datasets that have not
1452 been written.
1453 collections : Any, optional
1454 Collections to be searched, overriding ``self.collections``.
1455 Can be any of the types supported by the ``collections`` argument
1456 to butler construction.
1457 run : `str`, optional
1458 Run to use for predictions, overriding ``self.run``.
1459 **kwargs
1460 Additional keyword arguments used to augment or construct a
1461 `DataCoordinate`. See `DataCoordinate.standardize`
1462 parameters.
1464 Returns
1465 -------
1466 uris : `DatasetRefURIs`
1467 The URI to the primary artifact associated with this dataset (if
1468 the dataset was disassembled within the datastore this may be
1469 `None`), and the URIs to any components associated with the dataset
1470 artifact. (can be empty if there are no components).
1471 """
1472 ref = self._findDatasetRef(
1473 datasetRefOrType, dataId, predict=predict, run=run, collections=collections, **kwargs
1474 )
1475 return self.datastore.getURIs(ref, predict)
1477 def getURI(
1478 self,
1479 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1480 /,
1481 dataId: Optional[DataId] = None,
1482 *,
1483 predict: bool = False,
1484 collections: Any = None,
1485 run: Optional[str] = None,
1486 **kwargs: Any,
1487 ) -> ResourcePath:
1488 """Return the URI to the Dataset.
1490 Parameters
1491 ----------
1492 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1493 When `DatasetRef` the `dataId` should be `None`.
1494 Otherwise the `DatasetType` or name thereof.
1495 dataId : `dict` or `DataCoordinate`
1496 A `dict` of `Dimension` link name, value pairs that label the
1497 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1498 should be provided as the first argument.
1499 predict : `bool`
1500 If `True`, allow URIs to be returned of datasets that have not
1501 been written.
1502 collections : Any, optional
1503 Collections to be searched, overriding ``self.collections``.
1504 Can be any of the types supported by the ``collections`` argument
1505 to butler construction.
1506 run : `str`, optional
1507 Run to use for predictions, overriding ``self.run``.
1508 **kwargs
1509 Additional keyword arguments used to augment or construct a
1510 `DataCoordinate`. See `DataCoordinate.standardize`
1511 parameters.
1513 Returns
1514 -------
1515 uri : `lsst.resources.ResourcePath`
1516 URI pointing to the Dataset within the datastore. If the
1517 Dataset does not exist in the datastore, and if ``predict`` is
1518 `True`, the URI will be a prediction and will include a URI
1519 fragment "#predicted".
1520 If the datastore does not have entities that relate well
1521 to the concept of a URI the returned URI string will be
1522 descriptive. The returned URI is not guaranteed to be obtainable.
1524 Raises
1525 ------
1526 LookupError
1527 A URI has been requested for a dataset that does not exist and
1528 guessing is not allowed.
1529 ValueError
1530 Raised if a resolved `DatasetRef` was passed as an input, but it
1531 differs from the one found in the registry.
1532 TypeError
1533 Raised if no collections were provided.
1534 RuntimeError
1535 Raised if a URI is requested for a dataset that consists of
1536 multiple artifacts.
1537 """
1538 primary, components = self.getURIs(
1539 datasetRefOrType, dataId=dataId, predict=predict, collections=collections, run=run, **kwargs
1540 )
1542 if primary is None or components:
1543 raise RuntimeError(
1544 f"Dataset ({datasetRefOrType}) includes distinct URIs for components. "
1545 "Use Butler.getURIs() instead."
1546 )
1547 return primary
1549 def retrieveArtifacts(
1550 self,
1551 refs: Iterable[DatasetRef],
1552 destination: ResourcePathExpression,
1553 transfer: str = "auto",
1554 preserve_path: bool = True,
1555 overwrite: bool = False,
1556 ) -> List[ResourcePath]:
1557 """Retrieve the artifacts associated with the supplied refs.
1559 Parameters
1560 ----------
1561 refs : iterable of `DatasetRef`
1562 The datasets for which artifacts are to be retrieved.
1563 A single ref can result in multiple artifacts. The refs must
1564 be resolved.
1565 destination : `lsst.resources.ResourcePath` or `str`
1566 Location to write the artifacts.
1567 transfer : `str`, optional
1568 Method to use to transfer the artifacts. Must be one of the options
1569 supported by `~lsst.resources.ResourcePath.transfer_from()`.
1570 "move" is not allowed.
1571 preserve_path : `bool`, optional
1572 If `True` the full path of the artifact within the datastore
1573 is preserved. If `False` the final file component of the path
1574 is used.
1575 overwrite : `bool`, optional
1576 If `True` allow transfers to overwrite existing files at the
1577 destination.
1579 Returns
1580 -------
1581 targets : `list` of `lsst.resources.ResourcePath`
1582 URIs of file artifacts in destination location. Order is not
1583 preserved.
1585 Notes
1586 -----
1587 For non-file datastores the artifacts written to the destination
1588 may not match the representation inside the datastore. For example
1589 a hierarchical data structure in a NoSQL database may well be stored
1590 as a JSON file.
1591 """
1592 return self.datastore.retrieveArtifacts(
1593 refs,
1594 ResourcePath(destination),
1595 transfer=transfer,
1596 preserve_path=preserve_path,
1597 overwrite=overwrite,
1598 )
1600 def datasetExists(
1601 self,
1602 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1603 dataId: Optional[DataId] = None,
1604 *,
1605 collections: Any = None,
1606 **kwargs: Any,
1607 ) -> bool:
1608 """Return True if the Dataset is actually present in the Datastore.
1610 Parameters
1611 ----------
1612 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1613 When `DatasetRef` the `dataId` should be `None`.
1614 Otherwise the `DatasetType` or name thereof.
1615 dataId : `dict` or `DataCoordinate`
1616 A `dict` of `Dimension` link name, value pairs that label the
1617 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1618 should be provided as the first argument.
1619 collections : Any, optional
1620 Collections to be searched, overriding ``self.collections``.
1621 Can be any of the types supported by the ``collections`` argument
1622 to butler construction.
1623 **kwargs
1624 Additional keyword arguments used to augment or construct a
1625 `DataCoordinate`. See `DataCoordinate.standardize`
1626 parameters.
1628 Raises
1629 ------
1630 LookupError
1631 Raised if the dataset is not even present in the Registry.
1632 ValueError
1633 Raised if a resolved `DatasetRef` was passed as an input, but it
1634 differs from the one found in the registry.
1635 TypeError
1636 Raised if no collections were provided.
1637 """
1638 # A resolved ref may be given that is not known to this butler.
1639 if isinstance(datasetRefOrType, DatasetRef):
1640 ref = self.registry.getDataset(datasetRefOrType.id)
1641 if ref is None:
1642 raise LookupError(
1643 f"Resolved DatasetRef with id {datasetRefOrType.id} is not known to registry."
1644 )
1645 else:
1646 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs)
1647 return self.datastore.exists(ref)
1649 def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None:
1650 """Remove one or more `~CollectionType.RUN` collections and the
1651 datasets within them.
1653 Parameters
1654 ----------
1655 names : `Iterable` [ `str` ]
1656 The names of the collections to remove.
1657 unstore : `bool`, optional
1658 If `True` (default), delete datasets from all datastores in which
1659 they are present, and attempt to rollback the registry deletions if
1660 datastore deletions fail (which may not always be possible). If
1661 `False`, datastore records for these datasets are still removed,
1662 but any artifacts (e.g. files) will not be.
1664 Raises
1665 ------
1666 TypeError
1667 Raised if one or more collections are not of type
1668 `~CollectionType.RUN`.
1669 """
1670 if not self.isWriteable():
1671 raise TypeError("Butler is read-only.")
1672 names = list(names)
1673 refs: List[DatasetRef] = []
1674 for name in names:
1675 collectionType = self.registry.getCollectionType(name)
1676 if collectionType is not CollectionType.RUN:
1677 raise TypeError(f"The collection type of '{name}' is {collectionType.name}, not RUN.")
1678 refs.extend(self.registry.queryDatasets(..., collections=name, findFirst=True))
1679 with self.datastore.transaction():
1680 with self.registry.transaction():
1681 if unstore:
1682 self.datastore.trash(refs)
1683 else:
1684 self.datastore.forget(refs)
1685 for name in names:
1686 self.registry.removeCollection(name)
1687 if unstore:
1688 # Point of no return for removing artifacts
1689 self.datastore.emptyTrash()
1691 def pruneDatasets(
1692 self,
1693 refs: Iterable[DatasetRef],
1694 *,
1695 disassociate: bool = True,
1696 unstore: bool = False,
1697 tags: Iterable[str] = (),
1698 purge: bool = False,
1699 ) -> None:
1700 # docstring inherited from LimitedButler
1702 if not self.isWriteable():
1703 raise TypeError("Butler is read-only.")
1704 if purge:
1705 if not disassociate:
1706 raise TypeError("Cannot pass purge=True without disassociate=True.")
1707 if not unstore:
1708 raise TypeError("Cannot pass purge=True without unstore=True.")
1709 elif disassociate:
1710 tags = tuple(tags)
1711 if not tags:
1712 raise TypeError("No tags provided but disassociate=True.")
1713 for tag in tags:
1714 collectionType = self.registry.getCollectionType(tag)
1715 if collectionType is not CollectionType.TAGGED:
1716 raise TypeError(
1717 f"Cannot disassociate from collection '{tag}' "
1718 f"of non-TAGGED type {collectionType.name}."
1719 )
1720 # Transform possibly-single-pass iterable into something we can iterate
1721 # over multiple times.
1722 refs = list(refs)
1723 # Pruning a component of a DatasetRef makes no sense since registry
1724 # doesn't know about components and datastore might not store
1725 # components in a separate file
1726 for ref in refs:
1727 if ref.datasetType.component():
1728 raise ValueError(f"Can not prune a component of a dataset (ref={ref})")
1729 # We don't need an unreliable Datastore transaction for this, because
1730 # we've been extra careful to ensure that Datastore.trash only involves
1731 # mutating the Registry (it can _look_ at Datastore-specific things,
1732 # but shouldn't change them), and hence all operations here are
1733 # Registry operations.
1734 with self.datastore.transaction():
1735 with self.registry.transaction():
1736 if unstore:
1737 self.datastore.trash(refs)
1738 if purge:
1739 self.registry.removeDatasets(refs)
1740 elif disassociate:
1741 assert tags, "Guaranteed by earlier logic in this function."
1742 for tag in tags:
1743 self.registry.disassociate(tag, refs)
1744 # We've exited the Registry transaction, and apparently committed.
1745 # (if there was an exception, everything rolled back, and it's as if
1746 # nothing happened - and we never get here).
1747 # Datastore artifacts are not yet gone, but they're clearly marked
1748 # as trash, so if we fail to delete now because of (e.g.) filesystem
1749 # problems we can try again later, and if manual administrative
1750 # intervention is required, it's pretty clear what that should entail:
1751 # deleting everything on disk and in private Datastore tables that is
1752 # in the dataset_location_trash table.
1753 if unstore:
1754 # Point of no return for removing artifacts
1755 self.datastore.emptyTrash()
1757 @transactional
1758 def ingest(
1759 self,
1760 *datasets: FileDataset,
1761 transfer: Optional[str] = "auto",
1762 run: Optional[str] = None,
1763 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
1764 record_validation_info: bool = True,
1765 ) -> None:
1766 """Store and register one or more datasets that already exist on disk.
1768 Parameters
1769 ----------
1770 datasets : `FileDataset`
1771 Each positional argument is a struct containing information about
1772 a file to be ingested, including its URI (either absolute or
1773 relative to the datastore root, if applicable), a resolved
1774 `DatasetRef`, and optionally a formatter class or its
1775 fully-qualified string name. If a formatter is not provided, the
1776 formatter that would be used for `put` is assumed. On successful
1777 ingest all `FileDataset.formatter` attributes will be set to the
1778 formatter class used. `FileDataset.path` attributes may be modified
1779 to put paths in whatever the datastore considers a standardized
1780 form.
1781 transfer : `str`, optional
1782 If not `None`, must be one of 'auto', 'move', 'copy', 'direct',
1783 'split', 'hardlink', 'relsymlink' or 'symlink', indicating how to
1784 transfer the file.
1785 run : `str`, optional
1786 The name of the run ingested datasets should be added to,
1787 overriding ``self.run``. This parameter is now deprecated since
1788 the run is encoded in the ``FileDataset``.
1789 idGenerationMode : `DatasetIdGenEnum`, optional
1790 Specifies option for generating dataset IDs. By default unique IDs
1791 are generated for each inserted dataset.
1792 record_validation_info : `bool`, optional
1793 If `True`, the default, the datastore can record validation
1794 information associated with the file. If `False` the datastore
1795 will not attempt to track any information such as checksums
1796 or file sizes. This can be useful if such information is tracked
1797 in an external system or if the file is to be compressed in place.
1798 It is up to the datastore whether this parameter is relevant.
1800 Raises
1801 ------
1802 TypeError
1803 Raised if the butler is read-only or if no run was provided.
1804 NotImplementedError
1805 Raised if the `Datastore` does not support the given transfer mode.
1806 DatasetTypeNotSupportedError
1807 Raised if one or more files to be ingested have a dataset type that
1808 is not supported by the `Datastore`..
1809 FileNotFoundError
1810 Raised if one of the given files does not exist.
1811 FileExistsError
1812 Raised if transfer is not `None` but the (internal) location the
1813 file would be moved to is already occupied.
1815 Notes
1816 -----
1817 This operation is not fully exception safe: if a database operation
1818 fails, the given `FileDataset` instances may be only partially updated.
1820 It is atomic in terms of database operations (they will either all
1821 succeed or all fail) providing the database engine implements
1822 transactions correctly. It will attempt to be atomic in terms of
1823 filesystem operations as well, but this cannot be implemented
1824 rigorously for most datastores.
1825 """
1826 if not self.isWriteable():
1827 raise TypeError("Butler is read-only.")
1829 log.verbose("Ingesting %d file dataset%s.", len(datasets), "" if len(datasets) == 1 else "s")
1830 if not datasets:
1831 return
1833 progress = Progress("lsst.daf.butler.Butler.ingest", level=logging.DEBUG)
1835 # We need to reorganize all the inputs so that they are grouped
1836 # by dataset type and run. Multiple refs in a single FileDataset
1837 # are required to share the run and dataset type.
1838 GroupedData = MutableMapping[tuple[DatasetType, str], list[FileDataset]]
1839 groupedData: GroupedData = defaultdict(list)
1841 # Track DataIDs that are being ingested so we can spot issues early
1842 # with duplication. Retain previous FileDataset so we can report it.
1843 groupedDataIds: MutableMapping[
1844 tuple[DatasetType, str], dict[DataCoordinate, FileDataset]
1845 ] = defaultdict(dict)
1847 used_run = False
1849 # And the nested loop that populates it:
1850 for dataset in progress.wrap(datasets, desc="Grouping by dataset type"):
1851 # Somewhere to store pre-existing refs if we have an
1852 # execution butler.
1853 existingRefs: List[DatasetRef] = []
1855 for ref in dataset.refs:
1856 assert ref.run is not None # For mypy
1857 group_key = (ref.datasetType, ref.run)
1859 if ref.dataId in groupedDataIds[group_key]:
1860 raise ConflictingDefinitionError(
1861 f"Ingest conflict. Dataset {dataset.path} has same"
1862 " DataId as other ingest dataset"
1863 f" {groupedDataIds[group_key][ref.dataId].path} "
1864 f" ({ref.dataId})"
1865 )
1867 groupedDataIds[group_key][ref.dataId] = dataset
1869 if existingRefs:
1870 if len(dataset.refs) != len(existingRefs):
1871 # Keeping track of partially pre-existing datasets is hard
1872 # and should generally never happen. For now don't allow
1873 # it.
1874 raise ConflictingDefinitionError(
1875 f"For dataset {dataset.path} some dataIds already exist"
1876 " in registry but others do not. This is not supported."
1877 )
1879 # Store expanded form in the original FileDataset.
1880 dataset.refs = existingRefs
1881 else:
1882 groupedData[group_key].append(dataset)
1884 if not used_run and run is not None:
1885 warnings.warn(
1886 "All DatasetRefs to be ingested had resolved dataset IDs. The value given to the "
1887 f"'run' parameter ({run!r}) was not used and the parameter will be removed in the future.",
1888 category=FutureWarning,
1889 stacklevel=3, # Take into account the @transactional decorator.
1890 )
1892 # Now we can bulk-insert into Registry for each DatasetType.
1893 for (datasetType, this_run), grouped_datasets in progress.iter_item_chunks(
1894 groupedData.items(), desc="Bulk-inserting datasets by type"
1895 ):
1896 refs_to_import = []
1897 for dataset in grouped_datasets:
1898 refs_to_import.extend(dataset.refs)
1900 n_refs = len(refs_to_import)
1901 log.verbose(
1902 "Importing %d ref%s of dataset type %r into run %r",
1903 n_refs,
1904 "" if n_refs == 1 else "s",
1905 datasetType.name,
1906 this_run,
1907 )
1909 # Import the refs and expand the DataCoordinates since we can't
1910 # guarantee that they are expanded and Datastore will need
1911 # the records.
1912 imported_refs = self.registry._importDatasets(refs_to_import, expand=True)
1913 assert set(imported_refs) == set(refs_to_import)
1915 # Replace all the refs in the FileDataset with expanded versions.
1916 # Pull them off in the order we put them on the list.
1917 for dataset in grouped_datasets:
1918 n_dataset_refs = len(dataset.refs)
1919 dataset.refs = imported_refs[:n_dataset_refs]
1920 del imported_refs[:n_dataset_refs]
1922 # Bulk-insert everything into Datastore.
1923 # We do not know if any of the registry entries already existed
1924 # (_importDatasets only complains if they exist but differ) so
1925 # we have to catch IntegrityError explicitly.
1926 try:
1927 self.datastore.ingest(*datasets, transfer=transfer, record_validation_info=record_validation_info)
1928 except IntegrityError as e:
1929 raise ConflictingDefinitionError(f"Datastore already contains one or more datasets: {e}")
1931 @contextlib.contextmanager
1932 def export(
1933 self,
1934 *,
1935 directory: Optional[str] = None,
1936 filename: Optional[str] = None,
1937 format: Optional[str] = None,
1938 transfer: Optional[str] = None,
1939 ) -> Iterator[RepoExportContext]:
1940 """Export datasets from the repository represented by this `Butler`.
1942 This method is a context manager that returns a helper object
1943 (`RepoExportContext`) that is used to indicate what information from
1944 the repository should be exported.
1946 Parameters
1947 ----------
1948 directory : `str`, optional
1949 Directory dataset files should be written to if ``transfer`` is not
1950 `None`.
1951 filename : `str`, optional
1952 Name for the file that will include database information associated
1953 with the exported datasets. If this is not an absolute path and
1954 ``directory`` is not `None`, it will be written to ``directory``
1955 instead of the current working directory. Defaults to
1956 "export.{format}".
1957 format : `str`, optional
1958 File format for the database information file. If `None`, the
1959 extension of ``filename`` will be used.
1960 transfer : `str`, optional
1961 Transfer mode passed to `Datastore.export`.
1963 Raises
1964 ------
1965 TypeError
1966 Raised if the set of arguments passed is inconsistent.
1968 Examples
1969 --------
1970 Typically the `Registry.queryDataIds` and `Registry.queryDatasets`
1971 methods are used to provide the iterables over data IDs and/or datasets
1972 to be exported::
1974 with butler.export("exports.yaml") as export:
1975 # Export all flats, but none of the dimension element rows
1976 # (i.e. data ID information) associated with them.
1977 export.saveDatasets(butler.registry.queryDatasets("flat"),
1978 elements=())
1979 # Export all datasets that start with "deepCoadd_" and all of
1980 # their associated data ID information.
1981 export.saveDatasets(butler.registry.queryDatasets("deepCoadd_*"))
1982 """
1983 if directory is None and transfer is not None:
1984 raise TypeError("Cannot transfer without providing a directory.")
1985 if transfer == "move":
1986 raise TypeError("Transfer may not be 'move': export is read-only")
1987 if format is None:
1988 if filename is None:
1989 raise TypeError("At least one of 'filename' or 'format' must be provided.")
1990 else:
1991 _, format = os.path.splitext(filename)
1992 if not format:
1993 raise ValueError("Please specify a file extension to determine export format.")
1994 format = format[1:] # Strip leading ".""
1995 elif filename is None:
1996 filename = f"export.{format}"
1997 if directory is not None:
1998 filename = os.path.join(directory, filename)
1999 formats = self._config["repo_transfer_formats"]
2000 if format not in formats:
2001 raise ValueError(f"Unknown export format {format!r}, allowed: {','.join(formats.keys())}")
2002 BackendClass = get_class_of(formats[format, "export"])
2003 with open(filename, "w") as stream:
2004 backend = BackendClass(stream, universe=self.registry.dimensions)
2005 try:
2006 helper = RepoExportContext(
2007 self.registry, self.datastore, backend=backend, directory=directory, transfer=transfer
2008 )
2009 yield helper
2010 except BaseException:
2011 raise
2012 else:
2013 helper._finish()
2015 def import_(
2016 self,
2017 *,
2018 directory: Optional[ResourcePathExpression] = None,
2019 filename: Union[ResourcePathExpression, TextIO, None] = None,
2020 format: Optional[str] = None,
2021 transfer: Optional[str] = None,
2022 skip_dimensions: Optional[Set] = None,
2023 ) -> None:
2024 """Import datasets into this repository that were exported from a
2025 different butler repository via `~lsst.daf.butler.Butler.export`.
2027 Parameters
2028 ----------
2029 directory : `~lsst.resources.ResourcePathExpression`, optional
2030 Directory containing dataset files to import from. If `None`,
2031 ``filename`` and all dataset file paths specified therein must
2032 be absolute.
2033 filename : `~lsst.resources.ResourcePathExpression` or `TextIO`
2034 A stream or name of file that contains database information
2035 associated with the exported datasets, typically generated by
2036 `~lsst.daf.butler.Butler.export`. If this a string (name) or
2037 `~lsst.resources.ResourcePath` and is not an absolute path,
2038 it will first be looked for relative to ``directory`` and if not
2039 found there it will be looked for in the current working
2040 directory. Defaults to "export.{format}".
2041 format : `str`, optional
2042 File format for ``filename``. If `None`, the extension of
2043 ``filename`` will be used.
2044 transfer : `str`, optional
2045 Transfer mode passed to `~lsst.daf.butler.Datastore.ingest`.
2046 skip_dimensions : `set`, optional
2047 Names of dimensions that should be skipped and not imported.
2049 Raises
2050 ------
2051 TypeError
2052 Raised if the set of arguments passed is inconsistent, or if the
2053 butler is read-only.
2054 """
2055 if not self.isWriteable():
2056 raise TypeError("Butler is read-only.")
2057 if format is None:
2058 if filename is None:
2059 raise TypeError("At least one of 'filename' or 'format' must be provided.")
2060 else:
2061 _, format = os.path.splitext(filename) # type: ignore
2062 elif filename is None:
2063 filename = ResourcePath(f"export.{format}", forceAbsolute=False)
2064 if directory is not None:
2065 directory = ResourcePath(directory, forceDirectory=True)
2066 # mypy doesn't think this will work but it does in python >= 3.10.
2067 if isinstance(filename, ResourcePathExpression): # type: ignore
2068 filename = ResourcePath(filename, forceAbsolute=False) # type: ignore
2069 if not filename.isabs() and directory is not None:
2070 potential = directory.join(filename)
2071 exists_in_cwd = filename.exists()
2072 exists_in_dir = potential.exists()
2073 if exists_in_cwd and exists_in_dir:
2074 log.warning(
2075 "A relative path for filename was specified (%s) which exists relative to cwd. "
2076 "Additionally, the file exists relative to the given search directory (%s). "
2077 "Using the export file in the given directory.",
2078 filename,
2079 potential,
2080 )
2081 # Given they specified an explicit directory and that
2082 # directory has the export file in it, assume that that
2083 # is what was meant despite the file in cwd.
2084 filename = potential
2085 elif exists_in_dir:
2086 filename = potential
2087 elif not exists_in_cwd and not exists_in_dir:
2088 # Raise early.
2089 raise FileNotFoundError(
2090 f"Export file could not be found in {filename.abspath()} or {potential.abspath()}."
2091 )
2092 BackendClass: type[RepoImportBackend] = get_class_of(
2093 self._config["repo_transfer_formats"][format]["import"]
2094 )
2096 def doImport(importStream: TextIO | ResourceHandleProtocol) -> None:
2097 backend = BackendClass(importStream, self.registry) # type: ignore[call-arg]
2098 backend.register()
2099 with self.transaction():
2100 backend.load(
2101 self.datastore,
2102 directory=directory,
2103 transfer=transfer,
2104 skip_dimensions=skip_dimensions,
2105 )
2107 if isinstance(filename, ResourcePath):
2108 # We can not use open() here at the moment because of
2109 # DM-38589 since yaml does stream.read(8192) in a loop.
2110 stream = io.StringIO(filename.read().decode())
2111 doImport(stream)
2112 else:
2113 doImport(filename) # type: ignore
2115 def transfer_from(
2116 self,
2117 source_butler: LimitedButler,
2118 source_refs: Iterable[DatasetRef],
2119 transfer: str = "auto",
2120 skip_missing: bool = True,
2121 register_dataset_types: bool = False,
2122 transfer_dimensions: bool = False,
2123 ) -> collections.abc.Collection[DatasetRef]:
2124 """Transfer datasets to this Butler from a run in another Butler.
2126 Parameters
2127 ----------
2128 source_butler : `LimitedButler`
2129 Butler from which the datasets are to be transferred. If data IDs
2130 in ``source_refs`` are not expanded then this has to be a full
2131 `Butler` whose registry will be used to expand data IDs.
2132 source_refs : iterable of `DatasetRef`
2133 Datasets defined in the source butler that should be transferred to
2134 this butler.
2135 transfer : `str`, optional
2136 Transfer mode passed to `~lsst.daf.butler.Datastore.transfer_from`.
2137 skip_missing : `bool`
2138 If `True`, datasets with no datastore artifact associated with
2139 them are not transferred. If `False` a registry entry will be
2140 created even if no datastore record is created (and so will
2141 look equivalent to the dataset being unstored).
2142 register_dataset_types : `bool`
2143 If `True` any missing dataset types are registered. Otherwise
2144 an exception is raised.
2145 transfer_dimensions : `bool`, optional
2146 If `True`, dimension record data associated with the new datasets
2147 will be transferred.
2149 Returns
2150 -------
2151 refs : `list` of `DatasetRef`
2152 The refs added to this Butler.
2154 Notes
2155 -----
2156 The datastore artifact has to exist for a transfer
2157 to be made but non-existence is not an error.
2159 Datasets that already exist in this run will be skipped.
2161 The datasets are imported as part of a transaction, although
2162 dataset types are registered before the transaction is started.
2163 This means that it is possible for a dataset type to be registered
2164 even though transfer has failed.
2165 """
2166 if not self.isWriteable():
2167 raise TypeError("Butler is read-only.")
2168 progress = Progress("lsst.daf.butler.Butler.transfer_from", level=VERBOSE)
2170 # Will iterate through the refs multiple times so need to convert
2171 # to a list if this isn't a collection.
2172 if not isinstance(source_refs, collections.abc.Collection):
2173 source_refs = list(source_refs)
2175 original_count = len(source_refs)
2176 log.info("Transferring %d datasets into %s", original_count, str(self))
2178 # In some situations the datastore artifact may be missing
2179 # and we do not want that registry entry to be imported.
2180 # Asking datastore is not sufficient, the records may have been
2181 # purged, we have to ask for the (predicted) URI and check
2182 # existence explicitly. Execution butler is set up exactly like
2183 # this with no datastore records.
2184 artifact_existence: Dict[ResourcePath, bool] = {}
2185 if skip_missing:
2186 dataset_existence = source_butler.datastore.mexists(
2187 source_refs, artifact_existence=artifact_existence
2188 )
2189 source_refs = [ref for ref, exists in dataset_existence.items() if exists]
2190 filtered_count = len(source_refs)
2191 n_missing = original_count - filtered_count
2192 log.verbose(
2193 "%d dataset%s removed because the artifact does not exist. Now have %d.",
2194 n_missing,
2195 "" if n_missing == 1 else "s",
2196 filtered_count,
2197 )
2199 # Importing requires that we group the refs by dataset type and run
2200 # before doing the import.
2201 source_dataset_types = set()
2202 grouped_refs = defaultdict(list)
2203 for ref in source_refs:
2204 grouped_refs[ref.datasetType, ref.run].append(ref)
2205 source_dataset_types.add(ref.datasetType)
2207 # Check to see if the dataset type in the source butler has
2208 # the same definition in the target butler and register missing
2209 # ones if requested. Registration must happen outside a transaction.
2210 newly_registered_dataset_types = set()
2211 for datasetType in source_dataset_types:
2212 if register_dataset_types:
2213 # Let this raise immediately if inconsistent. Continuing
2214 # on to find additional inconsistent dataset types
2215 # might result in additional unwanted dataset types being
2216 # registered.
2217 if self.registry.registerDatasetType(datasetType):
2218 newly_registered_dataset_types.add(datasetType)
2219 else:
2220 # If the dataset type is missing, let it fail immediately.
2221 target_dataset_type = self.registry.getDatasetType(datasetType.name)
2222 if target_dataset_type != datasetType:
2223 raise ConflictingDefinitionError(
2224 "Source butler dataset type differs from definition"
2225 f" in target butler: {datasetType} !="
2226 f" {target_dataset_type}"
2227 )
2228 if newly_registered_dataset_types:
2229 # We may have registered some even if there were inconsistencies
2230 # but should let people know (or else remove them again).
2231 log.log(
2232 VERBOSE,
2233 "Registered the following dataset types in the target Butler: %s",
2234 ", ".join(d.name for d in newly_registered_dataset_types),
2235 )
2236 else:
2237 log.log(VERBOSE, "All required dataset types are known to the target Butler")
2239 dimension_records: Dict[DimensionElement, Dict[DataCoordinate, DimensionRecord]] = defaultdict(dict)
2240 if transfer_dimensions:
2241 # Collect all the dimension records for these refs.
2242 # All dimensions are to be copied but the list of valid dimensions
2243 # come from this butler's universe.
2244 elements = frozenset(
2245 element
2246 for element in self.registry.dimensions.getStaticElements()
2247 if element.hasTable() and element.viewOf is None
2248 )
2249 dataIds = set(ref.dataId for ref in source_refs)
2250 # This logic comes from saveDataIds.
2251 for dataId in dataIds:
2252 # Need an expanded record, if not expanded that we need a full
2253 # butler with registry (allow mocks with registry too).
2254 if not dataId.hasRecords():
2255 if registry := getattr(source_butler, "registry", None):
2256 dataId = registry.expandDataId(dataId)
2257 else:
2258 raise TypeError("Input butler needs to be a full butler to expand DataId.")
2259 # If this butler doesn't know about a dimension in the source
2260 # butler things will break later.
2261 for record in dataId.records.values():
2262 if record is not None and record.definition in elements:
2263 dimension_records[record.definition].setdefault(record.dataId, record)
2265 handled_collections: Set[str] = set()
2267 # Do all the importing in a single transaction.
2268 with self.transaction():
2269 if dimension_records:
2270 log.verbose("Ensuring that dimension records exist for transferred datasets.")
2271 for element, r in dimension_records.items():
2272 records = [r[dataId] for dataId in r]
2273 # Assume that if the record is already present that we can
2274 # use it without having to check that the record metadata
2275 # is consistent.
2276 self.registry.insertDimensionData(element, *records, skip_existing=True)
2278 n_imported = 0
2279 for (datasetType, run), refs_to_import in progress.iter_item_chunks(
2280 grouped_refs.items(), desc="Importing to registry by run and dataset type"
2281 ):
2282 if run not in handled_collections:
2283 # May need to create output collection. If source butler
2284 # has a registry, ask for documentation string.
2285 run_doc = None
2286 if registry := getattr(source_butler, "registry", None):
2287 run_doc = registry.getCollectionDocumentation(run)
2288 registered = self.registry.registerRun(run, doc=run_doc)
2289 handled_collections.add(run)
2290 if registered:
2291 log.log(VERBOSE, "Creating output run %s", run)
2293 n_refs = len(refs_to_import)
2294 log.verbose(
2295 "Importing %d ref%s of dataset type %s into run %s",
2296 n_refs,
2297 "" if n_refs == 1 else "s",
2298 datasetType.name,
2299 run,
2300 )
2302 # Assume we are using UUIDs and the source refs will match
2303 # those imported.
2304 imported_refs = self.registry._importDatasets(refs_to_import, expand=False)
2305 assert set(imported_refs) == set(refs_to_import)
2306 n_imported += len(imported_refs)
2308 assert len(source_refs) == n_imported
2309 log.verbose("Imported %d datasets into destination butler", n_imported)
2311 # Ask the datastore to transfer. The datastore has to check that
2312 # the source datastore is compatible with the target datastore.
2313 accepted, rejected = self.datastore.transfer_from(
2314 source_butler.datastore,
2315 source_refs,
2316 transfer=transfer,
2317 artifact_existence=artifact_existence,
2318 )
2319 if rejected:
2320 # For now, accept the registry entries but not the files.
2321 log.warning(
2322 "%d datasets were rejected and %d accepted for dataset type %s in run %r.",
2323 len(rejected),
2324 len(accepted),
2325 datasetType,
2326 run,
2327 )
2329 return source_refs
2331 def validateConfiguration(
2332 self,
2333 logFailures: bool = False,
2334 datasetTypeNames: Optional[Iterable[str]] = None,
2335 ignore: Iterable[str] | None = None,
2336 ) -> None:
2337 """Validate butler configuration.
2339 Checks that each `DatasetType` can be stored in the `Datastore`.
2341 Parameters
2342 ----------
2343 logFailures : `bool`, optional
2344 If `True`, output a log message for every validation error
2345 detected.
2346 datasetTypeNames : iterable of `str`, optional
2347 The `DatasetType` names that should be checked. This allows
2348 only a subset to be selected.
2349 ignore : iterable of `str`, optional
2350 Names of DatasetTypes to skip over. This can be used to skip
2351 known problems. If a named `DatasetType` corresponds to a
2352 composite, all components of that `DatasetType` will also be
2353 ignored.
2355 Raises
2356 ------
2357 ButlerValidationError
2358 Raised if there is some inconsistency with how this Butler
2359 is configured.
2360 """
2361 if datasetTypeNames:
2362 datasetTypes = [self.registry.getDatasetType(name) for name in datasetTypeNames]
2363 else:
2364 datasetTypes = list(self.registry.queryDatasetTypes())
2366 # filter out anything from the ignore list
2367 if ignore:
2368 ignore = set(ignore)
2369 datasetTypes = [
2370 e for e in datasetTypes if e.name not in ignore and e.nameAndComponent()[0] not in ignore
2371 ]
2372 else:
2373 ignore = set()
2375 # Find all the registered instruments
2376 instruments = set(record.name for record in self.registry.queryDimensionRecords("instrument"))
2378 # For each datasetType that has an instrument dimension, create
2379 # a DatasetRef for each defined instrument
2380 datasetRefs = []
2382 for datasetType in datasetTypes:
2383 if "instrument" in datasetType.dimensions:
2384 for instrument in instruments:
2385 datasetRef = DatasetRef(
2386 datasetType,
2387 {"instrument": instrument}, # type: ignore
2388 conform=False,
2389 run="validate",
2390 )
2391 datasetRefs.append(datasetRef)
2393 entities: List[Union[DatasetType, DatasetRef]] = []
2394 entities.extend(datasetTypes)
2395 entities.extend(datasetRefs)
2397 datastoreErrorStr = None
2398 try:
2399 self.datastore.validateConfiguration(entities, logFailures=logFailures)
2400 except ValidationError as e:
2401 datastoreErrorStr = str(e)
2403 # Also check that the LookupKeys used by the datastores match
2404 # registry and storage class definitions
2405 keys = self.datastore.getLookupKeys()
2407 failedNames = set()
2408 failedDataId = set()
2409 for key in keys:
2410 if key.name is not None:
2411 if key.name in ignore:
2412 continue
2414 # skip if specific datasetType names were requested and this
2415 # name does not match
2416 if datasetTypeNames and key.name not in datasetTypeNames:
2417 continue
2419 # See if it is a StorageClass or a DatasetType
2420 if key.name in self.storageClasses:
2421 pass
2422 else:
2423 try:
2424 self.registry.getDatasetType(key.name)
2425 except KeyError:
2426 if logFailures:
2427 log.critical("Key '%s' does not correspond to a DatasetType or StorageClass", key)
2428 failedNames.add(key)
2429 else:
2430 # Dimensions are checked for consistency when the Butler
2431 # is created and rendezvoused with a universe.
2432 pass
2434 # Check that the instrument is a valid instrument
2435 # Currently only support instrument so check for that
2436 if key.dataId:
2437 dataIdKeys = set(key.dataId)
2438 if set(["instrument"]) != dataIdKeys:
2439 if logFailures:
2440 log.critical("Key '%s' has unsupported DataId override", key)
2441 failedDataId.add(key)
2442 elif key.dataId["instrument"] not in instruments:
2443 if logFailures:
2444 log.critical("Key '%s' has unknown instrument", key)
2445 failedDataId.add(key)
2447 messages = []
2449 if datastoreErrorStr:
2450 messages.append(datastoreErrorStr)
2452 for failed, msg in (
2453 (failedNames, "Keys without corresponding DatasetType or StorageClass entry: "),
2454 (failedDataId, "Keys with bad DataId entries: "),
2455 ):
2456 if failed:
2457 msg += ", ".join(str(k) for k in failed)
2458 messages.append(msg)
2460 if messages:
2461 raise ValidationError(";\n".join(messages))
2463 @property
2464 def collections(self) -> Sequence[str]:
2465 """The collections to search by default, in order
2466 (`Sequence` [ `str` ]).
2468 This is an alias for ``self.registry.defaults.collections``. It cannot
2469 be set directly in isolation, but all defaults may be changed together
2470 by assigning a new `RegistryDefaults` instance to
2471 ``self.registry.defaults``.
2472 """
2473 return self.registry.defaults.collections
2475 @property
2476 def run(self) -> Optional[str]:
2477 """Name of the run this butler writes outputs to by default (`str` or
2478 `None`).
2480 This is an alias for ``self.registry.defaults.run``. It cannot be set
2481 directly in isolation, but all defaults may be changed together by
2482 assigning a new `RegistryDefaults` instance to
2483 ``self.registry.defaults``.
2484 """
2485 return self.registry.defaults.run
2487 @property
2488 def dimensions(self) -> DimensionUniverse:
2489 # Docstring inherited.
2490 return self.registry.dimensions
2492 registry: Registry
2493 """The object that manages dataset metadata and relationships (`Registry`).
2495 Most operations that don't involve reading or writing butler datasets are
2496 accessible only via `Registry` methods.
2497 """
2499 datastore: Datastore
2500 """The object that manages actual dataset storage (`Datastore`).
2502 Direct user access to the datastore should rarely be necessary; the primary
2503 exception is the case where a `Datastore` implementation provides extra
2504 functionality beyond what the base class defines.
2505 """
2507 storageClasses: StorageClassFactory
2508 """An object that maps known storage class names to objects that fully
2509 describe them (`StorageClassFactory`).
2510 """