Coverage for python/lsst/daf/butler/_butler.py: 10%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""
23Butler top level classes.
24"""
25from __future__ import annotations
27__all__ = (
28 "Butler",
29 "ButlerValidationError",
30 "PruneCollectionsArgsError",
31 "PurgeWithoutUnstorePruneCollectionsError",
32 "RunWithoutPurgePruneCollectionsError",
33 "PurgeUnsupportedPruneCollectionsError",
34)
36import collections.abc
37import contextlib
38import logging
39import numbers
40import os
41from collections import defaultdict
42from typing import (
43 Any,
44 ClassVar,
45 Counter,
46 Dict,
47 Iterable,
48 Iterator,
49 List,
50 MutableMapping,
51 Optional,
52 Set,
53 TextIO,
54 Tuple,
55 Type,
56 Union,
57)
59from lsst.resources import ResourcePath, ResourcePathExpression
60from lsst.utils import doImportType
61from lsst.utils.introspection import get_class_of
62from lsst.utils.logging import VERBOSE, getLogger
64from ._butlerConfig import ButlerConfig
65from ._butlerRepoIndex import ButlerRepoIndex
66from ._deferredDatasetHandle import DeferredDatasetHandle
67from .core import (
68 AmbiguousDatasetError,
69 Config,
70 ConfigSubset,
71 DataCoordinate,
72 DataId,
73 DataIdValue,
74 DatasetRef,
75 DatasetType,
76 Datastore,
77 Dimension,
78 DimensionConfig,
79 FileDataset,
80 Progress,
81 StorageClassFactory,
82 Timespan,
83 ValidationError,
84)
85from .core.repoRelocation import BUTLER_ROOT_TAG
86from .core.utils import transactional
87from .registry import (
88 CollectionSearch,
89 CollectionType,
90 ConflictingDefinitionError,
91 DatasetIdGenEnum,
92 Registry,
93 RegistryConfig,
94 RegistryDefaults,
95)
96from .transfers import RepoExportContext
98log = getLogger(__name__)
101class ButlerValidationError(ValidationError):
102 """There is a problem with the Butler configuration."""
104 pass
107class PruneCollectionsArgsError(TypeError):
108 """Base class for errors relating to Butler.pruneCollections input
109 arguments.
110 """
112 pass
115class PurgeWithoutUnstorePruneCollectionsError(PruneCollectionsArgsError):
116 """Raised when purge and unstore are both required to be True, and
117 purge is True but unstore is False.
118 """
120 def __init__(self) -> None:
121 super().__init__("Cannot pass purge=True without unstore=True.")
124class RunWithoutPurgePruneCollectionsError(PruneCollectionsArgsError):
125 """Raised when pruning a RUN collection but purge is False."""
127 def __init__(self, collectionType: CollectionType):
128 self.collectionType = collectionType
129 super().__init__(f"Cannot prune RUN collection {self.collectionType.name} without purge=True.")
132class PurgeUnsupportedPruneCollectionsError(PruneCollectionsArgsError):
133 """Raised when purge is True but is not supported for the given
134 collection."""
136 def __init__(self, collectionType: CollectionType):
137 self.collectionType = collectionType
138 super().__init__(
139 f"Cannot prune {self.collectionType} collection {self.collectionType.name} with purge=True."
140 )
143class Butler:
144 """Main entry point for the data access system.
146 Parameters
147 ----------
148 config : `ButlerConfig`, `Config` or `str`, optional.
149 Configuration. Anything acceptable to the
150 `ButlerConfig` constructor. If a directory path
151 is given the configuration will be read from a ``butler.yaml`` file in
152 that location. If `None` is given default values will be used.
153 butler : `Butler`, optional.
154 If provided, construct a new Butler that uses the same registry and
155 datastore as the given one, but with the given collection and run.
156 Incompatible with the ``config``, ``searchPaths``, and ``writeable``
157 arguments.
158 collections : `str` or `Iterable` [ `str` ], optional
159 An expression specifying the collections to be searched (in order) when
160 reading datasets.
161 This may be a `str` collection name or an iterable thereof.
162 See :ref:`daf_butler_collection_expressions` for more information.
163 These collections are not registered automatically and must be
164 manually registered before they are used by any method, but they may be
165 manually registered after the `Butler` is initialized.
166 run : `str`, optional
167 Name of the `~CollectionType.RUN` collection new datasets should be
168 inserted into. If ``collections`` is `None` and ``run`` is not `None`,
169 ``collections`` will be set to ``[run]``. If not `None`, this
170 collection will automatically be registered. If this is not set (and
171 ``writeable`` is not set either), a read-only butler will be created.
172 searchPaths : `list` of `str`, optional
173 Directory paths to search when calculating the full Butler
174 configuration. Not used if the supplied config is already a
175 `ButlerConfig`.
176 writeable : `bool`, optional
177 Explicitly sets whether the butler supports write operations. If not
178 provided, a read-write butler is created if any of ``run``, ``tags``,
179 or ``chains`` is non-empty.
180 inferDefaults : `bool`, optional
181 If `True` (default) infer default data ID values from the values
182 present in the datasets in ``collections``: if all collections have the
183 same value (or no value) for a governor dimension, that value will be
184 the default for that dimension. Nonexistent collections are ignored.
185 If a default value is provided explicitly for a governor dimension via
186 ``**kwargs``, no default will be inferred for that dimension.
187 **kwargs : `str`
188 Default data ID key-value pairs. These may only identify "governor"
189 dimensions like ``instrument`` and ``skymap``.
191 Examples
192 --------
193 While there are many ways to control exactly how a `Butler` interacts with
194 the collections in its `Registry`, the most common cases are still simple.
196 For a read-only `Butler` that searches one collection, do::
198 butler = Butler("/path/to/repo", collections=["u/alice/DM-50000"])
200 For a read-write `Butler` that writes to and reads from a
201 `~CollectionType.RUN` collection::
203 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a")
205 The `Butler` passed to a ``PipelineTask`` is often much more complex,
206 because we want to write to one `~CollectionType.RUN` collection but read
207 from several others (as well)::
209 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a",
210 collections=["u/alice/DM-50000/a",
211 "u/bob/DM-49998",
212 "HSC/defaults"])
214 This butler will `put` new datasets to the run ``u/alice/DM-50000/a``.
215 Datasets will be read first from that run (since it appears first in the
216 chain), and then from ``u/bob/DM-49998`` and finally ``HSC/defaults``.
218 Finally, one can always create a `Butler` with no collections::
220 butler = Butler("/path/to/repo", writeable=True)
222 This can be extremely useful when you just want to use ``butler.registry``,
223 e.g. for inserting dimension data or managing collections, or when the
224 collections you want to use with the butler are not consistent.
225 Passing ``writeable`` explicitly here is only necessary if you want to be
226 able to make changes to the repo - usually the value for ``writeable`` can
227 be guessed from the collection arguments provided, but it defaults to
228 `False` when there are not collection arguments.
229 """
231 def __init__(
232 self,
233 config: Union[Config, str, None] = None,
234 *,
235 butler: Optional[Butler] = None,
236 collections: Any = None,
237 run: Optional[str] = None,
238 searchPaths: Optional[List[str]] = None,
239 writeable: Optional[bool] = None,
240 inferDefaults: bool = True,
241 **kwargs: str,
242 ):
243 defaults = RegistryDefaults(collections=collections, run=run, infer=inferDefaults, **kwargs)
244 # Load registry, datastore, etc. from config or existing butler.
245 if butler is not None:
246 if config is not None or searchPaths is not None or writeable is not None:
247 raise TypeError(
248 "Cannot pass 'config', 'searchPaths', or 'writeable' arguments with 'butler' argument."
249 )
250 self.registry = butler.registry.copy(defaults)
251 self.datastore = butler.datastore
252 self.storageClasses = butler.storageClasses
253 self._config: ButlerConfig = butler._config
254 self._allow_put_of_predefined_dataset = butler._allow_put_of_predefined_dataset
255 else:
256 self._config = ButlerConfig(config, searchPaths=searchPaths)
257 try:
258 if "root" in self._config:
259 butlerRoot = self._config["root"]
260 else:
261 butlerRoot = self._config.configDir
262 if writeable is None:
263 writeable = run is not None
264 self.registry = Registry.fromConfig(
265 self._config, butlerRoot=butlerRoot, writeable=writeable, defaults=defaults
266 )
267 self.datastore = Datastore.fromConfig(
268 self._config, self.registry.getDatastoreBridgeManager(), butlerRoot=butlerRoot
269 )
270 self.storageClasses = StorageClassFactory()
271 self.storageClasses.addFromConfig(self._config)
272 self._allow_put_of_predefined_dataset = self._config.get(
273 "allow_put_of_predefined_dataset", False
274 )
275 except Exception:
276 # Failures here usually mean that configuration is incomplete,
277 # just issue an error message which includes config file URI.
278 log.error(f"Failed to instantiate Butler from config {self._config.configFile}.")
279 raise
281 if "run" in self._config or "collection" in self._config:
282 raise ValueError("Passing a run or collection via configuration is no longer supported.")
284 GENERATION: ClassVar[int] = 3
285 """This is a Generation 3 Butler.
287 This attribute may be removed in the future, once the Generation 2 Butler
288 interface has been fully retired; it should only be used in transitional
289 code.
290 """
292 @classmethod
293 def get_repo_uri(cls, label: str) -> ResourcePath:
294 """Look up the label in a butler repository index.
296 Parameters
297 ----------
298 label : `str`
299 Label of the Butler repository to look up.
301 Returns
302 -------
303 uri : `lsst.resources.ResourcePath`
304 URI to the Butler repository associated with the given label.
306 Raises
307 ------
308 KeyError
309 Raised if the label is not found in the index, or if an index
310 can not be found at all.
312 Notes
313 -----
314 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the
315 information is discovered.
316 """
317 return ButlerRepoIndex.get_repo_uri(label)
319 @classmethod
320 def get_known_repos(cls) -> Set[str]:
321 """Retrieve the list of known repository labels.
323 Returns
324 -------
325 repos : `set` of `str`
326 All the known labels. Can be empty if no index can be found.
328 Notes
329 -----
330 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the
331 information is discovered.
332 """
333 return ButlerRepoIndex.get_known_repos()
335 @staticmethod
336 def makeRepo(
337 root: ResourcePathExpression,
338 config: Union[Config, str, None] = None,
339 dimensionConfig: Union[Config, str, None] = None,
340 standalone: bool = False,
341 searchPaths: Optional[List[str]] = None,
342 forceConfigRoot: bool = True,
343 outfile: Optional[ResourcePathExpression] = None,
344 overwrite: bool = False,
345 ) -> Config:
346 """Create an empty data repository by adding a butler.yaml config
347 to a repository root directory.
349 Parameters
350 ----------
351 root : `lsst.resources.ResourcePathExpression`
352 Path or URI to the root location of the new repository. Will be
353 created if it does not exist.
354 config : `Config` or `str`, optional
355 Configuration to write to the repository, after setting any
356 root-dependent Registry or Datastore config options. Can not
357 be a `ButlerConfig` or a `ConfigSubset`. If `None`, default
358 configuration will be used. Root-dependent config options
359 specified in this config are overwritten if ``forceConfigRoot``
360 is `True`.
361 dimensionConfig : `Config` or `str`, optional
362 Configuration for dimensions, will be used to initialize registry
363 database.
364 standalone : `bool`
365 If True, write all expanded defaults, not just customized or
366 repository-specific settings.
367 This (mostly) decouples the repository from the default
368 configuration, insulating it from changes to the defaults (which
369 may be good or bad, depending on the nature of the changes).
370 Future *additions* to the defaults will still be picked up when
371 initializing `Butlers` to repos created with ``standalone=True``.
372 searchPaths : `list` of `str`, optional
373 Directory paths to search when calculating the full butler
374 configuration.
375 forceConfigRoot : `bool`, optional
376 If `False`, any values present in the supplied ``config`` that
377 would normally be reset are not overridden and will appear
378 directly in the output config. This allows non-standard overrides
379 of the root directory for a datastore or registry to be given.
380 If this parameter is `True` the values for ``root`` will be
381 forced into the resulting config if appropriate.
382 outfile : `lss.resources.ResourcePathExpression`, optional
383 If not-`None`, the output configuration will be written to this
384 location rather than into the repository itself. Can be a URI
385 string. Can refer to a directory that will be used to write
386 ``butler.yaml``.
387 overwrite : `bool`, optional
388 Create a new configuration file even if one already exists
389 in the specified output location. Default is to raise
390 an exception.
392 Returns
393 -------
394 config : `Config`
395 The updated `Config` instance written to the repo.
397 Raises
398 ------
399 ValueError
400 Raised if a ButlerConfig or ConfigSubset is passed instead of a
401 regular Config (as these subclasses would make it impossible to
402 support ``standalone=False``).
403 FileExistsError
404 Raised if the output config file already exists.
405 os.error
406 Raised if the directory does not exist, exists but is not a
407 directory, or cannot be created.
409 Notes
410 -----
411 Note that when ``standalone=False`` (the default), the configuration
412 search path (see `ConfigSubset.defaultSearchPaths`) that was used to
413 construct the repository should also be used to construct any Butlers
414 to avoid configuration inconsistencies.
415 """
416 if isinstance(config, (ButlerConfig, ConfigSubset)):
417 raise ValueError("makeRepo must be passed a regular Config without defaults applied.")
419 # Ensure that the root of the repository exists or can be made
420 uri = ResourcePath(root, forceDirectory=True)
421 uri.mkdir()
423 config = Config(config)
425 # If we are creating a new repo from scratch with relative roots,
426 # do not propagate an explicit root from the config file
427 if "root" in config:
428 del config["root"]
430 full = ButlerConfig(config, searchPaths=searchPaths) # this applies defaults
431 imported_class = doImportType(full["datastore", "cls"])
432 if not issubclass(imported_class, Datastore):
433 raise TypeError(f"Imported datastore class {full['datastore', 'cls']} is not a Datastore")
434 datastoreClass: Type[Datastore] = imported_class
435 datastoreClass.setConfigRoot(BUTLER_ROOT_TAG, config, full, overwrite=forceConfigRoot)
437 # if key exists in given config, parse it, otherwise parse the defaults
438 # in the expanded config
439 if config.get(("registry", "db")):
440 registryConfig = RegistryConfig(config)
441 else:
442 registryConfig = RegistryConfig(full)
443 defaultDatabaseUri = registryConfig.makeDefaultDatabaseUri(BUTLER_ROOT_TAG)
444 if defaultDatabaseUri is not None:
445 Config.updateParameters(
446 RegistryConfig, config, full, toUpdate={"db": defaultDatabaseUri}, overwrite=forceConfigRoot
447 )
448 else:
449 Config.updateParameters(RegistryConfig, config, full, toCopy=("db",), overwrite=forceConfigRoot)
451 if standalone:
452 config.merge(full)
453 else:
454 # Always expand the registry.managers section into the per-repo
455 # config, because after the database schema is created, it's not
456 # allowed to change anymore. Note that in the standalone=True
457 # branch, _everything_ in the config is expanded, so there's no
458 # need to special case this.
459 Config.updateParameters(RegistryConfig, config, full, toMerge=("managers",), overwrite=False)
460 configURI: Union[str, ResourcePathExpression]
461 if outfile is not None:
462 # When writing to a separate location we must include
463 # the root of the butler repo in the config else it won't know
464 # where to look.
465 config["root"] = uri.geturl()
466 configURI = outfile
467 else:
468 configURI = uri
469 config.dumpToUri(configURI, overwrite=overwrite)
471 # Create Registry and populate tables
472 registryConfig = RegistryConfig(config.get("registry"))
473 dimensionConfig = DimensionConfig(dimensionConfig)
474 Registry.createFromConfig(registryConfig, dimensionConfig=dimensionConfig, butlerRoot=root)
476 log.verbose("Wrote new Butler configuration file to %s", configURI)
478 return config
480 @classmethod
481 def _unpickle(
482 cls,
483 config: ButlerConfig,
484 collections: Optional[CollectionSearch],
485 run: Optional[str],
486 defaultDataId: Dict[str, str],
487 writeable: bool,
488 ) -> Butler:
489 """Callable used to unpickle a Butler.
491 We prefer not to use ``Butler.__init__`` directly so we can force some
492 of its many arguments to be keyword-only (note that ``__reduce__``
493 can only invoke callables with positional arguments).
495 Parameters
496 ----------
497 config : `ButlerConfig`
498 Butler configuration, already coerced into a true `ButlerConfig`
499 instance (and hence after any search paths for overrides have been
500 utilized).
501 collections : `CollectionSearch`
502 Names of the default collections to read from.
503 run : `str`, optional
504 Name of the default `~CollectionType.RUN` collection to write to.
505 defaultDataId : `dict` [ `str`, `str` ]
506 Default data ID values.
507 writeable : `bool`
508 Whether the Butler should support write operations.
510 Returns
511 -------
512 butler : `Butler`
513 A new `Butler` instance.
514 """
515 # MyPy doesn't recognize that the kwargs below are totally valid; it
516 # seems to think '**defaultDataId* is a _positional_ argument!
517 return cls(
518 config=config,
519 collections=collections,
520 run=run,
521 writeable=writeable,
522 **defaultDataId, # type: ignore
523 )
525 def __reduce__(self) -> tuple:
526 """Support pickling."""
527 return (
528 Butler._unpickle,
529 (
530 self._config,
531 self.collections,
532 self.run,
533 self.registry.defaults.dataId.byName(),
534 self.registry.isWriteable(),
535 ),
536 )
538 def __str__(self) -> str:
539 return "Butler(collections={}, run={}, datastore='{}', registry='{}')".format(
540 self.collections, self.run, self.datastore, self.registry
541 )
543 def isWriteable(self) -> bool:
544 """Return `True` if this `Butler` supports write operations."""
545 return self.registry.isWriteable()
547 @contextlib.contextmanager
548 def transaction(self) -> Iterator[None]:
549 """Context manager supporting `Butler` transactions.
551 Transactions can be nested.
552 """
553 with self.registry.transaction():
554 with self.datastore.transaction():
555 yield
557 def _standardizeArgs(
558 self,
559 datasetRefOrType: Union[DatasetRef, DatasetType, str],
560 dataId: Optional[DataId] = None,
561 **kwargs: Any,
562 ) -> Tuple[DatasetType, Optional[DataId]]:
563 """Standardize the arguments passed to several Butler APIs.
565 Parameters
566 ----------
567 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
568 When `DatasetRef` the `dataId` should be `None`.
569 Otherwise the `DatasetType` or name thereof.
570 dataId : `dict` or `DataCoordinate`
571 A `dict` of `Dimension` link name, value pairs that label the
572 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
573 should be provided as the second argument.
574 **kwargs
575 Additional keyword arguments used to augment or construct a
576 `DataCoordinate`. See `DataCoordinate.standardize`
577 parameters.
579 Returns
580 -------
581 datasetType : `DatasetType`
582 A `DatasetType` instance extracted from ``datasetRefOrType``.
583 dataId : `dict` or `DataId`, optional
584 Argument that can be used (along with ``kwargs``) to construct a
585 `DataId`.
587 Notes
588 -----
589 Butler APIs that conceptually need a DatasetRef also allow passing a
590 `DatasetType` (or the name of one) and a `DataId` (or a dict and
591 keyword arguments that can be used to construct one) separately. This
592 method accepts those arguments and always returns a true `DatasetType`
593 and a `DataId` or `dict`.
595 Standardization of `dict` vs `DataId` is best handled by passing the
596 returned ``dataId`` (and ``kwargs``) to `Registry` APIs, which are
597 generally similarly flexible.
598 """
599 externalDatasetType: Optional[DatasetType] = None
600 internalDatasetType: Optional[DatasetType] = None
601 if isinstance(datasetRefOrType, DatasetRef):
602 if dataId is not None or kwargs:
603 raise ValueError("DatasetRef given, cannot use dataId as well")
604 externalDatasetType = datasetRefOrType.datasetType
605 dataId = datasetRefOrType.dataId
606 else:
607 # Don't check whether DataId is provided, because Registry APIs
608 # can usually construct a better error message when it wasn't.
609 if isinstance(datasetRefOrType, DatasetType):
610 externalDatasetType = datasetRefOrType
611 else:
612 internalDatasetType = self.registry.getDatasetType(datasetRefOrType)
614 # Check that they are self-consistent
615 if externalDatasetType is not None:
616 internalDatasetType = self.registry.getDatasetType(externalDatasetType.name)
617 if externalDatasetType != internalDatasetType:
618 raise ValueError(
619 f"Supplied dataset type ({externalDatasetType}) inconsistent with "
620 f"registry definition ({internalDatasetType})"
621 )
623 assert internalDatasetType is not None
624 return internalDatasetType, dataId
626 def _rewrite_data_id(
627 self, dataId: Optional[DataId], datasetType: DatasetType, **kwargs: Any
628 ) -> Tuple[Optional[DataId], Dict[str, Any]]:
629 """Rewrite a data ID taking into account dimension records.
631 Take a Data ID and keyword args and rewrite it if necessary to
632 allow the user to specify dimension records rather than dimension
633 primary values.
635 This allows a user to include a dataId dict with keys of
636 ``exposure.day_obs`` and ``exposure.seq_num`` instead of giving
637 the integer exposure ID. It also allows a string to be given
638 for a dimension value rather than the integer ID if that is more
639 convenient. For example, rather than having to specifyin the
640 detector with ``detector.full_name``, a string given for ``detector``
641 will be interpreted as the full name and converted to the integer
642 value.
644 Keyword arguments can also use strings for dimensions like detector
645 and exposure but python does not allow them to include ``.`` and
646 so the ``exposure.day_obs`` syntax can not be used in a keyword
647 argument.
649 Parameters
650 ----------
651 dataId : `dict` or `DataCoordinate`
652 A `dict` of `Dimension` link name, value pairs that will label the
653 `DatasetRef` within a Collection.
654 datasetType : `DatasetType`
655 The dataset type associated with this dataId. Required to
656 determine the relevant dimensions.
657 **kwargs
658 Additional keyword arguments used to augment or construct a
659 `DataId`. See `DataId` parameters.
661 Returns
662 -------
663 dataId : `dict` or `DataCoordinate`
664 The, possibly rewritten, dataId. If given a `DataCoordinate` and
665 no keyword arguments, the original dataId will be returned
666 unchanged.
667 **kwargs : `dict`
668 Any unused keyword arguments (would normally be empty dict).
669 """
670 # Do nothing if we have a standalone DataCoordinate.
671 if isinstance(dataId, DataCoordinate) and not kwargs:
672 return dataId, kwargs
674 # Process dimension records that are using record information
675 # rather than ids
676 newDataId: Dict[str, DataIdValue] = {}
677 byRecord: Dict[str, Dict[str, Any]] = defaultdict(dict)
679 # if all the dataId comes from keyword parameters we do not need
680 # to do anything here because they can't be of the form
681 # exposure.obs_id because a "." is not allowed in a keyword parameter.
682 if dataId:
683 for k, v in dataId.items():
684 # If we have a Dimension we do not need to do anything
685 # because it cannot be a compound key.
686 if isinstance(k, str) and "." in k:
687 # Someone is using a more human-readable dataId
688 dimensionName, record = k.split(".", 1)
689 byRecord[dimensionName][record] = v
690 elif isinstance(k, Dimension):
691 newDataId[k.name] = v
692 else:
693 newDataId[k] = v
695 # Go through the updated dataId and check the type in case someone is
696 # using an alternate key. We have already filtered out the compound
697 # keys dimensions.record format.
698 not_dimensions = {}
700 # Will need to look in the dataId and the keyword arguments
701 # and will remove them if they need to be fixed or are unrecognized.
702 for dataIdDict in (newDataId, kwargs):
703 # Use a list so we can adjust the dict safely in the loop
704 for dimensionName in list(dataIdDict):
705 value = dataIdDict[dimensionName]
706 try:
707 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
708 except KeyError:
709 # This is not a real dimension
710 not_dimensions[dimensionName] = value
711 del dataIdDict[dimensionName]
712 continue
714 # Convert an integral type to an explicit int to simplify
715 # comparisons here
716 if isinstance(value, numbers.Integral):
717 value = int(value)
719 if not isinstance(value, dimension.primaryKey.getPythonType()):
720 for alternate in dimension.alternateKeys:
721 if isinstance(value, alternate.getPythonType()):
722 byRecord[dimensionName][alternate.name] = value
723 del dataIdDict[dimensionName]
724 log.debug(
725 "Converting dimension %s to %s.%s=%s",
726 dimensionName,
727 dimensionName,
728 alternate.name,
729 value,
730 )
731 break
732 else:
733 log.warning(
734 "Type mismatch found for value '%r' provided for dimension %s. "
735 "Could not find matching alternative (primary key has type %s) "
736 "so attempting to use as-is.",
737 value,
738 dimensionName,
739 dimension.primaryKey.getPythonType(),
740 )
742 # By this point kwargs and newDataId should only include valid
743 # dimensions. Merge kwargs in to the new dataId and log if there
744 # are dimensions in both (rather than calling update).
745 for k, v in kwargs.items():
746 if k in newDataId and newDataId[k] != v:
747 log.debug(
748 "Keyword arg %s overriding explicit value in dataId of %s with %s", k, newDataId[k], v
749 )
750 newDataId[k] = v
751 # No need to retain any values in kwargs now.
752 kwargs = {}
754 # If we have some unrecognized dimensions we have to try to connect
755 # them to records in other dimensions. This is made more complicated
756 # by some dimensions having records with clashing names. A mitigation
757 # is that we can tell by this point which dimensions are missing
758 # for the DatasetType but this does not work for calibrations
759 # where additional dimensions can be used to constrain the temporal
760 # axis.
761 if not_dimensions:
762 # Search for all dimensions even if we have been given a value
763 # explicitly. In some cases records are given as well as the
764 # actually dimension and this should not be an error if they
765 # match.
766 mandatoryDimensions = datasetType.dimensions.names # - provided
768 candidateDimensions: Set[str] = set()
769 candidateDimensions.update(mandatoryDimensions)
771 # For calibrations we may well be needing temporal dimensions
772 # so rather than always including all dimensions in the scan
773 # restrict things a little. It is still possible for there
774 # to be confusion over day_obs in visit vs exposure for example.
775 # If we are not searching calibration collections things may
776 # fail but they are going to fail anyway because of the
777 # ambiguousness of the dataId...
778 if datasetType.isCalibration():
779 for dim in self.registry.dimensions.getStaticDimensions():
780 if dim.temporal:
781 candidateDimensions.add(str(dim))
783 # Look up table for the first association with a dimension
784 guessedAssociation: Dict[str, Dict[str, Any]] = defaultdict(dict)
786 # Keep track of whether an item is associated with multiple
787 # dimensions.
788 counter: Counter[str] = Counter()
789 assigned: Dict[str, Set[str]] = defaultdict(set)
791 # Go through the missing dimensions and associate the
792 # given names with records within those dimensions
793 matched_dims = set()
794 for dimensionName in candidateDimensions:
795 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
796 fields = dimension.metadata.names | dimension.uniqueKeys.names
797 for field in not_dimensions:
798 if field in fields:
799 guessedAssociation[dimensionName][field] = not_dimensions[field]
800 counter[dimensionName] += 1
801 assigned[field].add(dimensionName)
802 matched_dims.add(field)
804 # Calculate the fields that matched nothing.
805 never_found = set(not_dimensions) - matched_dims
807 if never_found:
808 raise ValueError(f"Unrecognized keyword args given: {never_found}")
810 # There is a chance we have allocated a single dataId item
811 # to multiple dimensions. Need to decide which should be retained.
812 # For now assume that the most popular alternative wins.
813 # This means that day_obs with seq_num will result in
814 # exposure.day_obs and not visit.day_obs
815 # Also prefer an explicitly missing dimension over an inferred
816 # temporal dimension.
817 for fieldName, assignedDimensions in assigned.items():
818 if len(assignedDimensions) > 1:
819 # Pick the most popular (preferring mandatory dimensions)
820 requiredButMissing = assignedDimensions.intersection(mandatoryDimensions)
821 if requiredButMissing:
822 candidateDimensions = requiredButMissing
823 else:
824 candidateDimensions = assignedDimensions
826 # Select the relevant items and get a new restricted
827 # counter.
828 theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions}
829 duplicatesCounter: Counter[str] = Counter()
830 duplicatesCounter.update(theseCounts)
832 # Choose the most common. If they are equally common
833 # we will pick the one that was found first.
834 # Returns a list of tuples
835 selected = duplicatesCounter.most_common(1)[0][0]
837 log.debug(
838 "Ambiguous dataId entry '%s' associated with multiple dimensions: %s."
839 " Removed ambiguity by choosing dimension %s.",
840 fieldName,
841 ", ".join(assignedDimensions),
842 selected,
843 )
845 for candidateDimension in assignedDimensions:
846 if candidateDimension != selected:
847 del guessedAssociation[candidateDimension][fieldName]
849 # Update the record look up dict with the new associations
850 for dimensionName, values in guessedAssociation.items():
851 if values: # A dict might now be empty
852 log.debug("Assigned non-dimension dataId keys to dimension %s: %s", dimensionName, values)
853 byRecord[dimensionName].update(values)
855 if byRecord:
856 # Some record specifiers were found so we need to convert
857 # them to the Id form
858 for dimensionName, values in byRecord.items():
859 if dimensionName in newDataId:
860 log.warning(
861 "DataId specified explicit %s dimension value of %s in addition to"
862 " general record specifiers for it of %s. Ignoring record information.",
863 dimensionName,
864 newDataId[dimensionName],
865 str(values),
866 )
867 # Get the actual record and compare with these values.
868 try:
869 recs = list(self.registry.queryDimensionRecords(dimensionName, dataId=newDataId))
870 except LookupError:
871 raise ValueError(
872 f"Could not find dimension '{dimensionName}'"
873 f" with dataId {newDataId} as part of comparing with"
874 f" record values {byRecord[dimensionName]}"
875 ) from None
876 if len(recs) == 1:
877 errmsg: List[str] = []
878 for k, v in values.items():
879 if (recval := getattr(recs[0], k)) != v:
880 errmsg.append(f"{k}({recval} != {v})")
881 if errmsg:
882 raise ValueError(
883 f"Dimension {dimensionName} in dataId has explicit value"
884 " inconsistent with records: " + ", ".join(errmsg)
885 )
886 else:
887 # Multiple matches for an explicit dimension
888 # should never happen but let downstream complain.
889 pass
890 continue
892 # Build up a WHERE expression
893 bind = {k: v for k, v in values.items()}
894 where = " AND ".join(f"{dimensionName}.{k} = {k}" for k in bind)
896 # Hopefully we get a single record that matches
897 records = set(
898 self.registry.queryDimensionRecords(
899 dimensionName, dataId=newDataId, where=where, bind=bind, **kwargs
900 )
901 )
903 if len(records) != 1:
904 if len(records) > 1:
905 log.debug("Received %d records from constraints of %s", len(records), str(values))
906 for r in records:
907 log.debug("- %s", str(r))
908 raise ValueError(
909 f"DataId specification for dimension {dimensionName} is not"
910 f" uniquely constrained to a single dataset by {values}."
911 f" Got {len(records)} results."
912 )
913 raise ValueError(
914 f"DataId specification for dimension {dimensionName} matched no"
915 f" records when constrained by {values}"
916 )
918 # Get the primary key from the real dimension object
919 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
920 if not isinstance(dimension, Dimension):
921 raise RuntimeError(
922 f"{dimension.name} is not a true dimension, and cannot be used in data IDs."
923 )
924 newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name)
926 return newDataId, kwargs
928 def _findDatasetRef(
929 self,
930 datasetRefOrType: Union[DatasetRef, DatasetType, str],
931 dataId: Optional[DataId] = None,
932 *,
933 collections: Any = None,
934 allowUnresolved: bool = False,
935 **kwargs: Any,
936 ) -> DatasetRef:
937 """Shared logic for methods that start with a search for a dataset in
938 the registry.
940 Parameters
941 ----------
942 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
943 When `DatasetRef` the `dataId` should be `None`.
944 Otherwise the `DatasetType` or name thereof.
945 dataId : `dict` or `DataCoordinate`, optional
946 A `dict` of `Dimension` link name, value pairs that label the
947 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
948 should be provided as the first argument.
949 collections : Any, optional
950 Collections to be searched, overriding ``self.collections``.
951 Can be any of the types supported by the ``collections`` argument
952 to butler construction.
953 allowUnresolved : `bool`, optional
954 If `True`, return an unresolved `DatasetRef` if finding a resolved
955 one in the `Registry` fails. Defaults to `False`.
956 **kwargs
957 Additional keyword arguments used to augment or construct a
958 `DataId`. See `DataId` parameters.
960 Returns
961 -------
962 ref : `DatasetRef`
963 A reference to the dataset identified by the given arguments.
965 Raises
966 ------
967 LookupError
968 Raised if no matching dataset exists in the `Registry` (and
969 ``allowUnresolved is False``).
970 ValueError
971 Raised if a resolved `DatasetRef` was passed as an input, but it
972 differs from the one found in the registry.
973 TypeError
974 Raised if no collections were provided.
975 """
976 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs)
977 if isinstance(datasetRefOrType, DatasetRef):
978 idNumber = datasetRefOrType.id
979 else:
980 idNumber = None
981 timespan: Optional[Timespan] = None
983 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs)
985 if datasetType.isCalibration():
986 # Because this is a calibration dataset, first try to make a
987 # standardize the data ID without restricting the dimensions to
988 # those of the dataset type requested, because there may be extra
989 # dimensions that provide temporal information for a validity-range
990 # lookup.
991 dataId = DataCoordinate.standardize(
992 dataId, universe=self.registry.dimensions, defaults=self.registry.defaults.dataId, **kwargs
993 )
994 if dataId.graph.temporal:
995 dataId = self.registry.expandDataId(dataId)
996 timespan = dataId.timespan
997 else:
998 # Standardize the data ID to just the dimensions of the dataset
999 # type instead of letting registry.findDataset do it, so we get the
1000 # result even if no dataset is found.
1001 dataId = DataCoordinate.standardize(
1002 dataId, graph=datasetType.dimensions, defaults=self.registry.defaults.dataId, **kwargs
1003 )
1004 # Always lookup the DatasetRef, even if one is given, to ensure it is
1005 # present in the current collection.
1006 ref = self.registry.findDataset(datasetType, dataId, collections=collections, timespan=timespan)
1007 if ref is None:
1008 if allowUnresolved:
1009 return DatasetRef(datasetType, dataId)
1010 else:
1011 if collections is None:
1012 collections = self.registry.defaults.collections
1013 raise LookupError(
1014 f"Dataset {datasetType.name} with data ID {dataId} "
1015 f"could not be found in collections {collections}."
1016 )
1017 if idNumber is not None and idNumber != ref.id:
1018 if collections is None:
1019 collections = self.registry.defaults.collections
1020 raise ValueError(
1021 f"DatasetRef.id provided ({idNumber}) does not match "
1022 f"id ({ref.id}) in registry in collections {collections}."
1023 )
1024 return ref
1026 @transactional
1027 def put(
1028 self,
1029 obj: Any,
1030 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1031 dataId: Optional[DataId] = None,
1032 *,
1033 run: Optional[str] = None,
1034 **kwargs: Any,
1035 ) -> DatasetRef:
1036 """Store and register a dataset.
1038 Parameters
1039 ----------
1040 obj : `object`
1041 The dataset.
1042 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1043 When `DatasetRef` is provided, ``dataId`` should be `None`.
1044 Otherwise the `DatasetType` or name thereof.
1045 dataId : `dict` or `DataCoordinate`
1046 A `dict` of `Dimension` link name, value pairs that label the
1047 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1048 should be provided as the second argument.
1049 run : `str`, optional
1050 The name of the run the dataset should be added to, overriding
1051 ``self.run``.
1052 **kwargs
1053 Additional keyword arguments used to augment or construct a
1054 `DataCoordinate`. See `DataCoordinate.standardize`
1055 parameters.
1057 Returns
1058 -------
1059 ref : `DatasetRef`
1060 A reference to the stored dataset, updated with the correct id if
1061 given.
1063 Raises
1064 ------
1065 TypeError
1066 Raised if the butler is read-only or if no run has been provided.
1067 """
1068 log.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run)
1069 if not self.isWriteable():
1070 raise TypeError("Butler is read-only.")
1071 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs)
1072 if isinstance(datasetRefOrType, DatasetRef) and datasetRefOrType.id is not None:
1073 raise ValueError("DatasetRef must not be in registry, must have None id")
1075 # Handle dimension records in dataId
1076 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs)
1078 # Add Registry Dataset entry.
1079 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions, **kwargs)
1081 # For an execution butler the datasets will be pre-defined.
1082 # If the butler is configured that way datasets should only be inserted
1083 # if they do not already exist in registry. Trying and catching
1084 # ConflictingDefinitionError will not work because the transaction
1085 # will be corrupted. Instead, in this mode always check first.
1086 ref = None
1087 ref_is_predefined = False
1088 if self._allow_put_of_predefined_dataset:
1089 # Get the matching ref for this run.
1090 ref = self.registry.findDataset(datasetType, collections=run, dataId=dataId)
1092 if ref:
1093 # Must be expanded form for datastore templating
1094 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions)
1095 ref = ref.expanded(dataId)
1096 ref_is_predefined = True
1098 if not ref:
1099 (ref,) = self.registry.insertDatasets(datasetType, run=run, dataIds=[dataId])
1101 # If the ref is predefined it is possible that the datastore also
1102 # has the record. Asking datastore to put it again will result in
1103 # the artifact being recreated, overwriting previous, then will cause
1104 # a failure in writing the record which will cause the artifact
1105 # to be removed. Much safer to ask first before attempting to
1106 # overwrite. Race conditions should not be an issue for the
1107 # execution butler environment.
1108 if ref_is_predefined:
1109 if self.datastore.knows(ref):
1110 raise ConflictingDefinitionError(f"Dataset associated {ref} already exists.")
1112 self.datastore.put(obj, ref)
1114 return ref
1116 def getDirect(self, ref: DatasetRef, *, parameters: Optional[Dict[str, Any]] = None) -> Any:
1117 """Retrieve a stored dataset.
1119 Unlike `Butler.get`, this method allows datasets outside the Butler's
1120 collection to be read as long as the `DatasetRef` that identifies them
1121 can be obtained separately.
1123 Parameters
1124 ----------
1125 ref : `DatasetRef`
1126 Resolved reference to an already stored dataset.
1127 parameters : `dict`
1128 Additional StorageClass-defined options to control reading,
1129 typically used to efficiently read only a subset of the dataset.
1131 Returns
1132 -------
1133 obj : `object`
1134 The dataset.
1135 """
1136 return self.datastore.get(ref, parameters=parameters)
1138 def getDirectDeferred(
1139 self, ref: DatasetRef, *, parameters: Union[dict, None] = None
1140 ) -> DeferredDatasetHandle:
1141 """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
1142 from a resolved `DatasetRef`.
1144 Parameters
1145 ----------
1146 ref : `DatasetRef`
1147 Resolved reference to an already stored dataset.
1148 parameters : `dict`
1149 Additional StorageClass-defined options to control reading,
1150 typically used to efficiently read only a subset of the dataset.
1152 Returns
1153 -------
1154 obj : `DeferredDatasetHandle`
1155 A handle which can be used to retrieve a dataset at a later time.
1157 Raises
1158 ------
1159 AmbiguousDatasetError
1160 Raised if ``ref.id is None``, i.e. the reference is unresolved.
1161 """
1162 if ref.id is None:
1163 raise AmbiguousDatasetError(
1164 f"Dataset of type {ref.datasetType.name} with data ID {ref.dataId} is not resolved."
1165 )
1166 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters)
1168 def getDeferred(
1169 self,
1170 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1171 dataId: Optional[DataId] = None,
1172 *,
1173 parameters: Union[dict, None] = None,
1174 collections: Any = None,
1175 **kwargs: Any,
1176 ) -> DeferredDatasetHandle:
1177 """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
1178 after an immediate registry lookup.
1180 Parameters
1181 ----------
1182 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1183 When `DatasetRef` the `dataId` should be `None`.
1184 Otherwise the `DatasetType` or name thereof.
1185 dataId : `dict` or `DataCoordinate`, optional
1186 A `dict` of `Dimension` link name, value pairs that label the
1187 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1188 should be provided as the first argument.
1189 parameters : `dict`
1190 Additional StorageClass-defined options to control reading,
1191 typically used to efficiently read only a subset of the dataset.
1192 collections : Any, optional
1193 Collections to be searched, overriding ``self.collections``.
1194 Can be any of the types supported by the ``collections`` argument
1195 to butler construction.
1196 **kwargs
1197 Additional keyword arguments used to augment or construct a
1198 `DataId`. See `DataId` parameters.
1200 Returns
1201 -------
1202 obj : `DeferredDatasetHandle`
1203 A handle which can be used to retrieve a dataset at a later time.
1205 Raises
1206 ------
1207 LookupError
1208 Raised if no matching dataset exists in the `Registry` (and
1209 ``allowUnresolved is False``).
1210 ValueError
1211 Raised if a resolved `DatasetRef` was passed as an input, but it
1212 differs from the one found in the registry.
1213 TypeError
1214 Raised if no collections were provided.
1215 """
1216 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs)
1217 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters)
1219 def get(
1220 self,
1221 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1222 dataId: Optional[DataId] = None,
1223 *,
1224 parameters: Optional[Dict[str, Any]] = None,
1225 collections: Any = None,
1226 **kwargs: Any,
1227 ) -> Any:
1228 """Retrieve a stored dataset.
1230 Parameters
1231 ----------
1232 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1233 When `DatasetRef` the `dataId` should be `None`.
1234 Otherwise the `DatasetType` or name thereof.
1235 dataId : `dict` or `DataCoordinate`
1236 A `dict` of `Dimension` link name, value pairs that label the
1237 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1238 should be provided as the first argument.
1239 parameters : `dict`
1240 Additional StorageClass-defined options to control reading,
1241 typically used to efficiently read only a subset of the dataset.
1242 collections : Any, optional
1243 Collections to be searched, overriding ``self.collections``.
1244 Can be any of the types supported by the ``collections`` argument
1245 to butler construction.
1246 **kwargs
1247 Additional keyword arguments used to augment or construct a
1248 `DataCoordinate`. See `DataCoordinate.standardize`
1249 parameters.
1251 Returns
1252 -------
1253 obj : `object`
1254 The dataset.
1256 Raises
1257 ------
1258 ValueError
1259 Raised if a resolved `DatasetRef` was passed as an input, but it
1260 differs from the one found in the registry.
1261 LookupError
1262 Raised if no matching dataset exists in the `Registry`.
1263 TypeError
1264 Raised if no collections were provided.
1266 Notes
1267 -----
1268 When looking up datasets in a `~CollectionType.CALIBRATION` collection,
1269 this method requires that the given data ID include temporal dimensions
1270 beyond the dimensions of the dataset type itself, in order to find the
1271 dataset with the appropriate validity range. For example, a "bias"
1272 dataset with native dimensions ``{instrument, detector}`` could be
1273 fetched with a ``{instrument, detector, exposure}`` data ID, because
1274 ``exposure`` is a temporal dimension.
1275 """
1276 log.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters)
1277 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs)
1278 return self.getDirect(ref, parameters=parameters)
1280 def getURIs(
1281 self,
1282 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1283 dataId: Optional[DataId] = None,
1284 *,
1285 predict: bool = False,
1286 collections: Any = None,
1287 run: Optional[str] = None,
1288 **kwargs: Any,
1289 ) -> Tuple[Optional[ResourcePath], Dict[str, ResourcePath]]:
1290 """Returns the URIs associated with the dataset.
1292 Parameters
1293 ----------
1294 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1295 When `DatasetRef` the `dataId` should be `None`.
1296 Otherwise the `DatasetType` or name thereof.
1297 dataId : `dict` or `DataCoordinate`
1298 A `dict` of `Dimension` link name, value pairs that label the
1299 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1300 should be provided as the first argument.
1301 predict : `bool`
1302 If `True`, allow URIs to be returned of datasets that have not
1303 been written.
1304 collections : Any, optional
1305 Collections to be searched, overriding ``self.collections``.
1306 Can be any of the types supported by the ``collections`` argument
1307 to butler construction.
1308 run : `str`, optional
1309 Run to use for predictions, overriding ``self.run``.
1310 **kwargs
1311 Additional keyword arguments used to augment or construct a
1312 `DataCoordinate`. See `DataCoordinate.standardize`
1313 parameters.
1315 Returns
1316 -------
1317 primary : `lsst.resources.ResourcePath`
1318 The URI to the primary artifact associated with this dataset.
1319 If the dataset was disassembled within the datastore this
1320 may be `None`.
1321 components : `dict`
1322 URIs to any components associated with the dataset artifact.
1323 Can be empty if there are no components.
1324 """
1325 ref = self._findDatasetRef(
1326 datasetRefOrType, dataId, allowUnresolved=predict, collections=collections, **kwargs
1327 )
1328 if ref.id is None: # only possible if predict is True
1329 if run is None:
1330 run = self.run
1331 if run is None:
1332 raise TypeError("Cannot predict location with run=None.")
1333 # Lie about ID, because we can't guess it, and only
1334 # Datastore.getURIs() will ever see it (and it doesn't use it).
1335 ref = ref.resolved(id=0, run=run)
1336 return self.datastore.getURIs(ref, predict)
1338 def getURI(
1339 self,
1340 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1341 dataId: Optional[DataId] = None,
1342 *,
1343 predict: bool = False,
1344 collections: Any = None,
1345 run: Optional[str] = None,
1346 **kwargs: Any,
1347 ) -> ResourcePath:
1348 """Return the URI to the Dataset.
1350 Parameters
1351 ----------
1352 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1353 When `DatasetRef` the `dataId` should be `None`.
1354 Otherwise the `DatasetType` or name thereof.
1355 dataId : `dict` or `DataCoordinate`
1356 A `dict` of `Dimension` link name, value pairs that label the
1357 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1358 should be provided as the first argument.
1359 predict : `bool`
1360 If `True`, allow URIs to be returned of datasets that have not
1361 been written.
1362 collections : Any, optional
1363 Collections to be searched, overriding ``self.collections``.
1364 Can be any of the types supported by the ``collections`` argument
1365 to butler construction.
1366 run : `str`, optional
1367 Run to use for predictions, overriding ``self.run``.
1368 **kwargs
1369 Additional keyword arguments used to augment or construct a
1370 `DataCoordinate`. See `DataCoordinate.standardize`
1371 parameters.
1373 Returns
1374 -------
1375 uri : `lsst.resources.ResourcePath`
1376 URI pointing to the Dataset within the datastore. If the
1377 Dataset does not exist in the datastore, and if ``predict`` is
1378 `True`, the URI will be a prediction and will include a URI
1379 fragment "#predicted".
1380 If the datastore does not have entities that relate well
1381 to the concept of a URI the returned URI string will be
1382 descriptive. The returned URI is not guaranteed to be obtainable.
1384 Raises
1385 ------
1386 LookupError
1387 A URI has been requested for a dataset that does not exist and
1388 guessing is not allowed.
1389 ValueError
1390 Raised if a resolved `DatasetRef` was passed as an input, but it
1391 differs from the one found in the registry.
1392 TypeError
1393 Raised if no collections were provided.
1394 RuntimeError
1395 Raised if a URI is requested for a dataset that consists of
1396 multiple artifacts.
1397 """
1398 primary, components = self.getURIs(
1399 datasetRefOrType, dataId=dataId, predict=predict, collections=collections, run=run, **kwargs
1400 )
1402 if primary is None or components:
1403 raise RuntimeError(
1404 f"Dataset ({datasetRefOrType}) includes distinct URIs for components. "
1405 "Use Butler.getURIs() instead."
1406 )
1407 return primary
1409 def retrieveArtifacts(
1410 self,
1411 refs: Iterable[DatasetRef],
1412 destination: ResourcePathExpression,
1413 transfer: str = "auto",
1414 preserve_path: bool = True,
1415 overwrite: bool = False,
1416 ) -> List[ResourcePath]:
1417 """Retrieve the artifacts associated with the supplied refs.
1419 Parameters
1420 ----------
1421 refs : iterable of `DatasetRef`
1422 The datasets for which artifacts are to be retrieved.
1423 A single ref can result in multiple artifacts. The refs must
1424 be resolved.
1425 destination : `lsst.resources.ResourcePath` or `str`
1426 Location to write the artifacts.
1427 transfer : `str`, optional
1428 Method to use to transfer the artifacts. Must be one of the options
1429 supported by `~lsst.resources.ResourcePath.transfer_from()`.
1430 "move" is not allowed.
1431 preserve_path : `bool`, optional
1432 If `True` the full path of the artifact within the datastore
1433 is preserved. If `False` the final file component of the path
1434 is used.
1435 overwrite : `bool`, optional
1436 If `True` allow transfers to overwrite existing files at the
1437 destination.
1439 Returns
1440 -------
1441 targets : `list` of `lsst.resources.ResourcePath`
1442 URIs of file artifacts in destination location. Order is not
1443 preserved.
1445 Notes
1446 -----
1447 For non-file datastores the artifacts written to the destination
1448 may not match the representation inside the datastore. For example
1449 a hierarchical data structure in a NoSQL database may well be stored
1450 as a JSON file.
1451 """
1452 return self.datastore.retrieveArtifacts(
1453 refs,
1454 ResourcePath(destination),
1455 transfer=transfer,
1456 preserve_path=preserve_path,
1457 overwrite=overwrite,
1458 )
1460 def datasetExists(
1461 self,
1462 datasetRefOrType: Union[DatasetRef, DatasetType, str],
1463 dataId: Optional[DataId] = None,
1464 *,
1465 collections: Any = None,
1466 **kwargs: Any,
1467 ) -> bool:
1468 """Return True if the Dataset is actually present in the Datastore.
1470 Parameters
1471 ----------
1472 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1473 When `DatasetRef` the `dataId` should be `None`.
1474 Otherwise the `DatasetType` or name thereof.
1475 dataId : `dict` or `DataCoordinate`
1476 A `dict` of `Dimension` link name, value pairs that label the
1477 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1478 should be provided as the first argument.
1479 collections : Any, optional
1480 Collections to be searched, overriding ``self.collections``.
1481 Can be any of the types supported by the ``collections`` argument
1482 to butler construction.
1483 **kwargs
1484 Additional keyword arguments used to augment or construct a
1485 `DataCoordinate`. See `DataCoordinate.standardize`
1486 parameters.
1488 Raises
1489 ------
1490 LookupError
1491 Raised if the dataset is not even present in the Registry.
1492 ValueError
1493 Raised if a resolved `DatasetRef` was passed as an input, but it
1494 differs from the one found in the registry.
1495 TypeError
1496 Raised if no collections were provided.
1497 """
1498 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs)
1499 return self.datastore.exists(ref)
1501 def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None:
1502 """Remove one or more `~CollectionType.RUN` collections and the
1503 datasets within them.
1505 Parameters
1506 ----------
1507 names : `Iterable` [ `str` ]
1508 The names of the collections to remove.
1509 unstore : `bool`, optional
1510 If `True` (default), delete datasets from all datastores in which
1511 they are present, and attempt to rollback the registry deletions if
1512 datastore deletions fail (which may not always be possible). If
1513 `False`, datastore records for these datasets are still removed,
1514 but any artifacts (e.g. files) will not be.
1516 Raises
1517 ------
1518 TypeError
1519 Raised if one or more collections are not of type
1520 `~CollectionType.RUN`.
1521 """
1522 if not self.isWriteable():
1523 raise TypeError("Butler is read-only.")
1524 names = list(names)
1525 refs: List[DatasetRef] = []
1526 for name in names:
1527 collectionType = self.registry.getCollectionType(name)
1528 if collectionType is not CollectionType.RUN:
1529 raise TypeError(f"The collection type of '{name}' is {collectionType.name}, not RUN.")
1530 refs.extend(self.registry.queryDatasets(..., collections=name, findFirst=True))
1531 with self.registry.transaction():
1532 if unstore:
1533 self.datastore.trash(refs)
1534 else:
1535 self.datastore.forget(refs)
1536 for name in names:
1537 self.registry.removeCollection(name)
1538 if unstore:
1539 # Point of no return for removing artifacts
1540 self.datastore.emptyTrash()
1542 def pruneCollection(
1543 self, name: str, purge: bool = False, unstore: bool = False, unlink: Optional[List[str]] = None
1544 ) -> None:
1545 """Remove a collection and possibly prune datasets within it.
1547 Parameters
1548 ----------
1549 name : `str`
1550 Name of the collection to remove. If this is a
1551 `~CollectionType.TAGGED` or `~CollectionType.CHAINED` collection,
1552 datasets within the collection are not modified unless ``unstore``
1553 is `True`. If this is a `~CollectionType.RUN` collection,
1554 ``purge`` and ``unstore`` must be `True`, and all datasets in it
1555 are fully removed from the data repository.
1556 purge : `bool`, optional
1557 If `True`, permit `~CollectionType.RUN` collections to be removed,
1558 fully removing datasets within them. Requires ``unstore=True`` as
1559 well as an added precaution against accidental deletion. Must be
1560 `False` (default) if the collection is not a ``RUN``.
1561 unstore: `bool`, optional
1562 If `True`, remove all datasets in the collection from all
1563 datastores in which they appear.
1564 unlink: `list` [`str`], optional
1565 Before removing the given `collection` unlink it from from these
1566 parent collections.
1568 Raises
1569 ------
1570 TypeError
1571 Raised if the butler is read-only or arguments are mutually
1572 inconsistent.
1573 """
1574 # See pruneDatasets comments for more information about the logic here;
1575 # the cases are almost the same, but here we can rely on Registry to
1576 # take care everything but Datastore deletion when we remove the
1577 # collection.
1578 if not self.isWriteable():
1579 raise TypeError("Butler is read-only.")
1580 collectionType = self.registry.getCollectionType(name)
1581 if purge and not unstore:
1582 raise PurgeWithoutUnstorePruneCollectionsError()
1583 if collectionType is CollectionType.RUN and not purge:
1584 raise RunWithoutPurgePruneCollectionsError(collectionType)
1585 if collectionType is not CollectionType.RUN and purge:
1586 raise PurgeUnsupportedPruneCollectionsError(collectionType)
1588 def remove(child: str, parent: str) -> None:
1589 """Remove a child collection from a parent collection."""
1590 # Remove child from parent.
1591 chain = list(self.registry.getCollectionChain(parent))
1592 try:
1593 chain.remove(name)
1594 except ValueError as e:
1595 raise RuntimeError(f"{name} is not a child of {parent}") from e
1596 self.registry.setCollectionChain(parent, chain)
1598 with self.registry.transaction():
1599 if unlink:
1600 for parent in unlink:
1601 remove(name, parent)
1602 if unstore:
1603 refs = self.registry.queryDatasets(..., collections=name, findFirst=True)
1604 self.datastore.trash(refs)
1605 self.registry.removeCollection(name)
1607 if unstore:
1608 # Point of no return for removing artifacts
1609 self.datastore.emptyTrash()
1611 def pruneDatasets(
1612 self,
1613 refs: Iterable[DatasetRef],
1614 *,
1615 disassociate: bool = True,
1616 unstore: bool = False,
1617 tags: Iterable[str] = (),
1618 purge: bool = False,
1619 run: Optional[str] = None,
1620 ) -> None:
1621 """Remove one or more datasets from a collection and/or storage.
1623 Parameters
1624 ----------
1625 refs : `~collections.abc.Iterable` of `DatasetRef`
1626 Datasets to prune. These must be "resolved" references (not just
1627 a `DatasetType` and data ID).
1628 disassociate : `bool`, optional
1629 Disassociate pruned datasets from ``tags``, or from all collections
1630 if ``purge=True``.
1631 unstore : `bool`, optional
1632 If `True` (`False` is default) remove these datasets from all
1633 datastores known to this butler. Note that this will make it
1634 impossible to retrieve these datasets even via other collections.
1635 Datasets that are already not stored are ignored by this option.
1636 tags : `Iterable` [ `str` ], optional
1637 `~CollectionType.TAGGED` collections to disassociate the datasets
1638 from. Ignored if ``disassociate`` is `False` or ``purge`` is
1639 `True`.
1640 purge : `bool`, optional
1641 If `True` (`False` is default), completely remove the dataset from
1642 the `Registry`. To prevent accidental deletions, ``purge`` may
1643 only be `True` if all of the following conditions are met:
1645 - All given datasets are in the given run.
1646 - ``disassociate`` is `True`;
1647 - ``unstore`` is `True`.
1649 This mode may remove provenance information from datasets other
1650 than those provided, and should be used with extreme care.
1652 Raises
1653 ------
1654 TypeError
1655 Raised if the butler is read-only, if no collection was provided,
1656 or the conditions for ``purge=True`` were not met.
1657 """
1658 if not self.isWriteable():
1659 raise TypeError("Butler is read-only.")
1660 if purge:
1661 if not disassociate:
1662 raise TypeError("Cannot pass purge=True without disassociate=True.")
1663 if not unstore:
1664 raise TypeError("Cannot pass purge=True without unstore=True.")
1665 elif disassociate:
1666 tags = tuple(tags)
1667 if not tags:
1668 raise TypeError("No tags provided but disassociate=True.")
1669 for tag in tags:
1670 collectionType = self.registry.getCollectionType(tag)
1671 if collectionType is not CollectionType.TAGGED:
1672 raise TypeError(
1673 f"Cannot disassociate from collection '{tag}' "
1674 f"of non-TAGGED type {collectionType.name}."
1675 )
1676 # Transform possibly-single-pass iterable into something we can iterate
1677 # over multiple times.
1678 refs = list(refs)
1679 # Pruning a component of a DatasetRef makes no sense since registry
1680 # doesn't know about components and datastore might not store
1681 # components in a separate file
1682 for ref in refs:
1683 if ref.datasetType.component():
1684 raise ValueError(f"Can not prune a component of a dataset (ref={ref})")
1685 # We don't need an unreliable Datastore transaction for this, because
1686 # we've been extra careful to ensure that Datastore.trash only involves
1687 # mutating the Registry (it can _look_ at Datastore-specific things,
1688 # but shouldn't change them), and hence all operations here are
1689 # Registry operations.
1690 with self.registry.transaction():
1691 if unstore:
1692 self.datastore.trash(refs)
1693 if purge:
1694 self.registry.removeDatasets(refs)
1695 elif disassociate:
1696 assert tags, "Guaranteed by earlier logic in this function."
1697 for tag in tags:
1698 self.registry.disassociate(tag, refs)
1699 # We've exited the Registry transaction, and apparently committed.
1700 # (if there was an exception, everything rolled back, and it's as if
1701 # nothing happened - and we never get here).
1702 # Datastore artifacts are not yet gone, but they're clearly marked
1703 # as trash, so if we fail to delete now because of (e.g.) filesystem
1704 # problems we can try again later, and if manual administrative
1705 # intervention is required, it's pretty clear what that should entail:
1706 # deleting everything on disk and in private Datastore tables that is
1707 # in the dataset_location_trash table.
1708 if unstore:
1709 # Point of no return for removing artifacts
1710 self.datastore.emptyTrash()
1712 @transactional
1713 def ingest(
1714 self,
1715 *datasets: FileDataset,
1716 transfer: Optional[str] = "auto",
1717 run: Optional[str] = None,
1718 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
1719 ) -> None:
1720 """Store and register one or more datasets that already exist on disk.
1722 Parameters
1723 ----------
1724 datasets : `FileDataset`
1725 Each positional argument is a struct containing information about
1726 a file to be ingested, including its URI (either absolute or
1727 relative to the datastore root, if applicable), a `DatasetRef`,
1728 and optionally a formatter class or its fully-qualified string
1729 name. If a formatter is not provided, the formatter that would be
1730 used for `put` is assumed. On successful return, all
1731 `FileDataset.ref` attributes will have their `DatasetRef.id`
1732 attribute populated and all `FileDataset.formatter` attributes will
1733 be set to the formatter class used. `FileDataset.path` attributes
1734 may be modified to put paths in whatever the datastore considers a
1735 standardized form.
1736 transfer : `str`, optional
1737 If not `None`, must be one of 'auto', 'move', 'copy', 'direct',
1738 'split', 'hardlink', 'relsymlink' or 'symlink', indicating how to
1739 transfer the file.
1740 run : `str`, optional
1741 The name of the run ingested datasets should be added to,
1742 overriding ``self.run``.
1743 idGenerationMode : `DatasetIdGenEnum`, optional
1744 Specifies option for generating dataset IDs. By default unique IDs
1745 are generated for each inserted dataset.
1747 Raises
1748 ------
1749 TypeError
1750 Raised if the butler is read-only or if no run was provided.
1751 NotImplementedError
1752 Raised if the `Datastore` does not support the given transfer mode.
1753 DatasetTypeNotSupportedError
1754 Raised if one or more files to be ingested have a dataset type that
1755 is not supported by the `Datastore`..
1756 FileNotFoundError
1757 Raised if one of the given files does not exist.
1758 FileExistsError
1759 Raised if transfer is not `None` but the (internal) location the
1760 file would be moved to is already occupied.
1762 Notes
1763 -----
1764 This operation is not fully exception safe: if a database operation
1765 fails, the given `FileDataset` instances may be only partially updated.
1767 It is atomic in terms of database operations (they will either all
1768 succeed or all fail) providing the database engine implements
1769 transactions correctly. It will attempt to be atomic in terms of
1770 filesystem operations as well, but this cannot be implemented
1771 rigorously for most datastores.
1772 """
1773 if not self.isWriteable():
1774 raise TypeError("Butler is read-only.")
1775 progress = Progress("lsst.daf.butler.Butler.ingest", level=logging.DEBUG)
1776 # Reorganize the inputs so they're grouped by DatasetType and then
1777 # data ID. We also include a list of DatasetRefs for each FileDataset
1778 # to hold the resolved DatasetRefs returned by the Registry, before
1779 # it's safe to swap them into FileDataset.refs.
1780 # Some type annotation aliases to make that clearer:
1781 GroupForType = Dict[DataCoordinate, Tuple[FileDataset, List[DatasetRef]]]
1782 GroupedData = MutableMapping[DatasetType, GroupForType]
1783 # The actual data structure:
1784 groupedData: GroupedData = defaultdict(dict)
1785 # And the nested loop that populates it:
1786 for dataset in progress.wrap(datasets, desc="Grouping by dataset type"):
1787 # This list intentionally shared across the inner loop, since it's
1788 # associated with `dataset`.
1789 resolvedRefs: List[DatasetRef] = []
1791 # Somewhere to store pre-existing refs if we have an
1792 # execution butler.
1793 existingRefs: List[DatasetRef] = []
1795 for ref in dataset.refs:
1796 if ref.dataId in groupedData[ref.datasetType]:
1797 raise ConflictingDefinitionError(
1798 f"Ingest conflict. Dataset {dataset.path} has same"
1799 " DataId as other ingest dataset"
1800 f" {groupedData[ref.datasetType][ref.dataId][0].path} "
1801 f" ({ref.dataId})"
1802 )
1803 if self._allow_put_of_predefined_dataset:
1804 existing_ref = self.registry.findDataset(
1805 ref.datasetType, dataId=ref.dataId, collections=run
1806 )
1807 if existing_ref:
1808 if self.datastore.knows(existing_ref):
1809 raise ConflictingDefinitionError(
1810 f"Dataset associated with path {dataset.path}"
1811 f" already exists as {existing_ref}."
1812 )
1813 # Store this ref elsewhere since it already exists
1814 # and we do not want to remake it but we do want
1815 # to store it in the datastore.
1816 existingRefs.append(existing_ref)
1818 # Nothing else to do until we have finished
1819 # iterating.
1820 continue
1822 groupedData[ref.datasetType][ref.dataId] = (dataset, resolvedRefs)
1824 if existingRefs:
1826 if len(dataset.refs) != len(existingRefs):
1827 # Keeping track of partially pre-existing datasets is hard
1828 # and should generally never happen. For now don't allow
1829 # it.
1830 raise ConflictingDefinitionError(
1831 f"For dataset {dataset.path} some dataIds already exist"
1832 " in registry but others do not. This is not supported."
1833 )
1835 # Attach the resolved refs if we found them.
1836 dataset.refs = existingRefs
1838 # Now we can bulk-insert into Registry for each DatasetType.
1839 for datasetType, groupForType in progress.iter_item_chunks(
1840 groupedData.items(), desc="Bulk-inserting datasets by type"
1841 ):
1842 refs = self.registry.insertDatasets(
1843 datasetType,
1844 dataIds=groupForType.keys(),
1845 run=run,
1846 expand=self.datastore.needs_expanded_data_ids(transfer, datasetType),
1847 idGenerationMode=idGenerationMode,
1848 )
1849 # Append those resolved DatasetRefs to the new lists we set up for
1850 # them.
1851 for ref, (_, resolvedRefs) in zip(refs, groupForType.values()):
1852 resolvedRefs.append(ref)
1854 # Go back to the original FileDatasets to replace their refs with the
1855 # new resolved ones.
1856 for groupForType in progress.iter_chunks(
1857 groupedData.values(), desc="Reassociating resolved dataset refs with files"
1858 ):
1859 for dataset, resolvedRefs in groupForType.values():
1860 dataset.refs = resolvedRefs
1862 # Bulk-insert everything into Datastore.
1863 self.datastore.ingest(*datasets, transfer=transfer)
1865 @contextlib.contextmanager
1866 def export(
1867 self,
1868 *,
1869 directory: Optional[str] = None,
1870 filename: Optional[str] = None,
1871 format: Optional[str] = None,
1872 transfer: Optional[str] = None,
1873 ) -> Iterator[RepoExportContext]:
1874 """Export datasets from the repository represented by this `Butler`.
1876 This method is a context manager that returns a helper object
1877 (`RepoExportContext`) that is used to indicate what information from
1878 the repository should be exported.
1880 Parameters
1881 ----------
1882 directory : `str`, optional
1883 Directory dataset files should be written to if ``transfer`` is not
1884 `None`.
1885 filename : `str`, optional
1886 Name for the file that will include database information associated
1887 with the exported datasets. If this is not an absolute path and
1888 ``directory`` is not `None`, it will be written to ``directory``
1889 instead of the current working directory. Defaults to
1890 "export.{format}".
1891 format : `str`, optional
1892 File format for the database information file. If `None`, the
1893 extension of ``filename`` will be used.
1894 transfer : `str`, optional
1895 Transfer mode passed to `Datastore.export`.
1897 Raises
1898 ------
1899 TypeError
1900 Raised if the set of arguments passed is inconsistent.
1902 Examples
1903 --------
1904 Typically the `Registry.queryDataIds` and `Registry.queryDatasets`
1905 methods are used to provide the iterables over data IDs and/or datasets
1906 to be exported::
1908 with butler.export("exports.yaml") as export:
1909 # Export all flats, but none of the dimension element rows
1910 # (i.e. data ID information) associated with them.
1911 export.saveDatasets(butler.registry.queryDatasets("flat"),
1912 elements=())
1913 # Export all datasets that start with "deepCoadd_" and all of
1914 # their associated data ID information.
1915 export.saveDatasets(butler.registry.queryDatasets("deepCoadd_*"))
1916 """
1917 if directory is None and transfer is not None:
1918 raise TypeError("Cannot transfer without providing a directory.")
1919 if transfer == "move":
1920 raise TypeError("Transfer may not be 'move': export is read-only")
1921 if format is None:
1922 if filename is None:
1923 raise TypeError("At least one of 'filename' or 'format' must be provided.")
1924 else:
1925 _, format = os.path.splitext(filename)
1926 elif filename is None:
1927 filename = f"export.{format}"
1928 if directory is not None:
1929 filename = os.path.join(directory, filename)
1930 BackendClass = get_class_of(self._config["repo_transfer_formats"][format]["export"])
1931 with open(filename, "w") as stream:
1932 backend = BackendClass(stream)
1933 try:
1934 helper = RepoExportContext(
1935 self.registry, self.datastore, backend=backend, directory=directory, transfer=transfer
1936 )
1937 yield helper
1938 except BaseException:
1939 raise
1940 else:
1941 helper._finish()
1943 def import_(
1944 self,
1945 *,
1946 directory: Optional[str] = None,
1947 filename: Union[str, TextIO, None] = None,
1948 format: Optional[str] = None,
1949 transfer: Optional[str] = None,
1950 skip_dimensions: Optional[Set] = None,
1951 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
1952 reuseIds: bool = False,
1953 ) -> None:
1954 """Import datasets into this repository that were exported from a
1955 different butler repository via `~lsst.daf.butler.Butler.export`.
1957 Parameters
1958 ----------
1959 directory : `str`, optional
1960 Directory containing dataset files to import from. If `None`,
1961 ``filename`` and all dataset file paths specified therein must
1962 be absolute.
1963 filename : `str` or `TextIO`, optional
1964 A stream or name of file that contains database information
1965 associated with the exported datasets, typically generated by
1966 `~lsst.daf.butler.Butler.export`. If this a string (name) and
1967 is not an absolute path, does not exist in the current working
1968 directory, and ``directory`` is not `None`, it is assumed to be in
1969 ``directory``. Defaults to "export.{format}".
1970 format : `str`, optional
1971 File format for ``filename``. If `None`, the extension of
1972 ``filename`` will be used.
1973 transfer : `str`, optional
1974 Transfer mode passed to `~lsst.daf.butler.Datastore.ingest`.
1975 skip_dimensions : `set`, optional
1976 Names of dimensions that should be skipped and not imported.
1977 idGenerationMode : `DatasetIdGenEnum`, optional
1978 Specifies option for generating dataset IDs when IDs are not
1979 provided or their type does not match backend type. By default
1980 unique IDs are generated for each inserted dataset.
1981 reuseIds : `bool`, optional
1982 If `True` then forces re-use of imported dataset IDs for integer
1983 IDs which are normally generated as auto-incremented; exception
1984 will be raised if imported IDs clash with existing ones. This
1985 option has no effect on the use of globally-unique IDs which are
1986 always re-used (or generated if integer IDs are being imported).
1988 Raises
1989 ------
1990 TypeError
1991 Raised if the set of arguments passed is inconsistent, or if the
1992 butler is read-only.
1993 """
1994 if not self.isWriteable():
1995 raise TypeError("Butler is read-only.")
1996 if format is None:
1997 if filename is None:
1998 raise TypeError("At least one of 'filename' or 'format' must be provided.")
1999 else:
2000 _, format = os.path.splitext(filename) # type: ignore
2001 elif filename is None:
2002 filename = f"export.{format}"
2003 if isinstance(filename, str) and directory is not None and not os.path.exists(filename):
2004 filename = os.path.join(directory, filename)
2005 BackendClass = get_class_of(self._config["repo_transfer_formats"][format]["import"])
2007 def doImport(importStream: TextIO) -> None:
2008 backend = BackendClass(importStream, self.registry)
2009 backend.register()
2010 with self.transaction():
2011 backend.load(
2012 self.datastore,
2013 directory=directory,
2014 transfer=transfer,
2015 skip_dimensions=skip_dimensions,
2016 idGenerationMode=idGenerationMode,
2017 reuseIds=reuseIds,
2018 )
2020 if isinstance(filename, str):
2021 with open(filename, "r") as stream:
2022 doImport(stream)
2023 else:
2024 doImport(filename)
2026 def transfer_from(
2027 self,
2028 source_butler: Butler,
2029 source_refs: Iterable[DatasetRef],
2030 transfer: str = "auto",
2031 id_gen_map: Dict[str, DatasetIdGenEnum] = None,
2032 skip_missing: bool = True,
2033 register_dataset_types: bool = False,
2034 ) -> List[DatasetRef]:
2035 """Transfer datasets to this Butler from a run in another Butler.
2037 Parameters
2038 ----------
2039 source_butler : `Butler`
2040 Butler from which the datasets are to be transferred.
2041 source_refs : iterable of `DatasetRef`
2042 Datasets defined in the source butler that should be transferred to
2043 this butler.
2044 transfer : `str`, optional
2045 Transfer mode passed to `~lsst.daf.butler.Datastore.transfer_from`.
2046 id_gen_map : `dict` of [`str`, `DatasetIdGenEnum`], optional
2047 A mapping of dataset type to ID generation mode. Only used if
2048 the source butler is using integer IDs. Should not be used
2049 if this receiving butler uses integer IDs. Without this dataset
2050 import always uses unique.
2051 skip_missing : `bool`
2052 If `True`, datasets with no datastore artifact associated with
2053 them are not transferred. If `False` a registry entry will be
2054 created even if no datastore record is created (and so will
2055 look equivalent to the dataset being unstored).
2056 register_dataset_types : `bool`
2057 If `True` any missing dataset types are registered. Otherwise
2058 an exception is raised.
2060 Returns
2061 -------
2062 refs : `list` of `DatasetRef`
2063 The refs added to this Butler.
2065 Notes
2066 -----
2067 Requires that any dimension definitions are already present in the
2068 receiving Butler. The datastore artifact has to exist for a transfer
2069 to be made but non-existence is not an error.
2071 Datasets that already exist in this run will be skipped.
2073 The datasets are imported as part of a transaction, although
2074 dataset types are registered before the transaction is started.
2075 This means that it is possible for a dataset type to be registered
2076 even though transfer has failed.
2077 """
2078 if not self.isWriteable():
2079 raise TypeError("Butler is read-only.")
2080 progress = Progress("lsst.daf.butler.Butler.transfer_from", level=VERBOSE)
2082 # Will iterate through the refs multiple times so need to convert
2083 # to a list if this isn't a collection.
2084 if not isinstance(source_refs, collections.abc.Collection):
2085 source_refs = list(source_refs)
2087 original_count = len(source_refs)
2088 log.info("Transferring %d datasets into %s", original_count, str(self))
2090 if id_gen_map is None:
2091 id_gen_map = {}
2093 # In some situations the datastore artifact may be missing
2094 # and we do not want that registry entry to be imported.
2095 # Asking datastore is not sufficient, the records may have been
2096 # purged, we have to ask for the (predicted) URI and check
2097 # existence explicitly. Execution butler is set up exactly like
2098 # this with no datastore records.
2099 artifact_existence: Dict[ResourcePath, bool] = {}
2100 if skip_missing:
2101 dataset_existence = source_butler.datastore.mexists(
2102 source_refs, artifact_existence=artifact_existence
2103 )
2104 source_refs = [ref for ref, exists in dataset_existence.items() if exists]
2105 filtered_count = len(source_refs)
2106 log.verbose(
2107 "%d datasets removed because the artifact does not exist. Now have %d.",
2108 original_count - filtered_count,
2109 filtered_count,
2110 )
2112 # Importing requires that we group the refs by dataset type and run
2113 # before doing the import.
2114 source_dataset_types = set()
2115 grouped_refs = defaultdict(list)
2116 grouped_indices = defaultdict(list)
2117 for i, ref in enumerate(source_refs):
2118 grouped_refs[ref.datasetType, ref.run].append(ref)
2119 grouped_indices[ref.datasetType, ref.run].append(i)
2120 source_dataset_types.add(ref.datasetType)
2122 # Check to see if the dataset type in the source butler has
2123 # the same definition in the target butler and register missing
2124 # ones if requested. Registration must happen outside a transaction.
2125 newly_registered_dataset_types = set()
2126 for datasetType in source_dataset_types:
2127 if register_dataset_types:
2128 # Let this raise immediately if inconsistent. Continuing
2129 # on to find additional inconsistent dataset types
2130 # might result in additional unwanted dataset types being
2131 # registered.
2132 if self.registry.registerDatasetType(datasetType):
2133 newly_registered_dataset_types.add(datasetType)
2134 else:
2135 # If the dataset type is missing, let it fail immediately.
2136 target_dataset_type = self.registry.getDatasetType(datasetType.name)
2137 if target_dataset_type != datasetType:
2138 raise ConflictingDefinitionError(
2139 "Source butler dataset type differs from definition"
2140 f" in target butler: {datasetType} !="
2141 f" {target_dataset_type}"
2142 )
2143 if newly_registered_dataset_types:
2144 # We may have registered some even if there were inconsistencies
2145 # but should let people know (or else remove them again).
2146 log.log(
2147 VERBOSE,
2148 "Registered the following dataset types in the target Butler: %s",
2149 ", ".join(d.name for d in newly_registered_dataset_types),
2150 )
2151 else:
2152 log.log(VERBOSE, "All required dataset types are known to the target Butler")
2154 # The returned refs should be identical for UUIDs.
2155 # For now must also support integers and so need to retain the
2156 # newly-created refs from this registry.
2157 # Pre-size it so we can assign refs into the correct slots
2158 transferred_refs_tmp: List[Optional[DatasetRef]] = [None] * len(source_refs)
2159 default_id_gen = DatasetIdGenEnum.UNIQUE
2161 handled_collections: Set[str] = set()
2163 # Do all the importing in a single transaction.
2164 with self.transaction():
2165 for (datasetType, run), refs_to_import in progress.iter_item_chunks(
2166 grouped_refs.items(), desc="Importing to registry by run and dataset type"
2167 ):
2168 if run not in handled_collections:
2169 run_doc = source_butler.registry.getCollectionDocumentation(run)
2170 registered = self.registry.registerRun(run, doc=run_doc)
2171 handled_collections.add(run)
2172 if registered:
2173 log.log(VERBOSE, "Creating output run %s", run)
2175 id_generation_mode = default_id_gen
2176 if isinstance(refs_to_import[0].id, int):
2177 # ID generation mode might need to be overridden when
2178 # targetting UUID
2179 id_generation_mode = id_gen_map.get(datasetType.name, default_id_gen)
2181 n_refs = len(refs_to_import)
2182 log.verbose(
2183 "Importing %d ref%s of dataset type %s into run %s",
2184 n_refs,
2185 "" if n_refs == 1 else "s",
2186 datasetType.name,
2187 run,
2188 )
2190 # No way to know if this butler's registry uses UUID.
2191 # We have to trust the caller on this. If it fails they will
2192 # have to change their approach. We can't catch the exception
2193 # and retry with unique because that will mess up the
2194 # transaction handling. We aren't allowed to ask the registry
2195 # manager what type of ID it is using.
2196 imported_refs = self.registry._importDatasets(
2197 refs_to_import, idGenerationMode=id_generation_mode, expand=False
2198 )
2200 # Map them into the correct slots to match the initial order
2201 for i, ref in zip(grouped_indices[datasetType, run], imported_refs):
2202 transferred_refs_tmp[i] = ref
2204 # Mypy insists that we might have None in here so we have to make
2205 # that explicit by assigning to a new variable and filtering out
2206 # something that won't be there.
2207 transferred_refs = [ref for ref in transferred_refs_tmp if ref is not None]
2209 # Check consistency
2210 assert len(source_refs) == len(transferred_refs), "Different number of refs imported than given"
2212 log.verbose("Imported %d datasets into destination butler", len(transferred_refs))
2214 # The transferred refs need to be reordered to match the original
2215 # ordering given by the caller. Without this the datastore transfer
2216 # will be broken.
2218 # Ask the datastore to transfer. The datastore has to check that
2219 # the source datastore is compatible with the target datastore.
2220 self.datastore.transfer_from(
2221 source_butler.datastore,
2222 source_refs,
2223 local_refs=transferred_refs,
2224 transfer=transfer,
2225 artifact_existence=artifact_existence,
2226 )
2228 return transferred_refs
2230 def validateConfiguration(
2231 self,
2232 logFailures: bool = False,
2233 datasetTypeNames: Optional[Iterable[str]] = None,
2234 ignore: Iterable[str] = None,
2235 ) -> None:
2236 """Validate butler configuration.
2238 Checks that each `DatasetType` can be stored in the `Datastore`.
2240 Parameters
2241 ----------
2242 logFailures : `bool`, optional
2243 If `True`, output a log message for every validation error
2244 detected.
2245 datasetTypeNames : iterable of `str`, optional
2246 The `DatasetType` names that should be checked. This allows
2247 only a subset to be selected.
2248 ignore : iterable of `str`, optional
2249 Names of DatasetTypes to skip over. This can be used to skip
2250 known problems. If a named `DatasetType` corresponds to a
2251 composite, all components of that `DatasetType` will also be
2252 ignored.
2254 Raises
2255 ------
2256 ButlerValidationError
2257 Raised if there is some inconsistency with how this Butler
2258 is configured.
2259 """
2260 if datasetTypeNames:
2261 datasetTypes = [self.registry.getDatasetType(name) for name in datasetTypeNames]
2262 else:
2263 datasetTypes = list(self.registry.queryDatasetTypes())
2265 # filter out anything from the ignore list
2266 if ignore:
2267 ignore = set(ignore)
2268 datasetTypes = [
2269 e for e in datasetTypes if e.name not in ignore and e.nameAndComponent()[0] not in ignore
2270 ]
2271 else:
2272 ignore = set()
2274 # Find all the registered instruments
2275 instruments = set(record.name for record in self.registry.queryDimensionRecords("instrument"))
2277 # For each datasetType that has an instrument dimension, create
2278 # a DatasetRef for each defined instrument
2279 datasetRefs = []
2281 for datasetType in datasetTypes:
2282 if "instrument" in datasetType.dimensions:
2283 for instrument in instruments:
2284 datasetRef = DatasetRef(
2285 datasetType, {"instrument": instrument}, conform=False # type: ignore
2286 )
2287 datasetRefs.append(datasetRef)
2289 entities: List[Union[DatasetType, DatasetRef]] = []
2290 entities.extend(datasetTypes)
2291 entities.extend(datasetRefs)
2293 datastoreErrorStr = None
2294 try:
2295 self.datastore.validateConfiguration(entities, logFailures=logFailures)
2296 except ValidationError as e:
2297 datastoreErrorStr = str(e)
2299 # Also check that the LookupKeys used by the datastores match
2300 # registry and storage class definitions
2301 keys = self.datastore.getLookupKeys()
2303 failedNames = set()
2304 failedDataId = set()
2305 for key in keys:
2306 if key.name is not None:
2307 if key.name in ignore:
2308 continue
2310 # skip if specific datasetType names were requested and this
2311 # name does not match
2312 if datasetTypeNames and key.name not in datasetTypeNames:
2313 continue
2315 # See if it is a StorageClass or a DatasetType
2316 if key.name in self.storageClasses:
2317 pass
2318 else:
2319 try:
2320 self.registry.getDatasetType(key.name)
2321 except KeyError:
2322 if logFailures:
2323 log.critical("Key '%s' does not correspond to a DatasetType or StorageClass", key)
2324 failedNames.add(key)
2325 else:
2326 # Dimensions are checked for consistency when the Butler
2327 # is created and rendezvoused with a universe.
2328 pass
2330 # Check that the instrument is a valid instrument
2331 # Currently only support instrument so check for that
2332 if key.dataId:
2333 dataIdKeys = set(key.dataId)
2334 if set(["instrument"]) != dataIdKeys:
2335 if logFailures:
2336 log.critical("Key '%s' has unsupported DataId override", key)
2337 failedDataId.add(key)
2338 elif key.dataId["instrument"] not in instruments:
2339 if logFailures:
2340 log.critical("Key '%s' has unknown instrument", key)
2341 failedDataId.add(key)
2343 messages = []
2345 if datastoreErrorStr:
2346 messages.append(datastoreErrorStr)
2348 for failed, msg in (
2349 (failedNames, "Keys without corresponding DatasetType or StorageClass entry: "),
2350 (failedDataId, "Keys with bad DataId entries: "),
2351 ):
2352 if failed:
2353 msg += ", ".join(str(k) for k in failed)
2354 messages.append(msg)
2356 if messages:
2357 raise ValidationError(";\n".join(messages))
2359 @property
2360 def collections(self) -> CollectionSearch:
2361 """The collections to search by default, in order (`CollectionSearch`).
2363 This is an alias for ``self.registry.defaults.collections``. It cannot
2364 be set directly in isolation, but all defaults may be changed together
2365 by assigning a new `RegistryDefaults` instance to
2366 ``self.registry.defaults``.
2367 """
2368 return self.registry.defaults.collections
2370 @property
2371 def run(self) -> Optional[str]:
2372 """Name of the run this butler writes outputs to by default (`str` or
2373 `None`).
2375 This is an alias for ``self.registry.defaults.run``. It cannot be set
2376 directly in isolation, but all defaults may be changed together by
2377 assigning a new `RegistryDefaults` instance to
2378 ``self.registry.defaults``.
2379 """
2380 return self.registry.defaults.run
2382 registry: Registry
2383 """The object that manages dataset metadata and relationships (`Registry`).
2385 Most operations that don't involve reading or writing butler datasets are
2386 accessible only via `Registry` methods.
2387 """
2389 datastore: Datastore
2390 """The object that manages actual dataset storage (`Datastore`).
2392 Direct user access to the datastore should rarely be necessary; the primary
2393 exception is the case where a `Datastore` implementation provides extra
2394 functionality beyond what the base class defines.
2395 """
2397 storageClasses: StorageClassFactory
2398 """An object that maps known storage class names to objects that fully
2399 describe them (`StorageClassFactory`).
2400 """
2402 _allow_put_of_predefined_dataset: bool
2403 """Allow a put to succeed even if there is already a registry entry for it
2404 but not a datastore record. (`bool`)."""