Coverage for python/lsst/daf/butler/direct_butler.py: 10%
753 statements
« prev ^ index » next coverage.py v7.4.4, created at 2024-04-05 10:00 +0000
« prev ^ index » next coverage.py v7.4.4, created at 2024-04-05 10:00 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28"""Butler top level classes.
29"""
30from __future__ import annotations
32__all__ = (
33 "DirectButler",
34 "ButlerValidationError",
35)
37import collections.abc
38import contextlib
39import io
40import itertools
41import logging
42import numbers
43import os
44import warnings
45from collections import Counter, defaultdict
46from collections.abc import Iterable, Iterator, MutableMapping, Sequence
47from typing import TYPE_CHECKING, Any, ClassVar, TextIO, cast
49from lsst.resources import ResourcePath, ResourcePathExpression
50from lsst.utils.introspection import get_class_of
51from lsst.utils.iteration import ensure_iterable
52from lsst.utils.logging import VERBOSE, getLogger
53from sqlalchemy.exc import IntegrityError
55from ._butler import Butler
56from ._butler_config import ButlerConfig
57from ._butler_instance_options import ButlerInstanceOptions
58from ._dataset_existence import DatasetExistence
59from ._dataset_ref import DatasetRef
60from ._dataset_type import DatasetType
61from ._deferredDatasetHandle import DeferredDatasetHandle
62from ._exceptions import DatasetNotFoundError, DimensionValueError, ValidationError
63from ._limited_butler import LimitedButler
64from ._registry_shim import RegistryShim
65from ._storage_class import StorageClass, StorageClassFactory
66from ._timespan import Timespan
67from .datastore import Datastore, NullDatastore
68from .dimensions import DataCoordinate, Dimension
69from .progress import Progress
70from .queries import Query
71from .registry import (
72 CollectionType,
73 ConflictingDefinitionError,
74 DataIdError,
75 MissingDatasetTypeError,
76 RegistryDefaults,
77 _RegistryFactory,
78)
79from .registry.sql_registry import SqlRegistry
80from .transfers import RepoExportContext
81from .utils import transactional
83if TYPE_CHECKING:
84 from lsst.resources import ResourceHandleProtocol
86 from ._dataset_ref import DatasetId
87 from ._file_dataset import FileDataset
88 from .datastore import DatasetRefURIs
89 from .dimensions import DataId, DataIdValue, DimensionElement, DimensionRecord, DimensionUniverse
90 from .registry import Registry
91 from .transfers import RepoImportBackend
93_LOG = getLogger(__name__)
96class ButlerValidationError(ValidationError):
97 """There is a problem with the Butler configuration."""
99 pass
102class DirectButler(Butler): # numpydoc ignore=PR02
103 """Main entry point for the data access system.
105 Parameters
106 ----------
107 config : `ButlerConfig`
108 The configuration for this Butler instance.
109 registry : `SqlRegistry`
110 The object that manages dataset metadata and relationships.
111 datastore : Datastore
112 The object that manages actual dataset storage.
113 storageClasses : StorageClassFactory
114 An object that maps known storage class names to objects that fully
115 describe them.
117 Notes
118 -----
119 Most users should call the top-level `Butler`.``from_config`` instead of
120 using this constructor directly.
121 """
123 # This is __new__ instead of __init__ because we have to support
124 # instantiation via the legacy constructor Butler.__new__(), which
125 # reads the configuration and selects which subclass to instantiate. The
126 # interaction between __new__ and __init__ is kind of wacky in Python. If
127 # we were using __init__ here, __init__ would be called twice (once when
128 # the DirectButler instance is constructed inside Butler.from_config(), and
129 # a second time with the original arguments to Butler() when the instance
130 # is returned from Butler.__new__()
131 def __new__(
132 cls,
133 *,
134 config: ButlerConfig,
135 registry: SqlRegistry,
136 datastore: Datastore,
137 storageClasses: StorageClassFactory,
138 ) -> DirectButler:
139 self = cast(DirectButler, super().__new__(cls))
140 self._config = config
141 self._registry = registry
142 self._datastore = datastore
143 self.storageClasses = storageClasses
145 # For execution butler the datastore needs a special
146 # dependency-inversion trick. This is not used by regular butler,
147 # but we do not have a way to distinguish regular butler from execution
148 # butler.
149 self._datastore.set_retrieve_dataset_type_method(self._retrieve_dataset_type)
151 self._registry_shim = RegistryShim(self)
153 return self
155 @classmethod
156 def create_from_config(
157 cls,
158 config: ButlerConfig,
159 *,
160 options: ButlerInstanceOptions,
161 without_datastore: bool = False,
162 ) -> DirectButler:
163 """Construct a Butler instance from a configuration file.
165 Parameters
166 ----------
167 config : `ButlerConfig`
168 The configuration for this Butler instance.
169 options : `ButlerInstanceOptions`
170 Default values and other settings for the Butler instance.
171 without_datastore : `bool`, optional
172 If `True` do not attach a datastore to this butler. Any attempts
173 to use a datastore will fail.
175 Notes
176 -----
177 Most users should call the top-level `Butler`.``from_config``
178 instead of using this function directly.
179 """
180 if "run" in config or "collection" in config:
181 raise ValueError("Passing a run or collection via configuration is no longer supported.")
183 defaults = RegistryDefaults(
184 collections=options.collections, run=options.run, infer=options.inferDefaults, **options.kwargs
185 )
186 try:
187 butlerRoot = config.get("root", config.configDir)
188 writeable = options.writeable
189 if writeable is None:
190 writeable = options.run is not None
191 registry = _RegistryFactory(config).from_config(
192 butlerRoot=butlerRoot, writeable=writeable, defaults=defaults
193 )
194 if without_datastore:
195 datastore: Datastore = NullDatastore(None, None)
196 else:
197 datastore = Datastore.fromConfig(
198 config, registry.getDatastoreBridgeManager(), butlerRoot=butlerRoot
199 )
200 # TODO: Once datastore drops dependency on registry we can
201 # construct datastore first and pass opaque tables to registry
202 # constructor.
203 registry.make_datastore_tables(datastore.get_opaque_table_definitions())
204 storageClasses = StorageClassFactory()
205 storageClasses.addFromConfig(config)
207 return DirectButler(
208 config=config, registry=registry, datastore=datastore, storageClasses=storageClasses
209 )
210 except Exception:
211 # Failures here usually mean that configuration is incomplete,
212 # just issue an error message which includes config file URI.
213 _LOG.error(f"Failed to instantiate Butler from config {config.configFile}.")
214 raise
216 def _clone(
217 self,
218 *,
219 collections: Any = None,
220 run: str | None = None,
221 inferDefaults: bool = True,
222 **kwargs: Any,
223 ) -> DirectButler:
224 # Docstring inherited
225 defaults = RegistryDefaults(collections=collections, run=run, infer=inferDefaults, **kwargs)
226 registry = self._registry.copy(defaults)
228 return DirectButler(
229 registry=registry,
230 config=self._config,
231 datastore=self._datastore.clone(registry.getDatastoreBridgeManager()),
232 storageClasses=self.storageClasses,
233 )
235 GENERATION: ClassVar[int] = 3
236 """This is a Generation 3 Butler.
238 This attribute may be removed in the future, once the Generation 2 Butler
239 interface has been fully retired; it should only be used in transitional
240 code.
241 """
243 def _retrieve_dataset_type(self, name: str) -> DatasetType | None:
244 """Return DatasetType defined in registry given dataset type name."""
245 try:
246 return self.get_dataset_type(name)
247 except MissingDatasetTypeError:
248 return None
250 @classmethod
251 def _unpickle(
252 cls,
253 config: ButlerConfig,
254 collections: tuple[str, ...] | None,
255 run: str | None,
256 defaultDataId: dict[str, str],
257 writeable: bool,
258 ) -> DirectButler:
259 """Callable used to unpickle a Butler.
261 We prefer not to use ``Butler.__init__`` directly so we can force some
262 of its many arguments to be keyword-only (note that ``__reduce__``
263 can only invoke callables with positional arguments).
265 Parameters
266 ----------
267 config : `ButlerConfig`
268 Butler configuration, already coerced into a true `ButlerConfig`
269 instance (and hence after any search paths for overrides have been
270 utilized).
271 collections : `tuple` [ `str` ]
272 Names of the default collections to read from.
273 run : `str`, optional
274 Name of the default `~CollectionType.RUN` collection to write to.
275 defaultDataId : `dict` [ `str`, `str` ]
276 Default data ID values.
277 writeable : `bool`
278 Whether the Butler should support write operations.
280 Returns
281 -------
282 butler : `Butler`
283 A new `Butler` instance.
284 """
285 return cls.create_from_config(
286 config=config,
287 options=ButlerInstanceOptions(
288 collections=collections, run=run, writeable=writeable, kwargs=defaultDataId
289 ),
290 )
292 def __reduce__(self) -> tuple:
293 """Support pickling."""
294 return (
295 DirectButler._unpickle,
296 (
297 self._config,
298 self.collections,
299 self.run,
300 dict(self._registry.defaults.dataId.required),
301 self._registry.isWriteable(),
302 ),
303 )
305 def __str__(self) -> str:
306 return (
307 f"Butler(collections={self.collections}, run={self.run}, "
308 f"datastore='{self._datastore}', registry='{self._registry}')"
309 )
311 def isWriteable(self) -> bool:
312 # Docstring inherited.
313 return self._registry.isWriteable()
315 def _caching_context(self) -> contextlib.AbstractContextManager[None]:
316 """Context manager that enables caching."""
317 return self._registry.caching_context()
319 @contextlib.contextmanager
320 def transaction(self) -> Iterator[None]:
321 """Context manager supporting `Butler` transactions.
323 Transactions can be nested.
324 """
325 with self._registry.transaction(), self._datastore.transaction():
326 yield
328 def _standardizeArgs(
329 self,
330 datasetRefOrType: DatasetRef | DatasetType | str,
331 dataId: DataId | None = None,
332 for_put: bool = True,
333 **kwargs: Any,
334 ) -> tuple[DatasetType, DataId | None]:
335 """Standardize the arguments passed to several Butler APIs.
337 Parameters
338 ----------
339 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
340 When `DatasetRef` the `dataId` should be `None`.
341 Otherwise the `DatasetType` or name thereof.
342 dataId : `dict` or `DataCoordinate`
343 A `dict` of `Dimension` link name, value pairs that label the
344 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
345 should be provided as the second argument.
346 for_put : `bool`, optional
347 If `True` this call is invoked as part of a `Butler.put()`.
348 Otherwise it is assumed to be part of a `Butler.get()`. This
349 parameter is only relevant if there is dataset type
350 inconsistency.
351 **kwargs
352 Additional keyword arguments used to augment or construct a
353 `DataCoordinate`. See `DataCoordinate.standardize`
354 parameters.
356 Returns
357 -------
358 datasetType : `DatasetType`
359 A `DatasetType` instance extracted from ``datasetRefOrType``.
360 dataId : `dict` or `DataId`, optional
361 Argument that can be used (along with ``kwargs``) to construct a
362 `DataId`.
364 Notes
365 -----
366 Butler APIs that conceptually need a DatasetRef also allow passing a
367 `DatasetType` (or the name of one) and a `DataId` (or a dict and
368 keyword arguments that can be used to construct one) separately. This
369 method accepts those arguments and always returns a true `DatasetType`
370 and a `DataId` or `dict`.
372 Standardization of `dict` vs `DataId` is best handled by passing the
373 returned ``dataId`` (and ``kwargs``) to `Registry` APIs, which are
374 generally similarly flexible.
375 """
376 externalDatasetType: DatasetType | None = None
377 internalDatasetType: DatasetType | None = None
378 if isinstance(datasetRefOrType, DatasetRef):
379 if dataId is not None or kwargs:
380 raise ValueError("DatasetRef given, cannot use dataId as well")
381 externalDatasetType = datasetRefOrType.datasetType
382 dataId = datasetRefOrType.dataId
383 else:
384 # Don't check whether DataId is provided, because Registry APIs
385 # can usually construct a better error message when it wasn't.
386 if isinstance(datasetRefOrType, DatasetType):
387 externalDatasetType = datasetRefOrType
388 else:
389 internalDatasetType = self.get_dataset_type(datasetRefOrType)
391 # Check that they are self-consistent
392 if externalDatasetType is not None:
393 internalDatasetType = self.get_dataset_type(externalDatasetType.name)
394 if externalDatasetType != internalDatasetType:
395 # We can allow differences if they are compatible, depending
396 # on whether this is a get or a put. A get requires that
397 # the python type associated with the datastore can be
398 # converted to the user type. A put requires that the user
399 # supplied python type can be converted to the internal
400 # type expected by registry.
401 relevantDatasetType = internalDatasetType
402 if for_put:
403 is_compatible = internalDatasetType.is_compatible_with(externalDatasetType)
404 else:
405 is_compatible = externalDatasetType.is_compatible_with(internalDatasetType)
406 relevantDatasetType = externalDatasetType
407 if not is_compatible:
408 raise ValueError(
409 f"Supplied dataset type ({externalDatasetType}) inconsistent with "
410 f"registry definition ({internalDatasetType})"
411 )
412 # Override the internal definition.
413 internalDatasetType = relevantDatasetType
415 assert internalDatasetType is not None
416 return internalDatasetType, dataId
418 def _rewrite_data_id(
419 self, dataId: DataId | None, datasetType: DatasetType, **kwargs: Any
420 ) -> tuple[DataId | None, dict[str, Any]]:
421 """Rewrite a data ID taking into account dimension records.
423 Take a Data ID and keyword args and rewrite it if necessary to
424 allow the user to specify dimension records rather than dimension
425 primary values.
427 This allows a user to include a dataId dict with keys of
428 ``exposure.day_obs`` and ``exposure.seq_num`` instead of giving
429 the integer exposure ID. It also allows a string to be given
430 for a dimension value rather than the integer ID if that is more
431 convenient. For example, rather than having to specifying the
432 detector with ``detector.full_name``, a string given for ``detector``
433 will be interpreted as the full name and converted to the integer
434 value.
436 Keyword arguments can also use strings for dimensions like detector
437 and exposure but python does not allow them to include ``.`` and
438 so the ``exposure.day_obs`` syntax can not be used in a keyword
439 argument.
441 Parameters
442 ----------
443 dataId : `dict` or `DataCoordinate`
444 A `dict` of `Dimension` link name, value pairs that will label the
445 `DatasetRef` within a Collection.
446 datasetType : `DatasetType`
447 The dataset type associated with this dataId. Required to
448 determine the relevant dimensions.
449 **kwargs
450 Additional keyword arguments used to augment or construct a
451 `DataId`. See `DataId` parameters.
453 Returns
454 -------
455 dataId : `dict` or `DataCoordinate`
456 The, possibly rewritten, dataId. If given a `DataCoordinate` and
457 no keyword arguments, the original dataId will be returned
458 unchanged.
459 **kwargs : `dict`
460 Any unused keyword arguments (would normally be empty dict).
461 """
462 # Do nothing if we have a standalone DataCoordinate.
463 if isinstance(dataId, DataCoordinate) and not kwargs:
464 return dataId, kwargs
466 # Process dimension records that are using record information
467 # rather than ids
468 newDataId: dict[str, DataIdValue] = {}
469 byRecord: dict[str, dict[str, Any]] = defaultdict(dict)
471 # if all the dataId comes from keyword parameters we do not need
472 # to do anything here because they can't be of the form
473 # exposure.obs_id because a "." is not allowed in a keyword parameter.
474 if dataId:
475 for k, v in dataId.items():
476 # If we have a Dimension we do not need to do anything
477 # because it cannot be a compound key.
478 if isinstance(k, str) and "." in k:
479 # Someone is using a more human-readable dataId
480 dimensionName, record = k.split(".", 1)
481 byRecord[dimensionName][record] = v
482 elif isinstance(k, Dimension):
483 newDataId[k.name] = v
484 else:
485 newDataId[k] = v
487 # Go through the updated dataId and check the type in case someone is
488 # using an alternate key. We have already filtered out the compound
489 # keys dimensions.record format.
490 not_dimensions = {}
492 # Will need to look in the dataId and the keyword arguments
493 # and will remove them if they need to be fixed or are unrecognized.
494 for dataIdDict in (newDataId, kwargs):
495 # Use a list so we can adjust the dict safely in the loop
496 for dimensionName in list(dataIdDict):
497 value = dataIdDict[dimensionName]
498 try:
499 dimension = self.dimensions.dimensions[dimensionName]
500 except KeyError:
501 # This is not a real dimension
502 not_dimensions[dimensionName] = value
503 del dataIdDict[dimensionName]
504 continue
506 # Convert an integral type to an explicit int to simplify
507 # comparisons here
508 if isinstance(value, numbers.Integral):
509 value = int(value)
511 if not isinstance(value, dimension.primaryKey.getPythonType()):
512 for alternate in dimension.alternateKeys:
513 if isinstance(value, alternate.getPythonType()):
514 byRecord[dimensionName][alternate.name] = value
515 del dataIdDict[dimensionName]
516 _LOG.debug(
517 "Converting dimension %s to %s.%s=%s",
518 dimensionName,
519 dimensionName,
520 alternate.name,
521 value,
522 )
523 break
524 else:
525 _LOG.warning(
526 "Type mismatch found for value '%r' provided for dimension %s. "
527 "Could not find matching alternative (primary key has type %s) "
528 "so attempting to use as-is.",
529 value,
530 dimensionName,
531 dimension.primaryKey.getPythonType(),
532 )
534 # By this point kwargs and newDataId should only include valid
535 # dimensions. Merge kwargs in to the new dataId and log if there
536 # are dimensions in both (rather than calling update).
537 for k, v in kwargs.items():
538 if k in newDataId and newDataId[k] != v:
539 _LOG.debug(
540 "Keyword arg %s overriding explicit value in dataId of %s with %s", k, newDataId[k], v
541 )
542 newDataId[k] = v
543 # No need to retain any values in kwargs now.
544 kwargs = {}
546 # If we have some unrecognized dimensions we have to try to connect
547 # them to records in other dimensions. This is made more complicated
548 # by some dimensions having records with clashing names. A mitigation
549 # is that we can tell by this point which dimensions are missing
550 # for the DatasetType but this does not work for calibrations
551 # where additional dimensions can be used to constrain the temporal
552 # axis.
553 if not_dimensions:
554 # Search for all dimensions even if we have been given a value
555 # explicitly. In some cases records are given as well as the
556 # actually dimension and this should not be an error if they
557 # match.
558 mandatoryDimensions = datasetType.dimensions.names # - provided
560 candidateDimensions: set[str] = set()
561 candidateDimensions.update(mandatoryDimensions)
563 # For calibrations we may well be needing temporal dimensions
564 # so rather than always including all dimensions in the scan
565 # restrict things a little. It is still possible for there
566 # to be confusion over day_obs in visit vs exposure for example.
567 # If we are not searching calibration collections things may
568 # fail but they are going to fail anyway because of the
569 # ambiguousness of the dataId...
570 if datasetType.isCalibration():
571 for dim in self.dimensions.dimensions:
572 if dim.temporal:
573 candidateDimensions.add(str(dim))
575 # Look up table for the first association with a dimension
576 guessedAssociation: dict[str, dict[str, Any]] = defaultdict(dict)
578 # Keep track of whether an item is associated with multiple
579 # dimensions.
580 counter: Counter[str] = Counter()
581 assigned: dict[str, set[str]] = defaultdict(set)
583 # Go through the missing dimensions and associate the
584 # given names with records within those dimensions
585 matched_dims = set()
586 for dimensionName in candidateDimensions:
587 dimension = self.dimensions.dimensions[dimensionName]
588 fields = dimension.metadata.names | dimension.uniqueKeys.names
589 for field in not_dimensions:
590 if field in fields:
591 guessedAssociation[dimensionName][field] = not_dimensions[field]
592 counter[dimensionName] += 1
593 assigned[field].add(dimensionName)
594 matched_dims.add(field)
596 # Calculate the fields that matched nothing.
597 never_found = set(not_dimensions) - matched_dims
599 if never_found:
600 raise DimensionValueError(f"Unrecognized keyword args given: {never_found}")
602 # There is a chance we have allocated a single dataId item
603 # to multiple dimensions. Need to decide which should be retained.
604 # For now assume that the most popular alternative wins.
605 # This means that day_obs with seq_num will result in
606 # exposure.day_obs and not visit.day_obs
607 # Also prefer an explicitly missing dimension over an inferred
608 # temporal dimension.
609 for fieldName, assignedDimensions in assigned.items():
610 if len(assignedDimensions) > 1:
611 # Pick the most popular (preferring mandatory dimensions)
612 requiredButMissing = assignedDimensions.intersection(mandatoryDimensions)
613 if requiredButMissing:
614 candidateDimensions = requiredButMissing
615 else:
616 candidateDimensions = assignedDimensions
618 # If this is a choice between visit and exposure and
619 # neither was a required part of the dataset type,
620 # (hence in this branch) always prefer exposure over
621 # visit since exposures are always defined and visits
622 # are defined from exposures.
623 if candidateDimensions == {"exposure", "visit"}:
624 candidateDimensions = {"exposure"}
626 # Select the relevant items and get a new restricted
627 # counter.
628 theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions}
629 duplicatesCounter: Counter[str] = Counter()
630 duplicatesCounter.update(theseCounts)
632 # Choose the most common. If they are equally common
633 # we will pick the one that was found first.
634 # Returns a list of tuples
635 selected = duplicatesCounter.most_common(1)[0][0]
637 _LOG.debug(
638 "Ambiguous dataId entry '%s' associated with multiple dimensions: %s."
639 " Removed ambiguity by choosing dimension %s.",
640 fieldName,
641 ", ".join(assignedDimensions),
642 selected,
643 )
645 for candidateDimension in assignedDimensions:
646 if candidateDimension != selected:
647 del guessedAssociation[candidateDimension][fieldName]
649 # Update the record look up dict with the new associations
650 for dimensionName, values in guessedAssociation.items():
651 if values: # A dict might now be empty
652 _LOG.debug(
653 "Assigned non-dimension dataId keys to dimension %s: %s", dimensionName, values
654 )
655 byRecord[dimensionName].update(values)
657 if byRecord:
658 # Some record specifiers were found so we need to convert
659 # them to the Id form
660 for dimensionName, values in byRecord.items():
661 if dimensionName in newDataId:
662 _LOG.debug(
663 "DataId specified explicit %s dimension value of %s in addition to"
664 " general record specifiers for it of %s. Ignoring record information.",
665 dimensionName,
666 newDataId[dimensionName],
667 str(values),
668 )
669 # Get the actual record and compare with these values.
670 try:
671 recs = list(self._registry.queryDimensionRecords(dimensionName, dataId=newDataId))
672 except DataIdError:
673 raise DimensionValueError(
674 f"Could not find dimension '{dimensionName}'"
675 f" with dataId {newDataId} as part of comparing with"
676 f" record values {byRecord[dimensionName]}"
677 ) from None
678 if len(recs) == 1:
679 errmsg: list[str] = []
680 for k, v in values.items():
681 if (recval := getattr(recs[0], k)) != v:
682 errmsg.append(f"{k}({recval} != {v})")
683 if errmsg:
684 raise DimensionValueError(
685 f"Dimension {dimensionName} in dataId has explicit value"
686 " inconsistent with records: " + ", ".join(errmsg)
687 )
688 else:
689 # Multiple matches for an explicit dimension
690 # should never happen but let downstream complain.
691 pass
692 continue
694 # Build up a WHERE expression
695 bind = dict(values.items())
696 where = " AND ".join(f"{dimensionName}.{k} = {k}" for k in bind)
698 # Hopefully we get a single record that matches
699 records = set(
700 self._registry.queryDimensionRecords(
701 dimensionName, dataId=newDataId, where=where, bind=bind, **kwargs
702 )
703 )
705 if len(records) != 1:
706 if len(records) > 1:
707 # visit can have an ambiguous answer without involving
708 # visit_system. The default visit_system is defined
709 # by the instrument.
710 if (
711 dimensionName == "visit"
712 and "visit_system_membership" in self.dimensions
713 and "visit_system" in self.dimensions["instrument"].metadata
714 ):
715 instrument_records = list(
716 self._registry.queryDimensionRecords(
717 "instrument",
718 dataId=newDataId,
719 **kwargs,
720 )
721 )
722 if len(instrument_records) == 1:
723 visit_system = instrument_records[0].visit_system
724 if visit_system is None:
725 # Set to a value that will never match.
726 visit_system = -1
728 # Look up each visit in the
729 # visit_system_membership records.
730 for rec in records:
731 membership = list(
732 self._registry.queryDimensionRecords(
733 # Use bind to allow zero results.
734 # This is a fully-specified query.
735 "visit_system_membership",
736 where="instrument = inst AND visit_system = system AND visit = v",
737 bind=dict(
738 inst=instrument_records[0].name, system=visit_system, v=rec.id
739 ),
740 )
741 )
742 if membership:
743 # This record is the right answer.
744 records = {rec}
745 break
747 # The ambiguity may have been resolved so check again.
748 if len(records) > 1:
749 _LOG.debug(
750 "Received %d records from constraints of %s", len(records), str(values)
751 )
752 for r in records:
753 _LOG.debug("- %s", str(r))
754 raise DimensionValueError(
755 f"DataId specification for dimension {dimensionName} is not"
756 f" uniquely constrained to a single dataset by {values}."
757 f" Got {len(records)} results."
758 )
759 else:
760 raise DimensionValueError(
761 f"DataId specification for dimension {dimensionName} matched no"
762 f" records when constrained by {values}"
763 )
765 # Get the primary key from the real dimension object
766 dimension = self.dimensions.dimensions[dimensionName]
767 if not isinstance(dimension, Dimension):
768 raise RuntimeError(
769 f"{dimension.name} is not a true dimension, and cannot be used in data IDs."
770 )
771 newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name)
773 return newDataId, kwargs
775 def _findDatasetRef(
776 self,
777 datasetRefOrType: DatasetRef | DatasetType | str,
778 dataId: DataId | None = None,
779 *,
780 collections: Any = None,
781 predict: bool = False,
782 run: str | None = None,
783 datastore_records: bool = False,
784 timespan: Timespan | None = None,
785 **kwargs: Any,
786 ) -> DatasetRef:
787 """Shared logic for methods that start with a search for a dataset in
788 the registry.
790 Parameters
791 ----------
792 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
793 When `DatasetRef` the `dataId` should be `None`.
794 Otherwise the `DatasetType` or name thereof.
795 dataId : `dict` or `DataCoordinate`, optional
796 A `dict` of `Dimension` link name, value pairs that label the
797 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
798 should be provided as the first argument.
799 collections : Any, optional
800 Collections to be searched, overriding ``self.collections``.
801 Can be any of the types supported by the ``collections`` argument
802 to butler construction.
803 predict : `bool`, optional
804 If `True`, return a newly created `DatasetRef` with a unique
805 dataset ID if finding a reference in the `Registry` fails.
806 Defaults to `False`.
807 run : `str`, optional
808 Run collection name to use for creating `DatasetRef` for predicted
809 datasets. Only used if ``predict`` is `True`.
810 datastore_records : `bool`, optional
811 If `True` add datastore records to returned `DatasetRef`.
812 timespan : `Timespan` or `None`, optional
813 A timespan that the validity range of the dataset must overlap.
814 If not provided and this is a calibration dataset type, an attempt
815 will be made to find the timespan from any temporal coordinate
816 in the data ID.
817 **kwargs
818 Additional keyword arguments used to augment or construct a
819 `DataId`. See `DataId` parameters.
821 Returns
822 -------
823 ref : `DatasetRef`
824 A reference to the dataset identified by the given arguments.
825 This can be the same dataset reference as given if it was
826 resolved.
828 Raises
829 ------
830 LookupError
831 Raised if no matching dataset exists in the `Registry` (and
832 ``predict`` is `False`).
833 ValueError
834 Raised if a resolved `DatasetRef` was passed as an input, but it
835 differs from the one found in the registry.
836 TypeError
837 Raised if no collections were provided.
838 """
839 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, for_put=False, **kwargs)
840 if isinstance(datasetRefOrType, DatasetRef):
841 if collections is not None:
842 warnings.warn("Collections should not be specified with DatasetRef", stacklevel=3)
843 # May need to retrieve datastore records if requested.
844 if datastore_records and datasetRefOrType._datastore_records is None:
845 datasetRefOrType = self._registry.get_datastore_records(datasetRefOrType)
846 return datasetRefOrType
848 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs)
850 if datasetType.isCalibration():
851 # Because this is a calibration dataset, first try to make a
852 # standardize the data ID without restricting the dimensions to
853 # those of the dataset type requested, because there may be extra
854 # dimensions that provide temporal information for a validity-range
855 # lookup.
856 dataId = DataCoordinate.standardize(
857 dataId, universe=self.dimensions, defaults=self._registry.defaults.dataId, **kwargs
858 )
859 if timespan is None:
860 if dataId.dimensions.temporal:
861 dataId = self._registry.expandDataId(dataId)
862 # Use the timespan from the data ID to constrain the
863 # calibration lookup, but only if the caller has not
864 # specified an explicit timespan.
865 timespan = dataId.timespan
866 else:
867 # Try an arbitrary timespan. Downstream will fail if this
868 # results in more than one matching dataset.
869 timespan = Timespan(None, None)
870 else:
871 # Standardize the data ID to just the dimensions of the dataset
872 # type instead of letting registry.findDataset do it, so we get the
873 # result even if no dataset is found.
874 dataId = DataCoordinate.standardize(
875 dataId,
876 dimensions=datasetType.dimensions,
877 defaults=self._registry.defaults.dataId,
878 **kwargs,
879 )
880 # Always lookup the DatasetRef, even if one is given, to ensure it is
881 # present in the current collection.
882 ref = self.find_dataset(
883 datasetType,
884 dataId,
885 collections=collections,
886 timespan=timespan,
887 datastore_records=datastore_records,
888 )
889 if ref is None:
890 if predict:
891 if run is None:
892 run = self.run
893 if run is None:
894 raise TypeError("Cannot predict dataset ID/location with run=None.")
895 return DatasetRef(datasetType, dataId, run=run)
896 else:
897 if collections is None:
898 collections = self._registry.defaults.collections
899 raise DatasetNotFoundError(
900 f"Dataset {datasetType.name} with data ID {dataId} "
901 f"could not be found in collections {collections}."
902 )
903 if datasetType != ref.datasetType:
904 # If they differ it is because the user explicitly specified
905 # a compatible dataset type to this call rather than using the
906 # registry definition. The DatasetRef must therefore be recreated
907 # using the user definition such that the expected type is
908 # returned.
909 ref = DatasetRef(
910 datasetType, ref.dataId, run=ref.run, id=ref.id, datastore_records=ref._datastore_records
911 )
913 return ref
915 @transactional
916 def put(
917 self,
918 obj: Any,
919 datasetRefOrType: DatasetRef | DatasetType | str,
920 /,
921 dataId: DataId | None = None,
922 *,
923 run: str | None = None,
924 **kwargs: Any,
925 ) -> DatasetRef:
926 """Store and register a dataset.
928 Parameters
929 ----------
930 obj : `object`
931 The dataset.
932 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
933 When `DatasetRef` is provided, ``dataId`` should be `None`.
934 Otherwise the `DatasetType` or name thereof. If a fully resolved
935 `DatasetRef` is given the run and ID are used directly.
936 dataId : `dict` or `DataCoordinate`
937 A `dict` of `Dimension` link name, value pairs that label the
938 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
939 should be provided as the second argument.
940 run : `str`, optional
941 The name of the run the dataset should be added to, overriding
942 ``self.run``. Not used if a resolved `DatasetRef` is provided.
943 **kwargs
944 Additional keyword arguments used to augment or construct a
945 `DataCoordinate`. See `DataCoordinate.standardize`
946 parameters. Not used if a resolve `DatasetRef` is provided.
948 Returns
949 -------
950 ref : `DatasetRef`
951 A reference to the stored dataset, updated with the correct id if
952 given.
954 Raises
955 ------
956 TypeError
957 Raised if the butler is read-only or if no run has been provided.
958 """
959 if isinstance(datasetRefOrType, DatasetRef):
960 # This is a direct put of predefined DatasetRef.
961 _LOG.debug("Butler put direct: %s", datasetRefOrType)
962 if run is not None:
963 warnings.warn("Run collection is not used for DatasetRef", stacklevel=3)
964 # If registry already has a dataset with the same dataset ID,
965 # dataset type and DataId, then _importDatasets will do nothing and
966 # just return an original ref. We have to raise in this case, there
967 # is a datastore check below for that.
968 self._registry._importDatasets([datasetRefOrType], expand=True)
969 # Before trying to write to the datastore check that it does not
970 # know this dataset. This is prone to races, of course.
971 if self._datastore.knows(datasetRefOrType):
972 raise ConflictingDefinitionError(f"Datastore already contains dataset: {datasetRefOrType}")
973 # Try to write dataset to the datastore, if it fails due to a race
974 # with another write, the content of stored data may be
975 # unpredictable.
976 try:
977 self._datastore.put(obj, datasetRefOrType)
978 except IntegrityError as e:
979 raise ConflictingDefinitionError(f"Datastore already contains dataset: {e}") from e
980 return datasetRefOrType
982 _LOG.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run)
983 if not self.isWriteable():
984 raise TypeError("Butler is read-only.")
985 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs)
987 # Handle dimension records in dataId
988 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs)
990 # Add Registry Dataset entry.
991 dataId = self._registry.expandDataId(dataId, dimensions=datasetType.dimensions, **kwargs)
992 (ref,) = self._registry.insertDatasets(datasetType, run=run, dataIds=[dataId])
993 self._datastore.put(obj, ref)
995 return ref
997 def getDeferred(
998 self,
999 datasetRefOrType: DatasetRef | DatasetType | str,
1000 /,
1001 dataId: DataId | None = None,
1002 *,
1003 parameters: dict | None = None,
1004 collections: Any = None,
1005 storageClass: str | StorageClass | None = None,
1006 timespan: Timespan | None = None,
1007 **kwargs: Any,
1008 ) -> DeferredDatasetHandle:
1009 """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
1010 after an immediate registry lookup.
1012 Parameters
1013 ----------
1014 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1015 When `DatasetRef` the `dataId` should be `None`.
1016 Otherwise the `DatasetType` or name thereof.
1017 dataId : `dict` or `DataCoordinate`, optional
1018 A `dict` of `Dimension` link name, value pairs that label the
1019 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1020 should be provided as the first argument.
1021 parameters : `dict`
1022 Additional StorageClass-defined options to control reading,
1023 typically used to efficiently read only a subset of the dataset.
1024 collections : Any, optional
1025 Collections to be searched, overriding ``self.collections``.
1026 Can be any of the types supported by the ``collections`` argument
1027 to butler construction.
1028 storageClass : `StorageClass` or `str`, optional
1029 The storage class to be used to override the Python type
1030 returned by this method. By default the returned type matches
1031 the dataset type definition for this dataset. Specifying a
1032 read `StorageClass` can force a different type to be returned.
1033 This type must be compatible with the original type.
1034 timespan : `Timespan` or `None`, optional
1035 A timespan that the validity range of the dataset must overlap.
1036 If not provided and this is a calibration dataset type, an attempt
1037 will be made to find the timespan from any temporal coordinate
1038 in the data ID.
1039 **kwargs
1040 Additional keyword arguments used to augment or construct a
1041 `DataId`. See `DataId` parameters.
1043 Returns
1044 -------
1045 obj : `DeferredDatasetHandle`
1046 A handle which can be used to retrieve a dataset at a later time.
1048 Raises
1049 ------
1050 LookupError
1051 Raised if no matching dataset exists in the `Registry` or
1052 datastore.
1053 ValueError
1054 Raised if a resolved `DatasetRef` was passed as an input, but it
1055 differs from the one found in the registry.
1056 TypeError
1057 Raised if no collections were provided.
1058 """
1059 if isinstance(datasetRefOrType, DatasetRef):
1060 # Do the quick check first and if that fails, check for artifact
1061 # existence. This is necessary for datastores that are configured
1062 # in trust mode where there won't be a record but there will be
1063 # a file.
1064 if self._datastore.knows(datasetRefOrType) or self._datastore.exists(datasetRefOrType):
1065 ref = datasetRefOrType
1066 else:
1067 raise LookupError(f"Dataset reference {datasetRefOrType} does not exist.")
1068 else:
1069 ref = self._findDatasetRef(
1070 datasetRefOrType, dataId, collections=collections, timespan=timespan, **kwargs
1071 )
1072 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters, storageClass=storageClass)
1074 def get(
1075 self,
1076 datasetRefOrType: DatasetRef | DatasetType | str,
1077 /,
1078 dataId: DataId | None = None,
1079 *,
1080 parameters: dict[str, Any] | None = None,
1081 collections: Any = None,
1082 storageClass: StorageClass | str | None = None,
1083 timespan: Timespan | None = None,
1084 **kwargs: Any,
1085 ) -> Any:
1086 """Retrieve a stored dataset.
1088 Parameters
1089 ----------
1090 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1091 When `DatasetRef` the `dataId` should be `None`.
1092 Otherwise the `DatasetType` or name thereof.
1093 If a resolved `DatasetRef`, the associated dataset
1094 is returned directly without additional querying.
1095 dataId : `dict` or `DataCoordinate`
1096 A `dict` of `Dimension` link name, value pairs that label the
1097 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1098 should be provided as the first argument.
1099 parameters : `dict`
1100 Additional StorageClass-defined options to control reading,
1101 typically used to efficiently read only a subset of the dataset.
1102 collections : Any, optional
1103 Collections to be searched, overriding ``self.collections``.
1104 Can be any of the types supported by the ``collections`` argument
1105 to butler construction.
1106 storageClass : `StorageClass` or `str`, optional
1107 The storage class to be used to override the Python type
1108 returned by this method. By default the returned type matches
1109 the dataset type definition for this dataset. Specifying a
1110 read `StorageClass` can force a different type to be returned.
1111 This type must be compatible with the original type.
1112 timespan : `Timespan` or `None`, optional
1113 A timespan that the validity range of the dataset must overlap.
1114 If not provided and this is a calibration dataset type, an attempt
1115 will be made to find the timespan from any temporal coordinate
1116 in the data ID.
1117 **kwargs
1118 Additional keyword arguments used to augment or construct a
1119 `DataCoordinate`. See `DataCoordinate.standardize`
1120 parameters.
1122 Returns
1123 -------
1124 obj : `object`
1125 The dataset.
1127 Raises
1128 ------
1129 LookupError
1130 Raised if no matching dataset exists in the `Registry`.
1131 TypeError
1132 Raised if no collections were provided.
1134 Notes
1135 -----
1136 When looking up datasets in a `~CollectionType.CALIBRATION` collection,
1137 this method requires that the given data ID include temporal dimensions
1138 beyond the dimensions of the dataset type itself, in order to find the
1139 dataset with the appropriate validity range. For example, a "bias"
1140 dataset with native dimensions ``{instrument, detector}`` could be
1141 fetched with a ``{instrument, detector, exposure}`` data ID, because
1142 ``exposure`` is a temporal dimension.
1143 """
1144 _LOG.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters)
1145 ref = self._findDatasetRef(
1146 datasetRefOrType,
1147 dataId,
1148 collections=collections,
1149 datastore_records=True,
1150 timespan=timespan,
1151 **kwargs,
1152 )
1153 return self._datastore.get(ref, parameters=parameters, storageClass=storageClass)
1155 def getURIs(
1156 self,
1157 datasetRefOrType: DatasetRef | DatasetType | str,
1158 /,
1159 dataId: DataId | None = None,
1160 *,
1161 predict: bool = False,
1162 collections: Any = None,
1163 run: str | None = None,
1164 **kwargs: Any,
1165 ) -> DatasetRefURIs:
1166 """Return the URIs associated with the dataset.
1168 Parameters
1169 ----------
1170 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1171 When `DatasetRef` the `dataId` should be `None`.
1172 Otherwise the `DatasetType` or name thereof.
1173 dataId : `dict` or `DataCoordinate`
1174 A `dict` of `Dimension` link name, value pairs that label the
1175 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1176 should be provided as the first argument.
1177 predict : `bool`
1178 If `True`, allow URIs to be returned of datasets that have not
1179 been written.
1180 collections : Any, optional
1181 Collections to be searched, overriding ``self.collections``.
1182 Can be any of the types supported by the ``collections`` argument
1183 to butler construction.
1184 run : `str`, optional
1185 Run to use for predictions, overriding ``self.run``.
1186 **kwargs
1187 Additional keyword arguments used to augment or construct a
1188 `DataCoordinate`. See `DataCoordinate.standardize`
1189 parameters.
1191 Returns
1192 -------
1193 uris : `DatasetRefURIs`
1194 The URI to the primary artifact associated with this dataset (if
1195 the dataset was disassembled within the datastore this may be
1196 `None`), and the URIs to any components associated with the dataset
1197 artifact. (can be empty if there are no components).
1198 """
1199 ref = self._findDatasetRef(
1200 datasetRefOrType, dataId, predict=predict, run=run, collections=collections, **kwargs
1201 )
1202 return self._datastore.getURIs(ref, predict)
1204 def get_dataset_type(self, name: str) -> DatasetType:
1205 return self._registry.getDatasetType(name)
1207 def get_dataset(
1208 self,
1209 id: DatasetId,
1210 *,
1211 storage_class: str | StorageClass | None = None,
1212 dimension_records: bool = False,
1213 datastore_records: bool = False,
1214 ) -> DatasetRef | None:
1215 ref = self._registry.getDataset(id)
1216 if ref is not None:
1217 if dimension_records:
1218 ref = ref.expanded(
1219 self._registry.expandDataId(ref.dataId, dimensions=ref.datasetType.dimensions)
1220 )
1221 if storage_class:
1222 ref = ref.overrideStorageClass(storage_class)
1223 if datastore_records:
1224 ref = self._registry.get_datastore_records(ref)
1225 return ref
1227 def find_dataset(
1228 self,
1229 dataset_type: DatasetType | str,
1230 data_id: DataId | None = None,
1231 *,
1232 collections: str | Sequence[str] | None = None,
1233 timespan: Timespan | None = None,
1234 storage_class: str | StorageClass | None = None,
1235 dimension_records: bool = False,
1236 datastore_records: bool = False,
1237 **kwargs: Any,
1238 ) -> DatasetRef | None:
1239 # Handle any parts of the dataID that are not using primary dimension
1240 # keys.
1241 if isinstance(dataset_type, str):
1242 actual_type = self.get_dataset_type(dataset_type)
1243 else:
1244 actual_type = dataset_type
1246 # Store the component for later.
1247 component_name = actual_type.component()
1248 if actual_type.isComponent():
1249 parent_type = actual_type.makeCompositeDatasetType()
1250 else:
1251 parent_type = actual_type
1253 data_id, kwargs = self._rewrite_data_id(data_id, parent_type, **kwargs)
1255 ref = self._registry.findDataset(
1256 parent_type,
1257 data_id,
1258 collections=collections,
1259 timespan=timespan,
1260 datastore_records=datastore_records,
1261 **kwargs,
1262 )
1263 if ref is not None and dimension_records:
1264 ref = ref.expanded(self._registry.expandDataId(ref.dataId, dimensions=ref.datasetType.dimensions))
1265 if ref is not None and component_name:
1266 ref = ref.makeComponentRef(component_name)
1267 if ref is not None and storage_class is not None:
1268 ref = ref.overrideStorageClass(storage_class)
1270 return ref
1272 def retrieveArtifacts(
1273 self,
1274 refs: Iterable[DatasetRef],
1275 destination: ResourcePathExpression,
1276 transfer: str = "auto",
1277 preserve_path: bool = True,
1278 overwrite: bool = False,
1279 ) -> list[ResourcePath]:
1280 # Docstring inherited.
1281 return self._datastore.retrieveArtifacts(
1282 refs,
1283 ResourcePath(destination),
1284 transfer=transfer,
1285 preserve_path=preserve_path,
1286 overwrite=overwrite,
1287 )
1289 def exists(
1290 self,
1291 dataset_ref_or_type: DatasetRef | DatasetType | str,
1292 /,
1293 data_id: DataId | None = None,
1294 *,
1295 full_check: bool = True,
1296 collections: Any = None,
1297 **kwargs: Any,
1298 ) -> DatasetExistence:
1299 # Docstring inherited.
1300 existence = DatasetExistence.UNRECOGNIZED
1302 if isinstance(dataset_ref_or_type, DatasetRef):
1303 if collections is not None:
1304 warnings.warn("Collections should not be specified with DatasetRef", stacklevel=2)
1305 if data_id is not None:
1306 warnings.warn("A DataID should not be specified with DatasetRef", stacklevel=2)
1307 ref = dataset_ref_or_type
1308 registry_ref = self._registry.getDataset(dataset_ref_or_type.id)
1309 if registry_ref is not None:
1310 existence |= DatasetExistence.RECORDED
1312 if dataset_ref_or_type != registry_ref:
1313 # This could mean that storage classes differ, so we should
1314 # check for that but use the registry ref for the rest of
1315 # the method.
1316 if registry_ref.is_compatible_with(dataset_ref_or_type):
1317 # Use the registry version from now on.
1318 ref = registry_ref
1319 else:
1320 raise ValueError(
1321 f"The ref given to exists() ({ref}) has the same dataset ID as one "
1322 f"in registry but has different incompatible values ({registry_ref})."
1323 )
1324 else:
1325 try:
1326 ref = self._findDatasetRef(dataset_ref_or_type, data_id, collections=collections, **kwargs)
1327 except (LookupError, TypeError):
1328 return existence
1329 existence |= DatasetExistence.RECORDED
1331 if self._datastore.knows(ref):
1332 existence |= DatasetExistence.DATASTORE
1334 if full_check:
1335 if self._datastore.exists(ref):
1336 existence |= DatasetExistence._ARTIFACT
1337 elif existence.value != DatasetExistence.UNRECOGNIZED.value:
1338 # Do not add this flag if we have no other idea about a dataset.
1339 existence |= DatasetExistence(DatasetExistence._ASSUMED)
1341 return existence
1343 def _exists_many(
1344 self,
1345 refs: Iterable[DatasetRef],
1346 /,
1347 *,
1348 full_check: bool = True,
1349 ) -> dict[DatasetRef, DatasetExistence]:
1350 # Docstring inherited.
1351 existence = {ref: DatasetExistence.UNRECOGNIZED for ref in refs}
1353 # Registry does not have a bulk API to check for a ref.
1354 for ref in refs:
1355 registry_ref = self._registry.getDataset(ref.id)
1356 if registry_ref is not None:
1357 # It is possible, albeit unlikely, that the given ref does
1358 # not match the one in registry even though the UUID matches.
1359 # When checking a single ref we raise, but it's impolite to
1360 # do that when potentially hundreds of refs are being checked.
1361 # We could change the API to only accept UUIDs and that would
1362 # remove the ability to even check and remove the worry
1363 # about differing storage classes. Given the ongoing discussion
1364 # on refs vs UUIDs and whether to raise or have a new
1365 # private flag, treat this as a private API for now.
1366 existence[ref] |= DatasetExistence.RECORDED
1368 # Ask datastore if it knows about these refs.
1369 knows = self._datastore.knows_these(refs)
1370 for ref, known in knows.items():
1371 if known:
1372 existence[ref] |= DatasetExistence.DATASTORE
1374 if full_check:
1375 mexists = self._datastore.mexists(refs)
1376 for ref, exists in mexists.items():
1377 if exists:
1378 existence[ref] |= DatasetExistence._ARTIFACT
1379 else:
1380 # Do not set this flag if nothing is known about the dataset.
1381 for ref in existence:
1382 if existence[ref] != DatasetExistence.UNRECOGNIZED:
1383 existence[ref] |= DatasetExistence._ASSUMED
1385 return existence
1387 def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None:
1388 # Docstring inherited.
1389 if not self.isWriteable():
1390 raise TypeError("Butler is read-only.")
1391 names = list(names)
1392 refs: list[DatasetRef] = []
1393 for name in names:
1394 collectionType = self._registry.getCollectionType(name)
1395 if collectionType is not CollectionType.RUN:
1396 raise TypeError(f"The collection type of '{name}' is {collectionType.name}, not RUN.")
1397 refs.extend(self._registry.queryDatasets(..., collections=name, findFirst=True))
1398 with self._datastore.transaction(), self._registry.transaction():
1399 if unstore:
1400 self._datastore.trash(refs)
1401 else:
1402 self._datastore.forget(refs)
1403 for name in names:
1404 self._registry.removeCollection(name)
1405 if unstore:
1406 # Point of no return for removing artifacts
1407 self._datastore.emptyTrash()
1409 def pruneDatasets(
1410 self,
1411 refs: Iterable[DatasetRef],
1412 *,
1413 disassociate: bool = True,
1414 unstore: bool = False,
1415 tags: Iterable[str] = (),
1416 purge: bool = False,
1417 ) -> None:
1418 # docstring inherited from LimitedButler
1420 if not self.isWriteable():
1421 raise TypeError("Butler is read-only.")
1422 if purge:
1423 if not disassociate:
1424 raise TypeError("Cannot pass purge=True without disassociate=True.")
1425 if not unstore:
1426 raise TypeError("Cannot pass purge=True without unstore=True.")
1427 elif disassociate:
1428 tags = tuple(tags)
1429 if not tags:
1430 raise TypeError("No tags provided but disassociate=True.")
1431 for tag in tags:
1432 collectionType = self._registry.getCollectionType(tag)
1433 if collectionType is not CollectionType.TAGGED:
1434 raise TypeError(
1435 f"Cannot disassociate from collection '{tag}' "
1436 f"of non-TAGGED type {collectionType.name}."
1437 )
1438 # Transform possibly-single-pass iterable into something we can iterate
1439 # over multiple times.
1440 refs = list(refs)
1441 # Pruning a component of a DatasetRef makes no sense since registry
1442 # doesn't know about components and datastore might not store
1443 # components in a separate file
1444 for ref in refs:
1445 if ref.datasetType.component():
1446 raise ValueError(f"Can not prune a component of a dataset (ref={ref})")
1447 # We don't need an unreliable Datastore transaction for this, because
1448 # we've been extra careful to ensure that Datastore.trash only involves
1449 # mutating the Registry (it can _look_ at Datastore-specific things,
1450 # but shouldn't change them), and hence all operations here are
1451 # Registry operations.
1452 with self._datastore.transaction(), self._registry.transaction():
1453 if unstore:
1454 self._datastore.trash(refs)
1455 if purge:
1456 self._registry.removeDatasets(refs)
1457 elif disassociate:
1458 assert tags, "Guaranteed by earlier logic in this function."
1459 for tag in tags:
1460 self._registry.disassociate(tag, refs)
1461 # We've exited the Registry transaction, and apparently committed.
1462 # (if there was an exception, everything rolled back, and it's as if
1463 # nothing happened - and we never get here).
1464 # Datastore artifacts are not yet gone, but they're clearly marked
1465 # as trash, so if we fail to delete now because of (e.g.) filesystem
1466 # problems we can try again later, and if manual administrative
1467 # intervention is required, it's pretty clear what that should entail:
1468 # deleting everything on disk and in private Datastore tables that is
1469 # in the dataset_location_trash table.
1470 if unstore:
1471 # Point of no return for removing artifacts
1472 self._datastore.emptyTrash()
1474 @transactional
1475 def ingest(
1476 self,
1477 *datasets: FileDataset,
1478 transfer: str | None = "auto",
1479 record_validation_info: bool = True,
1480 ) -> None:
1481 # Docstring inherited.
1482 if not self.isWriteable():
1483 raise TypeError("Butler is read-only.")
1485 _LOG.verbose("Ingesting %d file dataset%s.", len(datasets), "" if len(datasets) == 1 else "s")
1486 if not datasets:
1487 return
1489 progress = Progress("lsst.daf.butler.Butler.ingest", level=logging.DEBUG)
1491 # We need to reorganize all the inputs so that they are grouped
1492 # by dataset type and run. Multiple refs in a single FileDataset
1493 # are required to share the run and dataset type.
1494 groupedData: MutableMapping[tuple[DatasetType, str], list[FileDataset]] = defaultdict(list)
1496 # Track DataIDs that are being ingested so we can spot issues early
1497 # with duplication. Retain previous FileDataset so we can report it.
1498 groupedDataIds: MutableMapping[tuple[DatasetType, str], dict[DataCoordinate, FileDataset]] = (
1499 defaultdict(dict)
1500 )
1502 # And the nested loop that populates it:
1503 for dataset in progress.wrap(datasets, desc="Grouping by dataset type"):
1504 # Somewhere to store pre-existing refs if we have an
1505 # execution butler.
1506 existingRefs: list[DatasetRef] = []
1508 for ref in dataset.refs:
1509 group_key = (ref.datasetType, ref.run)
1511 if ref.dataId in groupedDataIds[group_key]:
1512 raise ConflictingDefinitionError(
1513 f"Ingest conflict. Dataset {dataset.path} has same"
1514 " DataId as other ingest dataset"
1515 f" {groupedDataIds[group_key][ref.dataId].path} "
1516 f" ({ref.dataId})"
1517 )
1519 groupedDataIds[group_key][ref.dataId] = dataset
1521 if existingRefs:
1522 if len(dataset.refs) != len(existingRefs):
1523 # Keeping track of partially pre-existing datasets is hard
1524 # and should generally never happen. For now don't allow
1525 # it.
1526 raise ConflictingDefinitionError(
1527 f"For dataset {dataset.path} some dataIds already exist"
1528 " in registry but others do not. This is not supported."
1529 )
1531 # Store expanded form in the original FileDataset.
1532 dataset.refs = existingRefs
1533 else:
1534 groupedData[group_key].append(dataset)
1536 # Now we can bulk-insert into Registry for each DatasetType.
1537 for (datasetType, this_run), grouped_datasets in progress.iter_item_chunks(
1538 groupedData.items(), desc="Bulk-inserting datasets by type"
1539 ):
1540 refs_to_import = []
1541 for dataset in grouped_datasets:
1542 refs_to_import.extend(dataset.refs)
1544 n_refs = len(refs_to_import)
1545 _LOG.verbose(
1546 "Importing %d ref%s of dataset type %r into run %r",
1547 n_refs,
1548 "" if n_refs == 1 else "s",
1549 datasetType.name,
1550 this_run,
1551 )
1553 # Import the refs and expand the DataCoordinates since we can't
1554 # guarantee that they are expanded and Datastore will need
1555 # the records.
1556 imported_refs = self._registry._importDatasets(refs_to_import, expand=True)
1557 assert set(imported_refs) == set(refs_to_import)
1559 # Replace all the refs in the FileDataset with expanded versions.
1560 # Pull them off in the order we put them on the list.
1561 for dataset in grouped_datasets:
1562 n_dataset_refs = len(dataset.refs)
1563 dataset.refs = imported_refs[:n_dataset_refs]
1564 del imported_refs[:n_dataset_refs]
1566 # Bulk-insert everything into Datastore.
1567 # We do not know if any of the registry entries already existed
1568 # (_importDatasets only complains if they exist but differ) so
1569 # we have to catch IntegrityError explicitly.
1570 try:
1571 self._datastore.ingest(
1572 *datasets, transfer=transfer, record_validation_info=record_validation_info
1573 )
1574 except IntegrityError as e:
1575 raise ConflictingDefinitionError(f"Datastore already contains one or more datasets: {e}") from e
1577 @contextlib.contextmanager
1578 def export(
1579 self,
1580 *,
1581 directory: str | None = None,
1582 filename: str | None = None,
1583 format: str | None = None,
1584 transfer: str | None = None,
1585 ) -> Iterator[RepoExportContext]:
1586 # Docstring inherited.
1587 if directory is None and transfer is not None:
1588 raise TypeError("Cannot transfer without providing a directory.")
1589 if transfer == "move":
1590 raise TypeError("Transfer may not be 'move': export is read-only")
1591 if format is None:
1592 if filename is None:
1593 raise TypeError("At least one of 'filename' or 'format' must be provided.")
1594 else:
1595 _, format = os.path.splitext(filename)
1596 if not format:
1597 raise ValueError("Please specify a file extension to determine export format.")
1598 format = format[1:] # Strip leading ".""
1599 elif filename is None:
1600 filename = f"export.{format}"
1601 if directory is not None:
1602 filename = os.path.join(directory, filename)
1603 formats = self._config["repo_transfer_formats"]
1604 if format not in formats:
1605 raise ValueError(f"Unknown export format {format!r}, allowed: {','.join(formats.keys())}")
1606 BackendClass = get_class_of(formats[format, "export"])
1607 with open(filename, "w") as stream:
1608 backend = BackendClass(stream, universe=self.dimensions)
1609 try:
1610 helper = RepoExportContext(
1611 self._registry, self._datastore, backend=backend, directory=directory, transfer=transfer
1612 )
1613 with self._caching_context():
1614 yield helper
1615 except BaseException:
1616 raise
1617 else:
1618 helper._finish()
1620 def import_(
1621 self,
1622 *,
1623 directory: ResourcePathExpression | None = None,
1624 filename: ResourcePathExpression | TextIO | None = None,
1625 format: str | None = None,
1626 transfer: str | None = None,
1627 skip_dimensions: set | None = None,
1628 ) -> None:
1629 # Docstring inherited.
1630 if not self.isWriteable():
1631 raise TypeError("Butler is read-only.")
1632 if format is None:
1633 if filename is None:
1634 raise TypeError("At least one of 'filename' or 'format' must be provided.")
1635 else:
1636 _, format = os.path.splitext(filename) # type: ignore
1637 elif filename is None:
1638 filename = ResourcePath(f"export.{format}", forceAbsolute=False)
1639 if directory is not None:
1640 directory = ResourcePath(directory, forceDirectory=True)
1641 # mypy doesn't think this will work but it does in python >= 3.10.
1642 if isinstance(filename, ResourcePathExpression): # type: ignore
1643 filename = ResourcePath(filename, forceAbsolute=False) # type: ignore
1644 if not filename.isabs() and directory is not None:
1645 potential = directory.join(filename)
1646 exists_in_cwd = filename.exists()
1647 exists_in_dir = potential.exists()
1648 if exists_in_cwd and exists_in_dir:
1649 _LOG.warning(
1650 "A relative path for filename was specified (%s) which exists relative to cwd. "
1651 "Additionally, the file exists relative to the given search directory (%s). "
1652 "Using the export file in the given directory.",
1653 filename,
1654 potential,
1655 )
1656 # Given they specified an explicit directory and that
1657 # directory has the export file in it, assume that that
1658 # is what was meant despite the file in cwd.
1659 filename = potential
1660 elif exists_in_dir:
1661 filename = potential
1662 elif not exists_in_cwd and not exists_in_dir:
1663 # Raise early.
1664 raise FileNotFoundError(
1665 f"Export file could not be found in {filename.abspath()} or {potential.abspath()}."
1666 )
1667 BackendClass: type[RepoImportBackend] = get_class_of(
1668 self._config["repo_transfer_formats"][format]["import"]
1669 )
1671 def doImport(importStream: TextIO | ResourceHandleProtocol) -> None:
1672 with self._caching_context():
1673 backend = BackendClass(importStream, self._registry) # type: ignore[call-arg]
1674 backend.register()
1675 with self.transaction():
1676 backend.load(
1677 self._datastore,
1678 directory=directory,
1679 transfer=transfer,
1680 skip_dimensions=skip_dimensions,
1681 )
1683 if isinstance(filename, ResourcePath):
1684 # We can not use open() here at the moment because of
1685 # DM-38589 since yaml does stream.read(8192) in a loop.
1686 stream = io.StringIO(filename.read().decode())
1687 doImport(stream)
1688 else:
1689 doImport(filename) # type: ignore
1691 def transfer_dimension_records_from(
1692 self, source_butler: LimitedButler | Butler, source_refs: Iterable[DatasetRef]
1693 ) -> None:
1694 # Allowed dimensions in the target butler.
1695 elements = frozenset(element for element in self.dimensions.elements if element.has_own_table)
1697 data_ids = {ref.dataId for ref in source_refs}
1699 dimension_records = self._extract_all_dimension_records_from_data_ids(
1700 source_butler, data_ids, elements
1701 )
1703 # Insert order is important.
1704 for element in self.dimensions.sorted(dimension_records.keys()):
1705 records = [r for r in dimension_records[element].values()]
1706 # Assume that if the record is already present that we can
1707 # use it without having to check that the record metadata
1708 # is consistent.
1709 self._registry.insertDimensionData(element, *records, skip_existing=True)
1710 _LOG.debug("Dimension '%s' -- number of records transferred: %d", element.name, len(records))
1712 def _extract_all_dimension_records_from_data_ids(
1713 self,
1714 source_butler: LimitedButler | Butler,
1715 data_ids: set[DataCoordinate],
1716 allowed_elements: frozenset[DimensionElement],
1717 ) -> dict[DimensionElement, dict[DataCoordinate, DimensionRecord]]:
1718 primary_records = self._extract_dimension_records_from_data_ids(
1719 source_butler, data_ids, allowed_elements
1720 )
1722 can_query = True if isinstance(source_butler, Butler) else False
1724 additional_records: dict[DimensionElement, dict[DataCoordinate, DimensionRecord]] = defaultdict(dict)
1725 for original_element, record_mapping in primary_records.items():
1726 # Get dimensions that depend on this dimension.
1727 populated_by = self.dimensions.get_elements_populated_by(
1728 self.dimensions[original_element.name] # type: ignore
1729 )
1731 for data_id in record_mapping.keys():
1732 for element in populated_by:
1733 if element not in allowed_elements:
1734 continue
1735 if element.name == original_element.name:
1736 continue
1738 if element.name in primary_records:
1739 # If this element has already been stored avoid
1740 # re-finding records since that may lead to additional
1741 # spurious records. e.g. visit is populated_by
1742 # visit_detector_region but querying
1743 # visit_detector_region by visit will return all the
1744 # detectors for this visit -- the visit dataId does not
1745 # constrain this.
1746 # To constrain the query the original dataIds would
1747 # have to be scanned.
1748 continue
1750 if not can_query:
1751 raise RuntimeError(
1752 f"Transferring populated_by records like {element.name} requires a full Butler."
1753 )
1755 records = source_butler.registry.queryDimensionRecords( # type: ignore
1756 element.name,
1757 **data_id.mapping, # type: ignore
1758 )
1759 for record in records:
1760 additional_records[record.definition].setdefault(record.dataId, record)
1762 # The next step is to walk back through the additional records to
1763 # pick up any missing content (such as visit_definition needing to
1764 # know the exposure). Want to ensure we do not request records we
1765 # already have.
1766 missing_data_ids = set()
1767 for name, record_mapping in additional_records.items():
1768 for data_id in record_mapping.keys():
1769 if data_id not in primary_records[name]:
1770 missing_data_ids.add(data_id)
1772 # Fill out the new records. Assume that these new records do not
1773 # also need to carry over additional populated_by records.
1774 secondary_records = self._extract_dimension_records_from_data_ids(
1775 source_butler, missing_data_ids, allowed_elements
1776 )
1778 # Merge the extra sets of records in with the original.
1779 for name, record_mapping in itertools.chain(additional_records.items(), secondary_records.items()):
1780 primary_records[name].update(record_mapping)
1782 return primary_records
1784 def _extract_dimension_records_from_data_ids(
1785 self,
1786 source_butler: LimitedButler | Butler,
1787 data_ids: set[DataCoordinate],
1788 allowed_elements: frozenset[DimensionElement],
1789 ) -> dict[DimensionElement, dict[DataCoordinate, DimensionRecord]]:
1790 dimension_records: dict[DimensionElement, dict[DataCoordinate, DimensionRecord]] = defaultdict(dict)
1792 for data_id in data_ids:
1793 # Need an expanded record, if not expanded that we need a full
1794 # butler with registry (allow mocks with registry too).
1795 if not data_id.hasRecords():
1796 if registry := getattr(source_butler, "registry", None):
1797 data_id = registry.expandDataId(data_id)
1798 else:
1799 raise TypeError("Input butler needs to be a full butler to expand DataId.")
1800 # If this butler doesn't know about a dimension in the source
1801 # butler things will break later.
1802 for element_name in data_id.dimensions.elements:
1803 record = data_id.records[element_name]
1804 if record is not None and record.definition in allowed_elements:
1805 dimension_records[record.definition].setdefault(record.dataId, record)
1807 return dimension_records
1809 def transfer_from(
1810 self,
1811 source_butler: LimitedButler,
1812 source_refs: Iterable[DatasetRef],
1813 transfer: str = "auto",
1814 skip_missing: bool = True,
1815 register_dataset_types: bool = False,
1816 transfer_dimensions: bool = False,
1817 dry_run: bool = False,
1818 ) -> collections.abc.Collection[DatasetRef]:
1819 # Docstring inherited.
1820 if not self.isWriteable():
1821 raise TypeError("Butler is read-only.")
1822 progress = Progress("lsst.daf.butler.Butler.transfer_from", level=VERBOSE)
1824 # Will iterate through the refs multiple times so need to convert
1825 # to a list if this isn't a collection.
1826 if not isinstance(source_refs, collections.abc.Collection):
1827 source_refs = list(source_refs)
1829 original_count = len(source_refs)
1830 _LOG.info("Transferring %d datasets into %s", original_count, str(self))
1832 # In some situations the datastore artifact may be missing
1833 # and we do not want that registry entry to be imported.
1834 # Asking datastore is not sufficient, the records may have been
1835 # purged, we have to ask for the (predicted) URI and check
1836 # existence explicitly. Execution butler is set up exactly like
1837 # this with no datastore records.
1838 artifact_existence: dict[ResourcePath, bool] = {}
1839 if skip_missing:
1840 dataset_existence = source_butler._datastore.mexists(
1841 source_refs, artifact_existence=artifact_existence
1842 )
1843 source_refs = [ref for ref, exists in dataset_existence.items() if exists]
1844 filtered_count = len(source_refs)
1845 n_missing = original_count - filtered_count
1846 _LOG.verbose(
1847 "%d dataset%s removed because the artifact does not exist. Now have %d.",
1848 n_missing,
1849 "" if n_missing == 1 else "s",
1850 filtered_count,
1851 )
1853 # Importing requires that we group the refs by dataset type and run
1854 # before doing the import.
1855 source_dataset_types = set()
1856 grouped_refs = defaultdict(list)
1857 for ref in source_refs:
1858 grouped_refs[ref.datasetType, ref.run].append(ref)
1859 source_dataset_types.add(ref.datasetType)
1861 # Check to see if the dataset type in the source butler has
1862 # the same definition in the target butler and register missing
1863 # ones if requested. Registration must happen outside a transaction.
1864 newly_registered_dataset_types = set()
1865 for datasetType in source_dataset_types:
1866 if register_dataset_types:
1867 # Let this raise immediately if inconsistent. Continuing
1868 # on to find additional inconsistent dataset types
1869 # might result in additional unwanted dataset types being
1870 # registered.
1871 if self._registry.registerDatasetType(datasetType):
1872 newly_registered_dataset_types.add(datasetType)
1873 else:
1874 # If the dataset type is missing, let it fail immediately.
1875 target_dataset_type = self.get_dataset_type(datasetType.name)
1876 if target_dataset_type != datasetType:
1877 raise ConflictingDefinitionError(
1878 "Source butler dataset type differs from definition"
1879 f" in target butler: {datasetType} !="
1880 f" {target_dataset_type}"
1881 )
1882 if newly_registered_dataset_types:
1883 # We may have registered some even if there were inconsistencies
1884 # but should let people know (or else remove them again).
1885 _LOG.verbose(
1886 "Registered the following dataset types in the target Butler: %s",
1887 ", ".join(d.name for d in newly_registered_dataset_types),
1888 )
1889 else:
1890 _LOG.verbose("All required dataset types are known to the target Butler")
1892 dimension_records: dict[DimensionElement, dict[DataCoordinate, DimensionRecord]] = defaultdict(dict)
1893 if transfer_dimensions:
1894 # Collect all the dimension records for these refs.
1895 # All dimensions are to be copied but the list of valid dimensions
1896 # come from this butler's universe.
1897 elements = frozenset(element for element in self.dimensions.elements if element.has_own_table)
1898 dataIds = {ref.dataId for ref in source_refs}
1899 dimension_records = self._extract_all_dimension_records_from_data_ids(
1900 source_butler, dataIds, elements
1901 )
1903 handled_collections: set[str] = set()
1905 # Do all the importing in a single transaction.
1906 with self.transaction():
1907 if dimension_records and not dry_run:
1908 _LOG.verbose("Ensuring that dimension records exist for transferred datasets.")
1909 # Order matters.
1910 for element in self.dimensions.sorted(dimension_records.keys()):
1911 records = [r for r in dimension_records[element].values()]
1912 # Assume that if the record is already present that we can
1913 # use it without having to check that the record metadata
1914 # is consistent.
1915 self._registry.insertDimensionData(element, *records, skip_existing=True)
1917 n_imported = 0
1918 for (datasetType, run), refs_to_import in progress.iter_item_chunks(
1919 grouped_refs.items(), desc="Importing to registry by run and dataset type"
1920 ):
1921 if run not in handled_collections:
1922 # May need to create output collection. If source butler
1923 # has a registry, ask for documentation string.
1924 run_doc = None
1925 if registry := getattr(source_butler, "registry", None):
1926 run_doc = registry.getCollectionDocumentation(run)
1927 if not dry_run:
1928 registered = self._registry.registerRun(run, doc=run_doc)
1929 else:
1930 registered = True
1931 handled_collections.add(run)
1932 if registered:
1933 _LOG.verbose("Creating output run %s", run)
1935 n_refs = len(refs_to_import)
1936 _LOG.verbose(
1937 "Importing %d ref%s of dataset type %s into run %s",
1938 n_refs,
1939 "" if n_refs == 1 else "s",
1940 datasetType.name,
1941 run,
1942 )
1944 # Assume we are using UUIDs and the source refs will match
1945 # those imported.
1946 if not dry_run:
1947 imported_refs = self._registry._importDatasets(refs_to_import)
1948 else:
1949 imported_refs = refs_to_import
1950 assert set(imported_refs) == set(refs_to_import)
1951 n_imported += len(imported_refs)
1953 assert len(source_refs) == n_imported
1954 _LOG.verbose("Imported %d datasets into destination butler", n_imported)
1956 # Ask the datastore to transfer. The datastore has to check that
1957 # the source datastore is compatible with the target datastore.
1958 accepted, rejected = self._datastore.transfer_from(
1959 source_butler._datastore,
1960 source_refs,
1961 transfer=transfer,
1962 artifact_existence=artifact_existence,
1963 dry_run=dry_run,
1964 )
1965 if rejected:
1966 # For now, accept the registry entries but not the files.
1967 _LOG.warning(
1968 "%d datasets were rejected and %d accepted for dataset type %s in run %r.",
1969 len(rejected),
1970 len(accepted),
1971 datasetType,
1972 run,
1973 )
1975 return source_refs
1977 def validateConfiguration(
1978 self,
1979 logFailures: bool = False,
1980 datasetTypeNames: Iterable[str] | None = None,
1981 ignore: Iterable[str] | None = None,
1982 ) -> None:
1983 # Docstring inherited.
1984 if datasetTypeNames:
1985 datasetTypes = [self.get_dataset_type(name) for name in datasetTypeNames]
1986 else:
1987 datasetTypes = list(self._registry.queryDatasetTypes())
1989 # filter out anything from the ignore list
1990 if ignore:
1991 ignore = set(ignore)
1992 datasetTypes = [
1993 e for e in datasetTypes if e.name not in ignore and e.nameAndComponent()[0] not in ignore
1994 ]
1995 else:
1996 ignore = set()
1998 # For each datasetType that has an instrument dimension, create
1999 # a DatasetRef for each defined instrument
2000 datasetRefs = []
2002 # Find all the registered instruments (if "instrument" is in the
2003 # universe).
2004 if "instrument" in self.dimensions:
2005 instruments = {record.name for record in self._registry.queryDimensionRecords("instrument")}
2007 for datasetType in datasetTypes:
2008 if "instrument" in datasetType.dimensions:
2009 # In order to create a conforming dataset ref, create
2010 # fake DataCoordinate values for the non-instrument
2011 # dimensions. The type of the value does not matter here.
2012 dataId = {dim: 1 for dim in datasetType.dimensions.names if dim != "instrument"}
2014 for instrument in instruments:
2015 datasetRef = DatasetRef(
2016 datasetType,
2017 DataCoordinate.standardize(
2018 dataId, instrument=instrument, dimensions=datasetType.dimensions
2019 ),
2020 run="validate",
2021 )
2022 datasetRefs.append(datasetRef)
2024 entities: list[DatasetType | DatasetRef] = []
2025 entities.extend(datasetTypes)
2026 entities.extend(datasetRefs)
2028 datastoreErrorStr = None
2029 try:
2030 self._datastore.validateConfiguration(entities, logFailures=logFailures)
2031 except ValidationError as e:
2032 datastoreErrorStr = str(e)
2034 # Also check that the LookupKeys used by the datastores match
2035 # registry and storage class definitions
2036 keys = self._datastore.getLookupKeys()
2038 failedNames = set()
2039 failedDataId = set()
2040 for key in keys:
2041 if key.name is not None:
2042 if key.name in ignore:
2043 continue
2045 # skip if specific datasetType names were requested and this
2046 # name does not match
2047 if datasetTypeNames and key.name not in datasetTypeNames:
2048 continue
2050 # See if it is a StorageClass or a DatasetType
2051 if key.name in self.storageClasses:
2052 pass
2053 else:
2054 try:
2055 self.get_dataset_type(key.name)
2056 except KeyError:
2057 if logFailures:
2058 _LOG.critical(
2059 "Key '%s' does not correspond to a DatasetType or StorageClass", key
2060 )
2061 failedNames.add(key)
2062 else:
2063 # Dimensions are checked for consistency when the Butler
2064 # is created and rendezvoused with a universe.
2065 pass
2067 # Check that the instrument is a valid instrument
2068 # Currently only support instrument so check for that
2069 if key.dataId:
2070 dataIdKeys = set(key.dataId)
2071 if {"instrument"} != dataIdKeys:
2072 if logFailures:
2073 _LOG.critical("Key '%s' has unsupported DataId override", key)
2074 failedDataId.add(key)
2075 elif key.dataId["instrument"] not in instruments:
2076 if logFailures:
2077 _LOG.critical("Key '%s' has unknown instrument", key)
2078 failedDataId.add(key)
2080 messages = []
2082 if datastoreErrorStr:
2083 messages.append(datastoreErrorStr)
2085 for failed, msg in (
2086 (failedNames, "Keys without corresponding DatasetType or StorageClass entry: "),
2087 (failedDataId, "Keys with bad DataId entries: "),
2088 ):
2089 if failed:
2090 msg += ", ".join(str(k) for k in failed)
2091 messages.append(msg)
2093 if messages:
2094 raise ValidationError(";\n".join(messages))
2096 @property
2097 def collections(self) -> Sequence[str]:
2098 """The collections to search by default, in order
2099 (`~collections.abc.Sequence` [ `str` ]).
2101 This is an alias for ``self.registry.defaults.collections``. It cannot
2102 be set directly in isolation, but all defaults may be changed together
2103 by assigning a new `RegistryDefaults` instance to
2104 ``self.registry.defaults``.
2105 """
2106 return self._registry.defaults.collections
2108 @property
2109 def run(self) -> str | None:
2110 """Name of the run this butler writes outputs to by default (`str` or
2111 `None`).
2113 This is an alias for ``self.registry.defaults.run``. It cannot be set
2114 directly in isolation, but all defaults may be changed together by
2115 assigning a new `RegistryDefaults` instance to
2116 ``self.registry.defaults``.
2117 """
2118 return self._registry.defaults.run
2120 @property
2121 def registry(self) -> Registry:
2122 """The object that manages dataset metadata and relationships
2123 (`Registry`).
2125 Many operations that don't involve reading or writing butler datasets
2126 are accessible only via `Registry` methods. Eventually these methods
2127 will be replaced by equivalent `Butler` methods.
2128 """
2129 return self._registry_shim
2131 @property
2132 def dimensions(self) -> DimensionUniverse:
2133 # Docstring inherited.
2134 return self._registry.dimensions
2136 @contextlib.contextmanager
2137 def _query(self) -> Iterator[Query]:
2138 # Docstring inherited.
2139 raise NotImplementedError("TODO DM-41159")
2141 def _preload_cache(self) -> None:
2142 """Immediately load caches that are used for common operations."""
2143 self._registry.preload_cache()
2145 def prepend_collection_chain(
2146 self, parent_collection_name: str, child_collection_names: str | Iterable[str]
2147 ) -> None:
2148 return self._registry._managers.collections.prepend_collection_chain(
2149 parent_collection_name, list(ensure_iterable(child_collection_names))
2150 )
2152 _config: ButlerConfig
2153 """Configuration for this Butler instance."""
2155 _registry: SqlRegistry
2156 """The object that manages dataset metadata and relationships
2157 (`SqlRegistry`).
2159 Most operations that don't involve reading or writing butler datasets are
2160 accessible only via `SqlRegistry` methods.
2161 """
2163 datastore: Datastore
2164 """The object that manages actual dataset storage (`Datastore`).
2166 Direct user access to the datastore should rarely be necessary; the primary
2167 exception is the case where a `Datastore` implementation provides extra
2168 functionality beyond what the base class defines.
2169 """
2171 storageClasses: StorageClassFactory
2172 """An object that maps known storage class names to objects that fully
2173 describe them (`StorageClassFactory`).
2174 """
2176 _registry_shim: RegistryShim
2177 """Shim object to provide a legacy public interface for querying via the
2178 the ``registry`` property.
2179 """