Coverage for python/lsst/daf/butler/direct_butler.py: 10%
750 statements
« prev ^ index » next coverage.py v7.4.4, created at 2024-04-04 02:55 -0700
« prev ^ index » next coverage.py v7.4.4, created at 2024-04-04 02:55 -0700
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28"""Butler top level classes.
29"""
30from __future__ import annotations
32__all__ = (
33 "DirectButler",
34 "ButlerValidationError",
35)
37import collections.abc
38import contextlib
39import io
40import itertools
41import logging
42import numbers
43import os
44import warnings
45from collections import Counter, defaultdict
46from collections.abc import Iterable, Iterator, MutableMapping, Sequence
47from typing import TYPE_CHECKING, Any, ClassVar, TextIO, cast
49from lsst.resources import ResourcePath, ResourcePathExpression
50from lsst.utils.introspection import get_class_of
51from lsst.utils.logging import VERBOSE, getLogger
52from sqlalchemy.exc import IntegrityError
54from ._butler import Butler
55from ._butler_config import ButlerConfig
56from ._butler_instance_options import ButlerInstanceOptions
57from ._dataset_existence import DatasetExistence
58from ._dataset_ref import DatasetRef
59from ._dataset_type import DatasetType
60from ._deferredDatasetHandle import DeferredDatasetHandle
61from ._exceptions import DatasetNotFoundError, DimensionValueError, ValidationError
62from ._limited_butler import LimitedButler
63from ._registry_shim import RegistryShim
64from ._storage_class import StorageClass, StorageClassFactory
65from ._timespan import Timespan
66from .datastore import Datastore, NullDatastore
67from .dimensions import DataCoordinate, Dimension
68from .progress import Progress
69from .queries import Query
70from .registry import (
71 CollectionType,
72 ConflictingDefinitionError,
73 DataIdError,
74 MissingDatasetTypeError,
75 RegistryDefaults,
76 _RegistryFactory,
77)
78from .registry.sql_registry import SqlRegistry
79from .transfers import RepoExportContext
80from .utils import transactional
82if TYPE_CHECKING:
83 from lsst.resources import ResourceHandleProtocol
85 from ._dataset_ref import DatasetId
86 from ._file_dataset import FileDataset
87 from .datastore import DatasetRefURIs
88 from .dimensions import DataId, DataIdValue, DimensionElement, DimensionRecord, DimensionUniverse
89 from .registry import Registry
90 from .transfers import RepoImportBackend
92_LOG = getLogger(__name__)
95class ButlerValidationError(ValidationError):
96 """There is a problem with the Butler configuration."""
98 pass
101class DirectButler(Butler): # numpydoc ignore=PR02
102 """Main entry point for the data access system.
104 Parameters
105 ----------
106 config : `ButlerConfig`
107 The configuration for this Butler instance.
108 registry : `SqlRegistry`
109 The object that manages dataset metadata and relationships.
110 datastore : Datastore
111 The object that manages actual dataset storage.
112 storageClasses : StorageClassFactory
113 An object that maps known storage class names to objects that fully
114 describe them.
116 Notes
117 -----
118 Most users should call the top-level `Butler`.``from_config`` instead of
119 using this constructor directly.
120 """
122 # This is __new__ instead of __init__ because we have to support
123 # instantiation via the legacy constructor Butler.__new__(), which
124 # reads the configuration and selects which subclass to instantiate. The
125 # interaction between __new__ and __init__ is kind of wacky in Python. If
126 # we were using __init__ here, __init__ would be called twice (once when
127 # the DirectButler instance is constructed inside Butler.from_config(), and
128 # a second time with the original arguments to Butler() when the instance
129 # is returned from Butler.__new__()
130 def __new__(
131 cls,
132 *,
133 config: ButlerConfig,
134 registry: SqlRegistry,
135 datastore: Datastore,
136 storageClasses: StorageClassFactory,
137 ) -> DirectButler:
138 self = cast(DirectButler, super().__new__(cls))
139 self._config = config
140 self._registry = registry
141 self._datastore = datastore
142 self.storageClasses = storageClasses
144 # For execution butler the datastore needs a special
145 # dependency-inversion trick. This is not used by regular butler,
146 # but we do not have a way to distinguish regular butler from execution
147 # butler.
148 self._datastore.set_retrieve_dataset_type_method(self._retrieve_dataset_type)
150 self._registry_shim = RegistryShim(self)
152 return self
154 @classmethod
155 def create_from_config(
156 cls,
157 config: ButlerConfig,
158 *,
159 options: ButlerInstanceOptions,
160 without_datastore: bool = False,
161 ) -> DirectButler:
162 """Construct a Butler instance from a configuration file.
164 Parameters
165 ----------
166 config : `ButlerConfig`
167 The configuration for this Butler instance.
168 options : `ButlerInstanceOptions`
169 Default values and other settings for the Butler instance.
170 without_datastore : `bool`, optional
171 If `True` do not attach a datastore to this butler. Any attempts
172 to use a datastore will fail.
174 Notes
175 -----
176 Most users should call the top-level `Butler`.``from_config``
177 instead of using this function directly.
178 """
179 if "run" in config or "collection" in config:
180 raise ValueError("Passing a run or collection via configuration is no longer supported.")
182 defaults = RegistryDefaults(
183 collections=options.collections, run=options.run, infer=options.inferDefaults, **options.kwargs
184 )
185 try:
186 butlerRoot = config.get("root", config.configDir)
187 writeable = options.writeable
188 if writeable is None:
189 writeable = options.run is not None
190 registry = _RegistryFactory(config).from_config(
191 butlerRoot=butlerRoot, writeable=writeable, defaults=defaults
192 )
193 if without_datastore:
194 datastore: Datastore = NullDatastore(None, None)
195 else:
196 datastore = Datastore.fromConfig(
197 config, registry.getDatastoreBridgeManager(), butlerRoot=butlerRoot
198 )
199 # TODO: Once datastore drops dependency on registry we can
200 # construct datastore first and pass opaque tables to registry
201 # constructor.
202 registry.make_datastore_tables(datastore.get_opaque_table_definitions())
203 storageClasses = StorageClassFactory()
204 storageClasses.addFromConfig(config)
206 return DirectButler(
207 config=config, registry=registry, datastore=datastore, storageClasses=storageClasses
208 )
209 except Exception:
210 # Failures here usually mean that configuration is incomplete,
211 # just issue an error message which includes config file URI.
212 _LOG.error(f"Failed to instantiate Butler from config {config.configFile}.")
213 raise
215 def _clone(
216 self,
217 *,
218 collections: Any = None,
219 run: str | None = None,
220 inferDefaults: bool = True,
221 **kwargs: Any,
222 ) -> DirectButler:
223 # Docstring inherited
224 defaults = RegistryDefaults(collections=collections, run=run, infer=inferDefaults, **kwargs)
225 registry = self._registry.copy(defaults)
227 return DirectButler(
228 registry=registry,
229 config=self._config,
230 datastore=self._datastore.clone(registry.getDatastoreBridgeManager()),
231 storageClasses=self.storageClasses,
232 )
234 GENERATION: ClassVar[int] = 3
235 """This is a Generation 3 Butler.
237 This attribute may be removed in the future, once the Generation 2 Butler
238 interface has been fully retired; it should only be used in transitional
239 code.
240 """
242 def _retrieve_dataset_type(self, name: str) -> DatasetType | None:
243 """Return DatasetType defined in registry given dataset type name."""
244 try:
245 return self.get_dataset_type(name)
246 except MissingDatasetTypeError:
247 return None
249 @classmethod
250 def _unpickle(
251 cls,
252 config: ButlerConfig,
253 collections: tuple[str, ...] | None,
254 run: str | None,
255 defaultDataId: dict[str, str],
256 writeable: bool,
257 ) -> DirectButler:
258 """Callable used to unpickle a Butler.
260 We prefer not to use ``Butler.__init__`` directly so we can force some
261 of its many arguments to be keyword-only (note that ``__reduce__``
262 can only invoke callables with positional arguments).
264 Parameters
265 ----------
266 config : `ButlerConfig`
267 Butler configuration, already coerced into a true `ButlerConfig`
268 instance (and hence after any search paths for overrides have been
269 utilized).
270 collections : `tuple` [ `str` ]
271 Names of the default collections to read from.
272 run : `str`, optional
273 Name of the default `~CollectionType.RUN` collection to write to.
274 defaultDataId : `dict` [ `str`, `str` ]
275 Default data ID values.
276 writeable : `bool`
277 Whether the Butler should support write operations.
279 Returns
280 -------
281 butler : `Butler`
282 A new `Butler` instance.
283 """
284 return cls.create_from_config(
285 config=config,
286 options=ButlerInstanceOptions(
287 collections=collections, run=run, writeable=writeable, kwargs=defaultDataId
288 ),
289 )
291 def __reduce__(self) -> tuple:
292 """Support pickling."""
293 return (
294 DirectButler._unpickle,
295 (
296 self._config,
297 self.collections,
298 self.run,
299 dict(self._registry.defaults.dataId.required),
300 self._registry.isWriteable(),
301 ),
302 )
304 def __str__(self) -> str:
305 return (
306 f"Butler(collections={self.collections}, run={self.run}, "
307 f"datastore='{self._datastore}', registry='{self._registry}')"
308 )
310 def isWriteable(self) -> bool:
311 # Docstring inherited.
312 return self._registry.isWriteable()
314 def _caching_context(self) -> contextlib.AbstractContextManager[None]:
315 """Context manager that enables caching."""
316 return self._registry.caching_context()
318 @contextlib.contextmanager
319 def transaction(self) -> Iterator[None]:
320 """Context manager supporting `Butler` transactions.
322 Transactions can be nested.
323 """
324 with self._registry.transaction(), self._datastore.transaction():
325 yield
327 def _standardizeArgs(
328 self,
329 datasetRefOrType: DatasetRef | DatasetType | str,
330 dataId: DataId | None = None,
331 for_put: bool = True,
332 **kwargs: Any,
333 ) -> tuple[DatasetType, DataId | None]:
334 """Standardize the arguments passed to several Butler APIs.
336 Parameters
337 ----------
338 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
339 When `DatasetRef` the `dataId` should be `None`.
340 Otherwise the `DatasetType` or name thereof.
341 dataId : `dict` or `DataCoordinate`
342 A `dict` of `Dimension` link name, value pairs that label the
343 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
344 should be provided as the second argument.
345 for_put : `bool`, optional
346 If `True` this call is invoked as part of a `Butler.put()`.
347 Otherwise it is assumed to be part of a `Butler.get()`. This
348 parameter is only relevant if there is dataset type
349 inconsistency.
350 **kwargs
351 Additional keyword arguments used to augment or construct a
352 `DataCoordinate`. See `DataCoordinate.standardize`
353 parameters.
355 Returns
356 -------
357 datasetType : `DatasetType`
358 A `DatasetType` instance extracted from ``datasetRefOrType``.
359 dataId : `dict` or `DataId`, optional
360 Argument that can be used (along with ``kwargs``) to construct a
361 `DataId`.
363 Notes
364 -----
365 Butler APIs that conceptually need a DatasetRef also allow passing a
366 `DatasetType` (or the name of one) and a `DataId` (or a dict and
367 keyword arguments that can be used to construct one) separately. This
368 method accepts those arguments and always returns a true `DatasetType`
369 and a `DataId` or `dict`.
371 Standardization of `dict` vs `DataId` is best handled by passing the
372 returned ``dataId`` (and ``kwargs``) to `Registry` APIs, which are
373 generally similarly flexible.
374 """
375 externalDatasetType: DatasetType | None = None
376 internalDatasetType: DatasetType | None = None
377 if isinstance(datasetRefOrType, DatasetRef):
378 if dataId is not None or kwargs:
379 raise ValueError("DatasetRef given, cannot use dataId as well")
380 externalDatasetType = datasetRefOrType.datasetType
381 dataId = datasetRefOrType.dataId
382 else:
383 # Don't check whether DataId is provided, because Registry APIs
384 # can usually construct a better error message when it wasn't.
385 if isinstance(datasetRefOrType, DatasetType):
386 externalDatasetType = datasetRefOrType
387 else:
388 internalDatasetType = self.get_dataset_type(datasetRefOrType)
390 # Check that they are self-consistent
391 if externalDatasetType is not None:
392 internalDatasetType = self.get_dataset_type(externalDatasetType.name)
393 if externalDatasetType != internalDatasetType:
394 # We can allow differences if they are compatible, depending
395 # on whether this is a get or a put. A get requires that
396 # the python type associated with the datastore can be
397 # converted to the user type. A put requires that the user
398 # supplied python type can be converted to the internal
399 # type expected by registry.
400 relevantDatasetType = internalDatasetType
401 if for_put:
402 is_compatible = internalDatasetType.is_compatible_with(externalDatasetType)
403 else:
404 is_compatible = externalDatasetType.is_compatible_with(internalDatasetType)
405 relevantDatasetType = externalDatasetType
406 if not is_compatible:
407 raise ValueError(
408 f"Supplied dataset type ({externalDatasetType}) inconsistent with "
409 f"registry definition ({internalDatasetType})"
410 )
411 # Override the internal definition.
412 internalDatasetType = relevantDatasetType
414 assert internalDatasetType is not None
415 return internalDatasetType, dataId
417 def _rewrite_data_id(
418 self, dataId: DataId | None, datasetType: DatasetType, **kwargs: Any
419 ) -> tuple[DataId | None, dict[str, Any]]:
420 """Rewrite a data ID taking into account dimension records.
422 Take a Data ID and keyword args and rewrite it if necessary to
423 allow the user to specify dimension records rather than dimension
424 primary values.
426 This allows a user to include a dataId dict with keys of
427 ``exposure.day_obs`` and ``exposure.seq_num`` instead of giving
428 the integer exposure ID. It also allows a string to be given
429 for a dimension value rather than the integer ID if that is more
430 convenient. For example, rather than having to specifying the
431 detector with ``detector.full_name``, a string given for ``detector``
432 will be interpreted as the full name and converted to the integer
433 value.
435 Keyword arguments can also use strings for dimensions like detector
436 and exposure but python does not allow them to include ``.`` and
437 so the ``exposure.day_obs`` syntax can not be used in a keyword
438 argument.
440 Parameters
441 ----------
442 dataId : `dict` or `DataCoordinate`
443 A `dict` of `Dimension` link name, value pairs that will label the
444 `DatasetRef` within a Collection.
445 datasetType : `DatasetType`
446 The dataset type associated with this dataId. Required to
447 determine the relevant dimensions.
448 **kwargs
449 Additional keyword arguments used to augment or construct a
450 `DataId`. See `DataId` parameters.
452 Returns
453 -------
454 dataId : `dict` or `DataCoordinate`
455 The, possibly rewritten, dataId. If given a `DataCoordinate` and
456 no keyword arguments, the original dataId will be returned
457 unchanged.
458 **kwargs : `dict`
459 Any unused keyword arguments (would normally be empty dict).
460 """
461 # Do nothing if we have a standalone DataCoordinate.
462 if isinstance(dataId, DataCoordinate) and not kwargs:
463 return dataId, kwargs
465 # Process dimension records that are using record information
466 # rather than ids
467 newDataId: dict[str, DataIdValue] = {}
468 byRecord: dict[str, dict[str, Any]] = defaultdict(dict)
470 # if all the dataId comes from keyword parameters we do not need
471 # to do anything here because they can't be of the form
472 # exposure.obs_id because a "." is not allowed in a keyword parameter.
473 if dataId:
474 for k, v in dataId.items():
475 # If we have a Dimension we do not need to do anything
476 # because it cannot be a compound key.
477 if isinstance(k, str) and "." in k:
478 # Someone is using a more human-readable dataId
479 dimensionName, record = k.split(".", 1)
480 byRecord[dimensionName][record] = v
481 elif isinstance(k, Dimension):
482 newDataId[k.name] = v
483 else:
484 newDataId[k] = v
486 # Go through the updated dataId and check the type in case someone is
487 # using an alternate key. We have already filtered out the compound
488 # keys dimensions.record format.
489 not_dimensions = {}
491 # Will need to look in the dataId and the keyword arguments
492 # and will remove them if they need to be fixed or are unrecognized.
493 for dataIdDict in (newDataId, kwargs):
494 # Use a list so we can adjust the dict safely in the loop
495 for dimensionName in list(dataIdDict):
496 value = dataIdDict[dimensionName]
497 try:
498 dimension = self.dimensions.dimensions[dimensionName]
499 except KeyError:
500 # This is not a real dimension
501 not_dimensions[dimensionName] = value
502 del dataIdDict[dimensionName]
503 continue
505 # Convert an integral type to an explicit int to simplify
506 # comparisons here
507 if isinstance(value, numbers.Integral):
508 value = int(value)
510 if not isinstance(value, dimension.primaryKey.getPythonType()):
511 for alternate in dimension.alternateKeys:
512 if isinstance(value, alternate.getPythonType()):
513 byRecord[dimensionName][alternate.name] = value
514 del dataIdDict[dimensionName]
515 _LOG.debug(
516 "Converting dimension %s to %s.%s=%s",
517 dimensionName,
518 dimensionName,
519 alternate.name,
520 value,
521 )
522 break
523 else:
524 _LOG.warning(
525 "Type mismatch found for value '%r' provided for dimension %s. "
526 "Could not find matching alternative (primary key has type %s) "
527 "so attempting to use as-is.",
528 value,
529 dimensionName,
530 dimension.primaryKey.getPythonType(),
531 )
533 # By this point kwargs and newDataId should only include valid
534 # dimensions. Merge kwargs in to the new dataId and log if there
535 # are dimensions in both (rather than calling update).
536 for k, v in kwargs.items():
537 if k in newDataId and newDataId[k] != v:
538 _LOG.debug(
539 "Keyword arg %s overriding explicit value in dataId of %s with %s", k, newDataId[k], v
540 )
541 newDataId[k] = v
542 # No need to retain any values in kwargs now.
543 kwargs = {}
545 # If we have some unrecognized dimensions we have to try to connect
546 # them to records in other dimensions. This is made more complicated
547 # by some dimensions having records with clashing names. A mitigation
548 # is that we can tell by this point which dimensions are missing
549 # for the DatasetType but this does not work for calibrations
550 # where additional dimensions can be used to constrain the temporal
551 # axis.
552 if not_dimensions:
553 # Search for all dimensions even if we have been given a value
554 # explicitly. In some cases records are given as well as the
555 # actually dimension and this should not be an error if they
556 # match.
557 mandatoryDimensions = datasetType.dimensions.names # - provided
559 candidateDimensions: set[str] = set()
560 candidateDimensions.update(mandatoryDimensions)
562 # For calibrations we may well be needing temporal dimensions
563 # so rather than always including all dimensions in the scan
564 # restrict things a little. It is still possible for there
565 # to be confusion over day_obs in visit vs exposure for example.
566 # If we are not searching calibration collections things may
567 # fail but they are going to fail anyway because of the
568 # ambiguousness of the dataId...
569 if datasetType.isCalibration():
570 for dim in self.dimensions.dimensions:
571 if dim.temporal:
572 candidateDimensions.add(str(dim))
574 # Look up table for the first association with a dimension
575 guessedAssociation: dict[str, dict[str, Any]] = defaultdict(dict)
577 # Keep track of whether an item is associated with multiple
578 # dimensions.
579 counter: Counter[str] = Counter()
580 assigned: dict[str, set[str]] = defaultdict(set)
582 # Go through the missing dimensions and associate the
583 # given names with records within those dimensions
584 matched_dims = set()
585 for dimensionName in candidateDimensions:
586 dimension = self.dimensions.dimensions[dimensionName]
587 fields = dimension.metadata.names | dimension.uniqueKeys.names
588 for field in not_dimensions:
589 if field in fields:
590 guessedAssociation[dimensionName][field] = not_dimensions[field]
591 counter[dimensionName] += 1
592 assigned[field].add(dimensionName)
593 matched_dims.add(field)
595 # Calculate the fields that matched nothing.
596 never_found = set(not_dimensions) - matched_dims
598 if never_found:
599 raise DimensionValueError(f"Unrecognized keyword args given: {never_found}")
601 # There is a chance we have allocated a single dataId item
602 # to multiple dimensions. Need to decide which should be retained.
603 # For now assume that the most popular alternative wins.
604 # This means that day_obs with seq_num will result in
605 # exposure.day_obs and not visit.day_obs
606 # Also prefer an explicitly missing dimension over an inferred
607 # temporal dimension.
608 for fieldName, assignedDimensions in assigned.items():
609 if len(assignedDimensions) > 1:
610 # Pick the most popular (preferring mandatory dimensions)
611 requiredButMissing = assignedDimensions.intersection(mandatoryDimensions)
612 if requiredButMissing:
613 candidateDimensions = requiredButMissing
614 else:
615 candidateDimensions = assignedDimensions
617 # If this is a choice between visit and exposure and
618 # neither was a required part of the dataset type,
619 # (hence in this branch) always prefer exposure over
620 # visit since exposures are always defined and visits
621 # are defined from exposures.
622 if candidateDimensions == {"exposure", "visit"}:
623 candidateDimensions = {"exposure"}
625 # Select the relevant items and get a new restricted
626 # counter.
627 theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions}
628 duplicatesCounter: Counter[str] = Counter()
629 duplicatesCounter.update(theseCounts)
631 # Choose the most common. If they are equally common
632 # we will pick the one that was found first.
633 # Returns a list of tuples
634 selected = duplicatesCounter.most_common(1)[0][0]
636 _LOG.debug(
637 "Ambiguous dataId entry '%s' associated with multiple dimensions: %s."
638 " Removed ambiguity by choosing dimension %s.",
639 fieldName,
640 ", ".join(assignedDimensions),
641 selected,
642 )
644 for candidateDimension in assignedDimensions:
645 if candidateDimension != selected:
646 del guessedAssociation[candidateDimension][fieldName]
648 # Update the record look up dict with the new associations
649 for dimensionName, values in guessedAssociation.items():
650 if values: # A dict might now be empty
651 _LOG.debug(
652 "Assigned non-dimension dataId keys to dimension %s: %s", dimensionName, values
653 )
654 byRecord[dimensionName].update(values)
656 if byRecord:
657 # Some record specifiers were found so we need to convert
658 # them to the Id form
659 for dimensionName, values in byRecord.items():
660 if dimensionName in newDataId:
661 _LOG.debug(
662 "DataId specified explicit %s dimension value of %s in addition to"
663 " general record specifiers for it of %s. Ignoring record information.",
664 dimensionName,
665 newDataId[dimensionName],
666 str(values),
667 )
668 # Get the actual record and compare with these values.
669 try:
670 recs = list(self._registry.queryDimensionRecords(dimensionName, dataId=newDataId))
671 except DataIdError:
672 raise DimensionValueError(
673 f"Could not find dimension '{dimensionName}'"
674 f" with dataId {newDataId} as part of comparing with"
675 f" record values {byRecord[dimensionName]}"
676 ) from None
677 if len(recs) == 1:
678 errmsg: list[str] = []
679 for k, v in values.items():
680 if (recval := getattr(recs[0], k)) != v:
681 errmsg.append(f"{k}({recval} != {v})")
682 if errmsg:
683 raise DimensionValueError(
684 f"Dimension {dimensionName} in dataId has explicit value"
685 " inconsistent with records: " + ", ".join(errmsg)
686 )
687 else:
688 # Multiple matches for an explicit dimension
689 # should never happen but let downstream complain.
690 pass
691 continue
693 # Build up a WHERE expression
694 bind = dict(values.items())
695 where = " AND ".join(f"{dimensionName}.{k} = {k}" for k in bind)
697 # Hopefully we get a single record that matches
698 records = set(
699 self._registry.queryDimensionRecords(
700 dimensionName, dataId=newDataId, where=where, bind=bind, **kwargs
701 )
702 )
704 if len(records) != 1:
705 if len(records) > 1:
706 # visit can have an ambiguous answer without involving
707 # visit_system. The default visit_system is defined
708 # by the instrument.
709 if (
710 dimensionName == "visit"
711 and "visit_system_membership" in self.dimensions
712 and "visit_system" in self.dimensions["instrument"].metadata
713 ):
714 instrument_records = list(
715 self._registry.queryDimensionRecords(
716 "instrument",
717 dataId=newDataId,
718 **kwargs,
719 )
720 )
721 if len(instrument_records) == 1:
722 visit_system = instrument_records[0].visit_system
723 if visit_system is None:
724 # Set to a value that will never match.
725 visit_system = -1
727 # Look up each visit in the
728 # visit_system_membership records.
729 for rec in records:
730 membership = list(
731 self._registry.queryDimensionRecords(
732 # Use bind to allow zero results.
733 # This is a fully-specified query.
734 "visit_system_membership",
735 where="instrument = inst AND visit_system = system AND visit = v",
736 bind=dict(
737 inst=instrument_records[0].name, system=visit_system, v=rec.id
738 ),
739 )
740 )
741 if membership:
742 # This record is the right answer.
743 records = {rec}
744 break
746 # The ambiguity may have been resolved so check again.
747 if len(records) > 1:
748 _LOG.debug(
749 "Received %d records from constraints of %s", len(records), str(values)
750 )
751 for r in records:
752 _LOG.debug("- %s", str(r))
753 raise DimensionValueError(
754 f"DataId specification for dimension {dimensionName} is not"
755 f" uniquely constrained to a single dataset by {values}."
756 f" Got {len(records)} results."
757 )
758 else:
759 raise DimensionValueError(
760 f"DataId specification for dimension {dimensionName} matched no"
761 f" records when constrained by {values}"
762 )
764 # Get the primary key from the real dimension object
765 dimension = self.dimensions.dimensions[dimensionName]
766 if not isinstance(dimension, Dimension):
767 raise RuntimeError(
768 f"{dimension.name} is not a true dimension, and cannot be used in data IDs."
769 )
770 newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name)
772 return newDataId, kwargs
774 def _findDatasetRef(
775 self,
776 datasetRefOrType: DatasetRef | DatasetType | str,
777 dataId: DataId | None = None,
778 *,
779 collections: Any = None,
780 predict: bool = False,
781 run: str | None = None,
782 datastore_records: bool = False,
783 timespan: Timespan | None = None,
784 **kwargs: Any,
785 ) -> DatasetRef:
786 """Shared logic for methods that start with a search for a dataset in
787 the registry.
789 Parameters
790 ----------
791 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
792 When `DatasetRef` the `dataId` should be `None`.
793 Otherwise the `DatasetType` or name thereof.
794 dataId : `dict` or `DataCoordinate`, optional
795 A `dict` of `Dimension` link name, value pairs that label the
796 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
797 should be provided as the first argument.
798 collections : Any, optional
799 Collections to be searched, overriding ``self.collections``.
800 Can be any of the types supported by the ``collections`` argument
801 to butler construction.
802 predict : `bool`, optional
803 If `True`, return a newly created `DatasetRef` with a unique
804 dataset ID if finding a reference in the `Registry` fails.
805 Defaults to `False`.
806 run : `str`, optional
807 Run collection name to use for creating `DatasetRef` for predicted
808 datasets. Only used if ``predict`` is `True`.
809 datastore_records : `bool`, optional
810 If `True` add datastore records to returned `DatasetRef`.
811 timespan : `Timespan` or `None`, optional
812 A timespan that the validity range of the dataset must overlap.
813 If not provided and this is a calibration dataset type, an attempt
814 will be made to find the timespan from any temporal coordinate
815 in the data ID.
816 **kwargs
817 Additional keyword arguments used to augment or construct a
818 `DataId`. See `DataId` parameters.
820 Returns
821 -------
822 ref : `DatasetRef`
823 A reference to the dataset identified by the given arguments.
824 This can be the same dataset reference as given if it was
825 resolved.
827 Raises
828 ------
829 LookupError
830 Raised if no matching dataset exists in the `Registry` (and
831 ``predict`` is `False`).
832 ValueError
833 Raised if a resolved `DatasetRef` was passed as an input, but it
834 differs from the one found in the registry.
835 TypeError
836 Raised if no collections were provided.
837 """
838 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, for_put=False, **kwargs)
839 if isinstance(datasetRefOrType, DatasetRef):
840 if collections is not None:
841 warnings.warn("Collections should not be specified with DatasetRef", stacklevel=3)
842 # May need to retrieve datastore records if requested.
843 if datastore_records and datasetRefOrType._datastore_records is None:
844 datasetRefOrType = self._registry.get_datastore_records(datasetRefOrType)
845 return datasetRefOrType
847 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs)
849 if datasetType.isCalibration():
850 # Because this is a calibration dataset, first try to make a
851 # standardize the data ID without restricting the dimensions to
852 # those of the dataset type requested, because there may be extra
853 # dimensions that provide temporal information for a validity-range
854 # lookup.
855 dataId = DataCoordinate.standardize(
856 dataId, universe=self.dimensions, defaults=self._registry.defaults.dataId, **kwargs
857 )
858 if timespan is None:
859 if dataId.dimensions.temporal:
860 dataId = self._registry.expandDataId(dataId)
861 # Use the timespan from the data ID to constrain the
862 # calibration lookup, but only if the caller has not
863 # specified an explicit timespan.
864 timespan = dataId.timespan
865 else:
866 # Try an arbitrary timespan. Downstream will fail if this
867 # results in more than one matching dataset.
868 timespan = Timespan(None, None)
869 else:
870 # Standardize the data ID to just the dimensions of the dataset
871 # type instead of letting registry.findDataset do it, so we get the
872 # result even if no dataset is found.
873 dataId = DataCoordinate.standardize(
874 dataId,
875 dimensions=datasetType.dimensions,
876 defaults=self._registry.defaults.dataId,
877 **kwargs,
878 )
879 # Always lookup the DatasetRef, even if one is given, to ensure it is
880 # present in the current collection.
881 ref = self.find_dataset(
882 datasetType,
883 dataId,
884 collections=collections,
885 timespan=timespan,
886 datastore_records=datastore_records,
887 )
888 if ref is None:
889 if predict:
890 if run is None:
891 run = self.run
892 if run is None:
893 raise TypeError("Cannot predict dataset ID/location with run=None.")
894 return DatasetRef(datasetType, dataId, run=run)
895 else:
896 if collections is None:
897 collections = self._registry.defaults.collections
898 raise DatasetNotFoundError(
899 f"Dataset {datasetType.name} with data ID {dataId} "
900 f"could not be found in collections {collections}."
901 )
902 if datasetType != ref.datasetType:
903 # If they differ it is because the user explicitly specified
904 # a compatible dataset type to this call rather than using the
905 # registry definition. The DatasetRef must therefore be recreated
906 # using the user definition such that the expected type is
907 # returned.
908 ref = DatasetRef(
909 datasetType, ref.dataId, run=ref.run, id=ref.id, datastore_records=ref._datastore_records
910 )
912 return ref
914 @transactional
915 def put(
916 self,
917 obj: Any,
918 datasetRefOrType: DatasetRef | DatasetType | str,
919 /,
920 dataId: DataId | None = None,
921 *,
922 run: str | None = None,
923 **kwargs: Any,
924 ) -> DatasetRef:
925 """Store and register a dataset.
927 Parameters
928 ----------
929 obj : `object`
930 The dataset.
931 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
932 When `DatasetRef` is provided, ``dataId`` should be `None`.
933 Otherwise the `DatasetType` or name thereof. If a fully resolved
934 `DatasetRef` is given the run and ID are used directly.
935 dataId : `dict` or `DataCoordinate`
936 A `dict` of `Dimension` link name, value pairs that label the
937 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
938 should be provided as the second argument.
939 run : `str`, optional
940 The name of the run the dataset should be added to, overriding
941 ``self.run``. Not used if a resolved `DatasetRef` is provided.
942 **kwargs
943 Additional keyword arguments used to augment or construct a
944 `DataCoordinate`. See `DataCoordinate.standardize`
945 parameters. Not used if a resolve `DatasetRef` is provided.
947 Returns
948 -------
949 ref : `DatasetRef`
950 A reference to the stored dataset, updated with the correct id if
951 given.
953 Raises
954 ------
955 TypeError
956 Raised if the butler is read-only or if no run has been provided.
957 """
958 if isinstance(datasetRefOrType, DatasetRef):
959 # This is a direct put of predefined DatasetRef.
960 _LOG.debug("Butler put direct: %s", datasetRefOrType)
961 if run is not None:
962 warnings.warn("Run collection is not used for DatasetRef", stacklevel=3)
963 # If registry already has a dataset with the same dataset ID,
964 # dataset type and DataId, then _importDatasets will do nothing and
965 # just return an original ref. We have to raise in this case, there
966 # is a datastore check below for that.
967 self._registry._importDatasets([datasetRefOrType], expand=True)
968 # Before trying to write to the datastore check that it does not
969 # know this dataset. This is prone to races, of course.
970 if self._datastore.knows(datasetRefOrType):
971 raise ConflictingDefinitionError(f"Datastore already contains dataset: {datasetRefOrType}")
972 # Try to write dataset to the datastore, if it fails due to a race
973 # with another write, the content of stored data may be
974 # unpredictable.
975 try:
976 self._datastore.put(obj, datasetRefOrType)
977 except IntegrityError as e:
978 raise ConflictingDefinitionError(f"Datastore already contains dataset: {e}") from e
979 return datasetRefOrType
981 _LOG.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run)
982 if not self.isWriteable():
983 raise TypeError("Butler is read-only.")
984 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs)
986 # Handle dimension records in dataId
987 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs)
989 # Add Registry Dataset entry.
990 dataId = self._registry.expandDataId(dataId, dimensions=datasetType.dimensions, **kwargs)
991 (ref,) = self._registry.insertDatasets(datasetType, run=run, dataIds=[dataId])
992 self._datastore.put(obj, ref)
994 return ref
996 def getDeferred(
997 self,
998 datasetRefOrType: DatasetRef | DatasetType | str,
999 /,
1000 dataId: DataId | None = None,
1001 *,
1002 parameters: dict | None = None,
1003 collections: Any = None,
1004 storageClass: str | StorageClass | None = None,
1005 timespan: Timespan | None = None,
1006 **kwargs: Any,
1007 ) -> DeferredDatasetHandle:
1008 """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
1009 after an immediate registry lookup.
1011 Parameters
1012 ----------
1013 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1014 When `DatasetRef` the `dataId` should be `None`.
1015 Otherwise the `DatasetType` or name thereof.
1016 dataId : `dict` or `DataCoordinate`, optional
1017 A `dict` of `Dimension` link name, value pairs that label the
1018 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1019 should be provided as the first argument.
1020 parameters : `dict`
1021 Additional StorageClass-defined options to control reading,
1022 typically used to efficiently read only a subset of the dataset.
1023 collections : Any, optional
1024 Collections to be searched, overriding ``self.collections``.
1025 Can be any of the types supported by the ``collections`` argument
1026 to butler construction.
1027 storageClass : `StorageClass` or `str`, optional
1028 The storage class to be used to override the Python type
1029 returned by this method. By default the returned type matches
1030 the dataset type definition for this dataset. Specifying a
1031 read `StorageClass` can force a different type to be returned.
1032 This type must be compatible with the original type.
1033 timespan : `Timespan` or `None`, optional
1034 A timespan that the validity range of the dataset must overlap.
1035 If not provided and this is a calibration dataset type, an attempt
1036 will be made to find the timespan from any temporal coordinate
1037 in the data ID.
1038 **kwargs
1039 Additional keyword arguments used to augment or construct a
1040 `DataId`. See `DataId` parameters.
1042 Returns
1043 -------
1044 obj : `DeferredDatasetHandle`
1045 A handle which can be used to retrieve a dataset at a later time.
1047 Raises
1048 ------
1049 LookupError
1050 Raised if no matching dataset exists in the `Registry` or
1051 datastore.
1052 ValueError
1053 Raised if a resolved `DatasetRef` was passed as an input, but it
1054 differs from the one found in the registry.
1055 TypeError
1056 Raised if no collections were provided.
1057 """
1058 if isinstance(datasetRefOrType, DatasetRef):
1059 # Do the quick check first and if that fails, check for artifact
1060 # existence. This is necessary for datastores that are configured
1061 # in trust mode where there won't be a record but there will be
1062 # a file.
1063 if self._datastore.knows(datasetRefOrType) or self._datastore.exists(datasetRefOrType):
1064 ref = datasetRefOrType
1065 else:
1066 raise LookupError(f"Dataset reference {datasetRefOrType} does not exist.")
1067 else:
1068 ref = self._findDatasetRef(
1069 datasetRefOrType, dataId, collections=collections, timespan=timespan, **kwargs
1070 )
1071 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters, storageClass=storageClass)
1073 def get(
1074 self,
1075 datasetRefOrType: DatasetRef | DatasetType | str,
1076 /,
1077 dataId: DataId | None = None,
1078 *,
1079 parameters: dict[str, Any] | None = None,
1080 collections: Any = None,
1081 storageClass: StorageClass | str | None = None,
1082 timespan: Timespan | None = None,
1083 **kwargs: Any,
1084 ) -> Any:
1085 """Retrieve a stored dataset.
1087 Parameters
1088 ----------
1089 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1090 When `DatasetRef` the `dataId` should be `None`.
1091 Otherwise the `DatasetType` or name thereof.
1092 If a resolved `DatasetRef`, the associated dataset
1093 is returned directly without additional querying.
1094 dataId : `dict` or `DataCoordinate`
1095 A `dict` of `Dimension` link name, value pairs that label the
1096 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1097 should be provided as the first argument.
1098 parameters : `dict`
1099 Additional StorageClass-defined options to control reading,
1100 typically used to efficiently read only a subset of the dataset.
1101 collections : Any, optional
1102 Collections to be searched, overriding ``self.collections``.
1103 Can be any of the types supported by the ``collections`` argument
1104 to butler construction.
1105 storageClass : `StorageClass` or `str`, optional
1106 The storage class to be used to override the Python type
1107 returned by this method. By default the returned type matches
1108 the dataset type definition for this dataset. Specifying a
1109 read `StorageClass` can force a different type to be returned.
1110 This type must be compatible with the original type.
1111 timespan : `Timespan` or `None`, optional
1112 A timespan that the validity range of the dataset must overlap.
1113 If not provided and this is a calibration dataset type, an attempt
1114 will be made to find the timespan from any temporal coordinate
1115 in the data ID.
1116 **kwargs
1117 Additional keyword arguments used to augment or construct a
1118 `DataCoordinate`. See `DataCoordinate.standardize`
1119 parameters.
1121 Returns
1122 -------
1123 obj : `object`
1124 The dataset.
1126 Raises
1127 ------
1128 LookupError
1129 Raised if no matching dataset exists in the `Registry`.
1130 TypeError
1131 Raised if no collections were provided.
1133 Notes
1134 -----
1135 When looking up datasets in a `~CollectionType.CALIBRATION` collection,
1136 this method requires that the given data ID include temporal dimensions
1137 beyond the dimensions of the dataset type itself, in order to find the
1138 dataset with the appropriate validity range. For example, a "bias"
1139 dataset with native dimensions ``{instrument, detector}`` could be
1140 fetched with a ``{instrument, detector, exposure}`` data ID, because
1141 ``exposure`` is a temporal dimension.
1142 """
1143 _LOG.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters)
1144 ref = self._findDatasetRef(
1145 datasetRefOrType,
1146 dataId,
1147 collections=collections,
1148 datastore_records=True,
1149 timespan=timespan,
1150 **kwargs,
1151 )
1152 return self._datastore.get(ref, parameters=parameters, storageClass=storageClass)
1154 def getURIs(
1155 self,
1156 datasetRefOrType: DatasetRef | DatasetType | str,
1157 /,
1158 dataId: DataId | None = None,
1159 *,
1160 predict: bool = False,
1161 collections: Any = None,
1162 run: str | None = None,
1163 **kwargs: Any,
1164 ) -> DatasetRefURIs:
1165 """Return the URIs associated with the dataset.
1167 Parameters
1168 ----------
1169 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1170 When `DatasetRef` the `dataId` should be `None`.
1171 Otherwise the `DatasetType` or name thereof.
1172 dataId : `dict` or `DataCoordinate`
1173 A `dict` of `Dimension` link name, value pairs that label the
1174 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1175 should be provided as the first argument.
1176 predict : `bool`
1177 If `True`, allow URIs to be returned of datasets that have not
1178 been written.
1179 collections : Any, optional
1180 Collections to be searched, overriding ``self.collections``.
1181 Can be any of the types supported by the ``collections`` argument
1182 to butler construction.
1183 run : `str`, optional
1184 Run to use for predictions, overriding ``self.run``.
1185 **kwargs
1186 Additional keyword arguments used to augment or construct a
1187 `DataCoordinate`. See `DataCoordinate.standardize`
1188 parameters.
1190 Returns
1191 -------
1192 uris : `DatasetRefURIs`
1193 The URI to the primary artifact associated with this dataset (if
1194 the dataset was disassembled within the datastore this may be
1195 `None`), and the URIs to any components associated with the dataset
1196 artifact. (can be empty if there are no components).
1197 """
1198 ref = self._findDatasetRef(
1199 datasetRefOrType, dataId, predict=predict, run=run, collections=collections, **kwargs
1200 )
1201 return self._datastore.getURIs(ref, predict)
1203 def get_dataset_type(self, name: str) -> DatasetType:
1204 return self._registry.getDatasetType(name)
1206 def get_dataset(
1207 self,
1208 id: DatasetId,
1209 *,
1210 storage_class: str | StorageClass | None = None,
1211 dimension_records: bool = False,
1212 datastore_records: bool = False,
1213 ) -> DatasetRef | None:
1214 ref = self._registry.getDataset(id)
1215 if ref is not None:
1216 if dimension_records:
1217 ref = ref.expanded(
1218 self._registry.expandDataId(ref.dataId, dimensions=ref.datasetType.dimensions)
1219 )
1220 if storage_class:
1221 ref = ref.overrideStorageClass(storage_class)
1222 if datastore_records:
1223 ref = self._registry.get_datastore_records(ref)
1224 return ref
1226 def find_dataset(
1227 self,
1228 dataset_type: DatasetType | str,
1229 data_id: DataId | None = None,
1230 *,
1231 collections: str | Sequence[str] | None = None,
1232 timespan: Timespan | None = None,
1233 storage_class: str | StorageClass | None = None,
1234 dimension_records: bool = False,
1235 datastore_records: bool = False,
1236 **kwargs: Any,
1237 ) -> DatasetRef | None:
1238 # Handle any parts of the dataID that are not using primary dimension
1239 # keys.
1240 if isinstance(dataset_type, str):
1241 actual_type = self.get_dataset_type(dataset_type)
1242 else:
1243 actual_type = dataset_type
1245 # Store the component for later.
1246 component_name = actual_type.component()
1247 if actual_type.isComponent():
1248 parent_type = actual_type.makeCompositeDatasetType()
1249 else:
1250 parent_type = actual_type
1252 data_id, kwargs = self._rewrite_data_id(data_id, parent_type, **kwargs)
1254 ref = self._registry.findDataset(
1255 parent_type,
1256 data_id,
1257 collections=collections,
1258 timespan=timespan,
1259 datastore_records=datastore_records,
1260 **kwargs,
1261 )
1262 if ref is not None and dimension_records:
1263 ref = ref.expanded(self._registry.expandDataId(ref.dataId, dimensions=ref.datasetType.dimensions))
1264 if ref is not None and component_name:
1265 ref = ref.makeComponentRef(component_name)
1266 if ref is not None and storage_class is not None:
1267 ref = ref.overrideStorageClass(storage_class)
1269 return ref
1271 def retrieveArtifacts(
1272 self,
1273 refs: Iterable[DatasetRef],
1274 destination: ResourcePathExpression,
1275 transfer: str = "auto",
1276 preserve_path: bool = True,
1277 overwrite: bool = False,
1278 ) -> list[ResourcePath]:
1279 # Docstring inherited.
1280 return self._datastore.retrieveArtifacts(
1281 refs,
1282 ResourcePath(destination),
1283 transfer=transfer,
1284 preserve_path=preserve_path,
1285 overwrite=overwrite,
1286 )
1288 def exists(
1289 self,
1290 dataset_ref_or_type: DatasetRef | DatasetType | str,
1291 /,
1292 data_id: DataId | None = None,
1293 *,
1294 full_check: bool = True,
1295 collections: Any = None,
1296 **kwargs: Any,
1297 ) -> DatasetExistence:
1298 # Docstring inherited.
1299 existence = DatasetExistence.UNRECOGNIZED
1301 if isinstance(dataset_ref_or_type, DatasetRef):
1302 if collections is not None:
1303 warnings.warn("Collections should not be specified with DatasetRef", stacklevel=2)
1304 if data_id is not None:
1305 warnings.warn("A DataID should not be specified with DatasetRef", stacklevel=2)
1306 ref = dataset_ref_or_type
1307 registry_ref = self._registry.getDataset(dataset_ref_or_type.id)
1308 if registry_ref is not None:
1309 existence |= DatasetExistence.RECORDED
1311 if dataset_ref_or_type != registry_ref:
1312 # This could mean that storage classes differ, so we should
1313 # check for that but use the registry ref for the rest of
1314 # the method.
1315 if registry_ref.is_compatible_with(dataset_ref_or_type):
1316 # Use the registry version from now on.
1317 ref = registry_ref
1318 else:
1319 raise ValueError(
1320 f"The ref given to exists() ({ref}) has the same dataset ID as one "
1321 f"in registry but has different incompatible values ({registry_ref})."
1322 )
1323 else:
1324 try:
1325 ref = self._findDatasetRef(dataset_ref_or_type, data_id, collections=collections, **kwargs)
1326 except (LookupError, TypeError):
1327 return existence
1328 existence |= DatasetExistence.RECORDED
1330 if self._datastore.knows(ref):
1331 existence |= DatasetExistence.DATASTORE
1333 if full_check:
1334 if self._datastore.exists(ref):
1335 existence |= DatasetExistence._ARTIFACT
1336 elif existence.value != DatasetExistence.UNRECOGNIZED.value:
1337 # Do not add this flag if we have no other idea about a dataset.
1338 existence |= DatasetExistence(DatasetExistence._ASSUMED)
1340 return existence
1342 def _exists_many(
1343 self,
1344 refs: Iterable[DatasetRef],
1345 /,
1346 *,
1347 full_check: bool = True,
1348 ) -> dict[DatasetRef, DatasetExistence]:
1349 # Docstring inherited.
1350 existence = {ref: DatasetExistence.UNRECOGNIZED for ref in refs}
1352 # Registry does not have a bulk API to check for a ref.
1353 for ref in refs:
1354 registry_ref = self._registry.getDataset(ref.id)
1355 if registry_ref is not None:
1356 # It is possible, albeit unlikely, that the given ref does
1357 # not match the one in registry even though the UUID matches.
1358 # When checking a single ref we raise, but it's impolite to
1359 # do that when potentially hundreds of refs are being checked.
1360 # We could change the API to only accept UUIDs and that would
1361 # remove the ability to even check and remove the worry
1362 # about differing storage classes. Given the ongoing discussion
1363 # on refs vs UUIDs and whether to raise or have a new
1364 # private flag, treat this as a private API for now.
1365 existence[ref] |= DatasetExistence.RECORDED
1367 # Ask datastore if it knows about these refs.
1368 knows = self._datastore.knows_these(refs)
1369 for ref, known in knows.items():
1370 if known:
1371 existence[ref] |= DatasetExistence.DATASTORE
1373 if full_check:
1374 mexists = self._datastore.mexists(refs)
1375 for ref, exists in mexists.items():
1376 if exists:
1377 existence[ref] |= DatasetExistence._ARTIFACT
1378 else:
1379 # Do not set this flag if nothing is known about the dataset.
1380 for ref in existence:
1381 if existence[ref] != DatasetExistence.UNRECOGNIZED:
1382 existence[ref] |= DatasetExistence._ASSUMED
1384 return existence
1386 def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None:
1387 # Docstring inherited.
1388 if not self.isWriteable():
1389 raise TypeError("Butler is read-only.")
1390 names = list(names)
1391 refs: list[DatasetRef] = []
1392 for name in names:
1393 collectionType = self._registry.getCollectionType(name)
1394 if collectionType is not CollectionType.RUN:
1395 raise TypeError(f"The collection type of '{name}' is {collectionType.name}, not RUN.")
1396 refs.extend(self._registry.queryDatasets(..., collections=name, findFirst=True))
1397 with self._datastore.transaction(), self._registry.transaction():
1398 if unstore:
1399 self._datastore.trash(refs)
1400 else:
1401 self._datastore.forget(refs)
1402 for name in names:
1403 self._registry.removeCollection(name)
1404 if unstore:
1405 # Point of no return for removing artifacts
1406 self._datastore.emptyTrash()
1408 def pruneDatasets(
1409 self,
1410 refs: Iterable[DatasetRef],
1411 *,
1412 disassociate: bool = True,
1413 unstore: bool = False,
1414 tags: Iterable[str] = (),
1415 purge: bool = False,
1416 ) -> None:
1417 # docstring inherited from LimitedButler
1419 if not self.isWriteable():
1420 raise TypeError("Butler is read-only.")
1421 if purge:
1422 if not disassociate:
1423 raise TypeError("Cannot pass purge=True without disassociate=True.")
1424 if not unstore:
1425 raise TypeError("Cannot pass purge=True without unstore=True.")
1426 elif disassociate:
1427 tags = tuple(tags)
1428 if not tags:
1429 raise TypeError("No tags provided but disassociate=True.")
1430 for tag in tags:
1431 collectionType = self._registry.getCollectionType(tag)
1432 if collectionType is not CollectionType.TAGGED:
1433 raise TypeError(
1434 f"Cannot disassociate from collection '{tag}' "
1435 f"of non-TAGGED type {collectionType.name}."
1436 )
1437 # Transform possibly-single-pass iterable into something we can iterate
1438 # over multiple times.
1439 refs = list(refs)
1440 # Pruning a component of a DatasetRef makes no sense since registry
1441 # doesn't know about components and datastore might not store
1442 # components in a separate file
1443 for ref in refs:
1444 if ref.datasetType.component():
1445 raise ValueError(f"Can not prune a component of a dataset (ref={ref})")
1446 # We don't need an unreliable Datastore transaction for this, because
1447 # we've been extra careful to ensure that Datastore.trash only involves
1448 # mutating the Registry (it can _look_ at Datastore-specific things,
1449 # but shouldn't change them), and hence all operations here are
1450 # Registry operations.
1451 with self._datastore.transaction(), self._registry.transaction():
1452 if unstore:
1453 self._datastore.trash(refs)
1454 if purge:
1455 self._registry.removeDatasets(refs)
1456 elif disassociate:
1457 assert tags, "Guaranteed by earlier logic in this function."
1458 for tag in tags:
1459 self._registry.disassociate(tag, refs)
1460 # We've exited the Registry transaction, and apparently committed.
1461 # (if there was an exception, everything rolled back, and it's as if
1462 # nothing happened - and we never get here).
1463 # Datastore artifacts are not yet gone, but they're clearly marked
1464 # as trash, so if we fail to delete now because of (e.g.) filesystem
1465 # problems we can try again later, and if manual administrative
1466 # intervention is required, it's pretty clear what that should entail:
1467 # deleting everything on disk and in private Datastore tables that is
1468 # in the dataset_location_trash table.
1469 if unstore:
1470 # Point of no return for removing artifacts
1471 self._datastore.emptyTrash()
1473 @transactional
1474 def ingest(
1475 self,
1476 *datasets: FileDataset,
1477 transfer: str | None = "auto",
1478 record_validation_info: bool = True,
1479 ) -> None:
1480 # Docstring inherited.
1481 if not self.isWriteable():
1482 raise TypeError("Butler is read-only.")
1484 _LOG.verbose("Ingesting %d file dataset%s.", len(datasets), "" if len(datasets) == 1 else "s")
1485 if not datasets:
1486 return
1488 progress = Progress("lsst.daf.butler.Butler.ingest", level=logging.DEBUG)
1490 # We need to reorganize all the inputs so that they are grouped
1491 # by dataset type and run. Multiple refs in a single FileDataset
1492 # are required to share the run and dataset type.
1493 groupedData: MutableMapping[tuple[DatasetType, str], list[FileDataset]] = defaultdict(list)
1495 # Track DataIDs that are being ingested so we can spot issues early
1496 # with duplication. Retain previous FileDataset so we can report it.
1497 groupedDataIds: MutableMapping[tuple[DatasetType, str], dict[DataCoordinate, FileDataset]] = (
1498 defaultdict(dict)
1499 )
1501 # And the nested loop that populates it:
1502 for dataset in progress.wrap(datasets, desc="Grouping by dataset type"):
1503 # Somewhere to store pre-existing refs if we have an
1504 # execution butler.
1505 existingRefs: list[DatasetRef] = []
1507 for ref in dataset.refs:
1508 group_key = (ref.datasetType, ref.run)
1510 if ref.dataId in groupedDataIds[group_key]:
1511 raise ConflictingDefinitionError(
1512 f"Ingest conflict. Dataset {dataset.path} has same"
1513 " DataId as other ingest dataset"
1514 f" {groupedDataIds[group_key][ref.dataId].path} "
1515 f" ({ref.dataId})"
1516 )
1518 groupedDataIds[group_key][ref.dataId] = dataset
1520 if existingRefs:
1521 if len(dataset.refs) != len(existingRefs):
1522 # Keeping track of partially pre-existing datasets is hard
1523 # and should generally never happen. For now don't allow
1524 # it.
1525 raise ConflictingDefinitionError(
1526 f"For dataset {dataset.path} some dataIds already exist"
1527 " in registry but others do not. This is not supported."
1528 )
1530 # Store expanded form in the original FileDataset.
1531 dataset.refs = existingRefs
1532 else:
1533 groupedData[group_key].append(dataset)
1535 # Now we can bulk-insert into Registry for each DatasetType.
1536 for (datasetType, this_run), grouped_datasets in progress.iter_item_chunks(
1537 groupedData.items(), desc="Bulk-inserting datasets by type"
1538 ):
1539 refs_to_import = []
1540 for dataset in grouped_datasets:
1541 refs_to_import.extend(dataset.refs)
1543 n_refs = len(refs_to_import)
1544 _LOG.verbose(
1545 "Importing %d ref%s of dataset type %r into run %r",
1546 n_refs,
1547 "" if n_refs == 1 else "s",
1548 datasetType.name,
1549 this_run,
1550 )
1552 # Import the refs and expand the DataCoordinates since we can't
1553 # guarantee that they are expanded and Datastore will need
1554 # the records.
1555 imported_refs = self._registry._importDatasets(refs_to_import, expand=True)
1556 assert set(imported_refs) == set(refs_to_import)
1558 # Replace all the refs in the FileDataset with expanded versions.
1559 # Pull them off in the order we put them on the list.
1560 for dataset in grouped_datasets:
1561 n_dataset_refs = len(dataset.refs)
1562 dataset.refs = imported_refs[:n_dataset_refs]
1563 del imported_refs[:n_dataset_refs]
1565 # Bulk-insert everything into Datastore.
1566 # We do not know if any of the registry entries already existed
1567 # (_importDatasets only complains if they exist but differ) so
1568 # we have to catch IntegrityError explicitly.
1569 try:
1570 self._datastore.ingest(
1571 *datasets, transfer=transfer, record_validation_info=record_validation_info
1572 )
1573 except IntegrityError as e:
1574 raise ConflictingDefinitionError(f"Datastore already contains one or more datasets: {e}") from e
1576 @contextlib.contextmanager
1577 def export(
1578 self,
1579 *,
1580 directory: str | None = None,
1581 filename: str | None = None,
1582 format: str | None = None,
1583 transfer: str | None = None,
1584 ) -> Iterator[RepoExportContext]:
1585 # Docstring inherited.
1586 if directory is None and transfer is not None:
1587 raise TypeError("Cannot transfer without providing a directory.")
1588 if transfer == "move":
1589 raise TypeError("Transfer may not be 'move': export is read-only")
1590 if format is None:
1591 if filename is None:
1592 raise TypeError("At least one of 'filename' or 'format' must be provided.")
1593 else:
1594 _, format = os.path.splitext(filename)
1595 if not format:
1596 raise ValueError("Please specify a file extension to determine export format.")
1597 format = format[1:] # Strip leading ".""
1598 elif filename is None:
1599 filename = f"export.{format}"
1600 if directory is not None:
1601 filename = os.path.join(directory, filename)
1602 formats = self._config["repo_transfer_formats"]
1603 if format not in formats:
1604 raise ValueError(f"Unknown export format {format!r}, allowed: {','.join(formats.keys())}")
1605 BackendClass = get_class_of(formats[format, "export"])
1606 with open(filename, "w") as stream:
1607 backend = BackendClass(stream, universe=self.dimensions)
1608 try:
1609 helper = RepoExportContext(
1610 self._registry, self._datastore, backend=backend, directory=directory, transfer=transfer
1611 )
1612 with self._caching_context():
1613 yield helper
1614 except BaseException:
1615 raise
1616 else:
1617 helper._finish()
1619 def import_(
1620 self,
1621 *,
1622 directory: ResourcePathExpression | None = None,
1623 filename: ResourcePathExpression | TextIO | None = None,
1624 format: str | None = None,
1625 transfer: str | None = None,
1626 skip_dimensions: set | None = None,
1627 ) -> None:
1628 # Docstring inherited.
1629 if not self.isWriteable():
1630 raise TypeError("Butler is read-only.")
1631 if format is None:
1632 if filename is None:
1633 raise TypeError("At least one of 'filename' or 'format' must be provided.")
1634 else:
1635 _, format = os.path.splitext(filename) # type: ignore
1636 elif filename is None:
1637 filename = ResourcePath(f"export.{format}", forceAbsolute=False)
1638 if directory is not None:
1639 directory = ResourcePath(directory, forceDirectory=True)
1640 # mypy doesn't think this will work but it does in python >= 3.10.
1641 if isinstance(filename, ResourcePathExpression): # type: ignore
1642 filename = ResourcePath(filename, forceAbsolute=False) # type: ignore
1643 if not filename.isabs() and directory is not None:
1644 potential = directory.join(filename)
1645 exists_in_cwd = filename.exists()
1646 exists_in_dir = potential.exists()
1647 if exists_in_cwd and exists_in_dir:
1648 _LOG.warning(
1649 "A relative path for filename was specified (%s) which exists relative to cwd. "
1650 "Additionally, the file exists relative to the given search directory (%s). "
1651 "Using the export file in the given directory.",
1652 filename,
1653 potential,
1654 )
1655 # Given they specified an explicit directory and that
1656 # directory has the export file in it, assume that that
1657 # is what was meant despite the file in cwd.
1658 filename = potential
1659 elif exists_in_dir:
1660 filename = potential
1661 elif not exists_in_cwd and not exists_in_dir:
1662 # Raise early.
1663 raise FileNotFoundError(
1664 f"Export file could not be found in {filename.abspath()} or {potential.abspath()}."
1665 )
1666 BackendClass: type[RepoImportBackend] = get_class_of(
1667 self._config["repo_transfer_formats"][format]["import"]
1668 )
1670 def doImport(importStream: TextIO | ResourceHandleProtocol) -> None:
1671 with self._caching_context():
1672 backend = BackendClass(importStream, self._registry) # type: ignore[call-arg]
1673 backend.register()
1674 with self.transaction():
1675 backend.load(
1676 self._datastore,
1677 directory=directory,
1678 transfer=transfer,
1679 skip_dimensions=skip_dimensions,
1680 )
1682 if isinstance(filename, ResourcePath):
1683 # We can not use open() here at the moment because of
1684 # DM-38589 since yaml does stream.read(8192) in a loop.
1685 stream = io.StringIO(filename.read().decode())
1686 doImport(stream)
1687 else:
1688 doImport(filename) # type: ignore
1690 def transfer_dimension_records_from(
1691 self, source_butler: LimitedButler | Butler, source_refs: Iterable[DatasetRef]
1692 ) -> None:
1693 # Allowed dimensions in the target butler.
1694 elements = frozenset(element for element in self.dimensions.elements if element.has_own_table)
1696 data_ids = {ref.dataId for ref in source_refs}
1698 dimension_records = self._extract_all_dimension_records_from_data_ids(
1699 source_butler, data_ids, elements
1700 )
1702 # Insert order is important.
1703 for element in self.dimensions.sorted(dimension_records.keys()):
1704 records = [r for r in dimension_records[element].values()]
1705 # Assume that if the record is already present that we can
1706 # use it without having to check that the record metadata
1707 # is consistent.
1708 self._registry.insertDimensionData(element, *records, skip_existing=True)
1709 _LOG.debug("Dimension '%s' -- number of records transferred: %d", element.name, len(records))
1711 def _extract_all_dimension_records_from_data_ids(
1712 self,
1713 source_butler: LimitedButler | Butler,
1714 data_ids: set[DataCoordinate],
1715 allowed_elements: frozenset[DimensionElement],
1716 ) -> dict[DimensionElement, dict[DataCoordinate, DimensionRecord]]:
1717 primary_records = self._extract_dimension_records_from_data_ids(
1718 source_butler, data_ids, allowed_elements
1719 )
1721 can_query = True if isinstance(source_butler, Butler) else False
1723 additional_records: dict[DimensionElement, dict[DataCoordinate, DimensionRecord]] = defaultdict(dict)
1724 for original_element, record_mapping in primary_records.items():
1725 # Get dimensions that depend on this dimension.
1726 populated_by = self.dimensions.get_elements_populated_by(
1727 self.dimensions[original_element.name] # type: ignore
1728 )
1730 for data_id in record_mapping.keys():
1731 for element in populated_by:
1732 if element not in allowed_elements:
1733 continue
1734 if element.name == original_element.name:
1735 continue
1737 if element.name in primary_records:
1738 # If this element has already been stored avoid
1739 # re-finding records since that may lead to additional
1740 # spurious records. e.g. visit is populated_by
1741 # visit_detector_region but querying
1742 # visit_detector_region by visit will return all the
1743 # detectors for this visit -- the visit dataId does not
1744 # constrain this.
1745 # To constrain the query the original dataIds would
1746 # have to be scanned.
1747 continue
1749 if not can_query:
1750 raise RuntimeError(
1751 f"Transferring populated_by records like {element.name} requires a full Butler."
1752 )
1754 records = source_butler.registry.queryDimensionRecords( # type: ignore
1755 element.name,
1756 **data_id.mapping, # type: ignore
1757 )
1758 for record in records:
1759 additional_records[record.definition].setdefault(record.dataId, record)
1761 # The next step is to walk back through the additional records to
1762 # pick up any missing content (such as visit_definition needing to
1763 # know the exposure). Want to ensure we do not request records we
1764 # already have.
1765 missing_data_ids = set()
1766 for name, record_mapping in additional_records.items():
1767 for data_id in record_mapping.keys():
1768 if data_id not in primary_records[name]:
1769 missing_data_ids.add(data_id)
1771 # Fill out the new records. Assume that these new records do not
1772 # also need to carry over additional populated_by records.
1773 secondary_records = self._extract_dimension_records_from_data_ids(
1774 source_butler, missing_data_ids, allowed_elements
1775 )
1777 # Merge the extra sets of records in with the original.
1778 for name, record_mapping in itertools.chain(additional_records.items(), secondary_records.items()):
1779 primary_records[name].update(record_mapping)
1781 return primary_records
1783 def _extract_dimension_records_from_data_ids(
1784 self,
1785 source_butler: LimitedButler | Butler,
1786 data_ids: set[DataCoordinate],
1787 allowed_elements: frozenset[DimensionElement],
1788 ) -> dict[DimensionElement, dict[DataCoordinate, DimensionRecord]]:
1789 dimension_records: dict[DimensionElement, dict[DataCoordinate, DimensionRecord]] = defaultdict(dict)
1791 for data_id in data_ids:
1792 # Need an expanded record, if not expanded that we need a full
1793 # butler with registry (allow mocks with registry too).
1794 if not data_id.hasRecords():
1795 if registry := getattr(source_butler, "registry", None):
1796 data_id = registry.expandDataId(data_id)
1797 else:
1798 raise TypeError("Input butler needs to be a full butler to expand DataId.")
1799 # If this butler doesn't know about a dimension in the source
1800 # butler things will break later.
1801 for element_name in data_id.dimensions.elements:
1802 record = data_id.records[element_name]
1803 if record is not None and record.definition in allowed_elements:
1804 dimension_records[record.definition].setdefault(record.dataId, record)
1806 return dimension_records
1808 def transfer_from(
1809 self,
1810 source_butler: LimitedButler,
1811 source_refs: Iterable[DatasetRef],
1812 transfer: str = "auto",
1813 skip_missing: bool = True,
1814 register_dataset_types: bool = False,
1815 transfer_dimensions: bool = False,
1816 dry_run: bool = False,
1817 ) -> collections.abc.Collection[DatasetRef]:
1818 # Docstring inherited.
1819 if not self.isWriteable():
1820 raise TypeError("Butler is read-only.")
1821 progress = Progress("lsst.daf.butler.Butler.transfer_from", level=VERBOSE)
1823 # Will iterate through the refs multiple times so need to convert
1824 # to a list if this isn't a collection.
1825 if not isinstance(source_refs, collections.abc.Collection):
1826 source_refs = list(source_refs)
1828 original_count = len(source_refs)
1829 _LOG.info("Transferring %d datasets into %s", original_count, str(self))
1831 # In some situations the datastore artifact may be missing
1832 # and we do not want that registry entry to be imported.
1833 # Asking datastore is not sufficient, the records may have been
1834 # purged, we have to ask for the (predicted) URI and check
1835 # existence explicitly. Execution butler is set up exactly like
1836 # this with no datastore records.
1837 artifact_existence: dict[ResourcePath, bool] = {}
1838 if skip_missing:
1839 dataset_existence = source_butler._datastore.mexists(
1840 source_refs, artifact_existence=artifact_existence
1841 )
1842 source_refs = [ref for ref, exists in dataset_existence.items() if exists]
1843 filtered_count = len(source_refs)
1844 n_missing = original_count - filtered_count
1845 _LOG.verbose(
1846 "%d dataset%s removed because the artifact does not exist. Now have %d.",
1847 n_missing,
1848 "" if n_missing == 1 else "s",
1849 filtered_count,
1850 )
1852 # Importing requires that we group the refs by dataset type and run
1853 # before doing the import.
1854 source_dataset_types = set()
1855 grouped_refs = defaultdict(list)
1856 for ref in source_refs:
1857 grouped_refs[ref.datasetType, ref.run].append(ref)
1858 source_dataset_types.add(ref.datasetType)
1860 # Check to see if the dataset type in the source butler has
1861 # the same definition in the target butler and register missing
1862 # ones if requested. Registration must happen outside a transaction.
1863 newly_registered_dataset_types = set()
1864 for datasetType in source_dataset_types:
1865 if register_dataset_types:
1866 # Let this raise immediately if inconsistent. Continuing
1867 # on to find additional inconsistent dataset types
1868 # might result in additional unwanted dataset types being
1869 # registered.
1870 if self._registry.registerDatasetType(datasetType):
1871 newly_registered_dataset_types.add(datasetType)
1872 else:
1873 # If the dataset type is missing, let it fail immediately.
1874 target_dataset_type = self.get_dataset_type(datasetType.name)
1875 if target_dataset_type != datasetType:
1876 raise ConflictingDefinitionError(
1877 "Source butler dataset type differs from definition"
1878 f" in target butler: {datasetType} !="
1879 f" {target_dataset_type}"
1880 )
1881 if newly_registered_dataset_types:
1882 # We may have registered some even if there were inconsistencies
1883 # but should let people know (or else remove them again).
1884 _LOG.verbose(
1885 "Registered the following dataset types in the target Butler: %s",
1886 ", ".join(d.name for d in newly_registered_dataset_types),
1887 )
1888 else:
1889 _LOG.verbose("All required dataset types are known to the target Butler")
1891 dimension_records: dict[DimensionElement, dict[DataCoordinate, DimensionRecord]] = defaultdict(dict)
1892 if transfer_dimensions:
1893 # Collect all the dimension records for these refs.
1894 # All dimensions are to be copied but the list of valid dimensions
1895 # come from this butler's universe.
1896 elements = frozenset(element for element in self.dimensions.elements if element.has_own_table)
1897 dataIds = {ref.dataId for ref in source_refs}
1898 dimension_records = self._extract_all_dimension_records_from_data_ids(
1899 source_butler, dataIds, elements
1900 )
1902 handled_collections: set[str] = set()
1904 # Do all the importing in a single transaction.
1905 with self.transaction():
1906 if dimension_records and not dry_run:
1907 _LOG.verbose("Ensuring that dimension records exist for transferred datasets.")
1908 # Order matters.
1909 for element in self.dimensions.sorted(dimension_records.keys()):
1910 records = [r for r in dimension_records[element].values()]
1911 # Assume that if the record is already present that we can
1912 # use it without having to check that the record metadata
1913 # is consistent.
1914 self._registry.insertDimensionData(element, *records, skip_existing=True)
1916 n_imported = 0
1917 for (datasetType, run), refs_to_import in progress.iter_item_chunks(
1918 grouped_refs.items(), desc="Importing to registry by run and dataset type"
1919 ):
1920 if run not in handled_collections:
1921 # May need to create output collection. If source butler
1922 # has a registry, ask for documentation string.
1923 run_doc = None
1924 if registry := getattr(source_butler, "registry", None):
1925 run_doc = registry.getCollectionDocumentation(run)
1926 if not dry_run:
1927 registered = self._registry.registerRun(run, doc=run_doc)
1928 else:
1929 registered = True
1930 handled_collections.add(run)
1931 if registered:
1932 _LOG.verbose("Creating output run %s", run)
1934 n_refs = len(refs_to_import)
1935 _LOG.verbose(
1936 "Importing %d ref%s of dataset type %s into run %s",
1937 n_refs,
1938 "" if n_refs == 1 else "s",
1939 datasetType.name,
1940 run,
1941 )
1943 # Assume we are using UUIDs and the source refs will match
1944 # those imported.
1945 if not dry_run:
1946 imported_refs = self._registry._importDatasets(refs_to_import)
1947 else:
1948 imported_refs = refs_to_import
1949 assert set(imported_refs) == set(refs_to_import)
1950 n_imported += len(imported_refs)
1952 assert len(source_refs) == n_imported
1953 _LOG.verbose("Imported %d datasets into destination butler", n_imported)
1955 # Ask the datastore to transfer. The datastore has to check that
1956 # the source datastore is compatible with the target datastore.
1957 accepted, rejected = self._datastore.transfer_from(
1958 source_butler._datastore,
1959 source_refs,
1960 transfer=transfer,
1961 artifact_existence=artifact_existence,
1962 dry_run=dry_run,
1963 )
1964 if rejected:
1965 # For now, accept the registry entries but not the files.
1966 _LOG.warning(
1967 "%d datasets were rejected and %d accepted for dataset type %s in run %r.",
1968 len(rejected),
1969 len(accepted),
1970 datasetType,
1971 run,
1972 )
1974 return source_refs
1976 def validateConfiguration(
1977 self,
1978 logFailures: bool = False,
1979 datasetTypeNames: Iterable[str] | None = None,
1980 ignore: Iterable[str] | None = None,
1981 ) -> None:
1982 # Docstring inherited.
1983 if datasetTypeNames:
1984 datasetTypes = [self.get_dataset_type(name) for name in datasetTypeNames]
1985 else:
1986 datasetTypes = list(self._registry.queryDatasetTypes())
1988 # filter out anything from the ignore list
1989 if ignore:
1990 ignore = set(ignore)
1991 datasetTypes = [
1992 e for e in datasetTypes if e.name not in ignore and e.nameAndComponent()[0] not in ignore
1993 ]
1994 else:
1995 ignore = set()
1997 # For each datasetType that has an instrument dimension, create
1998 # a DatasetRef for each defined instrument
1999 datasetRefs = []
2001 # Find all the registered instruments (if "instrument" is in the
2002 # universe).
2003 if "instrument" in self.dimensions:
2004 instruments = {record.name for record in self._registry.queryDimensionRecords("instrument")}
2006 for datasetType in datasetTypes:
2007 if "instrument" in datasetType.dimensions:
2008 # In order to create a conforming dataset ref, create
2009 # fake DataCoordinate values for the non-instrument
2010 # dimensions. The type of the value does not matter here.
2011 dataId = {dim: 1 for dim in datasetType.dimensions.names if dim != "instrument"}
2013 for instrument in instruments:
2014 datasetRef = DatasetRef(
2015 datasetType,
2016 DataCoordinate.standardize(
2017 dataId, instrument=instrument, dimensions=datasetType.dimensions
2018 ),
2019 run="validate",
2020 )
2021 datasetRefs.append(datasetRef)
2023 entities: list[DatasetType | DatasetRef] = []
2024 entities.extend(datasetTypes)
2025 entities.extend(datasetRefs)
2027 datastoreErrorStr = None
2028 try:
2029 self._datastore.validateConfiguration(entities, logFailures=logFailures)
2030 except ValidationError as e:
2031 datastoreErrorStr = str(e)
2033 # Also check that the LookupKeys used by the datastores match
2034 # registry and storage class definitions
2035 keys = self._datastore.getLookupKeys()
2037 failedNames = set()
2038 failedDataId = set()
2039 for key in keys:
2040 if key.name is not None:
2041 if key.name in ignore:
2042 continue
2044 # skip if specific datasetType names were requested and this
2045 # name does not match
2046 if datasetTypeNames and key.name not in datasetTypeNames:
2047 continue
2049 # See if it is a StorageClass or a DatasetType
2050 if key.name in self.storageClasses:
2051 pass
2052 else:
2053 try:
2054 self.get_dataset_type(key.name)
2055 except KeyError:
2056 if logFailures:
2057 _LOG.critical(
2058 "Key '%s' does not correspond to a DatasetType or StorageClass", key
2059 )
2060 failedNames.add(key)
2061 else:
2062 # Dimensions are checked for consistency when the Butler
2063 # is created and rendezvoused with a universe.
2064 pass
2066 # Check that the instrument is a valid instrument
2067 # Currently only support instrument so check for that
2068 if key.dataId:
2069 dataIdKeys = set(key.dataId)
2070 if {"instrument"} != dataIdKeys:
2071 if logFailures:
2072 _LOG.critical("Key '%s' has unsupported DataId override", key)
2073 failedDataId.add(key)
2074 elif key.dataId["instrument"] not in instruments:
2075 if logFailures:
2076 _LOG.critical("Key '%s' has unknown instrument", key)
2077 failedDataId.add(key)
2079 messages = []
2081 if datastoreErrorStr:
2082 messages.append(datastoreErrorStr)
2084 for failed, msg in (
2085 (failedNames, "Keys without corresponding DatasetType or StorageClass entry: "),
2086 (failedDataId, "Keys with bad DataId entries: "),
2087 ):
2088 if failed:
2089 msg += ", ".join(str(k) for k in failed)
2090 messages.append(msg)
2092 if messages:
2093 raise ValidationError(";\n".join(messages))
2095 @property
2096 def collections(self) -> Sequence[str]:
2097 """The collections to search by default, in order
2098 (`~collections.abc.Sequence` [ `str` ]).
2100 This is an alias for ``self.registry.defaults.collections``. It cannot
2101 be set directly in isolation, but all defaults may be changed together
2102 by assigning a new `RegistryDefaults` instance to
2103 ``self.registry.defaults``.
2104 """
2105 return self._registry.defaults.collections
2107 @property
2108 def run(self) -> str | None:
2109 """Name of the run this butler writes outputs to by default (`str` or
2110 `None`).
2112 This is an alias for ``self.registry.defaults.run``. It cannot be set
2113 directly in isolation, but all defaults may be changed together by
2114 assigning a new `RegistryDefaults` instance to
2115 ``self.registry.defaults``.
2116 """
2117 return self._registry.defaults.run
2119 @property
2120 def registry(self) -> Registry:
2121 """The object that manages dataset metadata and relationships
2122 (`Registry`).
2124 Many operations that don't involve reading or writing butler datasets
2125 are accessible only via `Registry` methods. Eventually these methods
2126 will be replaced by equivalent `Butler` methods.
2127 """
2128 return self._registry_shim
2130 @property
2131 def dimensions(self) -> DimensionUniverse:
2132 # Docstring inherited.
2133 return self._registry.dimensions
2135 @contextlib.contextmanager
2136 def _query(self) -> Iterator[Query]:
2137 # Docstring inherited.
2138 raise NotImplementedError("TODO DM-41159")
2140 def _preload_cache(self) -> None:
2141 """Immediately load caches that are used for common operations."""
2142 self._registry.preload_cache()
2144 _config: ButlerConfig
2145 """Configuration for this Butler instance."""
2147 _registry: SqlRegistry
2148 """The object that manages dataset metadata and relationships
2149 (`SqlRegistry`).
2151 Most operations that don't involve reading or writing butler datasets are
2152 accessible only via `SqlRegistry` methods.
2153 """
2155 datastore: Datastore
2156 """The object that manages actual dataset storage (`Datastore`).
2158 Direct user access to the datastore should rarely be necessary; the primary
2159 exception is the case where a `Datastore` implementation provides extra
2160 functionality beyond what the base class defines.
2161 """
2163 storageClasses: StorageClassFactory
2164 """An object that maps known storage class names to objects that fully
2165 describe them (`StorageClassFactory`).
2166 """
2168 _registry_shim: RegistryShim
2169 """Shim object to provide a legacy public interface for querying via the
2170 the ``registry`` property.
2171 """