Coverage for python/lsst/daf/butler/direct_butler.py: 10%
749 statements
« prev ^ index » next coverage.py v7.4.4, created at 2024-03-26 02:48 -0700
« prev ^ index » next coverage.py v7.4.4, created at 2024-03-26 02:48 -0700
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28"""Butler top level classes.
29"""
30from __future__ import annotations
32__all__ = (
33 "DirectButler",
34 "ButlerValidationError",
35)
37import collections.abc
38import contextlib
39import io
40import itertools
41import logging
42import numbers
43import os
44import warnings
45from collections import Counter, defaultdict
46from collections.abc import Iterable, Iterator, MutableMapping, Sequence
47from typing import TYPE_CHECKING, Any, ClassVar, TextIO, cast
49from lsst.resources import ResourcePath, ResourcePathExpression
50from lsst.utils.introspection import get_class_of
51from lsst.utils.logging import VERBOSE, getLogger
52from sqlalchemy.exc import IntegrityError
54from ._butler import Butler
55from ._butler_config import ButlerConfig
56from ._butler_instance_options import ButlerInstanceOptions
57from ._dataset_existence import DatasetExistence
58from ._dataset_ref import DatasetRef
59from ._dataset_type import DatasetType
60from ._deferredDatasetHandle import DeferredDatasetHandle
61from ._exceptions import DatasetNotFoundError, ValidationError
62from ._limited_butler import LimitedButler
63from ._registry_shim import RegistryShim
64from ._storage_class import StorageClass, StorageClassFactory
65from ._timespan import Timespan
66from .datastore import Datastore, NullDatastore
67from .dimensions import DataCoordinate, Dimension
68from .progress import Progress
69from .queries import Query
70from .registry import (
71 CollectionType,
72 ConflictingDefinitionError,
73 DataIdError,
74 MissingDatasetTypeError,
75 RegistryDefaults,
76 _RegistryFactory,
77)
78from .registry.sql_registry import SqlRegistry
79from .transfers import RepoExportContext
80from .utils import transactional
82if TYPE_CHECKING:
83 from lsst.resources import ResourceHandleProtocol
85 from ._dataset_ref import DatasetId
86 from ._file_dataset import FileDataset
87 from .datastore import DatasetRefURIs
88 from .dimensions import DataId, DataIdValue, DimensionElement, DimensionRecord, DimensionUniverse
89 from .registry import Registry
90 from .transfers import RepoImportBackend
92_LOG = getLogger(__name__)
95class ButlerValidationError(ValidationError):
96 """There is a problem with the Butler configuration."""
98 pass
101class DirectButler(Butler): # numpydoc ignore=PR02
102 """Main entry point for the data access system.
104 Parameters
105 ----------
106 config : `ButlerConfig`
107 The configuration for this Butler instance.
108 registry : `SqlRegistry`
109 The object that manages dataset metadata and relationships.
110 datastore : Datastore
111 The object that manages actual dataset storage.
112 storageClasses : StorageClassFactory
113 An object that maps known storage class names to objects that fully
114 describe them.
116 Notes
117 -----
118 Most users should call the top-level `Butler`.``from_config`` instead of
119 using this constructor directly.
120 """
122 # This is __new__ instead of __init__ because we have to support
123 # instantiation via the legacy constructor Butler.__new__(), which
124 # reads the configuration and selects which subclass to instantiate. The
125 # interaction between __new__ and __init__ is kind of wacky in Python. If
126 # we were using __init__ here, __init__ would be called twice (once when
127 # the DirectButler instance is constructed inside Butler.from_config(), and
128 # a second time with the original arguments to Butler() when the instance
129 # is returned from Butler.__new__()
130 def __new__(
131 cls,
132 *,
133 config: ButlerConfig,
134 registry: SqlRegistry,
135 datastore: Datastore,
136 storageClasses: StorageClassFactory,
137 ) -> DirectButler:
138 self = cast(DirectButler, super().__new__(cls))
139 self._config = config
140 self._registry = registry
141 self._datastore = datastore
142 self.storageClasses = storageClasses
144 # For execution butler the datastore needs a special
145 # dependency-inversion trick. This is not used by regular butler,
146 # but we do not have a way to distinguish regular butler from execution
147 # butler.
148 self._datastore.set_retrieve_dataset_type_method(self._retrieve_dataset_type)
150 self._registry_shim = RegistryShim(self)
152 return self
154 @classmethod
155 def create_from_config(
156 cls,
157 config: ButlerConfig,
158 *,
159 options: ButlerInstanceOptions,
160 without_datastore: bool = False,
161 ) -> DirectButler:
162 """Construct a Butler instance from a configuration file.
164 Parameters
165 ----------
166 config : `ButlerConfig`
167 The configuration for this Butler instance.
168 options : `ButlerInstanceOptions`
169 Default values and other settings for the Butler instance.
170 without_datastore : `bool`, optional
171 If `True` do not attach a datastore to this butler. Any attempts
172 to use a datastore will fail.
174 Notes
175 -----
176 Most users should call the top-level `Butler`.``from_config``
177 instead of using this function directly.
178 """
179 if "run" in config or "collection" in config:
180 raise ValueError("Passing a run or collection via configuration is no longer supported.")
182 defaults = RegistryDefaults(
183 collections=options.collections, run=options.run, infer=options.inferDefaults, **options.kwargs
184 )
185 try:
186 butlerRoot = config.get("root", config.configDir)
187 writeable = options.writeable
188 if writeable is None:
189 writeable = options.run is not None
190 registry = _RegistryFactory(config).from_config(
191 butlerRoot=butlerRoot, writeable=writeable, defaults=defaults
192 )
193 if without_datastore:
194 datastore: Datastore = NullDatastore(None, None)
195 else:
196 datastore = Datastore.fromConfig(
197 config, registry.getDatastoreBridgeManager(), butlerRoot=butlerRoot
198 )
199 # TODO: Once datastore drops dependency on registry we can
200 # construct datastore first and pass opaque tables to registry
201 # constructor.
202 registry.make_datastore_tables(datastore.get_opaque_table_definitions())
203 storageClasses = StorageClassFactory()
204 storageClasses.addFromConfig(config)
206 return DirectButler(
207 config=config, registry=registry, datastore=datastore, storageClasses=storageClasses
208 )
209 except Exception:
210 # Failures here usually mean that configuration is incomplete,
211 # just issue an error message which includes config file URI.
212 _LOG.error(f"Failed to instantiate Butler from config {config.configFile}.")
213 raise
215 def _clone(
216 self,
217 *,
218 collections: Any = None,
219 run: str | None = None,
220 inferDefaults: bool = True,
221 **kwargs: Any,
222 ) -> DirectButler:
223 # Docstring inherited
224 defaults = RegistryDefaults(collections=collections, run=run, infer=inferDefaults, **kwargs)
225 registry = self._registry.copy(defaults)
227 return DirectButler(
228 registry=registry,
229 config=self._config,
230 datastore=self._datastore.clone(registry.getDatastoreBridgeManager()),
231 storageClasses=self.storageClasses,
232 )
234 GENERATION: ClassVar[int] = 3
235 """This is a Generation 3 Butler.
237 This attribute may be removed in the future, once the Generation 2 Butler
238 interface has been fully retired; it should only be used in transitional
239 code.
240 """
242 def _retrieve_dataset_type(self, name: str) -> DatasetType | None:
243 """Return DatasetType defined in registry given dataset type name."""
244 try:
245 return self.get_dataset_type(name)
246 except MissingDatasetTypeError:
247 return None
249 @classmethod
250 def _unpickle(
251 cls,
252 config: ButlerConfig,
253 collections: tuple[str, ...] | None,
254 run: str | None,
255 defaultDataId: dict[str, str],
256 writeable: bool,
257 ) -> DirectButler:
258 """Callable used to unpickle a Butler.
260 We prefer not to use ``Butler.__init__`` directly so we can force some
261 of its many arguments to be keyword-only (note that ``__reduce__``
262 can only invoke callables with positional arguments).
264 Parameters
265 ----------
266 config : `ButlerConfig`
267 Butler configuration, already coerced into a true `ButlerConfig`
268 instance (and hence after any search paths for overrides have been
269 utilized).
270 collections : `tuple` [ `str` ]
271 Names of the default collections to read from.
272 run : `str`, optional
273 Name of the default `~CollectionType.RUN` collection to write to.
274 defaultDataId : `dict` [ `str`, `str` ]
275 Default data ID values.
276 writeable : `bool`
277 Whether the Butler should support write operations.
279 Returns
280 -------
281 butler : `Butler`
282 A new `Butler` instance.
283 """
284 return cls.create_from_config(
285 config=config,
286 options=ButlerInstanceOptions(
287 collections=collections, run=run, writeable=writeable, kwargs=defaultDataId
288 ),
289 )
291 def __reduce__(self) -> tuple:
292 """Support pickling."""
293 return (
294 DirectButler._unpickle,
295 (
296 self._config,
297 self.collections,
298 self.run,
299 dict(self._registry.defaults.dataId.required),
300 self._registry.isWriteable(),
301 ),
302 )
304 def __str__(self) -> str:
305 return "Butler(collections={}, run={}, datastore='{}', registry='{}')".format(
306 self.collections, self.run, self._datastore, self._registry
307 )
309 def isWriteable(self) -> bool:
310 # Docstring inherited.
311 return self._registry.isWriteable()
313 def _caching_context(self) -> contextlib.AbstractContextManager[None]:
314 """Context manager that enables caching."""
315 return self._registry.caching_context()
317 @contextlib.contextmanager
318 def transaction(self) -> Iterator[None]:
319 """Context manager supporting `Butler` transactions.
321 Transactions can be nested.
322 """
323 with self._registry.transaction(), self._datastore.transaction():
324 yield
326 def _standardizeArgs(
327 self,
328 datasetRefOrType: DatasetRef | DatasetType | str,
329 dataId: DataId | None = None,
330 for_put: bool = True,
331 **kwargs: Any,
332 ) -> tuple[DatasetType, DataId | None]:
333 """Standardize the arguments passed to several Butler APIs.
335 Parameters
336 ----------
337 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
338 When `DatasetRef` the `dataId` should be `None`.
339 Otherwise the `DatasetType` or name thereof.
340 dataId : `dict` or `DataCoordinate`
341 A `dict` of `Dimension` link name, value pairs that label the
342 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
343 should be provided as the second argument.
344 for_put : `bool`, optional
345 If `True` this call is invoked as part of a `Butler.put()`.
346 Otherwise it is assumed to be part of a `Butler.get()`. This
347 parameter is only relevant if there is dataset type
348 inconsistency.
349 **kwargs
350 Additional keyword arguments used to augment or construct a
351 `DataCoordinate`. See `DataCoordinate.standardize`
352 parameters.
354 Returns
355 -------
356 datasetType : `DatasetType`
357 A `DatasetType` instance extracted from ``datasetRefOrType``.
358 dataId : `dict` or `DataId`, optional
359 Argument that can be used (along with ``kwargs``) to construct a
360 `DataId`.
362 Notes
363 -----
364 Butler APIs that conceptually need a DatasetRef also allow passing a
365 `DatasetType` (or the name of one) and a `DataId` (or a dict and
366 keyword arguments that can be used to construct one) separately. This
367 method accepts those arguments and always returns a true `DatasetType`
368 and a `DataId` or `dict`.
370 Standardization of `dict` vs `DataId` is best handled by passing the
371 returned ``dataId`` (and ``kwargs``) to `Registry` APIs, which are
372 generally similarly flexible.
373 """
374 externalDatasetType: DatasetType | None = None
375 internalDatasetType: DatasetType | None = None
376 if isinstance(datasetRefOrType, DatasetRef):
377 if dataId is not None or kwargs:
378 raise ValueError("DatasetRef given, cannot use dataId as well")
379 externalDatasetType = datasetRefOrType.datasetType
380 dataId = datasetRefOrType.dataId
381 else:
382 # Don't check whether DataId is provided, because Registry APIs
383 # can usually construct a better error message when it wasn't.
384 if isinstance(datasetRefOrType, DatasetType):
385 externalDatasetType = datasetRefOrType
386 else:
387 internalDatasetType = self.get_dataset_type(datasetRefOrType)
389 # Check that they are self-consistent
390 if externalDatasetType is not None:
391 internalDatasetType = self.get_dataset_type(externalDatasetType.name)
392 if externalDatasetType != internalDatasetType:
393 # We can allow differences if they are compatible, depending
394 # on whether this is a get or a put. A get requires that
395 # the python type associated with the datastore can be
396 # converted to the user type. A put requires that the user
397 # supplied python type can be converted to the internal
398 # type expected by registry.
399 relevantDatasetType = internalDatasetType
400 if for_put:
401 is_compatible = internalDatasetType.is_compatible_with(externalDatasetType)
402 else:
403 is_compatible = externalDatasetType.is_compatible_with(internalDatasetType)
404 relevantDatasetType = externalDatasetType
405 if not is_compatible:
406 raise ValueError(
407 f"Supplied dataset type ({externalDatasetType}) inconsistent with "
408 f"registry definition ({internalDatasetType})"
409 )
410 # Override the internal definition.
411 internalDatasetType = relevantDatasetType
413 assert internalDatasetType is not None
414 return internalDatasetType, dataId
416 def _rewrite_data_id(
417 self, dataId: DataId | None, datasetType: DatasetType, **kwargs: Any
418 ) -> tuple[DataId | None, dict[str, Any]]:
419 """Rewrite a data ID taking into account dimension records.
421 Take a Data ID and keyword args and rewrite it if necessary to
422 allow the user to specify dimension records rather than dimension
423 primary values.
425 This allows a user to include a dataId dict with keys of
426 ``exposure.day_obs`` and ``exposure.seq_num`` instead of giving
427 the integer exposure ID. It also allows a string to be given
428 for a dimension value rather than the integer ID if that is more
429 convenient. For example, rather than having to specifying the
430 detector with ``detector.full_name``, a string given for ``detector``
431 will be interpreted as the full name and converted to the integer
432 value.
434 Keyword arguments can also use strings for dimensions like detector
435 and exposure but python does not allow them to include ``.`` and
436 so the ``exposure.day_obs`` syntax can not be used in a keyword
437 argument.
439 Parameters
440 ----------
441 dataId : `dict` or `DataCoordinate`
442 A `dict` of `Dimension` link name, value pairs that will label the
443 `DatasetRef` within a Collection.
444 datasetType : `DatasetType`
445 The dataset type associated with this dataId. Required to
446 determine the relevant dimensions.
447 **kwargs
448 Additional keyword arguments used to augment or construct a
449 `DataId`. See `DataId` parameters.
451 Returns
452 -------
453 dataId : `dict` or `DataCoordinate`
454 The, possibly rewritten, dataId. If given a `DataCoordinate` and
455 no keyword arguments, the original dataId will be returned
456 unchanged.
457 **kwargs : `dict`
458 Any unused keyword arguments (would normally be empty dict).
459 """
460 # Do nothing if we have a standalone DataCoordinate.
461 if isinstance(dataId, DataCoordinate) and not kwargs:
462 return dataId, kwargs
464 # Process dimension records that are using record information
465 # rather than ids
466 newDataId: dict[str, DataIdValue] = {}
467 byRecord: dict[str, dict[str, Any]] = defaultdict(dict)
469 # if all the dataId comes from keyword parameters we do not need
470 # to do anything here because they can't be of the form
471 # exposure.obs_id because a "." is not allowed in a keyword parameter.
472 if dataId:
473 for k, v in dataId.items():
474 # If we have a Dimension we do not need to do anything
475 # because it cannot be a compound key.
476 if isinstance(k, str) and "." in k:
477 # Someone is using a more human-readable dataId
478 dimensionName, record = k.split(".", 1)
479 byRecord[dimensionName][record] = v
480 elif isinstance(k, Dimension):
481 newDataId[k.name] = v
482 else:
483 newDataId[k] = v
485 # Go through the updated dataId and check the type in case someone is
486 # using an alternate key. We have already filtered out the compound
487 # keys dimensions.record format.
488 not_dimensions = {}
490 # Will need to look in the dataId and the keyword arguments
491 # and will remove them if they need to be fixed or are unrecognized.
492 for dataIdDict in (newDataId, kwargs):
493 # Use a list so we can adjust the dict safely in the loop
494 for dimensionName in list(dataIdDict):
495 value = dataIdDict[dimensionName]
496 try:
497 dimension = self.dimensions.dimensions[dimensionName]
498 except KeyError:
499 # This is not a real dimension
500 not_dimensions[dimensionName] = value
501 del dataIdDict[dimensionName]
502 continue
504 # Convert an integral type to an explicit int to simplify
505 # comparisons here
506 if isinstance(value, numbers.Integral):
507 value = int(value)
509 if not isinstance(value, dimension.primaryKey.getPythonType()):
510 for alternate in dimension.alternateKeys:
511 if isinstance(value, alternate.getPythonType()):
512 byRecord[dimensionName][alternate.name] = value
513 del dataIdDict[dimensionName]
514 _LOG.debug(
515 "Converting dimension %s to %s.%s=%s",
516 dimensionName,
517 dimensionName,
518 alternate.name,
519 value,
520 )
521 break
522 else:
523 _LOG.warning(
524 "Type mismatch found for value '%r' provided for dimension %s. "
525 "Could not find matching alternative (primary key has type %s) "
526 "so attempting to use as-is.",
527 value,
528 dimensionName,
529 dimension.primaryKey.getPythonType(),
530 )
532 # By this point kwargs and newDataId should only include valid
533 # dimensions. Merge kwargs in to the new dataId and log if there
534 # are dimensions in both (rather than calling update).
535 for k, v in kwargs.items():
536 if k in newDataId and newDataId[k] != v:
537 _LOG.debug(
538 "Keyword arg %s overriding explicit value in dataId of %s with %s", k, newDataId[k], v
539 )
540 newDataId[k] = v
541 # No need to retain any values in kwargs now.
542 kwargs = {}
544 # If we have some unrecognized dimensions we have to try to connect
545 # them to records in other dimensions. This is made more complicated
546 # by some dimensions having records with clashing names. A mitigation
547 # is that we can tell by this point which dimensions are missing
548 # for the DatasetType but this does not work for calibrations
549 # where additional dimensions can be used to constrain the temporal
550 # axis.
551 if not_dimensions:
552 # Search for all dimensions even if we have been given a value
553 # explicitly. In some cases records are given as well as the
554 # actually dimension and this should not be an error if they
555 # match.
556 mandatoryDimensions = datasetType.dimensions.names # - provided
558 candidateDimensions: set[str] = set()
559 candidateDimensions.update(mandatoryDimensions)
561 # For calibrations we may well be needing temporal dimensions
562 # so rather than always including all dimensions in the scan
563 # restrict things a little. It is still possible for there
564 # to be confusion over day_obs in visit vs exposure for example.
565 # If we are not searching calibration collections things may
566 # fail but they are going to fail anyway because of the
567 # ambiguousness of the dataId...
568 if datasetType.isCalibration():
569 for dim in self.dimensions.dimensions:
570 if dim.temporal:
571 candidateDimensions.add(str(dim))
573 # Look up table for the first association with a dimension
574 guessedAssociation: dict[str, dict[str, Any]] = defaultdict(dict)
576 # Keep track of whether an item is associated with multiple
577 # dimensions.
578 counter: Counter[str] = Counter()
579 assigned: dict[str, set[str]] = defaultdict(set)
581 # Go through the missing dimensions and associate the
582 # given names with records within those dimensions
583 matched_dims = set()
584 for dimensionName in candidateDimensions:
585 dimension = self.dimensions.dimensions[dimensionName]
586 fields = dimension.metadata.names | dimension.uniqueKeys.names
587 for field in not_dimensions:
588 if field in fields:
589 guessedAssociation[dimensionName][field] = not_dimensions[field]
590 counter[dimensionName] += 1
591 assigned[field].add(dimensionName)
592 matched_dims.add(field)
594 # Calculate the fields that matched nothing.
595 never_found = set(not_dimensions) - matched_dims
597 if never_found:
598 raise ValueError(f"Unrecognized keyword args given: {never_found}")
600 # There is a chance we have allocated a single dataId item
601 # to multiple dimensions. Need to decide which should be retained.
602 # For now assume that the most popular alternative wins.
603 # This means that day_obs with seq_num will result in
604 # exposure.day_obs and not visit.day_obs
605 # Also prefer an explicitly missing dimension over an inferred
606 # temporal dimension.
607 for fieldName, assignedDimensions in assigned.items():
608 if len(assignedDimensions) > 1:
609 # Pick the most popular (preferring mandatory dimensions)
610 requiredButMissing = assignedDimensions.intersection(mandatoryDimensions)
611 if requiredButMissing:
612 candidateDimensions = requiredButMissing
613 else:
614 candidateDimensions = assignedDimensions
616 # If this is a choice between visit and exposure and
617 # neither was a required part of the dataset type,
618 # (hence in this branch) always prefer exposure over
619 # visit since exposures are always defined and visits
620 # are defined from exposures.
621 if candidateDimensions == {"exposure", "visit"}:
622 candidateDimensions = {"exposure"}
624 # Select the relevant items and get a new restricted
625 # counter.
626 theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions}
627 duplicatesCounter: Counter[str] = Counter()
628 duplicatesCounter.update(theseCounts)
630 # Choose the most common. If they are equally common
631 # we will pick the one that was found first.
632 # Returns a list of tuples
633 selected = duplicatesCounter.most_common(1)[0][0]
635 _LOG.debug(
636 "Ambiguous dataId entry '%s' associated with multiple dimensions: %s."
637 " Removed ambiguity by choosing dimension %s.",
638 fieldName,
639 ", ".join(assignedDimensions),
640 selected,
641 )
643 for candidateDimension in assignedDimensions:
644 if candidateDimension != selected:
645 del guessedAssociation[candidateDimension][fieldName]
647 # Update the record look up dict with the new associations
648 for dimensionName, values in guessedAssociation.items():
649 if values: # A dict might now be empty
650 _LOG.debug(
651 "Assigned non-dimension dataId keys to dimension %s: %s", dimensionName, values
652 )
653 byRecord[dimensionName].update(values)
655 if byRecord:
656 # Some record specifiers were found so we need to convert
657 # them to the Id form
658 for dimensionName, values in byRecord.items():
659 if dimensionName in newDataId:
660 _LOG.debug(
661 "DataId specified explicit %s dimension value of %s in addition to"
662 " general record specifiers for it of %s. Ignoring record information.",
663 dimensionName,
664 newDataId[dimensionName],
665 str(values),
666 )
667 # Get the actual record and compare with these values.
668 try:
669 recs = list(self._registry.queryDimensionRecords(dimensionName, dataId=newDataId))
670 except DataIdError:
671 raise ValueError(
672 f"Could not find dimension '{dimensionName}'"
673 f" with dataId {newDataId} as part of comparing with"
674 f" record values {byRecord[dimensionName]}"
675 ) from None
676 if len(recs) == 1:
677 errmsg: list[str] = []
678 for k, v in values.items():
679 if (recval := getattr(recs[0], k)) != v:
680 errmsg.append(f"{k}({recval} != {v})")
681 if errmsg:
682 raise ValueError(
683 f"Dimension {dimensionName} in dataId has explicit value"
684 " inconsistent with records: " + ", ".join(errmsg)
685 )
686 else:
687 # Multiple matches for an explicit dimension
688 # should never happen but let downstream complain.
689 pass
690 continue
692 # Build up a WHERE expression
693 bind = dict(values.items())
694 where = " AND ".join(f"{dimensionName}.{k} = {k}" for k in bind)
696 # Hopefully we get a single record that matches
697 records = set(
698 self._registry.queryDimensionRecords(
699 dimensionName, dataId=newDataId, where=where, bind=bind, **kwargs
700 )
701 )
703 if len(records) != 1:
704 if len(records) > 1:
705 # visit can have an ambiguous answer without involving
706 # visit_system. The default visit_system is defined
707 # by the instrument.
708 if (
709 dimensionName == "visit"
710 and "visit_system_membership" in self.dimensions
711 and "visit_system" in self.dimensions["instrument"].metadata
712 ):
713 instrument_records = list(
714 self._registry.queryDimensionRecords(
715 "instrument",
716 dataId=newDataId,
717 **kwargs,
718 )
719 )
720 if len(instrument_records) == 1:
721 visit_system = instrument_records[0].visit_system
722 if visit_system is None:
723 # Set to a value that will never match.
724 visit_system = -1
726 # Look up each visit in the
727 # visit_system_membership records.
728 for rec in records:
729 membership = list(
730 self._registry.queryDimensionRecords(
731 # Use bind to allow zero results.
732 # This is a fully-specified query.
733 "visit_system_membership",
734 where="instrument = inst AND visit_system = system AND visit = v",
735 bind=dict(
736 inst=instrument_records[0].name, system=visit_system, v=rec.id
737 ),
738 )
739 )
740 if membership:
741 # This record is the right answer.
742 records = {rec}
743 break
745 # The ambiguity may have been resolved so check again.
746 if len(records) > 1:
747 _LOG.debug(
748 "Received %d records from constraints of %s", len(records), str(values)
749 )
750 for r in records:
751 _LOG.debug("- %s", str(r))
752 raise ValueError(
753 f"DataId specification for dimension {dimensionName} is not"
754 f" uniquely constrained to a single dataset by {values}."
755 f" Got {len(records)} results."
756 )
757 else:
758 raise ValueError(
759 f"DataId specification for dimension {dimensionName} matched no"
760 f" records when constrained by {values}"
761 )
763 # Get the primary key from the real dimension object
764 dimension = self.dimensions.dimensions[dimensionName]
765 if not isinstance(dimension, Dimension):
766 raise RuntimeError(
767 f"{dimension.name} is not a true dimension, and cannot be used in data IDs."
768 )
769 newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name)
771 return newDataId, kwargs
773 def _findDatasetRef(
774 self,
775 datasetRefOrType: DatasetRef | DatasetType | str,
776 dataId: DataId | None = None,
777 *,
778 collections: Any = None,
779 predict: bool = False,
780 run: str | None = None,
781 datastore_records: bool = False,
782 **kwargs: Any,
783 ) -> DatasetRef:
784 """Shared logic for methods that start with a search for a dataset in
785 the registry.
787 Parameters
788 ----------
789 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
790 When `DatasetRef` the `dataId` should be `None`.
791 Otherwise the `DatasetType` or name thereof.
792 dataId : `dict` or `DataCoordinate`, optional
793 A `dict` of `Dimension` link name, value pairs that label the
794 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
795 should be provided as the first argument.
796 collections : Any, optional
797 Collections to be searched, overriding ``self.collections``.
798 Can be any of the types supported by the ``collections`` argument
799 to butler construction.
800 predict : `bool`, optional
801 If `True`, return a newly created `DatasetRef` with a unique
802 dataset ID if finding a reference in the `Registry` fails.
803 Defaults to `False`.
804 run : `str`, optional
805 Run collection name to use for creating `DatasetRef` for predicted
806 datasets. Only used if ``predict`` is `True`.
807 datastore_records : `bool`, optional
808 If `True` add datastore records to returned `DatasetRef`.
809 **kwargs
810 Additional keyword arguments used to augment or construct a
811 `DataId`. See `DataId` parameters.
813 Returns
814 -------
815 ref : `DatasetRef`
816 A reference to the dataset identified by the given arguments.
817 This can be the same dataset reference as given if it was
818 resolved.
820 Raises
821 ------
822 LookupError
823 Raised if no matching dataset exists in the `Registry` (and
824 ``predict`` is `False`).
825 ValueError
826 Raised if a resolved `DatasetRef` was passed as an input, but it
827 differs from the one found in the registry.
828 TypeError
829 Raised if no collections were provided.
830 """
831 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, for_put=False, **kwargs)
832 if isinstance(datasetRefOrType, DatasetRef):
833 if collections is not None:
834 warnings.warn("Collections should not be specified with DatasetRef", stacklevel=3)
835 # May need to retrieve datastore records if requested.
836 if datastore_records and datasetRefOrType._datastore_records is None:
837 datasetRefOrType = self._registry.get_datastore_records(datasetRefOrType)
838 return datasetRefOrType
839 timespan: Timespan | None = None
841 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs)
843 if datasetType.isCalibration():
844 # Because this is a calibration dataset, first try to make a
845 # standardize the data ID without restricting the dimensions to
846 # those of the dataset type requested, because there may be extra
847 # dimensions that provide temporal information for a validity-range
848 # lookup.
849 dataId = DataCoordinate.standardize(
850 dataId, universe=self.dimensions, defaults=self._registry.defaults.dataId, **kwargs
851 )
852 if dataId.dimensions.temporal:
853 dataId = self._registry.expandDataId(dataId)
854 timespan = dataId.timespan
855 else:
856 # Standardize the data ID to just the dimensions of the dataset
857 # type instead of letting registry.findDataset do it, so we get the
858 # result even if no dataset is found.
859 dataId = DataCoordinate.standardize(
860 dataId,
861 dimensions=datasetType.dimensions,
862 defaults=self._registry.defaults.dataId,
863 **kwargs,
864 )
865 # Always lookup the DatasetRef, even if one is given, to ensure it is
866 # present in the current collection.
867 ref = self.find_dataset(
868 datasetType,
869 dataId,
870 collections=collections,
871 timespan=timespan,
872 datastore_records=datastore_records,
873 )
874 if ref is None:
875 if predict:
876 if run is None:
877 run = self.run
878 if run is None:
879 raise TypeError("Cannot predict dataset ID/location with run=None.")
880 return DatasetRef(datasetType, dataId, run=run)
881 else:
882 if collections is None:
883 collections = self._registry.defaults.collections
884 raise DatasetNotFoundError(
885 f"Dataset {datasetType.name} with data ID {dataId} "
886 f"could not be found in collections {collections}."
887 )
888 if datasetType != ref.datasetType:
889 # If they differ it is because the user explicitly specified
890 # a compatible dataset type to this call rather than using the
891 # registry definition. The DatasetRef must therefore be recreated
892 # using the user definition such that the expected type is
893 # returned.
894 ref = DatasetRef(
895 datasetType, ref.dataId, run=ref.run, id=ref.id, datastore_records=ref._datastore_records
896 )
898 return ref
900 @transactional
901 def put(
902 self,
903 obj: Any,
904 datasetRefOrType: DatasetRef | DatasetType | str,
905 /,
906 dataId: DataId | None = None,
907 *,
908 run: str | None = None,
909 **kwargs: Any,
910 ) -> DatasetRef:
911 """Store and register a dataset.
913 Parameters
914 ----------
915 obj : `object`
916 The dataset.
917 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
918 When `DatasetRef` is provided, ``dataId`` should be `None`.
919 Otherwise the `DatasetType` or name thereof. If a fully resolved
920 `DatasetRef` is given the run and ID are used directly.
921 dataId : `dict` or `DataCoordinate`
922 A `dict` of `Dimension` link name, value pairs that label the
923 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
924 should be provided as the second argument.
925 run : `str`, optional
926 The name of the run the dataset should be added to, overriding
927 ``self.run``. Not used if a resolved `DatasetRef` is provided.
928 **kwargs
929 Additional keyword arguments used to augment or construct a
930 `DataCoordinate`. See `DataCoordinate.standardize`
931 parameters. Not used if a resolve `DatasetRef` is provided.
933 Returns
934 -------
935 ref : `DatasetRef`
936 A reference to the stored dataset, updated with the correct id if
937 given.
939 Raises
940 ------
941 TypeError
942 Raised if the butler is read-only or if no run has been provided.
943 """
944 if isinstance(datasetRefOrType, DatasetRef):
945 # This is a direct put of predefined DatasetRef.
946 _LOG.debug("Butler put direct: %s", datasetRefOrType)
947 if run is not None:
948 warnings.warn("Run collection is not used for DatasetRef", stacklevel=3)
949 # If registry already has a dataset with the same dataset ID,
950 # dataset type and DataId, then _importDatasets will do nothing and
951 # just return an original ref. We have to raise in this case, there
952 # is a datastore check below for that.
953 self._registry._importDatasets([datasetRefOrType], expand=True)
954 # Before trying to write to the datastore check that it does not
955 # know this dataset. This is prone to races, of course.
956 if self._datastore.knows(datasetRefOrType):
957 raise ConflictingDefinitionError(f"Datastore already contains dataset: {datasetRefOrType}")
958 # Try to write dataset to the datastore, if it fails due to a race
959 # with another write, the content of stored data may be
960 # unpredictable.
961 try:
962 self._datastore.put(obj, datasetRefOrType)
963 except IntegrityError as e:
964 raise ConflictingDefinitionError(f"Datastore already contains dataset: {e}") from e
965 return datasetRefOrType
967 _LOG.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run)
968 if not self.isWriteable():
969 raise TypeError("Butler is read-only.")
970 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs)
972 # Handle dimension records in dataId
973 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs)
975 # Add Registry Dataset entry.
976 dataId = self._registry.expandDataId(dataId, dimensions=datasetType.dimensions, **kwargs)
977 (ref,) = self._registry.insertDatasets(datasetType, run=run, dataIds=[dataId])
978 self._datastore.put(obj, ref)
980 return ref
982 def getDeferred(
983 self,
984 datasetRefOrType: DatasetRef | DatasetType | str,
985 /,
986 dataId: DataId | None = None,
987 *,
988 parameters: dict | None = None,
989 collections: Any = None,
990 storageClass: str | StorageClass | None = None,
991 **kwargs: Any,
992 ) -> DeferredDatasetHandle:
993 """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
994 after an immediate registry lookup.
996 Parameters
997 ----------
998 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
999 When `DatasetRef` the `dataId` should be `None`.
1000 Otherwise the `DatasetType` or name thereof.
1001 dataId : `dict` or `DataCoordinate`, optional
1002 A `dict` of `Dimension` link name, value pairs that label the
1003 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1004 should be provided as the first argument.
1005 parameters : `dict`
1006 Additional StorageClass-defined options to control reading,
1007 typically used to efficiently read only a subset of the dataset.
1008 collections : Any, optional
1009 Collections to be searched, overriding ``self.collections``.
1010 Can be any of the types supported by the ``collections`` argument
1011 to butler construction.
1012 storageClass : `StorageClass` or `str`, optional
1013 The storage class to be used to override the Python type
1014 returned by this method. By default the returned type matches
1015 the dataset type definition for this dataset. Specifying a
1016 read `StorageClass` can force a different type to be returned.
1017 This type must be compatible with the original type.
1018 **kwargs
1019 Additional keyword arguments used to augment or construct a
1020 `DataId`. See `DataId` parameters.
1022 Returns
1023 -------
1024 obj : `DeferredDatasetHandle`
1025 A handle which can be used to retrieve a dataset at a later time.
1027 Raises
1028 ------
1029 LookupError
1030 Raised if no matching dataset exists in the `Registry` or
1031 datastore.
1032 ValueError
1033 Raised if a resolved `DatasetRef` was passed as an input, but it
1034 differs from the one found in the registry.
1035 TypeError
1036 Raised if no collections were provided.
1037 """
1038 if isinstance(datasetRefOrType, DatasetRef):
1039 # Do the quick check first and if that fails, check for artifact
1040 # existence. This is necessary for datastores that are configured
1041 # in trust mode where there won't be a record but there will be
1042 # a file.
1043 if self._datastore.knows(datasetRefOrType) or self._datastore.exists(datasetRefOrType):
1044 ref = datasetRefOrType
1045 else:
1046 raise LookupError(f"Dataset reference {datasetRefOrType} does not exist.")
1047 else:
1048 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs)
1049 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters, storageClass=storageClass)
1051 def get(
1052 self,
1053 datasetRefOrType: DatasetRef | DatasetType | str,
1054 /,
1055 dataId: DataId | None = None,
1056 *,
1057 parameters: dict[str, Any] | None = None,
1058 collections: Any = None,
1059 storageClass: StorageClass | str | None = None,
1060 **kwargs: Any,
1061 ) -> Any:
1062 """Retrieve a stored dataset.
1064 Parameters
1065 ----------
1066 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1067 When `DatasetRef` the `dataId` should be `None`.
1068 Otherwise the `DatasetType` or name thereof.
1069 If a resolved `DatasetRef`, the associated dataset
1070 is returned directly without additional querying.
1071 dataId : `dict` or `DataCoordinate`
1072 A `dict` of `Dimension` link name, value pairs that label the
1073 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1074 should be provided as the first argument.
1075 parameters : `dict`
1076 Additional StorageClass-defined options to control reading,
1077 typically used to efficiently read only a subset of the dataset.
1078 collections : Any, optional
1079 Collections to be searched, overriding ``self.collections``.
1080 Can be any of the types supported by the ``collections`` argument
1081 to butler construction.
1082 storageClass : `StorageClass` or `str`, optional
1083 The storage class to be used to override the Python type
1084 returned by this method. By default the returned type matches
1085 the dataset type definition for this dataset. Specifying a
1086 read `StorageClass` can force a different type to be returned.
1087 This type must be compatible with the original type.
1088 **kwargs
1089 Additional keyword arguments used to augment or construct a
1090 `DataCoordinate`. See `DataCoordinate.standardize`
1091 parameters.
1093 Returns
1094 -------
1095 obj : `object`
1096 The dataset.
1098 Raises
1099 ------
1100 LookupError
1101 Raised if no matching dataset exists in the `Registry`.
1102 TypeError
1103 Raised if no collections were provided.
1105 Notes
1106 -----
1107 When looking up datasets in a `~CollectionType.CALIBRATION` collection,
1108 this method requires that the given data ID include temporal dimensions
1109 beyond the dimensions of the dataset type itself, in order to find the
1110 dataset with the appropriate validity range. For example, a "bias"
1111 dataset with native dimensions ``{instrument, detector}`` could be
1112 fetched with a ``{instrument, detector, exposure}`` data ID, because
1113 ``exposure`` is a temporal dimension.
1114 """
1115 _LOG.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters)
1116 ref = self._findDatasetRef(
1117 datasetRefOrType, dataId, collections=collections, datastore_records=True, **kwargs
1118 )
1119 return self._datastore.get(ref, parameters=parameters, storageClass=storageClass)
1121 def getURIs(
1122 self,
1123 datasetRefOrType: DatasetRef | DatasetType | str,
1124 /,
1125 dataId: DataId | None = None,
1126 *,
1127 predict: bool = False,
1128 collections: Any = None,
1129 run: str | None = None,
1130 **kwargs: Any,
1131 ) -> DatasetRefURIs:
1132 """Return the URIs associated with the dataset.
1134 Parameters
1135 ----------
1136 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1137 When `DatasetRef` the `dataId` should be `None`.
1138 Otherwise the `DatasetType` or name thereof.
1139 dataId : `dict` or `DataCoordinate`
1140 A `dict` of `Dimension` link name, value pairs that label the
1141 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1142 should be provided as the first argument.
1143 predict : `bool`
1144 If `True`, allow URIs to be returned of datasets that have not
1145 been written.
1146 collections : Any, optional
1147 Collections to be searched, overriding ``self.collections``.
1148 Can be any of the types supported by the ``collections`` argument
1149 to butler construction.
1150 run : `str`, optional
1151 Run to use for predictions, overriding ``self.run``.
1152 **kwargs
1153 Additional keyword arguments used to augment or construct a
1154 `DataCoordinate`. See `DataCoordinate.standardize`
1155 parameters.
1157 Returns
1158 -------
1159 uris : `DatasetRefURIs`
1160 The URI to the primary artifact associated with this dataset (if
1161 the dataset was disassembled within the datastore this may be
1162 `None`), and the URIs to any components associated with the dataset
1163 artifact. (can be empty if there are no components).
1164 """
1165 ref = self._findDatasetRef(
1166 datasetRefOrType, dataId, predict=predict, run=run, collections=collections, **kwargs
1167 )
1168 return self._datastore.getURIs(ref, predict)
1170 def get_dataset_type(self, name: str) -> DatasetType:
1171 return self._registry.getDatasetType(name)
1173 def get_dataset(
1174 self,
1175 id: DatasetId,
1176 *,
1177 storage_class: str | StorageClass | None = None,
1178 dimension_records: bool = False,
1179 datastore_records: bool = False,
1180 ) -> DatasetRef | None:
1181 ref = self._registry.getDataset(id)
1182 if ref is not None:
1183 if dimension_records:
1184 ref = ref.expanded(
1185 self._registry.expandDataId(ref.dataId, dimensions=ref.datasetType.dimensions)
1186 )
1187 if storage_class:
1188 ref = ref.overrideStorageClass(storage_class)
1189 if datastore_records:
1190 ref = self._registry.get_datastore_records(ref)
1191 return ref
1193 def find_dataset(
1194 self,
1195 dataset_type: DatasetType | str,
1196 data_id: DataId | None = None,
1197 *,
1198 collections: str | Sequence[str] | None = None,
1199 timespan: Timespan | None = None,
1200 storage_class: str | StorageClass | None = None,
1201 dimension_records: bool = False,
1202 datastore_records: bool = False,
1203 **kwargs: Any,
1204 ) -> DatasetRef | None:
1205 # Handle any parts of the dataID that are not using primary dimension
1206 # keys.
1207 if isinstance(dataset_type, str):
1208 actual_type = self.get_dataset_type(dataset_type)
1209 else:
1210 actual_type = dataset_type
1212 # Store the component for later.
1213 component_name = actual_type.component()
1214 if actual_type.isComponent():
1215 parent_type = actual_type.makeCompositeDatasetType()
1216 else:
1217 parent_type = actual_type
1219 data_id, kwargs = self._rewrite_data_id(data_id, parent_type, **kwargs)
1221 ref = self._registry.findDataset(
1222 parent_type,
1223 data_id,
1224 collections=collections,
1225 timespan=timespan,
1226 datastore_records=datastore_records,
1227 **kwargs,
1228 )
1229 if ref is not None and dimension_records:
1230 ref = ref.expanded(self._registry.expandDataId(ref.dataId, dimensions=ref.datasetType.dimensions))
1231 if ref is not None and component_name:
1232 ref = ref.makeComponentRef(component_name)
1233 if ref is not None and storage_class is not None:
1234 ref = ref.overrideStorageClass(storage_class)
1236 return ref
1238 def retrieveArtifacts(
1239 self,
1240 refs: Iterable[DatasetRef],
1241 destination: ResourcePathExpression,
1242 transfer: str = "auto",
1243 preserve_path: bool = True,
1244 overwrite: bool = False,
1245 ) -> list[ResourcePath]:
1246 # Docstring inherited.
1247 return self._datastore.retrieveArtifacts(
1248 refs,
1249 ResourcePath(destination),
1250 transfer=transfer,
1251 preserve_path=preserve_path,
1252 overwrite=overwrite,
1253 )
1255 def exists(
1256 self,
1257 dataset_ref_or_type: DatasetRef | DatasetType | str,
1258 /,
1259 data_id: DataId | None = None,
1260 *,
1261 full_check: bool = True,
1262 collections: Any = None,
1263 **kwargs: Any,
1264 ) -> DatasetExistence:
1265 # Docstring inherited.
1266 existence = DatasetExistence.UNRECOGNIZED
1268 if isinstance(dataset_ref_or_type, DatasetRef):
1269 if collections is not None:
1270 warnings.warn("Collections should not be specified with DatasetRef", stacklevel=2)
1271 if data_id is not None:
1272 warnings.warn("A DataID should not be specified with DatasetRef", stacklevel=2)
1273 ref = dataset_ref_or_type
1274 registry_ref = self._registry.getDataset(dataset_ref_or_type.id)
1275 if registry_ref is not None:
1276 existence |= DatasetExistence.RECORDED
1278 if dataset_ref_or_type != registry_ref:
1279 # This could mean that storage classes differ, so we should
1280 # check for that but use the registry ref for the rest of
1281 # the method.
1282 if registry_ref.is_compatible_with(dataset_ref_or_type):
1283 # Use the registry version from now on.
1284 ref = registry_ref
1285 else:
1286 raise ValueError(
1287 f"The ref given to exists() ({ref}) has the same dataset ID as one "
1288 f"in registry but has different incompatible values ({registry_ref})."
1289 )
1290 else:
1291 try:
1292 ref = self._findDatasetRef(dataset_ref_or_type, data_id, collections=collections, **kwargs)
1293 except (LookupError, TypeError):
1294 return existence
1295 existence |= DatasetExistence.RECORDED
1297 if self._datastore.knows(ref):
1298 existence |= DatasetExistence.DATASTORE
1300 if full_check:
1301 if self._datastore.exists(ref):
1302 existence |= DatasetExistence._ARTIFACT
1303 elif existence.value != DatasetExistence.UNRECOGNIZED.value:
1304 # Do not add this flag if we have no other idea about a dataset.
1305 existence |= DatasetExistence(DatasetExistence._ASSUMED)
1307 return existence
1309 def _exists_many(
1310 self,
1311 refs: Iterable[DatasetRef],
1312 /,
1313 *,
1314 full_check: bool = True,
1315 ) -> dict[DatasetRef, DatasetExistence]:
1316 # Docstring inherited.
1317 existence = {ref: DatasetExistence.UNRECOGNIZED for ref in refs}
1319 # Registry does not have a bulk API to check for a ref.
1320 for ref in refs:
1321 registry_ref = self._registry.getDataset(ref.id)
1322 if registry_ref is not None:
1323 # It is possible, albeit unlikely, that the given ref does
1324 # not match the one in registry even though the UUID matches.
1325 # When checking a single ref we raise, but it's impolite to
1326 # do that when potentially hundreds of refs are being checked.
1327 # We could change the API to only accept UUIDs and that would
1328 # remove the ability to even check and remove the worry
1329 # about differing storage classes. Given the ongoing discussion
1330 # on refs vs UUIDs and whether to raise or have a new
1331 # private flag, treat this as a private API for now.
1332 existence[ref] |= DatasetExistence.RECORDED
1334 # Ask datastore if it knows about these refs.
1335 knows = self._datastore.knows_these(refs)
1336 for ref, known in knows.items():
1337 if known:
1338 existence[ref] |= DatasetExistence.DATASTORE
1340 if full_check:
1341 mexists = self._datastore.mexists(refs)
1342 for ref, exists in mexists.items():
1343 if exists:
1344 existence[ref] |= DatasetExistence._ARTIFACT
1345 else:
1346 # Do not set this flag if nothing is known about the dataset.
1347 for ref in existence:
1348 if existence[ref] != DatasetExistence.UNRECOGNIZED:
1349 existence[ref] |= DatasetExistence._ASSUMED
1351 return existence
1353 def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None:
1354 # Docstring inherited.
1355 if not self.isWriteable():
1356 raise TypeError("Butler is read-only.")
1357 names = list(names)
1358 refs: list[DatasetRef] = []
1359 for name in names:
1360 collectionType = self._registry.getCollectionType(name)
1361 if collectionType is not CollectionType.RUN:
1362 raise TypeError(f"The collection type of '{name}' is {collectionType.name}, not RUN.")
1363 refs.extend(self._registry.queryDatasets(..., collections=name, findFirst=True))
1364 with self._datastore.transaction(), self._registry.transaction():
1365 if unstore:
1366 self._datastore.trash(refs)
1367 else:
1368 self._datastore.forget(refs)
1369 for name in names:
1370 self._registry.removeCollection(name)
1371 if unstore:
1372 # Point of no return for removing artifacts
1373 self._datastore.emptyTrash()
1375 def pruneDatasets(
1376 self,
1377 refs: Iterable[DatasetRef],
1378 *,
1379 disassociate: bool = True,
1380 unstore: bool = False,
1381 tags: Iterable[str] = (),
1382 purge: bool = False,
1383 ) -> None:
1384 # docstring inherited from LimitedButler
1386 if not self.isWriteable():
1387 raise TypeError("Butler is read-only.")
1388 if purge:
1389 if not disassociate:
1390 raise TypeError("Cannot pass purge=True without disassociate=True.")
1391 if not unstore:
1392 raise TypeError("Cannot pass purge=True without unstore=True.")
1393 elif disassociate:
1394 tags = tuple(tags)
1395 if not tags:
1396 raise TypeError("No tags provided but disassociate=True.")
1397 for tag in tags:
1398 collectionType = self._registry.getCollectionType(tag)
1399 if collectionType is not CollectionType.TAGGED:
1400 raise TypeError(
1401 f"Cannot disassociate from collection '{tag}' "
1402 f"of non-TAGGED type {collectionType.name}."
1403 )
1404 # Transform possibly-single-pass iterable into something we can iterate
1405 # over multiple times.
1406 refs = list(refs)
1407 # Pruning a component of a DatasetRef makes no sense since registry
1408 # doesn't know about components and datastore might not store
1409 # components in a separate file
1410 for ref in refs:
1411 if ref.datasetType.component():
1412 raise ValueError(f"Can not prune a component of a dataset (ref={ref})")
1413 # We don't need an unreliable Datastore transaction for this, because
1414 # we've been extra careful to ensure that Datastore.trash only involves
1415 # mutating the Registry (it can _look_ at Datastore-specific things,
1416 # but shouldn't change them), and hence all operations here are
1417 # Registry operations.
1418 with self._datastore.transaction(), self._registry.transaction():
1419 if unstore:
1420 self._datastore.trash(refs)
1421 if purge:
1422 self._registry.removeDatasets(refs)
1423 elif disassociate:
1424 assert tags, "Guaranteed by earlier logic in this function."
1425 for tag in tags:
1426 self._registry.disassociate(tag, refs)
1427 # We've exited the Registry transaction, and apparently committed.
1428 # (if there was an exception, everything rolled back, and it's as if
1429 # nothing happened - and we never get here).
1430 # Datastore artifacts are not yet gone, but they're clearly marked
1431 # as trash, so if we fail to delete now because of (e.g.) filesystem
1432 # problems we can try again later, and if manual administrative
1433 # intervention is required, it's pretty clear what that should entail:
1434 # deleting everything on disk and in private Datastore tables that is
1435 # in the dataset_location_trash table.
1436 if unstore:
1437 # Point of no return for removing artifacts
1438 self._datastore.emptyTrash()
1440 @transactional
1441 def ingest(
1442 self,
1443 *datasets: FileDataset,
1444 transfer: str | None = "auto",
1445 record_validation_info: bool = True,
1446 ) -> None:
1447 # Docstring inherited.
1448 if not self.isWriteable():
1449 raise TypeError("Butler is read-only.")
1451 _LOG.verbose("Ingesting %d file dataset%s.", len(datasets), "" if len(datasets) == 1 else "s")
1452 if not datasets:
1453 return
1455 progress = Progress("lsst.daf.butler.Butler.ingest", level=logging.DEBUG)
1457 # We need to reorganize all the inputs so that they are grouped
1458 # by dataset type and run. Multiple refs in a single FileDataset
1459 # are required to share the run and dataset type.
1460 groupedData: MutableMapping[tuple[DatasetType, str], list[FileDataset]] = defaultdict(list)
1462 # Track DataIDs that are being ingested so we can spot issues early
1463 # with duplication. Retain previous FileDataset so we can report it.
1464 groupedDataIds: MutableMapping[tuple[DatasetType, str], dict[DataCoordinate, FileDataset]] = (
1465 defaultdict(dict)
1466 )
1468 # And the nested loop that populates it:
1469 for dataset in progress.wrap(datasets, desc="Grouping by dataset type"):
1470 # Somewhere to store pre-existing refs if we have an
1471 # execution butler.
1472 existingRefs: list[DatasetRef] = []
1474 for ref in dataset.refs:
1475 group_key = (ref.datasetType, ref.run)
1477 if ref.dataId in groupedDataIds[group_key]:
1478 raise ConflictingDefinitionError(
1479 f"Ingest conflict. Dataset {dataset.path} has same"
1480 " DataId as other ingest dataset"
1481 f" {groupedDataIds[group_key][ref.dataId].path} "
1482 f" ({ref.dataId})"
1483 )
1485 groupedDataIds[group_key][ref.dataId] = dataset
1487 if existingRefs:
1488 if len(dataset.refs) != len(existingRefs):
1489 # Keeping track of partially pre-existing datasets is hard
1490 # and should generally never happen. For now don't allow
1491 # it.
1492 raise ConflictingDefinitionError(
1493 f"For dataset {dataset.path} some dataIds already exist"
1494 " in registry but others do not. This is not supported."
1495 )
1497 # Store expanded form in the original FileDataset.
1498 dataset.refs = existingRefs
1499 else:
1500 groupedData[group_key].append(dataset)
1502 # Now we can bulk-insert into Registry for each DatasetType.
1503 for (datasetType, this_run), grouped_datasets in progress.iter_item_chunks(
1504 groupedData.items(), desc="Bulk-inserting datasets by type"
1505 ):
1506 refs_to_import = []
1507 for dataset in grouped_datasets:
1508 refs_to_import.extend(dataset.refs)
1510 n_refs = len(refs_to_import)
1511 _LOG.verbose(
1512 "Importing %d ref%s of dataset type %r into run %r",
1513 n_refs,
1514 "" if n_refs == 1 else "s",
1515 datasetType.name,
1516 this_run,
1517 )
1519 # Import the refs and expand the DataCoordinates since we can't
1520 # guarantee that they are expanded and Datastore will need
1521 # the records.
1522 imported_refs = self._registry._importDatasets(refs_to_import, expand=True)
1523 assert set(imported_refs) == set(refs_to_import)
1525 # Replace all the refs in the FileDataset with expanded versions.
1526 # Pull them off in the order we put them on the list.
1527 for dataset in grouped_datasets:
1528 n_dataset_refs = len(dataset.refs)
1529 dataset.refs = imported_refs[:n_dataset_refs]
1530 del imported_refs[:n_dataset_refs]
1532 # Bulk-insert everything into Datastore.
1533 # We do not know if any of the registry entries already existed
1534 # (_importDatasets only complains if they exist but differ) so
1535 # we have to catch IntegrityError explicitly.
1536 try:
1537 self._datastore.ingest(
1538 *datasets, transfer=transfer, record_validation_info=record_validation_info
1539 )
1540 except IntegrityError as e:
1541 raise ConflictingDefinitionError(f"Datastore already contains one or more datasets: {e}") from e
1543 @contextlib.contextmanager
1544 def export(
1545 self,
1546 *,
1547 directory: str | None = None,
1548 filename: str | None = None,
1549 format: str | None = None,
1550 transfer: str | None = None,
1551 ) -> Iterator[RepoExportContext]:
1552 # Docstring inherited.
1553 if directory is None and transfer is not None:
1554 raise TypeError("Cannot transfer without providing a directory.")
1555 if transfer == "move":
1556 raise TypeError("Transfer may not be 'move': export is read-only")
1557 if format is None:
1558 if filename is None:
1559 raise TypeError("At least one of 'filename' or 'format' must be provided.")
1560 else:
1561 _, format = os.path.splitext(filename)
1562 if not format:
1563 raise ValueError("Please specify a file extension to determine export format.")
1564 format = format[1:] # Strip leading ".""
1565 elif filename is None:
1566 filename = f"export.{format}"
1567 if directory is not None:
1568 filename = os.path.join(directory, filename)
1569 formats = self._config["repo_transfer_formats"]
1570 if format not in formats:
1571 raise ValueError(f"Unknown export format {format!r}, allowed: {','.join(formats.keys())}")
1572 BackendClass = get_class_of(formats[format, "export"])
1573 with open(filename, "w") as stream:
1574 backend = BackendClass(stream, universe=self.dimensions)
1575 try:
1576 helper = RepoExportContext(
1577 self._registry, self._datastore, backend=backend, directory=directory, transfer=transfer
1578 )
1579 with self._caching_context():
1580 yield helper
1581 except BaseException:
1582 raise
1583 else:
1584 helper._finish()
1586 def import_(
1587 self,
1588 *,
1589 directory: ResourcePathExpression | None = None,
1590 filename: ResourcePathExpression | TextIO | None = None,
1591 format: str | None = None,
1592 transfer: str | None = None,
1593 skip_dimensions: set | None = None,
1594 ) -> None:
1595 # Docstring inherited.
1596 if not self.isWriteable():
1597 raise TypeError("Butler is read-only.")
1598 if format is None:
1599 if filename is None:
1600 raise TypeError("At least one of 'filename' or 'format' must be provided.")
1601 else:
1602 _, format = os.path.splitext(filename) # type: ignore
1603 elif filename is None:
1604 filename = ResourcePath(f"export.{format}", forceAbsolute=False)
1605 if directory is not None:
1606 directory = ResourcePath(directory, forceDirectory=True)
1607 # mypy doesn't think this will work but it does in python >= 3.10.
1608 if isinstance(filename, ResourcePathExpression): # type: ignore
1609 filename = ResourcePath(filename, forceAbsolute=False) # type: ignore
1610 if not filename.isabs() and directory is not None:
1611 potential = directory.join(filename)
1612 exists_in_cwd = filename.exists()
1613 exists_in_dir = potential.exists()
1614 if exists_in_cwd and exists_in_dir:
1615 _LOG.warning(
1616 "A relative path for filename was specified (%s) which exists relative to cwd. "
1617 "Additionally, the file exists relative to the given search directory (%s). "
1618 "Using the export file in the given directory.",
1619 filename,
1620 potential,
1621 )
1622 # Given they specified an explicit directory and that
1623 # directory has the export file in it, assume that that
1624 # is what was meant despite the file in cwd.
1625 filename = potential
1626 elif exists_in_dir:
1627 filename = potential
1628 elif not exists_in_cwd and not exists_in_dir:
1629 # Raise early.
1630 raise FileNotFoundError(
1631 f"Export file could not be found in {filename.abspath()} or {potential.abspath()}."
1632 )
1633 BackendClass: type[RepoImportBackend] = get_class_of(
1634 self._config["repo_transfer_formats"][format]["import"]
1635 )
1637 def doImport(importStream: TextIO | ResourceHandleProtocol) -> None:
1638 with self._caching_context():
1639 backend = BackendClass(importStream, self._registry) # type: ignore[call-arg]
1640 backend.register()
1641 with self.transaction():
1642 backend.load(
1643 self._datastore,
1644 directory=directory,
1645 transfer=transfer,
1646 skip_dimensions=skip_dimensions,
1647 )
1649 if isinstance(filename, ResourcePath):
1650 # We can not use open() here at the moment because of
1651 # DM-38589 since yaml does stream.read(8192) in a loop.
1652 stream = io.StringIO(filename.read().decode())
1653 doImport(stream)
1654 else:
1655 doImport(filename) # type: ignore
1657 def transfer_dimension_records_from(
1658 self, source_butler: LimitedButler | Butler, source_refs: Iterable[DatasetRef]
1659 ) -> None:
1660 # Allowed dimensions in the target butler.
1661 elements = frozenset(element for element in self.dimensions.elements if element.has_own_table)
1663 data_ids = {ref.dataId for ref in source_refs}
1665 dimension_records = self._extract_all_dimension_records_from_data_ids(
1666 source_butler, data_ids, elements
1667 )
1669 # Insert order is important.
1670 for element in self.dimensions.sorted(dimension_records.keys()):
1671 records = [r for r in dimension_records[element].values()]
1672 # Assume that if the record is already present that we can
1673 # use it without having to check that the record metadata
1674 # is consistent.
1675 self._registry.insertDimensionData(element, *records, skip_existing=True)
1676 _LOG.debug("Dimension '%s' -- number of records transferred: %d", element.name, len(records))
1678 def _extract_all_dimension_records_from_data_ids(
1679 self,
1680 source_butler: LimitedButler | Butler,
1681 data_ids: set[DataCoordinate],
1682 allowed_elements: frozenset[DimensionElement],
1683 ) -> dict[DimensionElement, dict[DataCoordinate, DimensionRecord]]:
1684 primary_records = self._extract_dimension_records_from_data_ids(
1685 source_butler, data_ids, allowed_elements
1686 )
1688 can_query = True if isinstance(source_butler, Butler) else False
1690 additional_records: dict[DimensionElement, dict[DataCoordinate, DimensionRecord]] = defaultdict(dict)
1691 for original_element, record_mapping in primary_records.items():
1692 # Get dimensions that depend on this dimension.
1693 populated_by = self.dimensions.get_elements_populated_by(
1694 self.dimensions[original_element.name] # type: ignore
1695 )
1697 for data_id in record_mapping.keys():
1698 for element in populated_by:
1699 if element not in allowed_elements:
1700 continue
1701 if element.name == original_element.name:
1702 continue
1704 if element.name in primary_records:
1705 # If this element has already been stored avoid
1706 # re-finding records since that may lead to additional
1707 # spurious records. e.g. visit is populated_by
1708 # visit_detector_region but querying
1709 # visit_detector_region by visit will return all the
1710 # detectors for this visit -- the visit dataId does not
1711 # constrain this.
1712 # To constrain the query the original dataIds would
1713 # have to be scanned.
1714 continue
1716 if not can_query:
1717 raise RuntimeError(
1718 f"Transferring populated_by records like {element.name} requires a full Butler."
1719 )
1721 records = source_butler.registry.queryDimensionRecords( # type: ignore
1722 element.name,
1723 **data_id.mapping, # type: ignore
1724 )
1725 for record in records:
1726 additional_records[record.definition].setdefault(record.dataId, record)
1728 # The next step is to walk back through the additional records to
1729 # pick up any missing content (such as visit_definition needing to
1730 # know the exposure). Want to ensure we do not request records we
1731 # already have.
1732 missing_data_ids = set()
1733 for name, record_mapping in additional_records.items():
1734 for data_id in record_mapping.keys():
1735 if data_id not in primary_records[name]:
1736 missing_data_ids.add(data_id)
1738 # Fill out the new records. Assume that these new records do not
1739 # also need to carry over additional populated_by records.
1740 secondary_records = self._extract_dimension_records_from_data_ids(
1741 source_butler, missing_data_ids, allowed_elements
1742 )
1744 # Merge the extra sets of records in with the original.
1745 for name, record_mapping in itertools.chain(additional_records.items(), secondary_records.items()):
1746 primary_records[name].update(record_mapping)
1748 return primary_records
1750 def _extract_dimension_records_from_data_ids(
1751 self,
1752 source_butler: LimitedButler | Butler,
1753 data_ids: set[DataCoordinate],
1754 allowed_elements: frozenset[DimensionElement],
1755 ) -> dict[DimensionElement, dict[DataCoordinate, DimensionRecord]]:
1756 dimension_records: dict[DimensionElement, dict[DataCoordinate, DimensionRecord]] = defaultdict(dict)
1758 for data_id in data_ids:
1759 # Need an expanded record, if not expanded that we need a full
1760 # butler with registry (allow mocks with registry too).
1761 if not data_id.hasRecords():
1762 if registry := getattr(source_butler, "registry", None):
1763 data_id = registry.expandDataId(data_id)
1764 else:
1765 raise TypeError("Input butler needs to be a full butler to expand DataId.")
1766 # If this butler doesn't know about a dimension in the source
1767 # butler things will break later.
1768 for element_name in data_id.dimensions.elements:
1769 record = data_id.records[element_name]
1770 if record is not None and record.definition in allowed_elements:
1771 dimension_records[record.definition].setdefault(record.dataId, record)
1773 return dimension_records
1775 def transfer_from(
1776 self,
1777 source_butler: LimitedButler,
1778 source_refs: Iterable[DatasetRef],
1779 transfer: str = "auto",
1780 skip_missing: bool = True,
1781 register_dataset_types: bool = False,
1782 transfer_dimensions: bool = False,
1783 dry_run: bool = False,
1784 ) -> collections.abc.Collection[DatasetRef]:
1785 # Docstring inherited.
1786 if not self.isWriteable():
1787 raise TypeError("Butler is read-only.")
1788 progress = Progress("lsst.daf.butler.Butler.transfer_from", level=VERBOSE)
1790 # Will iterate through the refs multiple times so need to convert
1791 # to a list if this isn't a collection.
1792 if not isinstance(source_refs, collections.abc.Collection):
1793 source_refs = list(source_refs)
1795 original_count = len(source_refs)
1796 _LOG.info("Transferring %d datasets into %s", original_count, str(self))
1798 # In some situations the datastore artifact may be missing
1799 # and we do not want that registry entry to be imported.
1800 # Asking datastore is not sufficient, the records may have been
1801 # purged, we have to ask for the (predicted) URI and check
1802 # existence explicitly. Execution butler is set up exactly like
1803 # this with no datastore records.
1804 artifact_existence: dict[ResourcePath, bool] = {}
1805 if skip_missing:
1806 dataset_existence = source_butler._datastore.mexists(
1807 source_refs, artifact_existence=artifact_existence
1808 )
1809 source_refs = [ref for ref, exists in dataset_existence.items() if exists]
1810 filtered_count = len(source_refs)
1811 n_missing = original_count - filtered_count
1812 _LOG.verbose(
1813 "%d dataset%s removed because the artifact does not exist. Now have %d.",
1814 n_missing,
1815 "" if n_missing == 1 else "s",
1816 filtered_count,
1817 )
1819 # Importing requires that we group the refs by dataset type and run
1820 # before doing the import.
1821 source_dataset_types = set()
1822 grouped_refs = defaultdict(list)
1823 for ref in source_refs:
1824 grouped_refs[ref.datasetType, ref.run].append(ref)
1825 source_dataset_types.add(ref.datasetType)
1827 # Check to see if the dataset type in the source butler has
1828 # the same definition in the target butler and register missing
1829 # ones if requested. Registration must happen outside a transaction.
1830 newly_registered_dataset_types = set()
1831 for datasetType in source_dataset_types:
1832 if register_dataset_types:
1833 # Let this raise immediately if inconsistent. Continuing
1834 # on to find additional inconsistent dataset types
1835 # might result in additional unwanted dataset types being
1836 # registered.
1837 if self._registry.registerDatasetType(datasetType):
1838 newly_registered_dataset_types.add(datasetType)
1839 else:
1840 # If the dataset type is missing, let it fail immediately.
1841 target_dataset_type = self.get_dataset_type(datasetType.name)
1842 if target_dataset_type != datasetType:
1843 raise ConflictingDefinitionError(
1844 "Source butler dataset type differs from definition"
1845 f" in target butler: {datasetType} !="
1846 f" {target_dataset_type}"
1847 )
1848 if newly_registered_dataset_types:
1849 # We may have registered some even if there were inconsistencies
1850 # but should let people know (or else remove them again).
1851 _LOG.verbose(
1852 "Registered the following dataset types in the target Butler: %s",
1853 ", ".join(d.name for d in newly_registered_dataset_types),
1854 )
1855 else:
1856 _LOG.verbose("All required dataset types are known to the target Butler")
1858 dimension_records: dict[DimensionElement, dict[DataCoordinate, DimensionRecord]] = defaultdict(dict)
1859 if transfer_dimensions:
1860 # Collect all the dimension records for these refs.
1861 # All dimensions are to be copied but the list of valid dimensions
1862 # come from this butler's universe.
1863 elements = frozenset(element for element in self.dimensions.elements if element.has_own_table)
1864 dataIds = {ref.dataId for ref in source_refs}
1865 dimension_records = self._extract_all_dimension_records_from_data_ids(
1866 source_butler, dataIds, elements
1867 )
1869 handled_collections: set[str] = set()
1871 # Do all the importing in a single transaction.
1872 with self.transaction():
1873 if dimension_records and not dry_run:
1874 _LOG.verbose("Ensuring that dimension records exist for transferred datasets.")
1875 # Order matters.
1876 for element in self.dimensions.sorted(dimension_records.keys()):
1877 records = [r for r in dimension_records[element].values()]
1878 # Assume that if the record is already present that we can
1879 # use it without having to check that the record metadata
1880 # is consistent.
1881 self._registry.insertDimensionData(element, *records, skip_existing=True)
1883 n_imported = 0
1884 for (datasetType, run), refs_to_import in progress.iter_item_chunks(
1885 grouped_refs.items(), desc="Importing to registry by run and dataset type"
1886 ):
1887 if run not in handled_collections:
1888 # May need to create output collection. If source butler
1889 # has a registry, ask for documentation string.
1890 run_doc = None
1891 if registry := getattr(source_butler, "registry", None):
1892 run_doc = registry.getCollectionDocumentation(run)
1893 if not dry_run:
1894 registered = self._registry.registerRun(run, doc=run_doc)
1895 else:
1896 registered = True
1897 handled_collections.add(run)
1898 if registered:
1899 _LOG.verbose("Creating output run %s", run)
1901 n_refs = len(refs_to_import)
1902 _LOG.verbose(
1903 "Importing %d ref%s of dataset type %s into run %s",
1904 n_refs,
1905 "" if n_refs == 1 else "s",
1906 datasetType.name,
1907 run,
1908 )
1910 # Assume we are using UUIDs and the source refs will match
1911 # those imported.
1912 if not dry_run:
1913 imported_refs = self._registry._importDatasets(refs_to_import)
1914 else:
1915 imported_refs = refs_to_import
1916 assert set(imported_refs) == set(refs_to_import)
1917 n_imported += len(imported_refs)
1919 assert len(source_refs) == n_imported
1920 _LOG.verbose("Imported %d datasets into destination butler", n_imported)
1922 # Ask the datastore to transfer. The datastore has to check that
1923 # the source datastore is compatible with the target datastore.
1924 accepted, rejected = self._datastore.transfer_from(
1925 source_butler._datastore,
1926 source_refs,
1927 transfer=transfer,
1928 artifact_existence=artifact_existence,
1929 dry_run=dry_run,
1930 )
1931 if rejected:
1932 # For now, accept the registry entries but not the files.
1933 _LOG.warning(
1934 "%d datasets were rejected and %d accepted for dataset type %s in run %r.",
1935 len(rejected),
1936 len(accepted),
1937 datasetType,
1938 run,
1939 )
1941 return source_refs
1943 def validateConfiguration(
1944 self,
1945 logFailures: bool = False,
1946 datasetTypeNames: Iterable[str] | None = None,
1947 ignore: Iterable[str] | None = None,
1948 ) -> None:
1949 # Docstring inherited.
1950 if datasetTypeNames:
1951 datasetTypes = [self.get_dataset_type(name) for name in datasetTypeNames]
1952 else:
1953 datasetTypes = list(self._registry.queryDatasetTypes())
1955 # filter out anything from the ignore list
1956 if ignore:
1957 ignore = set(ignore)
1958 datasetTypes = [
1959 e for e in datasetTypes if e.name not in ignore and e.nameAndComponent()[0] not in ignore
1960 ]
1961 else:
1962 ignore = set()
1964 # For each datasetType that has an instrument dimension, create
1965 # a DatasetRef for each defined instrument
1966 datasetRefs = []
1968 # Find all the registered instruments (if "instrument" is in the
1969 # universe).
1970 if "instrument" in self.dimensions:
1971 instruments = {record.name for record in self._registry.queryDimensionRecords("instrument")}
1973 for datasetType in datasetTypes:
1974 if "instrument" in datasetType.dimensions:
1975 # In order to create a conforming dataset ref, create
1976 # fake DataCoordinate values for the non-instrument
1977 # dimensions. The type of the value does not matter here.
1978 dataId = {dim: 1 for dim in datasetType.dimensions.names if dim != "instrument"}
1980 for instrument in instruments:
1981 datasetRef = DatasetRef(
1982 datasetType,
1983 DataCoordinate.standardize(
1984 dataId, instrument=instrument, dimensions=datasetType.dimensions
1985 ),
1986 run="validate",
1987 )
1988 datasetRefs.append(datasetRef)
1990 entities: list[DatasetType | DatasetRef] = []
1991 entities.extend(datasetTypes)
1992 entities.extend(datasetRefs)
1994 datastoreErrorStr = None
1995 try:
1996 self._datastore.validateConfiguration(entities, logFailures=logFailures)
1997 except ValidationError as e:
1998 datastoreErrorStr = str(e)
2000 # Also check that the LookupKeys used by the datastores match
2001 # registry and storage class definitions
2002 keys = self._datastore.getLookupKeys()
2004 failedNames = set()
2005 failedDataId = set()
2006 for key in keys:
2007 if key.name is not None:
2008 if key.name in ignore:
2009 continue
2011 # skip if specific datasetType names were requested and this
2012 # name does not match
2013 if datasetTypeNames and key.name not in datasetTypeNames:
2014 continue
2016 # See if it is a StorageClass or a DatasetType
2017 if key.name in self.storageClasses:
2018 pass
2019 else:
2020 try:
2021 self.get_dataset_type(key.name)
2022 except KeyError:
2023 if logFailures:
2024 _LOG.critical(
2025 "Key '%s' does not correspond to a DatasetType or StorageClass", key
2026 )
2027 failedNames.add(key)
2028 else:
2029 # Dimensions are checked for consistency when the Butler
2030 # is created and rendezvoused with a universe.
2031 pass
2033 # Check that the instrument is a valid instrument
2034 # Currently only support instrument so check for that
2035 if key.dataId:
2036 dataIdKeys = set(key.dataId)
2037 if {"instrument"} != dataIdKeys:
2038 if logFailures:
2039 _LOG.critical("Key '%s' has unsupported DataId override", key)
2040 failedDataId.add(key)
2041 elif key.dataId["instrument"] not in instruments:
2042 if logFailures:
2043 _LOG.critical("Key '%s' has unknown instrument", key)
2044 failedDataId.add(key)
2046 messages = []
2048 if datastoreErrorStr:
2049 messages.append(datastoreErrorStr)
2051 for failed, msg in (
2052 (failedNames, "Keys without corresponding DatasetType or StorageClass entry: "),
2053 (failedDataId, "Keys with bad DataId entries: "),
2054 ):
2055 if failed:
2056 msg += ", ".join(str(k) for k in failed)
2057 messages.append(msg)
2059 if messages:
2060 raise ValidationError(";\n".join(messages))
2062 @property
2063 def collections(self) -> Sequence[str]:
2064 """The collections to search by default, in order
2065 (`~collections.abc.Sequence` [ `str` ]).
2067 This is an alias for ``self.registry.defaults.collections``. It cannot
2068 be set directly in isolation, but all defaults may be changed together
2069 by assigning a new `RegistryDefaults` instance to
2070 ``self.registry.defaults``.
2071 """
2072 return self._registry.defaults.collections
2074 @property
2075 def run(self) -> str | None:
2076 """Name of the run this butler writes outputs to by default (`str` or
2077 `None`).
2079 This is an alias for ``self.registry.defaults.run``. It cannot be set
2080 directly in isolation, but all defaults may be changed together by
2081 assigning a new `RegistryDefaults` instance to
2082 ``self.registry.defaults``.
2083 """
2084 return self._registry.defaults.run
2086 @property
2087 def registry(self) -> Registry:
2088 """The object that manages dataset metadata and relationships
2089 (`Registry`).
2091 Many operations that don't involve reading or writing butler datasets
2092 are accessible only via `Registry` methods. Eventually these methods
2093 will be replaced by equivalent `Butler` methods.
2094 """
2095 return self._registry_shim
2097 @property
2098 def dimensions(self) -> DimensionUniverse:
2099 # Docstring inherited.
2100 return self._registry.dimensions
2102 @contextlib.contextmanager
2103 def _query(self) -> Iterator[Query]:
2104 # Docstring inherited.
2105 raise NotImplementedError("TODO DM-41159")
2107 def _preload_cache(self) -> None:
2108 """Immediately load caches that are used for common operations."""
2109 self._registry.preload_cache()
2111 _config: ButlerConfig
2112 """Configuration for this Butler instance."""
2114 _registry: SqlRegistry
2115 """The object that manages dataset metadata and relationships
2116 (`SqlRegistry`).
2118 Most operations that don't involve reading or writing butler datasets are
2119 accessible only via `SqlRegistry` methods.
2120 """
2122 datastore: Datastore
2123 """The object that manages actual dataset storage (`Datastore`).
2125 Direct user access to the datastore should rarely be necessary; the primary
2126 exception is the case where a `Datastore` implementation provides extra
2127 functionality beyond what the base class defines.
2128 """
2130 storageClasses: StorageClassFactory
2131 """An object that maps known storage class names to objects that fully
2132 describe them (`StorageClassFactory`).
2133 """
2135 _registry_shim: RegistryShim
2136 """Shim object to provide a legacy public interface for querying via the
2137 the ``registry`` property.
2138 """