Coverage for python/lsst/daf/butler/direct_butler/_direct_butler.py: 11%
763 statements
« prev ^ index » next coverage.py v7.5.0, created at 2024-04-27 03:00 -0700
« prev ^ index » next coverage.py v7.5.0, created at 2024-04-27 03:00 -0700
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28"""Butler top level classes.
29"""
30from __future__ import annotations
32__all__ = (
33 "DirectButler",
34 "ButlerValidationError",
35)
37import collections.abc
38import contextlib
39import io
40import itertools
41import logging
42import numbers
43import os
44import warnings
45from collections import Counter, defaultdict
46from collections.abc import Iterable, Iterator, MutableMapping, Sequence
47from typing import TYPE_CHECKING, Any, ClassVar, TextIO, cast
49from lsst.resources import ResourcePath, ResourcePathExpression
50from lsst.utils.introspection import get_class_of
51from lsst.utils.logging import VERBOSE, getLogger
52from sqlalchemy.exc import IntegrityError
54from .._butler import Butler
55from .._butler_config import ButlerConfig
56from .._butler_instance_options import ButlerInstanceOptions
57from .._dataset_existence import DatasetExistence
58from .._dataset_ref import DatasetRef
59from .._dataset_type import DatasetType
60from .._deferredDatasetHandle import DeferredDatasetHandle
61from .._exceptions import DatasetNotFoundError, DimensionValueError, ValidationError
62from .._limited_butler import LimitedButler
63from .._registry_shim import RegistryShim
64from .._storage_class import StorageClass, StorageClassFactory
65from .._timespan import Timespan
66from ..datastore import Datastore, NullDatastore
67from ..dimensions import DataCoordinate, Dimension
68from ..direct_query_driver import DirectQueryDriver
69from ..progress import Progress
70from ..queries import Query
71from ..registry import (
72 CollectionType,
73 ConflictingDefinitionError,
74 DataIdError,
75 MissingDatasetTypeError,
76 RegistryDefaults,
77 _RegistryFactory,
78)
79from ..registry.sql_registry import SqlRegistry
80from ..transfers import RepoExportContext
81from ..utils import transactional
82from ._direct_butler_collections import DirectButlerCollections
84if TYPE_CHECKING:
85 from lsst.resources import ResourceHandleProtocol
87 from .._dataset_ref import DatasetId
88 from .._file_dataset import FileDataset
89 from ..datastore import DatasetRefURIs
90 from ..dimensions import DataId, DataIdValue, DimensionElement, DimensionRecord, DimensionUniverse
91 from ..registry import Registry
92 from ..transfers import RepoImportBackend
94_LOG = getLogger(__name__)
97class ButlerValidationError(ValidationError):
98 """There is a problem with the Butler configuration."""
100 pass
103class DirectButler(Butler): # numpydoc ignore=PR02
104 """Main entry point for the data access system.
106 Parameters
107 ----------
108 config : `ButlerConfig`
109 The configuration for this Butler instance.
110 registry : `SqlRegistry`
111 The object that manages dataset metadata and relationships.
112 datastore : Datastore
113 The object that manages actual dataset storage.
114 storageClasses : StorageClassFactory
115 An object that maps known storage class names to objects that fully
116 describe them.
118 Notes
119 -----
120 Most users should call the top-level `Butler`.``from_config`` instead of
121 using this constructor directly.
122 """
124 # This is __new__ instead of __init__ because we have to support
125 # instantiation via the legacy constructor Butler.__new__(), which
126 # reads the configuration and selects which subclass to instantiate. The
127 # interaction between __new__ and __init__ is kind of wacky in Python. If
128 # we were using __init__ here, __init__ would be called twice (once when
129 # the DirectButler instance is constructed inside Butler.from_config(), and
130 # a second time with the original arguments to Butler() when the instance
131 # is returned from Butler.__new__()
132 def __new__(
133 cls,
134 *,
135 config: ButlerConfig,
136 registry: SqlRegistry,
137 datastore: Datastore,
138 storageClasses: StorageClassFactory,
139 ) -> DirectButler:
140 self = cast(DirectButler, super().__new__(cls))
141 self._config = config
142 self._registry = registry
143 self._datastore = datastore
144 self.storageClasses = storageClasses
146 # For execution butler the datastore needs a special
147 # dependency-inversion trick. This is not used by regular butler,
148 # but we do not have a way to distinguish regular butler from execution
149 # butler.
150 self._datastore.set_retrieve_dataset_type_method(self._retrieve_dataset_type)
152 self._registry_shim = RegistryShim(self)
154 return self
156 @classmethod
157 def create_from_config(
158 cls,
159 config: ButlerConfig,
160 *,
161 options: ButlerInstanceOptions,
162 without_datastore: bool = False,
163 ) -> DirectButler:
164 """Construct a Butler instance from a configuration file.
166 Parameters
167 ----------
168 config : `ButlerConfig`
169 The configuration for this Butler instance.
170 options : `ButlerInstanceOptions`
171 Default values and other settings for the Butler instance.
172 without_datastore : `bool`, optional
173 If `True` do not attach a datastore to this butler. Any attempts
174 to use a datastore will fail.
176 Notes
177 -----
178 Most users should call the top-level `Butler`.``from_config``
179 instead of using this function directly.
180 """
181 if "run" in config or "collection" in config:
182 raise ValueError("Passing a run or collection via configuration is no longer supported.")
184 defaults = RegistryDefaults(
185 collections=options.collections, run=options.run, infer=options.inferDefaults, **options.kwargs
186 )
187 try:
188 butlerRoot = config.get("root", config.configDir)
189 writeable = options.writeable
190 if writeable is None:
191 writeable = options.run is not None
192 registry = _RegistryFactory(config).from_config(
193 butlerRoot=butlerRoot, writeable=writeable, defaults=defaults
194 )
195 if without_datastore:
196 datastore: Datastore = NullDatastore(None, None)
197 else:
198 datastore = Datastore.fromConfig(
199 config, registry.getDatastoreBridgeManager(), butlerRoot=butlerRoot
200 )
201 # TODO: Once datastore drops dependency on registry we can
202 # construct datastore first and pass opaque tables to registry
203 # constructor.
204 registry.make_datastore_tables(datastore.get_opaque_table_definitions())
205 storageClasses = StorageClassFactory()
206 storageClasses.addFromConfig(config)
208 return DirectButler(
209 config=config, registry=registry, datastore=datastore, storageClasses=storageClasses
210 )
211 except Exception:
212 # Failures here usually mean that configuration is incomplete,
213 # just issue an error message which includes config file URI.
214 _LOG.error(f"Failed to instantiate Butler from config {config.configFile}.")
215 raise
217 def _clone(
218 self,
219 *,
220 collections: Any = None,
221 run: str | None = None,
222 inferDefaults: bool = True,
223 **kwargs: Any,
224 ) -> DirectButler:
225 # Docstring inherited
226 defaults = RegistryDefaults(collections=collections, run=run, infer=inferDefaults, **kwargs)
227 registry = self._registry.copy(defaults)
229 return DirectButler(
230 registry=registry,
231 config=self._config,
232 datastore=self._datastore.clone(registry.getDatastoreBridgeManager()),
233 storageClasses=self.storageClasses,
234 )
236 GENERATION: ClassVar[int] = 3
237 """This is a Generation 3 Butler.
239 This attribute may be removed in the future, once the Generation 2 Butler
240 interface has been fully retired; it should only be used in transitional
241 code.
242 """
244 def _retrieve_dataset_type(self, name: str) -> DatasetType | None:
245 """Return DatasetType defined in registry given dataset type name."""
246 try:
247 return self.get_dataset_type(name)
248 except MissingDatasetTypeError:
249 return None
251 @classmethod
252 def _unpickle(
253 cls,
254 config: ButlerConfig,
255 collections: tuple[str, ...] | None,
256 run: str | None,
257 defaultDataId: dict[str, str],
258 writeable: bool,
259 ) -> DirectButler:
260 """Callable used to unpickle a Butler.
262 We prefer not to use ``Butler.__init__`` directly so we can force some
263 of its many arguments to be keyword-only (note that ``__reduce__``
264 can only invoke callables with positional arguments).
266 Parameters
267 ----------
268 config : `ButlerConfig`
269 Butler configuration, already coerced into a true `ButlerConfig`
270 instance (and hence after any search paths for overrides have been
271 utilized).
272 collections : `tuple` [ `str` ]
273 Names of the default collections to read from.
274 run : `str`, optional
275 Name of the default `~CollectionType.RUN` collection to write to.
276 defaultDataId : `dict` [ `str`, `str` ]
277 Default data ID values.
278 writeable : `bool`
279 Whether the Butler should support write operations.
281 Returns
282 -------
283 butler : `Butler`
284 A new `Butler` instance.
285 """
286 return cls.create_from_config(
287 config=config,
288 options=ButlerInstanceOptions(
289 collections=collections, run=run, writeable=writeable, kwargs=defaultDataId
290 ),
291 )
293 def __reduce__(self) -> tuple:
294 """Support pickling."""
295 return (
296 DirectButler._unpickle,
297 (
298 self._config,
299 self.collections,
300 self.run,
301 dict(self._registry.defaults.dataId.required),
302 self._registry.isWriteable(),
303 ),
304 )
306 def __str__(self) -> str:
307 return (
308 f"Butler(collections={self.collections}, run={self.run}, "
309 f"datastore='{self._datastore}', registry='{self._registry}')"
310 )
312 def isWriteable(self) -> bool:
313 # Docstring inherited.
314 return self._registry.isWriteable()
316 def _caching_context(self) -> contextlib.AbstractContextManager[None]:
317 """Context manager that enables caching."""
318 return self._registry.caching_context()
320 @contextlib.contextmanager
321 def transaction(self) -> Iterator[None]:
322 """Context manager supporting `Butler` transactions.
324 Transactions can be nested.
325 """
326 with self._registry.transaction(), self._datastore.transaction():
327 yield
329 def _standardizeArgs(
330 self,
331 datasetRefOrType: DatasetRef | DatasetType | str,
332 dataId: DataId | None = None,
333 for_put: bool = True,
334 **kwargs: Any,
335 ) -> tuple[DatasetType, DataId | None]:
336 """Standardize the arguments passed to several Butler APIs.
338 Parameters
339 ----------
340 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
341 When `DatasetRef` the `dataId` should be `None`.
342 Otherwise the `DatasetType` or name thereof.
343 dataId : `dict` or `DataCoordinate`
344 A `dict` of `Dimension` link name, value pairs that label the
345 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
346 should be provided as the second argument.
347 for_put : `bool`, optional
348 If `True` this call is invoked as part of a `Butler.put()`.
349 Otherwise it is assumed to be part of a `Butler.get()`. This
350 parameter is only relevant if there is dataset type
351 inconsistency.
352 **kwargs
353 Additional keyword arguments used to augment or construct a
354 `DataCoordinate`. See `DataCoordinate.standardize`
355 parameters.
357 Returns
358 -------
359 datasetType : `DatasetType`
360 A `DatasetType` instance extracted from ``datasetRefOrType``.
361 dataId : `dict` or `DataId`, optional
362 Argument that can be used (along with ``kwargs``) to construct a
363 `DataId`.
365 Notes
366 -----
367 Butler APIs that conceptually need a DatasetRef also allow passing a
368 `DatasetType` (or the name of one) and a `DataId` (or a dict and
369 keyword arguments that can be used to construct one) separately. This
370 method accepts those arguments and always returns a true `DatasetType`
371 and a `DataId` or `dict`.
373 Standardization of `dict` vs `DataId` is best handled by passing the
374 returned ``dataId`` (and ``kwargs``) to `Registry` APIs, which are
375 generally similarly flexible.
376 """
377 externalDatasetType: DatasetType | None = None
378 internalDatasetType: DatasetType | None = None
379 if isinstance(datasetRefOrType, DatasetRef):
380 if dataId is not None or kwargs:
381 raise ValueError("DatasetRef given, cannot use dataId as well")
382 externalDatasetType = datasetRefOrType.datasetType
383 dataId = datasetRefOrType.dataId
384 else:
385 # Don't check whether DataId is provided, because Registry APIs
386 # can usually construct a better error message when it wasn't.
387 if isinstance(datasetRefOrType, DatasetType):
388 externalDatasetType = datasetRefOrType
389 else:
390 internalDatasetType = self.get_dataset_type(datasetRefOrType)
392 # Check that they are self-consistent
393 if externalDatasetType is not None:
394 internalDatasetType = self.get_dataset_type(externalDatasetType.name)
395 if externalDatasetType != internalDatasetType:
396 # We can allow differences if they are compatible, depending
397 # on whether this is a get or a put. A get requires that
398 # the python type associated with the datastore can be
399 # converted to the user type. A put requires that the user
400 # supplied python type can be converted to the internal
401 # type expected by registry.
402 relevantDatasetType = internalDatasetType
403 if for_put:
404 is_compatible = internalDatasetType.is_compatible_with(externalDatasetType)
405 else:
406 is_compatible = externalDatasetType.is_compatible_with(internalDatasetType)
407 relevantDatasetType = externalDatasetType
408 if not is_compatible:
409 raise ValueError(
410 f"Supplied dataset type ({externalDatasetType}) inconsistent with "
411 f"registry definition ({internalDatasetType})"
412 )
413 # Override the internal definition.
414 internalDatasetType = relevantDatasetType
416 assert internalDatasetType is not None
417 return internalDatasetType, dataId
419 def _rewrite_data_id(
420 self, dataId: DataId | None, datasetType: DatasetType, **kwargs: Any
421 ) -> tuple[DataId | None, dict[str, Any]]:
422 """Rewrite a data ID taking into account dimension records.
424 Take a Data ID and keyword args and rewrite it if necessary to
425 allow the user to specify dimension records rather than dimension
426 primary values.
428 This allows a user to include a dataId dict with keys of
429 ``exposure.day_obs`` and ``exposure.seq_num`` instead of giving
430 the integer exposure ID. It also allows a string to be given
431 for a dimension value rather than the integer ID if that is more
432 convenient. For example, rather than having to specifying the
433 detector with ``detector.full_name``, a string given for ``detector``
434 will be interpreted as the full name and converted to the integer
435 value.
437 Keyword arguments can also use strings for dimensions like detector
438 and exposure but python does not allow them to include ``.`` and
439 so the ``exposure.day_obs`` syntax can not be used in a keyword
440 argument.
442 Parameters
443 ----------
444 dataId : `dict` or `DataCoordinate`
445 A `dict` of `Dimension` link name, value pairs that will label the
446 `DatasetRef` within a Collection.
447 datasetType : `DatasetType`
448 The dataset type associated with this dataId. Required to
449 determine the relevant dimensions.
450 **kwargs
451 Additional keyword arguments used to augment or construct a
452 `DataId`. See `DataId` parameters.
454 Returns
455 -------
456 dataId : `dict` or `DataCoordinate`
457 The, possibly rewritten, dataId. If given a `DataCoordinate` and
458 no keyword arguments, the original dataId will be returned
459 unchanged.
460 **kwargs : `dict`
461 Any unused keyword arguments (would normally be empty dict).
462 """
463 # Do nothing if we have a standalone DataCoordinate.
464 if isinstance(dataId, DataCoordinate) and not kwargs:
465 return dataId, kwargs
467 # Process dimension records that are using record information
468 # rather than ids
469 newDataId: dict[str, DataIdValue] = {}
470 byRecord: dict[str, dict[str, Any]] = defaultdict(dict)
472 # if all the dataId comes from keyword parameters we do not need
473 # to do anything here because they can't be of the form
474 # exposure.obs_id because a "." is not allowed in a keyword parameter.
475 if dataId:
476 for k, v in dataId.items():
477 # If we have a Dimension we do not need to do anything
478 # because it cannot be a compound key.
479 if isinstance(k, str) and "." in k:
480 # Someone is using a more human-readable dataId
481 dimensionName, record = k.split(".", 1)
482 byRecord[dimensionName][record] = v
483 elif isinstance(k, Dimension):
484 newDataId[k.name] = v
485 else:
486 newDataId[k] = v
488 # Go through the updated dataId and check the type in case someone is
489 # using an alternate key. We have already filtered out the compound
490 # keys dimensions.record format.
491 not_dimensions = {}
493 # Will need to look in the dataId and the keyword arguments
494 # and will remove them if they need to be fixed or are unrecognized.
495 for dataIdDict in (newDataId, kwargs):
496 # Use a list so we can adjust the dict safely in the loop
497 for dimensionName in list(dataIdDict):
498 value = dataIdDict[dimensionName]
499 try:
500 dimension = self.dimensions.dimensions[dimensionName]
501 except KeyError:
502 # This is not a real dimension
503 not_dimensions[dimensionName] = value
504 del dataIdDict[dimensionName]
505 continue
507 # Convert an integral type to an explicit int to simplify
508 # comparisons here
509 if isinstance(value, numbers.Integral):
510 value = int(value)
512 if not isinstance(value, dimension.primaryKey.getPythonType()):
513 for alternate in dimension.alternateKeys:
514 if isinstance(value, alternate.getPythonType()):
515 byRecord[dimensionName][alternate.name] = value
516 del dataIdDict[dimensionName]
517 _LOG.debug(
518 "Converting dimension %s to %s.%s=%s",
519 dimensionName,
520 dimensionName,
521 alternate.name,
522 value,
523 )
524 break
525 else:
526 _LOG.warning(
527 "Type mismatch found for value '%r' provided for dimension %s. "
528 "Could not find matching alternative (primary key has type %s) "
529 "so attempting to use as-is.",
530 value,
531 dimensionName,
532 dimension.primaryKey.getPythonType(),
533 )
535 # By this point kwargs and newDataId should only include valid
536 # dimensions. Merge kwargs in to the new dataId and log if there
537 # are dimensions in both (rather than calling update).
538 for k, v in kwargs.items():
539 if k in newDataId and newDataId[k] != v:
540 _LOG.debug(
541 "Keyword arg %s overriding explicit value in dataId of %s with %s", k, newDataId[k], v
542 )
543 newDataId[k] = v
544 # No need to retain any values in kwargs now.
545 kwargs = {}
547 # If we have some unrecognized dimensions we have to try to connect
548 # them to records in other dimensions. This is made more complicated
549 # by some dimensions having records with clashing names. A mitigation
550 # is that we can tell by this point which dimensions are missing
551 # for the DatasetType but this does not work for calibrations
552 # where additional dimensions can be used to constrain the temporal
553 # axis.
554 if not_dimensions:
555 # Search for all dimensions even if we have been given a value
556 # explicitly. In some cases records are given as well as the
557 # actually dimension and this should not be an error if they
558 # match.
559 mandatoryDimensions = datasetType.dimensions.names # - provided
561 candidateDimensions: set[str] = set()
562 candidateDimensions.update(mandatoryDimensions)
564 # For calibrations we may well be needing temporal dimensions
565 # so rather than always including all dimensions in the scan
566 # restrict things a little. It is still possible for there
567 # to be confusion over day_obs in visit vs exposure for example.
568 # If we are not searching calibration collections things may
569 # fail but they are going to fail anyway because of the
570 # ambiguousness of the dataId...
571 if datasetType.isCalibration():
572 for dim in self.dimensions.dimensions:
573 if dim.temporal:
574 candidateDimensions.add(str(dim))
576 # Look up table for the first association with a dimension
577 guessedAssociation: dict[str, dict[str, Any]] = defaultdict(dict)
579 # Keep track of whether an item is associated with multiple
580 # dimensions.
581 counter: Counter[str] = Counter()
582 assigned: dict[str, set[str]] = defaultdict(set)
584 # Go through the missing dimensions and associate the
585 # given names with records within those dimensions
586 matched_dims = set()
587 for dimensionName in candidateDimensions:
588 dimension = self.dimensions.dimensions[dimensionName]
589 fields = dimension.metadata.names | dimension.uniqueKeys.names
590 for field in not_dimensions:
591 if field in fields:
592 guessedAssociation[dimensionName][field] = not_dimensions[field]
593 counter[dimensionName] += 1
594 assigned[field].add(dimensionName)
595 matched_dims.add(field)
597 # Calculate the fields that matched nothing.
598 never_found = set(not_dimensions) - matched_dims
600 if never_found:
601 raise DimensionValueError(f"Unrecognized keyword args given: {never_found}")
603 # There is a chance we have allocated a single dataId item
604 # to multiple dimensions. Need to decide which should be retained.
605 # For now assume that the most popular alternative wins.
606 # This means that day_obs with seq_num will result in
607 # exposure.day_obs and not visit.day_obs
608 # Also prefer an explicitly missing dimension over an inferred
609 # temporal dimension.
610 for fieldName, assignedDimensions in assigned.items():
611 if len(assignedDimensions) > 1:
612 # Pick the most popular (preferring mandatory dimensions)
613 requiredButMissing = assignedDimensions.intersection(mandatoryDimensions)
614 if requiredButMissing:
615 candidateDimensions = requiredButMissing
616 else:
617 candidateDimensions = assignedDimensions
619 # If this is a choice between visit and exposure and
620 # neither was a required part of the dataset type,
621 # (hence in this branch) always prefer exposure over
622 # visit since exposures are always defined and visits
623 # are defined from exposures.
624 if candidateDimensions == {"exposure", "visit"}:
625 candidateDimensions = {"exposure"}
627 # Select the relevant items and get a new restricted
628 # counter.
629 theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions}
630 duplicatesCounter: Counter[str] = Counter()
631 duplicatesCounter.update(theseCounts)
633 # Choose the most common. If they are equally common
634 # we will pick the one that was found first.
635 # Returns a list of tuples
636 selected = duplicatesCounter.most_common(1)[0][0]
638 _LOG.debug(
639 "Ambiguous dataId entry '%s' associated with multiple dimensions: %s."
640 " Removed ambiguity by choosing dimension %s.",
641 fieldName,
642 ", ".join(assignedDimensions),
643 selected,
644 )
646 for candidateDimension in assignedDimensions:
647 if candidateDimension != selected:
648 del guessedAssociation[candidateDimension][fieldName]
650 # Update the record look up dict with the new associations
651 for dimensionName, values in guessedAssociation.items():
652 if values: # A dict might now be empty
653 _LOG.debug(
654 "Assigned non-dimension dataId keys to dimension %s: %s", dimensionName, values
655 )
656 byRecord[dimensionName].update(values)
658 if byRecord:
659 # Some record specifiers were found so we need to convert
660 # them to the Id form
661 for dimensionName, values in byRecord.items():
662 if dimensionName in newDataId:
663 _LOG.debug(
664 "DataId specified explicit %s dimension value of %s in addition to"
665 " general record specifiers for it of %s. Ignoring record information.",
666 dimensionName,
667 newDataId[dimensionName],
668 str(values),
669 )
670 # Get the actual record and compare with these values.
671 try:
672 recs = list(self._registry.queryDimensionRecords(dimensionName, dataId=newDataId))
673 except DataIdError:
674 raise DimensionValueError(
675 f"Could not find dimension '{dimensionName}'"
676 f" with dataId {newDataId} as part of comparing with"
677 f" record values {byRecord[dimensionName]}"
678 ) from None
679 if len(recs) == 1:
680 errmsg: list[str] = []
681 for k, v in values.items():
682 if (recval := getattr(recs[0], k)) != v:
683 errmsg.append(f"{k}({recval} != {v})")
684 if errmsg:
685 raise DimensionValueError(
686 f"Dimension {dimensionName} in dataId has explicit value"
687 " inconsistent with records: " + ", ".join(errmsg)
688 )
689 else:
690 # Multiple matches for an explicit dimension
691 # should never happen but let downstream complain.
692 pass
693 continue
695 # Build up a WHERE expression
696 bind = dict(values.items())
697 where = " AND ".join(f"{dimensionName}.{k} = {k}" for k in bind)
699 # Hopefully we get a single record that matches
700 records = set(
701 self._registry.queryDimensionRecords(
702 dimensionName, dataId=newDataId, where=where, bind=bind, **kwargs
703 )
704 )
706 if len(records) != 1:
707 if len(records) > 1:
708 # visit can have an ambiguous answer without involving
709 # visit_system. The default visit_system is defined
710 # by the instrument.
711 if (
712 dimensionName == "visit"
713 and "visit_system_membership" in self.dimensions
714 and "visit_system" in self.dimensions["instrument"].metadata
715 ):
716 instrument_records = list(
717 self._registry.queryDimensionRecords(
718 "instrument",
719 dataId=newDataId,
720 **kwargs,
721 )
722 )
723 if len(instrument_records) == 1:
724 visit_system = instrument_records[0].visit_system
725 if visit_system is None:
726 # Set to a value that will never match.
727 visit_system = -1
729 # Look up each visit in the
730 # visit_system_membership records.
731 for rec in records:
732 membership = list(
733 self._registry.queryDimensionRecords(
734 # Use bind to allow zero results.
735 # This is a fully-specified query.
736 "visit_system_membership",
737 where="instrument = inst AND visit_system = system AND visit = v",
738 bind=dict(
739 inst=instrument_records[0].name, system=visit_system, v=rec.id
740 ),
741 )
742 )
743 if membership:
744 # This record is the right answer.
745 records = {rec}
746 break
748 # The ambiguity may have been resolved so check again.
749 if len(records) > 1:
750 _LOG.debug(
751 "Received %d records from constraints of %s", len(records), str(values)
752 )
753 for r in records:
754 _LOG.debug("- %s", str(r))
755 raise DimensionValueError(
756 f"DataId specification for dimension {dimensionName} is not"
757 f" uniquely constrained to a single dataset by {values}."
758 f" Got {len(records)} results."
759 )
760 else:
761 raise DimensionValueError(
762 f"DataId specification for dimension {dimensionName} matched no"
763 f" records when constrained by {values}"
764 )
766 # Get the primary key from the real dimension object
767 dimension = self.dimensions.dimensions[dimensionName]
768 if not isinstance(dimension, Dimension):
769 raise RuntimeError(
770 f"{dimension.name} is not a true dimension, and cannot be used in data IDs."
771 )
772 newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name)
774 return newDataId, kwargs
776 def _findDatasetRef(
777 self,
778 datasetRefOrType: DatasetRef | DatasetType | str,
779 dataId: DataId | None = None,
780 *,
781 collections: Any = None,
782 predict: bool = False,
783 run: str | None = None,
784 datastore_records: bool = False,
785 timespan: Timespan | None = None,
786 **kwargs: Any,
787 ) -> DatasetRef:
788 """Shared logic for methods that start with a search for a dataset in
789 the registry.
791 Parameters
792 ----------
793 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
794 When `DatasetRef` the `dataId` should be `None`.
795 Otherwise the `DatasetType` or name thereof.
796 dataId : `dict` or `DataCoordinate`, optional
797 A `dict` of `Dimension` link name, value pairs that label the
798 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
799 should be provided as the first argument.
800 collections : Any, optional
801 Collections to be searched, overriding ``self.collections``.
802 Can be any of the types supported by the ``collections`` argument
803 to butler construction.
804 predict : `bool`, optional
805 If `True`, return a newly created `DatasetRef` with a unique
806 dataset ID if finding a reference in the `Registry` fails.
807 Defaults to `False`.
808 run : `str`, optional
809 Run collection name to use for creating `DatasetRef` for predicted
810 datasets. Only used if ``predict`` is `True`.
811 datastore_records : `bool`, optional
812 If `True` add datastore records to returned `DatasetRef`.
813 timespan : `Timespan` or `None`, optional
814 A timespan that the validity range of the dataset must overlap.
815 If not provided and this is a calibration dataset type, an attempt
816 will be made to find the timespan from any temporal coordinate
817 in the data ID.
818 **kwargs
819 Additional keyword arguments used to augment or construct a
820 `DataId`. See `DataId` parameters.
822 Returns
823 -------
824 ref : `DatasetRef`
825 A reference to the dataset identified by the given arguments.
826 This can be the same dataset reference as given if it was
827 resolved.
829 Raises
830 ------
831 LookupError
832 Raised if no matching dataset exists in the `Registry` (and
833 ``predict`` is `False`).
834 ValueError
835 Raised if a resolved `DatasetRef` was passed as an input, but it
836 differs from the one found in the registry.
837 TypeError
838 Raised if no collections were provided.
839 """
840 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, for_put=False, **kwargs)
841 if isinstance(datasetRefOrType, DatasetRef):
842 if collections is not None:
843 warnings.warn("Collections should not be specified with DatasetRef", stacklevel=3)
844 # May need to retrieve datastore records if requested.
845 if datastore_records and datasetRefOrType._datastore_records is None:
846 datasetRefOrType = self._registry.get_datastore_records(datasetRefOrType)
847 return datasetRefOrType
849 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs)
851 if datasetType.isCalibration():
852 # Because this is a calibration dataset, first try to make a
853 # standardize the data ID without restricting the dimensions to
854 # those of the dataset type requested, because there may be extra
855 # dimensions that provide temporal information for a validity-range
856 # lookup.
857 dataId = DataCoordinate.standardize(
858 dataId, universe=self.dimensions, defaults=self._registry.defaults.dataId, **kwargs
859 )
860 if timespan is None:
861 if dataId.dimensions.temporal:
862 dataId = self._registry.expandDataId(dataId)
863 # Use the timespan from the data ID to constrain the
864 # calibration lookup, but only if the caller has not
865 # specified an explicit timespan.
866 timespan = dataId.timespan
867 else:
868 # Try an arbitrary timespan. Downstream will fail if this
869 # results in more than one matching dataset.
870 timespan = Timespan(None, None)
871 else:
872 # Standardize the data ID to just the dimensions of the dataset
873 # type instead of letting registry.findDataset do it, so we get the
874 # result even if no dataset is found.
875 dataId = DataCoordinate.standardize(
876 dataId,
877 dimensions=datasetType.dimensions,
878 defaults=self._registry.defaults.dataId,
879 **kwargs,
880 )
881 # Always lookup the DatasetRef, even if one is given, to ensure it is
882 # present in the current collection.
883 ref = self.find_dataset(
884 datasetType,
885 dataId,
886 collections=collections,
887 timespan=timespan,
888 datastore_records=datastore_records,
889 )
890 if ref is None:
891 if predict:
892 if run is None:
893 run = self.run
894 if run is None:
895 raise TypeError("Cannot predict dataset ID/location with run=None.")
896 return DatasetRef(datasetType, dataId, run=run)
897 else:
898 if collections is None:
899 collections = self._registry.defaults.collections
900 raise DatasetNotFoundError(
901 f"Dataset {datasetType.name} with data ID {dataId} "
902 f"could not be found in collections {collections}."
903 )
904 if datasetType != ref.datasetType:
905 # If they differ it is because the user explicitly specified
906 # a compatible dataset type to this call rather than using the
907 # registry definition. The DatasetRef must therefore be recreated
908 # using the user definition such that the expected type is
909 # returned.
910 ref = DatasetRef(
911 datasetType, ref.dataId, run=ref.run, id=ref.id, datastore_records=ref._datastore_records
912 )
914 return ref
916 @transactional
917 def put(
918 self,
919 obj: Any,
920 datasetRefOrType: DatasetRef | DatasetType | str,
921 /,
922 dataId: DataId | None = None,
923 *,
924 run: str | None = None,
925 **kwargs: Any,
926 ) -> DatasetRef:
927 """Store and register a dataset.
929 Parameters
930 ----------
931 obj : `object`
932 The dataset.
933 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
934 When `DatasetRef` is provided, ``dataId`` should be `None`.
935 Otherwise the `DatasetType` or name thereof. If a fully resolved
936 `DatasetRef` is given the run and ID are used directly.
937 dataId : `dict` or `DataCoordinate`
938 A `dict` of `Dimension` link name, value pairs that label the
939 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
940 should be provided as the second argument.
941 run : `str`, optional
942 The name of the run the dataset should be added to, overriding
943 ``self.run``. Not used if a resolved `DatasetRef` is provided.
944 **kwargs
945 Additional keyword arguments used to augment or construct a
946 `DataCoordinate`. See `DataCoordinate.standardize`
947 parameters. Not used if a resolve `DatasetRef` is provided.
949 Returns
950 -------
951 ref : `DatasetRef`
952 A reference to the stored dataset, updated with the correct id if
953 given.
955 Raises
956 ------
957 TypeError
958 Raised if the butler is read-only or if no run has been provided.
959 """
960 if isinstance(datasetRefOrType, DatasetRef):
961 # This is a direct put of predefined DatasetRef.
962 _LOG.debug("Butler put direct: %s", datasetRefOrType)
963 if run is not None:
964 warnings.warn("Run collection is not used for DatasetRef", stacklevel=3)
965 # If registry already has a dataset with the same dataset ID,
966 # dataset type and DataId, then _importDatasets will do nothing and
967 # just return an original ref. We have to raise in this case, there
968 # is a datastore check below for that.
969 self._registry._importDatasets([datasetRefOrType], expand=True)
970 # Before trying to write to the datastore check that it does not
971 # know this dataset. This is prone to races, of course.
972 if self._datastore.knows(datasetRefOrType):
973 raise ConflictingDefinitionError(f"Datastore already contains dataset: {datasetRefOrType}")
974 # Try to write dataset to the datastore, if it fails due to a race
975 # with another write, the content of stored data may be
976 # unpredictable.
977 try:
978 self._datastore.put(obj, datasetRefOrType)
979 except IntegrityError as e:
980 raise ConflictingDefinitionError(f"Datastore already contains dataset: {e}") from e
981 return datasetRefOrType
983 _LOG.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run)
984 if not self.isWriteable():
985 raise TypeError("Butler is read-only.")
986 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs)
988 # Handle dimension records in dataId
989 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs)
991 # Add Registry Dataset entry.
992 dataId = self._registry.expandDataId(dataId, dimensions=datasetType.dimensions, **kwargs)
993 (ref,) = self._registry.insertDatasets(datasetType, run=run, dataIds=[dataId])
994 self._datastore.put(obj, ref)
996 return ref
998 def getDeferred(
999 self,
1000 datasetRefOrType: DatasetRef | DatasetType | str,
1001 /,
1002 dataId: DataId | None = None,
1003 *,
1004 parameters: dict | None = None,
1005 collections: Any = None,
1006 storageClass: str | StorageClass | None = None,
1007 timespan: Timespan | None = None,
1008 **kwargs: Any,
1009 ) -> DeferredDatasetHandle:
1010 """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
1011 after an immediate registry lookup.
1013 Parameters
1014 ----------
1015 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1016 When `DatasetRef` the `dataId` should be `None`.
1017 Otherwise the `DatasetType` or name thereof.
1018 dataId : `dict` or `DataCoordinate`, optional
1019 A `dict` of `Dimension` link name, value pairs that label the
1020 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1021 should be provided as the first argument.
1022 parameters : `dict`
1023 Additional StorageClass-defined options to control reading,
1024 typically used to efficiently read only a subset of the dataset.
1025 collections : Any, optional
1026 Collections to be searched, overriding ``self.collections``.
1027 Can be any of the types supported by the ``collections`` argument
1028 to butler construction.
1029 storageClass : `StorageClass` or `str`, optional
1030 The storage class to be used to override the Python type
1031 returned by this method. By default the returned type matches
1032 the dataset type definition for this dataset. Specifying a
1033 read `StorageClass` can force a different type to be returned.
1034 This type must be compatible with the original type.
1035 timespan : `Timespan` or `None`, optional
1036 A timespan that the validity range of the dataset must overlap.
1037 If not provided and this is a calibration dataset type, an attempt
1038 will be made to find the timespan from any temporal coordinate
1039 in the data ID.
1040 **kwargs
1041 Additional keyword arguments used to augment or construct a
1042 `DataId`. See `DataId` parameters.
1044 Returns
1045 -------
1046 obj : `DeferredDatasetHandle`
1047 A handle which can be used to retrieve a dataset at a later time.
1049 Raises
1050 ------
1051 LookupError
1052 Raised if no matching dataset exists in the `Registry` or
1053 datastore.
1054 ValueError
1055 Raised if a resolved `DatasetRef` was passed as an input, but it
1056 differs from the one found in the registry.
1057 TypeError
1058 Raised if no collections were provided.
1059 """
1060 if isinstance(datasetRefOrType, DatasetRef):
1061 # Do the quick check first and if that fails, check for artifact
1062 # existence. This is necessary for datastores that are configured
1063 # in trust mode where there won't be a record but there will be
1064 # a file.
1065 if self._datastore.knows(datasetRefOrType) or self._datastore.exists(datasetRefOrType):
1066 ref = datasetRefOrType
1067 else:
1068 raise LookupError(f"Dataset reference {datasetRefOrType} does not exist.")
1069 else:
1070 ref = self._findDatasetRef(
1071 datasetRefOrType, dataId, collections=collections, timespan=timespan, **kwargs
1072 )
1073 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters, storageClass=storageClass)
1075 def get(
1076 self,
1077 datasetRefOrType: DatasetRef | DatasetType | str,
1078 /,
1079 dataId: DataId | None = None,
1080 *,
1081 parameters: dict[str, Any] | None = None,
1082 collections: Any = None,
1083 storageClass: StorageClass | str | None = None,
1084 timespan: Timespan | None = None,
1085 **kwargs: Any,
1086 ) -> Any:
1087 """Retrieve a stored dataset.
1089 Parameters
1090 ----------
1091 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1092 When `DatasetRef` the `dataId` should be `None`.
1093 Otherwise the `DatasetType` or name thereof.
1094 If a resolved `DatasetRef`, the associated dataset
1095 is returned directly without additional querying.
1096 dataId : `dict` or `DataCoordinate`
1097 A `dict` of `Dimension` link name, value pairs that label the
1098 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1099 should be provided as the first argument.
1100 parameters : `dict`
1101 Additional StorageClass-defined options to control reading,
1102 typically used to efficiently read only a subset of the dataset.
1103 collections : Any, optional
1104 Collections to be searched, overriding ``self.collections``.
1105 Can be any of the types supported by the ``collections`` argument
1106 to butler construction.
1107 storageClass : `StorageClass` or `str`, optional
1108 The storage class to be used to override the Python type
1109 returned by this method. By default the returned type matches
1110 the dataset type definition for this dataset. Specifying a
1111 read `StorageClass` can force a different type to be returned.
1112 This type must be compatible with the original type.
1113 timespan : `Timespan` or `None`, optional
1114 A timespan that the validity range of the dataset must overlap.
1115 If not provided and this is a calibration dataset type, an attempt
1116 will be made to find the timespan from any temporal coordinate
1117 in the data ID.
1118 **kwargs
1119 Additional keyword arguments used to augment or construct a
1120 `DataCoordinate`. See `DataCoordinate.standardize`
1121 parameters.
1123 Returns
1124 -------
1125 obj : `object`
1126 The dataset.
1128 Raises
1129 ------
1130 LookupError
1131 Raised if no matching dataset exists in the `Registry`.
1132 TypeError
1133 Raised if no collections were provided.
1135 Notes
1136 -----
1137 When looking up datasets in a `~CollectionType.CALIBRATION` collection,
1138 this method requires that the given data ID include temporal dimensions
1139 beyond the dimensions of the dataset type itself, in order to find the
1140 dataset with the appropriate validity range. For example, a "bias"
1141 dataset with native dimensions ``{instrument, detector}`` could be
1142 fetched with a ``{instrument, detector, exposure}`` data ID, because
1143 ``exposure`` is a temporal dimension.
1144 """
1145 _LOG.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters)
1146 ref = self._findDatasetRef(
1147 datasetRefOrType,
1148 dataId,
1149 collections=collections,
1150 datastore_records=True,
1151 timespan=timespan,
1152 **kwargs,
1153 )
1154 return self._datastore.get(ref, parameters=parameters, storageClass=storageClass)
1156 def getURIs(
1157 self,
1158 datasetRefOrType: DatasetRef | DatasetType | str,
1159 /,
1160 dataId: DataId | None = None,
1161 *,
1162 predict: bool = False,
1163 collections: Any = None,
1164 run: str | None = None,
1165 **kwargs: Any,
1166 ) -> DatasetRefURIs:
1167 """Return the URIs associated with the dataset.
1169 Parameters
1170 ----------
1171 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1172 When `DatasetRef` the `dataId` should be `None`.
1173 Otherwise the `DatasetType` or name thereof.
1174 dataId : `dict` or `DataCoordinate`
1175 A `dict` of `Dimension` link name, value pairs that label the
1176 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1177 should be provided as the first argument.
1178 predict : `bool`
1179 If `True`, allow URIs to be returned of datasets that have not
1180 been written.
1181 collections : Any, optional
1182 Collections to be searched, overriding ``self.collections``.
1183 Can be any of the types supported by the ``collections`` argument
1184 to butler construction.
1185 run : `str`, optional
1186 Run to use for predictions, overriding ``self.run``.
1187 **kwargs
1188 Additional keyword arguments used to augment or construct a
1189 `DataCoordinate`. See `DataCoordinate.standardize`
1190 parameters.
1192 Returns
1193 -------
1194 uris : `DatasetRefURIs`
1195 The URI to the primary artifact associated with this dataset (if
1196 the dataset was disassembled within the datastore this may be
1197 `None`), and the URIs to any components associated with the dataset
1198 artifact. (can be empty if there are no components).
1199 """
1200 ref = self._findDatasetRef(
1201 datasetRefOrType, dataId, predict=predict, run=run, collections=collections, **kwargs
1202 )
1203 return self._datastore.getURIs(ref, predict)
1205 def get_dataset_type(self, name: str) -> DatasetType:
1206 return self._registry.getDatasetType(name)
1208 def get_dataset(
1209 self,
1210 id: DatasetId,
1211 *,
1212 storage_class: str | StorageClass | None = None,
1213 dimension_records: bool = False,
1214 datastore_records: bool = False,
1215 ) -> DatasetRef | None:
1216 ref = self._registry.getDataset(id)
1217 if ref is not None:
1218 if dimension_records:
1219 ref = ref.expanded(
1220 self._registry.expandDataId(ref.dataId, dimensions=ref.datasetType.dimensions)
1221 )
1222 if storage_class:
1223 ref = ref.overrideStorageClass(storage_class)
1224 if datastore_records:
1225 ref = self._registry.get_datastore_records(ref)
1226 return ref
1228 def find_dataset(
1229 self,
1230 dataset_type: DatasetType | str,
1231 data_id: DataId | None = None,
1232 *,
1233 collections: str | Sequence[str] | None = None,
1234 timespan: Timespan | None = None,
1235 storage_class: str | StorageClass | None = None,
1236 dimension_records: bool = False,
1237 datastore_records: bool = False,
1238 **kwargs: Any,
1239 ) -> DatasetRef | None:
1240 # Handle any parts of the dataID that are not using primary dimension
1241 # keys.
1242 if isinstance(dataset_type, str):
1243 actual_type = self.get_dataset_type(dataset_type)
1244 else:
1245 actual_type = dataset_type
1247 # Store the component for later.
1248 component_name = actual_type.component()
1249 if actual_type.isComponent():
1250 parent_type = actual_type.makeCompositeDatasetType()
1251 else:
1252 parent_type = actual_type
1254 data_id, kwargs = self._rewrite_data_id(data_id, parent_type, **kwargs)
1256 ref = self._registry.findDataset(
1257 parent_type,
1258 data_id,
1259 collections=collections,
1260 timespan=timespan,
1261 datastore_records=datastore_records,
1262 **kwargs,
1263 )
1264 if ref is not None and dimension_records:
1265 ref = ref.expanded(self._registry.expandDataId(ref.dataId, dimensions=ref.datasetType.dimensions))
1266 if ref is not None and component_name:
1267 ref = ref.makeComponentRef(component_name)
1268 if ref is not None and storage_class is not None:
1269 ref = ref.overrideStorageClass(storage_class)
1271 return ref
1273 def retrieveArtifacts(
1274 self,
1275 refs: Iterable[DatasetRef],
1276 destination: ResourcePathExpression,
1277 transfer: str = "auto",
1278 preserve_path: bool = True,
1279 overwrite: bool = False,
1280 ) -> list[ResourcePath]:
1281 # Docstring inherited.
1282 return self._datastore.retrieveArtifacts(
1283 refs,
1284 ResourcePath(destination),
1285 transfer=transfer,
1286 preserve_path=preserve_path,
1287 overwrite=overwrite,
1288 )
1290 def exists(
1291 self,
1292 dataset_ref_or_type: DatasetRef | DatasetType | str,
1293 /,
1294 data_id: DataId | None = None,
1295 *,
1296 full_check: bool = True,
1297 collections: Any = None,
1298 **kwargs: Any,
1299 ) -> DatasetExistence:
1300 # Docstring inherited.
1301 existence = DatasetExistence.UNRECOGNIZED
1303 if isinstance(dataset_ref_or_type, DatasetRef):
1304 if collections is not None:
1305 warnings.warn("Collections should not be specified with DatasetRef", stacklevel=2)
1306 if data_id is not None:
1307 warnings.warn("A DataID should not be specified with DatasetRef", stacklevel=2)
1308 ref = dataset_ref_or_type
1309 registry_ref = self._registry.getDataset(dataset_ref_or_type.id)
1310 if registry_ref is not None:
1311 existence |= DatasetExistence.RECORDED
1313 if dataset_ref_or_type != registry_ref:
1314 # This could mean that storage classes differ, so we should
1315 # check for that but use the registry ref for the rest of
1316 # the method.
1317 if registry_ref.is_compatible_with(dataset_ref_or_type):
1318 # Use the registry version from now on.
1319 ref = registry_ref
1320 else:
1321 raise ValueError(
1322 f"The ref given to exists() ({ref}) has the same dataset ID as one "
1323 f"in registry but has different incompatible values ({registry_ref})."
1324 )
1325 else:
1326 try:
1327 ref = self._findDatasetRef(dataset_ref_or_type, data_id, collections=collections, **kwargs)
1328 except (LookupError, TypeError):
1329 return existence
1330 existence |= DatasetExistence.RECORDED
1332 if self._datastore.knows(ref):
1333 existence |= DatasetExistence.DATASTORE
1335 if full_check:
1336 if self._datastore.exists(ref):
1337 existence |= DatasetExistence._ARTIFACT
1338 elif existence.value != DatasetExistence.UNRECOGNIZED.value:
1339 # Do not add this flag if we have no other idea about a dataset.
1340 existence |= DatasetExistence(DatasetExistence._ASSUMED)
1342 return existence
1344 def _exists_many(
1345 self,
1346 refs: Iterable[DatasetRef],
1347 /,
1348 *,
1349 full_check: bool = True,
1350 ) -> dict[DatasetRef, DatasetExistence]:
1351 # Docstring inherited.
1352 existence = {ref: DatasetExistence.UNRECOGNIZED for ref in refs}
1354 # Registry does not have a bulk API to check for a ref.
1355 for ref in refs:
1356 registry_ref = self._registry.getDataset(ref.id)
1357 if registry_ref is not None:
1358 # It is possible, albeit unlikely, that the given ref does
1359 # not match the one in registry even though the UUID matches.
1360 # When checking a single ref we raise, but it's impolite to
1361 # do that when potentially hundreds of refs are being checked.
1362 # We could change the API to only accept UUIDs and that would
1363 # remove the ability to even check and remove the worry
1364 # about differing storage classes. Given the ongoing discussion
1365 # on refs vs UUIDs and whether to raise or have a new
1366 # private flag, treat this as a private API for now.
1367 existence[ref] |= DatasetExistence.RECORDED
1369 # Ask datastore if it knows about these refs.
1370 knows = self._datastore.knows_these(refs)
1371 for ref, known in knows.items():
1372 if known:
1373 existence[ref] |= DatasetExistence.DATASTORE
1375 if full_check:
1376 mexists = self._datastore.mexists(refs)
1377 for ref, exists in mexists.items():
1378 if exists:
1379 existence[ref] |= DatasetExistence._ARTIFACT
1380 else:
1381 # Do not set this flag if nothing is known about the dataset.
1382 for ref in existence:
1383 if existence[ref] != DatasetExistence.UNRECOGNIZED:
1384 existence[ref] |= DatasetExistence._ASSUMED
1386 return existence
1388 def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None:
1389 # Docstring inherited.
1390 if not self.isWriteable():
1391 raise TypeError("Butler is read-only.")
1392 names = list(names)
1393 refs: list[DatasetRef] = []
1394 for name in names:
1395 collectionType = self._registry.getCollectionType(name)
1396 if collectionType is not CollectionType.RUN:
1397 raise TypeError(f"The collection type of '{name}' is {collectionType.name}, not RUN.")
1398 refs.extend(self._registry.queryDatasets(..., collections=name, findFirst=True))
1399 with self._datastore.transaction(), self._registry.transaction():
1400 if unstore:
1401 self._datastore.trash(refs)
1402 else:
1403 self._datastore.forget(refs)
1404 for name in names:
1405 self._registry.removeCollection(name)
1406 if unstore:
1407 # Point of no return for removing artifacts
1408 self._datastore.emptyTrash()
1410 def pruneDatasets(
1411 self,
1412 refs: Iterable[DatasetRef],
1413 *,
1414 disassociate: bool = True,
1415 unstore: bool = False,
1416 tags: Iterable[str] = (),
1417 purge: bool = False,
1418 ) -> None:
1419 # docstring inherited from LimitedButler
1421 if not self.isWriteable():
1422 raise TypeError("Butler is read-only.")
1423 if purge:
1424 if not disassociate:
1425 raise TypeError("Cannot pass purge=True without disassociate=True.")
1426 if not unstore:
1427 raise TypeError("Cannot pass purge=True without unstore=True.")
1428 elif disassociate:
1429 tags = tuple(tags)
1430 if not tags:
1431 raise TypeError("No tags provided but disassociate=True.")
1432 for tag in tags:
1433 collectionType = self._registry.getCollectionType(tag)
1434 if collectionType is not CollectionType.TAGGED:
1435 raise TypeError(
1436 f"Cannot disassociate from collection '{tag}' "
1437 f"of non-TAGGED type {collectionType.name}."
1438 )
1439 # Transform possibly-single-pass iterable into something we can iterate
1440 # over multiple times.
1441 refs = list(refs)
1442 # Pruning a component of a DatasetRef makes no sense since registry
1443 # doesn't know about components and datastore might not store
1444 # components in a separate file
1445 for ref in refs:
1446 if ref.datasetType.component():
1447 raise ValueError(f"Can not prune a component of a dataset (ref={ref})")
1448 # We don't need an unreliable Datastore transaction for this, because
1449 # we've been extra careful to ensure that Datastore.trash only involves
1450 # mutating the Registry (it can _look_ at Datastore-specific things,
1451 # but shouldn't change them), and hence all operations here are
1452 # Registry operations.
1453 with self._datastore.transaction(), self._registry.transaction():
1454 if unstore:
1455 self._datastore.trash(refs)
1456 if purge:
1457 self._registry.removeDatasets(refs)
1458 elif disassociate:
1459 assert tags, "Guaranteed by earlier logic in this function."
1460 for tag in tags:
1461 self._registry.disassociate(tag, refs)
1462 # We've exited the Registry transaction, and apparently committed.
1463 # (if there was an exception, everything rolled back, and it's as if
1464 # nothing happened - and we never get here).
1465 # Datastore artifacts are not yet gone, but they're clearly marked
1466 # as trash, so if we fail to delete now because of (e.g.) filesystem
1467 # problems we can try again later, and if manual administrative
1468 # intervention is required, it's pretty clear what that should entail:
1469 # deleting everything on disk and in private Datastore tables that is
1470 # in the dataset_location_trash table.
1471 if unstore:
1472 # Point of no return for removing artifacts
1473 self._datastore.emptyTrash()
1475 @transactional
1476 def ingest(
1477 self,
1478 *datasets: FileDataset,
1479 transfer: str | None = "auto",
1480 record_validation_info: bool = True,
1481 ) -> None:
1482 # Docstring inherited.
1483 if not self.isWriteable():
1484 raise TypeError("Butler is read-only.")
1486 _LOG.verbose("Ingesting %d file dataset%s.", len(datasets), "" if len(datasets) == 1 else "s")
1487 if not datasets:
1488 return
1490 progress = Progress("lsst.daf.butler.Butler.ingest", level=logging.DEBUG)
1492 # We need to reorganize all the inputs so that they are grouped
1493 # by dataset type and run. Multiple refs in a single FileDataset
1494 # are required to share the run and dataset type.
1495 groupedData: MutableMapping[tuple[DatasetType, str], list[FileDataset]] = defaultdict(list)
1497 # Track DataIDs that are being ingested so we can spot issues early
1498 # with duplication. Retain previous FileDataset so we can report it.
1499 groupedDataIds: MutableMapping[tuple[DatasetType, str], dict[DataCoordinate, FileDataset]] = (
1500 defaultdict(dict)
1501 )
1503 # And the nested loop that populates it:
1504 for dataset in progress.wrap(datasets, desc="Grouping by dataset type"):
1505 # Somewhere to store pre-existing refs if we have an
1506 # execution butler.
1507 existingRefs: list[DatasetRef] = []
1509 for ref in dataset.refs:
1510 group_key = (ref.datasetType, ref.run)
1512 if ref.dataId in groupedDataIds[group_key]:
1513 raise ConflictingDefinitionError(
1514 f"Ingest conflict. Dataset {dataset.path} has same"
1515 " DataId as other ingest dataset"
1516 f" {groupedDataIds[group_key][ref.dataId].path} "
1517 f" ({ref.dataId})"
1518 )
1520 groupedDataIds[group_key][ref.dataId] = dataset
1522 if existingRefs:
1523 if len(dataset.refs) != len(existingRefs):
1524 # Keeping track of partially pre-existing datasets is hard
1525 # and should generally never happen. For now don't allow
1526 # it.
1527 raise ConflictingDefinitionError(
1528 f"For dataset {dataset.path} some dataIds already exist"
1529 " in registry but others do not. This is not supported."
1530 )
1532 # Store expanded form in the original FileDataset.
1533 dataset.refs = existingRefs
1534 else:
1535 groupedData[group_key].append(dataset)
1537 # Now we can bulk-insert into Registry for each DatasetType.
1538 for (datasetType, this_run), grouped_datasets in progress.iter_item_chunks(
1539 groupedData.items(), desc="Bulk-inserting datasets by type"
1540 ):
1541 refs_to_import = []
1542 for dataset in grouped_datasets:
1543 refs_to_import.extend(dataset.refs)
1545 n_refs = len(refs_to_import)
1546 _LOG.verbose(
1547 "Importing %d ref%s of dataset type %r into run %r",
1548 n_refs,
1549 "" if n_refs == 1 else "s",
1550 datasetType.name,
1551 this_run,
1552 )
1554 # Import the refs and expand the DataCoordinates since we can't
1555 # guarantee that they are expanded and Datastore will need
1556 # the records.
1557 imported_refs = self._registry._importDatasets(refs_to_import, expand=True)
1558 assert set(imported_refs) == set(refs_to_import)
1560 # Replace all the refs in the FileDataset with expanded versions.
1561 # Pull them off in the order we put them on the list.
1562 for dataset in grouped_datasets:
1563 n_dataset_refs = len(dataset.refs)
1564 dataset.refs = imported_refs[:n_dataset_refs]
1565 del imported_refs[:n_dataset_refs]
1567 # Bulk-insert everything into Datastore.
1568 # We do not know if any of the registry entries already existed
1569 # (_importDatasets only complains if they exist but differ) so
1570 # we have to catch IntegrityError explicitly.
1571 try:
1572 self._datastore.ingest(
1573 *datasets, transfer=transfer, record_validation_info=record_validation_info
1574 )
1575 except IntegrityError as e:
1576 raise ConflictingDefinitionError(f"Datastore already contains one or more datasets: {e}") from e
1578 @contextlib.contextmanager
1579 def export(
1580 self,
1581 *,
1582 directory: str | None = None,
1583 filename: str | None = None,
1584 format: str | None = None,
1585 transfer: str | None = None,
1586 ) -> Iterator[RepoExportContext]:
1587 # Docstring inherited.
1588 if directory is None and transfer is not None:
1589 raise TypeError("Cannot transfer without providing a directory.")
1590 if transfer == "move":
1591 raise TypeError("Transfer may not be 'move': export is read-only")
1592 if format is None:
1593 if filename is None:
1594 raise TypeError("At least one of 'filename' or 'format' must be provided.")
1595 else:
1596 _, format = os.path.splitext(filename)
1597 if not format:
1598 raise ValueError("Please specify a file extension to determine export format.")
1599 format = format[1:] # Strip leading ".""
1600 elif filename is None:
1601 filename = f"export.{format}"
1602 if directory is not None:
1603 filename = os.path.join(directory, filename)
1604 formats = self._config["repo_transfer_formats"]
1605 if format not in formats:
1606 raise ValueError(f"Unknown export format {format!r}, allowed: {','.join(formats.keys())}")
1607 BackendClass = get_class_of(formats[format, "export"])
1608 with open(filename, "w") as stream:
1609 backend = BackendClass(stream, universe=self.dimensions)
1610 try:
1611 helper = RepoExportContext(
1612 self._registry, self._datastore, backend=backend, directory=directory, transfer=transfer
1613 )
1614 with self._caching_context():
1615 yield helper
1616 except BaseException:
1617 raise
1618 else:
1619 helper._finish()
1621 def import_(
1622 self,
1623 *,
1624 directory: ResourcePathExpression | None = None,
1625 filename: ResourcePathExpression | TextIO | None = None,
1626 format: str | None = None,
1627 transfer: str | None = None,
1628 skip_dimensions: set | None = None,
1629 ) -> None:
1630 # Docstring inherited.
1631 if not self.isWriteable():
1632 raise TypeError("Butler is read-only.")
1633 if format is None:
1634 if filename is None:
1635 raise TypeError("At least one of 'filename' or 'format' must be provided.")
1636 else:
1637 _, format = os.path.splitext(filename) # type: ignore
1638 elif filename is None:
1639 filename = ResourcePath(f"export.{format}", forceAbsolute=False)
1640 if directory is not None:
1641 directory = ResourcePath(directory, forceDirectory=True)
1642 # mypy doesn't think this will work but it does in python >= 3.10.
1643 if isinstance(filename, ResourcePathExpression): # type: ignore
1644 filename = ResourcePath(filename, forceAbsolute=False) # type: ignore
1645 if not filename.isabs() and directory is not None:
1646 potential = directory.join(filename)
1647 exists_in_cwd = filename.exists()
1648 exists_in_dir = potential.exists()
1649 if exists_in_cwd and exists_in_dir:
1650 _LOG.warning(
1651 "A relative path for filename was specified (%s) which exists relative to cwd. "
1652 "Additionally, the file exists relative to the given search directory (%s). "
1653 "Using the export file in the given directory.",
1654 filename,
1655 potential,
1656 )
1657 # Given they specified an explicit directory and that
1658 # directory has the export file in it, assume that that
1659 # is what was meant despite the file in cwd.
1660 filename = potential
1661 elif exists_in_dir:
1662 filename = potential
1663 elif not exists_in_cwd and not exists_in_dir:
1664 # Raise early.
1665 raise FileNotFoundError(
1666 f"Export file could not be found in {filename.abspath()} or {potential.abspath()}."
1667 )
1668 BackendClass: type[RepoImportBackend] = get_class_of(
1669 self._config["repo_transfer_formats"][format]["import"]
1670 )
1672 def doImport(importStream: TextIO | ResourceHandleProtocol) -> None:
1673 with self._caching_context():
1674 backend = BackendClass(importStream, self._registry) # type: ignore[call-arg]
1675 backend.register()
1676 with self.transaction():
1677 backend.load(
1678 self._datastore,
1679 directory=directory,
1680 transfer=transfer,
1681 skip_dimensions=skip_dimensions,
1682 )
1684 if isinstance(filename, ResourcePath):
1685 # We can not use open() here at the moment because of
1686 # DM-38589 since yaml does stream.read(8192) in a loop.
1687 stream = io.StringIO(filename.read().decode())
1688 doImport(stream)
1689 else:
1690 doImport(filename) # type: ignore
1692 def transfer_dimension_records_from(
1693 self, source_butler: LimitedButler | Butler, source_refs: Iterable[DatasetRef]
1694 ) -> None:
1695 # Allowed dimensions in the target butler.
1696 elements = frozenset(element for element in self.dimensions.elements if element.has_own_table)
1698 data_ids = {ref.dataId for ref in source_refs}
1700 dimension_records = self._extract_all_dimension_records_from_data_ids(
1701 source_butler, data_ids, elements
1702 )
1704 # Insert order is important.
1705 for element in self.dimensions.sorted(dimension_records.keys()):
1706 records = [r for r in dimension_records[element].values()]
1707 # Assume that if the record is already present that we can
1708 # use it without having to check that the record metadata
1709 # is consistent.
1710 self._registry.insertDimensionData(element, *records, skip_existing=True)
1711 _LOG.debug("Dimension '%s' -- number of records transferred: %d", element.name, len(records))
1713 def _extract_all_dimension_records_from_data_ids(
1714 self,
1715 source_butler: LimitedButler | Butler,
1716 data_ids: set[DataCoordinate],
1717 allowed_elements: frozenset[DimensionElement],
1718 ) -> dict[DimensionElement, dict[DataCoordinate, DimensionRecord]]:
1719 primary_records = self._extract_dimension_records_from_data_ids(
1720 source_butler, data_ids, allowed_elements
1721 )
1723 can_query = True if isinstance(source_butler, Butler) else False
1725 additional_records: dict[DimensionElement, dict[DataCoordinate, DimensionRecord]] = defaultdict(dict)
1726 for original_element, record_mapping in primary_records.items():
1727 # Get dimensions that depend on this dimension.
1728 populated_by = self.dimensions.get_elements_populated_by(
1729 self.dimensions[original_element.name] # type: ignore
1730 )
1732 for data_id in record_mapping.keys():
1733 for element in populated_by:
1734 if element not in allowed_elements:
1735 continue
1736 if element.name == original_element.name:
1737 continue
1739 if element.name in primary_records:
1740 # If this element has already been stored avoid
1741 # re-finding records since that may lead to additional
1742 # spurious records. e.g. visit is populated_by
1743 # visit_detector_region but querying
1744 # visit_detector_region by visit will return all the
1745 # detectors for this visit -- the visit dataId does not
1746 # constrain this.
1747 # To constrain the query the original dataIds would
1748 # have to be scanned.
1749 continue
1751 if not can_query:
1752 raise RuntimeError(
1753 f"Transferring populated_by records like {element.name} requires a full Butler."
1754 )
1756 records = source_butler.registry.queryDimensionRecords( # type: ignore
1757 element.name,
1758 **data_id.mapping, # type: ignore
1759 )
1760 for record in records:
1761 additional_records[record.definition].setdefault(record.dataId, record)
1763 # The next step is to walk back through the additional records to
1764 # pick up any missing content (such as visit_definition needing to
1765 # know the exposure). Want to ensure we do not request records we
1766 # already have.
1767 missing_data_ids = set()
1768 for name, record_mapping in additional_records.items():
1769 for data_id in record_mapping.keys():
1770 if data_id not in primary_records[name]:
1771 missing_data_ids.add(data_id)
1773 # Fill out the new records. Assume that these new records do not
1774 # also need to carry over additional populated_by records.
1775 secondary_records = self._extract_dimension_records_from_data_ids(
1776 source_butler, missing_data_ids, allowed_elements
1777 )
1779 # Merge the extra sets of records in with the original.
1780 for name, record_mapping in itertools.chain(additional_records.items(), secondary_records.items()):
1781 primary_records[name].update(record_mapping)
1783 return primary_records
1785 def _extract_dimension_records_from_data_ids(
1786 self,
1787 source_butler: LimitedButler | Butler,
1788 data_ids: set[DataCoordinate],
1789 allowed_elements: frozenset[DimensionElement],
1790 ) -> dict[DimensionElement, dict[DataCoordinate, DimensionRecord]]:
1791 dimension_records: dict[DimensionElement, dict[DataCoordinate, DimensionRecord]] = defaultdict(dict)
1793 for data_id in data_ids:
1794 # Need an expanded record, if not expanded that we need a full
1795 # butler with registry (allow mocks with registry too).
1796 if not data_id.hasRecords():
1797 if registry := getattr(source_butler, "registry", None):
1798 data_id = registry.expandDataId(data_id)
1799 else:
1800 raise TypeError("Input butler needs to be a full butler to expand DataId.")
1801 # If this butler doesn't know about a dimension in the source
1802 # butler things will break later.
1803 for element_name in data_id.dimensions.elements:
1804 record = data_id.records[element_name]
1805 if record is not None and record.definition in allowed_elements:
1806 dimension_records[record.definition].setdefault(record.dataId, record)
1808 return dimension_records
1810 def transfer_from(
1811 self,
1812 source_butler: LimitedButler,
1813 source_refs: Iterable[DatasetRef],
1814 transfer: str = "auto",
1815 skip_missing: bool = True,
1816 register_dataset_types: bool = False,
1817 transfer_dimensions: bool = False,
1818 dry_run: bool = False,
1819 ) -> collections.abc.Collection[DatasetRef]:
1820 # Docstring inherited.
1821 if not self.isWriteable():
1822 raise TypeError("Butler is read-only.")
1823 progress = Progress("lsst.daf.butler.Butler.transfer_from", level=VERBOSE)
1825 # Will iterate through the refs multiple times so need to convert
1826 # to a list if this isn't a collection.
1827 if not isinstance(source_refs, collections.abc.Collection):
1828 source_refs = list(source_refs)
1830 original_count = len(source_refs)
1831 _LOG.info("Transferring %d datasets into %s", original_count, str(self))
1833 # In some situations the datastore artifact may be missing
1834 # and we do not want that registry entry to be imported.
1835 # Asking datastore is not sufficient, the records may have been
1836 # purged, we have to ask for the (predicted) URI and check
1837 # existence explicitly. Execution butler is set up exactly like
1838 # this with no datastore records.
1839 artifact_existence: dict[ResourcePath, bool] = {}
1840 if skip_missing:
1841 dataset_existence = source_butler._datastore.mexists(
1842 source_refs, artifact_existence=artifact_existence
1843 )
1844 source_refs = [ref for ref, exists in dataset_existence.items() if exists]
1845 filtered_count = len(source_refs)
1846 n_missing = original_count - filtered_count
1847 _LOG.verbose(
1848 "%d dataset%s removed because the artifact does not exist. Now have %d.",
1849 n_missing,
1850 "" if n_missing == 1 else "s",
1851 filtered_count,
1852 )
1854 # Importing requires that we group the refs by dataset type and run
1855 # before doing the import.
1856 source_dataset_types = set()
1857 grouped_refs = defaultdict(list)
1858 for ref in source_refs:
1859 grouped_refs[ref.datasetType, ref.run].append(ref)
1860 source_dataset_types.add(ref.datasetType)
1862 # Check to see if the dataset type in the source butler has
1863 # the same definition in the target butler and register missing
1864 # ones if requested. Registration must happen outside a transaction.
1865 newly_registered_dataset_types = set()
1866 for datasetType in source_dataset_types:
1867 if register_dataset_types:
1868 # Let this raise immediately if inconsistent. Continuing
1869 # on to find additional inconsistent dataset types
1870 # might result in additional unwanted dataset types being
1871 # registered.
1872 if self._registry.registerDatasetType(datasetType):
1873 newly_registered_dataset_types.add(datasetType)
1874 else:
1875 # If the dataset type is missing, let it fail immediately.
1876 target_dataset_type = self.get_dataset_type(datasetType.name)
1877 if target_dataset_type != datasetType:
1878 raise ConflictingDefinitionError(
1879 "Source butler dataset type differs from definition"
1880 f" in target butler: {datasetType} !="
1881 f" {target_dataset_type}"
1882 )
1883 if newly_registered_dataset_types:
1884 # We may have registered some even if there were inconsistencies
1885 # but should let people know (or else remove them again).
1886 _LOG.verbose(
1887 "Registered the following dataset types in the target Butler: %s",
1888 ", ".join(d.name for d in newly_registered_dataset_types),
1889 )
1890 else:
1891 _LOG.verbose("All required dataset types are known to the target Butler")
1893 dimension_records: dict[DimensionElement, dict[DataCoordinate, DimensionRecord]] = defaultdict(dict)
1894 if transfer_dimensions:
1895 # Collect all the dimension records for these refs.
1896 # All dimensions are to be copied but the list of valid dimensions
1897 # come from this butler's universe.
1898 elements = frozenset(element for element in self.dimensions.elements if element.has_own_table)
1899 dataIds = {ref.dataId for ref in source_refs}
1900 dimension_records = self._extract_all_dimension_records_from_data_ids(
1901 source_butler, dataIds, elements
1902 )
1904 handled_collections: set[str] = set()
1906 # Do all the importing in a single transaction.
1907 with self.transaction():
1908 if dimension_records and not dry_run:
1909 _LOG.verbose("Ensuring that dimension records exist for transferred datasets.")
1910 # Order matters.
1911 for element in self.dimensions.sorted(dimension_records.keys()):
1912 records = [r for r in dimension_records[element].values()]
1913 # Assume that if the record is already present that we can
1914 # use it without having to check that the record metadata
1915 # is consistent.
1916 self._registry.insertDimensionData(element, *records, skip_existing=True)
1918 n_imported = 0
1919 for (datasetType, run), refs_to_import in progress.iter_item_chunks(
1920 grouped_refs.items(), desc="Importing to registry by run and dataset type"
1921 ):
1922 if run not in handled_collections:
1923 # May need to create output collection. If source butler
1924 # has a registry, ask for documentation string.
1925 run_doc = None
1926 if registry := getattr(source_butler, "registry", None):
1927 run_doc = registry.getCollectionDocumentation(run)
1928 if not dry_run:
1929 registered = self._registry.registerRun(run, doc=run_doc)
1930 else:
1931 registered = True
1932 handled_collections.add(run)
1933 if registered:
1934 _LOG.verbose("Creating output run %s", run)
1936 n_refs = len(refs_to_import)
1937 _LOG.verbose(
1938 "Importing %d ref%s of dataset type %s into run %s",
1939 n_refs,
1940 "" if n_refs == 1 else "s",
1941 datasetType.name,
1942 run,
1943 )
1945 # Assume we are using UUIDs and the source refs will match
1946 # those imported.
1947 if not dry_run:
1948 imported_refs = self._registry._importDatasets(refs_to_import)
1949 else:
1950 imported_refs = refs_to_import
1951 assert set(imported_refs) == set(refs_to_import)
1952 n_imported += len(imported_refs)
1954 assert len(source_refs) == n_imported
1955 _LOG.verbose("Imported %d datasets into destination butler", n_imported)
1957 # Ask the datastore to transfer. The datastore has to check that
1958 # the source datastore is compatible with the target datastore.
1959 accepted, rejected = self._datastore.transfer_from(
1960 source_butler._datastore,
1961 source_refs,
1962 transfer=transfer,
1963 artifact_existence=artifact_existence,
1964 dry_run=dry_run,
1965 )
1966 if rejected:
1967 # For now, accept the registry entries but not the files.
1968 _LOG.warning(
1969 "%d datasets were rejected and %d accepted for dataset type %s in run %r.",
1970 len(rejected),
1971 len(accepted),
1972 datasetType,
1973 run,
1974 )
1976 return source_refs
1978 def validateConfiguration(
1979 self,
1980 logFailures: bool = False,
1981 datasetTypeNames: Iterable[str] | None = None,
1982 ignore: Iterable[str] | None = None,
1983 ) -> None:
1984 # Docstring inherited.
1985 if datasetTypeNames:
1986 datasetTypes = [self.get_dataset_type(name) for name in datasetTypeNames]
1987 else:
1988 datasetTypes = list(self._registry.queryDatasetTypes())
1990 # filter out anything from the ignore list
1991 if ignore:
1992 ignore = set(ignore)
1993 datasetTypes = [
1994 e for e in datasetTypes if e.name not in ignore and e.nameAndComponent()[0] not in ignore
1995 ]
1996 else:
1997 ignore = set()
1999 # For each datasetType that has an instrument dimension, create
2000 # a DatasetRef for each defined instrument
2001 datasetRefs = []
2003 # Find all the registered instruments (if "instrument" is in the
2004 # universe).
2005 if "instrument" in self.dimensions:
2006 instruments = {record.name for record in self._registry.queryDimensionRecords("instrument")}
2008 for datasetType in datasetTypes:
2009 if "instrument" in datasetType.dimensions:
2010 # In order to create a conforming dataset ref, create
2011 # fake DataCoordinate values for the non-instrument
2012 # dimensions. The type of the value does not matter here.
2013 dataId = {dim: 1 for dim in datasetType.dimensions.names if dim != "instrument"}
2015 for instrument in instruments:
2016 datasetRef = DatasetRef(
2017 datasetType,
2018 DataCoordinate.standardize(
2019 dataId, instrument=instrument, dimensions=datasetType.dimensions
2020 ),
2021 run="validate",
2022 )
2023 datasetRefs.append(datasetRef)
2025 entities: list[DatasetType | DatasetRef] = []
2026 entities.extend(datasetTypes)
2027 entities.extend(datasetRefs)
2029 datastoreErrorStr = None
2030 try:
2031 self._datastore.validateConfiguration(entities, logFailures=logFailures)
2032 except ValidationError as e:
2033 datastoreErrorStr = str(e)
2035 # Also check that the LookupKeys used by the datastores match
2036 # registry and storage class definitions
2037 keys = self._datastore.getLookupKeys()
2039 failedNames = set()
2040 failedDataId = set()
2041 for key in keys:
2042 if key.name is not None:
2043 if key.name in ignore:
2044 continue
2046 # skip if specific datasetType names were requested and this
2047 # name does not match
2048 if datasetTypeNames and key.name not in datasetTypeNames:
2049 continue
2051 # See if it is a StorageClass or a DatasetType
2052 if key.name in self.storageClasses:
2053 pass
2054 else:
2055 try:
2056 self.get_dataset_type(key.name)
2057 except KeyError:
2058 if logFailures:
2059 _LOG.critical(
2060 "Key '%s' does not correspond to a DatasetType or StorageClass", key
2061 )
2062 failedNames.add(key)
2063 else:
2064 # Dimensions are checked for consistency when the Butler
2065 # is created and rendezvoused with a universe.
2066 pass
2068 # Check that the instrument is a valid instrument
2069 # Currently only support instrument so check for that
2070 if key.dataId:
2071 dataIdKeys = set(key.dataId)
2072 if {"instrument"} != dataIdKeys:
2073 if logFailures:
2074 _LOG.critical("Key '%s' has unsupported DataId override", key)
2075 failedDataId.add(key)
2076 elif key.dataId["instrument"] not in instruments:
2077 if logFailures:
2078 _LOG.critical("Key '%s' has unknown instrument", key)
2079 failedDataId.add(key)
2081 messages = []
2083 if datastoreErrorStr:
2084 messages.append(datastoreErrorStr)
2086 for failed, msg in (
2087 (failedNames, "Keys without corresponding DatasetType or StorageClass entry: "),
2088 (failedDataId, "Keys with bad DataId entries: "),
2089 ):
2090 if failed:
2091 msg += ", ".join(str(k) for k in failed)
2092 messages.append(msg)
2094 if messages:
2095 raise ValidationError(";\n".join(messages))
2097 @property
2098 def collection_chains(self) -> DirectButlerCollections:
2099 """Object with methods for modifying collection chains."""
2100 return DirectButlerCollections(self._registry)
2102 @property
2103 def collections(self) -> Sequence[str]:
2104 """The collections to search by default, in order
2105 (`~collections.abc.Sequence` [ `str` ]).
2107 This is an alias for ``self.registry.defaults.collections``. It cannot
2108 be set directly in isolation, but all defaults may be changed together
2109 by assigning a new `RegistryDefaults` instance to
2110 ``self.registry.defaults``.
2111 """
2112 return self._registry.defaults.collections
2114 @property
2115 def run(self) -> str | None:
2116 """Name of the run this butler writes outputs to by default (`str` or
2117 `None`).
2119 This is an alias for ``self.registry.defaults.run``. It cannot be set
2120 directly in isolation, but all defaults may be changed together by
2121 assigning a new `RegistryDefaults` instance to
2122 ``self.registry.defaults``.
2123 """
2124 return self._registry.defaults.run
2126 @property
2127 def registry(self) -> Registry:
2128 """The object that manages dataset metadata and relationships
2129 (`Registry`).
2131 Many operations that don't involve reading or writing butler datasets
2132 are accessible only via `Registry` methods. Eventually these methods
2133 will be replaced by equivalent `Butler` methods.
2134 """
2135 return self._registry_shim
2137 @property
2138 def dimensions(self) -> DimensionUniverse:
2139 # Docstring inherited.
2140 return self._registry.dimensions
2142 @contextlib.contextmanager
2143 def _query(self) -> Iterator[Query]:
2144 # Docstring inherited.
2145 with self._query_driver(self._registry.defaults.collections, self.registry.defaults.dataId) as driver:
2146 yield Query(driver)
2148 @contextlib.contextmanager
2149 def _query_driver(
2150 self,
2151 default_collections: Iterable[str],
2152 default_data_id: DataCoordinate,
2153 ) -> Iterator[DirectQueryDriver]:
2154 """Set up a QueryDriver instance for use with this Butler. Although
2155 this is marked as a private method, it is also used by Butler server.
2156 """
2157 with self._caching_context():
2158 driver = DirectQueryDriver(
2159 self._registry._db,
2160 self.dimensions,
2161 self._registry._managers,
2162 default_collections=default_collections,
2163 default_data_id=default_data_id,
2164 )
2165 with driver:
2166 yield driver
2168 def _preload_cache(self) -> None:
2169 """Immediately load caches that are used for common operations."""
2170 self._registry.preload_cache()
2172 _config: ButlerConfig
2173 """Configuration for this Butler instance."""
2175 _registry: SqlRegistry
2176 """The object that manages dataset metadata and relationships
2177 (`SqlRegistry`).
2179 Most operations that don't involve reading or writing butler datasets are
2180 accessible only via `SqlRegistry` methods.
2181 """
2183 datastore: Datastore
2184 """The object that manages actual dataset storage (`Datastore`).
2186 Direct user access to the datastore should rarely be necessary; the primary
2187 exception is the case where a `Datastore` implementation provides extra
2188 functionality beyond what the base class defines.
2189 """
2191 storageClasses: StorageClassFactory
2192 """An object that maps known storage class names to objects that fully
2193 describe them (`StorageClassFactory`).
2194 """
2196 _registry_shim: RegistryShim
2197 """Shim object to provide a legacy public interface for querying via the
2198 the ``registry`` property.
2199 """