Coverage for python/lsst/daf/butler/direct_butler.py: 10%
789 statements
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-25 10:50 +0000
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-25 10:50 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28"""Butler top level classes.
29"""
30from __future__ import annotations
32__all__ = (
33 "DirectButler",
34 "ButlerValidationError",
35)
37import collections.abc
38import contextlib
39import io
40import itertools
41import logging
42import numbers
43import os
44import warnings
45from collections import Counter, defaultdict
46from collections.abc import Iterable, Iterator, Mapping, MutableMapping, Sequence
47from typing import TYPE_CHECKING, Any, ClassVar, TextIO, cast
49from lsst.resources import ResourcePath, ResourcePathExpression
50from lsst.utils.introspection import get_class_of
51from lsst.utils.iteration import ensure_iterable
52from lsst.utils.logging import VERBOSE, getLogger
53from sqlalchemy.exc import IntegrityError
55from ._butler import Butler
56from ._butler_config import ButlerConfig
57from ._butler_instance_options import ButlerInstanceOptions
58from ._dataset_existence import DatasetExistence
59from ._dataset_ref import DatasetRef
60from ._dataset_type import DatasetType
61from ._deferredDatasetHandle import DeferredDatasetHandle
62from ._exceptions import EmptyQueryResultError, ValidationError
63from ._limited_butler import LimitedButler
64from ._registry_shim import RegistryShim
65from ._storage_class import StorageClass, StorageClassFactory
66from ._timespan import Timespan
67from .datastore import Datastore, NullDatastore
68from .dimensions import DataCoordinate, Dimension
69from .direct_query import DirectQuery
70from .progress import Progress
71from .registry import (
72 CollectionType,
73 ConflictingDefinitionError,
74 DataIdError,
75 MissingDatasetTypeError,
76 NoDefaultCollectionError,
77 RegistryDefaults,
78 _RegistryFactory,
79)
80from .registry.sql_registry import SqlRegistry
81from .transfers import RepoExportContext
82from .utils import transactional
84if TYPE_CHECKING:
85 from lsst.resources import ResourceHandleProtocol
87 from ._dataset_ref import DatasetId
88 from ._file_dataset import FileDataset
89 from ._query import Query
90 from .datastore import DatasetRefURIs
91 from .dimensions import (
92 DataId,
93 DataIdValue,
94 DimensionElement,
95 DimensionGroup,
96 DimensionRecord,
97 DimensionUniverse,
98 )
99 from .registry import CollectionArgType, Registry
100 from .transfers import RepoImportBackend
102_LOG = getLogger(__name__)
105class ButlerValidationError(ValidationError):
106 """There is a problem with the Butler configuration."""
108 pass
111class DirectButler(Butler): # numpydoc ignore=PR02
112 """Main entry point for the data access system.
114 Parameters
115 ----------
116 config : `ButlerConfig`
117 The configuration for this Butler instance.
118 registry : `SqlRegistry`
119 The object that manages dataset metadata and relationships.
120 datastore : Datastore
121 The object that manages actual dataset storage.
122 storageClasses : StorageClassFactory
123 An object that maps known storage class names to objects that fully
124 describe them.
126 Notes
127 -----
128 Most users should call the top-level `Butler`.``from_config`` instead of
129 using this constructor directly.
130 """
132 # This is __new__ instead of __init__ because we have to support
133 # instantiation via the legacy constructor Butler.__new__(), which
134 # reads the configuration and selects which subclass to instantiate. The
135 # interaction between __new__ and __init__ is kind of wacky in Python. If
136 # we were using __init__ here, __init__ would be called twice (once when
137 # the DirectButler instance is constructed inside Butler.from_config(), and
138 # a second time with the original arguments to Butler() when the instance
139 # is returned from Butler.__new__()
140 def __new__(
141 cls,
142 *,
143 config: ButlerConfig,
144 registry: SqlRegistry,
145 datastore: Datastore,
146 storageClasses: StorageClassFactory,
147 ) -> DirectButler:
148 self = cast(DirectButler, super().__new__(cls))
149 self._config = config
150 self._registry = registry
151 self._datastore = datastore
152 self.storageClasses = storageClasses
154 # For execution butler the datastore needs a special
155 # dependency-inversion trick. This is not used by regular butler,
156 # but we do not have a way to distinguish regular butler from execution
157 # butler.
158 self._datastore.set_retrieve_dataset_type_method(self._retrieve_dataset_type)
160 self._registry_shim = RegistryShim(self)
162 return self
164 @classmethod
165 def create_from_config(
166 cls,
167 config: ButlerConfig,
168 *,
169 options: ButlerInstanceOptions,
170 without_datastore: bool = False,
171 ) -> DirectButler:
172 """Construct a Butler instance from a configuration file.
174 Parameters
175 ----------
176 config : `ButlerConfig`
177 The configuration for this Butler instance.
178 options : `ButlerInstanceOptions`
179 Default values and other settings for the Butler instance.
180 without_datastore : `bool`, optional
181 If `True` do not attach a datastore to this butler. Any attempts
182 to use a datastore will fail.
184 Notes
185 -----
186 Most users should call the top-level `Butler`.``from_config``
187 instead of using this function directly.
188 """
189 if "run" in config or "collection" in config:
190 raise ValueError("Passing a run or collection via configuration is no longer supported.")
192 defaults = RegistryDefaults(
193 collections=options.collections, run=options.run, infer=options.inferDefaults, **options.kwargs
194 )
195 try:
196 butlerRoot = config.get("root", config.configDir)
197 writeable = options.writeable
198 if writeable is None:
199 writeable = options.run is not None
200 registry = _RegistryFactory(config).from_config(
201 butlerRoot=butlerRoot, writeable=writeable, defaults=defaults
202 )
203 if without_datastore:
204 datastore: Datastore = NullDatastore(None, None)
205 else:
206 datastore = Datastore.fromConfig(
207 config, registry.getDatastoreBridgeManager(), butlerRoot=butlerRoot
208 )
209 # TODO: Once datastore drops dependency on registry we can
210 # construct datastore first and pass opaque tables to registry
211 # constructor.
212 registry.make_datastore_tables(datastore.get_opaque_table_definitions())
213 storageClasses = StorageClassFactory()
214 storageClasses.addFromConfig(config)
216 return DirectButler(
217 config=config, registry=registry, datastore=datastore, storageClasses=storageClasses
218 )
219 except Exception:
220 # Failures here usually mean that configuration is incomplete,
221 # just issue an error message which includes config file URI.
222 _LOG.error(f"Failed to instantiate Butler from config {config.configFile}.")
223 raise
225 def _clone(
226 self,
227 *,
228 collections: Any = None,
229 run: str | None = None,
230 inferDefaults: bool = True,
231 **kwargs: Any,
232 ) -> DirectButler:
233 # Docstring inherited
234 defaults = RegistryDefaults(collections=collections, run=run, infer=inferDefaults, **kwargs)
235 registry = self._registry.copy(defaults)
237 return DirectButler(
238 registry=registry,
239 config=self._config,
240 datastore=self._datastore.clone(registry.getDatastoreBridgeManager()),
241 storageClasses=self.storageClasses,
242 )
244 GENERATION: ClassVar[int] = 3
245 """This is a Generation 3 Butler.
247 This attribute may be removed in the future, once the Generation 2 Butler
248 interface has been fully retired; it should only be used in transitional
249 code.
250 """
252 def _retrieve_dataset_type(self, name: str) -> DatasetType | None:
253 """Return DatasetType defined in registry given dataset type name."""
254 try:
255 return self.get_dataset_type(name)
256 except MissingDatasetTypeError:
257 return None
259 @classmethod
260 def _unpickle(
261 cls,
262 config: ButlerConfig,
263 collections: tuple[str, ...] | None,
264 run: str | None,
265 defaultDataId: dict[str, str],
266 writeable: bool,
267 ) -> DirectButler:
268 """Callable used to unpickle a Butler.
270 We prefer not to use ``Butler.__init__`` directly so we can force some
271 of its many arguments to be keyword-only (note that ``__reduce__``
272 can only invoke callables with positional arguments).
274 Parameters
275 ----------
276 config : `ButlerConfig`
277 Butler configuration, already coerced into a true `ButlerConfig`
278 instance (and hence after any search paths for overrides have been
279 utilized).
280 collections : `tuple` [ `str` ]
281 Names of the default collections to read from.
282 run : `str`, optional
283 Name of the default `~CollectionType.RUN` collection to write to.
284 defaultDataId : `dict` [ `str`, `str` ]
285 Default data ID values.
286 writeable : `bool`
287 Whether the Butler should support write operations.
289 Returns
290 -------
291 butler : `Butler`
292 A new `Butler` instance.
293 """
294 return cls.create_from_config(
295 config=config,
296 options=ButlerInstanceOptions(
297 collections=collections, run=run, writeable=writeable, kwargs=defaultDataId
298 ),
299 )
301 def __reduce__(self) -> tuple:
302 """Support pickling."""
303 return (
304 DirectButler._unpickle,
305 (
306 self._config,
307 self.collections,
308 self.run,
309 dict(self._registry.defaults.dataId.required),
310 self._registry.isWriteable(),
311 ),
312 )
314 def __str__(self) -> str:
315 return "Butler(collections={}, run={}, datastore='{}', registry='{}')".format(
316 self.collections, self.run, self._datastore, self._registry
317 )
319 def isWriteable(self) -> bool:
320 # Docstring inherited.
321 return self._registry.isWriteable()
323 def _caching_context(self) -> contextlib.AbstractContextManager[None]:
324 """Context manager that enables caching."""
325 return self._registry.caching_context()
327 @contextlib.contextmanager
328 def transaction(self) -> Iterator[None]:
329 """Context manager supporting `Butler` transactions.
331 Transactions can be nested.
332 """
333 with self._registry.transaction(), self._datastore.transaction():
334 yield
336 def _standardizeArgs(
337 self,
338 datasetRefOrType: DatasetRef | DatasetType | str,
339 dataId: DataId | None = None,
340 for_put: bool = True,
341 **kwargs: Any,
342 ) -> tuple[DatasetType, DataId | None]:
343 """Standardize the arguments passed to several Butler APIs.
345 Parameters
346 ----------
347 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
348 When `DatasetRef` the `dataId` should be `None`.
349 Otherwise the `DatasetType` or name thereof.
350 dataId : `dict` or `DataCoordinate`
351 A `dict` of `Dimension` link name, value pairs that label the
352 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
353 should be provided as the second argument.
354 for_put : `bool`, optional
355 If `True` this call is invoked as part of a `Butler.put()`.
356 Otherwise it is assumed to be part of a `Butler.get()`. This
357 parameter is only relevant if there is dataset type
358 inconsistency.
359 **kwargs
360 Additional keyword arguments used to augment or construct a
361 `DataCoordinate`. See `DataCoordinate.standardize`
362 parameters.
364 Returns
365 -------
366 datasetType : `DatasetType`
367 A `DatasetType` instance extracted from ``datasetRefOrType``.
368 dataId : `dict` or `DataId`, optional
369 Argument that can be used (along with ``kwargs``) to construct a
370 `DataId`.
372 Notes
373 -----
374 Butler APIs that conceptually need a DatasetRef also allow passing a
375 `DatasetType` (or the name of one) and a `DataId` (or a dict and
376 keyword arguments that can be used to construct one) separately. This
377 method accepts those arguments and always returns a true `DatasetType`
378 and a `DataId` or `dict`.
380 Standardization of `dict` vs `DataId` is best handled by passing the
381 returned ``dataId`` (and ``kwargs``) to `Registry` APIs, which are
382 generally similarly flexible.
383 """
384 externalDatasetType: DatasetType | None = None
385 internalDatasetType: DatasetType | None = None
386 if isinstance(datasetRefOrType, DatasetRef):
387 if dataId is not None or kwargs:
388 raise ValueError("DatasetRef given, cannot use dataId as well")
389 externalDatasetType = datasetRefOrType.datasetType
390 dataId = datasetRefOrType.dataId
391 else:
392 # Don't check whether DataId is provided, because Registry APIs
393 # can usually construct a better error message when it wasn't.
394 if isinstance(datasetRefOrType, DatasetType):
395 externalDatasetType = datasetRefOrType
396 else:
397 internalDatasetType = self.get_dataset_type(datasetRefOrType)
399 # Check that they are self-consistent
400 if externalDatasetType is not None:
401 internalDatasetType = self.get_dataset_type(externalDatasetType.name)
402 if externalDatasetType != internalDatasetType:
403 # We can allow differences if they are compatible, depending
404 # on whether this is a get or a put. A get requires that
405 # the python type associated with the datastore can be
406 # converted to the user type. A put requires that the user
407 # supplied python type can be converted to the internal
408 # type expected by registry.
409 relevantDatasetType = internalDatasetType
410 if for_put:
411 is_compatible = internalDatasetType.is_compatible_with(externalDatasetType)
412 else:
413 is_compatible = externalDatasetType.is_compatible_with(internalDatasetType)
414 relevantDatasetType = externalDatasetType
415 if not is_compatible:
416 raise ValueError(
417 f"Supplied dataset type ({externalDatasetType}) inconsistent with "
418 f"registry definition ({internalDatasetType})"
419 )
420 # Override the internal definition.
421 internalDatasetType = relevantDatasetType
423 assert internalDatasetType is not None
424 return internalDatasetType, dataId
426 def _rewrite_data_id(
427 self, dataId: DataId | None, datasetType: DatasetType, **kwargs: Any
428 ) -> tuple[DataId | None, dict[str, Any]]:
429 """Rewrite a data ID taking into account dimension records.
431 Take a Data ID and keyword args and rewrite it if necessary to
432 allow the user to specify dimension records rather than dimension
433 primary values.
435 This allows a user to include a dataId dict with keys of
436 ``exposure.day_obs`` and ``exposure.seq_num`` instead of giving
437 the integer exposure ID. It also allows a string to be given
438 for a dimension value rather than the integer ID if that is more
439 convenient. For example, rather than having to specifying the
440 detector with ``detector.full_name``, a string given for ``detector``
441 will be interpreted as the full name and converted to the integer
442 value.
444 Keyword arguments can also use strings for dimensions like detector
445 and exposure but python does not allow them to include ``.`` and
446 so the ``exposure.day_obs`` syntax can not be used in a keyword
447 argument.
449 Parameters
450 ----------
451 dataId : `dict` or `DataCoordinate`
452 A `dict` of `Dimension` link name, value pairs that will label the
453 `DatasetRef` within a Collection.
454 datasetType : `DatasetType`
455 The dataset type associated with this dataId. Required to
456 determine the relevant dimensions.
457 **kwargs
458 Additional keyword arguments used to augment or construct a
459 `DataId`. See `DataId` parameters.
461 Returns
462 -------
463 dataId : `dict` or `DataCoordinate`
464 The, possibly rewritten, dataId. If given a `DataCoordinate` and
465 no keyword arguments, the original dataId will be returned
466 unchanged.
467 **kwargs : `dict`
468 Any unused keyword arguments (would normally be empty dict).
469 """
470 # Do nothing if we have a standalone DataCoordinate.
471 if isinstance(dataId, DataCoordinate) and not kwargs:
472 return dataId, kwargs
474 # Process dimension records that are using record information
475 # rather than ids
476 newDataId: dict[str, DataIdValue] = {}
477 byRecord: dict[str, dict[str, Any]] = defaultdict(dict)
479 # if all the dataId comes from keyword parameters we do not need
480 # to do anything here because they can't be of the form
481 # exposure.obs_id because a "." is not allowed in a keyword parameter.
482 if dataId:
483 for k, v in dataId.items():
484 # If we have a Dimension we do not need to do anything
485 # because it cannot be a compound key.
486 if isinstance(k, str) and "." in k:
487 # Someone is using a more human-readable dataId
488 dimensionName, record = k.split(".", 1)
489 byRecord[dimensionName][record] = v
490 elif isinstance(k, Dimension):
491 newDataId[k.name] = v
492 else:
493 newDataId[k] = v
495 # Go through the updated dataId and check the type in case someone is
496 # using an alternate key. We have already filtered out the compound
497 # keys dimensions.record format.
498 not_dimensions = {}
500 # Will need to look in the dataId and the keyword arguments
501 # and will remove them if they need to be fixed or are unrecognized.
502 for dataIdDict in (newDataId, kwargs):
503 # Use a list so we can adjust the dict safely in the loop
504 for dimensionName in list(dataIdDict):
505 value = dataIdDict[dimensionName]
506 try:
507 dimension = self.dimensions.dimensions[dimensionName]
508 except KeyError:
509 # This is not a real dimension
510 not_dimensions[dimensionName] = value
511 del dataIdDict[dimensionName]
512 continue
514 # Convert an integral type to an explicit int to simplify
515 # comparisons here
516 if isinstance(value, numbers.Integral):
517 value = int(value)
519 if not isinstance(value, dimension.primaryKey.getPythonType()):
520 for alternate in dimension.alternateKeys:
521 if isinstance(value, alternate.getPythonType()):
522 byRecord[dimensionName][alternate.name] = value
523 del dataIdDict[dimensionName]
524 _LOG.debug(
525 "Converting dimension %s to %s.%s=%s",
526 dimensionName,
527 dimensionName,
528 alternate.name,
529 value,
530 )
531 break
532 else:
533 _LOG.warning(
534 "Type mismatch found for value '%r' provided for dimension %s. "
535 "Could not find matching alternative (primary key has type %s) "
536 "so attempting to use as-is.",
537 value,
538 dimensionName,
539 dimension.primaryKey.getPythonType(),
540 )
542 # By this point kwargs and newDataId should only include valid
543 # dimensions. Merge kwargs in to the new dataId and log if there
544 # are dimensions in both (rather than calling update).
545 for k, v in kwargs.items():
546 if k in newDataId and newDataId[k] != v:
547 _LOG.debug(
548 "Keyword arg %s overriding explicit value in dataId of %s with %s", k, newDataId[k], v
549 )
550 newDataId[k] = v
551 # No need to retain any values in kwargs now.
552 kwargs = {}
554 # If we have some unrecognized dimensions we have to try to connect
555 # them to records in other dimensions. This is made more complicated
556 # by some dimensions having records with clashing names. A mitigation
557 # is that we can tell by this point which dimensions are missing
558 # for the DatasetType but this does not work for calibrations
559 # where additional dimensions can be used to constrain the temporal
560 # axis.
561 if not_dimensions:
562 # Search for all dimensions even if we have been given a value
563 # explicitly. In some cases records are given as well as the
564 # actually dimension and this should not be an error if they
565 # match.
566 mandatoryDimensions = datasetType.dimensions.names # - provided
568 candidateDimensions: set[str] = set()
569 candidateDimensions.update(mandatoryDimensions)
571 # For calibrations we may well be needing temporal dimensions
572 # so rather than always including all dimensions in the scan
573 # restrict things a little. It is still possible for there
574 # to be confusion over day_obs in visit vs exposure for example.
575 # If we are not searching calibration collections things may
576 # fail but they are going to fail anyway because of the
577 # ambiguousness of the dataId...
578 if datasetType.isCalibration():
579 for dim in self.dimensions.dimensions:
580 if dim.temporal:
581 candidateDimensions.add(str(dim))
583 # Look up table for the first association with a dimension
584 guessedAssociation: dict[str, dict[str, Any]] = defaultdict(dict)
586 # Keep track of whether an item is associated with multiple
587 # dimensions.
588 counter: Counter[str] = Counter()
589 assigned: dict[str, set[str]] = defaultdict(set)
591 # Go through the missing dimensions and associate the
592 # given names with records within those dimensions
593 matched_dims = set()
594 for dimensionName in candidateDimensions:
595 dimension = self.dimensions.dimensions[dimensionName]
596 fields = dimension.metadata.names | dimension.uniqueKeys.names
597 for field in not_dimensions:
598 if field in fields:
599 guessedAssociation[dimensionName][field] = not_dimensions[field]
600 counter[dimensionName] += 1
601 assigned[field].add(dimensionName)
602 matched_dims.add(field)
604 # Calculate the fields that matched nothing.
605 never_found = set(not_dimensions) - matched_dims
607 if never_found:
608 raise ValueError(f"Unrecognized keyword args given: {never_found}")
610 # There is a chance we have allocated a single dataId item
611 # to multiple dimensions. Need to decide which should be retained.
612 # For now assume that the most popular alternative wins.
613 # This means that day_obs with seq_num will result in
614 # exposure.day_obs and not visit.day_obs
615 # Also prefer an explicitly missing dimension over an inferred
616 # temporal dimension.
617 for fieldName, assignedDimensions in assigned.items():
618 if len(assignedDimensions) > 1:
619 # Pick the most popular (preferring mandatory dimensions)
620 requiredButMissing = assignedDimensions.intersection(mandatoryDimensions)
621 if requiredButMissing:
622 candidateDimensions = requiredButMissing
623 else:
624 candidateDimensions = assignedDimensions
626 # If this is a choice between visit and exposure and
627 # neither was a required part of the dataset type,
628 # (hence in this branch) always prefer exposure over
629 # visit since exposures are always defined and visits
630 # are defined from exposures.
631 if candidateDimensions == {"exposure", "visit"}:
632 candidateDimensions = {"exposure"}
634 # Select the relevant items and get a new restricted
635 # counter.
636 theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions}
637 duplicatesCounter: Counter[str] = Counter()
638 duplicatesCounter.update(theseCounts)
640 # Choose the most common. If they are equally common
641 # we will pick the one that was found first.
642 # Returns a list of tuples
643 selected = duplicatesCounter.most_common(1)[0][0]
645 _LOG.debug(
646 "Ambiguous dataId entry '%s' associated with multiple dimensions: %s."
647 " Removed ambiguity by choosing dimension %s.",
648 fieldName,
649 ", ".join(assignedDimensions),
650 selected,
651 )
653 for candidateDimension in assignedDimensions:
654 if candidateDimension != selected:
655 del guessedAssociation[candidateDimension][fieldName]
657 # Update the record look up dict with the new associations
658 for dimensionName, values in guessedAssociation.items():
659 if values: # A dict might now be empty
660 _LOG.debug(
661 "Assigned non-dimension dataId keys to dimension %s: %s", dimensionName, values
662 )
663 byRecord[dimensionName].update(values)
665 if byRecord:
666 # Some record specifiers were found so we need to convert
667 # them to the Id form
668 for dimensionName, values in byRecord.items():
669 if dimensionName in newDataId:
670 _LOG.debug(
671 "DataId specified explicit %s dimension value of %s in addition to"
672 " general record specifiers for it of %s. Ignoring record information.",
673 dimensionName,
674 newDataId[dimensionName],
675 str(values),
676 )
677 # Get the actual record and compare with these values.
678 try:
679 recs = list(self._registry.queryDimensionRecords(dimensionName, dataId=newDataId))
680 except DataIdError:
681 raise ValueError(
682 f"Could not find dimension '{dimensionName}'"
683 f" with dataId {newDataId} as part of comparing with"
684 f" record values {byRecord[dimensionName]}"
685 ) from None
686 if len(recs) == 1:
687 errmsg: list[str] = []
688 for k, v in values.items():
689 if (recval := getattr(recs[0], k)) != v:
690 errmsg.append(f"{k}({recval} != {v})")
691 if errmsg:
692 raise ValueError(
693 f"Dimension {dimensionName} in dataId has explicit value"
694 " inconsistent with records: " + ", ".join(errmsg)
695 )
696 else:
697 # Multiple matches for an explicit dimension
698 # should never happen but let downstream complain.
699 pass
700 continue
702 # Build up a WHERE expression
703 bind = dict(values.items())
704 where = " AND ".join(f"{dimensionName}.{k} = {k}" for k in bind)
706 # Hopefully we get a single record that matches
707 records = set(
708 self._registry.queryDimensionRecords(
709 dimensionName, dataId=newDataId, where=where, bind=bind, **kwargs
710 )
711 )
713 if len(records) != 1:
714 if len(records) > 1:
715 # visit can have an ambiguous answer without involving
716 # visit_system. The default visit_system is defined
717 # by the instrument.
718 if (
719 dimensionName == "visit"
720 and "visit_system_membership" in self.dimensions
721 and "visit_system" in self.dimensions["instrument"].metadata
722 ):
723 instrument_records = list(
724 self._registry.queryDimensionRecords(
725 "instrument",
726 dataId=newDataId,
727 **kwargs,
728 )
729 )
730 if len(instrument_records) == 1:
731 visit_system = instrument_records[0].visit_system
732 if visit_system is None:
733 # Set to a value that will never match.
734 visit_system = -1
736 # Look up each visit in the
737 # visit_system_membership records.
738 for rec in records:
739 membership = list(
740 self._registry.queryDimensionRecords(
741 # Use bind to allow zero results.
742 # This is a fully-specified query.
743 "visit_system_membership",
744 where="instrument = inst AND visit_system = system AND visit = v",
745 bind=dict(
746 inst=instrument_records[0].name, system=visit_system, v=rec.id
747 ),
748 )
749 )
750 if membership:
751 # This record is the right answer.
752 records = {rec}
753 break
755 # The ambiguity may have been resolved so check again.
756 if len(records) > 1:
757 _LOG.debug(
758 "Received %d records from constraints of %s", len(records), str(values)
759 )
760 for r in records:
761 _LOG.debug("- %s", str(r))
762 raise ValueError(
763 f"DataId specification for dimension {dimensionName} is not"
764 f" uniquely constrained to a single dataset by {values}."
765 f" Got {len(records)} results."
766 )
767 else:
768 raise ValueError(
769 f"DataId specification for dimension {dimensionName} matched no"
770 f" records when constrained by {values}"
771 )
773 # Get the primary key from the real dimension object
774 dimension = self.dimensions.dimensions[dimensionName]
775 if not isinstance(dimension, Dimension):
776 raise RuntimeError(
777 f"{dimension.name} is not a true dimension, and cannot be used in data IDs."
778 )
779 newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name)
781 return newDataId, kwargs
783 def _findDatasetRef(
784 self,
785 datasetRefOrType: DatasetRef | DatasetType | str,
786 dataId: DataId | None = None,
787 *,
788 collections: Any = None,
789 predict: bool = False,
790 run: str | None = None,
791 datastore_records: bool = False,
792 **kwargs: Any,
793 ) -> DatasetRef:
794 """Shared logic for methods that start with a search for a dataset in
795 the registry.
797 Parameters
798 ----------
799 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
800 When `DatasetRef` the `dataId` should be `None`.
801 Otherwise the `DatasetType` or name thereof.
802 dataId : `dict` or `DataCoordinate`, optional
803 A `dict` of `Dimension` link name, value pairs that label the
804 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
805 should be provided as the first argument.
806 collections : Any, optional
807 Collections to be searched, overriding ``self.collections``.
808 Can be any of the types supported by the ``collections`` argument
809 to butler construction.
810 predict : `bool`, optional
811 If `True`, return a newly created `DatasetRef` with a unique
812 dataset ID if finding a reference in the `Registry` fails.
813 Defaults to `False`.
814 run : `str`, optional
815 Run collection name to use for creating `DatasetRef` for predicted
816 datasets. Only used if ``predict`` is `True`.
817 datastore_records : `bool`, optional
818 If `True` add datastore records to returned `DatasetRef`.
819 **kwargs
820 Additional keyword arguments used to augment or construct a
821 `DataId`. See `DataId` parameters.
823 Returns
824 -------
825 ref : `DatasetRef`
826 A reference to the dataset identified by the given arguments.
827 This can be the same dataset reference as given if it was
828 resolved.
830 Raises
831 ------
832 LookupError
833 Raised if no matching dataset exists in the `Registry` (and
834 ``predict`` is `False`).
835 ValueError
836 Raised if a resolved `DatasetRef` was passed as an input, but it
837 differs from the one found in the registry.
838 TypeError
839 Raised if no collections were provided.
840 """
841 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, for_put=False, **kwargs)
842 if isinstance(datasetRefOrType, DatasetRef):
843 if collections is not None:
844 warnings.warn("Collections should not be specified with DatasetRef", stacklevel=3)
845 # May need to retrieve datastore records if requested.
846 if datastore_records and datasetRefOrType._datastore_records is None:
847 datasetRefOrType = self._registry.get_datastore_records(datasetRefOrType)
848 return datasetRefOrType
849 timespan: Timespan | None = None
851 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs)
853 if datasetType.isCalibration():
854 # Because this is a calibration dataset, first try to make a
855 # standardize the data ID without restricting the dimensions to
856 # those of the dataset type requested, because there may be extra
857 # dimensions that provide temporal information for a validity-range
858 # lookup.
859 dataId = DataCoordinate.standardize(
860 dataId, universe=self.dimensions, defaults=self._registry.defaults.dataId, **kwargs
861 )
862 if dataId.dimensions.temporal:
863 dataId = self._registry.expandDataId(dataId)
864 timespan = dataId.timespan
865 else:
866 # Standardize the data ID to just the dimensions of the dataset
867 # type instead of letting registry.findDataset do it, so we get the
868 # result even if no dataset is found.
869 dataId = DataCoordinate.standardize(
870 dataId,
871 dimensions=datasetType.dimensions,
872 defaults=self._registry.defaults.dataId,
873 **kwargs,
874 )
875 # Always lookup the DatasetRef, even if one is given, to ensure it is
876 # present in the current collection.
877 ref = self.find_dataset(
878 datasetType,
879 dataId,
880 collections=collections,
881 timespan=timespan,
882 datastore_records=datastore_records,
883 )
884 if ref is None:
885 if predict:
886 if run is None:
887 run = self.run
888 if run is None:
889 raise TypeError("Cannot predict dataset ID/location with run=None.")
890 return DatasetRef(datasetType, dataId, run=run)
891 else:
892 if collections is None:
893 collections = self._registry.defaults.collections
894 raise LookupError(
895 f"Dataset {datasetType.name} with data ID {dataId} "
896 f"could not be found in collections {collections}."
897 )
898 if datasetType != ref.datasetType:
899 # If they differ it is because the user explicitly specified
900 # a compatible dataset type to this call rather than using the
901 # registry definition. The DatasetRef must therefore be recreated
902 # using the user definition such that the expected type is
903 # returned.
904 ref = DatasetRef(
905 datasetType, ref.dataId, run=ref.run, id=ref.id, datastore_records=ref._datastore_records
906 )
908 return ref
910 @transactional
911 def put(
912 self,
913 obj: Any,
914 datasetRefOrType: DatasetRef | DatasetType | str,
915 /,
916 dataId: DataId | None = None,
917 *,
918 run: str | None = None,
919 **kwargs: Any,
920 ) -> DatasetRef:
921 """Store and register a dataset.
923 Parameters
924 ----------
925 obj : `object`
926 The dataset.
927 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
928 When `DatasetRef` is provided, ``dataId`` should be `None`.
929 Otherwise the `DatasetType` or name thereof. If a fully resolved
930 `DatasetRef` is given the run and ID are used directly.
931 dataId : `dict` or `DataCoordinate`
932 A `dict` of `Dimension` link name, value pairs that label the
933 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
934 should be provided as the second argument.
935 run : `str`, optional
936 The name of the run the dataset should be added to, overriding
937 ``self.run``. Not used if a resolved `DatasetRef` is provided.
938 **kwargs
939 Additional keyword arguments used to augment or construct a
940 `DataCoordinate`. See `DataCoordinate.standardize`
941 parameters. Not used if a resolve `DatasetRef` is provided.
943 Returns
944 -------
945 ref : `DatasetRef`
946 A reference to the stored dataset, updated with the correct id if
947 given.
949 Raises
950 ------
951 TypeError
952 Raised if the butler is read-only or if no run has been provided.
953 """
954 if isinstance(datasetRefOrType, DatasetRef):
955 # This is a direct put of predefined DatasetRef.
956 _LOG.debug("Butler put direct: %s", datasetRefOrType)
957 if run is not None:
958 warnings.warn("Run collection is not used for DatasetRef", stacklevel=3)
959 # If registry already has a dataset with the same dataset ID,
960 # dataset type and DataId, then _importDatasets will do nothing and
961 # just return an original ref. We have to raise in this case, there
962 # is a datastore check below for that.
963 self._registry._importDatasets([datasetRefOrType], expand=True)
964 # Before trying to write to the datastore check that it does not
965 # know this dataset. This is prone to races, of course.
966 if self._datastore.knows(datasetRefOrType):
967 raise ConflictingDefinitionError(f"Datastore already contains dataset: {datasetRefOrType}")
968 # Try to write dataset to the datastore, if it fails due to a race
969 # with another write, the content of stored data may be
970 # unpredictable.
971 try:
972 self._datastore.put(obj, datasetRefOrType)
973 except IntegrityError as e:
974 raise ConflictingDefinitionError(f"Datastore already contains dataset: {e}") from e
975 return datasetRefOrType
977 _LOG.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run)
978 if not self.isWriteable():
979 raise TypeError("Butler is read-only.")
980 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs)
982 # Handle dimension records in dataId
983 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs)
985 # Add Registry Dataset entry.
986 dataId = self._registry.expandDataId(dataId, dimensions=datasetType.dimensions, **kwargs)
987 (ref,) = self._registry.insertDatasets(datasetType, run=run, dataIds=[dataId])
988 self._datastore.put(obj, ref)
990 return ref
992 def getDeferred(
993 self,
994 datasetRefOrType: DatasetRef | DatasetType | str,
995 /,
996 dataId: DataId | None = None,
997 *,
998 parameters: dict | None = None,
999 collections: Any = None,
1000 storageClass: str | StorageClass | None = None,
1001 **kwargs: Any,
1002 ) -> DeferredDatasetHandle:
1003 """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
1004 after an immediate registry lookup.
1006 Parameters
1007 ----------
1008 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1009 When `DatasetRef` the `dataId` should be `None`.
1010 Otherwise the `DatasetType` or name thereof.
1011 dataId : `dict` or `DataCoordinate`, optional
1012 A `dict` of `Dimension` link name, value pairs that label the
1013 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1014 should be provided as the first argument.
1015 parameters : `dict`
1016 Additional StorageClass-defined options to control reading,
1017 typically used to efficiently read only a subset of the dataset.
1018 collections : Any, optional
1019 Collections to be searched, overriding ``self.collections``.
1020 Can be any of the types supported by the ``collections`` argument
1021 to butler construction.
1022 storageClass : `StorageClass` or `str`, optional
1023 The storage class to be used to override the Python type
1024 returned by this method. By default the returned type matches
1025 the dataset type definition for this dataset. Specifying a
1026 read `StorageClass` can force a different type to be returned.
1027 This type must be compatible with the original type.
1028 **kwargs
1029 Additional keyword arguments used to augment or construct a
1030 `DataId`. See `DataId` parameters.
1032 Returns
1033 -------
1034 obj : `DeferredDatasetHandle`
1035 A handle which can be used to retrieve a dataset at a later time.
1037 Raises
1038 ------
1039 LookupError
1040 Raised if no matching dataset exists in the `Registry` or
1041 datastore.
1042 ValueError
1043 Raised if a resolved `DatasetRef` was passed as an input, but it
1044 differs from the one found in the registry.
1045 TypeError
1046 Raised if no collections were provided.
1047 """
1048 if isinstance(datasetRefOrType, DatasetRef):
1049 # Do the quick check first and if that fails, check for artifact
1050 # existence. This is necessary for datastores that are configured
1051 # in trust mode where there won't be a record but there will be
1052 # a file.
1053 if self._datastore.knows(datasetRefOrType) or self._datastore.exists(datasetRefOrType):
1054 ref = datasetRefOrType
1055 else:
1056 raise LookupError(f"Dataset reference {datasetRefOrType} does not exist.")
1057 else:
1058 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs)
1059 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters, storageClass=storageClass)
1061 def get(
1062 self,
1063 datasetRefOrType: DatasetRef | DatasetType | str,
1064 /,
1065 dataId: DataId | None = None,
1066 *,
1067 parameters: dict[str, Any] | None = None,
1068 collections: Any = None,
1069 storageClass: StorageClass | str | None = None,
1070 **kwargs: Any,
1071 ) -> Any:
1072 """Retrieve a stored dataset.
1074 Parameters
1075 ----------
1076 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1077 When `DatasetRef` the `dataId` should be `None`.
1078 Otherwise the `DatasetType` or name thereof.
1079 If a resolved `DatasetRef`, the associated dataset
1080 is returned directly without additional querying.
1081 dataId : `dict` or `DataCoordinate`
1082 A `dict` of `Dimension` link name, value pairs that label the
1083 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1084 should be provided as the first argument.
1085 parameters : `dict`
1086 Additional StorageClass-defined options to control reading,
1087 typically used to efficiently read only a subset of the dataset.
1088 collections : Any, optional
1089 Collections to be searched, overriding ``self.collections``.
1090 Can be any of the types supported by the ``collections`` argument
1091 to butler construction.
1092 storageClass : `StorageClass` or `str`, optional
1093 The storage class to be used to override the Python type
1094 returned by this method. By default the returned type matches
1095 the dataset type definition for this dataset. Specifying a
1096 read `StorageClass` can force a different type to be returned.
1097 This type must be compatible with the original type.
1098 **kwargs
1099 Additional keyword arguments used to augment or construct a
1100 `DataCoordinate`. See `DataCoordinate.standardize`
1101 parameters.
1103 Returns
1104 -------
1105 obj : `object`
1106 The dataset.
1108 Raises
1109 ------
1110 LookupError
1111 Raised if no matching dataset exists in the `Registry`.
1112 TypeError
1113 Raised if no collections were provided.
1115 Notes
1116 -----
1117 When looking up datasets in a `~CollectionType.CALIBRATION` collection,
1118 this method requires that the given data ID include temporal dimensions
1119 beyond the dimensions of the dataset type itself, in order to find the
1120 dataset with the appropriate validity range. For example, a "bias"
1121 dataset with native dimensions ``{instrument, detector}`` could be
1122 fetched with a ``{instrument, detector, exposure}`` data ID, because
1123 ``exposure`` is a temporal dimension.
1124 """
1125 _LOG.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters)
1126 ref = self._findDatasetRef(
1127 datasetRefOrType, dataId, collections=collections, datastore_records=True, **kwargs
1128 )
1129 return self._datastore.get(ref, parameters=parameters, storageClass=storageClass)
1131 def getURIs(
1132 self,
1133 datasetRefOrType: DatasetRef | DatasetType | str,
1134 /,
1135 dataId: DataId | None = None,
1136 *,
1137 predict: bool = False,
1138 collections: Any = None,
1139 run: str | None = None,
1140 **kwargs: Any,
1141 ) -> DatasetRefURIs:
1142 """Return the URIs associated with the dataset.
1144 Parameters
1145 ----------
1146 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1147 When `DatasetRef` the `dataId` should be `None`.
1148 Otherwise the `DatasetType` or name thereof.
1149 dataId : `dict` or `DataCoordinate`
1150 A `dict` of `Dimension` link name, value pairs that label the
1151 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1152 should be provided as the first argument.
1153 predict : `bool`
1154 If `True`, allow URIs to be returned of datasets that have not
1155 been written.
1156 collections : Any, optional
1157 Collections to be searched, overriding ``self.collections``.
1158 Can be any of the types supported by the ``collections`` argument
1159 to butler construction.
1160 run : `str`, optional
1161 Run to use for predictions, overriding ``self.run``.
1162 **kwargs
1163 Additional keyword arguments used to augment or construct a
1164 `DataCoordinate`. See `DataCoordinate.standardize`
1165 parameters.
1167 Returns
1168 -------
1169 uris : `DatasetRefURIs`
1170 The URI to the primary artifact associated with this dataset (if
1171 the dataset was disassembled within the datastore this may be
1172 `None`), and the URIs to any components associated with the dataset
1173 artifact. (can be empty if there are no components).
1174 """
1175 ref = self._findDatasetRef(
1176 datasetRefOrType, dataId, predict=predict, run=run, collections=collections, **kwargs
1177 )
1178 return self._datastore.getURIs(ref, predict)
1180 def get_dataset_type(self, name: str) -> DatasetType:
1181 return self._registry.getDatasetType(name)
1183 def get_dataset(
1184 self,
1185 id: DatasetId,
1186 *,
1187 storage_class: str | StorageClass | None = None,
1188 dimension_records: bool = False,
1189 datastore_records: bool = False,
1190 ) -> DatasetRef | None:
1191 ref = self._registry.getDataset(id)
1192 if ref is not None:
1193 if dimension_records:
1194 ref = ref.expanded(
1195 self._registry.expandDataId(ref.dataId, dimensions=ref.datasetType.dimensions)
1196 )
1197 if storage_class:
1198 ref = ref.overrideStorageClass(storage_class)
1199 if datastore_records:
1200 ref = self._registry.get_datastore_records(ref)
1201 return ref
1203 def find_dataset(
1204 self,
1205 dataset_type: DatasetType | str,
1206 data_id: DataId | None = None,
1207 *,
1208 collections: str | Sequence[str] | None = None,
1209 timespan: Timespan | None = None,
1210 storage_class: str | StorageClass | None = None,
1211 dimension_records: bool = False,
1212 datastore_records: bool = False,
1213 **kwargs: Any,
1214 ) -> DatasetRef | None:
1215 # Handle any parts of the dataID that are not using primary dimension
1216 # keys.
1217 if isinstance(dataset_type, str):
1218 actual_type = self.get_dataset_type(dataset_type)
1219 else:
1220 actual_type = dataset_type
1222 # Store the component for later.
1223 component_name = actual_type.component()
1224 if actual_type.isComponent():
1225 parent_type = actual_type.makeCompositeDatasetType()
1226 else:
1227 parent_type = actual_type
1229 data_id, kwargs = self._rewrite_data_id(data_id, parent_type, **kwargs)
1231 ref = self._registry.findDataset(
1232 parent_type,
1233 data_id,
1234 collections=collections,
1235 timespan=timespan,
1236 datastore_records=datastore_records,
1237 **kwargs,
1238 )
1239 if ref is not None and dimension_records:
1240 ref = ref.expanded(self._registry.expandDataId(ref.dataId, dimensions=ref.datasetType.dimensions))
1241 if ref is not None and component_name:
1242 ref = ref.makeComponentRef(component_name)
1243 if ref is not None and storage_class is not None:
1244 ref = ref.overrideStorageClass(storage_class)
1246 return ref
1248 def retrieveArtifacts(
1249 self,
1250 refs: Iterable[DatasetRef],
1251 destination: ResourcePathExpression,
1252 transfer: str = "auto",
1253 preserve_path: bool = True,
1254 overwrite: bool = False,
1255 ) -> list[ResourcePath]:
1256 # Docstring inherited.
1257 return self._datastore.retrieveArtifacts(
1258 refs,
1259 ResourcePath(destination),
1260 transfer=transfer,
1261 preserve_path=preserve_path,
1262 overwrite=overwrite,
1263 )
1265 def exists(
1266 self,
1267 dataset_ref_or_type: DatasetRef | DatasetType | str,
1268 /,
1269 data_id: DataId | None = None,
1270 *,
1271 full_check: bool = True,
1272 collections: Any = None,
1273 **kwargs: Any,
1274 ) -> DatasetExistence:
1275 # Docstring inherited.
1276 existence = DatasetExistence.UNRECOGNIZED
1278 if isinstance(dataset_ref_or_type, DatasetRef):
1279 if collections is not None:
1280 warnings.warn("Collections should not be specified with DatasetRef", stacklevel=2)
1281 if data_id is not None:
1282 warnings.warn("A DataID should not be specified with DatasetRef", stacklevel=2)
1283 ref = dataset_ref_or_type
1284 registry_ref = self._registry.getDataset(dataset_ref_or_type.id)
1285 if registry_ref is not None:
1286 existence |= DatasetExistence.RECORDED
1288 if dataset_ref_or_type != registry_ref:
1289 # This could mean that storage classes differ, so we should
1290 # check for that but use the registry ref for the rest of
1291 # the method.
1292 if registry_ref.is_compatible_with(dataset_ref_or_type):
1293 # Use the registry version from now on.
1294 ref = registry_ref
1295 else:
1296 raise ValueError(
1297 f"The ref given to exists() ({ref}) has the same dataset ID as one "
1298 f"in registry but has different incompatible values ({registry_ref})."
1299 )
1300 else:
1301 try:
1302 ref = self._findDatasetRef(dataset_ref_or_type, data_id, collections=collections, **kwargs)
1303 except (LookupError, TypeError, NoDefaultCollectionError):
1304 return existence
1305 existence |= DatasetExistence.RECORDED
1307 if self._datastore.knows(ref):
1308 existence |= DatasetExistence.DATASTORE
1310 if full_check:
1311 if self._datastore.exists(ref):
1312 existence |= DatasetExistence._ARTIFACT
1313 elif existence.value != DatasetExistence.UNRECOGNIZED.value:
1314 # Do not add this flag if we have no other idea about a dataset.
1315 existence |= DatasetExistence(DatasetExistence._ASSUMED)
1317 return existence
1319 def _exists_many(
1320 self,
1321 refs: Iterable[DatasetRef],
1322 /,
1323 *,
1324 full_check: bool = True,
1325 ) -> dict[DatasetRef, DatasetExistence]:
1326 # Docstring inherited.
1327 existence = {ref: DatasetExistence.UNRECOGNIZED for ref in refs}
1329 # Registry does not have a bulk API to check for a ref.
1330 for ref in refs:
1331 registry_ref = self._registry.getDataset(ref.id)
1332 if registry_ref is not None:
1333 # It is possible, albeit unlikely, that the given ref does
1334 # not match the one in registry even though the UUID matches.
1335 # When checking a single ref we raise, but it's impolite to
1336 # do that when potentially hundreds of refs are being checked.
1337 # We could change the API to only accept UUIDs and that would
1338 # remove the ability to even check and remove the worry
1339 # about differing storage classes. Given the ongoing discussion
1340 # on refs vs UUIDs and whether to raise or have a new
1341 # private flag, treat this as a private API for now.
1342 existence[ref] |= DatasetExistence.RECORDED
1344 # Ask datastore if it knows about these refs.
1345 knows = self._datastore.knows_these(refs)
1346 for ref, known in knows.items():
1347 if known:
1348 existence[ref] |= DatasetExistence.DATASTORE
1350 if full_check:
1351 mexists = self._datastore.mexists(refs)
1352 for ref, exists in mexists.items():
1353 if exists:
1354 existence[ref] |= DatasetExistence._ARTIFACT
1355 else:
1356 # Do not set this flag if nothing is known about the dataset.
1357 for ref in existence:
1358 if existence[ref] != DatasetExistence.UNRECOGNIZED:
1359 existence[ref] |= DatasetExistence._ASSUMED
1361 return existence
1363 def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None:
1364 # Docstring inherited.
1365 if not self.isWriteable():
1366 raise TypeError("Butler is read-only.")
1367 names = list(names)
1368 refs: list[DatasetRef] = []
1369 for name in names:
1370 collectionType = self._registry.getCollectionType(name)
1371 if collectionType is not CollectionType.RUN:
1372 raise TypeError(f"The collection type of '{name}' is {collectionType.name}, not RUN.")
1373 refs.extend(self._registry.queryDatasets(..., collections=name, findFirst=True))
1374 with self._datastore.transaction(), self._registry.transaction():
1375 if unstore:
1376 self._datastore.trash(refs)
1377 else:
1378 self._datastore.forget(refs)
1379 for name in names:
1380 self._registry.removeCollection(name)
1381 if unstore:
1382 # Point of no return for removing artifacts
1383 self._datastore.emptyTrash()
1385 def pruneDatasets(
1386 self,
1387 refs: Iterable[DatasetRef],
1388 *,
1389 disassociate: bool = True,
1390 unstore: bool = False,
1391 tags: Iterable[str] = (),
1392 purge: bool = False,
1393 ) -> None:
1394 # docstring inherited from LimitedButler
1396 if not self.isWriteable():
1397 raise TypeError("Butler is read-only.")
1398 if purge:
1399 if not disassociate:
1400 raise TypeError("Cannot pass purge=True without disassociate=True.")
1401 if not unstore:
1402 raise TypeError("Cannot pass purge=True without unstore=True.")
1403 elif disassociate:
1404 tags = tuple(tags)
1405 if not tags:
1406 raise TypeError("No tags provided but disassociate=True.")
1407 for tag in tags:
1408 collectionType = self._registry.getCollectionType(tag)
1409 if collectionType is not CollectionType.TAGGED:
1410 raise TypeError(
1411 f"Cannot disassociate from collection '{tag}' "
1412 f"of non-TAGGED type {collectionType.name}."
1413 )
1414 # Transform possibly-single-pass iterable into something we can iterate
1415 # over multiple times.
1416 refs = list(refs)
1417 # Pruning a component of a DatasetRef makes no sense since registry
1418 # doesn't know about components and datastore might not store
1419 # components in a separate file
1420 for ref in refs:
1421 if ref.datasetType.component():
1422 raise ValueError(f"Can not prune a component of a dataset (ref={ref})")
1423 # We don't need an unreliable Datastore transaction for this, because
1424 # we've been extra careful to ensure that Datastore.trash only involves
1425 # mutating the Registry (it can _look_ at Datastore-specific things,
1426 # but shouldn't change them), and hence all operations here are
1427 # Registry operations.
1428 with self._datastore.transaction(), self._registry.transaction():
1429 if unstore:
1430 self._datastore.trash(refs)
1431 if purge:
1432 self._registry.removeDatasets(refs)
1433 elif disassociate:
1434 assert tags, "Guaranteed by earlier logic in this function."
1435 for tag in tags:
1436 self._registry.disassociate(tag, refs)
1437 # We've exited the Registry transaction, and apparently committed.
1438 # (if there was an exception, everything rolled back, and it's as if
1439 # nothing happened - and we never get here).
1440 # Datastore artifacts are not yet gone, but they're clearly marked
1441 # as trash, so if we fail to delete now because of (e.g.) filesystem
1442 # problems we can try again later, and if manual administrative
1443 # intervention is required, it's pretty clear what that should entail:
1444 # deleting everything on disk and in private Datastore tables that is
1445 # in the dataset_location_trash table.
1446 if unstore:
1447 # Point of no return for removing artifacts
1448 self._datastore.emptyTrash()
1450 @transactional
1451 def ingest(
1452 self,
1453 *datasets: FileDataset,
1454 transfer: str | None = "auto",
1455 record_validation_info: bool = True,
1456 ) -> None:
1457 # Docstring inherited.
1458 if not self.isWriteable():
1459 raise TypeError("Butler is read-only.")
1461 _LOG.verbose("Ingesting %d file dataset%s.", len(datasets), "" if len(datasets) == 1 else "s")
1462 if not datasets:
1463 return
1465 progress = Progress("lsst.daf.butler.Butler.ingest", level=logging.DEBUG)
1467 # We need to reorganize all the inputs so that they are grouped
1468 # by dataset type and run. Multiple refs in a single FileDataset
1469 # are required to share the run and dataset type.
1470 groupedData: MutableMapping[tuple[DatasetType, str], list[FileDataset]] = defaultdict(list)
1472 # Track DataIDs that are being ingested so we can spot issues early
1473 # with duplication. Retain previous FileDataset so we can report it.
1474 groupedDataIds: MutableMapping[
1475 tuple[DatasetType, str], dict[DataCoordinate, FileDataset]
1476 ] = defaultdict(dict)
1478 # And the nested loop that populates it:
1479 for dataset in progress.wrap(datasets, desc="Grouping by dataset type"):
1480 # Somewhere to store pre-existing refs if we have an
1481 # execution butler.
1482 existingRefs: list[DatasetRef] = []
1484 for ref in dataset.refs:
1485 group_key = (ref.datasetType, ref.run)
1487 if ref.dataId in groupedDataIds[group_key]:
1488 raise ConflictingDefinitionError(
1489 f"Ingest conflict. Dataset {dataset.path} has same"
1490 " DataId as other ingest dataset"
1491 f" {groupedDataIds[group_key][ref.dataId].path} "
1492 f" ({ref.dataId})"
1493 )
1495 groupedDataIds[group_key][ref.dataId] = dataset
1497 if existingRefs:
1498 if len(dataset.refs) != len(existingRefs):
1499 # Keeping track of partially pre-existing datasets is hard
1500 # and should generally never happen. For now don't allow
1501 # it.
1502 raise ConflictingDefinitionError(
1503 f"For dataset {dataset.path} some dataIds already exist"
1504 " in registry but others do not. This is not supported."
1505 )
1507 # Store expanded form in the original FileDataset.
1508 dataset.refs = existingRefs
1509 else:
1510 groupedData[group_key].append(dataset)
1512 # Now we can bulk-insert into Registry for each DatasetType.
1513 for (datasetType, this_run), grouped_datasets in progress.iter_item_chunks(
1514 groupedData.items(), desc="Bulk-inserting datasets by type"
1515 ):
1516 refs_to_import = []
1517 for dataset in grouped_datasets:
1518 refs_to_import.extend(dataset.refs)
1520 n_refs = len(refs_to_import)
1521 _LOG.verbose(
1522 "Importing %d ref%s of dataset type %r into run %r",
1523 n_refs,
1524 "" if n_refs == 1 else "s",
1525 datasetType.name,
1526 this_run,
1527 )
1529 # Import the refs and expand the DataCoordinates since we can't
1530 # guarantee that they are expanded and Datastore will need
1531 # the records.
1532 imported_refs = self._registry._importDatasets(refs_to_import, expand=True)
1533 assert set(imported_refs) == set(refs_to_import)
1535 # Replace all the refs in the FileDataset with expanded versions.
1536 # Pull them off in the order we put them on the list.
1537 for dataset in grouped_datasets:
1538 n_dataset_refs = len(dataset.refs)
1539 dataset.refs = imported_refs[:n_dataset_refs]
1540 del imported_refs[:n_dataset_refs]
1542 # Bulk-insert everything into Datastore.
1543 # We do not know if any of the registry entries already existed
1544 # (_importDatasets only complains if they exist but differ) so
1545 # we have to catch IntegrityError explicitly.
1546 try:
1547 self._datastore.ingest(
1548 *datasets, transfer=transfer, record_validation_info=record_validation_info
1549 )
1550 except IntegrityError as e:
1551 raise ConflictingDefinitionError(f"Datastore already contains one or more datasets: {e}") from e
1553 @contextlib.contextmanager
1554 def export(
1555 self,
1556 *,
1557 directory: str | None = None,
1558 filename: str | None = None,
1559 format: str | None = None,
1560 transfer: str | None = None,
1561 ) -> Iterator[RepoExportContext]:
1562 # Docstring inherited.
1563 if directory is None and transfer is not None:
1564 raise TypeError("Cannot transfer without providing a directory.")
1565 if transfer == "move":
1566 raise TypeError("Transfer may not be 'move': export is read-only")
1567 if format is None:
1568 if filename is None:
1569 raise TypeError("At least one of 'filename' or 'format' must be provided.")
1570 else:
1571 _, format = os.path.splitext(filename)
1572 if not format:
1573 raise ValueError("Please specify a file extension to determine export format.")
1574 format = format[1:] # Strip leading ".""
1575 elif filename is None:
1576 filename = f"export.{format}"
1577 if directory is not None:
1578 filename = os.path.join(directory, filename)
1579 formats = self._config["repo_transfer_formats"]
1580 if format not in formats:
1581 raise ValueError(f"Unknown export format {format!r}, allowed: {','.join(formats.keys())}")
1582 BackendClass = get_class_of(formats[format, "export"])
1583 with open(filename, "w") as stream:
1584 backend = BackendClass(stream, universe=self.dimensions)
1585 try:
1586 helper = RepoExportContext(
1587 self._registry, self._datastore, backend=backend, directory=directory, transfer=transfer
1588 )
1589 with self._caching_context():
1590 yield helper
1591 except BaseException:
1592 raise
1593 else:
1594 helper._finish()
1596 def import_(
1597 self,
1598 *,
1599 directory: ResourcePathExpression | None = None,
1600 filename: ResourcePathExpression | TextIO | None = None,
1601 format: str | None = None,
1602 transfer: str | None = None,
1603 skip_dimensions: set | None = None,
1604 ) -> None:
1605 # Docstring inherited.
1606 if not self.isWriteable():
1607 raise TypeError("Butler is read-only.")
1608 if format is None:
1609 if filename is None:
1610 raise TypeError("At least one of 'filename' or 'format' must be provided.")
1611 else:
1612 _, format = os.path.splitext(filename) # type: ignore
1613 elif filename is None:
1614 filename = ResourcePath(f"export.{format}", forceAbsolute=False)
1615 if directory is not None:
1616 directory = ResourcePath(directory, forceDirectory=True)
1617 # mypy doesn't think this will work but it does in python >= 3.10.
1618 if isinstance(filename, ResourcePathExpression): # type: ignore
1619 filename = ResourcePath(filename, forceAbsolute=False) # type: ignore
1620 if not filename.isabs() and directory is not None:
1621 potential = directory.join(filename)
1622 exists_in_cwd = filename.exists()
1623 exists_in_dir = potential.exists()
1624 if exists_in_cwd and exists_in_dir:
1625 _LOG.warning(
1626 "A relative path for filename was specified (%s) which exists relative to cwd. "
1627 "Additionally, the file exists relative to the given search directory (%s). "
1628 "Using the export file in the given directory.",
1629 filename,
1630 potential,
1631 )
1632 # Given they specified an explicit directory and that
1633 # directory has the export file in it, assume that that
1634 # is what was meant despite the file in cwd.
1635 filename = potential
1636 elif exists_in_dir:
1637 filename = potential
1638 elif not exists_in_cwd and not exists_in_dir:
1639 # Raise early.
1640 raise FileNotFoundError(
1641 f"Export file could not be found in {filename.abspath()} or {potential.abspath()}."
1642 )
1643 BackendClass: type[RepoImportBackend] = get_class_of(
1644 self._config["repo_transfer_formats"][format]["import"]
1645 )
1647 def doImport(importStream: TextIO | ResourceHandleProtocol) -> None:
1648 with self._caching_context():
1649 backend = BackendClass(importStream, self._registry) # type: ignore[call-arg]
1650 backend.register()
1651 with self.transaction():
1652 backend.load(
1653 self._datastore,
1654 directory=directory,
1655 transfer=transfer,
1656 skip_dimensions=skip_dimensions,
1657 )
1659 if isinstance(filename, ResourcePath):
1660 # We can not use open() here at the moment because of
1661 # DM-38589 since yaml does stream.read(8192) in a loop.
1662 stream = io.StringIO(filename.read().decode())
1663 doImport(stream)
1664 else:
1665 doImport(filename) # type: ignore
1667 def transfer_dimension_records_from(
1668 self, source_butler: LimitedButler | Butler, source_refs: Iterable[DatasetRef]
1669 ) -> None:
1670 # Allowed dimensions in the target butler.
1671 elements = frozenset(element for element in self.dimensions.elements if element.has_own_table)
1673 data_ids = {ref.dataId for ref in source_refs}
1675 dimension_records = self._extract_all_dimension_records_from_data_ids(
1676 source_butler, data_ids, elements
1677 )
1679 # Insert order is important.
1680 for element in self.dimensions.sorted(dimension_records.keys()):
1681 records = [r for r in dimension_records[element].values()]
1682 # Assume that if the record is already present that we can
1683 # use it without having to check that the record metadata
1684 # is consistent.
1685 self._registry.insertDimensionData(element, *records, skip_existing=True)
1686 _LOG.debug("Dimension '%s' -- number of records transferred: %d", element.name, len(records))
1688 def _extract_all_dimension_records_from_data_ids(
1689 self,
1690 source_butler: LimitedButler | Butler,
1691 data_ids: set[DataCoordinate],
1692 allowed_elements: frozenset[DimensionElement],
1693 ) -> dict[DimensionElement, dict[DataCoordinate, DimensionRecord]]:
1694 primary_records = self._extract_dimension_records_from_data_ids(
1695 source_butler, data_ids, allowed_elements
1696 )
1698 can_query = True if isinstance(source_butler, Butler) else False
1700 additional_records: dict[DimensionElement, dict[DataCoordinate, DimensionRecord]] = defaultdict(dict)
1701 for original_element, record_mapping in primary_records.items():
1702 # Get dimensions that depend on this dimension.
1703 populated_by = self.dimensions.get_elements_populated_by(
1704 self.dimensions[original_element.name] # type: ignore
1705 )
1707 for data_id in record_mapping.keys():
1708 for element in populated_by:
1709 if element not in allowed_elements:
1710 continue
1711 if element.name == original_element.name:
1712 continue
1714 if element.name in primary_records:
1715 # If this element has already been stored avoid
1716 # re-finding records since that may lead to additional
1717 # spurious records. e.g. visit is populated_by
1718 # visit_detector_region but querying
1719 # visit_detector_region by visit will return all the
1720 # detectors for this visit -- the visit dataId does not
1721 # constrain this.
1722 # To constrain the query the original dataIds would
1723 # have to be scanned.
1724 continue
1726 if not can_query:
1727 raise RuntimeError(
1728 f"Transferring populated_by records like {element.name} requires a full Butler."
1729 )
1731 records = source_butler.registry.queryDimensionRecords( # type: ignore
1732 element.name, **data_id.mapping # type: ignore
1733 )
1734 for record in records:
1735 additional_records[record.definition].setdefault(record.dataId, record)
1737 # The next step is to walk back through the additional records to
1738 # pick up any missing content (such as visit_definition needing to
1739 # know the exposure). Want to ensure we do not request records we
1740 # already have.
1741 missing_data_ids = set()
1742 for name, record_mapping in additional_records.items():
1743 for data_id in record_mapping.keys():
1744 if data_id not in primary_records[name]:
1745 missing_data_ids.add(data_id)
1747 # Fill out the new records. Assume that these new records do not
1748 # also need to carry over additional populated_by records.
1749 secondary_records = self._extract_dimension_records_from_data_ids(
1750 source_butler, missing_data_ids, allowed_elements
1751 )
1753 # Merge the extra sets of records in with the original.
1754 for name, record_mapping in itertools.chain(additional_records.items(), secondary_records.items()):
1755 primary_records[name].update(record_mapping)
1757 return primary_records
1759 def _extract_dimension_records_from_data_ids(
1760 self,
1761 source_butler: LimitedButler | Butler,
1762 data_ids: set[DataCoordinate],
1763 allowed_elements: frozenset[DimensionElement],
1764 ) -> dict[DimensionElement, dict[DataCoordinate, DimensionRecord]]:
1765 dimension_records: dict[DimensionElement, dict[DataCoordinate, DimensionRecord]] = defaultdict(dict)
1767 for data_id in data_ids:
1768 # Need an expanded record, if not expanded that we need a full
1769 # butler with registry (allow mocks with registry too).
1770 if not data_id.hasRecords():
1771 if registry := getattr(source_butler, "registry", None):
1772 data_id = registry.expandDataId(data_id)
1773 else:
1774 raise TypeError("Input butler needs to be a full butler to expand DataId.")
1775 # If this butler doesn't know about a dimension in the source
1776 # butler things will break later.
1777 for element_name in data_id.dimensions.elements:
1778 record = data_id.records[element_name]
1779 if record is not None and record.definition in allowed_elements:
1780 dimension_records[record.definition].setdefault(record.dataId, record)
1782 return dimension_records
1784 def transfer_from(
1785 self,
1786 source_butler: LimitedButler,
1787 source_refs: Iterable[DatasetRef],
1788 transfer: str = "auto",
1789 skip_missing: bool = True,
1790 register_dataset_types: bool = False,
1791 transfer_dimensions: bool = False,
1792 dry_run: bool = False,
1793 ) -> collections.abc.Collection[DatasetRef]:
1794 # Docstring inherited.
1795 if not self.isWriteable():
1796 raise TypeError("Butler is read-only.")
1797 progress = Progress("lsst.daf.butler.Butler.transfer_from", level=VERBOSE)
1799 # Will iterate through the refs multiple times so need to convert
1800 # to a list if this isn't a collection.
1801 if not isinstance(source_refs, collections.abc.Collection):
1802 source_refs = list(source_refs)
1804 original_count = len(source_refs)
1805 _LOG.info("Transferring %d datasets into %s", original_count, str(self))
1807 # In some situations the datastore artifact may be missing
1808 # and we do not want that registry entry to be imported.
1809 # Asking datastore is not sufficient, the records may have been
1810 # purged, we have to ask for the (predicted) URI and check
1811 # existence explicitly. Execution butler is set up exactly like
1812 # this with no datastore records.
1813 artifact_existence: dict[ResourcePath, bool] = {}
1814 if skip_missing:
1815 dataset_existence = source_butler._datastore.mexists(
1816 source_refs, artifact_existence=artifact_existence
1817 )
1818 source_refs = [ref for ref, exists in dataset_existence.items() if exists]
1819 filtered_count = len(source_refs)
1820 n_missing = original_count - filtered_count
1821 _LOG.verbose(
1822 "%d dataset%s removed because the artifact does not exist. Now have %d.",
1823 n_missing,
1824 "" if n_missing == 1 else "s",
1825 filtered_count,
1826 )
1828 # Importing requires that we group the refs by dataset type and run
1829 # before doing the import.
1830 source_dataset_types = set()
1831 grouped_refs = defaultdict(list)
1832 for ref in source_refs:
1833 grouped_refs[ref.datasetType, ref.run].append(ref)
1834 source_dataset_types.add(ref.datasetType)
1836 # Check to see if the dataset type in the source butler has
1837 # the same definition in the target butler and register missing
1838 # ones if requested. Registration must happen outside a transaction.
1839 newly_registered_dataset_types = set()
1840 for datasetType in source_dataset_types:
1841 if register_dataset_types:
1842 # Let this raise immediately if inconsistent. Continuing
1843 # on to find additional inconsistent dataset types
1844 # might result in additional unwanted dataset types being
1845 # registered.
1846 if self._registry.registerDatasetType(datasetType):
1847 newly_registered_dataset_types.add(datasetType)
1848 else:
1849 # If the dataset type is missing, let it fail immediately.
1850 target_dataset_type = self.get_dataset_type(datasetType.name)
1851 if target_dataset_type != datasetType:
1852 raise ConflictingDefinitionError(
1853 "Source butler dataset type differs from definition"
1854 f" in target butler: {datasetType} !="
1855 f" {target_dataset_type}"
1856 )
1857 if newly_registered_dataset_types:
1858 # We may have registered some even if there were inconsistencies
1859 # but should let people know (or else remove them again).
1860 _LOG.verbose(
1861 "Registered the following dataset types in the target Butler: %s",
1862 ", ".join(d.name for d in newly_registered_dataset_types),
1863 )
1864 else:
1865 _LOG.verbose("All required dataset types are known to the target Butler")
1867 dimension_records: dict[DimensionElement, dict[DataCoordinate, DimensionRecord]] = defaultdict(dict)
1868 if transfer_dimensions:
1869 # Collect all the dimension records for these refs.
1870 # All dimensions are to be copied but the list of valid dimensions
1871 # come from this butler's universe.
1872 elements = frozenset(element for element in self.dimensions.elements if element.has_own_table)
1873 dataIds = {ref.dataId for ref in source_refs}
1874 dimension_records = self._extract_all_dimension_records_from_data_ids(
1875 source_butler, dataIds, elements
1876 )
1878 handled_collections: set[str] = set()
1880 # Do all the importing in a single transaction.
1881 with self.transaction():
1882 if dimension_records and not dry_run:
1883 _LOG.verbose("Ensuring that dimension records exist for transferred datasets.")
1884 # Order matters.
1885 for element in self.dimensions.sorted(dimension_records.keys()):
1886 records = [r for r in dimension_records[element].values()]
1887 # Assume that if the record is already present that we can
1888 # use it without having to check that the record metadata
1889 # is consistent.
1890 self._registry.insertDimensionData(element, *records, skip_existing=True)
1892 n_imported = 0
1893 for (datasetType, run), refs_to_import in progress.iter_item_chunks(
1894 grouped_refs.items(), desc="Importing to registry by run and dataset type"
1895 ):
1896 if run not in handled_collections:
1897 # May need to create output collection. If source butler
1898 # has a registry, ask for documentation string.
1899 run_doc = None
1900 if registry := getattr(source_butler, "registry", None):
1901 run_doc = registry.getCollectionDocumentation(run)
1902 if not dry_run:
1903 registered = self._registry.registerRun(run, doc=run_doc)
1904 else:
1905 registered = True
1906 handled_collections.add(run)
1907 if registered:
1908 _LOG.verbose("Creating output run %s", run)
1910 n_refs = len(refs_to_import)
1911 _LOG.verbose(
1912 "Importing %d ref%s of dataset type %s into run %s",
1913 n_refs,
1914 "" if n_refs == 1 else "s",
1915 datasetType.name,
1916 run,
1917 )
1919 # Assume we are using UUIDs and the source refs will match
1920 # those imported.
1921 if not dry_run:
1922 imported_refs = self._registry._importDatasets(refs_to_import)
1923 else:
1924 imported_refs = refs_to_import
1925 assert set(imported_refs) == set(refs_to_import)
1926 n_imported += len(imported_refs)
1928 assert len(source_refs) == n_imported
1929 _LOG.verbose("Imported %d datasets into destination butler", n_imported)
1931 # Ask the datastore to transfer. The datastore has to check that
1932 # the source datastore is compatible with the target datastore.
1933 accepted, rejected = self._datastore.transfer_from(
1934 source_butler._datastore,
1935 source_refs,
1936 transfer=transfer,
1937 artifact_existence=artifact_existence,
1938 dry_run=dry_run,
1939 )
1940 if rejected:
1941 # For now, accept the registry entries but not the files.
1942 _LOG.warning(
1943 "%d datasets were rejected and %d accepted for dataset type %s in run %r.",
1944 len(rejected),
1945 len(accepted),
1946 datasetType,
1947 run,
1948 )
1950 return source_refs
1952 def validateConfiguration(
1953 self,
1954 logFailures: bool = False,
1955 datasetTypeNames: Iterable[str] | None = None,
1956 ignore: Iterable[str] | None = None,
1957 ) -> None:
1958 # Docstring inherited.
1959 if datasetTypeNames:
1960 datasetTypes = [self.get_dataset_type(name) for name in datasetTypeNames]
1961 else:
1962 datasetTypes = list(self._registry.queryDatasetTypes())
1964 # filter out anything from the ignore list
1965 if ignore:
1966 ignore = set(ignore)
1967 datasetTypes = [
1968 e for e in datasetTypes if e.name not in ignore and e.nameAndComponent()[0] not in ignore
1969 ]
1970 else:
1971 ignore = set()
1973 # For each datasetType that has an instrument dimension, create
1974 # a DatasetRef for each defined instrument
1975 datasetRefs = []
1977 # Find all the registered instruments (if "instrument" is in the
1978 # universe).
1979 if "instrument" in self.dimensions:
1980 instruments = {record.name for record in self._registry.queryDimensionRecords("instrument")}
1982 for datasetType in datasetTypes:
1983 if "instrument" in datasetType.dimensions:
1984 # In order to create a conforming dataset ref, create
1985 # fake DataCoordinate values for the non-instrument
1986 # dimensions. The type of the value does not matter here.
1987 dataId = {dim: 1 for dim in datasetType.dimensions.names if dim != "instrument"}
1989 for instrument in instruments:
1990 datasetRef = DatasetRef(
1991 datasetType,
1992 DataCoordinate.standardize(
1993 dataId, instrument=instrument, dimensions=datasetType.dimensions
1994 ),
1995 run="validate",
1996 )
1997 datasetRefs.append(datasetRef)
1999 entities: list[DatasetType | DatasetRef] = []
2000 entities.extend(datasetTypes)
2001 entities.extend(datasetRefs)
2003 datastoreErrorStr = None
2004 try:
2005 self._datastore.validateConfiguration(entities, logFailures=logFailures)
2006 except ValidationError as e:
2007 datastoreErrorStr = str(e)
2009 # Also check that the LookupKeys used by the datastores match
2010 # registry and storage class definitions
2011 keys = self._datastore.getLookupKeys()
2013 failedNames = set()
2014 failedDataId = set()
2015 for key in keys:
2016 if key.name is not None:
2017 if key.name in ignore:
2018 continue
2020 # skip if specific datasetType names were requested and this
2021 # name does not match
2022 if datasetTypeNames and key.name not in datasetTypeNames:
2023 continue
2025 # See if it is a StorageClass or a DatasetType
2026 if key.name in self.storageClasses:
2027 pass
2028 else:
2029 try:
2030 self.get_dataset_type(key.name)
2031 except KeyError:
2032 if logFailures:
2033 _LOG.critical(
2034 "Key '%s' does not correspond to a DatasetType or StorageClass", key
2035 )
2036 failedNames.add(key)
2037 else:
2038 # Dimensions are checked for consistency when the Butler
2039 # is created and rendezvoused with a universe.
2040 pass
2042 # Check that the instrument is a valid instrument
2043 # Currently only support instrument so check for that
2044 if key.dataId:
2045 dataIdKeys = set(key.dataId)
2046 if {"instrument"} != dataIdKeys:
2047 if logFailures:
2048 _LOG.critical("Key '%s' has unsupported DataId override", key)
2049 failedDataId.add(key)
2050 elif key.dataId["instrument"] not in instruments:
2051 if logFailures:
2052 _LOG.critical("Key '%s' has unknown instrument", key)
2053 failedDataId.add(key)
2055 messages = []
2057 if datastoreErrorStr:
2058 messages.append(datastoreErrorStr)
2060 for failed, msg in (
2061 (failedNames, "Keys without corresponding DatasetType or StorageClass entry: "),
2062 (failedDataId, "Keys with bad DataId entries: "),
2063 ):
2064 if failed:
2065 msg += ", ".join(str(k) for k in failed)
2066 messages.append(msg)
2068 if messages:
2069 raise ValidationError(";\n".join(messages))
2071 @property
2072 def collections(self) -> Sequence[str]:
2073 """The collections to search by default, in order
2074 (`~collections.abc.Sequence` [ `str` ]).
2076 This is an alias for ``self.registry.defaults.collections``. It cannot
2077 be set directly in isolation, but all defaults may be changed together
2078 by assigning a new `RegistryDefaults` instance to
2079 ``self.registry.defaults``.
2080 """
2081 return self._registry.defaults.collections
2083 @property
2084 def run(self) -> str | None:
2085 """Name of the run this butler writes outputs to by default (`str` or
2086 `None`).
2088 This is an alias for ``self.registry.defaults.run``. It cannot be set
2089 directly in isolation, but all defaults may be changed together by
2090 assigning a new `RegistryDefaults` instance to
2091 ``self.registry.defaults``.
2092 """
2093 return self._registry.defaults.run
2095 @property
2096 def registry(self) -> Registry:
2097 """The object that manages dataset metadata and relationships
2098 (`Registry`).
2100 Many operations that don't involve reading or writing butler datasets
2101 are accessible only via `Registry` methods. Eventually these methods
2102 will be replaced by equivalent `Butler` methods.
2103 """
2104 return self._registry_shim
2106 @property
2107 def dimensions(self) -> DimensionUniverse:
2108 # Docstring inherited.
2109 return self._registry.dimensions
2111 @contextlib.contextmanager
2112 def _query(self) -> Iterator[Query]:
2113 # Docstring inherited.
2114 with self._caching_context():
2115 yield DirectQuery(self._registry)
2117 def _query_data_ids(
2118 self,
2119 dimensions: DimensionGroup | Iterable[str] | str,
2120 *,
2121 data_id: DataId | None = None,
2122 where: str = "",
2123 bind: Mapping[str, Any] | None = None,
2124 expanded: bool = False,
2125 order_by: Iterable[str] | str | None = None,
2126 limit: int | None = None,
2127 offset: int | None = None,
2128 explain: bool = True,
2129 **kwargs: Any,
2130 ) -> list[DataCoordinate]:
2131 # Docstring inherited.
2132 query = DirectQuery(self._registry)
2133 result = query.data_ids(dimensions, data_id=data_id, where=where, bind=bind, **kwargs)
2134 if expanded:
2135 result = result.expanded()
2136 if order_by:
2137 result = result.order_by(*ensure_iterable(order_by))
2138 if limit is not None:
2139 result = result.limit(limit, offset)
2140 else:
2141 if offset is not None:
2142 raise TypeError("offset is specified without limit")
2143 data_ids = list(result)
2144 if explain and not data_ids:
2145 raise EmptyQueryResultError(list(result.explain_no_results()))
2146 return data_ids
2148 def _query_datasets(
2149 self,
2150 dataset_type: Any,
2151 collections: CollectionArgType | None = None,
2152 *,
2153 find_first: bool = True,
2154 data_id: DataId | None = None,
2155 where: str = "",
2156 bind: Mapping[str, Any] | None = None,
2157 expanded: bool = False,
2158 explain: bool = True,
2159 **kwargs: Any,
2160 ) -> list[DatasetRef]:
2161 # Docstring inherited.
2162 query = DirectQuery(self._registry)
2163 result = query.datasets(
2164 dataset_type,
2165 collections,
2166 find_first=find_first,
2167 data_id=data_id,
2168 where=where,
2169 bind=bind,
2170 **kwargs,
2171 )
2172 if expanded:
2173 result = result.expanded()
2174 refs = list(result)
2175 if explain and not refs:
2176 raise EmptyQueryResultError(list(result.explain_no_results()))
2177 return refs
2179 def _query_dimension_records(
2180 self,
2181 element: str,
2182 *,
2183 data_id: DataId | None = None,
2184 where: str = "",
2185 bind: Mapping[str, Any] | None = None,
2186 order_by: Iterable[str] | str | None = None,
2187 limit: int | None = None,
2188 offset: int | None = None,
2189 explain: bool = True,
2190 **kwargs: Any,
2191 ) -> list[DimensionRecord]:
2192 # Docstring inherited.
2193 query = DirectQuery(self._registry)
2194 result = query.dimension_records(element, data_id=data_id, where=where, bind=bind, **kwargs)
2195 if order_by:
2196 result = result.order_by(*ensure_iterable(order_by))
2197 if limit is not None:
2198 result = result.limit(limit, offset)
2199 else:
2200 if offset is not None:
2201 raise TypeError("offset is specified without limit")
2202 data_ids = list(result)
2203 if explain and not data_ids:
2204 raise EmptyQueryResultError(list(result.explain_no_results()))
2205 return data_ids
2207 def _preload_cache(self) -> None:
2208 """Immediately load caches that are used for common operations."""
2209 self._registry.preload_cache()
2211 _config: ButlerConfig
2212 """Configuration for this Butler instance."""
2214 _registry: SqlRegistry
2215 """The object that manages dataset metadata and relationships
2216 (`SqlRegistry`).
2218 Most operations that don't involve reading or writing butler datasets are
2219 accessible only via `SqlRegistry` methods.
2220 """
2222 datastore: Datastore
2223 """The object that manages actual dataset storage (`Datastore`).
2225 Direct user access to the datastore should rarely be necessary; the primary
2226 exception is the case where a `Datastore` implementation provides extra
2227 functionality beyond what the base class defines.
2228 """
2230 storageClasses: StorageClassFactory
2231 """An object that maps known storage class names to objects that fully
2232 describe them (`StorageClassFactory`).
2233 """
2235 _registry_shim: RegistryShim
2236 """Shim object to provide a legacy public interface for querying via the
2237 the ``registry`` property.
2238 """