Coverage for python/lsst/daf/butler/direct_butler.py: 10%
782 statements
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-16 10:44 +0000
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-16 10:44 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28"""Butler top level classes.
29"""
30from __future__ import annotations
32__all__ = (
33 "DirectButler",
34 "ButlerValidationError",
35)
37import collections.abc
38import contextlib
39import io
40import itertools
41import logging
42import numbers
43import os
44import warnings
45from collections import Counter, defaultdict
46from collections.abc import Iterable, Iterator, Mapping, MutableMapping, Sequence
47from typing import TYPE_CHECKING, Any, ClassVar, TextIO, cast
49from lsst.resources import ResourcePath, ResourcePathExpression
50from lsst.utils.introspection import get_class_of
51from lsst.utils.iteration import ensure_iterable
52from lsst.utils.logging import VERBOSE, getLogger
53from sqlalchemy.exc import IntegrityError
55from ._butler import Butler
56from ._butler_config import ButlerConfig
57from ._butler_instance_options import ButlerInstanceOptions
58from ._dataset_existence import DatasetExistence
59from ._dataset_ref import DatasetRef
60from ._dataset_type import DatasetType
61from ._deferredDatasetHandle import DeferredDatasetHandle
62from ._exceptions import EmptyQueryResultError, ValidationError
63from ._limited_butler import LimitedButler
64from ._registry_shim import RegistryShim
65from ._storage_class import StorageClass, StorageClassFactory
66from ._timespan import Timespan
67from .datastore import Datastore, NullDatastore
68from .dimensions import DataCoordinate, Dimension
69from .direct_query import DirectQuery
70from .progress import Progress
71from .registry import (
72 CollectionType,
73 ConflictingDefinitionError,
74 DataIdError,
75 MissingDatasetTypeError,
76 NoDefaultCollectionError,
77 RegistryDefaults,
78 _RegistryFactory,
79)
80from .registry.sql_registry import SqlRegistry
81from .transfers import RepoExportContext
82from .utils import transactional
84if TYPE_CHECKING:
85 from lsst.resources import ResourceHandleProtocol
87 from ._dataset_ref import DatasetId
88 from ._file_dataset import FileDataset
89 from ._query import Query
90 from .datastore import DatasetRefURIs
91 from .dimensions import (
92 DataId,
93 DataIdValue,
94 DimensionElement,
95 DimensionGroup,
96 DimensionRecord,
97 DimensionUniverse,
98 )
99 from .registry import CollectionArgType, Registry
100 from .transfers import RepoImportBackend
102_LOG = getLogger(__name__)
105class ButlerValidationError(ValidationError):
106 """There is a problem with the Butler configuration."""
108 pass
111class DirectButler(Butler): # numpydoc ignore=PR02
112 """Main entry point for the data access system.
114 Parameters
115 ----------
116 config : `ButlerConfig`
117 The configuration for this Butler instance.
118 registry : `SqlRegistry`
119 The object that manages dataset metadata and relationships.
120 datastore : Datastore
121 The object that manages actual dataset storage.
122 storageClasses : StorageClassFactory
123 An object that maps known storage class names to objects that fully
124 describe them.
126 Notes
127 -----
128 Most users should call the top-level `Butler`.``from_config`` instead of
129 using this constructor directly.
130 """
132 # This is __new__ instead of __init__ because we have to support
133 # instantiation via the legacy constructor Butler.__new__(), which
134 # reads the configuration and selects which subclass to instantiate. The
135 # interaction between __new__ and __init__ is kind of wacky in Python. If
136 # we were using __init__ here, __init__ would be called twice (once when
137 # the DirectButler instance is constructed inside Butler.from_config(), and
138 # a second time with the original arguments to Butler() when the instance
139 # is returned from Butler.__new__()
140 def __new__(
141 cls,
142 *,
143 config: ButlerConfig,
144 registry: SqlRegistry,
145 datastore: Datastore,
146 storageClasses: StorageClassFactory,
147 ) -> DirectButler:
148 self = cast(DirectButler, super().__new__(cls))
149 self._config = config
150 self._registry = registry
151 self._datastore = datastore
152 self.storageClasses = storageClasses
154 # For execution butler the datastore needs a special
155 # dependency-inversion trick. This is not used by regular butler,
156 # but we do not have a way to distinguish regular butler from execution
157 # butler.
158 self._datastore.set_retrieve_dataset_type_method(self._retrieve_dataset_type)
160 self._registry_shim = RegistryShim(self)
162 return self
164 @classmethod
165 def create_from_config(
166 cls,
167 config: ButlerConfig,
168 *,
169 options: ButlerInstanceOptions,
170 without_datastore: bool = False,
171 ) -> DirectButler:
172 """Construct a Butler instance from a configuration file.
174 Parameters
175 ----------
176 config : `ButlerConfig`
177 The configuration for this Butler instance.
178 options : `ButlerInstanceOptions`
179 Default values and other settings for the Butler instance.
180 without_datastore : `bool`, optional
181 If `True` do not attach a datastore to this butler. Any attempts
182 to use a datastore will fail.
184 Notes
185 -----
186 Most users should call the top-level `Butler`.``from_config``
187 instead of using this function directly.
188 """
189 if "run" in config or "collection" in config:
190 raise ValueError("Passing a run or collection via configuration is no longer supported.")
192 defaults = RegistryDefaults(
193 collections=options.collections, run=options.run, infer=options.inferDefaults, **options.kwargs
194 )
195 try:
196 butlerRoot = config.get("root", config.configDir)
197 writeable = options.writeable
198 if writeable is None:
199 writeable = options.run is not None
200 registry = _RegistryFactory(config).from_config(
201 butlerRoot=butlerRoot, writeable=writeable, defaults=defaults
202 )
203 if without_datastore:
204 datastore: Datastore = NullDatastore(None, None)
205 else:
206 datastore = Datastore.fromConfig(
207 config, registry.getDatastoreBridgeManager(), butlerRoot=butlerRoot
208 )
209 # TODO: Once datastore drops dependency on registry we can
210 # construct datastore first and pass opaque tables to registry
211 # constructor.
212 registry.make_datastore_tables(datastore.get_opaque_table_definitions())
213 storageClasses = StorageClassFactory()
214 storageClasses.addFromConfig(config)
216 return DirectButler(
217 config=config, registry=registry, datastore=datastore, storageClasses=storageClasses
218 )
219 except Exception:
220 # Failures here usually mean that configuration is incomplete,
221 # just issue an error message which includes config file URI.
222 _LOG.error(f"Failed to instantiate Butler from config {config.configFile}.")
223 raise
225 def _clone(
226 self,
227 *,
228 collections: Any = None,
229 run: str | None = None,
230 inferDefaults: bool = True,
231 **kwargs: Any,
232 ) -> DirectButler:
233 # Docstring inherited
234 defaults = RegistryDefaults(collections=collections, run=run, infer=inferDefaults, **kwargs)
236 return DirectButler(
237 registry=self._registry.copy(defaults),
238 config=self._config,
239 datastore=self._datastore,
240 storageClasses=self.storageClasses,
241 )
243 GENERATION: ClassVar[int] = 3
244 """This is a Generation 3 Butler.
246 This attribute may be removed in the future, once the Generation 2 Butler
247 interface has been fully retired; it should only be used in transitional
248 code.
249 """
251 def _retrieve_dataset_type(self, name: str) -> DatasetType | None:
252 """Return DatasetType defined in registry given dataset type name."""
253 try:
254 return self.get_dataset_type(name)
255 except MissingDatasetTypeError:
256 return None
258 @classmethod
259 def _unpickle(
260 cls,
261 config: ButlerConfig,
262 collections: tuple[str, ...] | None,
263 run: str | None,
264 defaultDataId: dict[str, str],
265 writeable: bool,
266 ) -> DirectButler:
267 """Callable used to unpickle a Butler.
269 We prefer not to use ``Butler.__init__`` directly so we can force some
270 of its many arguments to be keyword-only (note that ``__reduce__``
271 can only invoke callables with positional arguments).
273 Parameters
274 ----------
275 config : `ButlerConfig`
276 Butler configuration, already coerced into a true `ButlerConfig`
277 instance (and hence after any search paths for overrides have been
278 utilized).
279 collections : `tuple` [ `str` ]
280 Names of the default collections to read from.
281 run : `str`, optional
282 Name of the default `~CollectionType.RUN` collection to write to.
283 defaultDataId : `dict` [ `str`, `str` ]
284 Default data ID values.
285 writeable : `bool`
286 Whether the Butler should support write operations.
288 Returns
289 -------
290 butler : `Butler`
291 A new `Butler` instance.
292 """
293 return cls.create_from_config(
294 config=config,
295 options=ButlerInstanceOptions(
296 collections=collections, run=run, writeable=writeable, kwargs=defaultDataId
297 ),
298 )
300 def __reduce__(self) -> tuple:
301 """Support pickling."""
302 return (
303 DirectButler._unpickle,
304 (
305 self._config,
306 self.collections,
307 self.run,
308 dict(self._registry.defaults.dataId.required),
309 self._registry.isWriteable(),
310 ),
311 )
313 def __str__(self) -> str:
314 return "Butler(collections={}, run={}, datastore='{}', registry='{}')".format(
315 self.collections, self.run, self._datastore, self._registry
316 )
318 def isWriteable(self) -> bool:
319 # Docstring inherited.
320 return self._registry.isWriteable()
322 def _caching_context(self) -> contextlib.AbstractContextManager[None]:
323 """Context manager that enables caching."""
324 return self._registry.caching_context()
326 @contextlib.contextmanager
327 def transaction(self) -> Iterator[None]:
328 """Context manager supporting `Butler` transactions.
330 Transactions can be nested.
331 """
332 with self._registry.transaction(), self._datastore.transaction():
333 yield
335 def _standardizeArgs(
336 self,
337 datasetRefOrType: DatasetRef | DatasetType | str,
338 dataId: DataId | None = None,
339 for_put: bool = True,
340 **kwargs: Any,
341 ) -> tuple[DatasetType, DataId | None]:
342 """Standardize the arguments passed to several Butler APIs.
344 Parameters
345 ----------
346 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
347 When `DatasetRef` the `dataId` should be `None`.
348 Otherwise the `DatasetType` or name thereof.
349 dataId : `dict` or `DataCoordinate`
350 A `dict` of `Dimension` link name, value pairs that label the
351 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
352 should be provided as the second argument.
353 for_put : `bool`, optional
354 If `True` this call is invoked as part of a `Butler.put()`.
355 Otherwise it is assumed to be part of a `Butler.get()`. This
356 parameter is only relevant if there is dataset type
357 inconsistency.
358 **kwargs
359 Additional keyword arguments used to augment or construct a
360 `DataCoordinate`. See `DataCoordinate.standardize`
361 parameters.
363 Returns
364 -------
365 datasetType : `DatasetType`
366 A `DatasetType` instance extracted from ``datasetRefOrType``.
367 dataId : `dict` or `DataId`, optional
368 Argument that can be used (along with ``kwargs``) to construct a
369 `DataId`.
371 Notes
372 -----
373 Butler APIs that conceptually need a DatasetRef also allow passing a
374 `DatasetType` (or the name of one) and a `DataId` (or a dict and
375 keyword arguments that can be used to construct one) separately. This
376 method accepts those arguments and always returns a true `DatasetType`
377 and a `DataId` or `dict`.
379 Standardization of `dict` vs `DataId` is best handled by passing the
380 returned ``dataId`` (and ``kwargs``) to `Registry` APIs, which are
381 generally similarly flexible.
382 """
383 externalDatasetType: DatasetType | None = None
384 internalDatasetType: DatasetType | None = None
385 if isinstance(datasetRefOrType, DatasetRef):
386 if dataId is not None or kwargs:
387 raise ValueError("DatasetRef given, cannot use dataId as well")
388 externalDatasetType = datasetRefOrType.datasetType
389 dataId = datasetRefOrType.dataId
390 else:
391 # Don't check whether DataId is provided, because Registry APIs
392 # can usually construct a better error message when it wasn't.
393 if isinstance(datasetRefOrType, DatasetType):
394 externalDatasetType = datasetRefOrType
395 else:
396 internalDatasetType = self.get_dataset_type(datasetRefOrType)
398 # Check that they are self-consistent
399 if externalDatasetType is not None:
400 internalDatasetType = self.get_dataset_type(externalDatasetType.name)
401 if externalDatasetType != internalDatasetType:
402 # We can allow differences if they are compatible, depending
403 # on whether this is a get or a put. A get requires that
404 # the python type associated with the datastore can be
405 # converted to the user type. A put requires that the user
406 # supplied python type can be converted to the internal
407 # type expected by registry.
408 relevantDatasetType = internalDatasetType
409 if for_put:
410 is_compatible = internalDatasetType.is_compatible_with(externalDatasetType)
411 else:
412 is_compatible = externalDatasetType.is_compatible_with(internalDatasetType)
413 relevantDatasetType = externalDatasetType
414 if not is_compatible:
415 raise ValueError(
416 f"Supplied dataset type ({externalDatasetType}) inconsistent with "
417 f"registry definition ({internalDatasetType})"
418 )
419 # Override the internal definition.
420 internalDatasetType = relevantDatasetType
422 assert internalDatasetType is not None
423 return internalDatasetType, dataId
425 def _rewrite_data_id(
426 self, dataId: DataId | None, datasetType: DatasetType, **kwargs: Any
427 ) -> tuple[DataId | None, dict[str, Any]]:
428 """Rewrite a data ID taking into account dimension records.
430 Take a Data ID and keyword args and rewrite it if necessary to
431 allow the user to specify dimension records rather than dimension
432 primary values.
434 This allows a user to include a dataId dict with keys of
435 ``exposure.day_obs`` and ``exposure.seq_num`` instead of giving
436 the integer exposure ID. It also allows a string to be given
437 for a dimension value rather than the integer ID if that is more
438 convenient. For example, rather than having to specifying the
439 detector with ``detector.full_name``, a string given for ``detector``
440 will be interpreted as the full name and converted to the integer
441 value.
443 Keyword arguments can also use strings for dimensions like detector
444 and exposure but python does not allow them to include ``.`` and
445 so the ``exposure.day_obs`` syntax can not be used in a keyword
446 argument.
448 Parameters
449 ----------
450 dataId : `dict` or `DataCoordinate`
451 A `dict` of `Dimension` link name, value pairs that will label the
452 `DatasetRef` within a Collection.
453 datasetType : `DatasetType`
454 The dataset type associated with this dataId. Required to
455 determine the relevant dimensions.
456 **kwargs
457 Additional keyword arguments used to augment or construct a
458 `DataId`. See `DataId` parameters.
460 Returns
461 -------
462 dataId : `dict` or `DataCoordinate`
463 The, possibly rewritten, dataId. If given a `DataCoordinate` and
464 no keyword arguments, the original dataId will be returned
465 unchanged.
466 **kwargs : `dict`
467 Any unused keyword arguments (would normally be empty dict).
468 """
469 # Do nothing if we have a standalone DataCoordinate.
470 if isinstance(dataId, DataCoordinate) and not kwargs:
471 return dataId, kwargs
473 # Process dimension records that are using record information
474 # rather than ids
475 newDataId: dict[str, DataIdValue] = {}
476 byRecord: dict[str, dict[str, Any]] = defaultdict(dict)
478 # if all the dataId comes from keyword parameters we do not need
479 # to do anything here because they can't be of the form
480 # exposure.obs_id because a "." is not allowed in a keyword parameter.
481 if dataId:
482 for k, v in dataId.items():
483 # If we have a Dimension we do not need to do anything
484 # because it cannot be a compound key.
485 if isinstance(k, str) and "." in k:
486 # Someone is using a more human-readable dataId
487 dimensionName, record = k.split(".", 1)
488 byRecord[dimensionName][record] = v
489 elif isinstance(k, Dimension):
490 newDataId[k.name] = v
491 else:
492 newDataId[k] = v
494 # Go through the updated dataId and check the type in case someone is
495 # using an alternate key. We have already filtered out the compound
496 # keys dimensions.record format.
497 not_dimensions = {}
499 # Will need to look in the dataId and the keyword arguments
500 # and will remove them if they need to be fixed or are unrecognized.
501 for dataIdDict in (newDataId, kwargs):
502 # Use a list so we can adjust the dict safely in the loop
503 for dimensionName in list(dataIdDict):
504 value = dataIdDict[dimensionName]
505 try:
506 dimension = self.dimensions.dimensions[dimensionName]
507 except KeyError:
508 # This is not a real dimension
509 not_dimensions[dimensionName] = value
510 del dataIdDict[dimensionName]
511 continue
513 # Convert an integral type to an explicit int to simplify
514 # comparisons here
515 if isinstance(value, numbers.Integral):
516 value = int(value)
518 if not isinstance(value, dimension.primaryKey.getPythonType()):
519 for alternate in dimension.alternateKeys:
520 if isinstance(value, alternate.getPythonType()):
521 byRecord[dimensionName][alternate.name] = value
522 del dataIdDict[dimensionName]
523 _LOG.debug(
524 "Converting dimension %s to %s.%s=%s",
525 dimensionName,
526 dimensionName,
527 alternate.name,
528 value,
529 )
530 break
531 else:
532 _LOG.warning(
533 "Type mismatch found for value '%r' provided for dimension %s. "
534 "Could not find matching alternative (primary key has type %s) "
535 "so attempting to use as-is.",
536 value,
537 dimensionName,
538 dimension.primaryKey.getPythonType(),
539 )
541 # By this point kwargs and newDataId should only include valid
542 # dimensions. Merge kwargs in to the new dataId and log if there
543 # are dimensions in both (rather than calling update).
544 for k, v in kwargs.items():
545 if k in newDataId and newDataId[k] != v:
546 _LOG.debug(
547 "Keyword arg %s overriding explicit value in dataId of %s with %s", k, newDataId[k], v
548 )
549 newDataId[k] = v
550 # No need to retain any values in kwargs now.
551 kwargs = {}
553 # If we have some unrecognized dimensions we have to try to connect
554 # them to records in other dimensions. This is made more complicated
555 # by some dimensions having records with clashing names. A mitigation
556 # is that we can tell by this point which dimensions are missing
557 # for the DatasetType but this does not work for calibrations
558 # where additional dimensions can be used to constrain the temporal
559 # axis.
560 if not_dimensions:
561 # Search for all dimensions even if we have been given a value
562 # explicitly. In some cases records are given as well as the
563 # actually dimension and this should not be an error if they
564 # match.
565 mandatoryDimensions = datasetType.dimensions.names # - provided
567 candidateDimensions: set[str] = set()
568 candidateDimensions.update(mandatoryDimensions)
570 # For calibrations we may well be needing temporal dimensions
571 # so rather than always including all dimensions in the scan
572 # restrict things a little. It is still possible for there
573 # to be confusion over day_obs in visit vs exposure for example.
574 # If we are not searching calibration collections things may
575 # fail but they are going to fail anyway because of the
576 # ambiguousness of the dataId...
577 if datasetType.isCalibration():
578 for dim in self.dimensions.dimensions:
579 if dim.temporal:
580 candidateDimensions.add(str(dim))
582 # Look up table for the first association with a dimension
583 guessedAssociation: dict[str, dict[str, Any]] = defaultdict(dict)
585 # Keep track of whether an item is associated with multiple
586 # dimensions.
587 counter: Counter[str] = Counter()
588 assigned: dict[str, set[str]] = defaultdict(set)
590 # Go through the missing dimensions and associate the
591 # given names with records within those dimensions
592 matched_dims = set()
593 for dimensionName in candidateDimensions:
594 dimension = self.dimensions.dimensions[dimensionName]
595 fields = dimension.metadata.names | dimension.uniqueKeys.names
596 for field in not_dimensions:
597 if field in fields:
598 guessedAssociation[dimensionName][field] = not_dimensions[field]
599 counter[dimensionName] += 1
600 assigned[field].add(dimensionName)
601 matched_dims.add(field)
603 # Calculate the fields that matched nothing.
604 never_found = set(not_dimensions) - matched_dims
606 if never_found:
607 raise ValueError(f"Unrecognized keyword args given: {never_found}")
609 # There is a chance we have allocated a single dataId item
610 # to multiple dimensions. Need to decide which should be retained.
611 # For now assume that the most popular alternative wins.
612 # This means that day_obs with seq_num will result in
613 # exposure.day_obs and not visit.day_obs
614 # Also prefer an explicitly missing dimension over an inferred
615 # temporal dimension.
616 for fieldName, assignedDimensions in assigned.items():
617 if len(assignedDimensions) > 1:
618 # Pick the most popular (preferring mandatory dimensions)
619 requiredButMissing = assignedDimensions.intersection(mandatoryDimensions)
620 if requiredButMissing:
621 candidateDimensions = requiredButMissing
622 else:
623 candidateDimensions = assignedDimensions
625 # If this is a choice between visit and exposure and
626 # neither was a required part of the dataset type,
627 # (hence in this branch) always prefer exposure over
628 # visit since exposures are always defined and visits
629 # are defined from exposures.
630 if candidateDimensions == {"exposure", "visit"}:
631 candidateDimensions = {"exposure"}
633 # Select the relevant items and get a new restricted
634 # counter.
635 theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions}
636 duplicatesCounter: Counter[str] = Counter()
637 duplicatesCounter.update(theseCounts)
639 # Choose the most common. If they are equally common
640 # we will pick the one that was found first.
641 # Returns a list of tuples
642 selected = duplicatesCounter.most_common(1)[0][0]
644 _LOG.debug(
645 "Ambiguous dataId entry '%s' associated with multiple dimensions: %s."
646 " Removed ambiguity by choosing dimension %s.",
647 fieldName,
648 ", ".join(assignedDimensions),
649 selected,
650 )
652 for candidateDimension in assignedDimensions:
653 if candidateDimension != selected:
654 del guessedAssociation[candidateDimension][fieldName]
656 # Update the record look up dict with the new associations
657 for dimensionName, values in guessedAssociation.items():
658 if values: # A dict might now be empty
659 _LOG.debug(
660 "Assigned non-dimension dataId keys to dimension %s: %s", dimensionName, values
661 )
662 byRecord[dimensionName].update(values)
664 if byRecord:
665 # Some record specifiers were found so we need to convert
666 # them to the Id form
667 for dimensionName, values in byRecord.items():
668 if dimensionName in newDataId:
669 _LOG.debug(
670 "DataId specified explicit %s dimension value of %s in addition to"
671 " general record specifiers for it of %s. Ignoring record information.",
672 dimensionName,
673 newDataId[dimensionName],
674 str(values),
675 )
676 # Get the actual record and compare with these values.
677 try:
678 recs = list(self._registry.queryDimensionRecords(dimensionName, dataId=newDataId))
679 except DataIdError:
680 raise ValueError(
681 f"Could not find dimension '{dimensionName}'"
682 f" with dataId {newDataId} as part of comparing with"
683 f" record values {byRecord[dimensionName]}"
684 ) from None
685 if len(recs) == 1:
686 errmsg: list[str] = []
687 for k, v in values.items():
688 if (recval := getattr(recs[0], k)) != v:
689 errmsg.append(f"{k}({recval} != {v})")
690 if errmsg:
691 raise ValueError(
692 f"Dimension {dimensionName} in dataId has explicit value"
693 " inconsistent with records: " + ", ".join(errmsg)
694 )
695 else:
696 # Multiple matches for an explicit dimension
697 # should never happen but let downstream complain.
698 pass
699 continue
701 # Build up a WHERE expression
702 bind = dict(values.items())
703 where = " AND ".join(f"{dimensionName}.{k} = {k}" for k in bind)
705 # Hopefully we get a single record that matches
706 records = set(
707 self._registry.queryDimensionRecords(
708 dimensionName, dataId=newDataId, where=where, bind=bind, **kwargs
709 )
710 )
712 if len(records) != 1:
713 if len(records) > 1:
714 # visit can have an ambiguous answer without involving
715 # visit_system. The default visit_system is defined
716 # by the instrument.
717 if (
718 dimensionName == "visit"
719 and "visit_system_membership" in self.dimensions
720 and "visit_system" in self.dimensions["instrument"].metadata
721 ):
722 instrument_records = list(
723 self._registry.queryDimensionRecords(
724 "instrument",
725 dataId=newDataId,
726 **kwargs,
727 )
728 )
729 if len(instrument_records) == 1:
730 visit_system = instrument_records[0].visit_system
731 if visit_system is None:
732 # Set to a value that will never match.
733 visit_system = -1
735 # Look up each visit in the
736 # visit_system_membership records.
737 for rec in records:
738 membership = list(
739 self._registry.queryDimensionRecords(
740 # Use bind to allow zero results.
741 # This is a fully-specified query.
742 "visit_system_membership",
743 where="instrument = inst AND visit_system = system AND visit = v",
744 bind=dict(
745 inst=instrument_records[0].name, system=visit_system, v=rec.id
746 ),
747 )
748 )
749 if membership:
750 # This record is the right answer.
751 records = {rec}
752 break
754 # The ambiguity may have been resolved so check again.
755 if len(records) > 1:
756 _LOG.debug(
757 "Received %d records from constraints of %s", len(records), str(values)
758 )
759 for r in records:
760 _LOG.debug("- %s", str(r))
761 raise ValueError(
762 f"DataId specification for dimension {dimensionName} is not"
763 f" uniquely constrained to a single dataset by {values}."
764 f" Got {len(records)} results."
765 )
766 else:
767 raise ValueError(
768 f"DataId specification for dimension {dimensionName} matched no"
769 f" records when constrained by {values}"
770 )
772 # Get the primary key from the real dimension object
773 dimension = self.dimensions.dimensions[dimensionName]
774 if not isinstance(dimension, Dimension):
775 raise RuntimeError(
776 f"{dimension.name} is not a true dimension, and cannot be used in data IDs."
777 )
778 newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name)
780 return newDataId, kwargs
782 def _findDatasetRef(
783 self,
784 datasetRefOrType: DatasetRef | DatasetType | str,
785 dataId: DataId | None = None,
786 *,
787 collections: Any = None,
788 predict: bool = False,
789 run: str | None = None,
790 datastore_records: bool = False,
791 **kwargs: Any,
792 ) -> DatasetRef:
793 """Shared logic for methods that start with a search for a dataset in
794 the registry.
796 Parameters
797 ----------
798 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
799 When `DatasetRef` the `dataId` should be `None`.
800 Otherwise the `DatasetType` or name thereof.
801 dataId : `dict` or `DataCoordinate`, optional
802 A `dict` of `Dimension` link name, value pairs that label the
803 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
804 should be provided as the first argument.
805 collections : Any, optional
806 Collections to be searched, overriding ``self.collections``.
807 Can be any of the types supported by the ``collections`` argument
808 to butler construction.
809 predict : `bool`, optional
810 If `True`, return a newly created `DatasetRef` with a unique
811 dataset ID if finding a reference in the `Registry` fails.
812 Defaults to `False`.
813 run : `str`, optional
814 Run collection name to use for creating `DatasetRef` for predicted
815 datasets. Only used if ``predict`` is `True`.
816 datastore_records : `bool`, optional
817 If `True` add datastore records to returned `DatasetRef`.
818 **kwargs
819 Additional keyword arguments used to augment or construct a
820 `DataId`. See `DataId` parameters.
822 Returns
823 -------
824 ref : `DatasetRef`
825 A reference to the dataset identified by the given arguments.
826 This can be the same dataset reference as given if it was
827 resolved.
829 Raises
830 ------
831 LookupError
832 Raised if no matching dataset exists in the `Registry` (and
833 ``predict`` is `False`).
834 ValueError
835 Raised if a resolved `DatasetRef` was passed as an input, but it
836 differs from the one found in the registry.
837 TypeError
838 Raised if no collections were provided.
839 """
840 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, for_put=False, **kwargs)
841 if isinstance(datasetRefOrType, DatasetRef):
842 if collections is not None:
843 warnings.warn("Collections should not be specified with DatasetRef", stacklevel=3)
844 # May need to retrieve datastore records if requested.
845 if datastore_records and datasetRefOrType._datastore_records is None:
846 datasetRefOrType = self._registry.get_datastore_records(datasetRefOrType)
847 return datasetRefOrType
848 timespan: Timespan | None = None
850 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs)
852 if datasetType.isCalibration():
853 # Because this is a calibration dataset, first try to make a
854 # standardize the data ID without restricting the dimensions to
855 # those of the dataset type requested, because there may be extra
856 # dimensions that provide temporal information for a validity-range
857 # lookup.
858 dataId = DataCoordinate.standardize(
859 dataId, universe=self.dimensions, defaults=self._registry.defaults.dataId, **kwargs
860 )
861 if dataId.dimensions.temporal:
862 dataId = self._registry.expandDataId(dataId)
863 timespan = dataId.timespan
864 else:
865 # Standardize the data ID to just the dimensions of the dataset
866 # type instead of letting registry.findDataset do it, so we get the
867 # result even if no dataset is found.
868 dataId = DataCoordinate.standardize(
869 dataId,
870 dimensions=datasetType.dimensions,
871 defaults=self._registry.defaults.dataId,
872 **kwargs,
873 )
874 # Always lookup the DatasetRef, even if one is given, to ensure it is
875 # present in the current collection.
876 ref = self.find_dataset(
877 datasetType,
878 dataId,
879 collections=collections,
880 timespan=timespan,
881 datastore_records=datastore_records,
882 )
883 if ref is None:
884 if predict:
885 if run is None:
886 run = self.run
887 if run is None:
888 raise TypeError("Cannot predict dataset ID/location with run=None.")
889 return DatasetRef(datasetType, dataId, run=run)
890 else:
891 if collections is None:
892 collections = self._registry.defaults.collections
893 raise LookupError(
894 f"Dataset {datasetType.name} with data ID {dataId} "
895 f"could not be found in collections {collections}."
896 )
897 if datasetType != ref.datasetType:
898 # If they differ it is because the user explicitly specified
899 # a compatible dataset type to this call rather than using the
900 # registry definition. The DatasetRef must therefore be recreated
901 # using the user definition such that the expected type is
902 # returned.
903 ref = DatasetRef(
904 datasetType, ref.dataId, run=ref.run, id=ref.id, datastore_records=ref._datastore_records
905 )
907 return ref
909 @transactional
910 def put(
911 self,
912 obj: Any,
913 datasetRefOrType: DatasetRef | DatasetType | str,
914 /,
915 dataId: DataId | None = None,
916 *,
917 run: str | None = None,
918 **kwargs: Any,
919 ) -> DatasetRef:
920 """Store and register a dataset.
922 Parameters
923 ----------
924 obj : `object`
925 The dataset.
926 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
927 When `DatasetRef` is provided, ``dataId`` should be `None`.
928 Otherwise the `DatasetType` or name thereof. If a fully resolved
929 `DatasetRef` is given the run and ID are used directly.
930 dataId : `dict` or `DataCoordinate`
931 A `dict` of `Dimension` link name, value pairs that label the
932 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
933 should be provided as the second argument.
934 run : `str`, optional
935 The name of the run the dataset should be added to, overriding
936 ``self.run``. Not used if a resolved `DatasetRef` is provided.
937 **kwargs
938 Additional keyword arguments used to augment or construct a
939 `DataCoordinate`. See `DataCoordinate.standardize`
940 parameters. Not used if a resolve `DatasetRef` is provided.
942 Returns
943 -------
944 ref : `DatasetRef`
945 A reference to the stored dataset, updated with the correct id if
946 given.
948 Raises
949 ------
950 TypeError
951 Raised if the butler is read-only or if no run has been provided.
952 """
953 if isinstance(datasetRefOrType, DatasetRef):
954 # This is a direct put of predefined DatasetRef.
955 _LOG.debug("Butler put direct: %s", datasetRefOrType)
956 if run is not None:
957 warnings.warn("Run collection is not used for DatasetRef", stacklevel=3)
958 # If registry already has a dataset with the same dataset ID,
959 # dataset type and DataId, then _importDatasets will do nothing and
960 # just return an original ref. We have to raise in this case, there
961 # is a datastore check below for that.
962 self._registry._importDatasets([datasetRefOrType], expand=True)
963 # Before trying to write to the datastore check that it does not
964 # know this dataset. This is prone to races, of course.
965 if self._datastore.knows(datasetRefOrType):
966 raise ConflictingDefinitionError(f"Datastore already contains dataset: {datasetRefOrType}")
967 # Try to write dataset to the datastore, if it fails due to a race
968 # with another write, the content of stored data may be
969 # unpredictable.
970 try:
971 self._datastore.put(obj, datasetRefOrType)
972 except IntegrityError as e:
973 raise ConflictingDefinitionError(f"Datastore already contains dataset: {e}") from e
974 return datasetRefOrType
976 _LOG.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run)
977 if not self.isWriteable():
978 raise TypeError("Butler is read-only.")
979 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs)
981 # Handle dimension records in dataId
982 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs)
984 # Add Registry Dataset entry.
985 dataId = self._registry.expandDataId(dataId, dimensions=datasetType.dimensions, **kwargs)
986 (ref,) = self._registry.insertDatasets(datasetType, run=run, dataIds=[dataId])
987 self._datastore.put(obj, ref)
989 return ref
991 def getDeferred(
992 self,
993 datasetRefOrType: DatasetRef | DatasetType | str,
994 /,
995 dataId: DataId | None = None,
996 *,
997 parameters: dict | None = None,
998 collections: Any = None,
999 storageClass: str | StorageClass | None = None,
1000 **kwargs: Any,
1001 ) -> DeferredDatasetHandle:
1002 """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
1003 after an immediate registry lookup.
1005 Parameters
1006 ----------
1007 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1008 When `DatasetRef` the `dataId` should be `None`.
1009 Otherwise the `DatasetType` or name thereof.
1010 dataId : `dict` or `DataCoordinate`, optional
1011 A `dict` of `Dimension` link name, value pairs that label the
1012 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1013 should be provided as the first argument.
1014 parameters : `dict`
1015 Additional StorageClass-defined options to control reading,
1016 typically used to efficiently read only a subset of the dataset.
1017 collections : Any, optional
1018 Collections to be searched, overriding ``self.collections``.
1019 Can be any of the types supported by the ``collections`` argument
1020 to butler construction.
1021 storageClass : `StorageClass` or `str`, optional
1022 The storage class to be used to override the Python type
1023 returned by this method. By default the returned type matches
1024 the dataset type definition for this dataset. Specifying a
1025 read `StorageClass` can force a different type to be returned.
1026 This type must be compatible with the original type.
1027 **kwargs
1028 Additional keyword arguments used to augment or construct a
1029 `DataId`. See `DataId` parameters.
1031 Returns
1032 -------
1033 obj : `DeferredDatasetHandle`
1034 A handle which can be used to retrieve a dataset at a later time.
1036 Raises
1037 ------
1038 LookupError
1039 Raised if no matching dataset exists in the `Registry` or
1040 datastore.
1041 ValueError
1042 Raised if a resolved `DatasetRef` was passed as an input, but it
1043 differs from the one found in the registry.
1044 TypeError
1045 Raised if no collections were provided.
1046 """
1047 if isinstance(datasetRefOrType, DatasetRef):
1048 # Do the quick check first and if that fails, check for artifact
1049 # existence. This is necessary for datastores that are configured
1050 # in trust mode where there won't be a record but there will be
1051 # a file.
1052 if self._datastore.knows(datasetRefOrType) or self._datastore.exists(datasetRefOrType):
1053 ref = datasetRefOrType
1054 else:
1055 raise LookupError(f"Dataset reference {datasetRefOrType} does not exist.")
1056 else:
1057 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs)
1058 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters, storageClass=storageClass)
1060 def get(
1061 self,
1062 datasetRefOrType: DatasetRef | DatasetType | str,
1063 /,
1064 dataId: DataId | None = None,
1065 *,
1066 parameters: dict[str, Any] | None = None,
1067 collections: Any = None,
1068 storageClass: StorageClass | str | None = None,
1069 **kwargs: Any,
1070 ) -> Any:
1071 """Retrieve a stored dataset.
1073 Parameters
1074 ----------
1075 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1076 When `DatasetRef` the `dataId` should be `None`.
1077 Otherwise the `DatasetType` or name thereof.
1078 If a resolved `DatasetRef`, the associated dataset
1079 is returned directly without additional querying.
1080 dataId : `dict` or `DataCoordinate`
1081 A `dict` of `Dimension` link name, value pairs that label the
1082 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1083 should be provided as the first argument.
1084 parameters : `dict`
1085 Additional StorageClass-defined options to control reading,
1086 typically used to efficiently read only a subset of the dataset.
1087 collections : Any, optional
1088 Collections to be searched, overriding ``self.collections``.
1089 Can be any of the types supported by the ``collections`` argument
1090 to butler construction.
1091 storageClass : `StorageClass` or `str`, optional
1092 The storage class to be used to override the Python type
1093 returned by this method. By default the returned type matches
1094 the dataset type definition for this dataset. Specifying a
1095 read `StorageClass` can force a different type to be returned.
1096 This type must be compatible with the original type.
1097 **kwargs
1098 Additional keyword arguments used to augment or construct a
1099 `DataCoordinate`. See `DataCoordinate.standardize`
1100 parameters.
1102 Returns
1103 -------
1104 obj : `object`
1105 The dataset.
1107 Raises
1108 ------
1109 LookupError
1110 Raised if no matching dataset exists in the `Registry`.
1111 TypeError
1112 Raised if no collections were provided.
1114 Notes
1115 -----
1116 When looking up datasets in a `~CollectionType.CALIBRATION` collection,
1117 this method requires that the given data ID include temporal dimensions
1118 beyond the dimensions of the dataset type itself, in order to find the
1119 dataset with the appropriate validity range. For example, a "bias"
1120 dataset with native dimensions ``{instrument, detector}`` could be
1121 fetched with a ``{instrument, detector, exposure}`` data ID, because
1122 ``exposure`` is a temporal dimension.
1123 """
1124 _LOG.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters)
1125 ref = self._findDatasetRef(
1126 datasetRefOrType, dataId, collections=collections, datastore_records=True, **kwargs
1127 )
1128 return self._datastore.get(ref, parameters=parameters, storageClass=storageClass)
1130 def getURIs(
1131 self,
1132 datasetRefOrType: DatasetRef | DatasetType | str,
1133 /,
1134 dataId: DataId | None = None,
1135 *,
1136 predict: bool = False,
1137 collections: Any = None,
1138 run: str | None = None,
1139 **kwargs: Any,
1140 ) -> DatasetRefURIs:
1141 """Return the URIs associated with the dataset.
1143 Parameters
1144 ----------
1145 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1146 When `DatasetRef` the `dataId` should be `None`.
1147 Otherwise the `DatasetType` or name thereof.
1148 dataId : `dict` or `DataCoordinate`
1149 A `dict` of `Dimension` link name, value pairs that label the
1150 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1151 should be provided as the first argument.
1152 predict : `bool`
1153 If `True`, allow URIs to be returned of datasets that have not
1154 been written.
1155 collections : Any, optional
1156 Collections to be searched, overriding ``self.collections``.
1157 Can be any of the types supported by the ``collections`` argument
1158 to butler construction.
1159 run : `str`, optional
1160 Run to use for predictions, overriding ``self.run``.
1161 **kwargs
1162 Additional keyword arguments used to augment or construct a
1163 `DataCoordinate`. See `DataCoordinate.standardize`
1164 parameters.
1166 Returns
1167 -------
1168 uris : `DatasetRefURIs`
1169 The URI to the primary artifact associated with this dataset (if
1170 the dataset was disassembled within the datastore this may be
1171 `None`), and the URIs to any components associated with the dataset
1172 artifact. (can be empty if there are no components).
1173 """
1174 ref = self._findDatasetRef(
1175 datasetRefOrType, dataId, predict=predict, run=run, collections=collections, **kwargs
1176 )
1177 return self._datastore.getURIs(ref, predict)
1179 def get_dataset_type(self, name: str) -> DatasetType:
1180 return self._registry.getDatasetType(name)
1182 def get_dataset(
1183 self,
1184 id: DatasetId,
1185 *,
1186 storage_class: str | StorageClass | None = None,
1187 dimension_records: bool = False,
1188 datastore_records: bool = False,
1189 ) -> DatasetRef | None:
1190 ref = self._registry.getDataset(id)
1191 if ref is not None:
1192 if dimension_records:
1193 ref = ref.expanded(
1194 self._registry.expandDataId(ref.dataId, dimensions=ref.datasetType.dimensions)
1195 )
1196 if storage_class:
1197 ref = ref.overrideStorageClass(storage_class)
1198 if datastore_records:
1199 ref = self._registry.get_datastore_records(ref)
1200 return ref
1202 def find_dataset(
1203 self,
1204 dataset_type: DatasetType | str,
1205 data_id: DataId | None = None,
1206 *,
1207 collections: str | Sequence[str] | None = None,
1208 timespan: Timespan | None = None,
1209 storage_class: str | StorageClass | None = None,
1210 dimension_records: bool = False,
1211 datastore_records: bool = False,
1212 **kwargs: Any,
1213 ) -> DatasetRef | None:
1214 # Handle any parts of the dataID that are not using primary dimension
1215 # keys.
1216 if isinstance(dataset_type, str):
1217 actual_type = self.get_dataset_type(dataset_type)
1218 else:
1219 actual_type = dataset_type
1221 # Store the component for later.
1222 component_name = actual_type.component()
1223 if actual_type.isComponent():
1224 parent_type = actual_type.makeCompositeDatasetType()
1225 else:
1226 parent_type = actual_type
1228 data_id, kwargs = self._rewrite_data_id(data_id, parent_type, **kwargs)
1230 ref = self._registry.findDataset(
1231 parent_type,
1232 data_id,
1233 collections=collections,
1234 timespan=timespan,
1235 datastore_records=datastore_records,
1236 **kwargs,
1237 )
1238 if ref is not None and dimension_records:
1239 ref = ref.expanded(self._registry.expandDataId(ref.dataId, dimensions=ref.datasetType.dimensions))
1240 if ref is not None and component_name:
1241 ref = ref.makeComponentRef(component_name)
1242 if ref is not None and storage_class is not None:
1243 ref = ref.overrideStorageClass(storage_class)
1245 return ref
1247 def retrieveArtifacts(
1248 self,
1249 refs: Iterable[DatasetRef],
1250 destination: ResourcePathExpression,
1251 transfer: str = "auto",
1252 preserve_path: bool = True,
1253 overwrite: bool = False,
1254 ) -> list[ResourcePath]:
1255 # Docstring inherited.
1256 return self._datastore.retrieveArtifacts(
1257 refs,
1258 ResourcePath(destination),
1259 transfer=transfer,
1260 preserve_path=preserve_path,
1261 overwrite=overwrite,
1262 )
1264 def exists(
1265 self,
1266 dataset_ref_or_type: DatasetRef | DatasetType | str,
1267 /,
1268 data_id: DataId | None = None,
1269 *,
1270 full_check: bool = True,
1271 collections: Any = None,
1272 **kwargs: Any,
1273 ) -> DatasetExistence:
1274 # Docstring inherited.
1275 existence = DatasetExistence.UNRECOGNIZED
1277 if isinstance(dataset_ref_or_type, DatasetRef):
1278 if collections is not None:
1279 warnings.warn("Collections should not be specified with DatasetRef", stacklevel=2)
1280 if data_id is not None:
1281 warnings.warn("A DataID should not be specified with DatasetRef", stacklevel=2)
1282 ref = dataset_ref_or_type
1283 registry_ref = self._registry.getDataset(dataset_ref_or_type.id)
1284 if registry_ref is not None:
1285 existence |= DatasetExistence.RECORDED
1287 if dataset_ref_or_type != registry_ref:
1288 # This could mean that storage classes differ, so we should
1289 # check for that but use the registry ref for the rest of
1290 # the method.
1291 if registry_ref.is_compatible_with(dataset_ref_or_type):
1292 # Use the registry version from now on.
1293 ref = registry_ref
1294 else:
1295 raise ValueError(
1296 f"The ref given to exists() ({ref}) has the same dataset ID as one "
1297 f"in registry but has different incompatible values ({registry_ref})."
1298 )
1299 else:
1300 try:
1301 ref = self._findDatasetRef(dataset_ref_or_type, data_id, collections=collections, **kwargs)
1302 except (LookupError, TypeError, NoDefaultCollectionError):
1303 return existence
1304 existence |= DatasetExistence.RECORDED
1306 if self._datastore.knows(ref):
1307 existence |= DatasetExistence.DATASTORE
1309 if full_check:
1310 if self._datastore.exists(ref):
1311 existence |= DatasetExistence._ARTIFACT
1312 elif existence.value != DatasetExistence.UNRECOGNIZED.value:
1313 # Do not add this flag if we have no other idea about a dataset.
1314 existence |= DatasetExistence(DatasetExistence._ASSUMED)
1316 return existence
1318 def _exists_many(
1319 self,
1320 refs: Iterable[DatasetRef],
1321 /,
1322 *,
1323 full_check: bool = True,
1324 ) -> dict[DatasetRef, DatasetExistence]:
1325 # Docstring inherited.
1326 existence = {ref: DatasetExistence.UNRECOGNIZED for ref in refs}
1328 # Registry does not have a bulk API to check for a ref.
1329 for ref in refs:
1330 registry_ref = self._registry.getDataset(ref.id)
1331 if registry_ref is not None:
1332 # It is possible, albeit unlikely, that the given ref does
1333 # not match the one in registry even though the UUID matches.
1334 # When checking a single ref we raise, but it's impolite to
1335 # do that when potentially hundreds of refs are being checked.
1336 # We could change the API to only accept UUIDs and that would
1337 # remove the ability to even check and remove the worry
1338 # about differing storage classes. Given the ongoing discussion
1339 # on refs vs UUIDs and whether to raise or have a new
1340 # private flag, treat this as a private API for now.
1341 existence[ref] |= DatasetExistence.RECORDED
1343 # Ask datastore if it knows about these refs.
1344 knows = self._datastore.knows_these(refs)
1345 for ref, known in knows.items():
1346 if known:
1347 existence[ref] |= DatasetExistence.DATASTORE
1349 if full_check:
1350 mexists = self._datastore.mexists(refs)
1351 for ref, exists in mexists.items():
1352 if exists:
1353 existence[ref] |= DatasetExistence._ARTIFACT
1354 else:
1355 # Do not set this flag if nothing is known about the dataset.
1356 for ref in existence:
1357 if existence[ref] != DatasetExistence.UNRECOGNIZED:
1358 existence[ref] |= DatasetExistence._ASSUMED
1360 return existence
1362 def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None:
1363 # Docstring inherited.
1364 if not self.isWriteable():
1365 raise TypeError("Butler is read-only.")
1366 names = list(names)
1367 refs: list[DatasetRef] = []
1368 for name in names:
1369 collectionType = self._registry.getCollectionType(name)
1370 if collectionType is not CollectionType.RUN:
1371 raise TypeError(f"The collection type of '{name}' is {collectionType.name}, not RUN.")
1372 refs.extend(self._registry.queryDatasets(..., collections=name, findFirst=True))
1373 with self._datastore.transaction(), self._registry.transaction():
1374 if unstore:
1375 self._datastore.trash(refs)
1376 else:
1377 self._datastore.forget(refs)
1378 for name in names:
1379 self._registry.removeCollection(name)
1380 if unstore:
1381 # Point of no return for removing artifacts
1382 self._datastore.emptyTrash()
1384 def pruneDatasets(
1385 self,
1386 refs: Iterable[DatasetRef],
1387 *,
1388 disassociate: bool = True,
1389 unstore: bool = False,
1390 tags: Iterable[str] = (),
1391 purge: bool = False,
1392 ) -> None:
1393 # docstring inherited from LimitedButler
1395 if not self.isWriteable():
1396 raise TypeError("Butler is read-only.")
1397 if purge:
1398 if not disassociate:
1399 raise TypeError("Cannot pass purge=True without disassociate=True.")
1400 if not unstore:
1401 raise TypeError("Cannot pass purge=True without unstore=True.")
1402 elif disassociate:
1403 tags = tuple(tags)
1404 if not tags:
1405 raise TypeError("No tags provided but disassociate=True.")
1406 for tag in tags:
1407 collectionType = self._registry.getCollectionType(tag)
1408 if collectionType is not CollectionType.TAGGED:
1409 raise TypeError(
1410 f"Cannot disassociate from collection '{tag}' "
1411 f"of non-TAGGED type {collectionType.name}."
1412 )
1413 # Transform possibly-single-pass iterable into something we can iterate
1414 # over multiple times.
1415 refs = list(refs)
1416 # Pruning a component of a DatasetRef makes no sense since registry
1417 # doesn't know about components and datastore might not store
1418 # components in a separate file
1419 for ref in refs:
1420 if ref.datasetType.component():
1421 raise ValueError(f"Can not prune a component of a dataset (ref={ref})")
1422 # We don't need an unreliable Datastore transaction for this, because
1423 # we've been extra careful to ensure that Datastore.trash only involves
1424 # mutating the Registry (it can _look_ at Datastore-specific things,
1425 # but shouldn't change them), and hence all operations here are
1426 # Registry operations.
1427 with self._datastore.transaction(), self._registry.transaction():
1428 if unstore:
1429 self._datastore.trash(refs)
1430 if purge:
1431 self._registry.removeDatasets(refs)
1432 elif disassociate:
1433 assert tags, "Guaranteed by earlier logic in this function."
1434 for tag in tags:
1435 self._registry.disassociate(tag, refs)
1436 # We've exited the Registry transaction, and apparently committed.
1437 # (if there was an exception, everything rolled back, and it's as if
1438 # nothing happened - and we never get here).
1439 # Datastore artifacts are not yet gone, but they're clearly marked
1440 # as trash, so if we fail to delete now because of (e.g.) filesystem
1441 # problems we can try again later, and if manual administrative
1442 # intervention is required, it's pretty clear what that should entail:
1443 # deleting everything on disk and in private Datastore tables that is
1444 # in the dataset_location_trash table.
1445 if unstore:
1446 # Point of no return for removing artifacts
1447 self._datastore.emptyTrash()
1449 @transactional
1450 def ingest(
1451 self,
1452 *datasets: FileDataset,
1453 transfer: str | None = "auto",
1454 record_validation_info: bool = True,
1455 ) -> None:
1456 # Docstring inherited.
1457 if not self.isWriteable():
1458 raise TypeError("Butler is read-only.")
1460 _LOG.verbose("Ingesting %d file dataset%s.", len(datasets), "" if len(datasets) == 1 else "s")
1461 if not datasets:
1462 return
1464 progress = Progress("lsst.daf.butler.Butler.ingest", level=logging.DEBUG)
1466 # We need to reorganize all the inputs so that they are grouped
1467 # by dataset type and run. Multiple refs in a single FileDataset
1468 # are required to share the run and dataset type.
1469 groupedData: MutableMapping[tuple[DatasetType, str], list[FileDataset]] = defaultdict(list)
1471 # Track DataIDs that are being ingested so we can spot issues early
1472 # with duplication. Retain previous FileDataset so we can report it.
1473 groupedDataIds: MutableMapping[
1474 tuple[DatasetType, str], dict[DataCoordinate, FileDataset]
1475 ] = defaultdict(dict)
1477 # And the nested loop that populates it:
1478 for dataset in progress.wrap(datasets, desc="Grouping by dataset type"):
1479 # Somewhere to store pre-existing refs if we have an
1480 # execution butler.
1481 existingRefs: list[DatasetRef] = []
1483 for ref in dataset.refs:
1484 group_key = (ref.datasetType, ref.run)
1486 if ref.dataId in groupedDataIds[group_key]:
1487 raise ConflictingDefinitionError(
1488 f"Ingest conflict. Dataset {dataset.path} has same"
1489 " DataId as other ingest dataset"
1490 f" {groupedDataIds[group_key][ref.dataId].path} "
1491 f" ({ref.dataId})"
1492 )
1494 groupedDataIds[group_key][ref.dataId] = dataset
1496 if existingRefs:
1497 if len(dataset.refs) != len(existingRefs):
1498 # Keeping track of partially pre-existing datasets is hard
1499 # and should generally never happen. For now don't allow
1500 # it.
1501 raise ConflictingDefinitionError(
1502 f"For dataset {dataset.path} some dataIds already exist"
1503 " in registry but others do not. This is not supported."
1504 )
1506 # Store expanded form in the original FileDataset.
1507 dataset.refs = existingRefs
1508 else:
1509 groupedData[group_key].append(dataset)
1511 # Now we can bulk-insert into Registry for each DatasetType.
1512 for (datasetType, this_run), grouped_datasets in progress.iter_item_chunks(
1513 groupedData.items(), desc="Bulk-inserting datasets by type"
1514 ):
1515 refs_to_import = []
1516 for dataset in grouped_datasets:
1517 refs_to_import.extend(dataset.refs)
1519 n_refs = len(refs_to_import)
1520 _LOG.verbose(
1521 "Importing %d ref%s of dataset type %r into run %r",
1522 n_refs,
1523 "" if n_refs == 1 else "s",
1524 datasetType.name,
1525 this_run,
1526 )
1528 # Import the refs and expand the DataCoordinates since we can't
1529 # guarantee that they are expanded and Datastore will need
1530 # the records.
1531 imported_refs = self._registry._importDatasets(refs_to_import, expand=True)
1532 assert set(imported_refs) == set(refs_to_import)
1534 # Replace all the refs in the FileDataset with expanded versions.
1535 # Pull them off in the order we put them on the list.
1536 for dataset in grouped_datasets:
1537 n_dataset_refs = len(dataset.refs)
1538 dataset.refs = imported_refs[:n_dataset_refs]
1539 del imported_refs[:n_dataset_refs]
1541 # Bulk-insert everything into Datastore.
1542 # We do not know if any of the registry entries already existed
1543 # (_importDatasets only complains if they exist but differ) so
1544 # we have to catch IntegrityError explicitly.
1545 try:
1546 self._datastore.ingest(
1547 *datasets, transfer=transfer, record_validation_info=record_validation_info
1548 )
1549 except IntegrityError as e:
1550 raise ConflictingDefinitionError(f"Datastore already contains one or more datasets: {e}") from e
1552 @contextlib.contextmanager
1553 def export(
1554 self,
1555 *,
1556 directory: str | None = None,
1557 filename: str | None = None,
1558 format: str | None = None,
1559 transfer: str | None = None,
1560 ) -> Iterator[RepoExportContext]:
1561 # Docstring inherited.
1562 if directory is None and transfer is not None:
1563 raise TypeError("Cannot transfer without providing a directory.")
1564 if transfer == "move":
1565 raise TypeError("Transfer may not be 'move': export is read-only")
1566 if format is None:
1567 if filename is None:
1568 raise TypeError("At least one of 'filename' or 'format' must be provided.")
1569 else:
1570 _, format = os.path.splitext(filename)
1571 if not format:
1572 raise ValueError("Please specify a file extension to determine export format.")
1573 format = format[1:] # Strip leading ".""
1574 elif filename is None:
1575 filename = f"export.{format}"
1576 if directory is not None:
1577 filename = os.path.join(directory, filename)
1578 formats = self._config["repo_transfer_formats"]
1579 if format not in formats:
1580 raise ValueError(f"Unknown export format {format!r}, allowed: {','.join(formats.keys())}")
1581 BackendClass = get_class_of(formats[format, "export"])
1582 with open(filename, "w") as stream:
1583 backend = BackendClass(stream, universe=self.dimensions)
1584 try:
1585 helper = RepoExportContext(
1586 self._registry, self._datastore, backend=backend, directory=directory, transfer=transfer
1587 )
1588 with self._caching_context():
1589 yield helper
1590 except BaseException:
1591 raise
1592 else:
1593 helper._finish()
1595 def import_(
1596 self,
1597 *,
1598 directory: ResourcePathExpression | None = None,
1599 filename: ResourcePathExpression | TextIO | None = None,
1600 format: str | None = None,
1601 transfer: str | None = None,
1602 skip_dimensions: set | None = None,
1603 ) -> None:
1604 # Docstring inherited.
1605 if not self.isWriteable():
1606 raise TypeError("Butler is read-only.")
1607 if format is None:
1608 if filename is None:
1609 raise TypeError("At least one of 'filename' or 'format' must be provided.")
1610 else:
1611 _, format = os.path.splitext(filename) # type: ignore
1612 elif filename is None:
1613 filename = ResourcePath(f"export.{format}", forceAbsolute=False)
1614 if directory is not None:
1615 directory = ResourcePath(directory, forceDirectory=True)
1616 # mypy doesn't think this will work but it does in python >= 3.10.
1617 if isinstance(filename, ResourcePathExpression): # type: ignore
1618 filename = ResourcePath(filename, forceAbsolute=False) # type: ignore
1619 if not filename.isabs() and directory is not None:
1620 potential = directory.join(filename)
1621 exists_in_cwd = filename.exists()
1622 exists_in_dir = potential.exists()
1623 if exists_in_cwd and exists_in_dir:
1624 _LOG.warning(
1625 "A relative path for filename was specified (%s) which exists relative to cwd. "
1626 "Additionally, the file exists relative to the given search directory (%s). "
1627 "Using the export file in the given directory.",
1628 filename,
1629 potential,
1630 )
1631 # Given they specified an explicit directory and that
1632 # directory has the export file in it, assume that that
1633 # is what was meant despite the file in cwd.
1634 filename = potential
1635 elif exists_in_dir:
1636 filename = potential
1637 elif not exists_in_cwd and not exists_in_dir:
1638 # Raise early.
1639 raise FileNotFoundError(
1640 f"Export file could not be found in {filename.abspath()} or {potential.abspath()}."
1641 )
1642 BackendClass: type[RepoImportBackend] = get_class_of(
1643 self._config["repo_transfer_formats"][format]["import"]
1644 )
1646 def doImport(importStream: TextIO | ResourceHandleProtocol) -> None:
1647 with self._caching_context():
1648 backend = BackendClass(importStream, self._registry) # type: ignore[call-arg]
1649 backend.register()
1650 with self.transaction():
1651 backend.load(
1652 self._datastore,
1653 directory=directory,
1654 transfer=transfer,
1655 skip_dimensions=skip_dimensions,
1656 )
1658 if isinstance(filename, ResourcePath):
1659 # We can not use open() here at the moment because of
1660 # DM-38589 since yaml does stream.read(8192) in a loop.
1661 stream = io.StringIO(filename.read().decode())
1662 doImport(stream)
1663 else:
1664 doImport(filename) # type: ignore
1666 def transfer_dimension_records_from(
1667 self, source_butler: LimitedButler | Butler, source_refs: Iterable[DatasetRef]
1668 ) -> None:
1669 # Allowed dimensions in the target butler.
1670 elements = frozenset(element for element in self.dimensions.elements if element.has_own_table)
1672 data_ids = {ref.dataId for ref in source_refs}
1674 dimension_records = self._extract_all_dimension_records_from_data_ids(
1675 source_butler, data_ids, elements
1676 )
1678 # Insert order is important.
1679 for element in self.dimensions.sorted(dimension_records.keys()):
1680 records = [r for r in dimension_records[element].values()]
1681 # Assume that if the record is already present that we can
1682 # use it without having to check that the record metadata
1683 # is consistent.
1684 self._registry.insertDimensionData(element, *records, skip_existing=True)
1685 _LOG.debug("Dimension '%s' -- number of records transferred: %d", element.name, len(records))
1687 def _extract_all_dimension_records_from_data_ids(
1688 self,
1689 source_butler: LimitedButler | Butler,
1690 data_ids: set[DataCoordinate],
1691 allowed_elements: frozenset[DimensionElement],
1692 ) -> dict[DimensionElement, dict[DataCoordinate, DimensionRecord]]:
1693 primary_records = self._extract_dimension_records_from_data_ids(
1694 source_butler, data_ids, allowed_elements
1695 )
1697 can_query = True if isinstance(source_butler, Butler) else False
1699 additional_records: dict[DimensionElement, dict[DataCoordinate, DimensionRecord]] = defaultdict(dict)
1700 for original_element, record_mapping in primary_records.items():
1701 # Get dimensions that depend on this dimension.
1702 populated_by = self.dimensions.get_elements_populated_by(
1703 self.dimensions[original_element.name] # type: ignore
1704 )
1706 for data_id in record_mapping.keys():
1707 for element in populated_by:
1708 if element not in allowed_elements:
1709 continue
1710 if element.name == original_element.name:
1711 continue
1713 if element.name in primary_records:
1714 # If this element has already been stored avoid
1715 # re-finding records since that may lead to additional
1716 # spurious records. e.g. visit is populated_by
1717 # visit_detector_region but querying
1718 # visit_detector_region by visit will return all the
1719 # detectors for this visit -- the visit dataId does not
1720 # constrain this.
1721 # To constrain the query the original dataIds would
1722 # have to be scanned.
1723 continue
1725 if not can_query:
1726 raise RuntimeError(
1727 f"Transferring populated_by records like {element.name} requires a full Butler."
1728 )
1730 records = source_butler.registry.queryDimensionRecords( # type: ignore
1731 element.name, **data_id.mapping # type: ignore
1732 )
1733 for record in records:
1734 additional_records[record.definition].setdefault(record.dataId, record)
1736 # The next step is to walk back through the additional records to
1737 # pick up any missing content (such as visit_definition needing to
1738 # know the exposure). Want to ensure we do not request records we
1739 # already have.
1740 missing_data_ids = set()
1741 for name, record_mapping in additional_records.items():
1742 for data_id in record_mapping.keys():
1743 if data_id not in primary_records[name]:
1744 missing_data_ids.add(data_id)
1746 # Fill out the new records. Assume that these new records do not
1747 # also need to carry over additional populated_by records.
1748 secondary_records = self._extract_dimension_records_from_data_ids(
1749 source_butler, missing_data_ids, allowed_elements
1750 )
1752 # Merge the extra sets of records in with the original.
1753 for name, record_mapping in itertools.chain(additional_records.items(), secondary_records.items()):
1754 primary_records[name].update(record_mapping)
1756 return primary_records
1758 def _extract_dimension_records_from_data_ids(
1759 self,
1760 source_butler: LimitedButler | Butler,
1761 data_ids: set[DataCoordinate],
1762 allowed_elements: frozenset[DimensionElement],
1763 ) -> dict[DimensionElement, dict[DataCoordinate, DimensionRecord]]:
1764 dimension_records: dict[DimensionElement, dict[DataCoordinate, DimensionRecord]] = defaultdict(dict)
1766 for data_id in data_ids:
1767 # Need an expanded record, if not expanded that we need a full
1768 # butler with registry (allow mocks with registry too).
1769 if not data_id.hasRecords():
1770 if registry := getattr(source_butler, "registry", None):
1771 data_id = registry.expandDataId(data_id)
1772 else:
1773 raise TypeError("Input butler needs to be a full butler to expand DataId.")
1774 # If this butler doesn't know about a dimension in the source
1775 # butler things will break later.
1776 for element_name in data_id.dimensions.elements:
1777 record = data_id.records[element_name]
1778 if record is not None and record.definition in allowed_elements:
1779 dimension_records[record.definition].setdefault(record.dataId, record)
1781 return dimension_records
1783 def transfer_from(
1784 self,
1785 source_butler: LimitedButler,
1786 source_refs: Iterable[DatasetRef],
1787 transfer: str = "auto",
1788 skip_missing: bool = True,
1789 register_dataset_types: bool = False,
1790 transfer_dimensions: bool = False,
1791 ) -> collections.abc.Collection[DatasetRef]:
1792 # Docstring inherited.
1793 if not self.isWriteable():
1794 raise TypeError("Butler is read-only.")
1795 progress = Progress("lsst.daf.butler.Butler.transfer_from", level=VERBOSE)
1797 # Will iterate through the refs multiple times so need to convert
1798 # to a list if this isn't a collection.
1799 if not isinstance(source_refs, collections.abc.Collection):
1800 source_refs = list(source_refs)
1802 original_count = len(source_refs)
1803 _LOG.info("Transferring %d datasets into %s", original_count, str(self))
1805 # In some situations the datastore artifact may be missing
1806 # and we do not want that registry entry to be imported.
1807 # Asking datastore is not sufficient, the records may have been
1808 # purged, we have to ask for the (predicted) URI and check
1809 # existence explicitly. Execution butler is set up exactly like
1810 # this with no datastore records.
1811 artifact_existence: dict[ResourcePath, bool] = {}
1812 if skip_missing:
1813 dataset_existence = source_butler._datastore.mexists(
1814 source_refs, artifact_existence=artifact_existence
1815 )
1816 source_refs = [ref for ref, exists in dataset_existence.items() if exists]
1817 filtered_count = len(source_refs)
1818 n_missing = original_count - filtered_count
1819 _LOG.verbose(
1820 "%d dataset%s removed because the artifact does not exist. Now have %d.",
1821 n_missing,
1822 "" if n_missing == 1 else "s",
1823 filtered_count,
1824 )
1826 # Importing requires that we group the refs by dataset type and run
1827 # before doing the import.
1828 source_dataset_types = set()
1829 grouped_refs = defaultdict(list)
1830 for ref in source_refs:
1831 grouped_refs[ref.datasetType, ref.run].append(ref)
1832 source_dataset_types.add(ref.datasetType)
1834 # Check to see if the dataset type in the source butler has
1835 # the same definition in the target butler and register missing
1836 # ones if requested. Registration must happen outside a transaction.
1837 newly_registered_dataset_types = set()
1838 for datasetType in source_dataset_types:
1839 if register_dataset_types:
1840 # Let this raise immediately if inconsistent. Continuing
1841 # on to find additional inconsistent dataset types
1842 # might result in additional unwanted dataset types being
1843 # registered.
1844 if self._registry.registerDatasetType(datasetType):
1845 newly_registered_dataset_types.add(datasetType)
1846 else:
1847 # If the dataset type is missing, let it fail immediately.
1848 target_dataset_type = self.get_dataset_type(datasetType.name)
1849 if target_dataset_type != datasetType:
1850 raise ConflictingDefinitionError(
1851 "Source butler dataset type differs from definition"
1852 f" in target butler: {datasetType} !="
1853 f" {target_dataset_type}"
1854 )
1855 if newly_registered_dataset_types:
1856 # We may have registered some even if there were inconsistencies
1857 # but should let people know (or else remove them again).
1858 _LOG.verbose(
1859 "Registered the following dataset types in the target Butler: %s",
1860 ", ".join(d.name for d in newly_registered_dataset_types),
1861 )
1862 else:
1863 _LOG.verbose("All required dataset types are known to the target Butler")
1865 dimension_records: dict[DimensionElement, dict[DataCoordinate, DimensionRecord]] = defaultdict(dict)
1866 if transfer_dimensions:
1867 # Collect all the dimension records for these refs.
1868 # All dimensions are to be copied but the list of valid dimensions
1869 # come from this butler's universe.
1870 elements = frozenset(element for element in self.dimensions.elements if element.has_own_table)
1871 dataIds = {ref.dataId for ref in source_refs}
1872 dimension_records = self._extract_all_dimension_records_from_data_ids(
1873 source_butler, dataIds, elements
1874 )
1876 handled_collections: set[str] = set()
1878 # Do all the importing in a single transaction.
1879 with self.transaction():
1880 if dimension_records:
1881 _LOG.verbose("Ensuring that dimension records exist for transferred datasets.")
1882 # Order matters.
1883 for element in self.dimensions.sorted(dimension_records.keys()):
1884 records = [r for r in dimension_records[element].values()]
1885 # Assume that if the record is already present that we can
1886 # use it without having to check that the record metadata
1887 # is consistent.
1888 self._registry.insertDimensionData(element, *records, skip_existing=True)
1890 n_imported = 0
1891 for (datasetType, run), refs_to_import in progress.iter_item_chunks(
1892 grouped_refs.items(), desc="Importing to registry by run and dataset type"
1893 ):
1894 if run not in handled_collections:
1895 # May need to create output collection. If source butler
1896 # has a registry, ask for documentation string.
1897 run_doc = None
1898 if registry := getattr(source_butler, "registry", None):
1899 run_doc = registry.getCollectionDocumentation(run)
1900 registered = self._registry.registerRun(run, doc=run_doc)
1901 handled_collections.add(run)
1902 if registered:
1903 _LOG.verbose("Creating output run %s", run)
1905 n_refs = len(refs_to_import)
1906 _LOG.verbose(
1907 "Importing %d ref%s of dataset type %s into run %s",
1908 n_refs,
1909 "" if n_refs == 1 else "s",
1910 datasetType.name,
1911 run,
1912 )
1914 # Assume we are using UUIDs and the source refs will match
1915 # those imported.
1916 imported_refs = self._registry._importDatasets(refs_to_import)
1917 assert set(imported_refs) == set(refs_to_import)
1918 n_imported += len(imported_refs)
1920 assert len(source_refs) == n_imported
1921 _LOG.verbose("Imported %d datasets into destination butler", n_imported)
1923 # Ask the datastore to transfer. The datastore has to check that
1924 # the source datastore is compatible with the target datastore.
1925 accepted, rejected = self._datastore.transfer_from(
1926 source_butler._datastore,
1927 source_refs,
1928 transfer=transfer,
1929 artifact_existence=artifact_existence,
1930 )
1931 if rejected:
1932 # For now, accept the registry entries but not the files.
1933 _LOG.warning(
1934 "%d datasets were rejected and %d accepted for dataset type %s in run %r.",
1935 len(rejected),
1936 len(accepted),
1937 datasetType,
1938 run,
1939 )
1941 return source_refs
1943 def validateConfiguration(
1944 self,
1945 logFailures: bool = False,
1946 datasetTypeNames: Iterable[str] | None = None,
1947 ignore: Iterable[str] | None = None,
1948 ) -> None:
1949 # Docstring inherited.
1950 if datasetTypeNames:
1951 datasetTypes = [self.get_dataset_type(name) for name in datasetTypeNames]
1952 else:
1953 datasetTypes = list(self._registry.queryDatasetTypes())
1955 # filter out anything from the ignore list
1956 if ignore:
1957 ignore = set(ignore)
1958 datasetTypes = [
1959 e for e in datasetTypes if e.name not in ignore and e.nameAndComponent()[0] not in ignore
1960 ]
1961 else:
1962 ignore = set()
1964 # For each datasetType that has an instrument dimension, create
1965 # a DatasetRef for each defined instrument
1966 datasetRefs = []
1968 # Find all the registered instruments (if "instrument" is in the
1969 # universe).
1970 if "instrument" in self.dimensions:
1971 instruments = {record.name for record in self._registry.queryDimensionRecords("instrument")}
1973 for datasetType in datasetTypes:
1974 if "instrument" in datasetType.dimensions:
1975 # In order to create a conforming dataset ref, create
1976 # fake DataCoordinate values for the non-instrument
1977 # dimensions. The type of the value does not matter here.
1978 dataId = {dim: 1 for dim in datasetType.dimensions.names if dim != "instrument"}
1980 for instrument in instruments:
1981 datasetRef = DatasetRef(
1982 datasetType,
1983 DataCoordinate.standardize(
1984 dataId, instrument=instrument, dimensions=datasetType.dimensions
1985 ),
1986 run="validate",
1987 )
1988 datasetRefs.append(datasetRef)
1990 entities: list[DatasetType | DatasetRef] = []
1991 entities.extend(datasetTypes)
1992 entities.extend(datasetRefs)
1994 datastoreErrorStr = None
1995 try:
1996 self._datastore.validateConfiguration(entities, logFailures=logFailures)
1997 except ValidationError as e:
1998 datastoreErrorStr = str(e)
2000 # Also check that the LookupKeys used by the datastores match
2001 # registry and storage class definitions
2002 keys = self._datastore.getLookupKeys()
2004 failedNames = set()
2005 failedDataId = set()
2006 for key in keys:
2007 if key.name is not None:
2008 if key.name in ignore:
2009 continue
2011 # skip if specific datasetType names were requested and this
2012 # name does not match
2013 if datasetTypeNames and key.name not in datasetTypeNames:
2014 continue
2016 # See if it is a StorageClass or a DatasetType
2017 if key.name in self.storageClasses:
2018 pass
2019 else:
2020 try:
2021 self.get_dataset_type(key.name)
2022 except KeyError:
2023 if logFailures:
2024 _LOG.critical(
2025 "Key '%s' does not correspond to a DatasetType or StorageClass", key
2026 )
2027 failedNames.add(key)
2028 else:
2029 # Dimensions are checked for consistency when the Butler
2030 # is created and rendezvoused with a universe.
2031 pass
2033 # Check that the instrument is a valid instrument
2034 # Currently only support instrument so check for that
2035 if key.dataId:
2036 dataIdKeys = set(key.dataId)
2037 if {"instrument"} != dataIdKeys:
2038 if logFailures:
2039 _LOG.critical("Key '%s' has unsupported DataId override", key)
2040 failedDataId.add(key)
2041 elif key.dataId["instrument"] not in instruments:
2042 if logFailures:
2043 _LOG.critical("Key '%s' has unknown instrument", key)
2044 failedDataId.add(key)
2046 messages = []
2048 if datastoreErrorStr:
2049 messages.append(datastoreErrorStr)
2051 for failed, msg in (
2052 (failedNames, "Keys without corresponding DatasetType or StorageClass entry: "),
2053 (failedDataId, "Keys with bad DataId entries: "),
2054 ):
2055 if failed:
2056 msg += ", ".join(str(k) for k in failed)
2057 messages.append(msg)
2059 if messages:
2060 raise ValidationError(";\n".join(messages))
2062 @property
2063 def collections(self) -> Sequence[str]:
2064 """The collections to search by default, in order
2065 (`~collections.abc.Sequence` [ `str` ]).
2067 This is an alias for ``self.registry.defaults.collections``. It cannot
2068 be set directly in isolation, but all defaults may be changed together
2069 by assigning a new `RegistryDefaults` instance to
2070 ``self.registry.defaults``.
2071 """
2072 return self._registry.defaults.collections
2074 @property
2075 def run(self) -> str | None:
2076 """Name of the run this butler writes outputs to by default (`str` or
2077 `None`).
2079 This is an alias for ``self.registry.defaults.run``. It cannot be set
2080 directly in isolation, but all defaults may be changed together by
2081 assigning a new `RegistryDefaults` instance to
2082 ``self.registry.defaults``.
2083 """
2084 return self._registry.defaults.run
2086 @property
2087 def registry(self) -> Registry:
2088 """The object that manages dataset metadata and relationships
2089 (`Registry`).
2091 Many operations that don't involve reading or writing butler datasets
2092 are accessible only via `Registry` methods. Eventually these methods
2093 will be replaced by equivalent `Butler` methods.
2094 """
2095 return self._registry_shim
2097 @property
2098 def dimensions(self) -> DimensionUniverse:
2099 # Docstring inherited.
2100 return self._registry.dimensions
2102 @contextlib.contextmanager
2103 def _query(self) -> Iterator[Query]:
2104 # Docstring inherited.
2105 with self._caching_context():
2106 yield DirectQuery(self._registry)
2108 def _query_data_ids(
2109 self,
2110 dimensions: DimensionGroup | Iterable[str] | str,
2111 *,
2112 data_id: DataId | None = None,
2113 where: str = "",
2114 bind: Mapping[str, Any] | None = None,
2115 expanded: bool = False,
2116 order_by: Iterable[str] | str | None = None,
2117 limit: int | None = None,
2118 offset: int | None = None,
2119 explain: bool = True,
2120 **kwargs: Any,
2121 ) -> list[DataCoordinate]:
2122 # Docstring inherited.
2123 query = DirectQuery(self._registry)
2124 result = query.data_ids(dimensions, data_id=data_id, where=where, bind=bind, **kwargs)
2125 if expanded:
2126 result = result.expanded()
2127 if order_by:
2128 result = result.order_by(*ensure_iterable(order_by))
2129 if limit is not None:
2130 result = result.limit(limit, offset)
2131 else:
2132 if offset is not None:
2133 raise TypeError("offset is specified without limit")
2134 data_ids = list(result)
2135 if explain and not data_ids:
2136 raise EmptyQueryResultError(list(result.explain_no_results()))
2137 return data_ids
2139 def _query_datasets(
2140 self,
2141 dataset_type: Any,
2142 collections: CollectionArgType | None = None,
2143 *,
2144 find_first: bool = True,
2145 data_id: DataId | None = None,
2146 where: str = "",
2147 bind: Mapping[str, Any] | None = None,
2148 expanded: bool = False,
2149 explain: bool = True,
2150 **kwargs: Any,
2151 ) -> list[DatasetRef]:
2152 # Docstring inherited.
2153 query = DirectQuery(self._registry)
2154 result = query.datasets(
2155 dataset_type,
2156 collections,
2157 find_first=find_first,
2158 data_id=data_id,
2159 where=where,
2160 bind=bind,
2161 **kwargs,
2162 )
2163 if expanded:
2164 result = result.expanded()
2165 refs = list(result)
2166 if explain and not refs:
2167 raise EmptyQueryResultError(list(result.explain_no_results()))
2168 return refs
2170 def _query_dimension_records(
2171 self,
2172 element: str,
2173 *,
2174 data_id: DataId | None = None,
2175 where: str = "",
2176 bind: Mapping[str, Any] | None = None,
2177 order_by: Iterable[str] | str | None = None,
2178 limit: int | None = None,
2179 offset: int | None = None,
2180 explain: bool = True,
2181 **kwargs: Any,
2182 ) -> list[DimensionRecord]:
2183 # Docstring inherited.
2184 query = DirectQuery(self._registry)
2185 result = query.dimension_records(element, data_id=data_id, where=where, bind=bind, **kwargs)
2186 if order_by:
2187 result = result.order_by(*ensure_iterable(order_by))
2188 if limit is not None:
2189 result = result.limit(limit, offset)
2190 else:
2191 if offset is not None:
2192 raise TypeError("offset is specified without limit")
2193 data_ids = list(result)
2194 if explain and not data_ids:
2195 raise EmptyQueryResultError(list(result.explain_no_results()))
2196 return data_ids
2198 _config: ButlerConfig
2199 """Configuration for this Butler instance."""
2201 _registry: SqlRegistry
2202 """The object that manages dataset metadata and relationships
2203 (`SqlRegistry`).
2205 Most operations that don't involve reading or writing butler datasets are
2206 accessible only via `SqlRegistry` methods.
2207 """
2209 datastore: Datastore
2210 """The object that manages actual dataset storage (`Datastore`).
2212 Direct user access to the datastore should rarely be necessary; the primary
2213 exception is the case where a `Datastore` implementation provides extra
2214 functionality beyond what the base class defines.
2215 """
2217 storageClasses: StorageClassFactory
2218 """An object that maps known storage class names to objects that fully
2219 describe them (`StorageClassFactory`).
2220 """
2222 _registry_shim: RegistryShim
2223 """Shim object to provide a legacy public interface for querying via the
2224 the ``registry`` property.
2225 """