Coverage for python/lsst/daf/butler/direct_butler.py: 11%
795 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-08 10:56 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-08 10:56 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28"""Butler top level classes.
29"""
30from __future__ import annotations
32__all__ = (
33 "DirectButler",
34 "ButlerValidationError",
35)
37import collections.abc
38import contextlib
39import io
40import itertools
41import logging
42import numbers
43import os
44import warnings
45from collections import Counter, defaultdict
46from collections.abc import Iterable, Iterator, Mapping, MutableMapping, Sequence
47from typing import TYPE_CHECKING, Any, ClassVar, TextIO
49from deprecated.sphinx import deprecated
50from lsst.resources import ResourcePath, ResourcePathExpression
51from lsst.utils.introspection import get_class_of
52from lsst.utils.iteration import ensure_iterable
53from lsst.utils.logging import VERBOSE, getLogger
54from sqlalchemy.exc import IntegrityError
56from ._butler import Butler
57from ._butler_config import ButlerConfig
58from ._dataset_existence import DatasetExistence
59from ._dataset_ref import DatasetRef
60from ._dataset_type import DatasetType
61from ._deferredDatasetHandle import DeferredDatasetHandle
62from ._exceptions import EmptyQueryResultError, ValidationError
63from ._limited_butler import LimitedButler
64from ._registry_shim import RegistryShim
65from ._storage_class import StorageClass, StorageClassFactory
66from ._timespan import Timespan
67from .datastore import Datastore, NullDatastore
68from .dimensions import DataCoordinate, Dimension
69from .direct_query import DirectQuery
70from .progress import Progress
71from .registry import (
72 CollectionType,
73 ConflictingDefinitionError,
74 DataIdError,
75 MissingDatasetTypeError,
76 NoDefaultCollectionError,
77 RegistryDefaults,
78 _RegistryFactory,
79)
80from .registry.sql_registry import SqlRegistry
81from .transfers import RepoExportContext
82from .utils import transactional
84if TYPE_CHECKING:
85 from lsst.resources import ResourceHandleProtocol
87 from ._config import Config
88 from ._dataset_ref import DatasetId, DatasetIdGenEnum
89 from ._file_dataset import FileDataset
90 from ._query import Query
91 from .datastore import DatasetRefURIs
92 from .dimensions import (
93 DataId,
94 DataIdValue,
95 DimensionElement,
96 DimensionGroup,
97 DimensionRecord,
98 DimensionUniverse,
99 )
100 from .registry import CollectionArgType, Registry
101 from .transfers import RepoImportBackend
103_LOG = getLogger(__name__)
106class ButlerValidationError(ValidationError):
107 """There is a problem with the Butler configuration."""
109 pass
112class DirectButler(Butler):
113 """Main entry point for the data access system.
115 Parameters
116 ----------
117 config : `ButlerConfig`, `Config` or `str`, optional.
118 Configuration. Anything acceptable to the
119 `ButlerConfig` constructor. If a directory path
120 is given the configuration will be read from a ``butler.yaml`` file in
121 that location. If `None` is given default values will be used.
122 butler : `DirectButler`, optional.
123 If provided, construct a new Butler that uses the same registry and
124 datastore as the given one, but with the given collection and run.
125 Incompatible with the ``config``, ``searchPaths``, and ``writeable``
126 arguments.
127 collections : `str` or `~collections.abc.Iterable` [ `str` ], optional
128 An expression specifying the collections to be searched (in order) when
129 reading datasets.
130 This may be a `str` collection name or an iterable thereof.
131 See :ref:`daf_butler_collection_expressions` for more information.
132 These collections are not registered automatically and must be
133 manually registered before they are used by any method, but they may be
134 manually registered after the `Butler` is initialized.
135 run : `str`, optional
136 Name of the `~CollectionType.RUN` collection new datasets should be
137 inserted into. If ``collections`` is `None` and ``run`` is not `None`,
138 ``collections`` will be set to ``[run]``. If not `None`, this
139 collection will automatically be registered. If this is not set (and
140 ``writeable`` is not set either), a read-only butler will be created.
141 searchPaths : `list` of `str`, optional
142 Directory paths to search when calculating the full Butler
143 configuration. Not used if the supplied config is already a
144 `ButlerConfig`.
145 writeable : `bool`, optional
146 Explicitly sets whether the butler supports write operations. If not
147 provided, a read-write butler is created if any of ``run``, ``tags``,
148 or ``chains`` is non-empty.
149 inferDefaults : `bool`, optional
150 If `True` (default) infer default data ID values from the values
151 present in the datasets in ``collections``: if all collections have the
152 same value (or no value) for a governor dimension, that value will be
153 the default for that dimension. Nonexistent collections are ignored.
154 If a default value is provided explicitly for a governor dimension via
155 ``**kwargs``, no default will be inferred for that dimension.
156 without_datastore : `bool`, optional
157 If `True` do not attach a datastore to this butler. Any attempts
158 to use a datastore will fail.
159 **kwargs : `str`
160 Default data ID key-value pairs. These may only identify "governor"
161 dimensions like ``instrument`` and ``skymap``.
162 """
164 def __init__(
165 self,
166 config: Config | ResourcePathExpression | None = None,
167 *,
168 butler: DirectButler | None = None,
169 collections: Any = None,
170 run: str | None = None,
171 searchPaths: Sequence[ResourcePathExpression] | None = None,
172 writeable: bool | None = None,
173 inferDefaults: bool = True,
174 without_datastore: bool = False,
175 **kwargs: str,
176 ):
177 defaults = RegistryDefaults(collections=collections, run=run, infer=inferDefaults, **kwargs)
178 # Load registry, datastore, etc. from config or existing butler.
179 if butler is not None:
180 if config is not None or searchPaths is not None or writeable is not None:
181 raise TypeError(
182 "Cannot pass 'config', 'searchPaths', or 'writeable' arguments with 'butler' argument."
183 )
184 self._registry = butler._registry.copy(defaults)
185 self._datastore = butler._datastore
186 self.storageClasses = butler.storageClasses
187 self._config: ButlerConfig = butler._config
188 else:
189 self._config = ButlerConfig(config, searchPaths=searchPaths, without_datastore=without_datastore)
190 try:
191 butlerRoot = self._config.get("root", self._config.configDir)
192 if writeable is None:
193 writeable = run is not None
194 self._registry = _RegistryFactory(self._config).from_config(
195 butlerRoot=butlerRoot, writeable=writeable, defaults=defaults
196 )
197 if without_datastore:
198 self._datastore = NullDatastore(None, None)
199 else:
200 self._datastore = Datastore.fromConfig(
201 self._config, self._registry.getDatastoreBridgeManager(), butlerRoot=butlerRoot
202 )
203 # TODO: Once datastore drops dependency on registry we can
204 # construct datastore first and pass opaque tables to registry
205 # constructor.
206 self._registry.make_datastore_tables(self._datastore.get_opaque_table_definitions())
207 self.storageClasses = StorageClassFactory()
208 self.storageClasses.addFromConfig(self._config)
209 except Exception:
210 # Failures here usually mean that configuration is incomplete,
211 # just issue an error message which includes config file URI.
212 _LOG.error(f"Failed to instantiate Butler from config {self._config.configFile}.")
213 raise
215 # For execution butler the datastore needs a special
216 # dependency-inversion trick. This is not used by regular butler,
217 # but we do not have a way to distinguish regular butler from execution
218 # butler.
219 self._datastore.set_retrieve_dataset_type_method(self._retrieve_dataset_type)
221 if "run" in self._config or "collection" in self._config:
222 raise ValueError("Passing a run or collection via configuration is no longer supported.")
224 self._registry_shim = RegistryShim(self)
226 GENERATION: ClassVar[int] = 3
227 """This is a Generation 3 Butler.
229 This attribute may be removed in the future, once the Generation 2 Butler
230 interface has been fully retired; it should only be used in transitional
231 code.
232 """
234 def _retrieve_dataset_type(self, name: str) -> DatasetType | None:
235 """Return DatasetType defined in registry given dataset type name."""
236 try:
237 return self.get_dataset_type(name)
238 except MissingDatasetTypeError:
239 return None
241 @classmethod
242 def _unpickle(
243 cls,
244 config: ButlerConfig,
245 collections: tuple[str, ...] | None,
246 run: str | None,
247 defaultDataId: dict[str, str],
248 writeable: bool,
249 ) -> DirectButler:
250 """Callable used to unpickle a Butler.
252 We prefer not to use ``Butler.__init__`` directly so we can force some
253 of its many arguments to be keyword-only (note that ``__reduce__``
254 can only invoke callables with positional arguments).
256 Parameters
257 ----------
258 config : `ButlerConfig`
259 Butler configuration, already coerced into a true `ButlerConfig`
260 instance (and hence after any search paths for overrides have been
261 utilized).
262 collections : `tuple` [ `str` ]
263 Names of the default collections to read from.
264 run : `str`, optional
265 Name of the default `~CollectionType.RUN` collection to write to.
266 defaultDataId : `dict` [ `str`, `str` ]
267 Default data ID values.
268 writeable : `bool`
269 Whether the Butler should support write operations.
271 Returns
272 -------
273 butler : `Butler`
274 A new `Butler` instance.
275 """
276 # MyPy doesn't recognize that the kwargs below are totally valid; it
277 # seems to think '**defaultDataId* is a _positional_ argument!
278 return cls(
279 config=config,
280 collections=collections,
281 run=run,
282 writeable=writeable,
283 **defaultDataId, # type: ignore
284 )
286 def __reduce__(self) -> tuple:
287 """Support pickling."""
288 return (
289 DirectButler._unpickle,
290 (
291 self._config,
292 self.collections,
293 self.run,
294 dict(self._registry.defaults.dataId.required),
295 self._registry.isWriteable(),
296 ),
297 )
299 def __str__(self) -> str:
300 return "Butler(collections={}, run={}, datastore='{}', registry='{}')".format(
301 self.collections, self.run, self._datastore, self._registry
302 )
304 def isWriteable(self) -> bool:
305 # Docstring inherited.
306 return self._registry.isWriteable()
308 def _caching_context(self) -> contextlib.AbstractContextManager[None]:
309 """Context manager that enables caching."""
310 return self._registry.caching_context()
312 @contextlib.contextmanager
313 def transaction(self) -> Iterator[None]:
314 """Context manager supporting `Butler` transactions.
316 Transactions can be nested.
317 """
318 with self._registry.transaction(), self._datastore.transaction():
319 yield
321 def _standardizeArgs(
322 self,
323 datasetRefOrType: DatasetRef | DatasetType | str,
324 dataId: DataId | None = None,
325 for_put: bool = True,
326 **kwargs: Any,
327 ) -> tuple[DatasetType, DataId | None]:
328 """Standardize the arguments passed to several Butler APIs.
330 Parameters
331 ----------
332 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
333 When `DatasetRef` the `dataId` should be `None`.
334 Otherwise the `DatasetType` or name thereof.
335 dataId : `dict` or `DataCoordinate`
336 A `dict` of `Dimension` link name, value pairs that label the
337 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
338 should be provided as the second argument.
339 for_put : `bool`, optional
340 If `True` this call is invoked as part of a `Butler.put()`.
341 Otherwise it is assumed to be part of a `Butler.get()`. This
342 parameter is only relevant if there is dataset type
343 inconsistency.
344 **kwargs
345 Additional keyword arguments used to augment or construct a
346 `DataCoordinate`. See `DataCoordinate.standardize`
347 parameters.
349 Returns
350 -------
351 datasetType : `DatasetType`
352 A `DatasetType` instance extracted from ``datasetRefOrType``.
353 dataId : `dict` or `DataId`, optional
354 Argument that can be used (along with ``kwargs``) to construct a
355 `DataId`.
357 Notes
358 -----
359 Butler APIs that conceptually need a DatasetRef also allow passing a
360 `DatasetType` (or the name of one) and a `DataId` (or a dict and
361 keyword arguments that can be used to construct one) separately. This
362 method accepts those arguments and always returns a true `DatasetType`
363 and a `DataId` or `dict`.
365 Standardization of `dict` vs `DataId` is best handled by passing the
366 returned ``dataId`` (and ``kwargs``) to `Registry` APIs, which are
367 generally similarly flexible.
368 """
369 externalDatasetType: DatasetType | None = None
370 internalDatasetType: DatasetType | None = None
371 if isinstance(datasetRefOrType, DatasetRef):
372 if dataId is not None or kwargs:
373 raise ValueError("DatasetRef given, cannot use dataId as well")
374 externalDatasetType = datasetRefOrType.datasetType
375 dataId = datasetRefOrType.dataId
376 else:
377 # Don't check whether DataId is provided, because Registry APIs
378 # can usually construct a better error message when it wasn't.
379 if isinstance(datasetRefOrType, DatasetType):
380 externalDatasetType = datasetRefOrType
381 else:
382 internalDatasetType = self.get_dataset_type(datasetRefOrType)
384 # Check that they are self-consistent
385 if externalDatasetType is not None:
386 internalDatasetType = self.get_dataset_type(externalDatasetType.name)
387 if externalDatasetType != internalDatasetType:
388 # We can allow differences if they are compatible, depending
389 # on whether this is a get or a put. A get requires that
390 # the python type associated with the datastore can be
391 # converted to the user type. A put requires that the user
392 # supplied python type can be converted to the internal
393 # type expected by registry.
394 relevantDatasetType = internalDatasetType
395 if for_put:
396 is_compatible = internalDatasetType.is_compatible_with(externalDatasetType)
397 else:
398 is_compatible = externalDatasetType.is_compatible_with(internalDatasetType)
399 relevantDatasetType = externalDatasetType
400 if not is_compatible:
401 raise ValueError(
402 f"Supplied dataset type ({externalDatasetType}) inconsistent with "
403 f"registry definition ({internalDatasetType})"
404 )
405 # Override the internal definition.
406 internalDatasetType = relevantDatasetType
408 assert internalDatasetType is not None
409 return internalDatasetType, dataId
411 def _rewrite_data_id(
412 self, dataId: DataId | None, datasetType: DatasetType, **kwargs: Any
413 ) -> tuple[DataId | None, dict[str, Any]]:
414 """Rewrite a data ID taking into account dimension records.
416 Take a Data ID and keyword args and rewrite it if necessary to
417 allow the user to specify dimension records rather than dimension
418 primary values.
420 This allows a user to include a dataId dict with keys of
421 ``exposure.day_obs`` and ``exposure.seq_num`` instead of giving
422 the integer exposure ID. It also allows a string to be given
423 for a dimension value rather than the integer ID if that is more
424 convenient. For example, rather than having to specifying the
425 detector with ``detector.full_name``, a string given for ``detector``
426 will be interpreted as the full name and converted to the integer
427 value.
429 Keyword arguments can also use strings for dimensions like detector
430 and exposure but python does not allow them to include ``.`` and
431 so the ``exposure.day_obs`` syntax can not be used in a keyword
432 argument.
434 Parameters
435 ----------
436 dataId : `dict` or `DataCoordinate`
437 A `dict` of `Dimension` link name, value pairs that will label the
438 `DatasetRef` within a Collection.
439 datasetType : `DatasetType`
440 The dataset type associated with this dataId. Required to
441 determine the relevant dimensions.
442 **kwargs
443 Additional keyword arguments used to augment or construct a
444 `DataId`. See `DataId` parameters.
446 Returns
447 -------
448 dataId : `dict` or `DataCoordinate`
449 The, possibly rewritten, dataId. If given a `DataCoordinate` and
450 no keyword arguments, the original dataId will be returned
451 unchanged.
452 **kwargs : `dict`
453 Any unused keyword arguments (would normally be empty dict).
454 """
455 # Do nothing if we have a standalone DataCoordinate.
456 if isinstance(dataId, DataCoordinate) and not kwargs:
457 return dataId, kwargs
459 # Process dimension records that are using record information
460 # rather than ids
461 newDataId: dict[str, DataIdValue] = {}
462 byRecord: dict[str, dict[str, Any]] = defaultdict(dict)
464 # if all the dataId comes from keyword parameters we do not need
465 # to do anything here because they can't be of the form
466 # exposure.obs_id because a "." is not allowed in a keyword parameter.
467 if dataId:
468 for k, v in dataId.items():
469 # If we have a Dimension we do not need to do anything
470 # because it cannot be a compound key.
471 if isinstance(k, str) and "." in k:
472 # Someone is using a more human-readable dataId
473 dimensionName, record = k.split(".", 1)
474 byRecord[dimensionName][record] = v
475 elif isinstance(k, Dimension):
476 newDataId[k.name] = v
477 else:
478 newDataId[k] = v
480 # Go through the updated dataId and check the type in case someone is
481 # using an alternate key. We have already filtered out the compound
482 # keys dimensions.record format.
483 not_dimensions = {}
485 # Will need to look in the dataId and the keyword arguments
486 # and will remove them if they need to be fixed or are unrecognized.
487 for dataIdDict in (newDataId, kwargs):
488 # Use a list so we can adjust the dict safely in the loop
489 for dimensionName in list(dataIdDict):
490 value = dataIdDict[dimensionName]
491 try:
492 dimension = self.dimensions.dimensions[dimensionName]
493 except KeyError:
494 # This is not a real dimension
495 not_dimensions[dimensionName] = value
496 del dataIdDict[dimensionName]
497 continue
499 # Convert an integral type to an explicit int to simplify
500 # comparisons here
501 if isinstance(value, numbers.Integral):
502 value = int(value)
504 if not isinstance(value, dimension.primaryKey.getPythonType()):
505 for alternate in dimension.alternateKeys:
506 if isinstance(value, alternate.getPythonType()):
507 byRecord[dimensionName][alternate.name] = value
508 del dataIdDict[dimensionName]
509 _LOG.debug(
510 "Converting dimension %s to %s.%s=%s",
511 dimensionName,
512 dimensionName,
513 alternate.name,
514 value,
515 )
516 break
517 else:
518 _LOG.warning(
519 "Type mismatch found for value '%r' provided for dimension %s. "
520 "Could not find matching alternative (primary key has type %s) "
521 "so attempting to use as-is.",
522 value,
523 dimensionName,
524 dimension.primaryKey.getPythonType(),
525 )
527 # By this point kwargs and newDataId should only include valid
528 # dimensions. Merge kwargs in to the new dataId and log if there
529 # are dimensions in both (rather than calling update).
530 for k, v in kwargs.items():
531 if k in newDataId and newDataId[k] != v:
532 _LOG.debug(
533 "Keyword arg %s overriding explicit value in dataId of %s with %s", k, newDataId[k], v
534 )
535 newDataId[k] = v
536 # No need to retain any values in kwargs now.
537 kwargs = {}
539 # If we have some unrecognized dimensions we have to try to connect
540 # them to records in other dimensions. This is made more complicated
541 # by some dimensions having records with clashing names. A mitigation
542 # is that we can tell by this point which dimensions are missing
543 # for the DatasetType but this does not work for calibrations
544 # where additional dimensions can be used to constrain the temporal
545 # axis.
546 if not_dimensions:
547 # Search for all dimensions even if we have been given a value
548 # explicitly. In some cases records are given as well as the
549 # actually dimension and this should not be an error if they
550 # match.
551 mandatoryDimensions = datasetType.dimensions.names # - provided
553 candidateDimensions: set[str] = set()
554 candidateDimensions.update(mandatoryDimensions)
556 # For calibrations we may well be needing temporal dimensions
557 # so rather than always including all dimensions in the scan
558 # restrict things a little. It is still possible for there
559 # to be confusion over day_obs in visit vs exposure for example.
560 # If we are not searching calibration collections things may
561 # fail but they are going to fail anyway because of the
562 # ambiguousness of the dataId...
563 if datasetType.isCalibration():
564 for dim in self.dimensions.dimensions:
565 if dim.temporal:
566 candidateDimensions.add(str(dim))
568 # Look up table for the first association with a dimension
569 guessedAssociation: dict[str, dict[str, Any]] = defaultdict(dict)
571 # Keep track of whether an item is associated with multiple
572 # dimensions.
573 counter: Counter[str] = Counter()
574 assigned: dict[str, set[str]] = defaultdict(set)
576 # Go through the missing dimensions and associate the
577 # given names with records within those dimensions
578 matched_dims = set()
579 for dimensionName in candidateDimensions:
580 dimension = self.dimensions.dimensions[dimensionName]
581 fields = dimension.metadata.names | dimension.uniqueKeys.names
582 for field in not_dimensions:
583 if field in fields:
584 guessedAssociation[dimensionName][field] = not_dimensions[field]
585 counter[dimensionName] += 1
586 assigned[field].add(dimensionName)
587 matched_dims.add(field)
589 # Calculate the fields that matched nothing.
590 never_found = set(not_dimensions) - matched_dims
592 if never_found:
593 raise ValueError(f"Unrecognized keyword args given: {never_found}")
595 # There is a chance we have allocated a single dataId item
596 # to multiple dimensions. Need to decide which should be retained.
597 # For now assume that the most popular alternative wins.
598 # This means that day_obs with seq_num will result in
599 # exposure.day_obs and not visit.day_obs
600 # Also prefer an explicitly missing dimension over an inferred
601 # temporal dimension.
602 for fieldName, assignedDimensions in assigned.items():
603 if len(assignedDimensions) > 1:
604 # Pick the most popular (preferring mandatory dimensions)
605 requiredButMissing = assignedDimensions.intersection(mandatoryDimensions)
606 if requiredButMissing:
607 candidateDimensions = requiredButMissing
608 else:
609 candidateDimensions = assignedDimensions
611 # If this is a choice between visit and exposure and
612 # neither was a required part of the dataset type,
613 # (hence in this branch) always prefer exposure over
614 # visit since exposures are always defined and visits
615 # are defined from exposures.
616 if candidateDimensions == {"exposure", "visit"}:
617 candidateDimensions = {"exposure"}
619 # Select the relevant items and get a new restricted
620 # counter.
621 theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions}
622 duplicatesCounter: Counter[str] = Counter()
623 duplicatesCounter.update(theseCounts)
625 # Choose the most common. If they are equally common
626 # we will pick the one that was found first.
627 # Returns a list of tuples
628 selected = duplicatesCounter.most_common(1)[0][0]
630 _LOG.debug(
631 "Ambiguous dataId entry '%s' associated with multiple dimensions: %s."
632 " Removed ambiguity by choosing dimension %s.",
633 fieldName,
634 ", ".join(assignedDimensions),
635 selected,
636 )
638 for candidateDimension in assignedDimensions:
639 if candidateDimension != selected:
640 del guessedAssociation[candidateDimension][fieldName]
642 # Update the record look up dict with the new associations
643 for dimensionName, values in guessedAssociation.items():
644 if values: # A dict might now be empty
645 _LOG.debug(
646 "Assigned non-dimension dataId keys to dimension %s: %s", dimensionName, values
647 )
648 byRecord[dimensionName].update(values)
650 if byRecord:
651 # Some record specifiers were found so we need to convert
652 # them to the Id form
653 for dimensionName, values in byRecord.items():
654 if dimensionName in newDataId:
655 _LOG.debug(
656 "DataId specified explicit %s dimension value of %s in addition to"
657 " general record specifiers for it of %s. Ignoring record information.",
658 dimensionName,
659 newDataId[dimensionName],
660 str(values),
661 )
662 # Get the actual record and compare with these values.
663 try:
664 recs = list(self._registry.queryDimensionRecords(dimensionName, dataId=newDataId))
665 except DataIdError:
666 raise ValueError(
667 f"Could not find dimension '{dimensionName}'"
668 f" with dataId {newDataId} as part of comparing with"
669 f" record values {byRecord[dimensionName]}"
670 ) from None
671 if len(recs) == 1:
672 errmsg: list[str] = []
673 for k, v in values.items():
674 if (recval := getattr(recs[0], k)) != v:
675 errmsg.append(f"{k}({recval} != {v})")
676 if errmsg:
677 raise ValueError(
678 f"Dimension {dimensionName} in dataId has explicit value"
679 " inconsistent with records: " + ", ".join(errmsg)
680 )
681 else:
682 # Multiple matches for an explicit dimension
683 # should never happen but let downstream complain.
684 pass
685 continue
687 # Build up a WHERE expression
688 bind = dict(values.items())
689 where = " AND ".join(f"{dimensionName}.{k} = {k}" for k in bind)
691 # Hopefully we get a single record that matches
692 records = set(
693 self._registry.queryDimensionRecords(
694 dimensionName, dataId=newDataId, where=where, bind=bind, **kwargs
695 )
696 )
698 if len(records) != 1:
699 if len(records) > 1:
700 # visit can have an ambiguous answer without involving
701 # visit_system. The default visit_system is defined
702 # by the instrument.
703 if (
704 dimensionName == "visit"
705 and "visit_system_membership" in self.dimensions
706 and "visit_system" in self.dimensions["instrument"].metadata
707 ):
708 instrument_records = list(
709 self._registry.queryDimensionRecords(
710 "instrument",
711 dataId=newDataId,
712 **kwargs,
713 )
714 )
715 if len(instrument_records) == 1:
716 visit_system = instrument_records[0].visit_system
717 if visit_system is None:
718 # Set to a value that will never match.
719 visit_system = -1
721 # Look up each visit in the
722 # visit_system_membership records.
723 for rec in records:
724 membership = list(
725 self._registry.queryDimensionRecords(
726 # Use bind to allow zero results.
727 # This is a fully-specified query.
728 "visit_system_membership",
729 where="instrument = inst AND visit_system = system AND visit = v",
730 bind=dict(
731 inst=instrument_records[0].name, system=visit_system, v=rec.id
732 ),
733 )
734 )
735 if membership:
736 # This record is the right answer.
737 records = {rec}
738 break
740 # The ambiguity may have been resolved so check again.
741 if len(records) > 1:
742 _LOG.debug(
743 "Received %d records from constraints of %s", len(records), str(values)
744 )
745 for r in records:
746 _LOG.debug("- %s", str(r))
747 raise ValueError(
748 f"DataId specification for dimension {dimensionName} is not"
749 f" uniquely constrained to a single dataset by {values}."
750 f" Got {len(records)} results."
751 )
752 else:
753 raise ValueError(
754 f"DataId specification for dimension {dimensionName} matched no"
755 f" records when constrained by {values}"
756 )
758 # Get the primary key from the real dimension object
759 dimension = self.dimensions.dimensions[dimensionName]
760 if not isinstance(dimension, Dimension):
761 raise RuntimeError(
762 f"{dimension.name} is not a true dimension, and cannot be used in data IDs."
763 )
764 newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name)
766 return newDataId, kwargs
768 def _findDatasetRef(
769 self,
770 datasetRefOrType: DatasetRef | DatasetType | str,
771 dataId: DataId | None = None,
772 *,
773 collections: Any = None,
774 predict: bool = False,
775 run: str | None = None,
776 datastore_records: bool = False,
777 **kwargs: Any,
778 ) -> DatasetRef:
779 """Shared logic for methods that start with a search for a dataset in
780 the registry.
782 Parameters
783 ----------
784 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
785 When `DatasetRef` the `dataId` should be `None`.
786 Otherwise the `DatasetType` or name thereof.
787 dataId : `dict` or `DataCoordinate`, optional
788 A `dict` of `Dimension` link name, value pairs that label the
789 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
790 should be provided as the first argument.
791 collections : Any, optional
792 Collections to be searched, overriding ``self.collections``.
793 Can be any of the types supported by the ``collections`` argument
794 to butler construction.
795 predict : `bool`, optional
796 If `True`, return a newly created `DatasetRef` with a unique
797 dataset ID if finding a reference in the `Registry` fails.
798 Defaults to `False`.
799 run : `str`, optional
800 Run collection name to use for creating `DatasetRef` for predicted
801 datasets. Only used if ``predict`` is `True`.
802 datastore_records : `bool`, optional
803 If `True` add datastore records to returned `DatasetRef`.
804 **kwargs
805 Additional keyword arguments used to augment or construct a
806 `DataId`. See `DataId` parameters.
808 Returns
809 -------
810 ref : `DatasetRef`
811 A reference to the dataset identified by the given arguments.
812 This can be the same dataset reference as given if it was
813 resolved.
815 Raises
816 ------
817 LookupError
818 Raised if no matching dataset exists in the `Registry` (and
819 ``predict`` is `False`).
820 ValueError
821 Raised if a resolved `DatasetRef` was passed as an input, but it
822 differs from the one found in the registry.
823 TypeError
824 Raised if no collections were provided.
825 """
826 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, for_put=False, **kwargs)
827 if isinstance(datasetRefOrType, DatasetRef):
828 if collections is not None:
829 warnings.warn("Collections should not be specified with DatasetRef", stacklevel=3)
830 # May need to retrieve datastore records if requested.
831 if datastore_records and datasetRefOrType._datastore_records is None:
832 datasetRefOrType = self._registry.get_datastore_records(datasetRefOrType)
833 return datasetRefOrType
834 timespan: Timespan | None = None
836 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs)
838 if datasetType.isCalibration():
839 # Because this is a calibration dataset, first try to make a
840 # standardize the data ID without restricting the dimensions to
841 # those of the dataset type requested, because there may be extra
842 # dimensions that provide temporal information for a validity-range
843 # lookup.
844 dataId = DataCoordinate.standardize(
845 dataId, universe=self.dimensions, defaults=self._registry.defaults.dataId, **kwargs
846 )
847 if dataId.dimensions.temporal:
848 dataId = self._registry.expandDataId(dataId)
849 timespan = dataId.timespan
850 else:
851 # Standardize the data ID to just the dimensions of the dataset
852 # type instead of letting registry.findDataset do it, so we get the
853 # result even if no dataset is found.
854 dataId = DataCoordinate.standardize(
855 dataId,
856 dimensions=datasetType.dimensions,
857 defaults=self._registry.defaults.dataId,
858 **kwargs,
859 )
860 # Always lookup the DatasetRef, even if one is given, to ensure it is
861 # present in the current collection.
862 ref = self.find_dataset(
863 datasetType,
864 dataId,
865 collections=collections,
866 timespan=timespan,
867 datastore_records=datastore_records,
868 )
869 if ref is None:
870 if predict:
871 if run is None:
872 run = self.run
873 if run is None:
874 raise TypeError("Cannot predict dataset ID/location with run=None.")
875 return DatasetRef(datasetType, dataId, run=run)
876 else:
877 if collections is None:
878 collections = self._registry.defaults.collections
879 raise LookupError(
880 f"Dataset {datasetType.name} with data ID {dataId} "
881 f"could not be found in collections {collections}."
882 )
883 if datasetType != ref.datasetType:
884 # If they differ it is because the user explicitly specified
885 # a compatible dataset type to this call rather than using the
886 # registry definition. The DatasetRef must therefore be recreated
887 # using the user definition such that the expected type is
888 # returned.
889 ref = DatasetRef(
890 datasetType, ref.dataId, run=ref.run, id=ref.id, datastore_records=ref._datastore_records
891 )
893 return ref
895 # TODO: remove on DM-40067.
896 @transactional
897 @deprecated(
898 reason="Butler.put() now behaves like Butler.putDirect() when given a DatasetRef."
899 " Please use Butler.put(). Be aware that you may need to adjust your usage if you"
900 " were relying on the run parameter to determine the run."
901 " Will be removed after v26.0.",
902 version="v26.0",
903 category=FutureWarning,
904 )
905 def putDirect(self, obj: Any, ref: DatasetRef, /) -> DatasetRef:
906 # Docstring inherited.
907 return self.put(obj, ref)
909 @transactional
910 def put(
911 self,
912 obj: Any,
913 datasetRefOrType: DatasetRef | DatasetType | str,
914 /,
915 dataId: DataId | None = None,
916 *,
917 run: str | None = None,
918 **kwargs: Any,
919 ) -> DatasetRef:
920 """Store and register a dataset.
922 Parameters
923 ----------
924 obj : `object`
925 The dataset.
926 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
927 When `DatasetRef` is provided, ``dataId`` should be `None`.
928 Otherwise the `DatasetType` or name thereof. If a fully resolved
929 `DatasetRef` is given the run and ID are used directly.
930 dataId : `dict` or `DataCoordinate`
931 A `dict` of `Dimension` link name, value pairs that label the
932 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
933 should be provided as the second argument.
934 run : `str`, optional
935 The name of the run the dataset should be added to, overriding
936 ``self.run``. Not used if a resolved `DatasetRef` is provided.
937 **kwargs
938 Additional keyword arguments used to augment or construct a
939 `DataCoordinate`. See `DataCoordinate.standardize`
940 parameters. Not used if a resolve `DatasetRef` is provided.
942 Returns
943 -------
944 ref : `DatasetRef`
945 A reference to the stored dataset, updated with the correct id if
946 given.
948 Raises
949 ------
950 TypeError
951 Raised if the butler is read-only or if no run has been provided.
952 """
953 if isinstance(datasetRefOrType, DatasetRef):
954 # This is a direct put of predefined DatasetRef.
955 _LOG.debug("Butler put direct: %s", datasetRefOrType)
956 if run is not None:
957 warnings.warn("Run collection is not used for DatasetRef", stacklevel=3)
958 # If registry already has a dataset with the same dataset ID,
959 # dataset type and DataId, then _importDatasets will do nothing and
960 # just return an original ref. We have to raise in this case, there
961 # is a datastore check below for that.
962 self._registry._importDatasets([datasetRefOrType], expand=True)
963 # Before trying to write to the datastore check that it does not
964 # know this dataset. This is prone to races, of course.
965 if self._datastore.knows(datasetRefOrType):
966 raise ConflictingDefinitionError(f"Datastore already contains dataset: {datasetRefOrType}")
967 # Try to write dataset to the datastore, if it fails due to a race
968 # with another write, the content of stored data may be
969 # unpredictable.
970 try:
971 self._datastore.put(obj, datasetRefOrType)
972 except IntegrityError as e:
973 raise ConflictingDefinitionError(f"Datastore already contains dataset: {e}") from e
974 return datasetRefOrType
976 _LOG.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run)
977 if not self.isWriteable():
978 raise TypeError("Butler is read-only.")
979 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs)
981 # Handle dimension records in dataId
982 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs)
984 # Add Registry Dataset entry.
985 dataId = self._registry.expandDataId(dataId, dimensions=datasetType.dimensions, **kwargs)
986 (ref,) = self._registry.insertDatasets(datasetType, run=run, dataIds=[dataId])
987 self._datastore.put(obj, ref)
989 return ref
991 # TODO: remove on DM-40067.
992 @deprecated(
993 reason="Butler.get() now behaves like Butler.getDirect() when given a DatasetRef."
994 " Please use Butler.get(). Will be removed after v26.0.",
995 version="v26.0",
996 category=FutureWarning,
997 )
998 def getDirect(
999 self,
1000 ref: DatasetRef,
1001 *,
1002 parameters: dict[str, Any] | None = None,
1003 storageClass: StorageClass | str | None = None,
1004 ) -> Any:
1005 """Retrieve a stored dataset.
1007 Parameters
1008 ----------
1009 ref : `DatasetRef`
1010 Resolved reference to an already stored dataset.
1011 parameters : `dict`
1012 Additional StorageClass-defined options to control reading,
1013 typically used to efficiently read only a subset of the dataset.
1014 storageClass : `StorageClass` or `str`, optional
1015 The storage class to be used to override the Python type
1016 returned by this method. By default the returned type matches
1017 the dataset type definition for this dataset. Specifying a
1018 read `StorageClass` can force a different type to be returned.
1019 This type must be compatible with the original type.
1021 Returns
1022 -------
1023 obj : `object`
1024 The dataset.
1025 """
1026 return self._datastore.get(ref, parameters=parameters, storageClass=storageClass)
1028 # TODO: remove on DM-40067.
1029 @deprecated(
1030 reason="Butler.getDeferred() now behaves like getDirectDeferred() when given a DatasetRef. "
1031 "Please use Butler.getDeferred(). Will be removed after v26.0.",
1032 version="v26.0",
1033 category=FutureWarning,
1034 )
1035 def getDirectDeferred(
1036 self,
1037 ref: DatasetRef,
1038 *,
1039 parameters: dict[str, Any] | None = None,
1040 storageClass: str | StorageClass | None = None,
1041 ) -> DeferredDatasetHandle:
1042 """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
1043 from a resolved `DatasetRef`.
1045 Parameters
1046 ----------
1047 ref : `DatasetRef`
1048 Resolved reference to an already stored dataset.
1049 parameters : `dict`
1050 Additional StorageClass-defined options to control reading,
1051 typically used to efficiently read only a subset of the dataset.
1052 storageClass : `StorageClass` or `str`, optional
1053 The storage class to be used to override the Python type
1054 returned by this method. By default the returned type matches
1055 the dataset type definition for this dataset. Specifying a
1056 read `StorageClass` can force a different type to be returned.
1057 This type must be compatible with the original type.
1059 Returns
1060 -------
1061 obj : `DeferredDatasetHandle`
1062 A handle which can be used to retrieve a dataset at a later time.
1064 Raises
1065 ------
1066 LookupError
1067 Raised if no matching dataset exists in the `Registry`.
1068 """
1069 # Check that dataset is known to the datastore.
1070 if not self._datastore.knows(ref):
1071 raise LookupError(f"Dataset reference {ref} is not known to datastore.")
1072 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters, storageClass=storageClass)
1074 def getDeferred(
1075 self,
1076 datasetRefOrType: DatasetRef | DatasetType | str,
1077 /,
1078 dataId: DataId | None = None,
1079 *,
1080 parameters: dict | None = None,
1081 collections: Any = None,
1082 storageClass: str | StorageClass | None = None,
1083 **kwargs: Any,
1084 ) -> DeferredDatasetHandle:
1085 """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
1086 after an immediate registry lookup.
1088 Parameters
1089 ----------
1090 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1091 When `DatasetRef` the `dataId` should be `None`.
1092 Otherwise the `DatasetType` or name thereof.
1093 dataId : `dict` or `DataCoordinate`, optional
1094 A `dict` of `Dimension` link name, value pairs that label the
1095 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1096 should be provided as the first argument.
1097 parameters : `dict`
1098 Additional StorageClass-defined options to control reading,
1099 typically used to efficiently read only a subset of the dataset.
1100 collections : Any, optional
1101 Collections to be searched, overriding ``self.collections``.
1102 Can be any of the types supported by the ``collections`` argument
1103 to butler construction.
1104 storageClass : `StorageClass` or `str`, optional
1105 The storage class to be used to override the Python type
1106 returned by this method. By default the returned type matches
1107 the dataset type definition for this dataset. Specifying a
1108 read `StorageClass` can force a different type to be returned.
1109 This type must be compatible with the original type.
1110 **kwargs
1111 Additional keyword arguments used to augment or construct a
1112 `DataId`. See `DataId` parameters.
1114 Returns
1115 -------
1116 obj : `DeferredDatasetHandle`
1117 A handle which can be used to retrieve a dataset at a later time.
1119 Raises
1120 ------
1121 LookupError
1122 Raised if no matching dataset exists in the `Registry` or
1123 datastore.
1124 ValueError
1125 Raised if a resolved `DatasetRef` was passed as an input, but it
1126 differs from the one found in the registry.
1127 TypeError
1128 Raised if no collections were provided.
1129 """
1130 if isinstance(datasetRefOrType, DatasetRef):
1131 # Do the quick check first and if that fails, check for artifact
1132 # existence. This is necessary for datastores that are configured
1133 # in trust mode where there won't be a record but there will be
1134 # a file.
1135 if self._datastore.knows(datasetRefOrType) or self._datastore.exists(datasetRefOrType):
1136 ref = datasetRefOrType
1137 else:
1138 raise LookupError(f"Dataset reference {datasetRefOrType} does not exist.")
1139 else:
1140 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs)
1141 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters, storageClass=storageClass)
1143 def get(
1144 self,
1145 datasetRefOrType: DatasetRef | DatasetType | str,
1146 /,
1147 dataId: DataId | None = None,
1148 *,
1149 parameters: dict[str, Any] | None = None,
1150 collections: Any = None,
1151 storageClass: StorageClass | str | None = None,
1152 **kwargs: Any,
1153 ) -> Any:
1154 """Retrieve a stored dataset.
1156 Parameters
1157 ----------
1158 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1159 When `DatasetRef` the `dataId` should be `None`.
1160 Otherwise the `DatasetType` or name thereof.
1161 If a resolved `DatasetRef`, the associated dataset
1162 is returned directly without additional querying.
1163 dataId : `dict` or `DataCoordinate`
1164 A `dict` of `Dimension` link name, value pairs that label the
1165 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1166 should be provided as the first argument.
1167 parameters : `dict`
1168 Additional StorageClass-defined options to control reading,
1169 typically used to efficiently read only a subset of the dataset.
1170 collections : Any, optional
1171 Collections to be searched, overriding ``self.collections``.
1172 Can be any of the types supported by the ``collections`` argument
1173 to butler construction.
1174 storageClass : `StorageClass` or `str`, optional
1175 The storage class to be used to override the Python type
1176 returned by this method. By default the returned type matches
1177 the dataset type definition for this dataset. Specifying a
1178 read `StorageClass` can force a different type to be returned.
1179 This type must be compatible with the original type.
1180 **kwargs
1181 Additional keyword arguments used to augment or construct a
1182 `DataCoordinate`. See `DataCoordinate.standardize`
1183 parameters.
1185 Returns
1186 -------
1187 obj : `object`
1188 The dataset.
1190 Raises
1191 ------
1192 LookupError
1193 Raised if no matching dataset exists in the `Registry`.
1194 TypeError
1195 Raised if no collections were provided.
1197 Notes
1198 -----
1199 When looking up datasets in a `~CollectionType.CALIBRATION` collection,
1200 this method requires that the given data ID include temporal dimensions
1201 beyond the dimensions of the dataset type itself, in order to find the
1202 dataset with the appropriate validity range. For example, a "bias"
1203 dataset with native dimensions ``{instrument, detector}`` could be
1204 fetched with a ``{instrument, detector, exposure}`` data ID, because
1205 ``exposure`` is a temporal dimension.
1206 """
1207 _LOG.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters)
1208 ref = self._findDatasetRef(
1209 datasetRefOrType, dataId, collections=collections, datastore_records=True, **kwargs
1210 )
1211 return self._datastore.get(ref, parameters=parameters, storageClass=storageClass)
1213 def getURIs(
1214 self,
1215 datasetRefOrType: DatasetRef | DatasetType | str,
1216 /,
1217 dataId: DataId | None = None,
1218 *,
1219 predict: bool = False,
1220 collections: Any = None,
1221 run: str | None = None,
1222 **kwargs: Any,
1223 ) -> DatasetRefURIs:
1224 """Return the URIs associated with the dataset.
1226 Parameters
1227 ----------
1228 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1229 When `DatasetRef` the `dataId` should be `None`.
1230 Otherwise the `DatasetType` or name thereof.
1231 dataId : `dict` or `DataCoordinate`
1232 A `dict` of `Dimension` link name, value pairs that label the
1233 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1234 should be provided as the first argument.
1235 predict : `bool`
1236 If `True`, allow URIs to be returned of datasets that have not
1237 been written.
1238 collections : Any, optional
1239 Collections to be searched, overriding ``self.collections``.
1240 Can be any of the types supported by the ``collections`` argument
1241 to butler construction.
1242 run : `str`, optional
1243 Run to use for predictions, overriding ``self.run``.
1244 **kwargs
1245 Additional keyword arguments used to augment or construct a
1246 `DataCoordinate`. See `DataCoordinate.standardize`
1247 parameters.
1249 Returns
1250 -------
1251 uris : `DatasetRefURIs`
1252 The URI to the primary artifact associated with this dataset (if
1253 the dataset was disassembled within the datastore this may be
1254 `None`), and the URIs to any components associated with the dataset
1255 artifact. (can be empty if there are no components).
1256 """
1257 ref = self._findDatasetRef(
1258 datasetRefOrType, dataId, predict=predict, run=run, collections=collections, **kwargs
1259 )
1260 return self._datastore.getURIs(ref, predict)
1262 def getURI(
1263 self,
1264 datasetRefOrType: DatasetRef | DatasetType | str,
1265 /,
1266 dataId: DataId | None = None,
1267 *,
1268 predict: bool = False,
1269 collections: Any = None,
1270 run: str | None = None,
1271 **kwargs: Any,
1272 ) -> ResourcePath:
1273 """Return the URI to the Dataset.
1275 Parameters
1276 ----------
1277 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1278 When `DatasetRef` the `dataId` should be `None`.
1279 Otherwise the `DatasetType` or name thereof.
1280 dataId : `dict` or `DataCoordinate`
1281 A `dict` of `Dimension` link name, value pairs that label the
1282 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1283 should be provided as the first argument.
1284 predict : `bool`
1285 If `True`, allow URIs to be returned of datasets that have not
1286 been written.
1287 collections : Any, optional
1288 Collections to be searched, overriding ``self.collections``.
1289 Can be any of the types supported by the ``collections`` argument
1290 to butler construction.
1291 run : `str`, optional
1292 Run to use for predictions, overriding ``self.run``.
1293 **kwargs
1294 Additional keyword arguments used to augment or construct a
1295 `DataCoordinate`. See `DataCoordinate.standardize`
1296 parameters.
1298 Returns
1299 -------
1300 uri : `lsst.resources.ResourcePath`
1301 URI pointing to the Dataset within the datastore. If the
1302 Dataset does not exist in the datastore, and if ``predict`` is
1303 `True`, the URI will be a prediction and will include a URI
1304 fragment "#predicted".
1305 If the datastore does not have entities that relate well
1306 to the concept of a URI the returned URI string will be
1307 descriptive. The returned URI is not guaranteed to be obtainable.
1309 Raises
1310 ------
1311 LookupError
1312 A URI has been requested for a dataset that does not exist and
1313 guessing is not allowed.
1314 ValueError
1315 Raised if a resolved `DatasetRef` was passed as an input, but it
1316 differs from the one found in the registry.
1317 TypeError
1318 Raised if no collections were provided.
1319 RuntimeError
1320 Raised if a URI is requested for a dataset that consists of
1321 multiple artifacts.
1322 """
1323 primary, components = self.getURIs(
1324 datasetRefOrType, dataId=dataId, predict=predict, collections=collections, run=run, **kwargs
1325 )
1327 if primary is None or components:
1328 raise RuntimeError(
1329 f"Dataset ({datasetRefOrType}) includes distinct URIs for components. "
1330 "Use Butler.getURIs() instead."
1331 )
1332 return primary
1334 def get_dataset_type(self, name: str) -> DatasetType:
1335 return self._registry.getDatasetType(name)
1337 def get_dataset(
1338 self,
1339 id: DatasetId,
1340 storage_class: str | StorageClass | None = None,
1341 dimension_records: bool = False,
1342 datastore_records: bool = False,
1343 ) -> DatasetRef | None:
1344 ref = self._registry.getDataset(id)
1345 if ref is not None:
1346 if dimension_records:
1347 ref = ref.expanded(
1348 self._registry.expandDataId(ref.dataId, dimensions=ref.datasetType.dimensions)
1349 )
1350 if storage_class:
1351 ref = ref.overrideStorageClass(storage_class)
1352 if datastore_records:
1353 ref = self._registry.get_datastore_records(ref)
1354 return ref
1356 def find_dataset(
1357 self,
1358 dataset_type: DatasetType | str,
1359 data_id: DataId | None = None,
1360 *,
1361 collections: str | Sequence[str] | None = None,
1362 timespan: Timespan | None = None,
1363 storage_class: str | StorageClass | None = None,
1364 dimension_records: bool = False,
1365 datastore_records: bool = False,
1366 **kwargs: Any,
1367 ) -> DatasetRef | None:
1368 # Handle any parts of the dataID that are not using primary dimension
1369 # keys.
1370 if isinstance(dataset_type, str):
1371 actual_type = self.get_dataset_type(dataset_type)
1372 else:
1373 actual_type = dataset_type
1374 data_id, kwargs = self._rewrite_data_id(data_id, actual_type, **kwargs)
1376 ref = self._registry.findDataset(
1377 dataset_type,
1378 data_id,
1379 collections=collections,
1380 timespan=timespan,
1381 datastore_records=datastore_records,
1382 **kwargs,
1383 )
1384 if ref is not None and dimension_records:
1385 ref = ref.expanded(self._registry.expandDataId(ref.dataId, dimensions=ref.datasetType.dimensions))
1386 if ref is not None and storage_class is not None:
1387 ref = ref.overrideStorageClass(storage_class)
1388 return ref
1390 def retrieveArtifacts(
1391 self,
1392 refs: Iterable[DatasetRef],
1393 destination: ResourcePathExpression,
1394 transfer: str = "auto",
1395 preserve_path: bool = True,
1396 overwrite: bool = False,
1397 ) -> list[ResourcePath]:
1398 # Docstring inherited.
1399 return self._datastore.retrieveArtifacts(
1400 refs,
1401 ResourcePath(destination),
1402 transfer=transfer,
1403 preserve_path=preserve_path,
1404 overwrite=overwrite,
1405 )
1407 def exists(
1408 self,
1409 dataset_ref_or_type: DatasetRef | DatasetType | str,
1410 /,
1411 data_id: DataId | None = None,
1412 *,
1413 full_check: bool = True,
1414 collections: Any = None,
1415 **kwargs: Any,
1416 ) -> DatasetExistence:
1417 # Docstring inherited.
1418 existence = DatasetExistence.UNRECOGNIZED
1420 if isinstance(dataset_ref_or_type, DatasetRef):
1421 if collections is not None:
1422 warnings.warn("Collections should not be specified with DatasetRef", stacklevel=2)
1423 if data_id is not None:
1424 warnings.warn("A DataID should not be specified with DatasetRef", stacklevel=2)
1425 ref = dataset_ref_or_type
1426 registry_ref = self._registry.getDataset(dataset_ref_or_type.id)
1427 if registry_ref is not None:
1428 existence |= DatasetExistence.RECORDED
1430 if dataset_ref_or_type != registry_ref:
1431 # This could mean that storage classes differ, so we should
1432 # check for that but use the registry ref for the rest of
1433 # the method.
1434 if registry_ref.is_compatible_with(dataset_ref_or_type):
1435 # Use the registry version from now on.
1436 ref = registry_ref
1437 else:
1438 raise ValueError(
1439 f"The ref given to exists() ({ref}) has the same dataset ID as one "
1440 f"in registry but has different incompatible values ({registry_ref})."
1441 )
1442 else:
1443 try:
1444 ref = self._findDatasetRef(dataset_ref_or_type, data_id, collections=collections, **kwargs)
1445 except (LookupError, TypeError, NoDefaultCollectionError):
1446 return existence
1447 existence |= DatasetExistence.RECORDED
1449 if self._datastore.knows(ref):
1450 existence |= DatasetExistence.DATASTORE
1452 if full_check:
1453 if self._datastore.exists(ref):
1454 existence |= DatasetExistence._ARTIFACT
1455 elif existence.value != DatasetExistence.UNRECOGNIZED.value:
1456 # Do not add this flag if we have no other idea about a dataset.
1457 existence |= DatasetExistence(DatasetExistence._ASSUMED)
1459 return existence
1461 def _exists_many(
1462 self,
1463 refs: Iterable[DatasetRef],
1464 /,
1465 *,
1466 full_check: bool = True,
1467 ) -> dict[DatasetRef, DatasetExistence]:
1468 # Docstring inherited.
1469 existence = {ref: DatasetExistence.UNRECOGNIZED for ref in refs}
1471 # Registry does not have a bulk API to check for a ref.
1472 for ref in refs:
1473 registry_ref = self._registry.getDataset(ref.id)
1474 if registry_ref is not None:
1475 # It is possible, albeit unlikely, that the given ref does
1476 # not match the one in registry even though the UUID matches.
1477 # When checking a single ref we raise, but it's impolite to
1478 # do that when potentially hundreds of refs are being checked.
1479 # We could change the API to only accept UUIDs and that would
1480 # remove the ability to even check and remove the worry
1481 # about differing storage classes. Given the ongoing discussion
1482 # on refs vs UUIDs and whether to raise or have a new
1483 # private flag, treat this as a private API for now.
1484 existence[ref] |= DatasetExistence.RECORDED
1486 # Ask datastore if it knows about these refs.
1487 knows = self._datastore.knows_these(refs)
1488 for ref, known in knows.items():
1489 if known:
1490 existence[ref] |= DatasetExistence.DATASTORE
1492 if full_check:
1493 mexists = self._datastore.mexists(refs)
1494 for ref, exists in mexists.items():
1495 if exists:
1496 existence[ref] |= DatasetExistence._ARTIFACT
1497 else:
1498 # Do not set this flag if nothing is known about the dataset.
1499 for ref in existence:
1500 if existence[ref] != DatasetExistence.UNRECOGNIZED:
1501 existence[ref] |= DatasetExistence._ASSUMED
1503 return existence
1505 # TODO: remove on DM-40079.
1506 @deprecated(
1507 reason="Butler.datasetExists() has been replaced by Butler.exists(). Will be removed after v26.0.",
1508 version="v26.0",
1509 category=FutureWarning,
1510 )
1511 def datasetExists(
1512 self,
1513 datasetRefOrType: DatasetRef | DatasetType | str,
1514 dataId: DataId | None = None,
1515 *,
1516 collections: Any = None,
1517 **kwargs: Any,
1518 ) -> bool:
1519 """Return True if the Dataset is actually present in the Datastore.
1521 Parameters
1522 ----------
1523 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1524 When `DatasetRef` the `dataId` should be `None`.
1525 Otherwise the `DatasetType` or name thereof.
1526 dataId : `dict` or `DataCoordinate`
1527 A `dict` of `Dimension` link name, value pairs that label the
1528 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1529 should be provided as the first argument.
1530 collections : Any, optional
1531 Collections to be searched, overriding ``self.collections``.
1532 Can be any of the types supported by the ``collections`` argument
1533 to butler construction.
1534 **kwargs
1535 Additional keyword arguments used to augment or construct a
1536 `DataCoordinate`. See `DataCoordinate.standardize`
1537 parameters.
1539 Raises
1540 ------
1541 LookupError
1542 Raised if the dataset is not even present in the Registry.
1543 ValueError
1544 Raised if a resolved `DatasetRef` was passed as an input, but it
1545 differs from the one found in the registry.
1546 NoDefaultCollectionError
1547 Raised if no collections were provided.
1548 """
1549 # A resolved ref may be given that is not known to this butler.
1550 if isinstance(datasetRefOrType, DatasetRef):
1551 ref = self._registry.getDataset(datasetRefOrType.id)
1552 if ref is None:
1553 raise LookupError(
1554 f"Resolved DatasetRef with id {datasetRefOrType.id} is not known to registry."
1555 )
1556 else:
1557 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs)
1558 return self._datastore.exists(ref)
1560 def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None:
1561 # Docstring inherited.
1562 if not self.isWriteable():
1563 raise TypeError("Butler is read-only.")
1564 names = list(names)
1565 refs: list[DatasetRef] = []
1566 for name in names:
1567 collectionType = self._registry.getCollectionType(name)
1568 if collectionType is not CollectionType.RUN:
1569 raise TypeError(f"The collection type of '{name}' is {collectionType.name}, not RUN.")
1570 refs.extend(self._registry.queryDatasets(..., collections=name, findFirst=True))
1571 with self._datastore.transaction(), self._registry.transaction():
1572 if unstore:
1573 self._datastore.trash(refs)
1574 else:
1575 self._datastore.forget(refs)
1576 for name in names:
1577 self._registry.removeCollection(name)
1578 if unstore:
1579 # Point of no return for removing artifacts
1580 self._datastore.emptyTrash()
1582 def pruneDatasets(
1583 self,
1584 refs: Iterable[DatasetRef],
1585 *,
1586 disassociate: bool = True,
1587 unstore: bool = False,
1588 tags: Iterable[str] = (),
1589 purge: bool = False,
1590 ) -> None:
1591 # docstring inherited from LimitedButler
1593 if not self.isWriteable():
1594 raise TypeError("Butler is read-only.")
1595 if purge:
1596 if not disassociate:
1597 raise TypeError("Cannot pass purge=True without disassociate=True.")
1598 if not unstore:
1599 raise TypeError("Cannot pass purge=True without unstore=True.")
1600 elif disassociate:
1601 tags = tuple(tags)
1602 if not tags:
1603 raise TypeError("No tags provided but disassociate=True.")
1604 for tag in tags:
1605 collectionType = self._registry.getCollectionType(tag)
1606 if collectionType is not CollectionType.TAGGED:
1607 raise TypeError(
1608 f"Cannot disassociate from collection '{tag}' "
1609 f"of non-TAGGED type {collectionType.name}."
1610 )
1611 # Transform possibly-single-pass iterable into something we can iterate
1612 # over multiple times.
1613 refs = list(refs)
1614 # Pruning a component of a DatasetRef makes no sense since registry
1615 # doesn't know about components and datastore might not store
1616 # components in a separate file
1617 for ref in refs:
1618 if ref.datasetType.component():
1619 raise ValueError(f"Can not prune a component of a dataset (ref={ref})")
1620 # We don't need an unreliable Datastore transaction for this, because
1621 # we've been extra careful to ensure that Datastore.trash only involves
1622 # mutating the Registry (it can _look_ at Datastore-specific things,
1623 # but shouldn't change them), and hence all operations here are
1624 # Registry operations.
1625 with self._datastore.transaction(), self._registry.transaction():
1626 if unstore:
1627 self._datastore.trash(refs)
1628 if purge:
1629 self._registry.removeDatasets(refs)
1630 elif disassociate:
1631 assert tags, "Guaranteed by earlier logic in this function."
1632 for tag in tags:
1633 self._registry.disassociate(tag, refs)
1634 # We've exited the Registry transaction, and apparently committed.
1635 # (if there was an exception, everything rolled back, and it's as if
1636 # nothing happened - and we never get here).
1637 # Datastore artifacts are not yet gone, but they're clearly marked
1638 # as trash, so if we fail to delete now because of (e.g.) filesystem
1639 # problems we can try again later, and if manual administrative
1640 # intervention is required, it's pretty clear what that should entail:
1641 # deleting everything on disk and in private Datastore tables that is
1642 # in the dataset_location_trash table.
1643 if unstore:
1644 # Point of no return for removing artifacts
1645 self._datastore.emptyTrash()
1647 @transactional
1648 def ingest(
1649 self,
1650 *datasets: FileDataset,
1651 transfer: str | None = "auto",
1652 run: str | None = None,
1653 idGenerationMode: DatasetIdGenEnum | None = None,
1654 record_validation_info: bool = True,
1655 ) -> None:
1656 # Docstring inherited.
1657 if not self.isWriteable():
1658 raise TypeError("Butler is read-only.")
1660 _LOG.verbose("Ingesting %d file dataset%s.", len(datasets), "" if len(datasets) == 1 else "s")
1661 if not datasets:
1662 return
1664 if idGenerationMode is not None:
1665 warnings.warn(
1666 "The idGenerationMode parameter is no longer used and is ignored. "
1667 " Will be removed after v26.0",
1668 FutureWarning,
1669 stacklevel=2,
1670 )
1672 progress = Progress("lsst.daf.butler.Butler.ingest", level=logging.DEBUG)
1674 # We need to reorganize all the inputs so that they are grouped
1675 # by dataset type and run. Multiple refs in a single FileDataset
1676 # are required to share the run and dataset type.
1677 groupedData: MutableMapping[tuple[DatasetType, str], list[FileDataset]] = defaultdict(list)
1679 # Track DataIDs that are being ingested so we can spot issues early
1680 # with duplication. Retain previous FileDataset so we can report it.
1681 groupedDataIds: MutableMapping[
1682 tuple[DatasetType, str], dict[DataCoordinate, FileDataset]
1683 ] = defaultdict(dict)
1685 used_run = False
1687 # And the nested loop that populates it:
1688 for dataset in progress.wrap(datasets, desc="Grouping by dataset type"):
1689 # Somewhere to store pre-existing refs if we have an
1690 # execution butler.
1691 existingRefs: list[DatasetRef] = []
1693 for ref in dataset.refs:
1694 assert ref.run is not None # For mypy
1695 group_key = (ref.datasetType, ref.run)
1697 if ref.dataId in groupedDataIds[group_key]:
1698 raise ConflictingDefinitionError(
1699 f"Ingest conflict. Dataset {dataset.path} has same"
1700 " DataId as other ingest dataset"
1701 f" {groupedDataIds[group_key][ref.dataId].path} "
1702 f" ({ref.dataId})"
1703 )
1705 groupedDataIds[group_key][ref.dataId] = dataset
1707 if existingRefs:
1708 if len(dataset.refs) != len(existingRefs):
1709 # Keeping track of partially pre-existing datasets is hard
1710 # and should generally never happen. For now don't allow
1711 # it.
1712 raise ConflictingDefinitionError(
1713 f"For dataset {dataset.path} some dataIds already exist"
1714 " in registry but others do not. This is not supported."
1715 )
1717 # Store expanded form in the original FileDataset.
1718 dataset.refs = existingRefs
1719 else:
1720 groupedData[group_key].append(dataset)
1722 if not used_run and run is not None:
1723 warnings.warn(
1724 "All DatasetRefs to be ingested had resolved dataset IDs. The value given to the "
1725 f"'run' parameter ({run!r}) was not used and the parameter will be removed in the future.",
1726 category=FutureWarning,
1727 stacklevel=3, # Take into account the @transactional decorator.
1728 )
1730 # Now we can bulk-insert into Registry for each DatasetType.
1731 for (datasetType, this_run), grouped_datasets in progress.iter_item_chunks(
1732 groupedData.items(), desc="Bulk-inserting datasets by type"
1733 ):
1734 refs_to_import = []
1735 for dataset in grouped_datasets:
1736 refs_to_import.extend(dataset.refs)
1738 n_refs = len(refs_to_import)
1739 _LOG.verbose(
1740 "Importing %d ref%s of dataset type %r into run %r",
1741 n_refs,
1742 "" if n_refs == 1 else "s",
1743 datasetType.name,
1744 this_run,
1745 )
1747 # Import the refs and expand the DataCoordinates since we can't
1748 # guarantee that they are expanded and Datastore will need
1749 # the records.
1750 imported_refs = self._registry._importDatasets(refs_to_import, expand=True)
1751 assert set(imported_refs) == set(refs_to_import)
1753 # Replace all the refs in the FileDataset with expanded versions.
1754 # Pull them off in the order we put them on the list.
1755 for dataset in grouped_datasets:
1756 n_dataset_refs = len(dataset.refs)
1757 dataset.refs = imported_refs[:n_dataset_refs]
1758 del imported_refs[:n_dataset_refs]
1760 # Bulk-insert everything into Datastore.
1761 # We do not know if any of the registry entries already existed
1762 # (_importDatasets only complains if they exist but differ) so
1763 # we have to catch IntegrityError explicitly.
1764 try:
1765 self._datastore.ingest(
1766 *datasets, transfer=transfer, record_validation_info=record_validation_info
1767 )
1768 except IntegrityError as e:
1769 raise ConflictingDefinitionError(f"Datastore already contains one or more datasets: {e}") from e
1771 @contextlib.contextmanager
1772 def export(
1773 self,
1774 *,
1775 directory: str | None = None,
1776 filename: str | None = None,
1777 format: str | None = None,
1778 transfer: str | None = None,
1779 ) -> Iterator[RepoExportContext]:
1780 # Docstring inherited.
1781 if directory is None and transfer is not None:
1782 raise TypeError("Cannot transfer without providing a directory.")
1783 if transfer == "move":
1784 raise TypeError("Transfer may not be 'move': export is read-only")
1785 if format is None:
1786 if filename is None:
1787 raise TypeError("At least one of 'filename' or 'format' must be provided.")
1788 else:
1789 _, format = os.path.splitext(filename)
1790 if not format:
1791 raise ValueError("Please specify a file extension to determine export format.")
1792 format = format[1:] # Strip leading ".""
1793 elif filename is None:
1794 filename = f"export.{format}"
1795 if directory is not None:
1796 filename = os.path.join(directory, filename)
1797 formats = self._config["repo_transfer_formats"]
1798 if format not in formats:
1799 raise ValueError(f"Unknown export format {format!r}, allowed: {','.join(formats.keys())}")
1800 BackendClass = get_class_of(formats[format, "export"])
1801 with open(filename, "w") as stream:
1802 backend = BackendClass(stream, universe=self.dimensions)
1803 try:
1804 helper = RepoExportContext(
1805 self._registry, self._datastore, backend=backend, directory=directory, transfer=transfer
1806 )
1807 yield helper
1808 except BaseException:
1809 raise
1810 else:
1811 helper._finish()
1813 def import_(
1814 self,
1815 *,
1816 directory: ResourcePathExpression | None = None,
1817 filename: ResourcePathExpression | TextIO | None = None,
1818 format: str | None = None,
1819 transfer: str | None = None,
1820 skip_dimensions: set | None = None,
1821 ) -> None:
1822 # Docstring inherited.
1823 if not self.isWriteable():
1824 raise TypeError("Butler is read-only.")
1825 if format is None:
1826 if filename is None:
1827 raise TypeError("At least one of 'filename' or 'format' must be provided.")
1828 else:
1829 _, format = os.path.splitext(filename) # type: ignore
1830 elif filename is None:
1831 filename = ResourcePath(f"export.{format}", forceAbsolute=False)
1832 if directory is not None:
1833 directory = ResourcePath(directory, forceDirectory=True)
1834 # mypy doesn't think this will work but it does in python >= 3.10.
1835 if isinstance(filename, ResourcePathExpression): # type: ignore
1836 filename = ResourcePath(filename, forceAbsolute=False) # type: ignore
1837 if not filename.isabs() and directory is not None:
1838 potential = directory.join(filename)
1839 exists_in_cwd = filename.exists()
1840 exists_in_dir = potential.exists()
1841 if exists_in_cwd and exists_in_dir:
1842 _LOG.warning(
1843 "A relative path for filename was specified (%s) which exists relative to cwd. "
1844 "Additionally, the file exists relative to the given search directory (%s). "
1845 "Using the export file in the given directory.",
1846 filename,
1847 potential,
1848 )
1849 # Given they specified an explicit directory and that
1850 # directory has the export file in it, assume that that
1851 # is what was meant despite the file in cwd.
1852 filename = potential
1853 elif exists_in_dir:
1854 filename = potential
1855 elif not exists_in_cwd and not exists_in_dir:
1856 # Raise early.
1857 raise FileNotFoundError(
1858 f"Export file could not be found in {filename.abspath()} or {potential.abspath()}."
1859 )
1860 BackendClass: type[RepoImportBackend] = get_class_of(
1861 self._config["repo_transfer_formats"][format]["import"]
1862 )
1864 def doImport(importStream: TextIO | ResourceHandleProtocol) -> None:
1865 backend = BackendClass(importStream, self._registry) # type: ignore[call-arg]
1866 backend.register()
1867 with self.transaction():
1868 backend.load(
1869 self._datastore,
1870 directory=directory,
1871 transfer=transfer,
1872 skip_dimensions=skip_dimensions,
1873 )
1875 if isinstance(filename, ResourcePath):
1876 # We can not use open() here at the moment because of
1877 # DM-38589 since yaml does stream.read(8192) in a loop.
1878 stream = io.StringIO(filename.read().decode())
1879 doImport(stream)
1880 else:
1881 doImport(filename) # type: ignore
1883 def transfer_dimension_records_from(
1884 self, source_butler: LimitedButler | Butler, source_refs: Iterable[DatasetRef]
1885 ) -> None:
1886 # Allowed dimensions in the target butler.
1887 elements = frozenset(
1888 element for element in self.dimensions.elements if element.hasTable() and element.viewOf is None
1889 )
1891 data_ids = {ref.dataId for ref in source_refs}
1893 dimension_records = self._extract_all_dimension_records_from_data_ids(
1894 source_butler, data_ids, elements
1895 )
1897 # Insert order is important.
1898 for element in self.dimensions.sorted(dimension_records.keys()):
1899 records = [r for r in dimension_records[element].values()]
1900 # Assume that if the record is already present that we can
1901 # use it without having to check that the record metadata
1902 # is consistent.
1903 self._registry.insertDimensionData(element, *records, skip_existing=True)
1904 _LOG.debug("Dimension '%s' -- number of records transferred: %d", element.name, len(records))
1906 def _extract_all_dimension_records_from_data_ids(
1907 self,
1908 source_butler: LimitedButler | Butler,
1909 data_ids: set[DataCoordinate],
1910 allowed_elements: frozenset[DimensionElement],
1911 ) -> dict[DimensionElement, dict[DataCoordinate, DimensionRecord]]:
1912 primary_records = self._extract_dimension_records_from_data_ids(
1913 source_butler, data_ids, allowed_elements
1914 )
1916 can_query = True if isinstance(source_butler, Butler) else False
1918 additional_records: dict[DimensionElement, dict[DataCoordinate, DimensionRecord]] = defaultdict(dict)
1919 for original_element, record_mapping in primary_records.items():
1920 # Get dimensions that depend on this dimension.
1921 populated_by = self.dimensions.get_elements_populated_by(
1922 self.dimensions[original_element.name] # type: ignore
1923 )
1925 for data_id in record_mapping.keys():
1926 for element in populated_by:
1927 if element not in allowed_elements:
1928 continue
1929 if element.name == original_element.name:
1930 continue
1932 if element.name in primary_records:
1933 # If this element has already been stored avoid
1934 # re-finding records since that may lead to additional
1935 # spurious records. e.g. visit is populated_by
1936 # visit_detector_region but querying
1937 # visit_detector_region by visit will return all the
1938 # detectors for this visit -- the visit dataId does not
1939 # constrain this.
1940 # To constrain the query the original dataIds would
1941 # have to be scanned.
1942 continue
1944 if not can_query:
1945 raise RuntimeError(
1946 f"Transferring populated_by records like {element.name} requires a full Butler."
1947 )
1949 records = source_butler.registry.queryDimensionRecords( # type: ignore
1950 element.name, **data_id.mapping # type: ignore
1951 )
1952 for record in records:
1953 additional_records[record.definition].setdefault(record.dataId, record)
1955 # The next step is to walk back through the additional records to
1956 # pick up any missing content (such as visit_definition needing to
1957 # know the exposure). Want to ensure we do not request records we
1958 # already have.
1959 missing_data_ids = set()
1960 for name, record_mapping in additional_records.items():
1961 for data_id in record_mapping.keys():
1962 if data_id not in primary_records[name]:
1963 missing_data_ids.add(data_id)
1965 # Fill out the new records. Assume that these new records do not
1966 # also need to carry over additional populated_by records.
1967 secondary_records = self._extract_dimension_records_from_data_ids(
1968 source_butler, missing_data_ids, allowed_elements
1969 )
1971 # Merge the extra sets of records in with the original.
1972 for name, record_mapping in itertools.chain(additional_records.items(), secondary_records.items()):
1973 primary_records[name].update(record_mapping)
1975 return primary_records
1977 def _extract_dimension_records_from_data_ids(
1978 self,
1979 source_butler: LimitedButler | Butler,
1980 data_ids: set[DataCoordinate],
1981 allowed_elements: frozenset[DimensionElement],
1982 ) -> dict[DimensionElement, dict[DataCoordinate, DimensionRecord]]:
1983 dimension_records: dict[DimensionElement, dict[DataCoordinate, DimensionRecord]] = defaultdict(dict)
1985 for data_id in data_ids:
1986 # Need an expanded record, if not expanded that we need a full
1987 # butler with registry (allow mocks with registry too).
1988 if not data_id.hasRecords():
1989 if registry := getattr(source_butler, "registry", None):
1990 data_id = registry.expandDataId(data_id)
1991 else:
1992 raise TypeError("Input butler needs to be a full butler to expand DataId.")
1993 # If this butler doesn't know about a dimension in the source
1994 # butler things will break later.
1995 for element_name in data_id.dimensions.elements:
1996 record = data_id.records[element_name]
1997 if record is not None and record.definition in allowed_elements:
1998 dimension_records[record.definition].setdefault(record.dataId, record)
2000 return dimension_records
2002 def transfer_from(
2003 self,
2004 source_butler: LimitedButler,
2005 source_refs: Iterable[DatasetRef],
2006 transfer: str = "auto",
2007 skip_missing: bool = True,
2008 register_dataset_types: bool = False,
2009 transfer_dimensions: bool = False,
2010 ) -> collections.abc.Collection[DatasetRef]:
2011 # Docstring inherited.
2012 if not self.isWriteable():
2013 raise TypeError("Butler is read-only.")
2014 progress = Progress("lsst.daf.butler.Butler.transfer_from", level=VERBOSE)
2016 # Will iterate through the refs multiple times so need to convert
2017 # to a list if this isn't a collection.
2018 if not isinstance(source_refs, collections.abc.Collection):
2019 source_refs = list(source_refs)
2021 original_count = len(source_refs)
2022 _LOG.info("Transferring %d datasets into %s", original_count, str(self))
2024 # In some situations the datastore artifact may be missing
2025 # and we do not want that registry entry to be imported.
2026 # Asking datastore is not sufficient, the records may have been
2027 # purged, we have to ask for the (predicted) URI and check
2028 # existence explicitly. Execution butler is set up exactly like
2029 # this with no datastore records.
2030 artifact_existence: dict[ResourcePath, bool] = {}
2031 if skip_missing:
2032 dataset_existence = source_butler._datastore.mexists(
2033 source_refs, artifact_existence=artifact_existence
2034 )
2035 source_refs = [ref for ref, exists in dataset_existence.items() if exists]
2036 filtered_count = len(source_refs)
2037 n_missing = original_count - filtered_count
2038 _LOG.verbose(
2039 "%d dataset%s removed because the artifact does not exist. Now have %d.",
2040 n_missing,
2041 "" if n_missing == 1 else "s",
2042 filtered_count,
2043 )
2045 # Importing requires that we group the refs by dataset type and run
2046 # before doing the import.
2047 source_dataset_types = set()
2048 grouped_refs = defaultdict(list)
2049 for ref in source_refs:
2050 grouped_refs[ref.datasetType, ref.run].append(ref)
2051 source_dataset_types.add(ref.datasetType)
2053 # Check to see if the dataset type in the source butler has
2054 # the same definition in the target butler and register missing
2055 # ones if requested. Registration must happen outside a transaction.
2056 newly_registered_dataset_types = set()
2057 for datasetType in source_dataset_types:
2058 if register_dataset_types:
2059 # Let this raise immediately if inconsistent. Continuing
2060 # on to find additional inconsistent dataset types
2061 # might result in additional unwanted dataset types being
2062 # registered.
2063 if self._registry.registerDatasetType(datasetType):
2064 newly_registered_dataset_types.add(datasetType)
2065 else:
2066 # If the dataset type is missing, let it fail immediately.
2067 target_dataset_type = self.get_dataset_type(datasetType.name)
2068 if target_dataset_type != datasetType:
2069 raise ConflictingDefinitionError(
2070 "Source butler dataset type differs from definition"
2071 f" in target butler: {datasetType} !="
2072 f" {target_dataset_type}"
2073 )
2074 if newly_registered_dataset_types:
2075 # We may have registered some even if there were inconsistencies
2076 # but should let people know (or else remove them again).
2077 _LOG.verbose(
2078 "Registered the following dataset types in the target Butler: %s",
2079 ", ".join(d.name for d in newly_registered_dataset_types),
2080 )
2081 else:
2082 _LOG.verbose("All required dataset types are known to the target Butler")
2084 dimension_records: dict[DimensionElement, dict[DataCoordinate, DimensionRecord]] = defaultdict(dict)
2085 if transfer_dimensions:
2086 # Collect all the dimension records for these refs.
2087 # All dimensions are to be copied but the list of valid dimensions
2088 # come from this butler's universe.
2089 elements = frozenset(
2090 element
2091 for element in self.dimensions.elements
2092 if element.hasTable() and element.viewOf is None
2093 )
2094 dataIds = {ref.dataId for ref in source_refs}
2095 dimension_records = self._extract_all_dimension_records_from_data_ids(
2096 source_butler, dataIds, elements
2097 )
2099 handled_collections: set[str] = set()
2101 # Do all the importing in a single transaction.
2102 with self.transaction():
2103 if dimension_records:
2104 _LOG.verbose("Ensuring that dimension records exist for transferred datasets.")
2105 # Order matters.
2106 for element in self.dimensions.sorted(dimension_records.keys()):
2107 records = [r for r in dimension_records[element].values()]
2108 # Assume that if the record is already present that we can
2109 # use it without having to check that the record metadata
2110 # is consistent.
2111 self._registry.insertDimensionData(element, *records, skip_existing=True)
2113 n_imported = 0
2114 for (datasetType, run), refs_to_import in progress.iter_item_chunks(
2115 grouped_refs.items(), desc="Importing to registry by run and dataset type"
2116 ):
2117 if run not in handled_collections:
2118 # May need to create output collection. If source butler
2119 # has a registry, ask for documentation string.
2120 run_doc = None
2121 if registry := getattr(source_butler, "registry", None):
2122 run_doc = registry.getCollectionDocumentation(run)
2123 registered = self._registry.registerRun(run, doc=run_doc)
2124 handled_collections.add(run)
2125 if registered:
2126 _LOG.verbose("Creating output run %s", run)
2128 n_refs = len(refs_to_import)
2129 _LOG.verbose(
2130 "Importing %d ref%s of dataset type %s into run %s",
2131 n_refs,
2132 "" if n_refs == 1 else "s",
2133 datasetType.name,
2134 run,
2135 )
2137 # Assume we are using UUIDs and the source refs will match
2138 # those imported.
2139 imported_refs = self._registry._importDatasets(refs_to_import)
2140 assert set(imported_refs) == set(refs_to_import)
2141 n_imported += len(imported_refs)
2143 assert len(source_refs) == n_imported
2144 _LOG.verbose("Imported %d datasets into destination butler", n_imported)
2146 # Ask the datastore to transfer. The datastore has to check that
2147 # the source datastore is compatible with the target datastore.
2148 accepted, rejected = self._datastore.transfer_from(
2149 source_butler._datastore,
2150 source_refs,
2151 transfer=transfer,
2152 artifact_existence=artifact_existence,
2153 )
2154 if rejected:
2155 # For now, accept the registry entries but not the files.
2156 _LOG.warning(
2157 "%d datasets were rejected and %d accepted for dataset type %s in run %r.",
2158 len(rejected),
2159 len(accepted),
2160 datasetType,
2161 run,
2162 )
2164 return source_refs
2166 def validateConfiguration(
2167 self,
2168 logFailures: bool = False,
2169 datasetTypeNames: Iterable[str] | None = None,
2170 ignore: Iterable[str] | None = None,
2171 ) -> None:
2172 # Docstring inherited.
2173 if datasetTypeNames:
2174 datasetTypes = [self.get_dataset_type(name) for name in datasetTypeNames]
2175 else:
2176 datasetTypes = list(self._registry.queryDatasetTypes())
2178 # filter out anything from the ignore list
2179 if ignore:
2180 ignore = set(ignore)
2181 datasetTypes = [
2182 e for e in datasetTypes if e.name not in ignore and e.nameAndComponent()[0] not in ignore
2183 ]
2184 else:
2185 ignore = set()
2187 # For each datasetType that has an instrument dimension, create
2188 # a DatasetRef for each defined instrument
2189 datasetRefs = []
2191 # Find all the registered instruments (if "instrument" is in the
2192 # universe).
2193 if "instrument" in self.dimensions:
2194 instruments = {record.name for record in self._registry.queryDimensionRecords("instrument")}
2196 for datasetType in datasetTypes:
2197 if "instrument" in datasetType.dimensions:
2198 # In order to create a conforming dataset ref, create
2199 # fake DataCoordinate values for the non-instrument
2200 # dimensions. The type of the value does not matter here.
2201 dataId = {dim: 1 for dim in datasetType.dimensions.names if dim != "instrument"}
2203 for instrument in instruments:
2204 datasetRef = DatasetRef(
2205 datasetType,
2206 DataCoordinate.standardize(
2207 dataId, instrument=instrument, dimensions=datasetType.dimensions
2208 ),
2209 run="validate",
2210 )
2211 datasetRefs.append(datasetRef)
2213 entities: list[DatasetType | DatasetRef] = []
2214 entities.extend(datasetTypes)
2215 entities.extend(datasetRefs)
2217 datastoreErrorStr = None
2218 try:
2219 self._datastore.validateConfiguration(entities, logFailures=logFailures)
2220 except ValidationError as e:
2221 datastoreErrorStr = str(e)
2223 # Also check that the LookupKeys used by the datastores match
2224 # registry and storage class definitions
2225 keys = self._datastore.getLookupKeys()
2227 failedNames = set()
2228 failedDataId = set()
2229 for key in keys:
2230 if key.name is not None:
2231 if key.name in ignore:
2232 continue
2234 # skip if specific datasetType names were requested and this
2235 # name does not match
2236 if datasetTypeNames and key.name not in datasetTypeNames:
2237 continue
2239 # See if it is a StorageClass or a DatasetType
2240 if key.name in self.storageClasses:
2241 pass
2242 else:
2243 try:
2244 self.get_dataset_type(key.name)
2245 except KeyError:
2246 if logFailures:
2247 _LOG.critical(
2248 "Key '%s' does not correspond to a DatasetType or StorageClass", key
2249 )
2250 failedNames.add(key)
2251 else:
2252 # Dimensions are checked for consistency when the Butler
2253 # is created and rendezvoused with a universe.
2254 pass
2256 # Check that the instrument is a valid instrument
2257 # Currently only support instrument so check for that
2258 if key.dataId:
2259 dataIdKeys = set(key.dataId)
2260 if {"instrument"} != dataIdKeys:
2261 if logFailures:
2262 _LOG.critical("Key '%s' has unsupported DataId override", key)
2263 failedDataId.add(key)
2264 elif key.dataId["instrument"] not in instruments:
2265 if logFailures:
2266 _LOG.critical("Key '%s' has unknown instrument", key)
2267 failedDataId.add(key)
2269 messages = []
2271 if datastoreErrorStr:
2272 messages.append(datastoreErrorStr)
2274 for failed, msg in (
2275 (failedNames, "Keys without corresponding DatasetType or StorageClass entry: "),
2276 (failedDataId, "Keys with bad DataId entries: "),
2277 ):
2278 if failed:
2279 msg += ", ".join(str(k) for k in failed)
2280 messages.append(msg)
2282 if messages:
2283 raise ValidationError(";\n".join(messages))
2285 @property
2286 def collections(self) -> Sequence[str]:
2287 """The collections to search by default, in order
2288 (`~collections.abc.Sequence` [ `str` ]).
2290 This is an alias for ``self.registry.defaults.collections``. It cannot
2291 be set directly in isolation, but all defaults may be changed together
2292 by assigning a new `RegistryDefaults` instance to
2293 ``self.registry.defaults``.
2294 """
2295 return self._registry.defaults.collections
2297 @property
2298 def run(self) -> str | None:
2299 """Name of the run this butler writes outputs to by default (`str` or
2300 `None`).
2302 This is an alias for ``self.registry.defaults.run``. It cannot be set
2303 directly in isolation, but all defaults may be changed together by
2304 assigning a new `RegistryDefaults` instance to
2305 ``self.registry.defaults``.
2306 """
2307 return self._registry.defaults.run
2309 @property
2310 def registry(self) -> Registry:
2311 """The object that manages dataset metadata and relationships
2312 (`Registry`).
2314 Many operations that don't involve reading or writing butler datasets
2315 are accessible only via `Registry` methods. Eventually these methods
2316 will be replaced by equivalent `Butler` methods.
2317 """
2318 return self._registry_shim
2320 @property
2321 def dimensions(self) -> DimensionUniverse:
2322 # Docstring inherited.
2323 return self._registry.dimensions
2325 @contextlib.contextmanager
2326 def _query(self) -> Iterator[Query]:
2327 # Docstring inherited.
2328 yield DirectQuery(self._registry)
2330 def _query_data_ids(
2331 self,
2332 dimensions: DimensionGroup | Iterable[str] | str,
2333 *,
2334 data_id: DataId | None = None,
2335 where: str = "",
2336 bind: Mapping[str, Any] | None = None,
2337 expanded: bool = False,
2338 order_by: Iterable[str] | str | None = None,
2339 limit: int | None = None,
2340 offset: int | None = None,
2341 explain: bool = True,
2342 **kwargs: Any,
2343 ) -> list[DataCoordinate]:
2344 # Docstring inherited.
2345 query = DirectQuery(self._registry)
2346 result = query.data_ids(dimensions, data_id=data_id, where=where, bind=bind, **kwargs)
2347 if expanded:
2348 result = result.expanded()
2349 if order_by:
2350 result = result.order_by(*ensure_iterable(order_by))
2351 if limit is not None:
2352 result = result.limit(limit, offset)
2353 else:
2354 if offset is not None:
2355 raise TypeError("offset is specified without limit")
2356 data_ids = list(result)
2357 if explain and not data_ids:
2358 raise EmptyQueryResultError(list(result.explain_no_results()))
2359 return data_ids
2361 def _query_datasets(
2362 self,
2363 dataset_type: Any,
2364 collections: CollectionArgType | None = None,
2365 *,
2366 find_first: bool = True,
2367 data_id: DataId | None = None,
2368 where: str = "",
2369 bind: Mapping[str, Any] | None = None,
2370 expanded: bool = False,
2371 explain: bool = True,
2372 **kwargs: Any,
2373 ) -> list[DatasetRef]:
2374 # Docstring inherited.
2375 query = DirectQuery(self._registry)
2376 result = query.datasets(
2377 dataset_type,
2378 collections,
2379 find_first=find_first,
2380 data_id=data_id,
2381 where=where,
2382 bind=bind,
2383 **kwargs,
2384 )
2385 if expanded:
2386 result = result.expanded()
2387 refs = list(result)
2388 if explain and not refs:
2389 raise EmptyQueryResultError(list(result.explain_no_results()))
2390 return refs
2392 def _query_dimension_records(
2393 self,
2394 element: str,
2395 *,
2396 data_id: DataId | None = None,
2397 where: str = "",
2398 bind: Mapping[str, Any] | None = None,
2399 order_by: Iterable[str] | str | None = None,
2400 limit: int | None = None,
2401 offset: int | None = None,
2402 explain: bool = True,
2403 **kwargs: Any,
2404 ) -> list[DimensionRecord]:
2405 # Docstring inherited.
2406 query = DirectQuery(self._registry)
2407 result = query.dimension_records(element, data_id=data_id, where=where, bind=bind, **kwargs)
2408 if order_by:
2409 result = result.order_by(*ensure_iterable(order_by))
2410 if limit is not None:
2411 result = result.limit(limit, offset)
2412 else:
2413 if offset is not None:
2414 raise TypeError("offset is specified without limit")
2415 data_ids = list(result)
2416 if explain and not data_ids:
2417 raise EmptyQueryResultError(list(result.explain_no_results()))
2418 return data_ids
2420 _registry: SqlRegistry
2421 """The object that manages dataset metadata and relationships
2422 (`SqlRegistry`).
2424 Most operations that don't involve reading or writing butler datasets are
2425 accessible only via `SqlRegistry` methods.
2426 """
2428 datastore: Datastore
2429 """The object that manages actual dataset storage (`Datastore`).
2431 Direct user access to the datastore should rarely be necessary; the primary
2432 exception is the case where a `Datastore` implementation provides extra
2433 functionality beyond what the base class defines.
2434 """
2436 storageClasses: StorageClassFactory
2437 """An object that maps known storage class names to objects that fully
2438 describe them (`StorageClassFactory`).
2439 """