Coverage for python/lsst/daf/butler/direct_butler.py: 11%
715 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-01 11:00 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-01 11:00 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28"""Butler top level classes.
29"""
30from __future__ import annotations
32__all__ = (
33 "DirectButler",
34 "ButlerValidationError",
35)
37import collections.abc
38import contextlib
39import io
40import logging
41import numbers
42import os
43import warnings
44from collections import Counter, defaultdict
45from collections.abc import Iterable, Iterator, MutableMapping, Sequence
46from typing import TYPE_CHECKING, Any, ClassVar, TextIO
48from deprecated.sphinx import deprecated
49from lsst.resources import ResourcePath, ResourcePathExpression
50from lsst.utils.introspection import get_class_of
51from lsst.utils.logging import VERBOSE, getLogger
52from sqlalchemy.exc import IntegrityError
54from ._butler import Butler
55from ._butler_config import ButlerConfig
56from ._config import Config
57from ._dataset_existence import DatasetExistence
58from ._dataset_ref import DatasetId, DatasetIdGenEnum, DatasetRef
59from ._dataset_type import DatasetType
60from ._deferredDatasetHandle import DeferredDatasetHandle
61from ._exceptions import ValidationError
62from ._file_dataset import FileDataset
63from ._limited_butler import LimitedButler
64from ._registry_shim import RegistryShim
65from ._storage_class import StorageClass, StorageClassFactory
66from ._timespan import Timespan
67from .datastore import DatasetRefURIs, Datastore, NullDatastore
68from .dimensions import (
69 DataCoordinate,
70 DataId,
71 DataIdValue,
72 Dimension,
73 DimensionElement,
74 DimensionRecord,
75 DimensionUniverse,
76)
77from .progress import Progress
78from .registry import (
79 CollectionType,
80 ConflictingDefinitionError,
81 DataIdError,
82 MissingDatasetTypeError,
83 NoDefaultCollectionError,
84 Registry,
85 RegistryDefaults,
86 _RegistryFactory,
87)
88from .registry.sql_registry import SqlRegistry
89from .transfers import RepoExportContext
90from .utils import transactional
92if TYPE_CHECKING:
93 from lsst.resources import ResourceHandleProtocol
95 from .transfers import RepoImportBackend
97_LOG = getLogger(__name__)
100class ButlerValidationError(ValidationError):
101 """There is a problem with the Butler configuration."""
103 pass
106class DirectButler(Butler):
107 """Main entry point for the data access system.
109 Parameters
110 ----------
111 config : `ButlerConfig`, `Config` or `str`, optional.
112 Configuration. Anything acceptable to the
113 `ButlerConfig` constructor. If a directory path
114 is given the configuration will be read from a ``butler.yaml`` file in
115 that location. If `None` is given default values will be used.
116 butler : `DirectButler`, optional.
117 If provided, construct a new Butler that uses the same registry and
118 datastore as the given one, but with the given collection and run.
119 Incompatible with the ``config``, ``searchPaths``, and ``writeable``
120 arguments.
121 collections : `str` or `~collections.abc.Iterable` [ `str` ], optional
122 An expression specifying the collections to be searched (in order) when
123 reading datasets.
124 This may be a `str` collection name or an iterable thereof.
125 See :ref:`daf_butler_collection_expressions` for more information.
126 These collections are not registered automatically and must be
127 manually registered before they are used by any method, but they may be
128 manually registered after the `Butler` is initialized.
129 run : `str`, optional
130 Name of the `~CollectionType.RUN` collection new datasets should be
131 inserted into. If ``collections`` is `None` and ``run`` is not `None`,
132 ``collections`` will be set to ``[run]``. If not `None`, this
133 collection will automatically be registered. If this is not set (and
134 ``writeable`` is not set either), a read-only butler will be created.
135 searchPaths : `list` of `str`, optional
136 Directory paths to search when calculating the full Butler
137 configuration. Not used if the supplied config is already a
138 `ButlerConfig`.
139 writeable : `bool`, optional
140 Explicitly sets whether the butler supports write operations. If not
141 provided, a read-write butler is created if any of ``run``, ``tags``,
142 or ``chains`` is non-empty.
143 inferDefaults : `bool`, optional
144 If `True` (default) infer default data ID values from the values
145 present in the datasets in ``collections``: if all collections have the
146 same value (or no value) for a governor dimension, that value will be
147 the default for that dimension. Nonexistent collections are ignored.
148 If a default value is provided explicitly for a governor dimension via
149 ``**kwargs``, no default will be inferred for that dimension.
150 without_datastore : `bool`, optional
151 If `True` do not attach a datastore to this butler. Any attempts
152 to use a datastore will fail.
153 **kwargs : `str`
154 Default data ID key-value pairs. These may only identify "governor"
155 dimensions like ``instrument`` and ``skymap``.
156 """
158 def __init__(
159 self,
160 config: Config | ResourcePathExpression | None = None,
161 *,
162 butler: DirectButler | None = None,
163 collections: Any = None,
164 run: str | None = None,
165 searchPaths: Sequence[ResourcePathExpression] | None = None,
166 writeable: bool | None = None,
167 inferDefaults: bool = True,
168 without_datastore: bool = False,
169 **kwargs: str,
170 ):
171 defaults = RegistryDefaults(collections=collections, run=run, infer=inferDefaults, **kwargs)
172 # Load registry, datastore, etc. from config or existing butler.
173 if butler is not None:
174 if config is not None or searchPaths is not None or writeable is not None:
175 raise TypeError(
176 "Cannot pass 'config', 'searchPaths', or 'writeable' arguments with 'butler' argument."
177 )
178 self._registry = butler._registry.copy(defaults)
179 self._datastore = butler._datastore
180 self.storageClasses = butler.storageClasses
181 self._config: ButlerConfig = butler._config
182 else:
183 self._config = ButlerConfig(config, searchPaths=searchPaths, without_datastore=without_datastore)
184 try:
185 butlerRoot = self._config.get("root", self._config.configDir)
186 if writeable is None:
187 writeable = run is not None
188 self._registry = _RegistryFactory(self._config).from_config(
189 butlerRoot=butlerRoot, writeable=writeable, defaults=defaults
190 )
191 if without_datastore:
192 self._datastore = NullDatastore(None, None)
193 else:
194 self._datastore = Datastore.fromConfig(
195 self._config, self._registry.getDatastoreBridgeManager(), butlerRoot=butlerRoot
196 )
197 # TODO: Once datastore drops dependency on registry we can
198 # construct datastore first and pass opaque tables to registry
199 # constructor.
200 self._registry.make_datastore_tables(self._datastore.get_opaque_table_definitions())
201 self.storageClasses = StorageClassFactory()
202 self.storageClasses.addFromConfig(self._config)
203 except Exception:
204 # Failures here usually mean that configuration is incomplete,
205 # just issue an error message which includes config file URI.
206 _LOG.error(f"Failed to instantiate Butler from config {self._config.configFile}.")
207 raise
209 # For execution butler the datastore needs a special
210 # dependency-inversion trick. This is not used by regular butler,
211 # but we do not have a way to distinguish regular butler from execution
212 # butler.
213 self._datastore.set_retrieve_dataset_type_method(self._retrieve_dataset_type)
215 if "run" in self._config or "collection" in self._config:
216 raise ValueError("Passing a run or collection via configuration is no longer supported.")
218 self._registry_shim = RegistryShim(self)
220 GENERATION: ClassVar[int] = 3
221 """This is a Generation 3 Butler.
223 This attribute may be removed in the future, once the Generation 2 Butler
224 interface has been fully retired; it should only be used in transitional
225 code.
226 """
228 def _retrieve_dataset_type(self, name: str) -> DatasetType | None:
229 """Return DatasetType defined in registry given dataset type name."""
230 try:
231 return self.get_dataset_type(name)
232 except MissingDatasetTypeError:
233 return None
235 @classmethod
236 def _unpickle(
237 cls,
238 config: ButlerConfig,
239 collections: tuple[str, ...] | None,
240 run: str | None,
241 defaultDataId: dict[str, str],
242 writeable: bool,
243 ) -> DirectButler:
244 """Callable used to unpickle a Butler.
246 We prefer not to use ``Butler.__init__`` directly so we can force some
247 of its many arguments to be keyword-only (note that ``__reduce__``
248 can only invoke callables with positional arguments).
250 Parameters
251 ----------
252 config : `ButlerConfig`
253 Butler configuration, already coerced into a true `ButlerConfig`
254 instance (and hence after any search paths for overrides have been
255 utilized).
256 collections : `tuple` [ `str` ]
257 Names of the default collections to read from.
258 run : `str`, optional
259 Name of the default `~CollectionType.RUN` collection to write to.
260 defaultDataId : `dict` [ `str`, `str` ]
261 Default data ID values.
262 writeable : `bool`
263 Whether the Butler should support write operations.
265 Returns
266 -------
267 butler : `Butler`
268 A new `Butler` instance.
269 """
270 # MyPy doesn't recognize that the kwargs below are totally valid; it
271 # seems to think '**defaultDataId* is a _positional_ argument!
272 return cls(
273 config=config,
274 collections=collections,
275 run=run,
276 writeable=writeable,
277 **defaultDataId, # type: ignore
278 )
280 def __reduce__(self) -> tuple:
281 """Support pickling."""
282 return (
283 DirectButler._unpickle,
284 (
285 self._config,
286 self.collections,
287 self.run,
288 dict(self._registry.defaults.dataId.required),
289 self._registry.isWriteable(),
290 ),
291 )
293 def __str__(self) -> str:
294 return "Butler(collections={}, run={}, datastore='{}', registry='{}')".format(
295 self.collections, self.run, self._datastore, self._registry
296 )
298 def isWriteable(self) -> bool:
299 # Docstring inherited.
300 return self._registry.isWriteable()
302 def _caching_context(self) -> contextlib.AbstractContextManager[None]:
303 """Context manager that enables caching."""
304 return self._registry.caching_context()
306 @contextlib.contextmanager
307 def transaction(self) -> Iterator[None]:
308 """Context manager supporting `Butler` transactions.
310 Transactions can be nested.
311 """
312 with self._registry.transaction(), self._datastore.transaction():
313 yield
315 def _standardizeArgs(
316 self,
317 datasetRefOrType: DatasetRef | DatasetType | str,
318 dataId: DataId | None = None,
319 for_put: bool = True,
320 **kwargs: Any,
321 ) -> tuple[DatasetType, DataId | None]:
322 """Standardize the arguments passed to several Butler APIs.
324 Parameters
325 ----------
326 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
327 When `DatasetRef` the `dataId` should be `None`.
328 Otherwise the `DatasetType` or name thereof.
329 dataId : `dict` or `DataCoordinate`
330 A `dict` of `Dimension` link name, value pairs that label the
331 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
332 should be provided as the second argument.
333 for_put : `bool`, optional
334 If `True` this call is invoked as part of a `Butler.put()`.
335 Otherwise it is assumed to be part of a `Butler.get()`. This
336 parameter is only relevant if there is dataset type
337 inconsistency.
338 **kwargs
339 Additional keyword arguments used to augment or construct a
340 `DataCoordinate`. See `DataCoordinate.standardize`
341 parameters.
343 Returns
344 -------
345 datasetType : `DatasetType`
346 A `DatasetType` instance extracted from ``datasetRefOrType``.
347 dataId : `dict` or `DataId`, optional
348 Argument that can be used (along with ``kwargs``) to construct a
349 `DataId`.
351 Notes
352 -----
353 Butler APIs that conceptually need a DatasetRef also allow passing a
354 `DatasetType` (or the name of one) and a `DataId` (or a dict and
355 keyword arguments that can be used to construct one) separately. This
356 method accepts those arguments and always returns a true `DatasetType`
357 and a `DataId` or `dict`.
359 Standardization of `dict` vs `DataId` is best handled by passing the
360 returned ``dataId`` (and ``kwargs``) to `Registry` APIs, which are
361 generally similarly flexible.
362 """
363 externalDatasetType: DatasetType | None = None
364 internalDatasetType: DatasetType | None = None
365 if isinstance(datasetRefOrType, DatasetRef):
366 if dataId is not None or kwargs:
367 raise ValueError("DatasetRef given, cannot use dataId as well")
368 externalDatasetType = datasetRefOrType.datasetType
369 dataId = datasetRefOrType.dataId
370 else:
371 # Don't check whether DataId is provided, because Registry APIs
372 # can usually construct a better error message when it wasn't.
373 if isinstance(datasetRefOrType, DatasetType):
374 externalDatasetType = datasetRefOrType
375 else:
376 internalDatasetType = self.get_dataset_type(datasetRefOrType)
378 # Check that they are self-consistent
379 if externalDatasetType is not None:
380 internalDatasetType = self.get_dataset_type(externalDatasetType.name)
381 if externalDatasetType != internalDatasetType:
382 # We can allow differences if they are compatible, depending
383 # on whether this is a get or a put. A get requires that
384 # the python type associated with the datastore can be
385 # converted to the user type. A put requires that the user
386 # supplied python type can be converted to the internal
387 # type expected by registry.
388 relevantDatasetType = internalDatasetType
389 if for_put:
390 is_compatible = internalDatasetType.is_compatible_with(externalDatasetType)
391 else:
392 is_compatible = externalDatasetType.is_compatible_with(internalDatasetType)
393 relevantDatasetType = externalDatasetType
394 if not is_compatible:
395 raise ValueError(
396 f"Supplied dataset type ({externalDatasetType}) inconsistent with "
397 f"registry definition ({internalDatasetType})"
398 )
399 # Override the internal definition.
400 internalDatasetType = relevantDatasetType
402 assert internalDatasetType is not None
403 return internalDatasetType, dataId
405 def _rewrite_data_id(
406 self, dataId: DataId | None, datasetType: DatasetType, **kwargs: Any
407 ) -> tuple[DataId | None, dict[str, Any]]:
408 """Rewrite a data ID taking into account dimension records.
410 Take a Data ID and keyword args and rewrite it if necessary to
411 allow the user to specify dimension records rather than dimension
412 primary values.
414 This allows a user to include a dataId dict with keys of
415 ``exposure.day_obs`` and ``exposure.seq_num`` instead of giving
416 the integer exposure ID. It also allows a string to be given
417 for a dimension value rather than the integer ID if that is more
418 convenient. For example, rather than having to specifying the
419 detector with ``detector.full_name``, a string given for ``detector``
420 will be interpreted as the full name and converted to the integer
421 value.
423 Keyword arguments can also use strings for dimensions like detector
424 and exposure but python does not allow them to include ``.`` and
425 so the ``exposure.day_obs`` syntax can not be used in a keyword
426 argument.
428 Parameters
429 ----------
430 dataId : `dict` or `DataCoordinate`
431 A `dict` of `Dimension` link name, value pairs that will label the
432 `DatasetRef` within a Collection.
433 datasetType : `DatasetType`
434 The dataset type associated with this dataId. Required to
435 determine the relevant dimensions.
436 **kwargs
437 Additional keyword arguments used to augment or construct a
438 `DataId`. See `DataId` parameters.
440 Returns
441 -------
442 dataId : `dict` or `DataCoordinate`
443 The, possibly rewritten, dataId. If given a `DataCoordinate` and
444 no keyword arguments, the original dataId will be returned
445 unchanged.
446 **kwargs : `dict`
447 Any unused keyword arguments (would normally be empty dict).
448 """
449 # Do nothing if we have a standalone DataCoordinate.
450 if isinstance(dataId, DataCoordinate) and not kwargs:
451 return dataId, kwargs
453 # Process dimension records that are using record information
454 # rather than ids
455 newDataId: dict[str, DataIdValue] = {}
456 byRecord: dict[str, dict[str, Any]] = defaultdict(dict)
458 # if all the dataId comes from keyword parameters we do not need
459 # to do anything here because they can't be of the form
460 # exposure.obs_id because a "." is not allowed in a keyword parameter.
461 if dataId:
462 for k, v in dataId.items():
463 # If we have a Dimension we do not need to do anything
464 # because it cannot be a compound key.
465 if isinstance(k, str) and "." in k:
466 # Someone is using a more human-readable dataId
467 dimensionName, record = k.split(".", 1)
468 byRecord[dimensionName][record] = v
469 elif isinstance(k, Dimension):
470 newDataId[k.name] = v
471 else:
472 newDataId[k] = v
474 # Go through the updated dataId and check the type in case someone is
475 # using an alternate key. We have already filtered out the compound
476 # keys dimensions.record format.
477 not_dimensions = {}
479 # Will need to look in the dataId and the keyword arguments
480 # and will remove them if they need to be fixed or are unrecognized.
481 for dataIdDict in (newDataId, kwargs):
482 # Use a list so we can adjust the dict safely in the loop
483 for dimensionName in list(dataIdDict):
484 value = dataIdDict[dimensionName]
485 try:
486 dimension = self.dimensions.dimensions[dimensionName]
487 except KeyError:
488 # This is not a real dimension
489 not_dimensions[dimensionName] = value
490 del dataIdDict[dimensionName]
491 continue
493 # Convert an integral type to an explicit int to simplify
494 # comparisons here
495 if isinstance(value, numbers.Integral):
496 value = int(value)
498 if not isinstance(value, dimension.primaryKey.getPythonType()):
499 for alternate in dimension.alternateKeys:
500 if isinstance(value, alternate.getPythonType()):
501 byRecord[dimensionName][alternate.name] = value
502 del dataIdDict[dimensionName]
503 _LOG.debug(
504 "Converting dimension %s to %s.%s=%s",
505 dimensionName,
506 dimensionName,
507 alternate.name,
508 value,
509 )
510 break
511 else:
512 _LOG.warning(
513 "Type mismatch found for value '%r' provided for dimension %s. "
514 "Could not find matching alternative (primary key has type %s) "
515 "so attempting to use as-is.",
516 value,
517 dimensionName,
518 dimension.primaryKey.getPythonType(),
519 )
521 # By this point kwargs and newDataId should only include valid
522 # dimensions. Merge kwargs in to the new dataId and log if there
523 # are dimensions in both (rather than calling update).
524 for k, v in kwargs.items():
525 if k in newDataId and newDataId[k] != v:
526 _LOG.debug(
527 "Keyword arg %s overriding explicit value in dataId of %s with %s", k, newDataId[k], v
528 )
529 newDataId[k] = v
530 # No need to retain any values in kwargs now.
531 kwargs = {}
533 # If we have some unrecognized dimensions we have to try to connect
534 # them to records in other dimensions. This is made more complicated
535 # by some dimensions having records with clashing names. A mitigation
536 # is that we can tell by this point which dimensions are missing
537 # for the DatasetType but this does not work for calibrations
538 # where additional dimensions can be used to constrain the temporal
539 # axis.
540 if not_dimensions:
541 # Search for all dimensions even if we have been given a value
542 # explicitly. In some cases records are given as well as the
543 # actually dimension and this should not be an error if they
544 # match.
545 mandatoryDimensions = datasetType.dimensions.names # - provided
547 candidateDimensions: set[str] = set()
548 candidateDimensions.update(mandatoryDimensions)
550 # For calibrations we may well be needing temporal dimensions
551 # so rather than always including all dimensions in the scan
552 # restrict things a little. It is still possible for there
553 # to be confusion over day_obs in visit vs exposure for example.
554 # If we are not searching calibration collections things may
555 # fail but they are going to fail anyway because of the
556 # ambiguousness of the dataId...
557 if datasetType.isCalibration():
558 for dim in self.dimensions.dimensions:
559 if dim.temporal:
560 candidateDimensions.add(str(dim))
562 # Look up table for the first association with a dimension
563 guessedAssociation: dict[str, dict[str, Any]] = defaultdict(dict)
565 # Keep track of whether an item is associated with multiple
566 # dimensions.
567 counter: Counter[str] = Counter()
568 assigned: dict[str, set[str]] = defaultdict(set)
570 # Go through the missing dimensions and associate the
571 # given names with records within those dimensions
572 matched_dims = set()
573 for dimensionName in candidateDimensions:
574 dimension = self.dimensions.dimensions[dimensionName]
575 fields = dimension.metadata.names | dimension.uniqueKeys.names
576 for field in not_dimensions:
577 if field in fields:
578 guessedAssociation[dimensionName][field] = not_dimensions[field]
579 counter[dimensionName] += 1
580 assigned[field].add(dimensionName)
581 matched_dims.add(field)
583 # Calculate the fields that matched nothing.
584 never_found = set(not_dimensions) - matched_dims
586 if never_found:
587 raise ValueError(f"Unrecognized keyword args given: {never_found}")
589 # There is a chance we have allocated a single dataId item
590 # to multiple dimensions. Need to decide which should be retained.
591 # For now assume that the most popular alternative wins.
592 # This means that day_obs with seq_num will result in
593 # exposure.day_obs and not visit.day_obs
594 # Also prefer an explicitly missing dimension over an inferred
595 # temporal dimension.
596 for fieldName, assignedDimensions in assigned.items():
597 if len(assignedDimensions) > 1:
598 # Pick the most popular (preferring mandatory dimensions)
599 requiredButMissing = assignedDimensions.intersection(mandatoryDimensions)
600 if requiredButMissing:
601 candidateDimensions = requiredButMissing
602 else:
603 candidateDimensions = assignedDimensions
605 # If this is a choice between visit and exposure and
606 # neither was a required part of the dataset type,
607 # (hence in this branch) always prefer exposure over
608 # visit since exposures are always defined and visits
609 # are defined from exposures.
610 if candidateDimensions == {"exposure", "visit"}:
611 candidateDimensions = {"exposure"}
613 # Select the relevant items and get a new restricted
614 # counter.
615 theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions}
616 duplicatesCounter: Counter[str] = Counter()
617 duplicatesCounter.update(theseCounts)
619 # Choose the most common. If they are equally common
620 # we will pick the one that was found first.
621 # Returns a list of tuples
622 selected = duplicatesCounter.most_common(1)[0][0]
624 _LOG.debug(
625 "Ambiguous dataId entry '%s' associated with multiple dimensions: %s."
626 " Removed ambiguity by choosing dimension %s.",
627 fieldName,
628 ", ".join(assignedDimensions),
629 selected,
630 )
632 for candidateDimension in assignedDimensions:
633 if candidateDimension != selected:
634 del guessedAssociation[candidateDimension][fieldName]
636 # Update the record look up dict with the new associations
637 for dimensionName, values in guessedAssociation.items():
638 if values: # A dict might now be empty
639 _LOG.debug(
640 "Assigned non-dimension dataId keys to dimension %s: %s", dimensionName, values
641 )
642 byRecord[dimensionName].update(values)
644 if byRecord:
645 # Some record specifiers were found so we need to convert
646 # them to the Id form
647 for dimensionName, values in byRecord.items():
648 if dimensionName in newDataId:
649 _LOG.debug(
650 "DataId specified explicit %s dimension value of %s in addition to"
651 " general record specifiers for it of %s. Ignoring record information.",
652 dimensionName,
653 newDataId[dimensionName],
654 str(values),
655 )
656 # Get the actual record and compare with these values.
657 try:
658 recs = list(self._registry.queryDimensionRecords(dimensionName, dataId=newDataId))
659 except DataIdError:
660 raise ValueError(
661 f"Could not find dimension '{dimensionName}'"
662 f" with dataId {newDataId} as part of comparing with"
663 f" record values {byRecord[dimensionName]}"
664 ) from None
665 if len(recs) == 1:
666 errmsg: list[str] = []
667 for k, v in values.items():
668 if (recval := getattr(recs[0], k)) != v:
669 errmsg.append(f"{k}({recval} != {v})")
670 if errmsg:
671 raise ValueError(
672 f"Dimension {dimensionName} in dataId has explicit value"
673 " inconsistent with records: " + ", ".join(errmsg)
674 )
675 else:
676 # Multiple matches for an explicit dimension
677 # should never happen but let downstream complain.
678 pass
679 continue
681 # Build up a WHERE expression
682 bind = dict(values.items())
683 where = " AND ".join(f"{dimensionName}.{k} = {k}" for k in bind)
685 # Hopefully we get a single record that matches
686 records = set(
687 self._registry.queryDimensionRecords(
688 dimensionName, dataId=newDataId, where=where, bind=bind, **kwargs
689 )
690 )
692 if len(records) != 1:
693 if len(records) > 1:
694 # visit can have an ambiguous answer without involving
695 # visit_system. The default visit_system is defined
696 # by the instrument.
697 if (
698 dimensionName == "visit"
699 and "visit_system_membership" in self.dimensions
700 and "visit_system" in self.dimensions["instrument"].metadata
701 ):
702 instrument_records = list(
703 self._registry.queryDimensionRecords(
704 "instrument",
705 dataId=newDataId,
706 **kwargs,
707 )
708 )
709 if len(instrument_records) == 1:
710 visit_system = instrument_records[0].visit_system
711 if visit_system is None:
712 # Set to a value that will never match.
713 visit_system = -1
715 # Look up each visit in the
716 # visit_system_membership records.
717 for rec in records:
718 membership = list(
719 self._registry.queryDimensionRecords(
720 # Use bind to allow zero results.
721 # This is a fully-specified query.
722 "visit_system_membership",
723 where="instrument = inst AND visit_system = system AND visit = v",
724 bind=dict(
725 inst=instrument_records[0].name, system=visit_system, v=rec.id
726 ),
727 )
728 )
729 if membership:
730 # This record is the right answer.
731 records = {rec}
732 break
734 # The ambiguity may have been resolved so check again.
735 if len(records) > 1:
736 _LOG.debug(
737 "Received %d records from constraints of %s", len(records), str(values)
738 )
739 for r in records:
740 _LOG.debug("- %s", str(r))
741 raise ValueError(
742 f"DataId specification for dimension {dimensionName} is not"
743 f" uniquely constrained to a single dataset by {values}."
744 f" Got {len(records)} results."
745 )
746 else:
747 raise ValueError(
748 f"DataId specification for dimension {dimensionName} matched no"
749 f" records when constrained by {values}"
750 )
752 # Get the primary key from the real dimension object
753 dimension = self.dimensions.dimensions[dimensionName]
754 if not isinstance(dimension, Dimension):
755 raise RuntimeError(
756 f"{dimension.name} is not a true dimension, and cannot be used in data IDs."
757 )
758 newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name)
760 return newDataId, kwargs
762 def _findDatasetRef(
763 self,
764 datasetRefOrType: DatasetRef | DatasetType | str,
765 dataId: DataId | None = None,
766 *,
767 collections: Any = None,
768 predict: bool = False,
769 run: str | None = None,
770 datastore_records: bool = False,
771 **kwargs: Any,
772 ) -> DatasetRef:
773 """Shared logic for methods that start with a search for a dataset in
774 the registry.
776 Parameters
777 ----------
778 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
779 When `DatasetRef` the `dataId` should be `None`.
780 Otherwise the `DatasetType` or name thereof.
781 dataId : `dict` or `DataCoordinate`, optional
782 A `dict` of `Dimension` link name, value pairs that label the
783 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
784 should be provided as the first argument.
785 collections : Any, optional
786 Collections to be searched, overriding ``self.collections``.
787 Can be any of the types supported by the ``collections`` argument
788 to butler construction.
789 predict : `bool`, optional
790 If `True`, return a newly created `DatasetRef` with a unique
791 dataset ID if finding a reference in the `Registry` fails.
792 Defaults to `False`.
793 run : `str`, optional
794 Run collection name to use for creating `DatasetRef` for predicted
795 datasets. Only used if ``predict`` is `True`.
796 datastore_records : `bool`, optional
797 If `True` add datastore records to returned `DatasetRef`.
798 **kwargs
799 Additional keyword arguments used to augment or construct a
800 `DataId`. See `DataId` parameters.
802 Returns
803 -------
804 ref : `DatasetRef`
805 A reference to the dataset identified by the given arguments.
806 This can be the same dataset reference as given if it was
807 resolved.
809 Raises
810 ------
811 LookupError
812 Raised if no matching dataset exists in the `Registry` (and
813 ``predict`` is `False`).
814 ValueError
815 Raised if a resolved `DatasetRef` was passed as an input, but it
816 differs from the one found in the registry.
817 TypeError
818 Raised if no collections were provided.
819 """
820 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, for_put=False, **kwargs)
821 if isinstance(datasetRefOrType, DatasetRef):
822 if collections is not None:
823 warnings.warn("Collections should not be specified with DatasetRef", stacklevel=3)
824 # May need to retrieve datastore records if requested.
825 if datastore_records and datasetRefOrType._datastore_records is None:
826 datasetRefOrType = self._registry.get_datastore_records(datasetRefOrType)
827 return datasetRefOrType
828 timespan: Timespan | None = None
830 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs)
832 if datasetType.isCalibration():
833 # Because this is a calibration dataset, first try to make a
834 # standardize the data ID without restricting the dimensions to
835 # those of the dataset type requested, because there may be extra
836 # dimensions that provide temporal information for a validity-range
837 # lookup.
838 dataId = DataCoordinate.standardize(
839 dataId, universe=self.dimensions, defaults=self._registry.defaults.dataId, **kwargs
840 )
841 if dataId.dimensions.temporal:
842 dataId = self._registry.expandDataId(dataId)
843 timespan = dataId.timespan
844 else:
845 # Standardize the data ID to just the dimensions of the dataset
846 # type instead of letting registry.findDataset do it, so we get the
847 # result even if no dataset is found.
848 dataId = DataCoordinate.standardize(
849 dataId,
850 dimensions=datasetType.dimensions,
851 defaults=self._registry.defaults.dataId,
852 **kwargs,
853 )
854 # Always lookup the DatasetRef, even if one is given, to ensure it is
855 # present in the current collection.
856 ref = self.find_dataset(
857 datasetType,
858 dataId,
859 collections=collections,
860 timespan=timespan,
861 datastore_records=datastore_records,
862 )
863 if ref is None:
864 if predict:
865 if run is None:
866 run = self.run
867 if run is None:
868 raise TypeError("Cannot predict dataset ID/location with run=None.")
869 return DatasetRef(datasetType, dataId, run=run)
870 else:
871 if collections is None:
872 collections = self._registry.defaults.collections
873 raise LookupError(
874 f"Dataset {datasetType.name} with data ID {dataId} "
875 f"could not be found in collections {collections}."
876 )
877 if datasetType != ref.datasetType:
878 # If they differ it is because the user explicitly specified
879 # a compatible dataset type to this call rather than using the
880 # registry definition. The DatasetRef must therefore be recreated
881 # using the user definition such that the expected type is
882 # returned.
883 ref = DatasetRef(
884 datasetType, ref.dataId, run=ref.run, id=ref.id, datastore_records=ref._datastore_records
885 )
887 return ref
889 # TODO: remove on DM-40067.
890 @transactional
891 @deprecated(
892 reason="Butler.put() now behaves like Butler.putDirect() when given a DatasetRef."
893 " Please use Butler.put(). Be aware that you may need to adjust your usage if you"
894 " were relying on the run parameter to determine the run."
895 " Will be removed after v26.0.",
896 version="v26.0",
897 category=FutureWarning,
898 )
899 def putDirect(self, obj: Any, ref: DatasetRef, /) -> DatasetRef:
900 # Docstring inherited.
901 return self.put(obj, ref)
903 @transactional
904 def put(
905 self,
906 obj: Any,
907 datasetRefOrType: DatasetRef | DatasetType | str,
908 /,
909 dataId: DataId | None = None,
910 *,
911 run: str | None = None,
912 **kwargs: Any,
913 ) -> DatasetRef:
914 """Store and register a dataset.
916 Parameters
917 ----------
918 obj : `object`
919 The dataset.
920 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
921 When `DatasetRef` is provided, ``dataId`` should be `None`.
922 Otherwise the `DatasetType` or name thereof. If a fully resolved
923 `DatasetRef` is given the run and ID are used directly.
924 dataId : `dict` or `DataCoordinate`
925 A `dict` of `Dimension` link name, value pairs that label the
926 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
927 should be provided as the second argument.
928 run : `str`, optional
929 The name of the run the dataset should be added to, overriding
930 ``self.run``. Not used if a resolved `DatasetRef` is provided.
931 **kwargs
932 Additional keyword arguments used to augment or construct a
933 `DataCoordinate`. See `DataCoordinate.standardize`
934 parameters. Not used if a resolve `DatasetRef` is provided.
936 Returns
937 -------
938 ref : `DatasetRef`
939 A reference to the stored dataset, updated with the correct id if
940 given.
942 Raises
943 ------
944 TypeError
945 Raised if the butler is read-only or if no run has been provided.
946 """
947 if isinstance(datasetRefOrType, DatasetRef):
948 # This is a direct put of predefined DatasetRef.
949 _LOG.debug("Butler put direct: %s", datasetRefOrType)
950 if run is not None:
951 warnings.warn("Run collection is not used for DatasetRef", stacklevel=3)
952 # If registry already has a dataset with the same dataset ID,
953 # dataset type and DataId, then _importDatasets will do nothing and
954 # just return an original ref. We have to raise in this case, there
955 # is a datastore check below for that.
956 self._registry._importDatasets([datasetRefOrType], expand=True)
957 # Before trying to write to the datastore check that it does not
958 # know this dataset. This is prone to races, of course.
959 if self._datastore.knows(datasetRefOrType):
960 raise ConflictingDefinitionError(f"Datastore already contains dataset: {datasetRefOrType}")
961 # Try to write dataset to the datastore, if it fails due to a race
962 # with another write, the content of stored data may be
963 # unpredictable.
964 try:
965 self._datastore.put(obj, datasetRefOrType)
966 except IntegrityError as e:
967 raise ConflictingDefinitionError(f"Datastore already contains dataset: {e}") from e
968 return datasetRefOrType
970 _LOG.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run)
971 if not self.isWriteable():
972 raise TypeError("Butler is read-only.")
973 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs)
975 # Handle dimension records in dataId
976 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs)
978 # Add Registry Dataset entry.
979 dataId = self._registry.expandDataId(dataId, dimensions=datasetType.dimensions, **kwargs)
980 (ref,) = self._registry.insertDatasets(datasetType, run=run, dataIds=[dataId])
981 self._datastore.put(obj, ref)
983 return ref
985 # TODO: remove on DM-40067.
986 @deprecated(
987 reason="Butler.get() now behaves like Butler.getDirect() when given a DatasetRef."
988 " Please use Butler.get(). Will be removed after v26.0.",
989 version="v26.0",
990 category=FutureWarning,
991 )
992 def getDirect(
993 self,
994 ref: DatasetRef,
995 *,
996 parameters: dict[str, Any] | None = None,
997 storageClass: StorageClass | str | None = None,
998 ) -> Any:
999 """Retrieve a stored dataset.
1001 Parameters
1002 ----------
1003 ref : `DatasetRef`
1004 Resolved reference to an already stored dataset.
1005 parameters : `dict`
1006 Additional StorageClass-defined options to control reading,
1007 typically used to efficiently read only a subset of the dataset.
1008 storageClass : `StorageClass` or `str`, optional
1009 The storage class to be used to override the Python type
1010 returned by this method. By default the returned type matches
1011 the dataset type definition for this dataset. Specifying a
1012 read `StorageClass` can force a different type to be returned.
1013 This type must be compatible with the original type.
1015 Returns
1016 -------
1017 obj : `object`
1018 The dataset.
1019 """
1020 return self._datastore.get(ref, parameters=parameters, storageClass=storageClass)
1022 # TODO: remove on DM-40067.
1023 @deprecated(
1024 reason="Butler.getDeferred() now behaves like getDirectDeferred() when given a DatasetRef. "
1025 "Please use Butler.getDeferred(). Will be removed after v26.0.",
1026 version="v26.0",
1027 category=FutureWarning,
1028 )
1029 def getDirectDeferred(
1030 self,
1031 ref: DatasetRef,
1032 *,
1033 parameters: dict[str, Any] | None = None,
1034 storageClass: str | StorageClass | None = None,
1035 ) -> DeferredDatasetHandle:
1036 """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
1037 from a resolved `DatasetRef`.
1039 Parameters
1040 ----------
1041 ref : `DatasetRef`
1042 Resolved reference to an already stored dataset.
1043 parameters : `dict`
1044 Additional StorageClass-defined options to control reading,
1045 typically used to efficiently read only a subset of the dataset.
1046 storageClass : `StorageClass` or `str`, optional
1047 The storage class to be used to override the Python type
1048 returned by this method. By default the returned type matches
1049 the dataset type definition for this dataset. Specifying a
1050 read `StorageClass` can force a different type to be returned.
1051 This type must be compatible with the original type.
1053 Returns
1054 -------
1055 obj : `DeferredDatasetHandle`
1056 A handle which can be used to retrieve a dataset at a later time.
1058 Raises
1059 ------
1060 LookupError
1061 Raised if no matching dataset exists in the `Registry`.
1062 """
1063 # Check that dataset is known to the datastore.
1064 if not self._datastore.knows(ref):
1065 raise LookupError(f"Dataset reference {ref} is not known to datastore.")
1066 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters, storageClass=storageClass)
1068 def getDeferred(
1069 self,
1070 datasetRefOrType: DatasetRef | DatasetType | str,
1071 /,
1072 dataId: DataId | None = None,
1073 *,
1074 parameters: dict | None = None,
1075 collections: Any = None,
1076 storageClass: str | StorageClass | None = None,
1077 **kwargs: Any,
1078 ) -> DeferredDatasetHandle:
1079 """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
1080 after an immediate registry lookup.
1082 Parameters
1083 ----------
1084 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1085 When `DatasetRef` the `dataId` should be `None`.
1086 Otherwise the `DatasetType` or name thereof.
1087 dataId : `dict` or `DataCoordinate`, optional
1088 A `dict` of `Dimension` link name, value pairs that label the
1089 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1090 should be provided as the first argument.
1091 parameters : `dict`
1092 Additional StorageClass-defined options to control reading,
1093 typically used to efficiently read only a subset of the dataset.
1094 collections : Any, optional
1095 Collections to be searched, overriding ``self.collections``.
1096 Can be any of the types supported by the ``collections`` argument
1097 to butler construction.
1098 storageClass : `StorageClass` or `str`, optional
1099 The storage class to be used to override the Python type
1100 returned by this method. By default the returned type matches
1101 the dataset type definition for this dataset. Specifying a
1102 read `StorageClass` can force a different type to be returned.
1103 This type must be compatible with the original type.
1104 **kwargs
1105 Additional keyword arguments used to augment or construct a
1106 `DataId`. See `DataId` parameters.
1108 Returns
1109 -------
1110 obj : `DeferredDatasetHandle`
1111 A handle which can be used to retrieve a dataset at a later time.
1113 Raises
1114 ------
1115 LookupError
1116 Raised if no matching dataset exists in the `Registry` or
1117 datastore.
1118 ValueError
1119 Raised if a resolved `DatasetRef` was passed as an input, but it
1120 differs from the one found in the registry.
1121 TypeError
1122 Raised if no collections were provided.
1123 """
1124 if isinstance(datasetRefOrType, DatasetRef):
1125 # Do the quick check first and if that fails, check for artifact
1126 # existence. This is necessary for datastores that are configured
1127 # in trust mode where there won't be a record but there will be
1128 # a file.
1129 if self._datastore.knows(datasetRefOrType) or self._datastore.exists(datasetRefOrType):
1130 ref = datasetRefOrType
1131 else:
1132 raise LookupError(f"Dataset reference {datasetRefOrType} does not exist.")
1133 else:
1134 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs)
1135 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters, storageClass=storageClass)
1137 def get(
1138 self,
1139 datasetRefOrType: DatasetRef | DatasetType | str,
1140 /,
1141 dataId: DataId | None = None,
1142 *,
1143 parameters: dict[str, Any] | None = None,
1144 collections: Any = None,
1145 storageClass: StorageClass | str | None = None,
1146 **kwargs: Any,
1147 ) -> Any:
1148 """Retrieve a stored dataset.
1150 Parameters
1151 ----------
1152 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1153 When `DatasetRef` the `dataId` should be `None`.
1154 Otherwise the `DatasetType` or name thereof.
1155 If a resolved `DatasetRef`, the associated dataset
1156 is returned directly without additional querying.
1157 dataId : `dict` or `DataCoordinate`
1158 A `dict` of `Dimension` link name, value pairs that label the
1159 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1160 should be provided as the first argument.
1161 parameters : `dict`
1162 Additional StorageClass-defined options to control reading,
1163 typically used to efficiently read only a subset of the dataset.
1164 collections : Any, optional
1165 Collections to be searched, overriding ``self.collections``.
1166 Can be any of the types supported by the ``collections`` argument
1167 to butler construction.
1168 storageClass : `StorageClass` or `str`, optional
1169 The storage class to be used to override the Python type
1170 returned by this method. By default the returned type matches
1171 the dataset type definition for this dataset. Specifying a
1172 read `StorageClass` can force a different type to be returned.
1173 This type must be compatible with the original type.
1174 **kwargs
1175 Additional keyword arguments used to augment or construct a
1176 `DataCoordinate`. See `DataCoordinate.standardize`
1177 parameters.
1179 Returns
1180 -------
1181 obj : `object`
1182 The dataset.
1184 Raises
1185 ------
1186 LookupError
1187 Raised if no matching dataset exists in the `Registry`.
1188 TypeError
1189 Raised if no collections were provided.
1191 Notes
1192 -----
1193 When looking up datasets in a `~CollectionType.CALIBRATION` collection,
1194 this method requires that the given data ID include temporal dimensions
1195 beyond the dimensions of the dataset type itself, in order to find the
1196 dataset with the appropriate validity range. For example, a "bias"
1197 dataset with native dimensions ``{instrument, detector}`` could be
1198 fetched with a ``{instrument, detector, exposure}`` data ID, because
1199 ``exposure`` is a temporal dimension.
1200 """
1201 _LOG.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters)
1202 ref = self._findDatasetRef(
1203 datasetRefOrType, dataId, collections=collections, datastore_records=True, **kwargs
1204 )
1205 return self._datastore.get(ref, parameters=parameters, storageClass=storageClass)
1207 def getURIs(
1208 self,
1209 datasetRefOrType: DatasetRef | DatasetType | str,
1210 /,
1211 dataId: DataId | None = None,
1212 *,
1213 predict: bool = False,
1214 collections: Any = None,
1215 run: str | None = None,
1216 **kwargs: Any,
1217 ) -> DatasetRefURIs:
1218 """Return the URIs associated with the dataset.
1220 Parameters
1221 ----------
1222 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1223 When `DatasetRef` the `dataId` should be `None`.
1224 Otherwise the `DatasetType` or name thereof.
1225 dataId : `dict` or `DataCoordinate`
1226 A `dict` of `Dimension` link name, value pairs that label the
1227 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1228 should be provided as the first argument.
1229 predict : `bool`
1230 If `True`, allow URIs to be returned of datasets that have not
1231 been written.
1232 collections : Any, optional
1233 Collections to be searched, overriding ``self.collections``.
1234 Can be any of the types supported by the ``collections`` argument
1235 to butler construction.
1236 run : `str`, optional
1237 Run to use for predictions, overriding ``self.run``.
1238 **kwargs
1239 Additional keyword arguments used to augment or construct a
1240 `DataCoordinate`. See `DataCoordinate.standardize`
1241 parameters.
1243 Returns
1244 -------
1245 uris : `DatasetRefURIs`
1246 The URI to the primary artifact associated with this dataset (if
1247 the dataset was disassembled within the datastore this may be
1248 `None`), and the URIs to any components associated with the dataset
1249 artifact. (can be empty if there are no components).
1250 """
1251 ref = self._findDatasetRef(
1252 datasetRefOrType, dataId, predict=predict, run=run, collections=collections, **kwargs
1253 )
1254 return self._datastore.getURIs(ref, predict)
1256 def getURI(
1257 self,
1258 datasetRefOrType: DatasetRef | DatasetType | str,
1259 /,
1260 dataId: DataId | None = None,
1261 *,
1262 predict: bool = False,
1263 collections: Any = None,
1264 run: str | None = None,
1265 **kwargs: Any,
1266 ) -> ResourcePath:
1267 """Return the URI to the Dataset.
1269 Parameters
1270 ----------
1271 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1272 When `DatasetRef` the `dataId` should be `None`.
1273 Otherwise the `DatasetType` or name thereof.
1274 dataId : `dict` or `DataCoordinate`
1275 A `dict` of `Dimension` link name, value pairs that label the
1276 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1277 should be provided as the first argument.
1278 predict : `bool`
1279 If `True`, allow URIs to be returned of datasets that have not
1280 been written.
1281 collections : Any, optional
1282 Collections to be searched, overriding ``self.collections``.
1283 Can be any of the types supported by the ``collections`` argument
1284 to butler construction.
1285 run : `str`, optional
1286 Run to use for predictions, overriding ``self.run``.
1287 **kwargs
1288 Additional keyword arguments used to augment or construct a
1289 `DataCoordinate`. See `DataCoordinate.standardize`
1290 parameters.
1292 Returns
1293 -------
1294 uri : `lsst.resources.ResourcePath`
1295 URI pointing to the Dataset within the datastore. If the
1296 Dataset does not exist in the datastore, and if ``predict`` is
1297 `True`, the URI will be a prediction and will include a URI
1298 fragment "#predicted".
1299 If the datastore does not have entities that relate well
1300 to the concept of a URI the returned URI string will be
1301 descriptive. The returned URI is not guaranteed to be obtainable.
1303 Raises
1304 ------
1305 LookupError
1306 A URI has been requested for a dataset that does not exist and
1307 guessing is not allowed.
1308 ValueError
1309 Raised if a resolved `DatasetRef` was passed as an input, but it
1310 differs from the one found in the registry.
1311 TypeError
1312 Raised if no collections were provided.
1313 RuntimeError
1314 Raised if a URI is requested for a dataset that consists of
1315 multiple artifacts.
1316 """
1317 primary, components = self.getURIs(
1318 datasetRefOrType, dataId=dataId, predict=predict, collections=collections, run=run, **kwargs
1319 )
1321 if primary is None or components:
1322 raise RuntimeError(
1323 f"Dataset ({datasetRefOrType}) includes distinct URIs for components. "
1324 "Use Butler.getURIs() instead."
1325 )
1326 return primary
1328 def get_dataset_type(self, name: str) -> DatasetType:
1329 return self._registry.getDatasetType(name)
1331 def get_dataset(
1332 self,
1333 id: DatasetId,
1334 storage_class: str | StorageClass | None = None,
1335 dimension_records: bool = False,
1336 datastore_records: bool = False,
1337 ) -> DatasetRef | None:
1338 ref = self._registry.getDataset(id)
1339 if ref is not None:
1340 if dimension_records:
1341 ref = ref.expanded(
1342 self._registry.expandDataId(ref.dataId, dimensions=ref.datasetType.dimensions)
1343 )
1344 if storage_class:
1345 ref = ref.overrideStorageClass(storage_class)
1346 if datastore_records:
1347 ref = self._registry.get_datastore_records(ref)
1348 return ref
1350 def find_dataset(
1351 self,
1352 dataset_type: DatasetType | str,
1353 data_id: DataId | None = None,
1354 *,
1355 collections: str | Sequence[str] | None = None,
1356 timespan: Timespan | None = None,
1357 storage_class: str | StorageClass | None = None,
1358 dimension_records: bool = False,
1359 datastore_records: bool = False,
1360 **kwargs: Any,
1361 ) -> DatasetRef | None:
1362 # Handle any parts of the dataID that are not using primary dimension
1363 # keys.
1364 if isinstance(dataset_type, str):
1365 actual_type = self.get_dataset_type(dataset_type)
1366 else:
1367 actual_type = dataset_type
1368 data_id, kwargs = self._rewrite_data_id(data_id, actual_type, **kwargs)
1370 ref = self._registry.findDataset(
1371 dataset_type,
1372 data_id,
1373 collections=collections,
1374 timespan=timespan,
1375 datastore_records=datastore_records,
1376 **kwargs,
1377 )
1378 if ref is not None and dimension_records:
1379 ref = ref.expanded(self._registry.expandDataId(ref.dataId, dimensions=ref.datasetType.dimensions))
1380 if ref is not None and storage_class is not None:
1381 ref = ref.overrideStorageClass(storage_class)
1382 return ref
1384 def retrieveArtifacts(
1385 self,
1386 refs: Iterable[DatasetRef],
1387 destination: ResourcePathExpression,
1388 transfer: str = "auto",
1389 preserve_path: bool = True,
1390 overwrite: bool = False,
1391 ) -> list[ResourcePath]:
1392 # Docstring inherited.
1393 return self._datastore.retrieveArtifacts(
1394 refs,
1395 ResourcePath(destination),
1396 transfer=transfer,
1397 preserve_path=preserve_path,
1398 overwrite=overwrite,
1399 )
1401 def exists(
1402 self,
1403 dataset_ref_or_type: DatasetRef | DatasetType | str,
1404 /,
1405 data_id: DataId | None = None,
1406 *,
1407 full_check: bool = True,
1408 collections: Any = None,
1409 **kwargs: Any,
1410 ) -> DatasetExistence:
1411 # Docstring inherited.
1412 existence = DatasetExistence.UNRECOGNIZED
1414 if isinstance(dataset_ref_or_type, DatasetRef):
1415 if collections is not None:
1416 warnings.warn("Collections should not be specified with DatasetRef", stacklevel=2)
1417 if data_id is not None:
1418 warnings.warn("A DataID should not be specified with DatasetRef", stacklevel=2)
1419 ref = dataset_ref_or_type
1420 registry_ref = self._registry.getDataset(dataset_ref_or_type.id)
1421 if registry_ref is not None:
1422 existence |= DatasetExistence.RECORDED
1424 if dataset_ref_or_type != registry_ref:
1425 # This could mean that storage classes differ, so we should
1426 # check for that but use the registry ref for the rest of
1427 # the method.
1428 if registry_ref.is_compatible_with(dataset_ref_or_type):
1429 # Use the registry version from now on.
1430 ref = registry_ref
1431 else:
1432 raise ValueError(
1433 f"The ref given to exists() ({ref}) has the same dataset ID as one "
1434 f"in registry but has different incompatible values ({registry_ref})."
1435 )
1436 else:
1437 try:
1438 ref = self._findDatasetRef(dataset_ref_or_type, data_id, collections=collections, **kwargs)
1439 except (LookupError, TypeError, NoDefaultCollectionError):
1440 return existence
1441 existence |= DatasetExistence.RECORDED
1443 if self._datastore.knows(ref):
1444 existence |= DatasetExistence.DATASTORE
1446 if full_check:
1447 if self._datastore.exists(ref):
1448 existence |= DatasetExistence._ARTIFACT
1449 elif existence.value != DatasetExistence.UNRECOGNIZED.value:
1450 # Do not add this flag if we have no other idea about a dataset.
1451 existence |= DatasetExistence(DatasetExistence._ASSUMED)
1453 return existence
1455 def _exists_many(
1456 self,
1457 refs: Iterable[DatasetRef],
1458 /,
1459 *,
1460 full_check: bool = True,
1461 ) -> dict[DatasetRef, DatasetExistence]:
1462 # Docstring inherited.
1463 existence = {ref: DatasetExistence.UNRECOGNIZED for ref in refs}
1465 # Registry does not have a bulk API to check for a ref.
1466 for ref in refs:
1467 registry_ref = self._registry.getDataset(ref.id)
1468 if registry_ref is not None:
1469 # It is possible, albeit unlikely, that the given ref does
1470 # not match the one in registry even though the UUID matches.
1471 # When checking a single ref we raise, but it's impolite to
1472 # do that when potentially hundreds of refs are being checked.
1473 # We could change the API to only accept UUIDs and that would
1474 # remove the ability to even check and remove the worry
1475 # about differing storage classes. Given the ongoing discussion
1476 # on refs vs UUIDs and whether to raise or have a new
1477 # private flag, treat this as a private API for now.
1478 existence[ref] |= DatasetExistence.RECORDED
1480 # Ask datastore if it knows about these refs.
1481 knows = self._datastore.knows_these(refs)
1482 for ref, known in knows.items():
1483 if known:
1484 existence[ref] |= DatasetExistence.DATASTORE
1486 if full_check:
1487 mexists = self._datastore.mexists(refs)
1488 for ref, exists in mexists.items():
1489 if exists:
1490 existence[ref] |= DatasetExistence._ARTIFACT
1491 else:
1492 # Do not set this flag if nothing is known about the dataset.
1493 for ref in existence:
1494 if existence[ref] != DatasetExistence.UNRECOGNIZED:
1495 existence[ref] |= DatasetExistence._ASSUMED
1497 return existence
1499 # TODO: remove on DM-40079.
1500 @deprecated(
1501 reason="Butler.datasetExists() has been replaced by Butler.exists(). Will be removed after v26.0.",
1502 version="v26.0",
1503 category=FutureWarning,
1504 )
1505 def datasetExists(
1506 self,
1507 datasetRefOrType: DatasetRef | DatasetType | str,
1508 dataId: DataId | None = None,
1509 *,
1510 collections: Any = None,
1511 **kwargs: Any,
1512 ) -> bool:
1513 """Return True if the Dataset is actually present in the Datastore.
1515 Parameters
1516 ----------
1517 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1518 When `DatasetRef` the `dataId` should be `None`.
1519 Otherwise the `DatasetType` or name thereof.
1520 dataId : `dict` or `DataCoordinate`
1521 A `dict` of `Dimension` link name, value pairs that label the
1522 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1523 should be provided as the first argument.
1524 collections : Any, optional
1525 Collections to be searched, overriding ``self.collections``.
1526 Can be any of the types supported by the ``collections`` argument
1527 to butler construction.
1528 **kwargs
1529 Additional keyword arguments used to augment or construct a
1530 `DataCoordinate`. See `DataCoordinate.standardize`
1531 parameters.
1533 Raises
1534 ------
1535 LookupError
1536 Raised if the dataset is not even present in the Registry.
1537 ValueError
1538 Raised if a resolved `DatasetRef` was passed as an input, but it
1539 differs from the one found in the registry.
1540 NoDefaultCollectionError
1541 Raised if no collections were provided.
1542 """
1543 # A resolved ref may be given that is not known to this butler.
1544 if isinstance(datasetRefOrType, DatasetRef):
1545 ref = self._registry.getDataset(datasetRefOrType.id)
1546 if ref is None:
1547 raise LookupError(
1548 f"Resolved DatasetRef with id {datasetRefOrType.id} is not known to registry."
1549 )
1550 else:
1551 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs)
1552 return self._datastore.exists(ref)
1554 def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None:
1555 # Docstring inherited.
1556 if not self.isWriteable():
1557 raise TypeError("Butler is read-only.")
1558 names = list(names)
1559 refs: list[DatasetRef] = []
1560 for name in names:
1561 collectionType = self._registry.getCollectionType(name)
1562 if collectionType is not CollectionType.RUN:
1563 raise TypeError(f"The collection type of '{name}' is {collectionType.name}, not RUN.")
1564 refs.extend(self._registry.queryDatasets(..., collections=name, findFirst=True))
1565 with self._datastore.transaction(), self._registry.transaction():
1566 if unstore:
1567 self._datastore.trash(refs)
1568 else:
1569 self._datastore.forget(refs)
1570 for name in names:
1571 self._registry.removeCollection(name)
1572 if unstore:
1573 # Point of no return for removing artifacts
1574 self._datastore.emptyTrash()
1576 def pruneDatasets(
1577 self,
1578 refs: Iterable[DatasetRef],
1579 *,
1580 disassociate: bool = True,
1581 unstore: bool = False,
1582 tags: Iterable[str] = (),
1583 purge: bool = False,
1584 ) -> None:
1585 # docstring inherited from LimitedButler
1587 if not self.isWriteable():
1588 raise TypeError("Butler is read-only.")
1589 if purge:
1590 if not disassociate:
1591 raise TypeError("Cannot pass purge=True without disassociate=True.")
1592 if not unstore:
1593 raise TypeError("Cannot pass purge=True without unstore=True.")
1594 elif disassociate:
1595 tags = tuple(tags)
1596 if not tags:
1597 raise TypeError("No tags provided but disassociate=True.")
1598 for tag in tags:
1599 collectionType = self._registry.getCollectionType(tag)
1600 if collectionType is not CollectionType.TAGGED:
1601 raise TypeError(
1602 f"Cannot disassociate from collection '{tag}' "
1603 f"of non-TAGGED type {collectionType.name}."
1604 )
1605 # Transform possibly-single-pass iterable into something we can iterate
1606 # over multiple times.
1607 refs = list(refs)
1608 # Pruning a component of a DatasetRef makes no sense since registry
1609 # doesn't know about components and datastore might not store
1610 # components in a separate file
1611 for ref in refs:
1612 if ref.datasetType.component():
1613 raise ValueError(f"Can not prune a component of a dataset (ref={ref})")
1614 # We don't need an unreliable Datastore transaction for this, because
1615 # we've been extra careful to ensure that Datastore.trash only involves
1616 # mutating the Registry (it can _look_ at Datastore-specific things,
1617 # but shouldn't change them), and hence all operations here are
1618 # Registry operations.
1619 with self._datastore.transaction(), self._registry.transaction():
1620 if unstore:
1621 self._datastore.trash(refs)
1622 if purge:
1623 self._registry.removeDatasets(refs)
1624 elif disassociate:
1625 assert tags, "Guaranteed by earlier logic in this function."
1626 for tag in tags:
1627 self._registry.disassociate(tag, refs)
1628 # We've exited the Registry transaction, and apparently committed.
1629 # (if there was an exception, everything rolled back, and it's as if
1630 # nothing happened - and we never get here).
1631 # Datastore artifacts are not yet gone, but they're clearly marked
1632 # as trash, so if we fail to delete now because of (e.g.) filesystem
1633 # problems we can try again later, and if manual administrative
1634 # intervention is required, it's pretty clear what that should entail:
1635 # deleting everything on disk and in private Datastore tables that is
1636 # in the dataset_location_trash table.
1637 if unstore:
1638 # Point of no return for removing artifacts
1639 self._datastore.emptyTrash()
1641 @transactional
1642 def ingest(
1643 self,
1644 *datasets: FileDataset,
1645 transfer: str | None = "auto",
1646 run: str | None = None,
1647 idGenerationMode: DatasetIdGenEnum | None = None,
1648 record_validation_info: bool = True,
1649 ) -> None:
1650 # Docstring inherited.
1651 if not self.isWriteable():
1652 raise TypeError("Butler is read-only.")
1654 _LOG.verbose("Ingesting %d file dataset%s.", len(datasets), "" if len(datasets) == 1 else "s")
1655 if not datasets:
1656 return
1658 if idGenerationMode is not None:
1659 warnings.warn(
1660 "The idGenerationMode parameter is no longer used and is ignored. "
1661 " Will be removed after v26.0",
1662 FutureWarning,
1663 stacklevel=2,
1664 )
1666 progress = Progress("lsst.daf.butler.Butler.ingest", level=logging.DEBUG)
1668 # We need to reorganize all the inputs so that they are grouped
1669 # by dataset type and run. Multiple refs in a single FileDataset
1670 # are required to share the run and dataset type.
1671 GroupedData = MutableMapping[tuple[DatasetType, str], list[FileDataset]]
1672 groupedData: GroupedData = defaultdict(list)
1674 # Track DataIDs that are being ingested so we can spot issues early
1675 # with duplication. Retain previous FileDataset so we can report it.
1676 groupedDataIds: MutableMapping[
1677 tuple[DatasetType, str], dict[DataCoordinate, FileDataset]
1678 ] = defaultdict(dict)
1680 used_run = False
1682 # And the nested loop that populates it:
1683 for dataset in progress.wrap(datasets, desc="Grouping by dataset type"):
1684 # Somewhere to store pre-existing refs if we have an
1685 # execution butler.
1686 existingRefs: list[DatasetRef] = []
1688 for ref in dataset.refs:
1689 assert ref.run is not None # For mypy
1690 group_key = (ref.datasetType, ref.run)
1692 if ref.dataId in groupedDataIds[group_key]:
1693 raise ConflictingDefinitionError(
1694 f"Ingest conflict. Dataset {dataset.path} has same"
1695 " DataId as other ingest dataset"
1696 f" {groupedDataIds[group_key][ref.dataId].path} "
1697 f" ({ref.dataId})"
1698 )
1700 groupedDataIds[group_key][ref.dataId] = dataset
1702 if existingRefs:
1703 if len(dataset.refs) != len(existingRefs):
1704 # Keeping track of partially pre-existing datasets is hard
1705 # and should generally never happen. For now don't allow
1706 # it.
1707 raise ConflictingDefinitionError(
1708 f"For dataset {dataset.path} some dataIds already exist"
1709 " in registry but others do not. This is not supported."
1710 )
1712 # Store expanded form in the original FileDataset.
1713 dataset.refs = existingRefs
1714 else:
1715 groupedData[group_key].append(dataset)
1717 if not used_run and run is not None:
1718 warnings.warn(
1719 "All DatasetRefs to be ingested had resolved dataset IDs. The value given to the "
1720 f"'run' parameter ({run!r}) was not used and the parameter will be removed in the future.",
1721 category=FutureWarning,
1722 stacklevel=3, # Take into account the @transactional decorator.
1723 )
1725 # Now we can bulk-insert into Registry for each DatasetType.
1726 for (datasetType, this_run), grouped_datasets in progress.iter_item_chunks(
1727 groupedData.items(), desc="Bulk-inserting datasets by type"
1728 ):
1729 refs_to_import = []
1730 for dataset in grouped_datasets:
1731 refs_to_import.extend(dataset.refs)
1733 n_refs = len(refs_to_import)
1734 _LOG.verbose(
1735 "Importing %d ref%s of dataset type %r into run %r",
1736 n_refs,
1737 "" if n_refs == 1 else "s",
1738 datasetType.name,
1739 this_run,
1740 )
1742 # Import the refs and expand the DataCoordinates since we can't
1743 # guarantee that they are expanded and Datastore will need
1744 # the records.
1745 imported_refs = self._registry._importDatasets(refs_to_import, expand=True)
1746 assert set(imported_refs) == set(refs_to_import)
1748 # Replace all the refs in the FileDataset with expanded versions.
1749 # Pull them off in the order we put them on the list.
1750 for dataset in grouped_datasets:
1751 n_dataset_refs = len(dataset.refs)
1752 dataset.refs = imported_refs[:n_dataset_refs]
1753 del imported_refs[:n_dataset_refs]
1755 # Bulk-insert everything into Datastore.
1756 # We do not know if any of the registry entries already existed
1757 # (_importDatasets only complains if they exist but differ) so
1758 # we have to catch IntegrityError explicitly.
1759 try:
1760 self._datastore.ingest(
1761 *datasets, transfer=transfer, record_validation_info=record_validation_info
1762 )
1763 except IntegrityError as e:
1764 raise ConflictingDefinitionError(f"Datastore already contains one or more datasets: {e}") from e
1766 @contextlib.contextmanager
1767 def export(
1768 self,
1769 *,
1770 directory: str | None = None,
1771 filename: str | None = None,
1772 format: str | None = None,
1773 transfer: str | None = None,
1774 ) -> Iterator[RepoExportContext]:
1775 # Docstring inherited.
1776 if directory is None and transfer is not None:
1777 raise TypeError("Cannot transfer without providing a directory.")
1778 if transfer == "move":
1779 raise TypeError("Transfer may not be 'move': export is read-only")
1780 if format is None:
1781 if filename is None:
1782 raise TypeError("At least one of 'filename' or 'format' must be provided.")
1783 else:
1784 _, format = os.path.splitext(filename)
1785 if not format:
1786 raise ValueError("Please specify a file extension to determine export format.")
1787 format = format[1:] # Strip leading ".""
1788 elif filename is None:
1789 filename = f"export.{format}"
1790 if directory is not None:
1791 filename = os.path.join(directory, filename)
1792 formats = self._config["repo_transfer_formats"]
1793 if format not in formats:
1794 raise ValueError(f"Unknown export format {format!r}, allowed: {','.join(formats.keys())}")
1795 BackendClass = get_class_of(formats[format, "export"])
1796 with open(filename, "w") as stream:
1797 backend = BackendClass(stream, universe=self.dimensions)
1798 try:
1799 helper = RepoExportContext(
1800 self._registry, self._datastore, backend=backend, directory=directory, transfer=transfer
1801 )
1802 yield helper
1803 except BaseException:
1804 raise
1805 else:
1806 helper._finish()
1808 def import_(
1809 self,
1810 *,
1811 directory: ResourcePathExpression | None = None,
1812 filename: ResourcePathExpression | TextIO | None = None,
1813 format: str | None = None,
1814 transfer: str | None = None,
1815 skip_dimensions: set | None = None,
1816 ) -> None:
1817 # Docstring inherited.
1818 if not self.isWriteable():
1819 raise TypeError("Butler is read-only.")
1820 if format is None:
1821 if filename is None:
1822 raise TypeError("At least one of 'filename' or 'format' must be provided.")
1823 else:
1824 _, format = os.path.splitext(filename) # type: ignore
1825 elif filename is None:
1826 filename = ResourcePath(f"export.{format}", forceAbsolute=False)
1827 if directory is not None:
1828 directory = ResourcePath(directory, forceDirectory=True)
1829 # mypy doesn't think this will work but it does in python >= 3.10.
1830 if isinstance(filename, ResourcePathExpression): # type: ignore
1831 filename = ResourcePath(filename, forceAbsolute=False) # type: ignore
1832 if not filename.isabs() and directory is not None:
1833 potential = directory.join(filename)
1834 exists_in_cwd = filename.exists()
1835 exists_in_dir = potential.exists()
1836 if exists_in_cwd and exists_in_dir:
1837 _LOG.warning(
1838 "A relative path for filename was specified (%s) which exists relative to cwd. "
1839 "Additionally, the file exists relative to the given search directory (%s). "
1840 "Using the export file in the given directory.",
1841 filename,
1842 potential,
1843 )
1844 # Given they specified an explicit directory and that
1845 # directory has the export file in it, assume that that
1846 # is what was meant despite the file in cwd.
1847 filename = potential
1848 elif exists_in_dir:
1849 filename = potential
1850 elif not exists_in_cwd and not exists_in_dir:
1851 # Raise early.
1852 raise FileNotFoundError(
1853 f"Export file could not be found in {filename.abspath()} or {potential.abspath()}."
1854 )
1855 BackendClass: type[RepoImportBackend] = get_class_of(
1856 self._config["repo_transfer_formats"][format]["import"]
1857 )
1859 def doImport(importStream: TextIO | ResourceHandleProtocol) -> None:
1860 backend = BackendClass(importStream, self._registry) # type: ignore[call-arg]
1861 backend.register()
1862 with self.transaction():
1863 backend.load(
1864 self._datastore,
1865 directory=directory,
1866 transfer=transfer,
1867 skip_dimensions=skip_dimensions,
1868 )
1870 if isinstance(filename, ResourcePath):
1871 # We can not use open() here at the moment because of
1872 # DM-38589 since yaml does stream.read(8192) in a loop.
1873 stream = io.StringIO(filename.read().decode())
1874 doImport(stream)
1875 else:
1876 doImport(filename) # type: ignore
1878 def transfer_from(
1879 self,
1880 source_butler: LimitedButler,
1881 source_refs: Iterable[DatasetRef],
1882 transfer: str = "auto",
1883 skip_missing: bool = True,
1884 register_dataset_types: bool = False,
1885 transfer_dimensions: bool = False,
1886 ) -> collections.abc.Collection[DatasetRef]:
1887 # Docstring inherited.
1888 if not self.isWriteable():
1889 raise TypeError("Butler is read-only.")
1890 progress = Progress("lsst.daf.butler.Butler.transfer_from", level=VERBOSE)
1892 # Will iterate through the refs multiple times so need to convert
1893 # to a list if this isn't a collection.
1894 if not isinstance(source_refs, collections.abc.Collection):
1895 source_refs = list(source_refs)
1897 original_count = len(source_refs)
1898 _LOG.info("Transferring %d datasets into %s", original_count, str(self))
1900 # In some situations the datastore artifact may be missing
1901 # and we do not want that registry entry to be imported.
1902 # Asking datastore is not sufficient, the records may have been
1903 # purged, we have to ask for the (predicted) URI and check
1904 # existence explicitly. Execution butler is set up exactly like
1905 # this with no datastore records.
1906 artifact_existence: dict[ResourcePath, bool] = {}
1907 if skip_missing:
1908 dataset_existence = source_butler._datastore.mexists(
1909 source_refs, artifact_existence=artifact_existence
1910 )
1911 source_refs = [ref for ref, exists in dataset_existence.items() if exists]
1912 filtered_count = len(source_refs)
1913 n_missing = original_count - filtered_count
1914 _LOG.verbose(
1915 "%d dataset%s removed because the artifact does not exist. Now have %d.",
1916 n_missing,
1917 "" if n_missing == 1 else "s",
1918 filtered_count,
1919 )
1921 # Importing requires that we group the refs by dataset type and run
1922 # before doing the import.
1923 source_dataset_types = set()
1924 grouped_refs = defaultdict(list)
1925 for ref in source_refs:
1926 grouped_refs[ref.datasetType, ref.run].append(ref)
1927 source_dataset_types.add(ref.datasetType)
1929 # Check to see if the dataset type in the source butler has
1930 # the same definition in the target butler and register missing
1931 # ones if requested. Registration must happen outside a transaction.
1932 newly_registered_dataset_types = set()
1933 for datasetType in source_dataset_types:
1934 if register_dataset_types:
1935 # Let this raise immediately if inconsistent. Continuing
1936 # on to find additional inconsistent dataset types
1937 # might result in additional unwanted dataset types being
1938 # registered.
1939 if self._registry.registerDatasetType(datasetType):
1940 newly_registered_dataset_types.add(datasetType)
1941 else:
1942 # If the dataset type is missing, let it fail immediately.
1943 target_dataset_type = self.get_dataset_type(datasetType.name)
1944 if target_dataset_type != datasetType:
1945 raise ConflictingDefinitionError(
1946 "Source butler dataset type differs from definition"
1947 f" in target butler: {datasetType} !="
1948 f" {target_dataset_type}"
1949 )
1950 if newly_registered_dataset_types:
1951 # We may have registered some even if there were inconsistencies
1952 # but should let people know (or else remove them again).
1953 _LOG.verbose(
1954 "Registered the following dataset types in the target Butler: %s",
1955 ", ".join(d.name for d in newly_registered_dataset_types),
1956 )
1957 else:
1958 _LOG.verbose("All required dataset types are known to the target Butler")
1960 dimension_records: dict[DimensionElement, dict[DataCoordinate, DimensionRecord]] = defaultdict(dict)
1961 if transfer_dimensions:
1962 # Collect all the dimension records for these refs.
1963 # All dimensions are to be copied but the list of valid dimensions
1964 # come from this butler's universe.
1965 elements = frozenset(
1966 element
1967 for element in self.dimensions.elements
1968 if element.hasTable() and element.viewOf is None
1969 )
1970 dataIds = {ref.dataId for ref in source_refs}
1971 # This logic comes from saveDataIds.
1972 for dataId in dataIds:
1973 # Need an expanded record, if not expanded that we need a full
1974 # butler with registry (allow mocks with registry too).
1975 if not dataId.hasRecords():
1976 if registry := getattr(source_butler, "registry", None):
1977 dataId = registry.expandDataId(dataId)
1978 else:
1979 raise TypeError("Input butler needs to be a full butler to expand DataId.")
1980 # If this butler doesn't know about a dimension in the source
1981 # butler things will break later.
1982 for element_name in dataId.dimensions.elements:
1983 record = dataId.records[element_name]
1984 if record is not None and record.definition in elements:
1985 dimension_records[record.definition].setdefault(record.dataId, record)
1987 handled_collections: set[str] = set()
1989 # Do all the importing in a single transaction.
1990 with self.transaction():
1991 if dimension_records:
1992 _LOG.verbose("Ensuring that dimension records exist for transferred datasets.")
1993 for element, r in dimension_records.items():
1994 records = [r[dataId] for dataId in r]
1995 # Assume that if the record is already present that we can
1996 # use it without having to check that the record metadata
1997 # is consistent.
1998 self._registry.insertDimensionData(element, *records, skip_existing=True)
2000 n_imported = 0
2001 for (datasetType, run), refs_to_import in progress.iter_item_chunks(
2002 grouped_refs.items(), desc="Importing to registry by run and dataset type"
2003 ):
2004 if run not in handled_collections:
2005 # May need to create output collection. If source butler
2006 # has a registry, ask for documentation string.
2007 run_doc = None
2008 if registry := getattr(source_butler, "registry", None):
2009 run_doc = registry.getCollectionDocumentation(run)
2010 registered = self._registry.registerRun(run, doc=run_doc)
2011 handled_collections.add(run)
2012 if registered:
2013 _LOG.verbose("Creating output run %s", run)
2015 n_refs = len(refs_to_import)
2016 _LOG.verbose(
2017 "Importing %d ref%s of dataset type %s into run %s",
2018 n_refs,
2019 "" if n_refs == 1 else "s",
2020 datasetType.name,
2021 run,
2022 )
2024 # Assume we are using UUIDs and the source refs will match
2025 # those imported.
2026 imported_refs = self._registry._importDatasets(refs_to_import)
2027 assert set(imported_refs) == set(refs_to_import)
2028 n_imported += len(imported_refs)
2030 assert len(source_refs) == n_imported
2031 _LOG.verbose("Imported %d datasets into destination butler", n_imported)
2033 # Ask the datastore to transfer. The datastore has to check that
2034 # the source datastore is compatible with the target datastore.
2035 accepted, rejected = self._datastore.transfer_from(
2036 source_butler._datastore,
2037 source_refs,
2038 transfer=transfer,
2039 artifact_existence=artifact_existence,
2040 )
2041 if rejected:
2042 # For now, accept the registry entries but not the files.
2043 _LOG.warning(
2044 "%d datasets were rejected and %d accepted for dataset type %s in run %r.",
2045 len(rejected),
2046 len(accepted),
2047 datasetType,
2048 run,
2049 )
2051 return source_refs
2053 def validateConfiguration(
2054 self,
2055 logFailures: bool = False,
2056 datasetTypeNames: Iterable[str] | None = None,
2057 ignore: Iterable[str] | None = None,
2058 ) -> None:
2059 # Docstring inherited.
2060 if datasetTypeNames:
2061 datasetTypes = [self.get_dataset_type(name) for name in datasetTypeNames]
2062 else:
2063 datasetTypes = list(self._registry.queryDatasetTypes())
2065 # filter out anything from the ignore list
2066 if ignore:
2067 ignore = set(ignore)
2068 datasetTypes = [
2069 e for e in datasetTypes if e.name not in ignore and e.nameAndComponent()[0] not in ignore
2070 ]
2071 else:
2072 ignore = set()
2074 # For each datasetType that has an instrument dimension, create
2075 # a DatasetRef for each defined instrument
2076 datasetRefs = []
2078 # Find all the registered instruments (if "instrument" is in the
2079 # universe).
2080 if "instrument" in self.dimensions:
2081 instruments = {record.name for record in self._registry.queryDimensionRecords("instrument")}
2083 for datasetType in datasetTypes:
2084 if "instrument" in datasetType.dimensions:
2085 # In order to create a conforming dataset ref, create
2086 # fake DataCoordinate values for the non-instrument
2087 # dimensions. The type of the value does not matter here.
2088 dataId = {dim: 1 for dim in datasetType.dimensions.names if dim != "instrument"}
2090 for instrument in instruments:
2091 datasetRef = DatasetRef(
2092 datasetType,
2093 DataCoordinate.standardize(
2094 dataId, instrument=instrument, dimensions=datasetType.dimensions
2095 ),
2096 run="validate",
2097 )
2098 datasetRefs.append(datasetRef)
2100 entities: list[DatasetType | DatasetRef] = []
2101 entities.extend(datasetTypes)
2102 entities.extend(datasetRefs)
2104 datastoreErrorStr = None
2105 try:
2106 self._datastore.validateConfiguration(entities, logFailures=logFailures)
2107 except ValidationError as e:
2108 datastoreErrorStr = str(e)
2110 # Also check that the LookupKeys used by the datastores match
2111 # registry and storage class definitions
2112 keys = self._datastore.getLookupKeys()
2114 failedNames = set()
2115 failedDataId = set()
2116 for key in keys:
2117 if key.name is not None:
2118 if key.name in ignore:
2119 continue
2121 # skip if specific datasetType names were requested and this
2122 # name does not match
2123 if datasetTypeNames and key.name not in datasetTypeNames:
2124 continue
2126 # See if it is a StorageClass or a DatasetType
2127 if key.name in self.storageClasses:
2128 pass
2129 else:
2130 try:
2131 self.get_dataset_type(key.name)
2132 except KeyError:
2133 if logFailures:
2134 _LOG.critical(
2135 "Key '%s' does not correspond to a DatasetType or StorageClass", key
2136 )
2137 failedNames.add(key)
2138 else:
2139 # Dimensions are checked for consistency when the Butler
2140 # is created and rendezvoused with a universe.
2141 pass
2143 # Check that the instrument is a valid instrument
2144 # Currently only support instrument so check for that
2145 if key.dataId:
2146 dataIdKeys = set(key.dataId)
2147 if {"instrument"} != dataIdKeys:
2148 if logFailures:
2149 _LOG.critical("Key '%s' has unsupported DataId override", key)
2150 failedDataId.add(key)
2151 elif key.dataId["instrument"] not in instruments:
2152 if logFailures:
2153 _LOG.critical("Key '%s' has unknown instrument", key)
2154 failedDataId.add(key)
2156 messages = []
2158 if datastoreErrorStr:
2159 messages.append(datastoreErrorStr)
2161 for failed, msg in (
2162 (failedNames, "Keys without corresponding DatasetType or StorageClass entry: "),
2163 (failedDataId, "Keys with bad DataId entries: "),
2164 ):
2165 if failed:
2166 msg += ", ".join(str(k) for k in failed)
2167 messages.append(msg)
2169 if messages:
2170 raise ValidationError(";\n".join(messages))
2172 @property
2173 def collections(self) -> Sequence[str]:
2174 """The collections to search by default, in order
2175 (`~collections.abc.Sequence` [ `str` ]).
2177 This is an alias for ``self.registry.defaults.collections``. It cannot
2178 be set directly in isolation, but all defaults may be changed together
2179 by assigning a new `RegistryDefaults` instance to
2180 ``self.registry.defaults``.
2181 """
2182 return self._registry.defaults.collections
2184 @property
2185 def run(self) -> str | None:
2186 """Name of the run this butler writes outputs to by default (`str` or
2187 `None`).
2189 This is an alias for ``self.registry.defaults.run``. It cannot be set
2190 directly in isolation, but all defaults may be changed together by
2191 assigning a new `RegistryDefaults` instance to
2192 ``self.registry.defaults``.
2193 """
2194 return self._registry.defaults.run
2196 @property
2197 def registry(self) -> Registry:
2198 """The object that manages dataset metadata and relationships
2199 (`Registry`).
2201 Many operations that don't involve reading or writing butler datasets
2202 are accessible only via `Registry` methods. Eventually these methods
2203 will be replaced by equivalent `Butler` methods.
2204 """
2205 return self._registry_shim
2207 @property
2208 def dimensions(self) -> DimensionUniverse:
2209 # Docstring inherited.
2210 return self._registry.dimensions
2212 _registry: SqlRegistry
2213 """The object that manages dataset metadata and relationships
2214 (`SqlRegistry`).
2216 Most operations that don't involve reading or writing butler datasets are
2217 accessible only via `SqlRegistry` methods.
2218 """
2220 datastore: Datastore
2221 """The object that manages actual dataset storage (`Datastore`).
2223 Direct user access to the datastore should rarely be necessary; the primary
2224 exception is the case where a `Datastore` implementation provides extra
2225 functionality beyond what the base class defines.
2226 """
2228 storageClasses: StorageClassFactory
2229 """An object that maps known storage class names to objects that fully
2230 describe them (`StorageClassFactory`).
2231 """