Coverage for python/lsst/daf/butler/direct_butler.py: 11%
689 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-10-27 09:44 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-10-27 09:44 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28"""Butler top level classes.
29"""
30from __future__ import annotations
32__all__ = (
33 "DirectButler",
34 "ButlerValidationError",
35)
37import collections.abc
38import contextlib
39import io
40import logging
41import numbers
42import os
43import warnings
44from collections import Counter, defaultdict
45from collections.abc import Iterable, Iterator, MutableMapping, Sequence
46from typing import TYPE_CHECKING, Any, ClassVar, TextIO
48from deprecated.sphinx import deprecated
49from lsst.resources import ResourcePath, ResourcePathExpression
50from lsst.utils.introspection import get_class_of
51from lsst.utils.logging import VERBOSE, getLogger
52from sqlalchemy.exc import IntegrityError
54from ._butler import Butler
55from ._butler_config import ButlerConfig
56from ._config import Config
57from ._dataset_existence import DatasetExistence
58from ._dataset_ref import DatasetIdGenEnum, DatasetRef
59from ._dataset_type import DatasetType
60from ._deferredDatasetHandle import DeferredDatasetHandle
61from ._exceptions import ValidationError
62from ._file_dataset import FileDataset
63from ._limited_butler import LimitedButler
64from ._registry_shim import RegistryShim
65from ._storage_class import StorageClass, StorageClassFactory
66from ._timespan import Timespan
67from .datastore import DatasetRefURIs, Datastore, NullDatastore
68from .dimensions import (
69 DataCoordinate,
70 DataId,
71 DataIdValue,
72 Dimension,
73 DimensionElement,
74 DimensionRecord,
75 DimensionUniverse,
76)
77from .progress import Progress
78from .registry import (
79 CollectionType,
80 ConflictingDefinitionError,
81 DataIdError,
82 MissingDatasetTypeError,
83 NoDefaultCollectionError,
84 Registry,
85 RegistryDefaults,
86 _RegistryFactory,
87)
88from .registry.sql_registry import SqlRegistry
89from .transfers import RepoExportContext
90from .utils import transactional
92if TYPE_CHECKING:
93 from lsst.resources import ResourceHandleProtocol
95 from .transfers import RepoImportBackend
97_LOG = getLogger(__name__)
100class ButlerValidationError(ValidationError):
101 """There is a problem with the Butler configuration."""
103 pass
106class DirectButler(Butler):
107 """Main entry point for the data access system.
109 Parameters
110 ----------
111 config : `ButlerConfig`, `Config` or `str`, optional.
112 Configuration. Anything acceptable to the
113 `ButlerConfig` constructor. If a directory path
114 is given the configuration will be read from a ``butler.yaml`` file in
115 that location. If `None` is given default values will be used.
116 butler : `DirectButler`, optional.
117 If provided, construct a new Butler that uses the same registry and
118 datastore as the given one, but with the given collection and run.
119 Incompatible with the ``config``, ``searchPaths``, and ``writeable``
120 arguments.
121 collections : `str` or `~collections.abc.Iterable` [ `str` ], optional
122 An expression specifying the collections to be searched (in order) when
123 reading datasets.
124 This may be a `str` collection name or an iterable thereof.
125 See :ref:`daf_butler_collection_expressions` for more information.
126 These collections are not registered automatically and must be
127 manually registered before they are used by any method, but they may be
128 manually registered after the `Butler` is initialized.
129 run : `str`, optional
130 Name of the `~CollectionType.RUN` collection new datasets should be
131 inserted into. If ``collections`` is `None` and ``run`` is not `None`,
132 ``collections`` will be set to ``[run]``. If not `None`, this
133 collection will automatically be registered. If this is not set (and
134 ``writeable`` is not set either), a read-only butler will be created.
135 searchPaths : `list` of `str`, optional
136 Directory paths to search when calculating the full Butler
137 configuration. Not used if the supplied config is already a
138 `ButlerConfig`.
139 writeable : `bool`, optional
140 Explicitly sets whether the butler supports write operations. If not
141 provided, a read-write butler is created if any of ``run``, ``tags``,
142 or ``chains`` is non-empty.
143 inferDefaults : `bool`, optional
144 If `True` (default) infer default data ID values from the values
145 present in the datasets in ``collections``: if all collections have the
146 same value (or no value) for a governor dimension, that value will be
147 the default for that dimension. Nonexistent collections are ignored.
148 If a default value is provided explicitly for a governor dimension via
149 ``**kwargs``, no default will be inferred for that dimension.
150 without_datastore : `bool`, optional
151 If `True` do not attach a datastore to this butler. Any attempts
152 to use a datastore will fail.
153 **kwargs : `str`
154 Default data ID key-value pairs. These may only identify "governor"
155 dimensions like ``instrument`` and ``skymap``.
156 """
158 def __init__(
159 self,
160 config: Config | ResourcePathExpression | None = None,
161 *,
162 butler: DirectButler | None = None,
163 collections: Any = None,
164 run: str | None = None,
165 searchPaths: Sequence[ResourcePathExpression] | None = None,
166 writeable: bool | None = None,
167 inferDefaults: bool = True,
168 without_datastore: bool = False,
169 **kwargs: str,
170 ):
171 defaults = RegistryDefaults(collections=collections, run=run, infer=inferDefaults, **kwargs)
172 # Load registry, datastore, etc. from config or existing butler.
173 if butler is not None:
174 if config is not None or searchPaths is not None or writeable is not None:
175 raise TypeError(
176 "Cannot pass 'config', 'searchPaths', or 'writeable' arguments with 'butler' argument."
177 )
178 self._registry = butler._registry.copy(defaults)
179 self._datastore = butler._datastore
180 self.storageClasses = butler.storageClasses
181 self._config: ButlerConfig = butler._config
182 else:
183 self._config = ButlerConfig(config, searchPaths=searchPaths, without_datastore=without_datastore)
184 try:
185 butlerRoot = self._config.get("root", self._config.configDir)
186 if writeable is None:
187 writeable = run is not None
188 self._registry = _RegistryFactory(self._config).from_config(
189 butlerRoot=butlerRoot, writeable=writeable, defaults=defaults
190 )
191 if without_datastore:
192 self._datastore = NullDatastore(None, None)
193 else:
194 self._datastore = Datastore.fromConfig(
195 self._config, self._registry.getDatastoreBridgeManager(), butlerRoot=butlerRoot
196 )
197 # TODO: Once datastore drops dependency on registry we can
198 # construct datastore first and pass opaque tables to registry
199 # constructor.
200 self._registry.make_datastore_tables(self._datastore.get_opaque_table_definitions())
201 self.storageClasses = StorageClassFactory()
202 self.storageClasses.addFromConfig(self._config)
203 except Exception:
204 # Failures here usually mean that configuration is incomplete,
205 # just issue an error message which includes config file URI.
206 _LOG.error(f"Failed to instantiate Butler from config {self._config.configFile}.")
207 raise
209 # For execution butler the datastore needs a special
210 # dependency-inversion trick. This is not used by regular butler,
211 # but we do not have a way to distinguish regular butler from execution
212 # butler.
213 self._datastore.set_retrieve_dataset_type_method(self._retrieve_dataset_type)
215 if "run" in self._config or "collection" in self._config:
216 raise ValueError("Passing a run or collection via configuration is no longer supported.")
218 self._registry_shim = RegistryShim(self)
220 GENERATION: ClassVar[int] = 3
221 """This is a Generation 3 Butler.
223 This attribute may be removed in the future, once the Generation 2 Butler
224 interface has been fully retired; it should only be used in transitional
225 code.
226 """
228 def _retrieve_dataset_type(self, name: str) -> DatasetType | None:
229 """Return DatasetType defined in registry given dataset type name."""
230 try:
231 return self._registry.getDatasetType(name)
232 except MissingDatasetTypeError:
233 return None
235 @classmethod
236 def _unpickle(
237 cls,
238 config: ButlerConfig,
239 collections: tuple[str, ...] | None,
240 run: str | None,
241 defaultDataId: dict[str, str],
242 writeable: bool,
243 ) -> DirectButler:
244 """Callable used to unpickle a Butler.
246 We prefer not to use ``Butler.__init__`` directly so we can force some
247 of its many arguments to be keyword-only (note that ``__reduce__``
248 can only invoke callables with positional arguments).
250 Parameters
251 ----------
252 config : `ButlerConfig`
253 Butler configuration, already coerced into a true `ButlerConfig`
254 instance (and hence after any search paths for overrides have been
255 utilized).
256 collections : `tuple` [ `str` ]
257 Names of the default collections to read from.
258 run : `str`, optional
259 Name of the default `~CollectionType.RUN` collection to write to.
260 defaultDataId : `dict` [ `str`, `str` ]
261 Default data ID values.
262 writeable : `bool`
263 Whether the Butler should support write operations.
265 Returns
266 -------
267 butler : `Butler`
268 A new `Butler` instance.
269 """
270 # MyPy doesn't recognize that the kwargs below are totally valid; it
271 # seems to think '**defaultDataId* is a _positional_ argument!
272 return cls(
273 config=config,
274 collections=collections,
275 run=run,
276 writeable=writeable,
277 **defaultDataId, # type: ignore
278 )
280 def __reduce__(self) -> tuple:
281 """Support pickling."""
282 return (
283 DirectButler._unpickle,
284 (
285 self._config,
286 self.collections,
287 self.run,
288 self._registry.defaults.dataId.byName(),
289 self._registry.isWriteable(),
290 ),
291 )
293 def __str__(self) -> str:
294 return "Butler(collections={}, run={}, datastore='{}', registry='{}')".format(
295 self.collections, self.run, self._datastore, self._registry
296 )
298 def isWriteable(self) -> bool:
299 # Docstring inherited.
300 return self._registry.isWriteable()
302 @contextlib.contextmanager
303 def transaction(self) -> Iterator[None]:
304 """Context manager supporting `Butler` transactions.
306 Transactions can be nested.
307 """
308 with self._registry.transaction(), self._datastore.transaction():
309 yield
311 def _standardizeArgs(
312 self,
313 datasetRefOrType: DatasetRef | DatasetType | str,
314 dataId: DataId | None = None,
315 for_put: bool = True,
316 **kwargs: Any,
317 ) -> tuple[DatasetType, DataId | None]:
318 """Standardize the arguments passed to several Butler APIs.
320 Parameters
321 ----------
322 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
323 When `DatasetRef` the `dataId` should be `None`.
324 Otherwise the `DatasetType` or name thereof.
325 dataId : `dict` or `DataCoordinate`
326 A `dict` of `Dimension` link name, value pairs that label the
327 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
328 should be provided as the second argument.
329 for_put : `bool`, optional
330 If `True` this call is invoked as part of a `Butler.put()`.
331 Otherwise it is assumed to be part of a `Butler.get()`. This
332 parameter is only relevant if there is dataset type
333 inconsistency.
334 **kwargs
335 Additional keyword arguments used to augment or construct a
336 `DataCoordinate`. See `DataCoordinate.standardize`
337 parameters.
339 Returns
340 -------
341 datasetType : `DatasetType`
342 A `DatasetType` instance extracted from ``datasetRefOrType``.
343 dataId : `dict` or `DataId`, optional
344 Argument that can be used (along with ``kwargs``) to construct a
345 `DataId`.
347 Notes
348 -----
349 Butler APIs that conceptually need a DatasetRef also allow passing a
350 `DatasetType` (or the name of one) and a `DataId` (or a dict and
351 keyword arguments that can be used to construct one) separately. This
352 method accepts those arguments and always returns a true `DatasetType`
353 and a `DataId` or `dict`.
355 Standardization of `dict` vs `DataId` is best handled by passing the
356 returned ``dataId`` (and ``kwargs``) to `Registry` APIs, which are
357 generally similarly flexible.
358 """
359 externalDatasetType: DatasetType | None = None
360 internalDatasetType: DatasetType | None = None
361 if isinstance(datasetRefOrType, DatasetRef):
362 if dataId is not None or kwargs:
363 raise ValueError("DatasetRef given, cannot use dataId as well")
364 externalDatasetType = datasetRefOrType.datasetType
365 dataId = datasetRefOrType.dataId
366 else:
367 # Don't check whether DataId is provided, because Registry APIs
368 # can usually construct a better error message when it wasn't.
369 if isinstance(datasetRefOrType, DatasetType):
370 externalDatasetType = datasetRefOrType
371 else:
372 internalDatasetType = self._registry.getDatasetType(datasetRefOrType)
374 # Check that they are self-consistent
375 if externalDatasetType is not None:
376 internalDatasetType = self._registry.getDatasetType(externalDatasetType.name)
377 if externalDatasetType != internalDatasetType:
378 # We can allow differences if they are compatible, depending
379 # on whether this is a get or a put. A get requires that
380 # the python type associated with the datastore can be
381 # converted to the user type. A put requires that the user
382 # supplied python type can be converted to the internal
383 # type expected by registry.
384 relevantDatasetType = internalDatasetType
385 if for_put:
386 is_compatible = internalDatasetType.is_compatible_with(externalDatasetType)
387 else:
388 is_compatible = externalDatasetType.is_compatible_with(internalDatasetType)
389 relevantDatasetType = externalDatasetType
390 if not is_compatible:
391 raise ValueError(
392 f"Supplied dataset type ({externalDatasetType}) inconsistent with "
393 f"registry definition ({internalDatasetType})"
394 )
395 # Override the internal definition.
396 internalDatasetType = relevantDatasetType
398 assert internalDatasetType is not None
399 return internalDatasetType, dataId
401 def _rewrite_data_id(
402 self, dataId: DataId | None, datasetType: DatasetType, **kwargs: Any
403 ) -> tuple[DataId | None, dict[str, Any]]:
404 """Rewrite a data ID taking into account dimension records.
406 Take a Data ID and keyword args and rewrite it if necessary to
407 allow the user to specify dimension records rather than dimension
408 primary values.
410 This allows a user to include a dataId dict with keys of
411 ``exposure.day_obs`` and ``exposure.seq_num`` instead of giving
412 the integer exposure ID. It also allows a string to be given
413 for a dimension value rather than the integer ID if that is more
414 convenient. For example, rather than having to specifying the
415 detector with ``detector.full_name``, a string given for ``detector``
416 will be interpreted as the full name and converted to the integer
417 value.
419 Keyword arguments can also use strings for dimensions like detector
420 and exposure but python does not allow them to include ``.`` and
421 so the ``exposure.day_obs`` syntax can not be used in a keyword
422 argument.
424 Parameters
425 ----------
426 dataId : `dict` or `DataCoordinate`
427 A `dict` of `Dimension` link name, value pairs that will label the
428 `DatasetRef` within a Collection.
429 datasetType : `DatasetType`
430 The dataset type associated with this dataId. Required to
431 determine the relevant dimensions.
432 **kwargs
433 Additional keyword arguments used to augment or construct a
434 `DataId`. See `DataId` parameters.
436 Returns
437 -------
438 dataId : `dict` or `DataCoordinate`
439 The, possibly rewritten, dataId. If given a `DataCoordinate` and
440 no keyword arguments, the original dataId will be returned
441 unchanged.
442 **kwargs : `dict`
443 Any unused keyword arguments (would normally be empty dict).
444 """
445 # Do nothing if we have a standalone DataCoordinate.
446 if isinstance(dataId, DataCoordinate) and not kwargs:
447 return dataId, kwargs
449 # Process dimension records that are using record information
450 # rather than ids
451 newDataId: dict[str, DataIdValue] = {}
452 byRecord: dict[str, dict[str, Any]] = defaultdict(dict)
454 # if all the dataId comes from keyword parameters we do not need
455 # to do anything here because they can't be of the form
456 # exposure.obs_id because a "." is not allowed in a keyword parameter.
457 if dataId:
458 for k, v in dataId.items():
459 # If we have a Dimension we do not need to do anything
460 # because it cannot be a compound key.
461 if isinstance(k, str) and "." in k:
462 # Someone is using a more human-readable dataId
463 dimensionName, record = k.split(".", 1)
464 byRecord[dimensionName][record] = v
465 elif isinstance(k, Dimension):
466 newDataId[k.name] = v
467 else:
468 newDataId[k] = v
470 # Go through the updated dataId and check the type in case someone is
471 # using an alternate key. We have already filtered out the compound
472 # keys dimensions.record format.
473 not_dimensions = {}
475 # Will need to look in the dataId and the keyword arguments
476 # and will remove them if they need to be fixed or are unrecognized.
477 for dataIdDict in (newDataId, kwargs):
478 # Use a list so we can adjust the dict safely in the loop
479 for dimensionName in list(dataIdDict):
480 value = dataIdDict[dimensionName]
481 try:
482 dimension = self.dimensions.getStaticDimensions()[dimensionName]
483 except KeyError:
484 # This is not a real dimension
485 not_dimensions[dimensionName] = value
486 del dataIdDict[dimensionName]
487 continue
489 # Convert an integral type to an explicit int to simplify
490 # comparisons here
491 if isinstance(value, numbers.Integral):
492 value = int(value)
494 if not isinstance(value, dimension.primaryKey.getPythonType()):
495 for alternate in dimension.alternateKeys:
496 if isinstance(value, alternate.getPythonType()):
497 byRecord[dimensionName][alternate.name] = value
498 del dataIdDict[dimensionName]
499 _LOG.debug(
500 "Converting dimension %s to %s.%s=%s",
501 dimensionName,
502 dimensionName,
503 alternate.name,
504 value,
505 )
506 break
507 else:
508 _LOG.warning(
509 "Type mismatch found for value '%r' provided for dimension %s. "
510 "Could not find matching alternative (primary key has type %s) "
511 "so attempting to use as-is.",
512 value,
513 dimensionName,
514 dimension.primaryKey.getPythonType(),
515 )
517 # By this point kwargs and newDataId should only include valid
518 # dimensions. Merge kwargs in to the new dataId and log if there
519 # are dimensions in both (rather than calling update).
520 for k, v in kwargs.items():
521 if k in newDataId and newDataId[k] != v:
522 _LOG.debug(
523 "Keyword arg %s overriding explicit value in dataId of %s with %s", k, newDataId[k], v
524 )
525 newDataId[k] = v
526 # No need to retain any values in kwargs now.
527 kwargs = {}
529 # If we have some unrecognized dimensions we have to try to connect
530 # them to records in other dimensions. This is made more complicated
531 # by some dimensions having records with clashing names. A mitigation
532 # is that we can tell by this point which dimensions are missing
533 # for the DatasetType but this does not work for calibrations
534 # where additional dimensions can be used to constrain the temporal
535 # axis.
536 if not_dimensions:
537 # Search for all dimensions even if we have been given a value
538 # explicitly. In some cases records are given as well as the
539 # actually dimension and this should not be an error if they
540 # match.
541 mandatoryDimensions = datasetType.dimensions.names # - provided
543 candidateDimensions: set[str] = set()
544 candidateDimensions.update(mandatoryDimensions)
546 # For calibrations we may well be needing temporal dimensions
547 # so rather than always including all dimensions in the scan
548 # restrict things a little. It is still possible for there
549 # to be confusion over day_obs in visit vs exposure for example.
550 # If we are not searching calibration collections things may
551 # fail but they are going to fail anyway because of the
552 # ambiguousness of the dataId...
553 if datasetType.isCalibration():
554 for dim in self.dimensions.getStaticDimensions():
555 if dim.temporal:
556 candidateDimensions.add(str(dim))
558 # Look up table for the first association with a dimension
559 guessedAssociation: dict[str, dict[str, Any]] = defaultdict(dict)
561 # Keep track of whether an item is associated with multiple
562 # dimensions.
563 counter: Counter[str] = Counter()
564 assigned: dict[str, set[str]] = defaultdict(set)
566 # Go through the missing dimensions and associate the
567 # given names with records within those dimensions
568 matched_dims = set()
569 for dimensionName in candidateDimensions:
570 dimension = self.dimensions.getStaticDimensions()[dimensionName]
571 fields = dimension.metadata.names | dimension.uniqueKeys.names
572 for field in not_dimensions:
573 if field in fields:
574 guessedAssociation[dimensionName][field] = not_dimensions[field]
575 counter[dimensionName] += 1
576 assigned[field].add(dimensionName)
577 matched_dims.add(field)
579 # Calculate the fields that matched nothing.
580 never_found = set(not_dimensions) - matched_dims
582 if never_found:
583 raise ValueError(f"Unrecognized keyword args given: {never_found}")
585 # There is a chance we have allocated a single dataId item
586 # to multiple dimensions. Need to decide which should be retained.
587 # For now assume that the most popular alternative wins.
588 # This means that day_obs with seq_num will result in
589 # exposure.day_obs and not visit.day_obs
590 # Also prefer an explicitly missing dimension over an inferred
591 # temporal dimension.
592 for fieldName, assignedDimensions in assigned.items():
593 if len(assignedDimensions) > 1:
594 # Pick the most popular (preferring mandatory dimensions)
595 requiredButMissing = assignedDimensions.intersection(mandatoryDimensions)
596 if requiredButMissing:
597 candidateDimensions = requiredButMissing
598 else:
599 candidateDimensions = assignedDimensions
601 # If this is a choice between visit and exposure and
602 # neither was a required part of the dataset type,
603 # (hence in this branch) always prefer exposure over
604 # visit since exposures are always defined and visits
605 # are defined from exposures.
606 if candidateDimensions == {"exposure", "visit"}:
607 candidateDimensions = {"exposure"}
609 # Select the relevant items and get a new restricted
610 # counter.
611 theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions}
612 duplicatesCounter: Counter[str] = Counter()
613 duplicatesCounter.update(theseCounts)
615 # Choose the most common. If they are equally common
616 # we will pick the one that was found first.
617 # Returns a list of tuples
618 selected = duplicatesCounter.most_common(1)[0][0]
620 _LOG.debug(
621 "Ambiguous dataId entry '%s' associated with multiple dimensions: %s."
622 " Removed ambiguity by choosing dimension %s.",
623 fieldName,
624 ", ".join(assignedDimensions),
625 selected,
626 )
628 for candidateDimension in assignedDimensions:
629 if candidateDimension != selected:
630 del guessedAssociation[candidateDimension][fieldName]
632 # Update the record look up dict with the new associations
633 for dimensionName, values in guessedAssociation.items():
634 if values: # A dict might now be empty
635 _LOG.debug(
636 "Assigned non-dimension dataId keys to dimension %s: %s", dimensionName, values
637 )
638 byRecord[dimensionName].update(values)
640 if byRecord:
641 # Some record specifiers were found so we need to convert
642 # them to the Id form
643 for dimensionName, values in byRecord.items():
644 if dimensionName in newDataId:
645 _LOG.debug(
646 "DataId specified explicit %s dimension value of %s in addition to"
647 " general record specifiers for it of %s. Ignoring record information.",
648 dimensionName,
649 newDataId[dimensionName],
650 str(values),
651 )
652 # Get the actual record and compare with these values.
653 try:
654 recs = list(self._registry.queryDimensionRecords(dimensionName, dataId=newDataId))
655 except DataIdError:
656 raise ValueError(
657 f"Could not find dimension '{dimensionName}'"
658 f" with dataId {newDataId} as part of comparing with"
659 f" record values {byRecord[dimensionName]}"
660 ) from None
661 if len(recs) == 1:
662 errmsg: list[str] = []
663 for k, v in values.items():
664 if (recval := getattr(recs[0], k)) != v:
665 errmsg.append(f"{k}({recval} != {v})")
666 if errmsg:
667 raise ValueError(
668 f"Dimension {dimensionName} in dataId has explicit value"
669 " inconsistent with records: " + ", ".join(errmsg)
670 )
671 else:
672 # Multiple matches for an explicit dimension
673 # should never happen but let downstream complain.
674 pass
675 continue
677 # Build up a WHERE expression
678 bind = dict(values.items())
679 where = " AND ".join(f"{dimensionName}.{k} = {k}" for k in bind)
681 # Hopefully we get a single record that matches
682 records = set(
683 self._registry.queryDimensionRecords(
684 dimensionName, dataId=newDataId, where=where, bind=bind, **kwargs
685 )
686 )
688 if len(records) != 1:
689 if len(records) > 1:
690 # visit can have an ambiguous answer without involving
691 # visit_system. The default visit_system is defined
692 # by the instrument.
693 if (
694 dimensionName == "visit"
695 and "visit_system_membership" in self.dimensions
696 and "visit_system" in self.dimensions["instrument"].metadata
697 ):
698 instrument_records = list(
699 self._registry.queryDimensionRecords(
700 "instrument",
701 dataId=newDataId,
702 **kwargs,
703 )
704 )
705 if len(instrument_records) == 1:
706 visit_system = instrument_records[0].visit_system
707 if visit_system is None:
708 # Set to a value that will never match.
709 visit_system = -1
711 # Look up each visit in the
712 # visit_system_membership records.
713 for rec in records:
714 membership = list(
715 self._registry.queryDimensionRecords(
716 # Use bind to allow zero results.
717 # This is a fully-specified query.
718 "visit_system_membership",
719 where="instrument = inst AND visit_system = system AND visit = v",
720 bind=dict(
721 inst=instrument_records[0].name, system=visit_system, v=rec.id
722 ),
723 )
724 )
725 if membership:
726 # This record is the right answer.
727 records = {rec}
728 break
730 # The ambiguity may have been resolved so check again.
731 if len(records) > 1:
732 _LOG.debug(
733 "Received %d records from constraints of %s", len(records), str(values)
734 )
735 for r in records:
736 _LOG.debug("- %s", str(r))
737 raise ValueError(
738 f"DataId specification for dimension {dimensionName} is not"
739 f" uniquely constrained to a single dataset by {values}."
740 f" Got {len(records)} results."
741 )
742 else:
743 raise ValueError(
744 f"DataId specification for dimension {dimensionName} matched no"
745 f" records when constrained by {values}"
746 )
748 # Get the primary key from the real dimension object
749 dimension = self.dimensions.getStaticDimensions()[dimensionName]
750 if not isinstance(dimension, Dimension):
751 raise RuntimeError(
752 f"{dimension.name} is not a true dimension, and cannot be used in data IDs."
753 )
754 newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name)
756 return newDataId, kwargs
758 def _findDatasetRef(
759 self,
760 datasetRefOrType: DatasetRef | DatasetType | str,
761 dataId: DataId | None = None,
762 *,
763 collections: Any = None,
764 predict: bool = False,
765 run: str | None = None,
766 datastore_records: bool = False,
767 **kwargs: Any,
768 ) -> DatasetRef:
769 """Shared logic for methods that start with a search for a dataset in
770 the registry.
772 Parameters
773 ----------
774 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
775 When `DatasetRef` the `dataId` should be `None`.
776 Otherwise the `DatasetType` or name thereof.
777 dataId : `dict` or `DataCoordinate`, optional
778 A `dict` of `Dimension` link name, value pairs that label the
779 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
780 should be provided as the first argument.
781 collections : Any, optional
782 Collections to be searched, overriding ``self.collections``.
783 Can be any of the types supported by the ``collections`` argument
784 to butler construction.
785 predict : `bool`, optional
786 If `True`, return a newly created `DatasetRef` with a unique
787 dataset ID if finding a reference in the `Registry` fails.
788 Defaults to `False`.
789 run : `str`, optional
790 Run collection name to use for creating `DatasetRef` for predicted
791 datasets. Only used if ``predict`` is `True`.
792 datastore_records : `bool`, optional
793 If `True` add datastore records to returned `DatasetRef`.
794 **kwargs
795 Additional keyword arguments used to augment or construct a
796 `DataId`. See `DataId` parameters.
798 Returns
799 -------
800 ref : `DatasetRef`
801 A reference to the dataset identified by the given arguments.
802 This can be the same dataset reference as given if it was
803 resolved.
805 Raises
806 ------
807 LookupError
808 Raised if no matching dataset exists in the `Registry` (and
809 ``predict`` is `False`).
810 ValueError
811 Raised if a resolved `DatasetRef` was passed as an input, but it
812 differs from the one found in the registry.
813 TypeError
814 Raised if no collections were provided.
815 """
816 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, for_put=False, **kwargs)
817 if isinstance(datasetRefOrType, DatasetRef):
818 if collections is not None:
819 warnings.warn("Collections should not be specified with DatasetRef", stacklevel=3)
820 # May need to retrieve datastore records if requested.
821 if datastore_records and datasetRefOrType._datastore_records is None:
822 datasetRefOrType = self._registry.get_datastore_records(datasetRefOrType)
823 return datasetRefOrType
824 timespan: Timespan | None = None
826 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs)
828 if datasetType.isCalibration():
829 # Because this is a calibration dataset, first try to make a
830 # standardize the data ID without restricting the dimensions to
831 # those of the dataset type requested, because there may be extra
832 # dimensions that provide temporal information for a validity-range
833 # lookup.
834 dataId = DataCoordinate.standardize(
835 dataId, universe=self.dimensions, defaults=self._registry.defaults.dataId, **kwargs
836 )
837 if dataId.graph.temporal:
838 dataId = self._registry.expandDataId(dataId)
839 timespan = dataId.timespan
840 else:
841 # Standardize the data ID to just the dimensions of the dataset
842 # type instead of letting registry.findDataset do it, so we get the
843 # result even if no dataset is found.
844 dataId = DataCoordinate.standardize(
845 dataId, graph=datasetType.dimensions, defaults=self._registry.defaults.dataId, **kwargs
846 )
847 # Always lookup the DatasetRef, even if one is given, to ensure it is
848 # present in the current collection.
849 ref = self._registry.findDataset(
850 datasetType,
851 dataId,
852 collections=collections,
853 timespan=timespan,
854 datastore_records=datastore_records,
855 )
856 if ref is None:
857 if predict:
858 if run is None:
859 run = self.run
860 if run is None:
861 raise TypeError("Cannot predict dataset ID/location with run=None.")
862 return DatasetRef(datasetType, dataId, run=run)
863 else:
864 if collections is None:
865 collections = self._registry.defaults.collections
866 raise LookupError(
867 f"Dataset {datasetType.name} with data ID {dataId} "
868 f"could not be found in collections {collections}."
869 )
870 if datasetType != ref.datasetType:
871 # If they differ it is because the user explicitly specified
872 # a compatible dataset type to this call rather than using the
873 # registry definition. The DatasetRef must therefore be recreated
874 # using the user definition such that the expected type is
875 # returned.
876 ref = DatasetRef(
877 datasetType, ref.dataId, run=ref.run, id=ref.id, datastore_records=ref._datastore_records
878 )
880 return ref
882 # TODO: remove on DM-40067.
883 @transactional
884 @deprecated(
885 reason="Butler.put() now behaves like Butler.putDirect() when given a DatasetRef."
886 " Please use Butler.put(). Be aware that you may need to adjust your usage if you"
887 " were relying on the run parameter to determine the run."
888 " Will be removed after v26.0.",
889 version="v26.0",
890 category=FutureWarning,
891 )
892 def putDirect(self, obj: Any, ref: DatasetRef, /) -> DatasetRef:
893 # Docstring inherited.
894 return self.put(obj, ref)
896 @transactional
897 def put(
898 self,
899 obj: Any,
900 datasetRefOrType: DatasetRef | DatasetType | str,
901 /,
902 dataId: DataId | None = None,
903 *,
904 run: str | None = None,
905 **kwargs: Any,
906 ) -> DatasetRef:
907 """Store and register a dataset.
909 Parameters
910 ----------
911 obj : `object`
912 The dataset.
913 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
914 When `DatasetRef` is provided, ``dataId`` should be `None`.
915 Otherwise the `DatasetType` or name thereof. If a fully resolved
916 `DatasetRef` is given the run and ID are used directly.
917 dataId : `dict` or `DataCoordinate`
918 A `dict` of `Dimension` link name, value pairs that label the
919 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
920 should be provided as the second argument.
921 run : `str`, optional
922 The name of the run the dataset should be added to, overriding
923 ``self.run``. Not used if a resolved `DatasetRef` is provided.
924 **kwargs
925 Additional keyword arguments used to augment or construct a
926 `DataCoordinate`. See `DataCoordinate.standardize`
927 parameters. Not used if a resolve `DatasetRef` is provided.
929 Returns
930 -------
931 ref : `DatasetRef`
932 A reference to the stored dataset, updated with the correct id if
933 given.
935 Raises
936 ------
937 TypeError
938 Raised if the butler is read-only or if no run has been provided.
939 """
940 if isinstance(datasetRefOrType, DatasetRef):
941 # This is a direct put of predefined DatasetRef.
942 _LOG.debug("Butler put direct: %s", datasetRefOrType)
943 if run is not None:
944 warnings.warn("Run collection is not used for DatasetRef", stacklevel=3)
945 # If registry already has a dataset with the same dataset ID,
946 # dataset type and DataId, then _importDatasets will do nothing and
947 # just return an original ref. We have to raise in this case, there
948 # is a datastore check below for that.
949 self._registry._importDatasets([datasetRefOrType], expand=True)
950 # Before trying to write to the datastore check that it does not
951 # know this dataset. This is prone to races, of course.
952 if self._datastore.knows(datasetRefOrType):
953 raise ConflictingDefinitionError(f"Datastore already contains dataset: {datasetRefOrType}")
954 # Try to write dataset to the datastore, if it fails due to a race
955 # with another write, the content of stored data may be
956 # unpredictable.
957 try:
958 self._datastore.put(obj, datasetRefOrType)
959 except IntegrityError as e:
960 raise ConflictingDefinitionError(f"Datastore already contains dataset: {e}") from e
961 return datasetRefOrType
963 _LOG.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run)
964 if not self.isWriteable():
965 raise TypeError("Butler is read-only.")
966 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs)
968 # Handle dimension records in dataId
969 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs)
971 # Add Registry Dataset entry.
972 dataId = self._registry.expandDataId(dataId, graph=datasetType.dimensions, **kwargs)
973 (ref,) = self._registry.insertDatasets(datasetType, run=run, dataIds=[dataId])
974 self._datastore.put(obj, ref)
976 return ref
978 # TODO: remove on DM-40067.
979 @deprecated(
980 reason="Butler.get() now behaves like Butler.getDirect() when given a DatasetRef."
981 " Please use Butler.get(). Will be removed after v26.0.",
982 version="v26.0",
983 category=FutureWarning,
984 )
985 def getDirect(
986 self,
987 ref: DatasetRef,
988 *,
989 parameters: dict[str, Any] | None = None,
990 storageClass: StorageClass | str | None = None,
991 ) -> Any:
992 """Retrieve a stored dataset.
994 Parameters
995 ----------
996 ref : `DatasetRef`
997 Resolved reference to an already stored dataset.
998 parameters : `dict`
999 Additional StorageClass-defined options to control reading,
1000 typically used to efficiently read only a subset of the dataset.
1001 storageClass : `StorageClass` or `str`, optional
1002 The storage class to be used to override the Python type
1003 returned by this method. By default the returned type matches
1004 the dataset type definition for this dataset. Specifying a
1005 read `StorageClass` can force a different type to be returned.
1006 This type must be compatible with the original type.
1008 Returns
1009 -------
1010 obj : `object`
1011 The dataset.
1012 """
1013 return self._datastore.get(ref, parameters=parameters, storageClass=storageClass)
1015 # TODO: remove on DM-40067.
1016 @deprecated(
1017 reason="Butler.getDeferred() now behaves like getDirectDeferred() when given a DatasetRef. "
1018 "Please use Butler.getDeferred(). Will be removed after v26.0.",
1019 version="v26.0",
1020 category=FutureWarning,
1021 )
1022 def getDirectDeferred(
1023 self,
1024 ref: DatasetRef,
1025 *,
1026 parameters: dict[str, Any] | None = None,
1027 storageClass: str | StorageClass | None = None,
1028 ) -> DeferredDatasetHandle:
1029 """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
1030 from a resolved `DatasetRef`.
1032 Parameters
1033 ----------
1034 ref : `DatasetRef`
1035 Resolved reference to an already stored dataset.
1036 parameters : `dict`
1037 Additional StorageClass-defined options to control reading,
1038 typically used to efficiently read only a subset of the dataset.
1039 storageClass : `StorageClass` or `str`, optional
1040 The storage class to be used to override the Python type
1041 returned by this method. By default the returned type matches
1042 the dataset type definition for this dataset. Specifying a
1043 read `StorageClass` can force a different type to be returned.
1044 This type must be compatible with the original type.
1046 Returns
1047 -------
1048 obj : `DeferredDatasetHandle`
1049 A handle which can be used to retrieve a dataset at a later time.
1051 Raises
1052 ------
1053 LookupError
1054 Raised if no matching dataset exists in the `Registry`.
1055 """
1056 # Check that dataset is known to the datastore.
1057 if not self._datastore.knows(ref):
1058 raise LookupError(f"Dataset reference {ref} is not known to datastore.")
1059 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters, storageClass=storageClass)
1061 def getDeferred(
1062 self,
1063 datasetRefOrType: DatasetRef | DatasetType | str,
1064 /,
1065 dataId: DataId | None = None,
1066 *,
1067 parameters: dict | None = None,
1068 collections: Any = None,
1069 storageClass: str | StorageClass | None = None,
1070 **kwargs: Any,
1071 ) -> DeferredDatasetHandle:
1072 """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
1073 after an immediate registry lookup.
1075 Parameters
1076 ----------
1077 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1078 When `DatasetRef` the `dataId` should be `None`.
1079 Otherwise the `DatasetType` or name thereof.
1080 dataId : `dict` or `DataCoordinate`, optional
1081 A `dict` of `Dimension` link name, value pairs that label the
1082 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1083 should be provided as the first argument.
1084 parameters : `dict`
1085 Additional StorageClass-defined options to control reading,
1086 typically used to efficiently read only a subset of the dataset.
1087 collections : Any, optional
1088 Collections to be searched, overriding ``self.collections``.
1089 Can be any of the types supported by the ``collections`` argument
1090 to butler construction.
1091 storageClass : `StorageClass` or `str`, optional
1092 The storage class to be used to override the Python type
1093 returned by this method. By default the returned type matches
1094 the dataset type definition for this dataset. Specifying a
1095 read `StorageClass` can force a different type to be returned.
1096 This type must be compatible with the original type.
1097 **kwargs
1098 Additional keyword arguments used to augment or construct a
1099 `DataId`. See `DataId` parameters.
1101 Returns
1102 -------
1103 obj : `DeferredDatasetHandle`
1104 A handle which can be used to retrieve a dataset at a later time.
1106 Raises
1107 ------
1108 LookupError
1109 Raised if no matching dataset exists in the `Registry` or
1110 datastore.
1111 ValueError
1112 Raised if a resolved `DatasetRef` was passed as an input, but it
1113 differs from the one found in the registry.
1114 TypeError
1115 Raised if no collections were provided.
1116 """
1117 if isinstance(datasetRefOrType, DatasetRef):
1118 # Do the quick check first and if that fails, check for artifact
1119 # existence. This is necessary for datastores that are configured
1120 # in trust mode where there won't be a record but there will be
1121 # a file.
1122 if self._datastore.knows(datasetRefOrType) or self._datastore.exists(datasetRefOrType):
1123 ref = datasetRefOrType
1124 else:
1125 raise LookupError(f"Dataset reference {datasetRefOrType} does not exist.")
1126 else:
1127 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs)
1128 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters, storageClass=storageClass)
1130 def get(
1131 self,
1132 datasetRefOrType: DatasetRef | DatasetType | str,
1133 /,
1134 dataId: DataId | None = None,
1135 *,
1136 parameters: dict[str, Any] | None = None,
1137 collections: Any = None,
1138 storageClass: StorageClass | str | None = None,
1139 **kwargs: Any,
1140 ) -> Any:
1141 """Retrieve a stored dataset.
1143 Parameters
1144 ----------
1145 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1146 When `DatasetRef` the `dataId` should be `None`.
1147 Otherwise the `DatasetType` or name thereof.
1148 If a resolved `DatasetRef`, the associated dataset
1149 is returned directly without additional querying.
1150 dataId : `dict` or `DataCoordinate`
1151 A `dict` of `Dimension` link name, value pairs that label the
1152 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1153 should be provided as the first argument.
1154 parameters : `dict`
1155 Additional StorageClass-defined options to control reading,
1156 typically used to efficiently read only a subset of the dataset.
1157 collections : Any, optional
1158 Collections to be searched, overriding ``self.collections``.
1159 Can be any of the types supported by the ``collections`` argument
1160 to butler construction.
1161 storageClass : `StorageClass` or `str`, optional
1162 The storage class to be used to override the Python type
1163 returned by this method. By default the returned type matches
1164 the dataset type definition for this dataset. Specifying a
1165 read `StorageClass` can force a different type to be returned.
1166 This type must be compatible with the original type.
1167 **kwargs
1168 Additional keyword arguments used to augment or construct a
1169 `DataCoordinate`. See `DataCoordinate.standardize`
1170 parameters.
1172 Returns
1173 -------
1174 obj : `object`
1175 The dataset.
1177 Raises
1178 ------
1179 LookupError
1180 Raised if no matching dataset exists in the `Registry`.
1181 TypeError
1182 Raised if no collections were provided.
1184 Notes
1185 -----
1186 When looking up datasets in a `~CollectionType.CALIBRATION` collection,
1187 this method requires that the given data ID include temporal dimensions
1188 beyond the dimensions of the dataset type itself, in order to find the
1189 dataset with the appropriate validity range. For example, a "bias"
1190 dataset with native dimensions ``{instrument, detector}`` could be
1191 fetched with a ``{instrument, detector, exposure}`` data ID, because
1192 ``exposure`` is a temporal dimension.
1193 """
1194 _LOG.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters)
1195 ref = self._findDatasetRef(
1196 datasetRefOrType, dataId, collections=collections, datastore_records=True, **kwargs
1197 )
1198 return self._datastore.get(ref, parameters=parameters, storageClass=storageClass)
1200 def getURIs(
1201 self,
1202 datasetRefOrType: DatasetRef | DatasetType | str,
1203 /,
1204 dataId: DataId | None = None,
1205 *,
1206 predict: bool = False,
1207 collections: Any = None,
1208 run: str | None = None,
1209 **kwargs: Any,
1210 ) -> DatasetRefURIs:
1211 """Return the URIs associated with the dataset.
1213 Parameters
1214 ----------
1215 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1216 When `DatasetRef` the `dataId` should be `None`.
1217 Otherwise the `DatasetType` or name thereof.
1218 dataId : `dict` or `DataCoordinate`
1219 A `dict` of `Dimension` link name, value pairs that label the
1220 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1221 should be provided as the first argument.
1222 predict : `bool`
1223 If `True`, allow URIs to be returned of datasets that have not
1224 been written.
1225 collections : Any, optional
1226 Collections to be searched, overriding ``self.collections``.
1227 Can be any of the types supported by the ``collections`` argument
1228 to butler construction.
1229 run : `str`, optional
1230 Run to use for predictions, overriding ``self.run``.
1231 **kwargs
1232 Additional keyword arguments used to augment or construct a
1233 `DataCoordinate`. See `DataCoordinate.standardize`
1234 parameters.
1236 Returns
1237 -------
1238 uris : `DatasetRefURIs`
1239 The URI to the primary artifact associated with this dataset (if
1240 the dataset was disassembled within the datastore this may be
1241 `None`), and the URIs to any components associated with the dataset
1242 artifact. (can be empty if there are no components).
1243 """
1244 ref = self._findDatasetRef(
1245 datasetRefOrType, dataId, predict=predict, run=run, collections=collections, **kwargs
1246 )
1247 return self._datastore.getURIs(ref, predict)
1249 def getURI(
1250 self,
1251 datasetRefOrType: DatasetRef | DatasetType | str,
1252 /,
1253 dataId: DataId | None = None,
1254 *,
1255 predict: bool = False,
1256 collections: Any = None,
1257 run: str | None = None,
1258 **kwargs: Any,
1259 ) -> ResourcePath:
1260 """Return the URI to the Dataset.
1262 Parameters
1263 ----------
1264 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1265 When `DatasetRef` the `dataId` should be `None`.
1266 Otherwise the `DatasetType` or name thereof.
1267 dataId : `dict` or `DataCoordinate`
1268 A `dict` of `Dimension` link name, value pairs that label the
1269 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1270 should be provided as the first argument.
1271 predict : `bool`
1272 If `True`, allow URIs to be returned of datasets that have not
1273 been written.
1274 collections : Any, optional
1275 Collections to be searched, overriding ``self.collections``.
1276 Can be any of the types supported by the ``collections`` argument
1277 to butler construction.
1278 run : `str`, optional
1279 Run to use for predictions, overriding ``self.run``.
1280 **kwargs
1281 Additional keyword arguments used to augment or construct a
1282 `DataCoordinate`. See `DataCoordinate.standardize`
1283 parameters.
1285 Returns
1286 -------
1287 uri : `lsst.resources.ResourcePath`
1288 URI pointing to the Dataset within the datastore. If the
1289 Dataset does not exist in the datastore, and if ``predict`` is
1290 `True`, the URI will be a prediction and will include a URI
1291 fragment "#predicted".
1292 If the datastore does not have entities that relate well
1293 to the concept of a URI the returned URI string will be
1294 descriptive. The returned URI is not guaranteed to be obtainable.
1296 Raises
1297 ------
1298 LookupError
1299 A URI has been requested for a dataset that does not exist and
1300 guessing is not allowed.
1301 ValueError
1302 Raised if a resolved `DatasetRef` was passed as an input, but it
1303 differs from the one found in the registry.
1304 TypeError
1305 Raised if no collections were provided.
1306 RuntimeError
1307 Raised if a URI is requested for a dataset that consists of
1308 multiple artifacts.
1309 """
1310 primary, components = self.getURIs(
1311 datasetRefOrType, dataId=dataId, predict=predict, collections=collections, run=run, **kwargs
1312 )
1314 if primary is None or components:
1315 raise RuntimeError(
1316 f"Dataset ({datasetRefOrType}) includes distinct URIs for components. "
1317 "Use Butler.getURIs() instead."
1318 )
1319 return primary
1321 def retrieveArtifacts(
1322 self,
1323 refs: Iterable[DatasetRef],
1324 destination: ResourcePathExpression,
1325 transfer: str = "auto",
1326 preserve_path: bool = True,
1327 overwrite: bool = False,
1328 ) -> list[ResourcePath]:
1329 # Docstring inherited.
1330 return self._datastore.retrieveArtifacts(
1331 refs,
1332 ResourcePath(destination),
1333 transfer=transfer,
1334 preserve_path=preserve_path,
1335 overwrite=overwrite,
1336 )
1338 def exists(
1339 self,
1340 dataset_ref_or_type: DatasetRef | DatasetType | str,
1341 /,
1342 data_id: DataId | None = None,
1343 *,
1344 full_check: bool = True,
1345 collections: Any = None,
1346 **kwargs: Any,
1347 ) -> DatasetExistence:
1348 # Docstring inherited.
1349 existence = DatasetExistence.UNRECOGNIZED
1351 if isinstance(dataset_ref_or_type, DatasetRef):
1352 if collections is not None:
1353 warnings.warn("Collections should not be specified with DatasetRef", stacklevel=2)
1354 if data_id is not None:
1355 warnings.warn("A DataID should not be specified with DatasetRef", stacklevel=2)
1356 ref = dataset_ref_or_type
1357 registry_ref = self._registry.getDataset(dataset_ref_or_type.id)
1358 if registry_ref is not None:
1359 existence |= DatasetExistence.RECORDED
1361 if dataset_ref_or_type != registry_ref:
1362 # This could mean that storage classes differ, so we should
1363 # check for that but use the registry ref for the rest of
1364 # the method.
1365 if registry_ref.is_compatible_with(dataset_ref_or_type):
1366 # Use the registry version from now on.
1367 ref = registry_ref
1368 else:
1369 raise ValueError(
1370 f"The ref given to exists() ({ref}) has the same dataset ID as one "
1371 f"in registry but has different incompatible values ({registry_ref})."
1372 )
1373 else:
1374 try:
1375 ref = self._findDatasetRef(dataset_ref_or_type, data_id, collections=collections, **kwargs)
1376 except (LookupError, TypeError, NoDefaultCollectionError):
1377 return existence
1378 existence |= DatasetExistence.RECORDED
1380 if self._datastore.knows(ref):
1381 existence |= DatasetExistence.DATASTORE
1383 if full_check:
1384 if self._datastore.exists(ref):
1385 existence |= DatasetExistence._ARTIFACT
1386 elif existence.value != DatasetExistence.UNRECOGNIZED.value:
1387 # Do not add this flag if we have no other idea about a dataset.
1388 existence |= DatasetExistence(DatasetExistence._ASSUMED)
1390 return existence
1392 def _exists_many(
1393 self,
1394 refs: Iterable[DatasetRef],
1395 /,
1396 *,
1397 full_check: bool = True,
1398 ) -> dict[DatasetRef, DatasetExistence]:
1399 # Docstring inherited.
1400 existence = {ref: DatasetExistence.UNRECOGNIZED for ref in refs}
1402 # Registry does not have a bulk API to check for a ref.
1403 for ref in refs:
1404 registry_ref = self._registry.getDataset(ref.id)
1405 if registry_ref is not None:
1406 # It is possible, albeit unlikely, that the given ref does
1407 # not match the one in registry even though the UUID matches.
1408 # When checking a single ref we raise, but it's impolite to
1409 # do that when potentially hundreds of refs are being checked.
1410 # We could change the API to only accept UUIDs and that would
1411 # remove the ability to even check and remove the worry
1412 # about differing storage classes. Given the ongoing discussion
1413 # on refs vs UUIDs and whether to raise or have a new
1414 # private flag, treat this as a private API for now.
1415 existence[ref] |= DatasetExistence.RECORDED
1417 # Ask datastore if it knows about these refs.
1418 knows = self._datastore.knows_these(refs)
1419 for ref, known in knows.items():
1420 if known:
1421 existence[ref] |= DatasetExistence.DATASTORE
1423 if full_check:
1424 mexists = self._datastore.mexists(refs)
1425 for ref, exists in mexists.items():
1426 if exists:
1427 existence[ref] |= DatasetExistence._ARTIFACT
1428 else:
1429 # Do not set this flag if nothing is known about the dataset.
1430 for ref in existence:
1431 if existence[ref] != DatasetExistence.UNRECOGNIZED:
1432 existence[ref] |= DatasetExistence._ASSUMED
1434 return existence
1436 # TODO: remove on DM-40079.
1437 @deprecated(
1438 reason="Butler.datasetExists() has been replaced by Butler.exists(). Will be removed after v26.0.",
1439 version="v26.0",
1440 category=FutureWarning,
1441 )
1442 def datasetExists(
1443 self,
1444 datasetRefOrType: DatasetRef | DatasetType | str,
1445 dataId: DataId | None = None,
1446 *,
1447 collections: Any = None,
1448 **kwargs: Any,
1449 ) -> bool:
1450 """Return True if the Dataset is actually present in the Datastore.
1452 Parameters
1453 ----------
1454 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1455 When `DatasetRef` the `dataId` should be `None`.
1456 Otherwise the `DatasetType` or name thereof.
1457 dataId : `dict` or `DataCoordinate`
1458 A `dict` of `Dimension` link name, value pairs that label the
1459 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1460 should be provided as the first argument.
1461 collections : Any, optional
1462 Collections to be searched, overriding ``self.collections``.
1463 Can be any of the types supported by the ``collections`` argument
1464 to butler construction.
1465 **kwargs
1466 Additional keyword arguments used to augment or construct a
1467 `DataCoordinate`. See `DataCoordinate.standardize`
1468 parameters.
1470 Raises
1471 ------
1472 LookupError
1473 Raised if the dataset is not even present in the Registry.
1474 ValueError
1475 Raised if a resolved `DatasetRef` was passed as an input, but it
1476 differs from the one found in the registry.
1477 NoDefaultCollectionError
1478 Raised if no collections were provided.
1479 """
1480 # A resolved ref may be given that is not known to this butler.
1481 if isinstance(datasetRefOrType, DatasetRef):
1482 ref = self._registry.getDataset(datasetRefOrType.id)
1483 if ref is None:
1484 raise LookupError(
1485 f"Resolved DatasetRef with id {datasetRefOrType.id} is not known to registry."
1486 )
1487 else:
1488 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs)
1489 return self._datastore.exists(ref)
1491 def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None:
1492 # Docstring inherited.
1493 if not self.isWriteable():
1494 raise TypeError("Butler is read-only.")
1495 names = list(names)
1496 refs: list[DatasetRef] = []
1497 for name in names:
1498 collectionType = self._registry.getCollectionType(name)
1499 if collectionType is not CollectionType.RUN:
1500 raise TypeError(f"The collection type of '{name}' is {collectionType.name}, not RUN.")
1501 refs.extend(self._registry.queryDatasets(..., collections=name, findFirst=True))
1502 with self._datastore.transaction(), self._registry.transaction():
1503 if unstore:
1504 self._datastore.trash(refs)
1505 else:
1506 self._datastore.forget(refs)
1507 for name in names:
1508 self._registry.removeCollection(name)
1509 if unstore:
1510 # Point of no return for removing artifacts
1511 self._datastore.emptyTrash()
1513 def pruneDatasets(
1514 self,
1515 refs: Iterable[DatasetRef],
1516 *,
1517 disassociate: bool = True,
1518 unstore: bool = False,
1519 tags: Iterable[str] = (),
1520 purge: bool = False,
1521 ) -> None:
1522 # docstring inherited from LimitedButler
1524 if not self.isWriteable():
1525 raise TypeError("Butler is read-only.")
1526 if purge:
1527 if not disassociate:
1528 raise TypeError("Cannot pass purge=True without disassociate=True.")
1529 if not unstore:
1530 raise TypeError("Cannot pass purge=True without unstore=True.")
1531 elif disassociate:
1532 tags = tuple(tags)
1533 if not tags:
1534 raise TypeError("No tags provided but disassociate=True.")
1535 for tag in tags:
1536 collectionType = self._registry.getCollectionType(tag)
1537 if collectionType is not CollectionType.TAGGED:
1538 raise TypeError(
1539 f"Cannot disassociate from collection '{tag}' "
1540 f"of non-TAGGED type {collectionType.name}."
1541 )
1542 # Transform possibly-single-pass iterable into something we can iterate
1543 # over multiple times.
1544 refs = list(refs)
1545 # Pruning a component of a DatasetRef makes no sense since registry
1546 # doesn't know about components and datastore might not store
1547 # components in a separate file
1548 for ref in refs:
1549 if ref.datasetType.component():
1550 raise ValueError(f"Can not prune a component of a dataset (ref={ref})")
1551 # We don't need an unreliable Datastore transaction for this, because
1552 # we've been extra careful to ensure that Datastore.trash only involves
1553 # mutating the Registry (it can _look_ at Datastore-specific things,
1554 # but shouldn't change them), and hence all operations here are
1555 # Registry operations.
1556 with self._datastore.transaction(), self._registry.transaction():
1557 if unstore:
1558 self._datastore.trash(refs)
1559 if purge:
1560 self._registry.removeDatasets(refs)
1561 elif disassociate:
1562 assert tags, "Guaranteed by earlier logic in this function."
1563 for tag in tags:
1564 self._registry.disassociate(tag, refs)
1565 # We've exited the Registry transaction, and apparently committed.
1566 # (if there was an exception, everything rolled back, and it's as if
1567 # nothing happened - and we never get here).
1568 # Datastore artifacts are not yet gone, but they're clearly marked
1569 # as trash, so if we fail to delete now because of (e.g.) filesystem
1570 # problems we can try again later, and if manual administrative
1571 # intervention is required, it's pretty clear what that should entail:
1572 # deleting everything on disk and in private Datastore tables that is
1573 # in the dataset_location_trash table.
1574 if unstore:
1575 # Point of no return for removing artifacts
1576 self._datastore.emptyTrash()
1578 @transactional
1579 def ingest(
1580 self,
1581 *datasets: FileDataset,
1582 transfer: str | None = "auto",
1583 run: str | None = None,
1584 idGenerationMode: DatasetIdGenEnum | None = None,
1585 record_validation_info: bool = True,
1586 ) -> None:
1587 # Docstring inherited.
1588 if not self.isWriteable():
1589 raise TypeError("Butler is read-only.")
1591 _LOG.verbose("Ingesting %d file dataset%s.", len(datasets), "" if len(datasets) == 1 else "s")
1592 if not datasets:
1593 return
1595 if idGenerationMode is not None:
1596 warnings.warn(
1597 "The idGenerationMode parameter is no longer used and is ignored. "
1598 " Will be removed after v26.0",
1599 FutureWarning,
1600 stacklevel=2,
1601 )
1603 progress = Progress("lsst.daf.butler.Butler.ingest", level=logging.DEBUG)
1605 # We need to reorganize all the inputs so that they are grouped
1606 # by dataset type and run. Multiple refs in a single FileDataset
1607 # are required to share the run and dataset type.
1608 GroupedData = MutableMapping[tuple[DatasetType, str], list[FileDataset]]
1609 groupedData: GroupedData = defaultdict(list)
1611 # Track DataIDs that are being ingested so we can spot issues early
1612 # with duplication. Retain previous FileDataset so we can report it.
1613 groupedDataIds: MutableMapping[
1614 tuple[DatasetType, str], dict[DataCoordinate, FileDataset]
1615 ] = defaultdict(dict)
1617 used_run = False
1619 # And the nested loop that populates it:
1620 for dataset in progress.wrap(datasets, desc="Grouping by dataset type"):
1621 # Somewhere to store pre-existing refs if we have an
1622 # execution butler.
1623 existingRefs: list[DatasetRef] = []
1625 for ref in dataset.refs:
1626 assert ref.run is not None # For mypy
1627 group_key = (ref.datasetType, ref.run)
1629 if ref.dataId in groupedDataIds[group_key]:
1630 raise ConflictingDefinitionError(
1631 f"Ingest conflict. Dataset {dataset.path} has same"
1632 " DataId as other ingest dataset"
1633 f" {groupedDataIds[group_key][ref.dataId].path} "
1634 f" ({ref.dataId})"
1635 )
1637 groupedDataIds[group_key][ref.dataId] = dataset
1639 if existingRefs:
1640 if len(dataset.refs) != len(existingRefs):
1641 # Keeping track of partially pre-existing datasets is hard
1642 # and should generally never happen. For now don't allow
1643 # it.
1644 raise ConflictingDefinitionError(
1645 f"For dataset {dataset.path} some dataIds already exist"
1646 " in registry but others do not. This is not supported."
1647 )
1649 # Store expanded form in the original FileDataset.
1650 dataset.refs = existingRefs
1651 else:
1652 groupedData[group_key].append(dataset)
1654 if not used_run and run is not None:
1655 warnings.warn(
1656 "All DatasetRefs to be ingested had resolved dataset IDs. The value given to the "
1657 f"'run' parameter ({run!r}) was not used and the parameter will be removed in the future.",
1658 category=FutureWarning,
1659 stacklevel=3, # Take into account the @transactional decorator.
1660 )
1662 # Now we can bulk-insert into Registry for each DatasetType.
1663 for (datasetType, this_run), grouped_datasets in progress.iter_item_chunks(
1664 groupedData.items(), desc="Bulk-inserting datasets by type"
1665 ):
1666 refs_to_import = []
1667 for dataset in grouped_datasets:
1668 refs_to_import.extend(dataset.refs)
1670 n_refs = len(refs_to_import)
1671 _LOG.verbose(
1672 "Importing %d ref%s of dataset type %r into run %r",
1673 n_refs,
1674 "" if n_refs == 1 else "s",
1675 datasetType.name,
1676 this_run,
1677 )
1679 # Import the refs and expand the DataCoordinates since we can't
1680 # guarantee that they are expanded and Datastore will need
1681 # the records.
1682 imported_refs = self._registry._importDatasets(refs_to_import, expand=True)
1683 assert set(imported_refs) == set(refs_to_import)
1685 # Replace all the refs in the FileDataset with expanded versions.
1686 # Pull them off in the order we put them on the list.
1687 for dataset in grouped_datasets:
1688 n_dataset_refs = len(dataset.refs)
1689 dataset.refs = imported_refs[:n_dataset_refs]
1690 del imported_refs[:n_dataset_refs]
1692 # Bulk-insert everything into Datastore.
1693 # We do not know if any of the registry entries already existed
1694 # (_importDatasets only complains if they exist but differ) so
1695 # we have to catch IntegrityError explicitly.
1696 try:
1697 self._datastore.ingest(
1698 *datasets, transfer=transfer, record_validation_info=record_validation_info
1699 )
1700 except IntegrityError as e:
1701 raise ConflictingDefinitionError(f"Datastore already contains one or more datasets: {e}") from e
1703 @contextlib.contextmanager
1704 def export(
1705 self,
1706 *,
1707 directory: str | None = None,
1708 filename: str | None = None,
1709 format: str | None = None,
1710 transfer: str | None = None,
1711 ) -> Iterator[RepoExportContext]:
1712 # Docstring inherited.
1713 if directory is None and transfer is not None:
1714 raise TypeError("Cannot transfer without providing a directory.")
1715 if transfer == "move":
1716 raise TypeError("Transfer may not be 'move': export is read-only")
1717 if format is None:
1718 if filename is None:
1719 raise TypeError("At least one of 'filename' or 'format' must be provided.")
1720 else:
1721 _, format = os.path.splitext(filename)
1722 if not format:
1723 raise ValueError("Please specify a file extension to determine export format.")
1724 format = format[1:] # Strip leading ".""
1725 elif filename is None:
1726 filename = f"export.{format}"
1727 if directory is not None:
1728 filename = os.path.join(directory, filename)
1729 formats = self._config["repo_transfer_formats"]
1730 if format not in formats:
1731 raise ValueError(f"Unknown export format {format!r}, allowed: {','.join(formats.keys())}")
1732 BackendClass = get_class_of(formats[format, "export"])
1733 with open(filename, "w") as stream:
1734 backend = BackendClass(stream, universe=self.dimensions)
1735 try:
1736 helper = RepoExportContext(
1737 self._registry, self._datastore, backend=backend, directory=directory, transfer=transfer
1738 )
1739 yield helper
1740 except BaseException:
1741 raise
1742 else:
1743 helper._finish()
1745 def import_(
1746 self,
1747 *,
1748 directory: ResourcePathExpression | None = None,
1749 filename: ResourcePathExpression | TextIO | None = None,
1750 format: str | None = None,
1751 transfer: str | None = None,
1752 skip_dimensions: set | None = None,
1753 ) -> None:
1754 # Docstring inherited.
1755 if not self.isWriteable():
1756 raise TypeError("Butler is read-only.")
1757 if format is None:
1758 if filename is None:
1759 raise TypeError("At least one of 'filename' or 'format' must be provided.")
1760 else:
1761 _, format = os.path.splitext(filename) # type: ignore
1762 elif filename is None:
1763 filename = ResourcePath(f"export.{format}", forceAbsolute=False)
1764 if directory is not None:
1765 directory = ResourcePath(directory, forceDirectory=True)
1766 # mypy doesn't think this will work but it does in python >= 3.10.
1767 if isinstance(filename, ResourcePathExpression): # type: ignore
1768 filename = ResourcePath(filename, forceAbsolute=False) # type: ignore
1769 if not filename.isabs() and directory is not None:
1770 potential = directory.join(filename)
1771 exists_in_cwd = filename.exists()
1772 exists_in_dir = potential.exists()
1773 if exists_in_cwd and exists_in_dir:
1774 _LOG.warning(
1775 "A relative path for filename was specified (%s) which exists relative to cwd. "
1776 "Additionally, the file exists relative to the given search directory (%s). "
1777 "Using the export file in the given directory.",
1778 filename,
1779 potential,
1780 )
1781 # Given they specified an explicit directory and that
1782 # directory has the export file in it, assume that that
1783 # is what was meant despite the file in cwd.
1784 filename = potential
1785 elif exists_in_dir:
1786 filename = potential
1787 elif not exists_in_cwd and not exists_in_dir:
1788 # Raise early.
1789 raise FileNotFoundError(
1790 f"Export file could not be found in {filename.abspath()} or {potential.abspath()}."
1791 )
1792 BackendClass: type[RepoImportBackend] = get_class_of(
1793 self._config["repo_transfer_formats"][format]["import"]
1794 )
1796 def doImport(importStream: TextIO | ResourceHandleProtocol) -> None:
1797 backend = BackendClass(importStream, self._registry) # type: ignore[call-arg]
1798 backend.register()
1799 with self.transaction():
1800 backend.load(
1801 self._datastore,
1802 directory=directory,
1803 transfer=transfer,
1804 skip_dimensions=skip_dimensions,
1805 )
1807 if isinstance(filename, ResourcePath):
1808 # We can not use open() here at the moment because of
1809 # DM-38589 since yaml does stream.read(8192) in a loop.
1810 stream = io.StringIO(filename.read().decode())
1811 doImport(stream)
1812 else:
1813 doImport(filename) # type: ignore
1815 def transfer_from(
1816 self,
1817 source_butler: LimitedButler,
1818 source_refs: Iterable[DatasetRef],
1819 transfer: str = "auto",
1820 skip_missing: bool = True,
1821 register_dataset_types: bool = False,
1822 transfer_dimensions: bool = False,
1823 ) -> collections.abc.Collection[DatasetRef]:
1824 # Docstring inherited.
1825 if not self.isWriteable():
1826 raise TypeError("Butler is read-only.")
1827 progress = Progress("lsst.daf.butler.Butler.transfer_from", level=VERBOSE)
1829 # Will iterate through the refs multiple times so need to convert
1830 # to a list if this isn't a collection.
1831 if not isinstance(source_refs, collections.abc.Collection):
1832 source_refs = list(source_refs)
1834 original_count = len(source_refs)
1835 _LOG.info("Transferring %d datasets into %s", original_count, str(self))
1837 # In some situations the datastore artifact may be missing
1838 # and we do not want that registry entry to be imported.
1839 # Asking datastore is not sufficient, the records may have been
1840 # purged, we have to ask for the (predicted) URI and check
1841 # existence explicitly. Execution butler is set up exactly like
1842 # this with no datastore records.
1843 artifact_existence: dict[ResourcePath, bool] = {}
1844 if skip_missing:
1845 dataset_existence = source_butler._datastore.mexists(
1846 source_refs, artifact_existence=artifact_existence
1847 )
1848 source_refs = [ref for ref, exists in dataset_existence.items() if exists]
1849 filtered_count = len(source_refs)
1850 n_missing = original_count - filtered_count
1851 _LOG.verbose(
1852 "%d dataset%s removed because the artifact does not exist. Now have %d.",
1853 n_missing,
1854 "" if n_missing == 1 else "s",
1855 filtered_count,
1856 )
1858 # Importing requires that we group the refs by dataset type and run
1859 # before doing the import.
1860 source_dataset_types = set()
1861 grouped_refs = defaultdict(list)
1862 for ref in source_refs:
1863 grouped_refs[ref.datasetType, ref.run].append(ref)
1864 source_dataset_types.add(ref.datasetType)
1866 # Check to see if the dataset type in the source butler has
1867 # the same definition in the target butler and register missing
1868 # ones if requested. Registration must happen outside a transaction.
1869 newly_registered_dataset_types = set()
1870 for datasetType in source_dataset_types:
1871 if register_dataset_types:
1872 # Let this raise immediately if inconsistent. Continuing
1873 # on to find additional inconsistent dataset types
1874 # might result in additional unwanted dataset types being
1875 # registered.
1876 if self._registry.registerDatasetType(datasetType):
1877 newly_registered_dataset_types.add(datasetType)
1878 else:
1879 # If the dataset type is missing, let it fail immediately.
1880 target_dataset_type = self._registry.getDatasetType(datasetType.name)
1881 if target_dataset_type != datasetType:
1882 raise ConflictingDefinitionError(
1883 "Source butler dataset type differs from definition"
1884 f" in target butler: {datasetType} !="
1885 f" {target_dataset_type}"
1886 )
1887 if newly_registered_dataset_types:
1888 # We may have registered some even if there were inconsistencies
1889 # but should let people know (or else remove them again).
1890 _LOG.verbose(
1891 "Registered the following dataset types in the target Butler: %s",
1892 ", ".join(d.name for d in newly_registered_dataset_types),
1893 )
1894 else:
1895 _LOG.verbose("All required dataset types are known to the target Butler")
1897 dimension_records: dict[DimensionElement, dict[DataCoordinate, DimensionRecord]] = defaultdict(dict)
1898 if transfer_dimensions:
1899 # Collect all the dimension records for these refs.
1900 # All dimensions are to be copied but the list of valid dimensions
1901 # come from this butler's universe.
1902 elements = frozenset(
1903 element
1904 for element in self.dimensions.getStaticElements()
1905 if element.hasTable() and element.viewOf is None
1906 )
1907 dataIds = {ref.dataId for ref in source_refs}
1908 # This logic comes from saveDataIds.
1909 for dataId in dataIds:
1910 # Need an expanded record, if not expanded that we need a full
1911 # butler with registry (allow mocks with registry too).
1912 if not dataId.hasRecords():
1913 if registry := getattr(source_butler, "registry", None):
1914 dataId = registry.expandDataId(dataId)
1915 else:
1916 raise TypeError("Input butler needs to be a full butler to expand DataId.")
1917 # If this butler doesn't know about a dimension in the source
1918 # butler things will break later.
1919 for record in dataId.records.values():
1920 if record is not None and record.definition in elements:
1921 dimension_records[record.definition].setdefault(record.dataId, record)
1923 handled_collections: set[str] = set()
1925 # Do all the importing in a single transaction.
1926 with self.transaction():
1927 if dimension_records:
1928 _LOG.verbose("Ensuring that dimension records exist for transferred datasets.")
1929 for element, r in dimension_records.items():
1930 records = [r[dataId] for dataId in r]
1931 # Assume that if the record is already present that we can
1932 # use it without having to check that the record metadata
1933 # is consistent.
1934 self._registry.insertDimensionData(element, *records, skip_existing=True)
1936 n_imported = 0
1937 for (datasetType, run), refs_to_import in progress.iter_item_chunks(
1938 grouped_refs.items(), desc="Importing to registry by run and dataset type"
1939 ):
1940 if run not in handled_collections:
1941 # May need to create output collection. If source butler
1942 # has a registry, ask for documentation string.
1943 run_doc = None
1944 if registry := getattr(source_butler, "registry", None):
1945 run_doc = registry.getCollectionDocumentation(run)
1946 registered = self._registry.registerRun(run, doc=run_doc)
1947 handled_collections.add(run)
1948 if registered:
1949 _LOG.verbose("Creating output run %s", run)
1951 n_refs = len(refs_to_import)
1952 _LOG.verbose(
1953 "Importing %d ref%s of dataset type %s into run %s",
1954 n_refs,
1955 "" if n_refs == 1 else "s",
1956 datasetType.name,
1957 run,
1958 )
1960 # Assume we are using UUIDs and the source refs will match
1961 # those imported.
1962 imported_refs = self._registry._importDatasets(refs_to_import)
1963 assert set(imported_refs) == set(refs_to_import)
1964 n_imported += len(imported_refs)
1966 assert len(source_refs) == n_imported
1967 _LOG.verbose("Imported %d datasets into destination butler", n_imported)
1969 # Ask the datastore to transfer. The datastore has to check that
1970 # the source datastore is compatible with the target datastore.
1971 accepted, rejected = self._datastore.transfer_from(
1972 source_butler._datastore,
1973 source_refs,
1974 transfer=transfer,
1975 artifact_existence=artifact_existence,
1976 )
1977 if rejected:
1978 # For now, accept the registry entries but not the files.
1979 _LOG.warning(
1980 "%d datasets were rejected and %d accepted for dataset type %s in run %r.",
1981 len(rejected),
1982 len(accepted),
1983 datasetType,
1984 run,
1985 )
1987 return source_refs
1989 def validateConfiguration(
1990 self,
1991 logFailures: bool = False,
1992 datasetTypeNames: Iterable[str] | None = None,
1993 ignore: Iterable[str] | None = None,
1994 ) -> None:
1995 # Docstring inherited.
1996 if datasetTypeNames:
1997 datasetTypes = [self._registry.getDatasetType(name) for name in datasetTypeNames]
1998 else:
1999 datasetTypes = list(self._registry.queryDatasetTypes())
2001 # filter out anything from the ignore list
2002 if ignore:
2003 ignore = set(ignore)
2004 datasetTypes = [
2005 e for e in datasetTypes if e.name not in ignore and e.nameAndComponent()[0] not in ignore
2006 ]
2007 else:
2008 ignore = set()
2010 # For each datasetType that has an instrument dimension, create
2011 # a DatasetRef for each defined instrument
2012 datasetRefs = []
2014 # Find all the registered instruments (if "instrument" is in the
2015 # universe).
2016 if "instrument" in self.dimensions:
2017 instruments = {record.name for record in self._registry.queryDimensionRecords("instrument")}
2019 for datasetType in datasetTypes:
2020 if "instrument" in datasetType.dimensions:
2021 # In order to create a conforming dataset ref, create
2022 # fake DataCoordinate values for the non-instrument
2023 # dimensions. The type of the value does not matter here.
2024 dataId = {dim.name: 1 for dim in datasetType.dimensions if dim.name != "instrument"}
2026 for instrument in instruments:
2027 datasetRef = DatasetRef(
2028 datasetType,
2029 DataCoordinate.standardize(
2030 dataId, instrument=instrument, graph=datasetType.dimensions
2031 ),
2032 run="validate",
2033 )
2034 datasetRefs.append(datasetRef)
2036 entities: list[DatasetType | DatasetRef] = []
2037 entities.extend(datasetTypes)
2038 entities.extend(datasetRefs)
2040 datastoreErrorStr = None
2041 try:
2042 self._datastore.validateConfiguration(entities, logFailures=logFailures)
2043 except ValidationError as e:
2044 datastoreErrorStr = str(e)
2046 # Also check that the LookupKeys used by the datastores match
2047 # registry and storage class definitions
2048 keys = self._datastore.getLookupKeys()
2050 failedNames = set()
2051 failedDataId = set()
2052 for key in keys:
2053 if key.name is not None:
2054 if key.name in ignore:
2055 continue
2057 # skip if specific datasetType names were requested and this
2058 # name does not match
2059 if datasetTypeNames and key.name not in datasetTypeNames:
2060 continue
2062 # See if it is a StorageClass or a DatasetType
2063 if key.name in self.storageClasses:
2064 pass
2065 else:
2066 try:
2067 self._registry.getDatasetType(key.name)
2068 except KeyError:
2069 if logFailures:
2070 _LOG.critical(
2071 "Key '%s' does not correspond to a DatasetType or StorageClass", key
2072 )
2073 failedNames.add(key)
2074 else:
2075 # Dimensions are checked for consistency when the Butler
2076 # is created and rendezvoused with a universe.
2077 pass
2079 # Check that the instrument is a valid instrument
2080 # Currently only support instrument so check for that
2081 if key.dataId:
2082 dataIdKeys = set(key.dataId)
2083 if {"instrument"} != dataIdKeys:
2084 if logFailures:
2085 _LOG.critical("Key '%s' has unsupported DataId override", key)
2086 failedDataId.add(key)
2087 elif key.dataId["instrument"] not in instruments:
2088 if logFailures:
2089 _LOG.critical("Key '%s' has unknown instrument", key)
2090 failedDataId.add(key)
2092 messages = []
2094 if datastoreErrorStr:
2095 messages.append(datastoreErrorStr)
2097 for failed, msg in (
2098 (failedNames, "Keys without corresponding DatasetType or StorageClass entry: "),
2099 (failedDataId, "Keys with bad DataId entries: "),
2100 ):
2101 if failed:
2102 msg += ", ".join(str(k) for k in failed)
2103 messages.append(msg)
2105 if messages:
2106 raise ValidationError(";\n".join(messages))
2108 @property
2109 def collections(self) -> Sequence[str]:
2110 """The collections to search by default, in order
2111 (`~collections.abc.Sequence` [ `str` ]).
2113 This is an alias for ``self.registry.defaults.collections``. It cannot
2114 be set directly in isolation, but all defaults may be changed together
2115 by assigning a new `RegistryDefaults` instance to
2116 ``self.registry.defaults``.
2117 """
2118 return self._registry.defaults.collections
2120 @property
2121 def run(self) -> str | None:
2122 """Name of the run this butler writes outputs to by default (`str` or
2123 `None`).
2125 This is an alias for ``self.registry.defaults.run``. It cannot be set
2126 directly in isolation, but all defaults may be changed together by
2127 assigning a new `RegistryDefaults` instance to
2128 ``self.registry.defaults``.
2129 """
2130 return self._registry.defaults.run
2132 @property
2133 def registry(self) -> Registry:
2134 """The object that manages dataset metadata and relationships
2135 (`Registry`).
2137 Many operations that don't involve reading or writing butler datasets
2138 are accessible only via `Registry` methods. Eventually these methods
2139 will be replaced by equivalent `Butler` methods.
2140 """
2141 return self._registry_shim
2143 @property
2144 def dimensions(self) -> DimensionUniverse:
2145 # Docstring inherited.
2146 return self._registry.dimensions
2148 _registry: SqlRegistry
2149 """The object that manages dataset metadata and relationships
2150 (`SqlRegistry`).
2152 Most operations that don't involve reading or writing butler datasets are
2153 accessible only via `SqlRegistry` methods.
2154 """
2156 datastore: Datastore
2157 """The object that manages actual dataset storage (`Datastore`).
2159 Direct user access to the datastore should rarely be necessary; the primary
2160 exception is the case where a `Datastore` implementation provides extra
2161 functionality beyond what the base class defines.
2162 """
2164 storageClasses: StorageClassFactory
2165 """An object that maps known storage class names to objects that fully
2166 describe them (`StorageClassFactory`).
2167 """