Coverage for python/lsst/daf/butler/_butler.py: 8%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""
23Butler top level classes.
24"""
25from __future__ import annotations
27__all__ = (
28 "Butler",
29 "ButlerValidationError",
30 "PruneCollectionsArgsError",
31 "PurgeWithoutUnstorePruneCollectionsError",
32 "RunWithoutPurgePruneCollectionsError",
33 "PurgeUnsupportedPruneCollectionsError",
34)
36import collections.abc
37from collections import defaultdict
38import contextlib
39import logging
40import numbers
41import os
42from typing import (
43 Any,
44 ClassVar,
45 Counter,
46 Dict,
47 Iterable,
48 Iterator,
49 List,
50 MutableMapping,
51 Optional,
52 Set,
53 TextIO,
54 Tuple,
55 Type,
56 Union,
57)
59try:
60 import boto3
61except ImportError:
62 boto3 = None
64from lsst.utils import doImportType
65from lsst.utils.introspection import get_class_of
66from lsst.utils.logging import getLogger, VERBOSE
67from .core import (
68 AmbiguousDatasetError,
69 ButlerURI,
70 Config,
71 ConfigSubset,
72 DataCoordinate,
73 DataId,
74 DataIdValue,
75 DatasetRef,
76 DatasetType,
77 Datastore,
78 Dimension,
79 DimensionConfig,
80 FileDataset,
81 Progress,
82 StorageClassFactory,
83 Timespan,
84 ValidationError,
85)
86from .core.repoRelocation import BUTLER_ROOT_TAG
87from .core.utils import transactional
88from ._deferredDatasetHandle import DeferredDatasetHandle
89from ._butlerConfig import ButlerConfig
90from .registry import (
91 Registry,
92 RegistryConfig,
93 RegistryDefaults,
94 CollectionSearch,
95 CollectionType,
96 ConflictingDefinitionError,
97 DatasetIdGenEnum,
98)
99from .transfers import RepoExportContext
101log = getLogger(__name__)
104class ButlerValidationError(ValidationError):
105 """There is a problem with the Butler configuration."""
106 pass
109class PruneCollectionsArgsError(TypeError):
110 """Base class for errors relating to Butler.pruneCollections input
111 arguments.
112 """
113 pass
116class PurgeWithoutUnstorePruneCollectionsError(PruneCollectionsArgsError):
117 """Raised when purge and unstore are both required to be True, and
118 purge is True but unstore is False.
119 """
121 def __init__(self) -> None:
122 super().__init__("Cannot pass purge=True without unstore=True.")
125class RunWithoutPurgePruneCollectionsError(PruneCollectionsArgsError):
126 """Raised when pruning a RUN collection but purge is False."""
128 def __init__(self, collectionType: CollectionType):
129 self.collectionType = collectionType
130 super().__init__(f"Cannot prune RUN collection {self.collectionType.name} without purge=True.")
133class PurgeUnsupportedPruneCollectionsError(PruneCollectionsArgsError):
134 """Raised when purge is True but is not supported for the given
135 collection."""
137 def __init__(self, collectionType: CollectionType):
138 self.collectionType = collectionType
139 super().__init__(
140 f"Cannot prune {self.collectionType} collection {self.collectionType.name} with purge=True.")
143class Butler:
144 """Main entry point for the data access system.
146 Parameters
147 ----------
148 config : `ButlerConfig`, `Config` or `str`, optional.
149 Configuration. Anything acceptable to the
150 `ButlerConfig` constructor. If a directory path
151 is given the configuration will be read from a ``butler.yaml`` file in
152 that location. If `None` is given default values will be used.
153 butler : `Butler`, optional.
154 If provided, construct a new Butler that uses the same registry and
155 datastore as the given one, but with the given collection and run.
156 Incompatible with the ``config``, ``searchPaths``, and ``writeable``
157 arguments.
158 collections : `str` or `Iterable` [ `str` ], optional
159 An expression specifying the collections to be searched (in order) when
160 reading datasets.
161 This may be a `str` collection name or an iterable thereof.
162 See :ref:`daf_butler_collection_expressions` for more information.
163 These collections are not registered automatically and must be
164 manually registered before they are used by any method, but they may be
165 manually registered after the `Butler` is initialized.
166 run : `str`, optional
167 Name of the `~CollectionType.RUN` collection new datasets should be
168 inserted into. If ``collections`` is `None` and ``run`` is not `None`,
169 ``collections`` will be set to ``[run]``. If not `None`, this
170 collection will automatically be registered. If this is not set (and
171 ``writeable`` is not set either), a read-only butler will be created.
172 searchPaths : `list` of `str`, optional
173 Directory paths to search when calculating the full Butler
174 configuration. Not used if the supplied config is already a
175 `ButlerConfig`.
176 writeable : `bool`, optional
177 Explicitly sets whether the butler supports write operations. If not
178 provided, a read-write butler is created if any of ``run``, ``tags``,
179 or ``chains`` is non-empty.
180 inferDefaults : `bool`, optional
181 If `True` (default) infer default data ID values from the values
182 present in the datasets in ``collections``: if all collections have the
183 same value (or no value) for a governor dimension, that value will be
184 the default for that dimension. Nonexistent collections are ignored.
185 If a default value is provided explicitly for a governor dimension via
186 ``**kwargs``, no default will be inferred for that dimension.
187 **kwargs : `str`
188 Default data ID key-value pairs. These may only identify "governor"
189 dimensions like ``instrument`` and ``skymap``.
191 Examples
192 --------
193 While there are many ways to control exactly how a `Butler` interacts with
194 the collections in its `Registry`, the most common cases are still simple.
196 For a read-only `Butler` that searches one collection, do::
198 butler = Butler("/path/to/repo", collections=["u/alice/DM-50000"])
200 For a read-write `Butler` that writes to and reads from a
201 `~CollectionType.RUN` collection::
203 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a")
205 The `Butler` passed to a ``PipelineTask`` is often much more complex,
206 because we want to write to one `~CollectionType.RUN` collection but read
207 from several others (as well)::
209 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a",
210 collections=["u/alice/DM-50000/a",
211 "u/bob/DM-49998",
212 "HSC/defaults"])
214 This butler will `put` new datasets to the run ``u/alice/DM-50000/a``.
215 Datasets will be read first from that run (since it appears first in the
216 chain), and then from ``u/bob/DM-49998`` and finally ``HSC/defaults``.
218 Finally, one can always create a `Butler` with no collections::
220 butler = Butler("/path/to/repo", writeable=True)
222 This can be extremely useful when you just want to use ``butler.registry``,
223 e.g. for inserting dimension data or managing collections, or when the
224 collections you want to use with the butler are not consistent.
225 Passing ``writeable`` explicitly here is only necessary if you want to be
226 able to make changes to the repo - usually the value for ``writeable`` can
227 be guessed from the collection arguments provided, but it defaults to
228 `False` when there are not collection arguments.
229 """
230 def __init__(self, config: Union[Config, str, None] = None, *,
231 butler: Optional[Butler] = None,
232 collections: Any = None,
233 run: Optional[str] = None,
234 searchPaths: Optional[List[str]] = None,
235 writeable: Optional[bool] = None,
236 inferDefaults: bool = True,
237 **kwargs: str,
238 ):
239 defaults = RegistryDefaults(collections=collections, run=run, infer=inferDefaults, **kwargs)
240 # Load registry, datastore, etc. from config or existing butler.
241 if butler is not None:
242 if config is not None or searchPaths is not None or writeable is not None:
243 raise TypeError("Cannot pass 'config', 'searchPaths', or 'writeable' "
244 "arguments with 'butler' argument.")
245 self.registry = butler.registry.copy(defaults)
246 self.datastore = butler.datastore
247 self.storageClasses = butler.storageClasses
248 self._config: ButlerConfig = butler._config
249 self._allow_put_of_predefined_dataset = butler._allow_put_of_predefined_dataset
250 else:
251 self._config = ButlerConfig(config, searchPaths=searchPaths)
252 try:
253 if "root" in self._config:
254 butlerRoot = self._config["root"]
255 else:
256 butlerRoot = self._config.configDir
257 if writeable is None:
258 writeable = run is not None
259 self.registry = Registry.fromConfig(self._config, butlerRoot=butlerRoot, writeable=writeable,
260 defaults=defaults)
261 self.datastore = Datastore.fromConfig(self._config, self.registry.getDatastoreBridgeManager(),
262 butlerRoot=butlerRoot)
263 self.storageClasses = StorageClassFactory()
264 self.storageClasses.addFromConfig(self._config)
265 self._allow_put_of_predefined_dataset = self._config.get("allow_put_of_predefined_dataset",
266 False)
267 except Exception:
268 # Failures here usually mean that configuration is incomplete,
269 # just issue an error message which includes config file URI.
270 log.error(f"Failed to instantiate Butler from config {self._config.configFile}.")
271 raise
273 if "run" in self._config or "collection" in self._config:
274 raise ValueError("Passing a run or collection via configuration is no longer supported.")
276 GENERATION: ClassVar[int] = 3
277 """This is a Generation 3 Butler.
279 This attribute may be removed in the future, once the Generation 2 Butler
280 interface has been fully retired; it should only be used in transitional
281 code.
282 """
284 @staticmethod
285 def makeRepo(root: str, config: Union[Config, str, None] = None,
286 dimensionConfig: Union[Config, str, None] = None, standalone: bool = False,
287 searchPaths: Optional[List[str]] = None, forceConfigRoot: bool = True,
288 outfile: Optional[str] = None, overwrite: bool = False) -> Config:
289 """Create an empty data repository by adding a butler.yaml config
290 to a repository root directory.
292 Parameters
293 ----------
294 root : `str` or `ButlerURI`
295 Path or URI to the root location of the new repository. Will be
296 created if it does not exist.
297 config : `Config` or `str`, optional
298 Configuration to write to the repository, after setting any
299 root-dependent Registry or Datastore config options. Can not
300 be a `ButlerConfig` or a `ConfigSubset`. If `None`, default
301 configuration will be used. Root-dependent config options
302 specified in this config are overwritten if ``forceConfigRoot``
303 is `True`.
304 dimensionConfig : `Config` or `str`, optional
305 Configuration for dimensions, will be used to initialize registry
306 database.
307 standalone : `bool`
308 If True, write all expanded defaults, not just customized or
309 repository-specific settings.
310 This (mostly) decouples the repository from the default
311 configuration, insulating it from changes to the defaults (which
312 may be good or bad, depending on the nature of the changes).
313 Future *additions* to the defaults will still be picked up when
314 initializing `Butlers` to repos created with ``standalone=True``.
315 searchPaths : `list` of `str`, optional
316 Directory paths to search when calculating the full butler
317 configuration.
318 forceConfigRoot : `bool`, optional
319 If `False`, any values present in the supplied ``config`` that
320 would normally be reset are not overridden and will appear
321 directly in the output config. This allows non-standard overrides
322 of the root directory for a datastore or registry to be given.
323 If this parameter is `True` the values for ``root`` will be
324 forced into the resulting config if appropriate.
325 outfile : `str`, optional
326 If not-`None`, the output configuration will be written to this
327 location rather than into the repository itself. Can be a URI
328 string. Can refer to a directory that will be used to write
329 ``butler.yaml``.
330 overwrite : `bool`, optional
331 Create a new configuration file even if one already exists
332 in the specified output location. Default is to raise
333 an exception.
335 Returns
336 -------
337 config : `Config`
338 The updated `Config` instance written to the repo.
340 Raises
341 ------
342 ValueError
343 Raised if a ButlerConfig or ConfigSubset is passed instead of a
344 regular Config (as these subclasses would make it impossible to
345 support ``standalone=False``).
346 FileExistsError
347 Raised if the output config file already exists.
348 os.error
349 Raised if the directory does not exist, exists but is not a
350 directory, or cannot be created.
352 Notes
353 -----
354 Note that when ``standalone=False`` (the default), the configuration
355 search path (see `ConfigSubset.defaultSearchPaths`) that was used to
356 construct the repository should also be used to construct any Butlers
357 to avoid configuration inconsistencies.
358 """
359 if isinstance(config, (ButlerConfig, ConfigSubset)):
360 raise ValueError("makeRepo must be passed a regular Config without defaults applied.")
362 # Ensure that the root of the repository exists or can be made
363 uri = ButlerURI(root, forceDirectory=True)
364 uri.mkdir()
366 config = Config(config)
368 # If we are creating a new repo from scratch with relative roots,
369 # do not propagate an explicit root from the config file
370 if "root" in config:
371 del config["root"]
373 full = ButlerConfig(config, searchPaths=searchPaths) # this applies defaults
374 imported_class = doImportType(full["datastore", "cls"])
375 if not issubclass(imported_class, Datastore):
376 raise TypeError(f"Imported datastore class {full['datastore', 'cls']} is not a Datastore")
377 datastoreClass: Type[Datastore] = imported_class
378 datastoreClass.setConfigRoot(BUTLER_ROOT_TAG, config, full, overwrite=forceConfigRoot)
380 # if key exists in given config, parse it, otherwise parse the defaults
381 # in the expanded config
382 if config.get(("registry", "db")):
383 registryConfig = RegistryConfig(config)
384 else:
385 registryConfig = RegistryConfig(full)
386 defaultDatabaseUri = registryConfig.makeDefaultDatabaseUri(BUTLER_ROOT_TAG)
387 if defaultDatabaseUri is not None:
388 Config.updateParameters(RegistryConfig, config, full,
389 toUpdate={"db": defaultDatabaseUri},
390 overwrite=forceConfigRoot)
391 else:
392 Config.updateParameters(RegistryConfig, config, full, toCopy=("db",),
393 overwrite=forceConfigRoot)
395 if standalone:
396 config.merge(full)
397 else:
398 # Always expand the registry.managers section into the per-repo
399 # config, because after the database schema is created, it's not
400 # allowed to change anymore. Note that in the standalone=True
401 # branch, _everything_ in the config is expanded, so there's no
402 # need to special case this.
403 Config.updateParameters(RegistryConfig, config, full, toMerge=("managers",), overwrite=False)
404 configURI: Union[str, ButlerURI]
405 if outfile is not None:
406 # When writing to a separate location we must include
407 # the root of the butler repo in the config else it won't know
408 # where to look.
409 config["root"] = uri.geturl()
410 configURI = outfile
411 else:
412 configURI = uri
413 config.dumpToUri(configURI, overwrite=overwrite)
415 # Create Registry and populate tables
416 registryConfig = RegistryConfig(config.get("registry"))
417 dimensionConfig = DimensionConfig(dimensionConfig)
418 Registry.createFromConfig(registryConfig, dimensionConfig=dimensionConfig, butlerRoot=root)
420 log.verbose("Wrote new Butler configuration file to %s", configURI)
422 return config
424 @classmethod
425 def _unpickle(cls, config: ButlerConfig, collections: Optional[CollectionSearch], run: Optional[str],
426 defaultDataId: Dict[str, str], writeable: bool) -> Butler:
427 """Callable used to unpickle a Butler.
429 We prefer not to use ``Butler.__init__`` directly so we can force some
430 of its many arguments to be keyword-only (note that ``__reduce__``
431 can only invoke callables with positional arguments).
433 Parameters
434 ----------
435 config : `ButlerConfig`
436 Butler configuration, already coerced into a true `ButlerConfig`
437 instance (and hence after any search paths for overrides have been
438 utilized).
439 collections : `CollectionSearch`
440 Names of the default collections to read from.
441 run : `str`, optional
442 Name of the default `~CollectionType.RUN` collection to write to.
443 defaultDataId : `dict` [ `str`, `str` ]
444 Default data ID values.
445 writeable : `bool`
446 Whether the Butler should support write operations.
448 Returns
449 -------
450 butler : `Butler`
451 A new `Butler` instance.
452 """
453 # MyPy doesn't recognize that the kwargs below are totally valid; it
454 # seems to think '**defaultDataId* is a _positional_ argument!
455 return cls(config=config, collections=collections, run=run, writeable=writeable,
456 **defaultDataId) # type: ignore
458 def __reduce__(self) -> tuple:
459 """Support pickling.
460 """
461 return (Butler._unpickle, (self._config, self.collections, self.run,
462 self.registry.defaults.dataId.byName(),
463 self.registry.isWriteable()))
465 def __str__(self) -> str:
466 return "Butler(collections={}, run={}, datastore='{}', registry='{}')".format(
467 self.collections, self.run, self.datastore, self.registry)
469 def isWriteable(self) -> bool:
470 """Return `True` if this `Butler` supports write operations.
471 """
472 return self.registry.isWriteable()
474 @contextlib.contextmanager
475 def transaction(self) -> Iterator[None]:
476 """Context manager supporting `Butler` transactions.
478 Transactions can be nested.
479 """
480 with self.registry.transaction():
481 with self.datastore.transaction():
482 yield
484 def _standardizeArgs(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
485 dataId: Optional[DataId] = None, **kwargs: Any
486 ) -> Tuple[DatasetType, Optional[DataId]]:
487 """Standardize the arguments passed to several Butler APIs.
489 Parameters
490 ----------
491 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
492 When `DatasetRef` the `dataId` should be `None`.
493 Otherwise the `DatasetType` or name thereof.
494 dataId : `dict` or `DataCoordinate`
495 A `dict` of `Dimension` link name, value pairs that label the
496 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
497 should be provided as the second argument.
498 **kwargs
499 Additional keyword arguments used to augment or construct a
500 `DataCoordinate`. See `DataCoordinate.standardize`
501 parameters.
503 Returns
504 -------
505 datasetType : `DatasetType`
506 A `DatasetType` instance extracted from ``datasetRefOrType``.
507 dataId : `dict` or `DataId`, optional
508 Argument that can be used (along with ``kwargs``) to construct a
509 `DataId`.
511 Notes
512 -----
513 Butler APIs that conceptually need a DatasetRef also allow passing a
514 `DatasetType` (or the name of one) and a `DataId` (or a dict and
515 keyword arguments that can be used to construct one) separately. This
516 method accepts those arguments and always returns a true `DatasetType`
517 and a `DataId` or `dict`.
519 Standardization of `dict` vs `DataId` is best handled by passing the
520 returned ``dataId`` (and ``kwargs``) to `Registry` APIs, which are
521 generally similarly flexible.
522 """
523 externalDatasetType: Optional[DatasetType] = None
524 internalDatasetType: Optional[DatasetType] = None
525 if isinstance(datasetRefOrType, DatasetRef):
526 if dataId is not None or kwargs:
527 raise ValueError("DatasetRef given, cannot use dataId as well")
528 externalDatasetType = datasetRefOrType.datasetType
529 dataId = datasetRefOrType.dataId
530 else:
531 # Don't check whether DataId is provided, because Registry APIs
532 # can usually construct a better error message when it wasn't.
533 if isinstance(datasetRefOrType, DatasetType):
534 externalDatasetType = datasetRefOrType
535 else:
536 internalDatasetType = self.registry.getDatasetType(datasetRefOrType)
538 # Check that they are self-consistent
539 if externalDatasetType is not None:
540 internalDatasetType = self.registry.getDatasetType(externalDatasetType.name)
541 if externalDatasetType != internalDatasetType:
542 raise ValueError(f"Supplied dataset type ({externalDatasetType}) inconsistent with "
543 f"registry definition ({internalDatasetType})")
545 assert internalDatasetType is not None
546 return internalDatasetType, dataId
548 def _rewrite_data_id(self, dataId: Optional[DataId], datasetType: DatasetType,
549 **kwargs: Any) -> Tuple[Optional[DataId], Dict[str, Any]]:
550 """Rewrite a data ID taking into account dimension records.
552 Take a Data ID and keyword args and rewrite it if necessary to
553 allow the user to specify dimension records rather than dimension
554 primary values.
556 This allows a user to include a dataId dict with keys of
557 ``exposure.day_obs`` and ``exposure.seq_num`` instead of giving
558 the integer exposure ID. It also allows a string to be given
559 for a dimension value rather than the integer ID if that is more
560 convenient. For example, rather than having to specifyin the
561 detector with ``detector.full_name``, a string given for ``detector``
562 will be interpreted as the full name and converted to the integer
563 value.
565 Keyword arguments can also use strings for dimensions like detector
566 and exposure but python does not allow them to include ``.`` and
567 so the ``exposure.day_obs`` syntax can not be used in a keyword
568 argument.
570 Parameters
571 ----------
572 dataId : `dict` or `DataCoordinate`
573 A `dict` of `Dimension` link name, value pairs that will label the
574 `DatasetRef` within a Collection.
575 datasetType : `DatasetType`
576 The dataset type associated with this dataId. Required to
577 determine the relevant dimensions.
578 **kwargs
579 Additional keyword arguments used to augment or construct a
580 `DataId`. See `DataId` parameters.
582 Returns
583 -------
584 dataId : `dict` or `DataCoordinate`
585 The, possibly rewritten, dataId. If given a `DataCoordinate` and
586 no keyword arguments, the orginal dataId will be returned
587 unchanged.
588 **kwargs : `dict`
589 Any unused keyword arguments.
590 """
591 # Do nothing if we have a standalone DataCoordinate.
592 if isinstance(dataId, DataCoordinate) and not kwargs:
593 return dataId, kwargs
595 # Process dimension records that are using record information
596 # rather than ids
597 newDataId: Dict[str, DataIdValue] = {}
598 byRecord: Dict[str, Dict[str, Any]] = defaultdict(dict)
600 # if all the dataId comes from keyword parameters we do not need
601 # to do anything here because they can't be of the form
602 # exposure.obs_id because a "." is not allowed in a keyword parameter.
603 if dataId:
604 for k, v in dataId.items():
605 # If we have a Dimension we do not need to do anything
606 # because it cannot be a compound key.
607 if isinstance(k, str) and "." in k:
608 # Someone is using a more human-readable dataId
609 dimensionName, record = k.split(".", 1)
610 byRecord[dimensionName][record] = v
611 elif isinstance(k, Dimension):
612 newDataId[k.name] = v
613 else:
614 newDataId[k] = v
616 # Go through the updated dataId and check the type in case someone is
617 # using an alternate key. We have already filtered out the compound
618 # keys dimensions.record format.
619 not_dimensions = {}
621 # Will need to look in the dataId and the keyword arguments
622 # and will remove them if they need to be fixed or are unrecognized.
623 for dataIdDict in (newDataId, kwargs):
624 # Use a list so we can adjust the dict safely in the loop
625 for dimensionName in list(dataIdDict):
626 value = dataIdDict[dimensionName]
627 try:
628 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
629 except KeyError:
630 # This is not a real dimension
631 not_dimensions[dimensionName] = value
632 del dataIdDict[dimensionName]
633 continue
635 # Convert an integral type to an explicit int to simplify
636 # comparisons here
637 if isinstance(value, numbers.Integral):
638 value = int(value)
640 if not isinstance(value, dimension.primaryKey.getPythonType()):
641 for alternate in dimension.alternateKeys:
642 if isinstance(value, alternate.getPythonType()):
643 byRecord[dimensionName][alternate.name] = value
644 del dataIdDict[dimensionName]
645 log.debug("Converting dimension %s to %s.%s=%s",
646 dimensionName, dimensionName, alternate.name, value)
647 break
648 else:
649 log.warning("Type mismatch found for value '%r' provided for dimension %s. "
650 "Could not find matching alternative (primary key has type %s) "
651 "so attempting to use as-is.",
652 value, dimensionName, dimension.primaryKey.getPythonType())
654 # If we have some unrecognized dimensions we have to try to connect
655 # them to records in other dimensions. This is made more complicated
656 # by some dimensions having records with clashing names. A mitigation
657 # is that we can tell by this point which dimensions are missing
658 # for the DatasetType but this does not work for calibrations
659 # where additional dimensions can be used to constrain the temporal
660 # axis.
661 if not_dimensions:
662 # Calculate missing dimensions
663 provided = set(newDataId) | set(kwargs) | set(byRecord)
664 missingDimensions = datasetType.dimensions.names - provided
666 # For calibrations we may well be needing temporal dimensions
667 # so rather than always including all dimensions in the scan
668 # restrict things a little. It is still possible for there
669 # to be confusion over day_obs in visit vs exposure for example.
670 # If we are not searching calibration collections things may
671 # fail but they are going to fail anyway because of the
672 # ambiguousness of the dataId...
673 candidateDimensions: Set[str] = set()
674 candidateDimensions.update(missingDimensions)
675 if datasetType.isCalibration():
676 for dim in self.registry.dimensions.getStaticDimensions():
677 if dim.temporal:
678 candidateDimensions.add(str(dim))
680 # Look up table for the first association with a dimension
681 guessedAssociation: Dict[str, Dict[str, Any]] = defaultdict(dict)
683 # Keep track of whether an item is associated with multiple
684 # dimensions.
685 counter: Counter[str] = Counter()
686 assigned: Dict[str, Set[str]] = defaultdict(set)
688 # Go through the missing dimensions and associate the
689 # given names with records within those dimensions
690 for dimensionName in candidateDimensions:
691 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
692 fields = dimension.metadata.names | dimension.uniqueKeys.names
693 for field in not_dimensions:
694 if field in fields:
695 guessedAssociation[dimensionName][field] = not_dimensions[field]
696 counter[dimensionName] += 1
697 assigned[field].add(dimensionName)
699 # There is a chance we have allocated a single dataId item
700 # to multiple dimensions. Need to decide which should be retained.
701 # For now assume that the most popular alternative wins.
702 # This means that day_obs with seq_num will result in
703 # exposure.day_obs and not visit.day_obs
704 # Also prefer an explicitly missing dimension over an inferred
705 # temporal dimension.
706 for fieldName, assignedDimensions in assigned.items():
707 if len(assignedDimensions) > 1:
708 # Pick the most popular (preferring mandatory dimensions)
709 requiredButMissing = assignedDimensions.intersection(missingDimensions)
710 if requiredButMissing:
711 candidateDimensions = requiredButMissing
712 else:
713 candidateDimensions = assignedDimensions
715 # Select the relevant items and get a new restricted
716 # counter.
717 theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions}
718 duplicatesCounter: Counter[str] = Counter()
719 duplicatesCounter.update(theseCounts)
721 # Choose the most common. If they are equally common
722 # we will pick the one that was found first.
723 # Returns a list of tuples
724 selected = duplicatesCounter.most_common(1)[0][0]
726 log.debug("Ambiguous dataId entry '%s' associated with multiple dimensions: %s."
727 " Removed ambiguity by choosing dimension %s.",
728 fieldName, ", ".join(assignedDimensions), selected)
730 for candidateDimension in assignedDimensions:
731 if candidateDimension != selected:
732 del guessedAssociation[candidateDimension][fieldName]
734 # Update the record look up dict with the new associations
735 for dimensionName, values in guessedAssociation.items():
736 if values: # A dict might now be empty
737 log.debug("Assigned non-dimension dataId keys to dimension %s: %s",
738 dimensionName, values)
739 byRecord[dimensionName].update(values)
741 if byRecord:
742 # Some record specifiers were found so we need to convert
743 # them to the Id form
744 for dimensionName, values in byRecord.items():
745 if dimensionName in newDataId:
746 log.warning("DataId specified explicit %s dimension value of %s in addition to"
747 " general record specifiers for it of %s. Ignoring record information.",
748 dimensionName, newDataId[dimensionName], str(values))
749 continue
751 # Build up a WHERE expression
752 bind = {k: v for k, v in values.items()}
753 where = " AND ".join(f"{dimensionName}.{k} = {k}"
754 for k in bind)
756 # Hopefully we get a single record that matches
757 records = set(self.registry.queryDimensionRecords(dimensionName, dataId=newDataId,
758 where=where, bind=bind, **kwargs))
760 if len(records) != 1:
761 if len(records) > 1:
762 log.debug("Received %d records from constraints of %s", len(records), str(values))
763 for r in records:
764 log.debug("- %s", str(r))
765 raise RuntimeError(f"DataId specification for dimension {dimensionName} is not"
766 f" uniquely constrained to a single dataset by {values}."
767 f" Got {len(records)} results.")
768 raise RuntimeError(f"DataId specification for dimension {dimensionName} matched no"
769 f" records when constrained by {values}")
771 # Get the primary key from the real dimension object
772 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
773 if not isinstance(dimension, Dimension):
774 raise RuntimeError(
775 f"{dimension.name} is not a true dimension, and cannot be used in data IDs."
776 )
777 newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name)
779 # We have modified the dataId so need to switch to it
780 dataId = newDataId
782 return dataId, kwargs
784 def _findDatasetRef(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
785 dataId: Optional[DataId] = None, *,
786 collections: Any = None,
787 allowUnresolved: bool = False,
788 **kwargs: Any) -> DatasetRef:
789 """Shared logic for methods that start with a search for a dataset in
790 the registry.
792 Parameters
793 ----------
794 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
795 When `DatasetRef` the `dataId` should be `None`.
796 Otherwise the `DatasetType` or name thereof.
797 dataId : `dict` or `DataCoordinate`, optional
798 A `dict` of `Dimension` link name, value pairs that label the
799 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
800 should be provided as the first argument.
801 collections : Any, optional
802 Collections to be searched, overriding ``self.collections``.
803 Can be any of the types supported by the ``collections`` argument
804 to butler construction.
805 allowUnresolved : `bool`, optional
806 If `True`, return an unresolved `DatasetRef` if finding a resolved
807 one in the `Registry` fails. Defaults to `False`.
808 **kwargs
809 Additional keyword arguments used to augment or construct a
810 `DataId`. See `DataId` parameters.
812 Returns
813 -------
814 ref : `DatasetRef`
815 A reference to the dataset identified by the given arguments.
817 Raises
818 ------
819 LookupError
820 Raised if no matching dataset exists in the `Registry` (and
821 ``allowUnresolved is False``).
822 ValueError
823 Raised if a resolved `DatasetRef` was passed as an input, but it
824 differs from the one found in the registry.
825 TypeError
826 Raised if no collections were provided.
827 """
828 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs)
829 if isinstance(datasetRefOrType, DatasetRef):
830 idNumber = datasetRefOrType.id
831 else:
832 idNumber = None
833 timespan: Optional[Timespan] = None
835 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs)
837 if datasetType.isCalibration():
838 # Because this is a calibration dataset, first try to make a
839 # standardize the data ID without restricting the dimensions to
840 # those of the dataset type requested, because there may be extra
841 # dimensions that provide temporal information for a validity-range
842 # lookup.
843 dataId = DataCoordinate.standardize(dataId, universe=self.registry.dimensions,
844 defaults=self.registry.defaults.dataId, **kwargs)
845 if dataId.graph.temporal:
846 dataId = self.registry.expandDataId(dataId)
847 timespan = dataId.timespan
848 else:
849 # Standardize the data ID to just the dimensions of the dataset
850 # type instead of letting registry.findDataset do it, so we get the
851 # result even if no dataset is found.
852 dataId = DataCoordinate.standardize(dataId, graph=datasetType.dimensions,
853 defaults=self.registry.defaults.dataId, **kwargs)
854 # Always lookup the DatasetRef, even if one is given, to ensure it is
855 # present in the current collection.
856 ref = self.registry.findDataset(datasetType, dataId, collections=collections, timespan=timespan)
857 if ref is None:
858 if allowUnresolved:
859 return DatasetRef(datasetType, dataId)
860 else:
861 if collections is None:
862 collections = self.registry.defaults.collections
863 raise LookupError(f"Dataset {datasetType.name} with data ID {dataId} "
864 f"could not be found in collections {collections}.")
865 if idNumber is not None and idNumber != ref.id:
866 if collections is None:
867 collections = self.registry.defaults.collections
868 raise ValueError(f"DatasetRef.id provided ({idNumber}) does not match "
869 f"id ({ref.id}) in registry in collections {collections}.")
870 return ref
872 @transactional
873 def put(self, obj: Any, datasetRefOrType: Union[DatasetRef, DatasetType, str],
874 dataId: Optional[DataId] = None, *,
875 run: Optional[str] = None,
876 **kwargs: Any) -> DatasetRef:
877 """Store and register a dataset.
879 Parameters
880 ----------
881 obj : `object`
882 The dataset.
883 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
884 When `DatasetRef` is provided, ``dataId`` should be `None`.
885 Otherwise the `DatasetType` or name thereof.
886 dataId : `dict` or `DataCoordinate`
887 A `dict` of `Dimension` link name, value pairs that label the
888 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
889 should be provided as the second argument.
890 run : `str`, optional
891 The name of the run the dataset should be added to, overriding
892 ``self.run``.
893 **kwargs
894 Additional keyword arguments used to augment or construct a
895 `DataCoordinate`. See `DataCoordinate.standardize`
896 parameters.
898 Returns
899 -------
900 ref : `DatasetRef`
901 A reference to the stored dataset, updated with the correct id if
902 given.
904 Raises
905 ------
906 TypeError
907 Raised if the butler is read-only or if no run has been provided.
908 """
909 log.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run)
910 if not self.isWriteable():
911 raise TypeError("Butler is read-only.")
912 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs)
913 if isinstance(datasetRefOrType, DatasetRef) and datasetRefOrType.id is not None:
914 raise ValueError("DatasetRef must not be in registry, must have None id")
916 # Handle dimension records in dataId
917 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs)
919 # Add Registry Dataset entry.
920 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions, **kwargs)
922 # For an execution butler the datasets will be pre-defined.
923 # If the butler is configured that way datasets should only be inserted
924 # if they do not already exist in registry. Trying and catching
925 # ConflictingDefinitionError will not work because the transaction
926 # will be corrupted. Instead, in this mode always check first.
927 ref = None
928 ref_is_predefined = False
929 if self._allow_put_of_predefined_dataset:
930 # Get the matching ref for this run.
931 ref = self.registry.findDataset(datasetType, collections=run,
932 dataId=dataId)
934 if ref:
935 # Must be expanded form for datastore templating
936 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions)
937 ref = ref.expanded(dataId)
938 ref_is_predefined = True
940 if not ref:
941 ref, = self.registry.insertDatasets(datasetType, run=run, dataIds=[dataId])
943 # If the ref is predefined it is possible that the datastore also
944 # has the record. Asking datastore to put it again will result in
945 # the artifact being recreated, overwriting previous, then will cause
946 # a failure in writing the record which will cause the artifact
947 # to be removed. Much safer to ask first before attempting to
948 # overwrite. Race conditions should not be an issue for the
949 # execution butler environment.
950 if ref_is_predefined:
951 if self.datastore.knows(ref):
952 raise ConflictingDefinitionError(f"Dataset associated {ref} already exists.")
954 self.datastore.put(obj, ref)
956 return ref
958 def getDirect(self, ref: DatasetRef, *, parameters: Optional[Dict[str, Any]] = None) -> Any:
959 """Retrieve a stored dataset.
961 Unlike `Butler.get`, this method allows datasets outside the Butler's
962 collection to be read as long as the `DatasetRef` that identifies them
963 can be obtained separately.
965 Parameters
966 ----------
967 ref : `DatasetRef`
968 Resolved reference to an already stored dataset.
969 parameters : `dict`
970 Additional StorageClass-defined options to control reading,
971 typically used to efficiently read only a subset of the dataset.
973 Returns
974 -------
975 obj : `object`
976 The dataset.
977 """
978 return self.datastore.get(ref, parameters=parameters)
980 def getDirectDeferred(self, ref: DatasetRef, *,
981 parameters: Union[dict, None] = None) -> DeferredDatasetHandle:
982 """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
983 from a resolved `DatasetRef`.
985 Parameters
986 ----------
987 ref : `DatasetRef`
988 Resolved reference to an already stored dataset.
989 parameters : `dict`
990 Additional StorageClass-defined options to control reading,
991 typically used to efficiently read only a subset of the dataset.
993 Returns
994 -------
995 obj : `DeferredDatasetHandle`
996 A handle which can be used to retrieve a dataset at a later time.
998 Raises
999 ------
1000 AmbiguousDatasetError
1001 Raised if ``ref.id is None``, i.e. the reference is unresolved.
1002 """
1003 if ref.id is None:
1004 raise AmbiguousDatasetError(
1005 f"Dataset of type {ref.datasetType.name} with data ID {ref.dataId} is not resolved."
1006 )
1007 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters)
1009 def getDeferred(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
1010 dataId: Optional[DataId] = None, *,
1011 parameters: Union[dict, None] = None,
1012 collections: Any = None,
1013 **kwargs: Any) -> DeferredDatasetHandle:
1014 """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
1015 after an immediate registry lookup.
1017 Parameters
1018 ----------
1019 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1020 When `DatasetRef` the `dataId` should be `None`.
1021 Otherwise the `DatasetType` or name thereof.
1022 dataId : `dict` or `DataCoordinate`, optional
1023 A `dict` of `Dimension` link name, value pairs that label the
1024 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1025 should be provided as the first argument.
1026 parameters : `dict`
1027 Additional StorageClass-defined options to control reading,
1028 typically used to efficiently read only a subset of the dataset.
1029 collections : Any, optional
1030 Collections to be searched, overriding ``self.collections``.
1031 Can be any of the types supported by the ``collections`` argument
1032 to butler construction.
1033 **kwargs
1034 Additional keyword arguments used to augment or construct a
1035 `DataId`. See `DataId` parameters.
1037 Returns
1038 -------
1039 obj : `DeferredDatasetHandle`
1040 A handle which can be used to retrieve a dataset at a later time.
1042 Raises
1043 ------
1044 LookupError
1045 Raised if no matching dataset exists in the `Registry` (and
1046 ``allowUnresolved is False``).
1047 ValueError
1048 Raised if a resolved `DatasetRef` was passed as an input, but it
1049 differs from the one found in the registry.
1050 TypeError
1051 Raised if no collections were provided.
1052 """
1053 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs)
1054 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters)
1056 def get(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
1057 dataId: Optional[DataId] = None, *,
1058 parameters: Optional[Dict[str, Any]] = None,
1059 collections: Any = None,
1060 **kwargs: Any) -> Any:
1061 """Retrieve a stored dataset.
1063 Parameters
1064 ----------
1065 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1066 When `DatasetRef` the `dataId` should be `None`.
1067 Otherwise the `DatasetType` or name thereof.
1068 dataId : `dict` or `DataCoordinate`
1069 A `dict` of `Dimension` link name, value pairs that label the
1070 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1071 should be provided as the first argument.
1072 parameters : `dict`
1073 Additional StorageClass-defined options to control reading,
1074 typically used to efficiently read only a subset of the dataset.
1075 collections : Any, optional
1076 Collections to be searched, overriding ``self.collections``.
1077 Can be any of the types supported by the ``collections`` argument
1078 to butler construction.
1079 **kwargs
1080 Additional keyword arguments used to augment or construct a
1081 `DataCoordinate`. See `DataCoordinate.standardize`
1082 parameters.
1084 Returns
1085 -------
1086 obj : `object`
1087 The dataset.
1089 Raises
1090 ------
1091 ValueError
1092 Raised if a resolved `DatasetRef` was passed as an input, but it
1093 differs from the one found in the registry.
1094 LookupError
1095 Raised if no matching dataset exists in the `Registry`.
1096 TypeError
1097 Raised if no collections were provided.
1099 Notes
1100 -----
1101 When looking up datasets in a `~CollectionType.CALIBRATION` collection,
1102 this method requires that the given data ID include temporal dimensions
1103 beyond the dimensions of the dataset type itself, in order to find the
1104 dataset with the appropriate validity range. For example, a "bias"
1105 dataset with native dimensions ``{instrument, detector}`` could be
1106 fetched with a ``{instrument, detector, exposure}`` data ID, because
1107 ``exposure`` is a temporal dimension.
1108 """
1109 log.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters)
1110 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs)
1111 return self.getDirect(ref, parameters=parameters)
1113 def getURIs(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
1114 dataId: Optional[DataId] = None, *,
1115 predict: bool = False,
1116 collections: Any = None,
1117 run: Optional[str] = None,
1118 **kwargs: Any) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]:
1119 """Returns the URIs associated with the dataset.
1121 Parameters
1122 ----------
1123 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1124 When `DatasetRef` the `dataId` should be `None`.
1125 Otherwise the `DatasetType` or name thereof.
1126 dataId : `dict` or `DataCoordinate`
1127 A `dict` of `Dimension` link name, value pairs that label the
1128 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1129 should be provided as the first argument.
1130 predict : `bool`
1131 If `True`, allow URIs to be returned of datasets that have not
1132 been written.
1133 collections : Any, optional
1134 Collections to be searched, overriding ``self.collections``.
1135 Can be any of the types supported by the ``collections`` argument
1136 to butler construction.
1137 run : `str`, optional
1138 Run to use for predictions, overriding ``self.run``.
1139 **kwargs
1140 Additional keyword arguments used to augment or construct a
1141 `DataCoordinate`. See `DataCoordinate.standardize`
1142 parameters.
1144 Returns
1145 -------
1146 primary : `ButlerURI`
1147 The URI to the primary artifact associated with this dataset.
1148 If the dataset was disassembled within the datastore this
1149 may be `None`.
1150 components : `dict`
1151 URIs to any components associated with the dataset artifact.
1152 Can be empty if there are no components.
1153 """
1154 ref = self._findDatasetRef(datasetRefOrType, dataId, allowUnresolved=predict,
1155 collections=collections, **kwargs)
1156 if ref.id is None: # only possible if predict is True
1157 if run is None:
1158 run = self.run
1159 if run is None:
1160 raise TypeError("Cannot predict location with run=None.")
1161 # Lie about ID, because we can't guess it, and only
1162 # Datastore.getURIs() will ever see it (and it doesn't use it).
1163 ref = ref.resolved(id=0, run=run)
1164 return self.datastore.getURIs(ref, predict)
1166 def getURI(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
1167 dataId: Optional[DataId] = None, *,
1168 predict: bool = False,
1169 collections: Any = None,
1170 run: Optional[str] = None,
1171 **kwargs: Any) -> ButlerURI:
1172 """Return the URI to the Dataset.
1174 Parameters
1175 ----------
1176 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1177 When `DatasetRef` the `dataId` should be `None`.
1178 Otherwise the `DatasetType` or name thereof.
1179 dataId : `dict` or `DataCoordinate`
1180 A `dict` of `Dimension` link name, value pairs that label the
1181 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1182 should be provided as the first argument.
1183 predict : `bool`
1184 If `True`, allow URIs to be returned of datasets that have not
1185 been written.
1186 collections : Any, optional
1187 Collections to be searched, overriding ``self.collections``.
1188 Can be any of the types supported by the ``collections`` argument
1189 to butler construction.
1190 run : `str`, optional
1191 Run to use for predictions, overriding ``self.run``.
1192 **kwargs
1193 Additional keyword arguments used to augment or construct a
1194 `DataCoordinate`. See `DataCoordinate.standardize`
1195 parameters.
1197 Returns
1198 -------
1199 uri : `ButlerURI`
1200 URI pointing to the Dataset within the datastore. If the
1201 Dataset does not exist in the datastore, and if ``predict`` is
1202 `True`, the URI will be a prediction and will include a URI
1203 fragment "#predicted".
1204 If the datastore does not have entities that relate well
1205 to the concept of a URI the returned URI string will be
1206 descriptive. The returned URI is not guaranteed to be obtainable.
1208 Raises
1209 ------
1210 LookupError
1211 A URI has been requested for a dataset that does not exist and
1212 guessing is not allowed.
1213 ValueError
1214 Raised if a resolved `DatasetRef` was passed as an input, but it
1215 differs from the one found in the registry.
1216 TypeError
1217 Raised if no collections were provided.
1218 RuntimeError
1219 Raised if a URI is requested for a dataset that consists of
1220 multiple artifacts.
1221 """
1222 primary, components = self.getURIs(datasetRefOrType, dataId=dataId, predict=predict,
1223 collections=collections, run=run, **kwargs)
1225 if primary is None or components:
1226 raise RuntimeError(f"Dataset ({datasetRefOrType}) includes distinct URIs for components. "
1227 "Use Butler.getURIs() instead.")
1228 return primary
1230 def retrieveArtifacts(self, refs: Iterable[DatasetRef],
1231 destination: Union[str, ButlerURI], transfer: str = "auto",
1232 preserve_path: bool = True,
1233 overwrite: bool = False) -> List[ButlerURI]:
1234 """Retrieve the artifacts associated with the supplied refs.
1236 Parameters
1237 ----------
1238 refs : iterable of `DatasetRef`
1239 The datasets for which artifacts are to be retrieved.
1240 A single ref can result in multiple artifacts. The refs must
1241 be resolved.
1242 destination : `ButlerURI` or `str`
1243 Location to write the artifacts.
1244 transfer : `str`, optional
1245 Method to use to transfer the artifacts. Must be one of the options
1246 supported by `ButlerURI.transfer_from()`. "move" is not allowed.
1247 preserve_path : `bool`, optional
1248 If `True` the full path of the artifact within the datastore
1249 is preserved. If `False` the final file component of the path
1250 is used.
1251 overwrite : `bool`, optional
1252 If `True` allow transfers to overwrite existing files at the
1253 destination.
1255 Returns
1256 -------
1257 targets : `list` of `ButlerURI`
1258 URIs of file artifacts in destination location. Order is not
1259 preserved.
1261 Notes
1262 -----
1263 For non-file datastores the artifacts written to the destination
1264 may not match the representation inside the datastore. For example
1265 a hierarchical data structure in a NoSQL database may well be stored
1266 as a JSON file.
1267 """
1268 return self.datastore.retrieveArtifacts(refs, ButlerURI(destination), transfer=transfer,
1269 preserve_path=preserve_path, overwrite=overwrite)
1271 def datasetExists(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
1272 dataId: Optional[DataId] = None, *,
1273 collections: Any = None,
1274 **kwargs: Any) -> bool:
1275 """Return True if the Dataset is actually present in the Datastore.
1277 Parameters
1278 ----------
1279 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1280 When `DatasetRef` the `dataId` should be `None`.
1281 Otherwise the `DatasetType` or name thereof.
1282 dataId : `dict` or `DataCoordinate`
1283 A `dict` of `Dimension` link name, value pairs that label the
1284 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1285 should be provided as the first argument.
1286 collections : Any, optional
1287 Collections to be searched, overriding ``self.collections``.
1288 Can be any of the types supported by the ``collections`` argument
1289 to butler construction.
1290 **kwargs
1291 Additional keyword arguments used to augment or construct a
1292 `DataCoordinate`. See `DataCoordinate.standardize`
1293 parameters.
1295 Raises
1296 ------
1297 LookupError
1298 Raised if the dataset is not even present in the Registry.
1299 ValueError
1300 Raised if a resolved `DatasetRef` was passed as an input, but it
1301 differs from the one found in the registry.
1302 TypeError
1303 Raised if no collections were provided.
1304 """
1305 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs)
1306 return self.datastore.exists(ref)
1308 def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None:
1309 """Remove one or more `~CollectionType.RUN` collections and the
1310 datasets within them.
1312 Parameters
1313 ----------
1314 names : `Iterable` [ `str` ]
1315 The names of the collections to remove.
1316 unstore : `bool`, optional
1317 If `True` (default), delete datasets from all datastores in which
1318 they are present, and attempt to rollback the registry deletions if
1319 datastore deletions fail (which may not always be possible). If
1320 `False`, datastore records for these datasets are still removed,
1321 but any artifacts (e.g. files) will not be.
1323 Raises
1324 ------
1325 TypeError
1326 Raised if one or more collections are not of type
1327 `~CollectionType.RUN`.
1328 """
1329 if not self.isWriteable():
1330 raise TypeError("Butler is read-only.")
1331 names = list(names)
1332 refs: List[DatasetRef] = []
1333 for name in names:
1334 collectionType = self.registry.getCollectionType(name)
1335 if collectionType is not CollectionType.RUN:
1336 raise TypeError(f"The collection type of '{name}' is {collectionType.name}, not RUN.")
1337 refs.extend(self.registry.queryDatasets(..., collections=name, findFirst=True))
1338 with self.registry.transaction():
1339 if unstore:
1340 self.datastore.trash(refs)
1341 else:
1342 self.datastore.forget(refs)
1343 for name in names:
1344 self.registry.removeCollection(name)
1345 if unstore:
1346 # Point of no return for removing artifacts
1347 self.datastore.emptyTrash()
1349 def pruneCollection(self, name: str, purge: bool = False, unstore: bool = False,
1350 unlink: Optional[List[str]] = None) -> None:
1351 """Remove a collection and possibly prune datasets within it.
1353 Parameters
1354 ----------
1355 name : `str`
1356 Name of the collection to remove. If this is a
1357 `~CollectionType.TAGGED` or `~CollectionType.CHAINED` collection,
1358 datasets within the collection are not modified unless ``unstore``
1359 is `True`. If this is a `~CollectionType.RUN` collection,
1360 ``purge`` and ``unstore`` must be `True`, and all datasets in it
1361 are fully removed from the data repository.
1362 purge : `bool`, optional
1363 If `True`, permit `~CollectionType.RUN` collections to be removed,
1364 fully removing datasets within them. Requires ``unstore=True`` as
1365 well as an added precaution against accidental deletion. Must be
1366 `False` (default) if the collection is not a ``RUN``.
1367 unstore: `bool`, optional
1368 If `True`, remove all datasets in the collection from all
1369 datastores in which they appear.
1370 unlink: `list` [`str`], optional
1371 Before removing the given `collection` unlink it from from these
1372 parent collections.
1374 Raises
1375 ------
1376 TypeError
1377 Raised if the butler is read-only or arguments are mutually
1378 inconsistent.
1379 """
1380 # See pruneDatasets comments for more information about the logic here;
1381 # the cases are almost the same, but here we can rely on Registry to
1382 # take care everything but Datastore deletion when we remove the
1383 # collection.
1384 if not self.isWriteable():
1385 raise TypeError("Butler is read-only.")
1386 collectionType = self.registry.getCollectionType(name)
1387 if purge and not unstore:
1388 raise PurgeWithoutUnstorePruneCollectionsError()
1389 if collectionType is CollectionType.RUN and not purge:
1390 raise RunWithoutPurgePruneCollectionsError(collectionType)
1391 if collectionType is not CollectionType.RUN and purge:
1392 raise PurgeUnsupportedPruneCollectionsError(collectionType)
1394 def remove(child: str, parent: str) -> None:
1395 """Remove a child collection from a parent collection."""
1396 # Remove child from parent.
1397 chain = list(self.registry.getCollectionChain(parent))
1398 try:
1399 chain.remove(name)
1400 except ValueError as e:
1401 raise RuntimeError(f"{name} is not a child of {parent}") from e
1402 self.registry.setCollectionChain(parent, chain)
1404 with self.registry.transaction():
1405 if (unlink):
1406 for parent in unlink:
1407 remove(name, parent)
1408 if unstore:
1409 refs = self.registry.queryDatasets(..., collections=name, findFirst=True)
1410 self.datastore.trash(refs)
1411 self.registry.removeCollection(name)
1413 if unstore:
1414 # Point of no return for removing artifacts
1415 self.datastore.emptyTrash()
1417 def pruneDatasets(self, refs: Iterable[DatasetRef], *,
1418 disassociate: bool = True,
1419 unstore: bool = False,
1420 tags: Iterable[str] = (),
1421 purge: bool = False,
1422 run: Optional[str] = None) -> None:
1423 """Remove one or more datasets from a collection and/or storage.
1425 Parameters
1426 ----------
1427 refs : `~collections.abc.Iterable` of `DatasetRef`
1428 Datasets to prune. These must be "resolved" references (not just
1429 a `DatasetType` and data ID).
1430 disassociate : `bool`, optional
1431 Disassociate pruned datasets from ``tags``, or from all collections
1432 if ``purge=True``.
1433 unstore : `bool`, optional
1434 If `True` (`False` is default) remove these datasets from all
1435 datastores known to this butler. Note that this will make it
1436 impossible to retrieve these datasets even via other collections.
1437 Datasets that are already not stored are ignored by this option.
1438 tags : `Iterable` [ `str` ], optional
1439 `~CollectionType.TAGGED` collections to disassociate the datasets
1440 from. Ignored if ``disassociate`` is `False` or ``purge`` is
1441 `True`.
1442 purge : `bool`, optional
1443 If `True` (`False` is default), completely remove the dataset from
1444 the `Registry`. To prevent accidental deletions, ``purge`` may
1445 only be `True` if all of the following conditions are met:
1447 - All given datasets are in the given run.
1448 - ``disassociate`` is `True`;
1449 - ``unstore`` is `True`.
1451 This mode may remove provenance information from datasets other
1452 than those provided, and should be used with extreme care.
1454 Raises
1455 ------
1456 TypeError
1457 Raised if the butler is read-only, if no collection was provided,
1458 or the conditions for ``purge=True`` were not met.
1459 """
1460 if not self.isWriteable():
1461 raise TypeError("Butler is read-only.")
1462 if purge:
1463 if not disassociate:
1464 raise TypeError("Cannot pass purge=True without disassociate=True.")
1465 if not unstore:
1466 raise TypeError("Cannot pass purge=True without unstore=True.")
1467 elif disassociate:
1468 tags = tuple(tags)
1469 if not tags:
1470 raise TypeError("No tags provided but disassociate=True.")
1471 for tag in tags:
1472 collectionType = self.registry.getCollectionType(tag)
1473 if collectionType is not CollectionType.TAGGED:
1474 raise TypeError(f"Cannot disassociate from collection '{tag}' "
1475 f"of non-TAGGED type {collectionType.name}.")
1476 # Transform possibly-single-pass iterable into something we can iterate
1477 # over multiple times.
1478 refs = list(refs)
1479 # Pruning a component of a DatasetRef makes no sense since registry
1480 # doesn't know about components and datastore might not store
1481 # components in a separate file
1482 for ref in refs:
1483 if ref.datasetType.component():
1484 raise ValueError(f"Can not prune a component of a dataset (ref={ref})")
1485 # We don't need an unreliable Datastore transaction for this, because
1486 # we've been extra careful to ensure that Datastore.trash only involves
1487 # mutating the Registry (it can _look_ at Datastore-specific things,
1488 # but shouldn't change them), and hence all operations here are
1489 # Registry operations.
1490 with self.registry.transaction():
1491 if unstore:
1492 self.datastore.trash(refs)
1493 if purge:
1494 self.registry.removeDatasets(refs)
1495 elif disassociate:
1496 assert tags, "Guaranteed by earlier logic in this function."
1497 for tag in tags:
1498 self.registry.disassociate(tag, refs)
1499 # We've exited the Registry transaction, and apparently committed.
1500 # (if there was an exception, everything rolled back, and it's as if
1501 # nothing happened - and we never get here).
1502 # Datastore artifacts are not yet gone, but they're clearly marked
1503 # as trash, so if we fail to delete now because of (e.g.) filesystem
1504 # problems we can try again later, and if manual administrative
1505 # intervention is required, it's pretty clear what that should entail:
1506 # deleting everything on disk and in private Datastore tables that is
1507 # in the dataset_location_trash table.
1508 if unstore:
1509 # Point of no return for removing artifacts
1510 self.datastore.emptyTrash()
1512 @transactional
1513 def ingest(self, *datasets: FileDataset, transfer: Optional[str] = "auto", run: Optional[str] = None,
1514 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
1515 ) -> None:
1516 """Store and register one or more datasets that already exist on disk.
1518 Parameters
1519 ----------
1520 datasets : `FileDataset`
1521 Each positional argument is a struct containing information about
1522 a file to be ingested, including its URI (either absolute or
1523 relative to the datastore root, if applicable), a `DatasetRef`,
1524 and optionally a formatter class or its fully-qualified string
1525 name. If a formatter is not provided, the formatter that would be
1526 used for `put` is assumed. On successful return, all
1527 `FileDataset.ref` attributes will have their `DatasetRef.id`
1528 attribute populated and all `FileDataset.formatter` attributes will
1529 be set to the formatter class used. `FileDataset.path` attributes
1530 may be modified to put paths in whatever the datastore considers a
1531 standardized form.
1532 transfer : `str`, optional
1533 If not `None`, must be one of 'auto', 'move', 'copy', 'direct',
1534 'split', 'hardlink', 'relsymlink' or 'symlink', indicating how to
1535 transfer the file.
1536 run : `str`, optional
1537 The name of the run ingested datasets should be added to,
1538 overriding ``self.run``.
1539 idGenerationMode : `DatasetIdGenEnum`, optional
1540 Specifies option for generating dataset IDs. By default unique IDs
1541 are generated for each inserted dataset.
1543 Raises
1544 ------
1545 TypeError
1546 Raised if the butler is read-only or if no run was provided.
1547 NotImplementedError
1548 Raised if the `Datastore` does not support the given transfer mode.
1549 DatasetTypeNotSupportedError
1550 Raised if one or more files to be ingested have a dataset type that
1551 is not supported by the `Datastore`..
1552 FileNotFoundError
1553 Raised if one of the given files does not exist.
1554 FileExistsError
1555 Raised if transfer is not `None` but the (internal) location the
1556 file would be moved to is already occupied.
1558 Notes
1559 -----
1560 This operation is not fully exception safe: if a database operation
1561 fails, the given `FileDataset` instances may be only partially updated.
1563 It is atomic in terms of database operations (they will either all
1564 succeed or all fail) providing the database engine implements
1565 transactions correctly. It will attempt to be atomic in terms of
1566 filesystem operations as well, but this cannot be implemented
1567 rigorously for most datastores.
1568 """
1569 if not self.isWriteable():
1570 raise TypeError("Butler is read-only.")
1571 progress = Progress("lsst.daf.butler.Butler.ingest", level=logging.DEBUG)
1572 # Reorganize the inputs so they're grouped by DatasetType and then
1573 # data ID. We also include a list of DatasetRefs for each FileDataset
1574 # to hold the resolved DatasetRefs returned by the Registry, before
1575 # it's safe to swap them into FileDataset.refs.
1576 # Some type annotation aliases to make that clearer:
1577 GroupForType = Dict[DataCoordinate, Tuple[FileDataset, List[DatasetRef]]]
1578 GroupedData = MutableMapping[DatasetType, GroupForType]
1579 # The actual data structure:
1580 groupedData: GroupedData = defaultdict(dict)
1581 # And the nested loop that populates it:
1582 for dataset in progress.wrap(datasets, desc="Grouping by dataset type"):
1583 # This list intentionally shared across the inner loop, since it's
1584 # associated with `dataset`.
1585 resolvedRefs: List[DatasetRef] = []
1587 # Somewhere to store pre-existing refs if we have an
1588 # execution butler.
1589 existingRefs: List[DatasetRef] = []
1591 for ref in dataset.refs:
1592 if ref.dataId in groupedData[ref.datasetType]:
1593 raise ConflictingDefinitionError(f"Ingest conflict. Dataset {dataset.path} has same"
1594 " DataId as other ingest dataset"
1595 f" {groupedData[ref.datasetType][ref.dataId][0].path} "
1596 f" ({ref.dataId})")
1597 if self._allow_put_of_predefined_dataset:
1598 existing_ref = self.registry.findDataset(ref.datasetType,
1599 dataId=ref.dataId,
1600 collections=run)
1601 if existing_ref:
1602 if self.datastore.knows(existing_ref):
1603 raise ConflictingDefinitionError(f"Dataset associated with path {dataset.path}"
1604 f" already exists as {existing_ref}.")
1605 # Store this ref elsewhere since it already exists
1606 # and we do not want to remake it but we do want
1607 # to store it in the datastore.
1608 existingRefs.append(existing_ref)
1610 # Nothing else to do until we have finished
1611 # iterating.
1612 continue
1614 groupedData[ref.datasetType][ref.dataId] = (dataset, resolvedRefs)
1616 if existingRefs:
1618 if len(dataset.refs) != len(existingRefs):
1619 # Keeping track of partially pre-existing datasets is hard
1620 # and should generally never happen. For now don't allow
1621 # it.
1622 raise ConflictingDefinitionError(f"For dataset {dataset.path} some dataIds already exist"
1623 " in registry but others do not. This is not supported.")
1625 # Attach the resolved refs if we found them.
1626 dataset.refs = existingRefs
1628 # Now we can bulk-insert into Registry for each DatasetType.
1629 for datasetType, groupForType in progress.iter_item_chunks(groupedData.items(),
1630 desc="Bulk-inserting datasets by type"):
1631 refs = self.registry.insertDatasets(
1632 datasetType,
1633 dataIds=groupForType.keys(),
1634 run=run,
1635 expand=self.datastore.needs_expanded_data_ids(transfer, datasetType),
1636 idGenerationMode=idGenerationMode,
1637 )
1638 # Append those resolved DatasetRefs to the new lists we set up for
1639 # them.
1640 for ref, (_, resolvedRefs) in zip(refs, groupForType.values()):
1641 resolvedRefs.append(ref)
1643 # Go back to the original FileDatasets to replace their refs with the
1644 # new resolved ones.
1645 for groupForType in progress.iter_chunks(groupedData.values(),
1646 desc="Reassociating resolved dataset refs with files"):
1647 for dataset, resolvedRefs in groupForType.values():
1648 dataset.refs = resolvedRefs
1650 # Bulk-insert everything into Datastore.
1651 self.datastore.ingest(*datasets, transfer=transfer)
1653 @contextlib.contextmanager
1654 def export(self, *, directory: Optional[str] = None,
1655 filename: Optional[str] = None,
1656 format: Optional[str] = None,
1657 transfer: Optional[str] = None) -> Iterator[RepoExportContext]:
1658 """Export datasets from the repository represented by this `Butler`.
1660 This method is a context manager that returns a helper object
1661 (`RepoExportContext`) that is used to indicate what information from
1662 the repository should be exported.
1664 Parameters
1665 ----------
1666 directory : `str`, optional
1667 Directory dataset files should be written to if ``transfer`` is not
1668 `None`.
1669 filename : `str`, optional
1670 Name for the file that will include database information associated
1671 with the exported datasets. If this is not an absolute path and
1672 ``directory`` is not `None`, it will be written to ``directory``
1673 instead of the current working directory. Defaults to
1674 "export.{format}".
1675 format : `str`, optional
1676 File format for the database information file. If `None`, the
1677 extension of ``filename`` will be used.
1678 transfer : `str`, optional
1679 Transfer mode passed to `Datastore.export`.
1681 Raises
1682 ------
1683 TypeError
1684 Raised if the set of arguments passed is inconsistent.
1686 Examples
1687 --------
1688 Typically the `Registry.queryDataIds` and `Registry.queryDatasets`
1689 methods are used to provide the iterables over data IDs and/or datasets
1690 to be exported::
1692 with butler.export("exports.yaml") as export:
1693 # Export all flats, but none of the dimension element rows
1694 # (i.e. data ID information) associated with them.
1695 export.saveDatasets(butler.registry.queryDatasets("flat"),
1696 elements=())
1697 # Export all datasets that start with "deepCoadd_" and all of
1698 # their associated data ID information.
1699 export.saveDatasets(butler.registry.queryDatasets("deepCoadd_*"))
1700 """
1701 if directory is None and transfer is not None:
1702 raise TypeError("Cannot transfer without providing a directory.")
1703 if transfer == "move":
1704 raise TypeError("Transfer may not be 'move': export is read-only")
1705 if format is None:
1706 if filename is None:
1707 raise TypeError("At least one of 'filename' or 'format' must be provided.")
1708 else:
1709 _, format = os.path.splitext(filename)
1710 elif filename is None:
1711 filename = f"export.{format}"
1712 if directory is not None:
1713 filename = os.path.join(directory, filename)
1714 BackendClass = get_class_of(self._config["repo_transfer_formats"][format]["export"])
1715 with open(filename, 'w') as stream:
1716 backend = BackendClass(stream)
1717 try:
1718 helper = RepoExportContext(self.registry, self.datastore, backend=backend,
1719 directory=directory, transfer=transfer)
1720 yield helper
1721 except BaseException:
1722 raise
1723 else:
1724 helper._finish()
1726 def import_(self, *, directory: Optional[str] = None,
1727 filename: Union[str, TextIO, None] = None,
1728 format: Optional[str] = None,
1729 transfer: Optional[str] = None,
1730 skip_dimensions: Optional[Set] = None,
1731 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
1732 reuseIds: bool = False) -> None:
1733 """Import datasets into this repository that were exported from a
1734 different butler repository via `~lsst.daf.butler.Butler.export`.
1736 Parameters
1737 ----------
1738 directory : `str`, optional
1739 Directory containing dataset files to import from. If `None`,
1740 ``filename`` and all dataset file paths specified therein must
1741 be absolute.
1742 filename : `str` or `TextIO`, optional
1743 A stream or name of file that contains database information
1744 associated with the exported datasets, typically generated by
1745 `~lsst.daf.butler.Butler.export`. If this a string (name) and
1746 is not an absolute path, does not exist in the current working
1747 directory, and ``directory`` is not `None`, it is assumed to be in
1748 ``directory``. Defaults to "export.{format}".
1749 format : `str`, optional
1750 File format for ``filename``. If `None`, the extension of
1751 ``filename`` will be used.
1752 transfer : `str`, optional
1753 Transfer mode passed to `~lsst.daf.butler.Datastore.ingest`.
1754 skip_dimensions : `set`, optional
1755 Names of dimensions that should be skipped and not imported.
1756 idGenerationMode : `DatasetIdGenEnum`, optional
1757 Specifies option for generating dataset IDs when IDs are not
1758 provided or their type does not match backend type. By default
1759 unique IDs are generated for each inserted dataset.
1760 reuseIds : `bool`, optional
1761 If `True` then forces re-use of imported dataset IDs for integer
1762 IDs which are normally generated as auto-incremented; exception
1763 will be raised if imported IDs clash with existing ones. This
1764 option has no effect on the use of globally-unique IDs which are
1765 always re-used (or generated if integer IDs are being imported).
1767 Raises
1768 ------
1769 TypeError
1770 Raised if the set of arguments passed is inconsistent, or if the
1771 butler is read-only.
1772 """
1773 if not self.isWriteable():
1774 raise TypeError("Butler is read-only.")
1775 if format is None:
1776 if filename is None:
1777 raise TypeError("At least one of 'filename' or 'format' must be provided.")
1778 else:
1779 _, format = os.path.splitext(filename) # type: ignore
1780 elif filename is None:
1781 filename = f"export.{format}"
1782 if isinstance(filename, str) and directory is not None and not os.path.exists(filename):
1783 filename = os.path.join(directory, filename)
1784 BackendClass = get_class_of(self._config["repo_transfer_formats"][format]["import"])
1786 def doImport(importStream: TextIO) -> None:
1787 backend = BackendClass(importStream, self.registry)
1788 backend.register()
1789 with self.transaction():
1790 backend.load(self.datastore, directory=directory, transfer=transfer,
1791 skip_dimensions=skip_dimensions, idGenerationMode=idGenerationMode,
1792 reuseIds=reuseIds)
1794 if isinstance(filename, str):
1795 with open(filename, "r") as stream:
1796 doImport(stream)
1797 else:
1798 doImport(filename)
1800 def transfer_from(self, source_butler: Butler, source_refs: Iterable[DatasetRef],
1801 transfer: str = "auto",
1802 id_gen_map: Dict[str, DatasetIdGenEnum] = None,
1803 skip_missing: bool = True,
1804 register_dataset_types: bool = False) -> List[DatasetRef]:
1805 """Transfer datasets to this Butler from a run in another Butler.
1807 Parameters
1808 ----------
1809 source_butler : `Butler`
1810 Butler from which the datasets are to be transferred.
1811 source_refs : iterable of `DatasetRef`
1812 Datasets defined in the source butler that should be transferred to
1813 this butler.
1814 transfer : `str`, optional
1815 Transfer mode passed to `~lsst.daf.butler.Datastore.transfer_from`.
1816 id_gen_map : `dict` of [`str`, `DatasetIdGenEnum`], optional
1817 A mapping of dataset type to ID generation mode. Only used if
1818 the source butler is using integer IDs. Should not be used
1819 if this receiving butler uses integer IDs. Without this dataset
1820 import always uses unique.
1821 skip_missing : `bool`
1822 If `True`, datasets with no datastore artifact associated with
1823 them are not transferred. If `False` a registry entry will be
1824 created even if no datastore record is created (and so will
1825 look equivalent to the dataset being unstored).
1826 register_dataset_types : `bool`
1827 If `True` any missing dataset types are registered. Otherwise
1828 an exception is raised.
1830 Returns
1831 -------
1832 refs : `list` of `DatasetRef`
1833 The refs added to this Butler.
1835 Notes
1836 -----
1837 Requires that any dimension definitions are already present in the
1838 receiving Butler. The datastore artifact has to exist for a transfer
1839 to be made but non-existence is not an error.
1841 Datasets that already exist in this run will be skipped.
1843 The datasets are imported as part of a transaction, although
1844 dataset types are registered before the transaction is started.
1845 This means that it is possible for a dataset type to be registered
1846 even though transfer has failed.
1847 """
1848 if not self.isWriteable():
1849 raise TypeError("Butler is read-only.")
1850 progress = Progress("lsst.daf.butler.Butler.transfer_from", level=VERBOSE)
1852 # Will iterate through the refs multiple times so need to convert
1853 # to a list if this isn't a collection.
1854 if not isinstance(source_refs, collections.abc.Collection):
1855 source_refs = list(source_refs)
1857 original_count = len(source_refs)
1858 log.info("Transferring %d datasets into %s", original_count, str(self))
1860 if id_gen_map is None:
1861 id_gen_map = {}
1863 # In some situations the datastore artifact may be missing
1864 # and we do not want that registry entry to be imported.
1865 # Asking datastore is not sufficient, the records may have been
1866 # purged, we have to ask for the (predicted) URI and check
1867 # existence explicitly. Execution butler is set up exactly like
1868 # this with no datastore records.
1869 artifact_existence: Dict[ButlerURI, bool] = {}
1870 if skip_missing:
1871 dataset_existence = source_butler.datastore.mexists(source_refs,
1872 artifact_existence=artifact_existence)
1873 source_refs = [ref for ref, exists in dataset_existence.items() if exists]
1874 filtered_count = len(source_refs)
1875 log.verbose("%d datasets removed because the artifact does not exist. Now have %d.",
1876 original_count - filtered_count, filtered_count)
1878 # Importing requires that we group the refs by dataset type and run
1879 # before doing the import.
1880 source_dataset_types = set()
1881 grouped_refs = defaultdict(list)
1882 grouped_indices = defaultdict(list)
1883 for i, ref in enumerate(source_refs):
1884 grouped_refs[ref.datasetType, ref.run].append(ref)
1885 grouped_indices[ref.datasetType, ref.run].append(i)
1886 source_dataset_types.add(ref.datasetType)
1888 # Check to see if the dataset type in the source butler has
1889 # the same definition in the target butler and register missing
1890 # ones if requested. Registration must happen outside a transaction.
1891 newly_registered_dataset_types = set()
1892 for datasetType in source_dataset_types:
1893 if register_dataset_types:
1894 # Let this raise immediately if inconsistent. Continuing
1895 # on to find additional inconsistent dataset types
1896 # might result in additional unwanted dataset types being
1897 # registered.
1898 if self.registry.registerDatasetType(datasetType):
1899 newly_registered_dataset_types.add(datasetType)
1900 else:
1901 # If the dataset type is missing, let it fail immediately.
1902 target_dataset_type = self.registry.getDatasetType(datasetType.name)
1903 if target_dataset_type != datasetType:
1904 raise ConflictingDefinitionError("Source butler dataset type differs from definition"
1905 f" in target butler: {datasetType} !="
1906 f" {target_dataset_type}")
1907 if newly_registered_dataset_types:
1908 # We may have registered some even if there were inconsistencies
1909 # but should let people know (or else remove them again).
1910 log.log(VERBOSE, "Registered the following dataset types in the target Butler: %s",
1911 ", ".join(d.name for d in newly_registered_dataset_types))
1912 else:
1913 log.log(VERBOSE, "All required dataset types are known to the target Butler")
1915 # The returned refs should be identical for UUIDs.
1916 # For now must also support integers and so need to retain the
1917 # newly-created refs from this registry.
1918 # Pre-size it so we can assign refs into the correct slots
1919 transferred_refs_tmp: List[Optional[DatasetRef]] = [None] * len(source_refs)
1920 default_id_gen = DatasetIdGenEnum.UNIQUE
1922 handled_collections: Set[str] = set()
1924 # Do all the importing in a single transaction.
1925 with self.transaction():
1926 for (datasetType, run), refs_to_import in progress.iter_item_chunks(grouped_refs.items(),
1927 desc="Importing to registry"
1928 " by run and dataset type"):
1929 if run not in handled_collections:
1930 run_doc = source_butler.registry.getCollectionDocumentation(run)
1931 registered = self.registry.registerRun(run, doc=run_doc)
1932 handled_collections.add(run)
1933 if registered:
1934 log.log(VERBOSE, "Creating output run %s", run)
1936 id_generation_mode = default_id_gen
1937 if isinstance(refs_to_import[0].id, int):
1938 # ID generation mode might need to be overridden when
1939 # targetting UUID
1940 id_generation_mode = id_gen_map.get(datasetType.name, default_id_gen)
1942 n_refs = len(refs_to_import)
1943 log.verbose("Importing %d ref%s of dataset type %s into run %s",
1944 n_refs, "" if n_refs == 1 else "s", datasetType.name, run)
1946 # No way to know if this butler's registry uses UUID.
1947 # We have to trust the caller on this. If it fails they will
1948 # have to change their approach. We can't catch the exception
1949 # and retry with unique because that will mess up the
1950 # transaction handling. We aren't allowed to ask the registry
1951 # manager what type of ID it is using.
1952 imported_refs = self.registry._importDatasets(refs_to_import,
1953 idGenerationMode=id_generation_mode,
1954 expand=False)
1956 # Map them into the correct slots to match the initial order
1957 for i, ref in zip(grouped_indices[datasetType, run], imported_refs):
1958 transferred_refs_tmp[i] = ref
1960 # Mypy insists that we might have None in here so we have to make
1961 # that explicit by assigning to a new variable and filtering out
1962 # something that won't be there.
1963 transferred_refs = [ref for ref in transferred_refs_tmp if ref is not None]
1965 # Check consistency
1966 assert len(source_refs) == len(transferred_refs), "Different number of refs imported than given"
1968 log.verbose("Imported %d datasets into destination butler", len(transferred_refs))
1970 # The transferred refs need to be reordered to match the original
1971 # ordering given by the caller. Without this the datastore transfer
1972 # will be broken.
1974 # Ask the datastore to transfer. The datastore has to check that
1975 # the source datastore is compatible with the target datastore.
1976 self.datastore.transfer_from(source_butler.datastore, source_refs,
1977 local_refs=transferred_refs, transfer=transfer,
1978 artifact_existence=artifact_existence)
1980 return transferred_refs
1982 def validateConfiguration(self, logFailures: bool = False,
1983 datasetTypeNames: Optional[Iterable[str]] = None,
1984 ignore: Iterable[str] = None) -> None:
1985 """Validate butler configuration.
1987 Checks that each `DatasetType` can be stored in the `Datastore`.
1989 Parameters
1990 ----------
1991 logFailures : `bool`, optional
1992 If `True`, output a log message for every validation error
1993 detected.
1994 datasetTypeNames : iterable of `str`, optional
1995 The `DatasetType` names that should be checked. This allows
1996 only a subset to be selected.
1997 ignore : iterable of `str`, optional
1998 Names of DatasetTypes to skip over. This can be used to skip
1999 known problems. If a named `DatasetType` corresponds to a
2000 composite, all components of that `DatasetType` will also be
2001 ignored.
2003 Raises
2004 ------
2005 ButlerValidationError
2006 Raised if there is some inconsistency with how this Butler
2007 is configured.
2008 """
2009 if datasetTypeNames:
2010 datasetTypes = [self.registry.getDatasetType(name) for name in datasetTypeNames]
2011 else:
2012 datasetTypes = list(self.registry.queryDatasetTypes())
2014 # filter out anything from the ignore list
2015 if ignore:
2016 ignore = set(ignore)
2017 datasetTypes = [e for e in datasetTypes
2018 if e.name not in ignore and e.nameAndComponent()[0] not in ignore]
2019 else:
2020 ignore = set()
2022 # Find all the registered instruments
2023 instruments = set(
2024 record.name for record in self.registry.queryDimensionRecords("instrument")
2025 )
2027 # For each datasetType that has an instrument dimension, create
2028 # a DatasetRef for each defined instrument
2029 datasetRefs = []
2031 for datasetType in datasetTypes:
2032 if "instrument" in datasetType.dimensions:
2033 for instrument in instruments:
2034 datasetRef = DatasetRef(datasetType, {"instrument": instrument}, # type: ignore
2035 conform=False)
2036 datasetRefs.append(datasetRef)
2038 entities: List[Union[DatasetType, DatasetRef]] = []
2039 entities.extend(datasetTypes)
2040 entities.extend(datasetRefs)
2042 datastoreErrorStr = None
2043 try:
2044 self.datastore.validateConfiguration(entities, logFailures=logFailures)
2045 except ValidationError as e:
2046 datastoreErrorStr = str(e)
2048 # Also check that the LookupKeys used by the datastores match
2049 # registry and storage class definitions
2050 keys = self.datastore.getLookupKeys()
2052 failedNames = set()
2053 failedDataId = set()
2054 for key in keys:
2055 if key.name is not None:
2056 if key.name in ignore:
2057 continue
2059 # skip if specific datasetType names were requested and this
2060 # name does not match
2061 if datasetTypeNames and key.name not in datasetTypeNames:
2062 continue
2064 # See if it is a StorageClass or a DatasetType
2065 if key.name in self.storageClasses:
2066 pass
2067 else:
2068 try:
2069 self.registry.getDatasetType(key.name)
2070 except KeyError:
2071 if logFailures:
2072 log.critical("Key '%s' does not correspond to a DatasetType or StorageClass", key)
2073 failedNames.add(key)
2074 else:
2075 # Dimensions are checked for consistency when the Butler
2076 # is created and rendezvoused with a universe.
2077 pass
2079 # Check that the instrument is a valid instrument
2080 # Currently only support instrument so check for that
2081 if key.dataId:
2082 dataIdKeys = set(key.dataId)
2083 if set(["instrument"]) != dataIdKeys:
2084 if logFailures:
2085 log.critical("Key '%s' has unsupported DataId override", key)
2086 failedDataId.add(key)
2087 elif key.dataId["instrument"] not in instruments:
2088 if logFailures:
2089 log.critical("Key '%s' has unknown instrument", key)
2090 failedDataId.add(key)
2092 messages = []
2094 if datastoreErrorStr:
2095 messages.append(datastoreErrorStr)
2097 for failed, msg in ((failedNames, "Keys without corresponding DatasetType or StorageClass entry: "),
2098 (failedDataId, "Keys with bad DataId entries: ")):
2099 if failed:
2100 msg += ", ".join(str(k) for k in failed)
2101 messages.append(msg)
2103 if messages:
2104 raise ValidationError(";\n".join(messages))
2106 @property
2107 def collections(self) -> CollectionSearch:
2108 """The collections to search by default, in order (`CollectionSearch`).
2110 This is an alias for ``self.registry.defaults.collections``. It cannot
2111 be set directly in isolation, but all defaults may be changed together
2112 by assigning a new `RegistryDefaults` instance to
2113 ``self.registry.defaults``.
2114 """
2115 return self.registry.defaults.collections
2117 @property
2118 def run(self) -> Optional[str]:
2119 """Name of the run this butler writes outputs to by default (`str` or
2120 `None`).
2122 This is an alias for ``self.registry.defaults.run``. It cannot be set
2123 directly in isolation, but all defaults may be changed together by
2124 assigning a new `RegistryDefaults` instance to
2125 ``self.registry.defaults``.
2126 """
2127 return self.registry.defaults.run
2129 registry: Registry
2130 """The object that manages dataset metadata and relationships (`Registry`).
2132 Most operations that don't involve reading or writing butler datasets are
2133 accessible only via `Registry` methods.
2134 """
2136 datastore: Datastore
2137 """The object that manages actual dataset storage (`Datastore`).
2139 Direct user access to the datastore should rarely be necessary; the primary
2140 exception is the case where a `Datastore` implementation provides extra
2141 functionality beyond what the base class defines.
2142 """
2144 storageClasses: StorageClassFactory
2145 """An object that maps known storage class names to objects that fully
2146 describe them (`StorageClassFactory`).
2147 """
2149 _allow_put_of_predefined_dataset: bool
2150 """Allow a put to succeed even if there is already a registry entry for it
2151 but not a datastore record. (`bool`)."""