Coverage for python/lsst/daf/butler/_butler.py : 9%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""
23Butler top level classes.
24"""
25from __future__ import annotations
27__all__ = (
28 "Butler",
29 "ButlerValidationError",
30 "PruneCollectionsArgsError",
31 "PurgeWithoutUnstorePruneCollectionsError",
32 "RunWithoutPurgePruneCollectionsError",
33 "PurgeUnsupportedPruneCollectionsError",
34)
36import collections.abc
37from collections import defaultdict
38import contextlib
39import logging
40import numbers
41import os
42from typing import (
43 Any,
44 ClassVar,
45 Counter,
46 Dict,
47 Iterable,
48 Iterator,
49 List,
50 MutableMapping,
51 Optional,
52 Set,
53 TextIO,
54 Tuple,
55 Type,
56 Union,
57)
59try:
60 import boto3
61except ImportError:
62 boto3 = None
64from lsst.utils import doImport
65from .core import (
66 AmbiguousDatasetError,
67 ButlerURI,
68 Config,
69 ConfigSubset,
70 DataCoordinate,
71 DataId,
72 DataIdValue,
73 DatasetRef,
74 DatasetType,
75 Datastore,
76 Dimension,
77 DimensionConfig,
78 FileDataset,
79 Progress,
80 StorageClassFactory,
81 Timespan,
82 ValidationError,
83 VERBOSE,
84)
85from .core.repoRelocation import BUTLER_ROOT_TAG
86from .core.utils import transactional, getClassOf
87from ._deferredDatasetHandle import DeferredDatasetHandle
88from ._butlerConfig import ButlerConfig
89from .registry import (
90 Registry,
91 RegistryConfig,
92 RegistryDefaults,
93 CollectionSearch,
94 CollectionType,
95 ConflictingDefinitionError,
96 DatasetIdGenEnum,
97)
98from .transfers import RepoExportContext
100log = logging.getLogger(__name__)
103class ButlerValidationError(ValidationError):
104 """There is a problem with the Butler configuration."""
105 pass
108class PruneCollectionsArgsError(TypeError):
109 """Base class for errors relating to Butler.pruneCollections input
110 arguments.
111 """
112 pass
115class PurgeWithoutUnstorePruneCollectionsError(PruneCollectionsArgsError):
116 """Raised when purge and unstore are both required to be True, and
117 purge is True but unstore is False.
118 """
120 def __init__(self) -> None:
121 super().__init__("Cannot pass purge=True without unstore=True.")
124class RunWithoutPurgePruneCollectionsError(PruneCollectionsArgsError):
125 """Raised when pruning a RUN collection but purge is False."""
127 def __init__(self, collectionType: CollectionType):
128 self.collectionType = collectionType
129 super().__init__(f"Cannot prune RUN collection {self.collectionType.name} without purge=True.")
132class PurgeUnsupportedPruneCollectionsError(PruneCollectionsArgsError):
133 """Raised when purge is True but is not supported for the given
134 collection."""
136 def __init__(self, collectionType: CollectionType):
137 self.collectionType = collectionType
138 super().__init__(
139 f"Cannot prune {self.collectionType} collection {self.collectionType.name} with purge=True.")
142class Butler:
143 """Main entry point for the data access system.
145 Parameters
146 ----------
147 config : `ButlerConfig`, `Config` or `str`, optional.
148 Configuration. Anything acceptable to the
149 `ButlerConfig` constructor. If a directory path
150 is given the configuration will be read from a ``butler.yaml`` file in
151 that location. If `None` is given default values will be used.
152 butler : `Butler`, optional.
153 If provided, construct a new Butler that uses the same registry and
154 datastore as the given one, but with the given collection and run.
155 Incompatible with the ``config``, ``searchPaths``, and ``writeable``
156 arguments.
157 collections : `str` or `Iterable` [ `str` ], optional
158 An expression specifying the collections to be searched (in order) when
159 reading datasets.
160 This may be a `str` collection name or an iterable thereof.
161 See :ref:`daf_butler_collection_expressions` for more information.
162 These collections are not registered automatically and must be
163 manually registered before they are used by any method, but they may be
164 manually registered after the `Butler` is initialized.
165 run : `str`, optional
166 Name of the `~CollectionType.RUN` collection new datasets should be
167 inserted into. If ``collections`` is `None` and ``run`` is not `None`,
168 ``collections`` will be set to ``[run]``. If not `None`, this
169 collection will automatically be registered. If this is not set (and
170 ``writeable`` is not set either), a read-only butler will be created.
171 searchPaths : `list` of `str`, optional
172 Directory paths to search when calculating the full Butler
173 configuration. Not used if the supplied config is already a
174 `ButlerConfig`.
175 writeable : `bool`, optional
176 Explicitly sets whether the butler supports write operations. If not
177 provided, a read-write butler is created if any of ``run``, ``tags``,
178 or ``chains`` is non-empty.
179 inferDefaults : `bool`, optional
180 If `True` (default) infer default data ID values from the values
181 present in the datasets in ``collections``: if all collections have the
182 same value (or no value) for a governor dimension, that value will be
183 the default for that dimension. Nonexistent collections are ignored.
184 If a default value is provided explicitly for a governor dimension via
185 ``**kwargs``, no default will be inferred for that dimension.
186 **kwargs : `str`
187 Default data ID key-value pairs. These may only identify "governor"
188 dimensions like ``instrument`` and ``skymap``.
190 Examples
191 --------
192 While there are many ways to control exactly how a `Butler` interacts with
193 the collections in its `Registry`, the most common cases are still simple.
195 For a read-only `Butler` that searches one collection, do::
197 butler = Butler("/path/to/repo", collections=["u/alice/DM-50000"])
199 For a read-write `Butler` that writes to and reads from a
200 `~CollectionType.RUN` collection::
202 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a")
204 The `Butler` passed to a ``PipelineTask`` is often much more complex,
205 because we want to write to one `~CollectionType.RUN` collection but read
206 from several others (as well)::
208 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a",
209 collections=["u/alice/DM-50000/a",
210 "u/bob/DM-49998",
211 "HSC/defaults"])
213 This butler will `put` new datasets to the run ``u/alice/DM-50000/a``.
214 Datasets will be read first from that run (since it appears first in the
215 chain), and then from ``u/bob/DM-49998`` and finally ``HSC/defaults``.
217 Finally, one can always create a `Butler` with no collections::
219 butler = Butler("/path/to/repo", writeable=True)
221 This can be extremely useful when you just want to use ``butler.registry``,
222 e.g. for inserting dimension data or managing collections, or when the
223 collections you want to use with the butler are not consistent.
224 Passing ``writeable`` explicitly here is only necessary if you want to be
225 able to make changes to the repo - usually the value for ``writeable`` can
226 be guessed from the collection arguments provided, but it defaults to
227 `False` when there are not collection arguments.
228 """
229 def __init__(self, config: Union[Config, str, None] = None, *,
230 butler: Optional[Butler] = None,
231 collections: Any = None,
232 run: Optional[str] = None,
233 searchPaths: Optional[List[str]] = None,
234 writeable: Optional[bool] = None,
235 inferDefaults: bool = True,
236 **kwargs: str,
237 ):
238 defaults = RegistryDefaults(collections=collections, run=run, infer=inferDefaults, **kwargs)
239 # Load registry, datastore, etc. from config or existing butler.
240 if butler is not None:
241 if config is not None or searchPaths is not None or writeable is not None:
242 raise TypeError("Cannot pass 'config', 'searchPaths', or 'writeable' "
243 "arguments with 'butler' argument.")
244 self.registry = butler.registry.copy(defaults)
245 self.datastore = butler.datastore
246 self.storageClasses = butler.storageClasses
247 self._config: ButlerConfig = butler._config
248 self._allow_put_of_predefined_dataset = butler._allow_put_of_predefined_dataset
249 else:
250 self._config = ButlerConfig(config, searchPaths=searchPaths)
251 if "root" in self._config:
252 butlerRoot = self._config["root"]
253 else:
254 butlerRoot = self._config.configDir
255 if writeable is None:
256 writeable = run is not None
257 self.registry = Registry.fromConfig(self._config, butlerRoot=butlerRoot, writeable=writeable,
258 defaults=defaults)
259 self.datastore = Datastore.fromConfig(self._config, self.registry.getDatastoreBridgeManager(),
260 butlerRoot=butlerRoot)
261 self.storageClasses = StorageClassFactory()
262 self.storageClasses.addFromConfig(self._config)
263 self._allow_put_of_predefined_dataset = self._config.get("allow_put_of_predefined_dataset", False)
264 if "run" in self._config or "collection" in self._config:
265 raise ValueError("Passing a run or collection via configuration is no longer supported.")
267 GENERATION: ClassVar[int] = 3
268 """This is a Generation 3 Butler.
270 This attribute may be removed in the future, once the Generation 2 Butler
271 interface has been fully retired; it should only be used in transitional
272 code.
273 """
275 @staticmethod
276 def makeRepo(root: str, config: Union[Config, str, None] = None,
277 dimensionConfig: Union[Config, str, None] = None, standalone: bool = False,
278 searchPaths: Optional[List[str]] = None, forceConfigRoot: bool = True,
279 outfile: Optional[str] = None, overwrite: bool = False) -> Config:
280 """Create an empty data repository by adding a butler.yaml config
281 to a repository root directory.
283 Parameters
284 ----------
285 root : `str` or `ButlerURI`
286 Path or URI to the root location of the new repository. Will be
287 created if it does not exist.
288 config : `Config` or `str`, optional
289 Configuration to write to the repository, after setting any
290 root-dependent Registry or Datastore config options. Can not
291 be a `ButlerConfig` or a `ConfigSubset`. If `None`, default
292 configuration will be used. Root-dependent config options
293 specified in this config are overwritten if ``forceConfigRoot``
294 is `True`.
295 dimensionConfig : `Config` or `str`, optional
296 Configuration for dimensions, will be used to initialize registry
297 database.
298 standalone : `bool`
299 If True, write all expanded defaults, not just customized or
300 repository-specific settings.
301 This (mostly) decouples the repository from the default
302 configuration, insulating it from changes to the defaults (which
303 may be good or bad, depending on the nature of the changes).
304 Future *additions* to the defaults will still be picked up when
305 initializing `Butlers` to repos created with ``standalone=True``.
306 searchPaths : `list` of `str`, optional
307 Directory paths to search when calculating the full butler
308 configuration.
309 forceConfigRoot : `bool`, optional
310 If `False`, any values present in the supplied ``config`` that
311 would normally be reset are not overridden and will appear
312 directly in the output config. This allows non-standard overrides
313 of the root directory for a datastore or registry to be given.
314 If this parameter is `True` the values for ``root`` will be
315 forced into the resulting config if appropriate.
316 outfile : `str`, optional
317 If not-`None`, the output configuration will be written to this
318 location rather than into the repository itself. Can be a URI
319 string. Can refer to a directory that will be used to write
320 ``butler.yaml``.
321 overwrite : `bool`, optional
322 Create a new configuration file even if one already exists
323 in the specified output location. Default is to raise
324 an exception.
326 Returns
327 -------
328 config : `Config`
329 The updated `Config` instance written to the repo.
331 Raises
332 ------
333 ValueError
334 Raised if a ButlerConfig or ConfigSubset is passed instead of a
335 regular Config (as these subclasses would make it impossible to
336 support ``standalone=False``).
337 FileExistsError
338 Raised if the output config file already exists.
339 os.error
340 Raised if the directory does not exist, exists but is not a
341 directory, or cannot be created.
343 Notes
344 -----
345 Note that when ``standalone=False`` (the default), the configuration
346 search path (see `ConfigSubset.defaultSearchPaths`) that was used to
347 construct the repository should also be used to construct any Butlers
348 to avoid configuration inconsistencies.
349 """
350 if isinstance(config, (ButlerConfig, ConfigSubset)):
351 raise ValueError("makeRepo must be passed a regular Config without defaults applied.")
353 # Ensure that the root of the repository exists or can be made
354 uri = ButlerURI(root, forceDirectory=True)
355 uri.mkdir()
357 config = Config(config)
359 # If we are creating a new repo from scratch with relative roots,
360 # do not propagate an explicit root from the config file
361 if "root" in config:
362 del config["root"]
364 full = ButlerConfig(config, searchPaths=searchPaths) # this applies defaults
365 datastoreClass: Type[Datastore] = doImport(full["datastore", "cls"])
366 datastoreClass.setConfigRoot(BUTLER_ROOT_TAG, config, full, overwrite=forceConfigRoot)
368 # if key exists in given config, parse it, otherwise parse the defaults
369 # in the expanded config
370 if config.get(("registry", "db")):
371 registryConfig = RegistryConfig(config)
372 else:
373 registryConfig = RegistryConfig(full)
374 defaultDatabaseUri = registryConfig.makeDefaultDatabaseUri(BUTLER_ROOT_TAG)
375 if defaultDatabaseUri is not None:
376 Config.updateParameters(RegistryConfig, config, full,
377 toUpdate={"db": defaultDatabaseUri},
378 overwrite=forceConfigRoot)
379 else:
380 Config.updateParameters(RegistryConfig, config, full, toCopy=("db",),
381 overwrite=forceConfigRoot)
383 if standalone:
384 config.merge(full)
385 else:
386 # Always expand the registry.managers section into the per-repo
387 # config, because after the database schema is created, it's not
388 # allowed to change anymore. Note that in the standalone=True
389 # branch, _everything_ in the config is expanded, so there's no
390 # need to special case this.
391 Config.updateParameters(RegistryConfig, config, full, toMerge=("managers",), overwrite=False)
392 configURI: Union[str, ButlerURI]
393 if outfile is not None:
394 # When writing to a separate location we must include
395 # the root of the butler repo in the config else it won't know
396 # where to look.
397 config["root"] = uri.geturl()
398 configURI = outfile
399 else:
400 configURI = uri
401 config.dumpToUri(configURI, overwrite=overwrite)
403 # Create Registry and populate tables
404 registryConfig = RegistryConfig(config.get("registry"))
405 dimensionConfig = DimensionConfig(dimensionConfig)
406 Registry.createFromConfig(registryConfig, dimensionConfig=dimensionConfig, butlerRoot=root)
408 log.log(VERBOSE, "Wrote new Butler configuration file to %s", configURI)
410 return config
412 @classmethod
413 def _unpickle(cls, config: ButlerConfig, collections: Optional[CollectionSearch], run: Optional[str],
414 defaultDataId: Dict[str, str], writeable: bool) -> Butler:
415 """Callable used to unpickle a Butler.
417 We prefer not to use ``Butler.__init__`` directly so we can force some
418 of its many arguments to be keyword-only (note that ``__reduce__``
419 can only invoke callables with positional arguments).
421 Parameters
422 ----------
423 config : `ButlerConfig`
424 Butler configuration, already coerced into a true `ButlerConfig`
425 instance (and hence after any search paths for overrides have been
426 utilized).
427 collections : `CollectionSearch`
428 Names of the default collections to read from.
429 run : `str`, optional
430 Name of the default `~CollectionType.RUN` collection to write to.
431 defaultDataId : `dict` [ `str`, `str` ]
432 Default data ID values.
433 writeable : `bool`
434 Whether the Butler should support write operations.
436 Returns
437 -------
438 butler : `Butler`
439 A new `Butler` instance.
440 """
441 # MyPy doesn't recognize that the kwargs below are totally valid; it
442 # seems to think '**defaultDataId* is a _positional_ argument!
443 return cls(config=config, collections=collections, run=run, writeable=writeable,
444 **defaultDataId) # type: ignore
446 def __reduce__(self) -> tuple:
447 """Support pickling.
448 """
449 return (Butler._unpickle, (self._config, self.collections, self.run,
450 self.registry.defaults.dataId.byName(),
451 self.registry.isWriteable()))
453 def __str__(self) -> str:
454 return "Butler(collections={}, run={}, datastore='{}', registry='{}')".format(
455 self.collections, self.run, self.datastore, self.registry)
457 def isWriteable(self) -> bool:
458 """Return `True` if this `Butler` supports write operations.
459 """
460 return self.registry.isWriteable()
462 @contextlib.contextmanager
463 def transaction(self) -> Iterator[None]:
464 """Context manager supporting `Butler` transactions.
466 Transactions can be nested.
467 """
468 with self.registry.transaction():
469 with self.datastore.transaction():
470 yield
472 def _standardizeArgs(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
473 dataId: Optional[DataId] = None, **kwargs: Any
474 ) -> Tuple[DatasetType, Optional[DataId]]:
475 """Standardize the arguments passed to several Butler APIs.
477 Parameters
478 ----------
479 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
480 When `DatasetRef` the `dataId` should be `None`.
481 Otherwise the `DatasetType` or name thereof.
482 dataId : `dict` or `DataCoordinate`
483 A `dict` of `Dimension` link name, value pairs that label the
484 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
485 should be provided as the second argument.
486 **kwargs
487 Additional keyword arguments used to augment or construct a
488 `DataCoordinate`. See `DataCoordinate.standardize`
489 parameters.
491 Returns
492 -------
493 datasetType : `DatasetType`
494 A `DatasetType` instance extracted from ``datasetRefOrType``.
495 dataId : `dict` or `DataId`, optional
496 Argument that can be used (along with ``kwargs``) to construct a
497 `DataId`.
499 Notes
500 -----
501 Butler APIs that conceptually need a DatasetRef also allow passing a
502 `DatasetType` (or the name of one) and a `DataId` (or a dict and
503 keyword arguments that can be used to construct one) separately. This
504 method accepts those arguments and always returns a true `DatasetType`
505 and a `DataId` or `dict`.
507 Standardization of `dict` vs `DataId` is best handled by passing the
508 returned ``dataId`` (and ``kwargs``) to `Registry` APIs, which are
509 generally similarly flexible.
510 """
511 externalDatasetType: Optional[DatasetType] = None
512 internalDatasetType: Optional[DatasetType] = None
513 if isinstance(datasetRefOrType, DatasetRef):
514 if dataId is not None or kwargs:
515 raise ValueError("DatasetRef given, cannot use dataId as well")
516 externalDatasetType = datasetRefOrType.datasetType
517 dataId = datasetRefOrType.dataId
518 else:
519 # Don't check whether DataId is provided, because Registry APIs
520 # can usually construct a better error message when it wasn't.
521 if isinstance(datasetRefOrType, DatasetType):
522 externalDatasetType = datasetRefOrType
523 else:
524 internalDatasetType = self.registry.getDatasetType(datasetRefOrType)
526 # Check that they are self-consistent
527 if externalDatasetType is not None:
528 internalDatasetType = self.registry.getDatasetType(externalDatasetType.name)
529 if externalDatasetType != internalDatasetType:
530 raise ValueError(f"Supplied dataset type ({externalDatasetType}) inconsistent with "
531 f"registry definition ({internalDatasetType})")
533 assert internalDatasetType is not None
534 return internalDatasetType, dataId
536 def _rewrite_data_id(self, dataId: Optional[DataId], datasetType: DatasetType,
537 **kwargs: Any) -> Tuple[Optional[DataId], Dict[str, Any]]:
538 """Rewrite a data ID taking into account dimension records.
540 Take a Data ID and keyword args and rewrite it if necessary to
541 allow the user to specify dimension records rather than dimension
542 primary values.
544 This allows a user to include a dataId dict with keys of
545 ``exposure.day_obs`` and ``exposure.seq_num`` instead of giving
546 the integer exposure ID. It also allows a string to be given
547 for a dimension value rather than the integer ID if that is more
548 convenient. For example, rather than having to specifyin the
549 detector with ``detector.full_name``, a string given for ``detector``
550 will be interpreted as the full name and converted to the integer
551 value.
553 Keyword arguments can also use strings for dimensions like detector
554 and exposure but python does not allow them to include ``.`` and
555 so the ``exposure.day_obs`` syntax can not be used in a keyword
556 argument.
558 Parameters
559 ----------
560 dataId : `dict` or `DataCoordinate`
561 A `dict` of `Dimension` link name, value pairs that will label the
562 `DatasetRef` within a Collection.
563 datasetType : `DatasetType`
564 The dataset type associated with this dataId. Required to
565 determine the relevant dimensions.
566 **kwargs
567 Additional keyword arguments used to augment or construct a
568 `DataId`. See `DataId` parameters.
570 Returns
571 -------
572 dataId : `dict` or `DataCoordinate`
573 The, possibly rewritten, dataId. If given a `DataCoordinate` and
574 no keyword arguments, the orginal dataId will be returned
575 unchanged.
576 **kwargs : `dict`
577 Any unused keyword arguments.
578 """
579 # Do nothing if we have a standalone DataCoordinate.
580 if isinstance(dataId, DataCoordinate) and not kwargs:
581 return dataId, kwargs
583 # Process dimension records that are using record information
584 # rather than ids
585 newDataId: Dict[str, DataIdValue] = {}
586 byRecord: Dict[str, Dict[str, Any]] = defaultdict(dict)
588 # if all the dataId comes from keyword parameters we do not need
589 # to do anything here because they can't be of the form
590 # exposure.obs_id because a "." is not allowed in a keyword parameter.
591 if dataId:
592 for k, v in dataId.items():
593 # If we have a Dimension we do not need to do anything
594 # because it cannot be a compound key.
595 if isinstance(k, str) and "." in k:
596 # Someone is using a more human-readable dataId
597 dimensionName, record = k.split(".", 1)
598 byRecord[dimensionName][record] = v
599 elif isinstance(k, Dimension):
600 newDataId[k.name] = v
601 else:
602 newDataId[k] = v
604 # Go through the updated dataId and check the type in case someone is
605 # using an alternate key. We have already filtered out the compound
606 # keys dimensions.record format.
607 not_dimensions = {}
609 # Will need to look in the dataId and the keyword arguments
610 # and will remove them if they need to be fixed or are unrecognized.
611 for dataIdDict in (newDataId, kwargs):
612 # Use a list so we can adjust the dict safely in the loop
613 for dimensionName in list(dataIdDict):
614 value = dataIdDict[dimensionName]
615 try:
616 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
617 except KeyError:
618 # This is not a real dimension
619 not_dimensions[dimensionName] = value
620 del dataIdDict[dimensionName]
621 continue
623 # Convert an integral type to an explicit int to simplify
624 # comparisons here
625 if isinstance(value, numbers.Integral):
626 value = int(value)
628 if not isinstance(value, dimension.primaryKey.getPythonType()):
629 for alternate in dimension.alternateKeys:
630 if isinstance(value, alternate.getPythonType()):
631 byRecord[dimensionName][alternate.name] = value
632 del dataIdDict[dimensionName]
633 log.debug("Converting dimension %s to %s.%s=%s",
634 dimensionName, dimensionName, alternate.name, value)
635 break
636 else:
637 log.warning("Type mismatch found for value '%r' provided for dimension %s. "
638 "Could not find matching alternative (primary key has type %s) "
639 "so attempting to use as-is.",
640 value, dimensionName, dimension.primaryKey.getPythonType())
642 # If we have some unrecognized dimensions we have to try to connect
643 # them to records in other dimensions. This is made more complicated
644 # by some dimensions having records with clashing names. A mitigation
645 # is that we can tell by this point which dimensions are missing
646 # for the DatasetType but this does not work for calibrations
647 # where additional dimensions can be used to constrain the temporal
648 # axis.
649 if not_dimensions:
650 # Calculate missing dimensions
651 provided = set(newDataId) | set(kwargs) | set(byRecord)
652 missingDimensions = datasetType.dimensions.names - provided
654 # For calibrations we may well be needing temporal dimensions
655 # so rather than always including all dimensions in the scan
656 # restrict things a little. It is still possible for there
657 # to be confusion over day_obs in visit vs exposure for example.
658 # If we are not searching calibration collections things may
659 # fail but they are going to fail anyway because of the
660 # ambiguousness of the dataId...
661 candidateDimensions: Set[str] = set()
662 candidateDimensions.update(missingDimensions)
663 if datasetType.isCalibration():
664 for dim in self.registry.dimensions.getStaticDimensions():
665 if dim.temporal:
666 candidateDimensions.add(str(dim))
668 # Look up table for the first association with a dimension
669 guessedAssociation: Dict[str, Dict[str, Any]] = defaultdict(dict)
671 # Keep track of whether an item is associated with multiple
672 # dimensions.
673 counter: Counter[str] = Counter()
674 assigned: Dict[str, Set[str]] = defaultdict(set)
676 # Go through the missing dimensions and associate the
677 # given names with records within those dimensions
678 for dimensionName in candidateDimensions:
679 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
680 fields = dimension.metadata.names | dimension.uniqueKeys.names
681 for field in not_dimensions:
682 if field in fields:
683 guessedAssociation[dimensionName][field] = not_dimensions[field]
684 counter[dimensionName] += 1
685 assigned[field].add(dimensionName)
687 # There is a chance we have allocated a single dataId item
688 # to multiple dimensions. Need to decide which should be retained.
689 # For now assume that the most popular alternative wins.
690 # This means that day_obs with seq_num will result in
691 # exposure.day_obs and not visit.day_obs
692 # Also prefer an explicitly missing dimension over an inferred
693 # temporal dimension.
694 for fieldName, assignedDimensions in assigned.items():
695 if len(assignedDimensions) > 1:
696 # Pick the most popular (preferring mandatory dimensions)
697 requiredButMissing = assignedDimensions.intersection(missingDimensions)
698 if requiredButMissing:
699 candidateDimensions = requiredButMissing
700 else:
701 candidateDimensions = assignedDimensions
703 # Select the relevant items and get a new restricted
704 # counter.
705 theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions}
706 duplicatesCounter: Counter[str] = Counter()
707 duplicatesCounter.update(theseCounts)
709 # Choose the most common. If they are equally common
710 # we will pick the one that was found first.
711 # Returns a list of tuples
712 selected = duplicatesCounter.most_common(1)[0][0]
714 log.debug("Ambiguous dataId entry '%s' associated with multiple dimensions: %s."
715 " Removed ambiguity by choosing dimension %s.",
716 fieldName, ", ".join(assignedDimensions), selected)
718 for candidateDimension in assignedDimensions:
719 if candidateDimension != selected:
720 del guessedAssociation[candidateDimension][fieldName]
722 # Update the record look up dict with the new associations
723 for dimensionName, values in guessedAssociation.items():
724 if values: # A dict might now be empty
725 log.debug("Assigned non-dimension dataId keys to dimension %s: %s",
726 dimensionName, values)
727 byRecord[dimensionName].update(values)
729 if byRecord:
730 # Some record specifiers were found so we need to convert
731 # them to the Id form
732 for dimensionName, values in byRecord.items():
733 if dimensionName in newDataId:
734 log.warning("DataId specified explicit %s dimension value of %s in addition to"
735 " general record specifiers for it of %s. Ignoring record information.",
736 dimensionName, newDataId[dimensionName], str(values))
737 continue
739 # Build up a WHERE expression
740 bind = {k: v for k, v in values.items()}
741 where = " AND ".join(f"{dimensionName}.{k} = {k}"
742 for k in bind)
744 # Hopefully we get a single record that matches
745 records = set(self.registry.queryDimensionRecords(dimensionName, dataId=newDataId,
746 where=where, bind=bind, **kwargs))
748 if len(records) != 1:
749 if len(records) > 1:
750 log.debug("Received %d records from constraints of %s", len(records), str(values))
751 for r in records:
752 log.debug("- %s", str(r))
753 raise RuntimeError(f"DataId specification for dimension {dimensionName} is not"
754 f" uniquely constrained to a single dataset by {values}."
755 f" Got {len(records)} results.")
756 raise RuntimeError(f"DataId specification for dimension {dimensionName} matched no"
757 f" records when constrained by {values}")
759 # Get the primary key from the real dimension object
760 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
761 if not isinstance(dimension, Dimension):
762 raise RuntimeError(
763 f"{dimension.name} is not a true dimension, and cannot be used in data IDs."
764 )
765 newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name)
767 # We have modified the dataId so need to switch to it
768 dataId = newDataId
770 return dataId, kwargs
772 def _findDatasetRef(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
773 dataId: Optional[DataId] = None, *,
774 collections: Any = None,
775 allowUnresolved: bool = False,
776 **kwargs: Any) -> DatasetRef:
777 """Shared logic for methods that start with a search for a dataset in
778 the registry.
780 Parameters
781 ----------
782 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
783 When `DatasetRef` the `dataId` should be `None`.
784 Otherwise the `DatasetType` or name thereof.
785 dataId : `dict` or `DataCoordinate`, optional
786 A `dict` of `Dimension` link name, value pairs that label the
787 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
788 should be provided as the first argument.
789 collections : Any, optional
790 Collections to be searched, overriding ``self.collections``.
791 Can be any of the types supported by the ``collections`` argument
792 to butler construction.
793 allowUnresolved : `bool`, optional
794 If `True`, return an unresolved `DatasetRef` if finding a resolved
795 one in the `Registry` fails. Defaults to `False`.
796 **kwargs
797 Additional keyword arguments used to augment or construct a
798 `DataId`. See `DataId` parameters.
800 Returns
801 -------
802 ref : `DatasetRef`
803 A reference to the dataset identified by the given arguments.
805 Raises
806 ------
807 LookupError
808 Raised if no matching dataset exists in the `Registry` (and
809 ``allowUnresolved is False``).
810 ValueError
811 Raised if a resolved `DatasetRef` was passed as an input, but it
812 differs from the one found in the registry.
813 TypeError
814 Raised if no collections were provided.
815 """
816 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs)
817 if isinstance(datasetRefOrType, DatasetRef):
818 idNumber = datasetRefOrType.id
819 else:
820 idNumber = None
821 timespan: Optional[Timespan] = None
823 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs)
825 if datasetType.isCalibration():
826 # Because this is a calibration dataset, first try to make a
827 # standardize the data ID without restricting the dimensions to
828 # those of the dataset type requested, because there may be extra
829 # dimensions that provide temporal information for a validity-range
830 # lookup.
831 dataId = DataCoordinate.standardize(dataId, universe=self.registry.dimensions,
832 defaults=self.registry.defaults.dataId, **kwargs)
833 if dataId.graph.temporal:
834 dataId = self.registry.expandDataId(dataId)
835 timespan = dataId.timespan
836 else:
837 # Standardize the data ID to just the dimensions of the dataset
838 # type instead of letting registry.findDataset do it, so we get the
839 # result even if no dataset is found.
840 dataId = DataCoordinate.standardize(dataId, graph=datasetType.dimensions,
841 defaults=self.registry.defaults.dataId, **kwargs)
842 # Always lookup the DatasetRef, even if one is given, to ensure it is
843 # present in the current collection.
844 ref = self.registry.findDataset(datasetType, dataId, collections=collections, timespan=timespan)
845 if ref is None:
846 if allowUnresolved:
847 return DatasetRef(datasetType, dataId)
848 else:
849 if collections is None:
850 collections = self.registry.defaults.collections
851 raise LookupError(f"Dataset {datasetType.name} with data ID {dataId} "
852 f"could not be found in collections {collections}.")
853 if idNumber is not None and idNumber != ref.id:
854 if collections is None:
855 collections = self.registry.defaults.collections
856 raise ValueError(f"DatasetRef.id provided ({idNumber}) does not match "
857 f"id ({ref.id}) in registry in collections {collections}.")
858 return ref
860 @transactional
861 def put(self, obj: Any, datasetRefOrType: Union[DatasetRef, DatasetType, str],
862 dataId: Optional[DataId] = None, *,
863 run: Optional[str] = None,
864 **kwargs: Any) -> DatasetRef:
865 """Store and register a dataset.
867 Parameters
868 ----------
869 obj : `object`
870 The dataset.
871 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
872 When `DatasetRef` is provided, ``dataId`` should be `None`.
873 Otherwise the `DatasetType` or name thereof.
874 dataId : `dict` or `DataCoordinate`
875 A `dict` of `Dimension` link name, value pairs that label the
876 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
877 should be provided as the second argument.
878 run : `str`, optional
879 The name of the run the dataset should be added to, overriding
880 ``self.run``.
881 **kwargs
882 Additional keyword arguments used to augment or construct a
883 `DataCoordinate`. See `DataCoordinate.standardize`
884 parameters.
886 Returns
887 -------
888 ref : `DatasetRef`
889 A reference to the stored dataset, updated with the correct id if
890 given.
892 Raises
893 ------
894 TypeError
895 Raised if the butler is read-only or if no run has been provided.
896 """
897 log.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run)
898 if not self.isWriteable():
899 raise TypeError("Butler is read-only.")
900 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs)
901 if isinstance(datasetRefOrType, DatasetRef) and datasetRefOrType.id is not None:
902 raise ValueError("DatasetRef must not be in registry, must have None id")
904 # Add Registry Dataset entry.
905 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions, **kwargs)
907 # For an execution butler the datasets will be pre-defined.
908 # If the butler is configured that way datasets should only be inserted
909 # if they do not already exist in registry. Trying and catching
910 # ConflictingDefinitionError will not work because the transaction
911 # will be corrupted. Instead, in this mode always check first.
912 ref = None
913 ref_is_predefined = False
914 if self._allow_put_of_predefined_dataset:
915 # Get the matching ref for this run.
916 ref = self.registry.findDataset(datasetType, collections=run,
917 dataId=dataId)
919 if ref:
920 # Must be expanded form for datastore templating
921 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions)
922 ref = ref.expanded(dataId)
923 ref_is_predefined = True
925 if not ref:
926 ref, = self.registry.insertDatasets(datasetType, run=run, dataIds=[dataId])
928 # If the ref is predefined it is possible that the datastore also
929 # has the record. Asking datastore to put it again will result in
930 # the artifact being recreated, overwriting previous, then will cause
931 # a failure in writing the record which will cause the artifact
932 # to be removed. Much safer to ask first before attempting to
933 # overwrite. Race conditions should not be an issue for the
934 # execution butler environment.
935 if ref_is_predefined:
936 if self.datastore.knows(ref):
937 raise ConflictingDefinitionError(f"Dataset associated {ref} already exists.")
939 self.datastore.put(obj, ref)
941 return ref
943 def getDirect(self, ref: DatasetRef, *, parameters: Optional[Dict[str, Any]] = None) -> Any:
944 """Retrieve a stored dataset.
946 Unlike `Butler.get`, this method allows datasets outside the Butler's
947 collection to be read as long as the `DatasetRef` that identifies them
948 can be obtained separately.
950 Parameters
951 ----------
952 ref : `DatasetRef`
953 Resolved reference to an already stored dataset.
954 parameters : `dict`
955 Additional StorageClass-defined options to control reading,
956 typically used to efficiently read only a subset of the dataset.
958 Returns
959 -------
960 obj : `object`
961 The dataset.
962 """
963 return self.datastore.get(ref, parameters=parameters)
965 def getDirectDeferred(self, ref: DatasetRef, *,
966 parameters: Union[dict, None] = None) -> DeferredDatasetHandle:
967 """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
968 from a resolved `DatasetRef`.
970 Parameters
971 ----------
972 ref : `DatasetRef`
973 Resolved reference to an already stored dataset.
974 parameters : `dict`
975 Additional StorageClass-defined options to control reading,
976 typically used to efficiently read only a subset of the dataset.
978 Returns
979 -------
980 obj : `DeferredDatasetHandle`
981 A handle which can be used to retrieve a dataset at a later time.
983 Raises
984 ------
985 AmbiguousDatasetError
986 Raised if ``ref.id is None``, i.e. the reference is unresolved.
987 """
988 if ref.id is None:
989 raise AmbiguousDatasetError(
990 f"Dataset of type {ref.datasetType.name} with data ID {ref.dataId} is not resolved."
991 )
992 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters)
994 def getDeferred(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
995 dataId: Optional[DataId] = None, *,
996 parameters: Union[dict, None] = None,
997 collections: Any = None,
998 **kwargs: Any) -> DeferredDatasetHandle:
999 """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
1000 after an immediate registry lookup.
1002 Parameters
1003 ----------
1004 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1005 When `DatasetRef` the `dataId` should be `None`.
1006 Otherwise the `DatasetType` or name thereof.
1007 dataId : `dict` or `DataCoordinate`, optional
1008 A `dict` of `Dimension` link name, value pairs that label the
1009 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1010 should be provided as the first argument.
1011 parameters : `dict`
1012 Additional StorageClass-defined options to control reading,
1013 typically used to efficiently read only a subset of the dataset.
1014 collections : Any, optional
1015 Collections to be searched, overriding ``self.collections``.
1016 Can be any of the types supported by the ``collections`` argument
1017 to butler construction.
1018 **kwargs
1019 Additional keyword arguments used to augment or construct a
1020 `DataId`. See `DataId` parameters.
1022 Returns
1023 -------
1024 obj : `DeferredDatasetHandle`
1025 A handle which can be used to retrieve a dataset at a later time.
1027 Raises
1028 ------
1029 LookupError
1030 Raised if no matching dataset exists in the `Registry` (and
1031 ``allowUnresolved is False``).
1032 ValueError
1033 Raised if a resolved `DatasetRef` was passed as an input, but it
1034 differs from the one found in the registry.
1035 TypeError
1036 Raised if no collections were provided.
1037 """
1038 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs)
1039 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters)
1041 def get(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
1042 dataId: Optional[DataId] = None, *,
1043 parameters: Optional[Dict[str, Any]] = None,
1044 collections: Any = None,
1045 **kwargs: Any) -> Any:
1046 """Retrieve a stored dataset.
1048 Parameters
1049 ----------
1050 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1051 When `DatasetRef` the `dataId` should be `None`.
1052 Otherwise the `DatasetType` or name thereof.
1053 dataId : `dict` or `DataCoordinate`
1054 A `dict` of `Dimension` link name, value pairs that label the
1055 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1056 should be provided as the first argument.
1057 parameters : `dict`
1058 Additional StorageClass-defined options to control reading,
1059 typically used to efficiently read only a subset of the dataset.
1060 collections : Any, optional
1061 Collections to be searched, overriding ``self.collections``.
1062 Can be any of the types supported by the ``collections`` argument
1063 to butler construction.
1064 **kwargs
1065 Additional keyword arguments used to augment or construct a
1066 `DataCoordinate`. See `DataCoordinate.standardize`
1067 parameters.
1069 Returns
1070 -------
1071 obj : `object`
1072 The dataset.
1074 Raises
1075 ------
1076 ValueError
1077 Raised if a resolved `DatasetRef` was passed as an input, but it
1078 differs from the one found in the registry.
1079 LookupError
1080 Raised if no matching dataset exists in the `Registry`.
1081 TypeError
1082 Raised if no collections were provided.
1084 Notes
1085 -----
1086 When looking up datasets in a `~CollectionType.CALIBRATION` collection,
1087 this method requires that the given data ID include temporal dimensions
1088 beyond the dimensions of the dataset type itself, in order to find the
1089 dataset with the appropriate validity range. For example, a "bias"
1090 dataset with native dimensions ``{instrument, detector}`` could be
1091 fetched with a ``{instrument, detector, exposure}`` data ID, because
1092 ``exposure`` is a temporal dimension.
1093 """
1094 log.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters)
1095 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs)
1096 return self.getDirect(ref, parameters=parameters)
1098 def getURIs(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
1099 dataId: Optional[DataId] = None, *,
1100 predict: bool = False,
1101 collections: Any = None,
1102 run: Optional[str] = None,
1103 **kwargs: Any) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]:
1104 """Returns the URIs associated with the dataset.
1106 Parameters
1107 ----------
1108 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1109 When `DatasetRef` the `dataId` should be `None`.
1110 Otherwise the `DatasetType` or name thereof.
1111 dataId : `dict` or `DataCoordinate`
1112 A `dict` of `Dimension` link name, value pairs that label the
1113 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1114 should be provided as the first argument.
1115 predict : `bool`
1116 If `True`, allow URIs to be returned of datasets that have not
1117 been written.
1118 collections : Any, optional
1119 Collections to be searched, overriding ``self.collections``.
1120 Can be any of the types supported by the ``collections`` argument
1121 to butler construction.
1122 run : `str`, optional
1123 Run to use for predictions, overriding ``self.run``.
1124 **kwargs
1125 Additional keyword arguments used to augment or construct a
1126 `DataCoordinate`. See `DataCoordinate.standardize`
1127 parameters.
1129 Returns
1130 -------
1131 primary : `ButlerURI`
1132 The URI to the primary artifact associated with this dataset.
1133 If the dataset was disassembled within the datastore this
1134 may be `None`.
1135 components : `dict`
1136 URIs to any components associated with the dataset artifact.
1137 Can be empty if there are no components.
1138 """
1139 ref = self._findDatasetRef(datasetRefOrType, dataId, allowUnresolved=predict,
1140 collections=collections, **kwargs)
1141 if ref.id is None: # only possible if predict is True
1142 if run is None:
1143 run = self.run
1144 if run is None:
1145 raise TypeError("Cannot predict location with run=None.")
1146 # Lie about ID, because we can't guess it, and only
1147 # Datastore.getURIs() will ever see it (and it doesn't use it).
1148 ref = ref.resolved(id=0, run=run)
1149 return self.datastore.getURIs(ref, predict)
1151 def getURI(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
1152 dataId: Optional[DataId] = None, *,
1153 predict: bool = False,
1154 collections: Any = None,
1155 run: Optional[str] = None,
1156 **kwargs: Any) -> ButlerURI:
1157 """Return the URI to the Dataset.
1159 Parameters
1160 ----------
1161 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1162 When `DatasetRef` the `dataId` should be `None`.
1163 Otherwise the `DatasetType` or name thereof.
1164 dataId : `dict` or `DataCoordinate`
1165 A `dict` of `Dimension` link name, value pairs that label the
1166 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1167 should be provided as the first argument.
1168 predict : `bool`
1169 If `True`, allow URIs to be returned of datasets that have not
1170 been written.
1171 collections : Any, optional
1172 Collections to be searched, overriding ``self.collections``.
1173 Can be any of the types supported by the ``collections`` argument
1174 to butler construction.
1175 run : `str`, optional
1176 Run to use for predictions, overriding ``self.run``.
1177 **kwargs
1178 Additional keyword arguments used to augment or construct a
1179 `DataCoordinate`. See `DataCoordinate.standardize`
1180 parameters.
1182 Returns
1183 -------
1184 uri : `ButlerURI`
1185 URI pointing to the Dataset within the datastore. If the
1186 Dataset does not exist in the datastore, and if ``predict`` is
1187 `True`, the URI will be a prediction and will include a URI
1188 fragment "#predicted".
1189 If the datastore does not have entities that relate well
1190 to the concept of a URI the returned URI string will be
1191 descriptive. The returned URI is not guaranteed to be obtainable.
1193 Raises
1194 ------
1195 LookupError
1196 A URI has been requested for a dataset that does not exist and
1197 guessing is not allowed.
1198 ValueError
1199 Raised if a resolved `DatasetRef` was passed as an input, but it
1200 differs from the one found in the registry.
1201 TypeError
1202 Raised if no collections were provided.
1203 RuntimeError
1204 Raised if a URI is requested for a dataset that consists of
1205 multiple artifacts.
1206 """
1207 primary, components = self.getURIs(datasetRefOrType, dataId=dataId, predict=predict,
1208 collections=collections, run=run, **kwargs)
1210 if primary is None or components:
1211 raise RuntimeError(f"Dataset ({datasetRefOrType}) includes distinct URIs for components. "
1212 "Use Butler.getURIs() instead.")
1213 return primary
1215 def retrieveArtifacts(self, refs: Iterable[DatasetRef],
1216 destination: Union[str, ButlerURI], transfer: str = "auto",
1217 preserve_path: bool = True,
1218 overwrite: bool = False) -> List[ButlerURI]:
1219 """Retrieve the artifacts associated with the supplied refs.
1221 Parameters
1222 ----------
1223 refs : iterable of `DatasetRef`
1224 The datasets for which artifacts are to be retrieved.
1225 A single ref can result in multiple artifacts. The refs must
1226 be resolved.
1227 destination : `ButlerURI` or `str`
1228 Location to write the artifacts.
1229 transfer : `str`, optional
1230 Method to use to transfer the artifacts. Must be one of the options
1231 supported by `ButlerURI.transfer_from()`. "move" is not allowed.
1232 preserve_path : `bool`, optional
1233 If `True` the full path of the artifact within the datastore
1234 is preserved. If `False` the final file component of the path
1235 is used.
1236 overwrite : `bool`, optional
1237 If `True` allow transfers to overwrite existing files at the
1238 destination.
1240 Returns
1241 -------
1242 targets : `list` of `ButlerURI`
1243 URIs of file artifacts in destination location. Order is not
1244 preserved.
1246 Notes
1247 -----
1248 For non-file datastores the artifacts written to the destination
1249 may not match the representation inside the datastore. For example
1250 a hierarchical data structure in a NoSQL database may well be stored
1251 as a JSON file.
1252 """
1253 return self.datastore.retrieveArtifacts(refs, ButlerURI(destination), transfer=transfer,
1254 preserve_path=preserve_path, overwrite=overwrite)
1256 def datasetExists(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
1257 dataId: Optional[DataId] = None, *,
1258 collections: Any = None,
1259 **kwargs: Any) -> bool:
1260 """Return True if the Dataset is actually present in the Datastore.
1262 Parameters
1263 ----------
1264 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1265 When `DatasetRef` the `dataId` should be `None`.
1266 Otherwise the `DatasetType` or name thereof.
1267 dataId : `dict` or `DataCoordinate`
1268 A `dict` of `Dimension` link name, value pairs that label the
1269 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1270 should be provided as the first argument.
1271 collections : Any, optional
1272 Collections to be searched, overriding ``self.collections``.
1273 Can be any of the types supported by the ``collections`` argument
1274 to butler construction.
1275 **kwargs
1276 Additional keyword arguments used to augment or construct a
1277 `DataCoordinate`. See `DataCoordinate.standardize`
1278 parameters.
1280 Raises
1281 ------
1282 LookupError
1283 Raised if the dataset is not even present in the Registry.
1284 ValueError
1285 Raised if a resolved `DatasetRef` was passed as an input, but it
1286 differs from the one found in the registry.
1287 TypeError
1288 Raised if no collections were provided.
1289 """
1290 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs)
1291 return self.datastore.exists(ref)
1293 def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None:
1294 """Remove one or more `~CollectionType.RUN` collections and the
1295 datasets within them.
1297 Parameters
1298 ----------
1299 names : `Iterable` [ `str` ]
1300 The names of the collections to remove.
1301 unstore : `bool`, optional
1302 If `True` (default), delete datasets from all datastores in which
1303 they are present, and attempt to rollback the registry deletions if
1304 datastore deletions fail (which may not always be possible). If
1305 `False`, datastore records for these datasets are still removed,
1306 but any artifacts (e.g. files) will not be.
1308 Raises
1309 ------
1310 TypeError
1311 Raised if one or more collections are not of type
1312 `~CollectionType.RUN`.
1313 """
1314 if not self.isWriteable():
1315 raise TypeError("Butler is read-only.")
1316 names = list(names)
1317 refs: List[DatasetRef] = []
1318 for name in names:
1319 collectionType = self.registry.getCollectionType(name)
1320 if collectionType is not CollectionType.RUN:
1321 raise TypeError(f"The collection type of '{name}' is {collectionType.name}, not RUN.")
1322 refs.extend(self.registry.queryDatasets(..., collections=name, findFirst=True))
1323 with self.registry.transaction():
1324 if unstore:
1325 self.datastore.trash(refs)
1326 else:
1327 self.datastore.forget(refs)
1328 for name in names:
1329 self.registry.removeCollection(name)
1330 if unstore:
1331 # Point of no return for removing artifacts
1332 self.datastore.emptyTrash()
1334 def pruneCollection(self, name: str, purge: bool = False, unstore: bool = False,
1335 unlink: Optional[List[str]] = None) -> None:
1336 """Remove a collection and possibly prune datasets within it.
1338 Parameters
1339 ----------
1340 name : `str`
1341 Name of the collection to remove. If this is a
1342 `~CollectionType.TAGGED` or `~CollectionType.CHAINED` collection,
1343 datasets within the collection are not modified unless ``unstore``
1344 is `True`. If this is a `~CollectionType.RUN` collection,
1345 ``purge`` and ``unstore`` must be `True`, and all datasets in it
1346 are fully removed from the data repository.
1347 purge : `bool`, optional
1348 If `True`, permit `~CollectionType.RUN` collections to be removed,
1349 fully removing datasets within them. Requires ``unstore=True`` as
1350 well as an added precaution against accidental deletion. Must be
1351 `False` (default) if the collection is not a ``RUN``.
1352 unstore: `bool`, optional
1353 If `True`, remove all datasets in the collection from all
1354 datastores in which they appear.
1355 unlink: `list` [`str`], optional
1356 Before removing the given `collection` unlink it from from these
1357 parent collections.
1359 Raises
1360 ------
1361 TypeError
1362 Raised if the butler is read-only or arguments are mutually
1363 inconsistent.
1364 """
1365 # See pruneDatasets comments for more information about the logic here;
1366 # the cases are almost the same, but here we can rely on Registry to
1367 # take care everything but Datastore deletion when we remove the
1368 # collection.
1369 if not self.isWriteable():
1370 raise TypeError("Butler is read-only.")
1371 collectionType = self.registry.getCollectionType(name)
1372 if purge and not unstore:
1373 raise PurgeWithoutUnstorePruneCollectionsError()
1374 if collectionType is CollectionType.RUN and not purge:
1375 raise RunWithoutPurgePruneCollectionsError(collectionType)
1376 if collectionType is not CollectionType.RUN and purge:
1377 raise PurgeUnsupportedPruneCollectionsError(collectionType)
1379 def remove(child: str, parent: str) -> None:
1380 """Remove a child collection from a parent collection."""
1381 # Remove child from parent.
1382 chain = list(self.registry.getCollectionChain(parent))
1383 try:
1384 chain.remove(name)
1385 except ValueError as e:
1386 raise RuntimeError(f"{name} is not a child of {parent}") from e
1387 self.registry.setCollectionChain(parent, chain)
1389 with self.registry.transaction():
1390 if (unlink):
1391 for parent in unlink:
1392 remove(name, parent)
1393 if unstore:
1394 refs = self.registry.queryDatasets(..., collections=name, findFirst=True)
1395 self.datastore.trash(refs)
1396 self.registry.removeCollection(name)
1398 if unstore:
1399 # Point of no return for removing artifacts
1400 self.datastore.emptyTrash()
1402 def pruneDatasets(self, refs: Iterable[DatasetRef], *,
1403 disassociate: bool = True,
1404 unstore: bool = False,
1405 tags: Iterable[str] = (),
1406 purge: bool = False,
1407 run: Optional[str] = None) -> None:
1408 """Remove one or more datasets from a collection and/or storage.
1410 Parameters
1411 ----------
1412 refs : `~collections.abc.Iterable` of `DatasetRef`
1413 Datasets to prune. These must be "resolved" references (not just
1414 a `DatasetType` and data ID).
1415 disassociate : `bool`, optional
1416 Disassociate pruned datasets from ``tags``, or from all collections
1417 if ``purge=True``.
1418 unstore : `bool`, optional
1419 If `True` (`False` is default) remove these datasets from all
1420 datastores known to this butler. Note that this will make it
1421 impossible to retrieve these datasets even via other collections.
1422 Datasets that are already not stored are ignored by this option.
1423 tags : `Iterable` [ `str` ], optional
1424 `~CollectionType.TAGGED` collections to disassociate the datasets
1425 from. Ignored if ``disassociate`` is `False` or ``purge`` is
1426 `True`.
1427 purge : `bool`, optional
1428 If `True` (`False` is default), completely remove the dataset from
1429 the `Registry`. To prevent accidental deletions, ``purge`` may
1430 only be `True` if all of the following conditions are met:
1432 - All given datasets are in the given run.
1433 - ``disassociate`` is `True`;
1434 - ``unstore`` is `True`.
1436 This mode may remove provenance information from datasets other
1437 than those provided, and should be used with extreme care.
1439 Raises
1440 ------
1441 TypeError
1442 Raised if the butler is read-only, if no collection was provided,
1443 or the conditions for ``purge=True`` were not met.
1444 """
1445 if not self.isWriteable():
1446 raise TypeError("Butler is read-only.")
1447 if purge:
1448 if not disassociate:
1449 raise TypeError("Cannot pass purge=True without disassociate=True.")
1450 if not unstore:
1451 raise TypeError("Cannot pass purge=True without unstore=True.")
1452 elif disassociate:
1453 tags = tuple(tags)
1454 if not tags:
1455 raise TypeError("No tags provided but disassociate=True.")
1456 for tag in tags:
1457 collectionType = self.registry.getCollectionType(tag)
1458 if collectionType is not CollectionType.TAGGED:
1459 raise TypeError(f"Cannot disassociate from collection '{tag}' "
1460 f"of non-TAGGED type {collectionType.name}.")
1461 # Transform possibly-single-pass iterable into something we can iterate
1462 # over multiple times.
1463 refs = list(refs)
1464 # Pruning a component of a DatasetRef makes no sense since registry
1465 # doesn't know about components and datastore might not store
1466 # components in a separate file
1467 for ref in refs:
1468 if ref.datasetType.component():
1469 raise ValueError(f"Can not prune a component of a dataset (ref={ref})")
1470 # We don't need an unreliable Datastore transaction for this, because
1471 # we've been extra careful to ensure that Datastore.trash only involves
1472 # mutating the Registry (it can _look_ at Datastore-specific things,
1473 # but shouldn't change them), and hence all operations here are
1474 # Registry operations.
1475 with self.registry.transaction():
1476 if unstore:
1477 self.datastore.trash(refs)
1478 if purge:
1479 self.registry.removeDatasets(refs)
1480 elif disassociate:
1481 assert tags, "Guaranteed by earlier logic in this function."
1482 for tag in tags:
1483 self.registry.disassociate(tag, refs)
1484 # We've exited the Registry transaction, and apparently committed.
1485 # (if there was an exception, everything rolled back, and it's as if
1486 # nothing happened - and we never get here).
1487 # Datastore artifacts are not yet gone, but they're clearly marked
1488 # as trash, so if we fail to delete now because of (e.g.) filesystem
1489 # problems we can try again later, and if manual administrative
1490 # intervention is required, it's pretty clear what that should entail:
1491 # deleting everything on disk and in private Datastore tables that is
1492 # in the dataset_location_trash table.
1493 if unstore:
1494 # Point of no return for removing artifacts
1495 self.datastore.emptyTrash()
1497 @transactional
1498 def ingest(self, *datasets: FileDataset, transfer: Optional[str] = "auto", run: Optional[str] = None,
1499 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
1500 ) -> None:
1501 """Store and register one or more datasets that already exist on disk.
1503 Parameters
1504 ----------
1505 datasets : `FileDataset`
1506 Each positional argument is a struct containing information about
1507 a file to be ingested, including its URI (either absolute or
1508 relative to the datastore root, if applicable), a `DatasetRef`,
1509 and optionally a formatter class or its fully-qualified string
1510 name. If a formatter is not provided, the formatter that would be
1511 used for `put` is assumed. On successful return, all
1512 `FileDataset.ref` attributes will have their `DatasetRef.id`
1513 attribute populated and all `FileDataset.formatter` attributes will
1514 be set to the formatter class used. `FileDataset.path` attributes
1515 may be modified to put paths in whatever the datastore considers a
1516 standardized form.
1517 transfer : `str`, optional
1518 If not `None`, must be one of 'auto', 'move', 'copy', 'direct',
1519 'split', 'hardlink', 'relsymlink' or 'symlink', indicating how to
1520 transfer the file.
1521 run : `str`, optional
1522 The name of the run ingested datasets should be added to,
1523 overriding ``self.run``.
1524 idGenerationMode : `DatasetIdGenEnum`, optional
1525 Specifies option for generating dataset IDs. By default unique IDs
1526 are generated for each inserted dataset.
1528 Raises
1529 ------
1530 TypeError
1531 Raised if the butler is read-only or if no run was provided.
1532 NotImplementedError
1533 Raised if the `Datastore` does not support the given transfer mode.
1534 DatasetTypeNotSupportedError
1535 Raised if one or more files to be ingested have a dataset type that
1536 is not supported by the `Datastore`..
1537 FileNotFoundError
1538 Raised if one of the given files does not exist.
1539 FileExistsError
1540 Raised if transfer is not `None` but the (internal) location the
1541 file would be moved to is already occupied.
1543 Notes
1544 -----
1545 This operation is not fully exception safe: if a database operation
1546 fails, the given `FileDataset` instances may be only partially updated.
1548 It is atomic in terms of database operations (they will either all
1549 succeed or all fail) providing the database engine implements
1550 transactions correctly. It will attempt to be atomic in terms of
1551 filesystem operations as well, but this cannot be implemented
1552 rigorously for most datastores.
1553 """
1554 if not self.isWriteable():
1555 raise TypeError("Butler is read-only.")
1556 progress = Progress("lsst.daf.butler.Butler.ingest", level=logging.DEBUG)
1557 # Reorganize the inputs so they're grouped by DatasetType and then
1558 # data ID. We also include a list of DatasetRefs for each FileDataset
1559 # to hold the resolved DatasetRefs returned by the Registry, before
1560 # it's safe to swap them into FileDataset.refs.
1561 # Some type annotation aliases to make that clearer:
1562 GroupForType = Dict[DataCoordinate, Tuple[FileDataset, List[DatasetRef]]]
1563 GroupedData = MutableMapping[DatasetType, GroupForType]
1564 # The actual data structure:
1565 groupedData: GroupedData = defaultdict(dict)
1566 # And the nested loop that populates it:
1567 for dataset in progress.wrap(datasets, desc="Grouping by dataset type"):
1568 # This list intentionally shared across the inner loop, since it's
1569 # associated with `dataset`.
1570 resolvedRefs: List[DatasetRef] = []
1572 # Somewhere to store pre-existing refs if we have an
1573 # execution butler.
1574 existingRefs: List[DatasetRef] = []
1576 for ref in dataset.refs:
1577 if ref.dataId in groupedData[ref.datasetType]:
1578 raise ConflictingDefinitionError(f"Ingest conflict. Dataset {dataset.path} has same"
1579 " DataId as other ingest dataset"
1580 f" {groupedData[ref.datasetType][ref.dataId][0].path} "
1581 f" ({ref.dataId})")
1582 if self._allow_put_of_predefined_dataset:
1583 existing_ref = self.registry.findDataset(ref.datasetType,
1584 dataId=ref.dataId,
1585 collections=run)
1586 if existing_ref:
1587 if self.datastore.knows(existing_ref):
1588 raise ConflictingDefinitionError(f"Dataset associated with path {dataset.path}"
1589 f" already exists as {existing_ref}.")
1590 # Store this ref elsewhere since it already exists
1591 # and we do not want to remake it but we do want
1592 # to store it in the datastore.
1593 existingRefs.append(existing_ref)
1595 # Nothing else to do until we have finished
1596 # iterating.
1597 continue
1599 groupedData[ref.datasetType][ref.dataId] = (dataset, resolvedRefs)
1601 if existingRefs:
1603 if len(dataset.refs) != len(existingRefs):
1604 # Keeping track of partially pre-existing datasets is hard
1605 # and should generally never happen. For now don't allow
1606 # it.
1607 raise ConflictingDefinitionError(f"For dataset {dataset.path} some dataIds already exist"
1608 " in registry but others do not. This is not supported.")
1610 # Attach the resolved refs if we found them.
1611 dataset.refs = existingRefs
1613 # Now we can bulk-insert into Registry for each DatasetType.
1614 for datasetType, groupForType in progress.iter_item_chunks(groupedData.items(),
1615 desc="Bulk-inserting datasets by type"):
1616 refs = self.registry.insertDatasets(
1617 datasetType,
1618 dataIds=groupForType.keys(),
1619 run=run,
1620 expand=self.datastore.needs_expanded_data_ids(transfer, datasetType),
1621 idGenerationMode=idGenerationMode,
1622 )
1623 # Append those resolved DatasetRefs to the new lists we set up for
1624 # them.
1625 for ref, (_, resolvedRefs) in zip(refs, groupForType.values()):
1626 resolvedRefs.append(ref)
1628 # Go back to the original FileDatasets to replace their refs with the
1629 # new resolved ones.
1630 for groupForType in progress.iter_chunks(groupedData.values(),
1631 desc="Reassociating resolved dataset refs with files"):
1632 for dataset, resolvedRefs in groupForType.values():
1633 dataset.refs = resolvedRefs
1635 # Bulk-insert everything into Datastore.
1636 self.datastore.ingest(*datasets, transfer=transfer)
1638 @contextlib.contextmanager
1639 def export(self, *, directory: Optional[str] = None,
1640 filename: Optional[str] = None,
1641 format: Optional[str] = None,
1642 transfer: Optional[str] = None) -> Iterator[RepoExportContext]:
1643 """Export datasets from the repository represented by this `Butler`.
1645 This method is a context manager that returns a helper object
1646 (`RepoExportContext`) that is used to indicate what information from
1647 the repository should be exported.
1649 Parameters
1650 ----------
1651 directory : `str`, optional
1652 Directory dataset files should be written to if ``transfer`` is not
1653 `None`.
1654 filename : `str`, optional
1655 Name for the file that will include database information associated
1656 with the exported datasets. If this is not an absolute path and
1657 ``directory`` is not `None`, it will be written to ``directory``
1658 instead of the current working directory. Defaults to
1659 "export.{format}".
1660 format : `str`, optional
1661 File format for the database information file. If `None`, the
1662 extension of ``filename`` will be used.
1663 transfer : `str`, optional
1664 Transfer mode passed to `Datastore.export`.
1666 Raises
1667 ------
1668 TypeError
1669 Raised if the set of arguments passed is inconsistent.
1671 Examples
1672 --------
1673 Typically the `Registry.queryDataIds` and `Registry.queryDatasets`
1674 methods are used to provide the iterables over data IDs and/or datasets
1675 to be exported::
1677 with butler.export("exports.yaml") as export:
1678 # Export all flats, but none of the dimension element rows
1679 # (i.e. data ID information) associated with them.
1680 export.saveDatasets(butler.registry.queryDatasets("flat"),
1681 elements=())
1682 # Export all datasets that start with "deepCoadd_" and all of
1683 # their associated data ID information.
1684 export.saveDatasets(butler.registry.queryDatasets("deepCoadd_*"))
1685 """
1686 if directory is None and transfer is not None:
1687 raise TypeError("Cannot transfer without providing a directory.")
1688 if transfer == "move":
1689 raise TypeError("Transfer may not be 'move': export is read-only")
1690 if format is None:
1691 if filename is None:
1692 raise TypeError("At least one of 'filename' or 'format' must be provided.")
1693 else:
1694 _, format = os.path.splitext(filename)
1695 elif filename is None:
1696 filename = f"export.{format}"
1697 if directory is not None:
1698 filename = os.path.join(directory, filename)
1699 BackendClass = getClassOf(self._config["repo_transfer_formats"][format]["export"])
1700 with open(filename, 'w') as stream:
1701 backend = BackendClass(stream)
1702 try:
1703 helper = RepoExportContext(self.registry, self.datastore, backend=backend,
1704 directory=directory, transfer=transfer)
1705 yield helper
1706 except BaseException:
1707 raise
1708 else:
1709 helper._finish()
1711 def import_(self, *, directory: Optional[str] = None,
1712 filename: Union[str, TextIO, None] = None,
1713 format: Optional[str] = None,
1714 transfer: Optional[str] = None,
1715 skip_dimensions: Optional[Set] = None,
1716 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
1717 reuseIds: bool = False) -> None:
1718 """Import datasets into this repository that were exported from a
1719 different butler repository via `~lsst.daf.butler.Butler.export`.
1721 Parameters
1722 ----------
1723 directory : `str`, optional
1724 Directory containing dataset files to import from. If `None`,
1725 ``filename`` and all dataset file paths specified therein must
1726 be absolute.
1727 filename : `str` or `TextIO`, optional
1728 A stream or name of file that contains database information
1729 associated with the exported datasets, typically generated by
1730 `~lsst.daf.butler.Butler.export`. If this a string (name) and
1731 is not an absolute path, does not exist in the current working
1732 directory, and ``directory`` is not `None`, it is assumed to be in
1733 ``directory``. Defaults to "export.{format}".
1734 format : `str`, optional
1735 File format for ``filename``. If `None`, the extension of
1736 ``filename`` will be used.
1737 transfer : `str`, optional
1738 Transfer mode passed to `~lsst.daf.butler.Datastore.ingest`.
1739 skip_dimensions : `set`, optional
1740 Names of dimensions that should be skipped and not imported.
1741 idGenerationMode : `DatasetIdGenEnum`, optional
1742 Specifies option for generating dataset IDs when IDs are not
1743 provided or their type does not match backend type. By default
1744 unique IDs are generated for each inserted dataset.
1745 reuseIds : `bool`, optional
1746 If `True` then forces re-use of imported dataset IDs for integer
1747 IDs which are normally generated as auto-incremented; exception
1748 will be raised if imported IDs clash with existing ones. This
1749 option has no effect on the use of globally-unique IDs which are
1750 always re-used (or generated if integer IDs are being imported).
1752 Raises
1753 ------
1754 TypeError
1755 Raised if the set of arguments passed is inconsistent, or if the
1756 butler is read-only.
1757 """
1758 if not self.isWriteable():
1759 raise TypeError("Butler is read-only.")
1760 if format is None:
1761 if filename is None:
1762 raise TypeError("At least one of 'filename' or 'format' must be provided.")
1763 else:
1764 _, format = os.path.splitext(filename) # type: ignore
1765 elif filename is None:
1766 filename = f"export.{format}"
1767 if isinstance(filename, str) and directory is not None and not os.path.exists(filename):
1768 filename = os.path.join(directory, filename)
1769 BackendClass = getClassOf(self._config["repo_transfer_formats"][format]["import"])
1771 def doImport(importStream: TextIO) -> None:
1772 backend = BackendClass(importStream, self.registry)
1773 backend.register()
1774 with self.transaction():
1775 backend.load(self.datastore, directory=directory, transfer=transfer,
1776 skip_dimensions=skip_dimensions, idGenerationMode=idGenerationMode,
1777 reuseIds=reuseIds)
1779 if isinstance(filename, str):
1780 with open(filename, "r") as stream:
1781 doImport(stream)
1782 else:
1783 doImport(filename)
1785 def transfer_from(self, source_butler: Butler, source_refs: Iterable[DatasetRef],
1786 transfer: str = "auto",
1787 id_gen_map: Dict[str, DatasetIdGenEnum] = None,
1788 skip_missing: bool = True) -> List[DatasetRef]:
1789 """Transfer datasets to this Butler from a run in another Butler.
1791 Parameters
1792 ----------
1793 source_butler : `Butler`
1794 Butler from which the datasets are to be transferred.
1795 source_refs : iterable of `DatasetRef`
1796 Datasets defined in the source butler that should be transferred to
1797 this butler.
1798 transfer : `str`, optional
1799 Transfer mode passed to `~lsst.daf.butler.Datastore.transfer_from`.
1800 id_gen_map : `dict` of [`str`, `DatasetIdGenEnum`], optional
1801 A mapping of dataset type to ID generation mode. Only used if
1802 the source butler is using integer IDs. Should not be used
1803 if this receiving butler uses integer IDs. Without this dataset
1804 import always uses unique.
1805 skip_missing : `bool`
1806 If `True`, datasets with no datastore artifact associated with
1807 them are not transferred.
1809 Returns
1810 -------
1811 refs : `list` of `DatasetRef`
1812 The refs added to this Butler.
1814 Notes
1815 -----
1816 Requires that any dimension definitions are already present in the
1817 receiving Butler. The datastore artifact has to exist for a transfer
1818 to be made but non-existence is not an error.
1820 Datasets that already exist in this run will be skipped.
1822 The datasets are imported as part of a transaction, although
1823 dataset types are registered before the transaction is started.
1824 This means that it is possible for a dataset type to be registered
1825 even though transfer has failed.
1826 """
1827 if not self.isWriteable():
1828 raise TypeError("Butler is read-only.")
1829 progress = Progress("lsst.daf.butler.Butler.transfer_from", level=VERBOSE)
1831 # Will iterate through the refs multiple times so need to convert
1832 # to a list if this isn't a collection.
1833 if not isinstance(source_refs, collections.abc.Collection):
1834 source_refs = list(source_refs)
1836 log.info("Transferring %d datasets into %s", len(source_refs), str(self))
1838 if id_gen_map is None:
1839 id_gen_map = {}
1841 # In some situations the datastore artifact may be missing
1842 # and we do not want that registry entry to be imported.
1843 # Asking datastore is not sufficient, the records may have been
1844 # purged, we have to ask for the (predicted) URI and check
1845 # existence explicitly. Execution butler is set up exactly like
1846 # this with no datastore records.
1847 if skip_missing:
1848 source_refs = [ref for ref in source_refs if source_butler.datastore.exists(ref)]
1850 # Importing requires that we group the refs by dataset type and run
1851 # before doing the import.
1852 grouped_refs = defaultdict(list)
1853 grouped_indices = defaultdict(list)
1854 for i, ref in enumerate(source_refs):
1855 grouped_refs[ref.datasetType, ref.run].append(ref)
1856 grouped_indices[ref.datasetType, ref.run].append(i)
1858 # Register any dataset types we need. This has to be done outside
1859 # of a transaction and so will not be rolled back on failure.
1860 for datasetType, _ in grouped_refs:
1861 self.registry.registerDatasetType(datasetType)
1863 # The returned refs should be identical for UUIDs.
1864 # For now must also support integers and so need to retain the
1865 # newly-created refs from this registry.
1866 # Pre-size it so we can assign refs into the correct slots
1867 transferred_refs_tmp: List[Optional[DatasetRef]] = [None] * len(source_refs)
1868 default_id_gen = DatasetIdGenEnum.UNIQUE
1870 # Do all the importing in a single transaction.
1871 with self.transaction():
1872 for (datasetType, run), refs_to_import in progress.iter_item_chunks(grouped_refs.items(),
1873 desc="Importing to registry"
1874 " by run and dataset type"):
1875 run_doc = source_butler.registry.getCollectionDocumentation(run)
1876 self.registry.registerCollection(run, CollectionType.RUN, doc=run_doc)
1878 id_generation_mode = default_id_gen
1879 if isinstance(refs_to_import[0].id, int):
1880 # ID generation mode might need to be overridden when
1881 # targetting UUID
1882 id_generation_mode = id_gen_map.get(datasetType.name, default_id_gen)
1884 n_refs = len(refs_to_import)
1885 log.log(VERBOSE, "Importing %d ref%s of dataset type %s into run %s",
1886 n_refs, "" if n_refs == 1 else "s", datasetType.name, run)
1888 # No way to know if this butler's registry uses UUID.
1889 # We have to trust the caller on this. If it fails they will
1890 # have to change their approach. We can't catch the exception
1891 # and retry with unique because that will mess up the
1892 # transaction handling. We aren't allowed to ask the registry
1893 # manager what type of ID it is using.
1894 imported_refs = self.registry._importDatasets(refs_to_import,
1895 idGenerationMode=id_generation_mode,
1896 expand=False)
1898 # Map them into the correct slots to match the initial order
1899 for i, ref in zip(grouped_indices[datasetType, run], imported_refs):
1900 transferred_refs_tmp[i] = ref
1902 # Mypy insists that we might have None in here so we have to make
1903 # that explicit by assigning to a new variable and filtering out
1904 # something that won't be there.
1905 transferred_refs = [ref for ref in transferred_refs_tmp if ref is not None]
1907 # Check consistency
1908 assert len(source_refs) == len(transferred_refs), "Different number of refs imported than given"
1910 log.log(VERBOSE, "Imported %d datasets into destination butler", len(transferred_refs))
1912 # The transferred refs need to be reordered to match the original
1913 # ordering given by the caller. Without this the datastore transfer
1914 # will be broken.
1916 # Ask the datastore to transfer. The datastore has to check that
1917 # the source datastore is compatible with the target datastore.
1918 self.datastore.transfer_from(source_butler.datastore, source_refs,
1919 local_refs=transferred_refs, transfer=transfer)
1921 return transferred_refs
1923 def validateConfiguration(self, logFailures: bool = False,
1924 datasetTypeNames: Optional[Iterable[str]] = None,
1925 ignore: Iterable[str] = None) -> None:
1926 """Validate butler configuration.
1928 Checks that each `DatasetType` can be stored in the `Datastore`.
1930 Parameters
1931 ----------
1932 logFailures : `bool`, optional
1933 If `True`, output a log message for every validation error
1934 detected.
1935 datasetTypeNames : iterable of `str`, optional
1936 The `DatasetType` names that should be checked. This allows
1937 only a subset to be selected.
1938 ignore : iterable of `str`, optional
1939 Names of DatasetTypes to skip over. This can be used to skip
1940 known problems. If a named `DatasetType` corresponds to a
1941 composite, all components of that `DatasetType` will also be
1942 ignored.
1944 Raises
1945 ------
1946 ButlerValidationError
1947 Raised if there is some inconsistency with how this Butler
1948 is configured.
1949 """
1950 if datasetTypeNames:
1951 datasetTypes = [self.registry.getDatasetType(name) for name in datasetTypeNames]
1952 else:
1953 datasetTypes = list(self.registry.queryDatasetTypes())
1955 # filter out anything from the ignore list
1956 if ignore:
1957 ignore = set(ignore)
1958 datasetTypes = [e for e in datasetTypes
1959 if e.name not in ignore and e.nameAndComponent()[0] not in ignore]
1960 else:
1961 ignore = set()
1963 # Find all the registered instruments
1964 instruments = set(
1965 record.name for record in self.registry.queryDimensionRecords("instrument")
1966 )
1968 # For each datasetType that has an instrument dimension, create
1969 # a DatasetRef for each defined instrument
1970 datasetRefs = []
1972 for datasetType in datasetTypes:
1973 if "instrument" in datasetType.dimensions:
1974 for instrument in instruments:
1975 datasetRef = DatasetRef(datasetType, {"instrument": instrument}, # type: ignore
1976 conform=False)
1977 datasetRefs.append(datasetRef)
1979 entities: List[Union[DatasetType, DatasetRef]] = []
1980 entities.extend(datasetTypes)
1981 entities.extend(datasetRefs)
1983 datastoreErrorStr = None
1984 try:
1985 self.datastore.validateConfiguration(entities, logFailures=logFailures)
1986 except ValidationError as e:
1987 datastoreErrorStr = str(e)
1989 # Also check that the LookupKeys used by the datastores match
1990 # registry and storage class definitions
1991 keys = self.datastore.getLookupKeys()
1993 failedNames = set()
1994 failedDataId = set()
1995 for key in keys:
1996 if key.name is not None:
1997 if key.name in ignore:
1998 continue
2000 # skip if specific datasetType names were requested and this
2001 # name does not match
2002 if datasetTypeNames and key.name not in datasetTypeNames:
2003 continue
2005 # See if it is a StorageClass or a DatasetType
2006 if key.name in self.storageClasses:
2007 pass
2008 else:
2009 try:
2010 self.registry.getDatasetType(key.name)
2011 except KeyError:
2012 if logFailures:
2013 log.critical("Key '%s' does not correspond to a DatasetType or StorageClass", key)
2014 failedNames.add(key)
2015 else:
2016 # Dimensions are checked for consistency when the Butler
2017 # is created and rendezvoused with a universe.
2018 pass
2020 # Check that the instrument is a valid instrument
2021 # Currently only support instrument so check for that
2022 if key.dataId:
2023 dataIdKeys = set(key.dataId)
2024 if set(["instrument"]) != dataIdKeys:
2025 if logFailures:
2026 log.critical("Key '%s' has unsupported DataId override", key)
2027 failedDataId.add(key)
2028 elif key.dataId["instrument"] not in instruments:
2029 if logFailures:
2030 log.critical("Key '%s' has unknown instrument", key)
2031 failedDataId.add(key)
2033 messages = []
2035 if datastoreErrorStr:
2036 messages.append(datastoreErrorStr)
2038 for failed, msg in ((failedNames, "Keys without corresponding DatasetType or StorageClass entry: "),
2039 (failedDataId, "Keys with bad DataId entries: ")):
2040 if failed:
2041 msg += ", ".join(str(k) for k in failed)
2042 messages.append(msg)
2044 if messages:
2045 raise ValidationError(";\n".join(messages))
2047 @property
2048 def collections(self) -> CollectionSearch:
2049 """The collections to search by default, in order (`CollectionSearch`).
2051 This is an alias for ``self.registry.defaults.collections``. It cannot
2052 be set directly in isolation, but all defaults may be changed together
2053 by assigning a new `RegistryDefaults` instance to
2054 ``self.registry.defaults``.
2055 """
2056 return self.registry.defaults.collections
2058 @property
2059 def run(self) -> Optional[str]:
2060 """Name of the run this butler writes outputs to by default (`str` or
2061 `None`).
2063 This is an alias for ``self.registry.defaults.run``. It cannot be set
2064 directly in isolation, but all defaults may be changed together by
2065 assigning a new `RegistryDefaults` instance to
2066 ``self.registry.defaults``.
2067 """
2068 return self.registry.defaults.run
2070 registry: Registry
2071 """The object that manages dataset metadata and relationships (`Registry`).
2073 Most operations that don't involve reading or writing butler datasets are
2074 accessible only via `Registry` methods.
2075 """
2077 datastore: Datastore
2078 """The object that manages actual dataset storage (`Datastore`).
2080 Direct user access to the datastore should rarely be necessary; the primary
2081 exception is the case where a `Datastore` implementation provides extra
2082 functionality beyond what the base class defines.
2083 """
2085 storageClasses: StorageClassFactory
2086 """An object that maps known storage class names to objects that fully
2087 describe them (`StorageClassFactory`).
2088 """
2090 _allow_put_of_predefined_dataset: bool
2091 """Allow a put to succeed even if there is already a registry entry for it
2092 but not a datastore record. (`bool`)."""