Coverage for python/lsst/daf/butler/_butler.py : 9%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""
23Butler top level classes.
24"""
25from __future__ import annotations
27__all__ = (
28 "Butler",
29 "ButlerValidationError",
30 "PruneCollectionsArgsError",
31 "PurgeWithoutUnstorePruneCollectionsError",
32 "RunWithoutPurgePruneCollectionsError",
33 "PurgeUnsupportedPruneCollectionsError",
34)
36import collections.abc
37from collections import defaultdict
38import contextlib
39import logging
40import numbers
41import os
42from typing import (
43 Any,
44 ClassVar,
45 Counter,
46 Dict,
47 Iterable,
48 Iterator,
49 List,
50 MutableMapping,
51 Optional,
52 Set,
53 TextIO,
54 Tuple,
55 Type,
56 Union,
57)
59try:
60 import boto3
61except ImportError:
62 boto3 = None
64from lsst.utils import doImport
65from .core import (
66 AmbiguousDatasetError,
67 ButlerURI,
68 Config,
69 ConfigSubset,
70 DataCoordinate,
71 DataId,
72 DataIdValue,
73 DatasetRef,
74 DatasetType,
75 Datastore,
76 Dimension,
77 DimensionConfig,
78 FileDataset,
79 Progress,
80 StorageClassFactory,
81 Timespan,
82 ValidationError,
83 VERBOSE,
84)
85from .core.repoRelocation import BUTLER_ROOT_TAG
86from .core.utils import transactional, getClassOf
87from ._deferredDatasetHandle import DeferredDatasetHandle
88from ._butlerConfig import ButlerConfig
89from .registry import (
90 Registry,
91 RegistryConfig,
92 RegistryDefaults,
93 CollectionSearch,
94 CollectionType,
95 ConflictingDefinitionError,
96 DatasetIdGenEnum,
97)
98from .transfers import RepoExportContext
100log = logging.getLogger(__name__)
103class ButlerValidationError(ValidationError):
104 """There is a problem with the Butler configuration."""
105 pass
108class PruneCollectionsArgsError(TypeError):
109 """Base class for errors relating to Butler.pruneCollections input
110 arguments.
111 """
112 pass
115class PurgeWithoutUnstorePruneCollectionsError(PruneCollectionsArgsError):
116 """Raised when purge and unstore are both required to be True, and
117 purge is True but unstore is False.
118 """
120 def __init__(self) -> None:
121 super().__init__("Cannot pass purge=True without unstore=True.")
124class RunWithoutPurgePruneCollectionsError(PruneCollectionsArgsError):
125 """Raised when pruning a RUN collection but purge is False."""
127 def __init__(self, collectionType: CollectionType):
128 self.collectionType = collectionType
129 super().__init__(f"Cannot prune RUN collection {self.collectionType.name} without purge=True.")
132class PurgeUnsupportedPruneCollectionsError(PruneCollectionsArgsError):
133 """Raised when purge is True but is not supported for the given
134 collection."""
136 def __init__(self, collectionType: CollectionType):
137 self.collectionType = collectionType
138 super().__init__(
139 f"Cannot prune {self.collectionType} collection {self.collectionType.name} with purge=True.")
142class Butler:
143 """Main entry point for the data access system.
145 Parameters
146 ----------
147 config : `ButlerConfig`, `Config` or `str`, optional.
148 Configuration. Anything acceptable to the
149 `ButlerConfig` constructor. If a directory path
150 is given the configuration will be read from a ``butler.yaml`` file in
151 that location. If `None` is given default values will be used.
152 butler : `Butler`, optional.
153 If provided, construct a new Butler that uses the same registry and
154 datastore as the given one, but with the given collection and run.
155 Incompatible with the ``config``, ``searchPaths``, and ``writeable``
156 arguments.
157 collections : `str` or `Iterable` [ `str` ], optional
158 An expression specifying the collections to be searched (in order) when
159 reading datasets.
160 This may be a `str` collection name or an iterable thereof.
161 See :ref:`daf_butler_collection_expressions` for more information.
162 These collections are not registered automatically and must be
163 manually registered before they are used by any method, but they may be
164 manually registered after the `Butler` is initialized.
165 run : `str`, optional
166 Name of the `~CollectionType.RUN` collection new datasets should be
167 inserted into. If ``collections`` is `None` and ``run`` is not `None`,
168 ``collections`` will be set to ``[run]``. If not `None`, this
169 collection will automatically be registered. If this is not set (and
170 ``writeable`` is not set either), a read-only butler will be created.
171 searchPaths : `list` of `str`, optional
172 Directory paths to search when calculating the full Butler
173 configuration. Not used if the supplied config is already a
174 `ButlerConfig`.
175 writeable : `bool`, optional
176 Explicitly sets whether the butler supports write operations. If not
177 provided, a read-write butler is created if any of ``run``, ``tags``,
178 or ``chains`` is non-empty.
179 inferDefaults : `bool`, optional
180 If `True` (default) infer default data ID values from the values
181 present in the datasets in ``collections``: if all collections have the
182 same value (or no value) for a governor dimension, that value will be
183 the default for that dimension. Nonexistent collections are ignored.
184 If a default value is provided explicitly for a governor dimension via
185 ``**kwargs``, no default will be inferred for that dimension.
186 **kwargs : `str`
187 Default data ID key-value pairs. These may only identify "governor"
188 dimensions like ``instrument`` and ``skymap``.
190 Examples
191 --------
192 While there are many ways to control exactly how a `Butler` interacts with
193 the collections in its `Registry`, the most common cases are still simple.
195 For a read-only `Butler` that searches one collection, do::
197 butler = Butler("/path/to/repo", collections=["u/alice/DM-50000"])
199 For a read-write `Butler` that writes to and reads from a
200 `~CollectionType.RUN` collection::
202 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a")
204 The `Butler` passed to a ``PipelineTask`` is often much more complex,
205 because we want to write to one `~CollectionType.RUN` collection but read
206 from several others (as well)::
208 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a",
209 collections=["u/alice/DM-50000/a",
210 "u/bob/DM-49998",
211 "HSC/defaults"])
213 This butler will `put` new datasets to the run ``u/alice/DM-50000/a``.
214 Datasets will be read first from that run (since it appears first in the
215 chain), and then from ``u/bob/DM-49998`` and finally ``HSC/defaults``.
217 Finally, one can always create a `Butler` with no collections::
219 butler = Butler("/path/to/repo", writeable=True)
221 This can be extremely useful when you just want to use ``butler.registry``,
222 e.g. for inserting dimension data or managing collections, or when the
223 collections you want to use with the butler are not consistent.
224 Passing ``writeable`` explicitly here is only necessary if you want to be
225 able to make changes to the repo - usually the value for ``writeable`` can
226 be guessed from the collection arguments provided, but it defaults to
227 `False` when there are not collection arguments.
228 """
229 def __init__(self, config: Union[Config, str, None] = None, *,
230 butler: Optional[Butler] = None,
231 collections: Any = None,
232 run: Optional[str] = None,
233 searchPaths: Optional[List[str]] = None,
234 writeable: Optional[bool] = None,
235 inferDefaults: bool = True,
236 **kwargs: str,
237 ):
238 defaults = RegistryDefaults(collections=collections, run=run, infer=inferDefaults, **kwargs)
239 # Load registry, datastore, etc. from config or existing butler.
240 if butler is not None:
241 if config is not None or searchPaths is not None or writeable is not None:
242 raise TypeError("Cannot pass 'config', 'searchPaths', or 'writeable' "
243 "arguments with 'butler' argument.")
244 self.registry = butler.registry.copy(defaults)
245 self.datastore = butler.datastore
246 self.storageClasses = butler.storageClasses
247 self._config: ButlerConfig = butler._config
248 self._allow_put_of_predefined_dataset = butler._allow_put_of_predefined_dataset
249 else:
250 self._config = ButlerConfig(config, searchPaths=searchPaths)
251 if "root" in self._config:
252 butlerRoot = self._config["root"]
253 else:
254 butlerRoot = self._config.configDir
255 if writeable is None:
256 writeable = run is not None
257 self.registry = Registry.fromConfig(self._config, butlerRoot=butlerRoot, writeable=writeable,
258 defaults=defaults)
259 self.datastore = Datastore.fromConfig(self._config, self.registry.getDatastoreBridgeManager(),
260 butlerRoot=butlerRoot)
261 self.storageClasses = StorageClassFactory()
262 self.storageClasses.addFromConfig(self._config)
263 self._allow_put_of_predefined_dataset = self._config.get("allow_put_of_predefined_dataset", False)
264 if "run" in self._config or "collection" in self._config:
265 raise ValueError("Passing a run or collection via configuration is no longer supported.")
267 GENERATION: ClassVar[int] = 3
268 """This is a Generation 3 Butler.
270 This attribute may be removed in the future, once the Generation 2 Butler
271 interface has been fully retired; it should only be used in transitional
272 code.
273 """
275 @staticmethod
276 def makeRepo(root: str, config: Union[Config, str, None] = None,
277 dimensionConfig: Union[Config, str, None] = None, standalone: bool = False,
278 searchPaths: Optional[List[str]] = None, forceConfigRoot: bool = True,
279 outfile: Optional[str] = None, overwrite: bool = False) -> Config:
280 """Create an empty data repository by adding a butler.yaml config
281 to a repository root directory.
283 Parameters
284 ----------
285 root : `str` or `ButlerURI`
286 Path or URI to the root location of the new repository. Will be
287 created if it does not exist.
288 config : `Config` or `str`, optional
289 Configuration to write to the repository, after setting any
290 root-dependent Registry or Datastore config options. Can not
291 be a `ButlerConfig` or a `ConfigSubset`. If `None`, default
292 configuration will be used. Root-dependent config options
293 specified in this config are overwritten if ``forceConfigRoot``
294 is `True`.
295 dimensionConfig : `Config` or `str`, optional
296 Configuration for dimensions, will be used to initialize registry
297 database.
298 standalone : `bool`
299 If True, write all expanded defaults, not just customized or
300 repository-specific settings.
301 This (mostly) decouples the repository from the default
302 configuration, insulating it from changes to the defaults (which
303 may be good or bad, depending on the nature of the changes).
304 Future *additions* to the defaults will still be picked up when
305 initializing `Butlers` to repos created with ``standalone=True``.
306 searchPaths : `list` of `str`, optional
307 Directory paths to search when calculating the full butler
308 configuration.
309 forceConfigRoot : `bool`, optional
310 If `False`, any values present in the supplied ``config`` that
311 would normally be reset are not overridden and will appear
312 directly in the output config. This allows non-standard overrides
313 of the root directory for a datastore or registry to be given.
314 If this parameter is `True` the values for ``root`` will be
315 forced into the resulting config if appropriate.
316 outfile : `str`, optional
317 If not-`None`, the output configuration will be written to this
318 location rather than into the repository itself. Can be a URI
319 string. Can refer to a directory that will be used to write
320 ``butler.yaml``.
321 overwrite : `bool`, optional
322 Create a new configuration file even if one already exists
323 in the specified output location. Default is to raise
324 an exception.
326 Returns
327 -------
328 config : `Config`
329 The updated `Config` instance written to the repo.
331 Raises
332 ------
333 ValueError
334 Raised if a ButlerConfig or ConfigSubset is passed instead of a
335 regular Config (as these subclasses would make it impossible to
336 support ``standalone=False``).
337 FileExistsError
338 Raised if the output config file already exists.
339 os.error
340 Raised if the directory does not exist, exists but is not a
341 directory, or cannot be created.
343 Notes
344 -----
345 Note that when ``standalone=False`` (the default), the configuration
346 search path (see `ConfigSubset.defaultSearchPaths`) that was used to
347 construct the repository should also be used to construct any Butlers
348 to avoid configuration inconsistencies.
349 """
350 if isinstance(config, (ButlerConfig, ConfigSubset)):
351 raise ValueError("makeRepo must be passed a regular Config without defaults applied.")
353 # Ensure that the root of the repository exists or can be made
354 uri = ButlerURI(root, forceDirectory=True)
355 uri.mkdir()
357 config = Config(config)
359 # If we are creating a new repo from scratch with relative roots,
360 # do not propagate an explicit root from the config file
361 if "root" in config:
362 del config["root"]
364 full = ButlerConfig(config, searchPaths=searchPaths) # this applies defaults
365 datastoreClass: Type[Datastore] = doImport(full["datastore", "cls"])
366 datastoreClass.setConfigRoot(BUTLER_ROOT_TAG, config, full, overwrite=forceConfigRoot)
368 # if key exists in given config, parse it, otherwise parse the defaults
369 # in the expanded config
370 if config.get(("registry", "db")):
371 registryConfig = RegistryConfig(config)
372 else:
373 registryConfig = RegistryConfig(full)
374 defaultDatabaseUri = registryConfig.makeDefaultDatabaseUri(BUTLER_ROOT_TAG)
375 if defaultDatabaseUri is not None:
376 Config.updateParameters(RegistryConfig, config, full,
377 toUpdate={"db": defaultDatabaseUri},
378 overwrite=forceConfigRoot)
379 else:
380 Config.updateParameters(RegistryConfig, config, full, toCopy=("db",),
381 overwrite=forceConfigRoot)
383 if standalone:
384 config.merge(full)
385 else:
386 # Always expand the registry.managers section into the per-repo
387 # config, because after the database schema is created, it's not
388 # allowed to change anymore. Note that in the standalone=True
389 # branch, _everything_ in the config is expanded, so there's no
390 # need to special case this.
391 Config.updateParameters(RegistryConfig, config, full, toMerge=("managers",), overwrite=False)
392 configURI: Union[str, ButlerURI]
393 if outfile is not None:
394 # When writing to a separate location we must include
395 # the root of the butler repo in the config else it won't know
396 # where to look.
397 config["root"] = uri.geturl()
398 configURI = outfile
399 else:
400 configURI = uri
401 config.dumpToUri(configURI, overwrite=overwrite)
403 # Create Registry and populate tables
404 registryConfig = RegistryConfig(config.get("registry"))
405 dimensionConfig = DimensionConfig(dimensionConfig)
406 Registry.createFromConfig(registryConfig, dimensionConfig=dimensionConfig, butlerRoot=root)
408 log.log(VERBOSE, "Wrote new Butler configuration file to %s", configURI)
410 return config
412 @classmethod
413 def _unpickle(cls, config: ButlerConfig, collections: Optional[CollectionSearch], run: Optional[str],
414 defaultDataId: Dict[str, str], writeable: bool) -> Butler:
415 """Callable used to unpickle a Butler.
417 We prefer not to use ``Butler.__init__`` directly so we can force some
418 of its many arguments to be keyword-only (note that ``__reduce__``
419 can only invoke callables with positional arguments).
421 Parameters
422 ----------
423 config : `ButlerConfig`
424 Butler configuration, already coerced into a true `ButlerConfig`
425 instance (and hence after any search paths for overrides have been
426 utilized).
427 collections : `CollectionSearch`
428 Names of the default collections to read from.
429 run : `str`, optional
430 Name of the default `~CollectionType.RUN` collection to write to.
431 defaultDataId : `dict` [ `str`, `str` ]
432 Default data ID values.
433 writeable : `bool`
434 Whether the Butler should support write operations.
436 Returns
437 -------
438 butler : `Butler`
439 A new `Butler` instance.
440 """
441 # MyPy doesn't recognize that the kwargs below are totally valid; it
442 # seems to think '**defaultDataId* is a _positional_ argument!
443 return cls(config=config, collections=collections, run=run, writeable=writeable,
444 **defaultDataId) # type: ignore
446 def __reduce__(self) -> tuple:
447 """Support pickling.
448 """
449 return (Butler._unpickle, (self._config, self.collections, self.run,
450 self.registry.defaults.dataId.byName(),
451 self.registry.isWriteable()))
453 def __str__(self) -> str:
454 return "Butler(collections={}, run={}, datastore='{}', registry='{}')".format(
455 self.collections, self.run, self.datastore, self.registry)
457 def isWriteable(self) -> bool:
458 """Return `True` if this `Butler` supports write operations.
459 """
460 return self.registry.isWriteable()
462 @contextlib.contextmanager
463 def transaction(self) -> Iterator[None]:
464 """Context manager supporting `Butler` transactions.
466 Transactions can be nested.
467 """
468 with self.registry.transaction():
469 with self.datastore.transaction():
470 yield
472 def _standardizeArgs(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
473 dataId: Optional[DataId] = None, **kwds: Any
474 ) -> Tuple[DatasetType, Optional[DataId]]:
475 """Standardize the arguments passed to several Butler APIs.
477 Parameters
478 ----------
479 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
480 When `DatasetRef` the `dataId` should be `None`.
481 Otherwise the `DatasetType` or name thereof.
482 dataId : `dict` or `DataCoordinate`
483 A `dict` of `Dimension` link name, value pairs that label the
484 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
485 should be provided as the second argument.
486 kwds
487 Additional keyword arguments used to augment or construct a
488 `DataCoordinate`. See `DataCoordinate.standardize`
489 parameters.
491 Returns
492 -------
493 datasetType : `DatasetType`
494 A `DatasetType` instance extracted from ``datasetRefOrType``.
495 dataId : `dict` or `DataId`, optional
496 Argument that can be used (along with ``kwds``) to construct a
497 `DataId`.
499 Notes
500 -----
501 Butler APIs that conceptually need a DatasetRef also allow passing a
502 `DatasetType` (or the name of one) and a `DataId` (or a dict and
503 keyword arguments that can be used to construct one) separately. This
504 method accepts those arguments and always returns a true `DatasetType`
505 and a `DataId` or `dict`.
507 Standardization of `dict` vs `DataId` is best handled by passing the
508 returned ``dataId`` (and ``kwds``) to `Registry` APIs, which are
509 generally similarly flexible.
510 """
511 externalDatasetType: Optional[DatasetType] = None
512 internalDatasetType: Optional[DatasetType] = None
513 if isinstance(datasetRefOrType, DatasetRef):
514 if dataId is not None or kwds:
515 raise ValueError("DatasetRef given, cannot use dataId as well")
516 externalDatasetType = datasetRefOrType.datasetType
517 dataId = datasetRefOrType.dataId
518 else:
519 # Don't check whether DataId is provided, because Registry APIs
520 # can usually construct a better error message when it wasn't.
521 if isinstance(datasetRefOrType, DatasetType):
522 externalDatasetType = datasetRefOrType
523 else:
524 internalDatasetType = self.registry.getDatasetType(datasetRefOrType)
526 # Check that they are self-consistent
527 if externalDatasetType is not None:
528 internalDatasetType = self.registry.getDatasetType(externalDatasetType.name)
529 if externalDatasetType != internalDatasetType:
530 raise ValueError(f"Supplied dataset type ({externalDatasetType}) inconsistent with "
531 f"registry definition ({internalDatasetType})")
533 assert internalDatasetType is not None
534 return internalDatasetType, dataId
536 def _findDatasetRef(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
537 dataId: Optional[DataId] = None, *,
538 collections: Any = None,
539 allowUnresolved: bool = False,
540 **kwds: Any) -> DatasetRef:
541 """Shared logic for methods that start with a search for a dataset in
542 the registry.
544 Parameters
545 ----------
546 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
547 When `DatasetRef` the `dataId` should be `None`.
548 Otherwise the `DatasetType` or name thereof.
549 dataId : `dict` or `DataCoordinate`, optional
550 A `dict` of `Dimension` link name, value pairs that label the
551 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
552 should be provided as the first argument.
553 collections : Any, optional
554 Collections to be searched, overriding ``self.collections``.
555 Can be any of the types supported by the ``collections`` argument
556 to butler construction.
557 allowUnresolved : `bool`, optional
558 If `True`, return an unresolved `DatasetRef` if finding a resolved
559 one in the `Registry` fails. Defaults to `False`.
560 kwds
561 Additional keyword arguments used to augment or construct a
562 `DataId`. See `DataId` parameters.
564 Returns
565 -------
566 ref : `DatasetRef`
567 A reference to the dataset identified by the given arguments.
569 Raises
570 ------
571 LookupError
572 Raised if no matching dataset exists in the `Registry` (and
573 ``allowUnresolved is False``).
574 ValueError
575 Raised if a resolved `DatasetRef` was passed as an input, but it
576 differs from the one found in the registry.
577 TypeError
578 Raised if no collections were provided.
579 """
580 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwds)
581 if isinstance(datasetRefOrType, DatasetRef):
582 idNumber = datasetRefOrType.id
583 else:
584 idNumber = None
585 timespan: Optional[Timespan] = None
587 # Process dimension records that are using record information
588 # rather than ids
589 newDataId: Dict[str, DataIdValue] = {}
590 byRecord: Dict[str, Dict[str, Any]] = defaultdict(dict)
592 # if all the dataId comes from keyword parameters we do not need
593 # to do anything here because they can't be of the form
594 # exposure.obs_id because a "." is not allowed in a keyword parameter.
595 if dataId:
596 for k, v in dataId.items():
597 # If we have a Dimension we do not need to do anything
598 # because it cannot be a compound key.
599 if isinstance(k, str) and "." in k:
600 # Someone is using a more human-readable dataId
601 dimensionName, record = k.split(".", 1)
602 byRecord[dimensionName][record] = v
603 elif isinstance(k, Dimension):
604 newDataId[k.name] = v
605 else:
606 newDataId[k] = v
608 # Go through the updated dataId and check the type in case someone is
609 # using an alternate key. We have already filtered out the compound
610 # keys dimensions.record format.
611 not_dimensions = {}
613 # Will need to look in the dataId and the keyword arguments
614 # and will remove them if they need to be fixed or are unrecognized.
615 for dataIdDict in (newDataId, kwds):
616 # Use a list so we can adjust the dict safely in the loop
617 for dimensionName in list(dataIdDict):
618 value = dataIdDict[dimensionName]
619 try:
620 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
621 except KeyError:
622 # This is not a real dimension
623 not_dimensions[dimensionName] = value
624 del dataIdDict[dimensionName]
625 continue
627 # Convert an integral type to an explicit int to simplify
628 # comparisons here
629 if isinstance(value, numbers.Integral):
630 value = int(value)
632 if not isinstance(value, dimension.primaryKey.getPythonType()):
633 for alternate in dimension.alternateKeys:
634 if isinstance(value, alternate.getPythonType()):
635 byRecord[dimensionName][alternate.name] = value
636 del dataIdDict[dimensionName]
637 log.debug("Converting dimension %s to %s.%s=%s",
638 dimensionName, dimensionName, alternate.name, value)
639 break
640 else:
641 log.warning("Type mismatch found for value '%r' provided for dimension %s. "
642 "Could not find matching alternative (primary key has type %s) "
643 "so attempting to use as-is.",
644 value, dimensionName, dimension.primaryKey.getPythonType())
646 # If we have some unrecognized dimensions we have to try to connect
647 # them to records in other dimensions. This is made more complicated
648 # by some dimensions having records with clashing names. A mitigation
649 # is that we can tell by this point which dimensions are missing
650 # for the DatasetType but this does not work for calibrations
651 # where additional dimensions can be used to constrain the temporal
652 # axis.
653 if not_dimensions:
654 # Calculate missing dimensions
655 provided = set(newDataId) | set(kwds) | set(byRecord)
656 missingDimensions = datasetType.dimensions.names - provided
658 # For calibrations we may well be needing temporal dimensions
659 # so rather than always including all dimensions in the scan
660 # restrict things a little. It is still possible for there
661 # to be confusion over day_obs in visit vs exposure for example.
662 # If we are not searching calibration collections things may
663 # fail but they are going to fail anyway because of the
664 # ambiguousness of the dataId...
665 candidateDimensions: Set[str] = set()
666 candidateDimensions.update(missingDimensions)
667 if datasetType.isCalibration():
668 for dim in self.registry.dimensions.getStaticDimensions():
669 if dim.temporal:
670 candidateDimensions.add(str(dim))
672 # Look up table for the first association with a dimension
673 guessedAssociation: Dict[str, Dict[str, Any]] = defaultdict(dict)
675 # Keep track of whether an item is associated with multiple
676 # dimensions.
677 counter: Counter[str] = Counter()
678 assigned: Dict[str, Set[str]] = defaultdict(set)
680 # Go through the missing dimensions and associate the
681 # given names with records within those dimensions
682 for dimensionName in candidateDimensions:
683 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
684 fields = dimension.metadata.names | dimension.uniqueKeys.names
685 for field in not_dimensions:
686 if field in fields:
687 guessedAssociation[dimensionName][field] = not_dimensions[field]
688 counter[dimensionName] += 1
689 assigned[field].add(dimensionName)
691 # There is a chance we have allocated a single dataId item
692 # to multiple dimensions. Need to decide which should be retained.
693 # For now assume that the most popular alternative wins.
694 # This means that day_obs with seq_num will result in
695 # exposure.day_obs and not visit.day_obs
696 # Also prefer an explicitly missing dimension over an inferred
697 # temporal dimension.
698 for fieldName, assignedDimensions in assigned.items():
699 if len(assignedDimensions) > 1:
700 # Pick the most popular (preferring mandatory dimensions)
701 requiredButMissing = assignedDimensions.intersection(missingDimensions)
702 if requiredButMissing:
703 candidateDimensions = requiredButMissing
704 else:
705 candidateDimensions = assignedDimensions
707 # Select the relevant items and get a new restricted
708 # counter.
709 theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions}
710 duplicatesCounter: Counter[str] = Counter()
711 duplicatesCounter.update(theseCounts)
713 # Choose the most common. If they are equally common
714 # we will pick the one that was found first.
715 # Returns a list of tuples
716 selected = duplicatesCounter.most_common(1)[0][0]
718 log.debug("Ambiguous dataId entry '%s' associated with multiple dimensions: %s."
719 " Removed ambiguity by choosing dimension %s.",
720 fieldName, ", ".join(assignedDimensions), selected)
722 for candidateDimension in assignedDimensions:
723 if candidateDimension != selected:
724 del guessedAssociation[candidateDimension][fieldName]
726 # Update the record look up dict with the new associations
727 for dimensionName, values in guessedAssociation.items():
728 if values: # A dict might now be empty
729 log.debug("Assigned non-dimension dataId keys to dimension %s: %s",
730 dimensionName, values)
731 byRecord[dimensionName].update(values)
733 if byRecord:
734 # Some record specifiers were found so we need to convert
735 # them to the Id form
736 for dimensionName, values in byRecord.items():
737 if dimensionName in newDataId:
738 log.warning("DataId specified explicit %s dimension value of %s in addition to"
739 " general record specifiers for it of %s. Ignoring record information.",
740 dimensionName, newDataId[dimensionName], str(values))
741 continue
743 # Build up a WHERE expression -- use single quotes
744 def quote(s: Any) -> str:
745 if isinstance(s, str):
746 return f"'{s}'"
747 else:
748 return s
750 where = " AND ".join(f"{dimensionName}.{k} = {quote(v)}"
751 for k, v in values.items())
753 # Hopefully we get a single record that matches
754 records = set(self.registry.queryDimensionRecords(dimensionName, dataId=newDataId,
755 where=where, **kwds))
757 if len(records) != 1:
758 if len(records) > 1:
759 log.debug("Received %d records from constraints of %s", len(records), str(values))
760 for r in records:
761 log.debug("- %s", str(r))
762 raise RuntimeError(f"DataId specification for dimension {dimensionName} is not"
763 f" uniquely constrained to a single dataset by {values}."
764 f" Got {len(records)} results.")
765 raise RuntimeError(f"DataId specification for dimension {dimensionName} matched no"
766 f" records when constrained by {values}")
768 # Get the primary key from the real dimension object
769 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
770 if not isinstance(dimension, Dimension):
771 raise RuntimeError(
772 f"{dimension.name} is not a true dimension, and cannot be used in data IDs."
773 )
774 newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name)
776 # We have modified the dataId so need to switch to it
777 dataId = newDataId
779 if datasetType.isCalibration():
780 # Because this is a calibration dataset, first try to make a
781 # standardize the data ID without restricting the dimensions to
782 # those of the dataset type requested, because there may be extra
783 # dimensions that provide temporal information for a validity-range
784 # lookup.
785 dataId = DataCoordinate.standardize(dataId, universe=self.registry.dimensions,
786 defaults=self.registry.defaults.dataId, **kwds)
787 if dataId.graph.temporal:
788 dataId = self.registry.expandDataId(dataId)
789 timespan = dataId.timespan
790 else:
791 # Standardize the data ID to just the dimensions of the dataset
792 # type instead of letting registry.findDataset do it, so we get the
793 # result even if no dataset is found.
794 dataId = DataCoordinate.standardize(dataId, graph=datasetType.dimensions,
795 defaults=self.registry.defaults.dataId, **kwds)
796 # Always lookup the DatasetRef, even if one is given, to ensure it is
797 # present in the current collection.
798 ref = self.registry.findDataset(datasetType, dataId, collections=collections, timespan=timespan)
799 if ref is None:
800 if allowUnresolved:
801 return DatasetRef(datasetType, dataId)
802 else:
803 if collections is None:
804 collections = self.registry.defaults.collections
805 raise LookupError(f"Dataset {datasetType.name} with data ID {dataId} "
806 f"could not be found in collections {collections}.")
807 if idNumber is not None and idNumber != ref.id:
808 if collections is None:
809 collections = self.registry.defaults.collections
810 raise ValueError(f"DatasetRef.id provided ({idNumber}) does not match "
811 f"id ({ref.id}) in registry in collections {collections}.")
812 return ref
814 @transactional
815 def put(self, obj: Any, datasetRefOrType: Union[DatasetRef, DatasetType, str],
816 dataId: Optional[DataId] = None, *,
817 run: Optional[str] = None,
818 **kwds: Any) -> DatasetRef:
819 """Store and register a dataset.
821 Parameters
822 ----------
823 obj : `object`
824 The dataset.
825 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
826 When `DatasetRef` is provided, ``dataId`` should be `None`.
827 Otherwise the `DatasetType` or name thereof.
828 dataId : `dict` or `DataCoordinate`
829 A `dict` of `Dimension` link name, value pairs that label the
830 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
831 should be provided as the second argument.
832 run : `str`, optional
833 The name of the run the dataset should be added to, overriding
834 ``self.run``.
835 kwds
836 Additional keyword arguments used to augment or construct a
837 `DataCoordinate`. See `DataCoordinate.standardize`
838 parameters.
840 Returns
841 -------
842 ref : `DatasetRef`
843 A reference to the stored dataset, updated with the correct id if
844 given.
846 Raises
847 ------
848 TypeError
849 Raised if the butler is read-only or if no run has been provided.
850 """
851 log.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run)
852 if not self.isWriteable():
853 raise TypeError("Butler is read-only.")
854 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwds)
855 if isinstance(datasetRefOrType, DatasetRef) and datasetRefOrType.id is not None:
856 raise ValueError("DatasetRef must not be in registry, must have None id")
858 # Add Registry Dataset entry.
859 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions, **kwds)
861 # For an execution butler the datasets will be pre-defined.
862 # If the butler is configured that way datasets should only be inserted
863 # if they do not already exist in registry. Trying and catching
864 # ConflictingDefinitionError will not work because the transaction
865 # will be corrupted. Instead, in this mode always check first.
866 ref = None
867 ref_is_predefined = False
868 if self._allow_put_of_predefined_dataset:
869 # Get the matching ref for this run.
870 ref = self.registry.findDataset(datasetType, collections=run,
871 dataId=dataId)
873 if ref:
874 # Must be expanded form for datastore templating
875 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions)
876 ref = ref.expanded(dataId)
877 ref_is_predefined = True
879 if not ref:
880 ref, = self.registry.insertDatasets(datasetType, run=run, dataIds=[dataId])
882 # If the ref is predefined it is possible that the datastore also
883 # has the record. Asking datastore to put it again will result in
884 # the artifact being recreated, overwriting previous, then will cause
885 # a failure in writing the record which will cause the artifact
886 # to be removed. Much safer to ask first before attempting to
887 # overwrite. Race conditions should not be an issue for the
888 # execution butler environment.
889 if ref_is_predefined:
890 if self.datastore.knows(ref):
891 raise ConflictingDefinitionError(f"Dataset associated {ref} already exists.")
893 self.datastore.put(obj, ref)
895 return ref
897 def getDirect(self, ref: DatasetRef, *, parameters: Optional[Dict[str, Any]] = None) -> Any:
898 """Retrieve a stored dataset.
900 Unlike `Butler.get`, this method allows datasets outside the Butler's
901 collection to be read as long as the `DatasetRef` that identifies them
902 can be obtained separately.
904 Parameters
905 ----------
906 ref : `DatasetRef`
907 Resolved reference to an already stored dataset.
908 parameters : `dict`
909 Additional StorageClass-defined options to control reading,
910 typically used to efficiently read only a subset of the dataset.
912 Returns
913 -------
914 obj : `object`
915 The dataset.
916 """
917 return self.datastore.get(ref, parameters=parameters)
919 def getDirectDeferred(self, ref: DatasetRef, *,
920 parameters: Union[dict, None] = None) -> DeferredDatasetHandle:
921 """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
922 from a resolved `DatasetRef`.
924 Parameters
925 ----------
926 ref : `DatasetRef`
927 Resolved reference to an already stored dataset.
928 parameters : `dict`
929 Additional StorageClass-defined options to control reading,
930 typically used to efficiently read only a subset of the dataset.
932 Returns
933 -------
934 obj : `DeferredDatasetHandle`
935 A handle which can be used to retrieve a dataset at a later time.
937 Raises
938 ------
939 AmbiguousDatasetError
940 Raised if ``ref.id is None``, i.e. the reference is unresolved.
941 """
942 if ref.id is None:
943 raise AmbiguousDatasetError(
944 f"Dataset of type {ref.datasetType.name} with data ID {ref.dataId} is not resolved."
945 )
946 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters)
948 def getDeferred(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
949 dataId: Optional[DataId] = None, *,
950 parameters: Union[dict, None] = None,
951 collections: Any = None,
952 **kwds: Any) -> DeferredDatasetHandle:
953 """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
954 after an immediate registry lookup.
956 Parameters
957 ----------
958 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
959 When `DatasetRef` the `dataId` should be `None`.
960 Otherwise the `DatasetType` or name thereof.
961 dataId : `dict` or `DataCoordinate`, optional
962 A `dict` of `Dimension` link name, value pairs that label the
963 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
964 should be provided as the first argument.
965 parameters : `dict`
966 Additional StorageClass-defined options to control reading,
967 typically used to efficiently read only a subset of the dataset.
968 collections : Any, optional
969 Collections to be searched, overriding ``self.collections``.
970 Can be any of the types supported by the ``collections`` argument
971 to butler construction.
972 kwds
973 Additional keyword arguments used to augment or construct a
974 `DataId`. See `DataId` parameters.
976 Returns
977 -------
978 obj : `DeferredDatasetHandle`
979 A handle which can be used to retrieve a dataset at a later time.
981 Raises
982 ------
983 LookupError
984 Raised if no matching dataset exists in the `Registry` (and
985 ``allowUnresolved is False``).
986 ValueError
987 Raised if a resolved `DatasetRef` was passed as an input, but it
988 differs from the one found in the registry.
989 TypeError
990 Raised if no collections were provided.
991 """
992 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwds)
993 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters)
995 def get(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
996 dataId: Optional[DataId] = None, *,
997 parameters: Optional[Dict[str, Any]] = None,
998 collections: Any = None,
999 **kwds: Any) -> Any:
1000 """Retrieve a stored dataset.
1002 Parameters
1003 ----------
1004 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1005 When `DatasetRef` the `dataId` should be `None`.
1006 Otherwise the `DatasetType` or name thereof.
1007 dataId : `dict` or `DataCoordinate`
1008 A `dict` of `Dimension` link name, value pairs that label the
1009 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1010 should be provided as the first argument.
1011 parameters : `dict`
1012 Additional StorageClass-defined options to control reading,
1013 typically used to efficiently read only a subset of the dataset.
1014 collections : Any, optional
1015 Collections to be searched, overriding ``self.collections``.
1016 Can be any of the types supported by the ``collections`` argument
1017 to butler construction.
1018 kwds
1019 Additional keyword arguments used to augment or construct a
1020 `DataCoordinate`. See `DataCoordinate.standardize`
1021 parameters.
1023 Returns
1024 -------
1025 obj : `object`
1026 The dataset.
1028 Raises
1029 ------
1030 ValueError
1031 Raised if a resolved `DatasetRef` was passed as an input, but it
1032 differs from the one found in the registry.
1033 LookupError
1034 Raised if no matching dataset exists in the `Registry`.
1035 TypeError
1036 Raised if no collections were provided.
1038 Notes
1039 -----
1040 When looking up datasets in a `~CollectionType.CALIBRATION` collection,
1041 this method requires that the given data ID include temporal dimensions
1042 beyond the dimensions of the dataset type itself, in order to find the
1043 dataset with the appropriate validity range. For example, a "bias"
1044 dataset with native dimensions ``{instrument, detector}`` could be
1045 fetched with a ``{instrument, detector, exposure}`` data ID, because
1046 ``exposure`` is a temporal dimension.
1047 """
1048 log.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters)
1049 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwds)
1050 return self.getDirect(ref, parameters=parameters)
1052 def getURIs(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
1053 dataId: Optional[DataId] = None, *,
1054 predict: bool = False,
1055 collections: Any = None,
1056 run: Optional[str] = None,
1057 **kwds: Any) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]:
1058 """Returns the URIs associated with the dataset.
1060 Parameters
1061 ----------
1062 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1063 When `DatasetRef` the `dataId` should be `None`.
1064 Otherwise the `DatasetType` or name thereof.
1065 dataId : `dict` or `DataCoordinate`
1066 A `dict` of `Dimension` link name, value pairs that label the
1067 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1068 should be provided as the first argument.
1069 predict : `bool`
1070 If `True`, allow URIs to be returned of datasets that have not
1071 been written.
1072 collections : Any, optional
1073 Collections to be searched, overriding ``self.collections``.
1074 Can be any of the types supported by the ``collections`` argument
1075 to butler construction.
1076 run : `str`, optional
1077 Run to use for predictions, overriding ``self.run``.
1078 kwds
1079 Additional keyword arguments used to augment or construct a
1080 `DataCoordinate`. See `DataCoordinate.standardize`
1081 parameters.
1083 Returns
1084 -------
1085 primary : `ButlerURI`
1086 The URI to the primary artifact associated with this dataset.
1087 If the dataset was disassembled within the datastore this
1088 may be `None`.
1089 components : `dict`
1090 URIs to any components associated with the dataset artifact.
1091 Can be empty if there are no components.
1092 """
1093 ref = self._findDatasetRef(datasetRefOrType, dataId, allowUnresolved=predict,
1094 collections=collections, **kwds)
1095 if ref.id is None: # only possible if predict is True
1096 if run is None:
1097 run = self.run
1098 if run is None:
1099 raise TypeError("Cannot predict location with run=None.")
1100 # Lie about ID, because we can't guess it, and only
1101 # Datastore.getURIs() will ever see it (and it doesn't use it).
1102 ref = ref.resolved(id=0, run=run)
1103 return self.datastore.getURIs(ref, predict)
1105 def getURI(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
1106 dataId: Optional[DataId] = None, *,
1107 predict: bool = False,
1108 collections: Any = None,
1109 run: Optional[str] = None,
1110 **kwds: Any) -> ButlerURI:
1111 """Return the URI to the Dataset.
1113 Parameters
1114 ----------
1115 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1116 When `DatasetRef` the `dataId` should be `None`.
1117 Otherwise the `DatasetType` or name thereof.
1118 dataId : `dict` or `DataCoordinate`
1119 A `dict` of `Dimension` link name, value pairs that label the
1120 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1121 should be provided as the first argument.
1122 predict : `bool`
1123 If `True`, allow URIs to be returned of datasets that have not
1124 been written.
1125 collections : Any, optional
1126 Collections to be searched, overriding ``self.collections``.
1127 Can be any of the types supported by the ``collections`` argument
1128 to butler construction.
1129 run : `str`, optional
1130 Run to use for predictions, overriding ``self.run``.
1131 kwds
1132 Additional keyword arguments used to augment or construct a
1133 `DataCoordinate`. See `DataCoordinate.standardize`
1134 parameters.
1136 Returns
1137 -------
1138 uri : `ButlerURI`
1139 URI pointing to the Dataset within the datastore. If the
1140 Dataset does not exist in the datastore, and if ``predict`` is
1141 `True`, the URI will be a prediction and will include a URI
1142 fragment "#predicted".
1143 If the datastore does not have entities that relate well
1144 to the concept of a URI the returned URI string will be
1145 descriptive. The returned URI is not guaranteed to be obtainable.
1147 Raises
1148 ------
1149 LookupError
1150 A URI has been requested for a dataset that does not exist and
1151 guessing is not allowed.
1152 ValueError
1153 Raised if a resolved `DatasetRef` was passed as an input, but it
1154 differs from the one found in the registry.
1155 TypeError
1156 Raised if no collections were provided.
1157 RuntimeError
1158 Raised if a URI is requested for a dataset that consists of
1159 multiple artifacts.
1160 """
1161 primary, components = self.getURIs(datasetRefOrType, dataId=dataId, predict=predict,
1162 collections=collections, run=run, **kwds)
1164 if primary is None or components:
1165 raise RuntimeError(f"Dataset ({datasetRefOrType}) includes distinct URIs for components. "
1166 "Use Butler.getURIs() instead.")
1167 return primary
1169 def retrieveArtifacts(self, refs: Iterable[DatasetRef],
1170 destination: Union[str, ButlerURI], transfer: str = "auto",
1171 preserve_path: bool = True,
1172 overwrite: bool = False) -> List[ButlerURI]:
1173 """Retrieve the artifacts associated with the supplied refs.
1175 Parameters
1176 ----------
1177 refs : iterable of `DatasetRef`
1178 The datasets for which artifacts are to be retrieved.
1179 A single ref can result in multiple artifacts. The refs must
1180 be resolved.
1181 destination : `ButlerURI` or `str`
1182 Location to write the artifacts.
1183 transfer : `str`, optional
1184 Method to use to transfer the artifacts. Must be one of the options
1185 supported by `ButlerURI.transfer_from()`. "move" is not allowed.
1186 preserve_path : `bool`, optional
1187 If `True` the full path of the artifact within the datastore
1188 is preserved. If `False` the final file component of the path
1189 is used.
1190 overwrite : `bool`, optional
1191 If `True` allow transfers to overwrite existing files at the
1192 destination.
1194 Returns
1195 -------
1196 targets : `list` of `ButlerURI`
1197 URIs of file artifacts in destination location. Order is not
1198 preserved.
1200 Notes
1201 -----
1202 For non-file datastores the artifacts written to the destination
1203 may not match the representation inside the datastore. For example
1204 a hierarchical data structure in a NoSQL database may well be stored
1205 as a JSON file.
1206 """
1207 return self.datastore.retrieveArtifacts(refs, ButlerURI(destination), transfer=transfer,
1208 preserve_path=preserve_path, overwrite=overwrite)
1210 def datasetExists(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
1211 dataId: Optional[DataId] = None, *,
1212 collections: Any = None,
1213 **kwds: Any) -> bool:
1214 """Return True if the Dataset is actually present in the Datastore.
1216 Parameters
1217 ----------
1218 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1219 When `DatasetRef` the `dataId` should be `None`.
1220 Otherwise the `DatasetType` or name thereof.
1221 dataId : `dict` or `DataCoordinate`
1222 A `dict` of `Dimension` link name, value pairs that label the
1223 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1224 should be provided as the first argument.
1225 collections : Any, optional
1226 Collections to be searched, overriding ``self.collections``.
1227 Can be any of the types supported by the ``collections`` argument
1228 to butler construction.
1229 kwds
1230 Additional keyword arguments used to augment or construct a
1231 `DataCoordinate`. See `DataCoordinate.standardize`
1232 parameters.
1234 Raises
1235 ------
1236 LookupError
1237 Raised if the dataset is not even present in the Registry.
1238 ValueError
1239 Raised if a resolved `DatasetRef` was passed as an input, but it
1240 differs from the one found in the registry.
1241 TypeError
1242 Raised if no collections were provided.
1243 """
1244 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwds)
1245 return self.datastore.exists(ref)
1247 def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None:
1248 """Remove one or more `~CollectionType.RUN` collections and the
1249 datasets within them.
1251 Parameters
1252 ----------
1253 names : `Iterable` [ `str` ]
1254 The names of the collections to remove.
1255 unstore : `bool`, optional
1256 If `True` (default), delete datasets from all datastores in which
1257 they are present, and attempt to rollback the registry deletions if
1258 datastore deletions fail (which may not always be possible). If
1259 `False`, datastore records for these datasets are still removed,
1260 but any artifacts (e.g. files) will not be.
1262 Raises
1263 ------
1264 TypeError
1265 Raised if one or more collections are not of type
1266 `~CollectionType.RUN`.
1267 """
1268 if not self.isWriteable():
1269 raise TypeError("Butler is read-only.")
1270 names = list(names)
1271 refs: List[DatasetRef] = []
1272 for name in names:
1273 collectionType = self.registry.getCollectionType(name)
1274 if collectionType is not CollectionType.RUN:
1275 raise TypeError(f"The collection type of '{name}' is {collectionType.name}, not RUN.")
1276 refs.extend(self.registry.queryDatasets(..., collections=name, findFirst=True))
1277 with self.registry.transaction():
1278 if unstore:
1279 self.datastore.trash(refs)
1280 else:
1281 self.datastore.forget(refs)
1282 for name in names:
1283 self.registry.removeCollection(name)
1284 if unstore:
1285 # Point of no return for removing artifacts
1286 self.datastore.emptyTrash()
1288 def pruneCollection(self, name: str, purge: bool = False, unstore: bool = False,
1289 unlink: Optional[List[str]] = None) -> None:
1290 """Remove a collection and possibly prune datasets within it.
1292 Parameters
1293 ----------
1294 name : `str`
1295 Name of the collection to remove. If this is a
1296 `~CollectionType.TAGGED` or `~CollectionType.CHAINED` collection,
1297 datasets within the collection are not modified unless ``unstore``
1298 is `True`. If this is a `~CollectionType.RUN` collection,
1299 ``purge`` and ``unstore`` must be `True`, and all datasets in it
1300 are fully removed from the data repository.
1301 purge : `bool`, optional
1302 If `True`, permit `~CollectionType.RUN` collections to be removed,
1303 fully removing datasets within them. Requires ``unstore=True`` as
1304 well as an added precaution against accidental deletion. Must be
1305 `False` (default) if the collection is not a ``RUN``.
1306 unstore: `bool`, optional
1307 If `True`, remove all datasets in the collection from all
1308 datastores in which they appear.
1309 unlink: `list` [`str`], optional
1310 Before removing the given `collection` unlink it from from these
1311 parent collections.
1313 Raises
1314 ------
1315 TypeError
1316 Raised if the butler is read-only or arguments are mutually
1317 inconsistent.
1318 """
1319 # See pruneDatasets comments for more information about the logic here;
1320 # the cases are almost the same, but here we can rely on Registry to
1321 # take care everything but Datastore deletion when we remove the
1322 # collection.
1323 if not self.isWriteable():
1324 raise TypeError("Butler is read-only.")
1325 collectionType = self.registry.getCollectionType(name)
1326 if purge and not unstore:
1327 raise PurgeWithoutUnstorePruneCollectionsError()
1328 if collectionType is CollectionType.RUN and not purge:
1329 raise RunWithoutPurgePruneCollectionsError(collectionType)
1330 if collectionType is not CollectionType.RUN and purge:
1331 raise PurgeUnsupportedPruneCollectionsError(collectionType)
1333 def remove(child: str, parent: str) -> None:
1334 """Remove a child collection from a parent collection."""
1335 # Remove child from parent.
1336 chain = list(self.registry.getCollectionChain(parent))
1337 try:
1338 chain.remove(name)
1339 except ValueError as e:
1340 raise RuntimeError(f"{name} is not a child of {parent}") from e
1341 self.registry.setCollectionChain(parent, chain)
1343 with self.registry.transaction():
1344 if (unlink):
1345 for parent in unlink:
1346 remove(name, parent)
1347 if unstore:
1348 refs = self.registry.queryDatasets(..., collections=name, findFirst=True)
1349 self.datastore.trash(refs)
1350 self.registry.removeCollection(name)
1352 if unstore:
1353 # Point of no return for removing artifacts
1354 self.datastore.emptyTrash()
1356 def pruneDatasets(self, refs: Iterable[DatasetRef], *,
1357 disassociate: bool = True,
1358 unstore: bool = False,
1359 tags: Iterable[str] = (),
1360 purge: bool = False,
1361 run: Optional[str] = None) -> None:
1362 """Remove one or more datasets from a collection and/or storage.
1364 Parameters
1365 ----------
1366 refs : `~collections.abc.Iterable` of `DatasetRef`
1367 Datasets to prune. These must be "resolved" references (not just
1368 a `DatasetType` and data ID).
1369 disassociate : `bool`, optional
1370 Disassociate pruned datasets from ``tags``, or from all collections
1371 if ``purge=True``.
1372 unstore : `bool`, optional
1373 If `True` (`False` is default) remove these datasets from all
1374 datastores known to this butler. Note that this will make it
1375 impossible to retrieve these datasets even via other collections.
1376 Datasets that are already not stored are ignored by this option.
1377 tags : `Iterable` [ `str` ], optional
1378 `~CollectionType.TAGGED` collections to disassociate the datasets
1379 from. Ignored if ``disassociate`` is `False` or ``purge`` is
1380 `True`.
1381 purge : `bool`, optional
1382 If `True` (`False` is default), completely remove the dataset from
1383 the `Registry`. To prevent accidental deletions, ``purge`` may
1384 only be `True` if all of the following conditions are met:
1386 - All given datasets are in the given run.
1387 - ``disassociate`` is `True`;
1388 - ``unstore`` is `True`.
1390 This mode may remove provenance information from datasets other
1391 than those provided, and should be used with extreme care.
1393 Raises
1394 ------
1395 TypeError
1396 Raised if the butler is read-only, if no collection was provided,
1397 or the conditions for ``purge=True`` were not met.
1398 """
1399 if not self.isWriteable():
1400 raise TypeError("Butler is read-only.")
1401 if purge:
1402 if not disassociate:
1403 raise TypeError("Cannot pass purge=True without disassociate=True.")
1404 if not unstore:
1405 raise TypeError("Cannot pass purge=True without unstore=True.")
1406 elif disassociate:
1407 tags = tuple(tags)
1408 if not tags:
1409 raise TypeError("No tags provided but disassociate=True.")
1410 for tag in tags:
1411 collectionType = self.registry.getCollectionType(tag)
1412 if collectionType is not CollectionType.TAGGED:
1413 raise TypeError(f"Cannot disassociate from collection '{tag}' "
1414 f"of non-TAGGED type {collectionType.name}.")
1415 # Transform possibly-single-pass iterable into something we can iterate
1416 # over multiple times.
1417 refs = list(refs)
1418 # Pruning a component of a DatasetRef makes no sense since registry
1419 # doesn't know about components and datastore might not store
1420 # components in a separate file
1421 for ref in refs:
1422 if ref.datasetType.component():
1423 raise ValueError(f"Can not prune a component of a dataset (ref={ref})")
1424 # We don't need an unreliable Datastore transaction for this, because
1425 # we've been extra careful to ensure that Datastore.trash only involves
1426 # mutating the Registry (it can _look_ at Datastore-specific things,
1427 # but shouldn't change them), and hence all operations here are
1428 # Registry operations.
1429 with self.registry.transaction():
1430 if unstore:
1431 self.datastore.trash(refs)
1432 if purge:
1433 self.registry.removeDatasets(refs)
1434 elif disassociate:
1435 assert tags, "Guaranteed by earlier logic in this function."
1436 for tag in tags:
1437 self.registry.disassociate(tag, refs)
1438 # We've exited the Registry transaction, and apparently committed.
1439 # (if there was an exception, everything rolled back, and it's as if
1440 # nothing happened - and we never get here).
1441 # Datastore artifacts are not yet gone, but they're clearly marked
1442 # as trash, so if we fail to delete now because of (e.g.) filesystem
1443 # problems we can try again later, and if manual administrative
1444 # intervention is required, it's pretty clear what that should entail:
1445 # deleting everything on disk and in private Datastore tables that is
1446 # in the dataset_location_trash table.
1447 if unstore:
1448 # Point of no return for removing artifacts
1449 self.datastore.emptyTrash()
1451 @transactional
1452 def ingest(self, *datasets: FileDataset, transfer: Optional[str] = "auto", run: Optional[str] = None,
1453 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
1454 ) -> None:
1455 """Store and register one or more datasets that already exist on disk.
1457 Parameters
1458 ----------
1459 datasets : `FileDataset`
1460 Each positional argument is a struct containing information about
1461 a file to be ingested, including its path (either absolute or
1462 relative to the datastore root, if applicable), a `DatasetRef`,
1463 and optionally a formatter class or its fully-qualified string
1464 name. If a formatter is not provided, the formatter that would be
1465 used for `put` is assumed. On successful return, all
1466 `FileDataset.ref` attributes will have their `DatasetRef.id`
1467 attribute populated and all `FileDataset.formatter` attributes will
1468 be set to the formatter class used. `FileDataset.path` attributes
1469 may be modified to put paths in whatever the datastore considers a
1470 standardized form.
1471 transfer : `str`, optional
1472 If not `None`, must be one of 'auto', 'move', 'copy', 'direct',
1473 'hardlink', 'relsymlink' or 'symlink', indicating how to transfer
1474 the file.
1475 run : `str`, optional
1476 The name of the run ingested datasets should be added to,
1477 overriding ``self.run``.
1478 idGenerationMode : `DatasetIdGenEnum`, optional
1479 Specifies option for generating dataset IDs. By default unique IDs
1480 are generated for each inserted dataset.
1482 Raises
1483 ------
1484 TypeError
1485 Raised if the butler is read-only or if no run was provided.
1486 NotImplementedError
1487 Raised if the `Datastore` does not support the given transfer mode.
1488 DatasetTypeNotSupportedError
1489 Raised if one or more files to be ingested have a dataset type that
1490 is not supported by the `Datastore`..
1491 FileNotFoundError
1492 Raised if one of the given files does not exist.
1493 FileExistsError
1494 Raised if transfer is not `None` but the (internal) location the
1495 file would be moved to is already occupied.
1497 Notes
1498 -----
1499 This operation is not fully exception safe: if a database operation
1500 fails, the given `FileDataset` instances may be only partially updated.
1502 It is atomic in terms of database operations (they will either all
1503 succeed or all fail) providing the database engine implements
1504 transactions correctly. It will attempt to be atomic in terms of
1505 filesystem operations as well, but this cannot be implemented
1506 rigorously for most datastores.
1507 """
1508 if not self.isWriteable():
1509 raise TypeError("Butler is read-only.")
1510 progress = Progress("lsst.daf.butler.Butler.ingest", level=logging.DEBUG)
1511 # Reorganize the inputs so they're grouped by DatasetType and then
1512 # data ID. We also include a list of DatasetRefs for each FileDataset
1513 # to hold the resolved DatasetRefs returned by the Registry, before
1514 # it's safe to swap them into FileDataset.refs.
1515 # Some type annotation aliases to make that clearer:
1516 GroupForType = Dict[DataCoordinate, Tuple[FileDataset, List[DatasetRef]]]
1517 GroupedData = MutableMapping[DatasetType, GroupForType]
1518 # The actual data structure:
1519 groupedData: GroupedData = defaultdict(dict)
1520 # And the nested loop that populates it:
1521 for dataset in progress.wrap(datasets, desc="Grouping by dataset type"):
1522 # This list intentionally shared across the inner loop, since it's
1523 # associated with `dataset`.
1524 resolvedRefs: List[DatasetRef] = []
1525 for ref in dataset.refs:
1526 if ref.dataId in groupedData[ref.datasetType]:
1527 raise ConflictingDefinitionError(f"Ingest conflict. Dataset {dataset.path} has same"
1528 " DataId as other ingest dataset"
1529 f" {groupedData[ref.datasetType][ref.dataId][0].path} "
1530 f" ({ref.dataId})")
1531 groupedData[ref.datasetType][ref.dataId] = (dataset, resolvedRefs)
1533 # Now we can bulk-insert into Registry for each DatasetType.
1534 allResolvedRefs: List[DatasetRef] = []
1535 for datasetType, groupForType in progress.iter_item_chunks(groupedData.items(),
1536 desc="Bulk-inserting datasets by type"):
1537 refs = self.registry.insertDatasets(
1538 datasetType,
1539 dataIds=groupForType.keys(),
1540 run=run,
1541 expand=self.datastore.needs_expanded_data_ids(transfer, datasetType),
1542 idGenerationMode=idGenerationMode,
1543 )
1544 # Append those resolved DatasetRefs to the new lists we set up for
1545 # them.
1546 for ref, (_, resolvedRefs) in zip(refs, groupForType.values()):
1547 resolvedRefs.append(ref)
1549 # Go back to the original FileDatasets to replace their refs with the
1550 # new resolved ones, and also build a big list of all refs.
1551 allResolvedRefs = []
1552 for groupForType in progress.iter_chunks(groupedData.values(),
1553 desc="Reassociating resolved dataset refs with files"):
1554 for dataset, resolvedRefs in groupForType.values():
1555 dataset.refs = resolvedRefs
1556 allResolvedRefs.extend(resolvedRefs)
1558 # Bulk-insert everything into Datastore.
1559 self.datastore.ingest(*datasets, transfer=transfer)
1561 @contextlib.contextmanager
1562 def export(self, *, directory: Optional[str] = None,
1563 filename: Optional[str] = None,
1564 format: Optional[str] = None,
1565 transfer: Optional[str] = None) -> Iterator[RepoExportContext]:
1566 """Export datasets from the repository represented by this `Butler`.
1568 This method is a context manager that returns a helper object
1569 (`RepoExportContext`) that is used to indicate what information from
1570 the repository should be exported.
1572 Parameters
1573 ----------
1574 directory : `str`, optional
1575 Directory dataset files should be written to if ``transfer`` is not
1576 `None`.
1577 filename : `str`, optional
1578 Name for the file that will include database information associated
1579 with the exported datasets. If this is not an absolute path and
1580 ``directory`` is not `None`, it will be written to ``directory``
1581 instead of the current working directory. Defaults to
1582 "export.{format}".
1583 format : `str`, optional
1584 File format for the database information file. If `None`, the
1585 extension of ``filename`` will be used.
1586 transfer : `str`, optional
1587 Transfer mode passed to `Datastore.export`.
1589 Raises
1590 ------
1591 TypeError
1592 Raised if the set of arguments passed is inconsistent.
1594 Examples
1595 --------
1596 Typically the `Registry.queryDataIds` and `Registry.queryDatasets`
1597 methods are used to provide the iterables over data IDs and/or datasets
1598 to be exported::
1600 with butler.export("exports.yaml") as export:
1601 # Export all flats, but none of the dimension element rows
1602 # (i.e. data ID information) associated with them.
1603 export.saveDatasets(butler.registry.queryDatasets("flat"),
1604 elements=())
1605 # Export all datasets that start with "deepCoadd_" and all of
1606 # their associated data ID information.
1607 export.saveDatasets(butler.registry.queryDatasets("deepCoadd_*"))
1608 """
1609 if directory is None and transfer is not None:
1610 raise TypeError("Cannot transfer without providing a directory.")
1611 if transfer == "move":
1612 raise TypeError("Transfer may not be 'move': export is read-only")
1613 if format is None:
1614 if filename is None:
1615 raise TypeError("At least one of 'filename' or 'format' must be provided.")
1616 else:
1617 _, format = os.path.splitext(filename)
1618 elif filename is None:
1619 filename = f"export.{format}"
1620 if directory is not None:
1621 filename = os.path.join(directory, filename)
1622 BackendClass = getClassOf(self._config["repo_transfer_formats"][format]["export"])
1623 with open(filename, 'w') as stream:
1624 backend = BackendClass(stream)
1625 try:
1626 helper = RepoExportContext(self.registry, self.datastore, backend=backend,
1627 directory=directory, transfer=transfer)
1628 yield helper
1629 except BaseException:
1630 raise
1631 else:
1632 helper._finish()
1634 def import_(self, *, directory: Optional[str] = None,
1635 filename: Union[str, TextIO, None] = None,
1636 format: Optional[str] = None,
1637 transfer: Optional[str] = None,
1638 skip_dimensions: Optional[Set] = None,
1639 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
1640 reuseIds: bool = False) -> None:
1641 """Import datasets into this repository that were exported from a
1642 different butler repository via `~lsst.daf.butler.Butler.export`.
1644 Parameters
1645 ----------
1646 directory : `str`, optional
1647 Directory containing dataset files to import from. If `None`,
1648 ``filename`` and all dataset file paths specified therein must
1649 be absolute.
1650 filename : `str` or `TextIO`, optional
1651 A stream or name of file that contains database information
1652 associated with the exported datasets, typically generated by
1653 `~lsst.daf.butler.Butler.export`. If this a string (name) and
1654 is not an absolute path, does not exist in the current working
1655 directory, and ``directory`` is not `None`, it is assumed to be in
1656 ``directory``. Defaults to "export.{format}".
1657 format : `str`, optional
1658 File format for ``filename``. If `None`, the extension of
1659 ``filename`` will be used.
1660 transfer : `str`, optional
1661 Transfer mode passed to `~lsst.daf.butler.Datastore.ingest`.
1662 skip_dimensions : `set`, optional
1663 Names of dimensions that should be skipped and not imported.
1664 idGenerationMode : `DatasetIdGenEnum`, optional
1665 Specifies option for generating dataset IDs when IDs are not
1666 provided or their type does not match backend type. By default
1667 unique IDs are generated for each inserted dataset.
1668 reuseIds : `bool`, optional
1669 If `True` then forces re-use of imported dataset IDs for integer
1670 IDs which are normally generated as auto-incremented; exception
1671 will be raised if imported IDs clash with existing ones. This
1672 option has no effect on the use of globally-unique IDs which are
1673 always re-used (or generated if integer IDs are being imported).
1675 Raises
1676 ------
1677 TypeError
1678 Raised if the set of arguments passed is inconsistent, or if the
1679 butler is read-only.
1680 """
1681 if not self.isWriteable():
1682 raise TypeError("Butler is read-only.")
1683 if format is None:
1684 if filename is None:
1685 raise TypeError("At least one of 'filename' or 'format' must be provided.")
1686 else:
1687 _, format = os.path.splitext(filename) # type: ignore
1688 elif filename is None:
1689 filename = f"export.{format}"
1690 if isinstance(filename, str) and directory is not None and not os.path.exists(filename):
1691 filename = os.path.join(directory, filename)
1692 BackendClass = getClassOf(self._config["repo_transfer_formats"][format]["import"])
1694 def doImport(importStream: TextIO) -> None:
1695 backend = BackendClass(importStream, self.registry)
1696 backend.register()
1697 with self.transaction():
1698 backend.load(self.datastore, directory=directory, transfer=transfer,
1699 skip_dimensions=skip_dimensions, idGenerationMode=idGenerationMode,
1700 reuseIds=reuseIds)
1702 if isinstance(filename, str):
1703 with open(filename, "r") as stream:
1704 doImport(stream)
1705 else:
1706 doImport(filename)
1708 @transactional
1709 def transfer_from(self, source_butler: Butler, source_refs: Iterable[DatasetRef],
1710 transfer: str = "auto",
1711 id_gen_map: Dict[str, DatasetIdGenEnum] = None) -> List[DatasetRef]:
1712 """Transfer datasets to this Butler from a run in another Butler.
1714 Parameters
1715 ----------
1716 source_butler : `Butler`
1717 Butler from which the datasets are to be transferred.
1718 source_refs : iterable of `DatasetRef`
1719 Datasets defined in the source butler that should be transferred to
1720 this butler.
1721 transfer : `str`, optional
1722 Transfer mode passed to `~lsst.daf.butler.Datastore.transfer_from`.
1723 id_gen_map : `dict` of [`str`, `DatasetIdGenEnum`], optional
1724 A mapping of dataset type to ID generation mode. Only used if
1725 the source butler is using integer IDs. Should not be used
1726 if this receiving butler uses integer IDs. Without this dataset
1727 import always uses unique.
1729 Returns
1730 -------
1731 refs : `list` of `DatasetRef`
1732 The refs added to this Butler.
1734 Notes
1735 -----
1736 Requires that any dimension definitions are already present in the
1737 receiving Butler. The datastore artifact has to exist for a transfer
1738 to be made but non-existence is not an error.
1740 Datasets that already exist in this run will be skipped.
1741 """
1742 if not self.isWriteable():
1743 raise TypeError("Butler is read-only.")
1744 progress = Progress("lsst.daf.butler.Butler.transfer_from", level=VERBOSE)
1746 # Will iterate through the refs multiple times so need to convert
1747 # to a list if this isn't a collection.
1748 if not isinstance(source_refs, collections.abc.Collection):
1749 source_refs = list(source_refs)
1751 log.info("Transferring %d datasets into %s", len(source_refs), str(self))
1753 if id_gen_map is None:
1754 id_gen_map = {}
1756 # Importing requires that we group the refs by dataset type and run
1757 # before doing the import.
1758 grouped_refs = defaultdict(list)
1759 grouped_indices = defaultdict(list)
1760 for i, ref in enumerate(source_refs):
1761 grouped_refs[ref.datasetType, ref.run].append(ref)
1762 grouped_indices[ref.datasetType, ref.run].append(i)
1764 # The returned refs should be identical for UUIDs.
1765 # For now must also support integers and so need to retain the
1766 # newly-created refs from this registry.
1767 # Pre-size it so we can assign refs into the correct slots
1768 transferred_refs_tmp: List[Optional[DatasetRef]] = [None] * len(source_refs)
1769 default_id_gen = DatasetIdGenEnum.UNIQUE
1771 for (datasetType, run), refs_to_import in progress.iter_item_chunks(grouped_refs.items(),
1772 desc="Importing to registry by "
1773 "run and dataset type"):
1774 run_doc = source_butler.registry.getCollectionDocumentation(run)
1775 self.registry.registerCollection(run, CollectionType.RUN, doc=run_doc)
1777 id_generation_mode = default_id_gen
1778 if isinstance(refs_to_import[0].id, int):
1779 # ID generation mode might need to be overridden when
1780 # targetting UUID
1781 id_generation_mode = id_gen_map.get(datasetType.name, default_id_gen)
1783 n_refs = len(refs_to_import)
1784 log.log(VERBOSE, "Importing %d ref%s of dataset type %s into run %s",
1785 n_refs, "" if n_refs == 1 else "s", datasetType.name, run)
1787 # No way to know if this butler's registry uses UUID.
1788 # We have to trust the caller on this. If it fails they will have
1789 # to change their approach. We can't catch the exception and
1790 # retry with unique because that will mess up the transaction
1791 # handling. We aren't allowed to ask the registry manager what
1792 # type of ID it is using.
1793 imported_refs = self.registry._importDatasets(refs_to_import,
1794 idGenerationMode=id_generation_mode,
1795 expand=False)
1797 # Map them into the correct slots to match the initial order
1798 for i, ref in zip(grouped_indices[datasetType, run], imported_refs):
1799 transferred_refs_tmp[i] = ref
1801 # Mypy insists that we might have None in here so we have to make
1802 # that explicit by assigning to a new variable and filtering out
1803 # something that won't be there.
1804 transferred_refs = [ref for ref in transferred_refs_tmp if ref is not None]
1806 # Check consistency
1807 assert len(source_refs) == len(transferred_refs), "Different number of refs imported than given"
1809 log.log(VERBOSE, "Imported %d datasets into destination butler", len(transferred_refs))
1811 # The transferred refs need to be reordered to match the original
1812 # ordering given by the caller. Without this the datastore transfer
1813 # will be broken.
1815 # Ask the datastore to transfer. The datastore has to check that
1816 # the source datastore is compatible with the target datastore.
1817 self.datastore.transfer_from(source_butler.datastore, source_refs,
1818 local_refs=transferred_refs, transfer=transfer)
1820 return transferred_refs
1822 def validateConfiguration(self, logFailures: bool = False,
1823 datasetTypeNames: Optional[Iterable[str]] = None,
1824 ignore: Iterable[str] = None) -> None:
1825 """Validate butler configuration.
1827 Checks that each `DatasetType` can be stored in the `Datastore`.
1829 Parameters
1830 ----------
1831 logFailures : `bool`, optional
1832 If `True`, output a log message for every validation error
1833 detected.
1834 datasetTypeNames : iterable of `str`, optional
1835 The `DatasetType` names that should be checked. This allows
1836 only a subset to be selected.
1837 ignore : iterable of `str`, optional
1838 Names of DatasetTypes to skip over. This can be used to skip
1839 known problems. If a named `DatasetType` corresponds to a
1840 composite, all components of that `DatasetType` will also be
1841 ignored.
1843 Raises
1844 ------
1845 ButlerValidationError
1846 Raised if there is some inconsistency with how this Butler
1847 is configured.
1848 """
1849 if datasetTypeNames:
1850 datasetTypes = [self.registry.getDatasetType(name) for name in datasetTypeNames]
1851 else:
1852 datasetTypes = list(self.registry.queryDatasetTypes())
1854 # filter out anything from the ignore list
1855 if ignore:
1856 ignore = set(ignore)
1857 datasetTypes = [e for e in datasetTypes
1858 if e.name not in ignore and e.nameAndComponent()[0] not in ignore]
1859 else:
1860 ignore = set()
1862 # Find all the registered instruments
1863 instruments = set(
1864 record.name for record in self.registry.queryDimensionRecords("instrument")
1865 )
1867 # For each datasetType that has an instrument dimension, create
1868 # a DatasetRef for each defined instrument
1869 datasetRefs = []
1871 for datasetType in datasetTypes:
1872 if "instrument" in datasetType.dimensions:
1873 for instrument in instruments:
1874 datasetRef = DatasetRef(datasetType, {"instrument": instrument}, # type: ignore
1875 conform=False)
1876 datasetRefs.append(datasetRef)
1878 entities: List[Union[DatasetType, DatasetRef]] = []
1879 entities.extend(datasetTypes)
1880 entities.extend(datasetRefs)
1882 datastoreErrorStr = None
1883 try:
1884 self.datastore.validateConfiguration(entities, logFailures=logFailures)
1885 except ValidationError as e:
1886 datastoreErrorStr = str(e)
1888 # Also check that the LookupKeys used by the datastores match
1889 # registry and storage class definitions
1890 keys = self.datastore.getLookupKeys()
1892 failedNames = set()
1893 failedDataId = set()
1894 for key in keys:
1895 if key.name is not None:
1896 if key.name in ignore:
1897 continue
1899 # skip if specific datasetType names were requested and this
1900 # name does not match
1901 if datasetTypeNames and key.name not in datasetTypeNames:
1902 continue
1904 # See if it is a StorageClass or a DatasetType
1905 if key.name in self.storageClasses:
1906 pass
1907 else:
1908 try:
1909 self.registry.getDatasetType(key.name)
1910 except KeyError:
1911 if logFailures:
1912 log.critical("Key '%s' does not correspond to a DatasetType or StorageClass", key)
1913 failedNames.add(key)
1914 else:
1915 # Dimensions are checked for consistency when the Butler
1916 # is created and rendezvoused with a universe.
1917 pass
1919 # Check that the instrument is a valid instrument
1920 # Currently only support instrument so check for that
1921 if key.dataId:
1922 dataIdKeys = set(key.dataId)
1923 if set(["instrument"]) != dataIdKeys:
1924 if logFailures:
1925 log.critical("Key '%s' has unsupported DataId override", key)
1926 failedDataId.add(key)
1927 elif key.dataId["instrument"] not in instruments:
1928 if logFailures:
1929 log.critical("Key '%s' has unknown instrument", key)
1930 failedDataId.add(key)
1932 messages = []
1934 if datastoreErrorStr:
1935 messages.append(datastoreErrorStr)
1937 for failed, msg in ((failedNames, "Keys without corresponding DatasetType or StorageClass entry: "),
1938 (failedDataId, "Keys with bad DataId entries: ")):
1939 if failed:
1940 msg += ", ".join(str(k) for k in failed)
1941 messages.append(msg)
1943 if messages:
1944 raise ValidationError(";\n".join(messages))
1946 @property
1947 def collections(self) -> CollectionSearch:
1948 """The collections to search by default, in order (`CollectionSearch`).
1950 This is an alias for ``self.registry.defaults.collections``. It cannot
1951 be set directly in isolation, but all defaults may be changed together
1952 by assigning a new `RegistryDefaults` instance to
1953 ``self.registry.defaults``.
1954 """
1955 return self.registry.defaults.collections
1957 @property
1958 def run(self) -> Optional[str]:
1959 """Name of the run this butler writes outputs to by default (`str` or
1960 `None`).
1962 This is an alias for ``self.registry.defaults.run``. It cannot be set
1963 directly in isolation, but all defaults may be changed together by
1964 assigning a new `RegistryDefaults` instance to
1965 ``self.registry.defaults``.
1966 """
1967 return self.registry.defaults.run
1969 registry: Registry
1970 """The object that manages dataset metadata and relationships (`Registry`).
1972 Most operations that don't involve reading or writing butler datasets are
1973 accessible only via `Registry` methods.
1974 """
1976 datastore: Datastore
1977 """The object that manages actual dataset storage (`Datastore`).
1979 Direct user access to the datastore should rarely be necessary; the primary
1980 exception is the case where a `Datastore` implementation provides extra
1981 functionality beyond what the base class defines.
1982 """
1984 storageClasses: StorageClassFactory
1985 """An object that maps known storage class names to objects that fully
1986 describe them (`StorageClassFactory`).
1987 """
1989 _allow_put_of_predefined_dataset: bool
1990 """Allow a put to succeed even if there is already a registry entry for it
1991 but not a datastore record. (`bool`)."""