Coverage for python/lsst/daf/butler/_butler.py : 10%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""
23Butler top level classes.
24"""
25from __future__ import annotations
27__all__ = (
28 "Butler",
29 "ButlerValidationError",
30 "PruneCollectionsArgsError",
31 "PurgeWithoutUnstorePruneCollectionsError",
32 "RunWithoutPurgePruneCollectionsError",
33 "PurgeUnsupportedPruneCollectionsError",
34)
37from collections import defaultdict
38import contextlib
39import logging
40import numbers
41import os
42from typing import (
43 Any,
44 ClassVar,
45 Counter,
46 Dict,
47 Iterable,
48 Iterator,
49 List,
50 MutableMapping,
51 Optional,
52 Set,
53 TextIO,
54 Tuple,
55 Type,
56 Union,
57)
59try:
60 import boto3
61except ImportError:
62 boto3 = None
64from lsst.utils import doImport
65from .core import (
66 AmbiguousDatasetError,
67 ButlerURI,
68 Config,
69 ConfigSubset,
70 DataCoordinate,
71 DataId,
72 DataIdValue,
73 DatasetRef,
74 DatasetType,
75 Datastore,
76 Dimension,
77 DimensionConfig,
78 FileDataset,
79 StorageClassFactory,
80 Timespan,
81 ValidationError,
82)
83from .core.repoRelocation import BUTLER_ROOT_TAG
84from .core.utils import transactional, getClassOf
85from ._deferredDatasetHandle import DeferredDatasetHandle
86from ._butlerConfig import ButlerConfig
87from .registry import Registry, RegistryConfig, RegistryDefaults, CollectionType
88from .registry.wildcards import CollectionSearch
89from .transfers import RepoExportContext
91log = logging.getLogger(__name__)
94class ButlerValidationError(ValidationError):
95 """There is a problem with the Butler configuration."""
96 pass
99class PruneCollectionsArgsError(TypeError):
100 """Base class for errors relating to Butler.pruneCollections input
101 arguments.
102 """
103 pass
106class PurgeWithoutUnstorePruneCollectionsError(PruneCollectionsArgsError):
107 """Raised when purge and unstore are both required to be True, and
108 purge is True but unstore is False.
109 """
111 def __init__(self) -> None:
112 super().__init__("Cannot pass purge=True without unstore=True.")
115class RunWithoutPurgePruneCollectionsError(PruneCollectionsArgsError):
116 """Raised when pruning a RUN collection but purge is False."""
118 def __init__(self, collectionType: CollectionType):
119 self.collectionType = collectionType
120 super().__init__(f"Cannot prune RUN collection {self.collectionType.name} without purge=True.")
123class PurgeUnsupportedPruneCollectionsError(PruneCollectionsArgsError):
124 """Raised when purge is True but is not supported for the given
125 collection."""
127 def __init__(self, collectionType: CollectionType):
128 self.collectionType = collectionType
129 super().__init__(
130 f"Cannot prune {self.collectionType} collection {self.collectionType.name} with purge=True.")
133class Butler:
134 """Main entry point for the data access system.
136 Parameters
137 ----------
138 config : `ButlerConfig`, `Config` or `str`, optional.
139 Configuration. Anything acceptable to the
140 `ButlerConfig` constructor. If a directory path
141 is given the configuration will be read from a ``butler.yaml`` file in
142 that location. If `None` is given default values will be used.
143 butler : `Butler`, optional.
144 If provided, construct a new Butler that uses the same registry and
145 datastore as the given one, but with the given collection and run.
146 Incompatible with the ``config``, ``searchPaths``, and ``writeable``
147 arguments.
148 collections : `str` or `Iterable` [ `str` ], optional
149 An expression specifying the collections to be searched (in order) when
150 reading datasets.
151 This may be a `str` collection name or an iterable thereof.
152 See :ref:`daf_butler_collection_expressions` for more information.
153 These collections are not registered automatically and must be
154 manually registered before they are used by any method, but they may be
155 manually registered after the `Butler` is initialized.
156 run : `str`, optional
157 Name of the `~CollectionType.RUN` collection new datasets should be
158 inserted into. If ``collections`` is `None` and ``run`` is not `None`,
159 ``collections`` will be set to ``[run]``. If not `None`, this
160 collection will automatically be registered. If this is not set (and
161 ``writeable`` is not set either), a read-only butler will be created.
162 searchPaths : `list` of `str`, optional
163 Directory paths to search when calculating the full Butler
164 configuration. Not used if the supplied config is already a
165 `ButlerConfig`.
166 writeable : `bool`, optional
167 Explicitly sets whether the butler supports write operations. If not
168 provided, a read-write butler is created if any of ``run``, ``tags``,
169 or ``chains`` is non-empty.
170 inferDefaults : `bool`, optional
171 If `True` (default) infer default data ID values from the values
172 present in the datasets in ``collections``: if all collections have the
173 same value (or no value) for a governor dimension, that value will be
174 the default for that dimension. Nonexistent collections are ignored.
175 If a default value is provided explicitly for a governor dimension via
176 ``**kwargs``, no default will be inferred for that dimension.
177 **kwargs : `str`
178 Default data ID key-value pairs. These may only identify "governor"
179 dimensions like ``instrument`` and ``skymap``.
181 Examples
182 --------
183 While there are many ways to control exactly how a `Butler` interacts with
184 the collections in its `Registry`, the most common cases are still simple.
186 For a read-only `Butler` that searches one collection, do::
188 butler = Butler("/path/to/repo", collections=["u/alice/DM-50000"])
190 For a read-write `Butler` that writes to and reads from a
191 `~CollectionType.RUN` collection::
193 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a")
195 The `Butler` passed to a ``PipelineTask`` is often much more complex,
196 because we want to write to one `~CollectionType.RUN` collection but read
197 from several others (as well)::
199 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a",
200 collections=["u/alice/DM-50000/a",
201 "u/bob/DM-49998",
202 "HSC/defaults"])
204 This butler will `put` new datasets to the run ``u/alice/DM-50000/a``.
205 Datasets will be read first from that run (since it appears first in the
206 chain), and then from ``u/bob/DM-49998`` and finally ``HSC/defaults``.
208 Finally, one can always create a `Butler` with no collections::
210 butler = Butler("/path/to/repo", writeable=True)
212 This can be extremely useful when you just want to use ``butler.registry``,
213 e.g. for inserting dimension data or managing collections, or when the
214 collections you want to use with the butler are not consistent.
215 Passing ``writeable`` explicitly here is only necessary if you want to be
216 able to make changes to the repo - usually the value for ``writeable`` can
217 be guessed from the collection arguments provided, but it defaults to
218 `False` when there are not collection arguments.
219 """
220 def __init__(self, config: Union[Config, str, None] = None, *,
221 butler: Optional[Butler] = None,
222 collections: Any = None,
223 run: Optional[str] = None,
224 searchPaths: Optional[List[str]] = None,
225 writeable: Optional[bool] = None,
226 inferDefaults: bool = True,
227 **kwargs: str,
228 ):
229 defaults = RegistryDefaults(collections=collections, run=run, infer=inferDefaults, **kwargs)
230 # Load registry, datastore, etc. from config or existing butler.
231 if butler is not None:
232 if config is not None or searchPaths is not None or writeable is not None:
233 raise TypeError("Cannot pass 'config', 'searchPaths', or 'writeable' "
234 "arguments with 'butler' argument.")
235 self.registry = butler.registry.copy(defaults)
236 self.datastore = butler.datastore
237 self.storageClasses = butler.storageClasses
238 self._config: ButlerConfig = butler._config
239 else:
240 self._config = ButlerConfig(config, searchPaths=searchPaths)
241 if "root" in self._config:
242 butlerRoot = self._config["root"]
243 else:
244 butlerRoot = self._config.configDir
245 if writeable is None:
246 writeable = run is not None
247 self.registry = Registry.fromConfig(self._config, butlerRoot=butlerRoot, writeable=writeable,
248 defaults=defaults)
249 self.datastore = Datastore.fromConfig(self._config, self.registry.getDatastoreBridgeManager(),
250 butlerRoot=butlerRoot)
251 self.storageClasses = StorageClassFactory()
252 self.storageClasses.addFromConfig(self._config)
253 if "run" in self._config or "collection" in self._config:
254 raise ValueError("Passing a run or collection via configuration is no longer supported.")
256 GENERATION: ClassVar[int] = 3
257 """This is a Generation 3 Butler.
259 This attribute may be removed in the future, once the Generation 2 Butler
260 interface has been fully retired; it should only be used in transitional
261 code.
262 """
264 @staticmethod
265 def makeRepo(root: str, config: Union[Config, str, None] = None,
266 dimensionConfig: Union[Config, str, None] = None, standalone: bool = False,
267 searchPaths: Optional[List[str]] = None, forceConfigRoot: bool = True,
268 outfile: Optional[str] = None, overwrite: bool = False) -> Config:
269 """Create an empty data repository by adding a butler.yaml config
270 to a repository root directory.
272 Parameters
273 ----------
274 root : `str` or `ButlerURI`
275 Path or URI to the root location of the new repository. Will be
276 created if it does not exist.
277 config : `Config` or `str`, optional
278 Configuration to write to the repository, after setting any
279 root-dependent Registry or Datastore config options. Can not
280 be a `ButlerConfig` or a `ConfigSubset`. If `None`, default
281 configuration will be used. Root-dependent config options
282 specified in this config are overwritten if ``forceConfigRoot``
283 is `True`.
284 dimensionConfig : `Config` or `str`, optional
285 Configuration for dimensions, will be used to initialize registry
286 database.
287 standalone : `bool`
288 If True, write all expanded defaults, not just customized or
289 repository-specific settings.
290 This (mostly) decouples the repository from the default
291 configuration, insulating it from changes to the defaults (which
292 may be good or bad, depending on the nature of the changes).
293 Future *additions* to the defaults will still be picked up when
294 initializing `Butlers` to repos created with ``standalone=True``.
295 searchPaths : `list` of `str`, optional
296 Directory paths to search when calculating the full butler
297 configuration.
298 forceConfigRoot : `bool`, optional
299 If `False`, any values present in the supplied ``config`` that
300 would normally be reset are not overridden and will appear
301 directly in the output config. This allows non-standard overrides
302 of the root directory for a datastore or registry to be given.
303 If this parameter is `True` the values for ``root`` will be
304 forced into the resulting config if appropriate.
305 outfile : `str`, optional
306 If not-`None`, the output configuration will be written to this
307 location rather than into the repository itself. Can be a URI
308 string. Can refer to a directory that will be used to write
309 ``butler.yaml``.
310 overwrite : `bool`, optional
311 Create a new configuration file even if one already exists
312 in the specified output location. Default is to raise
313 an exception.
315 Returns
316 -------
317 config : `Config`
318 The updated `Config` instance written to the repo.
320 Raises
321 ------
322 ValueError
323 Raised if a ButlerConfig or ConfigSubset is passed instead of a
324 regular Config (as these subclasses would make it impossible to
325 support ``standalone=False``).
326 FileExistsError
327 Raised if the output config file already exists.
328 os.error
329 Raised if the directory does not exist, exists but is not a
330 directory, or cannot be created.
332 Notes
333 -----
334 Note that when ``standalone=False`` (the default), the configuration
335 search path (see `ConfigSubset.defaultSearchPaths`) that was used to
336 construct the repository should also be used to construct any Butlers
337 to avoid configuration inconsistencies.
338 """
339 if isinstance(config, (ButlerConfig, ConfigSubset)):
340 raise ValueError("makeRepo must be passed a regular Config without defaults applied.")
342 # Ensure that the root of the repository exists or can be made
343 uri = ButlerURI(root, forceDirectory=True)
344 uri.mkdir()
346 config = Config(config)
348 # If we are creating a new repo from scratch with relative roots,
349 # do not propagate an explicit root from the config file
350 if "root" in config:
351 del config["root"]
353 full = ButlerConfig(config, searchPaths=searchPaths) # this applies defaults
354 datastoreClass: Type[Datastore] = doImport(full["datastore", "cls"])
355 datastoreClass.setConfigRoot(BUTLER_ROOT_TAG, config, full, overwrite=forceConfigRoot)
357 # if key exists in given config, parse it, otherwise parse the defaults
358 # in the expanded config
359 if config.get(("registry", "db")):
360 registryConfig = RegistryConfig(config)
361 else:
362 registryConfig = RegistryConfig(full)
363 defaultDatabaseUri = registryConfig.makeDefaultDatabaseUri(BUTLER_ROOT_TAG)
364 if defaultDatabaseUri is not None:
365 Config.updateParameters(RegistryConfig, config, full,
366 toUpdate={"db": defaultDatabaseUri},
367 overwrite=forceConfigRoot)
368 else:
369 Config.updateParameters(RegistryConfig, config, full, toCopy=("db",),
370 overwrite=forceConfigRoot)
372 if standalone:
373 config.merge(full)
374 else:
375 # Always expand the registry.managers section into the per-repo
376 # config, because after the database schema is created, it's not
377 # allowed to change anymore. Note that in the standalone=True
378 # branch, _everything_ in the config is expanded, so there's no
379 # need to special case this.
380 Config.updateParameters(RegistryConfig, config, full, toCopy=("managers",), overwrite=False)
381 configURI: Union[str, ButlerURI]
382 if outfile is not None:
383 # When writing to a separate location we must include
384 # the root of the butler repo in the config else it won't know
385 # where to look.
386 config["root"] = uri.geturl()
387 configURI = outfile
388 else:
389 configURI = uri
390 config.dumpToUri(configURI, overwrite=overwrite)
392 # Create Registry and populate tables
393 registryConfig = RegistryConfig(config.get("registry"))
394 dimensionConfig = DimensionConfig(dimensionConfig)
395 Registry.createFromConfig(registryConfig, dimensionConfig=dimensionConfig, butlerRoot=root)
397 return config
399 @classmethod
400 def _unpickle(cls, config: ButlerConfig, collections: Optional[CollectionSearch], run: Optional[str],
401 defaultDataId: Dict[str, str], writeable: bool) -> Butler:
402 """Callable used to unpickle a Butler.
404 We prefer not to use ``Butler.__init__`` directly so we can force some
405 of its many arguments to be keyword-only (note that ``__reduce__``
406 can only invoke callables with positional arguments).
408 Parameters
409 ----------
410 config : `ButlerConfig`
411 Butler configuration, already coerced into a true `ButlerConfig`
412 instance (and hence after any search paths for overrides have been
413 utilized).
414 collections : `CollectionSearch`
415 Names of the default collections to read from.
416 run : `str`, optional
417 Name of the default `~CollectionType.RUN` collection to write to.
418 defaultDataId : `dict` [ `str`, `str` ]
419 Default data ID values.
420 writeable : `bool`
421 Whether the Butler should support write operations.
423 Returns
424 -------
425 butler : `Butler`
426 A new `Butler` instance.
427 """
428 # MyPy doesn't recognize that the kwargs below are totally valid; it
429 # seems to think '**defaultDataId* is a _positional_ argument!
430 return cls(config=config, collections=collections, run=run, writeable=writeable,
431 **defaultDataId) # type: ignore
433 def __reduce__(self) -> tuple:
434 """Support pickling.
435 """
436 return (Butler._unpickle, (self._config, self.collections, self.run,
437 self.registry.defaults.dataId.byName(),
438 self.registry.isWriteable()))
440 def __str__(self) -> str:
441 return "Butler(collections={}, run={}, datastore='{}', registry='{}')".format(
442 self.collections, self.run, self.datastore, self.registry)
444 def isWriteable(self) -> bool:
445 """Return `True` if this `Butler` supports write operations.
446 """
447 return self.registry.isWriteable()
449 @contextlib.contextmanager
450 def transaction(self) -> Iterator[None]:
451 """Context manager supporting `Butler` transactions.
453 Transactions can be nested.
454 """
455 with self.registry.transaction():
456 with self.datastore.transaction():
457 yield
459 def _standardizeArgs(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
460 dataId: Optional[DataId] = None, **kwds: Any
461 ) -> Tuple[DatasetType, Optional[DataId]]:
462 """Standardize the arguments passed to several Butler APIs.
464 Parameters
465 ----------
466 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
467 When `DatasetRef` the `dataId` should be `None`.
468 Otherwise the `DatasetType` or name thereof.
469 dataId : `dict` or `DataCoordinate`
470 A `dict` of `Dimension` link name, value pairs that label the
471 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
472 should be provided as the second argument.
473 kwds
474 Additional keyword arguments used to augment or construct a
475 `DataCoordinate`. See `DataCoordinate.standardize`
476 parameters.
478 Returns
479 -------
480 datasetType : `DatasetType`
481 A `DatasetType` instance extracted from ``datasetRefOrType``.
482 dataId : `dict` or `DataId`, optional
483 Argument that can be used (along with ``kwds``) to construct a
484 `DataId`.
486 Notes
487 -----
488 Butler APIs that conceptually need a DatasetRef also allow passing a
489 `DatasetType` (or the name of one) and a `DataId` (or a dict and
490 keyword arguments that can be used to construct one) separately. This
491 method accepts those arguments and always returns a true `DatasetType`
492 and a `DataId` or `dict`.
494 Standardization of `dict` vs `DataId` is best handled by passing the
495 returned ``dataId`` (and ``kwds``) to `Registry` APIs, which are
496 generally similarly flexible.
497 """
498 externalDatasetType: Optional[DatasetType] = None
499 internalDatasetType: Optional[DatasetType] = None
500 if isinstance(datasetRefOrType, DatasetRef):
501 if dataId is not None or kwds:
502 raise ValueError("DatasetRef given, cannot use dataId as well")
503 externalDatasetType = datasetRefOrType.datasetType
504 dataId = datasetRefOrType.dataId
505 else:
506 # Don't check whether DataId is provided, because Registry APIs
507 # can usually construct a better error message when it wasn't.
508 if isinstance(datasetRefOrType, DatasetType):
509 externalDatasetType = datasetRefOrType
510 else:
511 internalDatasetType = self.registry.getDatasetType(datasetRefOrType)
513 # Check that they are self-consistent
514 if externalDatasetType is not None:
515 internalDatasetType = self.registry.getDatasetType(externalDatasetType.name)
516 if externalDatasetType != internalDatasetType:
517 raise ValueError(f"Supplied dataset type ({externalDatasetType}) inconsistent with "
518 f"registry definition ({internalDatasetType})")
520 assert internalDatasetType is not None
521 return internalDatasetType, dataId
523 def _findDatasetRef(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
524 dataId: Optional[DataId] = None, *,
525 collections: Any = None,
526 allowUnresolved: bool = False,
527 **kwds: Any) -> DatasetRef:
528 """Shared logic for methods that start with a search for a dataset in
529 the registry.
531 Parameters
532 ----------
533 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
534 When `DatasetRef` the `dataId` should be `None`.
535 Otherwise the `DatasetType` or name thereof.
536 dataId : `dict` or `DataCoordinate`, optional
537 A `dict` of `Dimension` link name, value pairs that label the
538 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
539 should be provided as the first argument.
540 collections : Any, optional
541 Collections to be searched, overriding ``self.collections``.
542 Can be any of the types supported by the ``collections`` argument
543 to butler construction.
544 allowUnresolved : `bool`, optional
545 If `True`, return an unresolved `DatasetRef` if finding a resolved
546 one in the `Registry` fails. Defaults to `False`.
547 kwds
548 Additional keyword arguments used to augment or construct a
549 `DataId`. See `DataId` parameters.
551 Returns
552 -------
553 ref : `DatasetRef`
554 A reference to the dataset identified by the given arguments.
556 Raises
557 ------
558 LookupError
559 Raised if no matching dataset exists in the `Registry` (and
560 ``allowUnresolved is False``).
561 ValueError
562 Raised if a resolved `DatasetRef` was passed as an input, but it
563 differs from the one found in the registry.
564 TypeError
565 Raised if no collections were provided.
566 """
567 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwds)
568 if isinstance(datasetRefOrType, DatasetRef):
569 idNumber = datasetRefOrType.id
570 else:
571 idNumber = None
572 timespan: Optional[Timespan] = None
574 # Process dimension records that are using record information
575 # rather than ids
576 newDataId: Dict[str, DataIdValue] = {}
577 byRecord: Dict[str, Dict[str, Any]] = defaultdict(dict)
579 # if all the dataId comes from keyword parameters we do not need
580 # to do anything here because they can't be of the form
581 # exposure.obs_id because a "." is not allowed in a keyword parameter.
582 if dataId:
583 for k, v in dataId.items():
584 # If we have a Dimension we do not need to do anything
585 # because it cannot be a compound key.
586 if isinstance(k, str) and "." in k:
587 # Someone is using a more human-readable dataId
588 dimensionName, record = k.split(".", 1)
589 byRecord[dimensionName][record] = v
590 elif isinstance(k, Dimension):
591 newDataId[k.name] = v
592 else:
593 newDataId[k] = v
595 # Go through the updated dataId and check the type in case someone is
596 # using an alternate key. We have already filtered out the compound
597 # keys dimensions.record format.
598 not_dimensions = {}
600 # Will need to look in the dataId and the keyword arguments
601 # and will remove them if they need to be fixed or are unrecognized.
602 for dataIdDict in (newDataId, kwds):
603 # Use a list so we can adjust the dict safely in the loop
604 for dimensionName in list(dataIdDict):
605 value = dataIdDict[dimensionName]
606 try:
607 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
608 except KeyError:
609 # This is not a real dimension
610 not_dimensions[dimensionName] = value
611 del dataIdDict[dimensionName]
612 continue
614 # Convert an integral type to an explicit int to simplify
615 # comparisons here
616 if isinstance(value, numbers.Integral):
617 value = int(value)
619 if not isinstance(value, dimension.primaryKey.getPythonType()):
620 for alternate in dimension.alternateKeys:
621 if isinstance(value, alternate.getPythonType()):
622 byRecord[dimensionName][alternate.name] = value
623 del dataIdDict[dimensionName]
624 log.debug("Converting dimension %s to %s.%s=%s",
625 dimensionName, dimensionName, alternate.name, value)
626 break
627 else:
628 log.warning("Type mismatch found for value '%r' provided for dimension %s. "
629 "Could not find matching alternative (primary key has type %s) "
630 "so attempting to use as-is.",
631 value, dimensionName, dimension.primaryKey.getPythonType())
633 # If we have some unrecognized dimensions we have to try to connect
634 # them to records in other dimensions. This is made more complicated
635 # by some dimensions having records with clashing names. A mitigation
636 # is that we can tell by this point which dimensions are missing
637 # for the DatasetType but this does not work for calibrations
638 # where additional dimensions can be used to constrain the temporal
639 # axis.
640 if not_dimensions:
641 # Calculate missing dimensions
642 provided = set(newDataId) | set(kwds) | set(byRecord)
643 missingDimensions = datasetType.dimensions.names - provided
645 # For calibrations we may well be needing temporal dimensions
646 # so rather than always including all dimensions in the scan
647 # restrict things a little. It is still possible for there
648 # to be confusion over day_obs in visit vs exposure for example.
649 # If we are not searching calibration collections things may
650 # fail but they are going to fail anyway because of the
651 # ambiguousness of the dataId...
652 candidateDimensions: Set[str] = set()
653 candidateDimensions.update(missingDimensions)
654 if datasetType.isCalibration():
655 for dim in self.registry.dimensions.getStaticDimensions():
656 if dim.temporal:
657 candidateDimensions.add(str(dim))
659 # Look up table for the first association with a dimension
660 guessedAssociation: Dict[str, Dict[str, Any]] = defaultdict(dict)
662 # Keep track of whether an item is associated with multiple
663 # dimensions.
664 counter: Counter[str] = Counter()
665 assigned: Dict[str, Set[str]] = defaultdict(set)
667 # Go through the missing dimensions and associate the
668 # given names with records within those dimensions
669 for dimensionName in candidateDimensions:
670 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
671 fields = dimension.metadata.names | dimension.uniqueKeys.names
672 for field in not_dimensions:
673 if field in fields:
674 guessedAssociation[dimensionName][field] = not_dimensions[field]
675 counter[dimensionName] += 1
676 assigned[field].add(dimensionName)
678 # There is a chance we have allocated a single dataId item
679 # to multiple dimensions. Need to decide which should be retained.
680 # For now assume that the most popular alternative wins.
681 # This means that day_obs with seq_num will result in
682 # exposure.day_obs and not visit.day_obs
683 # Also prefer an explicitly missing dimension over an inferred
684 # temporal dimension.
685 for fieldName, assignedDimensions in assigned.items():
686 if len(assignedDimensions) > 1:
687 # Pick the most popular (preferring mandatory dimensions)
688 requiredButMissing = assignedDimensions.intersection(missingDimensions)
689 if requiredButMissing:
690 candidateDimensions = requiredButMissing
691 else:
692 candidateDimensions = assignedDimensions
694 # Select the relevant items and get a new restricted
695 # counter.
696 theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions}
697 duplicatesCounter: Counter[str] = Counter()
698 duplicatesCounter.update(theseCounts)
700 # Choose the most common. If they are equally common
701 # we will pick the one that was found first.
702 # Returns a list of tuples
703 selected = duplicatesCounter.most_common(1)[0][0]
705 log.debug("Ambiguous dataId entry '%s' associated with multiple dimensions: %s."
706 " Removed ambiguity by choosing dimension %s.",
707 fieldName, ", ".join(assignedDimensions), selected)
709 for candidateDimension in assignedDimensions:
710 if candidateDimension != selected:
711 del guessedAssociation[candidateDimension][fieldName]
713 # Update the record look up dict with the new associations
714 for dimensionName, values in guessedAssociation.items():
715 if values: # A dict might now be empty
716 log.debug("Assigned non-dimension dataId keys to dimension %s: %s",
717 dimensionName, values)
718 byRecord[dimensionName].update(values)
720 if byRecord:
721 # Some record specifiers were found so we need to convert
722 # them to the Id form
723 for dimensionName, values in byRecord.items():
724 if dimensionName in newDataId:
725 log.warning("DataId specified explicit %s dimension value of %s in addition to"
726 " general record specifiers for it of %s. Ignoring record information.",
727 dimensionName, newDataId[dimensionName], str(values))
728 continue
730 # Build up a WHERE expression -- use single quotes
731 def quote(s: Any) -> str:
732 if isinstance(s, str):
733 return f"'{s}'"
734 else:
735 return s
737 where = " AND ".join(f"{dimensionName}.{k} = {quote(v)}"
738 for k, v in values.items())
740 # Hopefully we get a single record that matches
741 records = set(self.registry.queryDimensionRecords(dimensionName, dataId=newDataId,
742 where=where, **kwds))
744 if len(records) != 1:
745 if len(records) > 1:
746 log.debug("Received %d records from constraints of %s", len(records), str(values))
747 for r in records:
748 log.debug("- %s", str(r))
749 raise RuntimeError(f"DataId specification for dimension {dimensionName} is not"
750 f" uniquely constrained to a single dataset by {values}."
751 f" Got {len(records)} results.")
752 raise RuntimeError(f"DataId specification for dimension {dimensionName} matched no"
753 f" records when constrained by {values}")
755 # Get the primary key from the real dimension object
756 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
757 if not isinstance(dimension, Dimension):
758 raise RuntimeError(
759 f"{dimension.name} is not a true dimension, and cannot be used in data IDs."
760 )
761 newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name)
763 # We have modified the dataId so need to switch to it
764 dataId = newDataId
766 if datasetType.isCalibration():
767 # Because this is a calibration dataset, first try to make a
768 # standardize the data ID without restricting the dimensions to
769 # those of the dataset type requested, because there may be extra
770 # dimensions that provide temporal information for a validity-range
771 # lookup.
772 dataId = DataCoordinate.standardize(dataId, universe=self.registry.dimensions,
773 defaults=self.registry.defaults.dataId, **kwds)
774 if dataId.graph.temporal:
775 dataId = self.registry.expandDataId(dataId)
776 timespan = dataId.timespan
777 else:
778 # Standardize the data ID to just the dimensions of the dataset
779 # type instead of letting registry.findDataset do it, so we get the
780 # result even if no dataset is found.
781 dataId = DataCoordinate.standardize(dataId, graph=datasetType.dimensions,
782 defaults=self.registry.defaults.dataId, **kwds)
783 # Always lookup the DatasetRef, even if one is given, to ensure it is
784 # present in the current collection.
785 ref = self.registry.findDataset(datasetType, dataId, collections=collections, timespan=timespan)
786 if ref is None:
787 if allowUnresolved:
788 return DatasetRef(datasetType, dataId)
789 else:
790 raise LookupError(f"Dataset {datasetType.name} with data ID {dataId} "
791 f"could not be found in collections {collections}.")
792 if idNumber is not None and idNumber != ref.id:
793 raise ValueError(f"DatasetRef.id provided ({idNumber}) does not match "
794 f"id ({ref.id}) in registry in collections {collections}.")
795 return ref
797 @transactional
798 def put(self, obj: Any, datasetRefOrType: Union[DatasetRef, DatasetType, str],
799 dataId: Optional[DataId] = None, *,
800 run: Optional[str] = None,
801 **kwds: Any) -> DatasetRef:
802 """Store and register a dataset.
804 Parameters
805 ----------
806 obj : `object`
807 The dataset.
808 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
809 When `DatasetRef` is provided, ``dataId`` should be `None`.
810 Otherwise the `DatasetType` or name thereof.
811 dataId : `dict` or `DataCoordinate`
812 A `dict` of `Dimension` link name, value pairs that label the
813 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
814 should be provided as the second argument.
815 run : `str`, optional
816 The name of the run the dataset should be added to, overriding
817 ``self.run``.
818 kwds
819 Additional keyword arguments used to augment or construct a
820 `DataCoordinate`. See `DataCoordinate.standardize`
821 parameters.
823 Returns
824 -------
825 ref : `DatasetRef`
826 A reference to the stored dataset, updated with the correct id if
827 given.
829 Raises
830 ------
831 TypeError
832 Raised if the butler is read-only or if no run has been provided.
833 """
834 log.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run)
835 if not self.isWriteable():
836 raise TypeError("Butler is read-only.")
837 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwds)
838 if isinstance(datasetRefOrType, DatasetRef) and datasetRefOrType.id is not None:
839 raise ValueError("DatasetRef must not be in registry, must have None id")
841 # Add Registry Dataset entry.
842 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions, **kwds)
843 ref, = self.registry.insertDatasets(datasetType, run=run, dataIds=[dataId])
845 # Add Datastore entry.
846 self.datastore.put(obj, ref)
848 return ref
850 def getDirect(self, ref: DatasetRef, *, parameters: Optional[Dict[str, Any]] = None) -> Any:
851 """Retrieve a stored dataset.
853 Unlike `Butler.get`, this method allows datasets outside the Butler's
854 collection to be read as long as the `DatasetRef` that identifies them
855 can be obtained separately.
857 Parameters
858 ----------
859 ref : `DatasetRef`
860 Resolved reference to an already stored dataset.
861 parameters : `dict`
862 Additional StorageClass-defined options to control reading,
863 typically used to efficiently read only a subset of the dataset.
865 Returns
866 -------
867 obj : `object`
868 The dataset.
869 """
870 return self.datastore.get(ref, parameters=parameters)
872 def getDirectDeferred(self, ref: DatasetRef, *,
873 parameters: Union[dict, None] = None) -> DeferredDatasetHandle:
874 """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
875 from a resolved `DatasetRef`.
877 Parameters
878 ----------
879 ref : `DatasetRef`
880 Resolved reference to an already stored dataset.
881 parameters : `dict`
882 Additional StorageClass-defined options to control reading,
883 typically used to efficiently read only a subset of the dataset.
885 Returns
886 -------
887 obj : `DeferredDatasetHandle`
888 A handle which can be used to retrieve a dataset at a later time.
890 Raises
891 ------
892 AmbiguousDatasetError
893 Raised if ``ref.id is None``, i.e. the reference is unresolved.
894 """
895 if ref.id is None:
896 raise AmbiguousDatasetError(
897 f"Dataset of type {ref.datasetType.name} with data ID {ref.dataId} is not resolved."
898 )
899 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters)
901 def getDeferred(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
902 dataId: Optional[DataId] = None, *,
903 parameters: Union[dict, None] = None,
904 collections: Any = None,
905 **kwds: Any) -> DeferredDatasetHandle:
906 """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
907 after an immediate registry lookup.
909 Parameters
910 ----------
911 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
912 When `DatasetRef` the `dataId` should be `None`.
913 Otherwise the `DatasetType` or name thereof.
914 dataId : `dict` or `DataCoordinate`, optional
915 A `dict` of `Dimension` link name, value pairs that label the
916 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
917 should be provided as the first argument.
918 parameters : `dict`
919 Additional StorageClass-defined options to control reading,
920 typically used to efficiently read only a subset of the dataset.
921 collections : Any, optional
922 Collections to be searched, overriding ``self.collections``.
923 Can be any of the types supported by the ``collections`` argument
924 to butler construction.
925 kwds
926 Additional keyword arguments used to augment or construct a
927 `DataId`. See `DataId` parameters.
929 Returns
930 -------
931 obj : `DeferredDatasetHandle`
932 A handle which can be used to retrieve a dataset at a later time.
934 Raises
935 ------
936 LookupError
937 Raised if no matching dataset exists in the `Registry` (and
938 ``allowUnresolved is False``).
939 ValueError
940 Raised if a resolved `DatasetRef` was passed as an input, but it
941 differs from the one found in the registry.
942 TypeError
943 Raised if no collections were provided.
944 """
945 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwds)
946 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters)
948 def get(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
949 dataId: Optional[DataId] = None, *,
950 parameters: Optional[Dict[str, Any]] = None,
951 collections: Any = None,
952 **kwds: Any) -> Any:
953 """Retrieve a stored dataset.
955 Parameters
956 ----------
957 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
958 When `DatasetRef` the `dataId` should be `None`.
959 Otherwise the `DatasetType` or name thereof.
960 dataId : `dict` or `DataCoordinate`
961 A `dict` of `Dimension` link name, value pairs that label the
962 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
963 should be provided as the first argument.
964 parameters : `dict`
965 Additional StorageClass-defined options to control reading,
966 typically used to efficiently read only a subset of the dataset.
967 collections : Any, optional
968 Collections to be searched, overriding ``self.collections``.
969 Can be any of the types supported by the ``collections`` argument
970 to butler construction.
971 kwds
972 Additional keyword arguments used to augment or construct a
973 `DataCoordinate`. See `DataCoordinate.standardize`
974 parameters.
976 Returns
977 -------
978 obj : `object`
979 The dataset.
981 Raises
982 ------
983 ValueError
984 Raised if a resolved `DatasetRef` was passed as an input, but it
985 differs from the one found in the registry.
986 LookupError
987 Raised if no matching dataset exists in the `Registry`.
988 TypeError
989 Raised if no collections were provided.
991 Notes
992 -----
993 When looking up datasets in a `~CollectionType.CALIBRATION` collection,
994 this method requires that the given data ID include temporal dimensions
995 beyond the dimensions of the dataset type itself, in order to find the
996 dataset with the appropriate validity range. For example, a "bias"
997 dataset with native dimensions ``{instrument, detector}`` could be
998 fetched with a ``{instrument, detector, exposure}`` data ID, because
999 ``exposure`` is a temporal dimension.
1000 """
1001 log.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters)
1002 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwds)
1003 return self.getDirect(ref, parameters=parameters)
1005 def getURIs(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
1006 dataId: Optional[DataId] = None, *,
1007 predict: bool = False,
1008 collections: Any = None,
1009 run: Optional[str] = None,
1010 **kwds: Any) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]:
1011 """Returns the URIs associated with the dataset.
1013 Parameters
1014 ----------
1015 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1016 When `DatasetRef` the `dataId` should be `None`.
1017 Otherwise the `DatasetType` or name thereof.
1018 dataId : `dict` or `DataCoordinate`
1019 A `dict` of `Dimension` link name, value pairs that label the
1020 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1021 should be provided as the first argument.
1022 predict : `bool`
1023 If `True`, allow URIs to be returned of datasets that have not
1024 been written.
1025 collections : Any, optional
1026 Collections to be searched, overriding ``self.collections``.
1027 Can be any of the types supported by the ``collections`` argument
1028 to butler construction.
1029 run : `str`, optional
1030 Run to use for predictions, overriding ``self.run``.
1031 kwds
1032 Additional keyword arguments used to augment or construct a
1033 `DataCoordinate`. See `DataCoordinate.standardize`
1034 parameters.
1036 Returns
1037 -------
1038 primary : `ButlerURI`
1039 The URI to the primary artifact associated with this dataset.
1040 If the dataset was disassembled within the datastore this
1041 may be `None`.
1042 components : `dict`
1043 URIs to any components associated with the dataset artifact.
1044 Can be empty if there are no components.
1045 """
1046 ref = self._findDatasetRef(datasetRefOrType, dataId, allowUnresolved=predict,
1047 collections=collections, **kwds)
1048 if ref.id is None: # only possible if predict is True
1049 if run is None:
1050 run = self.run
1051 if run is None:
1052 raise TypeError("Cannot predict location with run=None.")
1053 # Lie about ID, because we can't guess it, and only
1054 # Datastore.getURIs() will ever see it (and it doesn't use it).
1055 ref = ref.resolved(id=0, run=run)
1056 return self.datastore.getURIs(ref, predict)
1058 def getURI(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
1059 dataId: Optional[DataId] = None, *,
1060 predict: bool = False,
1061 collections: Any = None,
1062 run: Optional[str] = None,
1063 **kwds: Any) -> ButlerURI:
1064 """Return the URI to the Dataset.
1066 Parameters
1067 ----------
1068 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1069 When `DatasetRef` the `dataId` should be `None`.
1070 Otherwise the `DatasetType` or name thereof.
1071 dataId : `dict` or `DataCoordinate`
1072 A `dict` of `Dimension` link name, value pairs that label the
1073 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1074 should be provided as the first argument.
1075 predict : `bool`
1076 If `True`, allow URIs to be returned of datasets that have not
1077 been written.
1078 collections : Any, optional
1079 Collections to be searched, overriding ``self.collections``.
1080 Can be any of the types supported by the ``collections`` argument
1081 to butler construction.
1082 run : `str`, optional
1083 Run to use for predictions, overriding ``self.run``.
1084 kwds
1085 Additional keyword arguments used to augment or construct a
1086 `DataCoordinate`. See `DataCoordinate.standardize`
1087 parameters.
1089 Returns
1090 -------
1091 uri : `ButlerURI`
1092 URI pointing to the Dataset within the datastore. If the
1093 Dataset does not exist in the datastore, and if ``predict`` is
1094 `True`, the URI will be a prediction and will include a URI
1095 fragment "#predicted".
1096 If the datastore does not have entities that relate well
1097 to the concept of a URI the returned URI string will be
1098 descriptive. The returned URI is not guaranteed to be obtainable.
1100 Raises
1101 ------
1102 LookupError
1103 A URI has been requested for a dataset that does not exist and
1104 guessing is not allowed.
1105 ValueError
1106 Raised if a resolved `DatasetRef` was passed as an input, but it
1107 differs from the one found in the registry.
1108 TypeError
1109 Raised if no collections were provided.
1110 RuntimeError
1111 Raised if a URI is requested for a dataset that consists of
1112 multiple artifacts.
1113 """
1114 primary, components = self.getURIs(datasetRefOrType, dataId=dataId, predict=predict,
1115 collections=collections, run=run, **kwds)
1117 if primary is None or components:
1118 raise RuntimeError(f"Dataset ({datasetRefOrType}) includes distinct URIs for components. "
1119 "Use Butler.getURIs() instead.")
1120 return primary
1122 def datasetExists(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
1123 dataId: Optional[DataId] = None, *,
1124 collections: Any = None,
1125 **kwds: Any) -> bool:
1126 """Return True if the Dataset is actually present in the Datastore.
1128 Parameters
1129 ----------
1130 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1131 When `DatasetRef` the `dataId` should be `None`.
1132 Otherwise the `DatasetType` or name thereof.
1133 dataId : `dict` or `DataCoordinate`
1134 A `dict` of `Dimension` link name, value pairs that label the
1135 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1136 should be provided as the first argument.
1137 collections : Any, optional
1138 Collections to be searched, overriding ``self.collections``.
1139 Can be any of the types supported by the ``collections`` argument
1140 to butler construction.
1141 kwds
1142 Additional keyword arguments used to augment or construct a
1143 `DataCoordinate`. See `DataCoordinate.standardize`
1144 parameters.
1146 Raises
1147 ------
1148 LookupError
1149 Raised if the dataset is not even present in the Registry.
1150 ValueError
1151 Raised if a resolved `DatasetRef` was passed as an input, but it
1152 differs from the one found in the registry.
1153 TypeError
1154 Raised if no collections were provided.
1155 """
1156 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwds)
1157 return self.datastore.exists(ref)
1159 def pruneCollection(self, name: str, purge: bool = False, unstore: bool = False) -> None:
1160 """Remove a collection and possibly prune datasets within it.
1162 Parameters
1163 ----------
1164 name : `str`
1165 Name of the collection to remove. If this is a
1166 `~CollectionType.TAGGED` or `~CollectionType.CHAINED` collection,
1167 datasets within the collection are not modified unless ``unstore``
1168 is `True`. If this is a `~CollectionType.RUN` collection,
1169 ``purge`` and ``unstore`` must be `True`, and all datasets in it
1170 are fully removed from the data repository.
1171 purge : `bool`, optional
1172 If `True`, permit `~CollectionType.RUN` collections to be removed,
1173 fully removing datasets within them. Requires ``unstore=True`` as
1174 well as an added precaution against accidental deletion. Must be
1175 `False` (default) if the collection is not a ``RUN``.
1176 unstore: `bool`, optional
1177 If `True`, remove all datasets in the collection from all
1178 datastores in which they appear.
1180 Raises
1181 ------
1182 TypeError
1183 Raised if the butler is read-only or arguments are mutually
1184 inconsistent.
1185 """
1187 # See pruneDatasets comments for more information about the logic here;
1188 # the cases are almost the same, but here we can rely on Registry to
1189 # take care everything but Datastore deletion when we remove the
1190 # collection.
1191 if not self.isWriteable():
1192 raise TypeError("Butler is read-only.")
1193 collectionType = self.registry.getCollectionType(name)
1194 if purge and not unstore:
1195 raise PurgeWithoutUnstorePruneCollectionsError()
1196 if collectionType is CollectionType.RUN and not purge:
1197 raise RunWithoutPurgePruneCollectionsError(collectionType)
1198 if collectionType is not CollectionType.RUN and purge:
1199 raise PurgeUnsupportedPruneCollectionsError(collectionType)
1201 with self.registry.transaction():
1202 if unstore:
1203 for ref in self.registry.queryDatasets(..., collections=name, findFirst=True):
1204 if self.datastore.exists(ref):
1205 self.datastore.trash(ref)
1206 self.registry.removeCollection(name)
1207 if unstore:
1208 # Point of no return for removing artifacts
1209 self.datastore.emptyTrash()
1211 def pruneDatasets(self, refs: Iterable[DatasetRef], *,
1212 disassociate: bool = True,
1213 unstore: bool = False,
1214 tags: Iterable[str] = (),
1215 purge: bool = False,
1216 run: Optional[str] = None) -> None:
1217 """Remove one or more datasets from a collection and/or storage.
1219 Parameters
1220 ----------
1221 refs : `~collections.abc.Iterable` of `DatasetRef`
1222 Datasets to prune. These must be "resolved" references (not just
1223 a `DatasetType` and data ID).
1224 disassociate : `bool`, optional
1225 Disassociate pruned datasets from ``tags``, or from all collections
1226 if ``purge=True``.
1227 unstore : `bool`, optional
1228 If `True` (`False` is default) remove these datasets from all
1229 datastores known to this butler. Note that this will make it
1230 impossible to retrieve these datasets even via other collections.
1231 Datasets that are already not stored are ignored by this option.
1232 tags : `Iterable` [ `str` ], optional
1233 `~CollectionType.TAGGED` collections to disassociate the datasets
1234 from. Ignored if ``disassociate`` is `False` or ``purge`` is
1235 `True`.
1236 purge : `bool`, optional
1237 If `True` (`False` is default), completely remove the dataset from
1238 the `Registry`. To prevent accidental deletions, ``purge`` may
1239 only be `True` if all of the following conditions are met:
1241 - All given datasets are in the given run.
1242 - ``disassociate`` is `True`;
1243 - ``unstore`` is `True`.
1245 This mode may remove provenance information from datasets other
1246 than those provided, and should be used with extreme care.
1248 Raises
1249 ------
1250 TypeError
1251 Raised if the butler is read-only, if no collection was provided,
1252 or the conditions for ``purge=True`` were not met.
1253 """
1254 if not self.isWriteable():
1255 raise TypeError("Butler is read-only.")
1256 if purge:
1257 if not disassociate:
1258 raise TypeError("Cannot pass purge=True without disassociate=True.")
1259 if not unstore:
1260 raise TypeError("Cannot pass purge=True without unstore=True.")
1261 elif disassociate:
1262 tags = tuple(tags)
1263 if not tags:
1264 raise TypeError("No tags provided but disassociate=True.")
1265 for tag in tags:
1266 collectionType = self.registry.getCollectionType(tag)
1267 if collectionType is not CollectionType.TAGGED:
1268 raise TypeError(f"Cannot disassociate from collection '{tag}' "
1269 f"of non-TAGGED type {collectionType.name}.")
1270 # Transform possibly-single-pass iterable into something we can iterate
1271 # over multiple times.
1272 refs = list(refs)
1273 # Pruning a component of a DatasetRef makes no sense since registry
1274 # doesn't know about components and datastore might not store
1275 # components in a separate file
1276 for ref in refs:
1277 if ref.datasetType.component():
1278 raise ValueError(f"Can not prune a component of a dataset (ref={ref})")
1279 # We don't need an unreliable Datastore transaction for this, because
1280 # we've been extra careful to ensure that Datastore.trash only involves
1281 # mutating the Registry (it can _look_ at Datastore-specific things,
1282 # but shouldn't change them), and hence all operations here are
1283 # Registry operations.
1284 with self.registry.transaction():
1285 if unstore:
1286 for ref in refs:
1287 # There is a difference between a concrete composite
1288 # and virtual composite. In a virtual composite the
1289 # datastore is never given the top level DatasetRef. In
1290 # the concrete composite the datastore knows all the
1291 # refs and will clean up itself if asked to remove the
1292 # parent ref. We can not check configuration for this
1293 # since we can not trust that the configuration is the
1294 # same. We therefore have to ask if the ref exists or
1295 # not. This is consistent with the fact that we want
1296 # to ignore already-removed-from-datastore datasets
1297 # anyway.
1298 if self.datastore.exists(ref):
1299 self.datastore.trash(ref)
1300 if purge:
1301 self.registry.removeDatasets(refs)
1302 elif disassociate:
1303 assert tags, "Guaranteed by earlier logic in this function."
1304 for tag in tags:
1305 self.registry.disassociate(tag, refs)
1306 # We've exited the Registry transaction, and apparently committed.
1307 # (if there was an exception, everything rolled back, and it's as if
1308 # nothing happened - and we never get here).
1309 # Datastore artifacts are not yet gone, but they're clearly marked
1310 # as trash, so if we fail to delete now because of (e.g.) filesystem
1311 # problems we can try again later, and if manual administrative
1312 # intervention is required, it's pretty clear what that should entail:
1313 # deleting everything on disk and in private Datastore tables that is
1314 # in the dataset_location_trash table.
1315 if unstore:
1316 # Point of no return for removing artifacts
1317 self.datastore.emptyTrash()
1319 @transactional
1320 def ingest(self, *datasets: FileDataset, transfer: Optional[str] = "auto", run: Optional[str] = None,
1321 ) -> None:
1322 """Store and register one or more datasets that already exist on disk.
1324 Parameters
1325 ----------
1326 datasets : `FileDataset`
1327 Each positional argument is a struct containing information about
1328 a file to be ingested, including its path (either absolute or
1329 relative to the datastore root, if applicable), a `DatasetRef`,
1330 and optionally a formatter class or its fully-qualified string
1331 name. If a formatter is not provided, the formatter that would be
1332 used for `put` is assumed. On successful return, all
1333 `FileDataset.ref` attributes will have their `DatasetRef.id`
1334 attribute populated and all `FileDataset.formatter` attributes will
1335 be set to the formatter class used. `FileDataset.path` attributes
1336 may be modified to put paths in whatever the datastore considers a
1337 standardized form.
1338 transfer : `str`, optional
1339 If not `None`, must be one of 'auto', 'move', 'copy', 'direct',
1340 'hardlink', 'relsymlink' or 'symlink', indicating how to transfer
1341 the file.
1342 run : `str`, optional
1343 The name of the run ingested datasets should be added to,
1344 overriding ``self.run``.
1346 Raises
1347 ------
1348 TypeError
1349 Raised if the butler is read-only or if no run was provided.
1350 NotImplementedError
1351 Raised if the `Datastore` does not support the given transfer mode.
1352 DatasetTypeNotSupportedError
1353 Raised if one or more files to be ingested have a dataset type that
1354 is not supported by the `Datastore`..
1355 FileNotFoundError
1356 Raised if one of the given files does not exist.
1357 FileExistsError
1358 Raised if transfer is not `None` but the (internal) location the
1359 file would be moved to is already occupied.
1361 Notes
1362 -----
1363 This operation is not fully exception safe: if a database operation
1364 fails, the given `FileDataset` instances may be only partially updated.
1366 It is atomic in terms of database operations (they will either all
1367 succeed or all fail) providing the database engine implements
1368 transactions correctly. It will attempt to be atomic in terms of
1369 filesystem operations as well, but this cannot be implemented
1370 rigorously for most datastores.
1371 """
1372 if not self.isWriteable():
1373 raise TypeError("Butler is read-only.")
1374 # Reorganize the inputs so they're grouped by DatasetType and then
1375 # data ID. We also include a list of DatasetRefs for each FileDataset
1376 # to hold the resolved DatasetRefs returned by the Registry, before
1377 # it's safe to swap them into FileDataset.refs.
1378 # Some type annotation aliases to make that clearer:
1379 GroupForType = Dict[DataCoordinate, Tuple[FileDataset, List[DatasetRef]]]
1380 GroupedData = MutableMapping[DatasetType, GroupForType]
1381 # The actual data structure:
1382 groupedData: GroupedData = defaultdict(dict)
1383 # And the nested loop that populates it:
1384 for dataset in datasets:
1385 # This list intentionally shared across the inner loop, since it's
1386 # associated with `dataset`.
1387 resolvedRefs: List[DatasetRef] = []
1388 for ref in dataset.refs:
1389 groupedData[ref.datasetType][ref.dataId] = (dataset, resolvedRefs)
1391 # Now we can bulk-insert into Registry for each DatasetType.
1392 allResolvedRefs: List[DatasetRef] = []
1393 for datasetType, groupForType in groupedData.items():
1394 refs = self.registry.insertDatasets(datasetType,
1395 dataIds=groupForType.keys(),
1396 run=run)
1397 # Append those resolved DatasetRefs to the new lists we set up for
1398 # them.
1399 for ref, (_, resolvedRefs) in zip(refs, groupForType.values()):
1400 resolvedRefs.append(ref)
1402 # Go back to the original FileDatasets to replace their refs with the
1403 # new resolved ones, and also build a big list of all refs.
1404 allResolvedRefs = []
1405 for groupForType in groupedData.values():
1406 for dataset, resolvedRefs in groupForType.values():
1407 dataset.refs = resolvedRefs
1408 allResolvedRefs.extend(resolvedRefs)
1410 # Bulk-insert everything into Datastore.
1411 self.datastore.ingest(*datasets, transfer=transfer)
1413 @contextlib.contextmanager
1414 def export(self, *, directory: Optional[str] = None,
1415 filename: Optional[str] = None,
1416 format: Optional[str] = None,
1417 transfer: Optional[str] = None) -> Iterator[RepoExportContext]:
1418 """Export datasets from the repository represented by this `Butler`.
1420 This method is a context manager that returns a helper object
1421 (`RepoExportContext`) that is used to indicate what information from
1422 the repository should be exported.
1424 Parameters
1425 ----------
1426 directory : `str`, optional
1427 Directory dataset files should be written to if ``transfer`` is not
1428 `None`.
1429 filename : `str`, optional
1430 Name for the file that will include database information associated
1431 with the exported datasets. If this is not an absolute path and
1432 ``directory`` is not `None`, it will be written to ``directory``
1433 instead of the current working directory. Defaults to
1434 "export.{format}".
1435 format : `str`, optional
1436 File format for the database information file. If `None`, the
1437 extension of ``filename`` will be used.
1438 transfer : `str`, optional
1439 Transfer mode passed to `Datastore.export`.
1441 Raises
1442 ------
1443 TypeError
1444 Raised if the set of arguments passed is inconsistent.
1446 Examples
1447 --------
1448 Typically the `Registry.queryDataIds` and `Registry.queryDatasets`
1449 methods are used to provide the iterables over data IDs and/or datasets
1450 to be exported::
1452 with butler.export("exports.yaml") as export:
1453 # Export all flats, but none of the dimension element rows
1454 # (i.e. data ID information) associated with them.
1455 export.saveDatasets(butler.registry.queryDatasets("flat"),
1456 elements=())
1457 # Export all datasets that start with "deepCoadd_" and all of
1458 # their associated data ID information.
1459 export.saveDatasets(butler.registry.queryDatasets("deepCoadd_*"))
1460 """
1461 if directory is None and transfer is not None:
1462 raise TypeError("Cannot transfer without providing a directory.")
1463 if transfer == "move":
1464 raise TypeError("Transfer may not be 'move': export is read-only")
1465 if format is None:
1466 if filename is None:
1467 raise TypeError("At least one of 'filename' or 'format' must be provided.")
1468 else:
1469 _, format = os.path.splitext(filename)
1470 elif filename is None:
1471 filename = f"export.{format}"
1472 if directory is not None:
1473 filename = os.path.join(directory, filename)
1474 BackendClass = getClassOf(self._config["repo_transfer_formats"][format]["export"])
1475 with open(filename, 'w') as stream:
1476 backend = BackendClass(stream)
1477 try:
1478 helper = RepoExportContext(self.registry, self.datastore, backend=backend,
1479 directory=directory, transfer=transfer)
1480 yield helper
1481 except BaseException:
1482 raise
1483 else:
1484 helper._finish()
1486 def import_(self, *, directory: Optional[str] = None,
1487 filename: Union[str, TextIO, None] = None,
1488 format: Optional[str] = None,
1489 transfer: Optional[str] = None,
1490 skip_dimensions: Optional[Set] = None) -> None:
1491 """Import datasets into this repository that were exported from a
1492 different butler repository via `~lsst.daf.butler.Butler.export`.
1494 Parameters
1495 ----------
1496 directory : `str`, optional
1497 Directory containing dataset files to import from. If `None`,
1498 ``filename`` and all dataset file paths specified therein must
1499 be absolute.
1500 filename : `str` or `TextIO`, optional
1501 A stream or name of file that contains database information
1502 associated with the exported datasets, typically generated by
1503 `~lsst.daf.butler.Butler.export`. If this a string (name) and
1504 is not an absolute path, does not exist in the current working
1505 directory, and ``directory`` is not `None`, it is assumed to be in
1506 ``directory``. Defaults to "export.{format}".
1507 format : `str`, optional
1508 File format for ``filename``. If `None`, the extension of
1509 ``filename`` will be used.
1510 transfer : `str`, optional
1511 Transfer mode passed to `~lsst.daf.butler.Datastore.ingest`.
1512 skip_dimensions : `set`, optional
1513 Names of dimensions that should be skipped and not imported.
1515 Raises
1516 ------
1517 TypeError
1518 Raised if the set of arguments passed is inconsistent, or if the
1519 butler is read-only.
1520 """
1521 if not self.isWriteable():
1522 raise TypeError("Butler is read-only.")
1523 if format is None:
1524 if filename is None:
1525 raise TypeError("At least one of 'filename' or 'format' must be provided.")
1526 else:
1527 _, format = os.path.splitext(filename) # type: ignore
1528 elif filename is None:
1529 filename = f"export.{format}"
1530 if isinstance(filename, str) and directory is not None and not os.path.exists(filename):
1531 filename = os.path.join(directory, filename)
1532 BackendClass = getClassOf(self._config["repo_transfer_formats"][format]["import"])
1534 def doImport(importStream: TextIO) -> None:
1535 backend = BackendClass(importStream, self.registry)
1536 backend.register()
1537 with self.transaction():
1538 backend.load(self.datastore, directory=directory, transfer=transfer,
1539 skip_dimensions=skip_dimensions)
1541 if isinstance(filename, str):
1542 with open(filename, "r") as stream:
1543 doImport(stream)
1544 else:
1545 doImport(filename)
1547 def validateConfiguration(self, logFailures: bool = False,
1548 datasetTypeNames: Optional[Iterable[str]] = None,
1549 ignore: Iterable[str] = None) -> None:
1550 """Validate butler configuration.
1552 Checks that each `DatasetType` can be stored in the `Datastore`.
1554 Parameters
1555 ----------
1556 logFailures : `bool`, optional
1557 If `True`, output a log message for every validation error
1558 detected.
1559 datasetTypeNames : iterable of `str`, optional
1560 The `DatasetType` names that should be checked. This allows
1561 only a subset to be selected.
1562 ignore : iterable of `str`, optional
1563 Names of DatasetTypes to skip over. This can be used to skip
1564 known problems. If a named `DatasetType` corresponds to a
1565 composite, all components of that `DatasetType` will also be
1566 ignored.
1568 Raises
1569 ------
1570 ButlerValidationError
1571 Raised if there is some inconsistency with how this Butler
1572 is configured.
1573 """
1574 if datasetTypeNames:
1575 datasetTypes = [self.registry.getDatasetType(name) for name in datasetTypeNames]
1576 else:
1577 datasetTypes = list(self.registry.queryDatasetTypes())
1579 # filter out anything from the ignore list
1580 if ignore:
1581 ignore = set(ignore)
1582 datasetTypes = [e for e in datasetTypes
1583 if e.name not in ignore and e.nameAndComponent()[0] not in ignore]
1584 else:
1585 ignore = set()
1587 # Find all the registered instruments
1588 instruments = set(
1589 record.name for record in self.registry.queryDimensionRecords("instrument")
1590 )
1592 # For each datasetType that has an instrument dimension, create
1593 # a DatasetRef for each defined instrument
1594 datasetRefs = []
1596 for datasetType in datasetTypes:
1597 if "instrument" in datasetType.dimensions:
1598 for instrument in instruments:
1599 datasetRef = DatasetRef(datasetType, {"instrument": instrument}, # type: ignore
1600 conform=False)
1601 datasetRefs.append(datasetRef)
1603 entities: List[Union[DatasetType, DatasetRef]] = []
1604 entities.extend(datasetTypes)
1605 entities.extend(datasetRefs)
1607 datastoreErrorStr = None
1608 try:
1609 self.datastore.validateConfiguration(entities, logFailures=logFailures)
1610 except ValidationError as e:
1611 datastoreErrorStr = str(e)
1613 # Also check that the LookupKeys used by the datastores match
1614 # registry and storage class definitions
1615 keys = self.datastore.getLookupKeys()
1617 failedNames = set()
1618 failedDataId = set()
1619 for key in keys:
1620 if key.name is not None:
1621 if key.name in ignore:
1622 continue
1624 # skip if specific datasetType names were requested and this
1625 # name does not match
1626 if datasetTypeNames and key.name not in datasetTypeNames:
1627 continue
1629 # See if it is a StorageClass or a DatasetType
1630 if key.name in self.storageClasses:
1631 pass
1632 else:
1633 try:
1634 self.registry.getDatasetType(key.name)
1635 except KeyError:
1636 if logFailures:
1637 log.critical("Key '%s' does not correspond to a DatasetType or StorageClass", key)
1638 failedNames.add(key)
1639 else:
1640 # Dimensions are checked for consistency when the Butler
1641 # is created and rendezvoused with a universe.
1642 pass
1644 # Check that the instrument is a valid instrument
1645 # Currently only support instrument so check for that
1646 if key.dataId:
1647 dataIdKeys = set(key.dataId)
1648 if set(["instrument"]) != dataIdKeys:
1649 if logFailures:
1650 log.critical("Key '%s' has unsupported DataId override", key)
1651 failedDataId.add(key)
1652 elif key.dataId["instrument"] not in instruments:
1653 if logFailures:
1654 log.critical("Key '%s' has unknown instrument", key)
1655 failedDataId.add(key)
1657 messages = []
1659 if datastoreErrorStr:
1660 messages.append(datastoreErrorStr)
1662 for failed, msg in ((failedNames, "Keys without corresponding DatasetType or StorageClass entry: "),
1663 (failedDataId, "Keys with bad DataId entries: ")):
1664 if failed:
1665 msg += ", ".join(str(k) for k in failed)
1666 messages.append(msg)
1668 if messages:
1669 raise ValidationError(";\n".join(messages))
1671 @property
1672 def collections(self) -> CollectionSearch:
1673 """The collections to search by default, in order (`CollectionSearch`).
1675 This is an alias for ``self.registry.defaults.collections``. It cannot
1676 be set directly in isolation, but all defaults may be changed together
1677 by assigning a new `RegistryDefaults` instance to
1678 ``self.registry.defaults``.
1679 """
1680 return self.registry.defaults.collections
1682 @property
1683 def run(self) -> Optional[str]:
1684 """Name of the run this butler writes outputs to by default (`str` or
1685 `None`).
1687 This is an alias for ``self.registry.defaults.run``. It cannot be set
1688 directly in isolation, but all defaults may be changed together by
1689 assigning a new `RegistryDefaults` instance to
1690 ``self.registry.defaults``.
1691 """
1692 return self.registry.defaults.run
1694 registry: Registry
1695 """The object that manages dataset metadata and relationships (`Registry`).
1697 Most operations that don't involve reading or writing butler datasets are
1698 accessible only via `Registry` methods.
1699 """
1701 datastore: Datastore
1702 """The object that manages actual dataset storage (`Datastore`).
1704 Direct user access to the datastore should rarely be necessary; the primary
1705 exception is the case where a `Datastore` implementation provides extra
1706 functionality beyond what the base class defines.
1707 """
1709 storageClasses: StorageClassFactory
1710 """An object that maps known storage class names to objects that fully
1711 describe them (`StorageClassFactory`).
1712 """