Coverage for python/lsst/daf/butler/_butler.py : 9%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""
23Butler top level classes.
24"""
25from __future__ import annotations
27__all__ = (
28 "Butler",
29 "ButlerValidationError",
30 "PruneCollectionsArgsError",
31 "PurgeWithoutUnstorePruneCollectionsError",
32 "RunWithoutPurgePruneCollectionsError",
33 "PurgeUnsupportedPruneCollectionsError",
34)
37from collections import defaultdict
38import contextlib
39import logging
40import numbers
41import os
42from typing import (
43 Any,
44 ClassVar,
45 Counter,
46 Dict,
47 Iterable,
48 Iterator,
49 List,
50 MutableMapping,
51 Optional,
52 Set,
53 TextIO,
54 Tuple,
55 Type,
56 Union,
57)
59try:
60 import boto3
61except ImportError:
62 boto3 = None
64from lsst.utils import doImport
65from .core import (
66 AmbiguousDatasetError,
67 ButlerURI,
68 Config,
69 ConfigSubset,
70 DataCoordinate,
71 DataId,
72 DataIdValue,
73 DatasetRef,
74 DatasetType,
75 Datastore,
76 Dimension,
77 DimensionConfig,
78 FileDataset,
79 Progress,
80 StorageClassFactory,
81 Timespan,
82 ValidationError,
83)
84from .core.repoRelocation import BUTLER_ROOT_TAG
85from .core.utils import transactional, getClassOf
86from ._deferredDatasetHandle import DeferredDatasetHandle
87from ._butlerConfig import ButlerConfig
88from .registry import Registry, RegistryConfig, RegistryDefaults, CollectionType, ConflictingDefinitionError
89from .registry.wildcards import CollectionSearch
90from .transfers import RepoExportContext
92log = logging.getLogger(__name__)
95class ButlerValidationError(ValidationError):
96 """There is a problem with the Butler configuration."""
97 pass
100class PruneCollectionsArgsError(TypeError):
101 """Base class for errors relating to Butler.pruneCollections input
102 arguments.
103 """
104 pass
107class PurgeWithoutUnstorePruneCollectionsError(PruneCollectionsArgsError):
108 """Raised when purge and unstore are both required to be True, and
109 purge is True but unstore is False.
110 """
112 def __init__(self) -> None:
113 super().__init__("Cannot pass purge=True without unstore=True.")
116class RunWithoutPurgePruneCollectionsError(PruneCollectionsArgsError):
117 """Raised when pruning a RUN collection but purge is False."""
119 def __init__(self, collectionType: CollectionType):
120 self.collectionType = collectionType
121 super().__init__(f"Cannot prune RUN collection {self.collectionType.name} without purge=True.")
124class PurgeUnsupportedPruneCollectionsError(PruneCollectionsArgsError):
125 """Raised when purge is True but is not supported for the given
126 collection."""
128 def __init__(self, collectionType: CollectionType):
129 self.collectionType = collectionType
130 super().__init__(
131 f"Cannot prune {self.collectionType} collection {self.collectionType.name} with purge=True.")
134class Butler:
135 """Main entry point for the data access system.
137 Parameters
138 ----------
139 config : `ButlerConfig`, `Config` or `str`, optional.
140 Configuration. Anything acceptable to the
141 `ButlerConfig` constructor. If a directory path
142 is given the configuration will be read from a ``butler.yaml`` file in
143 that location. If `None` is given default values will be used.
144 butler : `Butler`, optional.
145 If provided, construct a new Butler that uses the same registry and
146 datastore as the given one, but with the given collection and run.
147 Incompatible with the ``config``, ``searchPaths``, and ``writeable``
148 arguments.
149 collections : `str` or `Iterable` [ `str` ], optional
150 An expression specifying the collections to be searched (in order) when
151 reading datasets.
152 This may be a `str` collection name or an iterable thereof.
153 See :ref:`daf_butler_collection_expressions` for more information.
154 These collections are not registered automatically and must be
155 manually registered before they are used by any method, but they may be
156 manually registered after the `Butler` is initialized.
157 run : `str`, optional
158 Name of the `~CollectionType.RUN` collection new datasets should be
159 inserted into. If ``collections`` is `None` and ``run`` is not `None`,
160 ``collections`` will be set to ``[run]``. If not `None`, this
161 collection will automatically be registered. If this is not set (and
162 ``writeable`` is not set either), a read-only butler will be created.
163 searchPaths : `list` of `str`, optional
164 Directory paths to search when calculating the full Butler
165 configuration. Not used if the supplied config is already a
166 `ButlerConfig`.
167 writeable : `bool`, optional
168 Explicitly sets whether the butler supports write operations. If not
169 provided, a read-write butler is created if any of ``run``, ``tags``,
170 or ``chains`` is non-empty.
171 inferDefaults : `bool`, optional
172 If `True` (default) infer default data ID values from the values
173 present in the datasets in ``collections``: if all collections have the
174 same value (or no value) for a governor dimension, that value will be
175 the default for that dimension. Nonexistent collections are ignored.
176 If a default value is provided explicitly for a governor dimension via
177 ``**kwargs``, no default will be inferred for that dimension.
178 **kwargs : `str`
179 Default data ID key-value pairs. These may only identify "governor"
180 dimensions like ``instrument`` and ``skymap``.
182 Examples
183 --------
184 While there are many ways to control exactly how a `Butler` interacts with
185 the collections in its `Registry`, the most common cases are still simple.
187 For a read-only `Butler` that searches one collection, do::
189 butler = Butler("/path/to/repo", collections=["u/alice/DM-50000"])
191 For a read-write `Butler` that writes to and reads from a
192 `~CollectionType.RUN` collection::
194 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a")
196 The `Butler` passed to a ``PipelineTask`` is often much more complex,
197 because we want to write to one `~CollectionType.RUN` collection but read
198 from several others (as well)::
200 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a",
201 collections=["u/alice/DM-50000/a",
202 "u/bob/DM-49998",
203 "HSC/defaults"])
205 This butler will `put` new datasets to the run ``u/alice/DM-50000/a``.
206 Datasets will be read first from that run (since it appears first in the
207 chain), and then from ``u/bob/DM-49998`` and finally ``HSC/defaults``.
209 Finally, one can always create a `Butler` with no collections::
211 butler = Butler("/path/to/repo", writeable=True)
213 This can be extremely useful when you just want to use ``butler.registry``,
214 e.g. for inserting dimension data or managing collections, or when the
215 collections you want to use with the butler are not consistent.
216 Passing ``writeable`` explicitly here is only necessary if you want to be
217 able to make changes to the repo - usually the value for ``writeable`` can
218 be guessed from the collection arguments provided, but it defaults to
219 `False` when there are not collection arguments.
220 """
221 def __init__(self, config: Union[Config, str, None] = None, *,
222 butler: Optional[Butler] = None,
223 collections: Any = None,
224 run: Optional[str] = None,
225 searchPaths: Optional[List[str]] = None,
226 writeable: Optional[bool] = None,
227 inferDefaults: bool = True,
228 **kwargs: str,
229 ):
230 defaults = RegistryDefaults(collections=collections, run=run, infer=inferDefaults, **kwargs)
231 # Load registry, datastore, etc. from config or existing butler.
232 if butler is not None:
233 if config is not None or searchPaths is not None or writeable is not None:
234 raise TypeError("Cannot pass 'config', 'searchPaths', or 'writeable' "
235 "arguments with 'butler' argument.")
236 self.registry = butler.registry.copy(defaults)
237 self.datastore = butler.datastore
238 self.storageClasses = butler.storageClasses
239 self._config: ButlerConfig = butler._config
240 else:
241 self._config = ButlerConfig(config, searchPaths=searchPaths)
242 if "root" in self._config:
243 butlerRoot = self._config["root"]
244 else:
245 butlerRoot = self._config.configDir
246 if writeable is None:
247 writeable = run is not None
248 self.registry = Registry.fromConfig(self._config, butlerRoot=butlerRoot, writeable=writeable,
249 defaults=defaults)
250 self.datastore = Datastore.fromConfig(self._config, self.registry.getDatastoreBridgeManager(),
251 butlerRoot=butlerRoot)
252 self.storageClasses = StorageClassFactory()
253 self.storageClasses.addFromConfig(self._config)
254 if "run" in self._config or "collection" in self._config:
255 raise ValueError("Passing a run or collection via configuration is no longer supported.")
257 GENERATION: ClassVar[int] = 3
258 """This is a Generation 3 Butler.
260 This attribute may be removed in the future, once the Generation 2 Butler
261 interface has been fully retired; it should only be used in transitional
262 code.
263 """
265 @staticmethod
266 def makeRepo(root: str, config: Union[Config, str, None] = None,
267 dimensionConfig: Union[Config, str, None] = None, standalone: bool = False,
268 searchPaths: Optional[List[str]] = None, forceConfigRoot: bool = True,
269 outfile: Optional[str] = None, overwrite: bool = False) -> Config:
270 """Create an empty data repository by adding a butler.yaml config
271 to a repository root directory.
273 Parameters
274 ----------
275 root : `str` or `ButlerURI`
276 Path or URI to the root location of the new repository. Will be
277 created if it does not exist.
278 config : `Config` or `str`, optional
279 Configuration to write to the repository, after setting any
280 root-dependent Registry or Datastore config options. Can not
281 be a `ButlerConfig` or a `ConfigSubset`. If `None`, default
282 configuration will be used. Root-dependent config options
283 specified in this config are overwritten if ``forceConfigRoot``
284 is `True`.
285 dimensionConfig : `Config` or `str`, optional
286 Configuration for dimensions, will be used to initialize registry
287 database.
288 standalone : `bool`
289 If True, write all expanded defaults, not just customized or
290 repository-specific settings.
291 This (mostly) decouples the repository from the default
292 configuration, insulating it from changes to the defaults (which
293 may be good or bad, depending on the nature of the changes).
294 Future *additions* to the defaults will still be picked up when
295 initializing `Butlers` to repos created with ``standalone=True``.
296 searchPaths : `list` of `str`, optional
297 Directory paths to search when calculating the full butler
298 configuration.
299 forceConfigRoot : `bool`, optional
300 If `False`, any values present in the supplied ``config`` that
301 would normally be reset are not overridden and will appear
302 directly in the output config. This allows non-standard overrides
303 of the root directory for a datastore or registry to be given.
304 If this parameter is `True` the values for ``root`` will be
305 forced into the resulting config if appropriate.
306 outfile : `str`, optional
307 If not-`None`, the output configuration will be written to this
308 location rather than into the repository itself. Can be a URI
309 string. Can refer to a directory that will be used to write
310 ``butler.yaml``.
311 overwrite : `bool`, optional
312 Create a new configuration file even if one already exists
313 in the specified output location. Default is to raise
314 an exception.
316 Returns
317 -------
318 config : `Config`
319 The updated `Config` instance written to the repo.
321 Raises
322 ------
323 ValueError
324 Raised if a ButlerConfig or ConfigSubset is passed instead of a
325 regular Config (as these subclasses would make it impossible to
326 support ``standalone=False``).
327 FileExistsError
328 Raised if the output config file already exists.
329 os.error
330 Raised if the directory does not exist, exists but is not a
331 directory, or cannot be created.
333 Notes
334 -----
335 Note that when ``standalone=False`` (the default), the configuration
336 search path (see `ConfigSubset.defaultSearchPaths`) that was used to
337 construct the repository should also be used to construct any Butlers
338 to avoid configuration inconsistencies.
339 """
340 if isinstance(config, (ButlerConfig, ConfigSubset)):
341 raise ValueError("makeRepo must be passed a regular Config without defaults applied.")
343 # Ensure that the root of the repository exists or can be made
344 uri = ButlerURI(root, forceDirectory=True)
345 uri.mkdir()
347 config = Config(config)
349 # If we are creating a new repo from scratch with relative roots,
350 # do not propagate an explicit root from the config file
351 if "root" in config:
352 del config["root"]
354 full = ButlerConfig(config, searchPaths=searchPaths) # this applies defaults
355 datastoreClass: Type[Datastore] = doImport(full["datastore", "cls"])
356 datastoreClass.setConfigRoot(BUTLER_ROOT_TAG, config, full, overwrite=forceConfigRoot)
358 # if key exists in given config, parse it, otherwise parse the defaults
359 # in the expanded config
360 if config.get(("registry", "db")):
361 registryConfig = RegistryConfig(config)
362 else:
363 registryConfig = RegistryConfig(full)
364 defaultDatabaseUri = registryConfig.makeDefaultDatabaseUri(BUTLER_ROOT_TAG)
365 if defaultDatabaseUri is not None:
366 Config.updateParameters(RegistryConfig, config, full,
367 toUpdate={"db": defaultDatabaseUri},
368 overwrite=forceConfigRoot)
369 else:
370 Config.updateParameters(RegistryConfig, config, full, toCopy=("db",),
371 overwrite=forceConfigRoot)
373 if standalone:
374 config.merge(full)
375 else:
376 # Always expand the registry.managers section into the per-repo
377 # config, because after the database schema is created, it's not
378 # allowed to change anymore. Note that in the standalone=True
379 # branch, _everything_ in the config is expanded, so there's no
380 # need to special case this.
381 Config.updateParameters(RegistryConfig, config, full, toCopy=("managers",), overwrite=False)
382 configURI: Union[str, ButlerURI]
383 if outfile is not None:
384 # When writing to a separate location we must include
385 # the root of the butler repo in the config else it won't know
386 # where to look.
387 config["root"] = uri.geturl()
388 configURI = outfile
389 else:
390 configURI = uri
391 config.dumpToUri(configURI, overwrite=overwrite)
393 # Create Registry and populate tables
394 registryConfig = RegistryConfig(config.get("registry"))
395 dimensionConfig = DimensionConfig(dimensionConfig)
396 Registry.createFromConfig(registryConfig, dimensionConfig=dimensionConfig, butlerRoot=root)
398 return config
400 @classmethod
401 def _unpickle(cls, config: ButlerConfig, collections: Optional[CollectionSearch], run: Optional[str],
402 defaultDataId: Dict[str, str], writeable: bool) -> Butler:
403 """Callable used to unpickle a Butler.
405 We prefer not to use ``Butler.__init__`` directly so we can force some
406 of its many arguments to be keyword-only (note that ``__reduce__``
407 can only invoke callables with positional arguments).
409 Parameters
410 ----------
411 config : `ButlerConfig`
412 Butler configuration, already coerced into a true `ButlerConfig`
413 instance (and hence after any search paths for overrides have been
414 utilized).
415 collections : `CollectionSearch`
416 Names of the default collections to read from.
417 run : `str`, optional
418 Name of the default `~CollectionType.RUN` collection to write to.
419 defaultDataId : `dict` [ `str`, `str` ]
420 Default data ID values.
421 writeable : `bool`
422 Whether the Butler should support write operations.
424 Returns
425 -------
426 butler : `Butler`
427 A new `Butler` instance.
428 """
429 # MyPy doesn't recognize that the kwargs below are totally valid; it
430 # seems to think '**defaultDataId* is a _positional_ argument!
431 return cls(config=config, collections=collections, run=run, writeable=writeable,
432 **defaultDataId) # type: ignore
434 def __reduce__(self) -> tuple:
435 """Support pickling.
436 """
437 return (Butler._unpickle, (self._config, self.collections, self.run,
438 self.registry.defaults.dataId.byName(),
439 self.registry.isWriteable()))
441 def __str__(self) -> str:
442 return "Butler(collections={}, run={}, datastore='{}', registry='{}')".format(
443 self.collections, self.run, self.datastore, self.registry)
445 def isWriteable(self) -> bool:
446 """Return `True` if this `Butler` supports write operations.
447 """
448 return self.registry.isWriteable()
450 @contextlib.contextmanager
451 def transaction(self) -> Iterator[None]:
452 """Context manager supporting `Butler` transactions.
454 Transactions can be nested.
455 """
456 with self.registry.transaction():
457 with self.datastore.transaction():
458 yield
460 def _standardizeArgs(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
461 dataId: Optional[DataId] = None, **kwds: Any
462 ) -> Tuple[DatasetType, Optional[DataId]]:
463 """Standardize the arguments passed to several Butler APIs.
465 Parameters
466 ----------
467 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
468 When `DatasetRef` the `dataId` should be `None`.
469 Otherwise the `DatasetType` or name thereof.
470 dataId : `dict` or `DataCoordinate`
471 A `dict` of `Dimension` link name, value pairs that label the
472 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
473 should be provided as the second argument.
474 kwds
475 Additional keyword arguments used to augment or construct a
476 `DataCoordinate`. See `DataCoordinate.standardize`
477 parameters.
479 Returns
480 -------
481 datasetType : `DatasetType`
482 A `DatasetType` instance extracted from ``datasetRefOrType``.
483 dataId : `dict` or `DataId`, optional
484 Argument that can be used (along with ``kwds``) to construct a
485 `DataId`.
487 Notes
488 -----
489 Butler APIs that conceptually need a DatasetRef also allow passing a
490 `DatasetType` (or the name of one) and a `DataId` (or a dict and
491 keyword arguments that can be used to construct one) separately. This
492 method accepts those arguments and always returns a true `DatasetType`
493 and a `DataId` or `dict`.
495 Standardization of `dict` vs `DataId` is best handled by passing the
496 returned ``dataId`` (and ``kwds``) to `Registry` APIs, which are
497 generally similarly flexible.
498 """
499 externalDatasetType: Optional[DatasetType] = None
500 internalDatasetType: Optional[DatasetType] = None
501 if isinstance(datasetRefOrType, DatasetRef):
502 if dataId is not None or kwds:
503 raise ValueError("DatasetRef given, cannot use dataId as well")
504 externalDatasetType = datasetRefOrType.datasetType
505 dataId = datasetRefOrType.dataId
506 else:
507 # Don't check whether DataId is provided, because Registry APIs
508 # can usually construct a better error message when it wasn't.
509 if isinstance(datasetRefOrType, DatasetType):
510 externalDatasetType = datasetRefOrType
511 else:
512 internalDatasetType = self.registry.getDatasetType(datasetRefOrType)
514 # Check that they are self-consistent
515 if externalDatasetType is not None:
516 internalDatasetType = self.registry.getDatasetType(externalDatasetType.name)
517 if externalDatasetType != internalDatasetType:
518 raise ValueError(f"Supplied dataset type ({externalDatasetType}) inconsistent with "
519 f"registry definition ({internalDatasetType})")
521 assert internalDatasetType is not None
522 return internalDatasetType, dataId
524 def _findDatasetRef(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
525 dataId: Optional[DataId] = None, *,
526 collections: Any = None,
527 allowUnresolved: bool = False,
528 **kwds: Any) -> DatasetRef:
529 """Shared logic for methods that start with a search for a dataset in
530 the registry.
532 Parameters
533 ----------
534 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
535 When `DatasetRef` the `dataId` should be `None`.
536 Otherwise the `DatasetType` or name thereof.
537 dataId : `dict` or `DataCoordinate`, optional
538 A `dict` of `Dimension` link name, value pairs that label the
539 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
540 should be provided as the first argument.
541 collections : Any, optional
542 Collections to be searched, overriding ``self.collections``.
543 Can be any of the types supported by the ``collections`` argument
544 to butler construction.
545 allowUnresolved : `bool`, optional
546 If `True`, return an unresolved `DatasetRef` if finding a resolved
547 one in the `Registry` fails. Defaults to `False`.
548 kwds
549 Additional keyword arguments used to augment or construct a
550 `DataId`. See `DataId` parameters.
552 Returns
553 -------
554 ref : `DatasetRef`
555 A reference to the dataset identified by the given arguments.
557 Raises
558 ------
559 LookupError
560 Raised if no matching dataset exists in the `Registry` (and
561 ``allowUnresolved is False``).
562 ValueError
563 Raised if a resolved `DatasetRef` was passed as an input, but it
564 differs from the one found in the registry.
565 TypeError
566 Raised if no collections were provided.
567 """
568 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwds)
569 if isinstance(datasetRefOrType, DatasetRef):
570 idNumber = datasetRefOrType.id
571 else:
572 idNumber = None
573 timespan: Optional[Timespan] = None
575 # Process dimension records that are using record information
576 # rather than ids
577 newDataId: Dict[str, DataIdValue] = {}
578 byRecord: Dict[str, Dict[str, Any]] = defaultdict(dict)
580 # if all the dataId comes from keyword parameters we do not need
581 # to do anything here because they can't be of the form
582 # exposure.obs_id because a "." is not allowed in a keyword parameter.
583 if dataId:
584 for k, v in dataId.items():
585 # If we have a Dimension we do not need to do anything
586 # because it cannot be a compound key.
587 if isinstance(k, str) and "." in k:
588 # Someone is using a more human-readable dataId
589 dimensionName, record = k.split(".", 1)
590 byRecord[dimensionName][record] = v
591 elif isinstance(k, Dimension):
592 newDataId[k.name] = v
593 else:
594 newDataId[k] = v
596 # Go through the updated dataId and check the type in case someone is
597 # using an alternate key. We have already filtered out the compound
598 # keys dimensions.record format.
599 not_dimensions = {}
601 # Will need to look in the dataId and the keyword arguments
602 # and will remove them if they need to be fixed or are unrecognized.
603 for dataIdDict in (newDataId, kwds):
604 # Use a list so we can adjust the dict safely in the loop
605 for dimensionName in list(dataIdDict):
606 value = dataIdDict[dimensionName]
607 try:
608 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
609 except KeyError:
610 # This is not a real dimension
611 not_dimensions[dimensionName] = value
612 del dataIdDict[dimensionName]
613 continue
615 # Convert an integral type to an explicit int to simplify
616 # comparisons here
617 if isinstance(value, numbers.Integral):
618 value = int(value)
620 if not isinstance(value, dimension.primaryKey.getPythonType()):
621 for alternate in dimension.alternateKeys:
622 if isinstance(value, alternate.getPythonType()):
623 byRecord[dimensionName][alternate.name] = value
624 del dataIdDict[dimensionName]
625 log.debug("Converting dimension %s to %s.%s=%s",
626 dimensionName, dimensionName, alternate.name, value)
627 break
628 else:
629 log.warning("Type mismatch found for value '%r' provided for dimension %s. "
630 "Could not find matching alternative (primary key has type %s) "
631 "so attempting to use as-is.",
632 value, dimensionName, dimension.primaryKey.getPythonType())
634 # If we have some unrecognized dimensions we have to try to connect
635 # them to records in other dimensions. This is made more complicated
636 # by some dimensions having records with clashing names. A mitigation
637 # is that we can tell by this point which dimensions are missing
638 # for the DatasetType but this does not work for calibrations
639 # where additional dimensions can be used to constrain the temporal
640 # axis.
641 if not_dimensions:
642 # Calculate missing dimensions
643 provided = set(newDataId) | set(kwds) | set(byRecord)
644 missingDimensions = datasetType.dimensions.names - provided
646 # For calibrations we may well be needing temporal dimensions
647 # so rather than always including all dimensions in the scan
648 # restrict things a little. It is still possible for there
649 # to be confusion over day_obs in visit vs exposure for example.
650 # If we are not searching calibration collections things may
651 # fail but they are going to fail anyway because of the
652 # ambiguousness of the dataId...
653 candidateDimensions: Set[str] = set()
654 candidateDimensions.update(missingDimensions)
655 if datasetType.isCalibration():
656 for dim in self.registry.dimensions.getStaticDimensions():
657 if dim.temporal:
658 candidateDimensions.add(str(dim))
660 # Look up table for the first association with a dimension
661 guessedAssociation: Dict[str, Dict[str, Any]] = defaultdict(dict)
663 # Keep track of whether an item is associated with multiple
664 # dimensions.
665 counter: Counter[str] = Counter()
666 assigned: Dict[str, Set[str]] = defaultdict(set)
668 # Go through the missing dimensions and associate the
669 # given names with records within those dimensions
670 for dimensionName in candidateDimensions:
671 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
672 fields = dimension.metadata.names | dimension.uniqueKeys.names
673 for field in not_dimensions:
674 if field in fields:
675 guessedAssociation[dimensionName][field] = not_dimensions[field]
676 counter[dimensionName] += 1
677 assigned[field].add(dimensionName)
679 # There is a chance we have allocated a single dataId item
680 # to multiple dimensions. Need to decide which should be retained.
681 # For now assume that the most popular alternative wins.
682 # This means that day_obs with seq_num will result in
683 # exposure.day_obs and not visit.day_obs
684 # Also prefer an explicitly missing dimension over an inferred
685 # temporal dimension.
686 for fieldName, assignedDimensions in assigned.items():
687 if len(assignedDimensions) > 1:
688 # Pick the most popular (preferring mandatory dimensions)
689 requiredButMissing = assignedDimensions.intersection(missingDimensions)
690 if requiredButMissing:
691 candidateDimensions = requiredButMissing
692 else:
693 candidateDimensions = assignedDimensions
695 # Select the relevant items and get a new restricted
696 # counter.
697 theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions}
698 duplicatesCounter: Counter[str] = Counter()
699 duplicatesCounter.update(theseCounts)
701 # Choose the most common. If they are equally common
702 # we will pick the one that was found first.
703 # Returns a list of tuples
704 selected = duplicatesCounter.most_common(1)[0][0]
706 log.debug("Ambiguous dataId entry '%s' associated with multiple dimensions: %s."
707 " Removed ambiguity by choosing dimension %s.",
708 fieldName, ", ".join(assignedDimensions), selected)
710 for candidateDimension in assignedDimensions:
711 if candidateDimension != selected:
712 del guessedAssociation[candidateDimension][fieldName]
714 # Update the record look up dict with the new associations
715 for dimensionName, values in guessedAssociation.items():
716 if values: # A dict might now be empty
717 log.debug("Assigned non-dimension dataId keys to dimension %s: %s",
718 dimensionName, values)
719 byRecord[dimensionName].update(values)
721 if byRecord:
722 # Some record specifiers were found so we need to convert
723 # them to the Id form
724 for dimensionName, values in byRecord.items():
725 if dimensionName in newDataId:
726 log.warning("DataId specified explicit %s dimension value of %s in addition to"
727 " general record specifiers for it of %s. Ignoring record information.",
728 dimensionName, newDataId[dimensionName], str(values))
729 continue
731 # Build up a WHERE expression -- use single quotes
732 def quote(s: Any) -> str:
733 if isinstance(s, str):
734 return f"'{s}'"
735 else:
736 return s
738 where = " AND ".join(f"{dimensionName}.{k} = {quote(v)}"
739 for k, v in values.items())
741 # Hopefully we get a single record that matches
742 records = set(self.registry.queryDimensionRecords(dimensionName, dataId=newDataId,
743 where=where, **kwds))
745 if len(records) != 1:
746 if len(records) > 1:
747 log.debug("Received %d records from constraints of %s", len(records), str(values))
748 for r in records:
749 log.debug("- %s", str(r))
750 raise RuntimeError(f"DataId specification for dimension {dimensionName} is not"
751 f" uniquely constrained to a single dataset by {values}."
752 f" Got {len(records)} results.")
753 raise RuntimeError(f"DataId specification for dimension {dimensionName} matched no"
754 f" records when constrained by {values}")
756 # Get the primary key from the real dimension object
757 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
758 if not isinstance(dimension, Dimension):
759 raise RuntimeError(
760 f"{dimension.name} is not a true dimension, and cannot be used in data IDs."
761 )
762 newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name)
764 # We have modified the dataId so need to switch to it
765 dataId = newDataId
767 if datasetType.isCalibration():
768 # Because this is a calibration dataset, first try to make a
769 # standardize the data ID without restricting the dimensions to
770 # those of the dataset type requested, because there may be extra
771 # dimensions that provide temporal information for a validity-range
772 # lookup.
773 dataId = DataCoordinate.standardize(dataId, universe=self.registry.dimensions,
774 defaults=self.registry.defaults.dataId, **kwds)
775 if dataId.graph.temporal:
776 dataId = self.registry.expandDataId(dataId)
777 timespan = dataId.timespan
778 else:
779 # Standardize the data ID to just the dimensions of the dataset
780 # type instead of letting registry.findDataset do it, so we get the
781 # result even if no dataset is found.
782 dataId = DataCoordinate.standardize(dataId, graph=datasetType.dimensions,
783 defaults=self.registry.defaults.dataId, **kwds)
784 # Always lookup the DatasetRef, even if one is given, to ensure it is
785 # present in the current collection.
786 ref = self.registry.findDataset(datasetType, dataId, collections=collections, timespan=timespan)
787 if ref is None:
788 if allowUnresolved:
789 return DatasetRef(datasetType, dataId)
790 else:
791 if collections is None:
792 collections = self.registry.defaults.collections
793 raise LookupError(f"Dataset {datasetType.name} with data ID {dataId} "
794 f"could not be found in collections {collections}.")
795 if idNumber is not None and idNumber != ref.id:
796 if collections is None:
797 collections = self.registry.defaults.collections
798 raise ValueError(f"DatasetRef.id provided ({idNumber}) does not match "
799 f"id ({ref.id}) in registry in collections {collections}.")
800 return ref
802 @transactional
803 def put(self, obj: Any, datasetRefOrType: Union[DatasetRef, DatasetType, str],
804 dataId: Optional[DataId] = None, *,
805 run: Optional[str] = None,
806 **kwds: Any) -> DatasetRef:
807 """Store and register a dataset.
809 Parameters
810 ----------
811 obj : `object`
812 The dataset.
813 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
814 When `DatasetRef` is provided, ``dataId`` should be `None`.
815 Otherwise the `DatasetType` or name thereof.
816 dataId : `dict` or `DataCoordinate`
817 A `dict` of `Dimension` link name, value pairs that label the
818 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
819 should be provided as the second argument.
820 run : `str`, optional
821 The name of the run the dataset should be added to, overriding
822 ``self.run``.
823 kwds
824 Additional keyword arguments used to augment or construct a
825 `DataCoordinate`. See `DataCoordinate.standardize`
826 parameters.
828 Returns
829 -------
830 ref : `DatasetRef`
831 A reference to the stored dataset, updated with the correct id if
832 given.
834 Raises
835 ------
836 TypeError
837 Raised if the butler is read-only or if no run has been provided.
838 """
839 log.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run)
840 if not self.isWriteable():
841 raise TypeError("Butler is read-only.")
842 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwds)
843 if isinstance(datasetRefOrType, DatasetRef) and datasetRefOrType.id is not None:
844 raise ValueError("DatasetRef must not be in registry, must have None id")
846 # Add Registry Dataset entry.
847 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions, **kwds)
848 ref, = self.registry.insertDatasets(datasetType, run=run, dataIds=[dataId])
850 # Add Datastore entry.
851 self.datastore.put(obj, ref)
853 return ref
855 def getDirect(self, ref: DatasetRef, *, parameters: Optional[Dict[str, Any]] = None) -> Any:
856 """Retrieve a stored dataset.
858 Unlike `Butler.get`, this method allows datasets outside the Butler's
859 collection to be read as long as the `DatasetRef` that identifies them
860 can be obtained separately.
862 Parameters
863 ----------
864 ref : `DatasetRef`
865 Resolved reference to an already stored dataset.
866 parameters : `dict`
867 Additional StorageClass-defined options to control reading,
868 typically used to efficiently read only a subset of the dataset.
870 Returns
871 -------
872 obj : `object`
873 The dataset.
874 """
875 return self.datastore.get(ref, parameters=parameters)
877 def getDirectDeferred(self, ref: DatasetRef, *,
878 parameters: Union[dict, None] = None) -> DeferredDatasetHandle:
879 """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
880 from a resolved `DatasetRef`.
882 Parameters
883 ----------
884 ref : `DatasetRef`
885 Resolved reference to an already stored dataset.
886 parameters : `dict`
887 Additional StorageClass-defined options to control reading,
888 typically used to efficiently read only a subset of the dataset.
890 Returns
891 -------
892 obj : `DeferredDatasetHandle`
893 A handle which can be used to retrieve a dataset at a later time.
895 Raises
896 ------
897 AmbiguousDatasetError
898 Raised if ``ref.id is None``, i.e. the reference is unresolved.
899 """
900 if ref.id is None:
901 raise AmbiguousDatasetError(
902 f"Dataset of type {ref.datasetType.name} with data ID {ref.dataId} is not resolved."
903 )
904 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters)
906 def getDeferred(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
907 dataId: Optional[DataId] = None, *,
908 parameters: Union[dict, None] = None,
909 collections: Any = None,
910 **kwds: Any) -> DeferredDatasetHandle:
911 """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
912 after an immediate registry lookup.
914 Parameters
915 ----------
916 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
917 When `DatasetRef` the `dataId` should be `None`.
918 Otherwise the `DatasetType` or name thereof.
919 dataId : `dict` or `DataCoordinate`, optional
920 A `dict` of `Dimension` link name, value pairs that label the
921 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
922 should be provided as the first argument.
923 parameters : `dict`
924 Additional StorageClass-defined options to control reading,
925 typically used to efficiently read only a subset of the dataset.
926 collections : Any, optional
927 Collections to be searched, overriding ``self.collections``.
928 Can be any of the types supported by the ``collections`` argument
929 to butler construction.
930 kwds
931 Additional keyword arguments used to augment or construct a
932 `DataId`. See `DataId` parameters.
934 Returns
935 -------
936 obj : `DeferredDatasetHandle`
937 A handle which can be used to retrieve a dataset at a later time.
939 Raises
940 ------
941 LookupError
942 Raised if no matching dataset exists in the `Registry` (and
943 ``allowUnresolved is False``).
944 ValueError
945 Raised if a resolved `DatasetRef` was passed as an input, but it
946 differs from the one found in the registry.
947 TypeError
948 Raised if no collections were provided.
949 """
950 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwds)
951 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters)
953 def get(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
954 dataId: Optional[DataId] = None, *,
955 parameters: Optional[Dict[str, Any]] = None,
956 collections: Any = None,
957 **kwds: Any) -> Any:
958 """Retrieve a stored dataset.
960 Parameters
961 ----------
962 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
963 When `DatasetRef` the `dataId` should be `None`.
964 Otherwise the `DatasetType` or name thereof.
965 dataId : `dict` or `DataCoordinate`
966 A `dict` of `Dimension` link name, value pairs that label the
967 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
968 should be provided as the first argument.
969 parameters : `dict`
970 Additional StorageClass-defined options to control reading,
971 typically used to efficiently read only a subset of the dataset.
972 collections : Any, optional
973 Collections to be searched, overriding ``self.collections``.
974 Can be any of the types supported by the ``collections`` argument
975 to butler construction.
976 kwds
977 Additional keyword arguments used to augment or construct a
978 `DataCoordinate`. See `DataCoordinate.standardize`
979 parameters.
981 Returns
982 -------
983 obj : `object`
984 The dataset.
986 Raises
987 ------
988 ValueError
989 Raised if a resolved `DatasetRef` was passed as an input, but it
990 differs from the one found in the registry.
991 LookupError
992 Raised if no matching dataset exists in the `Registry`.
993 TypeError
994 Raised if no collections were provided.
996 Notes
997 -----
998 When looking up datasets in a `~CollectionType.CALIBRATION` collection,
999 this method requires that the given data ID include temporal dimensions
1000 beyond the dimensions of the dataset type itself, in order to find the
1001 dataset with the appropriate validity range. For example, a "bias"
1002 dataset with native dimensions ``{instrument, detector}`` could be
1003 fetched with a ``{instrument, detector, exposure}`` data ID, because
1004 ``exposure`` is a temporal dimension.
1005 """
1006 log.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters)
1007 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwds)
1008 return self.getDirect(ref, parameters=parameters)
1010 def getURIs(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
1011 dataId: Optional[DataId] = None, *,
1012 predict: bool = False,
1013 collections: Any = None,
1014 run: Optional[str] = None,
1015 **kwds: Any) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]:
1016 """Returns the URIs associated with the dataset.
1018 Parameters
1019 ----------
1020 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1021 When `DatasetRef` the `dataId` should be `None`.
1022 Otherwise the `DatasetType` or name thereof.
1023 dataId : `dict` or `DataCoordinate`
1024 A `dict` of `Dimension` link name, value pairs that label the
1025 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1026 should be provided as the first argument.
1027 predict : `bool`
1028 If `True`, allow URIs to be returned of datasets that have not
1029 been written.
1030 collections : Any, optional
1031 Collections to be searched, overriding ``self.collections``.
1032 Can be any of the types supported by the ``collections`` argument
1033 to butler construction.
1034 run : `str`, optional
1035 Run to use for predictions, overriding ``self.run``.
1036 kwds
1037 Additional keyword arguments used to augment or construct a
1038 `DataCoordinate`. See `DataCoordinate.standardize`
1039 parameters.
1041 Returns
1042 -------
1043 primary : `ButlerURI`
1044 The URI to the primary artifact associated with this dataset.
1045 If the dataset was disassembled within the datastore this
1046 may be `None`.
1047 components : `dict`
1048 URIs to any components associated with the dataset artifact.
1049 Can be empty if there are no components.
1050 """
1051 ref = self._findDatasetRef(datasetRefOrType, dataId, allowUnresolved=predict,
1052 collections=collections, **kwds)
1053 if ref.id is None: # only possible if predict is True
1054 if run is None:
1055 run = self.run
1056 if run is None:
1057 raise TypeError("Cannot predict location with run=None.")
1058 # Lie about ID, because we can't guess it, and only
1059 # Datastore.getURIs() will ever see it (and it doesn't use it).
1060 ref = ref.resolved(id=0, run=run)
1061 return self.datastore.getURIs(ref, predict)
1063 def getURI(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
1064 dataId: Optional[DataId] = None, *,
1065 predict: bool = False,
1066 collections: Any = None,
1067 run: Optional[str] = None,
1068 **kwds: Any) -> ButlerURI:
1069 """Return the URI to the Dataset.
1071 Parameters
1072 ----------
1073 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1074 When `DatasetRef` the `dataId` should be `None`.
1075 Otherwise the `DatasetType` or name thereof.
1076 dataId : `dict` or `DataCoordinate`
1077 A `dict` of `Dimension` link name, value pairs that label the
1078 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1079 should be provided as the first argument.
1080 predict : `bool`
1081 If `True`, allow URIs to be returned of datasets that have not
1082 been written.
1083 collections : Any, optional
1084 Collections to be searched, overriding ``self.collections``.
1085 Can be any of the types supported by the ``collections`` argument
1086 to butler construction.
1087 run : `str`, optional
1088 Run to use for predictions, overriding ``self.run``.
1089 kwds
1090 Additional keyword arguments used to augment or construct a
1091 `DataCoordinate`. See `DataCoordinate.standardize`
1092 parameters.
1094 Returns
1095 -------
1096 uri : `ButlerURI`
1097 URI pointing to the Dataset within the datastore. If the
1098 Dataset does not exist in the datastore, and if ``predict`` is
1099 `True`, the URI will be a prediction and will include a URI
1100 fragment "#predicted".
1101 If the datastore does not have entities that relate well
1102 to the concept of a URI the returned URI string will be
1103 descriptive. The returned URI is not guaranteed to be obtainable.
1105 Raises
1106 ------
1107 LookupError
1108 A URI has been requested for a dataset that does not exist and
1109 guessing is not allowed.
1110 ValueError
1111 Raised if a resolved `DatasetRef` was passed as an input, but it
1112 differs from the one found in the registry.
1113 TypeError
1114 Raised if no collections were provided.
1115 RuntimeError
1116 Raised if a URI is requested for a dataset that consists of
1117 multiple artifacts.
1118 """
1119 primary, components = self.getURIs(datasetRefOrType, dataId=dataId, predict=predict,
1120 collections=collections, run=run, **kwds)
1122 if primary is None or components:
1123 raise RuntimeError(f"Dataset ({datasetRefOrType}) includes distinct URIs for components. "
1124 "Use Butler.getURIs() instead.")
1125 return primary
1127 def datasetExists(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
1128 dataId: Optional[DataId] = None, *,
1129 collections: Any = None,
1130 **kwds: Any) -> bool:
1131 """Return True if the Dataset is actually present in the Datastore.
1133 Parameters
1134 ----------
1135 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1136 When `DatasetRef` the `dataId` should be `None`.
1137 Otherwise the `DatasetType` or name thereof.
1138 dataId : `dict` or `DataCoordinate`
1139 A `dict` of `Dimension` link name, value pairs that label the
1140 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1141 should be provided as the first argument.
1142 collections : Any, optional
1143 Collections to be searched, overriding ``self.collections``.
1144 Can be any of the types supported by the ``collections`` argument
1145 to butler construction.
1146 kwds
1147 Additional keyword arguments used to augment or construct a
1148 `DataCoordinate`. See `DataCoordinate.standardize`
1149 parameters.
1151 Raises
1152 ------
1153 LookupError
1154 Raised if the dataset is not even present in the Registry.
1155 ValueError
1156 Raised if a resolved `DatasetRef` was passed as an input, but it
1157 differs from the one found in the registry.
1158 TypeError
1159 Raised if no collections were provided.
1160 """
1161 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwds)
1162 return self.datastore.exists(ref)
1164 def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None:
1165 """Remove one or more `~CollectionType.RUN` collections and the
1166 datasets within them.
1168 Parameters
1169 ----------
1170 names : `Iterable` [ `str` ]
1171 The names of the collections to remove.
1172 unstore : `bool`, optional
1173 If `True` (default), delete datasets from all datastores in which
1174 they are present, and attempt to rollback the registry deletions if
1175 datastore deletions fail (which may not always be possible). If
1176 `False`, datastore records for these datasets are still removed,
1177 but any artifacts (e.g. files) will not be.
1179 Raises
1180 ------
1181 TypeError
1182 Raised if one or more collections are not of type
1183 `~CollectionType.RUN`.
1184 """
1185 if not self.isWriteable():
1186 raise TypeError("Butler is read-only.")
1187 names = list(names)
1188 refs: List[DatasetRef] = []
1189 for name in names:
1190 collectionType = self.registry.getCollectionType(name)
1191 if collectionType is not CollectionType.RUN:
1192 raise TypeError(f"The collection type of '{name}' is {collectionType.name}, not RUN.")
1193 refs.extend(self.registry.queryDatasets(..., collections=name, findFirst=True))
1194 with self.registry.transaction():
1195 if unstore:
1196 for ref in refs:
1197 if self.datastore.exists(ref):
1198 self.datastore.trash(ref)
1199 else:
1200 self.datastore.forget(refs)
1201 for name in names:
1202 self.registry.removeCollection(name)
1203 if unstore:
1204 # Point of no return for removing artifacts
1205 self.datastore.emptyTrash()
1207 def pruneCollection(self, name: str, purge: bool = False, unstore: bool = False) -> None:
1208 """Remove a collection and possibly prune datasets within it.
1210 Parameters
1211 ----------
1212 name : `str`
1213 Name of the collection to remove. If this is a
1214 `~CollectionType.TAGGED` or `~CollectionType.CHAINED` collection,
1215 datasets within the collection are not modified unless ``unstore``
1216 is `True`. If this is a `~CollectionType.RUN` collection,
1217 ``purge`` and ``unstore`` must be `True`, and all datasets in it
1218 are fully removed from the data repository.
1219 purge : `bool`, optional
1220 If `True`, permit `~CollectionType.RUN` collections to be removed,
1221 fully removing datasets within them. Requires ``unstore=True`` as
1222 well as an added precaution against accidental deletion. Must be
1223 `False` (default) if the collection is not a ``RUN``.
1224 unstore: `bool`, optional
1225 If `True`, remove all datasets in the collection from all
1226 datastores in which they appear.
1228 Raises
1229 ------
1230 TypeError
1231 Raised if the butler is read-only or arguments are mutually
1232 inconsistent.
1233 """
1235 # See pruneDatasets comments for more information about the logic here;
1236 # the cases are almost the same, but here we can rely on Registry to
1237 # take care everything but Datastore deletion when we remove the
1238 # collection.
1239 if not self.isWriteable():
1240 raise TypeError("Butler is read-only.")
1241 collectionType = self.registry.getCollectionType(name)
1242 if purge and not unstore:
1243 raise PurgeWithoutUnstorePruneCollectionsError()
1244 if collectionType is CollectionType.RUN and not purge:
1245 raise RunWithoutPurgePruneCollectionsError(collectionType)
1246 if collectionType is not CollectionType.RUN and purge:
1247 raise PurgeUnsupportedPruneCollectionsError(collectionType)
1249 with self.registry.transaction():
1250 if unstore:
1251 for ref in self.registry.queryDatasets(..., collections=name, findFirst=True):
1252 if self.datastore.exists(ref):
1253 self.datastore.trash(ref)
1254 self.registry.removeCollection(name)
1255 if unstore:
1256 # Point of no return for removing artifacts
1257 self.datastore.emptyTrash()
1259 def pruneDatasets(self, refs: Iterable[DatasetRef], *,
1260 disassociate: bool = True,
1261 unstore: bool = False,
1262 tags: Iterable[str] = (),
1263 purge: bool = False,
1264 run: Optional[str] = None) -> None:
1265 """Remove one or more datasets from a collection and/or storage.
1267 Parameters
1268 ----------
1269 refs : `~collections.abc.Iterable` of `DatasetRef`
1270 Datasets to prune. These must be "resolved" references (not just
1271 a `DatasetType` and data ID).
1272 disassociate : `bool`, optional
1273 Disassociate pruned datasets from ``tags``, or from all collections
1274 if ``purge=True``.
1275 unstore : `bool`, optional
1276 If `True` (`False` is default) remove these datasets from all
1277 datastores known to this butler. Note that this will make it
1278 impossible to retrieve these datasets even via other collections.
1279 Datasets that are already not stored are ignored by this option.
1280 tags : `Iterable` [ `str` ], optional
1281 `~CollectionType.TAGGED` collections to disassociate the datasets
1282 from. Ignored if ``disassociate`` is `False` or ``purge`` is
1283 `True`.
1284 purge : `bool`, optional
1285 If `True` (`False` is default), completely remove the dataset from
1286 the `Registry`. To prevent accidental deletions, ``purge`` may
1287 only be `True` if all of the following conditions are met:
1289 - All given datasets are in the given run.
1290 - ``disassociate`` is `True`;
1291 - ``unstore`` is `True`.
1293 This mode may remove provenance information from datasets other
1294 than those provided, and should be used with extreme care.
1296 Raises
1297 ------
1298 TypeError
1299 Raised if the butler is read-only, if no collection was provided,
1300 or the conditions for ``purge=True`` were not met.
1301 """
1302 if not self.isWriteable():
1303 raise TypeError("Butler is read-only.")
1304 if purge:
1305 if not disassociate:
1306 raise TypeError("Cannot pass purge=True without disassociate=True.")
1307 if not unstore:
1308 raise TypeError("Cannot pass purge=True without unstore=True.")
1309 elif disassociate:
1310 tags = tuple(tags)
1311 if not tags:
1312 raise TypeError("No tags provided but disassociate=True.")
1313 for tag in tags:
1314 collectionType = self.registry.getCollectionType(tag)
1315 if collectionType is not CollectionType.TAGGED:
1316 raise TypeError(f"Cannot disassociate from collection '{tag}' "
1317 f"of non-TAGGED type {collectionType.name}.")
1318 # Transform possibly-single-pass iterable into something we can iterate
1319 # over multiple times.
1320 refs = list(refs)
1321 # Pruning a component of a DatasetRef makes no sense since registry
1322 # doesn't know about components and datastore might not store
1323 # components in a separate file
1324 for ref in refs:
1325 if ref.datasetType.component():
1326 raise ValueError(f"Can not prune a component of a dataset (ref={ref})")
1327 # We don't need an unreliable Datastore transaction for this, because
1328 # we've been extra careful to ensure that Datastore.trash only involves
1329 # mutating the Registry (it can _look_ at Datastore-specific things,
1330 # but shouldn't change them), and hence all operations here are
1331 # Registry operations.
1332 with self.registry.transaction():
1333 if unstore:
1334 for ref in refs:
1335 # There is a difference between a concrete composite
1336 # and virtual composite. In a virtual composite the
1337 # datastore is never given the top level DatasetRef. In
1338 # the concrete composite the datastore knows all the
1339 # refs and will clean up itself if asked to remove the
1340 # parent ref. We can not check configuration for this
1341 # since we can not trust that the configuration is the
1342 # same. We therefore have to ask if the ref exists or
1343 # not. This is consistent with the fact that we want
1344 # to ignore already-removed-from-datastore datasets
1345 # anyway.
1346 if self.datastore.exists(ref):
1347 self.datastore.trash(ref)
1348 if purge:
1349 self.registry.removeDatasets(refs)
1350 elif disassociate:
1351 assert tags, "Guaranteed by earlier logic in this function."
1352 for tag in tags:
1353 self.registry.disassociate(tag, refs)
1354 # We've exited the Registry transaction, and apparently committed.
1355 # (if there was an exception, everything rolled back, and it's as if
1356 # nothing happened - and we never get here).
1357 # Datastore artifacts are not yet gone, but they're clearly marked
1358 # as trash, so if we fail to delete now because of (e.g.) filesystem
1359 # problems we can try again later, and if manual administrative
1360 # intervention is required, it's pretty clear what that should entail:
1361 # deleting everything on disk and in private Datastore tables that is
1362 # in the dataset_location_trash table.
1363 if unstore:
1364 # Point of no return for removing artifacts
1365 self.datastore.emptyTrash()
1367 @transactional
1368 def ingest(self, *datasets: FileDataset, transfer: Optional[str] = "auto", run: Optional[str] = None,
1369 ) -> None:
1370 """Store and register one or more datasets that already exist on disk.
1372 Parameters
1373 ----------
1374 datasets : `FileDataset`
1375 Each positional argument is a struct containing information about
1376 a file to be ingested, including its path (either absolute or
1377 relative to the datastore root, if applicable), a `DatasetRef`,
1378 and optionally a formatter class or its fully-qualified string
1379 name. If a formatter is not provided, the formatter that would be
1380 used for `put` is assumed. On successful return, all
1381 `FileDataset.ref` attributes will have their `DatasetRef.id`
1382 attribute populated and all `FileDataset.formatter` attributes will
1383 be set to the formatter class used. `FileDataset.path` attributes
1384 may be modified to put paths in whatever the datastore considers a
1385 standardized form.
1386 transfer : `str`, optional
1387 If not `None`, must be one of 'auto', 'move', 'copy', 'direct',
1388 'hardlink', 'relsymlink' or 'symlink', indicating how to transfer
1389 the file.
1390 run : `str`, optional
1391 The name of the run ingested datasets should be added to,
1392 overriding ``self.run``.
1394 Raises
1395 ------
1396 TypeError
1397 Raised if the butler is read-only or if no run was provided.
1398 NotImplementedError
1399 Raised if the `Datastore` does not support the given transfer mode.
1400 DatasetTypeNotSupportedError
1401 Raised if one or more files to be ingested have a dataset type that
1402 is not supported by the `Datastore`..
1403 FileNotFoundError
1404 Raised if one of the given files does not exist.
1405 FileExistsError
1406 Raised if transfer is not `None` but the (internal) location the
1407 file would be moved to is already occupied.
1409 Notes
1410 -----
1411 This operation is not fully exception safe: if a database operation
1412 fails, the given `FileDataset` instances may be only partially updated.
1414 It is atomic in terms of database operations (they will either all
1415 succeed or all fail) providing the database engine implements
1416 transactions correctly. It will attempt to be atomic in terms of
1417 filesystem operations as well, but this cannot be implemented
1418 rigorously for most datastores.
1419 """
1420 if not self.isWriteable():
1421 raise TypeError("Butler is read-only.")
1422 progress = Progress("lsst.daf.butler.Butler.ingest", level=logging.DEBUG)
1423 # Reorganize the inputs so they're grouped by DatasetType and then
1424 # data ID. We also include a list of DatasetRefs for each FileDataset
1425 # to hold the resolved DatasetRefs returned by the Registry, before
1426 # it's safe to swap them into FileDataset.refs.
1427 # Some type annotation aliases to make that clearer:
1428 GroupForType = Dict[DataCoordinate, Tuple[FileDataset, List[DatasetRef]]]
1429 GroupedData = MutableMapping[DatasetType, GroupForType]
1430 # The actual data structure:
1431 groupedData: GroupedData = defaultdict(dict)
1432 # And the nested loop that populates it:
1433 for dataset in progress.wrap(datasets, desc="Grouping by dataset type"):
1434 # This list intentionally shared across the inner loop, since it's
1435 # associated with `dataset`.
1436 resolvedRefs: List[DatasetRef] = []
1437 for ref in dataset.refs:
1438 if ref.dataId in groupedData[ref.datasetType]:
1439 raise ConflictingDefinitionError(f"Ingest conflict. Dataset {dataset.path} has same"
1440 " DataId as other ingest dataset"
1441 f" {groupedData[ref.datasetType][ref.dataId][0].path} "
1442 f" ({ref.dataId})")
1443 groupedData[ref.datasetType][ref.dataId] = (dataset, resolvedRefs)
1445 # Now we can bulk-insert into Registry for each DatasetType.
1446 allResolvedRefs: List[DatasetRef] = []
1447 for datasetType, groupForType in progress.iter_item_chunks(groupedData.items(),
1448 desc="Bulk-inserting datasets by type"):
1449 refs = self.registry.insertDatasets(datasetType,
1450 dataIds=groupForType.keys(),
1451 run=run)
1452 # Append those resolved DatasetRefs to the new lists we set up for
1453 # them.
1454 for ref, (_, resolvedRefs) in zip(refs, groupForType.values()):
1455 resolvedRefs.append(ref)
1457 # Go back to the original FileDatasets to replace their refs with the
1458 # new resolved ones, and also build a big list of all refs.
1459 allResolvedRefs = []
1460 for groupForType in progress.iter_chunks(groupedData.values(),
1461 desc="Reassociating resolved dataset refs with files"):
1462 for dataset, resolvedRefs in groupForType.values():
1463 dataset.refs = resolvedRefs
1464 allResolvedRefs.extend(resolvedRefs)
1466 # Bulk-insert everything into Datastore.
1467 self.datastore.ingest(*datasets, transfer=transfer)
1469 @contextlib.contextmanager
1470 def export(self, *, directory: Optional[str] = None,
1471 filename: Optional[str] = None,
1472 format: Optional[str] = None,
1473 transfer: Optional[str] = None) -> Iterator[RepoExportContext]:
1474 """Export datasets from the repository represented by this `Butler`.
1476 This method is a context manager that returns a helper object
1477 (`RepoExportContext`) that is used to indicate what information from
1478 the repository should be exported.
1480 Parameters
1481 ----------
1482 directory : `str`, optional
1483 Directory dataset files should be written to if ``transfer`` is not
1484 `None`.
1485 filename : `str`, optional
1486 Name for the file that will include database information associated
1487 with the exported datasets. If this is not an absolute path and
1488 ``directory`` is not `None`, it will be written to ``directory``
1489 instead of the current working directory. Defaults to
1490 "export.{format}".
1491 format : `str`, optional
1492 File format for the database information file. If `None`, the
1493 extension of ``filename`` will be used.
1494 transfer : `str`, optional
1495 Transfer mode passed to `Datastore.export`.
1497 Raises
1498 ------
1499 TypeError
1500 Raised if the set of arguments passed is inconsistent.
1502 Examples
1503 --------
1504 Typically the `Registry.queryDataIds` and `Registry.queryDatasets`
1505 methods are used to provide the iterables over data IDs and/or datasets
1506 to be exported::
1508 with butler.export("exports.yaml") as export:
1509 # Export all flats, but none of the dimension element rows
1510 # (i.e. data ID information) associated with them.
1511 export.saveDatasets(butler.registry.queryDatasets("flat"),
1512 elements=())
1513 # Export all datasets that start with "deepCoadd_" and all of
1514 # their associated data ID information.
1515 export.saveDatasets(butler.registry.queryDatasets("deepCoadd_*"))
1516 """
1517 if directory is None and transfer is not None:
1518 raise TypeError("Cannot transfer without providing a directory.")
1519 if transfer == "move":
1520 raise TypeError("Transfer may not be 'move': export is read-only")
1521 if format is None:
1522 if filename is None:
1523 raise TypeError("At least one of 'filename' or 'format' must be provided.")
1524 else:
1525 _, format = os.path.splitext(filename)
1526 elif filename is None:
1527 filename = f"export.{format}"
1528 if directory is not None:
1529 filename = os.path.join(directory, filename)
1530 BackendClass = getClassOf(self._config["repo_transfer_formats"][format]["export"])
1531 with open(filename, 'w') as stream:
1532 backend = BackendClass(stream)
1533 try:
1534 helper = RepoExportContext(self.registry, self.datastore, backend=backend,
1535 directory=directory, transfer=transfer)
1536 yield helper
1537 except BaseException:
1538 raise
1539 else:
1540 helper._finish()
1542 def import_(self, *, directory: Optional[str] = None,
1543 filename: Union[str, TextIO, None] = None,
1544 format: Optional[str] = None,
1545 transfer: Optional[str] = None,
1546 skip_dimensions: Optional[Set] = None) -> None:
1547 """Import datasets into this repository that were exported from a
1548 different butler repository via `~lsst.daf.butler.Butler.export`.
1550 Parameters
1551 ----------
1552 directory : `str`, optional
1553 Directory containing dataset files to import from. If `None`,
1554 ``filename`` and all dataset file paths specified therein must
1555 be absolute.
1556 filename : `str` or `TextIO`, optional
1557 A stream or name of file that contains database information
1558 associated with the exported datasets, typically generated by
1559 `~lsst.daf.butler.Butler.export`. If this a string (name) and
1560 is not an absolute path, does not exist in the current working
1561 directory, and ``directory`` is not `None`, it is assumed to be in
1562 ``directory``. Defaults to "export.{format}".
1563 format : `str`, optional
1564 File format for ``filename``. If `None`, the extension of
1565 ``filename`` will be used.
1566 transfer : `str`, optional
1567 Transfer mode passed to `~lsst.daf.butler.Datastore.ingest`.
1568 skip_dimensions : `set`, optional
1569 Names of dimensions that should be skipped and not imported.
1571 Raises
1572 ------
1573 TypeError
1574 Raised if the set of arguments passed is inconsistent, or if the
1575 butler is read-only.
1576 """
1577 if not self.isWriteable():
1578 raise TypeError("Butler is read-only.")
1579 if format is None:
1580 if filename is None:
1581 raise TypeError("At least one of 'filename' or 'format' must be provided.")
1582 else:
1583 _, format = os.path.splitext(filename) # type: ignore
1584 elif filename is None:
1585 filename = f"export.{format}"
1586 if isinstance(filename, str) and directory is not None and not os.path.exists(filename):
1587 filename = os.path.join(directory, filename)
1588 BackendClass = getClassOf(self._config["repo_transfer_formats"][format]["import"])
1590 def doImport(importStream: TextIO) -> None:
1591 backend = BackendClass(importStream, self.registry)
1592 backend.register()
1593 with self.transaction():
1594 backend.load(self.datastore, directory=directory, transfer=transfer,
1595 skip_dimensions=skip_dimensions)
1597 if isinstance(filename, str):
1598 with open(filename, "r") as stream:
1599 doImport(stream)
1600 else:
1601 doImport(filename)
1603 def validateConfiguration(self, logFailures: bool = False,
1604 datasetTypeNames: Optional[Iterable[str]] = None,
1605 ignore: Iterable[str] = None) -> None:
1606 """Validate butler configuration.
1608 Checks that each `DatasetType` can be stored in the `Datastore`.
1610 Parameters
1611 ----------
1612 logFailures : `bool`, optional
1613 If `True`, output a log message for every validation error
1614 detected.
1615 datasetTypeNames : iterable of `str`, optional
1616 The `DatasetType` names that should be checked. This allows
1617 only a subset to be selected.
1618 ignore : iterable of `str`, optional
1619 Names of DatasetTypes to skip over. This can be used to skip
1620 known problems. If a named `DatasetType` corresponds to a
1621 composite, all components of that `DatasetType` will also be
1622 ignored.
1624 Raises
1625 ------
1626 ButlerValidationError
1627 Raised if there is some inconsistency with how this Butler
1628 is configured.
1629 """
1630 if datasetTypeNames:
1631 datasetTypes = [self.registry.getDatasetType(name) for name in datasetTypeNames]
1632 else:
1633 datasetTypes = list(self.registry.queryDatasetTypes())
1635 # filter out anything from the ignore list
1636 if ignore:
1637 ignore = set(ignore)
1638 datasetTypes = [e for e in datasetTypes
1639 if e.name not in ignore and e.nameAndComponent()[0] not in ignore]
1640 else:
1641 ignore = set()
1643 # Find all the registered instruments
1644 instruments = set(
1645 record.name for record in self.registry.queryDimensionRecords("instrument")
1646 )
1648 # For each datasetType that has an instrument dimension, create
1649 # a DatasetRef for each defined instrument
1650 datasetRefs = []
1652 for datasetType in datasetTypes:
1653 if "instrument" in datasetType.dimensions:
1654 for instrument in instruments:
1655 datasetRef = DatasetRef(datasetType, {"instrument": instrument}, # type: ignore
1656 conform=False)
1657 datasetRefs.append(datasetRef)
1659 entities: List[Union[DatasetType, DatasetRef]] = []
1660 entities.extend(datasetTypes)
1661 entities.extend(datasetRefs)
1663 datastoreErrorStr = None
1664 try:
1665 self.datastore.validateConfiguration(entities, logFailures=logFailures)
1666 except ValidationError as e:
1667 datastoreErrorStr = str(e)
1669 # Also check that the LookupKeys used by the datastores match
1670 # registry and storage class definitions
1671 keys = self.datastore.getLookupKeys()
1673 failedNames = set()
1674 failedDataId = set()
1675 for key in keys:
1676 if key.name is not None:
1677 if key.name in ignore:
1678 continue
1680 # skip if specific datasetType names were requested and this
1681 # name does not match
1682 if datasetTypeNames and key.name not in datasetTypeNames:
1683 continue
1685 # See if it is a StorageClass or a DatasetType
1686 if key.name in self.storageClasses:
1687 pass
1688 else:
1689 try:
1690 self.registry.getDatasetType(key.name)
1691 except KeyError:
1692 if logFailures:
1693 log.critical("Key '%s' does not correspond to a DatasetType or StorageClass", key)
1694 failedNames.add(key)
1695 else:
1696 # Dimensions are checked for consistency when the Butler
1697 # is created and rendezvoused with a universe.
1698 pass
1700 # Check that the instrument is a valid instrument
1701 # Currently only support instrument so check for that
1702 if key.dataId:
1703 dataIdKeys = set(key.dataId)
1704 if set(["instrument"]) != dataIdKeys:
1705 if logFailures:
1706 log.critical("Key '%s' has unsupported DataId override", key)
1707 failedDataId.add(key)
1708 elif key.dataId["instrument"] not in instruments:
1709 if logFailures:
1710 log.critical("Key '%s' has unknown instrument", key)
1711 failedDataId.add(key)
1713 messages = []
1715 if datastoreErrorStr:
1716 messages.append(datastoreErrorStr)
1718 for failed, msg in ((failedNames, "Keys without corresponding DatasetType or StorageClass entry: "),
1719 (failedDataId, "Keys with bad DataId entries: ")):
1720 if failed:
1721 msg += ", ".join(str(k) for k in failed)
1722 messages.append(msg)
1724 if messages:
1725 raise ValidationError(";\n".join(messages))
1727 @property
1728 def collections(self) -> CollectionSearch:
1729 """The collections to search by default, in order (`CollectionSearch`).
1731 This is an alias for ``self.registry.defaults.collections``. It cannot
1732 be set directly in isolation, but all defaults may be changed together
1733 by assigning a new `RegistryDefaults` instance to
1734 ``self.registry.defaults``.
1735 """
1736 return self.registry.defaults.collections
1738 @property
1739 def run(self) -> Optional[str]:
1740 """Name of the run this butler writes outputs to by default (`str` or
1741 `None`).
1743 This is an alias for ``self.registry.defaults.run``. It cannot be set
1744 directly in isolation, but all defaults may be changed together by
1745 assigning a new `RegistryDefaults` instance to
1746 ``self.registry.defaults``.
1747 """
1748 return self.registry.defaults.run
1750 registry: Registry
1751 """The object that manages dataset metadata and relationships (`Registry`).
1753 Most operations that don't involve reading or writing butler datasets are
1754 accessible only via `Registry` methods.
1755 """
1757 datastore: Datastore
1758 """The object that manages actual dataset storage (`Datastore`).
1760 Direct user access to the datastore should rarely be necessary; the primary
1761 exception is the case where a `Datastore` implementation provides extra
1762 functionality beyond what the base class defines.
1763 """
1765 storageClasses: StorageClassFactory
1766 """An object that maps known storage class names to objects that fully
1767 describe them (`StorageClassFactory`).
1768 """