Coverage for python/lsst/daf/butler/_butler.py : 8%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""
23Butler top level classes.
24"""
25from __future__ import annotations
27__all__ = (
28 "Butler",
29 "ButlerValidationError",
30 "PruneCollectionsArgsError",
31 "PurgeWithoutUnstorePruneCollectionsError",
32 "RunWithoutPurgePruneCollectionsError",
33 "PurgeUnsupportedPruneCollectionsError",
34)
37from collections import defaultdict
38import contextlib
39import logging
40import numbers
41import os
42from typing import (
43 Any,
44 ClassVar,
45 Counter,
46 Dict,
47 Iterable,
48 Iterator,
49 List,
50 Mapping,
51 MutableMapping,
52 Optional,
53 Set,
54 TextIO,
55 Tuple,
56 Type,
57 Union,
58)
60try:
61 import boto3
62except ImportError:
63 boto3 = None
65from lsst.utils import doImport
66from .core import (
67 AmbiguousDatasetError,
68 ButlerURI,
69 Config,
70 ConfigSubset,
71 DataCoordinate,
72 DataId,
73 DataIdValue,
74 DatasetRef,
75 DatasetType,
76 Datastore,
77 Dimension,
78 DimensionConfig,
79 FileDataset,
80 StorageClassFactory,
81 Timespan,
82 ValidationError,
83)
84from .core.repoRelocation import BUTLER_ROOT_TAG
85from .core.utils import transactional, getClassOf
86from ._deferredDatasetHandle import DeferredDatasetHandle
87from ._butlerConfig import ButlerConfig
88from .registry import Registry, RegistryConfig, CollectionType
89from .registry.wildcards import CollectionSearch
90from .transfers import RepoExportContext
92log = logging.getLogger(__name__)
95class ButlerValidationError(ValidationError):
96 """There is a problem with the Butler configuration."""
97 pass
100class PruneCollectionsArgsError(TypeError):
101 """Base class for errors relating to Butler.pruneCollections input
102 arguments.
103 """
104 pass
107class PurgeWithoutUnstorePruneCollectionsError(PruneCollectionsArgsError):
108 """Raised when purge and unstore are both required to be True, and
109 purge is True but unstore is False.
110 """
112 def __init__(self) -> None:
113 super().__init__("Cannot pass purge=True without unstore=True.")
116class RunWithoutPurgePruneCollectionsError(PruneCollectionsArgsError):
117 """Raised when pruning a RUN collection but purge is False."""
119 def __init__(self, collectionType: CollectionType):
120 self.collectionType = collectionType
121 super().__init__(f"Cannot prune RUN collection {self.collectionType.name} without purge=True.")
124class PurgeUnsupportedPruneCollectionsError(PruneCollectionsArgsError):
125 """Raised when purge is True but is not supported for the given
126 collection."""
128 def __init__(self, collectionType: CollectionType):
129 self.collectionType = collectionType
130 super().__init__(
131 f"Cannot prune {self.collectionType} collection {self.collectionType.name} with purge=True.")
134class Butler:
135 """Main entry point for the data access system.
137 Parameters
138 ----------
139 config : `ButlerConfig`, `Config` or `str`, optional.
140 Configuration. Anything acceptable to the
141 `ButlerConfig` constructor. If a directory path
142 is given the configuration will be read from a ``butler.yaml`` file in
143 that location. If `None` is given default values will be used.
144 butler : `Butler`, optional.
145 If provided, construct a new Butler that uses the same registry and
146 datastore as the given one, but with the given collection and run.
147 Incompatible with the ``config``, ``searchPaths``, and ``writeable``
148 arguments.
149 collections : `Any`, optional
150 An expression specifying the collections to be searched (in order) when
151 reading datasets, and optionally dataset type restrictions on them.
152 This may be:
153 - a `str` collection name;
154 - a tuple of (collection name, *dataset type restriction*);
155 - an iterable of either of the above;
156 - a mapping from `str` to *dataset type restriction*.
158 See :ref:`daf_butler_collection_expressions` for more information,
159 including the definition of a *dataset type restriction*. All
160 collections must either already exist or be specified to be created
161 by other arguments.
162 run : `str`, optional
163 Name of the run datasets should be output to. If the run
164 does not exist, it will be created. If ``collections`` is `None`, it
165 will be set to ``[run]``. If this is not set (and ``writeable`` is
166 not set either), a read-only butler will be created.
167 tags : `Iterable` [ `str` ], optional
168 A list of `~CollectionType.TAGGED` collections that datasets should be
169 associated with in `put` or `ingest` and disassociated from in
170 `pruneDatasets`. If any of these collections does not exist, it will
171 be created.
172 chains : `Mapping` [ `str`, `Iterable` [ `str` ] ], optional
173 A mapping from the names of new `~CollectionType.CHAINED` collections
174 to an expression identifying their child collections (which takes the
175 same form as the ``collections`` argument. Chains may be nested only
176 if children precede their parents in this mapping.
177 searchPaths : `list` of `str`, optional
178 Directory paths to search when calculating the full Butler
179 configuration. Not used if the supplied config is already a
180 `ButlerConfig`.
181 writeable : `bool`, optional
182 Explicitly sets whether the butler supports write operations. If not
183 provided, a read-write butler is created if any of ``run``, ``tags``,
184 or ``chains`` is non-empty.
186 Examples
187 --------
188 While there are many ways to control exactly how a `Butler` interacts with
189 the collections in its `Registry`, the most common cases are still simple.
191 For a read-only `Butler` that searches one collection, do::
193 butler = Butler("/path/to/repo", collections=["u/alice/DM-50000"])
195 For a read-write `Butler` that writes to and reads from a
196 `~CollectionType.RUN` collection::
198 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a")
200 The `Butler` passed to a ``PipelineTask`` is often much more complex,
201 because we want to write to one `~CollectionType.RUN` collection but read
202 from several others (as well), while defining a new
203 `~CollectionType.CHAINED` collection that combines them all::
205 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a",
206 collections=["u/alice/DM-50000"],
207 chains={
208 "u/alice/DM-50000": ["u/alice/DM-50000/a",
209 "u/bob/DM-49998",
210 "raw/hsc"]
211 })
213 This butler will `put` new datasets to the run ``u/alice/DM-50000/a``, but
214 they'll also be available from the chained collection ``u/alice/DM-50000``.
215 Datasets will be read first from that run (since it appears first in the
216 chain), and then from ``u/bob/DM-49998`` and finally ``raw/hsc``.
217 If ``u/alice/DM-50000`` had already been defined, the ``chain`` argument
218 would be unnecessary. We could also construct a butler that performs
219 exactly the same `put` and `get` operations without actually creating a
220 chained collection, just by passing multiple items is ``collections``::
222 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a",
223 collections=["u/alice/DM-50000/a",
224 "u/bob/DM-49998",
225 "raw/hsc"])
227 Finally, one can always create a `Butler` with no collections::
229 butler = Butler("/path/to/repo", writeable=True)
231 This can be extremely useful when you just want to use ``butler.registry``,
232 e.g. for inserting dimension data or managing collections, or when the
233 collections you want to use with the butler are not consistent.
234 Passing ``writeable`` explicitly here is only necessary if you want to be
235 able to make changes to the repo - usually the value for ``writeable`` is
236 can be guessed from the collection arguments provided, but it defaults to
237 `False` when there are not collection arguments.
238 """
239 def __init__(self, config: Union[Config, str, None] = None, *,
240 butler: Optional[Butler] = None,
241 collections: Any = None,
242 run: Optional[str] = None,
243 tags: Iterable[str] = (),
244 chains: Optional[Mapping[str, Any]] = None,
245 searchPaths: Optional[List[str]] = None,
246 writeable: Optional[bool] = None,
247 ):
248 # Transform any single-pass iterator into an actual sequence so we
249 # can see if its empty
250 self.tags = tuple(tags)
251 # Load registry, datastore, etc. from config or existing butler.
252 if butler is not None:
253 if config is not None or searchPaths is not None or writeable is not None:
254 raise TypeError("Cannot pass 'config', 'searchPaths', or 'writeable' "
255 "arguments with 'butler' argument.")
256 self.registry = butler.registry
257 self.datastore = butler.datastore
258 self.storageClasses = butler.storageClasses
259 self._config: ButlerConfig = butler._config
260 else:
261 self._config = ButlerConfig(config, searchPaths=searchPaths)
262 if "root" in self._config:
263 butlerRoot = self._config["root"]
264 else:
265 butlerRoot = self._config.configDir
266 if writeable is None:
267 writeable = run is not None or chains is not None or bool(self.tags)
268 self.registry = Registry.fromConfig(self._config, butlerRoot=butlerRoot, writeable=writeable)
269 self.datastore = Datastore.fromConfig(self._config, self.registry.getDatastoreBridgeManager(),
270 butlerRoot=butlerRoot)
271 self.storageClasses = StorageClassFactory()
272 self.storageClasses.addFromConfig(self._config)
273 # Check the many collection arguments for consistency and create any
274 # needed collections that don't exist.
275 if collections is None:
276 if run is not None:
277 collections = (run,)
278 else:
279 collections = ()
280 self.collections = CollectionSearch.fromExpression(collections)
281 if chains is None:
282 chains = {}
283 self.run = run
284 if "run" in self._config or "collection" in self._config:
285 raise ValueError("Passing a run or collection via configuration is no longer supported.")
286 if self.run is not None:
287 self.registry.registerCollection(self.run, type=CollectionType.RUN)
288 for tag in self.tags:
289 self.registry.registerCollection(tag, type=CollectionType.TAGGED)
290 for parent, children in chains.items():
291 self.registry.registerCollection(parent, type=CollectionType.CHAINED)
292 self.registry.setCollectionChain(parent, children)
294 GENERATION: ClassVar[int] = 3
295 """This is a Generation 3 Butler.
297 This attribute may be removed in the future, once the Generation 2 Butler
298 interface has been fully retired; it should only be used in transitional
299 code.
300 """
302 @staticmethod
303 def makeRepo(root: str, config: Union[Config, str, None] = None,
304 dimensionConfig: Union[Config, str, None] = None, standalone: bool = False,
305 searchPaths: Optional[List[str]] = None, forceConfigRoot: bool = True,
306 outfile: Optional[str] = None, overwrite: bool = False) -> Config:
307 """Create an empty data repository by adding a butler.yaml config
308 to a repository root directory.
310 Parameters
311 ----------
312 root : `str` or `ButlerURI`
313 Path or URI to the root location of the new repository. Will be
314 created if it does not exist.
315 config : `Config` or `str`, optional
316 Configuration to write to the repository, after setting any
317 root-dependent Registry or Datastore config options. Can not
318 be a `ButlerConfig` or a `ConfigSubset`. If `None`, default
319 configuration will be used. Root-dependent config options
320 specified in this config are overwritten if ``forceConfigRoot``
321 is `True`.
322 dimensionConfig : `Config` or `str`, optional
323 Configuration for dimensions, will be used to initialize registry
324 database.
325 standalone : `bool`
326 If True, write all expanded defaults, not just customized or
327 repository-specific settings.
328 This (mostly) decouples the repository from the default
329 configuration, insulating it from changes to the defaults (which
330 may be good or bad, depending on the nature of the changes).
331 Future *additions* to the defaults will still be picked up when
332 initializing `Butlers` to repos created with ``standalone=True``.
333 searchPaths : `list` of `str`, optional
334 Directory paths to search when calculating the full butler
335 configuration.
336 forceConfigRoot : `bool`, optional
337 If `False`, any values present in the supplied ``config`` that
338 would normally be reset are not overridden and will appear
339 directly in the output config. This allows non-standard overrides
340 of the root directory for a datastore or registry to be given.
341 If this parameter is `True` the values for ``root`` will be
342 forced into the resulting config if appropriate.
343 outfile : `str`, optional
344 If not-`None`, the output configuration will be written to this
345 location rather than into the repository itself. Can be a URI
346 string. Can refer to a directory that will be used to write
347 ``butler.yaml``.
348 overwrite : `bool`, optional
349 Create a new configuration file even if one already exists
350 in the specified output location. Default is to raise
351 an exception.
353 Returns
354 -------
355 config : `Config`
356 The updated `Config` instance written to the repo.
358 Raises
359 ------
360 ValueError
361 Raised if a ButlerConfig or ConfigSubset is passed instead of a
362 regular Config (as these subclasses would make it impossible to
363 support ``standalone=False``).
364 FileExistsError
365 Raised if the output config file already exists.
366 os.error
367 Raised if the directory does not exist, exists but is not a
368 directory, or cannot be created.
370 Notes
371 -----
372 Note that when ``standalone=False`` (the default), the configuration
373 search path (see `ConfigSubset.defaultSearchPaths`) that was used to
374 construct the repository should also be used to construct any Butlers
375 to avoid configuration inconsistencies.
376 """
377 if isinstance(config, (ButlerConfig, ConfigSubset)):
378 raise ValueError("makeRepo must be passed a regular Config without defaults applied.")
380 # Ensure that the root of the repository exists or can be made
381 uri = ButlerURI(root, forceDirectory=True)
382 uri.mkdir()
384 config = Config(config)
386 # If we are creating a new repo from scratch with relative roots,
387 # do not propagate an explicit root from the config file
388 if "root" in config:
389 del config["root"]
391 full = ButlerConfig(config, searchPaths=searchPaths) # this applies defaults
392 datastoreClass: Type[Datastore] = doImport(full["datastore", "cls"])
393 datastoreClass.setConfigRoot(BUTLER_ROOT_TAG, config, full, overwrite=forceConfigRoot)
395 # if key exists in given config, parse it, otherwise parse the defaults
396 # in the expanded config
397 if config.get(("registry", "db")):
398 registryConfig = RegistryConfig(config)
399 else:
400 registryConfig = RegistryConfig(full)
401 defaultDatabaseUri = registryConfig.makeDefaultDatabaseUri(BUTLER_ROOT_TAG)
402 if defaultDatabaseUri is not None:
403 Config.updateParameters(RegistryConfig, config, full,
404 toUpdate={"db": defaultDatabaseUri},
405 overwrite=forceConfigRoot)
406 else:
407 Config.updateParameters(RegistryConfig, config, full, toCopy=("db",),
408 overwrite=forceConfigRoot)
410 if standalone:
411 config.merge(full)
412 else:
413 # Always expand the registry.managers section into the per-repo
414 # config, because after the database schema is created, it's not
415 # allowed to change anymore. Note that in the standalone=True
416 # branch, _everything_ in the config is expanded, so there's no
417 # need to special case this.
418 Config.updateParameters(RegistryConfig, config, full, toCopy=("managers",), overwrite=False)
419 configURI: Union[str, ButlerURI]
420 if outfile is not None:
421 # When writing to a separate location we must include
422 # the root of the butler repo in the config else it won't know
423 # where to look.
424 config["root"] = uri.geturl()
425 configURI = outfile
426 else:
427 configURI = uri
428 config.dumpToUri(configURI, overwrite=overwrite)
430 # Create Registry and populate tables
431 registryConfig = RegistryConfig(config.get("registry"))
432 dimensionConfig = DimensionConfig(dimensionConfig)
433 Registry.createFromConfig(registryConfig, dimensionConfig=dimensionConfig, butlerRoot=root)
435 return config
437 @classmethod
438 def _unpickle(cls, config: ButlerConfig, collections: Optional[CollectionSearch], run: Optional[str],
439 tags: Tuple[str, ...], writeable: bool) -> Butler:
440 """Callable used to unpickle a Butler.
442 We prefer not to use ``Butler.__init__`` directly so we can force some
443 of its many arguments to be keyword-only (note that ``__reduce__``
444 can only invoke callables with positional arguments).
446 Parameters
447 ----------
448 config : `ButlerConfig`
449 Butler configuration, already coerced into a true `ButlerConfig`
450 instance (and hence after any search paths for overrides have been
451 utilized).
452 collections : `CollectionSearch`
453 Names of collections to read from.
454 run : `str`, optional
455 Name of `~CollectionType.RUN` collection to write to.
456 tags : `tuple` [`str`]
457 Names of `~CollectionType.TAGGED` collections to associate with.
458 writeable : `bool`
459 Whether the Butler should support write operations.
461 Returns
462 -------
463 butler : `Butler`
464 A new `Butler` instance.
465 """
466 return cls(config=config, collections=collections, run=run, tags=tags, writeable=writeable)
468 def __reduce__(self) -> tuple:
469 """Support pickling.
470 """
471 return (Butler._unpickle, (self._config, self.collections, self.run, self.tags,
472 self.registry.isWriteable()))
474 def __str__(self) -> str:
475 return "Butler(collections={}, run={}, tags={}, datastore='{}', registry='{}')".format(
476 self.collections, self.run, self.tags, self.datastore, self.registry)
478 def isWriteable(self) -> bool:
479 """Return `True` if this `Butler` supports write operations.
480 """
481 return self.registry.isWriteable()
483 @contextlib.contextmanager
484 def transaction(self) -> Iterator[None]:
485 """Context manager supporting `Butler` transactions.
487 Transactions can be nested.
488 """
489 with self.registry.transaction():
490 with self.datastore.transaction():
491 yield
493 def _standardizeArgs(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
494 dataId: Optional[DataId] = None, **kwds: Any
495 ) -> Tuple[DatasetType, Optional[DataId]]:
496 """Standardize the arguments passed to several Butler APIs.
498 Parameters
499 ----------
500 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
501 When `DatasetRef` the `dataId` should be `None`.
502 Otherwise the `DatasetType` or name thereof.
503 dataId : `dict` or `DataCoordinate`
504 A `dict` of `Dimension` link name, value pairs that label the
505 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
506 should be provided as the second argument.
507 kwds
508 Additional keyword arguments used to augment or construct a
509 `DataCoordinate`. See `DataCoordinate.standardize`
510 parameters.
512 Returns
513 -------
514 datasetType : `DatasetType`
515 A `DatasetType` instance extracted from ``datasetRefOrType``.
516 dataId : `dict` or `DataId`, optional
517 Argument that can be used (along with ``kwds``) to construct a
518 `DataId`.
520 Notes
521 -----
522 Butler APIs that conceptually need a DatasetRef also allow passing a
523 `DatasetType` (or the name of one) and a `DataId` (or a dict and
524 keyword arguments that can be used to construct one) separately. This
525 method accepts those arguments and always returns a true `DatasetType`
526 and a `DataId` or `dict`.
528 Standardization of `dict` vs `DataId` is best handled by passing the
529 returned ``dataId`` (and ``kwds``) to `Registry` APIs, which are
530 generally similarly flexible.
531 """
532 externalDatasetType: Optional[DatasetType] = None
533 internalDatasetType: Optional[DatasetType] = None
534 if isinstance(datasetRefOrType, DatasetRef):
535 if dataId is not None or kwds:
536 raise ValueError("DatasetRef given, cannot use dataId as well")
537 externalDatasetType = datasetRefOrType.datasetType
538 dataId = datasetRefOrType.dataId
539 else:
540 # Don't check whether DataId is provided, because Registry APIs
541 # can usually construct a better error message when it wasn't.
542 if isinstance(datasetRefOrType, DatasetType):
543 externalDatasetType = datasetRefOrType
544 else:
545 internalDatasetType = self.registry.getDatasetType(datasetRefOrType)
547 # Check that they are self-consistent
548 if externalDatasetType is not None:
549 internalDatasetType = self.registry.getDatasetType(externalDatasetType.name)
550 if externalDatasetType != internalDatasetType:
551 raise ValueError(f"Supplied dataset type ({externalDatasetType}) inconsistent with "
552 f"registry definition ({internalDatasetType})")
554 assert internalDatasetType is not None
555 return internalDatasetType, dataId
557 def _findDatasetRef(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
558 dataId: Optional[DataId] = None, *,
559 collections: Any = None,
560 allowUnresolved: bool = False,
561 **kwds: Any) -> DatasetRef:
562 """Shared logic for methods that start with a search for a dataset in
563 the registry.
565 Parameters
566 ----------
567 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
568 When `DatasetRef` the `dataId` should be `None`.
569 Otherwise the `DatasetType` or name thereof.
570 dataId : `dict` or `DataCoordinate`, optional
571 A `dict` of `Dimension` link name, value pairs that label the
572 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
573 should be provided as the first argument.
574 collections : Any, optional
575 Collections to be searched, overriding ``self.collections``.
576 Can be any of the types supported by the ``collections`` argument
577 to butler construction.
578 allowUnresolved : `bool`, optional
579 If `True`, return an unresolved `DatasetRef` if finding a resolved
580 one in the `Registry` fails. Defaults to `False`.
581 kwds
582 Additional keyword arguments used to augment or construct a
583 `DataId`. See `DataId` parameters.
585 Returns
586 -------
587 ref : `DatasetRef`
588 A reference to the dataset identified by the given arguments.
590 Raises
591 ------
592 LookupError
593 Raised if no matching dataset exists in the `Registry` (and
594 ``allowUnresolved is False``).
595 ValueError
596 Raised if a resolved `DatasetRef` was passed as an input, but it
597 differs from the one found in the registry.
598 TypeError
599 Raised if no collections were provided.
600 """
601 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwds)
602 if isinstance(datasetRefOrType, DatasetRef):
603 idNumber = datasetRefOrType.id
604 else:
605 idNumber = None
606 timespan: Optional[Timespan] = None
608 # Process dimension records that are using record information
609 # rather than ids
610 newDataId: Dict[str, DataIdValue] = {}
611 byRecord: Dict[str, Dict[str, Any]] = defaultdict(dict)
613 # if all the dataId comes from keyword parameters we do not need
614 # to do anything here because they can't be of the form
615 # exposure.obs_id because a "." is not allowed in a keyword parameter.
616 if dataId:
617 for k, v in dataId.items():
618 # If we have a Dimension we do not need to do anything
619 # because it cannot be a compound key.
620 if isinstance(k, str) and "." in k:
621 # Someone is using a more human-readable dataId
622 dimensionName, record = k.split(".", 1)
623 byRecord[dimensionName][record] = v
624 elif isinstance(k, Dimension):
625 newDataId[k.name] = v
626 else:
627 newDataId[k] = v
629 # Go through the updated dataId and check the type in case someone is
630 # using an alternate key. We have already filtered out the compound
631 # keys dimensions.record format.
632 not_dimensions = {}
634 # Will need to look in the dataId and the keyword arguments
635 # and will remove them if they need to be fixed or are unrecognized.
636 for dataIdDict in (newDataId, kwds):
637 # Use a list so we can adjust the dict safely in the loop
638 for dimensionName in list(dataIdDict):
639 value = dataIdDict[dimensionName]
640 try:
641 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
642 except KeyError:
643 # This is not a real dimension
644 not_dimensions[dimensionName] = value
645 del dataIdDict[dimensionName]
646 continue
648 # Convert an integral type to an explicit int to simplify
649 # comparisons here
650 if isinstance(value, numbers.Integral):
651 value = int(value)
653 if not isinstance(value, dimension.primaryKey.getPythonType()):
654 for alternate in dimension.alternateKeys:
655 if isinstance(value, alternate.getPythonType()):
656 byRecord[dimensionName][alternate.name] = value
657 del dataIdDict[dimensionName]
658 log.debug("Converting dimension %s to %s.%s=%s",
659 dimensionName, dimensionName, alternate.name, value)
660 break
661 else:
662 log.warning("Type mismatch found for value '%r' provided for dimension %s. "
663 "Could not find matching alternative (primary key has type %s) "
664 "so attempting to use as-is.",
665 value, dimensionName, dimension.primaryKey.getPythonType())
667 # If we have some unrecognized dimensions we have to try to connect
668 # them to records in other dimensions. This is made more complicated
669 # by some dimensions having records with clashing names. A mitigation
670 # is that we can tell by this point which dimensions are missing
671 # for the DatasetType but this does not work for calibrations
672 # where additional dimensions can be used to constrain the temporal
673 # axis.
674 if not_dimensions:
675 # Calculate missing dimensions
676 provided = set(newDataId) | set(kwds) | set(byRecord)
677 missingDimensions = datasetType.dimensions.names - provided
679 # For calibrations we may well be needing temporal dimensions
680 # so rather than always including all dimensions in the scan
681 # restrict things a little. It is still possible for there
682 # to be confusion over day_obs in visit vs exposure for example.
683 # If we are not searching calibration collections things may
684 # fail but they are going to fail anyway because of the
685 # ambiguousness of the dataId...
686 candidateDimensions: Set[str] = set()
687 candidateDimensions.update(missingDimensions)
688 if datasetType.isCalibration():
689 for dim in self.registry.dimensions.getStaticDimensions():
690 if dim.temporal:
691 candidateDimensions.add(str(dim))
693 # Look up table for the first association with a dimension
694 guessedAssociation: Dict[str, Dict[str, Any]] = defaultdict(dict)
696 # Keep track of whether an item is associated with multiple
697 # dimensions.
698 counter: Counter[str] = Counter()
699 assigned: Dict[str, Set[str]] = defaultdict(set)
701 # Go through the missing dimensions and associate the
702 # given names with records within those dimensions
703 for dimensionName in candidateDimensions:
704 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
705 fields = dimension.metadata.names | dimension.uniqueKeys.names
706 for field in not_dimensions:
707 if field in fields:
708 guessedAssociation[dimensionName][field] = not_dimensions[field]
709 counter[dimensionName] += 1
710 assigned[field].add(dimensionName)
712 # There is a chance we have allocated a single dataId item
713 # to multiple dimensions. Need to decide which should be retained.
714 # For now assume that the most popular alternative wins.
715 # This means that day_obs with seq_num will result in
716 # exposure.day_obs and not visit.day_obs
717 # Also prefer an explicitly missing dimension over an inferred
718 # temporal dimension.
719 for fieldName, assignedDimensions in assigned.items():
720 if len(assignedDimensions) > 1:
721 # Pick the most popular (preferring mandatory dimensions)
722 requiredButMissing = assignedDimensions.intersection(missingDimensions)
723 if requiredButMissing:
724 candidateDimensions = requiredButMissing
725 else:
726 candidateDimensions = assignedDimensions
728 # Select the relevant items and get a new restricted
729 # counter.
730 theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions}
731 duplicatesCounter: Counter[str] = Counter()
732 duplicatesCounter.update(theseCounts)
734 # Choose the most common. If they are equally common
735 # we will pick the one that was found first.
736 # Returns a list of tuples
737 selected = duplicatesCounter.most_common(1)[0][0]
739 log.debug("Ambiguous dataId entry '%s' associated with multiple dimensions: %s."
740 " Removed ambiguity by choosing dimension %s.",
741 fieldName, ", ".join(assignedDimensions), selected)
743 for candidateDimension in assignedDimensions:
744 if candidateDimension != selected:
745 del guessedAssociation[candidateDimension][fieldName]
747 # Update the record look up dict with the new associations
748 for dimensionName, values in guessedAssociation.items():
749 if values: # A dict might now be empty
750 log.debug("Assigned non-dimension dataId keys to dimension %s: %s",
751 dimensionName, values)
752 byRecord[dimensionName].update(values)
754 if byRecord:
755 # Some record specifiers were found so we need to convert
756 # them to the Id form
757 for dimensionName, values in byRecord.items():
758 if dimensionName in newDataId:
759 log.warning("DataId specified explicit %s dimension value of %s in addition to"
760 " general record specifiers for it of %s. Ignoring record information.",
761 dimensionName, newDataId[dimensionName], str(values))
762 continue
764 # Build up a WHERE expression -- use single quotes
765 def quote(s: Any) -> str:
766 if isinstance(s, str):
767 return f"'{s}'"
768 else:
769 return s
771 where = " AND ".join(f"{dimensionName}.{k} = {quote(v)}"
772 for k, v in values.items())
774 # Hopefully we get a single record that matches
775 records = set(self.registry.queryDimensionRecords(dimensionName, dataId=newDataId,
776 where=where, **kwds))
778 if len(records) != 1:
779 if len(records) > 1:
780 log.debug("Received %d records from constraints of %s", len(records), str(values))
781 for r in records:
782 log.debug("- %s", str(r))
783 raise RuntimeError(f"DataId specification for dimension {dimensionName} is not"
784 f" uniquely constrained to a single dataset by {values}."
785 f" Got {len(records)} results.")
786 raise RuntimeError(f"DataId specification for dimension {dimensionName} matched no"
787 f" records when constrained by {values}")
789 # Get the primary key from the real dimension object
790 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
791 if not isinstance(dimension, Dimension):
792 raise RuntimeError(
793 f"{dimension.name} is not a true dimension, and cannot be used in data IDs."
794 )
795 newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name)
797 # We have modified the dataId so need to switch to it
798 dataId = newDataId
800 if datasetType.isCalibration():
801 # Because this is a calibration dataset, first try to make a
802 # standardize the data ID without restricting the dimensions to
803 # those of the dataset type requested, because there may be extra
804 # dimensions that provide temporal information for a validity-range
805 # lookup.
806 dataId = DataCoordinate.standardize(dataId, universe=self.registry.dimensions, **kwds)
807 if dataId.graph.temporal:
808 dataId = self.registry.expandDataId(dataId)
809 timespan = dataId.timespan
810 else:
811 # Standardize the data ID to just the dimensions of the dataset
812 # type instead of letting registry.findDataset do it, so we get the
813 # result even if no dataset is found.
814 dataId = DataCoordinate.standardize(dataId, graph=datasetType.dimensions, **kwds)
815 if collections is None:
816 collections = self.collections
817 if not collections:
818 raise TypeError("No input collections provided.")
819 else:
820 collections = CollectionSearch.fromExpression(collections)
821 # Always lookup the DatasetRef, even if one is given, to ensure it is
822 # present in the current collection.
823 ref = self.registry.findDataset(datasetType, dataId, collections=collections, timespan=timespan)
824 if ref is None:
825 if allowUnresolved:
826 return DatasetRef(datasetType, dataId)
827 else:
828 raise LookupError(f"Dataset {datasetType.name} with data ID {dataId} "
829 f"could not be found in collections {collections}.")
830 if idNumber is not None and idNumber != ref.id:
831 raise ValueError(f"DatasetRef.id provided ({idNumber}) does not match "
832 f"id ({ref.id}) in registry in collections {collections}.")
833 return ref
835 @transactional
836 def put(self, obj: Any, datasetRefOrType: Union[DatasetRef, DatasetType, str],
837 dataId: Optional[DataId] = None, *,
838 run: Optional[str] = None,
839 tags: Optional[Iterable[str]] = None,
840 **kwds: Any) -> DatasetRef:
841 """Store and register a dataset.
843 Parameters
844 ----------
845 obj : `object`
846 The dataset.
847 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
848 When `DatasetRef` is provided, ``dataId`` should be `None`.
849 Otherwise the `DatasetType` or name thereof.
850 dataId : `dict` or `DataCoordinate`
851 A `dict` of `Dimension` link name, value pairs that label the
852 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
853 should be provided as the second argument.
854 run : `str`, optional
855 The name of the run the dataset should be added to, overriding
856 ``self.run``.
857 tags : `Iterable` [ `str` ], optional
858 The names of a `~CollectionType.TAGGED` collections to associate
859 the dataset with, overriding ``self.tags``. These collections
860 must have already been added to the `Registry`.
861 kwds
862 Additional keyword arguments used to augment or construct a
863 `DataCoordinate`. See `DataCoordinate.standardize`
864 parameters.
866 Returns
867 -------
868 ref : `DatasetRef`
869 A reference to the stored dataset, updated with the correct id if
870 given.
872 Raises
873 ------
874 TypeError
875 Raised if the butler is read-only or if no run has been provided.
876 """
877 log.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run)
878 if not self.isWriteable():
879 raise TypeError("Butler is read-only.")
880 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwds)
881 if isinstance(datasetRefOrType, DatasetRef) and datasetRefOrType.id is not None:
882 raise ValueError("DatasetRef must not be in registry, must have None id")
884 if run is None:
885 if self.run is None:
886 raise TypeError("No run provided.")
887 run = self.run
888 # No need to check type for run; first thing we do is
889 # insertDatasets, and that will check for us.
891 if tags is None:
892 tags = self.tags
893 else:
894 tags = tuple(tags)
895 for tag in tags:
896 # Check that these are tagged collections up front, because we want
897 # to avoid relying on Datastore transactionality to avoid modifying
898 # the repo if there's an error later.
899 collectionType = self.registry.getCollectionType(tag)
900 if collectionType is not CollectionType.TAGGED:
901 raise TypeError(f"Cannot associate into collection '{tag}' of non-TAGGED type "
902 f"{collectionType.name}.")
904 # Add Registry Dataset entry.
905 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions, **kwds)
906 ref, = self.registry.insertDatasets(datasetType, run=run, dataIds=[dataId])
908 # Add Datastore entry.
909 self.datastore.put(obj, ref)
911 for tag in tags:
912 self.registry.associate(tag, [ref])
914 return ref
916 def getDirect(self, ref: DatasetRef, *, parameters: Optional[Dict[str, Any]] = None) -> Any:
917 """Retrieve a stored dataset.
919 Unlike `Butler.get`, this method allows datasets outside the Butler's
920 collection to be read as long as the `DatasetRef` that identifies them
921 can be obtained separately.
923 Parameters
924 ----------
925 ref : `DatasetRef`
926 Resolved reference to an already stored dataset.
927 parameters : `dict`
928 Additional StorageClass-defined options to control reading,
929 typically used to efficiently read only a subset of the dataset.
931 Returns
932 -------
933 obj : `object`
934 The dataset.
935 """
936 return self.datastore.get(ref, parameters=parameters)
938 def getDirectDeferred(self, ref: DatasetRef, *,
939 parameters: Union[dict, None] = None) -> DeferredDatasetHandle:
940 """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
941 from a resolved `DatasetRef`.
943 Parameters
944 ----------
945 ref : `DatasetRef`
946 Resolved reference to an already stored dataset.
947 parameters : `dict`
948 Additional StorageClass-defined options to control reading,
949 typically used to efficiently read only a subset of the dataset.
951 Returns
952 -------
953 obj : `DeferredDatasetHandle`
954 A handle which can be used to retrieve a dataset at a later time.
956 Raises
957 ------
958 AmbiguousDatasetError
959 Raised if ``ref.id is None``, i.e. the reference is unresolved.
960 """
961 if ref.id is None:
962 raise AmbiguousDatasetError(
963 f"Dataset of type {ref.datasetType.name} with data ID {ref.dataId} is not resolved."
964 )
965 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters)
967 def getDeferred(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
968 dataId: Optional[DataId] = None, *,
969 parameters: Union[dict, None] = None,
970 collections: Any = None,
971 **kwds: Any) -> DeferredDatasetHandle:
972 """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
973 after an immediate registry lookup.
975 Parameters
976 ----------
977 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
978 When `DatasetRef` the `dataId` should be `None`.
979 Otherwise the `DatasetType` or name thereof.
980 dataId : `dict` or `DataCoordinate`, optional
981 A `dict` of `Dimension` link name, value pairs that label the
982 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
983 should be provided as the first argument.
984 parameters : `dict`
985 Additional StorageClass-defined options to control reading,
986 typically used to efficiently read only a subset of the dataset.
987 collections : Any, optional
988 Collections to be searched, overriding ``self.collections``.
989 Can be any of the types supported by the ``collections`` argument
990 to butler construction.
991 kwds
992 Additional keyword arguments used to augment or construct a
993 `DataId`. See `DataId` parameters.
995 Returns
996 -------
997 obj : `DeferredDatasetHandle`
998 A handle which can be used to retrieve a dataset at a later time.
1000 Raises
1001 ------
1002 LookupError
1003 Raised if no matching dataset exists in the `Registry` (and
1004 ``allowUnresolved is False``).
1005 ValueError
1006 Raised if a resolved `DatasetRef` was passed as an input, but it
1007 differs from the one found in the registry.
1008 TypeError
1009 Raised if no collections were provided.
1010 """
1011 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwds)
1012 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters)
1014 def get(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
1015 dataId: Optional[DataId] = None, *,
1016 parameters: Optional[Dict[str, Any]] = None,
1017 collections: Any = None,
1018 **kwds: Any) -> Any:
1019 """Retrieve a stored dataset.
1021 Parameters
1022 ----------
1023 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1024 When `DatasetRef` the `dataId` should be `None`.
1025 Otherwise the `DatasetType` or name thereof.
1026 dataId : `dict` or `DataCoordinate`
1027 A `dict` of `Dimension` link name, value pairs that label the
1028 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1029 should be provided as the first argument.
1030 parameters : `dict`
1031 Additional StorageClass-defined options to control reading,
1032 typically used to efficiently read only a subset of the dataset.
1033 collections : Any, optional
1034 Collections to be searched, overriding ``self.collections``.
1035 Can be any of the types supported by the ``collections`` argument
1036 to butler construction.
1037 kwds
1038 Additional keyword arguments used to augment or construct a
1039 `DataCoordinate`. See `DataCoordinate.standardize`
1040 parameters.
1042 Returns
1043 -------
1044 obj : `object`
1045 The dataset.
1047 Raises
1048 ------
1049 ValueError
1050 Raised if a resolved `DatasetRef` was passed as an input, but it
1051 differs from the one found in the registry.
1052 LookupError
1053 Raised if no matching dataset exists in the `Registry`.
1054 TypeError
1055 Raised if no collections were provided.
1057 Notes
1058 -----
1059 When looking up datasets in a `~CollectionType.CALIBRATION` collection,
1060 this method requires that the given data ID include temporal dimensions
1061 beyond the dimensions of the dataset type itself, in order to find the
1062 dataset with the appropriate validity range. For example, a "bias"
1063 dataset with native dimensions ``{instrument, detector}`` could be
1064 fetched with a ``{instrument, detector, exposure}`` data ID, because
1065 ``exposure`` is a temporal dimension.
1066 """
1067 log.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters)
1068 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwds)
1069 return self.getDirect(ref, parameters=parameters)
1071 def getURIs(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
1072 dataId: Optional[DataId] = None, *,
1073 predict: bool = False,
1074 collections: Any = None,
1075 run: Optional[str] = None,
1076 **kwds: Any) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]:
1077 """Returns the URIs associated with the dataset.
1079 Parameters
1080 ----------
1081 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1082 When `DatasetRef` the `dataId` should be `None`.
1083 Otherwise the `DatasetType` or name thereof.
1084 dataId : `dict` or `DataCoordinate`
1085 A `dict` of `Dimension` link name, value pairs that label the
1086 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1087 should be provided as the first argument.
1088 predict : `bool`
1089 If `True`, allow URIs to be returned of datasets that have not
1090 been written.
1091 collections : Any, optional
1092 Collections to be searched, overriding ``self.collections``.
1093 Can be any of the types supported by the ``collections`` argument
1094 to butler construction.
1095 run : `str`, optional
1096 Run to use for predictions, overriding ``self.run``.
1097 kwds
1098 Additional keyword arguments used to augment or construct a
1099 `DataCoordinate`. See `DataCoordinate.standardize`
1100 parameters.
1102 Returns
1103 -------
1104 primary : `ButlerURI`
1105 The URI to the primary artifact associated with this dataset.
1106 If the dataset was disassembled within the datastore this
1107 may be `None`.
1108 components : `dict`
1109 URIs to any components associated with the dataset artifact.
1110 Can be empty if there are no components.
1111 """
1112 ref = self._findDatasetRef(datasetRefOrType, dataId, allowUnresolved=predict,
1113 collections=collections, **kwds)
1114 if ref.id is None: # only possible if predict is True
1115 if run is None:
1116 run = self.run
1117 if run is None:
1118 raise TypeError("Cannot predict location with run=None.")
1119 # Lie about ID, because we can't guess it, and only
1120 # Datastore.getURIs() will ever see it (and it doesn't use it).
1121 ref = ref.resolved(id=0, run=run)
1122 return self.datastore.getURIs(ref, predict)
1124 def getURI(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
1125 dataId: Optional[DataId] = None, *,
1126 predict: bool = False,
1127 collections: Any = None,
1128 run: Optional[str] = None,
1129 **kwds: Any) -> ButlerURI:
1130 """Return the URI to the Dataset.
1132 Parameters
1133 ----------
1134 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1135 When `DatasetRef` the `dataId` should be `None`.
1136 Otherwise the `DatasetType` or name thereof.
1137 dataId : `dict` or `DataCoordinate`
1138 A `dict` of `Dimension` link name, value pairs that label the
1139 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1140 should be provided as the first argument.
1141 predict : `bool`
1142 If `True`, allow URIs to be returned of datasets that have not
1143 been written.
1144 collections : Any, optional
1145 Collections to be searched, overriding ``self.collections``.
1146 Can be any of the types supported by the ``collections`` argument
1147 to butler construction.
1148 run : `str`, optional
1149 Run to use for predictions, overriding ``self.run``.
1150 kwds
1151 Additional keyword arguments used to augment or construct a
1152 `DataCoordinate`. See `DataCoordinate.standardize`
1153 parameters.
1155 Returns
1156 -------
1157 uri : `ButlerURI`
1158 URI pointing to the Dataset within the datastore. If the
1159 Dataset does not exist in the datastore, and if ``predict`` is
1160 `True`, the URI will be a prediction and will include a URI
1161 fragment "#predicted".
1162 If the datastore does not have entities that relate well
1163 to the concept of a URI the returned URI string will be
1164 descriptive. The returned URI is not guaranteed to be obtainable.
1166 Raises
1167 ------
1168 LookupError
1169 A URI has been requested for a dataset that does not exist and
1170 guessing is not allowed.
1171 ValueError
1172 Raised if a resolved `DatasetRef` was passed as an input, but it
1173 differs from the one found in the registry.
1174 TypeError
1175 Raised if no collections were provided.
1176 RuntimeError
1177 Raised if a URI is requested for a dataset that consists of
1178 multiple artifacts.
1179 """
1180 primary, components = self.getURIs(datasetRefOrType, dataId=dataId, predict=predict,
1181 collections=collections, run=run, **kwds)
1183 if primary is None or components:
1184 raise RuntimeError(f"Dataset ({datasetRefOrType}) includes distinct URIs for components. "
1185 "Use Butler.getURIs() instead.")
1186 return primary
1188 def datasetExists(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
1189 dataId: Optional[DataId] = None, *,
1190 collections: Any = None,
1191 **kwds: Any) -> bool:
1192 """Return True if the Dataset is actually present in the Datastore.
1194 Parameters
1195 ----------
1196 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1197 When `DatasetRef` the `dataId` should be `None`.
1198 Otherwise the `DatasetType` or name thereof.
1199 dataId : `dict` or `DataCoordinate`
1200 A `dict` of `Dimension` link name, value pairs that label the
1201 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1202 should be provided as the first argument.
1203 collections : Any, optional
1204 Collections to be searched, overriding ``self.collections``.
1205 Can be any of the types supported by the ``collections`` argument
1206 to butler construction.
1207 kwds
1208 Additional keyword arguments used to augment or construct a
1209 `DataCoordinate`. See `DataCoordinate.standardize`
1210 parameters.
1212 Raises
1213 ------
1214 LookupError
1215 Raised if the dataset is not even present in the Registry.
1216 ValueError
1217 Raised if a resolved `DatasetRef` was passed as an input, but it
1218 differs from the one found in the registry.
1219 TypeError
1220 Raised if no collections were provided.
1221 """
1222 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwds)
1223 return self.datastore.exists(ref)
1225 def pruneCollection(self, name: str, purge: bool = False, unstore: bool = False) -> None:
1226 """Remove a collection and possibly prune datasets within it.
1228 Parameters
1229 ----------
1230 name : `str`
1231 Name of the collection to remove. If this is a
1232 `~CollectionType.TAGGED` or `~CollectionType.CHAINED` collection,
1233 datasets within the collection are not modified unless ``unstore``
1234 is `True`. If this is a `~CollectionType.RUN` collection,
1235 ``purge`` and ``unstore`` must be `True`, and all datasets in it
1236 are fully removed from the data repository.
1237 purge : `bool`, optional
1238 If `True`, permit `~CollectionType.RUN` collections to be removed,
1239 fully removing datasets within them. Requires ``unstore=True`` as
1240 well as an added precaution against accidental deletion. Must be
1241 `False` (default) if the collection is not a ``RUN``.
1242 unstore: `bool`, optional
1243 If `True`, remove all datasets in the collection from all
1244 datastores in which they appear.
1246 Raises
1247 ------
1248 TypeError
1249 Raised if the butler is read-only or arguments are mutually
1250 inconsistent.
1251 """
1253 # See pruneDatasets comments for more information about the logic here;
1254 # the cases are almost the same, but here we can rely on Registry to
1255 # take care everything but Datastore deletion when we remove the
1256 # collection.
1257 if not self.isWriteable():
1258 raise TypeError("Butler is read-only.")
1259 collectionType = self.registry.getCollectionType(name)
1260 if purge and not unstore:
1261 raise PurgeWithoutUnstorePruneCollectionsError()
1262 if collectionType is CollectionType.RUN and not purge:
1263 raise RunWithoutPurgePruneCollectionsError(collectionType)
1264 if collectionType is not CollectionType.RUN and purge:
1265 raise PurgeUnsupportedPruneCollectionsError(collectionType)
1267 with self.registry.transaction():
1268 if unstore:
1269 for ref in self.registry.queryDatasets(..., collections=name, findFirst=True):
1270 if self.datastore.exists(ref):
1271 self.datastore.trash(ref)
1272 self.registry.removeCollection(name)
1273 if unstore:
1274 # Point of no return for removing artifacts
1275 self.datastore.emptyTrash()
1277 def pruneDatasets(self, refs: Iterable[DatasetRef], *,
1278 disassociate: bool = True,
1279 unstore: bool = False,
1280 tags: Optional[Iterable[str]] = None,
1281 purge: bool = False,
1282 run: Optional[str] = None) -> None:
1283 """Remove one or more datasets from a collection and/or storage.
1285 Parameters
1286 ----------
1287 refs : `~collections.abc.Iterable` of `DatasetRef`
1288 Datasets to prune. These must be "resolved" references (not just
1289 a `DatasetType` and data ID).
1290 disassociate : `bool`, optional
1291 Disassociate pruned datasets from ``self.tags`` (or the collections
1292 given via the ``tags`` argument).
1293 unstore : `bool`, optional
1294 If `True` (`False` is default) remove these datasets from all
1295 datastores known to this butler. Note that this will make it
1296 impossible to retrieve these datasets even via other collections.
1297 Datasets that are already not stored are ignored by this option.
1298 tags : `Iterable` [ `str` ], optional
1299 `~CollectionType.TAGGED` collections to disassociate the datasets
1300 from, overriding ``self.tags``. Ignored if ``disassociate`` is
1301 `False` or ``purge`` is `True`.
1302 purge : `bool`, optional
1303 If `True` (`False` is default), completely remove the dataset from
1304 the `Registry`. To prevent accidental deletions, ``purge`` may
1305 only be `True` if all of the following conditions are met:
1307 - All given datasets are in the given run.
1308 - ``disassociate`` is `True`;
1309 - ``unstore`` is `True`.
1311 This mode may remove provenance information from datasets other
1312 than those provided, and should be used with extreme care.
1313 run : `str`, optional
1314 `~CollectionType.RUN` collection to purge from, overriding
1315 ``self.run``. Ignored unless ``purge`` is `True`.
1317 Raises
1318 ------
1319 TypeError
1320 Raised if the butler is read-only, if no collection was provided,
1321 or the conditions for ``purge=True`` were not met.
1322 """
1323 if not self.isWriteable():
1324 raise TypeError("Butler is read-only.")
1325 if purge:
1326 if not disassociate:
1327 raise TypeError("Cannot pass purge=True without disassociate=True.")
1328 if not unstore:
1329 raise TypeError("Cannot pass purge=True without unstore=True.")
1330 if run is None:
1331 run = self.run
1332 if run is None:
1333 raise TypeError("No run provided but purge=True.")
1334 collectionType = self.registry.getCollectionType(run)
1335 if collectionType is not CollectionType.RUN:
1336 raise TypeError(f"Cannot purge from collection '{run}' "
1337 f"of non-RUN type {collectionType.name}.")
1338 elif disassociate:
1339 if tags is None:
1340 tags = self.tags
1341 else:
1342 tags = tuple(tags)
1343 if not tags:
1344 raise TypeError("No tags provided but disassociate=True.")
1345 for tag in tags:
1346 collectionType = self.registry.getCollectionType(tag)
1347 if collectionType is not CollectionType.TAGGED:
1348 raise TypeError(f"Cannot disassociate from collection '{tag}' "
1349 f"of non-TAGGED type {collectionType.name}.")
1350 # Transform possibly-single-pass iterable into something we can iterate
1351 # over multiple times.
1352 refs = list(refs)
1353 # Pruning a component of a DatasetRef makes no sense since registry
1354 # doesn't know about components and datastore might not store
1355 # components in a separate file
1356 for ref in refs:
1357 if ref.datasetType.component():
1358 raise ValueError(f"Can not prune a component of a dataset (ref={ref})")
1359 # We don't need an unreliable Datastore transaction for this, because
1360 # we've been extra careful to ensure that Datastore.trash only involves
1361 # mutating the Registry (it can _look_ at Datastore-specific things,
1362 # but shouldn't change them), and hence all operations here are
1363 # Registry operations.
1364 with self.registry.transaction():
1365 if unstore:
1366 for ref in refs:
1367 # There is a difference between a concrete composite
1368 # and virtual composite. In a virtual composite the
1369 # datastore is never given the top level DatasetRef. In
1370 # the concrete composite the datastore knows all the
1371 # refs and will clean up itself if asked to remove the
1372 # parent ref. We can not check configuration for this
1373 # since we can not trust that the configuration is the
1374 # same. We therefore have to ask if the ref exists or
1375 # not. This is consistent with the fact that we want
1376 # to ignore already-removed-from-datastore datasets
1377 # anyway.
1378 if self.datastore.exists(ref):
1379 self.datastore.trash(ref)
1380 if purge:
1381 self.registry.removeDatasets(refs)
1382 elif disassociate:
1383 assert tags, "Guaranteed by earlier logic in this function."
1384 for tag in tags:
1385 self.registry.disassociate(tag, refs)
1386 # We've exited the Registry transaction, and apparently committed.
1387 # (if there was an exception, everything rolled back, and it's as if
1388 # nothing happened - and we never get here).
1389 # Datastore artifacts are not yet gone, but they're clearly marked
1390 # as trash, so if we fail to delete now because of (e.g.) filesystem
1391 # problems we can try again later, and if manual administrative
1392 # intervention is required, it's pretty clear what that should entail:
1393 # deleting everything on disk and in private Datastore tables that is
1394 # in the dataset_location_trash table.
1395 if unstore:
1396 # Point of no return for removing artifacts
1397 self.datastore.emptyTrash()
1399 @transactional
1400 def ingest(self, *datasets: FileDataset, transfer: Optional[str] = "auto", run: Optional[str] = None,
1401 tags: Optional[Iterable[str]] = None,) -> None:
1402 """Store and register one or more datasets that already exist on disk.
1404 Parameters
1405 ----------
1406 datasets : `FileDataset`
1407 Each positional argument is a struct containing information about
1408 a file to be ingested, including its path (either absolute or
1409 relative to the datastore root, if applicable), a `DatasetRef`,
1410 and optionally a formatter class or its fully-qualified string
1411 name. If a formatter is not provided, the formatter that would be
1412 used for `put` is assumed. On successful return, all
1413 `FileDataset.ref` attributes will have their `DatasetRef.id`
1414 attribute populated and all `FileDataset.formatter` attributes will
1415 be set to the formatter class used. `FileDataset.path` attributes
1416 may be modified to put paths in whatever the datastore considers a
1417 standardized form.
1418 transfer : `str`, optional
1419 If not `None`, must be one of 'auto', 'move', 'copy', 'direct',
1420 'hardlink', 'relsymlink' or 'symlink', indicating how to transfer
1421 the file.
1422 run : `str`, optional
1423 The name of the run ingested datasets should be added to,
1424 overriding ``self.run``.
1425 tags : `Iterable` [ `str` ], optional
1426 The names of a `~CollectionType.TAGGED` collections to associate
1427 the dataset with, overriding ``self.tags``. These collections
1428 must have already been added to the `Registry`.
1430 Raises
1431 ------
1432 TypeError
1433 Raised if the butler is read-only or if no run was provided.
1434 NotImplementedError
1435 Raised if the `Datastore` does not support the given transfer mode.
1436 DatasetTypeNotSupportedError
1437 Raised if one or more files to be ingested have a dataset type that
1438 is not supported by the `Datastore`..
1439 FileNotFoundError
1440 Raised if one of the given files does not exist.
1441 FileExistsError
1442 Raised if transfer is not `None` but the (internal) location the
1443 file would be moved to is already occupied.
1445 Notes
1446 -----
1447 This operation is not fully exception safe: if a database operation
1448 fails, the given `FileDataset` instances may be only partially updated.
1450 It is atomic in terms of database operations (they will either all
1451 succeed or all fail) providing the database engine implements
1452 transactions correctly. It will attempt to be atomic in terms of
1453 filesystem operations as well, but this cannot be implemented
1454 rigorously for most datastores.
1455 """
1456 if not self.isWriteable():
1457 raise TypeError("Butler is read-only.")
1458 if run is None:
1459 if self.run is None:
1460 raise TypeError("No run provided.")
1461 run = self.run
1462 # No need to check run type, since insertDatasets will do that
1463 # (safely) for us.
1464 if tags is None:
1465 tags = self.tags
1466 else:
1467 tags = tuple(tags)
1468 for tag in tags:
1469 # Check that these are tagged collections up front, because we want
1470 # to avoid relying on Datastore transactionality to avoid modifying
1471 # the repo if there's an error later.
1472 collectionType = self.registry.getCollectionType(tag)
1473 if collectionType is not CollectionType.TAGGED:
1474 raise TypeError(f"Cannot associate into collection '{tag}' of non-TAGGED type "
1475 f"{collectionType.name}.")
1476 # Reorganize the inputs so they're grouped by DatasetType and then
1477 # data ID. We also include a list of DatasetRefs for each FileDataset
1478 # to hold the resolved DatasetRefs returned by the Registry, before
1479 # it's safe to swap them into FileDataset.refs.
1480 # Some type annotation aliases to make that clearer:
1481 GroupForType = Dict[DataCoordinate, Tuple[FileDataset, List[DatasetRef]]]
1482 GroupedData = MutableMapping[DatasetType, GroupForType]
1483 # The actual data structure:
1484 groupedData: GroupedData = defaultdict(dict)
1485 # And the nested loop that populates it:
1486 for dataset in datasets:
1487 # This list intentionally shared across the inner loop, since it's
1488 # associated with `dataset`.
1489 resolvedRefs: List[DatasetRef] = []
1490 for ref in dataset.refs:
1491 groupedData[ref.datasetType][ref.dataId] = (dataset, resolvedRefs)
1493 # Now we can bulk-insert into Registry for each DatasetType.
1494 allResolvedRefs: List[DatasetRef] = []
1495 for datasetType, groupForType in groupedData.items():
1496 refs = self.registry.insertDatasets(datasetType,
1497 dataIds=groupForType.keys(),
1498 run=run)
1499 # Append those resolved DatasetRefs to the new lists we set up for
1500 # them.
1501 for ref, (_, resolvedRefs) in zip(refs, groupForType.values()):
1502 resolvedRefs.append(ref)
1504 # Go back to the original FileDatasets to replace their refs with the
1505 # new resolved ones, and also build a big list of all refs.
1506 allResolvedRefs = []
1507 for groupForType in groupedData.values():
1508 for dataset, resolvedRefs in groupForType.values():
1509 dataset.refs = resolvedRefs
1510 allResolvedRefs.extend(resolvedRefs)
1512 # Bulk-associate everything with any tagged collections.
1513 for tag in tags:
1514 self.registry.associate(tag, allResolvedRefs)
1516 # Bulk-insert everything into Datastore.
1517 self.datastore.ingest(*datasets, transfer=transfer)
1519 @contextlib.contextmanager
1520 def export(self, *, directory: Optional[str] = None,
1521 filename: Optional[str] = None,
1522 format: Optional[str] = None,
1523 transfer: Optional[str] = None) -> Iterator[RepoExportContext]:
1524 """Export datasets from the repository represented by this `Butler`.
1526 This method is a context manager that returns a helper object
1527 (`RepoExportContext`) that is used to indicate what information from
1528 the repository should be exported.
1530 Parameters
1531 ----------
1532 directory : `str`, optional
1533 Directory dataset files should be written to if ``transfer`` is not
1534 `None`.
1535 filename : `str`, optional
1536 Name for the file that will include database information associated
1537 with the exported datasets. If this is not an absolute path and
1538 ``directory`` is not `None`, it will be written to ``directory``
1539 instead of the current working directory. Defaults to
1540 "export.{format}".
1541 format : `str`, optional
1542 File format for the database information file. If `None`, the
1543 extension of ``filename`` will be used.
1544 transfer : `str`, optional
1545 Transfer mode passed to `Datastore.export`.
1547 Raises
1548 ------
1549 TypeError
1550 Raised if the set of arguments passed is inconsistent.
1552 Examples
1553 --------
1554 Typically the `Registry.queryDataIds` and `Registry.queryDatasets`
1555 methods are used to provide the iterables over data IDs and/or datasets
1556 to be exported::
1558 with butler.export("exports.yaml") as export:
1559 # Export all flats, but none of the dimension element rows
1560 # (i.e. data ID information) associated with them.
1561 export.saveDatasets(butler.registry.queryDatasets("flat"),
1562 elements=())
1563 # Export all datasets that start with "deepCoadd_" and all of
1564 # their associated data ID information.
1565 export.saveDatasets(butler.registry.queryDatasets("deepCoadd_*"))
1566 """
1567 if directory is None and transfer is not None:
1568 raise TypeError("Cannot transfer without providing a directory.")
1569 if transfer == "move":
1570 raise TypeError("Transfer may not be 'move': export is read-only")
1571 if format is None:
1572 if filename is None:
1573 raise TypeError("At least one of 'filename' or 'format' must be provided.")
1574 else:
1575 _, format = os.path.splitext(filename)
1576 elif filename is None:
1577 filename = f"export.{format}"
1578 if directory is not None:
1579 filename = os.path.join(directory, filename)
1580 BackendClass = getClassOf(self._config["repo_transfer_formats"][format]["export"])
1581 with open(filename, 'w') as stream:
1582 backend = BackendClass(stream)
1583 try:
1584 helper = RepoExportContext(self.registry, self.datastore, backend=backend,
1585 directory=directory, transfer=transfer)
1586 yield helper
1587 except BaseException:
1588 raise
1589 else:
1590 helper._finish()
1592 def import_(self, *, directory: Optional[str] = None,
1593 filename: Union[str, TextIO, None] = None,
1594 format: Optional[str] = None,
1595 transfer: Optional[str] = None,
1596 skip_dimensions: Optional[Set] = None) -> None:
1597 """Import datasets exported from a different butler repository.
1599 Parameters
1600 ----------
1601 directory : `str`, optional
1602 Directory containing dataset files. If `None`, all file paths
1603 must be absolute.
1604 filename : `str` or `TextIO`, optional
1605 A stream or name of file that contains database information
1606 associated with the exported datasets. If this a string (name) and
1607 is not an absolute path, does not exist in the current working
1608 directory, and ``directory`` is not `None`, it is assumed to be in
1609 ``directory``. Defaults to "export.{format}".
1610 format : `str`, optional
1611 File format for the database information file. If `None`, the
1612 extension of ``filename`` will be used.
1613 transfer : `str`, optional
1614 Transfer mode passed to `Datastore.ingest`.
1615 skip_dimensions : `set`, optional
1616 Names of dimensions that should be skipped and not imported.
1618 Raises
1619 ------
1620 TypeError
1621 Raised if the set of arguments passed is inconsistent, or if the
1622 butler is read-only.
1623 """
1624 if not self.isWriteable():
1625 raise TypeError("Butler is read-only.")
1626 if format is None:
1627 if filename is None:
1628 raise TypeError("At least one of 'filename' or 'format' must be provided.")
1629 else:
1630 _, format = os.path.splitext(filename) # type: ignore
1631 elif filename is None:
1632 filename = f"export.{format}"
1633 if isinstance(filename, str) and directory is not None and not os.path.exists(filename):
1634 filename = os.path.join(directory, filename)
1635 BackendClass = getClassOf(self._config["repo_transfer_formats"][format]["import"])
1637 def doImport(importStream: TextIO) -> None:
1638 backend = BackendClass(importStream, self.registry)
1639 backend.register()
1640 with self.transaction():
1641 backend.load(self.datastore, directory=directory, transfer=transfer,
1642 skip_dimensions=skip_dimensions)
1644 if isinstance(filename, str):
1645 with open(filename, "r") as stream:
1646 doImport(stream)
1647 else:
1648 doImport(filename)
1650 def validateConfiguration(self, logFailures: bool = False,
1651 datasetTypeNames: Optional[Iterable[str]] = None,
1652 ignore: Iterable[str] = None) -> None:
1653 """Validate butler configuration.
1655 Checks that each `DatasetType` can be stored in the `Datastore`.
1657 Parameters
1658 ----------
1659 logFailures : `bool`, optional
1660 If `True`, output a log message for every validation error
1661 detected.
1662 datasetTypeNames : iterable of `str`, optional
1663 The `DatasetType` names that should be checked. This allows
1664 only a subset to be selected.
1665 ignore : iterable of `str`, optional
1666 Names of DatasetTypes to skip over. This can be used to skip
1667 known problems. If a named `DatasetType` corresponds to a
1668 composite, all components of that `DatasetType` will also be
1669 ignored.
1671 Raises
1672 ------
1673 ButlerValidationError
1674 Raised if there is some inconsistency with how this Butler
1675 is configured.
1676 """
1677 if datasetTypeNames:
1678 datasetTypes = [self.registry.getDatasetType(name) for name in datasetTypeNames]
1679 else:
1680 datasetTypes = list(self.registry.queryDatasetTypes())
1682 # filter out anything from the ignore list
1683 if ignore:
1684 ignore = set(ignore)
1685 datasetTypes = [e for e in datasetTypes
1686 if e.name not in ignore and e.nameAndComponent()[0] not in ignore]
1687 else:
1688 ignore = set()
1690 # Find all the registered instruments
1691 instruments = set(
1692 record.name for record in self.registry.queryDimensionRecords("instrument")
1693 )
1695 # For each datasetType that has an instrument dimension, create
1696 # a DatasetRef for each defined instrument
1697 datasetRefs = []
1699 for datasetType in datasetTypes:
1700 if "instrument" in datasetType.dimensions:
1701 for instrument in instruments:
1702 datasetRef = DatasetRef(datasetType, {"instrument": instrument}, # type: ignore
1703 conform=False)
1704 datasetRefs.append(datasetRef)
1706 entities: List[Union[DatasetType, DatasetRef]] = []
1707 entities.extend(datasetTypes)
1708 entities.extend(datasetRefs)
1710 datastoreErrorStr = None
1711 try:
1712 self.datastore.validateConfiguration(entities, logFailures=logFailures)
1713 except ValidationError as e:
1714 datastoreErrorStr = str(e)
1716 # Also check that the LookupKeys used by the datastores match
1717 # registry and storage class definitions
1718 keys = self.datastore.getLookupKeys()
1720 failedNames = set()
1721 failedDataId = set()
1722 for key in keys:
1723 if key.name is not None:
1724 if key.name in ignore:
1725 continue
1727 # skip if specific datasetType names were requested and this
1728 # name does not match
1729 if datasetTypeNames and key.name not in datasetTypeNames:
1730 continue
1732 # See if it is a StorageClass or a DatasetType
1733 if key.name in self.storageClasses:
1734 pass
1735 else:
1736 try:
1737 self.registry.getDatasetType(key.name)
1738 except KeyError:
1739 if logFailures:
1740 log.fatal("Key '%s' does not correspond to a DatasetType or StorageClass", key)
1741 failedNames.add(key)
1742 else:
1743 # Dimensions are checked for consistency when the Butler
1744 # is created and rendezvoused with a universe.
1745 pass
1747 # Check that the instrument is a valid instrument
1748 # Currently only support instrument so check for that
1749 if key.dataId:
1750 dataIdKeys = set(key.dataId)
1751 if set(["instrument"]) != dataIdKeys:
1752 if logFailures:
1753 log.fatal("Key '%s' has unsupported DataId override", key)
1754 failedDataId.add(key)
1755 elif key.dataId["instrument"] not in instruments:
1756 if logFailures:
1757 log.fatal("Key '%s' has unknown instrument", key)
1758 failedDataId.add(key)
1760 messages = []
1762 if datastoreErrorStr:
1763 messages.append(datastoreErrorStr)
1765 for failed, msg in ((failedNames, "Keys without corresponding DatasetType or StorageClass entry: "),
1766 (failedDataId, "Keys with bad DataId entries: ")):
1767 if failed:
1768 msg += ", ".join(str(k) for k in failed)
1769 messages.append(msg)
1771 if messages:
1772 raise ValidationError(";\n".join(messages))
1774 registry: Registry
1775 """The object that manages dataset metadata and relationships (`Registry`).
1777 Most operations that don't involve reading or writing butler datasets are
1778 accessible only via `Registry` methods.
1779 """
1781 datastore: Datastore
1782 """The object that manages actual dataset storage (`Datastore`).
1784 Direct user access to the datastore should rarely be necessary; the primary
1785 exception is the case where a `Datastore` implementation provides extra
1786 functionality beyond what the base class defines.
1787 """
1789 storageClasses: StorageClassFactory
1790 """An object that maps known storage class names to objects that fully
1791 describe them (`StorageClassFactory`).
1792 """
1794 collections: Optional[CollectionSearch]
1795 """The collections to search and any restrictions on the dataset types to
1796 search for within them, in order (`CollectionSearch`).
1797 """
1799 run: Optional[str]
1800 """Name of the run this butler writes outputs to (`str` or `None`).
1801 """
1803 tags: Tuple[str, ...]
1804 """Names of `~CollectionType.TAGGED` collections this butler associates
1805 with in `put` and `ingest`, and disassociates from in `pruneDatasets`
1806 (`tuple` [ `str` ]).
1807 """