Coverage for python/lsst/daf/butler/_butler.py : 9%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""
23Butler top level classes.
24"""
25from __future__ import annotations
27__all__ = (
28 "Butler",
29 "ButlerValidationError",
30 "PruneCollectionsArgsError",
31 "PurgeWithoutUnstorePruneCollectionsError",
32 "RunWithoutPurgePruneCollectionsError",
33 "PurgeUnsupportedPruneCollectionsError",
34)
37from collections import defaultdict
38import contextlib
39import logging
40import numbers
41import os
42from typing import (
43 Any,
44 ClassVar,
45 Counter,
46 Dict,
47 Iterable,
48 Iterator,
49 List,
50 MutableMapping,
51 Optional,
52 Set,
53 TextIO,
54 Tuple,
55 Type,
56 Union,
57)
59try:
60 import boto3
61except ImportError:
62 boto3 = None
64from lsst.utils import doImport
65from .core import (
66 AmbiguousDatasetError,
67 ButlerURI,
68 Config,
69 ConfigSubset,
70 DataCoordinate,
71 DataId,
72 DataIdValue,
73 DatasetRef,
74 DatasetType,
75 Datastore,
76 Dimension,
77 DimensionConfig,
78 FileDataset,
79 StorageClassFactory,
80 Timespan,
81 ValidationError,
82)
83from .core.repoRelocation import BUTLER_ROOT_TAG
84from .core.utils import transactional, getClassOf
85from ._deferredDatasetHandle import DeferredDatasetHandle
86from ._butlerConfig import ButlerConfig
87from .registry import Registry, RegistryConfig, RegistryDefaults, CollectionType, ConflictingDefinitionError
88from .registry.wildcards import CollectionSearch
89from .transfers import RepoExportContext
91log = logging.getLogger(__name__)
94class ButlerValidationError(ValidationError):
95 """There is a problem with the Butler configuration."""
96 pass
99class PruneCollectionsArgsError(TypeError):
100 """Base class for errors relating to Butler.pruneCollections input
101 arguments.
102 """
103 pass
106class PurgeWithoutUnstorePruneCollectionsError(PruneCollectionsArgsError):
107 """Raised when purge and unstore are both required to be True, and
108 purge is True but unstore is False.
109 """
111 def __init__(self) -> None:
112 super().__init__("Cannot pass purge=True without unstore=True.")
115class RunWithoutPurgePruneCollectionsError(PruneCollectionsArgsError):
116 """Raised when pruning a RUN collection but purge is False."""
118 def __init__(self, collectionType: CollectionType):
119 self.collectionType = collectionType
120 super().__init__(f"Cannot prune RUN collection {self.collectionType.name} without purge=True.")
123class PurgeUnsupportedPruneCollectionsError(PruneCollectionsArgsError):
124 """Raised when purge is True but is not supported for the given
125 collection."""
127 def __init__(self, collectionType: CollectionType):
128 self.collectionType = collectionType
129 super().__init__(
130 f"Cannot prune {self.collectionType} collection {self.collectionType.name} with purge=True.")
133class Butler:
134 """Main entry point for the data access system.
136 Parameters
137 ----------
138 config : `ButlerConfig`, `Config` or `str`, optional.
139 Configuration. Anything acceptable to the
140 `ButlerConfig` constructor. If a directory path
141 is given the configuration will be read from a ``butler.yaml`` file in
142 that location. If `None` is given default values will be used.
143 butler : `Butler`, optional.
144 If provided, construct a new Butler that uses the same registry and
145 datastore as the given one, but with the given collection and run.
146 Incompatible with the ``config``, ``searchPaths``, and ``writeable``
147 arguments.
148 collections : `str` or `Iterable` [ `str` ], optional
149 An expression specifying the collections to be searched (in order) when
150 reading datasets.
151 This may be a `str` collection name or an iterable thereof.
152 See :ref:`daf_butler_collection_expressions` for more information.
153 These collections are not registered automatically and must be
154 manually registered before they are used by any method, but they may be
155 manually registered after the `Butler` is initialized.
156 run : `str`, optional
157 Name of the `~CollectionType.RUN` collection new datasets should be
158 inserted into. If ``collections`` is `None` and ``run`` is not `None`,
159 ``collections`` will be set to ``[run]``. If not `None`, this
160 collection will automatically be registered. If this is not set (and
161 ``writeable`` is not set either), a read-only butler will be created.
162 searchPaths : `list` of `str`, optional
163 Directory paths to search when calculating the full Butler
164 configuration. Not used if the supplied config is already a
165 `ButlerConfig`.
166 writeable : `bool`, optional
167 Explicitly sets whether the butler supports write operations. If not
168 provided, a read-write butler is created if any of ``run``, ``tags``,
169 or ``chains`` is non-empty.
170 inferDefaults : `bool`, optional
171 If `True` (default) infer default data ID values from the values
172 present in the datasets in ``collections``: if all collections have the
173 same value (or no value) for a governor dimension, that value will be
174 the default for that dimension. Nonexistent collections are ignored.
175 If a default value is provided explicitly for a governor dimension via
176 ``**kwargs``, no default will be inferred for that dimension.
177 **kwargs : `str`
178 Default data ID key-value pairs. These may only identify "governor"
179 dimensions like ``instrument`` and ``skymap``.
181 Examples
182 --------
183 While there are many ways to control exactly how a `Butler` interacts with
184 the collections in its `Registry`, the most common cases are still simple.
186 For a read-only `Butler` that searches one collection, do::
188 butler = Butler("/path/to/repo", collections=["u/alice/DM-50000"])
190 For a read-write `Butler` that writes to and reads from a
191 `~CollectionType.RUN` collection::
193 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a")
195 The `Butler` passed to a ``PipelineTask`` is often much more complex,
196 because we want to write to one `~CollectionType.RUN` collection but read
197 from several others (as well)::
199 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a",
200 collections=["u/alice/DM-50000/a",
201 "u/bob/DM-49998",
202 "HSC/defaults"])
204 This butler will `put` new datasets to the run ``u/alice/DM-50000/a``.
205 Datasets will be read first from that run (since it appears first in the
206 chain), and then from ``u/bob/DM-49998`` and finally ``HSC/defaults``.
208 Finally, one can always create a `Butler` with no collections::
210 butler = Butler("/path/to/repo", writeable=True)
212 This can be extremely useful when you just want to use ``butler.registry``,
213 e.g. for inserting dimension data or managing collections, or when the
214 collections you want to use with the butler are not consistent.
215 Passing ``writeable`` explicitly here is only necessary if you want to be
216 able to make changes to the repo - usually the value for ``writeable`` can
217 be guessed from the collection arguments provided, but it defaults to
218 `False` when there are not collection arguments.
219 """
220 def __init__(self, config: Union[Config, str, None] = None, *,
221 butler: Optional[Butler] = None,
222 collections: Any = None,
223 run: Optional[str] = None,
224 searchPaths: Optional[List[str]] = None,
225 writeable: Optional[bool] = None,
226 inferDefaults: bool = True,
227 **kwargs: str,
228 ):
229 defaults = RegistryDefaults(collections=collections, run=run, infer=inferDefaults, **kwargs)
230 # Load registry, datastore, etc. from config or existing butler.
231 if butler is not None:
232 if config is not None or searchPaths is not None or writeable is not None:
233 raise TypeError("Cannot pass 'config', 'searchPaths', or 'writeable' "
234 "arguments with 'butler' argument.")
235 self.registry = butler.registry.copy(defaults)
236 self.datastore = butler.datastore
237 self.storageClasses = butler.storageClasses
238 self._config: ButlerConfig = butler._config
239 else:
240 self._config = ButlerConfig(config, searchPaths=searchPaths)
241 if "root" in self._config:
242 butlerRoot = self._config["root"]
243 else:
244 butlerRoot = self._config.configDir
245 if writeable is None:
246 writeable = run is not None
247 self.registry = Registry.fromConfig(self._config, butlerRoot=butlerRoot, writeable=writeable,
248 defaults=defaults)
249 self.datastore = Datastore.fromConfig(self._config, self.registry.getDatastoreBridgeManager(),
250 butlerRoot=butlerRoot)
251 self.storageClasses = StorageClassFactory()
252 self.storageClasses.addFromConfig(self._config)
253 if "run" in self._config or "collection" in self._config:
254 raise ValueError("Passing a run or collection via configuration is no longer supported.")
256 GENERATION: ClassVar[int] = 3
257 """This is a Generation 3 Butler.
259 This attribute may be removed in the future, once the Generation 2 Butler
260 interface has been fully retired; it should only be used in transitional
261 code.
262 """
264 @staticmethod
265 def makeRepo(root: str, config: Union[Config, str, None] = None,
266 dimensionConfig: Union[Config, str, None] = None, standalone: bool = False,
267 searchPaths: Optional[List[str]] = None, forceConfigRoot: bool = True,
268 outfile: Optional[str] = None, overwrite: bool = False) -> Config:
269 """Create an empty data repository by adding a butler.yaml config
270 to a repository root directory.
272 Parameters
273 ----------
274 root : `str` or `ButlerURI`
275 Path or URI to the root location of the new repository. Will be
276 created if it does not exist.
277 config : `Config` or `str`, optional
278 Configuration to write to the repository, after setting any
279 root-dependent Registry or Datastore config options. Can not
280 be a `ButlerConfig` or a `ConfigSubset`. If `None`, default
281 configuration will be used. Root-dependent config options
282 specified in this config are overwritten if ``forceConfigRoot``
283 is `True`.
284 dimensionConfig : `Config` or `str`, optional
285 Configuration for dimensions, will be used to initialize registry
286 database.
287 standalone : `bool`
288 If True, write all expanded defaults, not just customized or
289 repository-specific settings.
290 This (mostly) decouples the repository from the default
291 configuration, insulating it from changes to the defaults (which
292 may be good or bad, depending on the nature of the changes).
293 Future *additions* to the defaults will still be picked up when
294 initializing `Butlers` to repos created with ``standalone=True``.
295 searchPaths : `list` of `str`, optional
296 Directory paths to search when calculating the full butler
297 configuration.
298 forceConfigRoot : `bool`, optional
299 If `False`, any values present in the supplied ``config`` that
300 would normally be reset are not overridden and will appear
301 directly in the output config. This allows non-standard overrides
302 of the root directory for a datastore or registry to be given.
303 If this parameter is `True` the values for ``root`` will be
304 forced into the resulting config if appropriate.
305 outfile : `str`, optional
306 If not-`None`, the output configuration will be written to this
307 location rather than into the repository itself. Can be a URI
308 string. Can refer to a directory that will be used to write
309 ``butler.yaml``.
310 overwrite : `bool`, optional
311 Create a new configuration file even if one already exists
312 in the specified output location. Default is to raise
313 an exception.
315 Returns
316 -------
317 config : `Config`
318 The updated `Config` instance written to the repo.
320 Raises
321 ------
322 ValueError
323 Raised if a ButlerConfig or ConfigSubset is passed instead of a
324 regular Config (as these subclasses would make it impossible to
325 support ``standalone=False``).
326 FileExistsError
327 Raised if the output config file already exists.
328 os.error
329 Raised if the directory does not exist, exists but is not a
330 directory, or cannot be created.
332 Notes
333 -----
334 Note that when ``standalone=False`` (the default), the configuration
335 search path (see `ConfigSubset.defaultSearchPaths`) that was used to
336 construct the repository should also be used to construct any Butlers
337 to avoid configuration inconsistencies.
338 """
339 if isinstance(config, (ButlerConfig, ConfigSubset)):
340 raise ValueError("makeRepo must be passed a regular Config without defaults applied.")
342 # Ensure that the root of the repository exists or can be made
343 uri = ButlerURI(root, forceDirectory=True)
344 uri.mkdir()
346 config = Config(config)
348 # If we are creating a new repo from scratch with relative roots,
349 # do not propagate an explicit root from the config file
350 if "root" in config:
351 del config["root"]
353 full = ButlerConfig(config, searchPaths=searchPaths) # this applies defaults
354 datastoreClass: Type[Datastore] = doImport(full["datastore", "cls"])
355 datastoreClass.setConfigRoot(BUTLER_ROOT_TAG, config, full, overwrite=forceConfigRoot)
357 # if key exists in given config, parse it, otherwise parse the defaults
358 # in the expanded config
359 if config.get(("registry", "db")):
360 registryConfig = RegistryConfig(config)
361 else:
362 registryConfig = RegistryConfig(full)
363 defaultDatabaseUri = registryConfig.makeDefaultDatabaseUri(BUTLER_ROOT_TAG)
364 if defaultDatabaseUri is not None:
365 Config.updateParameters(RegistryConfig, config, full,
366 toUpdate={"db": defaultDatabaseUri},
367 overwrite=forceConfigRoot)
368 else:
369 Config.updateParameters(RegistryConfig, config, full, toCopy=("db",),
370 overwrite=forceConfigRoot)
372 if standalone:
373 config.merge(full)
374 else:
375 # Always expand the registry.managers section into the per-repo
376 # config, because after the database schema is created, it's not
377 # allowed to change anymore. Note that in the standalone=True
378 # branch, _everything_ in the config is expanded, so there's no
379 # need to special case this.
380 Config.updateParameters(RegistryConfig, config, full, toCopy=("managers",), overwrite=False)
381 configURI: Union[str, ButlerURI]
382 if outfile is not None:
383 # When writing to a separate location we must include
384 # the root of the butler repo in the config else it won't know
385 # where to look.
386 config["root"] = uri.geturl()
387 configURI = outfile
388 else:
389 configURI = uri
390 config.dumpToUri(configURI, overwrite=overwrite)
392 # Create Registry and populate tables
393 registryConfig = RegistryConfig(config.get("registry"))
394 dimensionConfig = DimensionConfig(dimensionConfig)
395 Registry.createFromConfig(registryConfig, dimensionConfig=dimensionConfig, butlerRoot=root)
397 return config
399 @classmethod
400 def _unpickle(cls, config: ButlerConfig, collections: Optional[CollectionSearch], run: Optional[str],
401 defaultDataId: Dict[str, str], writeable: bool) -> Butler:
402 """Callable used to unpickle a Butler.
404 We prefer not to use ``Butler.__init__`` directly so we can force some
405 of its many arguments to be keyword-only (note that ``__reduce__``
406 can only invoke callables with positional arguments).
408 Parameters
409 ----------
410 config : `ButlerConfig`
411 Butler configuration, already coerced into a true `ButlerConfig`
412 instance (and hence after any search paths for overrides have been
413 utilized).
414 collections : `CollectionSearch`
415 Names of the default collections to read from.
416 run : `str`, optional
417 Name of the default `~CollectionType.RUN` collection to write to.
418 defaultDataId : `dict` [ `str`, `str` ]
419 Default data ID values.
420 writeable : `bool`
421 Whether the Butler should support write operations.
423 Returns
424 -------
425 butler : `Butler`
426 A new `Butler` instance.
427 """
428 # MyPy doesn't recognize that the kwargs below are totally valid; it
429 # seems to think '**defaultDataId* is a _positional_ argument!
430 return cls(config=config, collections=collections, run=run, writeable=writeable,
431 **defaultDataId) # type: ignore
433 def __reduce__(self) -> tuple:
434 """Support pickling.
435 """
436 return (Butler._unpickle, (self._config, self.collections, self.run,
437 self.registry.defaults.dataId.byName(),
438 self.registry.isWriteable()))
440 def __str__(self) -> str:
441 return "Butler(collections={}, run={}, datastore='{}', registry='{}')".format(
442 self.collections, self.run, self.datastore, self.registry)
444 def isWriteable(self) -> bool:
445 """Return `True` if this `Butler` supports write operations.
446 """
447 return self.registry.isWriteable()
449 @contextlib.contextmanager
450 def transaction(self) -> Iterator[None]:
451 """Context manager supporting `Butler` transactions.
453 Transactions can be nested.
454 """
455 with self.registry.transaction():
456 with self.datastore.transaction():
457 yield
459 def _standardizeArgs(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
460 dataId: Optional[DataId] = None, **kwds: Any
461 ) -> Tuple[DatasetType, Optional[DataId]]:
462 """Standardize the arguments passed to several Butler APIs.
464 Parameters
465 ----------
466 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
467 When `DatasetRef` the `dataId` should be `None`.
468 Otherwise the `DatasetType` or name thereof.
469 dataId : `dict` or `DataCoordinate`
470 A `dict` of `Dimension` link name, value pairs that label the
471 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
472 should be provided as the second argument.
473 kwds
474 Additional keyword arguments used to augment or construct a
475 `DataCoordinate`. See `DataCoordinate.standardize`
476 parameters.
478 Returns
479 -------
480 datasetType : `DatasetType`
481 A `DatasetType` instance extracted from ``datasetRefOrType``.
482 dataId : `dict` or `DataId`, optional
483 Argument that can be used (along with ``kwds``) to construct a
484 `DataId`.
486 Notes
487 -----
488 Butler APIs that conceptually need a DatasetRef also allow passing a
489 `DatasetType` (or the name of one) and a `DataId` (or a dict and
490 keyword arguments that can be used to construct one) separately. This
491 method accepts those arguments and always returns a true `DatasetType`
492 and a `DataId` or `dict`.
494 Standardization of `dict` vs `DataId` is best handled by passing the
495 returned ``dataId`` (and ``kwds``) to `Registry` APIs, which are
496 generally similarly flexible.
497 """
498 externalDatasetType: Optional[DatasetType] = None
499 internalDatasetType: Optional[DatasetType] = None
500 if isinstance(datasetRefOrType, DatasetRef):
501 if dataId is not None or kwds:
502 raise ValueError("DatasetRef given, cannot use dataId as well")
503 externalDatasetType = datasetRefOrType.datasetType
504 dataId = datasetRefOrType.dataId
505 else:
506 # Don't check whether DataId is provided, because Registry APIs
507 # can usually construct a better error message when it wasn't.
508 if isinstance(datasetRefOrType, DatasetType):
509 externalDatasetType = datasetRefOrType
510 else:
511 internalDatasetType = self.registry.getDatasetType(datasetRefOrType)
513 # Check that they are self-consistent
514 if externalDatasetType is not None:
515 internalDatasetType = self.registry.getDatasetType(externalDatasetType.name)
516 if externalDatasetType != internalDatasetType:
517 raise ValueError(f"Supplied dataset type ({externalDatasetType}) inconsistent with "
518 f"registry definition ({internalDatasetType})")
520 assert internalDatasetType is not None
521 return internalDatasetType, dataId
523 def _findDatasetRef(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
524 dataId: Optional[DataId] = None, *,
525 collections: Any = None,
526 allowUnresolved: bool = False,
527 **kwds: Any) -> DatasetRef:
528 """Shared logic for methods that start with a search for a dataset in
529 the registry.
531 Parameters
532 ----------
533 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
534 When `DatasetRef` the `dataId` should be `None`.
535 Otherwise the `DatasetType` or name thereof.
536 dataId : `dict` or `DataCoordinate`, optional
537 A `dict` of `Dimension` link name, value pairs that label the
538 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
539 should be provided as the first argument.
540 collections : Any, optional
541 Collections to be searched, overriding ``self.collections``.
542 Can be any of the types supported by the ``collections`` argument
543 to butler construction.
544 allowUnresolved : `bool`, optional
545 If `True`, return an unresolved `DatasetRef` if finding a resolved
546 one in the `Registry` fails. Defaults to `False`.
547 kwds
548 Additional keyword arguments used to augment or construct a
549 `DataId`. See `DataId` parameters.
551 Returns
552 -------
553 ref : `DatasetRef`
554 A reference to the dataset identified by the given arguments.
556 Raises
557 ------
558 LookupError
559 Raised if no matching dataset exists in the `Registry` (and
560 ``allowUnresolved is False``).
561 ValueError
562 Raised if a resolved `DatasetRef` was passed as an input, but it
563 differs from the one found in the registry.
564 TypeError
565 Raised if no collections were provided.
566 """
567 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwds)
568 if isinstance(datasetRefOrType, DatasetRef):
569 idNumber = datasetRefOrType.id
570 else:
571 idNumber = None
572 timespan: Optional[Timespan] = None
574 # Process dimension records that are using record information
575 # rather than ids
576 newDataId: Dict[str, DataIdValue] = {}
577 byRecord: Dict[str, Dict[str, Any]] = defaultdict(dict)
579 # if all the dataId comes from keyword parameters we do not need
580 # to do anything here because they can't be of the form
581 # exposure.obs_id because a "." is not allowed in a keyword parameter.
582 if dataId:
583 for k, v in dataId.items():
584 # If we have a Dimension we do not need to do anything
585 # because it cannot be a compound key.
586 if isinstance(k, str) and "." in k:
587 # Someone is using a more human-readable dataId
588 dimensionName, record = k.split(".", 1)
589 byRecord[dimensionName][record] = v
590 elif isinstance(k, Dimension):
591 newDataId[k.name] = v
592 else:
593 newDataId[k] = v
595 # Go through the updated dataId and check the type in case someone is
596 # using an alternate key. We have already filtered out the compound
597 # keys dimensions.record format.
598 not_dimensions = {}
600 # Will need to look in the dataId and the keyword arguments
601 # and will remove them if they need to be fixed or are unrecognized.
602 for dataIdDict in (newDataId, kwds):
603 # Use a list so we can adjust the dict safely in the loop
604 for dimensionName in list(dataIdDict):
605 value = dataIdDict[dimensionName]
606 try:
607 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
608 except KeyError:
609 # This is not a real dimension
610 not_dimensions[dimensionName] = value
611 del dataIdDict[dimensionName]
612 continue
614 # Convert an integral type to an explicit int to simplify
615 # comparisons here
616 if isinstance(value, numbers.Integral):
617 value = int(value)
619 if not isinstance(value, dimension.primaryKey.getPythonType()):
620 for alternate in dimension.alternateKeys:
621 if isinstance(value, alternate.getPythonType()):
622 byRecord[dimensionName][alternate.name] = value
623 del dataIdDict[dimensionName]
624 log.debug("Converting dimension %s to %s.%s=%s",
625 dimensionName, dimensionName, alternate.name, value)
626 break
627 else:
628 log.warning("Type mismatch found for value '%r' provided for dimension %s. "
629 "Could not find matching alternative (primary key has type %s) "
630 "so attempting to use as-is.",
631 value, dimensionName, dimension.primaryKey.getPythonType())
633 # If we have some unrecognized dimensions we have to try to connect
634 # them to records in other dimensions. This is made more complicated
635 # by some dimensions having records with clashing names. A mitigation
636 # is that we can tell by this point which dimensions are missing
637 # for the DatasetType but this does not work for calibrations
638 # where additional dimensions can be used to constrain the temporal
639 # axis.
640 if not_dimensions:
641 # Calculate missing dimensions
642 provided = set(newDataId) | set(kwds) | set(byRecord)
643 missingDimensions = datasetType.dimensions.names - provided
645 # For calibrations we may well be needing temporal dimensions
646 # so rather than always including all dimensions in the scan
647 # restrict things a little. It is still possible for there
648 # to be confusion over day_obs in visit vs exposure for example.
649 # If we are not searching calibration collections things may
650 # fail but they are going to fail anyway because of the
651 # ambiguousness of the dataId...
652 candidateDimensions: Set[str] = set()
653 candidateDimensions.update(missingDimensions)
654 if datasetType.isCalibration():
655 for dim in self.registry.dimensions.getStaticDimensions():
656 if dim.temporal:
657 candidateDimensions.add(str(dim))
659 # Look up table for the first association with a dimension
660 guessedAssociation: Dict[str, Dict[str, Any]] = defaultdict(dict)
662 # Keep track of whether an item is associated with multiple
663 # dimensions.
664 counter: Counter[str] = Counter()
665 assigned: Dict[str, Set[str]] = defaultdict(set)
667 # Go through the missing dimensions and associate the
668 # given names with records within those dimensions
669 for dimensionName in candidateDimensions:
670 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
671 fields = dimension.metadata.names | dimension.uniqueKeys.names
672 for field in not_dimensions:
673 if field in fields:
674 guessedAssociation[dimensionName][field] = not_dimensions[field]
675 counter[dimensionName] += 1
676 assigned[field].add(dimensionName)
678 # There is a chance we have allocated a single dataId item
679 # to multiple dimensions. Need to decide which should be retained.
680 # For now assume that the most popular alternative wins.
681 # This means that day_obs with seq_num will result in
682 # exposure.day_obs and not visit.day_obs
683 # Also prefer an explicitly missing dimension over an inferred
684 # temporal dimension.
685 for fieldName, assignedDimensions in assigned.items():
686 if len(assignedDimensions) > 1:
687 # Pick the most popular (preferring mandatory dimensions)
688 requiredButMissing = assignedDimensions.intersection(missingDimensions)
689 if requiredButMissing:
690 candidateDimensions = requiredButMissing
691 else:
692 candidateDimensions = assignedDimensions
694 # Select the relevant items and get a new restricted
695 # counter.
696 theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions}
697 duplicatesCounter: Counter[str] = Counter()
698 duplicatesCounter.update(theseCounts)
700 # Choose the most common. If they are equally common
701 # we will pick the one that was found first.
702 # Returns a list of tuples
703 selected = duplicatesCounter.most_common(1)[0][0]
705 log.debug("Ambiguous dataId entry '%s' associated with multiple dimensions: %s."
706 " Removed ambiguity by choosing dimension %s.",
707 fieldName, ", ".join(assignedDimensions), selected)
709 for candidateDimension in assignedDimensions:
710 if candidateDimension != selected:
711 del guessedAssociation[candidateDimension][fieldName]
713 # Update the record look up dict with the new associations
714 for dimensionName, values in guessedAssociation.items():
715 if values: # A dict might now be empty
716 log.debug("Assigned non-dimension dataId keys to dimension %s: %s",
717 dimensionName, values)
718 byRecord[dimensionName].update(values)
720 if byRecord:
721 # Some record specifiers were found so we need to convert
722 # them to the Id form
723 for dimensionName, values in byRecord.items():
724 if dimensionName in newDataId:
725 log.warning("DataId specified explicit %s dimension value of %s in addition to"
726 " general record specifiers for it of %s. Ignoring record information.",
727 dimensionName, newDataId[dimensionName], str(values))
728 continue
730 # Build up a WHERE expression -- use single quotes
731 def quote(s: Any) -> str:
732 if isinstance(s, str):
733 return f"'{s}'"
734 else:
735 return s
737 where = " AND ".join(f"{dimensionName}.{k} = {quote(v)}"
738 for k, v in values.items())
740 # Hopefully we get a single record that matches
741 records = set(self.registry.queryDimensionRecords(dimensionName, dataId=newDataId,
742 where=where, **kwds))
744 if len(records) != 1:
745 if len(records) > 1:
746 log.debug("Received %d records from constraints of %s", len(records), str(values))
747 for r in records:
748 log.debug("- %s", str(r))
749 raise RuntimeError(f"DataId specification for dimension {dimensionName} is not"
750 f" uniquely constrained to a single dataset by {values}."
751 f" Got {len(records)} results.")
752 raise RuntimeError(f"DataId specification for dimension {dimensionName} matched no"
753 f" records when constrained by {values}")
755 # Get the primary key from the real dimension object
756 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
757 if not isinstance(dimension, Dimension):
758 raise RuntimeError(
759 f"{dimension.name} is not a true dimension, and cannot be used in data IDs."
760 )
761 newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name)
763 # We have modified the dataId so need to switch to it
764 dataId = newDataId
766 if datasetType.isCalibration():
767 # Because this is a calibration dataset, first try to make a
768 # standardize the data ID without restricting the dimensions to
769 # those of the dataset type requested, because there may be extra
770 # dimensions that provide temporal information for a validity-range
771 # lookup.
772 dataId = DataCoordinate.standardize(dataId, universe=self.registry.dimensions,
773 defaults=self.registry.defaults.dataId, **kwds)
774 if dataId.graph.temporal:
775 dataId = self.registry.expandDataId(dataId)
776 timespan = dataId.timespan
777 else:
778 # Standardize the data ID to just the dimensions of the dataset
779 # type instead of letting registry.findDataset do it, so we get the
780 # result even if no dataset is found.
781 dataId = DataCoordinate.standardize(dataId, graph=datasetType.dimensions,
782 defaults=self.registry.defaults.dataId, **kwds)
783 # Always lookup the DatasetRef, even if one is given, to ensure it is
784 # present in the current collection.
785 ref = self.registry.findDataset(datasetType, dataId, collections=collections, timespan=timespan)
786 if ref is None:
787 if allowUnresolved:
788 return DatasetRef(datasetType, dataId)
789 else:
790 if collections is None:
791 collections = self.registry.defaults.collections
792 raise LookupError(f"Dataset {datasetType.name} with data ID {dataId} "
793 f"could not be found in collections {collections}.")
794 if idNumber is not None and idNumber != ref.id:
795 if collections is None:
796 collections = self.registry.defaults.collections
797 raise ValueError(f"DatasetRef.id provided ({idNumber}) does not match "
798 f"id ({ref.id}) in registry in collections {collections}.")
799 return ref
801 @transactional
802 def put(self, obj: Any, datasetRefOrType: Union[DatasetRef, DatasetType, str],
803 dataId: Optional[DataId] = None, *,
804 run: Optional[str] = None,
805 **kwds: Any) -> DatasetRef:
806 """Store and register a dataset.
808 Parameters
809 ----------
810 obj : `object`
811 The dataset.
812 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
813 When `DatasetRef` is provided, ``dataId`` should be `None`.
814 Otherwise the `DatasetType` or name thereof.
815 dataId : `dict` or `DataCoordinate`
816 A `dict` of `Dimension` link name, value pairs that label the
817 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
818 should be provided as the second argument.
819 run : `str`, optional
820 The name of the run the dataset should be added to, overriding
821 ``self.run``.
822 kwds
823 Additional keyword arguments used to augment or construct a
824 `DataCoordinate`. See `DataCoordinate.standardize`
825 parameters.
827 Returns
828 -------
829 ref : `DatasetRef`
830 A reference to the stored dataset, updated with the correct id if
831 given.
833 Raises
834 ------
835 TypeError
836 Raised if the butler is read-only or if no run has been provided.
837 """
838 log.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run)
839 if not self.isWriteable():
840 raise TypeError("Butler is read-only.")
841 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwds)
842 if isinstance(datasetRefOrType, DatasetRef) and datasetRefOrType.id is not None:
843 raise ValueError("DatasetRef must not be in registry, must have None id")
845 # Add Registry Dataset entry.
846 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions, **kwds)
847 ref, = self.registry.insertDatasets(datasetType, run=run, dataIds=[dataId])
849 # Add Datastore entry.
850 self.datastore.put(obj, ref)
852 return ref
854 def getDirect(self, ref: DatasetRef, *, parameters: Optional[Dict[str, Any]] = None) -> Any:
855 """Retrieve a stored dataset.
857 Unlike `Butler.get`, this method allows datasets outside the Butler's
858 collection to be read as long as the `DatasetRef` that identifies them
859 can be obtained separately.
861 Parameters
862 ----------
863 ref : `DatasetRef`
864 Resolved reference to an already stored dataset.
865 parameters : `dict`
866 Additional StorageClass-defined options to control reading,
867 typically used to efficiently read only a subset of the dataset.
869 Returns
870 -------
871 obj : `object`
872 The dataset.
873 """
874 return self.datastore.get(ref, parameters=parameters)
876 def getDirectDeferred(self, ref: DatasetRef, *,
877 parameters: Union[dict, None] = None) -> DeferredDatasetHandle:
878 """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
879 from a resolved `DatasetRef`.
881 Parameters
882 ----------
883 ref : `DatasetRef`
884 Resolved reference to an already stored dataset.
885 parameters : `dict`
886 Additional StorageClass-defined options to control reading,
887 typically used to efficiently read only a subset of the dataset.
889 Returns
890 -------
891 obj : `DeferredDatasetHandle`
892 A handle which can be used to retrieve a dataset at a later time.
894 Raises
895 ------
896 AmbiguousDatasetError
897 Raised if ``ref.id is None``, i.e. the reference is unresolved.
898 """
899 if ref.id is None:
900 raise AmbiguousDatasetError(
901 f"Dataset of type {ref.datasetType.name} with data ID {ref.dataId} is not resolved."
902 )
903 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters)
905 def getDeferred(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
906 dataId: Optional[DataId] = None, *,
907 parameters: Union[dict, None] = None,
908 collections: Any = None,
909 **kwds: Any) -> DeferredDatasetHandle:
910 """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
911 after an immediate registry lookup.
913 Parameters
914 ----------
915 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
916 When `DatasetRef` the `dataId` should be `None`.
917 Otherwise the `DatasetType` or name thereof.
918 dataId : `dict` or `DataCoordinate`, optional
919 A `dict` of `Dimension` link name, value pairs that label the
920 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
921 should be provided as the first argument.
922 parameters : `dict`
923 Additional StorageClass-defined options to control reading,
924 typically used to efficiently read only a subset of the dataset.
925 collections : Any, optional
926 Collections to be searched, overriding ``self.collections``.
927 Can be any of the types supported by the ``collections`` argument
928 to butler construction.
929 kwds
930 Additional keyword arguments used to augment or construct a
931 `DataId`. See `DataId` parameters.
933 Returns
934 -------
935 obj : `DeferredDatasetHandle`
936 A handle which can be used to retrieve a dataset at a later time.
938 Raises
939 ------
940 LookupError
941 Raised if no matching dataset exists in the `Registry` (and
942 ``allowUnresolved is False``).
943 ValueError
944 Raised if a resolved `DatasetRef` was passed as an input, but it
945 differs from the one found in the registry.
946 TypeError
947 Raised if no collections were provided.
948 """
949 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwds)
950 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters)
952 def get(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
953 dataId: Optional[DataId] = None, *,
954 parameters: Optional[Dict[str, Any]] = None,
955 collections: Any = None,
956 **kwds: Any) -> Any:
957 """Retrieve a stored dataset.
959 Parameters
960 ----------
961 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
962 When `DatasetRef` the `dataId` should be `None`.
963 Otherwise the `DatasetType` or name thereof.
964 dataId : `dict` or `DataCoordinate`
965 A `dict` of `Dimension` link name, value pairs that label the
966 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
967 should be provided as the first argument.
968 parameters : `dict`
969 Additional StorageClass-defined options to control reading,
970 typically used to efficiently read only a subset of the dataset.
971 collections : Any, optional
972 Collections to be searched, overriding ``self.collections``.
973 Can be any of the types supported by the ``collections`` argument
974 to butler construction.
975 kwds
976 Additional keyword arguments used to augment or construct a
977 `DataCoordinate`. See `DataCoordinate.standardize`
978 parameters.
980 Returns
981 -------
982 obj : `object`
983 The dataset.
985 Raises
986 ------
987 ValueError
988 Raised if a resolved `DatasetRef` was passed as an input, but it
989 differs from the one found in the registry.
990 LookupError
991 Raised if no matching dataset exists in the `Registry`.
992 TypeError
993 Raised if no collections were provided.
995 Notes
996 -----
997 When looking up datasets in a `~CollectionType.CALIBRATION` collection,
998 this method requires that the given data ID include temporal dimensions
999 beyond the dimensions of the dataset type itself, in order to find the
1000 dataset with the appropriate validity range. For example, a "bias"
1001 dataset with native dimensions ``{instrument, detector}`` could be
1002 fetched with a ``{instrument, detector, exposure}`` data ID, because
1003 ``exposure`` is a temporal dimension.
1004 """
1005 log.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters)
1006 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwds)
1007 return self.getDirect(ref, parameters=parameters)
1009 def getURIs(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
1010 dataId: Optional[DataId] = None, *,
1011 predict: bool = False,
1012 collections: Any = None,
1013 run: Optional[str] = None,
1014 **kwds: Any) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]:
1015 """Returns the URIs associated with the dataset.
1017 Parameters
1018 ----------
1019 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1020 When `DatasetRef` the `dataId` should be `None`.
1021 Otherwise the `DatasetType` or name thereof.
1022 dataId : `dict` or `DataCoordinate`
1023 A `dict` of `Dimension` link name, value pairs that label the
1024 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1025 should be provided as the first argument.
1026 predict : `bool`
1027 If `True`, allow URIs to be returned of datasets that have not
1028 been written.
1029 collections : Any, optional
1030 Collections to be searched, overriding ``self.collections``.
1031 Can be any of the types supported by the ``collections`` argument
1032 to butler construction.
1033 run : `str`, optional
1034 Run to use for predictions, overriding ``self.run``.
1035 kwds
1036 Additional keyword arguments used to augment or construct a
1037 `DataCoordinate`. See `DataCoordinate.standardize`
1038 parameters.
1040 Returns
1041 -------
1042 primary : `ButlerURI`
1043 The URI to the primary artifact associated with this dataset.
1044 If the dataset was disassembled within the datastore this
1045 may be `None`.
1046 components : `dict`
1047 URIs to any components associated with the dataset artifact.
1048 Can be empty if there are no components.
1049 """
1050 ref = self._findDatasetRef(datasetRefOrType, dataId, allowUnresolved=predict,
1051 collections=collections, **kwds)
1052 if ref.id is None: # only possible if predict is True
1053 if run is None:
1054 run = self.run
1055 if run is None:
1056 raise TypeError("Cannot predict location with run=None.")
1057 # Lie about ID, because we can't guess it, and only
1058 # Datastore.getURIs() will ever see it (and it doesn't use it).
1059 ref = ref.resolved(id=0, run=run)
1060 return self.datastore.getURIs(ref, predict)
1062 def getURI(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
1063 dataId: Optional[DataId] = None, *,
1064 predict: bool = False,
1065 collections: Any = None,
1066 run: Optional[str] = None,
1067 **kwds: Any) -> ButlerURI:
1068 """Return the URI to the Dataset.
1070 Parameters
1071 ----------
1072 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1073 When `DatasetRef` the `dataId` should be `None`.
1074 Otherwise the `DatasetType` or name thereof.
1075 dataId : `dict` or `DataCoordinate`
1076 A `dict` of `Dimension` link name, value pairs that label the
1077 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1078 should be provided as the first argument.
1079 predict : `bool`
1080 If `True`, allow URIs to be returned of datasets that have not
1081 been written.
1082 collections : Any, optional
1083 Collections to be searched, overriding ``self.collections``.
1084 Can be any of the types supported by the ``collections`` argument
1085 to butler construction.
1086 run : `str`, optional
1087 Run to use for predictions, overriding ``self.run``.
1088 kwds
1089 Additional keyword arguments used to augment or construct a
1090 `DataCoordinate`. See `DataCoordinate.standardize`
1091 parameters.
1093 Returns
1094 -------
1095 uri : `ButlerURI`
1096 URI pointing to the Dataset within the datastore. If the
1097 Dataset does not exist in the datastore, and if ``predict`` is
1098 `True`, the URI will be a prediction and will include a URI
1099 fragment "#predicted".
1100 If the datastore does not have entities that relate well
1101 to the concept of a URI the returned URI string will be
1102 descriptive. The returned URI is not guaranteed to be obtainable.
1104 Raises
1105 ------
1106 LookupError
1107 A URI has been requested for a dataset that does not exist and
1108 guessing is not allowed.
1109 ValueError
1110 Raised if a resolved `DatasetRef` was passed as an input, but it
1111 differs from the one found in the registry.
1112 TypeError
1113 Raised if no collections were provided.
1114 RuntimeError
1115 Raised if a URI is requested for a dataset that consists of
1116 multiple artifacts.
1117 """
1118 primary, components = self.getURIs(datasetRefOrType, dataId=dataId, predict=predict,
1119 collections=collections, run=run, **kwds)
1121 if primary is None or components:
1122 raise RuntimeError(f"Dataset ({datasetRefOrType}) includes distinct URIs for components. "
1123 "Use Butler.getURIs() instead.")
1124 return primary
1126 def datasetExists(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
1127 dataId: Optional[DataId] = None, *,
1128 collections: Any = None,
1129 **kwds: Any) -> bool:
1130 """Return True if the Dataset is actually present in the Datastore.
1132 Parameters
1133 ----------
1134 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1135 When `DatasetRef` the `dataId` should be `None`.
1136 Otherwise the `DatasetType` or name thereof.
1137 dataId : `dict` or `DataCoordinate`
1138 A `dict` of `Dimension` link name, value pairs that label the
1139 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1140 should be provided as the first argument.
1141 collections : Any, optional
1142 Collections to be searched, overriding ``self.collections``.
1143 Can be any of the types supported by the ``collections`` argument
1144 to butler construction.
1145 kwds
1146 Additional keyword arguments used to augment or construct a
1147 `DataCoordinate`. See `DataCoordinate.standardize`
1148 parameters.
1150 Raises
1151 ------
1152 LookupError
1153 Raised if the dataset is not even present in the Registry.
1154 ValueError
1155 Raised if a resolved `DatasetRef` was passed as an input, but it
1156 differs from the one found in the registry.
1157 TypeError
1158 Raised if no collections were provided.
1159 """
1160 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwds)
1161 return self.datastore.exists(ref)
1163 def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None:
1164 """Remove one or more `~CollectionType.RUN` collections and the
1165 datasets within them.
1167 Parameters
1168 ----------
1169 names : `Iterable` [ `str` ]
1170 The names of the collections to remove.
1171 unstore : `bool`, optional
1172 If `True` (default), delete datasets from all datastores in which
1173 they are present, and attempt to rollback the registry deletions if
1174 datastore deletions fail (which may not always be possible). If
1175 `False`, datastore records for these datasets are still removed,
1176 but any artifacts (e.g. files) will not be.
1178 Raises
1179 ------
1180 TypeError
1181 Raised if one or more collections are not of type
1182 `~CollectionType.RUN`.
1183 """
1184 if not self.isWriteable():
1185 raise TypeError("Butler is read-only.")
1186 names = list(names)
1187 refs: List[DatasetRef] = []
1188 for name in names:
1189 collectionType = self.registry.getCollectionType(name)
1190 if collectionType is not CollectionType.RUN:
1191 raise TypeError(f"The collection type of '{name}' is {collectionType.name}, not RUN.")
1192 refs.extend(self.registry.queryDatasets(..., collections=name, findFirst=True))
1193 with self.registry.transaction():
1194 if unstore:
1195 for ref in refs:
1196 if self.datastore.exists(ref):
1197 self.datastore.trash(ref)
1198 else:
1199 self.datastore.forget(refs)
1200 for name in names:
1201 self.registry.removeCollection(name)
1202 if unstore:
1203 # Point of no return for removing artifacts
1204 self.datastore.emptyTrash()
1206 def pruneCollection(self, name: str, purge: bool = False, unstore: bool = False) -> None:
1207 """Remove a collection and possibly prune datasets within it.
1209 Parameters
1210 ----------
1211 name : `str`
1212 Name of the collection to remove. If this is a
1213 `~CollectionType.TAGGED` or `~CollectionType.CHAINED` collection,
1214 datasets within the collection are not modified unless ``unstore``
1215 is `True`. If this is a `~CollectionType.RUN` collection,
1216 ``purge`` and ``unstore`` must be `True`, and all datasets in it
1217 are fully removed from the data repository.
1218 purge : `bool`, optional
1219 If `True`, permit `~CollectionType.RUN` collections to be removed,
1220 fully removing datasets within them. Requires ``unstore=True`` as
1221 well as an added precaution against accidental deletion. Must be
1222 `False` (default) if the collection is not a ``RUN``.
1223 unstore: `bool`, optional
1224 If `True`, remove all datasets in the collection from all
1225 datastores in which they appear.
1227 Raises
1228 ------
1229 TypeError
1230 Raised if the butler is read-only or arguments are mutually
1231 inconsistent.
1232 """
1234 # See pruneDatasets comments for more information about the logic here;
1235 # the cases are almost the same, but here we can rely on Registry to
1236 # take care everything but Datastore deletion when we remove the
1237 # collection.
1238 if not self.isWriteable():
1239 raise TypeError("Butler is read-only.")
1240 collectionType = self.registry.getCollectionType(name)
1241 if purge and not unstore:
1242 raise PurgeWithoutUnstorePruneCollectionsError()
1243 if collectionType is CollectionType.RUN and not purge:
1244 raise RunWithoutPurgePruneCollectionsError(collectionType)
1245 if collectionType is not CollectionType.RUN and purge:
1246 raise PurgeUnsupportedPruneCollectionsError(collectionType)
1248 with self.registry.transaction():
1249 if unstore:
1250 for ref in self.registry.queryDatasets(..., collections=name, findFirst=True):
1251 if self.datastore.exists(ref):
1252 self.datastore.trash(ref)
1253 self.registry.removeCollection(name)
1254 if unstore:
1255 # Point of no return for removing artifacts
1256 self.datastore.emptyTrash()
1258 def pruneDatasets(self, refs: Iterable[DatasetRef], *,
1259 disassociate: bool = True,
1260 unstore: bool = False,
1261 tags: Iterable[str] = (),
1262 purge: bool = False,
1263 run: Optional[str] = None) -> None:
1264 """Remove one or more datasets from a collection and/or storage.
1266 Parameters
1267 ----------
1268 refs : `~collections.abc.Iterable` of `DatasetRef`
1269 Datasets to prune. These must be "resolved" references (not just
1270 a `DatasetType` and data ID).
1271 disassociate : `bool`, optional
1272 Disassociate pruned datasets from ``tags``, or from all collections
1273 if ``purge=True``.
1274 unstore : `bool`, optional
1275 If `True` (`False` is default) remove these datasets from all
1276 datastores known to this butler. Note that this will make it
1277 impossible to retrieve these datasets even via other collections.
1278 Datasets that are already not stored are ignored by this option.
1279 tags : `Iterable` [ `str` ], optional
1280 `~CollectionType.TAGGED` collections to disassociate the datasets
1281 from. Ignored if ``disassociate`` is `False` or ``purge`` is
1282 `True`.
1283 purge : `bool`, optional
1284 If `True` (`False` is default), completely remove the dataset from
1285 the `Registry`. To prevent accidental deletions, ``purge`` may
1286 only be `True` if all of the following conditions are met:
1288 - All given datasets are in the given run.
1289 - ``disassociate`` is `True`;
1290 - ``unstore`` is `True`.
1292 This mode may remove provenance information from datasets other
1293 than those provided, and should be used with extreme care.
1295 Raises
1296 ------
1297 TypeError
1298 Raised if the butler is read-only, if no collection was provided,
1299 or the conditions for ``purge=True`` were not met.
1300 """
1301 if not self.isWriteable():
1302 raise TypeError("Butler is read-only.")
1303 if purge:
1304 if not disassociate:
1305 raise TypeError("Cannot pass purge=True without disassociate=True.")
1306 if not unstore:
1307 raise TypeError("Cannot pass purge=True without unstore=True.")
1308 elif disassociate:
1309 tags = tuple(tags)
1310 if not tags:
1311 raise TypeError("No tags provided but disassociate=True.")
1312 for tag in tags:
1313 collectionType = self.registry.getCollectionType(tag)
1314 if collectionType is not CollectionType.TAGGED:
1315 raise TypeError(f"Cannot disassociate from collection '{tag}' "
1316 f"of non-TAGGED type {collectionType.name}.")
1317 # Transform possibly-single-pass iterable into something we can iterate
1318 # over multiple times.
1319 refs = list(refs)
1320 # Pruning a component of a DatasetRef makes no sense since registry
1321 # doesn't know about components and datastore might not store
1322 # components in a separate file
1323 for ref in refs:
1324 if ref.datasetType.component():
1325 raise ValueError(f"Can not prune a component of a dataset (ref={ref})")
1326 # We don't need an unreliable Datastore transaction for this, because
1327 # we've been extra careful to ensure that Datastore.trash only involves
1328 # mutating the Registry (it can _look_ at Datastore-specific things,
1329 # but shouldn't change them), and hence all operations here are
1330 # Registry operations.
1331 with self.registry.transaction():
1332 if unstore:
1333 for ref in refs:
1334 # There is a difference between a concrete composite
1335 # and virtual composite. In a virtual composite the
1336 # datastore is never given the top level DatasetRef. In
1337 # the concrete composite the datastore knows all the
1338 # refs and will clean up itself if asked to remove the
1339 # parent ref. We can not check configuration for this
1340 # since we can not trust that the configuration is the
1341 # same. We therefore have to ask if the ref exists or
1342 # not. This is consistent with the fact that we want
1343 # to ignore already-removed-from-datastore datasets
1344 # anyway.
1345 if self.datastore.exists(ref):
1346 self.datastore.trash(ref)
1347 if purge:
1348 self.registry.removeDatasets(refs)
1349 elif disassociate:
1350 assert tags, "Guaranteed by earlier logic in this function."
1351 for tag in tags:
1352 self.registry.disassociate(tag, refs)
1353 # We've exited the Registry transaction, and apparently committed.
1354 # (if there was an exception, everything rolled back, and it's as if
1355 # nothing happened - and we never get here).
1356 # Datastore artifacts are not yet gone, but they're clearly marked
1357 # as trash, so if we fail to delete now because of (e.g.) filesystem
1358 # problems we can try again later, and if manual administrative
1359 # intervention is required, it's pretty clear what that should entail:
1360 # deleting everything on disk and in private Datastore tables that is
1361 # in the dataset_location_trash table.
1362 if unstore:
1363 # Point of no return for removing artifacts
1364 self.datastore.emptyTrash()
1366 @transactional
1367 def ingest(self, *datasets: FileDataset, transfer: Optional[str] = "auto", run: Optional[str] = None,
1368 ) -> None:
1369 """Store and register one or more datasets that already exist on disk.
1371 Parameters
1372 ----------
1373 datasets : `FileDataset`
1374 Each positional argument is a struct containing information about
1375 a file to be ingested, including its path (either absolute or
1376 relative to the datastore root, if applicable), a `DatasetRef`,
1377 and optionally a formatter class or its fully-qualified string
1378 name. If a formatter is not provided, the formatter that would be
1379 used for `put` is assumed. On successful return, all
1380 `FileDataset.ref` attributes will have their `DatasetRef.id`
1381 attribute populated and all `FileDataset.formatter` attributes will
1382 be set to the formatter class used. `FileDataset.path` attributes
1383 may be modified to put paths in whatever the datastore considers a
1384 standardized form.
1385 transfer : `str`, optional
1386 If not `None`, must be one of 'auto', 'move', 'copy', 'direct',
1387 'hardlink', 'relsymlink' or 'symlink', indicating how to transfer
1388 the file.
1389 run : `str`, optional
1390 The name of the run ingested datasets should be added to,
1391 overriding ``self.run``.
1393 Raises
1394 ------
1395 TypeError
1396 Raised if the butler is read-only or if no run was provided.
1397 NotImplementedError
1398 Raised if the `Datastore` does not support the given transfer mode.
1399 DatasetTypeNotSupportedError
1400 Raised if one or more files to be ingested have a dataset type that
1401 is not supported by the `Datastore`..
1402 FileNotFoundError
1403 Raised if one of the given files does not exist.
1404 FileExistsError
1405 Raised if transfer is not `None` but the (internal) location the
1406 file would be moved to is already occupied.
1408 Notes
1409 -----
1410 This operation is not fully exception safe: if a database operation
1411 fails, the given `FileDataset` instances may be only partially updated.
1413 It is atomic in terms of database operations (they will either all
1414 succeed or all fail) providing the database engine implements
1415 transactions correctly. It will attempt to be atomic in terms of
1416 filesystem operations as well, but this cannot be implemented
1417 rigorously for most datastores.
1418 """
1419 if not self.isWriteable():
1420 raise TypeError("Butler is read-only.")
1421 # Reorganize the inputs so they're grouped by DatasetType and then
1422 # data ID. We also include a list of DatasetRefs for each FileDataset
1423 # to hold the resolved DatasetRefs returned by the Registry, before
1424 # it's safe to swap them into FileDataset.refs.
1425 # Some type annotation aliases to make that clearer:
1426 GroupForType = Dict[DataCoordinate, Tuple[FileDataset, List[DatasetRef]]]
1427 GroupedData = MutableMapping[DatasetType, GroupForType]
1428 # The actual data structure:
1429 groupedData: GroupedData = defaultdict(dict)
1430 # And the nested loop that populates it:
1431 for dataset in datasets:
1432 # This list intentionally shared across the inner loop, since it's
1433 # associated with `dataset`.
1434 resolvedRefs: List[DatasetRef] = []
1435 for ref in dataset.refs:
1436 if ref.dataId in groupedData[ref.datasetType]:
1437 raise ConflictingDefinitionError(f"Ingest conflict. Dataset {dataset.path} has same"
1438 " DataId as other ingest dataset"
1439 f" {groupedData[ref.datasetType][ref.dataId][0].path} "
1440 f" ({ref.dataId})")
1441 groupedData[ref.datasetType][ref.dataId] = (dataset, resolvedRefs)
1443 # Now we can bulk-insert into Registry for each DatasetType.
1444 allResolvedRefs: List[DatasetRef] = []
1445 for datasetType, groupForType in groupedData.items():
1446 refs = self.registry.insertDatasets(datasetType,
1447 dataIds=groupForType.keys(),
1448 run=run)
1449 # Append those resolved DatasetRefs to the new lists we set up for
1450 # them.
1451 for ref, (_, resolvedRefs) in zip(refs, groupForType.values()):
1452 resolvedRefs.append(ref)
1454 # Go back to the original FileDatasets to replace their refs with the
1455 # new resolved ones, and also build a big list of all refs.
1456 allResolvedRefs = []
1457 for groupForType in groupedData.values():
1458 for dataset, resolvedRefs in groupForType.values():
1459 dataset.refs = resolvedRefs
1460 allResolvedRefs.extend(resolvedRefs)
1462 # Bulk-insert everything into Datastore.
1463 self.datastore.ingest(*datasets, transfer=transfer)
1465 @contextlib.contextmanager
1466 def export(self, *, directory: Optional[str] = None,
1467 filename: Optional[str] = None,
1468 format: Optional[str] = None,
1469 transfer: Optional[str] = None) -> Iterator[RepoExportContext]:
1470 """Export datasets from the repository represented by this `Butler`.
1472 This method is a context manager that returns a helper object
1473 (`RepoExportContext`) that is used to indicate what information from
1474 the repository should be exported.
1476 Parameters
1477 ----------
1478 directory : `str`, optional
1479 Directory dataset files should be written to if ``transfer`` is not
1480 `None`.
1481 filename : `str`, optional
1482 Name for the file that will include database information associated
1483 with the exported datasets. If this is not an absolute path and
1484 ``directory`` is not `None`, it will be written to ``directory``
1485 instead of the current working directory. Defaults to
1486 "export.{format}".
1487 format : `str`, optional
1488 File format for the database information file. If `None`, the
1489 extension of ``filename`` will be used.
1490 transfer : `str`, optional
1491 Transfer mode passed to `Datastore.export`.
1493 Raises
1494 ------
1495 TypeError
1496 Raised if the set of arguments passed is inconsistent.
1498 Examples
1499 --------
1500 Typically the `Registry.queryDataIds` and `Registry.queryDatasets`
1501 methods are used to provide the iterables over data IDs and/or datasets
1502 to be exported::
1504 with butler.export("exports.yaml") as export:
1505 # Export all flats, but none of the dimension element rows
1506 # (i.e. data ID information) associated with them.
1507 export.saveDatasets(butler.registry.queryDatasets("flat"),
1508 elements=())
1509 # Export all datasets that start with "deepCoadd_" and all of
1510 # their associated data ID information.
1511 export.saveDatasets(butler.registry.queryDatasets("deepCoadd_*"))
1512 """
1513 if directory is None and transfer is not None:
1514 raise TypeError("Cannot transfer without providing a directory.")
1515 if transfer == "move":
1516 raise TypeError("Transfer may not be 'move': export is read-only")
1517 if format is None:
1518 if filename is None:
1519 raise TypeError("At least one of 'filename' or 'format' must be provided.")
1520 else:
1521 _, format = os.path.splitext(filename)
1522 elif filename is None:
1523 filename = f"export.{format}"
1524 if directory is not None:
1525 filename = os.path.join(directory, filename)
1526 BackendClass = getClassOf(self._config["repo_transfer_formats"][format]["export"])
1527 with open(filename, 'w') as stream:
1528 backend = BackendClass(stream)
1529 try:
1530 helper = RepoExportContext(self.registry, self.datastore, backend=backend,
1531 directory=directory, transfer=transfer)
1532 yield helper
1533 except BaseException:
1534 raise
1535 else:
1536 helper._finish()
1538 def import_(self, *, directory: Optional[str] = None,
1539 filename: Union[str, TextIO, None] = None,
1540 format: Optional[str] = None,
1541 transfer: Optional[str] = None,
1542 skip_dimensions: Optional[Set] = None) -> None:
1543 """Import datasets into this repository that were exported from a
1544 different butler repository via `~lsst.daf.butler.Butler.export`.
1546 Parameters
1547 ----------
1548 directory : `str`, optional
1549 Directory containing dataset files to import from. If `None`,
1550 ``filename`` and all dataset file paths specified therein must
1551 be absolute.
1552 filename : `str` or `TextIO`, optional
1553 A stream or name of file that contains database information
1554 associated with the exported datasets, typically generated by
1555 `~lsst.daf.butler.Butler.export`. If this a string (name) and
1556 is not an absolute path, does not exist in the current working
1557 directory, and ``directory`` is not `None`, it is assumed to be in
1558 ``directory``. Defaults to "export.{format}".
1559 format : `str`, optional
1560 File format for ``filename``. If `None`, the extension of
1561 ``filename`` will be used.
1562 transfer : `str`, optional
1563 Transfer mode passed to `~lsst.daf.butler.Datastore.ingest`.
1564 skip_dimensions : `set`, optional
1565 Names of dimensions that should be skipped and not imported.
1567 Raises
1568 ------
1569 TypeError
1570 Raised if the set of arguments passed is inconsistent, or if the
1571 butler is read-only.
1572 """
1573 if not self.isWriteable():
1574 raise TypeError("Butler is read-only.")
1575 if format is None:
1576 if filename is None:
1577 raise TypeError("At least one of 'filename' or 'format' must be provided.")
1578 else:
1579 _, format = os.path.splitext(filename) # type: ignore
1580 elif filename is None:
1581 filename = f"export.{format}"
1582 if isinstance(filename, str) and directory is not None and not os.path.exists(filename):
1583 filename = os.path.join(directory, filename)
1584 BackendClass = getClassOf(self._config["repo_transfer_formats"][format]["import"])
1586 def doImport(importStream: TextIO) -> None:
1587 backend = BackendClass(importStream, self.registry)
1588 backend.register()
1589 with self.transaction():
1590 backend.load(self.datastore, directory=directory, transfer=transfer,
1591 skip_dimensions=skip_dimensions)
1593 if isinstance(filename, str):
1594 with open(filename, "r") as stream:
1595 doImport(stream)
1596 else:
1597 doImport(filename)
1599 def validateConfiguration(self, logFailures: bool = False,
1600 datasetTypeNames: Optional[Iterable[str]] = None,
1601 ignore: Iterable[str] = None) -> None:
1602 """Validate butler configuration.
1604 Checks that each `DatasetType` can be stored in the `Datastore`.
1606 Parameters
1607 ----------
1608 logFailures : `bool`, optional
1609 If `True`, output a log message for every validation error
1610 detected.
1611 datasetTypeNames : iterable of `str`, optional
1612 The `DatasetType` names that should be checked. This allows
1613 only a subset to be selected.
1614 ignore : iterable of `str`, optional
1615 Names of DatasetTypes to skip over. This can be used to skip
1616 known problems. If a named `DatasetType` corresponds to a
1617 composite, all components of that `DatasetType` will also be
1618 ignored.
1620 Raises
1621 ------
1622 ButlerValidationError
1623 Raised if there is some inconsistency with how this Butler
1624 is configured.
1625 """
1626 if datasetTypeNames:
1627 datasetTypes = [self.registry.getDatasetType(name) for name in datasetTypeNames]
1628 else:
1629 datasetTypes = list(self.registry.queryDatasetTypes())
1631 # filter out anything from the ignore list
1632 if ignore:
1633 ignore = set(ignore)
1634 datasetTypes = [e for e in datasetTypes
1635 if e.name not in ignore and e.nameAndComponent()[0] not in ignore]
1636 else:
1637 ignore = set()
1639 # Find all the registered instruments
1640 instruments = set(
1641 record.name for record in self.registry.queryDimensionRecords("instrument")
1642 )
1644 # For each datasetType that has an instrument dimension, create
1645 # a DatasetRef for each defined instrument
1646 datasetRefs = []
1648 for datasetType in datasetTypes:
1649 if "instrument" in datasetType.dimensions:
1650 for instrument in instruments:
1651 datasetRef = DatasetRef(datasetType, {"instrument": instrument}, # type: ignore
1652 conform=False)
1653 datasetRefs.append(datasetRef)
1655 entities: List[Union[DatasetType, DatasetRef]] = []
1656 entities.extend(datasetTypes)
1657 entities.extend(datasetRefs)
1659 datastoreErrorStr = None
1660 try:
1661 self.datastore.validateConfiguration(entities, logFailures=logFailures)
1662 except ValidationError as e:
1663 datastoreErrorStr = str(e)
1665 # Also check that the LookupKeys used by the datastores match
1666 # registry and storage class definitions
1667 keys = self.datastore.getLookupKeys()
1669 failedNames = set()
1670 failedDataId = set()
1671 for key in keys:
1672 if key.name is not None:
1673 if key.name in ignore:
1674 continue
1676 # skip if specific datasetType names were requested and this
1677 # name does not match
1678 if datasetTypeNames and key.name not in datasetTypeNames:
1679 continue
1681 # See if it is a StorageClass or a DatasetType
1682 if key.name in self.storageClasses:
1683 pass
1684 else:
1685 try:
1686 self.registry.getDatasetType(key.name)
1687 except KeyError:
1688 if logFailures:
1689 log.critical("Key '%s' does not correspond to a DatasetType or StorageClass", key)
1690 failedNames.add(key)
1691 else:
1692 # Dimensions are checked for consistency when the Butler
1693 # is created and rendezvoused with a universe.
1694 pass
1696 # Check that the instrument is a valid instrument
1697 # Currently only support instrument so check for that
1698 if key.dataId:
1699 dataIdKeys = set(key.dataId)
1700 if set(["instrument"]) != dataIdKeys:
1701 if logFailures:
1702 log.critical("Key '%s' has unsupported DataId override", key)
1703 failedDataId.add(key)
1704 elif key.dataId["instrument"] not in instruments:
1705 if logFailures:
1706 log.critical("Key '%s' has unknown instrument", key)
1707 failedDataId.add(key)
1709 messages = []
1711 if datastoreErrorStr:
1712 messages.append(datastoreErrorStr)
1714 for failed, msg in ((failedNames, "Keys without corresponding DatasetType or StorageClass entry: "),
1715 (failedDataId, "Keys with bad DataId entries: ")):
1716 if failed:
1717 msg += ", ".join(str(k) for k in failed)
1718 messages.append(msg)
1720 if messages:
1721 raise ValidationError(";\n".join(messages))
1723 @property
1724 def collections(self) -> CollectionSearch:
1725 """The collections to search by default, in order (`CollectionSearch`).
1727 This is an alias for ``self.registry.defaults.collections``. It cannot
1728 be set directly in isolation, but all defaults may be changed together
1729 by assigning a new `RegistryDefaults` instance to
1730 ``self.registry.defaults``.
1731 """
1732 return self.registry.defaults.collections
1734 @property
1735 def run(self) -> Optional[str]:
1736 """Name of the run this butler writes outputs to by default (`str` or
1737 `None`).
1739 This is an alias for ``self.registry.defaults.run``. It cannot be set
1740 directly in isolation, but all defaults may be changed together by
1741 assigning a new `RegistryDefaults` instance to
1742 ``self.registry.defaults``.
1743 """
1744 return self.registry.defaults.run
1746 registry: Registry
1747 """The object that manages dataset metadata and relationships (`Registry`).
1749 Most operations that don't involve reading or writing butler datasets are
1750 accessible only via `Registry` methods.
1751 """
1753 datastore: Datastore
1754 """The object that manages actual dataset storage (`Datastore`).
1756 Direct user access to the datastore should rarely be necessary; the primary
1757 exception is the case where a `Datastore` implementation provides extra
1758 functionality beyond what the base class defines.
1759 """
1761 storageClasses: StorageClassFactory
1762 """An object that maps known storage class names to objects that fully
1763 describe them (`StorageClassFactory`).
1764 """