Coverage for python/lsst/daf/butler/_butler.py : 9%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""
23Butler top level classes.
24"""
25from __future__ import annotations
27__all__ = (
28 "Butler",
29 "ButlerValidationError",
30 "PruneCollectionsArgsError",
31 "PurgeWithoutUnstorePruneCollectionsError",
32 "RunWithoutPurgePruneCollectionsError",
33 "PurgeUnsupportedPruneCollectionsError",
34)
37from collections import defaultdict
38import contextlib
39import logging
40import numbers
41import os
42from typing import (
43 Any,
44 ClassVar,
45 Counter,
46 Dict,
47 Iterable,
48 Iterator,
49 List,
50 MutableMapping,
51 Optional,
52 Set,
53 TextIO,
54 Tuple,
55 Type,
56 Union,
57)
59try:
60 import boto3
61except ImportError:
62 boto3 = None
64from lsst.utils import doImport
65from .core import (
66 AmbiguousDatasetError,
67 ButlerURI,
68 Config,
69 ConfigSubset,
70 DataCoordinate,
71 DataId,
72 DataIdValue,
73 DatasetRef,
74 DatasetType,
75 Datastore,
76 Dimension,
77 DimensionConfig,
78 FileDataset,
79 Progress,
80 StorageClassFactory,
81 Timespan,
82 ValidationError,
83 VERBOSE,
84)
85from .core.repoRelocation import BUTLER_ROOT_TAG
86from .core.utils import transactional, getClassOf
87from ._deferredDatasetHandle import DeferredDatasetHandle
88from ._butlerConfig import ButlerConfig
89from .registry import (
90 Registry,
91 RegistryConfig,
92 RegistryDefaults,
93 CollectionSearch,
94 CollectionType,
95 ConflictingDefinitionError,
96 DatasetIdGenEnum,
97)
98from .transfers import RepoExportContext
100log = logging.getLogger(__name__)
103class ButlerValidationError(ValidationError):
104 """There is a problem with the Butler configuration."""
105 pass
108class PruneCollectionsArgsError(TypeError):
109 """Base class for errors relating to Butler.pruneCollections input
110 arguments.
111 """
112 pass
115class PurgeWithoutUnstorePruneCollectionsError(PruneCollectionsArgsError):
116 """Raised when purge and unstore are both required to be True, and
117 purge is True but unstore is False.
118 """
120 def __init__(self) -> None:
121 super().__init__("Cannot pass purge=True without unstore=True.")
124class RunWithoutPurgePruneCollectionsError(PruneCollectionsArgsError):
125 """Raised when pruning a RUN collection but purge is False."""
127 def __init__(self, collectionType: CollectionType):
128 self.collectionType = collectionType
129 super().__init__(f"Cannot prune RUN collection {self.collectionType.name} without purge=True.")
132class PurgeUnsupportedPruneCollectionsError(PruneCollectionsArgsError):
133 """Raised when purge is True but is not supported for the given
134 collection."""
136 def __init__(self, collectionType: CollectionType):
137 self.collectionType = collectionType
138 super().__init__(
139 f"Cannot prune {self.collectionType} collection {self.collectionType.name} with purge=True.")
142class Butler:
143 """Main entry point for the data access system.
145 Parameters
146 ----------
147 config : `ButlerConfig`, `Config` or `str`, optional.
148 Configuration. Anything acceptable to the
149 `ButlerConfig` constructor. If a directory path
150 is given the configuration will be read from a ``butler.yaml`` file in
151 that location. If `None` is given default values will be used.
152 butler : `Butler`, optional.
153 If provided, construct a new Butler that uses the same registry and
154 datastore as the given one, but with the given collection and run.
155 Incompatible with the ``config``, ``searchPaths``, and ``writeable``
156 arguments.
157 collections : `str` or `Iterable` [ `str` ], optional
158 An expression specifying the collections to be searched (in order) when
159 reading datasets.
160 This may be a `str` collection name or an iterable thereof.
161 See :ref:`daf_butler_collection_expressions` for more information.
162 These collections are not registered automatically and must be
163 manually registered before they are used by any method, but they may be
164 manually registered after the `Butler` is initialized.
165 run : `str`, optional
166 Name of the `~CollectionType.RUN` collection new datasets should be
167 inserted into. If ``collections`` is `None` and ``run`` is not `None`,
168 ``collections`` will be set to ``[run]``. If not `None`, this
169 collection will automatically be registered. If this is not set (and
170 ``writeable`` is not set either), a read-only butler will be created.
171 searchPaths : `list` of `str`, optional
172 Directory paths to search when calculating the full Butler
173 configuration. Not used if the supplied config is already a
174 `ButlerConfig`.
175 writeable : `bool`, optional
176 Explicitly sets whether the butler supports write operations. If not
177 provided, a read-write butler is created if any of ``run``, ``tags``,
178 or ``chains`` is non-empty.
179 inferDefaults : `bool`, optional
180 If `True` (default) infer default data ID values from the values
181 present in the datasets in ``collections``: if all collections have the
182 same value (or no value) for a governor dimension, that value will be
183 the default for that dimension. Nonexistent collections are ignored.
184 If a default value is provided explicitly for a governor dimension via
185 ``**kwargs``, no default will be inferred for that dimension.
186 **kwargs : `str`
187 Default data ID key-value pairs. These may only identify "governor"
188 dimensions like ``instrument`` and ``skymap``.
190 Examples
191 --------
192 While there are many ways to control exactly how a `Butler` interacts with
193 the collections in its `Registry`, the most common cases are still simple.
195 For a read-only `Butler` that searches one collection, do::
197 butler = Butler("/path/to/repo", collections=["u/alice/DM-50000"])
199 For a read-write `Butler` that writes to and reads from a
200 `~CollectionType.RUN` collection::
202 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a")
204 The `Butler` passed to a ``PipelineTask`` is often much more complex,
205 because we want to write to one `~CollectionType.RUN` collection but read
206 from several others (as well)::
208 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a",
209 collections=["u/alice/DM-50000/a",
210 "u/bob/DM-49998",
211 "HSC/defaults"])
213 This butler will `put` new datasets to the run ``u/alice/DM-50000/a``.
214 Datasets will be read first from that run (since it appears first in the
215 chain), and then from ``u/bob/DM-49998`` and finally ``HSC/defaults``.
217 Finally, one can always create a `Butler` with no collections::
219 butler = Butler("/path/to/repo", writeable=True)
221 This can be extremely useful when you just want to use ``butler.registry``,
222 e.g. for inserting dimension data or managing collections, or when the
223 collections you want to use with the butler are not consistent.
224 Passing ``writeable`` explicitly here is only necessary if you want to be
225 able to make changes to the repo - usually the value for ``writeable`` can
226 be guessed from the collection arguments provided, but it defaults to
227 `False` when there are not collection arguments.
228 """
229 def __init__(self, config: Union[Config, str, None] = None, *,
230 butler: Optional[Butler] = None,
231 collections: Any = None,
232 run: Optional[str] = None,
233 searchPaths: Optional[List[str]] = None,
234 writeable: Optional[bool] = None,
235 inferDefaults: bool = True,
236 **kwargs: str,
237 ):
238 defaults = RegistryDefaults(collections=collections, run=run, infer=inferDefaults, **kwargs)
239 # Load registry, datastore, etc. from config or existing butler.
240 if butler is not None:
241 if config is not None or searchPaths is not None or writeable is not None:
242 raise TypeError("Cannot pass 'config', 'searchPaths', or 'writeable' "
243 "arguments with 'butler' argument.")
244 self.registry = butler.registry.copy(defaults)
245 self.datastore = butler.datastore
246 self.storageClasses = butler.storageClasses
247 self._config: ButlerConfig = butler._config
248 else:
249 self._config = ButlerConfig(config, searchPaths=searchPaths)
250 if "root" in self._config:
251 butlerRoot = self._config["root"]
252 else:
253 butlerRoot = self._config.configDir
254 if writeable is None:
255 writeable = run is not None
256 self.registry = Registry.fromConfig(self._config, butlerRoot=butlerRoot, writeable=writeable,
257 defaults=defaults)
258 self.datastore = Datastore.fromConfig(self._config, self.registry.getDatastoreBridgeManager(),
259 butlerRoot=butlerRoot)
260 self.storageClasses = StorageClassFactory()
261 self.storageClasses.addFromConfig(self._config)
262 if "run" in self._config or "collection" in self._config:
263 raise ValueError("Passing a run or collection via configuration is no longer supported.")
265 GENERATION: ClassVar[int] = 3
266 """This is a Generation 3 Butler.
268 This attribute may be removed in the future, once the Generation 2 Butler
269 interface has been fully retired; it should only be used in transitional
270 code.
271 """
273 @staticmethod
274 def makeRepo(root: str, config: Union[Config, str, None] = None,
275 dimensionConfig: Union[Config, str, None] = None, standalone: bool = False,
276 searchPaths: Optional[List[str]] = None, forceConfigRoot: bool = True,
277 outfile: Optional[str] = None, overwrite: bool = False) -> Config:
278 """Create an empty data repository by adding a butler.yaml config
279 to a repository root directory.
281 Parameters
282 ----------
283 root : `str` or `ButlerURI`
284 Path or URI to the root location of the new repository. Will be
285 created if it does not exist.
286 config : `Config` or `str`, optional
287 Configuration to write to the repository, after setting any
288 root-dependent Registry or Datastore config options. Can not
289 be a `ButlerConfig` or a `ConfigSubset`. If `None`, default
290 configuration will be used. Root-dependent config options
291 specified in this config are overwritten if ``forceConfigRoot``
292 is `True`.
293 dimensionConfig : `Config` or `str`, optional
294 Configuration for dimensions, will be used to initialize registry
295 database.
296 standalone : `bool`
297 If True, write all expanded defaults, not just customized or
298 repository-specific settings.
299 This (mostly) decouples the repository from the default
300 configuration, insulating it from changes to the defaults (which
301 may be good or bad, depending on the nature of the changes).
302 Future *additions* to the defaults will still be picked up when
303 initializing `Butlers` to repos created with ``standalone=True``.
304 searchPaths : `list` of `str`, optional
305 Directory paths to search when calculating the full butler
306 configuration.
307 forceConfigRoot : `bool`, optional
308 If `False`, any values present in the supplied ``config`` that
309 would normally be reset are not overridden and will appear
310 directly in the output config. This allows non-standard overrides
311 of the root directory for a datastore or registry to be given.
312 If this parameter is `True` the values for ``root`` will be
313 forced into the resulting config if appropriate.
314 outfile : `str`, optional
315 If not-`None`, the output configuration will be written to this
316 location rather than into the repository itself. Can be a URI
317 string. Can refer to a directory that will be used to write
318 ``butler.yaml``.
319 overwrite : `bool`, optional
320 Create a new configuration file even if one already exists
321 in the specified output location. Default is to raise
322 an exception.
324 Returns
325 -------
326 config : `Config`
327 The updated `Config` instance written to the repo.
329 Raises
330 ------
331 ValueError
332 Raised if a ButlerConfig or ConfigSubset is passed instead of a
333 regular Config (as these subclasses would make it impossible to
334 support ``standalone=False``).
335 FileExistsError
336 Raised if the output config file already exists.
337 os.error
338 Raised if the directory does not exist, exists but is not a
339 directory, or cannot be created.
341 Notes
342 -----
343 Note that when ``standalone=False`` (the default), the configuration
344 search path (see `ConfigSubset.defaultSearchPaths`) that was used to
345 construct the repository should also be used to construct any Butlers
346 to avoid configuration inconsistencies.
347 """
348 if isinstance(config, (ButlerConfig, ConfigSubset)):
349 raise ValueError("makeRepo must be passed a regular Config without defaults applied.")
351 # Ensure that the root of the repository exists or can be made
352 uri = ButlerURI(root, forceDirectory=True)
353 uri.mkdir()
355 config = Config(config)
357 # If we are creating a new repo from scratch with relative roots,
358 # do not propagate an explicit root from the config file
359 if "root" in config:
360 del config["root"]
362 full = ButlerConfig(config, searchPaths=searchPaths) # this applies defaults
363 datastoreClass: Type[Datastore] = doImport(full["datastore", "cls"])
364 datastoreClass.setConfigRoot(BUTLER_ROOT_TAG, config, full, overwrite=forceConfigRoot)
366 # if key exists in given config, parse it, otherwise parse the defaults
367 # in the expanded config
368 if config.get(("registry", "db")):
369 registryConfig = RegistryConfig(config)
370 else:
371 registryConfig = RegistryConfig(full)
372 defaultDatabaseUri = registryConfig.makeDefaultDatabaseUri(BUTLER_ROOT_TAG)
373 if defaultDatabaseUri is not None:
374 Config.updateParameters(RegistryConfig, config, full,
375 toUpdate={"db": defaultDatabaseUri},
376 overwrite=forceConfigRoot)
377 else:
378 Config.updateParameters(RegistryConfig, config, full, toCopy=("db",),
379 overwrite=forceConfigRoot)
381 if standalone:
382 config.merge(full)
383 else:
384 # Always expand the registry.managers section into the per-repo
385 # config, because after the database schema is created, it's not
386 # allowed to change anymore. Note that in the standalone=True
387 # branch, _everything_ in the config is expanded, so there's no
388 # need to special case this.
389 Config.updateParameters(RegistryConfig, config, full, toCopy=("managers",), overwrite=False)
390 configURI: Union[str, ButlerURI]
391 if outfile is not None:
392 # When writing to a separate location we must include
393 # the root of the butler repo in the config else it won't know
394 # where to look.
395 config["root"] = uri.geturl()
396 configURI = outfile
397 else:
398 configURI = uri
399 config.dumpToUri(configURI, overwrite=overwrite)
401 # Create Registry and populate tables
402 registryConfig = RegistryConfig(config.get("registry"))
403 dimensionConfig = DimensionConfig(dimensionConfig)
404 Registry.createFromConfig(registryConfig, dimensionConfig=dimensionConfig, butlerRoot=root)
406 log.log(VERBOSE, "Wrote new Butler configuration file to %s", configURI)
408 return config
410 @classmethod
411 def _unpickle(cls, config: ButlerConfig, collections: Optional[CollectionSearch], run: Optional[str],
412 defaultDataId: Dict[str, str], writeable: bool) -> Butler:
413 """Callable used to unpickle a Butler.
415 We prefer not to use ``Butler.__init__`` directly so we can force some
416 of its many arguments to be keyword-only (note that ``__reduce__``
417 can only invoke callables with positional arguments).
419 Parameters
420 ----------
421 config : `ButlerConfig`
422 Butler configuration, already coerced into a true `ButlerConfig`
423 instance (and hence after any search paths for overrides have been
424 utilized).
425 collections : `CollectionSearch`
426 Names of the default collections to read from.
427 run : `str`, optional
428 Name of the default `~CollectionType.RUN` collection to write to.
429 defaultDataId : `dict` [ `str`, `str` ]
430 Default data ID values.
431 writeable : `bool`
432 Whether the Butler should support write operations.
434 Returns
435 -------
436 butler : `Butler`
437 A new `Butler` instance.
438 """
439 # MyPy doesn't recognize that the kwargs below are totally valid; it
440 # seems to think '**defaultDataId* is a _positional_ argument!
441 return cls(config=config, collections=collections, run=run, writeable=writeable,
442 **defaultDataId) # type: ignore
444 def __reduce__(self) -> tuple:
445 """Support pickling.
446 """
447 return (Butler._unpickle, (self._config, self.collections, self.run,
448 self.registry.defaults.dataId.byName(),
449 self.registry.isWriteable()))
451 def __str__(self) -> str:
452 return "Butler(collections={}, run={}, datastore='{}', registry='{}')".format(
453 self.collections, self.run, self.datastore, self.registry)
455 def isWriteable(self) -> bool:
456 """Return `True` if this `Butler` supports write operations.
457 """
458 return self.registry.isWriteable()
460 @contextlib.contextmanager
461 def transaction(self) -> Iterator[None]:
462 """Context manager supporting `Butler` transactions.
464 Transactions can be nested.
465 """
466 with self.registry.transaction():
467 with self.datastore.transaction():
468 yield
470 def _standardizeArgs(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
471 dataId: Optional[DataId] = None, **kwds: Any
472 ) -> Tuple[DatasetType, Optional[DataId]]:
473 """Standardize the arguments passed to several Butler APIs.
475 Parameters
476 ----------
477 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
478 When `DatasetRef` the `dataId` should be `None`.
479 Otherwise the `DatasetType` or name thereof.
480 dataId : `dict` or `DataCoordinate`
481 A `dict` of `Dimension` link name, value pairs that label the
482 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
483 should be provided as the second argument.
484 kwds
485 Additional keyword arguments used to augment or construct a
486 `DataCoordinate`. See `DataCoordinate.standardize`
487 parameters.
489 Returns
490 -------
491 datasetType : `DatasetType`
492 A `DatasetType` instance extracted from ``datasetRefOrType``.
493 dataId : `dict` or `DataId`, optional
494 Argument that can be used (along with ``kwds``) to construct a
495 `DataId`.
497 Notes
498 -----
499 Butler APIs that conceptually need a DatasetRef also allow passing a
500 `DatasetType` (or the name of one) and a `DataId` (or a dict and
501 keyword arguments that can be used to construct one) separately. This
502 method accepts those arguments and always returns a true `DatasetType`
503 and a `DataId` or `dict`.
505 Standardization of `dict` vs `DataId` is best handled by passing the
506 returned ``dataId`` (and ``kwds``) to `Registry` APIs, which are
507 generally similarly flexible.
508 """
509 externalDatasetType: Optional[DatasetType] = None
510 internalDatasetType: Optional[DatasetType] = None
511 if isinstance(datasetRefOrType, DatasetRef):
512 if dataId is not None or kwds:
513 raise ValueError("DatasetRef given, cannot use dataId as well")
514 externalDatasetType = datasetRefOrType.datasetType
515 dataId = datasetRefOrType.dataId
516 else:
517 # Don't check whether DataId is provided, because Registry APIs
518 # can usually construct a better error message when it wasn't.
519 if isinstance(datasetRefOrType, DatasetType):
520 externalDatasetType = datasetRefOrType
521 else:
522 internalDatasetType = self.registry.getDatasetType(datasetRefOrType)
524 # Check that they are self-consistent
525 if externalDatasetType is not None:
526 internalDatasetType = self.registry.getDatasetType(externalDatasetType.name)
527 if externalDatasetType != internalDatasetType:
528 raise ValueError(f"Supplied dataset type ({externalDatasetType}) inconsistent with "
529 f"registry definition ({internalDatasetType})")
531 assert internalDatasetType is not None
532 return internalDatasetType, dataId
534 def _findDatasetRef(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
535 dataId: Optional[DataId] = None, *,
536 collections: Any = None,
537 allowUnresolved: bool = False,
538 **kwds: Any) -> DatasetRef:
539 """Shared logic for methods that start with a search for a dataset in
540 the registry.
542 Parameters
543 ----------
544 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
545 When `DatasetRef` the `dataId` should be `None`.
546 Otherwise the `DatasetType` or name thereof.
547 dataId : `dict` or `DataCoordinate`, optional
548 A `dict` of `Dimension` link name, value pairs that label the
549 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
550 should be provided as the first argument.
551 collections : Any, optional
552 Collections to be searched, overriding ``self.collections``.
553 Can be any of the types supported by the ``collections`` argument
554 to butler construction.
555 allowUnresolved : `bool`, optional
556 If `True`, return an unresolved `DatasetRef` if finding a resolved
557 one in the `Registry` fails. Defaults to `False`.
558 kwds
559 Additional keyword arguments used to augment or construct a
560 `DataId`. See `DataId` parameters.
562 Returns
563 -------
564 ref : `DatasetRef`
565 A reference to the dataset identified by the given arguments.
567 Raises
568 ------
569 LookupError
570 Raised if no matching dataset exists in the `Registry` (and
571 ``allowUnresolved is False``).
572 ValueError
573 Raised if a resolved `DatasetRef` was passed as an input, but it
574 differs from the one found in the registry.
575 TypeError
576 Raised if no collections were provided.
577 """
578 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwds)
579 if isinstance(datasetRefOrType, DatasetRef):
580 idNumber = datasetRefOrType.id
581 else:
582 idNumber = None
583 timespan: Optional[Timespan] = None
585 # Process dimension records that are using record information
586 # rather than ids
587 newDataId: Dict[str, DataIdValue] = {}
588 byRecord: Dict[str, Dict[str, Any]] = defaultdict(dict)
590 # if all the dataId comes from keyword parameters we do not need
591 # to do anything here because they can't be of the form
592 # exposure.obs_id because a "." is not allowed in a keyword parameter.
593 if dataId:
594 for k, v in dataId.items():
595 # If we have a Dimension we do not need to do anything
596 # because it cannot be a compound key.
597 if isinstance(k, str) and "." in k:
598 # Someone is using a more human-readable dataId
599 dimensionName, record = k.split(".", 1)
600 byRecord[dimensionName][record] = v
601 elif isinstance(k, Dimension):
602 newDataId[k.name] = v
603 else:
604 newDataId[k] = v
606 # Go through the updated dataId and check the type in case someone is
607 # using an alternate key. We have already filtered out the compound
608 # keys dimensions.record format.
609 not_dimensions = {}
611 # Will need to look in the dataId and the keyword arguments
612 # and will remove them if they need to be fixed or are unrecognized.
613 for dataIdDict in (newDataId, kwds):
614 # Use a list so we can adjust the dict safely in the loop
615 for dimensionName in list(dataIdDict):
616 value = dataIdDict[dimensionName]
617 try:
618 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
619 except KeyError:
620 # This is not a real dimension
621 not_dimensions[dimensionName] = value
622 del dataIdDict[dimensionName]
623 continue
625 # Convert an integral type to an explicit int to simplify
626 # comparisons here
627 if isinstance(value, numbers.Integral):
628 value = int(value)
630 if not isinstance(value, dimension.primaryKey.getPythonType()):
631 for alternate in dimension.alternateKeys:
632 if isinstance(value, alternate.getPythonType()):
633 byRecord[dimensionName][alternate.name] = value
634 del dataIdDict[dimensionName]
635 log.debug("Converting dimension %s to %s.%s=%s",
636 dimensionName, dimensionName, alternate.name, value)
637 break
638 else:
639 log.warning("Type mismatch found for value '%r' provided for dimension %s. "
640 "Could not find matching alternative (primary key has type %s) "
641 "so attempting to use as-is.",
642 value, dimensionName, dimension.primaryKey.getPythonType())
644 # If we have some unrecognized dimensions we have to try to connect
645 # them to records in other dimensions. This is made more complicated
646 # by some dimensions having records with clashing names. A mitigation
647 # is that we can tell by this point which dimensions are missing
648 # for the DatasetType but this does not work for calibrations
649 # where additional dimensions can be used to constrain the temporal
650 # axis.
651 if not_dimensions:
652 # Calculate missing dimensions
653 provided = set(newDataId) | set(kwds) | set(byRecord)
654 missingDimensions = datasetType.dimensions.names - provided
656 # For calibrations we may well be needing temporal dimensions
657 # so rather than always including all dimensions in the scan
658 # restrict things a little. It is still possible for there
659 # to be confusion over day_obs in visit vs exposure for example.
660 # If we are not searching calibration collections things may
661 # fail but they are going to fail anyway because of the
662 # ambiguousness of the dataId...
663 candidateDimensions: Set[str] = set()
664 candidateDimensions.update(missingDimensions)
665 if datasetType.isCalibration():
666 for dim in self.registry.dimensions.getStaticDimensions():
667 if dim.temporal:
668 candidateDimensions.add(str(dim))
670 # Look up table for the first association with a dimension
671 guessedAssociation: Dict[str, Dict[str, Any]] = defaultdict(dict)
673 # Keep track of whether an item is associated with multiple
674 # dimensions.
675 counter: Counter[str] = Counter()
676 assigned: Dict[str, Set[str]] = defaultdict(set)
678 # Go through the missing dimensions and associate the
679 # given names with records within those dimensions
680 for dimensionName in candidateDimensions:
681 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
682 fields = dimension.metadata.names | dimension.uniqueKeys.names
683 for field in not_dimensions:
684 if field in fields:
685 guessedAssociation[dimensionName][field] = not_dimensions[field]
686 counter[dimensionName] += 1
687 assigned[field].add(dimensionName)
689 # There is a chance we have allocated a single dataId item
690 # to multiple dimensions. Need to decide which should be retained.
691 # For now assume that the most popular alternative wins.
692 # This means that day_obs with seq_num will result in
693 # exposure.day_obs and not visit.day_obs
694 # Also prefer an explicitly missing dimension over an inferred
695 # temporal dimension.
696 for fieldName, assignedDimensions in assigned.items():
697 if len(assignedDimensions) > 1:
698 # Pick the most popular (preferring mandatory dimensions)
699 requiredButMissing = assignedDimensions.intersection(missingDimensions)
700 if requiredButMissing:
701 candidateDimensions = requiredButMissing
702 else:
703 candidateDimensions = assignedDimensions
705 # Select the relevant items and get a new restricted
706 # counter.
707 theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions}
708 duplicatesCounter: Counter[str] = Counter()
709 duplicatesCounter.update(theseCounts)
711 # Choose the most common. If they are equally common
712 # we will pick the one that was found first.
713 # Returns a list of tuples
714 selected = duplicatesCounter.most_common(1)[0][0]
716 log.debug("Ambiguous dataId entry '%s' associated with multiple dimensions: %s."
717 " Removed ambiguity by choosing dimension %s.",
718 fieldName, ", ".join(assignedDimensions), selected)
720 for candidateDimension in assignedDimensions:
721 if candidateDimension != selected:
722 del guessedAssociation[candidateDimension][fieldName]
724 # Update the record look up dict with the new associations
725 for dimensionName, values in guessedAssociation.items():
726 if values: # A dict might now be empty
727 log.debug("Assigned non-dimension dataId keys to dimension %s: %s",
728 dimensionName, values)
729 byRecord[dimensionName].update(values)
731 if byRecord:
732 # Some record specifiers were found so we need to convert
733 # them to the Id form
734 for dimensionName, values in byRecord.items():
735 if dimensionName in newDataId:
736 log.warning("DataId specified explicit %s dimension value of %s in addition to"
737 " general record specifiers for it of %s. Ignoring record information.",
738 dimensionName, newDataId[dimensionName], str(values))
739 continue
741 # Build up a WHERE expression -- use single quotes
742 def quote(s: Any) -> str:
743 if isinstance(s, str):
744 return f"'{s}'"
745 else:
746 return s
748 where = " AND ".join(f"{dimensionName}.{k} = {quote(v)}"
749 for k, v in values.items())
751 # Hopefully we get a single record that matches
752 records = set(self.registry.queryDimensionRecords(dimensionName, dataId=newDataId,
753 where=where, **kwds))
755 if len(records) != 1:
756 if len(records) > 1:
757 log.debug("Received %d records from constraints of %s", len(records), str(values))
758 for r in records:
759 log.debug("- %s", str(r))
760 raise RuntimeError(f"DataId specification for dimension {dimensionName} is not"
761 f" uniquely constrained to a single dataset by {values}."
762 f" Got {len(records)} results.")
763 raise RuntimeError(f"DataId specification for dimension {dimensionName} matched no"
764 f" records when constrained by {values}")
766 # Get the primary key from the real dimension object
767 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
768 if not isinstance(dimension, Dimension):
769 raise RuntimeError(
770 f"{dimension.name} is not a true dimension, and cannot be used in data IDs."
771 )
772 newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name)
774 # We have modified the dataId so need to switch to it
775 dataId = newDataId
777 if datasetType.isCalibration():
778 # Because this is a calibration dataset, first try to make a
779 # standardize the data ID without restricting the dimensions to
780 # those of the dataset type requested, because there may be extra
781 # dimensions that provide temporal information for a validity-range
782 # lookup.
783 dataId = DataCoordinate.standardize(dataId, universe=self.registry.dimensions,
784 defaults=self.registry.defaults.dataId, **kwds)
785 if dataId.graph.temporal:
786 dataId = self.registry.expandDataId(dataId)
787 timespan = dataId.timespan
788 else:
789 # Standardize the data ID to just the dimensions of the dataset
790 # type instead of letting registry.findDataset do it, so we get the
791 # result even if no dataset is found.
792 dataId = DataCoordinate.standardize(dataId, graph=datasetType.dimensions,
793 defaults=self.registry.defaults.dataId, **kwds)
794 # Always lookup the DatasetRef, even if one is given, to ensure it is
795 # present in the current collection.
796 ref = self.registry.findDataset(datasetType, dataId, collections=collections, timespan=timespan)
797 if ref is None:
798 if allowUnresolved:
799 return DatasetRef(datasetType, dataId)
800 else:
801 if collections is None:
802 collections = self.registry.defaults.collections
803 raise LookupError(f"Dataset {datasetType.name} with data ID {dataId} "
804 f"could not be found in collections {collections}.")
805 if idNumber is not None and idNumber != ref.id:
806 if collections is None:
807 collections = self.registry.defaults.collections
808 raise ValueError(f"DatasetRef.id provided ({idNumber}) does not match "
809 f"id ({ref.id}) in registry in collections {collections}.")
810 return ref
812 @transactional
813 def put(self, obj: Any, datasetRefOrType: Union[DatasetRef, DatasetType, str],
814 dataId: Optional[DataId] = None, *,
815 run: Optional[str] = None,
816 **kwds: Any) -> DatasetRef:
817 """Store and register a dataset.
819 Parameters
820 ----------
821 obj : `object`
822 The dataset.
823 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
824 When `DatasetRef` is provided, ``dataId`` should be `None`.
825 Otherwise the `DatasetType` or name thereof.
826 dataId : `dict` or `DataCoordinate`
827 A `dict` of `Dimension` link name, value pairs that label the
828 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
829 should be provided as the second argument.
830 run : `str`, optional
831 The name of the run the dataset should be added to, overriding
832 ``self.run``.
833 kwds
834 Additional keyword arguments used to augment or construct a
835 `DataCoordinate`. See `DataCoordinate.standardize`
836 parameters.
838 Returns
839 -------
840 ref : `DatasetRef`
841 A reference to the stored dataset, updated with the correct id if
842 given.
844 Raises
845 ------
846 TypeError
847 Raised if the butler is read-only or if no run has been provided.
848 """
849 log.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run)
850 if not self.isWriteable():
851 raise TypeError("Butler is read-only.")
852 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwds)
853 if isinstance(datasetRefOrType, DatasetRef) and datasetRefOrType.id is not None:
854 raise ValueError("DatasetRef must not be in registry, must have None id")
856 # Add Registry Dataset entry.
857 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions, **kwds)
858 ref, = self.registry.insertDatasets(datasetType, run=run, dataIds=[dataId])
860 # Add Datastore entry.
861 self.datastore.put(obj, ref)
863 return ref
865 def getDirect(self, ref: DatasetRef, *, parameters: Optional[Dict[str, Any]] = None) -> Any:
866 """Retrieve a stored dataset.
868 Unlike `Butler.get`, this method allows datasets outside the Butler's
869 collection to be read as long as the `DatasetRef` that identifies them
870 can be obtained separately.
872 Parameters
873 ----------
874 ref : `DatasetRef`
875 Resolved reference to an already stored dataset.
876 parameters : `dict`
877 Additional StorageClass-defined options to control reading,
878 typically used to efficiently read only a subset of the dataset.
880 Returns
881 -------
882 obj : `object`
883 The dataset.
884 """
885 return self.datastore.get(ref, parameters=parameters)
887 def getDirectDeferred(self, ref: DatasetRef, *,
888 parameters: Union[dict, None] = None) -> DeferredDatasetHandle:
889 """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
890 from a resolved `DatasetRef`.
892 Parameters
893 ----------
894 ref : `DatasetRef`
895 Resolved reference to an already stored dataset.
896 parameters : `dict`
897 Additional StorageClass-defined options to control reading,
898 typically used to efficiently read only a subset of the dataset.
900 Returns
901 -------
902 obj : `DeferredDatasetHandle`
903 A handle which can be used to retrieve a dataset at a later time.
905 Raises
906 ------
907 AmbiguousDatasetError
908 Raised if ``ref.id is None``, i.e. the reference is unresolved.
909 """
910 if ref.id is None:
911 raise AmbiguousDatasetError(
912 f"Dataset of type {ref.datasetType.name} with data ID {ref.dataId} is not resolved."
913 )
914 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters)
916 def getDeferred(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
917 dataId: Optional[DataId] = None, *,
918 parameters: Union[dict, None] = None,
919 collections: Any = None,
920 **kwds: Any) -> DeferredDatasetHandle:
921 """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
922 after an immediate registry lookup.
924 Parameters
925 ----------
926 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
927 When `DatasetRef` the `dataId` should be `None`.
928 Otherwise the `DatasetType` or name thereof.
929 dataId : `dict` or `DataCoordinate`, optional
930 A `dict` of `Dimension` link name, value pairs that label the
931 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
932 should be provided as the first argument.
933 parameters : `dict`
934 Additional StorageClass-defined options to control reading,
935 typically used to efficiently read only a subset of the dataset.
936 collections : Any, optional
937 Collections to be searched, overriding ``self.collections``.
938 Can be any of the types supported by the ``collections`` argument
939 to butler construction.
940 kwds
941 Additional keyword arguments used to augment or construct a
942 `DataId`. See `DataId` parameters.
944 Returns
945 -------
946 obj : `DeferredDatasetHandle`
947 A handle which can be used to retrieve a dataset at a later time.
949 Raises
950 ------
951 LookupError
952 Raised if no matching dataset exists in the `Registry` (and
953 ``allowUnresolved is False``).
954 ValueError
955 Raised if a resolved `DatasetRef` was passed as an input, but it
956 differs from the one found in the registry.
957 TypeError
958 Raised if no collections were provided.
959 """
960 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwds)
961 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters)
963 def get(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
964 dataId: Optional[DataId] = None, *,
965 parameters: Optional[Dict[str, Any]] = None,
966 collections: Any = None,
967 **kwds: Any) -> Any:
968 """Retrieve a stored dataset.
970 Parameters
971 ----------
972 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
973 When `DatasetRef` the `dataId` should be `None`.
974 Otherwise the `DatasetType` or name thereof.
975 dataId : `dict` or `DataCoordinate`
976 A `dict` of `Dimension` link name, value pairs that label the
977 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
978 should be provided as the first argument.
979 parameters : `dict`
980 Additional StorageClass-defined options to control reading,
981 typically used to efficiently read only a subset of the dataset.
982 collections : Any, optional
983 Collections to be searched, overriding ``self.collections``.
984 Can be any of the types supported by the ``collections`` argument
985 to butler construction.
986 kwds
987 Additional keyword arguments used to augment or construct a
988 `DataCoordinate`. See `DataCoordinate.standardize`
989 parameters.
991 Returns
992 -------
993 obj : `object`
994 The dataset.
996 Raises
997 ------
998 ValueError
999 Raised if a resolved `DatasetRef` was passed as an input, but it
1000 differs from the one found in the registry.
1001 LookupError
1002 Raised if no matching dataset exists in the `Registry`.
1003 TypeError
1004 Raised if no collections were provided.
1006 Notes
1007 -----
1008 When looking up datasets in a `~CollectionType.CALIBRATION` collection,
1009 this method requires that the given data ID include temporal dimensions
1010 beyond the dimensions of the dataset type itself, in order to find the
1011 dataset with the appropriate validity range. For example, a "bias"
1012 dataset with native dimensions ``{instrument, detector}`` could be
1013 fetched with a ``{instrument, detector, exposure}`` data ID, because
1014 ``exposure`` is a temporal dimension.
1015 """
1016 log.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters)
1017 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwds)
1018 return self.getDirect(ref, parameters=parameters)
1020 def getURIs(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
1021 dataId: Optional[DataId] = None, *,
1022 predict: bool = False,
1023 collections: Any = None,
1024 run: Optional[str] = None,
1025 **kwds: Any) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]:
1026 """Returns the URIs associated with the dataset.
1028 Parameters
1029 ----------
1030 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1031 When `DatasetRef` the `dataId` should be `None`.
1032 Otherwise the `DatasetType` or name thereof.
1033 dataId : `dict` or `DataCoordinate`
1034 A `dict` of `Dimension` link name, value pairs that label the
1035 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1036 should be provided as the first argument.
1037 predict : `bool`
1038 If `True`, allow URIs to be returned of datasets that have not
1039 been written.
1040 collections : Any, optional
1041 Collections to be searched, overriding ``self.collections``.
1042 Can be any of the types supported by the ``collections`` argument
1043 to butler construction.
1044 run : `str`, optional
1045 Run to use for predictions, overriding ``self.run``.
1046 kwds
1047 Additional keyword arguments used to augment or construct a
1048 `DataCoordinate`. See `DataCoordinate.standardize`
1049 parameters.
1051 Returns
1052 -------
1053 primary : `ButlerURI`
1054 The URI to the primary artifact associated with this dataset.
1055 If the dataset was disassembled within the datastore this
1056 may be `None`.
1057 components : `dict`
1058 URIs to any components associated with the dataset artifact.
1059 Can be empty if there are no components.
1060 """
1061 ref = self._findDatasetRef(datasetRefOrType, dataId, allowUnresolved=predict,
1062 collections=collections, **kwds)
1063 if ref.id is None: # only possible if predict is True
1064 if run is None:
1065 run = self.run
1066 if run is None:
1067 raise TypeError("Cannot predict location with run=None.")
1068 # Lie about ID, because we can't guess it, and only
1069 # Datastore.getURIs() will ever see it (and it doesn't use it).
1070 ref = ref.resolved(id=0, run=run)
1071 return self.datastore.getURIs(ref, predict)
1073 def getURI(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
1074 dataId: Optional[DataId] = None, *,
1075 predict: bool = False,
1076 collections: Any = None,
1077 run: Optional[str] = None,
1078 **kwds: Any) -> ButlerURI:
1079 """Return the URI to the Dataset.
1081 Parameters
1082 ----------
1083 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1084 When `DatasetRef` the `dataId` should be `None`.
1085 Otherwise the `DatasetType` or name thereof.
1086 dataId : `dict` or `DataCoordinate`
1087 A `dict` of `Dimension` link name, value pairs that label the
1088 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1089 should be provided as the first argument.
1090 predict : `bool`
1091 If `True`, allow URIs to be returned of datasets that have not
1092 been written.
1093 collections : Any, optional
1094 Collections to be searched, overriding ``self.collections``.
1095 Can be any of the types supported by the ``collections`` argument
1096 to butler construction.
1097 run : `str`, optional
1098 Run to use for predictions, overriding ``self.run``.
1099 kwds
1100 Additional keyword arguments used to augment or construct a
1101 `DataCoordinate`. See `DataCoordinate.standardize`
1102 parameters.
1104 Returns
1105 -------
1106 uri : `ButlerURI`
1107 URI pointing to the Dataset within the datastore. If the
1108 Dataset does not exist in the datastore, and if ``predict`` is
1109 `True`, the URI will be a prediction and will include a URI
1110 fragment "#predicted".
1111 If the datastore does not have entities that relate well
1112 to the concept of a URI the returned URI string will be
1113 descriptive. The returned URI is not guaranteed to be obtainable.
1115 Raises
1116 ------
1117 LookupError
1118 A URI has been requested for a dataset that does not exist and
1119 guessing is not allowed.
1120 ValueError
1121 Raised if a resolved `DatasetRef` was passed as an input, but it
1122 differs from the one found in the registry.
1123 TypeError
1124 Raised if no collections were provided.
1125 RuntimeError
1126 Raised if a URI is requested for a dataset that consists of
1127 multiple artifacts.
1128 """
1129 primary, components = self.getURIs(datasetRefOrType, dataId=dataId, predict=predict,
1130 collections=collections, run=run, **kwds)
1132 if primary is None or components:
1133 raise RuntimeError(f"Dataset ({datasetRefOrType}) includes distinct URIs for components. "
1134 "Use Butler.getURIs() instead.")
1135 return primary
1137 def retrieveArtifacts(self, refs: Iterable[DatasetRef],
1138 destination: Union[str, ButlerURI], transfer: str = "auto",
1139 preserve_path: bool = True,
1140 overwrite: bool = False) -> List[ButlerURI]:
1141 """Retrieve the artifacts associated with the supplied refs.
1143 Parameters
1144 ----------
1145 refs : iterable of `DatasetRef`
1146 The datasets for which artifacts are to be retrieved.
1147 A single ref can result in multiple artifacts. The refs must
1148 be resolved.
1149 destination : `ButlerURI` or `str`
1150 Location to write the artifacts.
1151 transfer : `str`, optional
1152 Method to use to transfer the artifacts. Must be one of the options
1153 supported by `ButlerURI.transfer_from()`. "move" is not allowed.
1154 preserve_path : `bool`, optional
1155 If `True` the full path of the artifact within the datastore
1156 is preserved. If `False` the final file component of the path
1157 is used.
1158 overwrite : `bool`, optional
1159 If `True` allow transfers to overwrite existing files at the
1160 destination.
1162 Returns
1163 -------
1164 targets : `list` of `ButlerURI`
1165 URIs of file artifacts in destination location. Order is not
1166 preserved.
1168 Notes
1169 -----
1170 For non-file datastores the artifacts written to the destination
1171 may not match the representation inside the datastore. For example
1172 a hierarchical data structure in a NoSQL database may well be stored
1173 as a JSON file.
1174 """
1175 return self.datastore.retrieveArtifacts(refs, ButlerURI(destination), transfer=transfer,
1176 preserve_path=preserve_path, overwrite=overwrite)
1178 def datasetExists(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
1179 dataId: Optional[DataId] = None, *,
1180 collections: Any = None,
1181 **kwds: Any) -> bool:
1182 """Return True if the Dataset is actually present in the Datastore.
1184 Parameters
1185 ----------
1186 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1187 When `DatasetRef` the `dataId` should be `None`.
1188 Otherwise the `DatasetType` or name thereof.
1189 dataId : `dict` or `DataCoordinate`
1190 A `dict` of `Dimension` link name, value pairs that label the
1191 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1192 should be provided as the first argument.
1193 collections : Any, optional
1194 Collections to be searched, overriding ``self.collections``.
1195 Can be any of the types supported by the ``collections`` argument
1196 to butler construction.
1197 kwds
1198 Additional keyword arguments used to augment or construct a
1199 `DataCoordinate`. See `DataCoordinate.standardize`
1200 parameters.
1202 Raises
1203 ------
1204 LookupError
1205 Raised if the dataset is not even present in the Registry.
1206 ValueError
1207 Raised if a resolved `DatasetRef` was passed as an input, but it
1208 differs from the one found in the registry.
1209 TypeError
1210 Raised if no collections were provided.
1211 """
1212 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwds)
1213 return self.datastore.exists(ref)
1215 def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None:
1216 """Remove one or more `~CollectionType.RUN` collections and the
1217 datasets within them.
1219 Parameters
1220 ----------
1221 names : `Iterable` [ `str` ]
1222 The names of the collections to remove.
1223 unstore : `bool`, optional
1224 If `True` (default), delete datasets from all datastores in which
1225 they are present, and attempt to rollback the registry deletions if
1226 datastore deletions fail (which may not always be possible). If
1227 `False`, datastore records for these datasets are still removed,
1228 but any artifacts (e.g. files) will not be.
1230 Raises
1231 ------
1232 TypeError
1233 Raised if one or more collections are not of type
1234 `~CollectionType.RUN`.
1235 """
1236 if not self.isWriteable():
1237 raise TypeError("Butler is read-only.")
1238 names = list(names)
1239 refs: List[DatasetRef] = []
1240 for name in names:
1241 collectionType = self.registry.getCollectionType(name)
1242 if collectionType is not CollectionType.RUN:
1243 raise TypeError(f"The collection type of '{name}' is {collectionType.name}, not RUN.")
1244 refs.extend(self.registry.queryDatasets(..., collections=name, findFirst=True))
1245 with self.registry.transaction():
1246 if unstore:
1247 self.datastore.trash(refs)
1248 else:
1249 self.datastore.forget(refs)
1250 for name in names:
1251 self.registry.removeCollection(name)
1252 if unstore:
1253 # Point of no return for removing artifacts
1254 self.datastore.emptyTrash()
1256 def pruneCollection(self, name: str, purge: bool = False, unstore: bool = False,
1257 unlink: Optional[List[str]] = None) -> None:
1258 """Remove a collection and possibly prune datasets within it.
1260 Parameters
1261 ----------
1262 name : `str`
1263 Name of the collection to remove. If this is a
1264 `~CollectionType.TAGGED` or `~CollectionType.CHAINED` collection,
1265 datasets within the collection are not modified unless ``unstore``
1266 is `True`. If this is a `~CollectionType.RUN` collection,
1267 ``purge`` and ``unstore`` must be `True`, and all datasets in it
1268 are fully removed from the data repository.
1269 purge : `bool`, optional
1270 If `True`, permit `~CollectionType.RUN` collections to be removed,
1271 fully removing datasets within them. Requires ``unstore=True`` as
1272 well as an added precaution against accidental deletion. Must be
1273 `False` (default) if the collection is not a ``RUN``.
1274 unstore: `bool`, optional
1275 If `True`, remove all datasets in the collection from all
1276 datastores in which they appear.
1277 unlink: `list` [`str`], optional
1278 Before removing the given `collection` unlink it from from these
1279 parent collections.
1281 Raises
1282 ------
1283 TypeError
1284 Raised if the butler is read-only or arguments are mutually
1285 inconsistent.
1286 """
1287 # See pruneDatasets comments for more information about the logic here;
1288 # the cases are almost the same, but here we can rely on Registry to
1289 # take care everything but Datastore deletion when we remove the
1290 # collection.
1291 if not self.isWriteable():
1292 raise TypeError("Butler is read-only.")
1293 collectionType = self.registry.getCollectionType(name)
1294 if purge and not unstore:
1295 raise PurgeWithoutUnstorePruneCollectionsError()
1296 if collectionType is CollectionType.RUN and not purge:
1297 raise RunWithoutPurgePruneCollectionsError(collectionType)
1298 if collectionType is not CollectionType.RUN and purge:
1299 raise PurgeUnsupportedPruneCollectionsError(collectionType)
1301 def remove(child: str, parent: str) -> None:
1302 """Remove a child collection from a parent collection."""
1303 # Remove child from parent.
1304 chain = list(self.registry.getCollectionChain(parent))
1305 try:
1306 chain.remove(name)
1307 except ValueError as e:
1308 raise RuntimeError(f"{name} is not a child of {parent}") from e
1309 self.registry.setCollectionChain(parent, chain)
1311 with self.registry.transaction():
1312 if (unlink):
1313 for parent in unlink:
1314 remove(name, parent)
1315 if unstore:
1316 refs = self.registry.queryDatasets(..., collections=name, findFirst=True)
1317 self.datastore.trash(refs)
1318 self.registry.removeCollection(name)
1320 if unstore:
1321 # Point of no return for removing artifacts
1322 self.datastore.emptyTrash()
1324 def pruneDatasets(self, refs: Iterable[DatasetRef], *,
1325 disassociate: bool = True,
1326 unstore: bool = False,
1327 tags: Iterable[str] = (),
1328 purge: bool = False,
1329 run: Optional[str] = None) -> None:
1330 """Remove one or more datasets from a collection and/or storage.
1332 Parameters
1333 ----------
1334 refs : `~collections.abc.Iterable` of `DatasetRef`
1335 Datasets to prune. These must be "resolved" references (not just
1336 a `DatasetType` and data ID).
1337 disassociate : `bool`, optional
1338 Disassociate pruned datasets from ``tags``, or from all collections
1339 if ``purge=True``.
1340 unstore : `bool`, optional
1341 If `True` (`False` is default) remove these datasets from all
1342 datastores known to this butler. Note that this will make it
1343 impossible to retrieve these datasets even via other collections.
1344 Datasets that are already not stored are ignored by this option.
1345 tags : `Iterable` [ `str` ], optional
1346 `~CollectionType.TAGGED` collections to disassociate the datasets
1347 from. Ignored if ``disassociate`` is `False` or ``purge`` is
1348 `True`.
1349 purge : `bool`, optional
1350 If `True` (`False` is default), completely remove the dataset from
1351 the `Registry`. To prevent accidental deletions, ``purge`` may
1352 only be `True` if all of the following conditions are met:
1354 - All given datasets are in the given run.
1355 - ``disassociate`` is `True`;
1356 - ``unstore`` is `True`.
1358 This mode may remove provenance information from datasets other
1359 than those provided, and should be used with extreme care.
1361 Raises
1362 ------
1363 TypeError
1364 Raised if the butler is read-only, if no collection was provided,
1365 or the conditions for ``purge=True`` were not met.
1366 """
1367 if not self.isWriteable():
1368 raise TypeError("Butler is read-only.")
1369 if purge:
1370 if not disassociate:
1371 raise TypeError("Cannot pass purge=True without disassociate=True.")
1372 if not unstore:
1373 raise TypeError("Cannot pass purge=True without unstore=True.")
1374 elif disassociate:
1375 tags = tuple(tags)
1376 if not tags:
1377 raise TypeError("No tags provided but disassociate=True.")
1378 for tag in tags:
1379 collectionType = self.registry.getCollectionType(tag)
1380 if collectionType is not CollectionType.TAGGED:
1381 raise TypeError(f"Cannot disassociate from collection '{tag}' "
1382 f"of non-TAGGED type {collectionType.name}.")
1383 # Transform possibly-single-pass iterable into something we can iterate
1384 # over multiple times.
1385 refs = list(refs)
1386 # Pruning a component of a DatasetRef makes no sense since registry
1387 # doesn't know about components and datastore might not store
1388 # components in a separate file
1389 for ref in refs:
1390 if ref.datasetType.component():
1391 raise ValueError(f"Can not prune a component of a dataset (ref={ref})")
1392 # We don't need an unreliable Datastore transaction for this, because
1393 # we've been extra careful to ensure that Datastore.trash only involves
1394 # mutating the Registry (it can _look_ at Datastore-specific things,
1395 # but shouldn't change them), and hence all operations here are
1396 # Registry operations.
1397 with self.registry.transaction():
1398 if unstore:
1399 self.datastore.trash(refs)
1400 if purge:
1401 self.registry.removeDatasets(refs)
1402 elif disassociate:
1403 assert tags, "Guaranteed by earlier logic in this function."
1404 for tag in tags:
1405 self.registry.disassociate(tag, refs)
1406 # We've exited the Registry transaction, and apparently committed.
1407 # (if there was an exception, everything rolled back, and it's as if
1408 # nothing happened - and we never get here).
1409 # Datastore artifacts are not yet gone, but they're clearly marked
1410 # as trash, so if we fail to delete now because of (e.g.) filesystem
1411 # problems we can try again later, and if manual administrative
1412 # intervention is required, it's pretty clear what that should entail:
1413 # deleting everything on disk and in private Datastore tables that is
1414 # in the dataset_location_trash table.
1415 if unstore:
1416 # Point of no return for removing artifacts
1417 self.datastore.emptyTrash()
1419 @transactional
1420 def ingest(self, *datasets: FileDataset, transfer: Optional[str] = "auto", run: Optional[str] = None,
1421 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
1422 ) -> None:
1423 """Store and register one or more datasets that already exist on disk.
1425 Parameters
1426 ----------
1427 datasets : `FileDataset`
1428 Each positional argument is a struct containing information about
1429 a file to be ingested, including its path (either absolute or
1430 relative to the datastore root, if applicable), a `DatasetRef`,
1431 and optionally a formatter class or its fully-qualified string
1432 name. If a formatter is not provided, the formatter that would be
1433 used for `put` is assumed. On successful return, all
1434 `FileDataset.ref` attributes will have their `DatasetRef.id`
1435 attribute populated and all `FileDataset.formatter` attributes will
1436 be set to the formatter class used. `FileDataset.path` attributes
1437 may be modified to put paths in whatever the datastore considers a
1438 standardized form.
1439 transfer : `str`, optional
1440 If not `None`, must be one of 'auto', 'move', 'copy', 'direct',
1441 'hardlink', 'relsymlink' or 'symlink', indicating how to transfer
1442 the file.
1443 run : `str`, optional
1444 The name of the run ingested datasets should be added to,
1445 overriding ``self.run``.
1446 idGenerationMode : `DatasetIdGenEnum`, optional
1447 Specifies option for generating dataset IDs. By default unique IDs
1448 are generated for each inserted dataset.
1450 Raises
1451 ------
1452 TypeError
1453 Raised if the butler is read-only or if no run was provided.
1454 NotImplementedError
1455 Raised if the `Datastore` does not support the given transfer mode.
1456 DatasetTypeNotSupportedError
1457 Raised if one or more files to be ingested have a dataset type that
1458 is not supported by the `Datastore`..
1459 FileNotFoundError
1460 Raised if one of the given files does not exist.
1461 FileExistsError
1462 Raised if transfer is not `None` but the (internal) location the
1463 file would be moved to is already occupied.
1465 Notes
1466 -----
1467 This operation is not fully exception safe: if a database operation
1468 fails, the given `FileDataset` instances may be only partially updated.
1470 It is atomic in terms of database operations (they will either all
1471 succeed or all fail) providing the database engine implements
1472 transactions correctly. It will attempt to be atomic in terms of
1473 filesystem operations as well, but this cannot be implemented
1474 rigorously for most datastores.
1475 """
1476 if not self.isWriteable():
1477 raise TypeError("Butler is read-only.")
1478 progress = Progress("lsst.daf.butler.Butler.ingest", level=logging.DEBUG)
1479 # Reorganize the inputs so they're grouped by DatasetType and then
1480 # data ID. We also include a list of DatasetRefs for each FileDataset
1481 # to hold the resolved DatasetRefs returned by the Registry, before
1482 # it's safe to swap them into FileDataset.refs.
1483 # Some type annotation aliases to make that clearer:
1484 GroupForType = Dict[DataCoordinate, Tuple[FileDataset, List[DatasetRef]]]
1485 GroupedData = MutableMapping[DatasetType, GroupForType]
1486 # The actual data structure:
1487 groupedData: GroupedData = defaultdict(dict)
1488 # And the nested loop that populates it:
1489 for dataset in progress.wrap(datasets, desc="Grouping by dataset type"):
1490 # This list intentionally shared across the inner loop, since it's
1491 # associated with `dataset`.
1492 resolvedRefs: List[DatasetRef] = []
1493 for ref in dataset.refs:
1494 if ref.dataId in groupedData[ref.datasetType]:
1495 raise ConflictingDefinitionError(f"Ingest conflict. Dataset {dataset.path} has same"
1496 " DataId as other ingest dataset"
1497 f" {groupedData[ref.datasetType][ref.dataId][0].path} "
1498 f" ({ref.dataId})")
1499 groupedData[ref.datasetType][ref.dataId] = (dataset, resolvedRefs)
1501 # Now we can bulk-insert into Registry for each DatasetType.
1502 allResolvedRefs: List[DatasetRef] = []
1503 for datasetType, groupForType in progress.iter_item_chunks(groupedData.items(),
1504 desc="Bulk-inserting datasets by type"):
1505 refs = self.registry.insertDatasets(
1506 datasetType,
1507 dataIds=groupForType.keys(),
1508 run=run,
1509 expand=self.datastore.needs_expanded_data_ids(transfer, datasetType),
1510 idGenerationMode=idGenerationMode,
1511 )
1512 # Append those resolved DatasetRefs to the new lists we set up for
1513 # them.
1514 for ref, (_, resolvedRefs) in zip(refs, groupForType.values()):
1515 resolvedRefs.append(ref)
1517 # Go back to the original FileDatasets to replace their refs with the
1518 # new resolved ones, and also build a big list of all refs.
1519 allResolvedRefs = []
1520 for groupForType in progress.iter_chunks(groupedData.values(),
1521 desc="Reassociating resolved dataset refs with files"):
1522 for dataset, resolvedRefs in groupForType.values():
1523 dataset.refs = resolvedRefs
1524 allResolvedRefs.extend(resolvedRefs)
1526 # Bulk-insert everything into Datastore.
1527 self.datastore.ingest(*datasets, transfer=transfer)
1529 @contextlib.contextmanager
1530 def export(self, *, directory: Optional[str] = None,
1531 filename: Optional[str] = None,
1532 format: Optional[str] = None,
1533 transfer: Optional[str] = None) -> Iterator[RepoExportContext]:
1534 """Export datasets from the repository represented by this `Butler`.
1536 This method is a context manager that returns a helper object
1537 (`RepoExportContext`) that is used to indicate what information from
1538 the repository should be exported.
1540 Parameters
1541 ----------
1542 directory : `str`, optional
1543 Directory dataset files should be written to if ``transfer`` is not
1544 `None`.
1545 filename : `str`, optional
1546 Name for the file that will include database information associated
1547 with the exported datasets. If this is not an absolute path and
1548 ``directory`` is not `None`, it will be written to ``directory``
1549 instead of the current working directory. Defaults to
1550 "export.{format}".
1551 format : `str`, optional
1552 File format for the database information file. If `None`, the
1553 extension of ``filename`` will be used.
1554 transfer : `str`, optional
1555 Transfer mode passed to `Datastore.export`.
1557 Raises
1558 ------
1559 TypeError
1560 Raised if the set of arguments passed is inconsistent.
1562 Examples
1563 --------
1564 Typically the `Registry.queryDataIds` and `Registry.queryDatasets`
1565 methods are used to provide the iterables over data IDs and/or datasets
1566 to be exported::
1568 with butler.export("exports.yaml") as export:
1569 # Export all flats, but none of the dimension element rows
1570 # (i.e. data ID information) associated with them.
1571 export.saveDatasets(butler.registry.queryDatasets("flat"),
1572 elements=())
1573 # Export all datasets that start with "deepCoadd_" and all of
1574 # their associated data ID information.
1575 export.saveDatasets(butler.registry.queryDatasets("deepCoadd_*"))
1576 """
1577 if directory is None and transfer is not None:
1578 raise TypeError("Cannot transfer without providing a directory.")
1579 if transfer == "move":
1580 raise TypeError("Transfer may not be 'move': export is read-only")
1581 if format is None:
1582 if filename is None:
1583 raise TypeError("At least one of 'filename' or 'format' must be provided.")
1584 else:
1585 _, format = os.path.splitext(filename)
1586 elif filename is None:
1587 filename = f"export.{format}"
1588 if directory is not None:
1589 filename = os.path.join(directory, filename)
1590 BackendClass = getClassOf(self._config["repo_transfer_formats"][format]["export"])
1591 with open(filename, 'w') as stream:
1592 backend = BackendClass(stream)
1593 try:
1594 helper = RepoExportContext(self.registry, self.datastore, backend=backend,
1595 directory=directory, transfer=transfer)
1596 yield helper
1597 except BaseException:
1598 raise
1599 else:
1600 helper._finish()
1602 def import_(self, *, directory: Optional[str] = None,
1603 filename: Union[str, TextIO, None] = None,
1604 format: Optional[str] = None,
1605 transfer: Optional[str] = None,
1606 skip_dimensions: Optional[Set] = None,
1607 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
1608 reuseIds: bool = False) -> None:
1609 """Import datasets into this repository that were exported from a
1610 different butler repository via `~lsst.daf.butler.Butler.export`.
1612 Parameters
1613 ----------
1614 directory : `str`, optional
1615 Directory containing dataset files to import from. If `None`,
1616 ``filename`` and all dataset file paths specified therein must
1617 be absolute.
1618 filename : `str` or `TextIO`, optional
1619 A stream or name of file that contains database information
1620 associated with the exported datasets, typically generated by
1621 `~lsst.daf.butler.Butler.export`. If this a string (name) and
1622 is not an absolute path, does not exist in the current working
1623 directory, and ``directory`` is not `None`, it is assumed to be in
1624 ``directory``. Defaults to "export.{format}".
1625 format : `str`, optional
1626 File format for ``filename``. If `None`, the extension of
1627 ``filename`` will be used.
1628 transfer : `str`, optional
1629 Transfer mode passed to `~lsst.daf.butler.Datastore.ingest`.
1630 skip_dimensions : `set`, optional
1631 Names of dimensions that should be skipped and not imported.
1632 idGenerationMode : `DatasetIdGenEnum`, optional
1633 Specifies option for generating dataset IDs when IDs are not
1634 provided or their type does not match backend type. By default
1635 unique IDs are generated for each inserted dataset.
1636 reuseIds : `bool`, optional
1637 If `True` then forces re-use of imported dataset IDs for integer
1638 IDs which are normally generated as auto-incremented; exception
1639 will be raised if imported IDs clash with existing ones. This
1640 option has no effect on the use of globally-unique IDs which are
1641 always re-used (or generated if integer IDs are being imported).
1643 Raises
1644 ------
1645 TypeError
1646 Raised if the set of arguments passed is inconsistent, or if the
1647 butler is read-only.
1648 """
1649 if not self.isWriteable():
1650 raise TypeError("Butler is read-only.")
1651 if format is None:
1652 if filename is None:
1653 raise TypeError("At least one of 'filename' or 'format' must be provided.")
1654 else:
1655 _, format = os.path.splitext(filename) # type: ignore
1656 elif filename is None:
1657 filename = f"export.{format}"
1658 if isinstance(filename, str) and directory is not None and not os.path.exists(filename):
1659 filename = os.path.join(directory, filename)
1660 BackendClass = getClassOf(self._config["repo_transfer_formats"][format]["import"])
1662 def doImport(importStream: TextIO) -> None:
1663 backend = BackendClass(importStream, self.registry)
1664 backend.register()
1665 with self.transaction():
1666 backend.load(self.datastore, directory=directory, transfer=transfer,
1667 skip_dimensions=skip_dimensions, idGenerationMode=idGenerationMode,
1668 reuseIds=reuseIds)
1670 if isinstance(filename, str):
1671 with open(filename, "r") as stream:
1672 doImport(stream)
1673 else:
1674 doImport(filename)
1676 def validateConfiguration(self, logFailures: bool = False,
1677 datasetTypeNames: Optional[Iterable[str]] = None,
1678 ignore: Iterable[str] = None) -> None:
1679 """Validate butler configuration.
1681 Checks that each `DatasetType` can be stored in the `Datastore`.
1683 Parameters
1684 ----------
1685 logFailures : `bool`, optional
1686 If `True`, output a log message for every validation error
1687 detected.
1688 datasetTypeNames : iterable of `str`, optional
1689 The `DatasetType` names that should be checked. This allows
1690 only a subset to be selected.
1691 ignore : iterable of `str`, optional
1692 Names of DatasetTypes to skip over. This can be used to skip
1693 known problems. If a named `DatasetType` corresponds to a
1694 composite, all components of that `DatasetType` will also be
1695 ignored.
1697 Raises
1698 ------
1699 ButlerValidationError
1700 Raised if there is some inconsistency with how this Butler
1701 is configured.
1702 """
1703 if datasetTypeNames:
1704 datasetTypes = [self.registry.getDatasetType(name) for name in datasetTypeNames]
1705 else:
1706 datasetTypes = list(self.registry.queryDatasetTypes())
1708 # filter out anything from the ignore list
1709 if ignore:
1710 ignore = set(ignore)
1711 datasetTypes = [e for e in datasetTypes
1712 if e.name not in ignore and e.nameAndComponent()[0] not in ignore]
1713 else:
1714 ignore = set()
1716 # Find all the registered instruments
1717 instruments = set(
1718 record.name for record in self.registry.queryDimensionRecords("instrument")
1719 )
1721 # For each datasetType that has an instrument dimension, create
1722 # a DatasetRef for each defined instrument
1723 datasetRefs = []
1725 for datasetType in datasetTypes:
1726 if "instrument" in datasetType.dimensions:
1727 for instrument in instruments:
1728 datasetRef = DatasetRef(datasetType, {"instrument": instrument}, # type: ignore
1729 conform=False)
1730 datasetRefs.append(datasetRef)
1732 entities: List[Union[DatasetType, DatasetRef]] = []
1733 entities.extend(datasetTypes)
1734 entities.extend(datasetRefs)
1736 datastoreErrorStr = None
1737 try:
1738 self.datastore.validateConfiguration(entities, logFailures=logFailures)
1739 except ValidationError as e:
1740 datastoreErrorStr = str(e)
1742 # Also check that the LookupKeys used by the datastores match
1743 # registry and storage class definitions
1744 keys = self.datastore.getLookupKeys()
1746 failedNames = set()
1747 failedDataId = set()
1748 for key in keys:
1749 if key.name is not None:
1750 if key.name in ignore:
1751 continue
1753 # skip if specific datasetType names were requested and this
1754 # name does not match
1755 if datasetTypeNames and key.name not in datasetTypeNames:
1756 continue
1758 # See if it is a StorageClass or a DatasetType
1759 if key.name in self.storageClasses:
1760 pass
1761 else:
1762 try:
1763 self.registry.getDatasetType(key.name)
1764 except KeyError:
1765 if logFailures:
1766 log.critical("Key '%s' does not correspond to a DatasetType or StorageClass", key)
1767 failedNames.add(key)
1768 else:
1769 # Dimensions are checked for consistency when the Butler
1770 # is created and rendezvoused with a universe.
1771 pass
1773 # Check that the instrument is a valid instrument
1774 # Currently only support instrument so check for that
1775 if key.dataId:
1776 dataIdKeys = set(key.dataId)
1777 if set(["instrument"]) != dataIdKeys:
1778 if logFailures:
1779 log.critical("Key '%s' has unsupported DataId override", key)
1780 failedDataId.add(key)
1781 elif key.dataId["instrument"] not in instruments:
1782 if logFailures:
1783 log.critical("Key '%s' has unknown instrument", key)
1784 failedDataId.add(key)
1786 messages = []
1788 if datastoreErrorStr:
1789 messages.append(datastoreErrorStr)
1791 for failed, msg in ((failedNames, "Keys without corresponding DatasetType or StorageClass entry: "),
1792 (failedDataId, "Keys with bad DataId entries: ")):
1793 if failed:
1794 msg += ", ".join(str(k) for k in failed)
1795 messages.append(msg)
1797 if messages:
1798 raise ValidationError(";\n".join(messages))
1800 @property
1801 def collections(self) -> CollectionSearch:
1802 """The collections to search by default, in order (`CollectionSearch`).
1804 This is an alias for ``self.registry.defaults.collections``. It cannot
1805 be set directly in isolation, but all defaults may be changed together
1806 by assigning a new `RegistryDefaults` instance to
1807 ``self.registry.defaults``.
1808 """
1809 return self.registry.defaults.collections
1811 @property
1812 def run(self) -> Optional[str]:
1813 """Name of the run this butler writes outputs to by default (`str` or
1814 `None`).
1816 This is an alias for ``self.registry.defaults.run``. It cannot be set
1817 directly in isolation, but all defaults may be changed together by
1818 assigning a new `RegistryDefaults` instance to
1819 ``self.registry.defaults``.
1820 """
1821 return self.registry.defaults.run
1823 registry: Registry
1824 """The object that manages dataset metadata and relationships (`Registry`).
1826 Most operations that don't involve reading or writing butler datasets are
1827 accessible only via `Registry` methods.
1828 """
1830 datastore: Datastore
1831 """The object that manages actual dataset storage (`Datastore`).
1833 Direct user access to the datastore should rarely be necessary; the primary
1834 exception is the case where a `Datastore` implementation provides extra
1835 functionality beyond what the base class defines.
1836 """
1838 storageClasses: StorageClassFactory
1839 """An object that maps known storage class names to objects that fully
1840 describe them (`StorageClassFactory`).
1841 """