Coverage for python/lsst/daf/butler/_butler.py : 9%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""
23Butler top level classes.
24"""
25from __future__ import annotations
27__all__ = (
28 "Butler",
29 "ButlerValidationError",
30 "PruneCollectionsArgsError",
31 "PurgeWithoutUnstorePruneCollectionsError",
32 "RunWithoutPurgePruneCollectionsError",
33 "PurgeUnsupportedPruneCollectionsError",
34)
37from collections import defaultdict
38import contextlib
39import logging
40import numbers
41import os
42from typing import (
43 Any,
44 ClassVar,
45 Counter,
46 Dict,
47 Iterable,
48 Iterator,
49 List,
50 MutableMapping,
51 Optional,
52 Set,
53 TextIO,
54 Tuple,
55 Type,
56 Union,
57)
59try:
60 import boto3
61except ImportError:
62 boto3 = None
64from lsst.utils import doImport
65from .core import (
66 AmbiguousDatasetError,
67 ButlerURI,
68 Config,
69 ConfigSubset,
70 DataCoordinate,
71 DataId,
72 DataIdValue,
73 DatasetRef,
74 DatasetType,
75 Datastore,
76 Dimension,
77 DimensionConfig,
78 FileDataset,
79 Progress,
80 StorageClassFactory,
81 Timespan,
82 ValidationError,
83)
84from .core.repoRelocation import BUTLER_ROOT_TAG
85from .core.utils import transactional, getClassOf
86from ._deferredDatasetHandle import DeferredDatasetHandle
87from ._butlerConfig import ButlerConfig
88from .registry import (
89 Registry,
90 RegistryConfig,
91 RegistryDefaults,
92 CollectionSearch,
93 CollectionType,
94 ConflictingDefinitionError,
95 DatasetIdGenEnum,
96)
97from .transfers import RepoExportContext
99log = logging.getLogger(__name__)
102class ButlerValidationError(ValidationError):
103 """There is a problem with the Butler configuration."""
104 pass
107class PruneCollectionsArgsError(TypeError):
108 """Base class for errors relating to Butler.pruneCollections input
109 arguments.
110 """
111 pass
114class PurgeWithoutUnstorePruneCollectionsError(PruneCollectionsArgsError):
115 """Raised when purge and unstore are both required to be True, and
116 purge is True but unstore is False.
117 """
119 def __init__(self) -> None:
120 super().__init__("Cannot pass purge=True without unstore=True.")
123class RunWithoutPurgePruneCollectionsError(PruneCollectionsArgsError):
124 """Raised when pruning a RUN collection but purge is False."""
126 def __init__(self, collectionType: CollectionType):
127 self.collectionType = collectionType
128 super().__init__(f"Cannot prune RUN collection {self.collectionType.name} without purge=True.")
131class PurgeUnsupportedPruneCollectionsError(PruneCollectionsArgsError):
132 """Raised when purge is True but is not supported for the given
133 collection."""
135 def __init__(self, collectionType: CollectionType):
136 self.collectionType = collectionType
137 super().__init__(
138 f"Cannot prune {self.collectionType} collection {self.collectionType.name} with purge=True.")
141class Butler:
142 """Main entry point for the data access system.
144 Parameters
145 ----------
146 config : `ButlerConfig`, `Config` or `str`, optional.
147 Configuration. Anything acceptable to the
148 `ButlerConfig` constructor. If a directory path
149 is given the configuration will be read from a ``butler.yaml`` file in
150 that location. If `None` is given default values will be used.
151 butler : `Butler`, optional.
152 If provided, construct a new Butler that uses the same registry and
153 datastore as the given one, but with the given collection and run.
154 Incompatible with the ``config``, ``searchPaths``, and ``writeable``
155 arguments.
156 collections : `str` or `Iterable` [ `str` ], optional
157 An expression specifying the collections to be searched (in order) when
158 reading datasets.
159 This may be a `str` collection name or an iterable thereof.
160 See :ref:`daf_butler_collection_expressions` for more information.
161 These collections are not registered automatically and must be
162 manually registered before they are used by any method, but they may be
163 manually registered after the `Butler` is initialized.
164 run : `str`, optional
165 Name of the `~CollectionType.RUN` collection new datasets should be
166 inserted into. If ``collections`` is `None` and ``run`` is not `None`,
167 ``collections`` will be set to ``[run]``. If not `None`, this
168 collection will automatically be registered. If this is not set (and
169 ``writeable`` is not set either), a read-only butler will be created.
170 searchPaths : `list` of `str`, optional
171 Directory paths to search when calculating the full Butler
172 configuration. Not used if the supplied config is already a
173 `ButlerConfig`.
174 writeable : `bool`, optional
175 Explicitly sets whether the butler supports write operations. If not
176 provided, a read-write butler is created if any of ``run``, ``tags``,
177 or ``chains`` is non-empty.
178 inferDefaults : `bool`, optional
179 If `True` (default) infer default data ID values from the values
180 present in the datasets in ``collections``: if all collections have the
181 same value (or no value) for a governor dimension, that value will be
182 the default for that dimension. Nonexistent collections are ignored.
183 If a default value is provided explicitly for a governor dimension via
184 ``**kwargs``, no default will be inferred for that dimension.
185 **kwargs : `str`
186 Default data ID key-value pairs. These may only identify "governor"
187 dimensions like ``instrument`` and ``skymap``.
189 Examples
190 --------
191 While there are many ways to control exactly how a `Butler` interacts with
192 the collections in its `Registry`, the most common cases are still simple.
194 For a read-only `Butler` that searches one collection, do::
196 butler = Butler("/path/to/repo", collections=["u/alice/DM-50000"])
198 For a read-write `Butler` that writes to and reads from a
199 `~CollectionType.RUN` collection::
201 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a")
203 The `Butler` passed to a ``PipelineTask`` is often much more complex,
204 because we want to write to one `~CollectionType.RUN` collection but read
205 from several others (as well)::
207 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a",
208 collections=["u/alice/DM-50000/a",
209 "u/bob/DM-49998",
210 "HSC/defaults"])
212 This butler will `put` new datasets to the run ``u/alice/DM-50000/a``.
213 Datasets will be read first from that run (since it appears first in the
214 chain), and then from ``u/bob/DM-49998`` and finally ``HSC/defaults``.
216 Finally, one can always create a `Butler` with no collections::
218 butler = Butler("/path/to/repo", writeable=True)
220 This can be extremely useful when you just want to use ``butler.registry``,
221 e.g. for inserting dimension data or managing collections, or when the
222 collections you want to use with the butler are not consistent.
223 Passing ``writeable`` explicitly here is only necessary if you want to be
224 able to make changes to the repo - usually the value for ``writeable`` can
225 be guessed from the collection arguments provided, but it defaults to
226 `False` when there are not collection arguments.
227 """
228 def __init__(self, config: Union[Config, str, None] = None, *,
229 butler: Optional[Butler] = None,
230 collections: Any = None,
231 run: Optional[str] = None,
232 searchPaths: Optional[List[str]] = None,
233 writeable: Optional[bool] = None,
234 inferDefaults: bool = True,
235 **kwargs: str,
236 ):
237 defaults = RegistryDefaults(collections=collections, run=run, infer=inferDefaults, **kwargs)
238 # Load registry, datastore, etc. from config or existing butler.
239 if butler is not None:
240 if config is not None or searchPaths is not None or writeable is not None:
241 raise TypeError("Cannot pass 'config', 'searchPaths', or 'writeable' "
242 "arguments with 'butler' argument.")
243 self.registry = butler.registry.copy(defaults)
244 self.datastore = butler.datastore
245 self.storageClasses = butler.storageClasses
246 self._config: ButlerConfig = butler._config
247 else:
248 self._config = ButlerConfig(config, searchPaths=searchPaths)
249 if "root" in self._config:
250 butlerRoot = self._config["root"]
251 else:
252 butlerRoot = self._config.configDir
253 if writeable is None:
254 writeable = run is not None
255 self.registry = Registry.fromConfig(self._config, butlerRoot=butlerRoot, writeable=writeable,
256 defaults=defaults)
257 self.datastore = Datastore.fromConfig(self._config, self.registry.getDatastoreBridgeManager(),
258 butlerRoot=butlerRoot)
259 self.storageClasses = StorageClassFactory()
260 self.storageClasses.addFromConfig(self._config)
261 if "run" in self._config or "collection" in self._config:
262 raise ValueError("Passing a run or collection via configuration is no longer supported.")
264 GENERATION: ClassVar[int] = 3
265 """This is a Generation 3 Butler.
267 This attribute may be removed in the future, once the Generation 2 Butler
268 interface has been fully retired; it should only be used in transitional
269 code.
270 """
272 @staticmethod
273 def makeRepo(root: str, config: Union[Config, str, None] = None,
274 dimensionConfig: Union[Config, str, None] = None, standalone: bool = False,
275 searchPaths: Optional[List[str]] = None, forceConfigRoot: bool = True,
276 outfile: Optional[str] = None, overwrite: bool = False) -> Config:
277 """Create an empty data repository by adding a butler.yaml config
278 to a repository root directory.
280 Parameters
281 ----------
282 root : `str` or `ButlerURI`
283 Path or URI to the root location of the new repository. Will be
284 created if it does not exist.
285 config : `Config` or `str`, optional
286 Configuration to write to the repository, after setting any
287 root-dependent Registry or Datastore config options. Can not
288 be a `ButlerConfig` or a `ConfigSubset`. If `None`, default
289 configuration will be used. Root-dependent config options
290 specified in this config are overwritten if ``forceConfigRoot``
291 is `True`.
292 dimensionConfig : `Config` or `str`, optional
293 Configuration for dimensions, will be used to initialize registry
294 database.
295 standalone : `bool`
296 If True, write all expanded defaults, not just customized or
297 repository-specific settings.
298 This (mostly) decouples the repository from the default
299 configuration, insulating it from changes to the defaults (which
300 may be good or bad, depending on the nature of the changes).
301 Future *additions* to the defaults will still be picked up when
302 initializing `Butlers` to repos created with ``standalone=True``.
303 searchPaths : `list` of `str`, optional
304 Directory paths to search when calculating the full butler
305 configuration.
306 forceConfigRoot : `bool`, optional
307 If `False`, any values present in the supplied ``config`` that
308 would normally be reset are not overridden and will appear
309 directly in the output config. This allows non-standard overrides
310 of the root directory for a datastore or registry to be given.
311 If this parameter is `True` the values for ``root`` will be
312 forced into the resulting config if appropriate.
313 outfile : `str`, optional
314 If not-`None`, the output configuration will be written to this
315 location rather than into the repository itself. Can be a URI
316 string. Can refer to a directory that will be used to write
317 ``butler.yaml``.
318 overwrite : `bool`, optional
319 Create a new configuration file even if one already exists
320 in the specified output location. Default is to raise
321 an exception.
323 Returns
324 -------
325 config : `Config`
326 The updated `Config` instance written to the repo.
328 Raises
329 ------
330 ValueError
331 Raised if a ButlerConfig or ConfigSubset is passed instead of a
332 regular Config (as these subclasses would make it impossible to
333 support ``standalone=False``).
334 FileExistsError
335 Raised if the output config file already exists.
336 os.error
337 Raised if the directory does not exist, exists but is not a
338 directory, or cannot be created.
340 Notes
341 -----
342 Note that when ``standalone=False`` (the default), the configuration
343 search path (see `ConfigSubset.defaultSearchPaths`) that was used to
344 construct the repository should also be used to construct any Butlers
345 to avoid configuration inconsistencies.
346 """
347 if isinstance(config, (ButlerConfig, ConfigSubset)):
348 raise ValueError("makeRepo must be passed a regular Config without defaults applied.")
350 # Ensure that the root of the repository exists or can be made
351 uri = ButlerURI(root, forceDirectory=True)
352 uri.mkdir()
354 config = Config(config)
356 # If we are creating a new repo from scratch with relative roots,
357 # do not propagate an explicit root from the config file
358 if "root" in config:
359 del config["root"]
361 full = ButlerConfig(config, searchPaths=searchPaths) # this applies defaults
362 datastoreClass: Type[Datastore] = doImport(full["datastore", "cls"])
363 datastoreClass.setConfigRoot(BUTLER_ROOT_TAG, config, full, overwrite=forceConfigRoot)
365 # if key exists in given config, parse it, otherwise parse the defaults
366 # in the expanded config
367 if config.get(("registry", "db")):
368 registryConfig = RegistryConfig(config)
369 else:
370 registryConfig = RegistryConfig(full)
371 defaultDatabaseUri = registryConfig.makeDefaultDatabaseUri(BUTLER_ROOT_TAG)
372 if defaultDatabaseUri is not None:
373 Config.updateParameters(RegistryConfig, config, full,
374 toUpdate={"db": defaultDatabaseUri},
375 overwrite=forceConfigRoot)
376 else:
377 Config.updateParameters(RegistryConfig, config, full, toCopy=("db",),
378 overwrite=forceConfigRoot)
380 if standalone:
381 config.merge(full)
382 else:
383 # Always expand the registry.managers section into the per-repo
384 # config, because after the database schema is created, it's not
385 # allowed to change anymore. Note that in the standalone=True
386 # branch, _everything_ in the config is expanded, so there's no
387 # need to special case this.
388 Config.updateParameters(RegistryConfig, config, full, toCopy=("managers",), overwrite=False)
389 configURI: Union[str, ButlerURI]
390 if outfile is not None:
391 # When writing to a separate location we must include
392 # the root of the butler repo in the config else it won't know
393 # where to look.
394 config["root"] = uri.geturl()
395 configURI = outfile
396 else:
397 configURI = uri
398 config.dumpToUri(configURI, overwrite=overwrite)
400 # Create Registry and populate tables
401 registryConfig = RegistryConfig(config.get("registry"))
402 dimensionConfig = DimensionConfig(dimensionConfig)
403 Registry.createFromConfig(registryConfig, dimensionConfig=dimensionConfig, butlerRoot=root)
405 return config
407 @classmethod
408 def _unpickle(cls, config: ButlerConfig, collections: Optional[CollectionSearch], run: Optional[str],
409 defaultDataId: Dict[str, str], writeable: bool) -> Butler:
410 """Callable used to unpickle a Butler.
412 We prefer not to use ``Butler.__init__`` directly so we can force some
413 of its many arguments to be keyword-only (note that ``__reduce__``
414 can only invoke callables with positional arguments).
416 Parameters
417 ----------
418 config : `ButlerConfig`
419 Butler configuration, already coerced into a true `ButlerConfig`
420 instance (and hence after any search paths for overrides have been
421 utilized).
422 collections : `CollectionSearch`
423 Names of the default collections to read from.
424 run : `str`, optional
425 Name of the default `~CollectionType.RUN` collection to write to.
426 defaultDataId : `dict` [ `str`, `str` ]
427 Default data ID values.
428 writeable : `bool`
429 Whether the Butler should support write operations.
431 Returns
432 -------
433 butler : `Butler`
434 A new `Butler` instance.
435 """
436 # MyPy doesn't recognize that the kwargs below are totally valid; it
437 # seems to think '**defaultDataId* is a _positional_ argument!
438 return cls(config=config, collections=collections, run=run, writeable=writeable,
439 **defaultDataId) # type: ignore
441 def __reduce__(self) -> tuple:
442 """Support pickling.
443 """
444 return (Butler._unpickle, (self._config, self.collections, self.run,
445 self.registry.defaults.dataId.byName(),
446 self.registry.isWriteable()))
448 def __str__(self) -> str:
449 return "Butler(collections={}, run={}, datastore='{}', registry='{}')".format(
450 self.collections, self.run, self.datastore, self.registry)
452 def isWriteable(self) -> bool:
453 """Return `True` if this `Butler` supports write operations.
454 """
455 return self.registry.isWriteable()
457 @contextlib.contextmanager
458 def transaction(self) -> Iterator[None]:
459 """Context manager supporting `Butler` transactions.
461 Transactions can be nested.
462 """
463 with self.registry.transaction():
464 with self.datastore.transaction():
465 yield
467 def _standardizeArgs(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
468 dataId: Optional[DataId] = None, **kwds: Any
469 ) -> Tuple[DatasetType, Optional[DataId]]:
470 """Standardize the arguments passed to several Butler APIs.
472 Parameters
473 ----------
474 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
475 When `DatasetRef` the `dataId` should be `None`.
476 Otherwise the `DatasetType` or name thereof.
477 dataId : `dict` or `DataCoordinate`
478 A `dict` of `Dimension` link name, value pairs that label the
479 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
480 should be provided as the second argument.
481 kwds
482 Additional keyword arguments used to augment or construct a
483 `DataCoordinate`. See `DataCoordinate.standardize`
484 parameters.
486 Returns
487 -------
488 datasetType : `DatasetType`
489 A `DatasetType` instance extracted from ``datasetRefOrType``.
490 dataId : `dict` or `DataId`, optional
491 Argument that can be used (along with ``kwds``) to construct a
492 `DataId`.
494 Notes
495 -----
496 Butler APIs that conceptually need a DatasetRef also allow passing a
497 `DatasetType` (or the name of one) and a `DataId` (or a dict and
498 keyword arguments that can be used to construct one) separately. This
499 method accepts those arguments and always returns a true `DatasetType`
500 and a `DataId` or `dict`.
502 Standardization of `dict` vs `DataId` is best handled by passing the
503 returned ``dataId`` (and ``kwds``) to `Registry` APIs, which are
504 generally similarly flexible.
505 """
506 externalDatasetType: Optional[DatasetType] = None
507 internalDatasetType: Optional[DatasetType] = None
508 if isinstance(datasetRefOrType, DatasetRef):
509 if dataId is not None or kwds:
510 raise ValueError("DatasetRef given, cannot use dataId as well")
511 externalDatasetType = datasetRefOrType.datasetType
512 dataId = datasetRefOrType.dataId
513 else:
514 # Don't check whether DataId is provided, because Registry APIs
515 # can usually construct a better error message when it wasn't.
516 if isinstance(datasetRefOrType, DatasetType):
517 externalDatasetType = datasetRefOrType
518 else:
519 internalDatasetType = self.registry.getDatasetType(datasetRefOrType)
521 # Check that they are self-consistent
522 if externalDatasetType is not None:
523 internalDatasetType = self.registry.getDatasetType(externalDatasetType.name)
524 if externalDatasetType != internalDatasetType:
525 raise ValueError(f"Supplied dataset type ({externalDatasetType}) inconsistent with "
526 f"registry definition ({internalDatasetType})")
528 assert internalDatasetType is not None
529 return internalDatasetType, dataId
531 def _findDatasetRef(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
532 dataId: Optional[DataId] = None, *,
533 collections: Any = None,
534 allowUnresolved: bool = False,
535 **kwds: Any) -> DatasetRef:
536 """Shared logic for methods that start with a search for a dataset in
537 the registry.
539 Parameters
540 ----------
541 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
542 When `DatasetRef` the `dataId` should be `None`.
543 Otherwise the `DatasetType` or name thereof.
544 dataId : `dict` or `DataCoordinate`, optional
545 A `dict` of `Dimension` link name, value pairs that label the
546 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
547 should be provided as the first argument.
548 collections : Any, optional
549 Collections to be searched, overriding ``self.collections``.
550 Can be any of the types supported by the ``collections`` argument
551 to butler construction.
552 allowUnresolved : `bool`, optional
553 If `True`, return an unresolved `DatasetRef` if finding a resolved
554 one in the `Registry` fails. Defaults to `False`.
555 kwds
556 Additional keyword arguments used to augment or construct a
557 `DataId`. See `DataId` parameters.
559 Returns
560 -------
561 ref : `DatasetRef`
562 A reference to the dataset identified by the given arguments.
564 Raises
565 ------
566 LookupError
567 Raised if no matching dataset exists in the `Registry` (and
568 ``allowUnresolved is False``).
569 ValueError
570 Raised if a resolved `DatasetRef` was passed as an input, but it
571 differs from the one found in the registry.
572 TypeError
573 Raised if no collections were provided.
574 """
575 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwds)
576 if isinstance(datasetRefOrType, DatasetRef):
577 idNumber = datasetRefOrType.id
578 else:
579 idNumber = None
580 timespan: Optional[Timespan] = None
582 # Process dimension records that are using record information
583 # rather than ids
584 newDataId: Dict[str, DataIdValue] = {}
585 byRecord: Dict[str, Dict[str, Any]] = defaultdict(dict)
587 # if all the dataId comes from keyword parameters we do not need
588 # to do anything here because they can't be of the form
589 # exposure.obs_id because a "." is not allowed in a keyword parameter.
590 if dataId:
591 for k, v in dataId.items():
592 # If we have a Dimension we do not need to do anything
593 # because it cannot be a compound key.
594 if isinstance(k, str) and "." in k:
595 # Someone is using a more human-readable dataId
596 dimensionName, record = k.split(".", 1)
597 byRecord[dimensionName][record] = v
598 elif isinstance(k, Dimension):
599 newDataId[k.name] = v
600 else:
601 newDataId[k] = v
603 # Go through the updated dataId and check the type in case someone is
604 # using an alternate key. We have already filtered out the compound
605 # keys dimensions.record format.
606 not_dimensions = {}
608 # Will need to look in the dataId and the keyword arguments
609 # and will remove them if they need to be fixed or are unrecognized.
610 for dataIdDict in (newDataId, kwds):
611 # Use a list so we can adjust the dict safely in the loop
612 for dimensionName in list(dataIdDict):
613 value = dataIdDict[dimensionName]
614 try:
615 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
616 except KeyError:
617 # This is not a real dimension
618 not_dimensions[dimensionName] = value
619 del dataIdDict[dimensionName]
620 continue
622 # Convert an integral type to an explicit int to simplify
623 # comparisons here
624 if isinstance(value, numbers.Integral):
625 value = int(value)
627 if not isinstance(value, dimension.primaryKey.getPythonType()):
628 for alternate in dimension.alternateKeys:
629 if isinstance(value, alternate.getPythonType()):
630 byRecord[dimensionName][alternate.name] = value
631 del dataIdDict[dimensionName]
632 log.debug("Converting dimension %s to %s.%s=%s",
633 dimensionName, dimensionName, alternate.name, value)
634 break
635 else:
636 log.warning("Type mismatch found for value '%r' provided for dimension %s. "
637 "Could not find matching alternative (primary key has type %s) "
638 "so attempting to use as-is.",
639 value, dimensionName, dimension.primaryKey.getPythonType())
641 # If we have some unrecognized dimensions we have to try to connect
642 # them to records in other dimensions. This is made more complicated
643 # by some dimensions having records with clashing names. A mitigation
644 # is that we can tell by this point which dimensions are missing
645 # for the DatasetType but this does not work for calibrations
646 # where additional dimensions can be used to constrain the temporal
647 # axis.
648 if not_dimensions:
649 # Calculate missing dimensions
650 provided = set(newDataId) | set(kwds) | set(byRecord)
651 missingDimensions = datasetType.dimensions.names - provided
653 # For calibrations we may well be needing temporal dimensions
654 # so rather than always including all dimensions in the scan
655 # restrict things a little. It is still possible for there
656 # to be confusion over day_obs in visit vs exposure for example.
657 # If we are not searching calibration collections things may
658 # fail but they are going to fail anyway because of the
659 # ambiguousness of the dataId...
660 candidateDimensions: Set[str] = set()
661 candidateDimensions.update(missingDimensions)
662 if datasetType.isCalibration():
663 for dim in self.registry.dimensions.getStaticDimensions():
664 if dim.temporal:
665 candidateDimensions.add(str(dim))
667 # Look up table for the first association with a dimension
668 guessedAssociation: Dict[str, Dict[str, Any]] = defaultdict(dict)
670 # Keep track of whether an item is associated with multiple
671 # dimensions.
672 counter: Counter[str] = Counter()
673 assigned: Dict[str, Set[str]] = defaultdict(set)
675 # Go through the missing dimensions and associate the
676 # given names with records within those dimensions
677 for dimensionName in candidateDimensions:
678 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
679 fields = dimension.metadata.names | dimension.uniqueKeys.names
680 for field in not_dimensions:
681 if field in fields:
682 guessedAssociation[dimensionName][field] = not_dimensions[field]
683 counter[dimensionName] += 1
684 assigned[field].add(dimensionName)
686 # There is a chance we have allocated a single dataId item
687 # to multiple dimensions. Need to decide which should be retained.
688 # For now assume that the most popular alternative wins.
689 # This means that day_obs with seq_num will result in
690 # exposure.day_obs and not visit.day_obs
691 # Also prefer an explicitly missing dimension over an inferred
692 # temporal dimension.
693 for fieldName, assignedDimensions in assigned.items():
694 if len(assignedDimensions) > 1:
695 # Pick the most popular (preferring mandatory dimensions)
696 requiredButMissing = assignedDimensions.intersection(missingDimensions)
697 if requiredButMissing:
698 candidateDimensions = requiredButMissing
699 else:
700 candidateDimensions = assignedDimensions
702 # Select the relevant items and get a new restricted
703 # counter.
704 theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions}
705 duplicatesCounter: Counter[str] = Counter()
706 duplicatesCounter.update(theseCounts)
708 # Choose the most common. If they are equally common
709 # we will pick the one that was found first.
710 # Returns a list of tuples
711 selected = duplicatesCounter.most_common(1)[0][0]
713 log.debug("Ambiguous dataId entry '%s' associated with multiple dimensions: %s."
714 " Removed ambiguity by choosing dimension %s.",
715 fieldName, ", ".join(assignedDimensions), selected)
717 for candidateDimension in assignedDimensions:
718 if candidateDimension != selected:
719 del guessedAssociation[candidateDimension][fieldName]
721 # Update the record look up dict with the new associations
722 for dimensionName, values in guessedAssociation.items():
723 if values: # A dict might now be empty
724 log.debug("Assigned non-dimension dataId keys to dimension %s: %s",
725 dimensionName, values)
726 byRecord[dimensionName].update(values)
728 if byRecord:
729 # Some record specifiers were found so we need to convert
730 # them to the Id form
731 for dimensionName, values in byRecord.items():
732 if dimensionName in newDataId:
733 log.warning("DataId specified explicit %s dimension value of %s in addition to"
734 " general record specifiers for it of %s. Ignoring record information.",
735 dimensionName, newDataId[dimensionName], str(values))
736 continue
738 # Build up a WHERE expression -- use single quotes
739 def quote(s: Any) -> str:
740 if isinstance(s, str):
741 return f"'{s}'"
742 else:
743 return s
745 where = " AND ".join(f"{dimensionName}.{k} = {quote(v)}"
746 for k, v in values.items())
748 # Hopefully we get a single record that matches
749 records = set(self.registry.queryDimensionRecords(dimensionName, dataId=newDataId,
750 where=where, **kwds))
752 if len(records) != 1:
753 if len(records) > 1:
754 log.debug("Received %d records from constraints of %s", len(records), str(values))
755 for r in records:
756 log.debug("- %s", str(r))
757 raise RuntimeError(f"DataId specification for dimension {dimensionName} is not"
758 f" uniquely constrained to a single dataset by {values}."
759 f" Got {len(records)} results.")
760 raise RuntimeError(f"DataId specification for dimension {dimensionName} matched no"
761 f" records when constrained by {values}")
763 # Get the primary key from the real dimension object
764 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName]
765 if not isinstance(dimension, Dimension):
766 raise RuntimeError(
767 f"{dimension.name} is not a true dimension, and cannot be used in data IDs."
768 )
769 newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name)
771 # We have modified the dataId so need to switch to it
772 dataId = newDataId
774 if datasetType.isCalibration():
775 # Because this is a calibration dataset, first try to make a
776 # standardize the data ID without restricting the dimensions to
777 # those of the dataset type requested, because there may be extra
778 # dimensions that provide temporal information for a validity-range
779 # lookup.
780 dataId = DataCoordinate.standardize(dataId, universe=self.registry.dimensions,
781 defaults=self.registry.defaults.dataId, **kwds)
782 if dataId.graph.temporal:
783 dataId = self.registry.expandDataId(dataId)
784 timespan = dataId.timespan
785 else:
786 # Standardize the data ID to just the dimensions of the dataset
787 # type instead of letting registry.findDataset do it, so we get the
788 # result even if no dataset is found.
789 dataId = DataCoordinate.standardize(dataId, graph=datasetType.dimensions,
790 defaults=self.registry.defaults.dataId, **kwds)
791 # Always lookup the DatasetRef, even if one is given, to ensure it is
792 # present in the current collection.
793 ref = self.registry.findDataset(datasetType, dataId, collections=collections, timespan=timespan)
794 if ref is None:
795 if allowUnresolved:
796 return DatasetRef(datasetType, dataId)
797 else:
798 if collections is None:
799 collections = self.registry.defaults.collections
800 raise LookupError(f"Dataset {datasetType.name} with data ID {dataId} "
801 f"could not be found in collections {collections}.")
802 if idNumber is not None and idNumber != ref.id:
803 if collections is None:
804 collections = self.registry.defaults.collections
805 raise ValueError(f"DatasetRef.id provided ({idNumber}) does not match "
806 f"id ({ref.id}) in registry in collections {collections}.")
807 return ref
809 @transactional
810 def put(self, obj: Any, datasetRefOrType: Union[DatasetRef, DatasetType, str],
811 dataId: Optional[DataId] = None, *,
812 run: Optional[str] = None,
813 **kwds: Any) -> DatasetRef:
814 """Store and register a dataset.
816 Parameters
817 ----------
818 obj : `object`
819 The dataset.
820 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
821 When `DatasetRef` is provided, ``dataId`` should be `None`.
822 Otherwise the `DatasetType` or name thereof.
823 dataId : `dict` or `DataCoordinate`
824 A `dict` of `Dimension` link name, value pairs that label the
825 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
826 should be provided as the second argument.
827 run : `str`, optional
828 The name of the run the dataset should be added to, overriding
829 ``self.run``.
830 kwds
831 Additional keyword arguments used to augment or construct a
832 `DataCoordinate`. See `DataCoordinate.standardize`
833 parameters.
835 Returns
836 -------
837 ref : `DatasetRef`
838 A reference to the stored dataset, updated with the correct id if
839 given.
841 Raises
842 ------
843 TypeError
844 Raised if the butler is read-only or if no run has been provided.
845 """
846 log.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run)
847 if not self.isWriteable():
848 raise TypeError("Butler is read-only.")
849 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwds)
850 if isinstance(datasetRefOrType, DatasetRef) and datasetRefOrType.id is not None:
851 raise ValueError("DatasetRef must not be in registry, must have None id")
853 # Add Registry Dataset entry.
854 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions, **kwds)
855 ref, = self.registry.insertDatasets(datasetType, run=run, dataIds=[dataId])
857 # Add Datastore entry.
858 self.datastore.put(obj, ref)
860 return ref
862 def getDirect(self, ref: DatasetRef, *, parameters: Optional[Dict[str, Any]] = None) -> Any:
863 """Retrieve a stored dataset.
865 Unlike `Butler.get`, this method allows datasets outside the Butler's
866 collection to be read as long as the `DatasetRef` that identifies them
867 can be obtained separately.
869 Parameters
870 ----------
871 ref : `DatasetRef`
872 Resolved reference to an already stored dataset.
873 parameters : `dict`
874 Additional StorageClass-defined options to control reading,
875 typically used to efficiently read only a subset of the dataset.
877 Returns
878 -------
879 obj : `object`
880 The dataset.
881 """
882 return self.datastore.get(ref, parameters=parameters)
884 def getDirectDeferred(self, ref: DatasetRef, *,
885 parameters: Union[dict, None] = None) -> DeferredDatasetHandle:
886 """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
887 from a resolved `DatasetRef`.
889 Parameters
890 ----------
891 ref : `DatasetRef`
892 Resolved reference to an already stored dataset.
893 parameters : `dict`
894 Additional StorageClass-defined options to control reading,
895 typically used to efficiently read only a subset of the dataset.
897 Returns
898 -------
899 obj : `DeferredDatasetHandle`
900 A handle which can be used to retrieve a dataset at a later time.
902 Raises
903 ------
904 AmbiguousDatasetError
905 Raised if ``ref.id is None``, i.e. the reference is unresolved.
906 """
907 if ref.id is None:
908 raise AmbiguousDatasetError(
909 f"Dataset of type {ref.datasetType.name} with data ID {ref.dataId} is not resolved."
910 )
911 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters)
913 def getDeferred(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
914 dataId: Optional[DataId] = None, *,
915 parameters: Union[dict, None] = None,
916 collections: Any = None,
917 **kwds: Any) -> DeferredDatasetHandle:
918 """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
919 after an immediate registry lookup.
921 Parameters
922 ----------
923 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
924 When `DatasetRef` the `dataId` should be `None`.
925 Otherwise the `DatasetType` or name thereof.
926 dataId : `dict` or `DataCoordinate`, optional
927 A `dict` of `Dimension` link name, value pairs that label the
928 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
929 should be provided as the first argument.
930 parameters : `dict`
931 Additional StorageClass-defined options to control reading,
932 typically used to efficiently read only a subset of the dataset.
933 collections : Any, optional
934 Collections to be searched, overriding ``self.collections``.
935 Can be any of the types supported by the ``collections`` argument
936 to butler construction.
937 kwds
938 Additional keyword arguments used to augment or construct a
939 `DataId`. See `DataId` parameters.
941 Returns
942 -------
943 obj : `DeferredDatasetHandle`
944 A handle which can be used to retrieve a dataset at a later time.
946 Raises
947 ------
948 LookupError
949 Raised if no matching dataset exists in the `Registry` (and
950 ``allowUnresolved is False``).
951 ValueError
952 Raised if a resolved `DatasetRef` was passed as an input, but it
953 differs from the one found in the registry.
954 TypeError
955 Raised if no collections were provided.
956 """
957 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwds)
958 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters)
960 def get(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
961 dataId: Optional[DataId] = None, *,
962 parameters: Optional[Dict[str, Any]] = None,
963 collections: Any = None,
964 **kwds: Any) -> Any:
965 """Retrieve a stored dataset.
967 Parameters
968 ----------
969 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
970 When `DatasetRef` the `dataId` should be `None`.
971 Otherwise the `DatasetType` or name thereof.
972 dataId : `dict` or `DataCoordinate`
973 A `dict` of `Dimension` link name, value pairs that label the
974 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
975 should be provided as the first argument.
976 parameters : `dict`
977 Additional StorageClass-defined options to control reading,
978 typically used to efficiently read only a subset of the dataset.
979 collections : Any, optional
980 Collections to be searched, overriding ``self.collections``.
981 Can be any of the types supported by the ``collections`` argument
982 to butler construction.
983 kwds
984 Additional keyword arguments used to augment or construct a
985 `DataCoordinate`. See `DataCoordinate.standardize`
986 parameters.
988 Returns
989 -------
990 obj : `object`
991 The dataset.
993 Raises
994 ------
995 ValueError
996 Raised if a resolved `DatasetRef` was passed as an input, but it
997 differs from the one found in the registry.
998 LookupError
999 Raised if no matching dataset exists in the `Registry`.
1000 TypeError
1001 Raised if no collections were provided.
1003 Notes
1004 -----
1005 When looking up datasets in a `~CollectionType.CALIBRATION` collection,
1006 this method requires that the given data ID include temporal dimensions
1007 beyond the dimensions of the dataset type itself, in order to find the
1008 dataset with the appropriate validity range. For example, a "bias"
1009 dataset with native dimensions ``{instrument, detector}`` could be
1010 fetched with a ``{instrument, detector, exposure}`` data ID, because
1011 ``exposure`` is a temporal dimension.
1012 """
1013 log.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters)
1014 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwds)
1015 return self.getDirect(ref, parameters=parameters)
1017 def getURIs(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
1018 dataId: Optional[DataId] = None, *,
1019 predict: bool = False,
1020 collections: Any = None,
1021 run: Optional[str] = None,
1022 **kwds: Any) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]:
1023 """Returns the URIs associated with the dataset.
1025 Parameters
1026 ----------
1027 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1028 When `DatasetRef` the `dataId` should be `None`.
1029 Otherwise the `DatasetType` or name thereof.
1030 dataId : `dict` or `DataCoordinate`
1031 A `dict` of `Dimension` link name, value pairs that label the
1032 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1033 should be provided as the first argument.
1034 predict : `bool`
1035 If `True`, allow URIs to be returned of datasets that have not
1036 been written.
1037 collections : Any, optional
1038 Collections to be searched, overriding ``self.collections``.
1039 Can be any of the types supported by the ``collections`` argument
1040 to butler construction.
1041 run : `str`, optional
1042 Run to use for predictions, overriding ``self.run``.
1043 kwds
1044 Additional keyword arguments used to augment or construct a
1045 `DataCoordinate`. See `DataCoordinate.standardize`
1046 parameters.
1048 Returns
1049 -------
1050 primary : `ButlerURI`
1051 The URI to the primary artifact associated with this dataset.
1052 If the dataset was disassembled within the datastore this
1053 may be `None`.
1054 components : `dict`
1055 URIs to any components associated with the dataset artifact.
1056 Can be empty if there are no components.
1057 """
1058 ref = self._findDatasetRef(datasetRefOrType, dataId, allowUnresolved=predict,
1059 collections=collections, **kwds)
1060 if ref.id is None: # only possible if predict is True
1061 if run is None:
1062 run = self.run
1063 if run is None:
1064 raise TypeError("Cannot predict location with run=None.")
1065 # Lie about ID, because we can't guess it, and only
1066 # Datastore.getURIs() will ever see it (and it doesn't use it).
1067 ref = ref.resolved(id=0, run=run)
1068 return self.datastore.getURIs(ref, predict)
1070 def getURI(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
1071 dataId: Optional[DataId] = None, *,
1072 predict: bool = False,
1073 collections: Any = None,
1074 run: Optional[str] = None,
1075 **kwds: Any) -> ButlerURI:
1076 """Return the URI to the Dataset.
1078 Parameters
1079 ----------
1080 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1081 When `DatasetRef` the `dataId` should be `None`.
1082 Otherwise the `DatasetType` or name thereof.
1083 dataId : `dict` or `DataCoordinate`
1084 A `dict` of `Dimension` link name, value pairs that label the
1085 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1086 should be provided as the first argument.
1087 predict : `bool`
1088 If `True`, allow URIs to be returned of datasets that have not
1089 been written.
1090 collections : Any, optional
1091 Collections to be searched, overriding ``self.collections``.
1092 Can be any of the types supported by the ``collections`` argument
1093 to butler construction.
1094 run : `str`, optional
1095 Run to use for predictions, overriding ``self.run``.
1096 kwds
1097 Additional keyword arguments used to augment or construct a
1098 `DataCoordinate`. See `DataCoordinate.standardize`
1099 parameters.
1101 Returns
1102 -------
1103 uri : `ButlerURI`
1104 URI pointing to the Dataset within the datastore. If the
1105 Dataset does not exist in the datastore, and if ``predict`` is
1106 `True`, the URI will be a prediction and will include a URI
1107 fragment "#predicted".
1108 If the datastore does not have entities that relate well
1109 to the concept of a URI the returned URI string will be
1110 descriptive. The returned URI is not guaranteed to be obtainable.
1112 Raises
1113 ------
1114 LookupError
1115 A URI has been requested for a dataset that does not exist and
1116 guessing is not allowed.
1117 ValueError
1118 Raised if a resolved `DatasetRef` was passed as an input, but it
1119 differs from the one found in the registry.
1120 TypeError
1121 Raised if no collections were provided.
1122 RuntimeError
1123 Raised if a URI is requested for a dataset that consists of
1124 multiple artifacts.
1125 """
1126 primary, components = self.getURIs(datasetRefOrType, dataId=dataId, predict=predict,
1127 collections=collections, run=run, **kwds)
1129 if primary is None or components:
1130 raise RuntimeError(f"Dataset ({datasetRefOrType}) includes distinct URIs for components. "
1131 "Use Butler.getURIs() instead.")
1132 return primary
1134 def retrieveArtifacts(self, refs: Iterable[DatasetRef],
1135 destination: Union[str, ButlerURI], transfer: str = "auto",
1136 preserve_path: bool = True,
1137 overwrite: bool = False) -> List[ButlerURI]:
1138 """Retrieve the artifacts associated with the supplied refs.
1140 Parameters
1141 ----------
1142 refs : iterable of `DatasetRef`
1143 The datasets for which artifacts are to be retrieved.
1144 A single ref can result in multiple artifacts. The refs must
1145 be resolved.
1146 destination : `ButlerURI` or `str`
1147 Location to write the artifacts.
1148 transfer : `str`, optional
1149 Method to use to transfer the artifacts. Must be one of the options
1150 supported by `ButlerURI.transfer_from()`. "move" is not allowed.
1151 preserve_path : `bool`, optional
1152 If `True` the full path of the artifact within the datastore
1153 is preserved. If `False` the final file component of the path
1154 is used.
1155 overwrite : `bool`, optional
1156 If `True` allow transfers to overwrite existing files at the
1157 destination.
1159 Returns
1160 -------
1161 targets : `list` of `ButlerURI`
1162 URIs of file artifacts in destination location. Order is not
1163 preserved.
1165 Notes
1166 -----
1167 For non-file datastores the artifacts written to the destination
1168 may not match the representation inside the datastore. For example
1169 a hierarchical data structure in a NoSQL database may well be stored
1170 as a JSON file.
1171 """
1172 return self.datastore.retrieveArtifacts(refs, ButlerURI(destination), transfer=transfer,
1173 preserve_path=preserve_path, overwrite=overwrite)
1175 def datasetExists(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
1176 dataId: Optional[DataId] = None, *,
1177 collections: Any = None,
1178 **kwds: Any) -> bool:
1179 """Return True if the Dataset is actually present in the Datastore.
1181 Parameters
1182 ----------
1183 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1184 When `DatasetRef` the `dataId` should be `None`.
1185 Otherwise the `DatasetType` or name thereof.
1186 dataId : `dict` or `DataCoordinate`
1187 A `dict` of `Dimension` link name, value pairs that label the
1188 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1189 should be provided as the first argument.
1190 collections : Any, optional
1191 Collections to be searched, overriding ``self.collections``.
1192 Can be any of the types supported by the ``collections`` argument
1193 to butler construction.
1194 kwds
1195 Additional keyword arguments used to augment or construct a
1196 `DataCoordinate`. See `DataCoordinate.standardize`
1197 parameters.
1199 Raises
1200 ------
1201 LookupError
1202 Raised if the dataset is not even present in the Registry.
1203 ValueError
1204 Raised if a resolved `DatasetRef` was passed as an input, but it
1205 differs from the one found in the registry.
1206 TypeError
1207 Raised if no collections were provided.
1208 """
1209 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwds)
1210 return self.datastore.exists(ref)
1212 def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None:
1213 """Remove one or more `~CollectionType.RUN` collections and the
1214 datasets within them.
1216 Parameters
1217 ----------
1218 names : `Iterable` [ `str` ]
1219 The names of the collections to remove.
1220 unstore : `bool`, optional
1221 If `True` (default), delete datasets from all datastores in which
1222 they are present, and attempt to rollback the registry deletions if
1223 datastore deletions fail (which may not always be possible). If
1224 `False`, datastore records for these datasets are still removed,
1225 but any artifacts (e.g. files) will not be.
1227 Raises
1228 ------
1229 TypeError
1230 Raised if one or more collections are not of type
1231 `~CollectionType.RUN`.
1232 """
1233 if not self.isWriteable():
1234 raise TypeError("Butler is read-only.")
1235 names = list(names)
1236 refs: List[DatasetRef] = []
1237 for name in names:
1238 collectionType = self.registry.getCollectionType(name)
1239 if collectionType is not CollectionType.RUN:
1240 raise TypeError(f"The collection type of '{name}' is {collectionType.name}, not RUN.")
1241 refs.extend(self.registry.queryDatasets(..., collections=name, findFirst=True))
1242 with self.registry.transaction():
1243 if unstore:
1244 self.datastore.trash(refs)
1245 else:
1246 self.datastore.forget(refs)
1247 for name in names:
1248 self.registry.removeCollection(name)
1249 if unstore:
1250 # Point of no return for removing artifacts
1251 self.datastore.emptyTrash()
1253 def pruneCollection(self, name: str, purge: bool = False, unstore: bool = False,
1254 unlink: Optional[List[str]] = None) -> None:
1255 """Remove a collection and possibly prune datasets within it.
1257 Parameters
1258 ----------
1259 name : `str`
1260 Name of the collection to remove. If this is a
1261 `~CollectionType.TAGGED` or `~CollectionType.CHAINED` collection,
1262 datasets within the collection are not modified unless ``unstore``
1263 is `True`. If this is a `~CollectionType.RUN` collection,
1264 ``purge`` and ``unstore`` must be `True`, and all datasets in it
1265 are fully removed from the data repository.
1266 purge : `bool`, optional
1267 If `True`, permit `~CollectionType.RUN` collections to be removed,
1268 fully removing datasets within them. Requires ``unstore=True`` as
1269 well as an added precaution against accidental deletion. Must be
1270 `False` (default) if the collection is not a ``RUN``.
1271 unstore: `bool`, optional
1272 If `True`, remove all datasets in the collection from all
1273 datastores in which they appear.
1274 unlink: `list` [`str`], optional
1275 Before removing the given `collection` unlink it from from these
1276 parent collections.
1278 Raises
1279 ------
1280 TypeError
1281 Raised if the butler is read-only or arguments are mutually
1282 inconsistent.
1283 """
1284 # See pruneDatasets comments for more information about the logic here;
1285 # the cases are almost the same, but here we can rely on Registry to
1286 # take care everything but Datastore deletion when we remove the
1287 # collection.
1288 if not self.isWriteable():
1289 raise TypeError("Butler is read-only.")
1290 collectionType = self.registry.getCollectionType(name)
1291 if purge and not unstore:
1292 raise PurgeWithoutUnstorePruneCollectionsError()
1293 if collectionType is CollectionType.RUN and not purge:
1294 raise RunWithoutPurgePruneCollectionsError(collectionType)
1295 if collectionType is not CollectionType.RUN and purge:
1296 raise PurgeUnsupportedPruneCollectionsError(collectionType)
1298 def remove(child: str, parent: str) -> None:
1299 """Remove a child collection from a parent collection."""
1300 # Remove child from parent.
1301 chain = list(self.registry.getCollectionChain(parent))
1302 try:
1303 chain.remove(name)
1304 except ValueError as e:
1305 raise RuntimeError(f"{name} is not a child of {parent}") from e
1306 self.registry.setCollectionChain(parent, chain)
1308 with self.registry.transaction():
1309 if (unlink):
1310 for parent in unlink:
1311 remove(name, parent)
1312 if unstore:
1313 refs = self.registry.queryDatasets(..., collections=name, findFirst=True)
1314 self.datastore.trash(refs)
1315 self.registry.removeCollection(name)
1317 if unstore:
1318 # Point of no return for removing artifacts
1319 self.datastore.emptyTrash()
1321 def pruneDatasets(self, refs: Iterable[DatasetRef], *,
1322 disassociate: bool = True,
1323 unstore: bool = False,
1324 tags: Iterable[str] = (),
1325 purge: bool = False,
1326 run: Optional[str] = None) -> None:
1327 """Remove one or more datasets from a collection and/or storage.
1329 Parameters
1330 ----------
1331 refs : `~collections.abc.Iterable` of `DatasetRef`
1332 Datasets to prune. These must be "resolved" references (not just
1333 a `DatasetType` and data ID).
1334 disassociate : `bool`, optional
1335 Disassociate pruned datasets from ``tags``, or from all collections
1336 if ``purge=True``.
1337 unstore : `bool`, optional
1338 If `True` (`False` is default) remove these datasets from all
1339 datastores known to this butler. Note that this will make it
1340 impossible to retrieve these datasets even via other collections.
1341 Datasets that are already not stored are ignored by this option.
1342 tags : `Iterable` [ `str` ], optional
1343 `~CollectionType.TAGGED` collections to disassociate the datasets
1344 from. Ignored if ``disassociate`` is `False` or ``purge`` is
1345 `True`.
1346 purge : `bool`, optional
1347 If `True` (`False` is default), completely remove the dataset from
1348 the `Registry`. To prevent accidental deletions, ``purge`` may
1349 only be `True` if all of the following conditions are met:
1351 - All given datasets are in the given run.
1352 - ``disassociate`` is `True`;
1353 - ``unstore`` is `True`.
1355 This mode may remove provenance information from datasets other
1356 than those provided, and should be used with extreme care.
1358 Raises
1359 ------
1360 TypeError
1361 Raised if the butler is read-only, if no collection was provided,
1362 or the conditions for ``purge=True`` were not met.
1363 """
1364 if not self.isWriteable():
1365 raise TypeError("Butler is read-only.")
1366 if purge:
1367 if not disassociate:
1368 raise TypeError("Cannot pass purge=True without disassociate=True.")
1369 if not unstore:
1370 raise TypeError("Cannot pass purge=True without unstore=True.")
1371 elif disassociate:
1372 tags = tuple(tags)
1373 if not tags:
1374 raise TypeError("No tags provided but disassociate=True.")
1375 for tag in tags:
1376 collectionType = self.registry.getCollectionType(tag)
1377 if collectionType is not CollectionType.TAGGED:
1378 raise TypeError(f"Cannot disassociate from collection '{tag}' "
1379 f"of non-TAGGED type {collectionType.name}.")
1380 # Transform possibly-single-pass iterable into something we can iterate
1381 # over multiple times.
1382 refs = list(refs)
1383 # Pruning a component of a DatasetRef makes no sense since registry
1384 # doesn't know about components and datastore might not store
1385 # components in a separate file
1386 for ref in refs:
1387 if ref.datasetType.component():
1388 raise ValueError(f"Can not prune a component of a dataset (ref={ref})")
1389 # We don't need an unreliable Datastore transaction for this, because
1390 # we've been extra careful to ensure that Datastore.trash only involves
1391 # mutating the Registry (it can _look_ at Datastore-specific things,
1392 # but shouldn't change them), and hence all operations here are
1393 # Registry operations.
1394 with self.registry.transaction():
1395 if unstore:
1396 self.datastore.trash(refs)
1397 if purge:
1398 self.registry.removeDatasets(refs)
1399 elif disassociate:
1400 assert tags, "Guaranteed by earlier logic in this function."
1401 for tag in tags:
1402 self.registry.disassociate(tag, refs)
1403 # We've exited the Registry transaction, and apparently committed.
1404 # (if there was an exception, everything rolled back, and it's as if
1405 # nothing happened - and we never get here).
1406 # Datastore artifacts are not yet gone, but they're clearly marked
1407 # as trash, so if we fail to delete now because of (e.g.) filesystem
1408 # problems we can try again later, and if manual administrative
1409 # intervention is required, it's pretty clear what that should entail:
1410 # deleting everything on disk and in private Datastore tables that is
1411 # in the dataset_location_trash table.
1412 if unstore:
1413 # Point of no return for removing artifacts
1414 self.datastore.emptyTrash()
1416 @transactional
1417 def ingest(self, *datasets: FileDataset, transfer: Optional[str] = "auto", run: Optional[str] = None,
1418 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
1419 ) -> None:
1420 """Store and register one or more datasets that already exist on disk.
1422 Parameters
1423 ----------
1424 datasets : `FileDataset`
1425 Each positional argument is a struct containing information about
1426 a file to be ingested, including its path (either absolute or
1427 relative to the datastore root, if applicable), a `DatasetRef`,
1428 and optionally a formatter class or its fully-qualified string
1429 name. If a formatter is not provided, the formatter that would be
1430 used for `put` is assumed. On successful return, all
1431 `FileDataset.ref` attributes will have their `DatasetRef.id`
1432 attribute populated and all `FileDataset.formatter` attributes will
1433 be set to the formatter class used. `FileDataset.path` attributes
1434 may be modified to put paths in whatever the datastore considers a
1435 standardized form.
1436 transfer : `str`, optional
1437 If not `None`, must be one of 'auto', 'move', 'copy', 'direct',
1438 'hardlink', 'relsymlink' or 'symlink', indicating how to transfer
1439 the file.
1440 run : `str`, optional
1441 The name of the run ingested datasets should be added to,
1442 overriding ``self.run``.
1443 idGenerationMode : `DatasetIdGenEnum`, optional
1444 Specifies option for generating dataset IDs. By default unique IDs
1445 are generated for each inserted dataset.
1447 Raises
1448 ------
1449 TypeError
1450 Raised if the butler is read-only or if no run was provided.
1451 NotImplementedError
1452 Raised if the `Datastore` does not support the given transfer mode.
1453 DatasetTypeNotSupportedError
1454 Raised if one or more files to be ingested have a dataset type that
1455 is not supported by the `Datastore`..
1456 FileNotFoundError
1457 Raised if one of the given files does not exist.
1458 FileExistsError
1459 Raised if transfer is not `None` but the (internal) location the
1460 file would be moved to is already occupied.
1462 Notes
1463 -----
1464 This operation is not fully exception safe: if a database operation
1465 fails, the given `FileDataset` instances may be only partially updated.
1467 It is atomic in terms of database operations (they will either all
1468 succeed or all fail) providing the database engine implements
1469 transactions correctly. It will attempt to be atomic in terms of
1470 filesystem operations as well, but this cannot be implemented
1471 rigorously for most datastores.
1472 """
1473 if not self.isWriteable():
1474 raise TypeError("Butler is read-only.")
1475 progress = Progress("lsst.daf.butler.Butler.ingest", level=logging.DEBUG)
1476 # Reorganize the inputs so they're grouped by DatasetType and then
1477 # data ID. We also include a list of DatasetRefs for each FileDataset
1478 # to hold the resolved DatasetRefs returned by the Registry, before
1479 # it's safe to swap them into FileDataset.refs.
1480 # Some type annotation aliases to make that clearer:
1481 GroupForType = Dict[DataCoordinate, Tuple[FileDataset, List[DatasetRef]]]
1482 GroupedData = MutableMapping[DatasetType, GroupForType]
1483 # The actual data structure:
1484 groupedData: GroupedData = defaultdict(dict)
1485 # And the nested loop that populates it:
1486 for dataset in progress.wrap(datasets, desc="Grouping by dataset type"):
1487 # This list intentionally shared across the inner loop, since it's
1488 # associated with `dataset`.
1489 resolvedRefs: List[DatasetRef] = []
1490 for ref in dataset.refs:
1491 if ref.dataId in groupedData[ref.datasetType]:
1492 raise ConflictingDefinitionError(f"Ingest conflict. Dataset {dataset.path} has same"
1493 " DataId as other ingest dataset"
1494 f" {groupedData[ref.datasetType][ref.dataId][0].path} "
1495 f" ({ref.dataId})")
1496 groupedData[ref.datasetType][ref.dataId] = (dataset, resolvedRefs)
1498 # Now we can bulk-insert into Registry for each DatasetType.
1499 allResolvedRefs: List[DatasetRef] = []
1500 for datasetType, groupForType in progress.iter_item_chunks(groupedData.items(),
1501 desc="Bulk-inserting datasets by type"):
1502 refs = self.registry.insertDatasets(
1503 datasetType,
1504 dataIds=groupForType.keys(),
1505 run=run,
1506 expand=self.datastore.needs_expanded_data_ids(transfer, datasetType),
1507 idGenerationMode=idGenerationMode,
1508 )
1509 # Append those resolved DatasetRefs to the new lists we set up for
1510 # them.
1511 for ref, (_, resolvedRefs) in zip(refs, groupForType.values()):
1512 resolvedRefs.append(ref)
1514 # Go back to the original FileDatasets to replace their refs with the
1515 # new resolved ones, and also build a big list of all refs.
1516 allResolvedRefs = []
1517 for groupForType in progress.iter_chunks(groupedData.values(),
1518 desc="Reassociating resolved dataset refs with files"):
1519 for dataset, resolvedRefs in groupForType.values():
1520 dataset.refs = resolvedRefs
1521 allResolvedRefs.extend(resolvedRefs)
1523 # Bulk-insert everything into Datastore.
1524 self.datastore.ingest(*datasets, transfer=transfer)
1526 @contextlib.contextmanager
1527 def export(self, *, directory: Optional[str] = None,
1528 filename: Optional[str] = None,
1529 format: Optional[str] = None,
1530 transfer: Optional[str] = None) -> Iterator[RepoExportContext]:
1531 """Export datasets from the repository represented by this `Butler`.
1533 This method is a context manager that returns a helper object
1534 (`RepoExportContext`) that is used to indicate what information from
1535 the repository should be exported.
1537 Parameters
1538 ----------
1539 directory : `str`, optional
1540 Directory dataset files should be written to if ``transfer`` is not
1541 `None`.
1542 filename : `str`, optional
1543 Name for the file that will include database information associated
1544 with the exported datasets. If this is not an absolute path and
1545 ``directory`` is not `None`, it will be written to ``directory``
1546 instead of the current working directory. Defaults to
1547 "export.{format}".
1548 format : `str`, optional
1549 File format for the database information file. If `None`, the
1550 extension of ``filename`` will be used.
1551 transfer : `str`, optional
1552 Transfer mode passed to `Datastore.export`.
1554 Raises
1555 ------
1556 TypeError
1557 Raised if the set of arguments passed is inconsistent.
1559 Examples
1560 --------
1561 Typically the `Registry.queryDataIds` and `Registry.queryDatasets`
1562 methods are used to provide the iterables over data IDs and/or datasets
1563 to be exported::
1565 with butler.export("exports.yaml") as export:
1566 # Export all flats, but none of the dimension element rows
1567 # (i.e. data ID information) associated with them.
1568 export.saveDatasets(butler.registry.queryDatasets("flat"),
1569 elements=())
1570 # Export all datasets that start with "deepCoadd_" and all of
1571 # their associated data ID information.
1572 export.saveDatasets(butler.registry.queryDatasets("deepCoadd_*"))
1573 """
1574 if directory is None and transfer is not None:
1575 raise TypeError("Cannot transfer without providing a directory.")
1576 if transfer == "move":
1577 raise TypeError("Transfer may not be 'move': export is read-only")
1578 if format is None:
1579 if filename is None:
1580 raise TypeError("At least one of 'filename' or 'format' must be provided.")
1581 else:
1582 _, format = os.path.splitext(filename)
1583 elif filename is None:
1584 filename = f"export.{format}"
1585 if directory is not None:
1586 filename = os.path.join(directory, filename)
1587 BackendClass = getClassOf(self._config["repo_transfer_formats"][format]["export"])
1588 with open(filename, 'w') as stream:
1589 backend = BackendClass(stream)
1590 try:
1591 helper = RepoExportContext(self.registry, self.datastore, backend=backend,
1592 directory=directory, transfer=transfer)
1593 yield helper
1594 except BaseException:
1595 raise
1596 else:
1597 helper._finish()
1599 def import_(self, *, directory: Optional[str] = None,
1600 filename: Union[str, TextIO, None] = None,
1601 format: Optional[str] = None,
1602 transfer: Optional[str] = None,
1603 skip_dimensions: Optional[Set] = None,
1604 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
1605 reuseIds: bool = False) -> None:
1606 """Import datasets into this repository that were exported from a
1607 different butler repository via `~lsst.daf.butler.Butler.export`.
1609 Parameters
1610 ----------
1611 directory : `str`, optional
1612 Directory containing dataset files to import from. If `None`,
1613 ``filename`` and all dataset file paths specified therein must
1614 be absolute.
1615 filename : `str` or `TextIO`, optional
1616 A stream or name of file that contains database information
1617 associated with the exported datasets, typically generated by
1618 `~lsst.daf.butler.Butler.export`. If this a string (name) and
1619 is not an absolute path, does not exist in the current working
1620 directory, and ``directory`` is not `None`, it is assumed to be in
1621 ``directory``. Defaults to "export.{format}".
1622 format : `str`, optional
1623 File format for ``filename``. If `None`, the extension of
1624 ``filename`` will be used.
1625 transfer : `str`, optional
1626 Transfer mode passed to `~lsst.daf.butler.Datastore.ingest`.
1627 skip_dimensions : `set`, optional
1628 Names of dimensions that should be skipped and not imported.
1629 idGenerationMode : `DatasetIdGenEnum`, optional
1630 Specifies option for generating dataset IDs when IDs are not
1631 provided or their type does not match backend type. By default
1632 unique IDs are generated for each inserted dataset.
1633 reuseIds : `bool`, optional
1634 If `True` then forces re-use of imported dataset IDs for integer
1635 IDs which are normally generated as auto-incremented; exception
1636 will be raised if imported IDs clash with existing ones. This
1637 option has no effect on the use of globally-unique IDs which are
1638 always re-used (or generated if integer IDs are being imported).
1640 Raises
1641 ------
1642 TypeError
1643 Raised if the set of arguments passed is inconsistent, or if the
1644 butler is read-only.
1645 """
1646 if not self.isWriteable():
1647 raise TypeError("Butler is read-only.")
1648 if format is None:
1649 if filename is None:
1650 raise TypeError("At least one of 'filename' or 'format' must be provided.")
1651 else:
1652 _, format = os.path.splitext(filename) # type: ignore
1653 elif filename is None:
1654 filename = f"export.{format}"
1655 if isinstance(filename, str) and directory is not None and not os.path.exists(filename):
1656 filename = os.path.join(directory, filename)
1657 BackendClass = getClassOf(self._config["repo_transfer_formats"][format]["import"])
1659 def doImport(importStream: TextIO) -> None:
1660 backend = BackendClass(importStream, self.registry)
1661 backend.register()
1662 with self.transaction():
1663 backend.load(self.datastore, directory=directory, transfer=transfer,
1664 skip_dimensions=skip_dimensions, idGenerationMode=idGenerationMode,
1665 reuseIds=reuseIds)
1667 if isinstance(filename, str):
1668 with open(filename, "r") as stream:
1669 doImport(stream)
1670 else:
1671 doImport(filename)
1673 def validateConfiguration(self, logFailures: bool = False,
1674 datasetTypeNames: Optional[Iterable[str]] = None,
1675 ignore: Iterable[str] = None) -> None:
1676 """Validate butler configuration.
1678 Checks that each `DatasetType` can be stored in the `Datastore`.
1680 Parameters
1681 ----------
1682 logFailures : `bool`, optional
1683 If `True`, output a log message for every validation error
1684 detected.
1685 datasetTypeNames : iterable of `str`, optional
1686 The `DatasetType` names that should be checked. This allows
1687 only a subset to be selected.
1688 ignore : iterable of `str`, optional
1689 Names of DatasetTypes to skip over. This can be used to skip
1690 known problems. If a named `DatasetType` corresponds to a
1691 composite, all components of that `DatasetType` will also be
1692 ignored.
1694 Raises
1695 ------
1696 ButlerValidationError
1697 Raised if there is some inconsistency with how this Butler
1698 is configured.
1699 """
1700 if datasetTypeNames:
1701 datasetTypes = [self.registry.getDatasetType(name) for name in datasetTypeNames]
1702 else:
1703 datasetTypes = list(self.registry.queryDatasetTypes())
1705 # filter out anything from the ignore list
1706 if ignore:
1707 ignore = set(ignore)
1708 datasetTypes = [e for e in datasetTypes
1709 if e.name not in ignore and e.nameAndComponent()[0] not in ignore]
1710 else:
1711 ignore = set()
1713 # Find all the registered instruments
1714 instruments = set(
1715 record.name for record in self.registry.queryDimensionRecords("instrument")
1716 )
1718 # For each datasetType that has an instrument dimension, create
1719 # a DatasetRef for each defined instrument
1720 datasetRefs = []
1722 for datasetType in datasetTypes:
1723 if "instrument" in datasetType.dimensions:
1724 for instrument in instruments:
1725 datasetRef = DatasetRef(datasetType, {"instrument": instrument}, # type: ignore
1726 conform=False)
1727 datasetRefs.append(datasetRef)
1729 entities: List[Union[DatasetType, DatasetRef]] = []
1730 entities.extend(datasetTypes)
1731 entities.extend(datasetRefs)
1733 datastoreErrorStr = None
1734 try:
1735 self.datastore.validateConfiguration(entities, logFailures=logFailures)
1736 except ValidationError as e:
1737 datastoreErrorStr = str(e)
1739 # Also check that the LookupKeys used by the datastores match
1740 # registry and storage class definitions
1741 keys = self.datastore.getLookupKeys()
1743 failedNames = set()
1744 failedDataId = set()
1745 for key in keys:
1746 if key.name is not None:
1747 if key.name in ignore:
1748 continue
1750 # skip if specific datasetType names were requested and this
1751 # name does not match
1752 if datasetTypeNames and key.name not in datasetTypeNames:
1753 continue
1755 # See if it is a StorageClass or a DatasetType
1756 if key.name in self.storageClasses:
1757 pass
1758 else:
1759 try:
1760 self.registry.getDatasetType(key.name)
1761 except KeyError:
1762 if logFailures:
1763 log.critical("Key '%s' does not correspond to a DatasetType or StorageClass", key)
1764 failedNames.add(key)
1765 else:
1766 # Dimensions are checked for consistency when the Butler
1767 # is created and rendezvoused with a universe.
1768 pass
1770 # Check that the instrument is a valid instrument
1771 # Currently only support instrument so check for that
1772 if key.dataId:
1773 dataIdKeys = set(key.dataId)
1774 if set(["instrument"]) != dataIdKeys:
1775 if logFailures:
1776 log.critical("Key '%s' has unsupported DataId override", key)
1777 failedDataId.add(key)
1778 elif key.dataId["instrument"] not in instruments:
1779 if logFailures:
1780 log.critical("Key '%s' has unknown instrument", key)
1781 failedDataId.add(key)
1783 messages = []
1785 if datastoreErrorStr:
1786 messages.append(datastoreErrorStr)
1788 for failed, msg in ((failedNames, "Keys without corresponding DatasetType or StorageClass entry: "),
1789 (failedDataId, "Keys with bad DataId entries: ")):
1790 if failed:
1791 msg += ", ".join(str(k) for k in failed)
1792 messages.append(msg)
1794 if messages:
1795 raise ValidationError(";\n".join(messages))
1797 @property
1798 def collections(self) -> CollectionSearch:
1799 """The collections to search by default, in order (`CollectionSearch`).
1801 This is an alias for ``self.registry.defaults.collections``. It cannot
1802 be set directly in isolation, but all defaults may be changed together
1803 by assigning a new `RegistryDefaults` instance to
1804 ``self.registry.defaults``.
1805 """
1806 return self.registry.defaults.collections
1808 @property
1809 def run(self) -> Optional[str]:
1810 """Name of the run this butler writes outputs to by default (`str` or
1811 `None`).
1813 This is an alias for ``self.registry.defaults.run``. It cannot be set
1814 directly in isolation, but all defaults may be changed together by
1815 assigning a new `RegistryDefaults` instance to
1816 ``self.registry.defaults``.
1817 """
1818 return self.registry.defaults.run
1820 registry: Registry
1821 """The object that manages dataset metadata and relationships (`Registry`).
1823 Most operations that don't involve reading or writing butler datasets are
1824 accessible only via `Registry` methods.
1825 """
1827 datastore: Datastore
1828 """The object that manages actual dataset storage (`Datastore`).
1830 Direct user access to the datastore should rarely be necessary; the primary
1831 exception is the case where a `Datastore` implementation provides extra
1832 functionality beyond what the base class defines.
1833 """
1835 storageClasses: StorageClassFactory
1836 """An object that maps known storage class names to objects that fully
1837 describe them (`StorageClassFactory`).
1838 """