Coverage for python/lsst/daf/butler/_butler.py : 9%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""
23Butler top level classes.
24"""
25from __future__ import annotations
27__all__ = (
28 "Butler",
29 "ButlerValidationError",
30 "PruneCollectionsArgsError",
31 "PurgeWithoutUnstorePruneCollectionsError",
32 "RunWithoutPurgePruneCollectionsError",
33 "PurgeUnsupportedPruneCollectionsError",
34)
37from collections import defaultdict
38import contextlib
39import logging
40import os
41from typing import (
42 Any,
43 ClassVar,
44 ContextManager,
45 Dict,
46 Iterable,
47 List,
48 Mapping,
49 MutableMapping,
50 Optional,
51 Set,
52 TextIO,
53 Tuple,
54 Union,
55)
57try:
58 import boto3
59except ImportError:
60 boto3 = None
62from lsst.utils import doImport
63from .core import (
64 AmbiguousDatasetError,
65 ButlerURI,
66 Config,
67 ConfigSubset,
68 DataCoordinate,
69 DataId,
70 DatasetRef,
71 DatasetType,
72 Datastore,
73 DimensionConfig,
74 FileDataset,
75 StorageClassFactory,
76 Timespan,
77 ValidationError,
78)
79from .core.repoRelocation import BUTLER_ROOT_TAG
80from .core.utils import transactional, getClassOf
81from ._deferredDatasetHandle import DeferredDatasetHandle
82from ._butlerConfig import ButlerConfig
83from .registry import Registry, RegistryConfig, CollectionType
84from .registry.wildcards import CollectionSearch
85from .transfers import RepoExportContext
87log = logging.getLogger(__name__)
90class ButlerValidationError(ValidationError):
91 """There is a problem with the Butler configuration."""
92 pass
95class PruneCollectionsArgsError(TypeError):
96 """Base class for errors relating to Butler.pruneCollections input
97 arguments.
98 """
99 pass
102class PurgeWithoutUnstorePruneCollectionsError(PruneCollectionsArgsError):
103 """Raised when purge and unstore are both required to be True, and
104 purge is True but unstore is False.
105 """
107 def __init__(self):
108 super().__init__("Cannot pass purge=True without unstore=True.")
111class RunWithoutPurgePruneCollectionsError(PruneCollectionsArgsError):
112 """Raised when pruning a RUN collection but purge is False."""
114 def __init__(self, collectionType):
115 self.collectionType = collectionType
116 super().__init__(f"Cannot prune RUN collection {self.collectionType.name} without purge=True.")
119class PurgeUnsupportedPruneCollectionsError(PruneCollectionsArgsError):
120 """Raised when purge is True but is not supported for the given
121 collection."""
123 def __init__(self, collectionType):
124 self.collectionType = collectionType
125 super().__init__(
126 f"Cannot prune {self.collectionType} collection {self.collectionType.name} with purge=True.")
129class Butler:
130 """Main entry point for the data access system.
132 Parameters
133 ----------
134 config : `ButlerConfig`, `Config` or `str`, optional.
135 Configuration. Anything acceptable to the
136 `ButlerConfig` constructor. If a directory path
137 is given the configuration will be read from a ``butler.yaml`` file in
138 that location. If `None` is given default values will be used.
139 butler : `Butler`, optional.
140 If provided, construct a new Butler that uses the same registry and
141 datastore as the given one, but with the given collection and run.
142 Incompatible with the ``config``, ``searchPaths``, and ``writeable``
143 arguments.
144 collections : `Any`, optional
145 An expression specifying the collections to be searched (in order) when
146 reading datasets, and optionally dataset type restrictions on them.
147 This may be:
148 - a `str` collection name;
149 - a tuple of (collection name, *dataset type restriction*);
150 - an iterable of either of the above;
151 - a mapping from `str` to *dataset type restriction*.
153 See :ref:`daf_butler_collection_expressions` for more information,
154 including the definition of a *dataset type restriction*. All
155 collections must either already exist or be specified to be created
156 by other arguments.
157 run : `str`, optional
158 Name of the run datasets should be output to. If the run
159 does not exist, it will be created. If ``collections`` is `None`, it
160 will be set to ``[run]``. If this is not set (and ``writeable`` is
161 not set either), a read-only butler will be created.
162 tags : `Iterable` [ `str` ], optional
163 A list of `~CollectionType.TAGGED` collections that datasets should be
164 associated with in `put` or `ingest` and disassociated from in
165 `pruneDatasets`. If any of these collections does not exist, it will
166 be created.
167 chains : `Mapping` [ `str`, `Iterable` [ `str` ] ], optional
168 A mapping from the names of new `~CollectionType.CHAINED` collections
169 to an expression identifying their child collections (which takes the
170 same form as the ``collections`` argument. Chains may be nested only
171 if children precede their parents in this mapping.
172 searchPaths : `list` of `str`, optional
173 Directory paths to search when calculating the full Butler
174 configuration. Not used if the supplied config is already a
175 `ButlerConfig`.
176 writeable : `bool`, optional
177 Explicitly sets whether the butler supports write operations. If not
178 provided, a read-write butler is created if any of ``run``, ``tags``,
179 or ``chains`` is non-empty.
181 Examples
182 --------
183 While there are many ways to control exactly how a `Butler` interacts with
184 the collections in its `Registry`, the most common cases are still simple.
186 For a read-only `Butler` that searches one collection, do::
188 butler = Butler("/path/to/repo", collections=["u/alice/DM-50000"])
190 For a read-write `Butler` that writes to and reads from a
191 `~CollectionType.RUN` collection::
193 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a")
195 The `Butler` passed to a ``PipelineTask`` is often much more complex,
196 because we want to write to one `~CollectionType.RUN` collection but read
197 from several others (as well), while defining a new
198 `~CollectionType.CHAINED` collection that combines them all::
200 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a",
201 collections=["u/alice/DM-50000"],
202 chains={
203 "u/alice/DM-50000": ["u/alice/DM-50000/a",
204 "u/bob/DM-49998",
205 "raw/hsc"]
206 })
208 This butler will `put` new datasets to the run ``u/alice/DM-50000/a``, but
209 they'll also be available from the chained collection ``u/alice/DM-50000``.
210 Datasets will be read first from that run (since it appears first in the
211 chain), and then from ``u/bob/DM-49998`` and finally ``raw/hsc``.
212 If ``u/alice/DM-50000`` had already been defined, the ``chain`` argument
213 would be unnecessary. We could also construct a butler that performs
214 exactly the same `put` and `get` operations without actually creating a
215 chained collection, just by passing multiple items is ``collections``::
217 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a",
218 collections=["u/alice/DM-50000/a",
219 "u/bob/DM-49998",
220 "raw/hsc"])
222 Finally, one can always create a `Butler` with no collections::
224 butler = Butler("/path/to/repo", writeable=True)
226 This can be extremely useful when you just want to use ``butler.registry``,
227 e.g. for inserting dimension data or managing collections, or when the
228 collections you want to use with the butler are not consistent.
229 Passing ``writeable`` explicitly here is only necessary if you want to be
230 able to make changes to the repo - usually the value for ``writeable`` is
231 can be guessed from the collection arguments provided, but it defaults to
232 `False` when there are not collection arguments.
233 """
234 def __init__(self, config: Union[Config, str, None] = None, *,
235 butler: Optional[Butler] = None,
236 collections: Any = None,
237 run: Optional[str] = None,
238 tags: Iterable[str] = (),
239 chains: Optional[Mapping[str, Any]] = None,
240 searchPaths: Optional[List[str]] = None,
241 writeable: Optional[bool] = None,
242 ):
243 # Transform any single-pass iterator into an actual sequence so we
244 # can see if its empty
245 self.tags = tuple(tags)
246 # Load registry, datastore, etc. from config or existing butler.
247 if butler is not None:
248 if config is not None or searchPaths is not None or writeable is not None:
249 raise TypeError("Cannot pass 'config', 'searchPaths', or 'writeable' "
250 "arguments with 'butler' argument.")
251 self.registry = butler.registry
252 self.datastore = butler.datastore
253 self.storageClasses = butler.storageClasses
254 self._config = butler._config
255 else:
256 self._config = ButlerConfig(config, searchPaths=searchPaths)
257 if "root" in self._config:
258 butlerRoot = self._config["root"]
259 else:
260 butlerRoot = self._config.configDir
261 if writeable is None:
262 writeable = run is not None or chains is not None or self.tags
263 self.registry = Registry.fromConfig(self._config, butlerRoot=butlerRoot, writeable=writeable)
264 self.datastore = Datastore.fromConfig(self._config, self.registry.getDatastoreBridgeManager(),
265 butlerRoot=butlerRoot)
266 self.storageClasses = StorageClassFactory()
267 self.storageClasses.addFromConfig(self._config)
268 # Check the many collection arguments for consistency and create any
269 # needed collections that don't exist.
270 if collections is None:
271 if run is not None:
272 collections = (run,)
273 else:
274 collections = ()
275 self.collections = CollectionSearch.fromExpression(collections)
276 if chains is None:
277 chains = {}
278 self.run = run
279 if "run" in self._config or "collection" in self._config:
280 raise ValueError("Passing a run or collection via configuration is no longer supported.")
281 if self.run is not None:
282 self.registry.registerCollection(self.run, type=CollectionType.RUN)
283 for tag in self.tags:
284 self.registry.registerCollection(tag, type=CollectionType.TAGGED)
285 for parent, children in chains.items():
286 self.registry.registerCollection(parent, type=CollectionType.CHAINED)
287 self.registry.setCollectionChain(parent, children)
289 GENERATION: ClassVar[int] = 3
290 """This is a Generation 3 Butler.
292 This attribute may be removed in the future, once the Generation 2 Butler
293 interface has been fully retired; it should only be used in transitional
294 code.
295 """
297 @staticmethod
298 def makeRepo(root: str, config: Union[Config, str, None] = None,
299 dimensionConfig: Union[Config, str, None] = None, standalone: bool = False,
300 searchPaths: Optional[List[str]] = None, forceConfigRoot: bool = True,
301 outfile: Optional[str] = None, overwrite: bool = False) -> Config:
302 """Create an empty data repository by adding a butler.yaml config
303 to a repository root directory.
305 Parameters
306 ----------
307 root : `str` or `ButlerURI`
308 Path or URI to the root location of the new repository. Will be
309 created if it does not exist.
310 config : `Config` or `str`, optional
311 Configuration to write to the repository, after setting any
312 root-dependent Registry or Datastore config options. Can not
313 be a `ButlerConfig` or a `ConfigSubset`. If `None`, default
314 configuration will be used. Root-dependent config options
315 specified in this config are overwritten if ``forceConfigRoot``
316 is `True`.
317 dimensionConfig : `Config` or `str`, optional
318 Configuration for dimensions, will be used to initialize registry
319 database.
320 standalone : `bool`
321 If True, write all expanded defaults, not just customized or
322 repository-specific settings.
323 This (mostly) decouples the repository from the default
324 configuration, insulating it from changes to the defaults (which
325 may be good or bad, depending on the nature of the changes).
326 Future *additions* to the defaults will still be picked up when
327 initializing `Butlers` to repos created with ``standalone=True``.
328 searchPaths : `list` of `str`, optional
329 Directory paths to search when calculating the full butler
330 configuration.
331 forceConfigRoot : `bool`, optional
332 If `False`, any values present in the supplied ``config`` that
333 would normally be reset are not overridden and will appear
334 directly in the output config. This allows non-standard overrides
335 of the root directory for a datastore or registry to be given.
336 If this parameter is `True` the values for ``root`` will be
337 forced into the resulting config if appropriate.
338 outfile : `str`, optional
339 If not-`None`, the output configuration will be written to this
340 location rather than into the repository itself. Can be a URI
341 string. Can refer to a directory that will be used to write
342 ``butler.yaml``.
343 overwrite : `bool`, optional
344 Create a new configuration file even if one already exists
345 in the specified output location. Default is to raise
346 an exception.
348 Returns
349 -------
350 config : `Config`
351 The updated `Config` instance written to the repo.
353 Raises
354 ------
355 ValueError
356 Raised if a ButlerConfig or ConfigSubset is passed instead of a
357 regular Config (as these subclasses would make it impossible to
358 support ``standalone=False``).
359 FileExistsError
360 Raised if the output config file already exists.
361 os.error
362 Raised if the directory does not exist, exists but is not a
363 directory, or cannot be created.
365 Notes
366 -----
367 Note that when ``standalone=False`` (the default), the configuration
368 search path (see `ConfigSubset.defaultSearchPaths`) that was used to
369 construct the repository should also be used to construct any Butlers
370 to avoid configuration inconsistencies.
371 """
372 if isinstance(config, (ButlerConfig, ConfigSubset)):
373 raise ValueError("makeRepo must be passed a regular Config without defaults applied.")
375 # Ensure that the root of the repository exists or can be made
376 uri = ButlerURI(root, forceDirectory=True)
377 uri.mkdir()
379 config = Config(config)
381 # If we are creating a new repo from scratch with relative roots,
382 # do not propagate an explicit root from the config file
383 if "root" in config:
384 del config["root"]
386 full = ButlerConfig(config, searchPaths=searchPaths) # this applies defaults
387 datastoreClass = doImport(full["datastore", "cls"])
388 datastoreClass.setConfigRoot(BUTLER_ROOT_TAG, config, full, overwrite=forceConfigRoot)
390 # if key exists in given config, parse it, otherwise parse the defaults
391 # in the expanded config
392 if config.get(("registry", "db")):
393 registryConfig = RegistryConfig(config)
394 else:
395 registryConfig = RegistryConfig(full)
396 defaultDatabaseUri = registryConfig.makeDefaultDatabaseUri(BUTLER_ROOT_TAG)
397 if defaultDatabaseUri is not None:
398 Config.updateParameters(RegistryConfig, config, full,
399 toUpdate={"db": defaultDatabaseUri},
400 overwrite=forceConfigRoot)
401 else:
402 Config.updateParameters(RegistryConfig, config, full, toCopy=("db",),
403 overwrite=forceConfigRoot)
405 if standalone:
406 config.merge(full)
407 else:
408 # Always expand the registry.managers section into the per-repo
409 # config, because after the database schema is created, it's not
410 # allowed to change anymore. Note that in the standalone=True
411 # branch, _everything_ in the config is expanded, so there's no
412 # need to special case this.
413 Config.updateParameters(RegistryConfig, config, full, toCopy=("managers",), overwrite=False)
414 if outfile is not None:
415 # When writing to a separate location we must include
416 # the root of the butler repo in the config else it won't know
417 # where to look.
418 config["root"] = uri.geturl()
419 configURI = outfile
420 else:
421 configURI = uri
422 config.dumpToUri(configURI, overwrite=overwrite)
424 # Create Registry and populate tables
425 registryConfig = RegistryConfig(config.get("registry"))
426 dimensionConfig = DimensionConfig(dimensionConfig)
427 Registry.createFromConfig(registryConfig, dimensionConfig=dimensionConfig, butlerRoot=root)
429 return config
431 @classmethod
432 def _unpickle(cls, config: ButlerConfig, collections: Optional[CollectionSearch], run: Optional[str],
433 tags: Tuple[str, ...], writeable: bool) -> Butler:
434 """Callable used to unpickle a Butler.
436 We prefer not to use ``Butler.__init__`` directly so we can force some
437 of its many arguments to be keyword-only (note that ``__reduce__``
438 can only invoke callables with positional arguments).
440 Parameters
441 ----------
442 config : `ButlerConfig`
443 Butler configuration, already coerced into a true `ButlerConfig`
444 instance (and hence after any search paths for overrides have been
445 utilized).
446 collections : `CollectionSearch`
447 Names of collections to read from.
448 run : `str`, optional
449 Name of `~CollectionType.RUN` collection to write to.
450 tags : `tuple` [`str`]
451 Names of `~CollectionType.TAGGED` collections to associate with.
452 writeable : `bool`
453 Whether the Butler should support write operations.
455 Returns
456 -------
457 butler : `Butler`
458 A new `Butler` instance.
459 """
460 return cls(config=config, collections=collections, run=run, tags=tags, writeable=writeable)
462 def __reduce__(self):
463 """Support pickling.
464 """
465 return (Butler._unpickle, (self._config, self.collections, self.run, self.tags,
466 self.registry.isWriteable()))
468 def __str__(self):
469 return "Butler(collections={}, run={}, tags={}, datastore='{}', registry='{}')".format(
470 self.collections, self.run, self.tags, self.datastore, self.registry)
472 def isWriteable(self) -> bool:
473 """Return `True` if this `Butler` supports write operations.
474 """
475 return self.registry.isWriteable()
477 @contextlib.contextmanager
478 def transaction(self):
479 """Context manager supporting `Butler` transactions.
481 Transactions can be nested.
482 """
483 with self.registry.transaction():
484 with self.datastore.transaction():
485 yield
487 def _standardizeArgs(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
488 dataId: Optional[DataId] = None, **kwds: Any) -> Tuple[DatasetType, DataId]:
489 """Standardize the arguments passed to several Butler APIs.
491 Parameters
492 ----------
493 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
494 When `DatasetRef` the `dataId` should be `None`.
495 Otherwise the `DatasetType` or name thereof.
496 dataId : `dict` or `DataCoordinate`
497 A `dict` of `Dimension` link name, value pairs that label the
498 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
499 should be provided as the second argument.
500 kwds
501 Additional keyword arguments used to augment or construct a
502 `DataCoordinate`. See `DataCoordinate.standardize`
503 parameters.
505 Returns
506 -------
507 datasetType : `DatasetType`
508 A `DatasetType` instance extracted from ``datasetRefOrType``.
509 dataId : `dict` or `DataId`, optional
510 Argument that can be used (along with ``kwds``) to construct a
511 `DataId`.
513 Notes
514 -----
515 Butler APIs that conceptually need a DatasetRef also allow passing a
516 `DatasetType` (or the name of one) and a `DataId` (or a dict and
517 keyword arguments that can be used to construct one) separately. This
518 method accepts those arguments and always returns a true `DatasetType`
519 and a `DataId` or `dict`.
521 Standardization of `dict` vs `DataId` is best handled by passing the
522 returned ``dataId`` (and ``kwds``) to `Registry` APIs, which are
523 generally similarly flexible.
524 """
525 externalDatasetType = None
526 internalDatasetType = None
527 if isinstance(datasetRefOrType, DatasetRef):
528 if dataId is not None or kwds:
529 raise ValueError("DatasetRef given, cannot use dataId as well")
530 externalDatasetType = datasetRefOrType.datasetType
531 dataId = datasetRefOrType.dataId
532 else:
533 # Don't check whether DataId is provided, because Registry APIs
534 # can usually construct a better error message when it wasn't.
535 if isinstance(datasetRefOrType, DatasetType):
536 externalDatasetType = datasetRefOrType
537 else:
538 internalDatasetType = self.registry.getDatasetType(datasetRefOrType)
540 # Check that they are self-consistent
541 if externalDatasetType is not None:
542 internalDatasetType = self.registry.getDatasetType(externalDatasetType.name)
543 if externalDatasetType != internalDatasetType:
544 raise ValueError(f"Supplied dataset type ({externalDatasetType}) inconsistent with "
545 f"registry definition ({internalDatasetType})")
547 return internalDatasetType, dataId
549 def _findDatasetRef(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
550 dataId: Optional[DataId] = None, *,
551 collections: Any = None,
552 allowUnresolved: bool = False,
553 **kwds: Any) -> DatasetRef:
554 """Shared logic for methods that start with a search for a dataset in
555 the registry.
557 Parameters
558 ----------
559 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
560 When `DatasetRef` the `dataId` should be `None`.
561 Otherwise the `DatasetType` or name thereof.
562 dataId : `dict` or `DataCoordinate`, optional
563 A `dict` of `Dimension` link name, value pairs that label the
564 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
565 should be provided as the first argument.
566 collections : Any, optional
567 Collections to be searched, overriding ``self.collections``.
568 Can be any of the types supported by the ``collections`` argument
569 to butler construction.
570 allowUnresolved : `bool`, optional
571 If `True`, return an unresolved `DatasetRef` if finding a resolved
572 one in the `Registry` fails. Defaults to `False`.
573 kwds
574 Additional keyword arguments used to augment or construct a
575 `DataId`. See `DataId` parameters.
577 Returns
578 -------
579 ref : `DatasetRef`
580 A reference to the dataset identified by the given arguments.
582 Raises
583 ------
584 LookupError
585 Raised if no matching dataset exists in the `Registry` (and
586 ``allowUnresolved is False``).
587 ValueError
588 Raised if a resolved `DatasetRef` was passed as an input, but it
589 differs from the one found in the registry.
590 TypeError
591 Raised if no collections were provided.
592 """
593 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwds)
594 if isinstance(datasetRefOrType, DatasetRef):
595 idNumber = datasetRefOrType.id
596 else:
597 idNumber = None
598 timespan: Optional[Timespan] = None
600 # Process dimension records that are using record information
601 # rather than ids
602 newDataId: dict[Any, Any] = {}
603 byRecord: dict[Any, dict[str, Any]] = defaultdict(dict)
605 # if all the dataId comes from keyword parameters we do not need
606 # to do anything here because they can't be of the form
607 # exposure.obs_id because a "." is not allowed in a keyword parameter.
608 if dataId:
609 for k, v in dataId.items():
610 # If we have a Dimension we do not need to do anything
611 # because it cannot be a compound key.
612 if isinstance(k, str) and "." in k:
613 # Someone is using a more human-readable dataId
614 dimension, record = k.split(".", 1)
615 byRecord[dimension][record] = v
616 else:
617 newDataId[k] = v
619 if byRecord:
620 # Some record specifiers were found so we need to convert
621 # them to the Id form
622 for dimensionName, values in byRecord.items():
623 if dimensionName in newDataId:
624 log.warning("DataId specified explicit %s dimension value of %s in addition to"
625 " general record specifiers for it of %s. Ignoring record information.",
626 dimensionName, newDataId[dimensionName], str(values))
627 continue
629 # Build up a WHERE expression -- use single quotes
630 def quote(s):
631 if isinstance(s, str):
632 return f"'{s}'"
633 else:
634 return s
636 where = " AND ".join(f"{dimensionName}.{k} = {quote(v)}"
637 for k, v in values.items())
639 # Hopefully we get a single record that matches
640 records = set(self.registry.queryDimensionRecords(dimensionName, dataId=newDataId,
641 where=where, **kwds))
643 if len(records) != 1:
644 if len(records) > 1:
645 log.debug("Received %d records from constraints of %s", len(records), str(values))
646 for r in records:
647 log.debug("- %s", str(r))
648 raise RuntimeError(f"DataId specification for dimension {dimensionName} is not"
649 f" uniquely constrained to a single dataset by {values}."
650 f" Got {len(records)} results.")
651 raise RuntimeError(f"DataId specification for dimension {dimensionName} matched no"
652 f" records when constrained by {values}")
654 # Get the primary key from the real dimension object
655 dimension = self.registry.dimensions[dimensionName]
656 newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name)
658 # We have modified the dataId so need to switch to it
659 dataId = newDataId
661 if datasetType.isCalibration():
662 # Because this is a calibration dataset, first try to make a
663 # standardize the data ID without restricting the dimensions to
664 # those of the dataset type requested, because there may be extra
665 # dimensions that provide temporal information for a validity-range
666 # lookup.
667 dataId = DataCoordinate.standardize(dataId, universe=self.registry.dimensions, **kwds)
668 if dataId.graph.temporal:
669 dataId = self.registry.expandDataId(dataId)
670 timespan = dataId.timespan
671 else:
672 # Standardize the data ID to just the dimensions of the dataset
673 # type instead of letting registry.findDataset do it, so we get the
674 # result even if no dataset is found.
675 dataId = DataCoordinate.standardize(dataId, graph=datasetType.dimensions, **kwds)
676 if collections is None:
677 collections = self.collections
678 if not collections:
679 raise TypeError("No input collections provided.")
680 else:
681 collections = CollectionSearch.fromExpression(collections)
682 # Always lookup the DatasetRef, even if one is given, to ensure it is
683 # present in the current collection.
684 ref = self.registry.findDataset(datasetType, dataId, collections=collections, timespan=timespan)
685 if ref is None:
686 if allowUnresolved:
687 return DatasetRef(datasetType, dataId)
688 else:
689 raise LookupError(f"Dataset {datasetType.name} with data ID {dataId} "
690 f"could not be found in collections {collections}.")
691 if idNumber is not None and idNumber != ref.id:
692 raise ValueError(f"DatasetRef.id provided ({idNumber}) does not match "
693 f"id ({ref.id}) in registry in collections {collections}.")
694 return ref
696 @transactional
697 def put(self, obj: Any, datasetRefOrType: Union[DatasetRef, DatasetType, str],
698 dataId: Optional[DataId] = None, *,
699 run: Optional[str] = None,
700 tags: Optional[Iterable[str]] = None,
701 **kwds: Any) -> DatasetRef:
702 """Store and register a dataset.
704 Parameters
705 ----------
706 obj : `object`
707 The dataset.
708 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
709 When `DatasetRef` is provided, ``dataId`` should be `None`.
710 Otherwise the `DatasetType` or name thereof.
711 dataId : `dict` or `DataCoordinate`
712 A `dict` of `Dimension` link name, value pairs that label the
713 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
714 should be provided as the second argument.
715 run : `str`, optional
716 The name of the run the dataset should be added to, overriding
717 ``self.run``.
718 tags : `Iterable` [ `str` ], optional
719 The names of a `~CollectionType.TAGGED` collections to associate
720 the dataset with, overriding ``self.tags``. These collections
721 must have already been added to the `Registry`.
722 kwds
723 Additional keyword arguments used to augment or construct a
724 `DataCoordinate`. See `DataCoordinate.standardize`
725 parameters.
727 Returns
728 -------
729 ref : `DatasetRef`
730 A reference to the stored dataset, updated with the correct id if
731 given.
733 Raises
734 ------
735 TypeError
736 Raised if the butler is read-only or if no run has been provided.
737 """
738 log.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run)
739 if not self.isWriteable():
740 raise TypeError("Butler is read-only.")
741 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwds)
742 if isinstance(datasetRefOrType, DatasetRef) and datasetRefOrType.id is not None:
743 raise ValueError("DatasetRef must not be in registry, must have None id")
745 if run is None:
746 if self.run is None:
747 raise TypeError("No run provided.")
748 run = self.run
749 # No need to check type for run; first thing we do is
750 # insertDatasets, and that will check for us.
752 if tags is None:
753 tags = self.tags
754 else:
755 tags = tuple(tags)
756 for tag in tags:
757 # Check that these are tagged collections up front, because we want
758 # to avoid relying on Datastore transactionality to avoid modifying
759 # the repo if there's an error later.
760 collectionType = self.registry.getCollectionType(tag)
761 if collectionType is not CollectionType.TAGGED:
762 raise TypeError(f"Cannot associate into collection '{tag}' of non-TAGGED type "
763 f"{collectionType.name}.")
765 # Add Registry Dataset entry.
766 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions, **kwds)
767 ref, = self.registry.insertDatasets(datasetType, run=run, dataIds=[dataId])
769 # Add Datastore entry.
770 self.datastore.put(obj, ref)
772 for tag in tags:
773 self.registry.associate(tag, [ref])
775 return ref
777 def getDirect(self, ref: DatasetRef, *, parameters: Optional[Dict[str, Any]] = None):
778 """Retrieve a stored dataset.
780 Unlike `Butler.get`, this method allows datasets outside the Butler's
781 collection to be read as long as the `DatasetRef` that identifies them
782 can be obtained separately.
784 Parameters
785 ----------
786 ref : `DatasetRef`
787 Resolved reference to an already stored dataset.
788 parameters : `dict`
789 Additional StorageClass-defined options to control reading,
790 typically used to efficiently read only a subset of the dataset.
792 Returns
793 -------
794 obj : `object`
795 The dataset.
796 """
797 return self.datastore.get(ref, parameters=parameters)
799 def getDirectDeferred(self, ref: DatasetRef, *,
800 parameters: Union[dict, None] = None) -> DeferredDatasetHandle:
801 """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
802 from a resolved `DatasetRef`.
804 Parameters
805 ----------
806 ref : `DatasetRef`
807 Resolved reference to an already stored dataset.
808 parameters : `dict`
809 Additional StorageClass-defined options to control reading,
810 typically used to efficiently read only a subset of the dataset.
812 Returns
813 -------
814 obj : `DeferredDatasetHandle`
815 A handle which can be used to retrieve a dataset at a later time.
817 Raises
818 ------
819 AmbiguousDatasetError
820 Raised if ``ref.id is None``, i.e. the reference is unresolved.
821 """
822 if ref.id is None:
823 raise AmbiguousDatasetError(
824 f"Dataset of type {ref.datasetType.name} with data ID {ref.dataId} is not resolved."
825 )
826 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters)
828 def getDeferred(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
829 dataId: Optional[DataId] = None, *,
830 parameters: Union[dict, None] = None,
831 collections: Any = None,
832 **kwds: Any) -> DeferredDatasetHandle:
833 """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
834 after an immediate registry lookup.
836 Parameters
837 ----------
838 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
839 When `DatasetRef` the `dataId` should be `None`.
840 Otherwise the `DatasetType` or name thereof.
841 dataId : `dict` or `DataCoordinate`, optional
842 A `dict` of `Dimension` link name, value pairs that label the
843 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
844 should be provided as the first argument.
845 parameters : `dict`
846 Additional StorageClass-defined options to control reading,
847 typically used to efficiently read only a subset of the dataset.
848 collections : Any, optional
849 Collections to be searched, overriding ``self.collections``.
850 Can be any of the types supported by the ``collections`` argument
851 to butler construction.
852 kwds
853 Additional keyword arguments used to augment or construct a
854 `DataId`. See `DataId` parameters.
856 Returns
857 -------
858 obj : `DeferredDatasetHandle`
859 A handle which can be used to retrieve a dataset at a later time.
861 Raises
862 ------
863 LookupError
864 Raised if no matching dataset exists in the `Registry` (and
865 ``allowUnresolved is False``).
866 ValueError
867 Raised if a resolved `DatasetRef` was passed as an input, but it
868 differs from the one found in the registry.
869 TypeError
870 Raised if no collections were provided.
871 """
872 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwds)
873 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters)
875 def get(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
876 dataId: Optional[DataId] = None, *,
877 parameters: Optional[Dict[str, Any]] = None,
878 collections: Any = None,
879 **kwds: Any) -> Any:
880 """Retrieve a stored dataset.
882 Parameters
883 ----------
884 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
885 When `DatasetRef` the `dataId` should be `None`.
886 Otherwise the `DatasetType` or name thereof.
887 dataId : `dict` or `DataCoordinate`
888 A `dict` of `Dimension` link name, value pairs that label the
889 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
890 should be provided as the first argument.
891 parameters : `dict`
892 Additional StorageClass-defined options to control reading,
893 typically used to efficiently read only a subset of the dataset.
894 collections : Any, optional
895 Collections to be searched, overriding ``self.collections``.
896 Can be any of the types supported by the ``collections`` argument
897 to butler construction.
898 kwds
899 Additional keyword arguments used to augment or construct a
900 `DataCoordinate`. See `DataCoordinate.standardize`
901 parameters.
903 Returns
904 -------
905 obj : `object`
906 The dataset.
908 Raises
909 ------
910 ValueError
911 Raised if a resolved `DatasetRef` was passed as an input, but it
912 differs from the one found in the registry.
913 LookupError
914 Raised if no matching dataset exists in the `Registry`.
915 TypeError
916 Raised if no collections were provided.
918 Notes
919 -----
920 When looking up datasets in a `~CollectionType.CALIBRATION` collection,
921 this method requires that the given data ID include temporal dimensions
922 beyond the dimensions of the dataset type itself, in order to find the
923 dataset with the appropriate validity range. For example, a "bias"
924 dataset with native dimensions ``{instrument, detector}`` could be
925 fetched with a ``{instrument, detector, exposure}`` data ID, because
926 ``exposure`` is a temporal dimension.
927 """
928 log.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters)
929 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwds)
930 return self.getDirect(ref, parameters=parameters)
932 def getURIs(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
933 dataId: Optional[DataId] = None, *,
934 predict: bool = False,
935 collections: Any = None,
936 run: Optional[str] = None,
937 **kwds: Any) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]:
938 """Returns the URIs associated with the dataset.
940 Parameters
941 ----------
942 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
943 When `DatasetRef` the `dataId` should be `None`.
944 Otherwise the `DatasetType` or name thereof.
945 dataId : `dict` or `DataCoordinate`
946 A `dict` of `Dimension` link name, value pairs that label the
947 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
948 should be provided as the first argument.
949 predict : `bool`
950 If `True`, allow URIs to be returned of datasets that have not
951 been written.
952 collections : Any, optional
953 Collections to be searched, overriding ``self.collections``.
954 Can be any of the types supported by the ``collections`` argument
955 to butler construction.
956 run : `str`, optional
957 Run to use for predictions, overriding ``self.run``.
958 kwds
959 Additional keyword arguments used to augment or construct a
960 `DataCoordinate`. See `DataCoordinate.standardize`
961 parameters.
963 Returns
964 -------
965 primary : `ButlerURI`
966 The URI to the primary artifact associated with this dataset.
967 If the dataset was disassembled within the datastore this
968 may be `None`.
969 components : `dict`
970 URIs to any components associated with the dataset artifact.
971 Can be empty if there are no components.
972 """
973 ref = self._findDatasetRef(datasetRefOrType, dataId, allowUnresolved=predict,
974 collections=collections, **kwds)
975 if ref.id is None: # only possible if predict is True
976 if run is None:
977 run = self.run
978 if run is None:
979 raise TypeError("Cannot predict location with run=None.")
980 # Lie about ID, because we can't guess it, and only
981 # Datastore.getURIs() will ever see it (and it doesn't use it).
982 ref = ref.resolved(id=0, run=run)
983 return self.datastore.getURIs(ref, predict)
985 def getURI(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
986 dataId: Optional[DataId] = None, *,
987 predict: bool = False,
988 collections: Any = None,
989 run: Optional[str] = None,
990 **kwds: Any) -> ButlerURI:
991 """Return the URI to the Dataset.
993 Parameters
994 ----------
995 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
996 When `DatasetRef` the `dataId` should be `None`.
997 Otherwise the `DatasetType` or name thereof.
998 dataId : `dict` or `DataCoordinate`
999 A `dict` of `Dimension` link name, value pairs that label the
1000 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1001 should be provided as the first argument.
1002 predict : `bool`
1003 If `True`, allow URIs to be returned of datasets that have not
1004 been written.
1005 collections : Any, optional
1006 Collections to be searched, overriding ``self.collections``.
1007 Can be any of the types supported by the ``collections`` argument
1008 to butler construction.
1009 run : `str`, optional
1010 Run to use for predictions, overriding ``self.run``.
1011 kwds
1012 Additional keyword arguments used to augment or construct a
1013 `DataCoordinate`. See `DataCoordinate.standardize`
1014 parameters.
1016 Returns
1017 -------
1018 uri : `ButlerURI`
1019 URI pointing to the Dataset within the datastore. If the
1020 Dataset does not exist in the datastore, and if ``predict`` is
1021 `True`, the URI will be a prediction and will include a URI
1022 fragment "#predicted".
1023 If the datastore does not have entities that relate well
1024 to the concept of a URI the returned URI string will be
1025 descriptive. The returned URI is not guaranteed to be obtainable.
1027 Raises
1028 ------
1029 LookupError
1030 A URI has been requested for a dataset that does not exist and
1031 guessing is not allowed.
1032 ValueError
1033 Raised if a resolved `DatasetRef` was passed as an input, but it
1034 differs from the one found in the registry.
1035 TypeError
1036 Raised if no collections were provided.
1037 RuntimeError
1038 Raised if a URI is requested for a dataset that consists of
1039 multiple artifacts.
1040 """
1041 primary, components = self.getURIs(datasetRefOrType, dataId=dataId, predict=predict,
1042 collections=collections, run=run, **kwds)
1044 if primary is None or components:
1045 raise RuntimeError(f"Dataset ({datasetRefOrType}) includes distinct URIs for components. "
1046 "Use Butler.getURIs() instead.")
1047 return primary
1049 def datasetExists(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
1050 dataId: Optional[DataId] = None, *,
1051 collections: Any = None,
1052 **kwds: Any) -> bool:
1053 """Return True if the Dataset is actually present in the Datastore.
1055 Parameters
1056 ----------
1057 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
1058 When `DatasetRef` the `dataId` should be `None`.
1059 Otherwise the `DatasetType` or name thereof.
1060 dataId : `dict` or `DataCoordinate`
1061 A `dict` of `Dimension` link name, value pairs that label the
1062 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1063 should be provided as the first argument.
1064 collections : Any, optional
1065 Collections to be searched, overriding ``self.collections``.
1066 Can be any of the types supported by the ``collections`` argument
1067 to butler construction.
1068 kwds
1069 Additional keyword arguments used to augment or construct a
1070 `DataCoordinate`. See `DataCoordinate.standardize`
1071 parameters.
1073 Raises
1074 ------
1075 LookupError
1076 Raised if the dataset is not even present in the Registry.
1077 ValueError
1078 Raised if a resolved `DatasetRef` was passed as an input, but it
1079 differs from the one found in the registry.
1080 TypeError
1081 Raised if no collections were provided.
1082 """
1083 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwds)
1084 return self.datastore.exists(ref)
1086 def pruneCollection(self, name: str, purge: bool = False, unstore: bool = False):
1087 """Remove a collection and possibly prune datasets within it.
1089 Parameters
1090 ----------
1091 name : `str`
1092 Name of the collection to remove. If this is a
1093 `~CollectionType.TAGGED` or `~CollectionType.CHAINED` collection,
1094 datasets within the collection are not modified unless ``unstore``
1095 is `True`. If this is a `~CollectionType.RUN` collection,
1096 ``purge`` and ``unstore`` must be `True`, and all datasets in it
1097 are fully removed from the data repository.
1098 purge : `bool`, optional
1099 If `True`, permit `~CollectionType.RUN` collections to be removed,
1100 fully removing datasets within them. Requires ``unstore=True`` as
1101 well as an added precaution against accidental deletion. Must be
1102 `False` (default) if the collection is not a ``RUN``.
1103 unstore: `bool`, optional
1104 If `True`, remove all datasets in the collection from all
1105 datastores in which they appear.
1107 Raises
1108 ------
1109 TypeError
1110 Raised if the butler is read-only or arguments are mutually
1111 inconsistent.
1112 """
1114 # See pruneDatasets comments for more information about the logic here;
1115 # the cases are almost the same, but here we can rely on Registry to
1116 # take care everything but Datastore deletion when we remove the
1117 # collection.
1118 if not self.isWriteable():
1119 raise TypeError("Butler is read-only.")
1120 collectionType = self.registry.getCollectionType(name)
1121 if purge and not unstore:
1122 raise PurgeWithoutUnstorePruneCollectionsError()
1123 if collectionType is CollectionType.RUN and not purge:
1124 raise RunWithoutPurgePruneCollectionsError(collectionType)
1125 if collectionType is not CollectionType.RUN and purge:
1126 raise PurgeUnsupportedPruneCollectionsError(collectionType)
1128 with self.registry.transaction():
1129 if unstore:
1130 for ref in self.registry.queryDatasets(..., collections=name, findFirst=True):
1131 if self.datastore.exists(ref):
1132 self.datastore.trash(ref)
1133 self.registry.removeCollection(name)
1134 if unstore:
1135 # Point of no return for removing artifacts
1136 self.datastore.emptyTrash()
1138 def pruneDatasets(self, refs: Iterable[DatasetRef], *,
1139 disassociate: bool = True,
1140 unstore: bool = False,
1141 tags: Optional[Iterable[str]] = None,
1142 purge: bool = False,
1143 run: Optional[str] = None):
1144 """Remove one or more datasets from a collection and/or storage.
1146 Parameters
1147 ----------
1148 refs : `~collections.abc.Iterable` of `DatasetRef`
1149 Datasets to prune. These must be "resolved" references (not just
1150 a `DatasetType` and data ID).
1151 disassociate : `bool`, optional
1152 Disassociate pruned datasets from ``self.tags`` (or the collections
1153 given via the ``tags`` argument).
1154 unstore : `bool`, optional
1155 If `True` (`False` is default) remove these datasets from all
1156 datastores known to this butler. Note that this will make it
1157 impossible to retrieve these datasets even via other collections.
1158 Datasets that are already not stored are ignored by this option.
1159 tags : `Iterable` [ `str` ], optional
1160 `~CollectionType.TAGGED` collections to disassociate the datasets
1161 from, overriding ``self.tags``. Ignored if ``disassociate`` is
1162 `False` or ``purge`` is `True`.
1163 purge : `bool`, optional
1164 If `True` (`False` is default), completely remove the dataset from
1165 the `Registry`. To prevent accidental deletions, ``purge`` may
1166 only be `True` if all of the following conditions are met:
1168 - All given datasets are in the given run.
1169 - ``disassociate`` is `True`;
1170 - ``unstore`` is `True`.
1172 This mode may remove provenance information from datasets other
1173 than those provided, and should be used with extreme care.
1174 run : `str`, optional
1175 `~CollectionType.RUN` collection to purge from, overriding
1176 ``self.run``. Ignored unless ``purge`` is `True`.
1178 Raises
1179 ------
1180 TypeError
1181 Raised if the butler is read-only, if no collection was provided,
1182 or the conditions for ``purge=True`` were not met.
1183 """
1184 if not self.isWriteable():
1185 raise TypeError("Butler is read-only.")
1186 if purge:
1187 if not disassociate:
1188 raise TypeError("Cannot pass purge=True without disassociate=True.")
1189 if not unstore:
1190 raise TypeError("Cannot pass purge=True without unstore=True.")
1191 if run is None:
1192 run = self.run
1193 if run is None:
1194 raise TypeError("No run provided but purge=True.")
1195 collectionType = self.registry.getCollectionType(run)
1196 if collectionType is not CollectionType.RUN:
1197 raise TypeError(f"Cannot purge from collection '{run}' "
1198 f"of non-RUN type {collectionType.name}.")
1199 elif disassociate:
1200 if tags is None:
1201 tags = self.tags
1202 else:
1203 tags = tuple(tags)
1204 if not tags:
1205 raise TypeError("No tags provided but disassociate=True.")
1206 for tag in tags:
1207 collectionType = self.registry.getCollectionType(tag)
1208 if collectionType is not CollectionType.TAGGED:
1209 raise TypeError(f"Cannot disassociate from collection '{tag}' "
1210 f"of non-TAGGED type {collectionType.name}.")
1211 # Transform possibly-single-pass iterable into something we can iterate
1212 # over multiple times.
1213 refs = list(refs)
1214 # Pruning a component of a DatasetRef makes no sense since registry
1215 # doesn't know about components and datastore might not store
1216 # components in a separate file
1217 for ref in refs:
1218 if ref.datasetType.component():
1219 raise ValueError(f"Can not prune a component of a dataset (ref={ref})")
1220 # We don't need an unreliable Datastore transaction for this, because
1221 # we've been extra careful to ensure that Datastore.trash only involves
1222 # mutating the Registry (it can _look_ at Datastore-specific things,
1223 # but shouldn't change them), and hence all operations here are
1224 # Registry operations.
1225 with self.registry.transaction():
1226 if unstore:
1227 for ref in refs:
1228 # There is a difference between a concrete composite
1229 # and virtual composite. In a virtual composite the
1230 # datastore is never given the top level DatasetRef. In
1231 # the concrete composite the datastore knows all the
1232 # refs and will clean up itself if asked to remove the
1233 # parent ref. We can not check configuration for this
1234 # since we can not trust that the configuration is the
1235 # same. We therefore have to ask if the ref exists or
1236 # not. This is consistent with the fact that we want
1237 # to ignore already-removed-from-datastore datasets
1238 # anyway.
1239 if self.datastore.exists(ref):
1240 self.datastore.trash(ref)
1241 if purge:
1242 self.registry.removeDatasets(refs)
1243 elif disassociate:
1244 for tag in tags:
1245 self.registry.disassociate(tag, refs)
1246 # We've exited the Registry transaction, and apparently committed.
1247 # (if there was an exception, everything rolled back, and it's as if
1248 # nothing happened - and we never get here).
1249 # Datastore artifacts are not yet gone, but they're clearly marked
1250 # as trash, so if we fail to delete now because of (e.g.) filesystem
1251 # problems we can try again later, and if manual administrative
1252 # intervention is required, it's pretty clear what that should entail:
1253 # deleting everything on disk and in private Datastore tables that is
1254 # in the dataset_location_trash table.
1255 if unstore:
1256 # Point of no return for removing artifacts
1257 self.datastore.emptyTrash()
1259 @transactional
1260 def ingest(self, *datasets: FileDataset, transfer: Optional[str] = "auto", run: Optional[str] = None,
1261 tags: Optional[Iterable[str]] = None,):
1262 """Store and register one or more datasets that already exist on disk.
1264 Parameters
1265 ----------
1266 datasets : `FileDataset`
1267 Each positional argument is a struct containing information about
1268 a file to be ingested, including its path (either absolute or
1269 relative to the datastore root, if applicable), a `DatasetRef`,
1270 and optionally a formatter class or its fully-qualified string
1271 name. If a formatter is not provided, the formatter that would be
1272 used for `put` is assumed. On successful return, all
1273 `FileDataset.ref` attributes will have their `DatasetRef.id`
1274 attribute populated and all `FileDataset.formatter` attributes will
1275 be set to the formatter class used. `FileDataset.path` attributes
1276 may be modified to put paths in whatever the datastore considers a
1277 standardized form.
1278 transfer : `str`, optional
1279 If not `None`, must be one of 'auto', 'move', 'copy', 'hardlink',
1280 'relsymlink' or 'symlink', indicating how to transfer the file.
1281 run : `str`, optional
1282 The name of the run ingested datasets should be added to,
1283 overriding ``self.run``.
1284 tags : `Iterable` [ `str` ], optional
1285 The names of a `~CollectionType.TAGGED` collections to associate
1286 the dataset with, overriding ``self.tags``. These collections
1287 must have already been added to the `Registry`.
1289 Raises
1290 ------
1291 TypeError
1292 Raised if the butler is read-only or if no run was provided.
1293 NotImplementedError
1294 Raised if the `Datastore` does not support the given transfer mode.
1295 DatasetTypeNotSupportedError
1296 Raised if one or more files to be ingested have a dataset type that
1297 is not supported by the `Datastore`..
1298 FileNotFoundError
1299 Raised if one of the given files does not exist.
1300 FileExistsError
1301 Raised if transfer is not `None` but the (internal) location the
1302 file would be moved to is already occupied.
1304 Notes
1305 -----
1306 This operation is not fully exception safe: if a database operation
1307 fails, the given `FileDataset` instances may be only partially updated.
1309 It is atomic in terms of database operations (they will either all
1310 succeed or all fail) providing the database engine implements
1311 transactions correctly. It will attempt to be atomic in terms of
1312 filesystem operations as well, but this cannot be implemented
1313 rigorously for most datastores.
1314 """
1315 if not self.isWriteable():
1316 raise TypeError("Butler is read-only.")
1317 if run is None:
1318 if self.run is None:
1319 raise TypeError("No run provided.")
1320 run = self.run
1321 # No need to check run type, since insertDatasets will do that
1322 # (safely) for us.
1323 if tags is None:
1324 tags = self.tags
1325 else:
1326 tags = tuple(tags)
1327 for tag in tags:
1328 # Check that these are tagged collections up front, because we want
1329 # to avoid relying on Datastore transactionality to avoid modifying
1330 # the repo if there's an error later.
1331 collectionType = self.registry.getCollectionType(tag)
1332 if collectionType is not CollectionType.TAGGED:
1333 raise TypeError(f"Cannot associate into collection '{tag}' of non-TAGGED type "
1334 f"{collectionType.name}.")
1335 # Reorganize the inputs so they're grouped by DatasetType and then
1336 # data ID. We also include a list of DatasetRefs for each FileDataset
1337 # to hold the resolved DatasetRefs returned by the Registry, before
1338 # it's safe to swap them into FileDataset.refs.
1339 # Some type annotation aliases to make that clearer:
1340 GroupForType = Dict[DataCoordinate, Tuple[FileDataset, List[DatasetRef]]]
1341 GroupedData = MutableMapping[DatasetType, GroupForType]
1342 # The actual data structure:
1343 groupedData: GroupedData = defaultdict(dict)
1344 # And the nested loop that populates it:
1345 for dataset in datasets:
1346 # This list intentionally shared across the inner loop, since it's
1347 # associated with `dataset`.
1348 resolvedRefs = []
1349 for ref in dataset.refs:
1350 groupedData[ref.datasetType][ref.dataId] = (dataset, resolvedRefs)
1352 # Now we can bulk-insert into Registry for each DatasetType.
1353 allResolvedRefs = []
1354 for datasetType, groupForType in groupedData.items():
1355 refs = self.registry.insertDatasets(datasetType,
1356 dataIds=groupForType.keys(),
1357 run=run)
1358 # Append those resolved DatasetRefs to the new lists we set up for
1359 # them.
1360 for ref, (_, resolvedRefs) in zip(refs, groupForType.values()):
1361 resolvedRefs.append(ref)
1363 # Go back to the original FileDatasets to replace their refs with the
1364 # new resolved ones, and also build a big list of all refs.
1365 allResolvedRefs = []
1366 for groupForType in groupedData.values():
1367 for dataset, resolvedRefs in groupForType.values():
1368 dataset.refs = resolvedRefs
1369 allResolvedRefs.extend(resolvedRefs)
1371 # Bulk-associate everything with any tagged collections.
1372 for tag in tags:
1373 self.registry.associate(tag, allResolvedRefs)
1375 # Bulk-insert everything into Datastore.
1376 self.datastore.ingest(*datasets, transfer=transfer)
1378 @contextlib.contextmanager
1379 def export(self, *, directory: Optional[str] = None,
1380 filename: Optional[str] = None,
1381 format: Optional[str] = None,
1382 transfer: Optional[str] = None) -> ContextManager[RepoExportContext]:
1383 """Export datasets from the repository represented by this `Butler`.
1385 This method is a context manager that returns a helper object
1386 (`RepoExportContext`) that is used to indicate what information from
1387 the repository should be exported.
1389 Parameters
1390 ----------
1391 directory : `str`, optional
1392 Directory dataset files should be written to if ``transfer`` is not
1393 `None`.
1394 filename : `str`, optional
1395 Name for the file that will include database information associated
1396 with the exported datasets. If this is not an absolute path and
1397 ``directory`` is not `None`, it will be written to ``directory``
1398 instead of the current working directory. Defaults to
1399 "export.{format}".
1400 format : `str`, optional
1401 File format for the database information file. If `None`, the
1402 extension of ``filename`` will be used.
1403 transfer : `str`, optional
1404 Transfer mode passed to `Datastore.export`.
1406 Raises
1407 ------
1408 TypeError
1409 Raised if the set of arguments passed is inconsistent.
1411 Examples
1412 --------
1413 Typically the `Registry.queryDataIds` and `Registry.queryDatasets`
1414 methods are used to provide the iterables over data IDs and/or datasets
1415 to be exported::
1417 with butler.export("exports.yaml") as export:
1418 # Export all flats, but none of the dimension element rows
1419 # (i.e. data ID information) associated with them.
1420 export.saveDatasets(butler.registry.queryDatasets("flat"),
1421 elements=())
1422 # Export all datasets that start with "deepCoadd_" and all of
1423 # their associated data ID information.
1424 export.saveDatasets(butler.registry.queryDatasets("deepCoadd_*"))
1425 """
1426 if directory is None and transfer is not None:
1427 raise TypeError("Cannot transfer without providing a directory.")
1428 if transfer == "move":
1429 raise TypeError("Transfer may not be 'move': export is read-only")
1430 if format is None:
1431 if filename is None:
1432 raise TypeError("At least one of 'filename' or 'format' must be provided.")
1433 else:
1434 _, format = os.path.splitext(filename)
1435 elif filename is None:
1436 filename = f"export.{format}"
1437 if directory is not None:
1438 filename = os.path.join(directory, filename)
1439 BackendClass = getClassOf(self._config["repo_transfer_formats"][format]["export"])
1440 with open(filename, 'w') as stream:
1441 backend = BackendClass(stream)
1442 try:
1443 helper = RepoExportContext(self.registry, self.datastore, backend=backend,
1444 directory=directory, transfer=transfer)
1445 yield helper
1446 except BaseException:
1447 raise
1448 else:
1449 helper._finish()
1451 def import_(self, *, directory: Optional[str] = None,
1452 filename: Union[str, TextIO, None] = None,
1453 format: Optional[str] = None,
1454 transfer: Optional[str] = None,
1455 skip_dimensions: Optional[Set] = None):
1456 """Import datasets exported from a different butler repository.
1458 Parameters
1459 ----------
1460 directory : `str`, optional
1461 Directory containing dataset files. If `None`, all file paths
1462 must be absolute.
1463 filename : `str` or `TextIO`, optional
1464 A stream or name of file that contains database information
1465 associated with the exported datasets. If this a string (name) and
1466 is not an absolute path, does not exist in the current working
1467 directory, and ``directory`` is not `None`, it is assumed to be in
1468 ``directory``. Defaults to "export.{format}".
1469 format : `str`, optional
1470 File format for the database information file. If `None`, the
1471 extension of ``filename`` will be used.
1472 transfer : `str`, optional
1473 Transfer mode passed to `Datastore.ingest`.
1474 skip_dimensions : `set`, optional
1475 Names of dimensions that should be skipped and not imported.
1477 Raises
1478 ------
1479 TypeError
1480 Raised if the set of arguments passed is inconsistent, or if the
1481 butler is read-only.
1482 """
1483 if not self.isWriteable():
1484 raise TypeError("Butler is read-only.")
1485 if format is None:
1486 if filename is None:
1487 raise TypeError("At least one of 'filename' or 'format' must be provided.")
1488 else:
1489 _, format = os.path.splitext(filename)
1490 elif filename is None:
1491 filename = f"export.{format}"
1492 if isinstance(filename, str) and directory is not None and not os.path.exists(filename):
1493 filename = os.path.join(directory, filename)
1494 BackendClass = getClassOf(self._config["repo_transfer_formats"][format]["import"])
1496 def doImport(importStream):
1497 backend = BackendClass(importStream, self.registry)
1498 backend.register()
1499 with self.transaction():
1500 backend.load(self.datastore, directory=directory, transfer=transfer,
1501 skip_dimensions=skip_dimensions)
1503 if isinstance(filename, str):
1504 with open(filename, "r") as stream:
1505 doImport(stream)
1506 else:
1507 doImport(filename)
1509 def validateConfiguration(self, logFailures: bool = False,
1510 datasetTypeNames: Optional[Iterable[str]] = None,
1511 ignore: Iterable[str] = None):
1512 """Validate butler configuration.
1514 Checks that each `DatasetType` can be stored in the `Datastore`.
1516 Parameters
1517 ----------
1518 logFailures : `bool`, optional
1519 If `True`, output a log message for every validation error
1520 detected.
1521 datasetTypeNames : iterable of `str`, optional
1522 The `DatasetType` names that should be checked. This allows
1523 only a subset to be selected.
1524 ignore : iterable of `str`, optional
1525 Names of DatasetTypes to skip over. This can be used to skip
1526 known problems. If a named `DatasetType` corresponds to a
1527 composite, all components of that `DatasetType` will also be
1528 ignored.
1530 Raises
1531 ------
1532 ButlerValidationError
1533 Raised if there is some inconsistency with how this Butler
1534 is configured.
1535 """
1536 if datasetTypeNames:
1537 entities = [self.registry.getDatasetType(name) for name in datasetTypeNames]
1538 else:
1539 entities = list(self.registry.queryDatasetTypes())
1541 # filter out anything from the ignore list
1542 if ignore:
1543 ignore = set(ignore)
1544 entities = [e for e in entities if e.name not in ignore and e.nameAndComponent()[0] not in ignore]
1545 else:
1546 ignore = set()
1548 # Find all the registered instruments
1549 instruments = set(
1550 record.name for record in self.registry.queryDimensionRecords("instrument")
1551 )
1553 # For each datasetType that has an instrument dimension, create
1554 # a DatasetRef for each defined instrument
1555 datasetRefs = []
1557 for datasetType in entities:
1558 if "instrument" in datasetType.dimensions:
1559 for instrument in instruments:
1560 datasetRef = DatasetRef(datasetType, {"instrument": instrument}, conform=False)
1561 datasetRefs.append(datasetRef)
1563 entities.extend(datasetRefs)
1565 datastoreErrorStr = None
1566 try:
1567 self.datastore.validateConfiguration(entities, logFailures=logFailures)
1568 except ValidationError as e:
1569 datastoreErrorStr = str(e)
1571 # Also check that the LookupKeys used by the datastores match
1572 # registry and storage class definitions
1573 keys = self.datastore.getLookupKeys()
1575 failedNames = set()
1576 failedDataId = set()
1577 for key in keys:
1578 datasetType = None
1579 if key.name is not None:
1580 if key.name in ignore:
1581 continue
1583 # skip if specific datasetType names were requested and this
1584 # name does not match
1585 if datasetTypeNames and key.name not in datasetTypeNames:
1586 continue
1588 # See if it is a StorageClass or a DatasetType
1589 if key.name in self.storageClasses:
1590 pass
1591 else:
1592 try:
1593 self.registry.getDatasetType(key.name)
1594 except KeyError:
1595 if logFailures:
1596 log.fatal("Key '%s' does not correspond to a DatasetType or StorageClass", key)
1597 failedNames.add(key)
1598 else:
1599 # Dimensions are checked for consistency when the Butler
1600 # is created and rendezvoused with a universe.
1601 pass
1603 # Check that the instrument is a valid instrument
1604 # Currently only support instrument so check for that
1605 if key.dataId:
1606 dataIdKeys = set(key.dataId)
1607 if set(["instrument"]) != dataIdKeys:
1608 if logFailures:
1609 log.fatal("Key '%s' has unsupported DataId override", key)
1610 failedDataId.add(key)
1611 elif key.dataId["instrument"] not in instruments:
1612 if logFailures:
1613 log.fatal("Key '%s' has unknown instrument", key)
1614 failedDataId.add(key)
1616 messages = []
1618 if datastoreErrorStr:
1619 messages.append(datastoreErrorStr)
1621 for failed, msg in ((failedNames, "Keys without corresponding DatasetType or StorageClass entry: "),
1622 (failedDataId, "Keys with bad DataId entries: ")):
1623 if failed:
1624 msg += ", ".join(str(k) for k in failed)
1625 messages.append(msg)
1627 if messages:
1628 raise ValidationError(";\n".join(messages))
1630 registry: Registry
1631 """The object that manages dataset metadata and relationships (`Registry`).
1633 Most operations that don't involve reading or writing butler datasets are
1634 accessible only via `Registry` methods.
1635 """
1637 datastore: Datastore
1638 """The object that manages actual dataset storage (`Datastore`).
1640 Direct user access to the datastore should rarely be necessary; the primary
1641 exception is the case where a `Datastore` implementation provides extra
1642 functionality beyond what the base class defines.
1643 """
1645 storageClasses: StorageClassFactory
1646 """An object that maps known storage class names to objects that fully
1647 describe them (`StorageClassFactory`).
1648 """
1650 collections: Optional[CollectionSearch]
1651 """The collections to search and any restrictions on the dataset types to
1652 search for within them, in order (`CollectionSearch`).
1653 """
1655 run: Optional[str]
1656 """Name of the run this butler writes outputs to (`str` or `None`).
1657 """
1659 tags: Tuple[str, ...]
1660 """Names of `~CollectionType.TAGGED` collections this butler associates
1661 with in `put` and `ingest`, and disassociates from in `pruneDatasets`
1662 (`tuple` [ `str` ]).
1663 """