Coverage for python/lsst/daf/butler/_butler.py : 9%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""
23Butler top level classes.
24"""
25from __future__ import annotations
27__all__ = ("Butler", "ButlerValidationError")
29import os
30from collections import defaultdict
31import contextlib
32import logging
33from typing import (
34 Any,
35 ClassVar,
36 ContextManager,
37 Dict,
38 Iterable,
39 List,
40 Mapping,
41 MutableMapping,
42 Optional,
43 Set,
44 TextIO,
45 Tuple,
46 Union,
47)
49try:
50 import boto3
51except ImportError:
52 boto3 = None
54from lsst.utils import doImport
55from .core import (
56 AmbiguousDatasetError,
57 ButlerURI,
58 Config,
59 ConfigSubset,
60 DataCoordinate,
61 DataId,
62 DatasetRef,
63 DatasetType,
64 Datastore,
65 FileDataset,
66 StorageClassFactory,
67 Timespan,
68 ValidationError,
69)
70from .core.repoRelocation import BUTLER_ROOT_TAG
71from .core.utils import transactional, getClassOf
72from ._deferredDatasetHandle import DeferredDatasetHandle
73from ._butlerConfig import ButlerConfig
74from .registry import Registry, RegistryConfig, CollectionType
75from .registry.wildcards import CollectionSearch
76from .transfers import RepoExportContext
78log = logging.getLogger(__name__)
81class ButlerValidationError(ValidationError):
82 """There is a problem with the Butler configuration."""
83 pass
86class Butler:
87 """Main entry point for the data access system.
89 Parameters
90 ----------
91 config : `ButlerConfig`, `Config` or `str`, optional.
92 Configuration. Anything acceptable to the
93 `ButlerConfig` constructor. If a directory path
94 is given the configuration will be read from a ``butler.yaml`` file in
95 that location. If `None` is given default values will be used.
96 butler : `Butler`, optional.
97 If provided, construct a new Butler that uses the same registry and
98 datastore as the given one, but with the given collection and run.
99 Incompatible with the ``config``, ``searchPaths``, and ``writeable``
100 arguments.
101 collections : `Any`, optional
102 An expression specifying the collections to be searched (in order) when
103 reading datasets, and optionally dataset type restrictions on them.
104 This may be:
105 - a `str` collection name;
106 - a tuple of (collection name, *dataset type restriction*);
107 - an iterable of either of the above;
108 - a mapping from `str` to *dataset type restriction*.
110 See :ref:`daf_butler_collection_expressions` for more information,
111 including the definition of a *dataset type restriction*. All
112 collections must either already exist or be specified to be created
113 by other arguments.
114 run : `str`, optional
115 Name of the run datasets should be output to. If the run
116 does not exist, it will be created. If ``collections`` is `None`, it
117 will be set to ``[run]``. If this is not set (and ``writeable`` is
118 not set either), a read-only butler will be created.
119 tags : `Iterable` [ `str` ], optional
120 A list of `~CollectionType.TAGGED` collections that datasets should be
121 associated with in `put` or `ingest` and disassociated from in
122 `pruneDatasets`. If any of these collections does not exist, it will
123 be created.
124 chains : `Mapping` [ `str`, `Iterable` [ `str` ] ], optional
125 A mapping from the names of new `~CollectionType.CHAINED` collections
126 to an expression identifying their child collections (which takes the
127 same form as the ``collections`` argument. Chains may be nested only
128 if children precede their parents in this mapping.
129 searchPaths : `list` of `str`, optional
130 Directory paths to search when calculating the full Butler
131 configuration. Not used if the supplied config is already a
132 `ButlerConfig`.
133 writeable : `bool`, optional
134 Explicitly sets whether the butler supports write operations. If not
135 provided, a read-write butler is created if any of ``run``, ``tags``,
136 or ``chains`` is non-empty.
138 Examples
139 --------
140 While there are many ways to control exactly how a `Butler` interacts with
141 the collections in its `Registry`, the most common cases are still simple.
143 For a read-only `Butler` that searches one collection, do::
145 butler = Butler("/path/to/repo", collections=["u/alice/DM-50000"])
147 For a read-write `Butler` that writes to and reads from a
148 `~CollectionType.RUN` collection::
150 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a")
152 The `Butler` passed to a ``PipelineTask`` is often much more complex,
153 because we want to write to one `~CollectionType.RUN` collection but read
154 from several others (as well), while defining a new
155 `~CollectionType.CHAINED` collection that combines them all::
157 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a",
158 collections=["u/alice/DM-50000"],
159 chains={
160 "u/alice/DM-50000": ["u/alice/DM-50000/a",
161 "u/bob/DM-49998",
162 "raw/hsc"]
163 })
165 This butler will `put` new datasets to the run ``u/alice/DM-50000/a``, but
166 they'll also be available from the chained collection ``u/alice/DM-50000``.
167 Datasets will be read first from that run (since it appears first in the
168 chain), and then from ``u/bob/DM-49998`` and finally ``raw/hsc``.
169 If ``u/alice/DM-50000`` had already been defined, the ``chain`` argument
170 would be unnecessary. We could also construct a butler that performs
171 exactly the same `put` and `get` operations without actually creating a
172 chained collection, just by passing multiple items is ``collections``::
174 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a",
175 collections=["u/alice/DM-50000/a",
176 "u/bob/DM-49998",
177 "raw/hsc"])
179 Finally, one can always create a `Butler` with no collections::
181 butler = Butler("/path/to/repo", writeable=True)
183 This can be extremely useful when you just want to use ``butler.registry``,
184 e.g. for inserting dimension data or managing collections, or when the
185 collections you want to use with the butler are not consistent.
186 Passing ``writeable`` explicitly here is only necessary if you want to be
187 able to make changes to the repo - usually the value for ``writeable`` is
188 can be guessed from the collection arguments provided, but it defaults to
189 `False` when there are not collection arguments.
190 """
191 def __init__(self, config: Union[Config, str, None] = None, *,
192 butler: Optional[Butler] = None,
193 collections: Any = None,
194 run: Optional[str] = None,
195 tags: Iterable[str] = (),
196 chains: Optional[Mapping[str, Any]] = None,
197 searchPaths: Optional[List[str]] = None,
198 writeable: Optional[bool] = None,
199 ):
200 # Transform any single-pass iterator into an actual sequence so we
201 # can see if its empty
202 self.tags = tuple(tags)
203 # Load registry, datastore, etc. from config or existing butler.
204 if butler is not None:
205 if config is not None or searchPaths is not None or writeable is not None:
206 raise TypeError("Cannot pass 'config', 'searchPaths', or 'writeable' "
207 "arguments with 'butler' argument.")
208 self.registry = butler.registry
209 self.datastore = butler.datastore
210 self.storageClasses = butler.storageClasses
211 self._config = butler._config
212 else:
213 self._config = ButlerConfig(config, searchPaths=searchPaths)
214 if "root" in self._config:
215 butlerRoot = self._config["root"]
216 else:
217 butlerRoot = self._config.configDir
218 if writeable is None:
219 writeable = run is not None or chains is not None or self.tags
220 self.registry = Registry.fromConfig(self._config, butlerRoot=butlerRoot, writeable=writeable)
221 self.datastore = Datastore.fromConfig(self._config, self.registry.getDatastoreBridgeManager(),
222 butlerRoot=butlerRoot)
223 self.storageClasses = StorageClassFactory()
224 self.storageClasses.addFromConfig(self._config)
225 # Check the many collection arguments for consistency and create any
226 # needed collections that don't exist.
227 if collections is None:
228 if run is not None:
229 collections = (run,)
230 else:
231 collections = ()
232 self.collections = CollectionSearch.fromExpression(collections)
233 if chains is None:
234 chains = {}
235 self.run = run
236 if "run" in self._config or "collection" in self._config:
237 raise ValueError("Passing a run or collection via configuration is no longer supported.")
238 if self.run is not None:
239 self.registry.registerCollection(self.run, type=CollectionType.RUN)
240 for tag in self.tags:
241 self.registry.registerCollection(tag, type=CollectionType.TAGGED)
242 for parent, children in chains.items():
243 self.registry.registerCollection(parent, type=CollectionType.CHAINED)
244 self.registry.setCollectionChain(parent, children)
246 GENERATION: ClassVar[int] = 3
247 """This is a Generation 3 Butler.
249 This attribute may be removed in the future, once the Generation 2 Butler
250 interface has been fully retired; it should only be used in transitional
251 code.
252 """
254 @staticmethod
255 def makeRepo(root: str, config: Union[Config, str, None] = None, standalone: bool = False,
256 createRegistry: bool = True, searchPaths: Optional[List[str]] = None,
257 forceConfigRoot: bool = True, outfile: Optional[str] = None,
258 overwrite: bool = False) -> Config:
259 """Create an empty data repository by adding a butler.yaml config
260 to a repository root directory.
262 Parameters
263 ----------
264 root : `str` or `ButlerURI`
265 Path or URI to the root location of the new repository. Will be
266 created if it does not exist.
267 config : `Config` or `str`, optional
268 Configuration to write to the repository, after setting any
269 root-dependent Registry or Datastore config options. Can not
270 be a `ButlerConfig` or a `ConfigSubset`. If `None`, default
271 configuration will be used. Root-dependent config options
272 specified in this config are overwritten if ``forceConfigRoot``
273 is `True`.
274 standalone : `bool`
275 If True, write all expanded defaults, not just customized or
276 repository-specific settings.
277 This (mostly) decouples the repository from the default
278 configuration, insulating it from changes to the defaults (which
279 may be good or bad, depending on the nature of the changes).
280 Future *additions* to the defaults will still be picked up when
281 initializing `Butlers` to repos created with ``standalone=True``.
282 createRegistry : `bool`, optional
283 If `True` create a new Registry.
284 searchPaths : `list` of `str`, optional
285 Directory paths to search when calculating the full butler
286 configuration.
287 forceConfigRoot : `bool`, optional
288 If `False`, any values present in the supplied ``config`` that
289 would normally be reset are not overridden and will appear
290 directly in the output config. This allows non-standard overrides
291 of the root directory for a datastore or registry to be given.
292 If this parameter is `True` the values for ``root`` will be
293 forced into the resulting config if appropriate.
294 outfile : `str`, optional
295 If not-`None`, the output configuration will be written to this
296 location rather than into the repository itself. Can be a URI
297 string. Can refer to a directory that will be used to write
298 ``butler.yaml``.
299 overwrite : `bool`, optional
300 Create a new configuration file even if one already exists
301 in the specified output location. Default is to raise
302 an exception.
304 Returns
305 -------
306 config : `Config`
307 The updated `Config` instance written to the repo.
309 Raises
310 ------
311 ValueError
312 Raised if a ButlerConfig or ConfigSubset is passed instead of a
313 regular Config (as these subclasses would make it impossible to
314 support ``standalone=False``).
315 FileExistsError
316 Raised if the output config file already exists.
317 os.error
318 Raised if the directory does not exist, exists but is not a
319 directory, or cannot be created.
321 Notes
322 -----
323 Note that when ``standalone=False`` (the default), the configuration
324 search path (see `ConfigSubset.defaultSearchPaths`) that was used to
325 construct the repository should also be used to construct any Butlers
326 to avoid configuration inconsistencies.
327 """
328 if isinstance(config, (ButlerConfig, ConfigSubset)):
329 raise ValueError("makeRepo must be passed a regular Config without defaults applied.")
331 # Ensure that the root of the repository exists or can be made
332 uri = ButlerURI(root, forceDirectory=True)
333 uri.mkdir()
335 config = Config(config)
337 # If we are creating a new repo from scratch with relative roots,
338 # do not propagate an explicit root from the config file
339 if "root" in config:
340 del config["root"]
342 full = ButlerConfig(config, searchPaths=searchPaths) # this applies defaults
343 datastoreClass = doImport(full["datastore", "cls"])
344 datastoreClass.setConfigRoot(BUTLER_ROOT_TAG, config, full, overwrite=forceConfigRoot)
346 # if key exists in given config, parse it, otherwise parse the defaults
347 # in the expanded config
348 if config.get(("registry", "db")):
349 registryConfig = RegistryConfig(config)
350 else:
351 registryConfig = RegistryConfig(full)
352 defaultDatabaseUri = registryConfig.makeDefaultDatabaseUri(BUTLER_ROOT_TAG)
353 if defaultDatabaseUri is not None:
354 Config.updateParameters(RegistryConfig, config, full,
355 toUpdate={"db": defaultDatabaseUri},
356 overwrite=forceConfigRoot)
357 else:
358 Config.updateParameters(RegistryConfig, config, full, toCopy=("db",),
359 overwrite=forceConfigRoot)
361 if standalone:
362 config.merge(full)
363 if outfile is not None:
364 # When writing to a separate location we must include
365 # the root of the butler repo in the config else it won't know
366 # where to look.
367 config["root"] = uri.geturl()
368 configURI = outfile
369 else:
370 configURI = uri
371 config.dumpToUri(configURI, overwrite=overwrite)
373 # Create Registry and populate tables
374 Registry.fromConfig(config, create=createRegistry, butlerRoot=root)
375 return config
377 @classmethod
378 def _unpickle(cls, config: ButlerConfig, collections: Optional[CollectionSearch], run: Optional[str],
379 tags: Tuple[str, ...], writeable: bool) -> Butler:
380 """Callable used to unpickle a Butler.
382 We prefer not to use ``Butler.__init__`` directly so we can force some
383 of its many arguments to be keyword-only (note that ``__reduce__``
384 can only invoke callables with positional arguments).
386 Parameters
387 ----------
388 config : `ButlerConfig`
389 Butler configuration, already coerced into a true `ButlerConfig`
390 instance (and hence after any search paths for overrides have been
391 utilized).
392 collections : `CollectionSearch`
393 Names of collections to read from.
394 run : `str`, optional
395 Name of `~CollectionType.RUN` collection to write to.
396 tags : `tuple` [`str`]
397 Names of `~CollectionType.TAGGED` collections to associate with.
398 writeable : `bool`
399 Whether the Butler should support write operations.
401 Returns
402 -------
403 butler : `Butler`
404 A new `Butler` instance.
405 """
406 return cls(config=config, collections=collections, run=run, tags=tags, writeable=writeable)
408 def __reduce__(self):
409 """Support pickling.
410 """
411 return (Butler._unpickle, (self._config, self.collections, self.run, self.tags,
412 self.registry.isWriteable()))
414 def __str__(self):
415 return "Butler(collections={}, run={}, tags={}, datastore='{}', registry='{}')".format(
416 self.collections, self.run, self.tags, self.datastore, self.registry)
418 def isWriteable(self) -> bool:
419 """Return `True` if this `Butler` supports write operations.
420 """
421 return self.registry.isWriteable()
423 @contextlib.contextmanager
424 def transaction(self):
425 """Context manager supporting `Butler` transactions.
427 Transactions can be nested.
428 """
429 with self.registry.transaction():
430 with self.datastore.transaction():
431 yield
433 def _standardizeArgs(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
434 dataId: Optional[DataId] = None, **kwds: Any) -> Tuple[DatasetType, DataId]:
435 """Standardize the arguments passed to several Butler APIs.
437 Parameters
438 ----------
439 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
440 When `DatasetRef` the `dataId` should be `None`.
441 Otherwise the `DatasetType` or name thereof.
442 dataId : `dict` or `DataCoordinate`
443 A `dict` of `Dimension` link name, value pairs that label the
444 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
445 should be provided as the second argument.
446 kwds
447 Additional keyword arguments used to augment or construct a
448 `DataCoordinate`. See `DataCoordinate.standardize`
449 parameters.
451 Returns
452 -------
453 datasetType : `DatasetType`
454 A `DatasetType` instance extracted from ``datasetRefOrType``.
455 dataId : `dict` or `DataId`, optional
456 Argument that can be used (along with ``kwds``) to construct a
457 `DataId`.
459 Notes
460 -----
461 Butler APIs that conceptually need a DatasetRef also allow passing a
462 `DatasetType` (or the name of one) and a `DataId` (or a dict and
463 keyword arguments that can be used to construct one) separately. This
464 method accepts those arguments and always returns a true `DatasetType`
465 and a `DataId` or `dict`.
467 Standardization of `dict` vs `DataId` is best handled by passing the
468 returned ``dataId`` (and ``kwds``) to `Registry` APIs, which are
469 generally similarly flexible.
470 """
471 externalDatasetType = None
472 internalDatasetType = None
473 if isinstance(datasetRefOrType, DatasetRef):
474 if dataId is not None or kwds:
475 raise ValueError("DatasetRef given, cannot use dataId as well")
476 externalDatasetType = datasetRefOrType.datasetType
477 dataId = datasetRefOrType.dataId
478 else:
479 # Don't check whether DataId is provided, because Registry APIs
480 # can usually construct a better error message when it wasn't.
481 if isinstance(datasetRefOrType, DatasetType):
482 externalDatasetType = datasetRefOrType
483 else:
484 internalDatasetType = self.registry.getDatasetType(datasetRefOrType)
486 # Check that they are self-consistent
487 if externalDatasetType is not None:
488 internalDatasetType = self.registry.getDatasetType(externalDatasetType.name)
489 if externalDatasetType != internalDatasetType:
490 raise ValueError(f"Supplied dataset type ({externalDatasetType}) inconsistent with "
491 f"registry definition ({internalDatasetType})")
493 return internalDatasetType, dataId
495 def _findDatasetRef(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
496 dataId: Optional[DataId] = None, *,
497 collections: Any = None,
498 allowUnresolved: bool = False,
499 **kwds: Any) -> DatasetRef:
500 """Shared logic for methods that start with a search for a dataset in
501 the registry.
503 Parameters
504 ----------
505 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
506 When `DatasetRef` the `dataId` should be `None`.
507 Otherwise the `DatasetType` or name thereof.
508 dataId : `dict` or `DataCoordinate`, optional
509 A `dict` of `Dimension` link name, value pairs that label the
510 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
511 should be provided as the first argument.
512 collections : Any, optional
513 Collections to be searched, overriding ``self.collections``.
514 Can be any of the types supported by the ``collections`` argument
515 to butler construction.
516 allowUnresolved : `bool`, optional
517 If `True`, return an unresolved `DatasetRef` if finding a resolved
518 one in the `Registry` fails. Defaults to `False`.
519 kwds
520 Additional keyword arguments used to augment or construct a
521 `DataId`. See `DataId` parameters.
523 Returns
524 -------
525 ref : `DatasetRef`
526 A reference to the dataset identified by the given arguments.
528 Raises
529 ------
530 LookupError
531 Raised if no matching dataset exists in the `Registry` (and
532 ``allowUnresolved is False``).
533 ValueError
534 Raised if a resolved `DatasetRef` was passed as an input, but it
535 differs from the one found in the registry.
536 TypeError
537 Raised if no collections were provided.
538 """
539 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwds)
540 if isinstance(datasetRefOrType, DatasetRef):
541 idNumber = datasetRefOrType.id
542 else:
543 idNumber = None
544 timespan: Optional[Timespan] = None
545 if datasetType.isCalibration():
546 # Because this is a calibration dataset, first try to make a
547 # standardize the data ID without restricting the dimensions to
548 # those of the dataset type requested, because there may be extra
549 # dimensions that provide temporal information for a validity-range
550 # lookup.
551 dataId = DataCoordinate.standardize(dataId, universe=self.registry.dimensions, **kwds)
552 if dataId.graph.temporal:
553 dataId = self.registry.expandDataId(dataId)
554 timespan = dataId.timespan
555 else:
556 # Standardize the data ID to just the dimensions of the dataset
557 # type instead of letting registry.findDataset do it, so we get the
558 # result even if no dataset is found.
559 dataId = DataCoordinate.standardize(dataId, graph=datasetType.dimensions, **kwds)
560 if collections is None:
561 collections = self.collections
562 if not collections:
563 raise TypeError("No input collections provided.")
564 else:
565 collections = CollectionSearch.fromExpression(collections)
566 # Always lookup the DatasetRef, even if one is given, to ensure it is
567 # present in the current collection.
568 ref = self.registry.findDataset(datasetType, dataId, collections=collections, timespan=timespan)
569 if ref is None:
570 if allowUnresolved:
571 return DatasetRef(datasetType, dataId)
572 else:
573 raise LookupError(f"Dataset {datasetType.name} with data ID {dataId} "
574 f"could not be found in collections {collections}.")
575 if idNumber is not None and idNumber != ref.id:
576 raise ValueError(f"DatasetRef.id provided ({idNumber}) does not match "
577 f"id ({ref.id}) in registry in collections {collections}.")
578 return ref
580 @transactional
581 def put(self, obj: Any, datasetRefOrType: Union[DatasetRef, DatasetType, str],
582 dataId: Optional[DataId] = None, *,
583 run: Optional[str] = None,
584 tags: Optional[Iterable[str]] = None,
585 **kwds: Any) -> DatasetRef:
586 """Store and register a dataset.
588 Parameters
589 ----------
590 obj : `object`
591 The dataset.
592 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
593 When `DatasetRef` is provided, ``dataId`` should be `None`.
594 Otherwise the `DatasetType` or name thereof.
595 dataId : `dict` or `DataCoordinate`
596 A `dict` of `Dimension` link name, value pairs that label the
597 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
598 should be provided as the second argument.
599 run : `str`, optional
600 The name of the run the dataset should be added to, overriding
601 ``self.run``.
602 tags : `Iterable` [ `str` ], optional
603 The names of a `~CollectionType.TAGGED` collections to associate
604 the dataset with, overriding ``self.tags``. These collections
605 must have already been added to the `Registry`.
606 kwds
607 Additional keyword arguments used to augment or construct a
608 `DataCoordinate`. See `DataCoordinate.standardize`
609 parameters.
611 Returns
612 -------
613 ref : `DatasetRef`
614 A reference to the stored dataset, updated with the correct id if
615 given.
617 Raises
618 ------
619 TypeError
620 Raised if the butler is read-only or if no run has been provided.
621 """
622 log.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run)
623 if not self.isWriteable():
624 raise TypeError("Butler is read-only.")
625 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwds)
626 if isinstance(datasetRefOrType, DatasetRef) and datasetRefOrType.id is not None:
627 raise ValueError("DatasetRef must not be in registry, must have None id")
629 if run is None:
630 if self.run is None:
631 raise TypeError("No run provided.")
632 run = self.run
633 # No need to check type for run; first thing we do is
634 # insertDatasets, and that will check for us.
636 if tags is None:
637 tags = self.tags
638 else:
639 tags = tuple(tags)
640 for tag in tags:
641 # Check that these are tagged collections up front, because we want
642 # to avoid relying on Datastore transactionality to avoid modifying
643 # the repo if there's an error later.
644 collectionType = self.registry.getCollectionType(tag)
645 if collectionType is not CollectionType.TAGGED:
646 raise TypeError(f"Cannot associate into collection '{tag}' of non-TAGGED type "
647 f"{collectionType.name}.")
649 # Add Registry Dataset entry.
650 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions, **kwds)
651 ref, = self.registry.insertDatasets(datasetType, run=run, dataIds=[dataId])
653 # Add Datastore entry.
654 self.datastore.put(obj, ref)
656 for tag in tags:
657 self.registry.associate(tag, [ref])
659 return ref
661 def getDirect(self, ref: DatasetRef, *, parameters: Optional[Dict[str, Any]] = None):
662 """Retrieve a stored dataset.
664 Unlike `Butler.get`, this method allows datasets outside the Butler's
665 collection to be read as long as the `DatasetRef` that identifies them
666 can be obtained separately.
668 Parameters
669 ----------
670 ref : `DatasetRef`
671 Resolved reference to an already stored dataset.
672 parameters : `dict`
673 Additional StorageClass-defined options to control reading,
674 typically used to efficiently read only a subset of the dataset.
676 Returns
677 -------
678 obj : `object`
679 The dataset.
680 """
681 return self.datastore.get(ref, parameters=parameters)
683 def getDirectDeferred(self, ref: DatasetRef, *,
684 parameters: Union[dict, None] = None) -> DeferredDatasetHandle:
685 """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
686 from a resolved `DatasetRef`.
688 Parameters
689 ----------
690 ref : `DatasetRef`
691 Resolved reference to an already stored dataset.
692 parameters : `dict`
693 Additional StorageClass-defined options to control reading,
694 typically used to efficiently read only a subset of the dataset.
696 Returns
697 -------
698 obj : `DeferredDatasetHandle`
699 A handle which can be used to retrieve a dataset at a later time.
701 Raises
702 ------
703 AmbiguousDatasetError
704 Raised if ``ref.id is None``, i.e. the reference is unresolved.
705 """
706 if ref.id is None:
707 raise AmbiguousDatasetError(
708 f"Dataset of type {ref.datasetType.name} with data ID {ref.dataId} is not resolved."
709 )
710 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters)
712 def getDeferred(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
713 dataId: Optional[DataId] = None, *,
714 parameters: Union[dict, None] = None,
715 collections: Any = None,
716 **kwds: Any) -> DeferredDatasetHandle:
717 """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
718 after an immediate registry lookup.
720 Parameters
721 ----------
722 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
723 When `DatasetRef` the `dataId` should be `None`.
724 Otherwise the `DatasetType` or name thereof.
725 dataId : `dict` or `DataCoordinate`, optional
726 A `dict` of `Dimension` link name, value pairs that label the
727 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
728 should be provided as the first argument.
729 parameters : `dict`
730 Additional StorageClass-defined options to control reading,
731 typically used to efficiently read only a subset of the dataset.
732 collections : Any, optional
733 Collections to be searched, overriding ``self.collections``.
734 Can be any of the types supported by the ``collections`` argument
735 to butler construction.
736 kwds
737 Additional keyword arguments used to augment or construct a
738 `DataId`. See `DataId` parameters.
740 Returns
741 -------
742 obj : `DeferredDatasetHandle`
743 A handle which can be used to retrieve a dataset at a later time.
745 Raises
746 ------
747 LookupError
748 Raised if no matching dataset exists in the `Registry` (and
749 ``allowUnresolved is False``).
750 ValueError
751 Raised if a resolved `DatasetRef` was passed as an input, but it
752 differs from the one found in the registry.
753 TypeError
754 Raised if no collections were provided.
755 """
756 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwds)
757 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters)
759 def get(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
760 dataId: Optional[DataId] = None, *,
761 parameters: Optional[Dict[str, Any]] = None,
762 collections: Any = None,
763 **kwds: Any) -> Any:
764 """Retrieve a stored dataset.
766 Parameters
767 ----------
768 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
769 When `DatasetRef` the `dataId` should be `None`.
770 Otherwise the `DatasetType` or name thereof.
771 dataId : `dict` or `DataCoordinate`
772 A `dict` of `Dimension` link name, value pairs that label the
773 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
774 should be provided as the first argument.
775 parameters : `dict`
776 Additional StorageClass-defined options to control reading,
777 typically used to efficiently read only a subset of the dataset.
778 collections : Any, optional
779 Collections to be searched, overriding ``self.collections``.
780 Can be any of the types supported by the ``collections`` argument
781 to butler construction.
782 kwds
783 Additional keyword arguments used to augment or construct a
784 `DataCoordinate`. See `DataCoordinate.standardize`
785 parameters.
787 Returns
788 -------
789 obj : `object`
790 The dataset.
792 Raises
793 ------
794 ValueError
795 Raised if a resolved `DatasetRef` was passed as an input, but it
796 differs from the one found in the registry.
797 LookupError
798 Raised if no matching dataset exists in the `Registry`.
799 TypeError
800 Raised if no collections were provided.
802 Notes
803 -----
804 When looking up datasets in a `~CollectionType.CALIBRATION` collection,
805 this method requires that the given data ID include temporal dimensions
806 beyond the dimensions of the dataset type itself, in order to find the
807 dataset with the appropriate validity range. For example, a "bias"
808 dataset with native dimensions ``{instrument, detector}`` could be
809 fetched with a ``{instrument, detector, exposure}`` data ID, because
810 ``exposure`` is a temporal dimension.
811 """
812 log.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters)
813 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwds)
814 return self.getDirect(ref, parameters=parameters)
816 def getURIs(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
817 dataId: Optional[DataId] = None, *,
818 predict: bool = False,
819 collections: Any = None,
820 run: Optional[str] = None,
821 **kwds: Any) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]:
822 """Returns the URIs associated with the dataset.
824 Parameters
825 ----------
826 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
827 When `DatasetRef` the `dataId` should be `None`.
828 Otherwise the `DatasetType` or name thereof.
829 dataId : `dict` or `DataCoordinate`
830 A `dict` of `Dimension` link name, value pairs that label the
831 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
832 should be provided as the first argument.
833 predict : `bool`
834 If `True`, allow URIs to be returned of datasets that have not
835 been written.
836 collections : Any, optional
837 Collections to be searched, overriding ``self.collections``.
838 Can be any of the types supported by the ``collections`` argument
839 to butler construction.
840 run : `str`, optional
841 Run to use for predictions, overriding ``self.run``.
842 kwds
843 Additional keyword arguments used to augment or construct a
844 `DataCoordinate`. See `DataCoordinate.standardize`
845 parameters.
847 Returns
848 -------
849 primary : `ButlerURI`
850 The URI to the primary artifact associated with this dataset.
851 If the dataset was disassembled within the datastore this
852 may be `None`.
853 components : `dict`
854 URIs to any components associated with the dataset artifact.
855 Can be empty if there are no components.
856 """
857 ref = self._findDatasetRef(datasetRefOrType, dataId, allowUnresolved=predict,
858 collections=collections, **kwds)
859 if ref.id is None: # only possible if predict is True
860 if run is None:
861 run = self.run
862 if run is None:
863 raise TypeError("Cannot predict location with run=None.")
864 # Lie about ID, because we can't guess it, and only
865 # Datastore.getURIs() will ever see it (and it doesn't use it).
866 ref = ref.resolved(id=0, run=run)
867 return self.datastore.getURIs(ref, predict)
869 def getURI(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
870 dataId: Optional[DataId] = None, *,
871 predict: bool = False,
872 collections: Any = None,
873 run: Optional[str] = None,
874 **kwds: Any) -> ButlerURI:
875 """Return the URI to the Dataset.
877 Parameters
878 ----------
879 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
880 When `DatasetRef` the `dataId` should be `None`.
881 Otherwise the `DatasetType` or name thereof.
882 dataId : `dict` or `DataCoordinate`
883 A `dict` of `Dimension` link name, value pairs that label the
884 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
885 should be provided as the first argument.
886 predict : `bool`
887 If `True`, allow URIs to be returned of datasets that have not
888 been written.
889 collections : Any, optional
890 Collections to be searched, overriding ``self.collections``.
891 Can be any of the types supported by the ``collections`` argument
892 to butler construction.
893 run : `str`, optional
894 Run to use for predictions, overriding ``self.run``.
895 kwds
896 Additional keyword arguments used to augment or construct a
897 `DataCoordinate`. See `DataCoordinate.standardize`
898 parameters.
900 Returns
901 -------
902 uri : `ButlerURI`
903 URI pointing to the Dataset within the datastore. If the
904 Dataset does not exist in the datastore, and if ``predict`` is
905 `True`, the URI will be a prediction and will include a URI
906 fragment "#predicted".
907 If the datastore does not have entities that relate well
908 to the concept of a URI the returned URI string will be
909 descriptive. The returned URI is not guaranteed to be obtainable.
911 Raises
912 ------
913 LookupError
914 A URI has been requested for a dataset that does not exist and
915 guessing is not allowed.
916 ValueError
917 Raised if a resolved `DatasetRef` was passed as an input, but it
918 differs from the one found in the registry.
919 TypeError
920 Raised if no collections were provided.
921 RuntimeError
922 Raised if a URI is requested for a dataset that consists of
923 multiple artifacts.
924 """
925 primary, components = self.getURIs(datasetRefOrType, dataId=dataId, predict=predict,
926 collections=collections, run=run, **kwds)
928 if primary is None or components:
929 raise RuntimeError(f"Dataset ({datasetRefOrType}) includes distinct URIs for components. "
930 "Use Butler.getURIs() instead.")
931 return primary
933 def datasetExists(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
934 dataId: Optional[DataId] = None, *,
935 collections: Any = None,
936 **kwds: Any) -> bool:
937 """Return True if the Dataset is actually present in the Datastore.
939 Parameters
940 ----------
941 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
942 When `DatasetRef` the `dataId` should be `None`.
943 Otherwise the `DatasetType` or name thereof.
944 dataId : `dict` or `DataCoordinate`
945 A `dict` of `Dimension` link name, value pairs that label the
946 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
947 should be provided as the first argument.
948 collections : Any, optional
949 Collections to be searched, overriding ``self.collections``.
950 Can be any of the types supported by the ``collections`` argument
951 to butler construction.
952 kwds
953 Additional keyword arguments used to augment or construct a
954 `DataCoordinate`. See `DataCoordinate.standardize`
955 parameters.
957 Raises
958 ------
959 LookupError
960 Raised if the dataset is not even present in the Registry.
961 ValueError
962 Raised if a resolved `DatasetRef` was passed as an input, but it
963 differs from the one found in the registry.
964 TypeError
965 Raised if no collections were provided.
966 """
967 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwds)
968 return self.datastore.exists(ref)
970 def pruneCollection(self, name: str, purge: bool = False, unstore: bool = False):
971 """Remove a collection and possibly prune datasets within it.
973 Parameters
974 ----------
975 name : `str`
976 Name of the collection to remove. If this is a
977 `~CollectionType.TAGGED` or `~CollectionType.CHAINED` collection,
978 datasets within the collection are not modified unless ``unstore``
979 is `True`. If this is a `~CollectionType.RUN` collection,
980 ``purge`` and ``unstore`` must be `True`, and all datasets in it
981 are fully removed from the data repository.
982 purge : `bool`, optional
983 If `True`, permit `~CollectionType.RUN` collections to be removed,
984 fully removing datasets within them. Requires ``unstore=True`` as
985 well as an added precaution against accidental deletion. Must be
986 `False` (default) if the collection is not a ``RUN``.
987 unstore: `bool`, optional
988 If `True`, remove all datasets in the collection from all
989 datastores in which they appear.
991 Raises
992 ------
993 TypeError
994 Raised if the butler is read-only or arguments are mutually
995 inconsistent.
996 """
997 # See pruneDatasets comments for more information about the logic here;
998 # the cases are almost the same, but here we can rely on Registry to
999 # take care everything but Datastore deletion when we remove the
1000 # collection.
1001 if not self.isWriteable():
1002 raise TypeError("Butler is read-only.")
1003 if purge and not unstore:
1004 raise TypeError("Cannot pass purge=True without unstore=True.")
1005 collectionType = self.registry.getCollectionType(name)
1006 if collectionType is CollectionType.RUN and not purge:
1007 raise TypeError(f"Cannot prune RUN collection {name} without purge=True.")
1008 if collectionType is not CollectionType.RUN and purge:
1009 raise TypeError(f"Cannot prune {collectionType.name} collection {name} with purge=True.")
1010 with self.registry.transaction():
1011 if unstore:
1012 for ref in self.registry.queryDatasets(..., collections=name, findFirst=True):
1013 if self.datastore.exists(ref):
1014 self.datastore.trash(ref)
1015 self.registry.removeCollection(name)
1016 if unstore:
1017 # Point of no return for removing artifacts
1018 self.datastore.emptyTrash()
1020 def pruneDatasets(self, refs: Iterable[DatasetRef], *,
1021 disassociate: bool = True,
1022 unstore: bool = False,
1023 tags: Optional[Iterable[str]] = None,
1024 purge: bool = False,
1025 run: Optional[str] = None):
1026 """Remove one or more datasets from a collection and/or storage.
1028 Parameters
1029 ----------
1030 refs : `~collections.abc.Iterable` of `DatasetRef`
1031 Datasets to prune. These must be "resolved" references (not just
1032 a `DatasetType` and data ID).
1033 disassociate : `bool`, optional
1034 Disassociate pruned datasets from ``self.tags`` (or the collections
1035 given via the ``tags`` argument).
1036 unstore : `bool`, optional
1037 If `True` (`False` is default) remove these datasets from all
1038 datastores known to this butler. Note that this will make it
1039 impossible to retrieve these datasets even via other collections.
1040 Datasets that are already not stored are ignored by this option.
1041 tags : `Iterable` [ `str` ], optional
1042 `~CollectionType.TAGGED` collections to disassociate the datasets
1043 from, overriding ``self.tags``. Ignored if ``disassociate`` is
1044 `False` or ``purge`` is `True`.
1045 purge : `bool`, optional
1046 If `True` (`False` is default), completely remove the dataset from
1047 the `Registry`. To prevent accidental deletions, ``purge`` may
1048 only be `True` if all of the following conditions are met:
1050 - All given datasets are in the given run.
1051 - ``disassociate`` is `True`;
1052 - ``unstore`` is `True`.
1054 This mode may remove provenance information from datasets other
1055 than those provided, and should be used with extreme care.
1056 run : `str`, optional
1057 `~CollectionType.RUN` collection to purge from, overriding
1058 ``self.run``. Ignored unless ``purge`` is `True`.
1060 Raises
1061 ------
1062 TypeError
1063 Raised if the butler is read-only, if no collection was provided,
1064 or the conditions for ``purge=True`` were not met.
1065 """
1066 if not self.isWriteable():
1067 raise TypeError("Butler is read-only.")
1068 if purge:
1069 if not disassociate:
1070 raise TypeError("Cannot pass purge=True without disassociate=True.")
1071 if not unstore:
1072 raise TypeError("Cannot pass purge=True without unstore=True.")
1073 if run is None:
1074 run = self.run
1075 if run is None:
1076 raise TypeError("No run provided but purge=True.")
1077 collectionType = self.registry.getCollectionType(run)
1078 if collectionType is not CollectionType.RUN:
1079 raise TypeError(f"Cannot purge from collection '{run}' "
1080 f"of non-RUN type {collectionType.name}.")
1081 elif disassociate:
1082 if tags is None:
1083 tags = self.tags
1084 else:
1085 tags = tuple(tags)
1086 if not tags:
1087 raise TypeError("No tags provided but disassociate=True.")
1088 for tag in tags:
1089 collectionType = self.registry.getCollectionType(tag)
1090 if collectionType is not CollectionType.TAGGED:
1091 raise TypeError(f"Cannot disassociate from collection '{tag}' "
1092 f"of non-TAGGED type {collectionType.name}.")
1093 # Transform possibly-single-pass iterable into something we can iterate
1094 # over multiple times.
1095 refs = list(refs)
1096 # Pruning a component of a DatasetRef makes no sense since registry
1097 # doesn't know about components and datastore might not store
1098 # components in a separate file
1099 for ref in refs:
1100 if ref.datasetType.component():
1101 raise ValueError(f"Can not prune a component of a dataset (ref={ref})")
1102 # We don't need an unreliable Datastore transaction for this, because
1103 # we've been extra careful to ensure that Datastore.trash only involves
1104 # mutating the Registry (it can _look_ at Datastore-specific things,
1105 # but shouldn't change them), and hence all operations here are
1106 # Registry operations.
1107 with self.registry.transaction():
1108 if unstore:
1109 for ref in refs:
1110 # There is a difference between a concrete composite
1111 # and virtual composite. In a virtual composite the
1112 # datastore is never given the top level DatasetRef. In
1113 # the concrete composite the datastore knows all the
1114 # refs and will clean up itself if asked to remove the
1115 # parent ref. We can not check configuration for this
1116 # since we can not trust that the configuration is the
1117 # same. We therefore have to ask if the ref exists or
1118 # not. This is consistent with the fact that we want
1119 # to ignore already-removed-from-datastore datasets
1120 # anyway.
1121 if self.datastore.exists(ref):
1122 self.datastore.trash(ref)
1123 if purge:
1124 self.registry.removeDatasets(refs)
1125 elif disassociate:
1126 for tag in tags:
1127 self.registry.disassociate(tag, refs)
1128 # We've exited the Registry transaction, and apparently committed.
1129 # (if there was an exception, everything rolled back, and it's as if
1130 # nothing happened - and we never get here).
1131 # Datastore artifacts are not yet gone, but they're clearly marked
1132 # as trash, so if we fail to delete now because of (e.g.) filesystem
1133 # problems we can try again later, and if manual administrative
1134 # intervention is required, it's pretty clear what that should entail:
1135 # deleting everything on disk and in private Datastore tables that is
1136 # in the dataset_location_trash table.
1137 if unstore:
1138 # Point of no return for removing artifacts
1139 self.datastore.emptyTrash()
1141 @transactional
1142 def ingest(self, *datasets: FileDataset, transfer: Optional[str] = "auto", run: Optional[str] = None,
1143 tags: Optional[Iterable[str]] = None,):
1144 """Store and register one or more datasets that already exist on disk.
1146 Parameters
1147 ----------
1148 datasets : `FileDataset`
1149 Each positional argument is a struct containing information about
1150 a file to be ingested, including its path (either absolute or
1151 relative to the datastore root, if applicable), a `DatasetRef`,
1152 and optionally a formatter class or its fully-qualified string
1153 name. If a formatter is not provided, the formatter that would be
1154 used for `put` is assumed. On successful return, all
1155 `FileDataset.ref` attributes will have their `DatasetRef.id`
1156 attribute populated and all `FileDataset.formatter` attributes will
1157 be set to the formatter class used. `FileDataset.path` attributes
1158 may be modified to put paths in whatever the datastore considers a
1159 standardized form.
1160 transfer : `str`, optional
1161 If not `None`, must be one of 'auto', 'move', 'copy', 'hardlink',
1162 'relsymlink' or 'symlink', indicating how to transfer the file.
1163 run : `str`, optional
1164 The name of the run ingested datasets should be added to,
1165 overriding ``self.run``.
1166 tags : `Iterable` [ `str` ], optional
1167 The names of a `~CollectionType.TAGGED` collections to associate
1168 the dataset with, overriding ``self.tags``. These collections
1169 must have already been added to the `Registry`.
1171 Raises
1172 ------
1173 TypeError
1174 Raised if the butler is read-only or if no run was provided.
1175 NotImplementedError
1176 Raised if the `Datastore` does not support the given transfer mode.
1177 DatasetTypeNotSupportedError
1178 Raised if one or more files to be ingested have a dataset type that
1179 is not supported by the `Datastore`..
1180 FileNotFoundError
1181 Raised if one of the given files does not exist.
1182 FileExistsError
1183 Raised if transfer is not `None` but the (internal) location the
1184 file would be moved to is already occupied.
1186 Notes
1187 -----
1188 This operation is not fully exception safe: if a database operation
1189 fails, the given `FileDataset` instances may be only partially updated.
1191 It is atomic in terms of database operations (they will either all
1192 succeed or all fail) providing the database engine implements
1193 transactions correctly. It will attempt to be atomic in terms of
1194 filesystem operations as well, but this cannot be implemented
1195 rigorously for most datastores.
1196 """
1197 if not self.isWriteable():
1198 raise TypeError("Butler is read-only.")
1199 if run is None:
1200 if self.run is None:
1201 raise TypeError("No run provided.")
1202 run = self.run
1203 # No need to check run type, since insertDatasets will do that
1204 # (safely) for us.
1205 if tags is None:
1206 tags = self.tags
1207 else:
1208 tags = tuple(tags)
1209 for tag in tags:
1210 # Check that these are tagged collections up front, because we want
1211 # to avoid relying on Datastore transactionality to avoid modifying
1212 # the repo if there's an error later.
1213 collectionType = self.registry.getCollectionType(tag)
1214 if collectionType is not CollectionType.TAGGED:
1215 raise TypeError(f"Cannot associate into collection '{tag}' of non-TAGGED type "
1216 f"{collectionType.name}.")
1217 # Reorganize the inputs so they're grouped by DatasetType and then
1218 # data ID. We also include a list of DatasetRefs for each FileDataset
1219 # to hold the resolved DatasetRefs returned by the Registry, before
1220 # it's safe to swap them into FileDataset.refs.
1221 # Some type annotation aliases to make that clearer:
1222 GroupForType = Dict[DataCoordinate, Tuple[FileDataset, List[DatasetRef]]]
1223 GroupedData = MutableMapping[DatasetType, GroupForType]
1224 # The actual data structure:
1225 groupedData: GroupedData = defaultdict(dict)
1226 # And the nested loop that populates it:
1227 for dataset in datasets:
1228 # This list intentionally shared across the inner loop, since it's
1229 # associated with `dataset`.
1230 resolvedRefs = []
1231 for ref in dataset.refs:
1232 groupedData[ref.datasetType][ref.dataId] = (dataset, resolvedRefs)
1234 # Now we can bulk-insert into Registry for each DatasetType.
1235 allResolvedRefs = []
1236 for datasetType, groupForType in groupedData.items():
1237 refs = self.registry.insertDatasets(datasetType,
1238 dataIds=groupForType.keys(),
1239 run=run)
1240 # Append those resolved DatasetRefs to the new lists we set up for
1241 # them.
1242 for ref, (_, resolvedRefs) in zip(refs, groupForType.values()):
1243 resolvedRefs.append(ref)
1245 # Go back to the original FileDatasets to replace their refs with the
1246 # new resolved ones, and also build a big list of all refs.
1247 allResolvedRefs = []
1248 for groupForType in groupedData.values():
1249 for dataset, resolvedRefs in groupForType.values():
1250 dataset.refs = resolvedRefs
1251 allResolvedRefs.extend(resolvedRefs)
1253 # Bulk-associate everything with any tagged collections.
1254 for tag in tags:
1255 self.registry.associate(tag, allResolvedRefs)
1257 # Bulk-insert everything into Datastore.
1258 self.datastore.ingest(*datasets, transfer=transfer)
1260 @contextlib.contextmanager
1261 def export(self, *, directory: Optional[str] = None,
1262 filename: Optional[str] = None,
1263 format: Optional[str] = None,
1264 transfer: Optional[str] = None) -> ContextManager[RepoExportContext]:
1265 """Export datasets from the repository represented by this `Butler`.
1267 This method is a context manager that returns a helper object
1268 (`RepoExportContext`) that is used to indicate what information from
1269 the repository should be exported.
1271 Parameters
1272 ----------
1273 directory : `str`, optional
1274 Directory dataset files should be written to if ``transfer`` is not
1275 `None`.
1276 filename : `str`, optional
1277 Name for the file that will include database information associated
1278 with the exported datasets. If this is not an absolute path and
1279 ``directory`` is not `None`, it will be written to ``directory``
1280 instead of the current working directory. Defaults to
1281 "export.{format}".
1282 format : `str`, optional
1283 File format for the database information file. If `None`, the
1284 extension of ``filename`` will be used.
1285 transfer : `str`, optional
1286 Transfer mode passed to `Datastore.export`.
1288 Raises
1289 ------
1290 TypeError
1291 Raised if the set of arguments passed is inconsistent.
1293 Examples
1294 --------
1295 Typically the `Registry.queryDataIds` and `Registry.queryDatasets`
1296 methods are used to provide the iterables over data IDs and/or datasets
1297 to be exported::
1299 with butler.export("exports.yaml") as export:
1300 # Export all flats, but none of the dimension element rows
1301 # (i.e. data ID information) associated with them.
1302 export.saveDatasets(butler.registry.queryDatasets("flat"),
1303 elements=())
1304 # Export all datasets that start with "deepCoadd_" and all of
1305 # their associated data ID information.
1306 export.saveDatasets(butler.registry.queryDatasets("deepCoadd_*"))
1307 """
1308 if directory is None and transfer is not None:
1309 raise TypeError("Cannot transfer without providing a directory.")
1310 if transfer == "move":
1311 raise TypeError("Transfer may not be 'move': export is read-only")
1312 if format is None:
1313 if filename is None:
1314 raise TypeError("At least one of 'filename' or 'format' must be provided.")
1315 else:
1316 _, format = os.path.splitext(filename)
1317 elif filename is None:
1318 filename = f"export.{format}"
1319 if directory is not None:
1320 filename = os.path.join(directory, filename)
1321 BackendClass = getClassOf(self._config["repo_transfer_formats"][format]["export"])
1322 with open(filename, 'w') as stream:
1323 backend = BackendClass(stream)
1324 try:
1325 helper = RepoExportContext(self.registry, self.datastore, backend=backend,
1326 directory=directory, transfer=transfer)
1327 yield helper
1328 except BaseException:
1329 raise
1330 else:
1331 helper._finish()
1333 def import_(self, *, directory: Optional[str] = None,
1334 filename: Union[str, TextIO, None] = None,
1335 format: Optional[str] = None,
1336 transfer: Optional[str] = None,
1337 skip_dimensions: Optional[Set] = None):
1338 """Import datasets exported from a different butler repository.
1340 Parameters
1341 ----------
1342 directory : `str`, optional
1343 Directory containing dataset files. If `None`, all file paths
1344 must be absolute.
1345 filename : `str` or `TextIO`, optional
1346 A stream or name of file that contains database information
1347 associated with the exported datasets. If this a string (name) and
1348 is not an absolute path, does not exist in the current working
1349 directory, and ``directory`` is not `None`, it is assumed to be in
1350 ``directory``. Defaults to "export.{format}".
1351 format : `str`, optional
1352 File format for the database information file. If `None`, the
1353 extension of ``filename`` will be used.
1354 transfer : `str`, optional
1355 Transfer mode passed to `Datastore.ingest`.
1356 skip_dimensions : `set`, optional
1357 Names of dimensions that should be skipped and not imported.
1359 Raises
1360 ------
1361 TypeError
1362 Raised if the set of arguments passed is inconsistent, or if the
1363 butler is read-only.
1364 """
1365 if not self.isWriteable():
1366 raise TypeError("Butler is read-only.")
1367 if format is None:
1368 if filename is None:
1369 raise TypeError("At least one of 'filename' or 'format' must be provided.")
1370 else:
1371 _, format = os.path.splitext(filename)
1372 elif filename is None:
1373 filename = f"export.{format}"
1374 if isinstance(filename, str) and directory is not None and not os.path.exists(filename):
1375 filename = os.path.join(directory, filename)
1376 BackendClass = getClassOf(self._config["repo_transfer_formats"][format]["import"])
1378 def doImport(importStream):
1379 backend = BackendClass(importStream, self.registry)
1380 backend.register()
1381 with self.transaction():
1382 backend.load(self.datastore, directory=directory, transfer=transfer,
1383 skip_dimensions=skip_dimensions)
1385 if isinstance(filename, str):
1386 with open(filename, "r") as stream:
1387 doImport(stream)
1388 else:
1389 doImport(filename)
1391 def validateConfiguration(self, logFailures: bool = False,
1392 datasetTypeNames: Optional[Iterable[str]] = None,
1393 ignore: Iterable[str] = None):
1394 """Validate butler configuration.
1396 Checks that each `DatasetType` can be stored in the `Datastore`.
1398 Parameters
1399 ----------
1400 logFailures : `bool`, optional
1401 If `True`, output a log message for every validation error
1402 detected.
1403 datasetTypeNames : iterable of `str`, optional
1404 The `DatasetType` names that should be checked. This allows
1405 only a subset to be selected.
1406 ignore : iterable of `str`, optional
1407 Names of DatasetTypes to skip over. This can be used to skip
1408 known problems. If a named `DatasetType` corresponds to a
1409 composite, all components of that `DatasetType` will also be
1410 ignored.
1412 Raises
1413 ------
1414 ButlerValidationError
1415 Raised if there is some inconsistency with how this Butler
1416 is configured.
1417 """
1418 if datasetTypeNames:
1419 entities = [self.registry.getDatasetType(name) for name in datasetTypeNames]
1420 else:
1421 entities = list(self.registry.queryDatasetTypes())
1423 # filter out anything from the ignore list
1424 if ignore:
1425 ignore = set(ignore)
1426 entities = [e for e in entities if e.name not in ignore and e.nameAndComponent()[0] not in ignore]
1427 else:
1428 ignore = set()
1430 # Find all the registered instruments
1431 instruments = set(
1432 record.name for record in self.registry.queryDimensionRecords("instrument")
1433 )
1435 # For each datasetType that has an instrument dimension, create
1436 # a DatasetRef for each defined instrument
1437 datasetRefs = []
1439 for datasetType in entities:
1440 if "instrument" in datasetType.dimensions:
1441 for instrument in instruments:
1442 datasetRef = DatasetRef(datasetType, {"instrument": instrument}, conform=False)
1443 datasetRefs.append(datasetRef)
1445 entities.extend(datasetRefs)
1447 datastoreErrorStr = None
1448 try:
1449 self.datastore.validateConfiguration(entities, logFailures=logFailures)
1450 except ValidationError as e:
1451 datastoreErrorStr = str(e)
1453 # Also check that the LookupKeys used by the datastores match
1454 # registry and storage class definitions
1455 keys = self.datastore.getLookupKeys()
1457 failedNames = set()
1458 failedDataId = set()
1459 for key in keys:
1460 datasetType = None
1461 if key.name is not None:
1462 if key.name in ignore:
1463 continue
1465 # skip if specific datasetType names were requested and this
1466 # name does not match
1467 if datasetTypeNames and key.name not in datasetTypeNames:
1468 continue
1470 # See if it is a StorageClass or a DatasetType
1471 if key.name in self.storageClasses:
1472 pass
1473 else:
1474 try:
1475 self.registry.getDatasetType(key.name)
1476 except KeyError:
1477 if logFailures:
1478 log.fatal("Key '%s' does not correspond to a DatasetType or StorageClass", key)
1479 failedNames.add(key)
1480 else:
1481 # Dimensions are checked for consistency when the Butler
1482 # is created and rendezvoused with a universe.
1483 pass
1485 # Check that the instrument is a valid instrument
1486 # Currently only support instrument so check for that
1487 if key.dataId:
1488 dataIdKeys = set(key.dataId)
1489 if set(["instrument"]) != dataIdKeys:
1490 if logFailures:
1491 log.fatal("Key '%s' has unsupported DataId override", key)
1492 failedDataId.add(key)
1493 elif key.dataId["instrument"] not in instruments:
1494 if logFailures:
1495 log.fatal("Key '%s' has unknown instrument", key)
1496 failedDataId.add(key)
1498 messages = []
1500 if datastoreErrorStr:
1501 messages.append(datastoreErrorStr)
1503 for failed, msg in ((failedNames, "Keys without corresponding DatasetType or StorageClass entry: "),
1504 (failedDataId, "Keys with bad DataId entries: ")):
1505 if failed:
1506 msg += ", ".join(str(k) for k in failed)
1507 messages.append(msg)
1509 if messages:
1510 raise ValidationError(";\n".join(messages))
1512 registry: Registry
1513 """The object that manages dataset metadata and relationships (`Registry`).
1515 Most operations that don't involve reading or writing butler datasets are
1516 accessible only via `Registry` methods.
1517 """
1519 datastore: Datastore
1520 """The object that manages actual dataset storage (`Datastore`).
1522 Direct user access to the datastore should rarely be necessary; the primary
1523 exception is the case where a `Datastore` implementation provides extra
1524 functionality beyond what the base class defines.
1525 """
1527 storageClasses: StorageClassFactory
1528 """An object that maps known storage class names to objects that fully
1529 describe them (`StorageClassFactory`).
1530 """
1532 collections: Optional[CollectionSearch]
1533 """The collections to search and any restrictions on the dataset types to
1534 search for within them, in order (`CollectionSearch`).
1535 """
1537 run: Optional[str]
1538 """Name of the run this butler writes outputs to (`str` or `None`).
1539 """
1541 tags: Tuple[str, ...]
1542 """Names of `~CollectionType.TAGGED` collections this butler associates
1543 with in `put` and `ingest`, and disassociates from in `pruneDatasets`
1544 (`tuple` [ `str` ]).
1545 """