Coverage for python/lsst/daf/butler/_butler.py : 9%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""
23Butler top level classes.
24"""
25from __future__ import annotations
27__all__ = ("Butler", "ButlerValidationError")
29import os
30from collections import defaultdict
31import contextlib
32import logging
33from typing import (
34 Any,
35 ClassVar,
36 ContextManager,
37 Dict,
38 Iterable,
39 List,
40 Mapping,
41 MutableMapping,
42 Optional,
43 Tuple,
44 Union,
45)
47try:
48 import boto3
49except ImportError:
50 boto3 = None
52from lsst.utils import doImport
53from .core import (
54 ButlerURI,
55 Config,
56 ConfigSubset,
57 DataCoordinate,
58 DataId,
59 DatasetRef,
60 DatasetType,
61 Datastore,
62 FileDataset,
63 Quantum,
64 RepoExport,
65 StorageClassFactory,
66 ValidationError,
67)
68from .core.repoRelocation import BUTLER_ROOT_TAG
69from .core.utils import transactional, getClassOf, safeMakeDir
70from .core.s3utils import bucketExists
71from ._deferredDatasetHandle import DeferredDatasetHandle
72from ._butlerConfig import ButlerConfig
73from .registry import Registry, RegistryConfig, CollectionType
74from .registry.wildcards import CollectionSearch
76log = logging.getLogger(__name__)
79class ButlerValidationError(ValidationError):
80 """There is a problem with the Butler configuration."""
81 pass
84class Butler:
85 """Main entry point for the data access system.
87 Parameters
88 ----------
89 config : `ButlerConfig`, `Config` or `str`, optional.
90 Configuration. Anything acceptable to the
91 `ButlerConfig` constructor. If a directory path
92 is given the configuration will be read from a ``butler.yaml`` file in
93 that location. If `None` is given default values will be used.
94 butler : `Butler`, optional.
95 If provided, construct a new Butler that uses the same registry and
96 datastore as the given one, but with the given collection and run.
97 Incompatible with the ``config``, ``searchPaths``, and ``writeable``
98 arguments.
99 collections : `Any`, optional
100 An expression specifying the collections to be searched (in order) when
101 reading datasets, and optionally dataset type restrictions on them.
102 This may be:
103 - a `str` collection name;
104 - a tuple of (collection name, *dataset type restriction*);
105 - an iterable of either of the above;
106 - a mapping from `str` to *dataset type restriction*.
108 See :ref:`daf_butler_collection_expressions` for more information,
109 including the definition of a *dataset type restriction*. All
110 collections must either already exist or be specified to be created
111 by other arguments.
112 run : `str`, optional
113 Name of the run datasets should be output to. If the run
114 does not exist, it will be created. If ``collections`` is `None`, it
115 will be set to ``[run]``. If this is not set (and ``writeable`` is
116 not set either), a read-only butler will be created.
117 tags : `Iterable` [ `str` ], optional
118 A list of `~CollectionType.TAGGED` collections that datasets should be
119 associated with in `put` or `ingest` and disassociated from in
120 `pruneDatasets`. If any of these collections does not exist, it will
121 be created.
122 chains : `Mapping` [ `str`, `Iterable` [ `str` ] ], optional
123 A mapping from the names of new `~CollectionType.CHAINED` collections
124 to an expression identifying their child collections (which takes the
125 same form as the ``collections`` argument. Chains may be nested only
126 if children precede their parents in this mapping.
127 searchPaths : `list` of `str`, optional
128 Directory paths to search when calculating the full Butler
129 configuration. Not used if the supplied config is already a
130 `ButlerConfig`.
131 writeable : `bool`, optional
132 Explicitly sets whether the butler supports write operations. If not
133 provided, a read-write butler is created if any of ``run``, ``tags``,
134 or ``chains`` is non-empty.
136 Examples
137 --------
138 While there are many ways to control exactly how a `Butler` interacts with
139 the collections in its `Registry`, the most common cases are still simple.
141 For a read-only `Butler` that searches one collection, do::
143 butler = Butler("/path/to/repo", collections=["u/alice/DM-50000"])
145 For a read-write `Butler` that writes to and reads from a
146 `~CollectionType.RUN` collection::
148 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a")
150 The `Butler` passed to a ``PipelineTask`` is often much more complex,
151 because we want to write to one `~CollectionType.RUN` collection but read
152 from several others (as well), while defining a new
153 `~CollectionType.CHAINED` collection that combines them all::
155 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a",
156 collections=["u/alice/DM-50000"],
157 chains={
158 "u/alice/DM-50000": ["u/alice/DM-50000/a",
159 "u/bob/DM-49998",
160 "raw/hsc"]
161 })
163 This butler will `put` new datasets to the run ``u/alice/DM-50000/a``, but
164 they'll also be available from the chained collection ``u/alice/DM-50000``.
165 Datasets will be read first from that run (since it appears first in the
166 chain), and then from ``u/bob/DM-49998`` and finally ``raw/hsc``.
167 If ``u/alice/DM-50000`` had already been defined, the ``chain`` argument
168 would be unnecessary. We could also construct a butler that performs
169 exactly the same `put` and `get` operations without actually creating a
170 chained collection, just by passing multiple items is ``collections``::
172 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a",
173 collections=["u/alice/DM-50000/a",
174 "u/bob/DM-49998",
175 "raw/hsc"])
177 Finally, one can always create a `Butler` with no collections::
179 butler = Butler("/path/to/repo", writeable=True)
181 This can be extremely useful when you just want to use ``butler.registry``,
182 e.g. for inserting dimension data or managing collections, or when the
183 collections you want to use with the butler are not consistent.
184 Passing ``writeable`` explicitly here is only necessary if you want to be
185 able to make changes to the repo - usually the value for ``writeable`` is
186 can be guessed from the collection arguments provided, but it defaults to
187 `False` when there are not collection arguments.
188 """
189 def __init__(self, config: Union[Config, str, None] = None, *,
190 butler: Optional[Butler] = None,
191 collections: Any = None,
192 run: Optional[str] = None,
193 tags: Iterable[str] = (),
194 chains: Optional[Mapping[str, Any]] = None,
195 searchPaths: Optional[List[str]] = None,
196 writeable: Optional[bool] = None):
197 # Transform any single-pass iterator into an actual sequence so we
198 # can see if its empty
199 self.tags = tuple(tags)
200 # Load registry, datastore, etc. from config or existing butler.
201 if butler is not None:
202 if config is not None or searchPaths is not None or writeable is not None:
203 raise TypeError("Cannot pass 'config', 'searchPaths', or 'writeable' "
204 "arguments with 'butler' argument.")
205 self.registry = butler.registry
206 self.datastore = butler.datastore
207 self.storageClasses = butler.storageClasses
208 self._config = butler._config
209 else:
210 self._config = ButlerConfig(config, searchPaths=searchPaths)
211 if "root" in self._config:
212 butlerRoot = self._config["root"]
213 else:
214 butlerRoot = self._config.configDir
215 if writeable is None:
216 writeable = run is not None or chains is not None or self.tags
217 self.registry = Registry.fromConfig(self._config, butlerRoot=butlerRoot, writeable=writeable)
218 self.datastore = Datastore.fromConfig(self._config, self.registry.getDatastoreBridgeManager(),
219 butlerRoot=butlerRoot)
220 self.storageClasses = StorageClassFactory()
221 self.storageClasses.addFromConfig(self._config)
222 # Check the many collection arguments for consistency and create any
223 # needed collections that don't exist.
224 if collections is None:
225 if run is not None:
226 collections = (run,)
227 else:
228 collections = ()
229 self.collections = CollectionSearch.fromExpression(collections)
230 if chains is None:
231 chains = {}
232 self.run = run
233 if "run" in self._config or "collection" in self._config:
234 raise ValueError("Passing a run or collection via configuration is no longer supported.")
235 if self.run is not None:
236 self.registry.registerCollection(self.run, type=CollectionType.RUN)
237 for tag in self.tags:
238 self.registry.registerCollection(tag, type=CollectionType.TAGGED)
239 for parent, children in chains.items():
240 self.registry.registerCollection(parent, type=CollectionType.CHAINED)
241 self.registry.setCollectionChain(parent, children)
243 GENERATION: ClassVar[int] = 3
244 """This is a Generation 3 Butler.
246 This attribute may be removed in the future, once the Generation 2 Butler
247 interface has been fully retired; it should only be used in transitional
248 code.
249 """
251 @staticmethod
252 def makeRepo(root: str, config: Union[Config, str, None] = None, standalone: bool = False,
253 createRegistry: bool = True, searchPaths: Optional[List[str]] = None,
254 forceConfigRoot: bool = True, outfile: Optional[str] = None,
255 overwrite: bool = False) -> Config:
256 """Create an empty data repository by adding a butler.yaml config
257 to a repository root directory.
259 Parameters
260 ----------
261 root : `str` or `ButlerURI`
262 Path or URI to the root location of the new repository. Will be
263 created if it does not exist.
264 config : `Config` or `str`, optional
265 Configuration to write to the repository, after setting any
266 root-dependent Registry or Datastore config options. Can not
267 be a `ButlerConfig` or a `ConfigSubset`. If `None`, default
268 configuration will be used. Root-dependent config options
269 specified in this config are overwritten if ``forceConfigRoot``
270 is `True`.
271 standalone : `bool`
272 If True, write all expanded defaults, not just customized or
273 repository-specific settings.
274 This (mostly) decouples the repository from the default
275 configuration, insulating it from changes to the defaults (which
276 may be good or bad, depending on the nature of the changes).
277 Future *additions* to the defaults will still be picked up when
278 initializing `Butlers` to repos created with ``standalone=True``.
279 createRegistry : `bool`, optional
280 If `True` create a new Registry.
281 searchPaths : `list` of `str`, optional
282 Directory paths to search when calculating the full butler
283 configuration.
284 forceConfigRoot : `bool`, optional
285 If `False`, any values present in the supplied ``config`` that
286 would normally be reset are not overridden and will appear
287 directly in the output config. This allows non-standard overrides
288 of the root directory for a datastore or registry to be given.
289 If this parameter is `True` the values for ``root`` will be
290 forced into the resulting config if appropriate.
291 outfile : `str`, optional
292 If not-`None`, the output configuration will be written to this
293 location rather than into the repository itself. Can be a URI
294 string. Can refer to a directory that will be used to write
295 ``butler.yaml``.
296 overwrite : `bool`, optional
297 Create a new configuration file even if one already exists
298 in the specified output location. Default is to raise
299 an exception.
301 Returns
302 -------
303 config : `Config`
304 The updated `Config` instance written to the repo.
306 Raises
307 ------
308 ValueError
309 Raised if a ButlerConfig or ConfigSubset is passed instead of a
310 regular Config (as these subclasses would make it impossible to
311 support ``standalone=False``).
312 FileExistsError
313 Raised if the output config file already exists.
314 os.error
315 Raised if the directory does not exist, exists but is not a
316 directory, or cannot be created.
318 Notes
319 -----
320 Note that when ``standalone=False`` (the default), the configuration
321 search path (see `ConfigSubset.defaultSearchPaths`) that was used to
322 construct the repository should also be used to construct any Butlers
323 to avoid configuration inconsistencies.
324 """
325 if isinstance(config, (ButlerConfig, ConfigSubset)):
326 raise ValueError("makeRepo must be passed a regular Config without defaults applied.")
328 # for "file" schemes we are assuming POSIX semantics for paths, for
329 # schemeless URIs we are assuming os.path semantics.
330 uri = ButlerURI(root, forceDirectory=True)
331 if uri.scheme == "file" or not uri.scheme:
332 if not os.path.isdir(uri.ospath):
333 safeMakeDir(uri.ospath)
334 elif uri.scheme == "s3":
335 # bucket must already exist
336 if not bucketExists(uri.netloc):
337 raise ValueError(f"Bucket {uri.netloc} does not exist!")
338 s3 = boto3.client("s3")
339 # don't create S3 key when root is at the top-level of an Bucket
340 if not uri.path == "/":
341 s3.put_object(Bucket=uri.netloc, Key=uri.relativeToPathRoot)
342 else:
343 raise ValueError(f"Unrecognized scheme: {uri.scheme}")
344 config = Config(config)
346 # If we are creating a new repo from scratch with relative roots,
347 # do not propagate an explicit root from the config file
348 if "root" in config:
349 del config["root"]
351 full = ButlerConfig(config, searchPaths=searchPaths) # this applies defaults
352 datastoreClass = doImport(full["datastore", "cls"])
353 datastoreClass.setConfigRoot(BUTLER_ROOT_TAG, config, full, overwrite=forceConfigRoot)
355 # if key exists in given config, parse it, otherwise parse the defaults
356 # in the expanded config
357 if config.get(("registry", "db")):
358 registryConfig = RegistryConfig(config)
359 else:
360 registryConfig = RegistryConfig(full)
361 defaultDatabaseUri = registryConfig.makeDefaultDatabaseUri(BUTLER_ROOT_TAG)
362 if defaultDatabaseUri is not None:
363 Config.updateParameters(RegistryConfig, config, full,
364 toUpdate={"db": defaultDatabaseUri},
365 overwrite=forceConfigRoot)
366 else:
367 Config.updateParameters(RegistryConfig, config, full, toCopy=("db",),
368 overwrite=forceConfigRoot)
370 if standalone:
371 config.merge(full)
372 if outfile is not None:
373 # When writing to a separate location we must include
374 # the root of the butler repo in the config else it won't know
375 # where to look.
376 config["root"] = uri.geturl()
377 configURI = outfile
378 else:
379 configURI = uri
380 config.dumpToUri(configURI, overwrite=overwrite)
382 # Create Registry and populate tables
383 Registry.fromConfig(config, create=createRegistry, butlerRoot=root)
384 return config
386 @classmethod
387 def _unpickle(cls, config: ButlerConfig, collections: Optional[CollectionSearch], run: Optional[str],
388 tags: Tuple[str, ...], writeable: bool) -> Butler:
389 """Callable used to unpickle a Butler.
391 We prefer not to use ``Butler.__init__`` directly so we can force some
392 of its many arguments to be keyword-only (note that ``__reduce__``
393 can only invoke callables with positional arguments).
395 Parameters
396 ----------
397 config : `ButlerConfig`
398 Butler configuration, already coerced into a true `ButlerConfig`
399 instance (and hence after any search paths for overrides have been
400 utilized).
401 collections : `CollectionSearch`
402 Names of collections to read from.
403 run : `str`, optional
404 Name of `~CollectionType.RUN` collection to write to.
405 tags : `tuple` [`str`]
406 Names of `~CollectionType.TAGGED` collections to associate with.
407 writeable : `bool`
408 Whether the Butler should support write operations.
410 Returns
411 -------
412 butler : `Butler`
413 A new `Butler` instance.
414 """
415 return cls(config=config, collections=collections, run=run, tags=tags, writeable=writeable)
417 def __reduce__(self):
418 """Support pickling.
419 """
420 return (Butler._unpickle, (self._config, self.collections, self.run, self.tags,
421 self.registry.isWriteable()))
423 def __str__(self):
424 return "Butler(collections={}, run={}, tags={}, datastore='{}', registry='{}')".format(
425 self.collections, self.run, self.tags, self.datastore, self.registry)
427 def isWriteable(self) -> bool:
428 """Return `True` if this `Butler` supports write operations.
429 """
430 return self.registry.isWriteable()
432 @contextlib.contextmanager
433 def transaction(self):
434 """Context manager supporting `Butler` transactions.
436 Transactions can be nested.
437 """
438 with self.registry.transaction():
439 with self.datastore.transaction():
440 yield
442 def _standardizeArgs(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
443 dataId: Optional[DataId] = None, **kwds: Any) -> Tuple[DatasetType, DataId]:
444 """Standardize the arguments passed to several Butler APIs.
446 Parameters
447 ----------
448 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
449 When `DatasetRef` the `dataId` should be `None`.
450 Otherwise the `DatasetType` or name thereof.
451 dataId : `dict` or `DataCoordinate`
452 A `dict` of `Dimension` link name, value pairs that label the
453 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
454 should be provided as the second argument.
455 kwds
456 Additional keyword arguments used to augment or construct a
457 `DataCoordinate`. See `DataCoordinate.standardize`
458 parameters.
460 Returns
461 -------
462 datasetType : `DatasetType`
463 A `DatasetType` instance extracted from ``datasetRefOrType``.
464 dataId : `dict` or `DataId`, optional
465 Argument that can be used (along with ``kwds``) to construct a
466 `DataId`.
468 Notes
469 -----
470 Butler APIs that conceptually need a DatasetRef also allow passing a
471 `DatasetType` (or the name of one) and a `DataId` (or a dict and
472 keyword arguments that can be used to construct one) separately. This
473 method accepts those arguments and always returns a true `DatasetType`
474 and a `DataId` or `dict`.
476 Standardization of `dict` vs `DataId` is best handled by passing the
477 returned ``dataId`` (and ``kwds``) to `Registry` APIs, which are
478 generally similarly flexible.
479 """
480 externalDatasetType = None
481 internalDatasetType = None
482 if isinstance(datasetRefOrType, DatasetRef):
483 if dataId is not None or kwds:
484 raise ValueError("DatasetRef given, cannot use dataId as well")
485 externalDatasetType = datasetRefOrType.datasetType
486 dataId = datasetRefOrType.dataId
487 else:
488 # Don't check whether DataId is provided, because Registry APIs
489 # can usually construct a better error message when it wasn't.
490 if isinstance(datasetRefOrType, DatasetType):
491 externalDatasetType = datasetRefOrType
492 else:
493 internalDatasetType = self.registry.getDatasetType(datasetRefOrType)
495 # Check that they are self-consistent
496 if externalDatasetType is not None:
497 internalDatasetType = self.registry.getDatasetType(externalDatasetType.name)
498 if externalDatasetType != internalDatasetType:
499 raise ValueError(f"Supplied dataset type ({externalDatasetType}) inconsistent with "
500 f"registry definition ({internalDatasetType})")
502 return internalDatasetType, dataId
504 def _findDatasetRef(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
505 dataId: Optional[DataId] = None, *,
506 collections: Any = None,
507 allowUnresolved: bool = False,
508 **kwds: Any) -> DatasetRef:
509 """Shared logic for methods that start with a search for a dataset in
510 the registry.
512 Parameters
513 ----------
514 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
515 When `DatasetRef` the `dataId` should be `None`.
516 Otherwise the `DatasetType` or name thereof.
517 dataId : `dict` or `DataCoordinate`, optional
518 A `dict` of `Dimension` link name, value pairs that label the
519 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
520 should be provided as the first argument.
521 collections : Any, optional
522 Collections to be searched, overriding ``self.collections``.
523 Can be any of the types supported by the ``collections`` argument
524 to butler construction.
525 allowUnresolved : `bool`, optional
526 If `True`, return an unresolved `DatasetRef` if finding a resolved
527 one in the `Registry` fails. Defaults to `False`.
528 kwds
529 Additional keyword arguments used to augment or construct a
530 `DataId`. See `DataId` parameters.
532 Returns
533 -------
534 ref : `DatasetRef`
535 A reference to the dataset identified by the given arguments.
537 Raises
538 ------
539 LookupError
540 Raised if no matching dataset exists in the `Registry` (and
541 ``allowUnresolved is False``).
542 ValueError
543 Raised if a resolved `DatasetRef` was passed as an input, but it
544 differs from the one found in the registry.
545 TypeError
546 Raised if no collections were provided.
547 """
548 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwds)
549 if isinstance(datasetRefOrType, DatasetRef):
550 idNumber = datasetRefOrType.id
551 else:
552 idNumber = None
553 # Expand the data ID first instead of letting registry.findDataset do
554 # it, so we get the result even if it returns None.
555 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions, **kwds)
556 if collections is None:
557 collections = self.collections
558 if not collections:
559 raise TypeError("No input collections provided.")
560 else:
561 collections = CollectionSearch.fromExpression(collections)
562 # Always lookup the DatasetRef, even if one is given, to ensure it is
563 # present in the current collection.
564 ref = self.registry.findDataset(datasetType, dataId, collections=collections)
565 if ref is None:
566 if allowUnresolved:
567 return DatasetRef(datasetType, dataId)
568 else:
569 raise LookupError(f"Dataset {datasetType.name} with data ID {dataId} "
570 f"could not be found in collections {collections}.")
571 if idNumber is not None and idNumber != ref.id:
572 raise ValueError(f"DatasetRef.id provided ({idNumber}) does not match "
573 f"id ({ref.id}) in registry in collections {collections}.")
574 return ref
576 @transactional
577 def put(self, obj: Any, datasetRefOrType: Union[DatasetRef, DatasetType, str],
578 dataId: Optional[DataId] = None, *,
579 producer: Optional[Quantum] = None,
580 run: Optional[str] = None,
581 tags: Optional[Iterable[str]] = None,
582 **kwds: Any) -> DatasetRef:
583 """Store and register a dataset.
585 Parameters
586 ----------
587 obj : `object`
588 The dataset.
589 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
590 When `DatasetRef` is provided, ``dataId`` should be `None`.
591 Otherwise the `DatasetType` or name thereof.
592 dataId : `dict` or `DataCoordinate`
593 A `dict` of `Dimension` link name, value pairs that label the
594 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
595 should be provided as the second argument.
596 producer : `Quantum`, optional
597 The producer.
598 run : `str`, optional
599 The name of the run the dataset should be added to, overriding
600 ``self.run``.
601 tags : `Iterable` [ `str` ], optional
602 The names of a `~CollectionType.TAGGED` collections to associate
603 the dataset with, overriding ``self.tags``. These collections
604 must have already been added to the `Registry`.
605 kwds
606 Additional keyword arguments used to augment or construct a
607 `DataCoordinate`. See `DataCoordinate.standardize`
608 parameters.
610 Returns
611 -------
612 ref : `DatasetRef`
613 A reference to the stored dataset, updated with the correct id if
614 given.
616 Raises
617 ------
618 TypeError
619 Raised if the butler is read-only or if no run has been provided.
620 """
621 log.debug("Butler put: %s, dataId=%s, producer=%s, run=%s", datasetRefOrType, dataId, producer, run)
622 if not self.isWriteable():
623 raise TypeError("Butler is read-only.")
624 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwds)
625 if isinstance(datasetRefOrType, DatasetRef) and datasetRefOrType.id is not None:
626 raise ValueError("DatasetRef must not be in registry, must have None id")
628 if run is None:
629 if self.run is None:
630 raise TypeError("No run provided.")
631 run = self.run
632 # No need to check type for run; first thing we do is
633 # insertDatasets, and that will check for us.
635 if tags is None:
636 tags = self.tags
637 else:
638 tags = tuple(tags)
639 for tag in tags:
640 # Check that these are tagged collections up front, because we want
641 # to avoid relying on Datastore transactionality to avoid modifying
642 # the repo if there's an error later.
643 collectionType = self.registry.getCollectionType(tag)
644 if collectionType is not CollectionType.TAGGED:
645 raise TypeError(f"Cannot associate into collection '{tag}' of non-TAGGED type "
646 f"{collectionType.name}.")
648 # Add Registry Dataset entry.
649 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions, **kwds)
650 ref, = self.registry.insertDatasets(datasetType, run=run, dataIds=[dataId],
651 producer=producer)
653 # Add Datastore entry.
654 self.datastore.put(obj, ref)
656 for tag in tags:
657 self.registry.associate(tag, [ref])
659 return ref
661 def getDirect(self, ref: DatasetRef, *, parameters: Optional[Dict[str, Any]] = None):
662 """Retrieve a stored dataset.
664 Unlike `Butler.get`, this method allows datasets outside the Butler's
665 collection to be read as long as the `DatasetRef` that identifies them
666 can be obtained separately.
668 Parameters
669 ----------
670 ref : `DatasetRef`
671 Reference to an already stored dataset.
672 parameters : `dict`
673 Additional StorageClass-defined options to control reading,
674 typically used to efficiently read only a subset of the dataset.
676 Returns
677 -------
678 obj : `object`
679 The dataset.
680 """
681 return self.datastore.get(ref, parameters=parameters)
683 def getDeferred(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
684 dataId: Optional[DataId] = None, *,
685 parameters: Union[dict, None] = None,
686 collections: Any = None,
687 **kwds: Any) -> DeferredDatasetHandle:
688 """Create a `DeferredDatasetHandle` which can later retrieve a dataset
690 Parameters
691 ----------
692 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
693 When `DatasetRef` the `dataId` should be `None`.
694 Otherwise the `DatasetType` or name thereof.
695 dataId : `dict` or `DataCoordinate`, optional
696 A `dict` of `Dimension` link name, value pairs that label the
697 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
698 should be provided as the first argument.
699 parameters : `dict`
700 Additional StorageClass-defined options to control reading,
701 typically used to efficiently read only a subset of the dataset.
702 collections : Any, optional
703 Collections to be searched, overriding ``self.collections``.
704 Can be any of the types supported by the ``collections`` argument
705 to butler construction.
706 kwds
707 Additional keyword arguments used to augment or construct a
708 `DataId`. See `DataId` parameters.
710 Returns
711 -------
712 obj : `DeferredDatasetHandle`
713 A handle which can be used to retrieve a dataset at a later time.
715 Raises
716 ------
717 LookupError
718 Raised if no matching dataset exists in the `Registry` (and
719 ``allowUnresolved is False``).
720 ValueError
721 Raised if a resolved `DatasetRef` was passed as an input, but it
722 differs from the one found in the registry.
723 TypeError
724 Raised if no collections were provided.
725 """
726 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwds)
727 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters)
729 def get(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
730 dataId: Optional[DataId] = None, *,
731 parameters: Optional[Dict[str, Any]] = None,
732 collections: Any = None,
733 **kwds: Any) -> Any:
734 """Retrieve a stored dataset.
736 Parameters
737 ----------
738 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
739 When `DatasetRef` the `dataId` should be `None`.
740 Otherwise the `DatasetType` or name thereof.
741 dataId : `dict` or `DataCoordinate`
742 A `dict` of `Dimension` link name, value pairs that label the
743 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
744 should be provided as the first argument.
745 parameters : `dict`
746 Additional StorageClass-defined options to control reading,
747 typically used to efficiently read only a subset of the dataset.
748 collections : Any, optional
749 Collections to be searched, overriding ``self.collections``.
750 Can be any of the types supported by the ``collections`` argument
751 to butler construction.
752 kwds
753 Additional keyword arguments used to augment or construct a
754 `DataCoordinate`. See `DataCoordinate.standardize`
755 parameters.
757 Returns
758 -------
759 obj : `object`
760 The dataset.
762 Raises
763 ------
764 ValueError
765 Raised if a resolved `DatasetRef` was passed as an input, but it
766 differs from the one found in the registry.
767 LookupError
768 Raised if no matching dataset exists in the `Registry`.
769 TypeError
770 Raised if no collections were provided.
771 """
772 log.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters)
773 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwds)
774 return self.getDirect(ref, parameters=parameters)
776 def getURIs(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
777 dataId: Optional[DataId] = None, *,
778 predict: bool = False,
779 collections: Any = None,
780 run: Optional[str] = None,
781 **kwds: Any) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]:
782 """Returns the URIs associated with the dataset.
784 Parameters
785 ----------
786 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
787 When `DatasetRef` the `dataId` should be `None`.
788 Otherwise the `DatasetType` or name thereof.
789 dataId : `dict` or `DataCoordinate`
790 A `dict` of `Dimension` link name, value pairs that label the
791 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
792 should be provided as the first argument.
793 predict : `bool`
794 If `True`, allow URIs to be returned of datasets that have not
795 been written.
796 collections : Any, optional
797 Collections to be searched, overriding ``self.collections``.
798 Can be any of the types supported by the ``collections`` argument
799 to butler construction.
800 run : `str`, optional
801 Run to use for predictions, overriding ``self.run``.
802 kwds
803 Additional keyword arguments used to augment or construct a
804 `DataCoordinate`. See `DataCoordinate.standardize`
805 parameters.
807 Returns
808 -------
809 primary : `ButlerURI`
810 The URI to the primary artifact associated with this dataset.
811 If the dataset was disassembled within the datastore this
812 may be `None`.
813 components : `dict`
814 URIs to any components associated with the dataset artifact.
815 Can be empty if there are no components.
816 """
817 ref = self._findDatasetRef(datasetRefOrType, dataId, allowUnresolved=predict,
818 collections=collections, **kwds)
819 if ref.id is None: # only possible if predict is True
820 if run is None:
821 run = self.run
822 if run is None:
823 raise TypeError("Cannot predict location with run=None.")
824 # Lie about ID, because we can't guess it, and only
825 # Datastore.getURIs() will ever see it (and it doesn't use it).
826 ref = ref.resolved(id=0, run=run)
827 return self.datastore.getURIs(ref, predict)
829 def getURI(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
830 dataId: Optional[DataId] = None, *,
831 predict: bool = False,
832 collections: Any = None,
833 run: Optional[str] = None,
834 **kwds: Any) -> ButlerURI:
835 """Return the URI to the Dataset.
837 Parameters
838 ----------
839 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
840 When `DatasetRef` the `dataId` should be `None`.
841 Otherwise the `DatasetType` or name thereof.
842 dataId : `dict` or `DataCoordinate`
843 A `dict` of `Dimension` link name, value pairs that label the
844 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
845 should be provided as the first argument.
846 predict : `bool`
847 If `True`, allow URIs to be returned of datasets that have not
848 been written.
849 collections : Any, optional
850 Collections to be searched, overriding ``self.collections``.
851 Can be any of the types supported by the ``collections`` argument
852 to butler construction.
853 run : `str`, optional
854 Run to use for predictions, overriding ``self.run``.
855 kwds
856 Additional keyword arguments used to augment or construct a
857 `DataCoordinate`. See `DataCoordinate.standardize`
858 parameters.
860 Returns
861 -------
862 uri : `ButlerURI`
863 URI pointing to the Dataset within the datastore. If the
864 Dataset does not exist in the datastore, and if ``predict`` is
865 `True`, the URI will be a prediction and will include a URI
866 fragment "#predicted".
867 If the datastore does not have entities that relate well
868 to the concept of a URI the returned URI string will be
869 descriptive. The returned URI is not guaranteed to be obtainable.
871 Raises
872 ------
873 LookupError
874 A URI has been requested for a dataset that does not exist and
875 guessing is not allowed.
876 ValueError
877 Raised if a resolved `DatasetRef` was passed as an input, but it
878 differs from the one found in the registry.
879 TypeError
880 Raised if no collections were provided.
881 RuntimeError
882 Raised if a URI is requested for a dataset that consists of
883 multiple artifacts.
884 """
885 primary, components = self.getURIs(datasetRefOrType, dataId=dataId, predict=predict,
886 collections=collections, run=run, **kwds)
888 if primary is None or components:
889 raise RuntimeError(f"Dataset ({datasetRefOrType}) includes distinct URIs for components. "
890 "Use Butler.getURIs() instead.")
891 return primary
893 def datasetExists(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
894 dataId: Optional[DataId] = None, *,
895 collections: Any = None,
896 **kwds: Any) -> bool:
897 """Return True if the Dataset is actually present in the Datastore.
899 Parameters
900 ----------
901 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
902 When `DatasetRef` the `dataId` should be `None`.
903 Otherwise the `DatasetType` or name thereof.
904 dataId : `dict` or `DataCoordinate`
905 A `dict` of `Dimension` link name, value pairs that label the
906 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
907 should be provided as the first argument.
908 collections : Any, optional
909 Collections to be searched, overriding ``self.collections``.
910 Can be any of the types supported by the ``collections`` argument
911 to butler construction.
912 kwds
913 Additional keyword arguments used to augment or construct a
914 `DataCoordinate`. See `DataCoordinate.standardize`
915 parameters.
917 Raises
918 ------
919 LookupError
920 Raised if the dataset is not even present in the Registry.
921 ValueError
922 Raised if a resolved `DatasetRef` was passed as an input, but it
923 differs from the one found in the registry.
924 TypeError
925 Raised if no collections were provided.
926 """
927 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwds)
928 return self.datastore.exists(ref)
930 def pruneCollection(self, name: str, purge: bool = False, unstore: bool = False):
931 """Remove a collection and possibly prune datasets within it.
933 Parameters
934 ----------
935 name : `str`
936 Name of the collection to remove. If this is a
937 `~CollectionType.TAGGED` or `~CollectionType.CHAINED` collection,
938 datasets within the collection are not modified unless ``unstore``
939 is `True`. If this is a `~CollectionType.RUN` collection,
940 ``purge`` and ``unstore`` must be `True`, and all datasets in it
941 are fully removed from the data repository.
942 purge : `bool`, optional
943 If `True`, permit `~CollectionType.RUN` collections to be removed,
944 fully removing datasets within them. Requires ``unstore=True`` as
945 well as an added precaution against accidental deletion. Must be
946 `False` (default) if the collection is not a ``RUN``.
947 unstore: `bool`, optional
948 If `True`, remove all datasets in the collection from all
949 datastores in which they appear.
951 Raises
952 ------
953 TypeError
954 Raised if the butler is read-only or arguments are mutually
955 inconsistent.
956 """
957 # See pruneDatasets comments for more information about the logic here;
958 # the cases are almost the same, but here we can rely on Registry to
959 # take care everything but Datastore deletion when we remove the
960 # collection.
961 if not self.isWriteable():
962 raise TypeError("Butler is read-only.")
963 if purge and not unstore:
964 raise TypeError("Cannot pass purge=True without unstore=True.")
965 collectionType = self.registry.getCollectionType(name)
966 if collectionType is CollectionType.RUN and not purge:
967 raise TypeError(f"Cannot prune RUN collection {name} without purge=True.")
968 if collectionType is not CollectionType.RUN and purge:
969 raise TypeError(f"Cannot prune {collectionType.name} collection {name} with purge=True.")
970 with self.registry.transaction():
971 if unstore:
972 for ref in self.registry.queryDatasets(..., collections=name, deduplicate=True):
973 if self.datastore.exists(ref):
974 self.datastore.trash(ref)
975 self.registry.removeCollection(name)
976 if unstore:
977 # Point of no return for removing artifacts
978 self.datastore.emptyTrash()
980 def pruneDatasets(self, refs: Iterable[DatasetRef], *,
981 disassociate: bool = True,
982 unstore: bool = False,
983 tags: Optional[Iterable[str]] = None,
984 purge: bool = False,
985 run: Optional[str] = None):
986 """Remove one or more datasets from a collection and/or storage.
988 Parameters
989 ----------
990 refs : `~collections.abc.Iterable` of `DatasetRef`
991 Datasets to prune. These must be "resolved" references (not just
992 a `DatasetType` and data ID).
993 disassociate : bool`, optional
994 Disassociate pruned datasets from ``self.tags`` (or the collections
995 given via the ``tags`` argument). Ignored if ``refs`` is ``...``.
996 unstore : `bool`, optional
997 If `True` (`False` is default) remove these datasets from all
998 datastores known to this butler. Note that this will make it
999 impossible to retrieve these datasets even via other collections.
1000 Datasets that are already not stored are ignored by this option.
1001 tags : `Iterable` [ `str` ], optional
1002 `~CollectionType.TAGGED` collections to disassociate the datasets
1003 from, overriding ``self.tags``. Ignored if ``disassociate`` is
1004 `False` or ``purge`` is `True`.
1005 purge : `bool`, optional
1006 If `True` (`False` is default), completely remove the dataset from
1007 the `Registry`. To prevent accidental deletions, ``purge`` may
1008 only be `True` if all of the following conditions are met:
1010 - All given datasets are in the given run.
1011 - ``disassociate`` is `True`;
1012 - ``unstore`` is `True`.
1014 This mode may remove provenance information from datasets other
1015 than those provided, and should be used with extreme care.
1016 run : `str`, optional
1017 `~CollectionType.RUN` collection to purge from, overriding
1018 ``self.run``. Ignored unless ``purge`` is `True`.
1020 Raises
1021 ------
1022 TypeError
1023 Raised if the butler is read-only, if no collection was provided,
1024 or the conditions for ``purge=True`` were not met.
1025 """
1026 if not self.isWriteable():
1027 raise TypeError("Butler is read-only.")
1028 if purge:
1029 if not disassociate:
1030 raise TypeError("Cannot pass purge=True without disassociate=True.")
1031 if not unstore:
1032 raise TypeError("Cannot pass purge=True without unstore=True.")
1033 if run is None:
1034 run = self.run
1035 if run is None:
1036 raise TypeError("No run provided but purge=True.")
1037 collectionType = self.registry.getCollectionType(run)
1038 if collectionType is not CollectionType.RUN:
1039 raise TypeError(f"Cannot purge from collection '{run}' "
1040 f"of non-RUN type {collectionType.name}.")
1041 elif disassociate:
1042 if tags is None:
1043 tags = self.tags
1044 else:
1045 tags = tuple(tags)
1046 if not tags:
1047 raise TypeError("No tags provided but disassociate=True.")
1048 for tag in tags:
1049 collectionType = self.registry.getCollectionType(tag)
1050 if collectionType is not CollectionType.TAGGED:
1051 raise TypeError(f"Cannot disassociate from collection '{tag}' "
1052 f"of non-TAGGED type {collectionType.name}.")
1053 # Transform possibly-single-pass iterable into something we can iterate
1054 # over multiple times.
1055 refs = list(refs)
1056 # Pruning a component of a DatasetRef makes no sense since registry
1057 # doesn't know about components and datastore might not store
1058 # components in a separate file
1059 for ref in refs:
1060 if ref.datasetType.component():
1061 raise ValueError(f"Can not prune a component of a dataset (ref={ref})")
1062 # We don't need an unreliable Datastore transaction for this, because
1063 # we've been extra careful to ensure that Datastore.trash only involves
1064 # mutating the Registry (it can _look_ at Datastore-specific things,
1065 # but shouldn't change them), and hence all operations here are
1066 # Registry operations.
1067 with self.registry.transaction():
1068 if unstore:
1069 for ref in refs:
1070 # There is a difference between a concrete composite
1071 # and virtual composite. In a virtual composite the
1072 # datastore is never given the top level DatasetRef. In
1073 # the concrete composite the datastore knows all the
1074 # refs and will clean up itself if asked to remove the
1075 # parent ref. We can not check configuration for this
1076 # since we can not trust that the configuration is the
1077 # same. We therefore have to ask if the ref exists or
1078 # not. This is consistent with the fact that we want
1079 # to ignore already-removed-from-datastore datasets
1080 # anyway.
1081 if self.datastore.exists(ref):
1082 self.datastore.trash(ref)
1083 if purge:
1084 self.registry.removeDatasets(refs)
1085 elif disassociate:
1086 for tag in tags:
1087 self.registry.disassociate(tag, refs)
1088 # We've exited the Registry transaction, and apparently committed.
1089 # (if there was an exception, everything rolled back, and it's as if
1090 # nothing happened - and we never get here).
1091 # Datastore artifacts are not yet gone, but they're clearly marked
1092 # as trash, so if we fail to delete now because of (e.g.) filesystem
1093 # problems we can try again later, and if manual administrative
1094 # intervention is required, it's pretty clear what that should entail:
1095 # deleting everything on disk and in private Datastore tables that is
1096 # in the dataset_location_trash table.
1097 if unstore:
1098 # Point of no return for removing artifacts
1099 self.datastore.emptyTrash()
1101 @transactional
1102 def ingest(self, *datasets: FileDataset, transfer: Optional[str] = None, run: Optional[str] = None,
1103 tags: Optional[Iterable[str]] = None,):
1104 """Store and register one or more datasets that already exist on disk.
1106 Parameters
1107 ----------
1108 datasets : `FileDataset`
1109 Each positional argument is a struct containing information about
1110 a file to be ingested, including its path (either absolute or
1111 relative to the datastore root, if applicable), a `DatasetRef`,
1112 and optionally a formatter class or its fully-qualified string
1113 name. If a formatter is not provided, the formatter that would be
1114 used for `put` is assumed. On successful return, all
1115 `FileDataset.ref` attributes will have their `DatasetRef.id`
1116 attribute populated and all `FileDataset.formatter` attributes will
1117 be set to the formatter class used. `FileDataset.path` attributes
1118 may be modified to put paths in whatever the datastore considers a
1119 standardized form.
1120 transfer : `str`, optional
1121 If not `None`, must be one of 'auto', 'move', 'copy', 'hardlink',
1122 'relsymlink' or 'symlink', indicating how to transfer the file.
1123 run : `str`, optional
1124 The name of the run ingested datasets should be added to,
1125 overriding ``self.run``.
1126 tags : `Iterable` [ `str` ], optional
1127 The names of a `~CollectionType.TAGGED` collections to associate
1128 the dataset with, overriding ``self.tags``. These collections
1129 must have already been added to the `Registry`.
1131 Raises
1132 ------
1133 TypeError
1134 Raised if the butler is read-only or if no run was provided.
1135 NotImplementedError
1136 Raised if the `Datastore` does not support the given transfer mode.
1137 DatasetTypeNotSupportedError
1138 Raised if one or more files to be ingested have a dataset type that
1139 is not supported by the `Datastore`..
1140 FileNotFoundError
1141 Raised if one of the given files does not exist.
1142 FileExistsError
1143 Raised if transfer is not `None` but the (internal) location the
1144 file would be moved to is already occupied.
1146 Notes
1147 -----
1148 This operation is not fully exception safe: if a database operation
1149 fails, the given `FileDataset` instances may be only partially updated.
1151 It is atomic in terms of database operations (they will either all
1152 succeed or all fail) providing the database engine implements
1153 transactions correctly. It will attempt to be atomic in terms of
1154 filesystem operations as well, but this cannot be implemented
1155 rigorously for most datastores.
1156 """
1157 if not self.isWriteable():
1158 raise TypeError("Butler is read-only.")
1159 if run is None:
1160 if self.run is None:
1161 raise TypeError("No run provided.")
1162 run = self.run
1163 # No need to check run type, since insertDatasets will do that
1164 # (safely) for us.
1165 if tags is None:
1166 tags = self.tags
1167 else:
1168 tags = tuple(tags)
1169 for tag in tags:
1170 # Check that these are tagged collections up front, because we want
1171 # to avoid relying on Datastore transactionality to avoid modifying
1172 # the repo if there's an error later.
1173 collectionType = self.registry.getCollectionType(tag)
1174 if collectionType is not CollectionType.TAGGED:
1175 raise TypeError(f"Cannot associate into collection '{tag}' of non-TAGGED type "
1176 f"{collectionType.name}.")
1177 # Reorganize the inputs so they're grouped by DatasetType and then
1178 # data ID. We also include a list of DatasetRefs for each FileDataset
1179 # to hold the resolved DatasetRefs returned by the Registry, before
1180 # it's safe to swap them into FileDataset.refs.
1181 # Some type annotation aliases to make that clearer:
1182 GroupForType = Dict[DataCoordinate, Tuple[FileDataset, List[DatasetRef]]]
1183 GroupedData = MutableMapping[DatasetType, GroupForType]
1184 # The actual data structure:
1185 groupedData: GroupedData = defaultdict(dict)
1186 # And the nested loop that populates it:
1187 for dataset in datasets:
1188 # This list intentionally shared across the inner loop, since it's
1189 # associated with `dataset`.
1190 resolvedRefs = []
1191 for ref in dataset.refs:
1192 groupedData[ref.datasetType][ref.dataId] = (dataset, resolvedRefs)
1194 # Now we can bulk-insert into Registry for each DatasetType.
1195 allResolvedRefs = []
1196 for datasetType, groupForType in groupedData.items():
1197 refs = self.registry.insertDatasets(datasetType,
1198 dataIds=groupForType.keys(),
1199 run=run)
1200 # Append those resolved DatasetRefs to the new lists we set up for
1201 # them.
1202 for ref, (_, resolvedRefs) in zip(refs, groupForType.values()):
1203 resolvedRefs.append(ref)
1205 # Go back to the original FileDatasets to replace their refs with the
1206 # new resolved ones, and also build a big list of all refs.
1207 allResolvedRefs = []
1208 for groupForType in groupedData.values():
1209 for dataset, resolvedRefs in groupForType.values():
1210 dataset.refs = resolvedRefs
1211 allResolvedRefs.extend(resolvedRefs)
1213 # Bulk-associate everything with any tagged collections.
1214 for tag in tags:
1215 self.registry.associate(tag, allResolvedRefs)
1217 # Bulk-insert everything into Datastore.
1218 self.datastore.ingest(*datasets, transfer=transfer)
1220 @contextlib.contextmanager
1221 def export(self, *, directory: Optional[str] = None,
1222 filename: Optional[str] = None,
1223 format: Optional[str] = None,
1224 transfer: Optional[str] = None) -> ContextManager[RepoExport]:
1225 """Export datasets from the repository represented by this `Butler`.
1227 This method is a context manager that returns a helper object
1228 (`RepoExport`) that is used to indicate what information from the
1229 repository should be exported.
1231 Parameters
1232 ----------
1233 directory : `str`, optional
1234 Directory dataset files should be written to if ``transfer`` is not
1235 `None`.
1236 filename : `str`, optional
1237 Name for the file that will include database information associated
1238 with the exported datasets. If this is not an absolute path and
1239 ``directory`` is not `None`, it will be written to ``directory``
1240 instead of the current working directory. Defaults to
1241 "export.{format}".
1242 format : `str`, optional
1243 File format for the database information file. If `None`, the
1244 extension of ``filename`` will be used.
1245 transfer : `str`, optional
1246 Transfer mode passed to `Datastore.export`.
1248 Raises
1249 ------
1250 TypeError
1251 Raised if the set of arguments passed is inconsistent.
1253 Examples
1254 --------
1255 Typically the `Registry.queryDimensions` and `Registry.queryDatasets`
1256 methods are used to provide the iterables over data IDs and/or datasets
1257 to be exported::
1259 with butler.export("exports.yaml") as export:
1260 # Export all flats, and the calibration_label dimensions
1261 # associated with them.
1262 export.saveDatasets(butler.registry.queryDatasets("flat"),
1263 elements=[butler.registry.dimensions["calibration_label"]])
1264 # Export all datasets that start with "deepCoadd_" and all of
1265 # their associated data ID information.
1266 export.saveDatasets(butler.registry.queryDatasets("deepCoadd_*"))
1267 """
1268 if directory is None and transfer is not None:
1269 raise TypeError("Cannot transfer without providing a directory.")
1270 if transfer == "move":
1271 raise TypeError("Transfer may not be 'move': export is read-only")
1272 if format is None:
1273 if filename is None:
1274 raise TypeError("At least one of 'filename' or 'format' must be provided.")
1275 else:
1276 _, format = os.path.splitext(filename)
1277 elif filename is None:
1278 filename = f"export.{format}"
1279 if directory is not None:
1280 filename = os.path.join(directory, filename)
1281 BackendClass = getClassOf(self._config["repo_transfer_formats"][format]["export"])
1282 with open(filename, 'w') as stream:
1283 backend = BackendClass(stream)
1284 try:
1285 helper = RepoExport(self.registry, self.datastore, backend=backend,
1286 directory=directory, transfer=transfer)
1287 yield helper
1288 except BaseException:
1289 raise
1290 else:
1291 helper._finish()
1293 def import_(self, *, directory: Optional[str] = None,
1294 filename: Optional[str] = None,
1295 format: Optional[str] = None,
1296 transfer: Optional[str] = None):
1297 """Import datasets exported from a different butler repository.
1299 Parameters
1300 ----------
1301 directory : `str`, optional
1302 Directory containing dataset files. If `None`, all file paths
1303 must be absolute.
1304 filename : `str`, optional
1305 Name for the file that containing database information associated
1306 with the exported datasets. If this is not an absolute path, does
1307 not exist in the current working directory, and ``directory`` is
1308 not `None`, it is assumed to be in ``directory``. Defaults to
1309 "export.{format}".
1310 format : `str`, optional
1311 File format for the database information file. If `None`, the
1312 extension of ``filename`` will be used.
1313 transfer : `str`, optional
1314 Transfer mode passed to `Datastore.export`.
1316 Raises
1317 ------
1318 TypeError
1319 Raised if the set of arguments passed is inconsistent, or if the
1320 butler is read-only.
1321 """
1322 if not self.isWriteable():
1323 raise TypeError("Butler is read-only.")
1324 if format is None:
1325 if filename is None:
1326 raise TypeError("At least one of 'filename' or 'format' must be provided.")
1327 else:
1328 _, format = os.path.splitext(filename)
1329 elif filename is None:
1330 filename = f"export.{format}"
1331 if directory is not None and not os.path.exists(filename):
1332 filename = os.path.join(directory, filename)
1333 BackendClass = getClassOf(self._config["repo_transfer_formats"][format]["import"])
1334 with open(filename, 'r') as stream:
1335 backend = BackendClass(stream, self.registry)
1336 backend.register()
1337 with self.transaction():
1338 backend.load(self.datastore, directory=directory, transfer=transfer)
1340 def validateConfiguration(self, logFailures: bool = False,
1341 datasetTypeNames: Optional[Iterable[str]] = None,
1342 ignore: Iterable[str] = None):
1343 """Validate butler configuration.
1345 Checks that each `DatasetType` can be stored in the `Datastore`.
1347 Parameters
1348 ----------
1349 logFailures : `bool`, optional
1350 If `True`, output a log message for every validation error
1351 detected.
1352 datasetTypeNames : iterable of `str`, optional
1353 The `DatasetType` names that should be checked. This allows
1354 only a subset to be selected.
1355 ignore : iterable of `str`, optional
1356 Names of DatasetTypes to skip over. This can be used to skip
1357 known problems. If a named `DatasetType` corresponds to a
1358 composite, all components of that `DatasetType` will also be
1359 ignored.
1361 Raises
1362 ------
1363 ButlerValidationError
1364 Raised if there is some inconsistency with how this Butler
1365 is configured.
1366 """
1367 if datasetTypeNames:
1368 entities = [self.registry.getDatasetType(name) for name in datasetTypeNames]
1369 else:
1370 entities = list(self.registry.queryDatasetTypes())
1372 # filter out anything from the ignore list
1373 if ignore:
1374 ignore = set(ignore)
1375 entities = [e for e in entities if e.name not in ignore and e.nameAndComponent()[0] not in ignore]
1376 else:
1377 ignore = set()
1379 # Find all the registered instruments
1380 instruments = set(
1381 dataId["instrument"] for dataId in self.registry.queryDimensions(["instrument"])
1382 )
1384 # For each datasetType that has an instrument dimension, create
1385 # a DatasetRef for each defined instrument
1386 datasetRefs = []
1388 for datasetType in entities:
1389 if "instrument" in datasetType.dimensions:
1390 for instrument in instruments:
1391 datasetRef = DatasetRef(datasetType, {"instrument": instrument}, conform=False)
1392 datasetRefs.append(datasetRef)
1394 entities.extend(datasetRefs)
1396 datastoreErrorStr = None
1397 try:
1398 self.datastore.validateConfiguration(entities, logFailures=logFailures)
1399 except ValidationError as e:
1400 datastoreErrorStr = str(e)
1402 # Also check that the LookupKeys used by the datastores match
1403 # registry and storage class definitions
1404 keys = self.datastore.getLookupKeys()
1406 failedNames = set()
1407 failedDataId = set()
1408 for key in keys:
1409 datasetType = None
1410 if key.name is not None:
1411 if key.name in ignore:
1412 continue
1414 # skip if specific datasetType names were requested and this
1415 # name does not match
1416 if datasetTypeNames and key.name not in datasetTypeNames:
1417 continue
1419 # See if it is a StorageClass or a DatasetType
1420 if key.name in self.storageClasses:
1421 pass
1422 else:
1423 try:
1424 self.registry.getDatasetType(key.name)
1425 except KeyError:
1426 if logFailures:
1427 log.fatal("Key '%s' does not correspond to a DatasetType or StorageClass", key)
1428 failedNames.add(key)
1429 else:
1430 # Dimensions are checked for consistency when the Butler
1431 # is created and rendezvoused with a universe.
1432 pass
1434 # Check that the instrument is a valid instrument
1435 # Currently only support instrument so check for that
1436 if key.dataId:
1437 dataIdKeys = set(key.dataId)
1438 if set(["instrument"]) != dataIdKeys:
1439 if logFailures:
1440 log.fatal("Key '%s' has unsupported DataId override", key)
1441 failedDataId.add(key)
1442 elif key.dataId["instrument"] not in instruments:
1443 if logFailures:
1444 log.fatal("Key '%s' has unknown instrument", key)
1445 failedDataId.add(key)
1447 messages = []
1449 if datastoreErrorStr:
1450 messages.append(datastoreErrorStr)
1452 for failed, msg in ((failedNames, "Keys without corresponding DatasetType or StorageClass entry: "),
1453 (failedDataId, "Keys with bad DataId entries: ")):
1454 if failed:
1455 msg += ", ".join(str(k) for k in failed)
1456 messages.append(msg)
1458 if messages:
1459 raise ValidationError(";\n".join(messages))
1461 registry: Registry
1462 """The object that manages dataset metadata and relationships (`Registry`).
1464 Most operations that don't involve reading or writing butler datasets are
1465 accessible only via `Registry` methods.
1466 """
1468 datastore: Datastore
1469 """The object that manages actual dataset storage (`Datastore`).
1471 Direct user access to the datastore should rarely be necessary; the primary
1472 exception is the case where a `Datastore` implementation provides extra
1473 functionality beyond what the base class defines.
1474 """
1476 storageClasses: StorageClassFactory
1477 """An object that maps known storage class names to objects that fully
1478 describe them (`StorageClassFactory`).
1479 """
1481 collections: Optional[CollectionSearch]
1482 """The collections to search and any restrictions on the dataset types to
1483 search for within them, in order (`CollectionSearch`).
1484 """
1486 run: Optional[str]
1487 """Name of the run this butler writes outputs to (`str` or `None`).
1488 """
1490 tags: Tuple[str, ...]
1491 """Names of `~CollectionType.TAGGED` collections this butler associates
1492 with in `put` and `ingest`, and disassociates from in `pruneDatasets`
1493 (`tuple` [ `str` ]).
1494 """