Coverage for python/lsst/daf/butler/_butler.py : 9%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""
23Butler top level classes.
24"""
25from __future__ import annotations
27__all__ = ("Butler", "ButlerValidationError")
29import os
30from collections import defaultdict
31import contextlib
32import logging
33from typing import (
34 Any,
35 ClassVar,
36 ContextManager,
37 Dict,
38 Iterable,
39 List,
40 Mapping,
41 MutableMapping,
42 Optional,
43 TextIO,
44 Tuple,
45 Union,
46)
48try:
49 import boto3
50except ImportError:
51 boto3 = None
53from lsst.utils import doImport
54from .core import (
55 ButlerURI,
56 Config,
57 ConfigSubset,
58 DataCoordinate,
59 DataId,
60 DatasetRef,
61 DatasetType,
62 Datastore,
63 FileDataset,
64 Quantum,
65 RepoExport,
66 StorageClassFactory,
67 ValidationError,
68)
69from .core.repoRelocation import BUTLER_ROOT_TAG
70from .core.utils import transactional, getClassOf, safeMakeDir
71from .core.s3utils import bucketExists
72from ._deferredDatasetHandle import DeferredDatasetHandle
73from ._butlerConfig import ButlerConfig
74from .registry import Registry, RegistryConfig, CollectionType
75from .registry.wildcards import CollectionSearch
77log = logging.getLogger(__name__)
80class ButlerValidationError(ValidationError):
81 """There is a problem with the Butler configuration."""
82 pass
85class Butler:
86 """Main entry point for the data access system.
88 Parameters
89 ----------
90 config : `ButlerConfig`, `Config` or `str`, optional.
91 Configuration. Anything acceptable to the
92 `ButlerConfig` constructor. If a directory path
93 is given the configuration will be read from a ``butler.yaml`` file in
94 that location. If `None` is given default values will be used.
95 butler : `Butler`, optional.
96 If provided, construct a new Butler that uses the same registry and
97 datastore as the given one, but with the given collection and run.
98 Incompatible with the ``config``, ``searchPaths``, and ``writeable``
99 arguments.
100 collections : `Any`, optional
101 An expression specifying the collections to be searched (in order) when
102 reading datasets, and optionally dataset type restrictions on them.
103 This may be:
104 - a `str` collection name;
105 - a tuple of (collection name, *dataset type restriction*);
106 - an iterable of either of the above;
107 - a mapping from `str` to *dataset type restriction*.
109 See :ref:`daf_butler_collection_expressions` for more information,
110 including the definition of a *dataset type restriction*. All
111 collections must either already exist or be specified to be created
112 by other arguments.
113 run : `str`, optional
114 Name of the run datasets should be output to. If the run
115 does not exist, it will be created. If ``collections`` is `None`, it
116 will be set to ``[run]``. If this is not set (and ``writeable`` is
117 not set either), a read-only butler will be created.
118 tags : `Iterable` [ `str` ], optional
119 A list of `~CollectionType.TAGGED` collections that datasets should be
120 associated with in `put` or `ingest` and disassociated from in
121 `pruneDatasets`. If any of these collections does not exist, it will
122 be created.
123 chains : `Mapping` [ `str`, `Iterable` [ `str` ] ], optional
124 A mapping from the names of new `~CollectionType.CHAINED` collections
125 to an expression identifying their child collections (which takes the
126 same form as the ``collections`` argument. Chains may be nested only
127 if children precede their parents in this mapping.
128 searchPaths : `list` of `str`, optional
129 Directory paths to search when calculating the full Butler
130 configuration. Not used if the supplied config is already a
131 `ButlerConfig`.
132 writeable : `bool`, optional
133 Explicitly sets whether the butler supports write operations. If not
134 provided, a read-write butler is created if any of ``run``, ``tags``,
135 or ``chains`` is non-empty.
137 Examples
138 --------
139 While there are many ways to control exactly how a `Butler` interacts with
140 the collections in its `Registry`, the most common cases are still simple.
142 For a read-only `Butler` that searches one collection, do::
144 butler = Butler("/path/to/repo", collections=["u/alice/DM-50000"])
146 For a read-write `Butler` that writes to and reads from a
147 `~CollectionType.RUN` collection::
149 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a")
151 The `Butler` passed to a ``PipelineTask`` is often much more complex,
152 because we want to write to one `~CollectionType.RUN` collection but read
153 from several others (as well), while defining a new
154 `~CollectionType.CHAINED` collection that combines them all::
156 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a",
157 collections=["u/alice/DM-50000"],
158 chains={
159 "u/alice/DM-50000": ["u/alice/DM-50000/a",
160 "u/bob/DM-49998",
161 "raw/hsc"]
162 })
164 This butler will `put` new datasets to the run ``u/alice/DM-50000/a``, but
165 they'll also be available from the chained collection ``u/alice/DM-50000``.
166 Datasets will be read first from that run (since it appears first in the
167 chain), and then from ``u/bob/DM-49998`` and finally ``raw/hsc``.
168 If ``u/alice/DM-50000`` had already been defined, the ``chain`` argument
169 would be unnecessary. We could also construct a butler that performs
170 exactly the same `put` and `get` operations without actually creating a
171 chained collection, just by passing multiple items is ``collections``::
173 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a",
174 collections=["u/alice/DM-50000/a",
175 "u/bob/DM-49998",
176 "raw/hsc"])
178 Finally, one can always create a `Butler` with no collections::
180 butler = Butler("/path/to/repo", writeable=True)
182 This can be extremely useful when you just want to use ``butler.registry``,
183 e.g. for inserting dimension data or managing collections, or when the
184 collections you want to use with the butler are not consistent.
185 Passing ``writeable`` explicitly here is only necessary if you want to be
186 able to make changes to the repo - usually the value for ``writeable`` is
187 can be guessed from the collection arguments provided, but it defaults to
188 `False` when there are not collection arguments.
189 """
190 def __init__(self, config: Union[Config, str, None] = None, *,
191 butler: Optional[Butler] = None,
192 collections: Any = None,
193 run: Optional[str] = None,
194 tags: Iterable[str] = (),
195 chains: Optional[Mapping[str, Any]] = None,
196 searchPaths: Optional[List[str]] = None,
197 writeable: Optional[bool] = None):
198 # Transform any single-pass iterator into an actual sequence so we
199 # can see if its empty
200 self.tags = tuple(tags)
201 # Load registry, datastore, etc. from config or existing butler.
202 if butler is not None:
203 if config is not None or searchPaths is not None or writeable is not None:
204 raise TypeError("Cannot pass 'config', 'searchPaths', or 'writeable' "
205 "arguments with 'butler' argument.")
206 self.registry = butler.registry
207 self.datastore = butler.datastore
208 self.storageClasses = butler.storageClasses
209 self._config = butler._config
210 else:
211 self._config = ButlerConfig(config, searchPaths=searchPaths)
212 if "root" in self._config:
213 butlerRoot = self._config["root"]
214 else:
215 butlerRoot = self._config.configDir
216 if writeable is None:
217 writeable = run is not None or chains is not None or self.tags
218 self.registry = Registry.fromConfig(self._config, butlerRoot=butlerRoot, writeable=writeable)
219 self.datastore = Datastore.fromConfig(self._config, self.registry.getDatastoreBridgeManager(),
220 butlerRoot=butlerRoot)
221 self.storageClasses = StorageClassFactory()
222 self.storageClasses.addFromConfig(self._config)
223 # Check the many collection arguments for consistency and create any
224 # needed collections that don't exist.
225 if collections is None:
226 if run is not None:
227 collections = (run,)
228 else:
229 collections = ()
230 self.collections = CollectionSearch.fromExpression(collections)
231 if chains is None:
232 chains = {}
233 self.run = run
234 if "run" in self._config or "collection" in self._config:
235 raise ValueError("Passing a run or collection via configuration is no longer supported.")
236 if self.run is not None:
237 self.registry.registerCollection(self.run, type=CollectionType.RUN)
238 for tag in self.tags:
239 self.registry.registerCollection(tag, type=CollectionType.TAGGED)
240 for parent, children in chains.items():
241 self.registry.registerCollection(parent, type=CollectionType.CHAINED)
242 self.registry.setCollectionChain(parent, children)
244 GENERATION: ClassVar[int] = 3
245 """This is a Generation 3 Butler.
247 This attribute may be removed in the future, once the Generation 2 Butler
248 interface has been fully retired; it should only be used in transitional
249 code.
250 """
252 @staticmethod
253 def makeRepo(root: str, config: Union[Config, str, None] = None, standalone: bool = False,
254 createRegistry: bool = True, searchPaths: Optional[List[str]] = None,
255 forceConfigRoot: bool = True, outfile: Optional[str] = None,
256 overwrite: bool = False) -> Config:
257 """Create an empty data repository by adding a butler.yaml config
258 to a repository root directory.
260 Parameters
261 ----------
262 root : `str` or `ButlerURI`
263 Path or URI to the root location of the new repository. Will be
264 created if it does not exist.
265 config : `Config` or `str`, optional
266 Configuration to write to the repository, after setting any
267 root-dependent Registry or Datastore config options. Can not
268 be a `ButlerConfig` or a `ConfigSubset`. If `None`, default
269 configuration will be used. Root-dependent config options
270 specified in this config are overwritten if ``forceConfigRoot``
271 is `True`.
272 standalone : `bool`
273 If True, write all expanded defaults, not just customized or
274 repository-specific settings.
275 This (mostly) decouples the repository from the default
276 configuration, insulating it from changes to the defaults (which
277 may be good or bad, depending on the nature of the changes).
278 Future *additions* to the defaults will still be picked up when
279 initializing `Butlers` to repos created with ``standalone=True``.
280 createRegistry : `bool`, optional
281 If `True` create a new Registry.
282 searchPaths : `list` of `str`, optional
283 Directory paths to search when calculating the full butler
284 configuration.
285 forceConfigRoot : `bool`, optional
286 If `False`, any values present in the supplied ``config`` that
287 would normally be reset are not overridden and will appear
288 directly in the output config. This allows non-standard overrides
289 of the root directory for a datastore or registry to be given.
290 If this parameter is `True` the values for ``root`` will be
291 forced into the resulting config if appropriate.
292 outfile : `str`, optional
293 If not-`None`, the output configuration will be written to this
294 location rather than into the repository itself. Can be a URI
295 string. Can refer to a directory that will be used to write
296 ``butler.yaml``.
297 overwrite : `bool`, optional
298 Create a new configuration file even if one already exists
299 in the specified output location. Default is to raise
300 an exception.
302 Returns
303 -------
304 config : `Config`
305 The updated `Config` instance written to the repo.
307 Raises
308 ------
309 ValueError
310 Raised if a ButlerConfig or ConfigSubset is passed instead of a
311 regular Config (as these subclasses would make it impossible to
312 support ``standalone=False``).
313 FileExistsError
314 Raised if the output config file already exists.
315 os.error
316 Raised if the directory does not exist, exists but is not a
317 directory, or cannot be created.
319 Notes
320 -----
321 Note that when ``standalone=False`` (the default), the configuration
322 search path (see `ConfigSubset.defaultSearchPaths`) that was used to
323 construct the repository should also be used to construct any Butlers
324 to avoid configuration inconsistencies.
325 """
326 if isinstance(config, (ButlerConfig, ConfigSubset)):
327 raise ValueError("makeRepo must be passed a regular Config without defaults applied.")
329 # for "file" schemes we are assuming POSIX semantics for paths, for
330 # schemeless URIs we are assuming os.path semantics.
331 uri = ButlerURI(root, forceDirectory=True)
332 if uri.scheme == "file" or not uri.scheme:
333 if not os.path.isdir(uri.ospath):
334 safeMakeDir(uri.ospath)
335 elif uri.scheme == "s3":
336 # bucket must already exist
337 if not bucketExists(uri.netloc):
338 raise ValueError(f"Bucket {uri.netloc} does not exist!")
339 s3 = boto3.client("s3")
340 # don't create S3 key when root is at the top-level of an Bucket
341 if not uri.path == "/":
342 s3.put_object(Bucket=uri.netloc, Key=uri.relativeToPathRoot)
343 else:
344 raise ValueError(f"Unrecognized scheme: {uri.scheme}")
345 config = Config(config)
347 # If we are creating a new repo from scratch with relative roots,
348 # do not propagate an explicit root from the config file
349 if "root" in config:
350 del config["root"]
352 full = ButlerConfig(config, searchPaths=searchPaths) # this applies defaults
353 datastoreClass = doImport(full["datastore", "cls"])
354 datastoreClass.setConfigRoot(BUTLER_ROOT_TAG, config, full, overwrite=forceConfigRoot)
356 # if key exists in given config, parse it, otherwise parse the defaults
357 # in the expanded config
358 if config.get(("registry", "db")):
359 registryConfig = RegistryConfig(config)
360 else:
361 registryConfig = RegistryConfig(full)
362 defaultDatabaseUri = registryConfig.makeDefaultDatabaseUri(BUTLER_ROOT_TAG)
363 if defaultDatabaseUri is not None:
364 Config.updateParameters(RegistryConfig, config, full,
365 toUpdate={"db": defaultDatabaseUri},
366 overwrite=forceConfigRoot)
367 else:
368 Config.updateParameters(RegistryConfig, config, full, toCopy=("db",),
369 overwrite=forceConfigRoot)
371 if standalone:
372 config.merge(full)
373 if outfile is not None:
374 # When writing to a separate location we must include
375 # the root of the butler repo in the config else it won't know
376 # where to look.
377 config["root"] = uri.geturl()
378 configURI = outfile
379 else:
380 configURI = uri
381 config.dumpToUri(configURI, overwrite=overwrite)
383 # Create Registry and populate tables
384 Registry.fromConfig(config, create=createRegistry, butlerRoot=root)
385 return config
387 @classmethod
388 def _unpickle(cls, config: ButlerConfig, collections: Optional[CollectionSearch], run: Optional[str],
389 tags: Tuple[str, ...], writeable: bool) -> Butler:
390 """Callable used to unpickle a Butler.
392 We prefer not to use ``Butler.__init__`` directly so we can force some
393 of its many arguments to be keyword-only (note that ``__reduce__``
394 can only invoke callables with positional arguments).
396 Parameters
397 ----------
398 config : `ButlerConfig`
399 Butler configuration, already coerced into a true `ButlerConfig`
400 instance (and hence after any search paths for overrides have been
401 utilized).
402 collections : `CollectionSearch`
403 Names of collections to read from.
404 run : `str`, optional
405 Name of `~CollectionType.RUN` collection to write to.
406 tags : `tuple` [`str`]
407 Names of `~CollectionType.TAGGED` collections to associate with.
408 writeable : `bool`
409 Whether the Butler should support write operations.
411 Returns
412 -------
413 butler : `Butler`
414 A new `Butler` instance.
415 """
416 return cls(config=config, collections=collections, run=run, tags=tags, writeable=writeable)
418 def __reduce__(self):
419 """Support pickling.
420 """
421 return (Butler._unpickle, (self._config, self.collections, self.run, self.tags,
422 self.registry.isWriteable()))
424 def __str__(self):
425 return "Butler(collections={}, run={}, tags={}, datastore='{}', registry='{}')".format(
426 self.collections, self.run, self.tags, self.datastore, self.registry)
428 def isWriteable(self) -> bool:
429 """Return `True` if this `Butler` supports write operations.
430 """
431 return self.registry.isWriteable()
433 @contextlib.contextmanager
434 def transaction(self):
435 """Context manager supporting `Butler` transactions.
437 Transactions can be nested.
438 """
439 with self.registry.transaction():
440 with self.datastore.transaction():
441 yield
443 def _standardizeArgs(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
444 dataId: Optional[DataId] = None, **kwds: Any) -> Tuple[DatasetType, DataId]:
445 """Standardize the arguments passed to several Butler APIs.
447 Parameters
448 ----------
449 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
450 When `DatasetRef` the `dataId` should be `None`.
451 Otherwise the `DatasetType` or name thereof.
452 dataId : `dict` or `DataCoordinate`
453 A `dict` of `Dimension` link name, value pairs that label the
454 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
455 should be provided as the second argument.
456 kwds
457 Additional keyword arguments used to augment or construct a
458 `DataCoordinate`. See `DataCoordinate.standardize`
459 parameters.
461 Returns
462 -------
463 datasetType : `DatasetType`
464 A `DatasetType` instance extracted from ``datasetRefOrType``.
465 dataId : `dict` or `DataId`, optional
466 Argument that can be used (along with ``kwds``) to construct a
467 `DataId`.
469 Notes
470 -----
471 Butler APIs that conceptually need a DatasetRef also allow passing a
472 `DatasetType` (or the name of one) and a `DataId` (or a dict and
473 keyword arguments that can be used to construct one) separately. This
474 method accepts those arguments and always returns a true `DatasetType`
475 and a `DataId` or `dict`.
477 Standardization of `dict` vs `DataId` is best handled by passing the
478 returned ``dataId`` (and ``kwds``) to `Registry` APIs, which are
479 generally similarly flexible.
480 """
481 externalDatasetType = None
482 internalDatasetType = None
483 if isinstance(datasetRefOrType, DatasetRef):
484 if dataId is not None or kwds:
485 raise ValueError("DatasetRef given, cannot use dataId as well")
486 externalDatasetType = datasetRefOrType.datasetType
487 dataId = datasetRefOrType.dataId
488 else:
489 # Don't check whether DataId is provided, because Registry APIs
490 # can usually construct a better error message when it wasn't.
491 if isinstance(datasetRefOrType, DatasetType):
492 externalDatasetType = datasetRefOrType
493 else:
494 internalDatasetType = self.registry.getDatasetType(datasetRefOrType)
496 # Check that they are self-consistent
497 if externalDatasetType is not None:
498 internalDatasetType = self.registry.getDatasetType(externalDatasetType.name)
499 if externalDatasetType != internalDatasetType:
500 raise ValueError(f"Supplied dataset type ({externalDatasetType}) inconsistent with "
501 f"registry definition ({internalDatasetType})")
503 return internalDatasetType, dataId
505 def _findDatasetRef(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
506 dataId: Optional[DataId] = None, *,
507 collections: Any = None,
508 allowUnresolved: bool = False,
509 **kwds: Any) -> DatasetRef:
510 """Shared logic for methods that start with a search for a dataset in
511 the registry.
513 Parameters
514 ----------
515 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
516 When `DatasetRef` the `dataId` should be `None`.
517 Otherwise the `DatasetType` or name thereof.
518 dataId : `dict` or `DataCoordinate`, optional
519 A `dict` of `Dimension` link name, value pairs that label the
520 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
521 should be provided as the first argument.
522 collections : Any, optional
523 Collections to be searched, overriding ``self.collections``.
524 Can be any of the types supported by the ``collections`` argument
525 to butler construction.
526 allowUnresolved : `bool`, optional
527 If `True`, return an unresolved `DatasetRef` if finding a resolved
528 one in the `Registry` fails. Defaults to `False`.
529 kwds
530 Additional keyword arguments used to augment or construct a
531 `DataId`. See `DataId` parameters.
533 Returns
534 -------
535 ref : `DatasetRef`
536 A reference to the dataset identified by the given arguments.
538 Raises
539 ------
540 LookupError
541 Raised if no matching dataset exists in the `Registry` (and
542 ``allowUnresolved is False``).
543 ValueError
544 Raised if a resolved `DatasetRef` was passed as an input, but it
545 differs from the one found in the registry.
546 TypeError
547 Raised if no collections were provided.
548 """
549 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwds)
550 if isinstance(datasetRefOrType, DatasetRef):
551 idNumber = datasetRefOrType.id
552 else:
553 idNumber = None
554 # Expand the data ID first instead of letting registry.findDataset do
555 # it, so we get the result even if it returns None.
556 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions, **kwds)
557 if collections is None:
558 collections = self.collections
559 if not collections:
560 raise TypeError("No input collections provided.")
561 else:
562 collections = CollectionSearch.fromExpression(collections)
563 # Always lookup the DatasetRef, even if one is given, to ensure it is
564 # present in the current collection.
565 ref = self.registry.findDataset(datasetType, dataId, collections=collections)
566 if ref is None:
567 if allowUnresolved:
568 return DatasetRef(datasetType, dataId)
569 else:
570 raise LookupError(f"Dataset {datasetType.name} with data ID {dataId} "
571 f"could not be found in collections {collections}.")
572 if idNumber is not None and idNumber != ref.id:
573 raise ValueError(f"DatasetRef.id provided ({idNumber}) does not match "
574 f"id ({ref.id}) in registry in collections {collections}.")
575 return ref
577 @transactional
578 def put(self, obj: Any, datasetRefOrType: Union[DatasetRef, DatasetType, str],
579 dataId: Optional[DataId] = None, *,
580 producer: Optional[Quantum] = None,
581 run: Optional[str] = None,
582 tags: Optional[Iterable[str]] = None,
583 **kwds: Any) -> DatasetRef:
584 """Store and register a dataset.
586 Parameters
587 ----------
588 obj : `object`
589 The dataset.
590 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
591 When `DatasetRef` is provided, ``dataId`` should be `None`.
592 Otherwise the `DatasetType` or name thereof.
593 dataId : `dict` or `DataCoordinate`
594 A `dict` of `Dimension` link name, value pairs that label the
595 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
596 should be provided as the second argument.
597 producer : `Quantum`, optional
598 The producer.
599 run : `str`, optional
600 The name of the run the dataset should be added to, overriding
601 ``self.run``.
602 tags : `Iterable` [ `str` ], optional
603 The names of a `~CollectionType.TAGGED` collections to associate
604 the dataset with, overriding ``self.tags``. These collections
605 must have already been added to the `Registry`.
606 kwds
607 Additional keyword arguments used to augment or construct a
608 `DataCoordinate`. See `DataCoordinate.standardize`
609 parameters.
611 Returns
612 -------
613 ref : `DatasetRef`
614 A reference to the stored dataset, updated with the correct id if
615 given.
617 Raises
618 ------
619 TypeError
620 Raised if the butler is read-only or if no run has been provided.
621 """
622 log.debug("Butler put: %s, dataId=%s, producer=%s, run=%s", datasetRefOrType, dataId, producer, run)
623 if not self.isWriteable():
624 raise TypeError("Butler is read-only.")
625 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwds)
626 if isinstance(datasetRefOrType, DatasetRef) and datasetRefOrType.id is not None:
627 raise ValueError("DatasetRef must not be in registry, must have None id")
629 if run is None:
630 if self.run is None:
631 raise TypeError("No run provided.")
632 run = self.run
633 # No need to check type for run; first thing we do is
634 # insertDatasets, and that will check for us.
636 if tags is None:
637 tags = self.tags
638 else:
639 tags = tuple(tags)
640 for tag in tags:
641 # Check that these are tagged collections up front, because we want
642 # to avoid relying on Datastore transactionality to avoid modifying
643 # the repo if there's an error later.
644 collectionType = self.registry.getCollectionType(tag)
645 if collectionType is not CollectionType.TAGGED:
646 raise TypeError(f"Cannot associate into collection '{tag}' of non-TAGGED type "
647 f"{collectionType.name}.")
649 # Add Registry Dataset entry.
650 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions, **kwds)
651 ref, = self.registry.insertDatasets(datasetType, run=run, dataIds=[dataId],
652 producer=producer)
654 # Add Datastore entry.
655 self.datastore.put(obj, ref)
657 for tag in tags:
658 self.registry.associate(tag, [ref])
660 return ref
662 def getDirect(self, ref: DatasetRef, *, parameters: Optional[Dict[str, Any]] = None):
663 """Retrieve a stored dataset.
665 Unlike `Butler.get`, this method allows datasets outside the Butler's
666 collection to be read as long as the `DatasetRef` that identifies them
667 can be obtained separately.
669 Parameters
670 ----------
671 ref : `DatasetRef`
672 Reference to an already stored dataset.
673 parameters : `dict`
674 Additional StorageClass-defined options to control reading,
675 typically used to efficiently read only a subset of the dataset.
677 Returns
678 -------
679 obj : `object`
680 The dataset.
681 """
682 return self.datastore.get(ref, parameters=parameters)
684 def getDeferred(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
685 dataId: Optional[DataId] = None, *,
686 parameters: Union[dict, None] = None,
687 collections: Any = None,
688 **kwds: Any) -> DeferredDatasetHandle:
689 """Create a `DeferredDatasetHandle` which can later retrieve a dataset
691 Parameters
692 ----------
693 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
694 When `DatasetRef` the `dataId` should be `None`.
695 Otherwise the `DatasetType` or name thereof.
696 dataId : `dict` or `DataCoordinate`, optional
697 A `dict` of `Dimension` link name, value pairs that label the
698 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
699 should be provided as the first argument.
700 parameters : `dict`
701 Additional StorageClass-defined options to control reading,
702 typically used to efficiently read only a subset of the dataset.
703 collections : Any, optional
704 Collections to be searched, overriding ``self.collections``.
705 Can be any of the types supported by the ``collections`` argument
706 to butler construction.
707 kwds
708 Additional keyword arguments used to augment or construct a
709 `DataId`. See `DataId` parameters.
711 Returns
712 -------
713 obj : `DeferredDatasetHandle`
714 A handle which can be used to retrieve a dataset at a later time.
716 Raises
717 ------
718 LookupError
719 Raised if no matching dataset exists in the `Registry` (and
720 ``allowUnresolved is False``).
721 ValueError
722 Raised if a resolved `DatasetRef` was passed as an input, but it
723 differs from the one found in the registry.
724 TypeError
725 Raised if no collections were provided.
726 """
727 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwds)
728 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters)
730 def get(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
731 dataId: Optional[DataId] = None, *,
732 parameters: Optional[Dict[str, Any]] = None,
733 collections: Any = None,
734 **kwds: Any) -> Any:
735 """Retrieve a stored dataset.
737 Parameters
738 ----------
739 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
740 When `DatasetRef` the `dataId` should be `None`.
741 Otherwise the `DatasetType` or name thereof.
742 dataId : `dict` or `DataCoordinate`
743 A `dict` of `Dimension` link name, value pairs that label the
744 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
745 should be provided as the first argument.
746 parameters : `dict`
747 Additional StorageClass-defined options to control reading,
748 typically used to efficiently read only a subset of the dataset.
749 collections : Any, optional
750 Collections to be searched, overriding ``self.collections``.
751 Can be any of the types supported by the ``collections`` argument
752 to butler construction.
753 kwds
754 Additional keyword arguments used to augment or construct a
755 `DataCoordinate`. See `DataCoordinate.standardize`
756 parameters.
758 Returns
759 -------
760 obj : `object`
761 The dataset.
763 Raises
764 ------
765 ValueError
766 Raised if a resolved `DatasetRef` was passed as an input, but it
767 differs from the one found in the registry.
768 LookupError
769 Raised if no matching dataset exists in the `Registry`.
770 TypeError
771 Raised if no collections were provided.
772 """
773 log.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters)
774 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwds)
775 return self.getDirect(ref, parameters=parameters)
777 def getURIs(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
778 dataId: Optional[DataId] = None, *,
779 predict: bool = False,
780 collections: Any = None,
781 run: Optional[str] = None,
782 **kwds: Any) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]:
783 """Returns the URIs associated with the dataset.
785 Parameters
786 ----------
787 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
788 When `DatasetRef` the `dataId` should be `None`.
789 Otherwise the `DatasetType` or name thereof.
790 dataId : `dict` or `DataCoordinate`
791 A `dict` of `Dimension` link name, value pairs that label the
792 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
793 should be provided as the first argument.
794 predict : `bool`
795 If `True`, allow URIs to be returned of datasets that have not
796 been written.
797 collections : Any, optional
798 Collections to be searched, overriding ``self.collections``.
799 Can be any of the types supported by the ``collections`` argument
800 to butler construction.
801 run : `str`, optional
802 Run to use for predictions, overriding ``self.run``.
803 kwds
804 Additional keyword arguments used to augment or construct a
805 `DataCoordinate`. See `DataCoordinate.standardize`
806 parameters.
808 Returns
809 -------
810 primary : `ButlerURI`
811 The URI to the primary artifact associated with this dataset.
812 If the dataset was disassembled within the datastore this
813 may be `None`.
814 components : `dict`
815 URIs to any components associated with the dataset artifact.
816 Can be empty if there are no components.
817 """
818 ref = self._findDatasetRef(datasetRefOrType, dataId, allowUnresolved=predict,
819 collections=collections, **kwds)
820 if ref.id is None: # only possible if predict is True
821 if run is None:
822 run = self.run
823 if run is None:
824 raise TypeError("Cannot predict location with run=None.")
825 # Lie about ID, because we can't guess it, and only
826 # Datastore.getURIs() will ever see it (and it doesn't use it).
827 ref = ref.resolved(id=0, run=run)
828 return self.datastore.getURIs(ref, predict)
830 def getURI(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
831 dataId: Optional[DataId] = None, *,
832 predict: bool = False,
833 collections: Any = None,
834 run: Optional[str] = None,
835 **kwds: Any) -> ButlerURI:
836 """Return the URI to the Dataset.
838 Parameters
839 ----------
840 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
841 When `DatasetRef` the `dataId` should be `None`.
842 Otherwise the `DatasetType` or name thereof.
843 dataId : `dict` or `DataCoordinate`
844 A `dict` of `Dimension` link name, value pairs that label the
845 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
846 should be provided as the first argument.
847 predict : `bool`
848 If `True`, allow URIs to be returned of datasets that have not
849 been written.
850 collections : Any, optional
851 Collections to be searched, overriding ``self.collections``.
852 Can be any of the types supported by the ``collections`` argument
853 to butler construction.
854 run : `str`, optional
855 Run to use for predictions, overriding ``self.run``.
856 kwds
857 Additional keyword arguments used to augment or construct a
858 `DataCoordinate`. See `DataCoordinate.standardize`
859 parameters.
861 Returns
862 -------
863 uri : `ButlerURI`
864 URI pointing to the Dataset within the datastore. If the
865 Dataset does not exist in the datastore, and if ``predict`` is
866 `True`, the URI will be a prediction and will include a URI
867 fragment "#predicted".
868 If the datastore does not have entities that relate well
869 to the concept of a URI the returned URI string will be
870 descriptive. The returned URI is not guaranteed to be obtainable.
872 Raises
873 ------
874 LookupError
875 A URI has been requested for a dataset that does not exist and
876 guessing is not allowed.
877 ValueError
878 Raised if a resolved `DatasetRef` was passed as an input, but it
879 differs from the one found in the registry.
880 TypeError
881 Raised if no collections were provided.
882 RuntimeError
883 Raised if a URI is requested for a dataset that consists of
884 multiple artifacts.
885 """
886 primary, components = self.getURIs(datasetRefOrType, dataId=dataId, predict=predict,
887 collections=collections, run=run, **kwds)
889 if primary is None or components:
890 raise RuntimeError(f"Dataset ({datasetRefOrType}) includes distinct URIs for components. "
891 "Use Butler.getURIs() instead.")
892 return primary
894 def datasetExists(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
895 dataId: Optional[DataId] = None, *,
896 collections: Any = None,
897 **kwds: Any) -> bool:
898 """Return True if the Dataset is actually present in the Datastore.
900 Parameters
901 ----------
902 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
903 When `DatasetRef` the `dataId` should be `None`.
904 Otherwise the `DatasetType` or name thereof.
905 dataId : `dict` or `DataCoordinate`
906 A `dict` of `Dimension` link name, value pairs that label the
907 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
908 should be provided as the first argument.
909 collections : Any, optional
910 Collections to be searched, overriding ``self.collections``.
911 Can be any of the types supported by the ``collections`` argument
912 to butler construction.
913 kwds
914 Additional keyword arguments used to augment or construct a
915 `DataCoordinate`. See `DataCoordinate.standardize`
916 parameters.
918 Raises
919 ------
920 LookupError
921 Raised if the dataset is not even present in the Registry.
922 ValueError
923 Raised if a resolved `DatasetRef` was passed as an input, but it
924 differs from the one found in the registry.
925 TypeError
926 Raised if no collections were provided.
927 """
928 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwds)
929 return self.datastore.exists(ref)
931 def pruneCollection(self, name: str, purge: bool = False, unstore: bool = False):
932 """Remove a collection and possibly prune datasets within it.
934 Parameters
935 ----------
936 name : `str`
937 Name of the collection to remove. If this is a
938 `~CollectionType.TAGGED` or `~CollectionType.CHAINED` collection,
939 datasets within the collection are not modified unless ``unstore``
940 is `True`. If this is a `~CollectionType.RUN` collection,
941 ``purge`` and ``unstore`` must be `True`, and all datasets in it
942 are fully removed from the data repository.
943 purge : `bool`, optional
944 If `True`, permit `~CollectionType.RUN` collections to be removed,
945 fully removing datasets within them. Requires ``unstore=True`` as
946 well as an added precaution against accidental deletion. Must be
947 `False` (default) if the collection is not a ``RUN``.
948 unstore: `bool`, optional
949 If `True`, remove all datasets in the collection from all
950 datastores in which they appear.
952 Raises
953 ------
954 TypeError
955 Raised if the butler is read-only or arguments are mutually
956 inconsistent.
957 """
958 # See pruneDatasets comments for more information about the logic here;
959 # the cases are almost the same, but here we can rely on Registry to
960 # take care everything but Datastore deletion when we remove the
961 # collection.
962 if not self.isWriteable():
963 raise TypeError("Butler is read-only.")
964 if purge and not unstore:
965 raise TypeError("Cannot pass purge=True without unstore=True.")
966 collectionType = self.registry.getCollectionType(name)
967 if collectionType is CollectionType.RUN and not purge:
968 raise TypeError(f"Cannot prune RUN collection {name} without purge=True.")
969 if collectionType is not CollectionType.RUN and purge:
970 raise TypeError(f"Cannot prune {collectionType.name} collection {name} with purge=True.")
971 with self.registry.transaction():
972 if unstore:
973 for ref in self.registry.queryDatasets(..., collections=name, deduplicate=True):
974 if self.datastore.exists(ref):
975 self.datastore.trash(ref)
976 self.registry.removeCollection(name)
977 if unstore:
978 # Point of no return for removing artifacts
979 self.datastore.emptyTrash()
981 def pruneDatasets(self, refs: Iterable[DatasetRef], *,
982 disassociate: bool = True,
983 unstore: bool = False,
984 tags: Optional[Iterable[str]] = None,
985 purge: bool = False,
986 run: Optional[str] = None):
987 """Remove one or more datasets from a collection and/or storage.
989 Parameters
990 ----------
991 refs : `~collections.abc.Iterable` of `DatasetRef`
992 Datasets to prune. These must be "resolved" references (not just
993 a `DatasetType` and data ID).
994 disassociate : bool`, optional
995 Disassociate pruned datasets from ``self.tags`` (or the collections
996 given via the ``tags`` argument). Ignored if ``refs`` is ``...``.
997 unstore : `bool`, optional
998 If `True` (`False` is default) remove these datasets from all
999 datastores known to this butler. Note that this will make it
1000 impossible to retrieve these datasets even via other collections.
1001 Datasets that are already not stored are ignored by this option.
1002 tags : `Iterable` [ `str` ], optional
1003 `~CollectionType.TAGGED` collections to disassociate the datasets
1004 from, overriding ``self.tags``. Ignored if ``disassociate`` is
1005 `False` or ``purge`` is `True`.
1006 purge : `bool`, optional
1007 If `True` (`False` is default), completely remove the dataset from
1008 the `Registry`. To prevent accidental deletions, ``purge`` may
1009 only be `True` if all of the following conditions are met:
1011 - All given datasets are in the given run.
1012 - ``disassociate`` is `True`;
1013 - ``unstore`` is `True`.
1015 This mode may remove provenance information from datasets other
1016 than those provided, and should be used with extreme care.
1017 run : `str`, optional
1018 `~CollectionType.RUN` collection to purge from, overriding
1019 ``self.run``. Ignored unless ``purge`` is `True`.
1021 Raises
1022 ------
1023 TypeError
1024 Raised if the butler is read-only, if no collection was provided,
1025 or the conditions for ``purge=True`` were not met.
1026 """
1027 if not self.isWriteable():
1028 raise TypeError("Butler is read-only.")
1029 if purge:
1030 if not disassociate:
1031 raise TypeError("Cannot pass purge=True without disassociate=True.")
1032 if not unstore:
1033 raise TypeError("Cannot pass purge=True without unstore=True.")
1034 if run is None:
1035 run = self.run
1036 if run is None:
1037 raise TypeError("No run provided but purge=True.")
1038 collectionType = self.registry.getCollectionType(run)
1039 if collectionType is not CollectionType.RUN:
1040 raise TypeError(f"Cannot purge from collection '{run}' "
1041 f"of non-RUN type {collectionType.name}.")
1042 elif disassociate:
1043 if tags is None:
1044 tags = self.tags
1045 else:
1046 tags = tuple(tags)
1047 if not tags:
1048 raise TypeError("No tags provided but disassociate=True.")
1049 for tag in tags:
1050 collectionType = self.registry.getCollectionType(tag)
1051 if collectionType is not CollectionType.TAGGED:
1052 raise TypeError(f"Cannot disassociate from collection '{tag}' "
1053 f"of non-TAGGED type {collectionType.name}.")
1054 # Transform possibly-single-pass iterable into something we can iterate
1055 # over multiple times.
1056 refs = list(refs)
1057 # Pruning a component of a DatasetRef makes no sense since registry
1058 # doesn't know about components and datastore might not store
1059 # components in a separate file
1060 for ref in refs:
1061 if ref.datasetType.component():
1062 raise ValueError(f"Can not prune a component of a dataset (ref={ref})")
1063 # We don't need an unreliable Datastore transaction for this, because
1064 # we've been extra careful to ensure that Datastore.trash only involves
1065 # mutating the Registry (it can _look_ at Datastore-specific things,
1066 # but shouldn't change them), and hence all operations here are
1067 # Registry operations.
1068 with self.registry.transaction():
1069 if unstore:
1070 for ref in refs:
1071 # There is a difference between a concrete composite
1072 # and virtual composite. In a virtual composite the
1073 # datastore is never given the top level DatasetRef. In
1074 # the concrete composite the datastore knows all the
1075 # refs and will clean up itself if asked to remove the
1076 # parent ref. We can not check configuration for this
1077 # since we can not trust that the configuration is the
1078 # same. We therefore have to ask if the ref exists or
1079 # not. This is consistent with the fact that we want
1080 # to ignore already-removed-from-datastore datasets
1081 # anyway.
1082 if self.datastore.exists(ref):
1083 self.datastore.trash(ref)
1084 if purge:
1085 self.registry.removeDatasets(refs)
1086 elif disassociate:
1087 for tag in tags:
1088 self.registry.disassociate(tag, refs)
1089 # We've exited the Registry transaction, and apparently committed.
1090 # (if there was an exception, everything rolled back, and it's as if
1091 # nothing happened - and we never get here).
1092 # Datastore artifacts are not yet gone, but they're clearly marked
1093 # as trash, so if we fail to delete now because of (e.g.) filesystem
1094 # problems we can try again later, and if manual administrative
1095 # intervention is required, it's pretty clear what that should entail:
1096 # deleting everything on disk and in private Datastore tables that is
1097 # in the dataset_location_trash table.
1098 if unstore:
1099 # Point of no return for removing artifacts
1100 self.datastore.emptyTrash()
1102 @transactional
1103 def ingest(self, *datasets: FileDataset, transfer: Optional[str] = None, run: Optional[str] = None,
1104 tags: Optional[Iterable[str]] = None,):
1105 """Store and register one or more datasets that already exist on disk.
1107 Parameters
1108 ----------
1109 datasets : `FileDataset`
1110 Each positional argument is a struct containing information about
1111 a file to be ingested, including its path (either absolute or
1112 relative to the datastore root, if applicable), a `DatasetRef`,
1113 and optionally a formatter class or its fully-qualified string
1114 name. If a formatter is not provided, the formatter that would be
1115 used for `put` is assumed. On successful return, all
1116 `FileDataset.ref` attributes will have their `DatasetRef.id`
1117 attribute populated and all `FileDataset.formatter` attributes will
1118 be set to the formatter class used. `FileDataset.path` attributes
1119 may be modified to put paths in whatever the datastore considers a
1120 standardized form.
1121 transfer : `str`, optional
1122 If not `None`, must be one of 'auto', 'move', 'copy', 'hardlink',
1123 'relsymlink' or 'symlink', indicating how to transfer the file.
1124 run : `str`, optional
1125 The name of the run ingested datasets should be added to,
1126 overriding ``self.run``.
1127 tags : `Iterable` [ `str` ], optional
1128 The names of a `~CollectionType.TAGGED` collections to associate
1129 the dataset with, overriding ``self.tags``. These collections
1130 must have already been added to the `Registry`.
1132 Raises
1133 ------
1134 TypeError
1135 Raised if the butler is read-only or if no run was provided.
1136 NotImplementedError
1137 Raised if the `Datastore` does not support the given transfer mode.
1138 DatasetTypeNotSupportedError
1139 Raised if one or more files to be ingested have a dataset type that
1140 is not supported by the `Datastore`..
1141 FileNotFoundError
1142 Raised if one of the given files does not exist.
1143 FileExistsError
1144 Raised if transfer is not `None` but the (internal) location the
1145 file would be moved to is already occupied.
1147 Notes
1148 -----
1149 This operation is not fully exception safe: if a database operation
1150 fails, the given `FileDataset` instances may be only partially updated.
1152 It is atomic in terms of database operations (they will either all
1153 succeed or all fail) providing the database engine implements
1154 transactions correctly. It will attempt to be atomic in terms of
1155 filesystem operations as well, but this cannot be implemented
1156 rigorously for most datastores.
1157 """
1158 if not self.isWriteable():
1159 raise TypeError("Butler is read-only.")
1160 if run is None:
1161 if self.run is None:
1162 raise TypeError("No run provided.")
1163 run = self.run
1164 # No need to check run type, since insertDatasets will do that
1165 # (safely) for us.
1166 if tags is None:
1167 tags = self.tags
1168 else:
1169 tags = tuple(tags)
1170 for tag in tags:
1171 # Check that these are tagged collections up front, because we want
1172 # to avoid relying on Datastore transactionality to avoid modifying
1173 # the repo if there's an error later.
1174 collectionType = self.registry.getCollectionType(tag)
1175 if collectionType is not CollectionType.TAGGED:
1176 raise TypeError(f"Cannot associate into collection '{tag}' of non-TAGGED type "
1177 f"{collectionType.name}.")
1178 # Reorganize the inputs so they're grouped by DatasetType and then
1179 # data ID. We also include a list of DatasetRefs for each FileDataset
1180 # to hold the resolved DatasetRefs returned by the Registry, before
1181 # it's safe to swap them into FileDataset.refs.
1182 # Some type annotation aliases to make that clearer:
1183 GroupForType = Dict[DataCoordinate, Tuple[FileDataset, List[DatasetRef]]]
1184 GroupedData = MutableMapping[DatasetType, GroupForType]
1185 # The actual data structure:
1186 groupedData: GroupedData = defaultdict(dict)
1187 # And the nested loop that populates it:
1188 for dataset in datasets:
1189 # This list intentionally shared across the inner loop, since it's
1190 # associated with `dataset`.
1191 resolvedRefs = []
1192 for ref in dataset.refs:
1193 groupedData[ref.datasetType][ref.dataId] = (dataset, resolvedRefs)
1195 # Now we can bulk-insert into Registry for each DatasetType.
1196 allResolvedRefs = []
1197 for datasetType, groupForType in groupedData.items():
1198 refs = self.registry.insertDatasets(datasetType,
1199 dataIds=groupForType.keys(),
1200 run=run)
1201 # Append those resolved DatasetRefs to the new lists we set up for
1202 # them.
1203 for ref, (_, resolvedRefs) in zip(refs, groupForType.values()):
1204 resolvedRefs.append(ref)
1206 # Go back to the original FileDatasets to replace their refs with the
1207 # new resolved ones, and also build a big list of all refs.
1208 allResolvedRefs = []
1209 for groupForType in groupedData.values():
1210 for dataset, resolvedRefs in groupForType.values():
1211 dataset.refs = resolvedRefs
1212 allResolvedRefs.extend(resolvedRefs)
1214 # Bulk-associate everything with any tagged collections.
1215 for tag in tags:
1216 self.registry.associate(tag, allResolvedRefs)
1218 # Bulk-insert everything into Datastore.
1219 self.datastore.ingest(*datasets, transfer=transfer)
1221 @contextlib.contextmanager
1222 def export(self, *, directory: Optional[str] = None,
1223 filename: Optional[str] = None,
1224 format: Optional[str] = None,
1225 transfer: Optional[str] = None) -> ContextManager[RepoExport]:
1226 """Export datasets from the repository represented by this `Butler`.
1228 This method is a context manager that returns a helper object
1229 (`RepoExport`) that is used to indicate what information from the
1230 repository should be exported.
1232 Parameters
1233 ----------
1234 directory : `str`, optional
1235 Directory dataset files should be written to if ``transfer`` is not
1236 `None`.
1237 filename : `str`, optional
1238 Name for the file that will include database information associated
1239 with the exported datasets. If this is not an absolute path and
1240 ``directory`` is not `None`, it will be written to ``directory``
1241 instead of the current working directory. Defaults to
1242 "export.{format}".
1243 format : `str`, optional
1244 File format for the database information file. If `None`, the
1245 extension of ``filename`` will be used.
1246 transfer : `str`, optional
1247 Transfer mode passed to `Datastore.export`.
1249 Raises
1250 ------
1251 TypeError
1252 Raised if the set of arguments passed is inconsistent.
1254 Examples
1255 --------
1256 Typically the `Registry.queryDimensions` and `Registry.queryDatasets`
1257 methods are used to provide the iterables over data IDs and/or datasets
1258 to be exported::
1260 with butler.export("exports.yaml") as export:
1261 # Export all flats, and the calibration_label dimensions
1262 # associated with them.
1263 export.saveDatasets(butler.registry.queryDatasets("flat"),
1264 elements=[butler.registry.dimensions["calibration_label"]])
1265 # Export all datasets that start with "deepCoadd_" and all of
1266 # their associated data ID information.
1267 export.saveDatasets(butler.registry.queryDatasets("deepCoadd_*"))
1268 """
1269 if directory is None and transfer is not None:
1270 raise TypeError("Cannot transfer without providing a directory.")
1271 if transfer == "move":
1272 raise TypeError("Transfer may not be 'move': export is read-only")
1273 if format is None:
1274 if filename is None:
1275 raise TypeError("At least one of 'filename' or 'format' must be provided.")
1276 else:
1277 _, format = os.path.splitext(filename)
1278 elif filename is None:
1279 filename = f"export.{format}"
1280 if directory is not None:
1281 filename = os.path.join(directory, filename)
1282 BackendClass = getClassOf(self._config["repo_transfer_formats"][format]["export"])
1283 with open(filename, 'w') as stream:
1284 backend = BackendClass(stream)
1285 try:
1286 helper = RepoExport(self.registry, self.datastore, backend=backend,
1287 directory=directory, transfer=transfer)
1288 yield helper
1289 except BaseException:
1290 raise
1291 else:
1292 helper._finish()
1294 def import_(self, *, directory: Optional[str] = None,
1295 filename: Union[str, TextIO, None] = None,
1296 format: Optional[str] = None,
1297 transfer: Optional[str] = None):
1298 """Import datasets exported from a different butler repository.
1300 Parameters
1301 ----------
1302 directory : `str`, optional
1303 Directory containing dataset files. If `None`, all file paths
1304 must be absolute.
1305 filename : `str` or `TextIO`, optional
1306 A stream or name of file that contains database information
1307 associated with the exported datasets. If this a string (name) and
1308 is not an absolute path, does not exist in the current working
1309 directory, and ``directory`` is not `None`, it is assumed to be in
1310 ``directory``. Defaults to "export.{format}".
1311 format : `str`, optional
1312 File format for the database information file. If `None`, the
1313 extension of ``filename`` will be used.
1314 transfer : `str`, optional
1315 Transfer mode passed to `Datastore.export`.
1317 Raises
1318 ------
1319 TypeError
1320 Raised if the set of arguments passed is inconsistent, or if the
1321 butler is read-only.
1322 """
1323 if not self.isWriteable():
1324 raise TypeError("Butler is read-only.")
1325 if format is None:
1326 if filename is None:
1327 raise TypeError("At least one of 'filename' or 'format' must be provided.")
1328 else:
1329 _, format = os.path.splitext(filename)
1330 elif filename is None:
1331 filename = f"export.{format}"
1332 if isinstance(filename, str) and directory is not None and not os.path.exists(filename):
1333 filename = os.path.join(directory, filename)
1334 BackendClass = getClassOf(self._config["repo_transfer_formats"][format]["import"])
1336 def doImport(importStream):
1337 backend = BackendClass(importStream, self.registry)
1338 backend.register()
1339 with self.transaction():
1340 backend.load(self.datastore, directory=directory, transfer=transfer)
1342 if isinstance(filename, str):
1343 with open(filename, "r") as stream:
1344 doImport(stream)
1345 else:
1346 doImport(filename)
1348 def validateConfiguration(self, logFailures: bool = False,
1349 datasetTypeNames: Optional[Iterable[str]] = None,
1350 ignore: Iterable[str] = None):
1351 """Validate butler configuration.
1353 Checks that each `DatasetType` can be stored in the `Datastore`.
1355 Parameters
1356 ----------
1357 logFailures : `bool`, optional
1358 If `True`, output a log message for every validation error
1359 detected.
1360 datasetTypeNames : iterable of `str`, optional
1361 The `DatasetType` names that should be checked. This allows
1362 only a subset to be selected.
1363 ignore : iterable of `str`, optional
1364 Names of DatasetTypes to skip over. This can be used to skip
1365 known problems. If a named `DatasetType` corresponds to a
1366 composite, all components of that `DatasetType` will also be
1367 ignored.
1369 Raises
1370 ------
1371 ButlerValidationError
1372 Raised if there is some inconsistency with how this Butler
1373 is configured.
1374 """
1375 if datasetTypeNames:
1376 entities = [self.registry.getDatasetType(name) for name in datasetTypeNames]
1377 else:
1378 entities = list(self.registry.queryDatasetTypes())
1380 # filter out anything from the ignore list
1381 if ignore:
1382 ignore = set(ignore)
1383 entities = [e for e in entities if e.name not in ignore and e.nameAndComponent()[0] not in ignore]
1384 else:
1385 ignore = set()
1387 # Find all the registered instruments
1388 instruments = set(
1389 dataId["instrument"] for dataId in self.registry.queryDimensions(["instrument"])
1390 )
1392 # For each datasetType that has an instrument dimension, create
1393 # a DatasetRef for each defined instrument
1394 datasetRefs = []
1396 for datasetType in entities:
1397 if "instrument" in datasetType.dimensions:
1398 for instrument in instruments:
1399 datasetRef = DatasetRef(datasetType, {"instrument": instrument}, conform=False)
1400 datasetRefs.append(datasetRef)
1402 entities.extend(datasetRefs)
1404 datastoreErrorStr = None
1405 try:
1406 self.datastore.validateConfiguration(entities, logFailures=logFailures)
1407 except ValidationError as e:
1408 datastoreErrorStr = str(e)
1410 # Also check that the LookupKeys used by the datastores match
1411 # registry and storage class definitions
1412 keys = self.datastore.getLookupKeys()
1414 failedNames = set()
1415 failedDataId = set()
1416 for key in keys:
1417 datasetType = None
1418 if key.name is not None:
1419 if key.name in ignore:
1420 continue
1422 # skip if specific datasetType names were requested and this
1423 # name does not match
1424 if datasetTypeNames and key.name not in datasetTypeNames:
1425 continue
1427 # See if it is a StorageClass or a DatasetType
1428 if key.name in self.storageClasses:
1429 pass
1430 else:
1431 try:
1432 self.registry.getDatasetType(key.name)
1433 except KeyError:
1434 if logFailures:
1435 log.fatal("Key '%s' does not correspond to a DatasetType or StorageClass", key)
1436 failedNames.add(key)
1437 else:
1438 # Dimensions are checked for consistency when the Butler
1439 # is created and rendezvoused with a universe.
1440 pass
1442 # Check that the instrument is a valid instrument
1443 # Currently only support instrument so check for that
1444 if key.dataId:
1445 dataIdKeys = set(key.dataId)
1446 if set(["instrument"]) != dataIdKeys:
1447 if logFailures:
1448 log.fatal("Key '%s' has unsupported DataId override", key)
1449 failedDataId.add(key)
1450 elif key.dataId["instrument"] not in instruments:
1451 if logFailures:
1452 log.fatal("Key '%s' has unknown instrument", key)
1453 failedDataId.add(key)
1455 messages = []
1457 if datastoreErrorStr:
1458 messages.append(datastoreErrorStr)
1460 for failed, msg in ((failedNames, "Keys without corresponding DatasetType or StorageClass entry: "),
1461 (failedDataId, "Keys with bad DataId entries: ")):
1462 if failed:
1463 msg += ", ".join(str(k) for k in failed)
1464 messages.append(msg)
1466 if messages:
1467 raise ValidationError(";\n".join(messages))
1469 registry: Registry
1470 """The object that manages dataset metadata and relationships (`Registry`).
1472 Most operations that don't involve reading or writing butler datasets are
1473 accessible only via `Registry` methods.
1474 """
1476 datastore: Datastore
1477 """The object that manages actual dataset storage (`Datastore`).
1479 Direct user access to the datastore should rarely be necessary; the primary
1480 exception is the case where a `Datastore` implementation provides extra
1481 functionality beyond what the base class defines.
1482 """
1484 storageClasses: StorageClassFactory
1485 """An object that maps known storage class names to objects that fully
1486 describe them (`StorageClassFactory`).
1487 """
1489 collections: Optional[CollectionSearch]
1490 """The collections to search and any restrictions on the dataset types to
1491 search for within them, in order (`CollectionSearch`).
1492 """
1494 run: Optional[str]
1495 """Name of the run this butler writes outputs to (`str` or `None`).
1496 """
1498 tags: Tuple[str, ...]
1499 """Names of `~CollectionType.TAGGED` collections this butler associates
1500 with in `put` and `ingest`, and disassociates from in `pruneDatasets`
1501 (`tuple` [ `str` ]).
1502 """