Coverage for python/lsst/daf/butler/_butler.py : 8%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""
23Butler top level classes.
24"""
25from __future__ import annotations
27__all__ = ("Butler", "ButlerValidationError")
29import os
30from collections import defaultdict
31import contextlib
32import logging
33from typing import (
34 Any,
35 ClassVar,
36 ContextManager,
37 Dict,
38 Iterable,
39 List,
40 Mapping,
41 MutableMapping,
42 Optional,
43 Tuple,
44 Union,
45)
47try:
48 import boto3
49except ImportError:
50 boto3 = None
52from lsst.utils import doImport
53from .core import (
54 ButlerURI,
55 CompositesMap,
56 Config,
57 ConfigSubset,
58 DataCoordinate,
59 DataId,
60 DatasetRef,
61 DatasetType,
62 Datastore,
63 FileDataset,
64 Quantum,
65 RepoExport,
66 StorageClassFactory,
67 ValidationError,
68)
69from .core.repoRelocation import BUTLER_ROOT_TAG
70from .core.safeFileIo import safeMakeDir
71from .core.utils import transactional, getClassOf
72from .core.s3utils import bucketExists
73from ._deferredDatasetHandle import DeferredDatasetHandle
74from ._butlerConfig import ButlerConfig
75from .registry import Registry, RegistryConfig, CollectionType
76from .registry.wildcards import CollectionSearch
78log = logging.getLogger(__name__)
81class ButlerValidationError(ValidationError):
82 """There is a problem with the Butler configuration."""
83 pass
86class Butler:
87 """Main entry point for the data access system.
89 Parameters
90 ----------
91 config : `ButlerConfig`, `Config` or `str`, optional.
92 Configuration. Anything acceptable to the
93 `ButlerConfig` constructor. If a directory path
94 is given the configuration will be read from a ``butler.yaml`` file in
95 that location. If `None` is given default values will be used.
96 butler : `Butler`, optional.
97 If provided, construct a new Butler that uses the same registry and
98 datastore as the given one, but with the given collection and run.
99 Incompatible with the ``config``, ``searchPaths``, and ``writeable``
100 arguments.
101 collections : `Any`, optional
102 An expression specifying the collections to be searched (in order) when
103 reading datasets, and optionally dataset type restrictions on them.
104 This may be:
105 - a `str` collection name;
106 - a tuple of (collection name, *dataset type restriction*);
107 - an iterable of either of the above;
108 - a mapping from `str` to *dataset type restriction*.
110 See :ref:`daf_butler_collection_expressions` for more information,
111 including the definition of a *dataset type restriction*. All
112 collections must either already exist or be specified to be created
113 by other arguments.
114 run : `str`, optional
115 Name of the run datasets should be output to. If the run
116 does not exist, it will be created. If ``collections`` is `None`, it
117 will be set to ``[run]``. If this is not set (and ``writeable`` is
118 not set either), a read-only butler will be created.
119 tags : `Iterable` [ `str` ], optional
120 A list of `~CollectionType.TAGGED` collections that datasets should be
121 associated with in `put` or `ingest` and disassociated from in
122 `pruneDatasets`. If any of these collections does not exist, it will
123 be created.
124 chains : `Mapping` [ `str`, `Iterable` [ `str` ] ], optional
125 A mapping from the names of new `~CollectionType.CHAINED` collections
126 to an expression identifying their child collections (which takes the
127 same form as the ``collections`` argument. Chains may be nested only
128 if children precede their parents in this mapping.
129 searchPaths : `list` of `str`, optional
130 Directory paths to search when calculating the full Butler
131 configuration. Not used if the supplied config is already a
132 `ButlerConfig`.
133 writeable : `bool`, optional
134 Explicitly sets whether the butler supports write operations. If not
135 provided, a read-write butler is created if any of ``run``, ``tags``,
136 or ``chains`` is non-empty.
138 Examples
139 --------
140 While there are many ways to control exactly how a `Butler` interacts with
141 the collections in its `Registry`, the most common cases are still simple.
143 For a read-only `Butler` that searches one collection, do::
145 butler = Butler("/path/to/repo", collections=["u/alice/DM-50000"])
147 For a read-write `Butler` that writes to and reads from a
148 `~CollectionType.RUN` collection::
150 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a")
152 The `Butler` passed to a ``PipelineTask`` is often much more complex,
153 because we want to write to one `~CollectionType.RUN` collection but read
154 from several others (as well), while defining a new
155 `~CollectionType.CHAINED` collection that combines them all::
157 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a",
158 collections=["u/alice/DM-50000"],
159 chains={
160 "u/alice/DM-50000": ["u/alice/DM-50000/a",
161 "u/bob/DM-49998",
162 "raw/hsc"]
163 })
165 This butler will `put` new datasets to the run ``u/alice/DM-50000/a``, but
166 they'll also be available from the chained collection ``u/alice/DM-50000``.
167 Datasets will be read first from that run (since it appears first in the
168 chain), and then from ``u/bob/DM-49998`` and finally ``raw/hsc``.
169 If ``u/alice/DM-50000`` had already been defined, the ``chain`` argument
170 would be unnecessary. We could also construct a butler that performs
171 exactly the same `put` and `get` operations without actually creating a
172 chained collection, just by passing multiple items is ``collections``::
174 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a",
175 collections=["u/alice/DM-50000/a",
176 "u/bob/DM-49998",
177 "raw/hsc"])
179 Finally, one can always create a `Butler` with no collections::
181 butler = Butler("/path/to/repo", writeable=True)
183 This can be extremely useful when you just want to use ``butler.registry``,
184 e.g. for inserting dimension data or managing collections, or when the
185 collections you want to use with the butler are not consistent.
186 Passing ``writeable`` explicitly here is only necessary if you want to be
187 able to make changes to the repo - usually the value for ``writeable`` is
188 can be guessed from the collection arguments provided, but it defaults to
189 `False` when there are not collection arguments.
190 """
191 def __init__(self, config: Union[Config, str, None] = None, *,
192 butler: Optional[Butler] = None,
193 collections: Any = None,
194 run: Optional[str] = None,
195 tags: Iterable[str] = (),
196 chains: Optional[Mapping[str, Any]] = None,
197 searchPaths: Optional[List[str]] = None,
198 writeable: Optional[bool] = None):
199 # Transform any single-pass iterator into an actual sequence so we
200 # can see if its empty
201 self.tags = tuple(tags)
202 # Load registry, datastore, etc. from config or existing butler.
203 if butler is not None:
204 if config is not None or searchPaths is not None or writeable is not None:
205 raise TypeError("Cannot pass 'config', 'searchPaths', or 'writeable' "
206 "arguments with 'butler' argument.")
207 self.registry = butler.registry
208 self.datastore = butler.datastore
209 self.storageClasses = butler.storageClasses
210 self._composites = butler._composites
211 self._config = butler._config
212 else:
213 self._config = ButlerConfig(config, searchPaths=searchPaths)
214 if "root" in self._config:
215 butlerRoot = self._config["root"]
216 else:
217 butlerRoot = self._config.configDir
218 if writeable is None:
219 writeable = run is not None or chains is not None or self.tags
220 self.registry = Registry.fromConfig(self._config, butlerRoot=butlerRoot, writeable=writeable)
221 self.datastore = Datastore.fromConfig(self._config, self.registry.getDatastoreBridgeManager(),
222 butlerRoot=butlerRoot)
223 self.storageClasses = StorageClassFactory()
224 self.storageClasses.addFromConfig(self._config)
225 self._composites = CompositesMap(self._config, universe=self.registry.dimensions)
226 # Check the many collection arguments for consistency and create any
227 # needed collections that don't exist.
228 if collections is None:
229 if run is not None:
230 collections = (run,)
231 else:
232 collections = ()
233 self.collections = CollectionSearch.fromExpression(collections)
234 if chains is None:
235 chains = {}
236 self.run = run
237 if "run" in self._config or "collection" in self._config:
238 raise ValueError("Passing a run or collection via configuration is no longer supported.")
239 if self.run is not None:
240 self.registry.registerCollection(self.run, type=CollectionType.RUN)
241 for tag in self.tags:
242 self.registry.registerCollection(tag, type=CollectionType.TAGGED)
243 for parent, children in chains.items():
244 self.registry.registerCollection(parent, type=CollectionType.CHAINED)
245 self.registry.setCollectionChain(parent, children)
247 GENERATION: ClassVar[int] = 3
248 """This is a Generation 3 Butler.
250 This attribute may be removed in the future, once the Generation 2 Butler
251 interface has been fully retired; it should only be used in transitional
252 code.
253 """
255 @staticmethod
256 def makeRepo(root: str, config: Union[Config, str, None] = None, standalone: bool = False,
257 createRegistry: bool = True, searchPaths: Optional[List[str]] = None,
258 forceConfigRoot: bool = True, outfile: Optional[str] = None,
259 overwrite: bool = False) -> Config:
260 """Create an empty data repository by adding a butler.yaml config
261 to a repository root directory.
263 Parameters
264 ----------
265 root : `str` or `ButlerURI`
266 Path or URI to the root location of the new repository. Will be
267 created if it does not exist.
268 config : `Config` or `str`, optional
269 Configuration to write to the repository, after setting any
270 root-dependent Registry or Datastore config options. Can not
271 be a `ButlerConfig` or a `ConfigSubset`. If `None`, default
272 configuration will be used. Root-dependent config options
273 specified in this config are overwritten if ``forceConfigRoot``
274 is `True`.
275 standalone : `bool`
276 If True, write all expanded defaults, not just customized or
277 repository-specific settings.
278 This (mostly) decouples the repository from the default
279 configuration, insulating it from changes to the defaults (which
280 may be good or bad, depending on the nature of the changes).
281 Future *additions* to the defaults will still be picked up when
282 initializing `Butlers` to repos created with ``standalone=True``.
283 createRegistry : `bool`, optional
284 If `True` create a new Registry.
285 searchPaths : `list` of `str`, optional
286 Directory paths to search when calculating the full butler
287 configuration.
288 forceConfigRoot : `bool`, optional
289 If `False`, any values present in the supplied ``config`` that
290 would normally be reset are not overridden and will appear
291 directly in the output config. This allows non-standard overrides
292 of the root directory for a datastore or registry to be given.
293 If this parameter is `True` the values for ``root`` will be
294 forced into the resulting config if appropriate.
295 outfile : `str`, optional
296 If not-`None`, the output configuration will be written to this
297 location rather than into the repository itself. Can be a URI
298 string. Can refer to a directory that will be used to write
299 ``butler.yaml``.
300 overwrite : `bool`, optional
301 Create a new configuration file even if one already exists
302 in the specified output location. Default is to raise
303 an exception.
305 Returns
306 -------
307 config : `Config`
308 The updated `Config` instance written to the repo.
310 Raises
311 ------
312 ValueError
313 Raised if a ButlerConfig or ConfigSubset is passed instead of a
314 regular Config (as these subclasses would make it impossible to
315 support ``standalone=False``).
316 FileExistsError
317 Raised if the output config file already exists.
318 os.error
319 Raised if the directory does not exist, exists but is not a
320 directory, or cannot be created.
322 Notes
323 -----
324 Note that when ``standalone=False`` (the default), the configuration
325 search path (see `ConfigSubset.defaultSearchPaths`) that was used to
326 construct the repository should also be used to construct any Butlers
327 to avoid configuration inconsistencies.
328 """
329 if isinstance(config, (ButlerConfig, ConfigSubset)):
330 raise ValueError("makeRepo must be passed a regular Config without defaults applied.")
332 # for "file" schemes we are assuming POSIX semantics for paths, for
333 # schemeless URIs we are assuming os.path semantics.
334 uri = ButlerURI(root, forceDirectory=True)
335 if uri.scheme == "file" or not uri.scheme:
336 if not os.path.isdir(uri.ospath):
337 safeMakeDir(uri.ospath)
338 elif uri.scheme == "s3":
339 # bucket must already exist
340 if not bucketExists(uri.netloc):
341 raise ValueError(f"Bucket {uri.netloc} does not exist!")
342 s3 = boto3.client("s3")
343 # don't create S3 key when root is at the top-level of an Bucket
344 if not uri.path == "/":
345 s3.put_object(Bucket=uri.netloc, Key=uri.relativeToPathRoot)
346 else:
347 raise ValueError(f"Unrecognized scheme: {uri.scheme}")
348 config = Config(config)
350 # If we are creating a new repo from scratch with relative roots,
351 # do not propagate an explicit root from the config file
352 if "root" in config:
353 del config["root"]
355 full = ButlerConfig(config, searchPaths=searchPaths) # this applies defaults
356 datastoreClass = doImport(full["datastore", "cls"])
357 datastoreClass.setConfigRoot(BUTLER_ROOT_TAG, config, full, overwrite=forceConfigRoot)
359 # if key exists in given config, parse it, otherwise parse the defaults
360 # in the expanded config
361 if config.get(("registry", "db")):
362 registryConfig = RegistryConfig(config)
363 else:
364 registryConfig = RegistryConfig(full)
365 defaultDatabaseUri = registryConfig.makeDefaultDatabaseUri(BUTLER_ROOT_TAG)
366 if defaultDatabaseUri is not None:
367 Config.updateParameters(RegistryConfig, config, full,
368 toUpdate={"db": defaultDatabaseUri},
369 overwrite=forceConfigRoot)
370 else:
371 Config.updateParameters(RegistryConfig, config, full, toCopy=("db",),
372 overwrite=forceConfigRoot)
374 if standalone:
375 config.merge(full)
376 if outfile is not None:
377 # When writing to a separate location we must include
378 # the root of the butler repo in the config else it won't know
379 # where to look.
380 config["root"] = uri.geturl()
381 configURI = outfile
382 else:
383 configURI = uri
384 config.dumpToUri(configURI, overwrite=overwrite)
386 # Create Registry and populate tables
387 Registry.fromConfig(config, create=createRegistry, butlerRoot=root)
388 return config
390 @classmethod
391 def _unpickle(cls, config: ButlerConfig, collections: Optional[CollectionSearch], run: Optional[str],
392 tags: Tuple[str, ...], writeable: bool) -> Butler:
393 """Callable used to unpickle a Butler.
395 We prefer not to use ``Butler.__init__`` directly so we can force some
396 of its many arguments to be keyword-only (note that ``__reduce__``
397 can only invoke callables with positional arguments).
399 Parameters
400 ----------
401 config : `ButlerConfig`
402 Butler configuration, already coerced into a true `ButlerConfig`
403 instance (and hence after any search paths for overrides have been
404 utilized).
405 collections : `CollectionSearch`
406 Names of collections to read from.
407 run : `str`, optional
408 Name of `~CollectionType.RUN` collection to write to.
409 tags : `tuple` [`str`]
410 Names of `~CollectionType.TAGGED` collections to associate with.
411 writeable : `bool`
412 Whether the Butler should support write operations.
414 Returns
415 -------
416 butler : `Butler`
417 A new `Butler` instance.
418 """
419 return cls(config=config, collections=collections, run=run, tags=tags, writeable=writeable)
421 def __reduce__(self):
422 """Support pickling.
423 """
424 return (Butler._unpickle, (self._config, self.collections, self.run, self.tags,
425 self.registry.isWriteable()))
427 def __str__(self):
428 return "Butler(collections={}, run={}, tags={}, datastore='{}', registry='{}')".format(
429 self.collections, self.run, self.tags, self.datastore, self.registry)
431 def isWriteable(self) -> bool:
432 """Return `True` if this `Butler` supports write operations.
433 """
434 return self.registry.isWriteable()
436 @contextlib.contextmanager
437 def transaction(self):
438 """Context manager supporting `Butler` transactions.
440 Transactions can be nested.
441 """
442 with self.registry.transaction():
443 with self.datastore.transaction():
444 yield
446 def _standardizeArgs(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
447 dataId: Optional[DataId] = None, **kwds: Any) -> Tuple[DatasetType, DataId]:
448 """Standardize the arguments passed to several Butler APIs.
450 Parameters
451 ----------
452 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
453 When `DatasetRef` the `dataId` should be `None`.
454 Otherwise the `DatasetType` or name thereof.
455 dataId : `dict` or `DataCoordinate`
456 A `dict` of `Dimension` link name, value pairs that label the
457 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
458 should be provided as the second argument.
459 kwds
460 Additional keyword arguments used to augment or construct a
461 `DataCoordinate`. See `DataCoordinate.standardize`
462 parameters.
464 Returns
465 -------
466 datasetType : `DatasetType`
467 A `DatasetType` instance extracted from ``datasetRefOrType``.
468 dataId : `dict` or `DataId`, optional
469 Argument that can be used (along with ``kwds``) to construct a
470 `DataId`.
472 Notes
473 -----
474 Butler APIs that conceptually need a DatasetRef also allow passing a
475 `DatasetType` (or the name of one) and a `DataId` (or a dict and
476 keyword arguments that can be used to construct one) separately. This
477 method accepts those arguments and always returns a true `DatasetType`
478 and a `DataId` or `dict`.
480 Standardization of `dict` vs `DataId` is best handled by passing the
481 returned ``dataId`` (and ``kwds``) to `Registry` APIs, which are
482 generally similarly flexible.
483 """
484 externalDatasetType = None
485 internalDatasetType = None
486 if isinstance(datasetRefOrType, DatasetRef):
487 if dataId is not None or kwds:
488 raise ValueError("DatasetRef given, cannot use dataId as well")
489 externalDatasetType = datasetRefOrType.datasetType
490 dataId = datasetRefOrType.dataId
491 else:
492 # Don't check whether DataId is provided, because Registry APIs
493 # can usually construct a better error message when it wasn't.
494 if isinstance(datasetRefOrType, DatasetType):
495 externalDatasetType = datasetRefOrType
496 else:
497 internalDatasetType = self.registry.getDatasetType(datasetRefOrType)
499 # Check that they are self-consistent
500 if externalDatasetType is not None:
501 internalDatasetType = self.registry.getDatasetType(externalDatasetType.name)
502 if externalDatasetType != internalDatasetType:
503 raise ValueError(f"Supplied dataset type ({externalDatasetType}) inconsistent with "
504 f"registry definition ({internalDatasetType})")
506 return internalDatasetType, dataId
508 def _findDatasetRef(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
509 dataId: Optional[DataId] = None, *,
510 collections: Any = None,
511 allowUnresolved: bool = False,
512 **kwds: Any) -> DatasetRef:
513 """Shared logic for methods that start with a search for a dataset in
514 the registry.
516 Parameters
517 ----------
518 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
519 When `DatasetRef` the `dataId` should be `None`.
520 Otherwise the `DatasetType` or name thereof.
521 dataId : `dict` or `DataCoordinate`, optional
522 A `dict` of `Dimension` link name, value pairs that label the
523 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
524 should be provided as the first argument.
525 collections : Any, optional
526 Collections to be searched, overriding ``self.collections``.
527 Can be any of the types supported by the ``collections`` argument
528 to butler construction.
529 allowUnresolved : `bool`, optional
530 If `True`, return an unresolved `DatasetRef` if finding a resolved
531 one in the `Registry` fails. Defaults to `False`.
532 kwds
533 Additional keyword arguments used to augment or construct a
534 `DataId`. See `DataId` parameters.
536 Returns
537 -------
538 ref : `DatasetRef`
539 A reference to the dataset identified by the given arguments.
541 Raises
542 ------
543 LookupError
544 Raised if no matching dataset exists in the `Registry` (and
545 ``allowUnresolved is False``).
546 ValueError
547 Raised if a resolved `DatasetRef` was passed as an input, but it
548 differs from the one found in the registry.
549 TypeError
550 Raised if no collections were provided.
551 """
552 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwds)
553 if isinstance(datasetRefOrType, DatasetRef):
554 idNumber = datasetRefOrType.id
555 else:
556 idNumber = None
557 # Expand the data ID first instead of letting registry.findDataset do
558 # it, so we get the result even if it returns None.
559 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions, **kwds)
560 if collections is None:
561 collections = self.collections
562 if not collections:
563 raise TypeError("No input collections provided.")
564 else:
565 collections = CollectionSearch.fromExpression(collections)
566 # Always lookup the DatasetRef, even if one is given, to ensure it is
567 # present in the current collection.
568 ref = self.registry.findDataset(datasetType, dataId, collections=collections)
569 if ref is None:
570 if allowUnresolved:
571 return DatasetRef(datasetType, dataId)
572 else:
573 raise LookupError(f"Dataset {datasetType.name} with data ID {dataId} "
574 f"could not be found in collections {collections}.")
575 if idNumber is not None and idNumber != ref.id:
576 raise ValueError(f"DatasetRef.id provided ({idNumber}) does not match "
577 f"id ({ref.id}) in registry in collections {collections}.")
578 return ref
580 @transactional
581 def put(self, obj: Any, datasetRefOrType: Union[DatasetRef, DatasetType, str],
582 dataId: Optional[DataId] = None, *,
583 producer: Optional[Quantum] = None,
584 run: Optional[str] = None,
585 tags: Optional[Iterable[str]] = None,
586 **kwds: Any) -> DatasetRef:
587 """Store and register a dataset.
589 Parameters
590 ----------
591 obj : `object`
592 The dataset.
593 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
594 When `DatasetRef` is provided, ``dataId`` should be `None`.
595 Otherwise the `DatasetType` or name thereof.
596 dataId : `dict` or `DataCoordinate`
597 A `dict` of `Dimension` link name, value pairs that label the
598 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
599 should be provided as the second argument.
600 producer : `Quantum`, optional
601 The producer.
602 run : `str`, optional
603 The name of the run the dataset should be added to, overriding
604 ``self.run``.
605 tags : `Iterable` [ `str` ], optional
606 The names of a `~CollectionType.TAGGED` collections to associate
607 the dataset with, overriding ``self.tags``. These collections
608 must have already been added to the `Registry`.
609 kwds
610 Additional keyword arguments used to augment or construct a
611 `DataCoordinate`. See `DataCoordinate.standardize`
612 parameters.
614 Returns
615 -------
616 ref : `DatasetRef`
617 A reference to the stored dataset, updated with the correct id if
618 given.
620 Raises
621 ------
622 TypeError
623 Raised if the butler is read-only or if no run has been provided.
624 """
625 log.debug("Butler put: %s, dataId=%s, producer=%s, run=%s", datasetRefOrType, dataId, producer, run)
626 if not self.isWriteable():
627 raise TypeError("Butler is read-only.")
628 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwds)
629 if isinstance(datasetRefOrType, DatasetRef) and datasetRefOrType.id is not None:
630 raise ValueError("DatasetRef must not be in registry, must have None id")
632 if run is None:
633 if self.run is None:
634 raise TypeError("No run provided.")
635 run = self.run
636 # No need to check type for run; first thing we do is
637 # insertDatasets, and that will check for us.
639 if tags is None:
640 tags = self.tags
641 else:
642 tags = tuple(tags)
643 for tag in tags:
644 # Check that these are tagged collections up front, because we want
645 # to avoid relying on Datastore transactionality to avoid modifying
646 # the repo if there's an error later.
647 collectionType = self.registry.getCollectionType(tag)
648 if collectionType is not CollectionType.TAGGED:
649 raise TypeError(f"Cannot associate into collection '{tag}' of non-TAGGED type "
650 f"{collectionType.name}.")
652 # Disable all disassembly at the registry level for now
653 isVirtualComposite = False
655 # Add Registry Dataset entry. If not a virtual composite, add
656 # and attach components at the same time.
657 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions, **kwds)
658 ref, = self.registry.insertDatasets(datasetType, run=run, dataIds=[dataId],
659 producer=producer,
660 # Never write components into
661 # registry
662 recursive=False)
664 # Check to see if this datasetType requires disassembly
665 if isVirtualComposite:
666 components = datasetType.storageClass.assembler().disassemble(obj)
667 componentRefs = {}
668 for component, info in components.items():
669 compTypeName = datasetType.componentTypeName(component)
670 compRef = self.put(info.component, compTypeName, dataId, producer=producer, run=run,
671 collection=False) # We don't need to recursively associate.
672 componentRefs[component] = compRef
673 ref = self.registry.attachComponents(ref, componentRefs)
674 else:
675 # This is an entity without a disassembler.
676 self.datastore.put(obj, ref)
678 for tag in tags:
679 self.registry.associate(tag, [ref]) # this is already recursive by default
681 return ref
683 def getDirect(self, ref: DatasetRef, *, parameters: Optional[Dict[str, Any]] = None):
684 """Retrieve a stored dataset.
686 Unlike `Butler.get`, this method allows datasets outside the Butler's
687 collection to be read as long as the `DatasetRef` that identifies them
688 can be obtained separately.
690 Parameters
691 ----------
692 ref : `DatasetRef`
693 Reference to an already stored dataset.
694 parameters : `dict`
695 Additional StorageClass-defined options to control reading,
696 typically used to efficiently read only a subset of the dataset.
698 Returns
699 -------
700 obj : `object`
701 The dataset.
702 """
703 # if the ref exists in the store we return it directly
704 if self.datastore.exists(ref):
705 return self.datastore.get(ref, parameters=parameters)
706 elif ref.isComposite() and ref.components:
707 # The presence of components indicates that this dataset
708 # was disassembled at the registry level.
709 # Check that we haven't got any unknown parameters
710 ref.datasetType.storageClass.validateParameters(parameters)
711 # Reconstruct the composite
712 usedParams = set()
713 components = {}
714 for compName, compRef in ref.components.items():
715 # make a dictionary of parameters containing only the subset
716 # supported by the StorageClass of the components
717 compParams = compRef.datasetType.storageClass.filterParameters(parameters)
718 usedParams.update(set(compParams))
719 components[compName] = self.datastore.get(compRef, parameters=compParams)
721 # Any unused parameters will have to be passed to the assembler
722 if parameters:
723 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams}
724 else:
725 unusedParams = {}
727 # Assemble the components
728 inMemoryDataset = ref.datasetType.storageClass.assembler().assemble(components)
729 return ref.datasetType.storageClass.assembler().handleParameters(inMemoryDataset,
730 parameters=unusedParams)
731 else:
732 # single entity in datastore
733 raise FileNotFoundError(f"Unable to locate dataset '{ref}' in datastore {self.datastore.name}")
735 def getDeferred(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
736 dataId: Optional[DataId] = None, *,
737 parameters: Union[dict, None] = None,
738 collections: Any = None,
739 **kwds: Any) -> DeferredDatasetHandle:
740 """Create a `DeferredDatasetHandle` which can later retrieve a dataset
742 Parameters
743 ----------
744 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
745 When `DatasetRef` the `dataId` should be `None`.
746 Otherwise the `DatasetType` or name thereof.
747 dataId : `dict` or `DataCoordinate`, optional
748 A `dict` of `Dimension` link name, value pairs that label the
749 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
750 should be provided as the first argument.
751 parameters : `dict`
752 Additional StorageClass-defined options to control reading,
753 typically used to efficiently read only a subset of the dataset.
754 collections : Any, optional
755 Collections to be searched, overriding ``self.collections``.
756 Can be any of the types supported by the ``collections`` argument
757 to butler construction.
758 kwds
759 Additional keyword arguments used to augment or construct a
760 `DataId`. See `DataId` parameters.
762 Returns
763 -------
764 obj : `DeferredDatasetHandle`
765 A handle which can be used to retrieve a dataset at a later time.
767 Raises
768 ------
769 LookupError
770 Raised if no matching dataset exists in the `Registry` (and
771 ``allowUnresolved is False``).
772 ValueError
773 Raised if a resolved `DatasetRef` was passed as an input, but it
774 differs from the one found in the registry.
775 TypeError
776 Raised if no collections were provided.
777 """
778 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwds)
779 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters)
781 def get(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
782 dataId: Optional[DataId] = None, *,
783 parameters: Optional[Dict[str, Any]] = None,
784 collections: Any = None,
785 **kwds: Any) -> Any:
786 """Retrieve a stored dataset.
788 Parameters
789 ----------
790 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
791 When `DatasetRef` the `dataId` should be `None`.
792 Otherwise the `DatasetType` or name thereof.
793 dataId : `dict` or `DataCoordinate`
794 A `dict` of `Dimension` link name, value pairs that label the
795 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
796 should be provided as the first argument.
797 parameters : `dict`
798 Additional StorageClass-defined options to control reading,
799 typically used to efficiently read only a subset of the dataset.
800 collections : Any, optional
801 Collections to be searched, overriding ``self.collections``.
802 Can be any of the types supported by the ``collections`` argument
803 to butler construction.
804 kwds
805 Additional keyword arguments used to augment or construct a
806 `DataCoordinate`. See `DataCoordinate.standardize`
807 parameters.
809 Returns
810 -------
811 obj : `object`
812 The dataset.
814 Raises
815 ------
816 ValueError
817 Raised if a resolved `DatasetRef` was passed as an input, but it
818 differs from the one found in the registry.
819 LookupError
820 Raised if no matching dataset exists in the `Registry`.
821 TypeError
822 Raised if no collections were provided.
823 """
824 log.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters)
825 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwds)
826 return self.getDirect(ref, parameters=parameters)
828 def getUri(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
829 dataId: Optional[DataId] = None, *,
830 predict: bool = False,
831 collections: Any = None,
832 run: Optional[str] = None,
833 **kwds: Any) -> str:
834 """Return the URI to the Dataset.
836 Parameters
837 ----------
838 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
839 When `DatasetRef` the `dataId` should be `None`.
840 Otherwise the `DatasetType` or name thereof.
841 dataId : `dict` or `DataCoordinate`
842 A `dict` of `Dimension` link name, value pairs that label the
843 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
844 should be provided as the first argument.
845 predict : `bool`
846 If `True`, allow URIs to be returned of datasets that have not
847 been written.
848 collections : Any, optional
849 Collections to be searched, overriding ``self.collections``.
850 Can be any of the types supported by the ``collections`` argument
851 to butler construction.
852 run : `str`, optional
853 Run to use for predictions, overriding ``self.run``.
854 kwds
855 Additional keyword arguments used to augment or construct a
856 `DataCoordinate`. See `DataCoordinate.standardize`
857 parameters.
859 Returns
860 -------
861 uri : `str`
862 URI string pointing to the Dataset within the datastore. If the
863 Dataset does not exist in the datastore, and if ``predict`` is
864 `True`, the URI will be a prediction and will include a URI
865 fragment "#predicted".
866 If the datastore does not have entities that relate well
867 to the concept of a URI the returned URI string will be
868 descriptive. The returned URI is not guaranteed to be obtainable.
870 Raises
871 ------
872 LookupError
873 A URI has been requested for a dataset that does not exist and
874 guessing is not allowed.
875 ValueError
876 Raised if a resolved `DatasetRef` was passed as an input, but it
877 differs from the one found in the registry.
878 TypeError
879 Raised if no collections were provided.
880 """
881 ref = self._findDatasetRef(datasetRefOrType, dataId, allowUnresolved=predict,
882 collections=collections, **kwds)
883 if ref.id is None: # only possible if predict is True
884 if run is None:
885 run = self.run
886 if run is None:
887 raise TypeError("Cannot predict location with run=None.")
888 # Lie about ID, because we can't guess it, and only
889 # Datastore.getUri() will ever see it (and it doesn't use it).
890 ref = ref.resolved(id=0, run=self.run)
891 return self.datastore.getUri(ref, predict)
893 def datasetExists(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
894 dataId: Optional[DataId] = None, *,
895 collections: Any = None,
896 **kwds: Any) -> bool:
897 """Return True if the Dataset is actually present in the Datastore.
899 Parameters
900 ----------
901 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
902 When `DatasetRef` the `dataId` should be `None`.
903 Otherwise the `DatasetType` or name thereof.
904 dataId : `dict` or `DataCoordinate`
905 A `dict` of `Dimension` link name, value pairs that label the
906 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
907 should be provided as the first argument.
908 collections : Any, optional
909 Collections to be searched, overriding ``self.collections``.
910 Can be any of the types supported by the ``collections`` argument
911 to butler construction.
912 kwds
913 Additional keyword arguments used to augment or construct a
914 `DataCoordinate`. See `DataCoordinate.standardize`
915 parameters.
917 Raises
918 ------
919 LookupError
920 Raised if the dataset is not even present in the Registry.
921 ValueError
922 Raised if a resolved `DatasetRef` was passed as an input, but it
923 differs from the one found in the registry.
924 TypeError
925 Raised if no collections were provided.
926 """
927 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwds)
928 return self.datastore.exists(ref)
930 def pruneCollection(self, name: str, purge: bool = False, unstore: bool = False):
931 """Remove a collection and possibly prune datasets within it.
933 Parameters
934 ----------
935 name : `str`
936 Name of the collection to remove. If this is a
937 `~CollectionType.TAGGED` or `~CollectionType.CHAINED` collection,
938 datasets within the collection are not modified unless ``unstore``
939 is `True`. If this is a `~CollectionType.RUN` collection,
940 ``purge`` and ``unstore`` must be `True`, and all datasets in it
941 are fully removed from the data repository.
942 purge : `bool`, optional
943 If `True`, permit `~CollectionType.RUN` collections to be removed,
944 fully removing datasets within them. Requires ``unstore=True`` as
945 well as an added precaution against accidental deletion. Must be
946 `False` (default) if the collection is not a ``RUN``.
947 unstore: `bool`, optional
948 If `True`, remove all datasets in the collection from all
949 datastores in which they appear.
951 Raises
952 ------
953 TypeError
954 Raised if the butler is read-only or arguments are mutually
955 inconsistent.
956 """
957 # See pruneDatasets comments for more information about the logic here;
958 # the cases are almost the same, but here we can rely on Registry to
959 # take care everything but Datastore deletion when we remove the
960 # collection.
961 if not self.isWriteable():
962 raise TypeError("Butler is read-only.")
963 if purge and not unstore:
964 raise TypeError("Cannot pass purge=True without unstore=True.")
965 collectionType = self.registry.getCollectionType(name)
966 if collectionType is CollectionType.RUN and not purge:
967 raise TypeError(f"Cannot prune RUN collection {name} without purge=True.")
968 if collectionType is not CollectionType.RUN and purge:
969 raise TypeError(f"Cannot prune {collectionType.name} collection {name} with purge=True.")
970 with self.registry.transaction():
971 if unstore:
972 for ref in self.registry.queryDatasets(..., collections=name, deduplicate=True):
973 if self.datastore.exists(ref):
974 self.datastore.trash(ref)
975 self.registry.removeCollection(name)
976 if unstore:
977 # Point of no return for removing artifacts
978 self.datastore.emptyTrash()
980 def pruneDatasets(self, refs: Iterable[DatasetRef], *,
981 disassociate: bool = True,
982 unstore: bool = False,
983 tags: Optional[Iterable[str]] = None,
984 purge: bool = False,
985 run: Optional[str] = None,
986 recursive: bool = True):
987 """Remove one or more datasets from a collection and/or storage.
989 Parameters
990 ----------
991 refs : `~collections.abc.Iterable` of `DatasetRef`
992 Datasets to prune. These must be "resolved" references (not just
993 a `DatasetType` and data ID).
994 disassociate : bool`, optional
995 Disassociate pruned datasets from ``self.tags`` (or the collections
996 given via the ``tags`` argument). Ignored if ``refs`` is ``...``.
997 unstore : `bool`, optional
998 If `True` (`False` is default) remove these datasets from all
999 datastores known to this butler. Note that this will make it
1000 impossible to retrieve these datasets even via other collections.
1001 Datasets that are already not stored are ignored by this option.
1002 tags : `Iterable` [ `str` ], optional
1003 `~CollectionType.TAGGED` collections to disassociate the datasets
1004 from, overriding ``self.tags``. Ignored if ``disassociate`` is
1005 `False` or ``purge`` is `True`.
1006 purge : `bool`, optional
1007 If `True` (`False` is default), completely remove the dataset from
1008 the `Registry`. To prevent accidental deletions, ``purge`` may
1009 only be `True` if all of the following conditions are met:
1011 - All given datasets are in the given run.
1012 - ``disassociate`` is `True`;
1013 - ``unstore`` is `True`.
1015 This mode may remove provenance information from datasets other
1016 than those provided, and should be used with extreme care.
1017 run : `str`, optional
1018 `~CollectionType.RUN` collection to purge from, overriding
1019 ``self.run``. Ignored unless ``purge`` is `True`.
1020 recursive : `bool`, optional
1021 If `True` (default) also prune component datasets of any given
1022 composite datasets. This will only prune components that are
1023 actually attached to the given `DatasetRef` objects, which may
1024 not reflect what is in the database (especially if they were
1025 obtained from `Registry.queryDatasets`, which does not include
1026 components in its results).
1028 Raises
1029 ------
1030 TypeError
1031 Raised if the butler is read-only, if no collection was provided,
1032 or the conditions for ``purge=True`` were not met.
1033 """
1034 if not self.isWriteable():
1035 raise TypeError("Butler is read-only.")
1036 if purge:
1037 if not disassociate:
1038 raise TypeError("Cannot pass purge=True without disassociate=True.")
1039 if not unstore:
1040 raise TypeError("Cannot pass purge=True without unstore=True.")
1041 if run is None:
1042 run = self.run
1043 if run is None:
1044 raise TypeError("No run provided but purge=True.")
1045 collectionType = self.registry.getCollectionType(run)
1046 if collectionType is not CollectionType.RUN:
1047 raise TypeError(f"Cannot purge from collection '{run}' "
1048 f"of non-RUN type {collectionType.name}.")
1049 elif disassociate:
1050 if tags is None:
1051 tags = self.tags
1052 else:
1053 tags = tuple(tags)
1054 if not tags:
1055 raise TypeError("No tags provided but disassociate=True.")
1056 for tag in tags:
1057 collectionType = self.registry.getCollectionType(tag)
1058 if collectionType is not CollectionType.TAGGED:
1059 raise TypeError(f"Cannot disassociate from collection '{tag}' "
1060 f"of non-TAGGED type {collectionType.name}.")
1061 # Pruning a component of a DatasetRef makes no sense since registry
1062 # doesn't always know about components and datastore might not store
1063 # components in a separate file
1064 for ref in refs:
1065 if ref.datasetType.component():
1066 raise ValueError(f"Can not prune a component of a dataset (ref={ref})")
1068 if recursive:
1069 refs = list(DatasetRef.flatten(refs))
1070 # We don't need an unreliable Datastore transaction for this, because
1071 # we've been extra careful to ensure that Datastore.trash only involves
1072 # mutating the Registry (it can _look_ at Datastore-specific things,
1073 # but shouldn't change them), and hence all operations here are
1074 # Registry operations.
1075 with self.registry.transaction():
1076 if unstore:
1077 for ref in refs:
1078 # There is a difference between a concrete composite
1079 # and virtual composite. In a virtual composite the
1080 # datastore is never given the top level DatasetRef. In
1081 # the concrete composite the datastore knows all the
1082 # refs and will clean up itself if asked to remove the
1083 # parent ref. We can not check configuration for this
1084 # since we can not trust that the configuration is the
1085 # same. We therefore have to ask if the ref exists or
1086 # not. This is consistent with the fact that we want
1087 # to ignore already-removed-from-datastore datasets
1088 # anyway.
1089 if self.datastore.exists(ref):
1090 self.datastore.trash(ref)
1091 if purge:
1092 self.registry.removeDatasets(refs, recursive=False) # refs is already recursiveley expanded
1093 elif disassociate:
1094 for tag in tags:
1095 # recursive=False here because refs is already recursive
1096 # if we want it to be.
1097 self.registry.disassociate(tag, refs, recursive=False)
1098 # We've exited the Registry transaction, and apparently committed.
1099 # (if there was an exception, everything rolled back, and it's as if
1100 # nothing happened - and we never get here).
1101 # Datastore artifacts are not yet gone, but they're clearly marked
1102 # as trash, so if we fail to delete now because of (e.g.) filesystem
1103 # problems we can try again later, and if manual administrative
1104 # intervention is required, it's pretty clear what that should entail:
1105 # deleting everything on disk and in private Datastore tables that is
1106 # in the dataset_location_trash table.
1107 if unstore:
1108 # Point of no return for removing artifacts
1109 self.datastore.emptyTrash()
1111 @transactional
1112 def ingest(self, *datasets: FileDataset, transfer: Optional[str] = None, run: Optional[str] = None,
1113 tags: Optional[Iterable[str]] = None,):
1114 """Store and register one or more datasets that already exist on disk.
1116 Parameters
1117 ----------
1118 datasets : `FileDataset`
1119 Each positional argument is a struct containing information about
1120 a file to be ingested, including its path (either absolute or
1121 relative to the datastore root, if applicable), a `DatasetRef`,
1122 and optionally a formatter class or its fully-qualified string
1123 name. If a formatter is not provided, the formatter that would be
1124 used for `put` is assumed. On successful return, all
1125 `FileDataset.ref` attributes will have their `DatasetRef.id`
1126 attribute populated and all `FileDataset.formatter` attributes will
1127 be set to the formatter class used. `FileDataset.path` attributes
1128 may be modified to put paths in whatever the datastore considers a
1129 standardized form.
1130 transfer : `str`, optional
1131 If not `None`, must be one of 'auto', 'move', 'copy', 'hardlink',
1132 'relsymlink' or 'symlink', indicating how to transfer the file.
1133 run : `str`, optional
1134 The name of the run ingested datasets should be added to,
1135 overriding ``self.run``.
1136 tags : `Iterable` [ `str` ], optional
1137 The names of a `~CollectionType.TAGGED` collections to associate
1138 the dataset with, overriding ``self.tags``. These collections
1139 must have already been added to the `Registry`.
1141 Raises
1142 ------
1143 TypeError
1144 Raised if the butler is read-only or if no run was provided.
1145 NotImplementedError
1146 Raised if the `Datastore` does not support the given transfer mode.
1147 DatasetTypeNotSupportedError
1148 Raised if one or more files to be ingested have a dataset type that
1149 is not supported by the `Datastore`..
1150 FileNotFoundError
1151 Raised if one of the given files does not exist.
1152 FileExistsError
1153 Raised if transfer is not `None` but the (internal) location the
1154 file would be moved to is already occupied.
1156 Notes
1157 -----
1158 This operation is not fully exception safe: if a database operation
1159 fails, the given `FileDataset` instances may be only partially updated.
1161 It is atomic in terms of database operations (they will either all
1162 succeed or all fail) providing the database engine implements
1163 transactions correctly. It will attempt to be atomic in terms of
1164 filesystem operations as well, but this cannot be implemented
1165 rigorously for most datastores.
1166 """
1167 if not self.isWriteable():
1168 raise TypeError("Butler is read-only.")
1169 if run is None:
1170 if self.run is None:
1171 raise TypeError("No run provided.")
1172 run = self.run
1173 # No need to check run type, since insertDatasets will do that
1174 # (safely) for us.
1175 if tags is None:
1176 tags = self.tags
1177 else:
1178 tags = tuple(tags)
1179 for tag in tags:
1180 # Check that these are tagged collections up front, because we want
1181 # to avoid relying on Datastore transactionality to avoid modifying
1182 # the repo if there's an error later.
1183 collectionType = self.registry.getCollectionType(tag)
1184 if collectionType is not CollectionType.TAGGED:
1185 raise TypeError(f"Cannot associate into collection '{tag}' of non-TAGGED type "
1186 f"{collectionType.name}.")
1187 # Reorganize the inputs so they're grouped by DatasetType and then
1188 # data ID. We also include a list of DatasetRefs for each FileDataset
1189 # to hold the resolved DatasetRefs returned by the Registry, before
1190 # it's safe to swap them into FileDataset.refs.
1191 # Some type annotation aliases to make that clearer:
1192 GroupForType = Dict[DataCoordinate, Tuple[FileDataset, List[DatasetRef]]]
1193 GroupedData = MutableMapping[DatasetType, GroupForType]
1194 # The actual data structure:
1195 groupedData: GroupedData = defaultdict(dict)
1196 # And the nested loop that populates it:
1197 for dataset in datasets:
1198 # This list intentionally shared across the inner loop, since it's
1199 # associated with `dataset`.
1200 resolvedRefs = []
1201 for ref in dataset.refs:
1202 groupedData[ref.datasetType][ref.dataId] = (dataset, resolvedRefs)
1204 # Now we can bulk-insert into Registry for each DatasetType.
1205 allResolvedRefs = []
1206 for datasetType, groupForType in groupedData.items():
1207 refs = self.registry.insertDatasets(datasetType,
1208 dataIds=groupForType.keys(),
1209 run=run,
1210 recursive=True)
1211 # Append those resolved DatasetRefs to the new lists we set up for
1212 # them.
1213 for ref, (_, resolvedRefs) in zip(refs, groupForType.values()):
1214 resolvedRefs.append(ref)
1216 # Go back to the original FileDatasets to replace their refs with the
1217 # new resolved ones, and also build a big list of all refs.
1218 allResolvedRefs = []
1219 for groupForType in groupedData.values():
1220 for dataset, resolvedRefs in groupForType.values():
1221 dataset.refs = resolvedRefs
1222 allResolvedRefs.extend(resolvedRefs)
1224 # Bulk-associate everything with any tagged collections.
1225 for tag in tags:
1226 self.registry.associate(tag, allResolvedRefs)
1228 # Bulk-insert everything into Datastore.
1229 self.datastore.ingest(*datasets, transfer=transfer)
1231 @contextlib.contextmanager
1232 def export(self, *, directory: Optional[str] = None,
1233 filename: Optional[str] = None,
1234 format: Optional[str] = None,
1235 transfer: Optional[str] = None) -> ContextManager[RepoExport]:
1236 """Export datasets from the repository represented by this `Butler`.
1238 This method is a context manager that returns a helper object
1239 (`RepoExport`) that is used to indicate what information from the
1240 repository should be exported.
1242 Parameters
1243 ----------
1244 directory : `str`, optional
1245 Directory dataset files should be written to if ``transfer`` is not
1246 `None`.
1247 filename : `str`, optional
1248 Name for the file that will include database information associated
1249 with the exported datasets. If this is not an absolute path and
1250 ``directory`` is not `None`, it will be written to ``directory``
1251 instead of the current working directory. Defaults to
1252 "export.{format}".
1253 format : `str`, optional
1254 File format for the database information file. If `None`, the
1255 extension of ``filename`` will be used.
1256 transfer : `str`, optional
1257 Transfer mode passed to `Datastore.export`.
1259 Raises
1260 ------
1261 TypeError
1262 Raised if the set of arguments passed is inconsistent.
1264 Examples
1265 --------
1266 Typically the `Registry.queryDimensions` and `Registry.queryDatasets`
1267 methods are used to provide the iterables over data IDs and/or datasets
1268 to be exported::
1270 with butler.export("exports.yaml") as export:
1271 # Export all flats, and the calibration_label dimensions
1272 # associated with them.
1273 export.saveDatasets(butler.registry.queryDatasets("flat"),
1274 elements=[butler.registry.dimensions["calibration_label"]])
1275 # Export all datasets that start with "deepCoadd_" and all of
1276 # their associated data ID information.
1277 export.saveDatasets(butler.registry.queryDatasets("deepCoadd_*"))
1278 """
1279 if directory is None and transfer is not None:
1280 raise TypeError("Cannot transfer without providing a directory.")
1281 if transfer == "move":
1282 raise TypeError("Transfer may not be 'move': export is read-only")
1283 if format is None:
1284 if filename is None:
1285 raise TypeError("At least one of 'filename' or 'format' must be provided.")
1286 else:
1287 _, format = os.path.splitext(filename)
1288 elif filename is None:
1289 filename = f"export.{format}"
1290 if directory is not None:
1291 filename = os.path.join(directory, filename)
1292 BackendClass = getClassOf(self._config["repo_transfer_formats"][format]["export"])
1293 with open(filename, 'w') as stream:
1294 backend = BackendClass(stream)
1295 try:
1296 helper = RepoExport(self.registry, self.datastore, backend=backend,
1297 directory=directory, transfer=transfer)
1298 yield helper
1299 except BaseException:
1300 raise
1301 else:
1302 helper._finish()
1304 def import_(self, *, directory: Optional[str] = None,
1305 filename: Optional[str] = None,
1306 format: Optional[str] = None,
1307 transfer: Optional[str] = None):
1308 """Import datasets exported from a different butler repository.
1310 Parameters
1311 ----------
1312 directory : `str`, optional
1313 Directory containing dataset files. If `None`, all file paths
1314 must be absolute.
1315 filename : `str`, optional
1316 Name for the file that containing database information associated
1317 with the exported datasets. If this is not an absolute path, does
1318 not exist in the current working directory, and ``directory`` is
1319 not `None`, it is assumed to be in ``directory``. Defaults to
1320 "export.{format}".
1321 format : `str`, optional
1322 File format for the database information file. If `None`, the
1323 extension of ``filename`` will be used.
1324 transfer : `str`, optional
1325 Transfer mode passed to `Datastore.export`.
1327 Raises
1328 ------
1329 TypeError
1330 Raised if the set of arguments passed is inconsistent, or if the
1331 butler is read-only.
1332 """
1333 if not self.isWriteable():
1334 raise TypeError("Butler is read-only.")
1335 if format is None:
1336 if filename is None:
1337 raise TypeError("At least one of 'filename' or 'format' must be provided.")
1338 else:
1339 _, format = os.path.splitext(filename)
1340 elif filename is None:
1341 filename = f"export.{format}"
1342 if directory is not None and not os.path.exists(filename):
1343 filename = os.path.join(directory, filename)
1344 BackendClass = getClassOf(self._config["repo_transfer_formats"][format]["import"])
1345 with open(filename, 'r') as stream:
1346 backend = BackendClass(stream, self.registry)
1347 backend.register()
1348 with self.transaction():
1349 backend.load(self.datastore, directory=directory, transfer=transfer)
1351 def validateConfiguration(self, logFailures: bool = False,
1352 datasetTypeNames: Optional[Iterable[str]] = None,
1353 ignore: Iterable[str] = None):
1354 """Validate butler configuration.
1356 Checks that each `DatasetType` can be stored in the `Datastore`.
1358 Parameters
1359 ----------
1360 logFailures : `bool`, optional
1361 If `True`, output a log message for every validation error
1362 detected.
1363 datasetTypeNames : iterable of `str`, optional
1364 The `DatasetType` names that should be checked. This allows
1365 only a subset to be selected.
1366 ignore : iterable of `str`, optional
1367 Names of DatasetTypes to skip over. This can be used to skip
1368 known problems. If a named `DatasetType` corresponds to a
1369 composite, all component of that `DatasetType` will also be
1370 ignored.
1372 Raises
1373 ------
1374 ButlerValidationError
1375 Raised if there is some inconsistency with how this Butler
1376 is configured.
1377 """
1378 if datasetTypeNames:
1379 entities = [self.registry.getDatasetType(name) for name in datasetTypeNames]
1380 else:
1381 entities = list(self.registry.queryDatasetTypes())
1383 # filter out anything from the ignore list
1384 if ignore:
1385 ignore = set(ignore)
1386 entities = [e for e in entities if e.name not in ignore and e.nameAndComponent()[0] not in ignore]
1387 else:
1388 ignore = set()
1390 # Find all the registered instruments
1391 instruments = set(
1392 dataId["instrument"] for dataId in self.registry.queryDimensions(["instrument"])
1393 )
1395 # For each datasetType that has an instrument dimension, create
1396 # a DatasetRef for each defined instrument
1397 datasetRefs = []
1399 for datasetType in entities:
1400 if "instrument" in datasetType.dimensions:
1401 for instrument in instruments:
1402 datasetRef = DatasetRef(datasetType, {"instrument": instrument}, conform=False)
1403 datasetRefs.append(datasetRef)
1405 entities.extend(datasetRefs)
1407 datastoreErrorStr = None
1408 try:
1409 self.datastore.validateConfiguration(entities, logFailures=logFailures)
1410 except ValidationError as e:
1411 datastoreErrorStr = str(e)
1413 # Also check that the LookupKeys used by the datastores match
1414 # registry and storage class definitions
1415 keys = self.datastore.getLookupKeys()
1417 failedNames = set()
1418 failedDataId = set()
1419 for key in keys:
1420 datasetType = None
1421 if key.name is not None:
1422 if key.name in ignore:
1423 continue
1425 # skip if specific datasetType names were requested and this
1426 # name does not match
1427 if datasetTypeNames and key.name not in datasetTypeNames:
1428 continue
1430 # See if it is a StorageClass or a DatasetType
1431 if key.name in self.storageClasses:
1432 pass
1433 else:
1434 try:
1435 self.registry.getDatasetType(key.name)
1436 except KeyError:
1437 if logFailures:
1438 log.fatal("Key '%s' does not correspond to a DatasetType or StorageClass", key)
1439 failedNames.add(key)
1440 else:
1441 # Dimensions are checked for consistency when the Butler
1442 # is created and rendezvoused with a universe.
1443 pass
1445 # Check that the instrument is a valid instrument
1446 # Currently only support instrument so check for that
1447 if key.dataId:
1448 dataIdKeys = set(key.dataId)
1449 if set(["instrument"]) != dataIdKeys:
1450 if logFailures:
1451 log.fatal("Key '%s' has unsupported DataId override", key)
1452 failedDataId.add(key)
1453 elif key.dataId["instrument"] not in instruments:
1454 if logFailures:
1455 log.fatal("Key '%s' has unknown instrument", key)
1456 failedDataId.add(key)
1458 messages = []
1460 if datastoreErrorStr:
1461 messages.append(datastoreErrorStr)
1463 for failed, msg in ((failedNames, "Keys without corresponding DatasetType or StorageClass entry: "),
1464 (failedDataId, "Keys with bad DataId entries: ")):
1465 if failed:
1466 msg += ", ".join(str(k) for k in failed)
1467 messages.append(msg)
1469 if messages:
1470 raise ValidationError(";\n".join(messages))
1472 registry: Registry
1473 """The object that manages dataset metadata and relationships (`Registry`).
1475 Most operations that don't involve reading or writing butler datasets are
1476 accessible only via `Registry` methods.
1477 """
1479 datastore: Datastore
1480 """The object that manages actual dataset storage (`Datastore`).
1482 Direct user access to the datastore should rarely be necessary; the primary
1483 exception is the case where a `Datastore` implementation provides extra
1484 functionality beyond what the base class defines.
1485 """
1487 storageClasses: StorageClassFactory
1488 """An object that maps known storage class names to objects that fully
1489 describe them (`StorageClassFactory`).
1490 """
1492 collections: Optional[CollectionSearch]
1493 """The collections to search and any restrictions on the dataset types to
1494 search for within them, in order (`CollectionSearch`).
1495 """
1497 run: Optional[str]
1498 """Name of the run this butler writes outputs to (`str` or `None`).
1499 """
1501 tags: Tuple[str, ...]
1502 """Names of `~CollectionType.TAGGED` collections this butler associates
1503 with in `put` and `ingest`, and disassociates from in `pruneDatasets`
1504 (`tuple` [ `str` ]).
1505 """