Coverage for python/lsst/daf/butler/_butler.py : 8%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""
23Butler top level classes.
24"""
25from __future__ import annotations
27__all__ = ("Butler", "ButlerValidationError")
29import os
30from collections import defaultdict
31import contextlib
32import logging
33from typing import (
34 Any,
35 ClassVar,
36 ContextManager,
37 Dict,
38 Iterable,
39 List,
40 Mapping,
41 MutableMapping,
42 Optional,
43 Tuple,
44 Union,
45)
47try:
48 import boto3
49except ImportError:
50 boto3 = None
52from lsst.utils import doImport
53from .core import (
54 ButlerURI,
55 CompositesMap,
56 Config,
57 ConfigSubset,
58 DataCoordinate,
59 DataId,
60 DatasetRef,
61 DatasetType,
62 Datastore,
63 FileDataset,
64 Quantum,
65 RepoExport,
66 StorageClassFactory,
67 ValidationError,
68)
69from .core.repoRelocation import BUTLER_ROOT_TAG
70from .core.safeFileIo import safeMakeDir
71from .core.utils import transactional, getClassOf
72from ._deferredDatasetHandle import DeferredDatasetHandle
73from ._butlerConfig import ButlerConfig
74from .registry import Registry, RegistryConfig, CollectionType
75from .registry.wildcards import CollectionSearch
77log = logging.getLogger(__name__)
80class ButlerValidationError(ValidationError):
81 """There is a problem with the Butler configuration."""
82 pass
85class Butler:
86 """Main entry point for the data access system.
88 Parameters
89 ----------
90 config : `ButlerConfig`, `Config` or `str`, optional.
91 Configuration. Anything acceptable to the
92 `ButlerConfig` constructor. If a directory path
93 is given the configuration will be read from a ``butler.yaml`` file in
94 that location. If `None` is given default values will be used.
95 butler : `Butler`, optional.
96 If provided, construct a new Butler that uses the same registry and
97 datastore as the given one, but with the given collection and run.
98 Incompatible with the ``config``, ``searchPaths``, and ``writeable``
99 arguments.
100 collections : `Any`, optional
101 An expression specifying the collections to be searched (in order) when
102 reading datasets, and optionally dataset type restrictions on them.
103 This may be:
104 - a `str` collection name;
105 - a tuple of (collection name, *dataset type restriction*);
106 - an iterable of either of the above;
107 - a mapping from `str` to *dataset type restriction*.
109 See :ref:`daf_butler_collection_expressions` for more information,
110 including the definition of a *dataset type restriction*. All
111 collections must either already exist or be specified to be created
112 by other arguments.
113 run : `str`, optional
114 Name of the run datasets should be output to. If the run
115 does not exist, it will be created. If ``collections`` is `None`, it
116 will be set to ``[run]``. If this is not set (and ``writeable`` is
117 not set either), a read-only butler will be created.
118 tags : `Iterable` [ `str` ], optional
119 A list of `~CollectionType.TAGGED` collections that datasets should be
120 associated with in `put` or `ingest` and disassociated from in
121 `pruneDatasets`. If any of these collections does not exist, it will
122 be created.
123 chains : `Mapping` [ `str`, `Iterable` [ `str` ] ], optional
124 A mapping from the names of new `~CollectionType.CHAINED` collections
125 to an expression identifying their child collections (which takes the
126 same form as the ``collections`` argument. Chains may be nested only
127 if children precede their parents in this mapping.
128 searchPaths : `list` of `str`, optional
129 Directory paths to search when calculating the full Butler
130 configuration. Not used if the supplied config is already a
131 `ButlerConfig`.
132 writeable : `bool`, optional
133 Explicitly sets whether the butler supports write operations. If not
134 provided, a read-write butler is created if any of ``run``, ``tags``,
135 or ``chains`` is non-empty.
137 Examples
138 --------
139 While there are many ways to control exactly how a `Butler` interacts with
140 the collections in its `Registry`, the most common cases are still simple.
142 For a read-only `Butler` that searches one collection, do::
144 butler = Butler("/path/to/repo", collections=["u/alice/DM-50000"])
146 For a read-write `Butler` that writes to and reads from a
147 `~CollectionType.RUN` collection::
149 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a")
151 The `Butler` passed to a ``PipelineTask`` is often much more complex,
152 because we want to write to one `~CollectionType.RUN` collection but read
153 from several others (as well), while defining a new
154 `~CollectionType.CHAINED` collection that combines them all::
156 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a",
157 collections=["u/alice/DM-50000"],
158 chains={
159 "u/alice/DM-50000": ["u/alice/DM-50000/a",
160 "u/bob/DM-49998",
161 "raw/hsc"]
162 })
164 This butler will `put` new datasets to the run ``u/alice/DM-50000/a``, but
165 they'll also be available from the chained collection ``u/alice/DM-50000``.
166 Datasets will be read first from that run (since it appears first in the
167 chain), and then from ``u/bob/DM-49998`` and finally ``raw/hsc``.
168 If ``u/alice/DM-50000`` had already been defined, the ``chain`` argument
169 would be unnecessary. We could also construct a butler that performs
170 exactly the same `put` and `get` operations without actually creating a
171 chained collection, just by passing multiple items is ``collections``::
173 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a",
174 collections=["u/alice/DM-50000/a",
175 "u/bob/DM-49998",
176 "raw/hsc"])
178 Finally, one can always create a `Butler` with no collections::
180 butler = Butler("/path/to/repo", writeable=True)
182 This can be extremely useful when you just want to use ``butler.registry``,
183 e.g. for inserting dimension data or managing collections, or when the
184 collections you want to use with the butler are not consistent.
185 Passing ``writeable`` explicitly here is only necessary if you want to be
186 able to make changes to the repo - usually the value for ``writeable`` is
187 can be guessed from the collection arguments provided, but it defaults to
188 `False` when there are not collection arguments.
189 """
190 def __init__(self, config: Union[Config, str, None] = None, *,
191 butler: Optional[Butler] = None,
192 collections: Any = None,
193 run: Optional[str] = None,
194 tags: Iterable[str] = (),
195 chains: Optional[Mapping[str, Any]] = None,
196 searchPaths: Optional[List[str]] = None,
197 writeable: Optional[bool] = None):
198 # Transform any single-pass iterator into an actual sequence so we
199 # can see if its empty
200 self.tags = tuple(tags)
201 # Load registry, datastore, etc. from config or existing butler.
202 if butler is not None:
203 if config is not None or searchPaths is not None or writeable is not None:
204 raise TypeError("Cannot pass 'config', 'searchPaths', or 'writeable' "
205 "arguments with 'butler' argument.")
206 self.registry = butler.registry
207 self.datastore = butler.datastore
208 self.storageClasses = butler.storageClasses
209 self._composites = butler._composites
210 self._config = butler._config
211 else:
212 self._config = ButlerConfig(config, searchPaths=searchPaths)
213 if "root" in self._config:
214 butlerRoot = self._config["root"]
215 else:
216 butlerRoot = self._config.configDir
217 if writeable is None:
218 writeable = run is not None or chains is not None or self.tags
219 self.registry = Registry.fromConfig(self._config, butlerRoot=butlerRoot, writeable=writeable)
220 self.datastore = Datastore.fromConfig(self._config, self.registry, butlerRoot=butlerRoot)
221 self.storageClasses = StorageClassFactory()
222 self.storageClasses.addFromConfig(self._config)
223 self._composites = CompositesMap(self._config, universe=self.registry.dimensions)
224 # Check the many collection arguments for consistency and create any
225 # needed collections that don't exist.
226 if collections is None:
227 if run is not None:
228 collections = (run,)
229 else:
230 collections = ()
231 self.collections = CollectionSearch.fromExpression(collections)
232 if chains is None:
233 chains = {}
234 self.run = run
235 if "run" in self._config or "collection" in self._config:
236 raise ValueError("Passing a run or collection via configuration is no longer supported.")
237 if self.run is not None:
238 self.registry.registerCollection(self.run, type=CollectionType.RUN)
239 for tag in self.tags:
240 self.registry.registerCollection(tag, type=CollectionType.TAGGED)
241 for parent, children in chains.items():
242 self.registry.registerCollection(parent, type=CollectionType.CHAINED)
243 self.registry.setCollectionChain(parent, children)
245 GENERATION: ClassVar[int] = 3
246 """This is a Generation 3 Butler.
248 This attribute may be removed in the future, once the Generation 2 Butler
249 interface has been fully retired; it should only be used in transitional
250 code.
251 """
253 @staticmethod
254 def makeRepo(root: str, config: Union[Config, str, None] = None, standalone: bool = False,
255 createRegistry: bool = True, searchPaths: Optional[List[str]] = None,
256 forceConfigRoot: bool = True, outfile: Optional[str] = None,
257 overwrite: bool = False) -> Config:
258 """Create an empty data repository by adding a butler.yaml config
259 to a repository root directory.
261 Parameters
262 ----------
263 root : `str`
264 Filesystem path to the root of the new repository. Will be created
265 if it does not exist.
266 config : `Config` or `str`, optional
267 Configuration to write to the repository, after setting any
268 root-dependent Registry or Datastore config options. Can not
269 be a `ButlerConfig` or a `ConfigSubset`. If `None`, default
270 configuration will be used. Root-dependent config options
271 specified in this config are overwritten if ``forceConfigRoot``
272 is `True`.
273 standalone : `bool`
274 If True, write all expanded defaults, not just customized or
275 repository-specific settings.
276 This (mostly) decouples the repository from the default
277 configuration, insulating it from changes to the defaults (which
278 may be good or bad, depending on the nature of the changes).
279 Future *additions* to the defaults will still be picked up when
280 initializing `Butlers` to repos created with ``standalone=True``.
281 createRegistry : `bool`, optional
282 If `True` create a new Registry.
283 searchPaths : `list` of `str`, optional
284 Directory paths to search when calculating the full butler
285 configuration.
286 forceConfigRoot : `bool`, optional
287 If `False`, any values present in the supplied ``config`` that
288 would normally be reset are not overridden and will appear
289 directly in the output config. This allows non-standard overrides
290 of the root directory for a datastore or registry to be given.
291 If this parameter is `True` the values for ``root`` will be
292 forced into the resulting config if appropriate.
293 outfile : `str`, optional
294 If not-`None`, the output configuration will be written to this
295 location rather than into the repository itself. Can be a URI
296 string. Can refer to a directory that will be used to write
297 ``butler.yaml``.
298 overwrite : `bool`, optional
299 Create a new configuration file even if one already exists
300 in the specified output location. Default is to raise
301 an exception.
303 Returns
304 -------
305 config : `Config`
306 The updated `Config` instance written to the repo.
308 Raises
309 ------
310 ValueError
311 Raised if a ButlerConfig or ConfigSubset is passed instead of a
312 regular Config (as these subclasses would make it impossible to
313 support ``standalone=False``).
314 FileExistsError
315 Raised if the output config file already exists.
316 os.error
317 Raised if the directory does not exist, exists but is not a
318 directory, or cannot be created.
320 Notes
321 -----
322 Note that when ``standalone=False`` (the default), the configuration
323 search path (see `ConfigSubset.defaultSearchPaths`) that was used to
324 construct the repository should also be used to construct any Butlers
325 to avoid configuration inconsistencies.
326 """
327 if isinstance(config, (ButlerConfig, ConfigSubset)):
328 raise ValueError("makeRepo must be passed a regular Config without defaults applied.")
330 # for "file" schemes we are assuming POSIX semantics for paths, for
331 # schemeless URIs we are assuming os.path semantics.
332 uri = ButlerURI(root)
333 if uri.scheme == "file" or not uri.scheme:
334 if not os.path.isdir(uri.ospath):
335 safeMakeDir(uri.ospath)
336 elif uri.scheme == "s3":
337 s3 = boto3.resource("s3")
338 # implies bucket exists, if not another level of checks
339 bucket = s3.Bucket(uri.netloc)
340 bucket.put_object(Bucket=uri.netloc, Key=uri.relativeToPathRoot)
341 else:
342 raise ValueError(f"Unrecognized scheme: {uri.scheme}")
343 config = Config(config)
345 # If we are creating a new repo from scratch with relative roots,
346 # do not propagate an explicit root from the config file
347 if "root" in config:
348 del config["root"]
350 full = ButlerConfig(config, searchPaths=searchPaths) # this applies defaults
351 datastoreClass = doImport(full["datastore", "cls"])
352 datastoreClass.setConfigRoot(BUTLER_ROOT_TAG, config, full, overwrite=forceConfigRoot)
354 # if key exists in given config, parse it, otherwise parse the defaults
355 # in the expanded config
356 if config.get(("registry", "db")):
357 registryConfig = RegistryConfig(config)
358 else:
359 registryConfig = RegistryConfig(full)
360 defaultDatabaseUri = registryConfig.makeDefaultDatabaseUri(BUTLER_ROOT_TAG)
361 if defaultDatabaseUri is not None:
362 Config.updateParameters(RegistryConfig, config, full,
363 toUpdate={"db": defaultDatabaseUri},
364 overwrite=forceConfigRoot)
365 else:
366 Config.updateParameters(RegistryConfig, config, full, toCopy=("db",),
367 overwrite=forceConfigRoot)
369 if standalone:
370 config.merge(full)
371 if outfile is not None:
372 # When writing to a separate location we must include
373 # the root of the butler repo in the config else it won't know
374 # where to look.
375 config["root"] = uri.geturl()
376 configURI = outfile
377 else:
378 configURI = uri
379 config.dumpToUri(configURI, overwrite=overwrite)
381 # Create Registry and populate tables
382 Registry.fromConfig(config, create=createRegistry, butlerRoot=root)
383 return config
385 @classmethod
386 def _unpickle(cls, config: ButlerConfig, collections: Optional[CollectionSearch], run: Optional[str],
387 tags: Tuple[str, ...], writeable: bool) -> Butler:
388 """Callable used to unpickle a Butler.
390 We prefer not to use ``Butler.__init__`` directly so we can force some
391 of its many arguments to be keyword-only (note that ``__reduce__``
392 can only invoke callables with positional arguments).
394 Parameters
395 ----------
396 config : `ButlerConfig`
397 Butler configuration, already coerced into a true `ButlerConfig`
398 instance (and hence after any search paths for overrides have been
399 utilized).
400 collections : `CollectionSearch`
401 Names of collections to read from.
402 run : `str`, optional
403 Name of `~CollectionType.RUN` collection to write to.
404 tags : `tuple` [`str`]
405 Names of `~CollectionType.TAGGED` collections to associate with.
406 writeable : `bool`
407 Whether the Butler should support write operations.
409 Returns
410 -------
411 butler : `Butler`
412 A new `Butler` instance.
413 """
414 return cls(config=config, collections=collections, run=run, tags=tags, writeable=writeable)
416 def __reduce__(self):
417 """Support pickling.
418 """
419 return (Butler._unpickle, (self._config, self.collections, self.run, self.tags,
420 self.registry.isWriteable()))
422 def __str__(self):
423 return "Butler(collections={}, run={}, tags={}, datastore='{}', registry='{}')".format(
424 self.collections, self.run, self.tags, self.datastore, self.registry)
426 def isWriteable(self) -> bool:
427 """Return `True` if this `Butler` supports write operations.
428 """
429 return self.registry.isWriteable()
431 @contextlib.contextmanager
432 def transaction(self):
433 """Context manager supporting `Butler` transactions.
435 Transactions can be nested.
436 """
437 with self.registry.transaction():
438 with self.datastore.transaction():
439 yield
441 def _standardizeArgs(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
442 dataId: Optional[DataId] = None, **kwds: Any) -> Tuple[DatasetType, DataId]:
443 """Standardize the arguments passed to several Butler APIs.
445 Parameters
446 ----------
447 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
448 When `DatasetRef` the `dataId` should be `None`.
449 Otherwise the `DatasetType` or name thereof.
450 dataId : `dict` or `DataCoordinate`
451 A `dict` of `Dimension` link name, value pairs that label the
452 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
453 should be provided as the second argument.
454 kwds
455 Additional keyword arguments used to augment or construct a
456 `DataCoordinate`. See `DataCoordinate.standardize`
457 parameters.
459 Returns
460 -------
461 datasetType : `DatasetType`
462 A `DatasetType` instance extracted from ``datasetRefOrType``.
463 dataId : `dict` or `DataId`, optional
464 Argument that can be used (along with ``kwds``) to construct a
465 `DataId`.
467 Notes
468 -----
469 Butler APIs that conceptually need a DatasetRef also allow passing a
470 `DatasetType` (or the name of one) and a `DataId` (or a dict and
471 keyword arguments that can be used to construct one) separately. This
472 method accepts those arguments and always returns a true `DatasetType`
473 and a `DataId` or `dict`.
475 Standardization of `dict` vs `DataId` is best handled by passing the
476 returned ``dataId`` (and ``kwds``) to `Registry` APIs, which are
477 generally similarly flexible.
478 """
479 externalDatasetType = None
480 internalDatasetType = None
481 if isinstance(datasetRefOrType, DatasetRef):
482 if dataId is not None or kwds:
483 raise ValueError("DatasetRef given, cannot use dataId as well")
484 externalDatasetType = datasetRefOrType.datasetType
485 dataId = datasetRefOrType.dataId
486 else:
487 # Don't check whether DataId is provided, because Registry APIs
488 # can usually construct a better error message when it wasn't.
489 if isinstance(datasetRefOrType, DatasetType):
490 externalDatasetType = datasetRefOrType
491 else:
492 internalDatasetType = self.registry.getDatasetType(datasetRefOrType)
494 # Check that they are self-consistent
495 if externalDatasetType is not None:
496 internalDatasetType = self.registry.getDatasetType(externalDatasetType.name)
497 if externalDatasetType != internalDatasetType:
498 raise ValueError(f"Supplied dataset type ({externalDatasetType}) inconsistent with "
499 f"registry definition ({internalDatasetType})")
501 return internalDatasetType, dataId
503 def _findDatasetRef(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
504 dataId: Optional[DataId] = None, *,
505 collections: Any = None,
506 allowUnresolved: bool = False,
507 **kwds: Any) -> DatasetRef:
508 """Shared logic for methods that start with a search for a dataset in
509 the registry.
511 Parameters
512 ----------
513 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
514 When `DatasetRef` the `dataId` should be `None`.
515 Otherwise the `DatasetType` or name thereof.
516 dataId : `dict` or `DataCoordinate`, optional
517 A `dict` of `Dimension` link name, value pairs that label the
518 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
519 should be provided as the first argument.
520 collections : Any, optional
521 Collections to be searched, overriding ``self.collections``.
522 Can be any of the types supported by the ``collections`` argument
523 to butler construction.
524 allowUnresolved : `bool`, optional
525 If `True`, return an unresolved `DatasetRef` if finding a resolved
526 one in the `Registry` fails. Defaults to `False`.
527 kwds
528 Additional keyword arguments used to augment or construct a
529 `DataId`. See `DataId` parameters.
531 Returns
532 -------
533 ref : `DatasetRef`
534 A reference to the dataset identified by the given arguments.
536 Raises
537 ------
538 LookupError
539 Raised if no matching dataset exists in the `Registry` (and
540 ``allowUnresolved is False``).
541 ValueError
542 Raised if a resolved `DatasetRef` was passed as an input, but it
543 differs from the one found in the registry.
544 TypeError
545 Raised if no collections were provided.
546 """
547 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwds)
548 if isinstance(datasetRefOrType, DatasetRef):
549 idNumber = datasetRefOrType.id
550 else:
551 idNumber = None
552 # Expand the data ID first instead of letting registry.findDataset do
553 # it, so we get the result even if it returns None.
554 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions, **kwds)
555 if collections is None:
556 collections = self.collections
557 if not collections:
558 raise TypeError("No input collections provided.")
559 else:
560 collections = CollectionSearch.fromExpression(collections)
561 # Always lookup the DatasetRef, even if one is given, to ensure it is
562 # present in the current collection.
563 ref = self.registry.findDataset(datasetType, dataId, collections=collections)
564 if ref is None:
565 if allowUnresolved:
566 return DatasetRef(datasetType, dataId)
567 else:
568 raise LookupError(f"Dataset {datasetType.name} with data ID {dataId} "
569 f"could not be found in collections {collections}.")
570 if idNumber is not None and idNumber != ref.id:
571 raise ValueError(f"DatasetRef.id provided ({idNumber}) does not match "
572 f"id ({ref.id}) in registry in collections {collections}.")
573 return ref
575 @transactional
576 def put(self, obj: Any, datasetRefOrType: Union[DatasetRef, DatasetType, str],
577 dataId: Optional[DataId] = None, *,
578 producer: Optional[Quantum] = None,
579 run: Optional[str] = None,
580 tags: Optional[Iterable[str]] = None,
581 **kwds: Any) -> DatasetRef:
582 """Store and register a dataset.
584 Parameters
585 ----------
586 obj : `object`
587 The dataset.
588 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
589 When `DatasetRef` is provided, ``dataId`` should be `None`.
590 Otherwise the `DatasetType` or name thereof.
591 dataId : `dict` or `DataCoordinate`
592 A `dict` of `Dimension` link name, value pairs that label the
593 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
594 should be provided as the second argument.
595 producer : `Quantum`, optional
596 The producer.
597 run : `str`, optional
598 The name of the run the dataset should be added to, overriding
599 ``self.run``.
600 tags : `Iterable` [ `str` ], optional
601 The names of a `~CollectionType.TAGGED` collections to associate
602 the dataset with, overriding ``self.tags``. These collections
603 must have already been added to the `Registry`.
604 kwds
605 Additional keyword arguments used to augment or construct a
606 `DataCoordinate`. See `DataCoordinate.standardize`
607 parameters.
609 Returns
610 -------
611 ref : `DatasetRef`
612 A reference to the stored dataset, updated with the correct id if
613 given.
615 Raises
616 ------
617 TypeError
618 Raised if the butler is read-only or if no run has been provided.
619 """
620 log.debug("Butler put: %s, dataId=%s, producer=%s, run=%s", datasetRefOrType, dataId, producer, run)
621 if not self.isWriteable():
622 raise TypeError("Butler is read-only.")
623 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwds)
624 if isinstance(datasetRefOrType, DatasetRef) and datasetRefOrType.id is not None:
625 raise ValueError("DatasetRef must not be in registry, must have None id")
627 if run is None:
628 if self.run is None:
629 raise TypeError("No run provided.")
630 run = self.run
631 # No need to check type for run; first thing we do is
632 # insertDatasets, and that will check for us.
634 if tags is None:
635 tags = self.tags
636 else:
637 tags = tuple(tags)
638 for tag in tags:
639 # Check that these are tagged collections up front, because we want
640 # to avoid relying on Datastore transactionality to avoid modifying
641 # the repo if there's an error later.
642 collectionType = self.registry.getCollectionType(tag)
643 if collectionType is not CollectionType.TAGGED:
644 raise TypeError(f"Cannot associate into collection '{tag}' of non-TAGGED type "
645 f"{collectionType.name}.")
647 isVirtualComposite = self._composites.shouldBeDisassembled(datasetType)
649 # Add Registry Dataset entry. If not a virtual composite, add
650 # and attach components at the same time.
651 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions, **kwds)
652 ref, = self.registry.insertDatasets(datasetType, run=run, dataIds=[dataId],
653 producer=producer, recursive=not isVirtualComposite)
655 # Check to see if this datasetType requires disassembly
656 if isVirtualComposite:
657 components = datasetType.storageClass.assembler().disassemble(obj)
658 componentRefs = {}
659 for component, info in components.items():
660 compTypeName = datasetType.componentTypeName(component)
661 compRef = self.put(info.component, compTypeName, dataId, producer=producer, run=run,
662 collection=False) # We don't need to recursively associate.
663 componentRefs[component] = compRef
664 ref = self.registry.attachComponents(ref, componentRefs)
665 else:
666 # This is an entity without a disassembler.
667 self.datastore.put(obj, ref)
669 for tag in tags:
670 self.registry.associate(tag, [ref]) # this is already recursive by default
672 return ref
674 def getDirect(self, ref: DatasetRef, *, parameters: Optional[Dict[str, Any]] = None):
675 """Retrieve a stored dataset.
677 Unlike `Butler.get`, this method allows datasets outside the Butler's
678 collection to be read as long as the `DatasetRef` that identifies them
679 can be obtained separately.
681 Parameters
682 ----------
683 ref : `DatasetRef`
684 Reference to an already stored dataset.
685 parameters : `dict`
686 Additional StorageClass-defined options to control reading,
687 typically used to efficiently read only a subset of the dataset.
689 Returns
690 -------
691 obj : `object`
692 The dataset.
693 """
694 # if the ref exists in the store we return it directly
695 if self.datastore.exists(ref):
696 return self.datastore.get(ref, parameters=parameters)
697 elif ref.isComposite():
698 # Check that we haven't got any unknown parameters
699 ref.datasetType.storageClass.validateParameters(parameters)
700 # Reconstruct the composite
701 usedParams = set()
702 components = {}
703 for compName, compRef in ref.components.items():
704 # make a dictionary of parameters containing only the subset
705 # supported by the StorageClass of the components
706 compParams = compRef.datasetType.storageClass.filterParameters(parameters)
707 usedParams.update(set(compParams))
708 components[compName] = self.datastore.get(compRef, parameters=compParams)
710 # Any unused parameters will have to be passed to the assembler
711 if parameters:
712 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams}
713 else:
714 unusedParams = {}
716 # Assemble the components
717 inMemoryDataset = ref.datasetType.storageClass.assembler().assemble(components)
718 return ref.datasetType.storageClass.assembler().handleParameters(inMemoryDataset,
719 parameters=unusedParams)
720 else:
721 # single entity in datastore
722 raise FileNotFoundError(f"Unable to locate dataset '{ref}' in datastore {self.datastore.name}")
724 def getDeferred(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
725 dataId: Optional[DataId] = None, *,
726 parameters: Union[dict, None] = None,
727 collections: Any = None,
728 **kwds: Any) -> DeferredDatasetHandle:
729 """Create a `DeferredDatasetHandle` which can later retrieve a dataset
731 Parameters
732 ----------
733 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
734 When `DatasetRef` the `dataId` should be `None`.
735 Otherwise the `DatasetType` or name thereof.
736 dataId : `dict` or `DataCoordinate`, optional
737 A `dict` of `Dimension` link name, value pairs that label the
738 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
739 should be provided as the first argument.
740 parameters : `dict`
741 Additional StorageClass-defined options to control reading,
742 typically used to efficiently read only a subset of the dataset.
743 collections : Any, optional
744 Collections to be searched, overriding ``self.collections``.
745 Can be any of the types supported by the ``collections`` argument
746 to butler construction.
747 kwds
748 Additional keyword arguments used to augment or construct a
749 `DataId`. See `DataId` parameters.
751 Returns
752 -------
753 obj : `DeferredDatasetHandle`
754 A handle which can be used to retrieve a dataset at a later time.
756 Raises
757 ------
758 LookupError
759 Raised if no matching dataset exists in the `Registry` (and
760 ``allowUnresolved is False``).
761 ValueError
762 Raised if a resolved `DatasetRef` was passed as an input, but it
763 differs from the one found in the registry.
764 TypeError
765 Raised if no collections were provided.
766 """
767 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwds)
768 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters)
770 def get(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
771 dataId: Optional[DataId] = None, *,
772 parameters: Optional[Dict[str, Any]] = None,
773 collections: Any = None,
774 **kwds: Any) -> Any:
775 """Retrieve a stored dataset.
777 Parameters
778 ----------
779 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
780 When `DatasetRef` the `dataId` should be `None`.
781 Otherwise the `DatasetType` or name thereof.
782 dataId : `dict` or `DataCoordinate`
783 A `dict` of `Dimension` link name, value pairs that label the
784 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
785 should be provided as the first argument.
786 parameters : `dict`
787 Additional StorageClass-defined options to control reading,
788 typically used to efficiently read only a subset of the dataset.
789 collections : Any, optional
790 Collections to be searched, overriding ``self.collections``.
791 Can be any of the types supported by the ``collections`` argument
792 to butler construction.
793 kwds
794 Additional keyword arguments used to augment or construct a
795 `DataCoordinate`. See `DataCoordinate.standardize`
796 parameters.
798 Returns
799 -------
800 obj : `object`
801 The dataset.
803 Raises
804 ------
805 ValueError
806 Raised if a resolved `DatasetRef` was passed as an input, but it
807 differs from the one found in the registry.
808 LookupError
809 Raised if no matching dataset exists in the `Registry`.
810 TypeError
811 Raised if no collections were provided.
812 """
813 log.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters)
814 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwds)
815 return self.getDirect(ref, parameters=parameters)
817 def getUri(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
818 dataId: Optional[DataId] = None, *,
819 predict: bool = False,
820 collections: Any = None,
821 run: Optional[str] = None,
822 **kwds: Any) -> str:
823 """Return the URI to the Dataset.
825 Parameters
826 ----------
827 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
828 When `DatasetRef` the `dataId` should be `None`.
829 Otherwise the `DatasetType` or name thereof.
830 dataId : `dict` or `DataCoordinate`
831 A `dict` of `Dimension` link name, value pairs that label the
832 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
833 should be provided as the first argument.
834 predict : `bool`
835 If `True`, allow URIs to be returned of datasets that have not
836 been written.
837 collections : Any, optional
838 Collections to be searched, overriding ``self.collections``.
839 Can be any of the types supported by the ``collections`` argument
840 to butler construction.
841 run : `str`, optional
842 Run to use for predictions, overriding ``self.run``.
843 kwds
844 Additional keyword arguments used to augment or construct a
845 `DataCoordinate`. See `DataCoordinate.standardize`
846 parameters.
848 Returns
849 -------
850 uri : `str`
851 URI string pointing to the Dataset within the datastore. If the
852 Dataset does not exist in the datastore, and if ``predict`` is
853 `True`, the URI will be a prediction and will include a URI
854 fragment "#predicted".
855 If the datastore does not have entities that relate well
856 to the concept of a URI the returned URI string will be
857 descriptive. The returned URI is not guaranteed to be obtainable.
859 Raises
860 ------
861 LookupError
862 A URI has been requested for a dataset that does not exist and
863 guessing is not allowed.
864 ValueError
865 Raised if a resolved `DatasetRef` was passed as an input, but it
866 differs from the one found in the registry.
867 TypeError
868 Raised if no collections were provided.
869 """
870 ref = self._findDatasetRef(datasetRefOrType, dataId, allowUnresolved=predict,
871 collections=collections, **kwds)
872 if ref.id is None: # only possible if predict is True
873 if run is None:
874 run = self.run
875 if run is None:
876 raise TypeError("Cannot predict location with run=None.")
877 # Lie about ID, because we can't guess it, and only
878 # Datastore.getUri() will ever see it (and it doesn't use it).
879 ref = ref.resolved(id=0, run=self.run)
880 return self.datastore.getUri(ref, predict)
882 def datasetExists(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
883 dataId: Optional[DataId] = None, *,
884 collections: Any = None,
885 **kwds: Any) -> bool:
886 """Return True if the Dataset is actually present in the Datastore.
888 Parameters
889 ----------
890 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
891 When `DatasetRef` the `dataId` should be `None`.
892 Otherwise the `DatasetType` or name thereof.
893 dataId : `dict` or `DataCoordinate`
894 A `dict` of `Dimension` link name, value pairs that label the
895 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
896 should be provided as the first argument.
897 collections : Any, optional
898 Collections to be searched, overriding ``self.collections``.
899 Can be any of the types supported by the ``collections`` argument
900 to butler construction.
901 kwds
902 Additional keyword arguments used to augment or construct a
903 `DataCoordinate`. See `DataCoordinate.standardize`
904 parameters.
906 Raises
907 ------
908 LookupError
909 Raised if the dataset is not even present in the Registry.
910 ValueError
911 Raised if a resolved `DatasetRef` was passed as an input, but it
912 differs from the one found in the registry.
913 TypeError
914 Raised if no collections were provided.
915 """
916 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwds)
917 return self.datastore.exists(ref)
919 def pruneCollection(self, name: str, purge: bool = False, unstore: bool = False):
920 """Remove a collection and possibly prune datasets within it.
922 Parameters
923 ----------
924 name : `str`
925 Name of the collection to remove. If this is a
926 `~CollectionType.TAGGED` or `~CollectionType.CHAINED` collection,
927 datasets within the collection are not modified unless ``unstore``
928 is `True`. If this is a `~CollectionType.RUN` collection,
929 ``purge`` and ``unstore`` must be `True`, and all datasets in it
930 are fully removed from the data repository.
931 purge : `bool`, optional
932 If `True`, permit `~CollectionType.RUN` collections to be removed,
933 fully removing datasets within them. Requires ``unstore=True`` as
934 well as an added precaution against accidental deletion. Must be
935 `False` (default) if the collection is not a ``RUN``.
936 unstore: `bool`, optional
937 If `True`, remove all datasets in the collection from all
938 datastores in which they appear.
940 Raises
941 ------
942 TypeError
943 Raised if the butler is read-only or arguments are mutually
944 inconsistent.
945 """
946 # See pruneDatasets comments for more information about the logic here;
947 # the cases are almost the same, but here we can rely on Registry to
948 # take care everything but Datastore deletion when we remove the
949 # collection.
950 if not self.isWriteable():
951 raise TypeError("Butler is read-only.")
952 if purge and not unstore:
953 raise TypeError("Cannot pass purge=True without unstore=True.")
954 collectionType = self.registry.getCollectionType(name)
955 if collectionType is CollectionType.RUN and not purge:
956 raise TypeError(f"Cannot prune RUN collection {name} without purge=True.")
957 if collectionType is not CollectionType.RUN and purge:
958 raise TypeError(f"Cannot prune {collectionType.name} collection {name} with purge=True.")
959 with self.registry.transaction():
960 if unstore:
961 for ref in self.registry.queryDatasets(..., collections=name, deduplicate=True):
962 if self.datastore.exists(ref):
963 self.datastore.trash(ref)
964 self.registry.removeCollection(name)
965 if unstore:
966 # Point of no return for removing artifacts
967 self.datastore.emptyTrash()
969 def pruneDatasets(self, refs: Iterable[DatasetRef], *,
970 disassociate: bool = True,
971 unstore: bool = False,
972 tags: Optional[Iterable[str]] = None,
973 purge: bool = False,
974 run: Optional[str] = None,
975 recursive: bool = True):
976 """Remove one or more datasets from a collection and/or storage.
978 Parameters
979 ----------
980 refs : `~collections.abc.Iterable` of `DatasetRef`
981 Datasets to prune. These must be "resolved" references (not just
982 a `DatasetType` and data ID).
983 disassociate : bool`, optional
984 Disassociate pruned datasets from ``self.tags`` (or the collections
985 given via the ``tags`` argument). Ignored if ``refs`` is ``...``.
986 unstore : `bool`, optional
987 If `True` (`False` is default) remove these datasets from all
988 datastores known to this butler. Note that this will make it
989 impossible to retrieve these datasets even via other collections.
990 Datasets that are already not stored are ignored by this option.
991 tags : `Iterable` [ `str` ], optional
992 `~CollectionType.TAGGED` collections to disassociate the datasets
993 from, overriding ``self.tags``. Ignored if ``disassociate`` is
994 `False` or ``purge`` is `True`.
995 purge : `bool`, optional
996 If `True` (`False` is default), completely remove the dataset from
997 the `Registry`. To prevent accidental deletions, ``purge`` may
998 only be `True` if all of the following conditions are met:
1000 - All given datasets are in the given run.
1001 - ``disassociate`` is `True`;
1002 - ``unstore`` is `True`.
1004 This mode may remove provenance information from datasets other
1005 than those provided, and should be used with extreme care.
1006 run : `str`, optional
1007 `~CollectionType.RUN` collection to purge from, overriding
1008 ``self.run``. Ignored unless ``purge`` is `True`.
1009 recursive : `bool`, optional
1010 If `True` (default) also prune component datasets of any given
1011 composite datasets. This will only prune components that are
1012 actually attached to the given `DatasetRef` objects, which may
1013 not reflect what is in the database (especially if they were
1014 obtained from `Registry.queryDatasets`, which does not include
1015 components in its results).
1017 Raises
1018 ------
1019 TypeError
1020 Raised if the butler is read-only, if no collection was provided,
1021 or the conditions for ``purge=True`` were not met.
1022 """
1023 if not self.isWriteable():
1024 raise TypeError("Butler is read-only.")
1025 if purge:
1026 if not disassociate:
1027 raise TypeError("Cannot pass purge=True without disassociate=True.")
1028 if not unstore:
1029 raise TypeError("Cannot pass purge=True without unstore=True.")
1030 if run is None:
1031 run = self.run
1032 if run is None:
1033 raise TypeError("No run provided but purge=True.")
1034 collectionType = self.registry.getCollectionType(run)
1035 if collectionType is not CollectionType.RUN:
1036 raise TypeError(f"Cannot purge from collection '{run}' "
1037 f"of non-RUN type {collectionType.name}.")
1038 elif disassociate:
1039 if tags is None:
1040 tags = self.tags
1041 else:
1042 tags = tuple(tags)
1043 if not tags:
1044 raise TypeError("No tags provided but disassociate=True.")
1045 for tag in tags:
1046 collectionType = self.registry.getCollectionType(tag)
1047 if collectionType is not CollectionType.TAGGED:
1048 raise TypeError(f"Cannot disassociate from collection '{tag}' "
1049 f"of non-TAGGED type {collectionType.name}.")
1050 if recursive:
1051 refs = list(DatasetRef.flatten(refs))
1052 # We don't need an unreliable Datastore transaction for this, because
1053 # we've been extra careful to ensure that Datastore.trash only involves
1054 # mutating the Registry (it can _look_ at Datastore-specific things,
1055 # but shouldn't change them), and hence all operations here are
1056 # Registry operations.
1057 with self.registry.transaction():
1058 if unstore:
1059 for ref in refs:
1060 # There is a difference between a concrete composite
1061 # and virtual composite. In a virtual composite the
1062 # datastore is never given the top level DatasetRef. In
1063 # the concrete composite the datastore knows all the
1064 # refs and will clean up itself if asked to remove the
1065 # parent ref. We can not check configuration for this
1066 # since we can not trust that the configuration is the
1067 # same. We therefore have to ask if the ref exists or
1068 # not. This is consistent with the fact that we want
1069 # to ignore already-removed-from-datastore datasets
1070 # anyway.
1071 if self.datastore.exists(ref):
1072 self.datastore.trash(ref)
1073 if purge:
1074 self.registry.removeDatasets(refs, recursive=False) # refs is already recursiveley expanded
1075 elif disassociate:
1076 for tag in tags:
1077 # recursive=False here because refs is already recursive
1078 # if we want it to be.
1079 self.registry.disassociate(tag, refs, recursive=False)
1080 # We've exited the Registry transaction, and apparently committed.
1081 # (if there was an exception, everything rolled back, and it's as if
1082 # nothing happened - and we never get here).
1083 # Datastore artifacts are not yet gone, but they're clearly marked
1084 # as trash, so if we fail to delete now because of (e.g.) filesystem
1085 # problems we can try again later, and if manual administrative
1086 # intervention is required, it's pretty clear what that should entail:
1087 # deleting everything on disk and in private Datastore tables that is
1088 # in the dataset_location_trash table.
1089 if unstore:
1090 # Point of no return for removing artifacts
1091 self.datastore.emptyTrash()
1093 @transactional
1094 def ingest(self, *datasets: FileDataset, transfer: Optional[str] = None, run: Optional[str] = None,
1095 tags: Optional[Iterable[str]] = None,):
1096 """Store and register one or more datasets that already exist on disk.
1098 Parameters
1099 ----------
1100 datasets : `FileDataset`
1101 Each positional argument is a struct containing information about
1102 a file to be ingested, including its path (either absolute or
1103 relative to the datastore root, if applicable), a `DatasetRef`,
1104 and optionally a formatter class or its fully-qualified string
1105 name. If a formatter is not provided, the formatter that would be
1106 used for `put` is assumed. On successful return, all
1107 `FileDataset.ref` attributes will have their `DatasetRef.id`
1108 attribute populated and all `FileDataset.formatter` attributes will
1109 be set to the formatter class used. `FileDataset.path` attributes
1110 may be modified to put paths in whatever the datastore considers a
1111 standardized form.
1112 transfer : `str`, optional
1113 If not `None`, must be one of 'auto', 'move', 'copy', 'hardlink',
1114 'relsymlink' or 'symlink', indicating how to transfer the file.
1115 run : `str`, optional
1116 The name of the run ingested datasets should be added to,
1117 overriding ``self.run``.
1118 tags : `Iterable` [ `str` ], optional
1119 The names of a `~CollectionType.TAGGED` collections to associate
1120 the dataset with, overriding ``self.tags``. These collections
1121 must have already been added to the `Registry`.
1123 Raises
1124 ------
1125 TypeError
1126 Raised if the butler is read-only or if no run was provided.
1127 NotImplementedError
1128 Raised if the `Datastore` does not support the given transfer mode.
1129 DatasetTypeNotSupportedError
1130 Raised if one or more files to be ingested have a dataset type that
1131 is not supported by the `Datastore`..
1132 FileNotFoundError
1133 Raised if one of the given files does not exist.
1134 FileExistsError
1135 Raised if transfer is not `None` but the (internal) location the
1136 file would be moved to is already occupied.
1138 Notes
1139 -----
1140 This operation is not fully exception safe: if a database operation
1141 fails, the given `FileDataset` instances may be only partially updated.
1143 It is atomic in terms of database operations (they will either all
1144 succeed or all fail) providing the database engine implements
1145 transactions correctly. It will attempt to be atomic in terms of
1146 filesystem operations as well, but this cannot be implemented
1147 rigorously for most datastores.
1148 """
1149 if not self.isWriteable():
1150 raise TypeError("Butler is read-only.")
1151 if run is None:
1152 if self.run is None:
1153 raise TypeError("No run provided.")
1154 run = self.run
1155 # No need to check run type, since insertDatasets will do that
1156 # (safely) for us.
1157 if tags is None:
1158 tags = self.tags
1159 else:
1160 tags = tuple(tags)
1161 for tag in tags:
1162 # Check that these are tagged collections up front, because we want
1163 # to avoid relying on Datastore transactionality to avoid modifying
1164 # the repo if there's an error later.
1165 collectionType = self.registry.getCollectionType(tag)
1166 if collectionType is not CollectionType.TAGGED:
1167 raise TypeError(f"Cannot associate into collection '{tag}' of non-TAGGED type "
1168 f"{collectionType.name}.")
1169 # Reorganize the inputs so they're grouped by DatasetType and then
1170 # data ID. We also include a list of DatasetRefs for each FileDataset
1171 # to hold the resolved DatasetRefs returned by the Registry, before
1172 # it's safe to swap them into FileDataset.refs.
1173 # Some type annotation aliases to make that clearer:
1174 GroupForType = Dict[DataCoordinate, Tuple[FileDataset, List[DatasetRef]]]
1175 GroupedData = MutableMapping[DatasetType, GroupForType]
1176 # The actual data structure:
1177 groupedData: GroupedData = defaultdict(dict)
1178 # And the nested loop that populates it:
1179 for dataset in datasets:
1180 # This list intentionally shared across the inner loop, since it's
1181 # associated with `dataset`.
1182 resolvedRefs = []
1183 for ref in dataset.refs:
1184 groupedData[ref.datasetType][ref.dataId] = (dataset, resolvedRefs)
1186 # Now we can bulk-insert into Registry for each DatasetType.
1187 allResolvedRefs = []
1188 for datasetType, groupForType in groupedData.items():
1189 refs = self.registry.insertDatasets(datasetType,
1190 dataIds=groupForType.keys(),
1191 run=run,
1192 recursive=True)
1193 # Append those resolved DatasetRefs to the new lists we set up for
1194 # them.
1195 for ref, (_, resolvedRefs) in zip(refs, groupForType.values()):
1196 resolvedRefs.append(ref)
1198 # Go back to the original FileDatasets to replace their refs with the
1199 # new resolved ones, and also build a big list of all refs.
1200 allResolvedRefs = []
1201 for groupForType in groupedData.values():
1202 for dataset, resolvedRefs in groupForType.values():
1203 dataset.refs = resolvedRefs
1204 allResolvedRefs.extend(resolvedRefs)
1206 # Bulk-associate everything with any tagged collections.
1207 for tag in tags:
1208 self.registry.associate(tag, allResolvedRefs)
1210 # Bulk-insert everything into Datastore.
1211 self.datastore.ingest(*datasets, transfer=transfer)
1213 @contextlib.contextmanager
1214 def export(self, *, directory: Optional[str] = None,
1215 filename: Optional[str] = None,
1216 format: Optional[str] = None,
1217 transfer: Optional[str] = None) -> ContextManager[RepoExport]:
1218 """Export datasets from the repository represented by this `Butler`.
1220 This method is a context manager that returns a helper object
1221 (`RepoExport`) that is used to indicate what information from the
1222 repository should be exported.
1224 Parameters
1225 ----------
1226 directory : `str`, optional
1227 Directory dataset files should be written to if ``transfer`` is not
1228 `None`.
1229 filename : `str`, optional
1230 Name for the file that will include database information associated
1231 with the exported datasets. If this is not an absolute path and
1232 ``directory`` is not `None`, it will be written to ``directory``
1233 instead of the current working directory. Defaults to
1234 "export.{format}".
1235 format : `str`, optional
1236 File format for the database information file. If `None`, the
1237 extension of ``filename`` will be used.
1238 transfer : `str`, optional
1239 Transfer mode passed to `Datastore.export`.
1241 Raises
1242 ------
1243 TypeError
1244 Raised if the set of arguments passed is inconsistent.
1246 Examples
1247 --------
1248 Typically the `Registry.queryDimensions` and `Registry.queryDatasets`
1249 methods are used to provide the iterables over data IDs and/or datasets
1250 to be exported::
1252 with butler.export("exports.yaml") as export:
1253 # Export all flats, and the calibration_label dimensions
1254 # associated with them.
1255 export.saveDatasets(butler.registry.queryDatasets("flat"),
1256 elements=[butler.registry.dimensions["calibration_label"]])
1257 # Export all datasets that start with "deepCoadd_" and all of
1258 # their associated data ID information.
1259 export.saveDatasets(butler.registry.queryDatasets("deepCoadd_*"))
1260 """
1261 if directory is None and transfer is not None:
1262 raise TypeError("Cannot transfer without providing a directory.")
1263 if transfer == "move":
1264 raise TypeError("Transfer may not be 'move': export is read-only")
1265 if format is None:
1266 if filename is None:
1267 raise TypeError("At least one of 'filename' or 'format' must be provided.")
1268 else:
1269 _, format = os.path.splitext(filename)
1270 elif filename is None:
1271 filename = f"export.{format}"
1272 if directory is not None:
1273 filename = os.path.join(directory, filename)
1274 BackendClass = getClassOf(self._config["repo_transfer_formats"][format]["export"])
1275 with open(filename, 'w') as stream:
1276 backend = BackendClass(stream)
1277 try:
1278 helper = RepoExport(self.registry, self.datastore, backend=backend,
1279 directory=directory, transfer=transfer)
1280 yield helper
1281 except BaseException:
1282 raise
1283 else:
1284 helper._finish()
1286 def import_(self, *, directory: Optional[str] = None,
1287 filename: Optional[str] = None,
1288 format: Optional[str] = None,
1289 transfer: Optional[str] = None):
1290 """Import datasets exported from a different butler repository.
1292 Parameters
1293 ----------
1294 directory : `str`, optional
1295 Directory containing dataset files. If `None`, all file paths
1296 must be absolute.
1297 filename : `str`, optional
1298 Name for the file that containing database information associated
1299 with the exported datasets. If this is not an absolute path, does
1300 not exist in the current working directory, and ``directory`` is
1301 not `None`, it is assumed to be in ``directory``. Defaults to
1302 "export.{format}".
1303 format : `str`, optional
1304 File format for the database information file. If `None`, the
1305 extension of ``filename`` will be used.
1306 transfer : `str`, optional
1307 Transfer mode passed to `Datastore.export`.
1309 Raises
1310 ------
1311 TypeError
1312 Raised if the set of arguments passed is inconsistent, or if the
1313 butler is read-only.
1314 """
1315 if not self.isWriteable():
1316 raise TypeError("Butler is read-only.")
1317 if format is None:
1318 if filename is None:
1319 raise TypeError("At least one of 'filename' or 'format' must be provided.")
1320 else:
1321 _, format = os.path.splitext(filename)
1322 elif filename is None:
1323 filename = f"export.{format}"
1324 if directory is not None and not os.path.exists(filename):
1325 filename = os.path.join(directory, filename)
1326 BackendClass = getClassOf(self._config["repo_transfer_formats"][format]["import"])
1327 with open(filename, 'r') as stream:
1328 backend = BackendClass(stream, self.registry)
1329 backend.register()
1330 with self.transaction():
1331 backend.load(self.datastore, directory=directory, transfer=transfer)
1333 def validateConfiguration(self, logFailures: bool = False,
1334 datasetTypeNames: Optional[Iterable[str]] = None,
1335 ignore: Iterable[str] = None):
1336 """Validate butler configuration.
1338 Checks that each `DatasetType` can be stored in the `Datastore`.
1340 Parameters
1341 ----------
1342 logFailures : `bool`, optional
1343 If `True`, output a log message for every validation error
1344 detected.
1345 datasetTypeNames : iterable of `str`, optional
1346 The `DatasetType` names that should be checked. This allows
1347 only a subset to be selected.
1348 ignore : iterable of `str`, optional
1349 Names of DatasetTypes to skip over. This can be used to skip
1350 known problems. If a named `DatasetType` corresponds to a
1351 composite, all component of that `DatasetType` will also be
1352 ignored.
1354 Raises
1355 ------
1356 ButlerValidationError
1357 Raised if there is some inconsistency with how this Butler
1358 is configured.
1359 """
1360 if datasetTypeNames:
1361 entities = [self.registry.getDatasetType(name) for name in datasetTypeNames]
1362 else:
1363 entities = list(self.registry.queryDatasetTypes())
1365 # filter out anything from the ignore list
1366 if ignore:
1367 ignore = set(ignore)
1368 entities = [e for e in entities if e.name not in ignore and e.nameAndComponent()[0] not in ignore]
1369 else:
1370 ignore = set()
1372 # Find all the registered instruments
1373 instruments = set(
1374 dataId["instrument"] for dataId in self.registry.queryDimensions(["instrument"])
1375 )
1377 # For each datasetType that has an instrument dimension, create
1378 # a DatasetRef for each defined instrument
1379 datasetRefs = []
1381 for datasetType in entities:
1382 if "instrument" in datasetType.dimensions:
1383 for instrument in instruments:
1384 datasetRef = DatasetRef(datasetType, {"instrument": instrument}, conform=False)
1385 datasetRefs.append(datasetRef)
1387 entities.extend(datasetRefs)
1389 datastoreErrorStr = None
1390 try:
1391 self.datastore.validateConfiguration(entities, logFailures=logFailures)
1392 except ValidationError as e:
1393 datastoreErrorStr = str(e)
1395 # Also check that the LookupKeys used by the datastores match
1396 # registry and storage class definitions
1397 keys = self.datastore.getLookupKeys()
1399 failedNames = set()
1400 failedDataId = set()
1401 for key in keys:
1402 datasetType = None
1403 if key.name is not None:
1404 if key.name in ignore:
1405 continue
1407 # skip if specific datasetType names were requested and this
1408 # name does not match
1409 if datasetTypeNames and key.name not in datasetTypeNames:
1410 continue
1412 # See if it is a StorageClass or a DatasetType
1413 if key.name in self.storageClasses:
1414 pass
1415 else:
1416 try:
1417 self.registry.getDatasetType(key.name)
1418 except KeyError:
1419 if logFailures:
1420 log.fatal("Key '%s' does not correspond to a DatasetType or StorageClass", key)
1421 failedNames.add(key)
1422 else:
1423 # Dimensions are checked for consistency when the Butler
1424 # is created and rendezvoused with a universe.
1425 pass
1427 # Check that the instrument is a valid instrument
1428 # Currently only support instrument so check for that
1429 if key.dataId:
1430 dataIdKeys = set(key.dataId)
1431 if set(["instrument"]) != dataIdKeys:
1432 if logFailures:
1433 log.fatal("Key '%s' has unsupported DataId override", key)
1434 failedDataId.add(key)
1435 elif key.dataId["instrument"] not in instruments:
1436 if logFailures:
1437 log.fatal("Key '%s' has unknown instrument", key)
1438 failedDataId.add(key)
1440 messages = []
1442 if datastoreErrorStr:
1443 messages.append(datastoreErrorStr)
1445 for failed, msg in ((failedNames, "Keys without corresponding DatasetType or StorageClass entry: "),
1446 (failedDataId, "Keys with bad DataId entries: ")):
1447 if failed:
1448 msg += ", ".join(str(k) for k in failed)
1449 messages.append(msg)
1451 if messages:
1452 raise ValidationError(";\n".join(messages))
1454 registry: Registry
1455 """The object that manages dataset metadata and relationships (`Registry`).
1457 Most operations that don't involve reading or writing butler datasets are
1458 accessible only via `Registry` methods.
1459 """
1461 datastore: Datastore
1462 """The object that manages actual dataset storage (`Datastore`).
1464 Direct user access to the datastore should rarely be necessary; the primary
1465 exception is the case where a `Datastore` implementation provides extra
1466 functionality beyond what the base class defines.
1467 """
1469 storageClasses: StorageClassFactory
1470 """An object that maps known storage class names to objects that fully
1471 describe them (`StorageClassFactory`).
1472 """
1474 collections: Optional[CollectionSearch]
1475 """The collections to search and any restrictions on the dataset types to
1476 search for within them, in order (`CollectionSearch`).
1477 """
1479 run: Optional[str]
1480 """Name of the run this butler writes outputs to (`str` or `None`).
1481 """
1483 tags: Tuple[str, ...]
1484 """Names of `~CollectionType.TAGGED` collections this butler associates
1485 with in `put` and `ingest`, and disassociates from in `pruneDatasets`
1486 (`tuple` [ `str` ]).
1487 """