Coverage for python/lsst/daf/butler/_butler.py : 8%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""
23Butler top level classes.
24"""
25from __future__ import annotations
27__all__ = ("Butler", "ButlerValidationError")
29import os
30from collections import defaultdict
31import contextlib
32import logging
33from typing import (
34 Any,
35 ClassVar,
36 ContextManager,
37 Dict,
38 Iterable,
39 List,
40 Mapping,
41 MutableMapping,
42 Optional,
43 Tuple,
44 Union,
45)
47try:
48 import boto3
49except ImportError:
50 boto3 = None
52from lsst.utils import doImport
53from .core import (
54 ButlerURI,
55 CompositesMap,
56 Config,
57 ConfigSubset,
58 DataCoordinate,
59 DataId,
60 DatasetRef,
61 DatasetType,
62 Datastore,
63 FileDataset,
64 Quantum,
65 RepoExport,
66 StorageClassFactory,
67 ValidationError,
68)
69from .core.repoRelocation import BUTLER_ROOT_TAG
70from .core.safeFileIo import safeMakeDir
71from .core.utils import transactional, getClassOf
72from ._deferredDatasetHandle import DeferredDatasetHandle
73from ._butlerConfig import ButlerConfig
74from .registry import Registry, RegistryConfig, CollectionType
75from .registry.wildcards import CollectionSearch
77log = logging.getLogger(__name__)
80class ButlerValidationError(ValidationError):
81 """There is a problem with the Butler configuration."""
82 pass
85class Butler:
86 """Main entry point for the data access system.
88 Parameters
89 ----------
90 config : `ButlerConfig`, `Config` or `str`, optional.
91 Configuration. Anything acceptable to the
92 `ButlerConfig` constructor. If a directory path
93 is given the configuration will be read from a ``butler.yaml`` file in
94 that location. If `None` is given default values will be used.
95 butler : `Butler`, optional.
96 If provided, construct a new Butler that uses the same registry and
97 datastore as the given one, but with the given collection and run.
98 Incompatible with the ``config``, ``searchPaths``, and ``writeable``
99 arguments.
100 collections : `Any`, optional
101 An expression specifying the collections to be searched (in order) when
102 reading datasets, and optionally dataset type restrictions on them.
103 This may be:
104 - a `str` collection name;
105 - a tuple of (collection name, *dataset type restriction*);
106 - an iterable of either of the above;
107 - a mapping from `str` to *dataset type restriction*.
109 See :ref:`daf_butler_collection_expressions` for more information,
110 including the definition of a *dataset type restriction*. All
111 collections must either already exist or be specified to be created
112 by other arguments.
113 run : `str`, optional
114 Name of the run datasets should be output to. If the run
115 does not exist, it will be created. If ``collections`` is `None`, it
116 will be set to ``[run]``. If this is not set (and ``writeable`` is
117 not set either), a read-only butler will be created.
118 tags : `Iterable` [ `str` ], optional
119 A list of `~CollectionType.TAGGED` collections that datasets should be
120 associated with in `put` or `ingest` and disassociated from in `prune`.
121 If any of these collections does not exist, it will be created.
122 chains : `Mapping` [ `str`, `Iterable` [ `str` ] ], optional
123 A mapping from the names of new `~CollectionType.CHAINED` collections
124 to an expression identifying their child collections (which takes the
125 same form as the ``collections`` argument. Chains may be nested only
126 if children precede their parents in this mapping.
127 searchPaths : `list` of `str`, optional
128 Directory paths to search when calculating the full Butler
129 configuration. Not used if the supplied config is already a
130 `ButlerConfig`.
131 writeable : `bool`, optional
132 Explicitly sets whether the butler supports write operations. If not
133 provided, a read-write butler is created if any of ``run``, ``tags``,
134 or ``chains`` is non-empty.
136 Examples
137 --------
138 While there are many ways to control exactly how a `Butler` interacts with
139 the collections in its `Registry`, the most common cases are still simple.
141 For a read-only `Butler` that searches one collection, do::
143 butler = Butler("/path/to/repo", collections=["u/alice/DM-50000"])
145 For a read-write `Butler` that writes to and reads from a
146 `~CollectionType.RUN` collection::
148 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a")
150 The `Butler` passed to a ``PipelineTask`` is often much more complex,
151 because we want to write to one `~CollectionType.RUN` collection but read
152 from several others (as well), while defining a new
153 `~CollectionType.CHAINED` collection that combines them all::
155 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a",
156 collections=["u/alice/DM-50000"],
157 chains={
158 "u/alice/DM-50000": ["u/alice/DM-50000/a",
159 "u/bob/DM-49998",
160 "raw/hsc"]
161 })
163 This butler will `put` new datasets to the run ``u/alice/DM-50000/a``, but
164 they'll also be available from the chained collection ``u/alice/DM-50000``.
165 Datasets will be read first from that run (since it appears first in the
166 chain), and then from ``u/bob/DM-49998`` and finally ``raw/hsc``.
167 If ``u/alice/DM-50000`` had already been defined, the ``chain`` argument
168 would be unnecessary. We could also construct a butler that performs
169 exactly the same `put` and `get` operations without actually creating a
170 chained collection, just by passing multiple items is ``collections``::
172 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a",
173 collections=["u/alice/DM-50000/a",
174 "u/bob/DM-49998",
175 "raw/hsc"])
177 Finally, one can always create a `Butler` with no collections::
179 butler = Butler("/path/to/repo", writeable=True)
181 This can be extremely useful when you just want to use ``butler.registry``,
182 e.g. for inserting dimension data or managing collections, or when the
183 collections you want to use with the butler are not consistent.
184 Passing ``writeable`` explicitly here is only necessary if you want to be
185 able to make changes to the repo - usually the value for ``writeable`` is
186 can be guessed from the collection arguments provided, but it defaults to
187 `False` when there are not collection arguments.
188 """
189 def __init__(self, config: Union[Config, str, None] = None, *,
190 butler: Optional[Butler] = None,
191 collections: Any = None,
192 run: Optional[str] = None,
193 tags: Iterable[str] = (),
194 chains: Optional[Mapping[str, Any]] = None,
195 searchPaths: Optional[List[str]] = None,
196 writeable: Optional[bool] = None):
197 # Transform any single-pass iterator into an actual sequence so we
198 # can see if its empty
199 self.tags = tuple(tags)
200 # Load registry, datastore, etc. from config or existing butler.
201 if butler is not None:
202 if config is not None or searchPaths is not None or writeable is not None:
203 raise TypeError("Cannot pass 'config', 'searchPaths', or 'writeable' "
204 "arguments with 'butler' argument.")
205 self.registry = butler.registry
206 self.datastore = butler.datastore
207 self.storageClasses = butler.storageClasses
208 self._composites = butler._composites
209 self._config = butler._config
210 else:
211 self._config = ButlerConfig(config, searchPaths=searchPaths)
212 if "root" in self._config:
213 butlerRoot = self._config["root"]
214 else:
215 butlerRoot = self._config.configDir
216 if writeable is None:
217 writeable = run is not None or chains is not None or self.tags
218 self.registry = Registry.fromConfig(self._config, butlerRoot=butlerRoot, writeable=writeable)
219 self.datastore = Datastore.fromConfig(self._config, self.registry, butlerRoot=butlerRoot)
220 self.storageClasses = StorageClassFactory()
221 self.storageClasses.addFromConfig(self._config)
222 self._composites = CompositesMap(self._config, universe=self.registry.dimensions)
223 # Check the many collection arguments for consistency and create any
224 # needed collections that don't exist.
225 if collections is None:
226 if run is not None:
227 collections = (run,)
228 else:
229 collections = ()
230 self.collections = CollectionSearch.fromExpression(collections)
231 if chains is None:
232 chains = {}
233 self.run = run
234 if "run" in self._config or "collection" in self._config:
235 raise ValueError("Passing a run or collection via configuration is no longer supported.")
236 if self.run is not None:
237 self.registry.registerCollection(self.run, type=CollectionType.RUN)
238 for tag in self.tags:
239 self.registry.registerCollection(tag, type=CollectionType.TAGGED)
240 for parent, children in chains.items():
241 self.registry.registerCollection(parent, type=CollectionType.CHAINED)
242 self.registry.setCollectionChain(parent, children)
244 GENERATION: ClassVar[int] = 3
245 """This is a Generation 3 Butler.
247 This attribute may be removed in the future, once the Generation 2 Butler
248 interface has been fully retired; it should only be used in transitional
249 code.
250 """
252 @staticmethod
253 def makeRepo(root: str, config: Union[Config, str, None] = None, standalone: bool = False,
254 createRegistry: bool = True, searchPaths: Optional[List[str]] = None,
255 forceConfigRoot: bool = True, outfile: Optional[str] = None,
256 overwrite: bool = False) -> Config:
257 """Create an empty data repository by adding a butler.yaml config
258 to a repository root directory.
260 Parameters
261 ----------
262 root : `str`
263 Filesystem path to the root of the new repository. Will be created
264 if it does not exist.
265 config : `Config` or `str`, optional
266 Configuration to write to the repository, after setting any
267 root-dependent Registry or Datastore config options. Can not
268 be a `ButlerConfig` or a `ConfigSubset`. If `None`, default
269 configuration will be used. Root-dependent config options
270 specified in this config are overwritten if ``forceConfigRoot``
271 is `True`.
272 standalone : `bool`
273 If True, write all expanded defaults, not just customized or
274 repository-specific settings.
275 This (mostly) decouples the repository from the default
276 configuration, insulating it from changes to the defaults (which
277 may be good or bad, depending on the nature of the changes).
278 Future *additions* to the defaults will still be picked up when
279 initializing `Butlers` to repos created with ``standalone=True``.
280 createRegistry : `bool`, optional
281 If `True` create a new Registry.
282 searchPaths : `list` of `str`, optional
283 Directory paths to search when calculating the full butler
284 configuration.
285 forceConfigRoot : `bool`, optional
286 If `False`, any values present in the supplied ``config`` that
287 would normally be reset are not overridden and will appear
288 directly in the output config. This allows non-standard overrides
289 of the root directory for a datastore or registry to be given.
290 If this parameter is `True` the values for ``root`` will be
291 forced into the resulting config if appropriate.
292 outfile : `str`, optional
293 If not-`None`, the output configuration will be written to this
294 location rather than into the repository itself. Can be a URI
295 string. Can refer to a directory that will be used to write
296 ``butler.yaml``.
297 overwrite : `bool`, optional
298 Create a new configuration file even if one already exists
299 in the specified output location. Default is to raise
300 an exception.
302 Returns
303 -------
304 config : `Config`
305 The updated `Config` instance written to the repo.
307 Raises
308 ------
309 ValueError
310 Raised if a ButlerConfig or ConfigSubset is passed instead of a
311 regular Config (as these subclasses would make it impossible to
312 support ``standalone=False``).
313 FileExistsError
314 Raised if the output config file already exists.
315 os.error
316 Raised if the directory does not exist, exists but is not a
317 directory, or cannot be created.
319 Notes
320 -----
321 Note that when ``standalone=False`` (the default), the configuration
322 search path (see `ConfigSubset.defaultSearchPaths`) that was used to
323 construct the repository should also be used to construct any Butlers
324 to avoid configuration inconsistencies.
325 """
326 if isinstance(config, (ButlerConfig, ConfigSubset)):
327 raise ValueError("makeRepo must be passed a regular Config without defaults applied.")
329 # for "file" schemes we are assuming POSIX semantics for paths, for
330 # schemeless URIs we are assuming os.path semantics.
331 uri = ButlerURI(root)
332 if uri.scheme == "file" or not uri.scheme:
333 if not os.path.isdir(uri.ospath):
334 safeMakeDir(uri.ospath)
335 elif uri.scheme == "s3":
336 s3 = boto3.resource("s3")
337 # implies bucket exists, if not another level of checks
338 bucket = s3.Bucket(uri.netloc)
339 bucket.put_object(Bucket=uri.netloc, Key=uri.relativeToPathRoot)
340 else:
341 raise ValueError(f"Unrecognized scheme: {uri.scheme}")
342 config = Config(config)
344 # If we are creating a new repo from scratch with relative roots,
345 # do not propagate an explicit root from the config file
346 if "root" in config:
347 del config["root"]
349 full = ButlerConfig(config, searchPaths=searchPaths) # this applies defaults
350 datastoreClass = doImport(full["datastore", "cls"])
351 datastoreClass.setConfigRoot(BUTLER_ROOT_TAG, config, full, overwrite=forceConfigRoot)
353 # if key exists in given config, parse it, otherwise parse the defaults
354 # in the expanded config
355 if config.get(("registry", "db")):
356 registryConfig = RegistryConfig(config)
357 else:
358 registryConfig = RegistryConfig(full)
359 defaultDatabaseUri = registryConfig.makeDefaultDatabaseUri(BUTLER_ROOT_TAG)
360 if defaultDatabaseUri is not None:
361 Config.updateParameters(RegistryConfig, config, full,
362 toUpdate={"db": defaultDatabaseUri},
363 overwrite=forceConfigRoot)
364 else:
365 Config.updateParameters(RegistryConfig, config, full, toCopy=("db",),
366 overwrite=forceConfigRoot)
368 if standalone:
369 config.merge(full)
370 if outfile is not None:
371 # When writing to a separate location we must include
372 # the root of the butler repo in the config else it won't know
373 # where to look.
374 config["root"] = uri.geturl()
375 configURI = outfile
376 else:
377 configURI = uri
378 config.dumpToUri(configURI, overwrite=overwrite)
380 # Create Registry and populate tables
381 Registry.fromConfig(config, create=createRegistry, butlerRoot=root)
382 return config
384 @classmethod
385 def _unpickle(cls, config: ButlerConfig, collections: Optional[CollectionSearch], run: Optional[str],
386 tags: Tuple[str, ...], writeable: bool) -> Butler:
387 """Callable used to unpickle a Butler.
389 We prefer not to use ``Butler.__init__`` directly so we can force some
390 of its many arguments to be keyword-only (note that ``__reduce__``
391 can only invoke callables with positional arguments).
393 Parameters
394 ----------
395 config : `ButlerConfig`
396 Butler configuration, already coerced into a true `ButlerConfig`
397 instance (and hence after any search paths for overrides have been
398 utilized).
399 collections : `CollectionSearch`
400 Names of collections to read from.
401 run : `str`, optional
402 Name of `~CollectionType.RUN` collection to write to.
403 tags : `tuple` [`str`]
404 Names of `~CollectionType.TAGGED` collections to associate with.
405 writeable : `bool`
406 Whether the Butler should support write operations.
408 Returns
409 -------
410 butler : `Butler`
411 A new `Butler` instance.
412 """
413 return cls(config=config, collections=collections, run=run, tags=tags, writeable=writeable)
415 def __reduce__(self):
416 """Support pickling.
417 """
418 return (Butler._unpickle, (self._config, self.collections, self.run, self.tags,
419 self.registry.isWriteable()))
421 def __str__(self):
422 return "Butler(collections={}, run={}, tags={}, datastore='{}', registry='{}')".format(
423 self.collections, self.run, self.tags, self.datastore, self.registry)
425 def isWriteable(self) -> bool:
426 """Return `True` if this `Butler` supports write operations.
427 """
428 return self.registry.isWriteable()
430 @contextlib.contextmanager
431 def transaction(self):
432 """Context manager supporting `Butler` transactions.
434 Transactions can be nested.
435 """
436 with self.registry.transaction():
437 with self.datastore.transaction():
438 yield
440 def _standardizeArgs(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
441 dataId: Optional[DataId] = None, **kwds: Any) -> Tuple[DatasetType, DataId]:
442 """Standardize the arguments passed to several Butler APIs.
444 Parameters
445 ----------
446 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
447 When `DatasetRef` the `dataId` should be `None`.
448 Otherwise the `DatasetType` or name thereof.
449 dataId : `dict` or `DataCoordinate`
450 A `dict` of `Dimension` link name, value pairs that label the
451 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
452 should be provided as the second argument.
453 kwds
454 Additional keyword arguments used to augment or construct a
455 `DataCoordinate`. See `DataCoordinate.standardize`
456 parameters.
458 Returns
459 -------
460 datasetType : `DatasetType`
461 A `DatasetType` instance extracted from ``datasetRefOrType``.
462 dataId : `dict` or `DataId`, optional
463 Argument that can be used (along with ``kwds``) to construct a
464 `DataId`.
466 Notes
467 -----
468 Butler APIs that conceptually need a DatasetRef also allow passing a
469 `DatasetType` (or the name of one) and a `DataId` (or a dict and
470 keyword arguments that can be used to construct one) separately. This
471 method accepts those arguments and always returns a true `DatasetType`
472 and a `DataId` or `dict`.
474 Standardization of `dict` vs `DataId` is best handled by passing the
475 returned ``dataId`` (and ``kwds``) to `Registry` APIs, which are
476 generally similarly flexible.
477 """
478 externalDatasetType = None
479 internalDatasetType = None
480 if isinstance(datasetRefOrType, DatasetRef):
481 if dataId is not None or kwds:
482 raise ValueError("DatasetRef given, cannot use dataId as well")
483 externalDatasetType = datasetRefOrType.datasetType
484 dataId = datasetRefOrType.dataId
485 else:
486 # Don't check whether DataId is provided, because Registry APIs
487 # can usually construct a better error message when it wasn't.
488 if isinstance(datasetRefOrType, DatasetType):
489 externalDatasetType = datasetRefOrType
490 else:
491 internalDatasetType = self.registry.getDatasetType(datasetRefOrType)
493 # Check that they are self-consistent
494 if externalDatasetType is not None:
495 internalDatasetType = self.registry.getDatasetType(externalDatasetType.name)
496 if externalDatasetType != internalDatasetType:
497 raise ValueError(f"Supplied dataset type ({externalDatasetType}) inconsistent with "
498 f"registry definition ({internalDatasetType})")
500 return internalDatasetType, dataId
502 def _findDatasetRef(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
503 dataId: Optional[DataId] = None, *,
504 collections: Any = None,
505 allowUnresolved: bool = False,
506 **kwds: Any) -> DatasetRef:
507 """Shared logic for methods that start with a search for a dataset in
508 the registry.
510 Parameters
511 ----------
512 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
513 When `DatasetRef` the `dataId` should be `None`.
514 Otherwise the `DatasetType` or name thereof.
515 dataId : `dict` or `DataCoordinate`, optional
516 A `dict` of `Dimension` link name, value pairs that label the
517 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
518 should be provided as the first argument.
519 collections : Any, optional
520 Collections to be searched, overriding ``self.collections``.
521 Can be any of the types supported by the ``collections`` argument
522 to butler construction.
523 allowUnresolved : `bool`, optional
524 If `True`, return an unresolved `DatasetRef` if finding a resolved
525 one in the `Registry` fails. Defaults to `False`.
526 kwds
527 Additional keyword arguments used to augment or construct a
528 `DataId`. See `DataId` parameters.
530 Returns
531 -------
532 ref : `DatasetRef`
533 A reference to the dataset identified by the given arguments.
535 Raises
536 ------
537 LookupError
538 Raised if no matching dataset exists in the `Registry` (and
539 ``allowUnresolved is False``).
540 ValueError
541 Raised if a resolved `DatasetRef` was passed as an input, but it
542 differs from the one found in the registry.
543 TypeError
544 Raised if no collections were provided.
545 """
546 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwds)
547 if isinstance(datasetRefOrType, DatasetRef):
548 idNumber = datasetRefOrType.id
549 else:
550 idNumber = None
551 # Expand the data ID first instead of letting registry.findDataset do
552 # it, so we get the result even if it returns None.
553 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions, **kwds)
554 if collections is None:
555 collections = self.collections
556 if not collections:
557 raise TypeError("No input collections provided.")
558 else:
559 collections = CollectionSearch.fromExpression(collections)
560 # Always lookup the DatasetRef, even if one is given, to ensure it is
561 # present in the current collection.
562 ref = self.registry.findDataset(datasetType, dataId, collections=collections)
563 if ref is None:
564 if allowUnresolved:
565 return DatasetRef(datasetType, dataId)
566 else:
567 raise LookupError(f"Dataset {datasetType.name} with data ID {dataId} "
568 f"could not be found in collections {collections}.")
569 if idNumber is not None and idNumber != ref.id:
570 raise ValueError(f"DatasetRef.id provided ({idNumber}) does not match "
571 f"id ({ref.id}) in registry in collections {collections}.")
572 return ref
574 @transactional
575 def put(self, obj: Any, datasetRefOrType: Union[DatasetRef, DatasetType, str],
576 dataId: Optional[DataId] = None, *,
577 producer: Optional[Quantum] = None,
578 run: Optional[str] = None,
579 tags: Optional[Iterable[str]] = None,
580 **kwds: Any) -> DatasetRef:
581 """Store and register a dataset.
583 Parameters
584 ----------
585 obj : `object`
586 The dataset.
587 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
588 When `DatasetRef` is provided, ``dataId`` should be `None`.
589 Otherwise the `DatasetType` or name thereof.
590 dataId : `dict` or `DataCoordinate`
591 A `dict` of `Dimension` link name, value pairs that label the
592 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
593 should be provided as the second argument.
594 producer : `Quantum`, optional
595 The producer.
596 run : `str`, optional
597 The name of the run the dataset should be added to, overriding
598 ``self.run``.
599 tags : `Iterable` [ `str` ], optional
600 The names of a `~CollectionType.TAGGED` collections to associate
601 the dataset with, overriding ``self.tags``. These collections
602 must have already been added to the `Registry`.
603 kwds
604 Additional keyword arguments used to augment or construct a
605 `DataCoordinate`. See `DataCoordinate.standardize`
606 parameters.
608 Returns
609 -------
610 ref : `DatasetRef`
611 A reference to the stored dataset, updated with the correct id if
612 given.
614 Raises
615 ------
616 TypeError
617 Raised if the butler is read-only or if no run has been provided.
618 """
619 log.debug("Butler put: %s, dataId=%s, producer=%s, run=%s", datasetRefOrType, dataId, producer, run)
620 if not self.isWriteable():
621 raise TypeError("Butler is read-only.")
622 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwds)
623 if isinstance(datasetRefOrType, DatasetRef) and datasetRefOrType.id is not None:
624 raise ValueError("DatasetRef must not be in registry, must have None id")
626 if run is None:
627 if self.run is None:
628 raise TypeError("No run provided.")
629 run = self.run
630 # No need to check type for run; first thing we do is
631 # insertDatasets, and that will check for us.
633 if tags is None:
634 tags = self.tags
635 else:
636 tags = tuple(tags)
637 for tag in tags:
638 # Check that these are tagged collections up front, because we want
639 # to avoid relying on Datastore transactionality to avoid modifying
640 # the repo if there's an error later.
641 collectionType = self.registry.getCollectionType(tag)
642 if collectionType is not CollectionType.TAGGED:
643 raise TypeError(f"Cannot associate into collection '{tag}' of non-TAGGED type "
644 f"{collectionType.name}.")
646 isVirtualComposite = self._composites.shouldBeDisassembled(datasetType)
648 # Add Registry Dataset entry. If not a virtual composite, add
649 # and attach components at the same time.
650 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions, **kwds)
651 ref, = self.registry.insertDatasets(datasetType, run=run, dataIds=[dataId],
652 producer=producer, recursive=not isVirtualComposite)
654 # Check to see if this datasetType requires disassembly
655 if isVirtualComposite:
656 components = datasetType.storageClass.assembler().disassemble(obj)
657 for component, info in components.items():
658 compTypeName = datasetType.componentTypeName(component)
659 compRef = self.put(info.component, compTypeName, dataId, producer=producer, run=run,
660 collection=False) # We don't need to recursively associate.
661 self.registry.attachComponent(component, ref, compRef)
662 else:
663 # This is an entity without a disassembler.
664 self.datastore.put(obj, ref)
666 for tag in tags:
667 self.registry.associate(tag, [ref]) # this is already recursive by default
669 return ref
671 def getDirect(self, ref: DatasetRef, *, parameters: Optional[Dict[str, Any]] = None):
672 """Retrieve a stored dataset.
674 Unlike `Butler.get`, this method allows datasets outside the Butler's
675 collection to be read as long as the `DatasetRef` that identifies them
676 can be obtained separately.
678 Parameters
679 ----------
680 ref : `DatasetRef`
681 Reference to an already stored dataset.
682 parameters : `dict`
683 Additional StorageClass-defined options to control reading,
684 typically used to efficiently read only a subset of the dataset.
686 Returns
687 -------
688 obj : `object`
689 The dataset.
690 """
691 # if the ref exists in the store we return it directly
692 if self.datastore.exists(ref):
693 return self.datastore.get(ref, parameters=parameters)
694 elif ref.isComposite():
695 # Check that we haven't got any unknown parameters
696 ref.datasetType.storageClass.validateParameters(parameters)
697 # Reconstruct the composite
698 usedParams = set()
699 components = {}
700 for compName, compRef in ref.components.items():
701 # make a dictionary of parameters containing only the subset
702 # supported by the StorageClass of the components
703 compParams = compRef.datasetType.storageClass.filterParameters(parameters)
704 usedParams.update(set(compParams))
705 components[compName] = self.datastore.get(compRef, parameters=compParams)
707 # Any unused parameters will have to be passed to the assembler
708 if parameters:
709 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams}
710 else:
711 unusedParams = {}
713 # Assemble the components
714 inMemoryDataset = ref.datasetType.storageClass.assembler().assemble(components)
715 return ref.datasetType.storageClass.assembler().handleParameters(inMemoryDataset,
716 parameters=unusedParams)
717 else:
718 # single entity in datastore
719 raise FileNotFoundError(f"Unable to locate dataset '{ref}' in datastore {self.datastore.name}")
721 def getDeferred(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
722 dataId: Optional[DataId] = None, *,
723 parameters: Union[dict, None] = None,
724 collections: Any = None,
725 **kwds: Any) -> DeferredDatasetHandle:
726 """Create a `DeferredDatasetHandle` which can later retrieve a dataset
728 Parameters
729 ----------
730 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
731 When `DatasetRef` the `dataId` should be `None`.
732 Otherwise the `DatasetType` or name thereof.
733 dataId : `dict` or `DataCoordinate`, optional
734 A `dict` of `Dimension` link name, value pairs that label the
735 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
736 should be provided as the first argument.
737 parameters : `dict`
738 Additional StorageClass-defined options to control reading,
739 typically used to efficiently read only a subset of the dataset.
740 collections : Any, optional
741 Collections to be searched, overriding ``self.collections``.
742 Can be any of the types supported by the ``collections`` argument
743 to butler construction.
744 kwds
745 Additional keyword arguments used to augment or construct a
746 `DataId`. See `DataId` parameters.
748 Returns
749 -------
750 obj : `DeferredDatasetHandle`
751 A handle which can be used to retrieve a dataset at a later time.
753 Raises
754 ------
755 LookupError
756 Raised if no matching dataset exists in the `Registry` (and
757 ``allowUnresolved is False``).
758 ValueError
759 Raised if a resolved `DatasetRef` was passed as an input, but it
760 differs from the one found in the registry.
761 TypeError
762 Raised if no collections were provided.
763 """
764 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwds)
765 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters)
767 def get(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
768 dataId: Optional[DataId] = None, *,
769 parameters: Optional[Dict[str, Any]] = None,
770 collections: Any = None,
771 **kwds: Any) -> Any:
772 """Retrieve a stored dataset.
774 Parameters
775 ----------
776 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
777 When `DatasetRef` the `dataId` should be `None`.
778 Otherwise the `DatasetType` or name thereof.
779 dataId : `dict` or `DataCoordinate`
780 A `dict` of `Dimension` link name, value pairs that label the
781 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
782 should be provided as the first argument.
783 parameters : `dict`
784 Additional StorageClass-defined options to control reading,
785 typically used to efficiently read only a subset of the dataset.
786 collections : Any, optional
787 Collections to be searched, overriding ``self.collections``.
788 Can be any of the types supported by the ``collections`` argument
789 to butler construction.
790 kwds
791 Additional keyword arguments used to augment or construct a
792 `DataCoordinate`. See `DataCoordinate.standardize`
793 parameters.
795 Returns
796 -------
797 obj : `object`
798 The dataset.
800 Raises
801 ------
802 ValueError
803 Raised if a resolved `DatasetRef` was passed as an input, but it
804 differs from the one found in the registry.
805 LookupError
806 Raised if no matching dataset exists in the `Registry`.
807 TypeError
808 Raised if no collections were provided.
809 """
810 log.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters)
811 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwds)
812 return self.getDirect(ref, parameters=parameters)
814 def getUri(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
815 dataId: Optional[DataId] = None, *,
816 predict: bool = False,
817 collections: Any = None,
818 run: Optional[str] = None,
819 **kwds: Any) -> str:
820 """Return the URI to the Dataset.
822 Parameters
823 ----------
824 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
825 When `DatasetRef` the `dataId` should be `None`.
826 Otherwise the `DatasetType` or name thereof.
827 dataId : `dict` or `DataCoordinate`
828 A `dict` of `Dimension` link name, value pairs that label the
829 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
830 should be provided as the first argument.
831 predict : `bool`
832 If `True`, allow URIs to be returned of datasets that have not
833 been written.
834 collections : Any, optional
835 Collections to be searched, overriding ``self.collections``.
836 Can be any of the types supported by the ``collections`` argument
837 to butler construction.
838 run : `str`, optional
839 Run to use for predictions, overriding ``self.run``.
840 kwds
841 Additional keyword arguments used to augment or construct a
842 `DataCoordinate`. See `DataCoordinate.standardize`
843 parameters.
845 Returns
846 -------
847 uri : `str`
848 URI string pointing to the Dataset within the datastore. If the
849 Dataset does not exist in the datastore, and if ``predict`` is
850 `True`, the URI will be a prediction and will include a URI
851 fragment "#predicted".
852 If the datastore does not have entities that relate well
853 to the concept of a URI the returned URI string will be
854 descriptive. The returned URI is not guaranteed to be obtainable.
856 Raises
857 ------
858 LookupError
859 A URI has been requested for a dataset that does not exist and
860 guessing is not allowed.
861 ValueError
862 Raised if a resolved `DatasetRef` was passed as an input, but it
863 differs from the one found in the registry.
864 TypeError
865 Raised if no collections were provided.
866 """
867 ref = self._findDatasetRef(datasetRefOrType, dataId, allowUnresolved=predict,
868 collections=collections, **kwds)
869 if ref.id is None: # only possible if predict is True
870 if run is None:
871 run = self.run
872 if run is None:
873 raise TypeError("Cannot predict location with run=None.")
874 # Lie about ID, because we can't guess it, and only
875 # Datastore.getUri() will ever see it (and it doesn't use it).
876 ref = ref.resolved(id=0, run=self.run)
877 return self.datastore.getUri(ref, predict)
879 def datasetExists(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
880 dataId: Optional[DataId] = None, *,
881 collections: Any = None,
882 **kwds: Any) -> bool:
883 """Return True if the Dataset is actually present in the Datastore.
885 Parameters
886 ----------
887 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
888 When `DatasetRef` the `dataId` should be `None`.
889 Otherwise the `DatasetType` or name thereof.
890 dataId : `dict` or `DataCoordinate`
891 A `dict` of `Dimension` link name, value pairs that label the
892 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
893 should be provided as the first argument.
894 collections : Any, optional
895 Collections to be searched, overriding ``self.collections``.
896 Can be any of the types supported by the ``collections`` argument
897 to butler construction.
898 kwds
899 Additional keyword arguments used to augment or construct a
900 `DataCoordinate`. See `DataCoordinate.standardize`
901 parameters.
903 Raises
904 ------
905 LookupError
906 Raised if the dataset is not even present in the Registry.
907 ValueError
908 Raised if a resolved `DatasetRef` was passed as an input, but it
909 differs from the one found in the registry.
910 TypeError
911 Raised if no collections were provided.
912 """
913 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwds)
914 return self.datastore.exists(ref)
916 def prune(self, refs: Iterable[DatasetRef], *,
917 disassociate: bool = True,
918 unstore: bool = False,
919 tags: Optional[Iterable[str]] = None,
920 purge: bool = False,
921 run: Optional[str] = None,
922 recursive: bool = True):
923 """Remove one or more datasets from a collection and/or storage.
925 Parameters
926 ----------
927 refs : `~collections.abc.Iterable` of `DatasetRef`
928 Datasets to prune. These must be "resolved" references (not just
929 a `DatasetType` and data ID).
930 disassociate : bool`, optional
931 Disassociate pruned datasets from ``self.collections`` (or the
932 collection given as the ``collection`` argument). Dataset that are
933 not in this collection are ignored, unless ``purge`` is `True`.
934 unstore : `bool`, optional
935 If `True` (`False` is default) remove these datasets from all
936 datastores known to this butler. Note that this will make it
937 impossible to retrieve these datasets even via other collections.
938 Datasets that are already not stored are ignored by this option.
939 tags : `Iterable` [ `str` ], optional
940 `~CollectionType.TAGGED` collections to disassociate the datasets
941 from, overriding ``self.tags``. Ignored if ``disassociate`` is
942 `False` or ``purge`` is `True`.
943 purge : `bool`, optional
944 If `True` (`False` is default), completely remove the dataset from
945 the `Registry`. To prevent accidental deletions, ``purge`` may
946 only be `True` if all of the following conditions are met:
948 - All given datasets are in the given run.
949 - ``disassociate`` is `True`;
950 - ``unstore`` is `True`;
951 - none of the given datasets are components of some other dataset.
953 This mode may remove provenance information from datasets other
954 than those provided, and should be used with extreme care.
955 run : `str`, optional
956 `~CollectionType.RUN` collection to purge from, overriding
957 ``self.run``. Ignored unless ``purge`` is `True`.
958 recursive : `bool`, optional
959 If `True` (default) also prune component datasets of any given
960 composite datasets. This will only prune components that are
961 actually attached to the given `DatasetRef` objects, which may
962 not reflect what is in the database (especially if they were
963 obtained from `Registry.queryDatasets`, which does not include
964 components in its results).
966 Raises
967 ------
968 TypeError
969 Raised if the butler is read-only, if no collection was provided,
970 or the conditions for ``purge=True`` were not met.
971 IOError
972 Raised an incomplete deletion may have left the repository in an
973 inconsistent state. Only possible if ``unstore=True``, and always
974 accompanied by a chained exception describing the lower-level
975 error.
976 """
977 #
978 # TODO: this method can easily leave the repository in an inconsistent
979 # state if unstore=True in the most common configuration, because
980 # PosixDatastore.remove isn't exception safe. Even if we can't make
981 # file deletion and database deletion truly atomic, we should refactor
982 # the Datastore deletion interface to make it possible to only try to
983 # delete files after we've checked for all of errors we can. I imagine
984 # that would look something like
985 #
986 # with self.datastore.removing(refs):
987 # self.registry.<do some deleting>
988 #
989 # where starting the context manager:
990 # - starts a registry transaction;
991 # - checks that the given refs are known to the datastore;
992 # - computes a list of files (or whatever) to delete later;
993 # - does what it can to predict whether it will have trouble deleting
994 # any of them;
995 # - removes DatasetLocation records from registry, freeing up other
996 # Registry records to be removed without violating FK constraints.
997 #
998 # When the context manager ends, it:
999 # - closes the transaction (aborting and doing nothing if that
1000 # raises);
1001 # - attempts to actually delete all of the files, maybe retrying on
1002 # failure, and raising really scary exceptions if it can't.
1003 #
1004 if not self.isWriteable():
1005 raise TypeError("Butler is read-only.")
1006 if purge:
1007 if run is None:
1008 run = self.run
1009 if run is None:
1010 raise TypeError("No run provided but purge=True.")
1011 collectionType = self.registry.getCollectionType(run)
1012 if collectionType is not CollectionType.RUN:
1013 raise TypeError(f"Cannot purge from collection '{run}' "
1014 f"of non-RUN type {collectionType.name}.")
1015 elif disassociate:
1016 if tags is None:
1017 tags = self.tags
1018 else:
1019 tags = tuple(tags)
1020 if not tags:
1021 raise TypeError("No tags provided but disassociate=True.")
1022 for tag in tags:
1023 collectionType = self.registry.getCollectionType(tag)
1024 if collectionType is not CollectionType.TAGGED:
1025 raise TypeError(f"Cannot disassociate from collection '{tag}' "
1026 f"of non-TAGGED type {collectionType.name}.")
1027 if purge:
1028 if not disassociate:
1029 raise TypeError("Cannot pass purge=True without disassociate=True.")
1030 if not unstore:
1031 raise TypeError("Cannot pass purge=True without unstore=True.")
1032 refs = list(refs)
1033 for ref in refs:
1034 # isComponent isn't actually reliable, because the dataset type
1035 # name isn't the whole story, and that's mildly concerning
1036 # because the repo can be corrupted if we try to delete a
1037 # component from the Registry before its parent, but this
1038 # should be good enough for now.
1039 if ref.isComponent():
1040 raise TypeError("Cannot pass purge=True with component DatasetRefs.")
1041 if recursive:
1042 refs = list(DatasetRef.flatten(refs))
1043 with self.transaction():
1044 if purge:
1045 # For now, just do some checks; we can't actually remove
1046 # datasets from registry until their datastore-managed registry
1047 # entries are gone. But we want to catch as many problems as
1048 # we can before the point of no return.
1049 for ref in refs:
1050 if ref.run != run:
1051 raise ValueError(f"Cannot purge '{ref}' because it is not in '{run}'.")
1052 elif disassociate:
1053 # If we're disassociating but not purging, we can do that
1054 # before we try to delete, and it will roll back if deletion
1055 # fails. That will at least do the right thing if deletion
1056 # fails because the files couldn't actually be deleted (e.g.
1057 # due to lack of permissions).
1058 for tag in tags:
1059 # recursive=False here because refs is already recursive
1060 # if we want it to be.
1061 self.registry.disassociate(tag, refs, recursive=False)
1062 if unstore:
1063 try:
1064 for ref in refs:
1065 # There is a difference between a concrete composite
1066 # and virtual composite. In a virtual composite the
1067 # datastore is never given the top level DatasetRef. In
1068 # the concrete composite the datastore knows all the
1069 # refs and will clean up itself if asked to remove the
1070 # parent ref. We can not check configuration for this
1071 # since we can not trust that the configuration is the
1072 # same. We therefore have to ask if the ref exists or
1073 # not. This is consistent with the fact that we want
1074 # to ignore already-removed-from-datastore datasets
1075 # anyway.
1076 if self.datastore.exists(ref):
1077 self.datastore.trash(ref)
1078 if purge:
1079 self.registry.removeDataset(ref)
1081 # Point of no return for removing artifacts
1082 self.datastore.emptyTrash()
1083 except BaseException as err:
1084 raise IOError("WARNING: an incomplete deletion may have put "
1085 f"the repository in a corrupted state: {err}") from err
1087 @transactional
1088 def ingest(self, *datasets: FileDataset, transfer: Optional[str] = None, run: Optional[str] = None,
1089 tags: Optional[Iterable[str]] = None,):
1090 """Store and register one or more datasets that already exist on disk.
1092 Parameters
1093 ----------
1094 datasets : `FileDataset`
1095 Each positional argument is a struct containing information about
1096 a file to be ingested, including its path (either absolute or
1097 relative to the datastore root, if applicable), a `DatasetRef`,
1098 and optionally a formatter class or its fully-qualified string
1099 name. If a formatter is not provided, the formatter that would be
1100 used for `put` is assumed. On successful return, all
1101 `FileDataset.ref` attributes will have their `DatasetRef.id`
1102 attribute populated and all `FileDataset.formatter` attributes will
1103 be set to the formatter class used. `FileDataset.path` attributes
1104 may be modified to put paths in whatever the datastore considers a
1105 standardized form.
1106 transfer : `str`, optional
1107 If not `None`, must be one of 'auto', 'move', 'copy', 'hardlink',
1108 'relsymlink' or 'symlink', indicating how to transfer the file.
1109 run : `str`, optional
1110 The name of the run ingested datasets should be added to,
1111 overriding ``self.run``.
1112 tags : `Iterable` [ `str` ], optional
1113 The names of a `~CollectionType.TAGGED` collections to associate
1114 the dataset with, overriding ``self.tags``. These collections
1115 must have already been added to the `Registry`.
1117 Raises
1118 ------
1119 TypeError
1120 Raised if the butler is read-only or if no run was provided.
1121 NotImplementedError
1122 Raised if the `Datastore` does not support the given transfer mode.
1123 DatasetTypeNotSupportedError
1124 Raised if one or more files to be ingested have a dataset type that
1125 is not supported by the `Datastore`..
1126 FileNotFoundError
1127 Raised if one of the given files does not exist.
1128 FileExistsError
1129 Raised if transfer is not `None` but the (internal) location the
1130 file would be moved to is already occupied.
1132 Notes
1133 -----
1134 This operation is not fully exception safe: if a database operation
1135 fails, the given `FileDataset` instances may be only partially updated.
1137 It is atomic in terms of database operations (they will either all
1138 succeed or all fail) providing the database engine implements
1139 transactions correctly. It will attempt to be atomic in terms of
1140 filesystem operations as well, but this cannot be implemented
1141 rigorously for most datastores.
1142 """
1143 if not self.isWriteable():
1144 raise TypeError("Butler is read-only.")
1145 if run is None:
1146 if self.run is None:
1147 raise TypeError("No run provided.")
1148 run = self.run
1149 # No need to check run type, since insertDatasets will do that
1150 # (safely) for us.
1151 if tags is None:
1152 tags = self.tags
1153 else:
1154 tags = tuple(tags)
1155 for tag in tags:
1156 # Check that these are tagged collections up front, because we want
1157 # to avoid relying on Datastore transactionality to avoid modifying
1158 # the repo if there's an error later.
1159 collectionType = self.registry.getCollectionType(tag)
1160 if collectionType is not CollectionType.TAGGED:
1161 raise TypeError(f"Cannot associate into collection '{tag}' of non-TAGGED type "
1162 f"{collectionType.name}.")
1163 # Reorganize the inputs so they're grouped by DatasetType and then
1164 # data ID. We also include a list of DatasetRefs for each FileDataset
1165 # to hold the resolved DatasetRefs returned by the Registry, before
1166 # it's safe to swap them into FileDataset.refs.
1167 # Some type annotation aliases to make that clearer:
1168 GroupForType = Dict[DataCoordinate, Tuple[FileDataset, List[DatasetRef]]]
1169 GroupedData = MutableMapping[DatasetType, GroupForType]
1170 # The actual data structure:
1171 groupedData: GroupedData = defaultdict(dict)
1172 # And the nested loop that populates it:
1173 for dataset in datasets:
1174 # This list intentionally shared across the inner loop, since it's
1175 # associated with `dataset`.
1176 resolvedRefs = []
1177 for ref in dataset.refs:
1178 groupedData[ref.datasetType][ref.dataId] = (dataset, resolvedRefs)
1180 # Now we can bulk-insert into Registry for each DatasetType.
1181 allResolvedRefs = []
1182 for datasetType, groupForType in groupedData.items():
1183 refs = self.registry.insertDatasets(datasetType,
1184 dataIds=groupForType.keys(),
1185 run=run,
1186 recursive=True)
1187 # Append those resolved DatasetRefs to the new lists we set up for
1188 # them.
1189 for ref, (_, resolvedRefs) in zip(refs, groupForType.values()):
1190 resolvedRefs.append(ref)
1192 # Go back to the original FileDatasets to replace their refs with the
1193 # new resolved ones, and also build a big list of all refs.
1194 allResolvedRefs = []
1195 for groupForType in groupedData.values():
1196 for dataset, resolvedRefs in groupForType.values():
1197 dataset.refs = resolvedRefs
1198 allResolvedRefs.extend(resolvedRefs)
1200 # Bulk-associate everything with any tagged collections.
1201 for tag in tags:
1202 self.registry.associate(tag, allResolvedRefs)
1204 # Bulk-insert everything into Datastore.
1205 self.datastore.ingest(*datasets, transfer=transfer)
1207 @contextlib.contextmanager
1208 def export(self, *, directory: Optional[str] = None,
1209 filename: Optional[str] = None,
1210 format: Optional[str] = None,
1211 transfer: Optional[str] = None) -> ContextManager[RepoExport]:
1212 """Export datasets from the repository represented by this `Butler`.
1214 This method is a context manager that returns a helper object
1215 (`RepoExport`) that is used to indicate what information from the
1216 repository should be exported.
1218 Parameters
1219 ----------
1220 directory : `str`, optional
1221 Directory dataset files should be written to if ``transfer`` is not
1222 `None`.
1223 filename : `str`, optional
1224 Name for the file that will include database information associated
1225 with the exported datasets. If this is not an absolute path and
1226 ``directory`` is not `None`, it will be written to ``directory``
1227 instead of the current working directory. Defaults to
1228 "export.{format}".
1229 format : `str`, optional
1230 File format for the database information file. If `None`, the
1231 extension of ``filename`` will be used.
1232 transfer : `str`, optional
1233 Transfer mode passed to `Datastore.export`.
1235 Raises
1236 ------
1237 TypeError
1238 Raised if the set of arguments passed is inconsistent.
1240 Examples
1241 --------
1242 Typically the `Registry.queryDimensions` and `Registry.queryDatasets`
1243 methods are used to provide the iterables over data IDs and/or datasets
1244 to be exported::
1246 with butler.export("exports.yaml") as export:
1247 # Export all flats, and the calibration_label dimensions
1248 # associated with them.
1249 export.saveDatasets(butler.registry.queryDatasets("flat"),
1250 elements=[butler.registry.dimensions["calibration_label"]])
1251 # Export all datasets that start with "deepCoadd_" and all of
1252 # their associated data ID information.
1253 export.saveDatasets(butler.registry.queryDatasets("deepCoadd_*"))
1254 """
1255 if directory is None and transfer is not None:
1256 raise TypeError("Cannot transfer without providing a directory.")
1257 if transfer == "move":
1258 raise TypeError("Transfer may not be 'move': export is read-only")
1259 if format is None:
1260 if filename is None:
1261 raise TypeError("At least one of 'filename' or 'format' must be provided.")
1262 else:
1263 _, format = os.path.splitext(filename)
1264 elif filename is None:
1265 filename = f"export.{format}"
1266 if directory is not None:
1267 filename = os.path.join(directory, filename)
1268 BackendClass = getClassOf(self._config["repo_transfer_formats"][format]["export"])
1269 with open(filename, 'w') as stream:
1270 backend = BackendClass(stream)
1271 try:
1272 helper = RepoExport(self.registry, self.datastore, backend=backend,
1273 directory=directory, transfer=transfer)
1274 yield helper
1275 except BaseException:
1276 raise
1277 else:
1278 helper._finish()
1280 def import_(self, *, directory: Optional[str] = None,
1281 filename: Optional[str] = None,
1282 format: Optional[str] = None,
1283 transfer: Optional[str] = None):
1284 """Import datasets exported from a different butler repository.
1286 Parameters
1287 ----------
1288 directory : `str`, optional
1289 Directory containing dataset files. If `None`, all file paths
1290 must be absolute.
1291 filename : `str`, optional
1292 Name for the file that containing database information associated
1293 with the exported datasets. If this is not an absolute path, does
1294 not exist in the current working directory, and ``directory`` is
1295 not `None`, it is assumed to be in ``directory``. Defaults to
1296 "export.{format}".
1297 format : `str`, optional
1298 File format for the database information file. If `None`, the
1299 extension of ``filename`` will be used.
1300 transfer : `str`, optional
1301 Transfer mode passed to `Datastore.export`.
1303 Raises
1304 ------
1305 TypeError
1306 Raised if the set of arguments passed is inconsistent, or if the
1307 butler is read-only.
1308 """
1309 if not self.isWriteable():
1310 raise TypeError("Butler is read-only.")
1311 if format is None:
1312 if filename is None:
1313 raise TypeError("At least one of 'filename' or 'format' must be provided.")
1314 else:
1315 _, format = os.path.splitext(filename)
1316 elif filename is None:
1317 filename = f"export.{format}"
1318 if directory is not None and not os.path.exists(filename):
1319 filename = os.path.join(directory, filename)
1320 BackendClass = getClassOf(self._config["repo_transfer_formats"][format]["import"])
1321 with open(filename, 'r') as stream:
1322 backend = BackendClass(stream, self.registry)
1323 backend.register()
1324 with self.transaction():
1325 backend.load(self.datastore, directory=directory, transfer=transfer)
1327 def validateConfiguration(self, logFailures: bool = False,
1328 datasetTypeNames: Optional[Iterable[str]] = None,
1329 ignore: Iterable[str] = None):
1330 """Validate butler configuration.
1332 Checks that each `DatasetType` can be stored in the `Datastore`.
1334 Parameters
1335 ----------
1336 logFailures : `bool`, optional
1337 If `True`, output a log message for every validation error
1338 detected.
1339 datasetTypeNames : iterable of `str`, optional
1340 The `DatasetType` names that should be checked. This allows
1341 only a subset to be selected.
1342 ignore : iterable of `str`, optional
1343 Names of DatasetTypes to skip over. This can be used to skip
1344 known problems. If a named `DatasetType` corresponds to a
1345 composite, all component of that `DatasetType` will also be
1346 ignored.
1348 Raises
1349 ------
1350 ButlerValidationError
1351 Raised if there is some inconsistency with how this Butler
1352 is configured.
1353 """
1354 if datasetTypeNames:
1355 entities = [self.registry.getDatasetType(name) for name in datasetTypeNames]
1356 else:
1357 entities = list(self.registry.queryDatasetTypes())
1359 # filter out anything from the ignore list
1360 if ignore:
1361 ignore = set(ignore)
1362 entities = [e for e in entities if e.name not in ignore and e.nameAndComponent()[0] not in ignore]
1363 else:
1364 ignore = set()
1366 # Find all the registered instruments
1367 instruments = set(
1368 dataId["instrument"] for dataId in self.registry.queryDimensions(["instrument"])
1369 )
1371 # For each datasetType that has an instrument dimension, create
1372 # a DatasetRef for each defined instrument
1373 datasetRefs = []
1375 for datasetType in entities:
1376 if "instrument" in datasetType.dimensions:
1377 for instrument in instruments:
1378 datasetRef = DatasetRef(datasetType, {"instrument": instrument}, conform=False)
1379 datasetRefs.append(datasetRef)
1381 entities.extend(datasetRefs)
1383 datastoreErrorStr = None
1384 try:
1385 self.datastore.validateConfiguration(entities, logFailures=logFailures)
1386 except ValidationError as e:
1387 datastoreErrorStr = str(e)
1389 # Also check that the LookupKeys used by the datastores match
1390 # registry and storage class definitions
1391 keys = self.datastore.getLookupKeys()
1393 failedNames = set()
1394 failedDataId = set()
1395 for key in keys:
1396 datasetType = None
1397 if key.name is not None:
1398 if key.name in ignore:
1399 continue
1401 # skip if specific datasetType names were requested and this
1402 # name does not match
1403 if datasetTypeNames and key.name not in datasetTypeNames:
1404 continue
1406 # See if it is a StorageClass or a DatasetType
1407 if key.name in self.storageClasses:
1408 pass
1409 else:
1410 try:
1411 self.registry.getDatasetType(key.name)
1412 except KeyError:
1413 if logFailures:
1414 log.fatal("Key '%s' does not correspond to a DatasetType or StorageClass", key)
1415 failedNames.add(key)
1416 else:
1417 # Dimensions are checked for consistency when the Butler
1418 # is created and rendezvoused with a universe.
1419 pass
1421 # Check that the instrument is a valid instrument
1422 # Currently only support instrument so check for that
1423 if key.dataId:
1424 dataIdKeys = set(key.dataId)
1425 if set(["instrument"]) != dataIdKeys:
1426 if logFailures:
1427 log.fatal("Key '%s' has unsupported DataId override", key)
1428 failedDataId.add(key)
1429 elif key.dataId["instrument"] not in instruments:
1430 if logFailures:
1431 log.fatal("Key '%s' has unknown instrument", key)
1432 failedDataId.add(key)
1434 messages = []
1436 if datastoreErrorStr:
1437 messages.append(datastoreErrorStr)
1439 for failed, msg in ((failedNames, "Keys without corresponding DatasetType or StorageClass entry: "),
1440 (failedDataId, "Keys with bad DataId entries: ")):
1441 if failed:
1442 msg += ", ".join(str(k) for k in failed)
1443 messages.append(msg)
1445 if messages:
1446 raise ValidationError(";\n".join(messages))
1448 registry: Registry
1449 """The object that manages dataset metadata and relationships (`Registry`).
1451 Most operations that don't involve reading or writing butler datasets are
1452 accessible only via `Registry` methods.
1453 """
1455 datastore: Datastore
1456 """The object that manages actual dataset storage (`Datastore`).
1458 Direct user access to the datastore should rarely be necessary; the primary
1459 exception is the case where a `Datastore` implementation provides extra
1460 functionality beyond what the base class defines.
1461 """
1463 storageClasses: StorageClassFactory
1464 """An object that maps known storage class names to objects that fully
1465 describe them (`StorageClassFactory`).
1466 """
1468 collections: Optional[CollectionSearch]
1469 """The collections to search and any restrictions on the dataset types to
1470 search for within them, in order (`CollectionSearch`).
1471 """
1473 run: Optional[str]
1474 """Name of the run this butler writes outputs to (`str` or `None`).
1475 """
1477 tags: Tuple[str, ...]
1478 """Names of `~CollectionType.TAGGED` collections this butler associates
1479 with in `put` and `ingest`, and disassociates from in `prune` (`tuple` of
1480 `str`).
1481 """