Coverage for python/lsst/daf/butler/_butler.py : 9%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""
23Butler top level classes.
24"""
25from __future__ import annotations
27__all__ = ("Butler", "ButlerValidationError")
29import os
30from collections import defaultdict
31import contextlib
32import logging
33from typing import (
34 Any,
35 ClassVar,
36 ContextManager,
37 Dict,
38 Iterable,
39 List,
40 Mapping,
41 MutableMapping,
42 Optional,
43 TextIO,
44 Tuple,
45 Union,
46)
48try:
49 import boto3
50except ImportError:
51 boto3 = None
53from lsst.utils import doImport
54from .core import (
55 ButlerURI,
56 Config,
57 ConfigSubset,
58 DataCoordinate,
59 DataId,
60 DatasetRef,
61 DatasetType,
62 Datastore,
63 FileDataset,
64 RepoExport,
65 StorageClassFactory,
66 ValidationError,
67)
68from .core.repoRelocation import BUTLER_ROOT_TAG
69from .core.utils import transactional, getClassOf
70from ._deferredDatasetHandle import DeferredDatasetHandle
71from ._butlerConfig import ButlerConfig
72from .registry import Registry, RegistryConfig, CollectionType
73from .registry.wildcards import CollectionSearch
75log = logging.getLogger(__name__)
78class ButlerValidationError(ValidationError):
79 """There is a problem with the Butler configuration."""
80 pass
83class Butler:
84 """Main entry point for the data access system.
86 Parameters
87 ----------
88 config : `ButlerConfig`, `Config` or `str`, optional.
89 Configuration. Anything acceptable to the
90 `ButlerConfig` constructor. If a directory path
91 is given the configuration will be read from a ``butler.yaml`` file in
92 that location. If `None` is given default values will be used.
93 butler : `Butler`, optional.
94 If provided, construct a new Butler that uses the same registry and
95 datastore as the given one, but with the given collection and run.
96 Incompatible with the ``config``, ``searchPaths``, and ``writeable``
97 arguments.
98 collections : `Any`, optional
99 An expression specifying the collections to be searched (in order) when
100 reading datasets, and optionally dataset type restrictions on them.
101 This may be:
102 - a `str` collection name;
103 - a tuple of (collection name, *dataset type restriction*);
104 - an iterable of either of the above;
105 - a mapping from `str` to *dataset type restriction*.
107 See :ref:`daf_butler_collection_expressions` for more information,
108 including the definition of a *dataset type restriction*. All
109 collections must either already exist or be specified to be created
110 by other arguments.
111 run : `str`, optional
112 Name of the run datasets should be output to. If the run
113 does not exist, it will be created. If ``collections`` is `None`, it
114 will be set to ``[run]``. If this is not set (and ``writeable`` is
115 not set either), a read-only butler will be created.
116 tags : `Iterable` [ `str` ], optional
117 A list of `~CollectionType.TAGGED` collections that datasets should be
118 associated with in `put` or `ingest` and disassociated from in
119 `pruneDatasets`. If any of these collections does not exist, it will
120 be created.
121 chains : `Mapping` [ `str`, `Iterable` [ `str` ] ], optional
122 A mapping from the names of new `~CollectionType.CHAINED` collections
123 to an expression identifying their child collections (which takes the
124 same form as the ``collections`` argument. Chains may be nested only
125 if children precede their parents in this mapping.
126 searchPaths : `list` of `str`, optional
127 Directory paths to search when calculating the full Butler
128 configuration. Not used if the supplied config is already a
129 `ButlerConfig`.
130 writeable : `bool`, optional
131 Explicitly sets whether the butler supports write operations. If not
132 provided, a read-write butler is created if any of ``run``, ``tags``,
133 or ``chains`` is non-empty.
135 Examples
136 --------
137 While there are many ways to control exactly how a `Butler` interacts with
138 the collections in its `Registry`, the most common cases are still simple.
140 For a read-only `Butler` that searches one collection, do::
142 butler = Butler("/path/to/repo", collections=["u/alice/DM-50000"])
144 For a read-write `Butler` that writes to and reads from a
145 `~CollectionType.RUN` collection::
147 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a")
149 The `Butler` passed to a ``PipelineTask`` is often much more complex,
150 because we want to write to one `~CollectionType.RUN` collection but read
151 from several others (as well), while defining a new
152 `~CollectionType.CHAINED` collection that combines them all::
154 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a",
155 collections=["u/alice/DM-50000"],
156 chains={
157 "u/alice/DM-50000": ["u/alice/DM-50000/a",
158 "u/bob/DM-49998",
159 "raw/hsc"]
160 })
162 This butler will `put` new datasets to the run ``u/alice/DM-50000/a``, but
163 they'll also be available from the chained collection ``u/alice/DM-50000``.
164 Datasets will be read first from that run (since it appears first in the
165 chain), and then from ``u/bob/DM-49998`` and finally ``raw/hsc``.
166 If ``u/alice/DM-50000`` had already been defined, the ``chain`` argument
167 would be unnecessary. We could also construct a butler that performs
168 exactly the same `put` and `get` operations without actually creating a
169 chained collection, just by passing multiple items is ``collections``::
171 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a",
172 collections=["u/alice/DM-50000/a",
173 "u/bob/DM-49998",
174 "raw/hsc"])
176 Finally, one can always create a `Butler` with no collections::
178 butler = Butler("/path/to/repo", writeable=True)
180 This can be extremely useful when you just want to use ``butler.registry``,
181 e.g. for inserting dimension data or managing collections, or when the
182 collections you want to use with the butler are not consistent.
183 Passing ``writeable`` explicitly here is only necessary if you want to be
184 able to make changes to the repo - usually the value for ``writeable`` is
185 can be guessed from the collection arguments provided, but it defaults to
186 `False` when there are not collection arguments.
187 """
188 def __init__(self, config: Union[Config, str, None] = None, *,
189 butler: Optional[Butler] = None,
190 collections: Any = None,
191 run: Optional[str] = None,
192 tags: Iterable[str] = (),
193 chains: Optional[Mapping[str, Any]] = None,
194 searchPaths: Optional[List[str]] = None,
195 writeable: Optional[bool] = None):
196 # Transform any single-pass iterator into an actual sequence so we
197 # can see if its empty
198 self.tags = tuple(tags)
199 # Load registry, datastore, etc. from config or existing butler.
200 if butler is not None:
201 if config is not None or searchPaths is not None or writeable is not None:
202 raise TypeError("Cannot pass 'config', 'searchPaths', or 'writeable' "
203 "arguments with 'butler' argument.")
204 self.registry = butler.registry
205 self.datastore = butler.datastore
206 self.storageClasses = butler.storageClasses
207 self._config = butler._config
208 else:
209 self._config = ButlerConfig(config, searchPaths=searchPaths)
210 if "root" in self._config:
211 butlerRoot = self._config["root"]
212 else:
213 butlerRoot = self._config.configDir
214 if writeable is None:
215 writeable = run is not None or chains is not None or self.tags
216 self.registry = Registry.fromConfig(self._config, butlerRoot=butlerRoot, writeable=writeable)
217 self.datastore = Datastore.fromConfig(self._config, self.registry.getDatastoreBridgeManager(),
218 butlerRoot=butlerRoot)
219 self.storageClasses = StorageClassFactory()
220 self.storageClasses.addFromConfig(self._config)
221 # Check the many collection arguments for consistency and create any
222 # needed collections that don't exist.
223 if collections is None:
224 if run is not None:
225 collections = (run,)
226 else:
227 collections = ()
228 self.collections = CollectionSearch.fromExpression(collections)
229 if chains is None:
230 chains = {}
231 self.run = run
232 if "run" in self._config or "collection" in self._config:
233 raise ValueError("Passing a run or collection via configuration is no longer supported.")
234 if self.run is not None:
235 self.registry.registerCollection(self.run, type=CollectionType.RUN)
236 for tag in self.tags:
237 self.registry.registerCollection(tag, type=CollectionType.TAGGED)
238 for parent, children in chains.items():
239 self.registry.registerCollection(parent, type=CollectionType.CHAINED)
240 self.registry.setCollectionChain(parent, children)
242 GENERATION: ClassVar[int] = 3
243 """This is a Generation 3 Butler.
245 This attribute may be removed in the future, once the Generation 2 Butler
246 interface has been fully retired; it should only be used in transitional
247 code.
248 """
250 @staticmethod
251 def makeRepo(root: str, config: Union[Config, str, None] = None, standalone: bool = False,
252 createRegistry: bool = True, searchPaths: Optional[List[str]] = None,
253 forceConfigRoot: bool = True, outfile: Optional[str] = None,
254 overwrite: bool = False) -> Config:
255 """Create an empty data repository by adding a butler.yaml config
256 to a repository root directory.
258 Parameters
259 ----------
260 root : `str` or `ButlerURI`
261 Path or URI to the root location of the new repository. Will be
262 created if it does not exist.
263 config : `Config` or `str`, optional
264 Configuration to write to the repository, after setting any
265 root-dependent Registry or Datastore config options. Can not
266 be a `ButlerConfig` or a `ConfigSubset`. If `None`, default
267 configuration will be used. Root-dependent config options
268 specified in this config are overwritten if ``forceConfigRoot``
269 is `True`.
270 standalone : `bool`
271 If True, write all expanded defaults, not just customized or
272 repository-specific settings.
273 This (mostly) decouples the repository from the default
274 configuration, insulating it from changes to the defaults (which
275 may be good or bad, depending on the nature of the changes).
276 Future *additions* to the defaults will still be picked up when
277 initializing `Butlers` to repos created with ``standalone=True``.
278 createRegistry : `bool`, optional
279 If `True` create a new Registry.
280 searchPaths : `list` of `str`, optional
281 Directory paths to search when calculating the full butler
282 configuration.
283 forceConfigRoot : `bool`, optional
284 If `False`, any values present in the supplied ``config`` that
285 would normally be reset are not overridden and will appear
286 directly in the output config. This allows non-standard overrides
287 of the root directory for a datastore or registry to be given.
288 If this parameter is `True` the values for ``root`` will be
289 forced into the resulting config if appropriate.
290 outfile : `str`, optional
291 If not-`None`, the output configuration will be written to this
292 location rather than into the repository itself. Can be a URI
293 string. Can refer to a directory that will be used to write
294 ``butler.yaml``.
295 overwrite : `bool`, optional
296 Create a new configuration file even if one already exists
297 in the specified output location. Default is to raise
298 an exception.
300 Returns
301 -------
302 config : `Config`
303 The updated `Config` instance written to the repo.
305 Raises
306 ------
307 ValueError
308 Raised if a ButlerConfig or ConfigSubset is passed instead of a
309 regular Config (as these subclasses would make it impossible to
310 support ``standalone=False``).
311 FileExistsError
312 Raised if the output config file already exists.
313 os.error
314 Raised if the directory does not exist, exists but is not a
315 directory, or cannot be created.
317 Notes
318 -----
319 Note that when ``standalone=False`` (the default), the configuration
320 search path (see `ConfigSubset.defaultSearchPaths`) that was used to
321 construct the repository should also be used to construct any Butlers
322 to avoid configuration inconsistencies.
323 """
324 if isinstance(config, (ButlerConfig, ConfigSubset)):
325 raise ValueError("makeRepo must be passed a regular Config without defaults applied.")
327 # Ensure that the root of the repository exists or can be made
328 uri = ButlerURI(root, forceDirectory=True)
329 uri.mkdir()
331 config = Config(config)
333 # If we are creating a new repo from scratch with relative roots,
334 # do not propagate an explicit root from the config file
335 if "root" in config:
336 del config["root"]
338 full = ButlerConfig(config, searchPaths=searchPaths) # this applies defaults
339 datastoreClass = doImport(full["datastore", "cls"])
340 datastoreClass.setConfigRoot(BUTLER_ROOT_TAG, config, full, overwrite=forceConfigRoot)
342 # if key exists in given config, parse it, otherwise parse the defaults
343 # in the expanded config
344 if config.get(("registry", "db")):
345 registryConfig = RegistryConfig(config)
346 else:
347 registryConfig = RegistryConfig(full)
348 defaultDatabaseUri = registryConfig.makeDefaultDatabaseUri(BUTLER_ROOT_TAG)
349 if defaultDatabaseUri is not None:
350 Config.updateParameters(RegistryConfig, config, full,
351 toUpdate={"db": defaultDatabaseUri},
352 overwrite=forceConfigRoot)
353 else:
354 Config.updateParameters(RegistryConfig, config, full, toCopy=("db",),
355 overwrite=forceConfigRoot)
357 if standalone:
358 config.merge(full)
359 if outfile is not None:
360 # When writing to a separate location we must include
361 # the root of the butler repo in the config else it won't know
362 # where to look.
363 config["root"] = uri.geturl()
364 configURI = outfile
365 else:
366 configURI = uri
367 config.dumpToUri(configURI, overwrite=overwrite)
369 # Create Registry and populate tables
370 Registry.fromConfig(config, create=createRegistry, butlerRoot=root)
371 return config
373 @classmethod
374 def _unpickle(cls, config: ButlerConfig, collections: Optional[CollectionSearch], run: Optional[str],
375 tags: Tuple[str, ...], writeable: bool) -> Butler:
376 """Callable used to unpickle a Butler.
378 We prefer not to use ``Butler.__init__`` directly so we can force some
379 of its many arguments to be keyword-only (note that ``__reduce__``
380 can only invoke callables with positional arguments).
382 Parameters
383 ----------
384 config : `ButlerConfig`
385 Butler configuration, already coerced into a true `ButlerConfig`
386 instance (and hence after any search paths for overrides have been
387 utilized).
388 collections : `CollectionSearch`
389 Names of collections to read from.
390 run : `str`, optional
391 Name of `~CollectionType.RUN` collection to write to.
392 tags : `tuple` [`str`]
393 Names of `~CollectionType.TAGGED` collections to associate with.
394 writeable : `bool`
395 Whether the Butler should support write operations.
397 Returns
398 -------
399 butler : `Butler`
400 A new `Butler` instance.
401 """
402 return cls(config=config, collections=collections, run=run, tags=tags, writeable=writeable)
404 def __reduce__(self):
405 """Support pickling.
406 """
407 return (Butler._unpickle, (self._config, self.collections, self.run, self.tags,
408 self.registry.isWriteable()))
410 def __str__(self):
411 return "Butler(collections={}, run={}, tags={}, datastore='{}', registry='{}')".format(
412 self.collections, self.run, self.tags, self.datastore, self.registry)
414 def isWriteable(self) -> bool:
415 """Return `True` if this `Butler` supports write operations.
416 """
417 return self.registry.isWriteable()
419 @contextlib.contextmanager
420 def transaction(self):
421 """Context manager supporting `Butler` transactions.
423 Transactions can be nested.
424 """
425 with self.registry.transaction():
426 with self.datastore.transaction():
427 yield
429 def _standardizeArgs(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
430 dataId: Optional[DataId] = None, **kwds: Any) -> Tuple[DatasetType, DataId]:
431 """Standardize the arguments passed to several Butler APIs.
433 Parameters
434 ----------
435 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
436 When `DatasetRef` the `dataId` should be `None`.
437 Otherwise the `DatasetType` or name thereof.
438 dataId : `dict` or `DataCoordinate`
439 A `dict` of `Dimension` link name, value pairs that label the
440 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
441 should be provided as the second argument.
442 kwds
443 Additional keyword arguments used to augment or construct a
444 `DataCoordinate`. See `DataCoordinate.standardize`
445 parameters.
447 Returns
448 -------
449 datasetType : `DatasetType`
450 A `DatasetType` instance extracted from ``datasetRefOrType``.
451 dataId : `dict` or `DataId`, optional
452 Argument that can be used (along with ``kwds``) to construct a
453 `DataId`.
455 Notes
456 -----
457 Butler APIs that conceptually need a DatasetRef also allow passing a
458 `DatasetType` (or the name of one) and a `DataId` (or a dict and
459 keyword arguments that can be used to construct one) separately. This
460 method accepts those arguments and always returns a true `DatasetType`
461 and a `DataId` or `dict`.
463 Standardization of `dict` vs `DataId` is best handled by passing the
464 returned ``dataId`` (and ``kwds``) to `Registry` APIs, which are
465 generally similarly flexible.
466 """
467 externalDatasetType = None
468 internalDatasetType = None
469 if isinstance(datasetRefOrType, DatasetRef):
470 if dataId is not None or kwds:
471 raise ValueError("DatasetRef given, cannot use dataId as well")
472 externalDatasetType = datasetRefOrType.datasetType
473 dataId = datasetRefOrType.dataId
474 else:
475 # Don't check whether DataId is provided, because Registry APIs
476 # can usually construct a better error message when it wasn't.
477 if isinstance(datasetRefOrType, DatasetType):
478 externalDatasetType = datasetRefOrType
479 else:
480 internalDatasetType = self.registry.getDatasetType(datasetRefOrType)
482 # Check that they are self-consistent
483 if externalDatasetType is not None:
484 internalDatasetType = self.registry.getDatasetType(externalDatasetType.name)
485 if externalDatasetType != internalDatasetType:
486 raise ValueError(f"Supplied dataset type ({externalDatasetType}) inconsistent with "
487 f"registry definition ({internalDatasetType})")
489 return internalDatasetType, dataId
491 def _findDatasetRef(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
492 dataId: Optional[DataId] = None, *,
493 collections: Any = None,
494 allowUnresolved: bool = False,
495 **kwds: Any) -> DatasetRef:
496 """Shared logic for methods that start with a search for a dataset in
497 the registry.
499 Parameters
500 ----------
501 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
502 When `DatasetRef` the `dataId` should be `None`.
503 Otherwise the `DatasetType` or name thereof.
504 dataId : `dict` or `DataCoordinate`, optional
505 A `dict` of `Dimension` link name, value pairs that label the
506 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
507 should be provided as the first argument.
508 collections : Any, optional
509 Collections to be searched, overriding ``self.collections``.
510 Can be any of the types supported by the ``collections`` argument
511 to butler construction.
512 allowUnresolved : `bool`, optional
513 If `True`, return an unresolved `DatasetRef` if finding a resolved
514 one in the `Registry` fails. Defaults to `False`.
515 kwds
516 Additional keyword arguments used to augment or construct a
517 `DataId`. See `DataId` parameters.
519 Returns
520 -------
521 ref : `DatasetRef`
522 A reference to the dataset identified by the given arguments.
524 Raises
525 ------
526 LookupError
527 Raised if no matching dataset exists in the `Registry` (and
528 ``allowUnresolved is False``).
529 ValueError
530 Raised if a resolved `DatasetRef` was passed as an input, but it
531 differs from the one found in the registry.
532 TypeError
533 Raised if no collections were provided.
534 """
535 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwds)
536 if isinstance(datasetRefOrType, DatasetRef):
537 idNumber = datasetRefOrType.id
538 else:
539 idNumber = None
540 # Standardize the data ID first instead of letting registry.findDataset
541 # do it, so we get the result even if no dataset is found.
542 dataId = DataCoordinate.standardize(dataId, graph=datasetType.dimensions, **kwds)
543 if collections is None:
544 collections = self.collections
545 if not collections:
546 raise TypeError("No input collections provided.")
547 else:
548 collections = CollectionSearch.fromExpression(collections)
549 # Always lookup the DatasetRef, even if one is given, to ensure it is
550 # present in the current collection.
551 ref = self.registry.findDataset(datasetType, dataId, collections=collections)
552 if ref is None:
553 if allowUnresolved:
554 return DatasetRef(datasetType, dataId)
555 else:
556 raise LookupError(f"Dataset {datasetType.name} with data ID {dataId} "
557 f"could not be found in collections {collections}.")
558 if idNumber is not None and idNumber != ref.id:
559 raise ValueError(f"DatasetRef.id provided ({idNumber}) does not match "
560 f"id ({ref.id}) in registry in collections {collections}.")
561 return ref
563 @transactional
564 def put(self, obj: Any, datasetRefOrType: Union[DatasetRef, DatasetType, str],
565 dataId: Optional[DataId] = None, *,
566 run: Optional[str] = None,
567 tags: Optional[Iterable[str]] = None,
568 **kwds: Any) -> DatasetRef:
569 """Store and register a dataset.
571 Parameters
572 ----------
573 obj : `object`
574 The dataset.
575 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
576 When `DatasetRef` is provided, ``dataId`` should be `None`.
577 Otherwise the `DatasetType` or name thereof.
578 dataId : `dict` or `DataCoordinate`
579 A `dict` of `Dimension` link name, value pairs that label the
580 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
581 should be provided as the second argument.
582 run : `str`, optional
583 The name of the run the dataset should be added to, overriding
584 ``self.run``.
585 tags : `Iterable` [ `str` ], optional
586 The names of a `~CollectionType.TAGGED` collections to associate
587 the dataset with, overriding ``self.tags``. These collections
588 must have already been added to the `Registry`.
589 kwds
590 Additional keyword arguments used to augment or construct a
591 `DataCoordinate`. See `DataCoordinate.standardize`
592 parameters.
594 Returns
595 -------
596 ref : `DatasetRef`
597 A reference to the stored dataset, updated with the correct id if
598 given.
600 Raises
601 ------
602 TypeError
603 Raised if the butler is read-only or if no run has been provided.
604 """
605 log.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run)
606 if not self.isWriteable():
607 raise TypeError("Butler is read-only.")
608 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwds)
609 if isinstance(datasetRefOrType, DatasetRef) and datasetRefOrType.id is not None:
610 raise ValueError("DatasetRef must not be in registry, must have None id")
612 if run is None:
613 if self.run is None:
614 raise TypeError("No run provided.")
615 run = self.run
616 # No need to check type for run; first thing we do is
617 # insertDatasets, and that will check for us.
619 if tags is None:
620 tags = self.tags
621 else:
622 tags = tuple(tags)
623 for tag in tags:
624 # Check that these are tagged collections up front, because we want
625 # to avoid relying on Datastore transactionality to avoid modifying
626 # the repo if there's an error later.
627 collectionType = self.registry.getCollectionType(tag)
628 if collectionType is not CollectionType.TAGGED:
629 raise TypeError(f"Cannot associate into collection '{tag}' of non-TAGGED type "
630 f"{collectionType.name}.")
632 # Add Registry Dataset entry.
633 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions, **kwds)
634 ref, = self.registry.insertDatasets(datasetType, run=run, dataIds=[dataId])
636 # Add Datastore entry.
637 self.datastore.put(obj, ref)
639 for tag in tags:
640 self.registry.associate(tag, [ref])
642 return ref
644 def getDirect(self, ref: DatasetRef, *, parameters: Optional[Dict[str, Any]] = None):
645 """Retrieve a stored dataset.
647 Unlike `Butler.get`, this method allows datasets outside the Butler's
648 collection to be read as long as the `DatasetRef` that identifies them
649 can be obtained separately.
651 Parameters
652 ----------
653 ref : `DatasetRef`
654 Reference to an already stored dataset.
655 parameters : `dict`
656 Additional StorageClass-defined options to control reading,
657 typically used to efficiently read only a subset of the dataset.
659 Returns
660 -------
661 obj : `object`
662 The dataset.
663 """
664 return self.datastore.get(ref, parameters=parameters)
666 def getDeferred(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
667 dataId: Optional[DataId] = None, *,
668 parameters: Union[dict, None] = None,
669 collections: Any = None,
670 **kwds: Any) -> DeferredDatasetHandle:
671 """Create a `DeferredDatasetHandle` which can later retrieve a dataset
673 Parameters
674 ----------
675 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
676 When `DatasetRef` the `dataId` should be `None`.
677 Otherwise the `DatasetType` or name thereof.
678 dataId : `dict` or `DataCoordinate`, optional
679 A `dict` of `Dimension` link name, value pairs that label the
680 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
681 should be provided as the first argument.
682 parameters : `dict`
683 Additional StorageClass-defined options to control reading,
684 typically used to efficiently read only a subset of the dataset.
685 collections : Any, optional
686 Collections to be searched, overriding ``self.collections``.
687 Can be any of the types supported by the ``collections`` argument
688 to butler construction.
689 kwds
690 Additional keyword arguments used to augment or construct a
691 `DataId`. See `DataId` parameters.
693 Returns
694 -------
695 obj : `DeferredDatasetHandle`
696 A handle which can be used to retrieve a dataset at a later time.
698 Raises
699 ------
700 LookupError
701 Raised if no matching dataset exists in the `Registry` (and
702 ``allowUnresolved is False``).
703 ValueError
704 Raised if a resolved `DatasetRef` was passed as an input, but it
705 differs from the one found in the registry.
706 TypeError
707 Raised if no collections were provided.
708 """
709 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwds)
710 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters)
712 def get(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
713 dataId: Optional[DataId] = None, *,
714 parameters: Optional[Dict[str, Any]] = None,
715 collections: Any = None,
716 **kwds: Any) -> Any:
717 """Retrieve a stored dataset.
719 Parameters
720 ----------
721 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
722 When `DatasetRef` the `dataId` should be `None`.
723 Otherwise the `DatasetType` or name thereof.
724 dataId : `dict` or `DataCoordinate`
725 A `dict` of `Dimension` link name, value pairs that label the
726 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
727 should be provided as the first argument.
728 parameters : `dict`
729 Additional StorageClass-defined options to control reading,
730 typically used to efficiently read only a subset of the dataset.
731 collections : Any, optional
732 Collections to be searched, overriding ``self.collections``.
733 Can be any of the types supported by the ``collections`` argument
734 to butler construction.
735 kwds
736 Additional keyword arguments used to augment or construct a
737 `DataCoordinate`. See `DataCoordinate.standardize`
738 parameters.
740 Returns
741 -------
742 obj : `object`
743 The dataset.
745 Raises
746 ------
747 ValueError
748 Raised if a resolved `DatasetRef` was passed as an input, but it
749 differs from the one found in the registry.
750 LookupError
751 Raised if no matching dataset exists in the `Registry`.
752 TypeError
753 Raised if no collections were provided.
754 """
755 log.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters)
756 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwds)
757 return self.getDirect(ref, parameters=parameters)
759 def getURIs(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
760 dataId: Optional[DataId] = None, *,
761 predict: bool = False,
762 collections: Any = None,
763 run: Optional[str] = None,
764 **kwds: Any) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]:
765 """Returns the URIs associated with the dataset.
767 Parameters
768 ----------
769 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
770 When `DatasetRef` the `dataId` should be `None`.
771 Otherwise the `DatasetType` or name thereof.
772 dataId : `dict` or `DataCoordinate`
773 A `dict` of `Dimension` link name, value pairs that label the
774 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
775 should be provided as the first argument.
776 predict : `bool`
777 If `True`, allow URIs to be returned of datasets that have not
778 been written.
779 collections : Any, optional
780 Collections to be searched, overriding ``self.collections``.
781 Can be any of the types supported by the ``collections`` argument
782 to butler construction.
783 run : `str`, optional
784 Run to use for predictions, overriding ``self.run``.
785 kwds
786 Additional keyword arguments used to augment or construct a
787 `DataCoordinate`. See `DataCoordinate.standardize`
788 parameters.
790 Returns
791 -------
792 primary : `ButlerURI`
793 The URI to the primary artifact associated with this dataset.
794 If the dataset was disassembled within the datastore this
795 may be `None`.
796 components : `dict`
797 URIs to any components associated with the dataset artifact.
798 Can be empty if there are no components.
799 """
800 ref = self._findDatasetRef(datasetRefOrType, dataId, allowUnresolved=predict,
801 collections=collections, **kwds)
802 if ref.id is None: # only possible if predict is True
803 if run is None:
804 run = self.run
805 if run is None:
806 raise TypeError("Cannot predict location with run=None.")
807 # Lie about ID, because we can't guess it, and only
808 # Datastore.getURIs() will ever see it (and it doesn't use it).
809 ref = ref.resolved(id=0, run=run)
810 return self.datastore.getURIs(ref, predict)
812 def getURI(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
813 dataId: Optional[DataId] = None, *,
814 predict: bool = False,
815 collections: Any = None,
816 run: Optional[str] = None,
817 **kwds: Any) -> ButlerURI:
818 """Return the URI to the Dataset.
820 Parameters
821 ----------
822 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
823 When `DatasetRef` the `dataId` should be `None`.
824 Otherwise the `DatasetType` or name thereof.
825 dataId : `dict` or `DataCoordinate`
826 A `dict` of `Dimension` link name, value pairs that label the
827 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
828 should be provided as the first argument.
829 predict : `bool`
830 If `True`, allow URIs to be returned of datasets that have not
831 been written.
832 collections : Any, optional
833 Collections to be searched, overriding ``self.collections``.
834 Can be any of the types supported by the ``collections`` argument
835 to butler construction.
836 run : `str`, optional
837 Run to use for predictions, overriding ``self.run``.
838 kwds
839 Additional keyword arguments used to augment or construct a
840 `DataCoordinate`. See `DataCoordinate.standardize`
841 parameters.
843 Returns
844 -------
845 uri : `ButlerURI`
846 URI pointing to the Dataset within the datastore. If the
847 Dataset does not exist in the datastore, and if ``predict`` is
848 `True`, the URI will be a prediction and will include a URI
849 fragment "#predicted".
850 If the datastore does not have entities that relate well
851 to the concept of a URI the returned URI string will be
852 descriptive. The returned URI is not guaranteed to be obtainable.
854 Raises
855 ------
856 LookupError
857 A URI has been requested for a dataset that does not exist and
858 guessing is not allowed.
859 ValueError
860 Raised if a resolved `DatasetRef` was passed as an input, but it
861 differs from the one found in the registry.
862 TypeError
863 Raised if no collections were provided.
864 RuntimeError
865 Raised if a URI is requested for a dataset that consists of
866 multiple artifacts.
867 """
868 primary, components = self.getURIs(datasetRefOrType, dataId=dataId, predict=predict,
869 collections=collections, run=run, **kwds)
871 if primary is None or components:
872 raise RuntimeError(f"Dataset ({datasetRefOrType}) includes distinct URIs for components. "
873 "Use Butler.getURIs() instead.")
874 return primary
876 def datasetExists(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
877 dataId: Optional[DataId] = None, *,
878 collections: Any = None,
879 **kwds: Any) -> bool:
880 """Return True if the Dataset is actually present in the Datastore.
882 Parameters
883 ----------
884 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
885 When `DatasetRef` the `dataId` should be `None`.
886 Otherwise the `DatasetType` or name thereof.
887 dataId : `dict` or `DataCoordinate`
888 A `dict` of `Dimension` link name, value pairs that label the
889 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
890 should be provided as the first argument.
891 collections : Any, optional
892 Collections to be searched, overriding ``self.collections``.
893 Can be any of the types supported by the ``collections`` argument
894 to butler construction.
895 kwds
896 Additional keyword arguments used to augment or construct a
897 `DataCoordinate`. See `DataCoordinate.standardize`
898 parameters.
900 Raises
901 ------
902 LookupError
903 Raised if the dataset is not even present in the Registry.
904 ValueError
905 Raised if a resolved `DatasetRef` was passed as an input, but it
906 differs from the one found in the registry.
907 TypeError
908 Raised if no collections were provided.
909 """
910 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwds)
911 return self.datastore.exists(ref)
913 def pruneCollection(self, name: str, purge: bool = False, unstore: bool = False):
914 """Remove a collection and possibly prune datasets within it.
916 Parameters
917 ----------
918 name : `str`
919 Name of the collection to remove. If this is a
920 `~CollectionType.TAGGED` or `~CollectionType.CHAINED` collection,
921 datasets within the collection are not modified unless ``unstore``
922 is `True`. If this is a `~CollectionType.RUN` collection,
923 ``purge`` and ``unstore`` must be `True`, and all datasets in it
924 are fully removed from the data repository.
925 purge : `bool`, optional
926 If `True`, permit `~CollectionType.RUN` collections to be removed,
927 fully removing datasets within them. Requires ``unstore=True`` as
928 well as an added precaution against accidental deletion. Must be
929 `False` (default) if the collection is not a ``RUN``.
930 unstore: `bool`, optional
931 If `True`, remove all datasets in the collection from all
932 datastores in which they appear.
934 Raises
935 ------
936 TypeError
937 Raised if the butler is read-only or arguments are mutually
938 inconsistent.
939 """
940 # See pruneDatasets comments for more information about the logic here;
941 # the cases are almost the same, but here we can rely on Registry to
942 # take care everything but Datastore deletion when we remove the
943 # collection.
944 if not self.isWriteable():
945 raise TypeError("Butler is read-only.")
946 if purge and not unstore:
947 raise TypeError("Cannot pass purge=True without unstore=True.")
948 collectionType = self.registry.getCollectionType(name)
949 if collectionType is CollectionType.RUN and not purge:
950 raise TypeError(f"Cannot prune RUN collection {name} without purge=True.")
951 if collectionType is not CollectionType.RUN and purge:
952 raise TypeError(f"Cannot prune {collectionType.name} collection {name} with purge=True.")
953 with self.registry.transaction():
954 if unstore:
955 for ref in self.registry.queryDatasets(..., collections=name, deduplicate=True):
956 if self.datastore.exists(ref):
957 self.datastore.trash(ref)
958 self.registry.removeCollection(name)
959 if unstore:
960 # Point of no return for removing artifacts
961 self.datastore.emptyTrash()
963 def pruneDatasets(self, refs: Iterable[DatasetRef], *,
964 disassociate: bool = True,
965 unstore: bool = False,
966 tags: Optional[Iterable[str]] = None,
967 purge: bool = False,
968 run: Optional[str] = None):
969 """Remove one or more datasets from a collection and/or storage.
971 Parameters
972 ----------
973 refs : `~collections.abc.Iterable` of `DatasetRef`
974 Datasets to prune. These must be "resolved" references (not just
975 a `DatasetType` and data ID).
976 disassociate : bool`, optional
977 Disassociate pruned datasets from ``self.tags`` (or the collections
978 given via the ``tags`` argument). Ignored if ``refs`` is ``...``.
979 unstore : `bool`, optional
980 If `True` (`False` is default) remove these datasets from all
981 datastores known to this butler. Note that this will make it
982 impossible to retrieve these datasets even via other collections.
983 Datasets that are already not stored are ignored by this option.
984 tags : `Iterable` [ `str` ], optional
985 `~CollectionType.TAGGED` collections to disassociate the datasets
986 from, overriding ``self.tags``. Ignored if ``disassociate`` is
987 `False` or ``purge`` is `True`.
988 purge : `bool`, optional
989 If `True` (`False` is default), completely remove the dataset from
990 the `Registry`. To prevent accidental deletions, ``purge`` may
991 only be `True` if all of the following conditions are met:
993 - All given datasets are in the given run.
994 - ``disassociate`` is `True`;
995 - ``unstore`` is `True`.
997 This mode may remove provenance information from datasets other
998 than those provided, and should be used with extreme care.
999 run : `str`, optional
1000 `~CollectionType.RUN` collection to purge from, overriding
1001 ``self.run``. Ignored unless ``purge`` is `True`.
1003 Raises
1004 ------
1005 TypeError
1006 Raised if the butler is read-only, if no collection was provided,
1007 or the conditions for ``purge=True`` were not met.
1008 """
1009 if not self.isWriteable():
1010 raise TypeError("Butler is read-only.")
1011 if purge:
1012 if not disassociate:
1013 raise TypeError("Cannot pass purge=True without disassociate=True.")
1014 if not unstore:
1015 raise TypeError("Cannot pass purge=True without unstore=True.")
1016 if run is None:
1017 run = self.run
1018 if run is None:
1019 raise TypeError("No run provided but purge=True.")
1020 collectionType = self.registry.getCollectionType(run)
1021 if collectionType is not CollectionType.RUN:
1022 raise TypeError(f"Cannot purge from collection '{run}' "
1023 f"of non-RUN type {collectionType.name}.")
1024 elif disassociate:
1025 if tags is None:
1026 tags = self.tags
1027 else:
1028 tags = tuple(tags)
1029 if not tags:
1030 raise TypeError("No tags provided but disassociate=True.")
1031 for tag in tags:
1032 collectionType = self.registry.getCollectionType(tag)
1033 if collectionType is not CollectionType.TAGGED:
1034 raise TypeError(f"Cannot disassociate from collection '{tag}' "
1035 f"of non-TAGGED type {collectionType.name}.")
1036 # Transform possibly-single-pass iterable into something we can iterate
1037 # over multiple times.
1038 refs = list(refs)
1039 # Pruning a component of a DatasetRef makes no sense since registry
1040 # doesn't know about components and datastore might not store
1041 # components in a separate file
1042 for ref in refs:
1043 if ref.datasetType.component():
1044 raise ValueError(f"Can not prune a component of a dataset (ref={ref})")
1045 # We don't need an unreliable Datastore transaction for this, because
1046 # we've been extra careful to ensure that Datastore.trash only involves
1047 # mutating the Registry (it can _look_ at Datastore-specific things,
1048 # but shouldn't change them), and hence all operations here are
1049 # Registry operations.
1050 with self.registry.transaction():
1051 if unstore:
1052 for ref in refs:
1053 # There is a difference between a concrete composite
1054 # and virtual composite. In a virtual composite the
1055 # datastore is never given the top level DatasetRef. In
1056 # the concrete composite the datastore knows all the
1057 # refs and will clean up itself if asked to remove the
1058 # parent ref. We can not check configuration for this
1059 # since we can not trust that the configuration is the
1060 # same. We therefore have to ask if the ref exists or
1061 # not. This is consistent with the fact that we want
1062 # to ignore already-removed-from-datastore datasets
1063 # anyway.
1064 if self.datastore.exists(ref):
1065 self.datastore.trash(ref)
1066 if purge:
1067 self.registry.removeDatasets(refs)
1068 elif disassociate:
1069 for tag in tags:
1070 self.registry.disassociate(tag, refs)
1071 # We've exited the Registry transaction, and apparently committed.
1072 # (if there was an exception, everything rolled back, and it's as if
1073 # nothing happened - and we never get here).
1074 # Datastore artifacts are not yet gone, but they're clearly marked
1075 # as trash, so if we fail to delete now because of (e.g.) filesystem
1076 # problems we can try again later, and if manual administrative
1077 # intervention is required, it's pretty clear what that should entail:
1078 # deleting everything on disk and in private Datastore tables that is
1079 # in the dataset_location_trash table.
1080 if unstore:
1081 # Point of no return for removing artifacts
1082 self.datastore.emptyTrash()
1084 @transactional
1085 def ingest(self, *datasets: FileDataset, transfer: Optional[str] = "auto", run: Optional[str] = None,
1086 tags: Optional[Iterable[str]] = None,):
1087 """Store and register one or more datasets that already exist on disk.
1089 Parameters
1090 ----------
1091 datasets : `FileDataset`
1092 Each positional argument is a struct containing information about
1093 a file to be ingested, including its path (either absolute or
1094 relative to the datastore root, if applicable), a `DatasetRef`,
1095 and optionally a formatter class or its fully-qualified string
1096 name. If a formatter is not provided, the formatter that would be
1097 used for `put` is assumed. On successful return, all
1098 `FileDataset.ref` attributes will have their `DatasetRef.id`
1099 attribute populated and all `FileDataset.formatter` attributes will
1100 be set to the formatter class used. `FileDataset.path` attributes
1101 may be modified to put paths in whatever the datastore considers a
1102 standardized form.
1103 transfer : `str`, optional
1104 If not `None`, must be one of 'auto', 'move', 'copy', 'hardlink',
1105 'relsymlink' or 'symlink', indicating how to transfer the file.
1106 run : `str`, optional
1107 The name of the run ingested datasets should be added to,
1108 overriding ``self.run``.
1109 tags : `Iterable` [ `str` ], optional
1110 The names of a `~CollectionType.TAGGED` collections to associate
1111 the dataset with, overriding ``self.tags``. These collections
1112 must have already been added to the `Registry`.
1114 Raises
1115 ------
1116 TypeError
1117 Raised if the butler is read-only or if no run was provided.
1118 NotImplementedError
1119 Raised if the `Datastore` does not support the given transfer mode.
1120 DatasetTypeNotSupportedError
1121 Raised if one or more files to be ingested have a dataset type that
1122 is not supported by the `Datastore`..
1123 FileNotFoundError
1124 Raised if one of the given files does not exist.
1125 FileExistsError
1126 Raised if transfer is not `None` but the (internal) location the
1127 file would be moved to is already occupied.
1129 Notes
1130 -----
1131 This operation is not fully exception safe: if a database operation
1132 fails, the given `FileDataset` instances may be only partially updated.
1134 It is atomic in terms of database operations (they will either all
1135 succeed or all fail) providing the database engine implements
1136 transactions correctly. It will attempt to be atomic in terms of
1137 filesystem operations as well, but this cannot be implemented
1138 rigorously for most datastores.
1139 """
1140 if not self.isWriteable():
1141 raise TypeError("Butler is read-only.")
1142 if run is None:
1143 if self.run is None:
1144 raise TypeError("No run provided.")
1145 run = self.run
1146 # No need to check run type, since insertDatasets will do that
1147 # (safely) for us.
1148 if tags is None:
1149 tags = self.tags
1150 else:
1151 tags = tuple(tags)
1152 for tag in tags:
1153 # Check that these are tagged collections up front, because we want
1154 # to avoid relying on Datastore transactionality to avoid modifying
1155 # the repo if there's an error later.
1156 collectionType = self.registry.getCollectionType(tag)
1157 if collectionType is not CollectionType.TAGGED:
1158 raise TypeError(f"Cannot associate into collection '{tag}' of non-TAGGED type "
1159 f"{collectionType.name}.")
1160 # Reorganize the inputs so they're grouped by DatasetType and then
1161 # data ID. We also include a list of DatasetRefs for each FileDataset
1162 # to hold the resolved DatasetRefs returned by the Registry, before
1163 # it's safe to swap them into FileDataset.refs.
1164 # Some type annotation aliases to make that clearer:
1165 GroupForType = Dict[DataCoordinate, Tuple[FileDataset, List[DatasetRef]]]
1166 GroupedData = MutableMapping[DatasetType, GroupForType]
1167 # The actual data structure:
1168 groupedData: GroupedData = defaultdict(dict)
1169 # And the nested loop that populates it:
1170 for dataset in datasets:
1171 # This list intentionally shared across the inner loop, since it's
1172 # associated with `dataset`.
1173 resolvedRefs = []
1174 for ref in dataset.refs:
1175 groupedData[ref.datasetType][ref.dataId] = (dataset, resolvedRefs)
1177 # Now we can bulk-insert into Registry for each DatasetType.
1178 allResolvedRefs = []
1179 for datasetType, groupForType in groupedData.items():
1180 refs = self.registry.insertDatasets(datasetType,
1181 dataIds=groupForType.keys(),
1182 run=run)
1183 # Append those resolved DatasetRefs to the new lists we set up for
1184 # them.
1185 for ref, (_, resolvedRefs) in zip(refs, groupForType.values()):
1186 resolvedRefs.append(ref)
1188 # Go back to the original FileDatasets to replace their refs with the
1189 # new resolved ones, and also build a big list of all refs.
1190 allResolvedRefs = []
1191 for groupForType in groupedData.values():
1192 for dataset, resolvedRefs in groupForType.values():
1193 dataset.refs = resolvedRefs
1194 allResolvedRefs.extend(resolvedRefs)
1196 # Bulk-associate everything with any tagged collections.
1197 for tag in tags:
1198 self.registry.associate(tag, allResolvedRefs)
1200 # Bulk-insert everything into Datastore.
1201 self.datastore.ingest(*datasets, transfer=transfer)
1203 @contextlib.contextmanager
1204 def export(self, *, directory: Optional[str] = None,
1205 filename: Optional[str] = None,
1206 format: Optional[str] = None,
1207 transfer: Optional[str] = None) -> ContextManager[RepoExport]:
1208 """Export datasets from the repository represented by this `Butler`.
1210 This method is a context manager that returns a helper object
1211 (`RepoExport`) that is used to indicate what information from the
1212 repository should be exported.
1214 Parameters
1215 ----------
1216 directory : `str`, optional
1217 Directory dataset files should be written to if ``transfer`` is not
1218 `None`.
1219 filename : `str`, optional
1220 Name for the file that will include database information associated
1221 with the exported datasets. If this is not an absolute path and
1222 ``directory`` is not `None`, it will be written to ``directory``
1223 instead of the current working directory. Defaults to
1224 "export.{format}".
1225 format : `str`, optional
1226 File format for the database information file. If `None`, the
1227 extension of ``filename`` will be used.
1228 transfer : `str`, optional
1229 Transfer mode passed to `Datastore.export`.
1231 Raises
1232 ------
1233 TypeError
1234 Raised if the set of arguments passed is inconsistent.
1236 Examples
1237 --------
1238 Typically the `Registry.queryDimensions` and `Registry.queryDatasets`
1239 methods are used to provide the iterables over data IDs and/or datasets
1240 to be exported::
1242 with butler.export("exports.yaml") as export:
1243 # Export all flats, and the calibration_label dimensions
1244 # associated with them.
1245 export.saveDatasets(butler.registry.queryDatasets("flat"),
1246 elements=[butler.registry.dimensions["calibration_label"]])
1247 # Export all datasets that start with "deepCoadd_" and all of
1248 # their associated data ID information.
1249 export.saveDatasets(butler.registry.queryDatasets("deepCoadd_*"))
1250 """
1251 if directory is None and transfer is not None:
1252 raise TypeError("Cannot transfer without providing a directory.")
1253 if transfer == "move":
1254 raise TypeError("Transfer may not be 'move': export is read-only")
1255 if format is None:
1256 if filename is None:
1257 raise TypeError("At least one of 'filename' or 'format' must be provided.")
1258 else:
1259 _, format = os.path.splitext(filename)
1260 elif filename is None:
1261 filename = f"export.{format}"
1262 if directory is not None:
1263 filename = os.path.join(directory, filename)
1264 BackendClass = getClassOf(self._config["repo_transfer_formats"][format]["export"])
1265 with open(filename, 'w') as stream:
1266 backend = BackendClass(stream)
1267 try:
1268 helper = RepoExport(self.registry, self.datastore, backend=backend,
1269 directory=directory, transfer=transfer)
1270 yield helper
1271 except BaseException:
1272 raise
1273 else:
1274 helper._finish()
1276 def import_(self, *, directory: Optional[str] = None,
1277 filename: Union[str, TextIO, None] = None,
1278 format: Optional[str] = None,
1279 transfer: Optional[str] = None):
1280 """Import datasets exported from a different butler repository.
1282 Parameters
1283 ----------
1284 directory : `str`, optional
1285 Directory containing dataset files. If `None`, all file paths
1286 must be absolute.
1287 filename : `str` or `TextIO`, optional
1288 A stream or name of file that contains database information
1289 associated with the exported datasets. If this a string (name) and
1290 is not an absolute path, does not exist in the current working
1291 directory, and ``directory`` is not `None`, it is assumed to be in
1292 ``directory``. Defaults to "export.{format}".
1293 format : `str`, optional
1294 File format for the database information file. If `None`, the
1295 extension of ``filename`` will be used.
1296 transfer : `str`, optional
1297 Transfer mode passed to `Datastore.export`.
1299 Raises
1300 ------
1301 TypeError
1302 Raised if the set of arguments passed is inconsistent, or if the
1303 butler is read-only.
1304 """
1305 if not self.isWriteable():
1306 raise TypeError("Butler is read-only.")
1307 if format is None:
1308 if filename is None:
1309 raise TypeError("At least one of 'filename' or 'format' must be provided.")
1310 else:
1311 _, format = os.path.splitext(filename)
1312 elif filename is None:
1313 filename = f"export.{format}"
1314 if isinstance(filename, str) and directory is not None and not os.path.exists(filename):
1315 filename = os.path.join(directory, filename)
1316 BackendClass = getClassOf(self._config["repo_transfer_formats"][format]["import"])
1318 def doImport(importStream):
1319 backend = BackendClass(importStream, self.registry)
1320 backend.register()
1321 with self.transaction():
1322 backend.load(self.datastore, directory=directory, transfer=transfer)
1324 if isinstance(filename, str):
1325 with open(filename, "r") as stream:
1326 doImport(stream)
1327 else:
1328 doImport(filename)
1330 def validateConfiguration(self, logFailures: bool = False,
1331 datasetTypeNames: Optional[Iterable[str]] = None,
1332 ignore: Iterable[str] = None):
1333 """Validate butler configuration.
1335 Checks that each `DatasetType` can be stored in the `Datastore`.
1337 Parameters
1338 ----------
1339 logFailures : `bool`, optional
1340 If `True`, output a log message for every validation error
1341 detected.
1342 datasetTypeNames : iterable of `str`, optional
1343 The `DatasetType` names that should be checked. This allows
1344 only a subset to be selected.
1345 ignore : iterable of `str`, optional
1346 Names of DatasetTypes to skip over. This can be used to skip
1347 known problems. If a named `DatasetType` corresponds to a
1348 composite, all components of that `DatasetType` will also be
1349 ignored.
1351 Raises
1352 ------
1353 ButlerValidationError
1354 Raised if there is some inconsistency with how this Butler
1355 is configured.
1356 """
1357 if datasetTypeNames:
1358 entities = [self.registry.getDatasetType(name) for name in datasetTypeNames]
1359 else:
1360 entities = list(self.registry.queryDatasetTypes())
1362 # filter out anything from the ignore list
1363 if ignore:
1364 ignore = set(ignore)
1365 entities = [e for e in entities if e.name not in ignore and e.nameAndComponent()[0] not in ignore]
1366 else:
1367 ignore = set()
1369 # Find all the registered instruments
1370 instruments = set(
1371 dataId["instrument"] for dataId in self.registry.queryDimensions(["instrument"])
1372 )
1374 # For each datasetType that has an instrument dimension, create
1375 # a DatasetRef for each defined instrument
1376 datasetRefs = []
1378 for datasetType in entities:
1379 if "instrument" in datasetType.dimensions:
1380 for instrument in instruments:
1381 datasetRef = DatasetRef(datasetType, {"instrument": instrument}, conform=False)
1382 datasetRefs.append(datasetRef)
1384 entities.extend(datasetRefs)
1386 datastoreErrorStr = None
1387 try:
1388 self.datastore.validateConfiguration(entities, logFailures=logFailures)
1389 except ValidationError as e:
1390 datastoreErrorStr = str(e)
1392 # Also check that the LookupKeys used by the datastores match
1393 # registry and storage class definitions
1394 keys = self.datastore.getLookupKeys()
1396 failedNames = set()
1397 failedDataId = set()
1398 for key in keys:
1399 datasetType = None
1400 if key.name is not None:
1401 if key.name in ignore:
1402 continue
1404 # skip if specific datasetType names were requested and this
1405 # name does not match
1406 if datasetTypeNames and key.name not in datasetTypeNames:
1407 continue
1409 # See if it is a StorageClass or a DatasetType
1410 if key.name in self.storageClasses:
1411 pass
1412 else:
1413 try:
1414 self.registry.getDatasetType(key.name)
1415 except KeyError:
1416 if logFailures:
1417 log.fatal("Key '%s' does not correspond to a DatasetType or StorageClass", key)
1418 failedNames.add(key)
1419 else:
1420 # Dimensions are checked for consistency when the Butler
1421 # is created and rendezvoused with a universe.
1422 pass
1424 # Check that the instrument is a valid instrument
1425 # Currently only support instrument so check for that
1426 if key.dataId:
1427 dataIdKeys = set(key.dataId)
1428 if set(["instrument"]) != dataIdKeys:
1429 if logFailures:
1430 log.fatal("Key '%s' has unsupported DataId override", key)
1431 failedDataId.add(key)
1432 elif key.dataId["instrument"] not in instruments:
1433 if logFailures:
1434 log.fatal("Key '%s' has unknown instrument", key)
1435 failedDataId.add(key)
1437 messages = []
1439 if datastoreErrorStr:
1440 messages.append(datastoreErrorStr)
1442 for failed, msg in ((failedNames, "Keys without corresponding DatasetType or StorageClass entry: "),
1443 (failedDataId, "Keys with bad DataId entries: ")):
1444 if failed:
1445 msg += ", ".join(str(k) for k in failed)
1446 messages.append(msg)
1448 if messages:
1449 raise ValidationError(";\n".join(messages))
1451 registry: Registry
1452 """The object that manages dataset metadata and relationships (`Registry`).
1454 Most operations that don't involve reading or writing butler datasets are
1455 accessible only via `Registry` methods.
1456 """
1458 datastore: Datastore
1459 """The object that manages actual dataset storage (`Datastore`).
1461 Direct user access to the datastore should rarely be necessary; the primary
1462 exception is the case where a `Datastore` implementation provides extra
1463 functionality beyond what the base class defines.
1464 """
1466 storageClasses: StorageClassFactory
1467 """An object that maps known storage class names to objects that fully
1468 describe them (`StorageClassFactory`).
1469 """
1471 collections: Optional[CollectionSearch]
1472 """The collections to search and any restrictions on the dataset types to
1473 search for within them, in order (`CollectionSearch`).
1474 """
1476 run: Optional[str]
1477 """Name of the run this butler writes outputs to (`str` or `None`).
1478 """
1480 tags: Tuple[str, ...]
1481 """Names of `~CollectionType.TAGGED` collections this butler associates
1482 with in `put` and `ingest`, and disassociates from in `pruneDatasets`
1483 (`tuple` [ `str` ]).
1484 """