Coverage for python/lsst/daf/butler/_butler.py : 9%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""
23Butler top level classes.
24"""
25from __future__ import annotations
27__all__ = ("Butler", "ButlerValidationError")
29import os
30from collections import defaultdict
31import contextlib
32import logging
33from typing import (
34 Any,
35 ClassVar,
36 ContextManager,
37 Dict,
38 Iterable,
39 List,
40 Mapping,
41 MutableMapping,
42 Optional,
43 Set,
44 TextIO,
45 Tuple,
46 Union,
47)
49try:
50 import boto3
51except ImportError:
52 boto3 = None
54from lsst.utils import doImport
55from .core import (
56 ButlerURI,
57 Config,
58 ConfigSubset,
59 DataCoordinate,
60 DataId,
61 DatasetRef,
62 DatasetType,
63 Datastore,
64 FileDataset,
65 RepoExport,
66 StorageClassFactory,
67 ValidationError,
68)
69from .core.repoRelocation import BUTLER_ROOT_TAG
70from .core.utils import transactional, getClassOf
71from ._deferredDatasetHandle import DeferredDatasetHandle
72from ._butlerConfig import ButlerConfig
73from .registry import Registry, RegistryConfig, CollectionType
74from .registry.wildcards import CollectionSearch
76log = logging.getLogger(__name__)
79class ButlerValidationError(ValidationError):
80 """There is a problem with the Butler configuration."""
81 pass
84class Butler:
85 """Main entry point for the data access system.
87 Parameters
88 ----------
89 config : `ButlerConfig`, `Config` or `str`, optional.
90 Configuration. Anything acceptable to the
91 `ButlerConfig` constructor. If a directory path
92 is given the configuration will be read from a ``butler.yaml`` file in
93 that location. If `None` is given default values will be used.
94 butler : `Butler`, optional.
95 If provided, construct a new Butler that uses the same registry and
96 datastore as the given one, but with the given collection and run.
97 Incompatible with the ``config``, ``searchPaths``, and ``writeable``
98 arguments.
99 collections : `Any`, optional
100 An expression specifying the collections to be searched (in order) when
101 reading datasets, and optionally dataset type restrictions on them.
102 This may be:
103 - a `str` collection name;
104 - a tuple of (collection name, *dataset type restriction*);
105 - an iterable of either of the above;
106 - a mapping from `str` to *dataset type restriction*.
108 See :ref:`daf_butler_collection_expressions` for more information,
109 including the definition of a *dataset type restriction*. All
110 collections must either already exist or be specified to be created
111 by other arguments.
112 run : `str`, optional
113 Name of the run datasets should be output to. If the run
114 does not exist, it will be created. If ``collections`` is `None`, it
115 will be set to ``[run]``. If this is not set (and ``writeable`` is
116 not set either), a read-only butler will be created.
117 tags : `Iterable` [ `str` ], optional
118 A list of `~CollectionType.TAGGED` collections that datasets should be
119 associated with in `put` or `ingest` and disassociated from in
120 `pruneDatasets`. If any of these collections does not exist, it will
121 be created.
122 chains : `Mapping` [ `str`, `Iterable` [ `str` ] ], optional
123 A mapping from the names of new `~CollectionType.CHAINED` collections
124 to an expression identifying their child collections (which takes the
125 same form as the ``collections`` argument. Chains may be nested only
126 if children precede their parents in this mapping.
127 searchPaths : `list` of `str`, optional
128 Directory paths to search when calculating the full Butler
129 configuration. Not used if the supplied config is already a
130 `ButlerConfig`.
131 writeable : `bool`, optional
132 Explicitly sets whether the butler supports write operations. If not
133 provided, a read-write butler is created if any of ``run``, ``tags``,
134 or ``chains`` is non-empty.
136 Examples
137 --------
138 While there are many ways to control exactly how a `Butler` interacts with
139 the collections in its `Registry`, the most common cases are still simple.
141 For a read-only `Butler` that searches one collection, do::
143 butler = Butler("/path/to/repo", collections=["u/alice/DM-50000"])
145 For a read-write `Butler` that writes to and reads from a
146 `~CollectionType.RUN` collection::
148 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a")
150 The `Butler` passed to a ``PipelineTask`` is often much more complex,
151 because we want to write to one `~CollectionType.RUN` collection but read
152 from several others (as well), while defining a new
153 `~CollectionType.CHAINED` collection that combines them all::
155 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a",
156 collections=["u/alice/DM-50000"],
157 chains={
158 "u/alice/DM-50000": ["u/alice/DM-50000/a",
159 "u/bob/DM-49998",
160 "raw/hsc"]
161 })
163 This butler will `put` new datasets to the run ``u/alice/DM-50000/a``, but
164 they'll also be available from the chained collection ``u/alice/DM-50000``.
165 Datasets will be read first from that run (since it appears first in the
166 chain), and then from ``u/bob/DM-49998`` and finally ``raw/hsc``.
167 If ``u/alice/DM-50000`` had already been defined, the ``chain`` argument
168 would be unnecessary. We could also construct a butler that performs
169 exactly the same `put` and `get` operations without actually creating a
170 chained collection, just by passing multiple items is ``collections``::
172 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a",
173 collections=["u/alice/DM-50000/a",
174 "u/bob/DM-49998",
175 "raw/hsc"])
177 Finally, one can always create a `Butler` with no collections::
179 butler = Butler("/path/to/repo", writeable=True)
181 This can be extremely useful when you just want to use ``butler.registry``,
182 e.g. for inserting dimension data or managing collections, or when the
183 collections you want to use with the butler are not consistent.
184 Passing ``writeable`` explicitly here is only necessary if you want to be
185 able to make changes to the repo - usually the value for ``writeable`` is
186 can be guessed from the collection arguments provided, but it defaults to
187 `False` when there are not collection arguments.
188 """
189 def __init__(self, config: Union[Config, str, None] = None, *,
190 butler: Optional[Butler] = None,
191 collections: Any = None,
192 run: Optional[str] = None,
193 tags: Iterable[str] = (),
194 chains: Optional[Mapping[str, Any]] = None,
195 searchPaths: Optional[List[str]] = None,
196 writeable: Optional[bool] = None):
197 # Transform any single-pass iterator into an actual sequence so we
198 # can see if its empty
199 self.tags = tuple(tags)
200 # Load registry, datastore, etc. from config or existing butler.
201 if butler is not None:
202 if config is not None or searchPaths is not None or writeable is not None:
203 raise TypeError("Cannot pass 'config', 'searchPaths', or 'writeable' "
204 "arguments with 'butler' argument.")
205 self.registry = butler.registry
206 self.datastore = butler.datastore
207 self.storageClasses = butler.storageClasses
208 self._config = butler._config
209 else:
210 self._config = ButlerConfig(config, searchPaths=searchPaths)
211 if "root" in self._config:
212 butlerRoot = self._config["root"]
213 else:
214 butlerRoot = self._config.configDir
215 if writeable is None:
216 writeable = run is not None or chains is not None or self.tags
217 self.registry = Registry.fromConfig(self._config, butlerRoot=butlerRoot, writeable=writeable)
218 self.datastore = Datastore.fromConfig(self._config, self.registry.getDatastoreBridgeManager(),
219 butlerRoot=butlerRoot)
220 self.storageClasses = StorageClassFactory()
221 self.storageClasses.addFromConfig(self._config)
222 # Check the many collection arguments for consistency and create any
223 # needed collections that don't exist.
224 if collections is None:
225 if run is not None:
226 collections = (run,)
227 else:
228 collections = ()
229 self.collections = CollectionSearch.fromExpression(collections)
230 if chains is None:
231 chains = {}
232 self.run = run
233 if "run" in self._config or "collection" in self._config:
234 raise ValueError("Passing a run or collection via configuration is no longer supported.")
235 if self.run is not None:
236 self.registry.registerCollection(self.run, type=CollectionType.RUN)
237 for tag in self.tags:
238 self.registry.registerCollection(tag, type=CollectionType.TAGGED)
239 for parent, children in chains.items():
240 self.registry.registerCollection(parent, type=CollectionType.CHAINED)
241 self.registry.setCollectionChain(parent, children)
243 GENERATION: ClassVar[int] = 3
244 """This is a Generation 3 Butler.
246 This attribute may be removed in the future, once the Generation 2 Butler
247 interface has been fully retired; it should only be used in transitional
248 code.
249 """
251 @staticmethod
252 def makeRepo(root: str, config: Union[Config, str, None] = None, standalone: bool = False,
253 createRegistry: bool = True, searchPaths: Optional[List[str]] = None,
254 forceConfigRoot: bool = True, outfile: Optional[str] = None,
255 overwrite: bool = False) -> Config:
256 """Create an empty data repository by adding a butler.yaml config
257 to a repository root directory.
259 Parameters
260 ----------
261 root : `str` or `ButlerURI`
262 Path or URI to the root location of the new repository. Will be
263 created if it does not exist.
264 config : `Config` or `str`, optional
265 Configuration to write to the repository, after setting any
266 root-dependent Registry or Datastore config options. Can not
267 be a `ButlerConfig` or a `ConfigSubset`. If `None`, default
268 configuration will be used. Root-dependent config options
269 specified in this config are overwritten if ``forceConfigRoot``
270 is `True`.
271 standalone : `bool`
272 If True, write all expanded defaults, not just customized or
273 repository-specific settings.
274 This (mostly) decouples the repository from the default
275 configuration, insulating it from changes to the defaults (which
276 may be good or bad, depending on the nature of the changes).
277 Future *additions* to the defaults will still be picked up when
278 initializing `Butlers` to repos created with ``standalone=True``.
279 createRegistry : `bool`, optional
280 If `True` create a new Registry.
281 searchPaths : `list` of `str`, optional
282 Directory paths to search when calculating the full butler
283 configuration.
284 forceConfigRoot : `bool`, optional
285 If `False`, any values present in the supplied ``config`` that
286 would normally be reset are not overridden and will appear
287 directly in the output config. This allows non-standard overrides
288 of the root directory for a datastore or registry to be given.
289 If this parameter is `True` the values for ``root`` will be
290 forced into the resulting config if appropriate.
291 outfile : `str`, optional
292 If not-`None`, the output configuration will be written to this
293 location rather than into the repository itself. Can be a URI
294 string. Can refer to a directory that will be used to write
295 ``butler.yaml``.
296 overwrite : `bool`, optional
297 Create a new configuration file even if one already exists
298 in the specified output location. Default is to raise
299 an exception.
301 Returns
302 -------
303 config : `Config`
304 The updated `Config` instance written to the repo.
306 Raises
307 ------
308 ValueError
309 Raised if a ButlerConfig or ConfigSubset is passed instead of a
310 regular Config (as these subclasses would make it impossible to
311 support ``standalone=False``).
312 FileExistsError
313 Raised if the output config file already exists.
314 os.error
315 Raised if the directory does not exist, exists but is not a
316 directory, or cannot be created.
318 Notes
319 -----
320 Note that when ``standalone=False`` (the default), the configuration
321 search path (see `ConfigSubset.defaultSearchPaths`) that was used to
322 construct the repository should also be used to construct any Butlers
323 to avoid configuration inconsistencies.
324 """
325 if isinstance(config, (ButlerConfig, ConfigSubset)):
326 raise ValueError("makeRepo must be passed a regular Config without defaults applied.")
328 # Ensure that the root of the repository exists or can be made
329 uri = ButlerURI(root, forceDirectory=True)
330 uri.mkdir()
332 config = Config(config)
334 # If we are creating a new repo from scratch with relative roots,
335 # do not propagate an explicit root from the config file
336 if "root" in config:
337 del config["root"]
339 full = ButlerConfig(config, searchPaths=searchPaths) # this applies defaults
340 datastoreClass = doImport(full["datastore", "cls"])
341 datastoreClass.setConfigRoot(BUTLER_ROOT_TAG, config, full, overwrite=forceConfigRoot)
343 # if key exists in given config, parse it, otherwise parse the defaults
344 # in the expanded config
345 if config.get(("registry", "db")):
346 registryConfig = RegistryConfig(config)
347 else:
348 registryConfig = RegistryConfig(full)
349 defaultDatabaseUri = registryConfig.makeDefaultDatabaseUri(BUTLER_ROOT_TAG)
350 if defaultDatabaseUri is not None:
351 Config.updateParameters(RegistryConfig, config, full,
352 toUpdate={"db": defaultDatabaseUri},
353 overwrite=forceConfigRoot)
354 else:
355 Config.updateParameters(RegistryConfig, config, full, toCopy=("db",),
356 overwrite=forceConfigRoot)
358 if standalone:
359 config.merge(full)
360 if outfile is not None:
361 # When writing to a separate location we must include
362 # the root of the butler repo in the config else it won't know
363 # where to look.
364 config["root"] = uri.geturl()
365 configURI = outfile
366 else:
367 configURI = uri
368 config.dumpToUri(configURI, overwrite=overwrite)
370 # Create Registry and populate tables
371 Registry.fromConfig(config, create=createRegistry, butlerRoot=root)
372 return config
374 @classmethod
375 def _unpickle(cls, config: ButlerConfig, collections: Optional[CollectionSearch], run: Optional[str],
376 tags: Tuple[str, ...], writeable: bool) -> Butler:
377 """Callable used to unpickle a Butler.
379 We prefer not to use ``Butler.__init__`` directly so we can force some
380 of its many arguments to be keyword-only (note that ``__reduce__``
381 can only invoke callables with positional arguments).
383 Parameters
384 ----------
385 config : `ButlerConfig`
386 Butler configuration, already coerced into a true `ButlerConfig`
387 instance (and hence after any search paths for overrides have been
388 utilized).
389 collections : `CollectionSearch`
390 Names of collections to read from.
391 run : `str`, optional
392 Name of `~CollectionType.RUN` collection to write to.
393 tags : `tuple` [`str`]
394 Names of `~CollectionType.TAGGED` collections to associate with.
395 writeable : `bool`
396 Whether the Butler should support write operations.
398 Returns
399 -------
400 butler : `Butler`
401 A new `Butler` instance.
402 """
403 return cls(config=config, collections=collections, run=run, tags=tags, writeable=writeable)
405 def __reduce__(self):
406 """Support pickling.
407 """
408 return (Butler._unpickle, (self._config, self.collections, self.run, self.tags,
409 self.registry.isWriteable()))
411 def __str__(self):
412 return "Butler(collections={}, run={}, tags={}, datastore='{}', registry='{}')".format(
413 self.collections, self.run, self.tags, self.datastore, self.registry)
415 def isWriteable(self) -> bool:
416 """Return `True` if this `Butler` supports write operations.
417 """
418 return self.registry.isWriteable()
420 @contextlib.contextmanager
421 def transaction(self):
422 """Context manager supporting `Butler` transactions.
424 Transactions can be nested.
425 """
426 with self.registry.transaction():
427 with self.datastore.transaction():
428 yield
430 def _standardizeArgs(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
431 dataId: Optional[DataId] = None, **kwds: Any) -> Tuple[DatasetType, DataId]:
432 """Standardize the arguments passed to several Butler APIs.
434 Parameters
435 ----------
436 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
437 When `DatasetRef` the `dataId` should be `None`.
438 Otherwise the `DatasetType` or name thereof.
439 dataId : `dict` or `DataCoordinate`
440 A `dict` of `Dimension` link name, value pairs that label the
441 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
442 should be provided as the second argument.
443 kwds
444 Additional keyword arguments used to augment or construct a
445 `DataCoordinate`. See `DataCoordinate.standardize`
446 parameters.
448 Returns
449 -------
450 datasetType : `DatasetType`
451 A `DatasetType` instance extracted from ``datasetRefOrType``.
452 dataId : `dict` or `DataId`, optional
453 Argument that can be used (along with ``kwds``) to construct a
454 `DataId`.
456 Notes
457 -----
458 Butler APIs that conceptually need a DatasetRef also allow passing a
459 `DatasetType` (or the name of one) and a `DataId` (or a dict and
460 keyword arguments that can be used to construct one) separately. This
461 method accepts those arguments and always returns a true `DatasetType`
462 and a `DataId` or `dict`.
464 Standardization of `dict` vs `DataId` is best handled by passing the
465 returned ``dataId`` (and ``kwds``) to `Registry` APIs, which are
466 generally similarly flexible.
467 """
468 externalDatasetType = None
469 internalDatasetType = None
470 if isinstance(datasetRefOrType, DatasetRef):
471 if dataId is not None or kwds:
472 raise ValueError("DatasetRef given, cannot use dataId as well")
473 externalDatasetType = datasetRefOrType.datasetType
474 dataId = datasetRefOrType.dataId
475 else:
476 # Don't check whether DataId is provided, because Registry APIs
477 # can usually construct a better error message when it wasn't.
478 if isinstance(datasetRefOrType, DatasetType):
479 externalDatasetType = datasetRefOrType
480 else:
481 internalDatasetType = self.registry.getDatasetType(datasetRefOrType)
483 # Check that they are self-consistent
484 if externalDatasetType is not None:
485 internalDatasetType = self.registry.getDatasetType(externalDatasetType.name)
486 if externalDatasetType != internalDatasetType:
487 raise ValueError(f"Supplied dataset type ({externalDatasetType}) inconsistent with "
488 f"registry definition ({internalDatasetType})")
490 return internalDatasetType, dataId
492 def _findDatasetRef(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
493 dataId: Optional[DataId] = None, *,
494 collections: Any = None,
495 allowUnresolved: bool = False,
496 **kwds: Any) -> DatasetRef:
497 """Shared logic for methods that start with a search for a dataset in
498 the registry.
500 Parameters
501 ----------
502 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
503 When `DatasetRef` the `dataId` should be `None`.
504 Otherwise the `DatasetType` or name thereof.
505 dataId : `dict` or `DataCoordinate`, optional
506 A `dict` of `Dimension` link name, value pairs that label the
507 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
508 should be provided as the first argument.
509 collections : Any, optional
510 Collections to be searched, overriding ``self.collections``.
511 Can be any of the types supported by the ``collections`` argument
512 to butler construction.
513 allowUnresolved : `bool`, optional
514 If `True`, return an unresolved `DatasetRef` if finding a resolved
515 one in the `Registry` fails. Defaults to `False`.
516 kwds
517 Additional keyword arguments used to augment or construct a
518 `DataId`. See `DataId` parameters.
520 Returns
521 -------
522 ref : `DatasetRef`
523 A reference to the dataset identified by the given arguments.
525 Raises
526 ------
527 LookupError
528 Raised if no matching dataset exists in the `Registry` (and
529 ``allowUnresolved is False``).
530 ValueError
531 Raised if a resolved `DatasetRef` was passed as an input, but it
532 differs from the one found in the registry.
533 TypeError
534 Raised if no collections were provided.
535 """
536 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwds)
537 if isinstance(datasetRefOrType, DatasetRef):
538 idNumber = datasetRefOrType.id
539 else:
540 idNumber = None
541 # Standardize the data ID first instead of letting registry.findDataset
542 # do it, so we get the result even if no dataset is found.
543 dataId = DataCoordinate.standardize(dataId, graph=datasetType.dimensions, **kwds)
544 if collections is None:
545 collections = self.collections
546 if not collections:
547 raise TypeError("No input collections provided.")
548 else:
549 collections = CollectionSearch.fromExpression(collections)
550 # Always lookup the DatasetRef, even if one is given, to ensure it is
551 # present in the current collection.
552 ref = self.registry.findDataset(datasetType, dataId, collections=collections)
553 if ref is None:
554 if allowUnresolved:
555 return DatasetRef(datasetType, dataId)
556 else:
557 raise LookupError(f"Dataset {datasetType.name} with data ID {dataId} "
558 f"could not be found in collections {collections}.")
559 if idNumber is not None and idNumber != ref.id:
560 raise ValueError(f"DatasetRef.id provided ({idNumber}) does not match "
561 f"id ({ref.id}) in registry in collections {collections}.")
562 return ref
564 @transactional
565 def put(self, obj: Any, datasetRefOrType: Union[DatasetRef, DatasetType, str],
566 dataId: Optional[DataId] = None, *,
567 run: Optional[str] = None,
568 tags: Optional[Iterable[str]] = None,
569 **kwds: Any) -> DatasetRef:
570 """Store and register a dataset.
572 Parameters
573 ----------
574 obj : `object`
575 The dataset.
576 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
577 When `DatasetRef` is provided, ``dataId`` should be `None`.
578 Otherwise the `DatasetType` or name thereof.
579 dataId : `dict` or `DataCoordinate`
580 A `dict` of `Dimension` link name, value pairs that label the
581 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
582 should be provided as the second argument.
583 run : `str`, optional
584 The name of the run the dataset should be added to, overriding
585 ``self.run``.
586 tags : `Iterable` [ `str` ], optional
587 The names of a `~CollectionType.TAGGED` collections to associate
588 the dataset with, overriding ``self.tags``. These collections
589 must have already been added to the `Registry`.
590 kwds
591 Additional keyword arguments used to augment or construct a
592 `DataCoordinate`. See `DataCoordinate.standardize`
593 parameters.
595 Returns
596 -------
597 ref : `DatasetRef`
598 A reference to the stored dataset, updated with the correct id if
599 given.
601 Raises
602 ------
603 TypeError
604 Raised if the butler is read-only or if no run has been provided.
605 """
606 log.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run)
607 if not self.isWriteable():
608 raise TypeError("Butler is read-only.")
609 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwds)
610 if isinstance(datasetRefOrType, DatasetRef) and datasetRefOrType.id is not None:
611 raise ValueError("DatasetRef must not be in registry, must have None id")
613 if run is None:
614 if self.run is None:
615 raise TypeError("No run provided.")
616 run = self.run
617 # No need to check type for run; first thing we do is
618 # insertDatasets, and that will check for us.
620 if tags is None:
621 tags = self.tags
622 else:
623 tags = tuple(tags)
624 for tag in tags:
625 # Check that these are tagged collections up front, because we want
626 # to avoid relying on Datastore transactionality to avoid modifying
627 # the repo if there's an error later.
628 collectionType = self.registry.getCollectionType(tag)
629 if collectionType is not CollectionType.TAGGED:
630 raise TypeError(f"Cannot associate into collection '{tag}' of non-TAGGED type "
631 f"{collectionType.name}.")
633 # Add Registry Dataset entry.
634 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions, **kwds)
635 ref, = self.registry.insertDatasets(datasetType, run=run, dataIds=[dataId])
637 # Add Datastore entry.
638 self.datastore.put(obj, ref)
640 for tag in tags:
641 self.registry.associate(tag, [ref])
643 return ref
645 def getDirect(self, ref: DatasetRef, *, parameters: Optional[Dict[str, Any]] = None):
646 """Retrieve a stored dataset.
648 Unlike `Butler.get`, this method allows datasets outside the Butler's
649 collection to be read as long as the `DatasetRef` that identifies them
650 can be obtained separately.
652 Parameters
653 ----------
654 ref : `DatasetRef`
655 Reference to an already stored dataset.
656 parameters : `dict`
657 Additional StorageClass-defined options to control reading,
658 typically used to efficiently read only a subset of the dataset.
660 Returns
661 -------
662 obj : `object`
663 The dataset.
664 """
665 return self.datastore.get(ref, parameters=parameters)
667 def getDeferred(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
668 dataId: Optional[DataId] = None, *,
669 parameters: Union[dict, None] = None,
670 collections: Any = None,
671 **kwds: Any) -> DeferredDatasetHandle:
672 """Create a `DeferredDatasetHandle` which can later retrieve a dataset
674 Parameters
675 ----------
676 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
677 When `DatasetRef` the `dataId` should be `None`.
678 Otherwise the `DatasetType` or name thereof.
679 dataId : `dict` or `DataCoordinate`, optional
680 A `dict` of `Dimension` link name, value pairs that label the
681 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
682 should be provided as the first argument.
683 parameters : `dict`
684 Additional StorageClass-defined options to control reading,
685 typically used to efficiently read only a subset of the dataset.
686 collections : Any, optional
687 Collections to be searched, overriding ``self.collections``.
688 Can be any of the types supported by the ``collections`` argument
689 to butler construction.
690 kwds
691 Additional keyword arguments used to augment or construct a
692 `DataId`. See `DataId` parameters.
694 Returns
695 -------
696 obj : `DeferredDatasetHandle`
697 A handle which can be used to retrieve a dataset at a later time.
699 Raises
700 ------
701 LookupError
702 Raised if no matching dataset exists in the `Registry` (and
703 ``allowUnresolved is False``).
704 ValueError
705 Raised if a resolved `DatasetRef` was passed as an input, but it
706 differs from the one found in the registry.
707 TypeError
708 Raised if no collections were provided.
709 """
710 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwds)
711 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters)
713 def get(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
714 dataId: Optional[DataId] = None, *,
715 parameters: Optional[Dict[str, Any]] = None,
716 collections: Any = None,
717 **kwds: Any) -> Any:
718 """Retrieve a stored dataset.
720 Parameters
721 ----------
722 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
723 When `DatasetRef` the `dataId` should be `None`.
724 Otherwise the `DatasetType` or name thereof.
725 dataId : `dict` or `DataCoordinate`
726 A `dict` of `Dimension` link name, value pairs that label the
727 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
728 should be provided as the first argument.
729 parameters : `dict`
730 Additional StorageClass-defined options to control reading,
731 typically used to efficiently read only a subset of the dataset.
732 collections : Any, optional
733 Collections to be searched, overriding ``self.collections``.
734 Can be any of the types supported by the ``collections`` argument
735 to butler construction.
736 kwds
737 Additional keyword arguments used to augment or construct a
738 `DataCoordinate`. See `DataCoordinate.standardize`
739 parameters.
741 Returns
742 -------
743 obj : `object`
744 The dataset.
746 Raises
747 ------
748 ValueError
749 Raised if a resolved `DatasetRef` was passed as an input, but it
750 differs from the one found in the registry.
751 LookupError
752 Raised if no matching dataset exists in the `Registry`.
753 TypeError
754 Raised if no collections were provided.
755 """
756 log.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters)
757 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwds)
758 return self.getDirect(ref, parameters=parameters)
760 def getURIs(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
761 dataId: Optional[DataId] = None, *,
762 predict: bool = False,
763 collections: Any = None,
764 run: Optional[str] = None,
765 **kwds: Any) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]:
766 """Returns the URIs associated with the dataset.
768 Parameters
769 ----------
770 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
771 When `DatasetRef` the `dataId` should be `None`.
772 Otherwise the `DatasetType` or name thereof.
773 dataId : `dict` or `DataCoordinate`
774 A `dict` of `Dimension` link name, value pairs that label the
775 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
776 should be provided as the first argument.
777 predict : `bool`
778 If `True`, allow URIs to be returned of datasets that have not
779 been written.
780 collections : Any, optional
781 Collections to be searched, overriding ``self.collections``.
782 Can be any of the types supported by the ``collections`` argument
783 to butler construction.
784 run : `str`, optional
785 Run to use for predictions, overriding ``self.run``.
786 kwds
787 Additional keyword arguments used to augment or construct a
788 `DataCoordinate`. See `DataCoordinate.standardize`
789 parameters.
791 Returns
792 -------
793 primary : `ButlerURI`
794 The URI to the primary artifact associated with this dataset.
795 If the dataset was disassembled within the datastore this
796 may be `None`.
797 components : `dict`
798 URIs to any components associated with the dataset artifact.
799 Can be empty if there are no components.
800 """
801 ref = self._findDatasetRef(datasetRefOrType, dataId, allowUnresolved=predict,
802 collections=collections, **kwds)
803 if ref.id is None: # only possible if predict is True
804 if run is None:
805 run = self.run
806 if run is None:
807 raise TypeError("Cannot predict location with run=None.")
808 # Lie about ID, because we can't guess it, and only
809 # Datastore.getURIs() will ever see it (and it doesn't use it).
810 ref = ref.resolved(id=0, run=run)
811 return self.datastore.getURIs(ref, predict)
813 def getURI(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
814 dataId: Optional[DataId] = None, *,
815 predict: bool = False,
816 collections: Any = None,
817 run: Optional[str] = None,
818 **kwds: Any) -> ButlerURI:
819 """Return the URI to the Dataset.
821 Parameters
822 ----------
823 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
824 When `DatasetRef` the `dataId` should be `None`.
825 Otherwise the `DatasetType` or name thereof.
826 dataId : `dict` or `DataCoordinate`
827 A `dict` of `Dimension` link name, value pairs that label the
828 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
829 should be provided as the first argument.
830 predict : `bool`
831 If `True`, allow URIs to be returned of datasets that have not
832 been written.
833 collections : Any, optional
834 Collections to be searched, overriding ``self.collections``.
835 Can be any of the types supported by the ``collections`` argument
836 to butler construction.
837 run : `str`, optional
838 Run to use for predictions, overriding ``self.run``.
839 kwds
840 Additional keyword arguments used to augment or construct a
841 `DataCoordinate`. See `DataCoordinate.standardize`
842 parameters.
844 Returns
845 -------
846 uri : `ButlerURI`
847 URI pointing to the Dataset within the datastore. If the
848 Dataset does not exist in the datastore, and if ``predict`` is
849 `True`, the URI will be a prediction and will include a URI
850 fragment "#predicted".
851 If the datastore does not have entities that relate well
852 to the concept of a URI the returned URI string will be
853 descriptive. The returned URI is not guaranteed to be obtainable.
855 Raises
856 ------
857 LookupError
858 A URI has been requested for a dataset that does not exist and
859 guessing is not allowed.
860 ValueError
861 Raised if a resolved `DatasetRef` was passed as an input, but it
862 differs from the one found in the registry.
863 TypeError
864 Raised if no collections were provided.
865 RuntimeError
866 Raised if a URI is requested for a dataset that consists of
867 multiple artifacts.
868 """
869 primary, components = self.getURIs(datasetRefOrType, dataId=dataId, predict=predict,
870 collections=collections, run=run, **kwds)
872 if primary is None or components:
873 raise RuntimeError(f"Dataset ({datasetRefOrType}) includes distinct URIs for components. "
874 "Use Butler.getURIs() instead.")
875 return primary
877 def datasetExists(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
878 dataId: Optional[DataId] = None, *,
879 collections: Any = None,
880 **kwds: Any) -> bool:
881 """Return True if the Dataset is actually present in the Datastore.
883 Parameters
884 ----------
885 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
886 When `DatasetRef` the `dataId` should be `None`.
887 Otherwise the `DatasetType` or name thereof.
888 dataId : `dict` or `DataCoordinate`
889 A `dict` of `Dimension` link name, value pairs that label the
890 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
891 should be provided as the first argument.
892 collections : Any, optional
893 Collections to be searched, overriding ``self.collections``.
894 Can be any of the types supported by the ``collections`` argument
895 to butler construction.
896 kwds
897 Additional keyword arguments used to augment or construct a
898 `DataCoordinate`. See `DataCoordinate.standardize`
899 parameters.
901 Raises
902 ------
903 LookupError
904 Raised if the dataset is not even present in the Registry.
905 ValueError
906 Raised if a resolved `DatasetRef` was passed as an input, but it
907 differs from the one found in the registry.
908 TypeError
909 Raised if no collections were provided.
910 """
911 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwds)
912 return self.datastore.exists(ref)
914 def pruneCollection(self, name: str, purge: bool = False, unstore: bool = False):
915 """Remove a collection and possibly prune datasets within it.
917 Parameters
918 ----------
919 name : `str`
920 Name of the collection to remove. If this is a
921 `~CollectionType.TAGGED` or `~CollectionType.CHAINED` collection,
922 datasets within the collection are not modified unless ``unstore``
923 is `True`. If this is a `~CollectionType.RUN` collection,
924 ``purge`` and ``unstore`` must be `True`, and all datasets in it
925 are fully removed from the data repository.
926 purge : `bool`, optional
927 If `True`, permit `~CollectionType.RUN` collections to be removed,
928 fully removing datasets within them. Requires ``unstore=True`` as
929 well as an added precaution against accidental deletion. Must be
930 `False` (default) if the collection is not a ``RUN``.
931 unstore: `bool`, optional
932 If `True`, remove all datasets in the collection from all
933 datastores in which they appear.
935 Raises
936 ------
937 TypeError
938 Raised if the butler is read-only or arguments are mutually
939 inconsistent.
940 """
941 # See pruneDatasets comments for more information about the logic here;
942 # the cases are almost the same, but here we can rely on Registry to
943 # take care everything but Datastore deletion when we remove the
944 # collection.
945 if not self.isWriteable():
946 raise TypeError("Butler is read-only.")
947 if purge and not unstore:
948 raise TypeError("Cannot pass purge=True without unstore=True.")
949 collectionType = self.registry.getCollectionType(name)
950 if collectionType is CollectionType.RUN and not purge:
951 raise TypeError(f"Cannot prune RUN collection {name} without purge=True.")
952 if collectionType is not CollectionType.RUN and purge:
953 raise TypeError(f"Cannot prune {collectionType.name} collection {name} with purge=True.")
954 with self.registry.transaction():
955 if unstore:
956 for ref in self.registry.queryDatasets(..., collections=name, deduplicate=True):
957 if self.datastore.exists(ref):
958 self.datastore.trash(ref)
959 self.registry.removeCollection(name)
960 if unstore:
961 # Point of no return for removing artifacts
962 self.datastore.emptyTrash()
964 def pruneDatasets(self, refs: Iterable[DatasetRef], *,
965 disassociate: bool = True,
966 unstore: bool = False,
967 tags: Optional[Iterable[str]] = None,
968 purge: bool = False,
969 run: Optional[str] = None):
970 """Remove one or more datasets from a collection and/or storage.
972 Parameters
973 ----------
974 refs : `~collections.abc.Iterable` of `DatasetRef`
975 Datasets to prune. These must be "resolved" references (not just
976 a `DatasetType` and data ID).
977 disassociate : bool`, optional
978 Disassociate pruned datasets from ``self.tags`` (or the collections
979 given via the ``tags`` argument). Ignored if ``refs`` is ``...``.
980 unstore : `bool`, optional
981 If `True` (`False` is default) remove these datasets from all
982 datastores known to this butler. Note that this will make it
983 impossible to retrieve these datasets even via other collections.
984 Datasets that are already not stored are ignored by this option.
985 tags : `Iterable` [ `str` ], optional
986 `~CollectionType.TAGGED` collections to disassociate the datasets
987 from, overriding ``self.tags``. Ignored if ``disassociate`` is
988 `False` or ``purge`` is `True`.
989 purge : `bool`, optional
990 If `True` (`False` is default), completely remove the dataset from
991 the `Registry`. To prevent accidental deletions, ``purge`` may
992 only be `True` if all of the following conditions are met:
994 - All given datasets are in the given run.
995 - ``disassociate`` is `True`;
996 - ``unstore`` is `True`.
998 This mode may remove provenance information from datasets other
999 than those provided, and should be used with extreme care.
1000 run : `str`, optional
1001 `~CollectionType.RUN` collection to purge from, overriding
1002 ``self.run``. Ignored unless ``purge`` is `True`.
1004 Raises
1005 ------
1006 TypeError
1007 Raised if the butler is read-only, if no collection was provided,
1008 or the conditions for ``purge=True`` were not met.
1009 """
1010 if not self.isWriteable():
1011 raise TypeError("Butler is read-only.")
1012 if purge:
1013 if not disassociate:
1014 raise TypeError("Cannot pass purge=True without disassociate=True.")
1015 if not unstore:
1016 raise TypeError("Cannot pass purge=True without unstore=True.")
1017 if run is None:
1018 run = self.run
1019 if run is None:
1020 raise TypeError("No run provided but purge=True.")
1021 collectionType = self.registry.getCollectionType(run)
1022 if collectionType is not CollectionType.RUN:
1023 raise TypeError(f"Cannot purge from collection '{run}' "
1024 f"of non-RUN type {collectionType.name}.")
1025 elif disassociate:
1026 if tags is None:
1027 tags = self.tags
1028 else:
1029 tags = tuple(tags)
1030 if not tags:
1031 raise TypeError("No tags provided but disassociate=True.")
1032 for tag in tags:
1033 collectionType = self.registry.getCollectionType(tag)
1034 if collectionType is not CollectionType.TAGGED:
1035 raise TypeError(f"Cannot disassociate from collection '{tag}' "
1036 f"of non-TAGGED type {collectionType.name}.")
1037 # Transform possibly-single-pass iterable into something we can iterate
1038 # over multiple times.
1039 refs = list(refs)
1040 # Pruning a component of a DatasetRef makes no sense since registry
1041 # doesn't know about components and datastore might not store
1042 # components in a separate file
1043 for ref in refs:
1044 if ref.datasetType.component():
1045 raise ValueError(f"Can not prune a component of a dataset (ref={ref})")
1046 # We don't need an unreliable Datastore transaction for this, because
1047 # we've been extra careful to ensure that Datastore.trash only involves
1048 # mutating the Registry (it can _look_ at Datastore-specific things,
1049 # but shouldn't change them), and hence all operations here are
1050 # Registry operations.
1051 with self.registry.transaction():
1052 if unstore:
1053 for ref in refs:
1054 # There is a difference between a concrete composite
1055 # and virtual composite. In a virtual composite the
1056 # datastore is never given the top level DatasetRef. In
1057 # the concrete composite the datastore knows all the
1058 # refs and will clean up itself if asked to remove the
1059 # parent ref. We can not check configuration for this
1060 # since we can not trust that the configuration is the
1061 # same. We therefore have to ask if the ref exists or
1062 # not. This is consistent with the fact that we want
1063 # to ignore already-removed-from-datastore datasets
1064 # anyway.
1065 if self.datastore.exists(ref):
1066 self.datastore.trash(ref)
1067 if purge:
1068 self.registry.removeDatasets(refs)
1069 elif disassociate:
1070 for tag in tags:
1071 self.registry.disassociate(tag, refs)
1072 # We've exited the Registry transaction, and apparently committed.
1073 # (if there was an exception, everything rolled back, and it's as if
1074 # nothing happened - and we never get here).
1075 # Datastore artifacts are not yet gone, but they're clearly marked
1076 # as trash, so if we fail to delete now because of (e.g.) filesystem
1077 # problems we can try again later, and if manual administrative
1078 # intervention is required, it's pretty clear what that should entail:
1079 # deleting everything on disk and in private Datastore tables that is
1080 # in the dataset_location_trash table.
1081 if unstore:
1082 # Point of no return for removing artifacts
1083 self.datastore.emptyTrash()
1085 @transactional
1086 def ingest(self, *datasets: FileDataset, transfer: Optional[str] = "auto", run: Optional[str] = None,
1087 tags: Optional[Iterable[str]] = None,):
1088 """Store and register one or more datasets that already exist on disk.
1090 Parameters
1091 ----------
1092 datasets : `FileDataset`
1093 Each positional argument is a struct containing information about
1094 a file to be ingested, including its path (either absolute or
1095 relative to the datastore root, if applicable), a `DatasetRef`,
1096 and optionally a formatter class or its fully-qualified string
1097 name. If a formatter is not provided, the formatter that would be
1098 used for `put` is assumed. On successful return, all
1099 `FileDataset.ref` attributes will have their `DatasetRef.id`
1100 attribute populated and all `FileDataset.formatter` attributes will
1101 be set to the formatter class used. `FileDataset.path` attributes
1102 may be modified to put paths in whatever the datastore considers a
1103 standardized form.
1104 transfer : `str`, optional
1105 If not `None`, must be one of 'auto', 'move', 'copy', 'hardlink',
1106 'relsymlink' or 'symlink', indicating how to transfer the file.
1107 run : `str`, optional
1108 The name of the run ingested datasets should be added to,
1109 overriding ``self.run``.
1110 tags : `Iterable` [ `str` ], optional
1111 The names of a `~CollectionType.TAGGED` collections to associate
1112 the dataset with, overriding ``self.tags``. These collections
1113 must have already been added to the `Registry`.
1115 Raises
1116 ------
1117 TypeError
1118 Raised if the butler is read-only or if no run was provided.
1119 NotImplementedError
1120 Raised if the `Datastore` does not support the given transfer mode.
1121 DatasetTypeNotSupportedError
1122 Raised if one or more files to be ingested have a dataset type that
1123 is not supported by the `Datastore`..
1124 FileNotFoundError
1125 Raised if one of the given files does not exist.
1126 FileExistsError
1127 Raised if transfer is not `None` but the (internal) location the
1128 file would be moved to is already occupied.
1130 Notes
1131 -----
1132 This operation is not fully exception safe: if a database operation
1133 fails, the given `FileDataset` instances may be only partially updated.
1135 It is atomic in terms of database operations (they will either all
1136 succeed or all fail) providing the database engine implements
1137 transactions correctly. It will attempt to be atomic in terms of
1138 filesystem operations as well, but this cannot be implemented
1139 rigorously for most datastores.
1140 """
1141 if not self.isWriteable():
1142 raise TypeError("Butler is read-only.")
1143 if run is None:
1144 if self.run is None:
1145 raise TypeError("No run provided.")
1146 run = self.run
1147 # No need to check run type, since insertDatasets will do that
1148 # (safely) for us.
1149 if tags is None:
1150 tags = self.tags
1151 else:
1152 tags = tuple(tags)
1153 for tag in tags:
1154 # Check that these are tagged collections up front, because we want
1155 # to avoid relying on Datastore transactionality to avoid modifying
1156 # the repo if there's an error later.
1157 collectionType = self.registry.getCollectionType(tag)
1158 if collectionType is not CollectionType.TAGGED:
1159 raise TypeError(f"Cannot associate into collection '{tag}' of non-TAGGED type "
1160 f"{collectionType.name}.")
1161 # Reorganize the inputs so they're grouped by DatasetType and then
1162 # data ID. We also include a list of DatasetRefs for each FileDataset
1163 # to hold the resolved DatasetRefs returned by the Registry, before
1164 # it's safe to swap them into FileDataset.refs.
1165 # Some type annotation aliases to make that clearer:
1166 GroupForType = Dict[DataCoordinate, Tuple[FileDataset, List[DatasetRef]]]
1167 GroupedData = MutableMapping[DatasetType, GroupForType]
1168 # The actual data structure:
1169 groupedData: GroupedData = defaultdict(dict)
1170 # And the nested loop that populates it:
1171 for dataset in datasets:
1172 # This list intentionally shared across the inner loop, since it's
1173 # associated with `dataset`.
1174 resolvedRefs = []
1175 for ref in dataset.refs:
1176 groupedData[ref.datasetType][ref.dataId] = (dataset, resolvedRefs)
1178 # Now we can bulk-insert into Registry for each DatasetType.
1179 allResolvedRefs = []
1180 for datasetType, groupForType in groupedData.items():
1181 refs = self.registry.insertDatasets(datasetType,
1182 dataIds=groupForType.keys(),
1183 run=run)
1184 # Append those resolved DatasetRefs to the new lists we set up for
1185 # them.
1186 for ref, (_, resolvedRefs) in zip(refs, groupForType.values()):
1187 resolvedRefs.append(ref)
1189 # Go back to the original FileDatasets to replace their refs with the
1190 # new resolved ones, and also build a big list of all refs.
1191 allResolvedRefs = []
1192 for groupForType in groupedData.values():
1193 for dataset, resolvedRefs in groupForType.values():
1194 dataset.refs = resolvedRefs
1195 allResolvedRefs.extend(resolvedRefs)
1197 # Bulk-associate everything with any tagged collections.
1198 for tag in tags:
1199 self.registry.associate(tag, allResolvedRefs)
1201 # Bulk-insert everything into Datastore.
1202 self.datastore.ingest(*datasets, transfer=transfer)
1204 @contextlib.contextmanager
1205 def export(self, *, directory: Optional[str] = None,
1206 filename: Optional[str] = None,
1207 format: Optional[str] = None,
1208 transfer: Optional[str] = None) -> ContextManager[RepoExport]:
1209 """Export datasets from the repository represented by this `Butler`.
1211 This method is a context manager that returns a helper object
1212 (`RepoExport`) that is used to indicate what information from the
1213 repository should be exported.
1215 Parameters
1216 ----------
1217 directory : `str`, optional
1218 Directory dataset files should be written to if ``transfer`` is not
1219 `None`.
1220 filename : `str`, optional
1221 Name for the file that will include database information associated
1222 with the exported datasets. If this is not an absolute path and
1223 ``directory`` is not `None`, it will be written to ``directory``
1224 instead of the current working directory. Defaults to
1225 "export.{format}".
1226 format : `str`, optional
1227 File format for the database information file. If `None`, the
1228 extension of ``filename`` will be used.
1229 transfer : `str`, optional
1230 Transfer mode passed to `Datastore.export`.
1232 Raises
1233 ------
1234 TypeError
1235 Raised if the set of arguments passed is inconsistent.
1237 Examples
1238 --------
1239 Typically the `Registry.queryDataIds` and `Registry.queryDatasets`
1240 methods are used to provide the iterables over data IDs and/or datasets
1241 to be exported::
1243 with butler.export("exports.yaml") as export:
1244 # Export all flats, and the calibration_label dimensions
1245 # associated with them.
1246 export.saveDatasets(butler.registry.queryDatasets("flat"),
1247 elements=[butler.registry.dimensions["calibration_label"]])
1248 # Export all datasets that start with "deepCoadd_" and all of
1249 # their associated data ID information.
1250 export.saveDatasets(butler.registry.queryDatasets("deepCoadd_*"))
1251 """
1252 if directory is None and transfer is not None:
1253 raise TypeError("Cannot transfer without providing a directory.")
1254 if transfer == "move":
1255 raise TypeError("Transfer may not be 'move': export is read-only")
1256 if format is None:
1257 if filename is None:
1258 raise TypeError("At least one of 'filename' or 'format' must be provided.")
1259 else:
1260 _, format = os.path.splitext(filename)
1261 elif filename is None:
1262 filename = f"export.{format}"
1263 if directory is not None:
1264 filename = os.path.join(directory, filename)
1265 BackendClass = getClassOf(self._config["repo_transfer_formats"][format]["export"])
1266 with open(filename, 'w') as stream:
1267 backend = BackendClass(stream)
1268 try:
1269 helper = RepoExport(self.registry, self.datastore, backend=backend,
1270 directory=directory, transfer=transfer)
1271 yield helper
1272 except BaseException:
1273 raise
1274 else:
1275 helper._finish()
1277 def import_(self, *, directory: Optional[str] = None,
1278 filename: Union[str, TextIO, None] = None,
1279 format: Optional[str] = None,
1280 transfer: Optional[str] = None,
1281 skip_dimensions: Optional[Set] = None):
1282 """Import datasets exported from a different butler repository.
1284 Parameters
1285 ----------
1286 directory : `str`, optional
1287 Directory containing dataset files. If `None`, all file paths
1288 must be absolute.
1289 filename : `str` or `TextIO`, optional
1290 A stream or name of file that contains database information
1291 associated with the exported datasets. If this a string (name) and
1292 is not an absolute path, does not exist in the current working
1293 directory, and ``directory`` is not `None`, it is assumed to be in
1294 ``directory``. Defaults to "export.{format}".
1295 format : `str`, optional
1296 File format for the database information file. If `None`, the
1297 extension of ``filename`` will be used.
1298 transfer : `str`, optional
1299 Transfer mode passed to `Datastore.ingest`.
1300 skip_dimensions : `set`, optional
1301 Names of dimensions that should be skipped and not imported.
1303 Raises
1304 ------
1305 TypeError
1306 Raised if the set of arguments passed is inconsistent, or if the
1307 butler is read-only.
1308 """
1309 if not self.isWriteable():
1310 raise TypeError("Butler is read-only.")
1311 if format is None:
1312 if filename is None:
1313 raise TypeError("At least one of 'filename' or 'format' must be provided.")
1314 else:
1315 _, format = os.path.splitext(filename)
1316 elif filename is None:
1317 filename = f"export.{format}"
1318 if isinstance(filename, str) and directory is not None and not os.path.exists(filename):
1319 filename = os.path.join(directory, filename)
1320 BackendClass = getClassOf(self._config["repo_transfer_formats"][format]["import"])
1322 def doImport(importStream):
1323 backend = BackendClass(importStream, self.registry)
1324 backend.register()
1325 with self.transaction():
1326 backend.load(self.datastore, directory=directory, transfer=transfer,
1327 skip_dimensions=skip_dimensions)
1329 if isinstance(filename, str):
1330 with open(filename, "r") as stream:
1331 doImport(stream)
1332 else:
1333 doImport(filename)
1335 def validateConfiguration(self, logFailures: bool = False,
1336 datasetTypeNames: Optional[Iterable[str]] = None,
1337 ignore: Iterable[str] = None):
1338 """Validate butler configuration.
1340 Checks that each `DatasetType` can be stored in the `Datastore`.
1342 Parameters
1343 ----------
1344 logFailures : `bool`, optional
1345 If `True`, output a log message for every validation error
1346 detected.
1347 datasetTypeNames : iterable of `str`, optional
1348 The `DatasetType` names that should be checked. This allows
1349 only a subset to be selected.
1350 ignore : iterable of `str`, optional
1351 Names of DatasetTypes to skip over. This can be used to skip
1352 known problems. If a named `DatasetType` corresponds to a
1353 composite, all components of that `DatasetType` will also be
1354 ignored.
1356 Raises
1357 ------
1358 ButlerValidationError
1359 Raised if there is some inconsistency with how this Butler
1360 is configured.
1361 """
1362 if datasetTypeNames:
1363 entities = [self.registry.getDatasetType(name) for name in datasetTypeNames]
1364 else:
1365 entities = list(self.registry.queryDatasetTypes())
1367 # filter out anything from the ignore list
1368 if ignore:
1369 ignore = set(ignore)
1370 entities = [e for e in entities if e.name not in ignore and e.nameAndComponent()[0] not in ignore]
1371 else:
1372 ignore = set()
1374 # Find all the registered instruments
1375 instruments = set(
1376 record.name for record in self.registry.queryDimensionRecords("instrument")
1377 )
1379 # For each datasetType that has an instrument dimension, create
1380 # a DatasetRef for each defined instrument
1381 datasetRefs = []
1383 for datasetType in entities:
1384 if "instrument" in datasetType.dimensions:
1385 for instrument in instruments:
1386 datasetRef = DatasetRef(datasetType, {"instrument": instrument}, conform=False)
1387 datasetRefs.append(datasetRef)
1389 entities.extend(datasetRefs)
1391 datastoreErrorStr = None
1392 try:
1393 self.datastore.validateConfiguration(entities, logFailures=logFailures)
1394 except ValidationError as e:
1395 datastoreErrorStr = str(e)
1397 # Also check that the LookupKeys used by the datastores match
1398 # registry and storage class definitions
1399 keys = self.datastore.getLookupKeys()
1401 failedNames = set()
1402 failedDataId = set()
1403 for key in keys:
1404 datasetType = None
1405 if key.name is not None:
1406 if key.name in ignore:
1407 continue
1409 # skip if specific datasetType names were requested and this
1410 # name does not match
1411 if datasetTypeNames and key.name not in datasetTypeNames:
1412 continue
1414 # See if it is a StorageClass or a DatasetType
1415 if key.name in self.storageClasses:
1416 pass
1417 else:
1418 try:
1419 self.registry.getDatasetType(key.name)
1420 except KeyError:
1421 if logFailures:
1422 log.fatal("Key '%s' does not correspond to a DatasetType or StorageClass", key)
1423 failedNames.add(key)
1424 else:
1425 # Dimensions are checked for consistency when the Butler
1426 # is created and rendezvoused with a universe.
1427 pass
1429 # Check that the instrument is a valid instrument
1430 # Currently only support instrument so check for that
1431 if key.dataId:
1432 dataIdKeys = set(key.dataId)
1433 if set(["instrument"]) != dataIdKeys:
1434 if logFailures:
1435 log.fatal("Key '%s' has unsupported DataId override", key)
1436 failedDataId.add(key)
1437 elif key.dataId["instrument"] not in instruments:
1438 if logFailures:
1439 log.fatal("Key '%s' has unknown instrument", key)
1440 failedDataId.add(key)
1442 messages = []
1444 if datastoreErrorStr:
1445 messages.append(datastoreErrorStr)
1447 for failed, msg in ((failedNames, "Keys without corresponding DatasetType or StorageClass entry: "),
1448 (failedDataId, "Keys with bad DataId entries: ")):
1449 if failed:
1450 msg += ", ".join(str(k) for k in failed)
1451 messages.append(msg)
1453 if messages:
1454 raise ValidationError(";\n".join(messages))
1456 registry: Registry
1457 """The object that manages dataset metadata and relationships (`Registry`).
1459 Most operations that don't involve reading or writing butler datasets are
1460 accessible only via `Registry` methods.
1461 """
1463 datastore: Datastore
1464 """The object that manages actual dataset storage (`Datastore`).
1466 Direct user access to the datastore should rarely be necessary; the primary
1467 exception is the case where a `Datastore` implementation provides extra
1468 functionality beyond what the base class defines.
1469 """
1471 storageClasses: StorageClassFactory
1472 """An object that maps known storage class names to objects that fully
1473 describe them (`StorageClassFactory`).
1474 """
1476 collections: Optional[CollectionSearch]
1477 """The collections to search and any restrictions on the dataset types to
1478 search for within them, in order (`CollectionSearch`).
1479 """
1481 run: Optional[str]
1482 """Name of the run this butler writes outputs to (`str` or `None`).
1483 """
1485 tags: Tuple[str, ...]
1486 """Names of `~CollectionType.TAGGED` collections this butler associates
1487 with in `put` and `ingest`, and disassociates from in `pruneDatasets`
1488 (`tuple` [ `str` ]).
1489 """