Coverage for python/lsst/daf/butler/_butler.py : 9%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""
23Butler top level classes.
24"""
25from __future__ import annotations
27__all__ = ("Butler", "ButlerValidationError")
29import os
30from collections import defaultdict
31import contextlib
32import logging
33from typing import (
34 Any,
35 ClassVar,
36 ContextManager,
37 Dict,
38 Iterable,
39 List,
40 Mapping,
41 MutableMapping,
42 Optional,
43 TextIO,
44 Tuple,
45 Union,
46)
48try:
49 import boto3
50except ImportError:
51 boto3 = None
53from lsst.utils import doImport
54from .core import (
55 ButlerURI,
56 Config,
57 ConfigSubset,
58 DataCoordinate,
59 DataId,
60 DatasetRef,
61 DatasetType,
62 Datastore,
63 FileDataset,
64 RepoExport,
65 StorageClassFactory,
66 ValidationError,
67)
68from .core.repoRelocation import BUTLER_ROOT_TAG
69from .core.utils import transactional, getClassOf, safeMakeDir
70from .core.s3utils import bucketExists
71from ._deferredDatasetHandle import DeferredDatasetHandle
72from ._butlerConfig import ButlerConfig
73from .registry import Registry, RegistryConfig, CollectionType
74from .registry.wildcards import CollectionSearch
76log = logging.getLogger(__name__)
79class ButlerValidationError(ValidationError):
80 """There is a problem with the Butler configuration."""
81 pass
84class Butler:
85 """Main entry point for the data access system.
87 Parameters
88 ----------
89 config : `ButlerConfig`, `Config` or `str`, optional.
90 Configuration. Anything acceptable to the
91 `ButlerConfig` constructor. If a directory path
92 is given the configuration will be read from a ``butler.yaml`` file in
93 that location. If `None` is given default values will be used.
94 butler : `Butler`, optional.
95 If provided, construct a new Butler that uses the same registry and
96 datastore as the given one, but with the given collection and run.
97 Incompatible with the ``config``, ``searchPaths``, and ``writeable``
98 arguments.
99 collections : `Any`, optional
100 An expression specifying the collections to be searched (in order) when
101 reading datasets, and optionally dataset type restrictions on them.
102 This may be:
103 - a `str` collection name;
104 - a tuple of (collection name, *dataset type restriction*);
105 - an iterable of either of the above;
106 - a mapping from `str` to *dataset type restriction*.
108 See :ref:`daf_butler_collection_expressions` for more information,
109 including the definition of a *dataset type restriction*. All
110 collections must either already exist or be specified to be created
111 by other arguments.
112 run : `str`, optional
113 Name of the run datasets should be output to. If the run
114 does not exist, it will be created. If ``collections`` is `None`, it
115 will be set to ``[run]``. If this is not set (and ``writeable`` is
116 not set either), a read-only butler will be created.
117 tags : `Iterable` [ `str` ], optional
118 A list of `~CollectionType.TAGGED` collections that datasets should be
119 associated with in `put` or `ingest` and disassociated from in
120 `pruneDatasets`. If any of these collections does not exist, it will
121 be created.
122 chains : `Mapping` [ `str`, `Iterable` [ `str` ] ], optional
123 A mapping from the names of new `~CollectionType.CHAINED` collections
124 to an expression identifying their child collections (which takes the
125 same form as the ``collections`` argument. Chains may be nested only
126 if children precede their parents in this mapping.
127 searchPaths : `list` of `str`, optional
128 Directory paths to search when calculating the full Butler
129 configuration. Not used if the supplied config is already a
130 `ButlerConfig`.
131 writeable : `bool`, optional
132 Explicitly sets whether the butler supports write operations. If not
133 provided, a read-write butler is created if any of ``run``, ``tags``,
134 or ``chains`` is non-empty.
136 Examples
137 --------
138 While there are many ways to control exactly how a `Butler` interacts with
139 the collections in its `Registry`, the most common cases are still simple.
141 For a read-only `Butler` that searches one collection, do::
143 butler = Butler("/path/to/repo", collections=["u/alice/DM-50000"])
145 For a read-write `Butler` that writes to and reads from a
146 `~CollectionType.RUN` collection::
148 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a")
150 The `Butler` passed to a ``PipelineTask`` is often much more complex,
151 because we want to write to one `~CollectionType.RUN` collection but read
152 from several others (as well), while defining a new
153 `~CollectionType.CHAINED` collection that combines them all::
155 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a",
156 collections=["u/alice/DM-50000"],
157 chains={
158 "u/alice/DM-50000": ["u/alice/DM-50000/a",
159 "u/bob/DM-49998",
160 "raw/hsc"]
161 })
163 This butler will `put` new datasets to the run ``u/alice/DM-50000/a``, but
164 they'll also be available from the chained collection ``u/alice/DM-50000``.
165 Datasets will be read first from that run (since it appears first in the
166 chain), and then from ``u/bob/DM-49998`` and finally ``raw/hsc``.
167 If ``u/alice/DM-50000`` had already been defined, the ``chain`` argument
168 would be unnecessary. We could also construct a butler that performs
169 exactly the same `put` and `get` operations without actually creating a
170 chained collection, just by passing multiple items is ``collections``::
172 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a",
173 collections=["u/alice/DM-50000/a",
174 "u/bob/DM-49998",
175 "raw/hsc"])
177 Finally, one can always create a `Butler` with no collections::
179 butler = Butler("/path/to/repo", writeable=True)
181 This can be extremely useful when you just want to use ``butler.registry``,
182 e.g. for inserting dimension data or managing collections, or when the
183 collections you want to use with the butler are not consistent.
184 Passing ``writeable`` explicitly here is only necessary if you want to be
185 able to make changes to the repo - usually the value for ``writeable`` is
186 can be guessed from the collection arguments provided, but it defaults to
187 `False` when there are not collection arguments.
188 """
189 def __init__(self, config: Union[Config, str, None] = None, *,
190 butler: Optional[Butler] = None,
191 collections: Any = None,
192 run: Optional[str] = None,
193 tags: Iterable[str] = (),
194 chains: Optional[Mapping[str, Any]] = None,
195 searchPaths: Optional[List[str]] = None,
196 writeable: Optional[bool] = None):
197 # Transform any single-pass iterator into an actual sequence so we
198 # can see if its empty
199 self.tags = tuple(tags)
200 # Load registry, datastore, etc. from config or existing butler.
201 if butler is not None:
202 if config is not None or searchPaths is not None or writeable is not None:
203 raise TypeError("Cannot pass 'config', 'searchPaths', or 'writeable' "
204 "arguments with 'butler' argument.")
205 self.registry = butler.registry
206 self.datastore = butler.datastore
207 self.storageClasses = butler.storageClasses
208 self._config = butler._config
209 else:
210 self._config = ButlerConfig(config, searchPaths=searchPaths)
211 if "root" in self._config:
212 butlerRoot = self._config["root"]
213 else:
214 butlerRoot = self._config.configDir
215 if writeable is None:
216 writeable = run is not None or chains is not None or self.tags
217 self.registry = Registry.fromConfig(self._config, butlerRoot=butlerRoot, writeable=writeable)
218 self.datastore = Datastore.fromConfig(self._config, self.registry.getDatastoreBridgeManager(),
219 butlerRoot=butlerRoot)
220 self.storageClasses = StorageClassFactory()
221 self.storageClasses.addFromConfig(self._config)
222 # Check the many collection arguments for consistency and create any
223 # needed collections that don't exist.
224 if collections is None:
225 if run is not None:
226 collections = (run,)
227 else:
228 collections = ()
229 self.collections = CollectionSearch.fromExpression(collections)
230 if chains is None:
231 chains = {}
232 self.run = run
233 if "run" in self._config or "collection" in self._config:
234 raise ValueError("Passing a run or collection via configuration is no longer supported.")
235 if self.run is not None:
236 self.registry.registerCollection(self.run, type=CollectionType.RUN)
237 for tag in self.tags:
238 self.registry.registerCollection(tag, type=CollectionType.TAGGED)
239 for parent, children in chains.items():
240 self.registry.registerCollection(parent, type=CollectionType.CHAINED)
241 self.registry.setCollectionChain(parent, children)
243 GENERATION: ClassVar[int] = 3
244 """This is a Generation 3 Butler.
246 This attribute may be removed in the future, once the Generation 2 Butler
247 interface has been fully retired; it should only be used in transitional
248 code.
249 """
251 @staticmethod
252 def makeRepo(root: str, config: Union[Config, str, None] = None, standalone: bool = False,
253 createRegistry: bool = True, searchPaths: Optional[List[str]] = None,
254 forceConfigRoot: bool = True, outfile: Optional[str] = None,
255 overwrite: bool = False) -> Config:
256 """Create an empty data repository by adding a butler.yaml config
257 to a repository root directory.
259 Parameters
260 ----------
261 root : `str` or `ButlerURI`
262 Path or URI to the root location of the new repository. Will be
263 created if it does not exist.
264 config : `Config` or `str`, optional
265 Configuration to write to the repository, after setting any
266 root-dependent Registry or Datastore config options. Can not
267 be a `ButlerConfig` or a `ConfigSubset`. If `None`, default
268 configuration will be used. Root-dependent config options
269 specified in this config are overwritten if ``forceConfigRoot``
270 is `True`.
271 standalone : `bool`
272 If True, write all expanded defaults, not just customized or
273 repository-specific settings.
274 This (mostly) decouples the repository from the default
275 configuration, insulating it from changes to the defaults (which
276 may be good or bad, depending on the nature of the changes).
277 Future *additions* to the defaults will still be picked up when
278 initializing `Butlers` to repos created with ``standalone=True``.
279 createRegistry : `bool`, optional
280 If `True` create a new Registry.
281 searchPaths : `list` of `str`, optional
282 Directory paths to search when calculating the full butler
283 configuration.
284 forceConfigRoot : `bool`, optional
285 If `False`, any values present in the supplied ``config`` that
286 would normally be reset are not overridden and will appear
287 directly in the output config. This allows non-standard overrides
288 of the root directory for a datastore or registry to be given.
289 If this parameter is `True` the values for ``root`` will be
290 forced into the resulting config if appropriate.
291 outfile : `str`, optional
292 If not-`None`, the output configuration will be written to this
293 location rather than into the repository itself. Can be a URI
294 string. Can refer to a directory that will be used to write
295 ``butler.yaml``.
296 overwrite : `bool`, optional
297 Create a new configuration file even if one already exists
298 in the specified output location. Default is to raise
299 an exception.
301 Returns
302 -------
303 config : `Config`
304 The updated `Config` instance written to the repo.
306 Raises
307 ------
308 ValueError
309 Raised if a ButlerConfig or ConfigSubset is passed instead of a
310 regular Config (as these subclasses would make it impossible to
311 support ``standalone=False``).
312 FileExistsError
313 Raised if the output config file already exists.
314 os.error
315 Raised if the directory does not exist, exists but is not a
316 directory, or cannot be created.
318 Notes
319 -----
320 Note that when ``standalone=False`` (the default), the configuration
321 search path (see `ConfigSubset.defaultSearchPaths`) that was used to
322 construct the repository should also be used to construct any Butlers
323 to avoid configuration inconsistencies.
324 """
325 if isinstance(config, (ButlerConfig, ConfigSubset)):
326 raise ValueError("makeRepo must be passed a regular Config without defaults applied.")
328 # for "file" schemes we are assuming POSIX semantics for paths, for
329 # schemeless URIs we are assuming os.path semantics.
330 uri = ButlerURI(root, forceDirectory=True)
331 if uri.scheme == "file" or not uri.scheme:
332 if not os.path.isdir(uri.ospath):
333 safeMakeDir(uri.ospath)
334 elif uri.scheme == "s3":
335 # bucket must already exist
336 if not bucketExists(uri.netloc):
337 raise ValueError(f"Bucket {uri.netloc} does not exist!")
338 s3 = boto3.client("s3")
339 # don't create S3 key when root is at the top-level of an Bucket
340 if not uri.path == "/":
341 s3.put_object(Bucket=uri.netloc, Key=uri.relativeToPathRoot)
342 else:
343 raise ValueError(f"Unrecognized scheme: {uri.scheme}")
344 config = Config(config)
346 # If we are creating a new repo from scratch with relative roots,
347 # do not propagate an explicit root from the config file
348 if "root" in config:
349 del config["root"]
351 full = ButlerConfig(config, searchPaths=searchPaths) # this applies defaults
352 datastoreClass = doImport(full["datastore", "cls"])
353 datastoreClass.setConfigRoot(BUTLER_ROOT_TAG, config, full, overwrite=forceConfigRoot)
355 # if key exists in given config, parse it, otherwise parse the defaults
356 # in the expanded config
357 if config.get(("registry", "db")):
358 registryConfig = RegistryConfig(config)
359 else:
360 registryConfig = RegistryConfig(full)
361 defaultDatabaseUri = registryConfig.makeDefaultDatabaseUri(BUTLER_ROOT_TAG)
362 if defaultDatabaseUri is not None:
363 Config.updateParameters(RegistryConfig, config, full,
364 toUpdate={"db": defaultDatabaseUri},
365 overwrite=forceConfigRoot)
366 else:
367 Config.updateParameters(RegistryConfig, config, full, toCopy=("db",),
368 overwrite=forceConfigRoot)
370 if standalone:
371 config.merge(full)
372 if outfile is not None:
373 # When writing to a separate location we must include
374 # the root of the butler repo in the config else it won't know
375 # where to look.
376 config["root"] = uri.geturl()
377 configURI = outfile
378 else:
379 configURI = uri
380 config.dumpToUri(configURI, overwrite=overwrite)
382 # Create Registry and populate tables
383 Registry.fromConfig(config, create=createRegistry, butlerRoot=root)
384 return config
386 @classmethod
387 def _unpickle(cls, config: ButlerConfig, collections: Optional[CollectionSearch], run: Optional[str],
388 tags: Tuple[str, ...], writeable: bool) -> Butler:
389 """Callable used to unpickle a Butler.
391 We prefer not to use ``Butler.__init__`` directly so we can force some
392 of its many arguments to be keyword-only (note that ``__reduce__``
393 can only invoke callables with positional arguments).
395 Parameters
396 ----------
397 config : `ButlerConfig`
398 Butler configuration, already coerced into a true `ButlerConfig`
399 instance (and hence after any search paths for overrides have been
400 utilized).
401 collections : `CollectionSearch`
402 Names of collections to read from.
403 run : `str`, optional
404 Name of `~CollectionType.RUN` collection to write to.
405 tags : `tuple` [`str`]
406 Names of `~CollectionType.TAGGED` collections to associate with.
407 writeable : `bool`
408 Whether the Butler should support write operations.
410 Returns
411 -------
412 butler : `Butler`
413 A new `Butler` instance.
414 """
415 return cls(config=config, collections=collections, run=run, tags=tags, writeable=writeable)
417 def __reduce__(self):
418 """Support pickling.
419 """
420 return (Butler._unpickle, (self._config, self.collections, self.run, self.tags,
421 self.registry.isWriteable()))
423 def __str__(self):
424 return "Butler(collections={}, run={}, tags={}, datastore='{}', registry='{}')".format(
425 self.collections, self.run, self.tags, self.datastore, self.registry)
427 def isWriteable(self) -> bool:
428 """Return `True` if this `Butler` supports write operations.
429 """
430 return self.registry.isWriteable()
432 @contextlib.contextmanager
433 def transaction(self):
434 """Context manager supporting `Butler` transactions.
436 Transactions can be nested.
437 """
438 with self.registry.transaction():
439 with self.datastore.transaction():
440 yield
442 def _standardizeArgs(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
443 dataId: Optional[DataId] = None, **kwds: Any) -> Tuple[DatasetType, DataId]:
444 """Standardize the arguments passed to several Butler APIs.
446 Parameters
447 ----------
448 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
449 When `DatasetRef` the `dataId` should be `None`.
450 Otherwise the `DatasetType` or name thereof.
451 dataId : `dict` or `DataCoordinate`
452 A `dict` of `Dimension` link name, value pairs that label the
453 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
454 should be provided as the second argument.
455 kwds
456 Additional keyword arguments used to augment or construct a
457 `DataCoordinate`. See `DataCoordinate.standardize`
458 parameters.
460 Returns
461 -------
462 datasetType : `DatasetType`
463 A `DatasetType` instance extracted from ``datasetRefOrType``.
464 dataId : `dict` or `DataId`, optional
465 Argument that can be used (along with ``kwds``) to construct a
466 `DataId`.
468 Notes
469 -----
470 Butler APIs that conceptually need a DatasetRef also allow passing a
471 `DatasetType` (or the name of one) and a `DataId` (or a dict and
472 keyword arguments that can be used to construct one) separately. This
473 method accepts those arguments and always returns a true `DatasetType`
474 and a `DataId` or `dict`.
476 Standardization of `dict` vs `DataId` is best handled by passing the
477 returned ``dataId`` (and ``kwds``) to `Registry` APIs, which are
478 generally similarly flexible.
479 """
480 externalDatasetType = None
481 internalDatasetType = None
482 if isinstance(datasetRefOrType, DatasetRef):
483 if dataId is not None or kwds:
484 raise ValueError("DatasetRef given, cannot use dataId as well")
485 externalDatasetType = datasetRefOrType.datasetType
486 dataId = datasetRefOrType.dataId
487 else:
488 # Don't check whether DataId is provided, because Registry APIs
489 # can usually construct a better error message when it wasn't.
490 if isinstance(datasetRefOrType, DatasetType):
491 externalDatasetType = datasetRefOrType
492 else:
493 internalDatasetType = self.registry.getDatasetType(datasetRefOrType)
495 # Check that they are self-consistent
496 if externalDatasetType is not None:
497 internalDatasetType = self.registry.getDatasetType(externalDatasetType.name)
498 if externalDatasetType != internalDatasetType:
499 raise ValueError(f"Supplied dataset type ({externalDatasetType}) inconsistent with "
500 f"registry definition ({internalDatasetType})")
502 return internalDatasetType, dataId
504 def _findDatasetRef(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
505 dataId: Optional[DataId] = None, *,
506 collections: Any = None,
507 allowUnresolved: bool = False,
508 **kwds: Any) -> DatasetRef:
509 """Shared logic for methods that start with a search for a dataset in
510 the registry.
512 Parameters
513 ----------
514 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
515 When `DatasetRef` the `dataId` should be `None`.
516 Otherwise the `DatasetType` or name thereof.
517 dataId : `dict` or `DataCoordinate`, optional
518 A `dict` of `Dimension` link name, value pairs that label the
519 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
520 should be provided as the first argument.
521 collections : Any, optional
522 Collections to be searched, overriding ``self.collections``.
523 Can be any of the types supported by the ``collections`` argument
524 to butler construction.
525 allowUnresolved : `bool`, optional
526 If `True`, return an unresolved `DatasetRef` if finding a resolved
527 one in the `Registry` fails. Defaults to `False`.
528 kwds
529 Additional keyword arguments used to augment or construct a
530 `DataId`. See `DataId` parameters.
532 Returns
533 -------
534 ref : `DatasetRef`
535 A reference to the dataset identified by the given arguments.
537 Raises
538 ------
539 LookupError
540 Raised if no matching dataset exists in the `Registry` (and
541 ``allowUnresolved is False``).
542 ValueError
543 Raised if a resolved `DatasetRef` was passed as an input, but it
544 differs from the one found in the registry.
545 TypeError
546 Raised if no collections were provided.
547 """
548 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwds)
549 if isinstance(datasetRefOrType, DatasetRef):
550 idNumber = datasetRefOrType.id
551 else:
552 idNumber = None
553 # Expand the data ID first instead of letting registry.findDataset do
554 # it, so we get the result even if it returns None.
555 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions, **kwds)
556 if collections is None:
557 collections = self.collections
558 if not collections:
559 raise TypeError("No input collections provided.")
560 else:
561 collections = CollectionSearch.fromExpression(collections)
562 # Always lookup the DatasetRef, even if one is given, to ensure it is
563 # present in the current collection.
564 ref = self.registry.findDataset(datasetType, dataId, collections=collections)
565 if ref is None:
566 if allowUnresolved:
567 return DatasetRef(datasetType, dataId)
568 else:
569 raise LookupError(f"Dataset {datasetType.name} with data ID {dataId} "
570 f"could not be found in collections {collections}.")
571 if idNumber is not None and idNumber != ref.id:
572 raise ValueError(f"DatasetRef.id provided ({idNumber}) does not match "
573 f"id ({ref.id}) in registry in collections {collections}.")
574 return ref
576 @transactional
577 def put(self, obj: Any, datasetRefOrType: Union[DatasetRef, DatasetType, str],
578 dataId: Optional[DataId] = None, *,
579 run: Optional[str] = None,
580 tags: Optional[Iterable[str]] = None,
581 **kwds: Any) -> DatasetRef:
582 """Store and register a dataset.
584 Parameters
585 ----------
586 obj : `object`
587 The dataset.
588 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
589 When `DatasetRef` is provided, ``dataId`` should be `None`.
590 Otherwise the `DatasetType` or name thereof.
591 dataId : `dict` or `DataCoordinate`
592 A `dict` of `Dimension` link name, value pairs that label the
593 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
594 should be provided as the second argument.
595 run : `str`, optional
596 The name of the run the dataset should be added to, overriding
597 ``self.run``.
598 tags : `Iterable` [ `str` ], optional
599 The names of a `~CollectionType.TAGGED` collections to associate
600 the dataset with, overriding ``self.tags``. These collections
601 must have already been added to the `Registry`.
602 kwds
603 Additional keyword arguments used to augment or construct a
604 `DataCoordinate`. See `DataCoordinate.standardize`
605 parameters.
607 Returns
608 -------
609 ref : `DatasetRef`
610 A reference to the stored dataset, updated with the correct id if
611 given.
613 Raises
614 ------
615 TypeError
616 Raised if the butler is read-only or if no run has been provided.
617 """
618 log.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run)
619 if not self.isWriteable():
620 raise TypeError("Butler is read-only.")
621 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwds)
622 if isinstance(datasetRefOrType, DatasetRef) and datasetRefOrType.id is not None:
623 raise ValueError("DatasetRef must not be in registry, must have None id")
625 if run is None:
626 if self.run is None:
627 raise TypeError("No run provided.")
628 run = self.run
629 # No need to check type for run; first thing we do is
630 # insertDatasets, and that will check for us.
632 if tags is None:
633 tags = self.tags
634 else:
635 tags = tuple(tags)
636 for tag in tags:
637 # Check that these are tagged collections up front, because we want
638 # to avoid relying on Datastore transactionality to avoid modifying
639 # the repo if there's an error later.
640 collectionType = self.registry.getCollectionType(tag)
641 if collectionType is not CollectionType.TAGGED:
642 raise TypeError(f"Cannot associate into collection '{tag}' of non-TAGGED type "
643 f"{collectionType.name}.")
645 # Add Registry Dataset entry.
646 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions, **kwds)
647 ref, = self.registry.insertDatasets(datasetType, run=run, dataIds=[dataId])
649 # Add Datastore entry.
650 self.datastore.put(obj, ref)
652 for tag in tags:
653 self.registry.associate(tag, [ref])
655 return ref
657 def getDirect(self, ref: DatasetRef, *, parameters: Optional[Dict[str, Any]] = None):
658 """Retrieve a stored dataset.
660 Unlike `Butler.get`, this method allows datasets outside the Butler's
661 collection to be read as long as the `DatasetRef` that identifies them
662 can be obtained separately.
664 Parameters
665 ----------
666 ref : `DatasetRef`
667 Reference to an already stored dataset.
668 parameters : `dict`
669 Additional StorageClass-defined options to control reading,
670 typically used to efficiently read only a subset of the dataset.
672 Returns
673 -------
674 obj : `object`
675 The dataset.
676 """
677 return self.datastore.get(ref, parameters=parameters)
679 def getDeferred(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
680 dataId: Optional[DataId] = None, *,
681 parameters: Union[dict, None] = None,
682 collections: Any = None,
683 **kwds: Any) -> DeferredDatasetHandle:
684 """Create a `DeferredDatasetHandle` which can later retrieve a dataset
686 Parameters
687 ----------
688 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
689 When `DatasetRef` the `dataId` should be `None`.
690 Otherwise the `DatasetType` or name thereof.
691 dataId : `dict` or `DataCoordinate`, optional
692 A `dict` of `Dimension` link name, value pairs that label the
693 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
694 should be provided as the first argument.
695 parameters : `dict`
696 Additional StorageClass-defined options to control reading,
697 typically used to efficiently read only a subset of the dataset.
698 collections : Any, optional
699 Collections to be searched, overriding ``self.collections``.
700 Can be any of the types supported by the ``collections`` argument
701 to butler construction.
702 kwds
703 Additional keyword arguments used to augment or construct a
704 `DataId`. See `DataId` parameters.
706 Returns
707 -------
708 obj : `DeferredDatasetHandle`
709 A handle which can be used to retrieve a dataset at a later time.
711 Raises
712 ------
713 LookupError
714 Raised if no matching dataset exists in the `Registry` (and
715 ``allowUnresolved is False``).
716 ValueError
717 Raised if a resolved `DatasetRef` was passed as an input, but it
718 differs from the one found in the registry.
719 TypeError
720 Raised if no collections were provided.
721 """
722 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwds)
723 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters)
725 def get(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
726 dataId: Optional[DataId] = None, *,
727 parameters: Optional[Dict[str, Any]] = None,
728 collections: Any = None,
729 **kwds: Any) -> Any:
730 """Retrieve a stored dataset.
732 Parameters
733 ----------
734 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
735 When `DatasetRef` the `dataId` should be `None`.
736 Otherwise the `DatasetType` or name thereof.
737 dataId : `dict` or `DataCoordinate`
738 A `dict` of `Dimension` link name, value pairs that label the
739 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
740 should be provided as the first argument.
741 parameters : `dict`
742 Additional StorageClass-defined options to control reading,
743 typically used to efficiently read only a subset of the dataset.
744 collections : Any, optional
745 Collections to be searched, overriding ``self.collections``.
746 Can be any of the types supported by the ``collections`` argument
747 to butler construction.
748 kwds
749 Additional keyword arguments used to augment or construct a
750 `DataCoordinate`. See `DataCoordinate.standardize`
751 parameters.
753 Returns
754 -------
755 obj : `object`
756 The dataset.
758 Raises
759 ------
760 ValueError
761 Raised if a resolved `DatasetRef` was passed as an input, but it
762 differs from the one found in the registry.
763 LookupError
764 Raised if no matching dataset exists in the `Registry`.
765 TypeError
766 Raised if no collections were provided.
767 """
768 log.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters)
769 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwds)
770 return self.getDirect(ref, parameters=parameters)
772 def getURIs(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
773 dataId: Optional[DataId] = None, *,
774 predict: bool = False,
775 collections: Any = None,
776 run: Optional[str] = None,
777 **kwds: Any) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]:
778 """Returns the URIs associated with the dataset.
780 Parameters
781 ----------
782 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
783 When `DatasetRef` the `dataId` should be `None`.
784 Otherwise the `DatasetType` or name thereof.
785 dataId : `dict` or `DataCoordinate`
786 A `dict` of `Dimension` link name, value pairs that label the
787 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
788 should be provided as the first argument.
789 predict : `bool`
790 If `True`, allow URIs to be returned of datasets that have not
791 been written.
792 collections : Any, optional
793 Collections to be searched, overriding ``self.collections``.
794 Can be any of the types supported by the ``collections`` argument
795 to butler construction.
796 run : `str`, optional
797 Run to use for predictions, overriding ``self.run``.
798 kwds
799 Additional keyword arguments used to augment or construct a
800 `DataCoordinate`. See `DataCoordinate.standardize`
801 parameters.
803 Returns
804 -------
805 primary : `ButlerURI`
806 The URI to the primary artifact associated with this dataset.
807 If the dataset was disassembled within the datastore this
808 may be `None`.
809 components : `dict`
810 URIs to any components associated with the dataset artifact.
811 Can be empty if there are no components.
812 """
813 ref = self._findDatasetRef(datasetRefOrType, dataId, allowUnresolved=predict,
814 collections=collections, **kwds)
815 if ref.id is None: # only possible if predict is True
816 if run is None:
817 run = self.run
818 if run is None:
819 raise TypeError("Cannot predict location with run=None.")
820 # Lie about ID, because we can't guess it, and only
821 # Datastore.getURIs() will ever see it (and it doesn't use it).
822 ref = ref.resolved(id=0, run=run)
823 return self.datastore.getURIs(ref, predict)
825 def getURI(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
826 dataId: Optional[DataId] = None, *,
827 predict: bool = False,
828 collections: Any = None,
829 run: Optional[str] = None,
830 **kwds: Any) -> ButlerURI:
831 """Return the URI to the Dataset.
833 Parameters
834 ----------
835 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
836 When `DatasetRef` the `dataId` should be `None`.
837 Otherwise the `DatasetType` or name thereof.
838 dataId : `dict` or `DataCoordinate`
839 A `dict` of `Dimension` link name, value pairs that label the
840 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
841 should be provided as the first argument.
842 predict : `bool`
843 If `True`, allow URIs to be returned of datasets that have not
844 been written.
845 collections : Any, optional
846 Collections to be searched, overriding ``self.collections``.
847 Can be any of the types supported by the ``collections`` argument
848 to butler construction.
849 run : `str`, optional
850 Run to use for predictions, overriding ``self.run``.
851 kwds
852 Additional keyword arguments used to augment or construct a
853 `DataCoordinate`. See `DataCoordinate.standardize`
854 parameters.
856 Returns
857 -------
858 uri : `ButlerURI`
859 URI pointing to the Dataset within the datastore. If the
860 Dataset does not exist in the datastore, and if ``predict`` is
861 `True`, the URI will be a prediction and will include a URI
862 fragment "#predicted".
863 If the datastore does not have entities that relate well
864 to the concept of a URI the returned URI string will be
865 descriptive. The returned URI is not guaranteed to be obtainable.
867 Raises
868 ------
869 LookupError
870 A URI has been requested for a dataset that does not exist and
871 guessing is not allowed.
872 ValueError
873 Raised if a resolved `DatasetRef` was passed as an input, but it
874 differs from the one found in the registry.
875 TypeError
876 Raised if no collections were provided.
877 RuntimeError
878 Raised if a URI is requested for a dataset that consists of
879 multiple artifacts.
880 """
881 primary, components = self.getURIs(datasetRefOrType, dataId=dataId, predict=predict,
882 collections=collections, run=run, **kwds)
884 if primary is None or components:
885 raise RuntimeError(f"Dataset ({datasetRefOrType}) includes distinct URIs for components. "
886 "Use Butler.getURIs() instead.")
887 return primary
889 def datasetExists(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
890 dataId: Optional[DataId] = None, *,
891 collections: Any = None,
892 **kwds: Any) -> bool:
893 """Return True if the Dataset is actually present in the Datastore.
895 Parameters
896 ----------
897 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
898 When `DatasetRef` the `dataId` should be `None`.
899 Otherwise the `DatasetType` or name thereof.
900 dataId : `dict` or `DataCoordinate`
901 A `dict` of `Dimension` link name, value pairs that label the
902 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
903 should be provided as the first argument.
904 collections : Any, optional
905 Collections to be searched, overriding ``self.collections``.
906 Can be any of the types supported by the ``collections`` argument
907 to butler construction.
908 kwds
909 Additional keyword arguments used to augment or construct a
910 `DataCoordinate`. See `DataCoordinate.standardize`
911 parameters.
913 Raises
914 ------
915 LookupError
916 Raised if the dataset is not even present in the Registry.
917 ValueError
918 Raised if a resolved `DatasetRef` was passed as an input, but it
919 differs from the one found in the registry.
920 TypeError
921 Raised if no collections were provided.
922 """
923 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwds)
924 return self.datastore.exists(ref)
926 def pruneCollection(self, name: str, purge: bool = False, unstore: bool = False):
927 """Remove a collection and possibly prune datasets within it.
929 Parameters
930 ----------
931 name : `str`
932 Name of the collection to remove. If this is a
933 `~CollectionType.TAGGED` or `~CollectionType.CHAINED` collection,
934 datasets within the collection are not modified unless ``unstore``
935 is `True`. If this is a `~CollectionType.RUN` collection,
936 ``purge`` and ``unstore`` must be `True`, and all datasets in it
937 are fully removed from the data repository.
938 purge : `bool`, optional
939 If `True`, permit `~CollectionType.RUN` collections to be removed,
940 fully removing datasets within them. Requires ``unstore=True`` as
941 well as an added precaution against accidental deletion. Must be
942 `False` (default) if the collection is not a ``RUN``.
943 unstore: `bool`, optional
944 If `True`, remove all datasets in the collection from all
945 datastores in which they appear.
947 Raises
948 ------
949 TypeError
950 Raised if the butler is read-only or arguments are mutually
951 inconsistent.
952 """
953 # See pruneDatasets comments for more information about the logic here;
954 # the cases are almost the same, but here we can rely on Registry to
955 # take care everything but Datastore deletion when we remove the
956 # collection.
957 if not self.isWriteable():
958 raise TypeError("Butler is read-only.")
959 if purge and not unstore:
960 raise TypeError("Cannot pass purge=True without unstore=True.")
961 collectionType = self.registry.getCollectionType(name)
962 if collectionType is CollectionType.RUN and not purge:
963 raise TypeError(f"Cannot prune RUN collection {name} without purge=True.")
964 if collectionType is not CollectionType.RUN and purge:
965 raise TypeError(f"Cannot prune {collectionType.name} collection {name} with purge=True.")
966 with self.registry.transaction():
967 if unstore:
968 for ref in self.registry.queryDatasets(..., collections=name, deduplicate=True):
969 if self.datastore.exists(ref):
970 self.datastore.trash(ref)
971 self.registry.removeCollection(name)
972 if unstore:
973 # Point of no return for removing artifacts
974 self.datastore.emptyTrash()
976 def pruneDatasets(self, refs: Iterable[DatasetRef], *,
977 disassociate: bool = True,
978 unstore: bool = False,
979 tags: Optional[Iterable[str]] = None,
980 purge: bool = False,
981 run: Optional[str] = None):
982 """Remove one or more datasets from a collection and/or storage.
984 Parameters
985 ----------
986 refs : `~collections.abc.Iterable` of `DatasetRef`
987 Datasets to prune. These must be "resolved" references (not just
988 a `DatasetType` and data ID).
989 disassociate : bool`, optional
990 Disassociate pruned datasets from ``self.tags`` (or the collections
991 given via the ``tags`` argument). Ignored if ``refs`` is ``...``.
992 unstore : `bool`, optional
993 If `True` (`False` is default) remove these datasets from all
994 datastores known to this butler. Note that this will make it
995 impossible to retrieve these datasets even via other collections.
996 Datasets that are already not stored are ignored by this option.
997 tags : `Iterable` [ `str` ], optional
998 `~CollectionType.TAGGED` collections to disassociate the datasets
999 from, overriding ``self.tags``. Ignored if ``disassociate`` is
1000 `False` or ``purge`` is `True`.
1001 purge : `bool`, optional
1002 If `True` (`False` is default), completely remove the dataset from
1003 the `Registry`. To prevent accidental deletions, ``purge`` may
1004 only be `True` if all of the following conditions are met:
1006 - All given datasets are in the given run.
1007 - ``disassociate`` is `True`;
1008 - ``unstore`` is `True`.
1010 This mode may remove provenance information from datasets other
1011 than those provided, and should be used with extreme care.
1012 run : `str`, optional
1013 `~CollectionType.RUN` collection to purge from, overriding
1014 ``self.run``. Ignored unless ``purge`` is `True`.
1016 Raises
1017 ------
1018 TypeError
1019 Raised if the butler is read-only, if no collection was provided,
1020 or the conditions for ``purge=True`` were not met.
1021 """
1022 if not self.isWriteable():
1023 raise TypeError("Butler is read-only.")
1024 if purge:
1025 if not disassociate:
1026 raise TypeError("Cannot pass purge=True without disassociate=True.")
1027 if not unstore:
1028 raise TypeError("Cannot pass purge=True without unstore=True.")
1029 if run is None:
1030 run = self.run
1031 if run is None:
1032 raise TypeError("No run provided but purge=True.")
1033 collectionType = self.registry.getCollectionType(run)
1034 if collectionType is not CollectionType.RUN:
1035 raise TypeError(f"Cannot purge from collection '{run}' "
1036 f"of non-RUN type {collectionType.name}.")
1037 elif disassociate:
1038 if tags is None:
1039 tags = self.tags
1040 else:
1041 tags = tuple(tags)
1042 if not tags:
1043 raise TypeError("No tags provided but disassociate=True.")
1044 for tag in tags:
1045 collectionType = self.registry.getCollectionType(tag)
1046 if collectionType is not CollectionType.TAGGED:
1047 raise TypeError(f"Cannot disassociate from collection '{tag}' "
1048 f"of non-TAGGED type {collectionType.name}.")
1049 # Transform possibly-single-pass iterable into something we can iterate
1050 # over multiple times.
1051 refs = list(refs)
1052 # Pruning a component of a DatasetRef makes no sense since registry
1053 # doesn't know about components and datastore might not store
1054 # components in a separate file
1055 for ref in refs:
1056 if ref.datasetType.component():
1057 raise ValueError(f"Can not prune a component of a dataset (ref={ref})")
1058 # We don't need an unreliable Datastore transaction for this, because
1059 # we've been extra careful to ensure that Datastore.trash only involves
1060 # mutating the Registry (it can _look_ at Datastore-specific things,
1061 # but shouldn't change them), and hence all operations here are
1062 # Registry operations.
1063 with self.registry.transaction():
1064 if unstore:
1065 for ref in refs:
1066 # There is a difference between a concrete composite
1067 # and virtual composite. In a virtual composite the
1068 # datastore is never given the top level DatasetRef. In
1069 # the concrete composite the datastore knows all the
1070 # refs and will clean up itself if asked to remove the
1071 # parent ref. We can not check configuration for this
1072 # since we can not trust that the configuration is the
1073 # same. We therefore have to ask if the ref exists or
1074 # not. This is consistent with the fact that we want
1075 # to ignore already-removed-from-datastore datasets
1076 # anyway.
1077 if self.datastore.exists(ref):
1078 self.datastore.trash(ref)
1079 if purge:
1080 self.registry.removeDatasets(refs)
1081 elif disassociate:
1082 for tag in tags:
1083 self.registry.disassociate(tag, refs)
1084 # We've exited the Registry transaction, and apparently committed.
1085 # (if there was an exception, everything rolled back, and it's as if
1086 # nothing happened - and we never get here).
1087 # Datastore artifacts are not yet gone, but they're clearly marked
1088 # as trash, so if we fail to delete now because of (e.g.) filesystem
1089 # problems we can try again later, and if manual administrative
1090 # intervention is required, it's pretty clear what that should entail:
1091 # deleting everything on disk and in private Datastore tables that is
1092 # in the dataset_location_trash table.
1093 if unstore:
1094 # Point of no return for removing artifacts
1095 self.datastore.emptyTrash()
1097 @transactional
1098 def ingest(self, *datasets: FileDataset, transfer: Optional[str] = "auto", run: Optional[str] = None,
1099 tags: Optional[Iterable[str]] = None,):
1100 """Store and register one or more datasets that already exist on disk.
1102 Parameters
1103 ----------
1104 datasets : `FileDataset`
1105 Each positional argument is a struct containing information about
1106 a file to be ingested, including its path (either absolute or
1107 relative to the datastore root, if applicable), a `DatasetRef`,
1108 and optionally a formatter class or its fully-qualified string
1109 name. If a formatter is not provided, the formatter that would be
1110 used for `put` is assumed. On successful return, all
1111 `FileDataset.ref` attributes will have their `DatasetRef.id`
1112 attribute populated and all `FileDataset.formatter` attributes will
1113 be set to the formatter class used. `FileDataset.path` attributes
1114 may be modified to put paths in whatever the datastore considers a
1115 standardized form.
1116 transfer : `str`, optional
1117 If not `None`, must be one of 'auto', 'move', 'copy', 'hardlink',
1118 'relsymlink' or 'symlink', indicating how to transfer the file.
1119 run : `str`, optional
1120 The name of the run ingested datasets should be added to,
1121 overriding ``self.run``.
1122 tags : `Iterable` [ `str` ], optional
1123 The names of a `~CollectionType.TAGGED` collections to associate
1124 the dataset with, overriding ``self.tags``. These collections
1125 must have already been added to the `Registry`.
1127 Raises
1128 ------
1129 TypeError
1130 Raised if the butler is read-only or if no run was provided.
1131 NotImplementedError
1132 Raised if the `Datastore` does not support the given transfer mode.
1133 DatasetTypeNotSupportedError
1134 Raised if one or more files to be ingested have a dataset type that
1135 is not supported by the `Datastore`..
1136 FileNotFoundError
1137 Raised if one of the given files does not exist.
1138 FileExistsError
1139 Raised if transfer is not `None` but the (internal) location the
1140 file would be moved to is already occupied.
1142 Notes
1143 -----
1144 This operation is not fully exception safe: if a database operation
1145 fails, the given `FileDataset` instances may be only partially updated.
1147 It is atomic in terms of database operations (they will either all
1148 succeed or all fail) providing the database engine implements
1149 transactions correctly. It will attempt to be atomic in terms of
1150 filesystem operations as well, but this cannot be implemented
1151 rigorously for most datastores.
1152 """
1153 if not self.isWriteable():
1154 raise TypeError("Butler is read-only.")
1155 if run is None:
1156 if self.run is None:
1157 raise TypeError("No run provided.")
1158 run = self.run
1159 # No need to check run type, since insertDatasets will do that
1160 # (safely) for us.
1161 if tags is None:
1162 tags = self.tags
1163 else:
1164 tags = tuple(tags)
1165 for tag in tags:
1166 # Check that these are tagged collections up front, because we want
1167 # to avoid relying on Datastore transactionality to avoid modifying
1168 # the repo if there's an error later.
1169 collectionType = self.registry.getCollectionType(tag)
1170 if collectionType is not CollectionType.TAGGED:
1171 raise TypeError(f"Cannot associate into collection '{tag}' of non-TAGGED type "
1172 f"{collectionType.name}.")
1173 # Reorganize the inputs so they're grouped by DatasetType and then
1174 # data ID. We also include a list of DatasetRefs for each FileDataset
1175 # to hold the resolved DatasetRefs returned by the Registry, before
1176 # it's safe to swap them into FileDataset.refs.
1177 # Some type annotation aliases to make that clearer:
1178 GroupForType = Dict[DataCoordinate, Tuple[FileDataset, List[DatasetRef]]]
1179 GroupedData = MutableMapping[DatasetType, GroupForType]
1180 # The actual data structure:
1181 groupedData: GroupedData = defaultdict(dict)
1182 # And the nested loop that populates it:
1183 for dataset in datasets:
1184 # This list intentionally shared across the inner loop, since it's
1185 # associated with `dataset`.
1186 resolvedRefs = []
1187 for ref in dataset.refs:
1188 groupedData[ref.datasetType][ref.dataId] = (dataset, resolvedRefs)
1190 # Now we can bulk-insert into Registry for each DatasetType.
1191 allResolvedRefs = []
1192 for datasetType, groupForType in groupedData.items():
1193 refs = self.registry.insertDatasets(datasetType,
1194 dataIds=groupForType.keys(),
1195 run=run)
1196 # Append those resolved DatasetRefs to the new lists we set up for
1197 # them.
1198 for ref, (_, resolvedRefs) in zip(refs, groupForType.values()):
1199 resolvedRefs.append(ref)
1201 # Go back to the original FileDatasets to replace their refs with the
1202 # new resolved ones, and also build a big list of all refs.
1203 allResolvedRefs = []
1204 for groupForType in groupedData.values():
1205 for dataset, resolvedRefs in groupForType.values():
1206 dataset.refs = resolvedRefs
1207 allResolvedRefs.extend(resolvedRefs)
1209 # Bulk-associate everything with any tagged collections.
1210 for tag in tags:
1211 self.registry.associate(tag, allResolvedRefs)
1213 # Bulk-insert everything into Datastore.
1214 self.datastore.ingest(*datasets, transfer=transfer)
1216 @contextlib.contextmanager
1217 def export(self, *, directory: Optional[str] = None,
1218 filename: Optional[str] = None,
1219 format: Optional[str] = None,
1220 transfer: Optional[str] = None) -> ContextManager[RepoExport]:
1221 """Export datasets from the repository represented by this `Butler`.
1223 This method is a context manager that returns a helper object
1224 (`RepoExport`) that is used to indicate what information from the
1225 repository should be exported.
1227 Parameters
1228 ----------
1229 directory : `str`, optional
1230 Directory dataset files should be written to if ``transfer`` is not
1231 `None`.
1232 filename : `str`, optional
1233 Name for the file that will include database information associated
1234 with the exported datasets. If this is not an absolute path and
1235 ``directory`` is not `None`, it will be written to ``directory``
1236 instead of the current working directory. Defaults to
1237 "export.{format}".
1238 format : `str`, optional
1239 File format for the database information file. If `None`, the
1240 extension of ``filename`` will be used.
1241 transfer : `str`, optional
1242 Transfer mode passed to `Datastore.export`.
1244 Raises
1245 ------
1246 TypeError
1247 Raised if the set of arguments passed is inconsistent.
1249 Examples
1250 --------
1251 Typically the `Registry.queryDimensions` and `Registry.queryDatasets`
1252 methods are used to provide the iterables over data IDs and/or datasets
1253 to be exported::
1255 with butler.export("exports.yaml") as export:
1256 # Export all flats, and the calibration_label dimensions
1257 # associated with them.
1258 export.saveDatasets(butler.registry.queryDatasets("flat"),
1259 elements=[butler.registry.dimensions["calibration_label"]])
1260 # Export all datasets that start with "deepCoadd_" and all of
1261 # their associated data ID information.
1262 export.saveDatasets(butler.registry.queryDatasets("deepCoadd_*"))
1263 """
1264 if directory is None and transfer is not None:
1265 raise TypeError("Cannot transfer without providing a directory.")
1266 if transfer == "move":
1267 raise TypeError("Transfer may not be 'move': export is read-only")
1268 if format is None:
1269 if filename is None:
1270 raise TypeError("At least one of 'filename' or 'format' must be provided.")
1271 else:
1272 _, format = os.path.splitext(filename)
1273 elif filename is None:
1274 filename = f"export.{format}"
1275 if directory is not None:
1276 filename = os.path.join(directory, filename)
1277 BackendClass = getClassOf(self._config["repo_transfer_formats"][format]["export"])
1278 with open(filename, 'w') as stream:
1279 backend = BackendClass(stream)
1280 try:
1281 helper = RepoExport(self.registry, self.datastore, backend=backend,
1282 directory=directory, transfer=transfer)
1283 yield helper
1284 except BaseException:
1285 raise
1286 else:
1287 helper._finish()
1289 def import_(self, *, directory: Optional[str] = None,
1290 filename: Union[str, TextIO, None] = None,
1291 format: Optional[str] = None,
1292 transfer: Optional[str] = None):
1293 """Import datasets exported from a different butler repository.
1295 Parameters
1296 ----------
1297 directory : `str`, optional
1298 Directory containing dataset files. If `None`, all file paths
1299 must be absolute.
1300 filename : `str` or `TextIO`, optional
1301 A stream or name of file that contains database information
1302 associated with the exported datasets. If this a string (name) and
1303 is not an absolute path, does not exist in the current working
1304 directory, and ``directory`` is not `None`, it is assumed to be in
1305 ``directory``. Defaults to "export.{format}".
1306 format : `str`, optional
1307 File format for the database information file. If `None`, the
1308 extension of ``filename`` will be used.
1309 transfer : `str`, optional
1310 Transfer mode passed to `Datastore.export`.
1312 Raises
1313 ------
1314 TypeError
1315 Raised if the set of arguments passed is inconsistent, or if the
1316 butler is read-only.
1317 """
1318 if not self.isWriteable():
1319 raise TypeError("Butler is read-only.")
1320 if format is None:
1321 if filename is None:
1322 raise TypeError("At least one of 'filename' or 'format' must be provided.")
1323 else:
1324 _, format = os.path.splitext(filename)
1325 elif filename is None:
1326 filename = f"export.{format}"
1327 if isinstance(filename, str) and directory is not None and not os.path.exists(filename):
1328 filename = os.path.join(directory, filename)
1329 BackendClass = getClassOf(self._config["repo_transfer_formats"][format]["import"])
1331 def doImport(importStream):
1332 backend = BackendClass(importStream, self.registry)
1333 backend.register()
1334 with self.transaction():
1335 backend.load(self.datastore, directory=directory, transfer=transfer)
1337 if isinstance(filename, str):
1338 with open(filename, "r") as stream:
1339 doImport(stream)
1340 else:
1341 doImport(filename)
1343 def validateConfiguration(self, logFailures: bool = False,
1344 datasetTypeNames: Optional[Iterable[str]] = None,
1345 ignore: Iterable[str] = None):
1346 """Validate butler configuration.
1348 Checks that each `DatasetType` can be stored in the `Datastore`.
1350 Parameters
1351 ----------
1352 logFailures : `bool`, optional
1353 If `True`, output a log message for every validation error
1354 detected.
1355 datasetTypeNames : iterable of `str`, optional
1356 The `DatasetType` names that should be checked. This allows
1357 only a subset to be selected.
1358 ignore : iterable of `str`, optional
1359 Names of DatasetTypes to skip over. This can be used to skip
1360 known problems. If a named `DatasetType` corresponds to a
1361 composite, all components of that `DatasetType` will also be
1362 ignored.
1364 Raises
1365 ------
1366 ButlerValidationError
1367 Raised if there is some inconsistency with how this Butler
1368 is configured.
1369 """
1370 if datasetTypeNames:
1371 entities = [self.registry.getDatasetType(name) for name in datasetTypeNames]
1372 else:
1373 entities = list(self.registry.queryDatasetTypes())
1375 # filter out anything from the ignore list
1376 if ignore:
1377 ignore = set(ignore)
1378 entities = [e for e in entities if e.name not in ignore and e.nameAndComponent()[0] not in ignore]
1379 else:
1380 ignore = set()
1382 # Find all the registered instruments
1383 instruments = set(
1384 dataId["instrument"] for dataId in self.registry.queryDimensions(["instrument"])
1385 )
1387 # For each datasetType that has an instrument dimension, create
1388 # a DatasetRef for each defined instrument
1389 datasetRefs = []
1391 for datasetType in entities:
1392 if "instrument" in datasetType.dimensions:
1393 for instrument in instruments:
1394 datasetRef = DatasetRef(datasetType, {"instrument": instrument}, conform=False)
1395 datasetRefs.append(datasetRef)
1397 entities.extend(datasetRefs)
1399 datastoreErrorStr = None
1400 try:
1401 self.datastore.validateConfiguration(entities, logFailures=logFailures)
1402 except ValidationError as e:
1403 datastoreErrorStr = str(e)
1405 # Also check that the LookupKeys used by the datastores match
1406 # registry and storage class definitions
1407 keys = self.datastore.getLookupKeys()
1409 failedNames = set()
1410 failedDataId = set()
1411 for key in keys:
1412 datasetType = None
1413 if key.name is not None:
1414 if key.name in ignore:
1415 continue
1417 # skip if specific datasetType names were requested and this
1418 # name does not match
1419 if datasetTypeNames and key.name not in datasetTypeNames:
1420 continue
1422 # See if it is a StorageClass or a DatasetType
1423 if key.name in self.storageClasses:
1424 pass
1425 else:
1426 try:
1427 self.registry.getDatasetType(key.name)
1428 except KeyError:
1429 if logFailures:
1430 log.fatal("Key '%s' does not correspond to a DatasetType or StorageClass", key)
1431 failedNames.add(key)
1432 else:
1433 # Dimensions are checked for consistency when the Butler
1434 # is created and rendezvoused with a universe.
1435 pass
1437 # Check that the instrument is a valid instrument
1438 # Currently only support instrument so check for that
1439 if key.dataId:
1440 dataIdKeys = set(key.dataId)
1441 if set(["instrument"]) != dataIdKeys:
1442 if logFailures:
1443 log.fatal("Key '%s' has unsupported DataId override", key)
1444 failedDataId.add(key)
1445 elif key.dataId["instrument"] not in instruments:
1446 if logFailures:
1447 log.fatal("Key '%s' has unknown instrument", key)
1448 failedDataId.add(key)
1450 messages = []
1452 if datastoreErrorStr:
1453 messages.append(datastoreErrorStr)
1455 for failed, msg in ((failedNames, "Keys without corresponding DatasetType or StorageClass entry: "),
1456 (failedDataId, "Keys with bad DataId entries: ")):
1457 if failed:
1458 msg += ", ".join(str(k) for k in failed)
1459 messages.append(msg)
1461 if messages:
1462 raise ValidationError(";\n".join(messages))
1464 registry: Registry
1465 """The object that manages dataset metadata and relationships (`Registry`).
1467 Most operations that don't involve reading or writing butler datasets are
1468 accessible only via `Registry` methods.
1469 """
1471 datastore: Datastore
1472 """The object that manages actual dataset storage (`Datastore`).
1474 Direct user access to the datastore should rarely be necessary; the primary
1475 exception is the case where a `Datastore` implementation provides extra
1476 functionality beyond what the base class defines.
1477 """
1479 storageClasses: StorageClassFactory
1480 """An object that maps known storage class names to objects that fully
1481 describe them (`StorageClassFactory`).
1482 """
1484 collections: Optional[CollectionSearch]
1485 """The collections to search and any restrictions on the dataset types to
1486 search for within them, in order (`CollectionSearch`).
1487 """
1489 run: Optional[str]
1490 """Name of the run this butler writes outputs to (`str` or `None`).
1491 """
1493 tags: Tuple[str, ...]
1494 """Names of `~CollectionType.TAGGED` collections this butler associates
1495 with in `put` and `ingest`, and disassociates from in `pruneDatasets`
1496 (`tuple` [ `str` ]).
1497 """