Coverage for python/lsst/daf/butler/_butler.py : 10%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""
23Butler top level classes.
24"""
25from __future__ import annotations
27__all__ = ("Butler", "ButlerValidationError")
29import os
30from collections import defaultdict
31import contextlib
32import logging
33from typing import (
34 Any,
35 ClassVar,
36 ContextManager,
37 Dict,
38 Iterable,
39 List,
40 MutableMapping,
41 Optional,
42 Tuple,
43 Union,
44)
46try:
47 import boto3
48except ImportError:
49 boto3 = None
51from lsst.utils import doImport
52from .core import (
53 ButlerURI,
54 CompositesMap,
55 Config,
56 ConfigSubset,
57 DataCoordinate,
58 DataId,
59 DatasetRef,
60 DatasetType,
61 Datastore,
62 FileDataset,
63 Quantum,
64 RepoExport,
65 StorageClassFactory,
66 ValidationError,
67)
68from .core.repoRelocation import BUTLER_ROOT_TAG
69from .core.safeFileIo import safeMakeDir
70from .core.utils import transactional, getClassOf
71from ._deferredDatasetHandle import DeferredDatasetHandle
72from ._butlerConfig import ButlerConfig
73from .registry import Registry, RegistryConfig
75log = logging.getLogger(__name__)
78class ButlerValidationError(ValidationError):
79 """There is a problem with the Butler configuration."""
80 pass
83class Butler:
84 """Main entry point for the data access system.
86 Attributes
87 ----------
88 config : `str`, `ButlerConfig` or `Config`, optional
89 (filename to) configuration. If this is not a `ButlerConfig`, defaults
90 will be read. If a `str`, may be the path to a directory containing
91 a "butler.yaml" file.
92 datastore : `Datastore`
93 Datastore to use for storage.
94 registry : `Registry`
95 Registry to use for lookups.
97 Parameters
98 ----------
99 config : `ButlerConfig`, `Config` or `str`, optional.
100 Configuration. Anything acceptable to the
101 `ButlerConfig` constructor. If a directory path
102 is given the configuration will be read from a ``butler.yaml`` file in
103 that location. If `None` is given default values will be used.
104 butler : `Butler`, optional.
105 If provided, construct a new Butler that uses the same registry and
106 datastore as the given one, but with the given collection and run.
107 Incompatible with the ``config``, ``searchPaths``, and ``writeable``
108 arguments.
109 collection : `str`, optional
110 Collection to use for all input lookups. May be `None` to either use
111 the value passed to ``run``, or to defer passing a collection until
112 the methods that require one are called.
113 run : `str`, optional
114 Name of the run datasets should be output to; also used as a tagged
115 collection name these dataset will be associated with. If the run
116 does not exist, it will be created. If ``collection`` is `None`, this
117 collection will be used for input lookups as well; if not, it must have
118 the same value as ``run``.
119 searchPaths : `list` of `str`, optional
120 Directory paths to search when calculating the full Butler
121 configuration. Not used if the supplied config is already a
122 `ButlerConfig`.
123 writeable : `bool`, optional
124 Explicitly sets whether the butler supports write operations. If not
125 provided, a read-only butler is created unless ``run`` is passed.
127 Raises
128 ------
129 ValueError
130 Raised if neither "collection" nor "run" are provided by argument or
131 config, or if both are provided and are inconsistent.
132 """
133 def __init__(self, config: Union[Config, str, None] = None, *,
134 butler: Optional[Butler] = None,
135 collection: Optional[str] = None,
136 run: Optional[str] = None,
137 searchPaths: Optional[List[str]] = None,
138 writeable: Optional[bool] = None):
139 if butler is not None:
140 if config is not None or searchPaths is not None or writeable is not None:
141 raise TypeError("Cannot pass 'config', 'searchPaths', or 'writeable' "
142 "arguments with 'butler' argument.")
143 self.registry = butler.registry
144 self.datastore = butler.datastore
145 self.storageClasses = butler.storageClasses
146 self._composites = butler._composites
147 self._config = butler._config
148 else:
149 self._config = ButlerConfig(config, searchPaths=searchPaths)
150 if "root" in self._config:
151 butlerRoot = self._config["root"]
152 else:
153 butlerRoot = self._config.configDir
154 if writeable is None:
155 writeable = run is not None
156 self.registry = Registry.fromConfig(self._config, butlerRoot=butlerRoot, writeable=writeable)
157 self.datastore = Datastore.fromConfig(self._config, self.registry, butlerRoot=butlerRoot)
158 self.storageClasses = StorageClassFactory()
159 self.storageClasses.addFromConfig(self._config)
160 self._composites = CompositesMap(self._config, universe=self.registry.dimensions)
161 if "run" in self._config or "collection" in self._config:
162 raise ValueError("Passing a run or collection via configuration is no longer supported.")
163 if run is not None and writeable is False:
164 raise ValueError(f"Butler initialized with run='{run}', "
165 f"but is read-only; use collection='{run}' instead.")
166 self.run = run
167 if collection is None and run is not None:
168 collection = run
169 if self.run is not None and collection != self.run:
170 raise ValueError(
171 "Run ({}) and collection ({}) are inconsistent.".format(self.run, collection)
172 )
173 self.collection = collection
174 if self.run is not None:
175 self.registry.registerRun(self.run)
177 GENERATION: ClassVar[int] = 3
178 """This is a Generation 3 Butler.
180 This attribute may be removed in the future, once the Generation 2 Butler
181 interface has been fully retired; it should only be used in transitional
182 code.
183 """
185 @staticmethod
186 def makeRepo(root: str, config: Union[Config, str, None] = None, standalone: bool = False,
187 createRegistry: bool = True, searchPaths: Optional[List[str]] = None,
188 forceConfigRoot: bool = True, outfile: Optional[str] = None) -> Config:
189 """Create an empty data repository by adding a butler.yaml config
190 to a repository root directory.
192 Parameters
193 ----------
194 root : `str`
195 Filesystem path to the root of the new repository. Will be created
196 if it does not exist.
197 config : `Config` or `str`, optional
198 Configuration to write to the repository, after setting any
199 root-dependent Registry or Datastore config options. Can not
200 be a `ButlerConfig` or a `ConfigSubset`. If `None`, default
201 configuration will be used. Root-dependent config options
202 specified in this config are overwritten if ``forceConfigRoot``
203 is `True`.
204 standalone : `bool`
205 If True, write all expanded defaults, not just customized or
206 repository-specific settings.
207 This (mostly) decouples the repository from the default
208 configuration, insulating it from changes to the defaults (which
209 may be good or bad, depending on the nature of the changes).
210 Future *additions* to the defaults will still be picked up when
211 initializing `Butlers` to repos created with ``standalone=True``.
212 createRegistry : `bool`, optional
213 If `True` create a new Registry.
214 searchPaths : `list` of `str`, optional
215 Directory paths to search when calculating the full butler
216 configuration.
217 forceConfigRoot : `bool`, optional
218 If `False`, any values present in the supplied ``config`` that
219 would normally be reset are not overridden and will appear
220 directly in the output config. This allows non-standard overrides
221 of the root directory for a datastore or registry to be given.
222 If this parameter is `True` the values for ``root`` will be
223 forced into the resulting config if appropriate.
224 outfile : `str`, optional
225 If not-`None`, the output configuration will be written to this
226 location rather than into the repository itself. Can be a URI
227 string. Can refer to a directory that will be used to write
228 ``butler.yaml``.
230 Returns
231 -------
232 config : `Config`
233 The updated `Config` instance written to the repo.
235 Raises
236 ------
237 ValueError
238 Raised if a ButlerConfig or ConfigSubset is passed instead of a
239 regular Config (as these subclasses would make it impossible to
240 support ``standalone=False``).
241 os.error
242 Raised if the directory does not exist, exists but is not a
243 directory, or cannot be created.
245 Notes
246 -----
247 Note that when ``standalone=False`` (the default), the configuration
248 search path (see `ConfigSubset.defaultSearchPaths`) that was used to
249 construct the repository should also be used to construct any Butlers
250 to avoid configuration inconsistencies.
251 """
252 if isinstance(config, (ButlerConfig, ConfigSubset)):
253 raise ValueError("makeRepo must be passed a regular Config without defaults applied.")
255 # for "file" schemes we are assuming POSIX semantics for paths, for
256 # schemeless URIs we are assuming os.path semantics.
257 uri = ButlerURI(root)
258 if uri.scheme == "file" or not uri.scheme:
259 if not os.path.isdir(uri.ospath):
260 safeMakeDir(uri.ospath)
261 elif uri.scheme == "s3":
262 s3 = boto3.resource("s3")
263 # implies bucket exists, if not another level of checks
264 bucket = s3.Bucket(uri.netloc)
265 bucket.put_object(Bucket=uri.netloc, Key=uri.relativeToPathRoot)
266 else:
267 raise ValueError(f"Unrecognized scheme: {uri.scheme}")
268 config = Config(config)
270 # If we are creating a new repo from scratch with relative roots,
271 # do not propagate an explicit root from the config file
272 if "root" in config:
273 del config["root"]
275 full = ButlerConfig(config, searchPaths=searchPaths) # this applies defaults
276 datastoreClass = doImport(full["datastore", "cls"])
277 datastoreClass.setConfigRoot(BUTLER_ROOT_TAG, config, full, overwrite=forceConfigRoot)
279 # if key exists in given config, parse it, otherwise parse the defaults
280 # in the expanded config
281 if config.get(("registry", "db")):
282 registryConfig = RegistryConfig(config)
283 else:
284 registryConfig = RegistryConfig(full)
285 defaultDatabaseUri = registryConfig.makeDefaultDatabaseUri(BUTLER_ROOT_TAG)
286 if defaultDatabaseUri is not None:
287 Config.updateParameters(RegistryConfig, config, full,
288 toUpdate={"db": defaultDatabaseUri},
289 overwrite=forceConfigRoot)
290 else:
291 Config.updateParameters(RegistryConfig, config, full, toCopy=("db",),
292 overwrite=forceConfigRoot)
294 if standalone:
295 config.merge(full)
296 if outfile is not None:
297 # When writing to a separate location we must include
298 # the root of the butler repo in the config else it won't know
299 # where to look.
300 config["root"] = uri.geturl()
301 configURI = outfile
302 else:
303 configURI = uri
304 config.dumpToUri(configURI)
306 # Create Registry and populate tables
307 Registry.fromConfig(config, create=createRegistry, butlerRoot=root)
308 return config
310 @classmethod
311 def _unpickle(cls, config: ButlerConfig, collection: str, run: Optional[str], writeable: bool) -> Butler:
312 """Callable used to unpickle a Butler.
314 We prefer not to use ``Butler.__init__`` directly so we can force some
315 of its many arguments to be keyword-only (note that ``__reduce__``
316 can only invoke callables with positional arguments).
318 Parameters
319 ----------
320 config : `ButlerConfig`
321 Butler configuration, already coerced into a true `ButlerConfig`
322 instance (and hence after any search paths for overrides have been
323 utilized).
324 collection : `str`
325 String name of a collection to use for read operations.
326 run : `str`, optional
327 String name of a run to use for write operations, or `None` for a
328 read-only butler.
330 Returns
331 -------
332 butler : `Butler`
333 A new `Butler` instance.
334 """
335 return cls(config=config, collection=collection, run=run, writeable=writeable)
337 def __reduce__(self):
338 """Support pickling.
339 """
340 return (Butler._unpickle, (self._config, self.collection, self.run, self.registry.isWriteable()))
342 def __str__(self):
343 return "Butler(collection='{}', datastore='{}', registry='{}')".format(
344 self.collection, self.datastore, self.registry)
346 def isWriteable(self) -> bool:
347 """Return `True` if this `Butler` supports write operations.
348 """
349 return self.registry.isWriteable()
351 @contextlib.contextmanager
352 def transaction(self):
353 """Context manager supporting `Butler` transactions.
355 Transactions can be nested.
356 """
357 with self.registry.transaction():
358 with self.datastore.transaction():
359 yield
361 def _standardizeArgs(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
362 dataId: Optional[DataId] = None, **kwds: Any) -> Tuple[DatasetType, DataId]:
363 """Standardize the arguments passed to several Butler APIs.
365 Parameters
366 ----------
367 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
368 When `DatasetRef` the `dataId` should be `None`.
369 Otherwise the `DatasetType` or name thereof.
370 dataId : `dict` or `DataCoordinate`
371 A `dict` of `Dimension` link name, value pairs that label the
372 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
373 should be provided as the second argument.
374 kwds
375 Additional keyword arguments used to augment or construct a
376 `DataCoordinate`. See `DataCoordinate.standardize`
377 parameters.
379 Returns
380 -------
381 datasetType : `DatasetType`
382 A `DatasetType` instance extracted from ``datasetRefOrType``.
383 dataId : `dict` or `DataId`, optional
384 Argument that can be used (along with ``kwds``) to construct a
385 `DataId`.
387 Notes
388 -----
389 Butler APIs that conceptually need a DatasetRef also allow passing a
390 `DatasetType` (or the name of one) and a `DataId` (or a dict and
391 keyword arguments that can be used to construct one) separately. This
392 method accepts those arguments and always returns a true `DatasetType`
393 and a `DataId` or `dict`.
395 Standardization of `dict` vs `DataId` is best handled by passing the
396 returned ``dataId`` (and ``kwds``) to `Registry` APIs, which are
397 generally similarly flexible.
398 """
399 externalDatasetType = None
400 internalDatasetType = None
401 if isinstance(datasetRefOrType, DatasetRef):
402 if dataId is not None or kwds:
403 raise ValueError("DatasetRef given, cannot use dataId as well")
404 externalDatasetType = datasetRefOrType.datasetType
405 dataId = datasetRefOrType.dataId
406 else:
407 # Don't check whether DataId is provided, because Registry APIs
408 # can usually construct a better error message when it wasn't.
409 if isinstance(datasetRefOrType, DatasetType):
410 externalDatasetType = datasetRefOrType
411 else:
412 internalDatasetType = self.registry.getDatasetType(datasetRefOrType)
414 # Check that they are self-consistent
415 if externalDatasetType is not None:
416 internalDatasetType = self.registry.getDatasetType(externalDatasetType.name)
417 if externalDatasetType != internalDatasetType:
418 raise ValueError(f"Supplied dataset type ({externalDatasetType}) inconsistent with "
419 f"registry definition ({internalDatasetType})")
421 return internalDatasetType, dataId
423 def _findDatasetRef(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
424 dataId: Optional[DataId] = None, *,
425 collection: Optional[str] = None,
426 allowUnresolved: bool = False,
427 **kwds: Any) -> DatasetRef:
428 """Shared logic for methods that start with a search for a dataset in
429 the registry.
431 Parameters
432 ----------
433 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
434 When `DatasetRef` the `dataId` should be `None`.
435 Otherwise the `DatasetType` or name thereof.
436 dataId : `dict` or `DataCoordinate`, optional
437 A `dict` of `Dimension` link name, value pairs that label the
438 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
439 should be provided as the first argument.
440 collection : `str`, optional
441 Name of the collection to search, overriding ``self.collection``.
442 allowUnresolved : `bool`, optional
443 If `True`, return an unresolved `DatasetRef` if finding a resolved
444 one in the `Registry` fails. Defaults to `False`.
445 kwds
446 Additional keyword arguments used to augment or construct a
447 `DataId`. See `DataId` parameters.
449 Returns
450 -------
451 ref : `DatasetRef`
452 A reference to the dataset identified by the given arguments.
454 Raises
455 ------
456 LookupError
457 Raised if no matching dataset exists in the `Registry` (and
458 ``allowUnresolved is False``).
459 ValueError
460 Raised if a resolved `DatasetRef` was passed as an input, but it
461 differs from the one found in the registry in this collection.
462 TypeError
463 Raised if ``collection`` and ``self.collection`` are both `None`.
464 """
465 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwds)
466 if isinstance(datasetRefOrType, DatasetRef):
467 idNumber = datasetRefOrType.id
468 else:
469 idNumber = None
470 # Expand the data ID first instead of letting registry.find do it, so
471 # we get the result even if it returns None.
472 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions, **kwds)
473 if collection is None:
474 collection = self.collection
475 if collection is None:
476 raise TypeError("No collection provided.")
477 # Always lookup the DatasetRef, even if one is given, to ensure it is
478 # present in the current collection.
479 ref = self.registry.find(collection, datasetType, dataId)
480 if ref is None:
481 if allowUnresolved:
482 return DatasetRef(datasetType, dataId)
483 else:
484 raise LookupError(f"Dataset {datasetType.name} with data ID {dataId} "
485 f"could not be found in collection '{collection}'.")
486 if idNumber is not None and idNumber != ref.id:
487 raise ValueError(f"DatasetRef.id provided ({idNumber}) does not match "
488 f"id ({ref.id}) in registry in collection '{collection}'.")
489 return ref
491 @transactional
492 def put(self, obj: Any, datasetRefOrType: Union[DatasetRef, DatasetType, str],
493 dataId: Optional[DataId] = None, *,
494 producer: Optional[Quantum] = None,
495 run: Optional[str] = None,
496 **kwds: Any) -> DatasetRef:
497 """Store and register a dataset.
499 Parameters
500 ----------
501 obj : `object`
502 The dataset.
503 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
504 When `DatasetRef` is provided, ``dataId`` should be `None`.
505 Otherwise the `DatasetType` or name thereof.
506 dataId : `dict` or `DataCoordinate`
507 A `dict` of `Dimension` link name, value pairs that label the
508 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
509 should be provided as the second argument.
510 producer : `Quantum`, optional
511 The producer.
512 run : `str`, optional
513 The name of the run the dataset should be added to, overriding
514 ``self.run``.
515 kwds
516 Additional keyword arguments used to augment or construct a
517 `DataCoordinate`. See `DataCoordinate.standardize`
518 parameters.
520 Returns
521 -------
522 ref : `DatasetRef`
523 A reference to the stored dataset, updated with the correct id if
524 given.
526 Raises
527 ------
528 TypeError
529 Raised if the butler is read-only or if no run has been provided.
530 """
531 log.debug("Butler put: %s, dataId=%s, producer=%s, run=%s", datasetRefOrType, dataId, producer, run)
532 if not self.isWriteable():
533 raise TypeError("Butler is read-only.")
534 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwds)
535 if isinstance(datasetRefOrType, DatasetRef) and datasetRefOrType.id is not None:
536 raise ValueError("DatasetRef must not be in registry, must have None id")
538 if run is None:
539 if self.run is None:
540 raise TypeError("No run provided.")
541 run = self.run
543 isVirtualComposite = self._composites.shouldBeDisassembled(datasetType)
545 # Add Registry Dataset entry. If not a virtual composite, add
546 # and attach components at the same time.
547 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions, **kwds)
548 ref, = self.registry.insertDatasets(datasetType, run=run, dataIds=[dataId],
549 producer=producer, recursive=not isVirtualComposite)
551 # Check to see if this datasetType requires disassembly
552 if isVirtualComposite:
553 components = datasetType.storageClass.assembler().disassemble(obj)
554 for component, info in components.items():
555 compTypeName = datasetType.componentTypeName(component)
556 compRef = self.put(info.component, compTypeName, dataId, producer=producer, run=run)
557 self.registry.attachComponent(component, ref, compRef)
558 else:
559 # This is an entity without a disassembler.
560 self.datastore.put(obj, ref)
562 return ref
564 def getDirect(self, ref: DatasetRef, *, parameters: Optional[Dict[str, Any]] = None):
565 """Retrieve a stored dataset.
567 Unlike `Butler.get`, this method allows datasets outside the Butler's
568 collection to be read as long as the `DatasetRef` that identifies them
569 can be obtained separately.
571 Parameters
572 ----------
573 ref : `DatasetRef`
574 Reference to an already stored dataset.
575 parameters : `dict`
576 Additional StorageClass-defined options to control reading,
577 typically used to efficiently read only a subset of the dataset.
579 Returns
580 -------
581 obj : `object`
582 The dataset.
583 """
584 # if the ref exists in the store we return it directly
585 if self.datastore.exists(ref):
586 return self.datastore.get(ref, parameters=parameters)
587 elif ref.isComposite():
588 # Check that we haven't got any unknown parameters
589 ref.datasetType.storageClass.validateParameters(parameters)
590 # Reconstruct the composite
591 usedParams = set()
592 components = {}
593 for compName, compRef in ref.components.items():
594 # make a dictionary of parameters containing only the subset
595 # supported by the StorageClass of the components
596 compParams = compRef.datasetType.storageClass.filterParameters(parameters)
597 usedParams.update(set(compParams))
598 components[compName] = self.datastore.get(compRef, parameters=compParams)
600 # Any unused parameters will have to be passed to the assembler
601 if parameters:
602 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams}
603 else:
604 unusedParams = {}
606 # Assemble the components
607 inMemoryDataset = ref.datasetType.storageClass.assembler().assemble(components)
608 return ref.datasetType.storageClass.assembler().handleParameters(inMemoryDataset,
609 parameters=unusedParams)
610 else:
611 # single entity in datastore
612 raise FileNotFoundError(f"Unable to locate dataset '{ref}' in datastore {self.datastore.name}")
614 def getDeferred(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
615 dataId: Optional[DataId] = None, *,
616 parameters: Union[dict, None] = None,
617 collection: Optional[str] = None,
618 **kwds: Any) -> DeferredDatasetHandle:
619 """Create a `DeferredDatasetHandle` which can later retrieve a dataset
621 Parameters
622 ----------
623 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
624 When `DatasetRef` the `dataId` should be `None`.
625 Otherwise the `DatasetType` or name thereof.
626 dataId : `dict` or `DataCoordinate`, optional
627 A `dict` of `Dimension` link name, value pairs that label the
628 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
629 should be provided as the first argument.
630 collection : `str`, optional
631 Name of the collection to search, overriding ``self.collection``.
632 parameters : `dict`
633 Additional StorageClass-defined options to control reading,
634 typically used to efficiently read only a subset of the dataset.
635 collection : `str`, optional
636 Collection to search, overriding ``self.collection``.
637 kwds
638 Additional keyword arguments used to augment or construct a
639 `DataId`. See `DataId` parameters.
641 Returns
642 -------
643 obj : `DeferredDatasetHandle`
644 A handle which can be used to retrieve a dataset at a later time.
646 Raises
647 ------
648 LookupError
649 Raised if no matching dataset exists in the `Registry` (and
650 ``allowUnresolved is False``).
651 ValueError
652 Raised if a resolved `DatasetRef` was passed as an input, but it
653 differs from the one found in the registry in this collection.
654 TypeError
655 Raised if ``collection`` and ``self.collection`` are both `None`.
656 """
657 ref = self._findDatasetRef(datasetRefOrType, dataId, collection=collection, **kwds)
658 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters)
660 def get(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
661 dataId: Optional[DataId] = None, *,
662 parameters: Optional[Dict[str, Any]] = None,
663 collection: Optional[str] = None,
664 **kwds: Any) -> Any:
665 """Retrieve a stored dataset.
667 Parameters
668 ----------
669 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
670 When `DatasetRef` the `dataId` should be `None`.
671 Otherwise the `DatasetType` or name thereof.
672 dataId : `dict` or `DataCoordinate`
673 A `dict` of `Dimension` link name, value pairs that label the
674 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
675 should be provided as the first argument.
676 parameters : `dict`
677 Additional StorageClass-defined options to control reading,
678 typically used to efficiently read only a subset of the dataset.
679 collection : `str`, optional
680 Collection to search, overriding ``self.collection``.
681 kwds
682 Additional keyword arguments used to augment or construct a
683 `DataCoordinate`. See `DataCoordinate.standardize`
684 parameters.
686 Returns
687 -------
688 obj : `object`
689 The dataset.
691 Raises
692 ------
693 ValueError
694 Raised if a resolved `DatasetRef` was passed as an input, but it
695 differs from the one found in the registry in this collection.
696 LookupError
697 Raised if no matching dataset exists in the `Registry`.
698 TypeError
699 Raised if ``collection`` and ``self.collection`` are both `None`.
700 """
701 log.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters)
702 ref = self._findDatasetRef(datasetRefOrType, dataId, collection=collection, **kwds)
703 return self.getDirect(ref, parameters=parameters)
705 def getUri(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
706 dataId: Optional[DataId] = None, *,
707 predict: bool = False,
708 collection: Optional[str] = None,
709 run: Optional[str] = None,
710 **kwds: Any) -> str:
711 """Return the URI to the Dataset.
713 Parameters
714 ----------
715 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
716 When `DatasetRef` the `dataId` should be `None`.
717 Otherwise the `DatasetType` or name thereof.
718 dataId : `dict` or `DataCoordinate`
719 A `dict` of `Dimension` link name, value pairs that label the
720 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
721 should be provided as the first argument.
722 predict : `bool`
723 If `True`, allow URIs to be returned of datasets that have not
724 been written.
725 collection : `str`, optional
726 Collection to search, overriding ``self.collection``.
727 run : `str`, optional
728 Run to use for predictions, overriding ``self.run``.
729 kwds
730 Additional keyword arguments used to augment or construct a
731 `DataCoordinate`. See `DataCoordinate.standardize`
732 parameters.
734 Returns
735 -------
736 uri : `str`
737 URI string pointing to the Dataset within the datastore. If the
738 Dataset does not exist in the datastore, and if ``predict`` is
739 `True`, the URI will be a prediction and will include a URI
740 fragment "#predicted".
741 If the datastore does not have entities that relate well
742 to the concept of a URI the returned URI string will be
743 descriptive. The returned URI is not guaranteed to be obtainable.
745 Raises
746 ------
747 LookupError
748 A URI has been requested for a dataset that does not exist and
749 guessing is not allowed.
750 ValueError
751 Raised if a resolved `DatasetRef` was passed as an input, but it
752 differs from the one found in the registry in this collection.
753 TypeError
754 Raised if ``collection`` and ``self.collection`` are both `None`.
755 """
756 ref = self._findDatasetRef(datasetRefOrType, dataId, allowUnresolved=predict, collection=collection,
757 **kwds)
758 if ref.id is None: # only possible if predict is True
759 if run is None:
760 run = self.run
761 if run is None:
762 raise TypeError("Cannot predict location with run=None.")
763 # Lie about ID, because we can't guess it, and only
764 # Datastore.getUri() will ever see it (and it doesn't use it).
765 ref = ref.resolved(id=0, run=self.run)
766 return self.datastore.getUri(ref, predict)
768 def datasetExists(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
769 dataId: Optional[DataId] = None, *,
770 collection: Optional[str] = None,
771 **kwds: Any) -> bool:
772 """Return True if the Dataset is actually present in the Datastore.
774 Parameters
775 ----------
776 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
777 When `DatasetRef` the `dataId` should be `None`.
778 Otherwise the `DatasetType` or name thereof.
779 dataId : `dict` or `DataCoordinate`
780 A `dict` of `Dimension` link name, value pairs that label the
781 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
782 should be provided as the first argument.
783 collection : `str`, optional
784 Collection to search, overriding ``self.collection``.
785 kwds
786 Additional keyword arguments used to augment or construct a
787 `DataCoordinate`. See `DataCoordinate.standardize`
788 parameters.
790 Raises
791 ------
792 LookupError
793 Raised if the dataset is not even present in the Registry.
794 ValueError
795 Raised if a resolved `DatasetRef` was passed as an input, but it
796 differs from the one found in the registry in this collection.
797 TypeError
798 Raised if ``collection`` and ``self.collection`` are both `None`.
799 """
800 ref = self._findDatasetRef(datasetRefOrType, dataId, collection=collection, **kwds)
801 return self.datastore.exists(ref)
803 def remove(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
804 dataId: Optional[DataId] = None, *,
805 delete: bool = True, remember: bool = True, collection: Optional[str] = None, **kwds: Any):
806 """Remove a dataset from the collection and possibly the repository.
808 The identified dataset is always at least removed from the Butler's
809 collection. By default it is also deleted from the Datastore (e.g.
810 files are actually deleted), but the dataset is "remembered" by
811 retaining its row in the dataset and provenance tables in the registry.
813 If the dataset is a composite, all components will also be removed.
815 Parameters
816 ----------
817 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
818 When `DatasetRef` the `dataId` should be `None`.
819 Otherwise the `DatasetType` or name thereof.
820 dataId : `dict` or `DataId`
821 A `dict` of `Dimension` link name, value pairs that label the
822 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
823 should be provided as the first argument.
824 delete : `bool`
825 If `True` (default) actually delete the dataset from the
826 Datastore (i.e. actually remove files).
827 remember : `bool`
828 If `True` (default), retain dataset and provenance records in
829 the `Registry` for this dataset.
830 collection : `str`, optional
831 Collection to search, overriding ``self.collection``.
832 kwds
833 Additional keyword arguments used to augment or construct a
834 `DataId`. See `DataId` parameters.
836 Raises
837 ------
838 TypeError
839 Raised if the butler is read-only, if no collection was provided,
840 or if ``delete`` and ``remember`` are both `False`; a dataset
841 cannot remain in a `Datastore` if its `Registry` entries is
842 removed.
843 OrphanedRecordError
844 Raised if ``remember`` is `False` but the dataset is still present
845 in a `Datastore` not recognized by this `Butler` client.
846 ValueError
847 Raised if a resolved `DatasetRef` was passed as an input, but it
848 differs from the one found in the registry in this collection.
849 """
850 if not self.isWriteable():
851 raise TypeError("Butler is read-only.")
852 ref = self._findDatasetRef(datasetRefOrType, dataId, collection=collection, **kwds)
853 if delete:
854 # There is a difference between a concrete composite and virtual
855 # composite. In a virtual composite the datastore is never
856 # given the top level DatasetRef. In the concrete composite
857 # the datastore knows all the refs and will clean up itself
858 # if asked to remove the parent ref.
859 # We can not check configuration for this since we can not trust
860 # that the configuration is the same. We therefore have to ask
861 # if the ref exists or not
862 if self.datastore.exists(ref):
863 self.datastore.remove(ref)
864 elif ref.isComposite():
865 datastoreNames = set(self.datastore.names)
866 for r in ref.components.values():
867 # If a dataset was removed previously but remembered
868 # in registry, skip the removal in the datastore.
869 datastoreLocations = self.registry.getDatasetLocations(r)
870 if datastoreLocations & datastoreNames:
871 self.datastore.remove(r)
872 else:
873 raise FileNotFoundError(f"Dataset {ref} not known to datastore")
874 elif not remember:
875 raise ValueError("Cannot retain dataset in Datastore without keeping Registry dataset record.")
876 if remember:
877 self.registry.disassociate(self.collection, [ref])
878 else:
879 # This also implicitly disassociates.
880 self.registry.removeDataset(ref)
882 @transactional
883 def ingest(self, *datasets: FileDataset, transfer: Optional[str] = None, run: Optional[str] = None):
884 """Store and register one or more datasets that already exist on disk.
886 Parameters
887 ----------
888 datasets : `FileDataset`
889 Each positional argument is a struct containing information about
890 a file to be ingested, including its path (either absolute or
891 relative to the datastore root, if applicable), a `DatasetRef`,
892 and optionally a formatter class or its fully-qualified string
893 name. If a formatter is not provided, the formatter that would be
894 used for `put` is assumed. On successful return, all
895 `FileDataset.ref` attributes will have their `DatasetRef.id`
896 attribute populated and all `FileDataset.formatter` attributes will
897 be set to the formatter class used. `FileDataset.path` attributes
898 may be modified to put paths in whatever the datastore considers a
899 standardized form.
900 transfer : `str`, optional
901 If not `None`, must be one of 'move', 'copy', 'hardlink', or
902 'symlink', indicating how to transfer the file.
903 run : `str`, optional
904 The name of the run ingested datasets should be added to,
905 overriding ``self.run``.
907 Raises
908 ------
909 TypeError
910 Raised if the butler is read-only or if no run was provided.
911 NotImplementedError
912 Raised if the `Datastore` does not support the given transfer mode.
913 DatasetTypeNotSupportedError
914 Raised if one or more files to be ingested have a dataset type that
915 is not supported by the `Datastore`..
916 FileNotFoundError
917 Raised if one of the given files does not exist.
918 FileExistsError
919 Raised if transfer is not `None` but the (internal) location the
920 file would be moved to is already occupied.
922 Notes
923 -----
924 This operation is not fully exception safe: if a database operation
925 fails, the given `FileDataset` instances may be only partially updated.
927 It is atomic in terms of database operations (they will either all
928 succeed or all fail) providing the database engine implements
929 transactions correctly. It will attempt to be atomic in terms of
930 filesystem operations as well, but this cannot be implemented
931 rigorously for most datastores.
932 """
933 if not self.isWriteable():
934 raise TypeError("Butler is read-only.")
935 if run is None:
936 if self.run is None:
937 raise TypeError("No run provided.")
938 run = self.run
940 # Reorganize the inputs so they're grouped by DatasetType and then
941 # data ID. We also include a list of DatasetRefs for each FileDataset
942 # to hold the resolved DatasetRefs returned by the Registry, before
943 # it's safe to swap them into FileDataset.refs.
944 # Some type annotation aliases to make that clearer:
945 GroupForType = Dict[DataCoordinate, Tuple[FileDataset, List[DatasetRef]]]
946 GroupedData = MutableMapping[DatasetType, GroupForType]
947 # The actual data structure:
948 groupedData: GroupedData = defaultdict(dict)
949 # And the nested loop that populates it:
950 for dataset in datasets:
951 # This list intentionally shared across the inner loop, since it's
952 # associated with `dataset`.
953 resolvedRefs = []
954 for ref in dataset.refs:
955 groupedData[ref.datasetType][ref.dataId] = (dataset, resolvedRefs)
957 # Now we can bulk-insert into Registry for each DatasetType.
958 for datasetType, groupForType in groupedData.items():
959 refs = self.registry.insertDatasets(datasetType,
960 dataIds=groupForType.keys(),
961 run=run,
962 recursive=True)
963 # Append those resolved DatasetRefs to the new lists we set up for
964 # them.
965 for ref, (_, resolvedRefs) in zip(refs, groupForType.values()):
966 resolvedRefs.append(ref)
968 # Go back to the original FileDatasets to replace their refs with the
969 # new resolved ones.
970 for groupForType in groupedData.values():
971 for dataset, resolvedRefs in groupForType.values():
972 dataset.refs = resolvedRefs
974 # Bulk-insert everything into Datastore.
975 self.datastore.ingest(*datasets, transfer=transfer)
977 @contextlib.contextmanager
978 def export(self, *, directory: Optional[str] = None,
979 filename: Optional[str] = None,
980 format: Optional[str] = None,
981 transfer: Optional[str] = None) -> ContextManager[RepoExport]:
982 """Export datasets from the repository represented by this `Butler`.
984 This method is a context manager that returns a helper object
985 (`RepoExport`) that is used to indicate what information from the
986 repository should be exported.
988 Parameters
989 ----------
990 directory : `str`, optional
991 Directory dataset files should be written to if ``transfer`` is not
992 `None`.
993 filename : `str`, optional
994 Name for the file that will include database information associated
995 with the exported datasets. If this is not an absolute path and
996 ``directory`` is not `None`, it will be written to ``directory``
997 instead of the current working directory. Defaults to
998 "export.{format}".
999 format : `str`, optional
1000 File format for the database information file. If `None`, the
1001 extension of ``filename`` will be used.
1002 transfer : `str`, optional
1003 Transfer mode passed to `Datastore.export`.
1005 Raises
1006 ------
1007 TypeError
1008 Raised if the set of arguments passed is inconsistent.
1010 Examples
1011 --------
1012 Typically the `Registry.queryDimensions` and `Registry.queryDatasets`
1013 methods are used to provide the iterables over data IDs and/or datasets
1014 to be exported::
1016 with butler.export("exports.yaml") as export:
1017 # Export all flats, and the calibration_label dimensions
1018 # associated with them.
1019 export.saveDatasets(butler.registry.queryDatasets("flat"),
1020 elements=[butler.registry.dimensions["calibration_label"]])
1021 # Export all datasets that start with "deepCoadd_" and all of
1022 # their associated data ID information.
1023 export.saveDatasets(butler.registry.queryDatasets("deepCoadd_*"))
1024 """
1025 if directory is None and transfer is not None:
1026 raise TypeError("Cannot transfer without providing a directory.")
1027 if transfer == "move":
1028 raise TypeError("Transfer may not be 'move': export is read-only")
1029 if format is None:
1030 if filename is None:
1031 raise TypeError("At least one of 'filename' or 'format' must be provided.")
1032 else:
1033 _, format = os.path.splitext(filename)
1034 elif filename is None:
1035 filename = f"export.{format}"
1036 if directory is not None:
1037 filename = os.path.join(directory, filename)
1038 BackendClass = getClassOf(self._config["repo_transfer_formats"][format]["export"])
1039 with open(filename, 'w') as stream:
1040 backend = BackendClass(stream)
1041 try:
1042 helper = RepoExport(self.registry, self.datastore, backend=backend,
1043 directory=directory, transfer=transfer)
1044 yield helper
1045 except BaseException:
1046 raise
1047 else:
1048 helper._finish()
1050 def import_(self, *, directory: Optional[str] = None,
1051 filename: Optional[str] = None,
1052 format: Optional[str] = None,
1053 transfer: Optional[str] = None):
1054 """Import datasets exported from a different butler repository.
1056 Parameters
1057 ----------
1058 directory : `str`, optional
1059 Directory containing dataset files. If `None`, all file paths
1060 must be absolute.
1061 filename : `str`, optional
1062 Name for the file that containing database information associated
1063 with the exported datasets. If this is not an absolute path, does
1064 not exist in the current working directory, and ``directory`` is
1065 not `None`, it is assumed to be in ``directory``. Defaults to
1066 "export.{format}".
1067 format : `str`, optional
1068 File format for the database information file. If `None`, the
1069 extension of ``filename`` will be used.
1070 transfer : `str`, optional
1071 Transfer mode passed to `Datastore.export`.
1073 Raises
1074 ------
1075 TypeError
1076 Raised if the set of arguments passed is inconsistent, or if the
1077 butler is read-only.
1078 """
1079 if not self.isWriteable():
1080 raise TypeError("Butler is read-only.")
1081 if format is None:
1082 if filename is None:
1083 raise TypeError("At least one of 'filename' or 'format' must be provided.")
1084 else:
1085 _, format = os.path.splitext(filename)
1086 elif filename is None:
1087 filename = f"export.{format}"
1088 if directory is not None and not os.path.exists(filename):
1089 filename = os.path.join(directory, filename)
1090 BackendClass = getClassOf(self._config["repo_transfer_formats"][format]["import"])
1091 with open(filename, 'r') as stream:
1092 backend = BackendClass(stream, self.registry)
1093 backend.register()
1094 with self.transaction():
1095 backend.load(self.datastore, directory=directory, transfer=transfer)
1097 def validateConfiguration(self, logFailures: bool = False,
1098 datasetTypeNames: Optional[Iterable[str]] = None,
1099 ignore: Iterable[str] = None):
1100 """Validate butler configuration.
1102 Checks that each `DatasetType` can be stored in the `Datastore`.
1104 Parameters
1105 ----------
1106 logFailures : `bool`, optional
1107 If `True`, output a log message for every validation error
1108 detected.
1109 datasetTypeNames : iterable of `str`, optional
1110 The `DatasetType` names that should be checked. This allows
1111 only a subset to be selected.
1112 ignore : iterable of `str`, optional
1113 Names of DatasetTypes to skip over. This can be used to skip
1114 known problems. If a named `DatasetType` corresponds to a
1115 composite, all component of that `DatasetType` will also be
1116 ignored.
1118 Raises
1119 ------
1120 ButlerValidationError
1121 Raised if there is some inconsistency with how this Butler
1122 is configured.
1123 """
1124 if datasetTypeNames:
1125 entities = [self.registry.getDatasetType(name) for name in datasetTypeNames]
1126 else:
1127 entities = list(self.registry.getAllDatasetTypes())
1129 # filter out anything from the ignore list
1130 if ignore:
1131 ignore = set(ignore)
1132 entities = [e for e in entities if e.name not in ignore and e.nameAndComponent()[0] not in ignore]
1133 else:
1134 ignore = set()
1136 # Find all the registered instruments
1137 instruments = set(
1138 dataId["instrument"] for dataId in self.registry.queryDimensions(["instrument"])
1139 )
1141 # For each datasetType that has an instrument dimension, create
1142 # a DatasetRef for each defined instrument
1143 datasetRefs = []
1145 for datasetType in entities:
1146 if "instrument" in datasetType.dimensions:
1147 for instrument in instruments:
1148 datasetRef = DatasetRef(datasetType, {"instrument": instrument}, conform=False)
1149 datasetRefs.append(datasetRef)
1151 entities.extend(datasetRefs)
1153 datastoreErrorStr = None
1154 try:
1155 self.datastore.validateConfiguration(entities, logFailures=logFailures)
1156 except ValidationError as e:
1157 datastoreErrorStr = str(e)
1159 # Also check that the LookupKeys used by the datastores match
1160 # registry and storage class definitions
1161 keys = self.datastore.getLookupKeys()
1163 failedNames = set()
1164 failedDataId = set()
1165 for key in keys:
1166 datasetType = None
1167 if key.name is not None:
1168 if key.name in ignore:
1169 continue
1171 # skip if specific datasetType names were requested and this
1172 # name does not match
1173 if datasetTypeNames and key.name not in datasetTypeNames:
1174 continue
1176 # See if it is a StorageClass or a DatasetType
1177 if key.name in self.storageClasses:
1178 pass
1179 else:
1180 try:
1181 self.registry.getDatasetType(key.name)
1182 except KeyError:
1183 if logFailures:
1184 log.fatal("Key '%s' does not correspond to a DatasetType or StorageClass", key)
1185 failedNames.add(key)
1186 else:
1187 # Dimensions are checked for consistency when the Butler
1188 # is created and rendezvoused with a universe.
1189 pass
1191 # Check that the instrument is a valid instrument
1192 # Currently only support instrument so check for that
1193 if key.dataId:
1194 dataIdKeys = set(key.dataId)
1195 if set(["instrument"]) != dataIdKeys:
1196 if logFailures:
1197 log.fatal("Key '%s' has unsupported DataId override", key)
1198 failedDataId.add(key)
1199 elif key.dataId["instrument"] not in instruments:
1200 if logFailures:
1201 log.fatal("Key '%s' has unknown instrument", key)
1202 failedDataId.add(key)
1204 messages = []
1206 if datastoreErrorStr:
1207 messages.append(datastoreErrorStr)
1209 for failed, msg in ((failedNames, "Keys without corresponding DatasetType or StorageClass entry: "),
1210 (failedDataId, "Keys with bad DataId entries: ")):
1211 if failed:
1212 msg += ", ".join(str(k) for k in failed)
1213 messages.append(msg)
1215 if messages:
1216 raise ValidationError(";\n".join(messages))
1218 registry: Registry
1219 """The object that manages dataset metadata and relationships (`Registry`).
1221 Most operations that don't involve reading or writing butler datasets are
1222 accessible only via `Registry` methods.
1223 """
1225 datastore: Datastore
1226 """The object that manages actual dataset storage (`Datastore`).
1228 Direct user access to the datastore should rarely be necessary; the primary
1229 exception is the case where a `Datastore` implementation provides extra
1230 functionality beyond what the base class defines.
1231 """
1233 storageClasses: StorageClassFactory
1234 """An object that maps known storage class names to objects that fully
1235 describe them (`StorageClassFactory`).
1236 """
1238 run: Optional[str]
1239 """Name of the run this butler writes outputs to (`str` or `None`).
1240 """
1242 collection: Optional[str]
1243 """Name of the collection this butler searches for datasets (`str` or
1244 `None`).
1245 """