Coverage for python/lsst/daf/butler/_butler.py : 10%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""
23Butler top level classes.
24"""
25from __future__ import annotations
27__all__ = ("Butler", "ButlerValidationError")
29import os
30from collections import defaultdict
31import contextlib
32import logging
33from typing import (
34 Any,
35 ClassVar,
36 ContextManager,
37 Dict,
38 Iterable,
39 List,
40 MutableMapping,
41 Optional,
42 Tuple,
43 Union,
44)
46try:
47 import boto3
48except ImportError:
49 boto3 = None
51from lsst.utils import doImport
52from .core import (
53 ButlerURI,
54 CompositesMap,
55 Config,
56 ConfigSubset,
57 DataCoordinate,
58 DataId,
59 DatasetRef,
60 DatasetType,
61 Datastore,
62 FileDataset,
63 Quantum,
64 RepoExport,
65 StorageClassFactory,
66 ValidationError,
67)
68from .core.repoRelocation import BUTLER_ROOT_TAG
69from .core.safeFileIo import safeMakeDir
70from .core.utils import transactional, getClassOf
71from ._deferredDatasetHandle import DeferredDatasetHandle
72from ._butlerConfig import ButlerConfig
73from .registry import Registry, RegistryConfig
75log = logging.getLogger(__name__)
78class ButlerValidationError(ValidationError):
79 """There is a problem with the Butler configuration."""
80 pass
83class Butler:
84 """Main entry point for the data access system.
86 Attributes
87 ----------
88 config : `str`, `ButlerConfig` or `Config`, optional
89 (filename to) configuration. If this is not a `ButlerConfig`, defaults
90 will be read. If a `str`, may be the path to a directory containing
91 a "butler.yaml" file.
92 datastore : `Datastore`
93 Datastore to use for storage.
94 registry : `Registry`
95 Registry to use for lookups.
97 Parameters
98 ----------
99 config : `ButlerConfig`, `Config` or `str`, optional.
100 Configuration. Anything acceptable to the
101 `ButlerConfig` constructor. If a directory path
102 is given the configuration will be read from a ``butler.yaml`` file in
103 that location. If `None` is given default values will be used.
104 butler : `Butler`, optional.
105 If provided, construct a new Butler that uses the same registry and
106 datastore as the given one, but with the given collection and run.
107 Incompatible with the ``config``, ``searchPaths``, and ``writeable``
108 arguments.
109 collection : `str`, optional
110 Collection to use for all input lookups. May be `None` to either use
111 the value passed to ``run``, or to defer passing a collection until
112 the methods that require one are called.
113 run : `str`, optional
114 Name of the run datasets should be output to; also used as a tagged
115 collection name these dataset will be associated with. If the run
116 does not exist, it will be created. If ``collection`` is `None`, this
117 collection will be used for input lookups as well; if not, it must have
118 the same value as ``run``.
119 searchPaths : `list` of `str`, optional
120 Directory paths to search when calculating the full Butler
121 configuration. Not used if the supplied config is already a
122 `ButlerConfig`.
123 writeable : `bool`, optional
124 Explicitly sets whether the butler supports write operations. If not
125 provided, a read-only butler is created unless ``run`` is passed.
127 Raises
128 ------
129 ValueError
130 Raised if neither "collection" nor "run" are provided by argument or
131 config, or if both are provided and are inconsistent.
132 """
133 def __init__(self, config: Union[Config, str, None] = None, *,
134 butler: Optional[Butler] = None,
135 collection: Optional[str] = None,
136 run: Optional[str] = None,
137 searchPaths: Optional[List[str]] = None,
138 writeable: Optional[bool] = None):
139 if butler is not None:
140 if config is not None or searchPaths is not None or writeable is not None:
141 raise TypeError("Cannot pass 'config', 'searchPaths', or 'writeable' "
142 "arguments with 'butler' argument.")
143 self.registry = butler.registry
144 self.datastore = butler.datastore
145 self.storageClasses = butler.storageClasses
146 self._composites = butler._composites
147 self._config = butler._config
148 else:
149 self._config = ButlerConfig(config, searchPaths=searchPaths)
150 if "root" in self._config:
151 butlerRoot = self._config["root"]
152 else:
153 butlerRoot = self._config.configDir
154 if writeable is None:
155 writeable = run is not None
156 self.registry = Registry.fromConfig(self._config, butlerRoot=butlerRoot, writeable=writeable)
157 self.datastore = Datastore.fromConfig(self._config, self.registry, butlerRoot=butlerRoot)
158 self.storageClasses = StorageClassFactory()
159 self.storageClasses.addFromConfig(self._config)
160 self._composites = CompositesMap(self._config, universe=self.registry.dimensions)
161 if "run" in self._config or "collection" in self._config:
162 raise ValueError("Passing a run or collection via configuration is no longer supported.")
163 if run is not None and writeable is False:
164 raise ValueError(f"Butler initialized with run='{run}', "
165 f"but is read-only; use collection='{run}' instead.")
166 self.run = run
167 if collection is None and run is not None:
168 collection = run
169 if self.run is not None and collection != self.run:
170 raise ValueError(
171 "Run ({}) and collection ({}) are inconsistent.".format(self.run, collection)
172 )
173 self.collection = collection
174 if self.run is not None:
175 self.registry.registerRun(self.run)
177 GENERATION: ClassVar[int] = 3
178 """This is a Generation 3 Butler.
180 This attribute may be removed in the future, once the Generation 2 Butler
181 interface has been fully retired; it should only be used in transitional
182 code.
183 """
185 @staticmethod
186 def makeRepo(root: str, config: Union[Config, str, None] = None, standalone: bool = False,
187 createRegistry: bool = True, searchPaths: Optional[List[str]] = None,
188 forceConfigRoot: bool = True, outfile: Optional[str] = None) -> Config:
189 """Create an empty data repository by adding a butler.yaml config
190 to a repository root directory.
192 Parameters
193 ----------
194 root : `str`
195 Filesystem path to the root of the new repository. Will be created
196 if it does not exist.
197 config : `Config` or `str`, optional
198 Configuration to write to the repository, after setting any
199 root-dependent Registry or Datastore config options. Can not
200 be a `ButlerConfig` or a `ConfigSubset`. If `None`, default
201 configuration will be used. Root-dependent config options
202 specified in this config are overwritten if ``forceConfigRoot``
203 is `True`.
204 standalone : `bool`
205 If True, write all expanded defaults, not just customized or
206 repository-specific settings.
207 This (mostly) decouples the repository from the default
208 configuration, insulating it from changes to the defaults (which
209 may be good or bad, depending on the nature of the changes).
210 Future *additions* to the defaults will still be picked up when
211 initializing `Butlers` to repos created with ``standalone=True``.
212 createRegistry : `bool`, optional
213 If `True` create a new Registry.
214 searchPaths : `list` of `str`, optional
215 Directory paths to search when calculating the full butler
216 configuration.
217 forceConfigRoot : `bool`, optional
218 If `False`, any values present in the supplied ``config`` that
219 would normally be reset are not overridden and will appear
220 directly in the output config. This allows non-standard overrides
221 of the root directory for a datastore or registry to be given.
222 If this parameter is `True` the values for ``root`` will be
223 forced into the resulting config if appropriate.
224 outfile : `str`, optional
225 If not-`None`, the output configuration will be written to this
226 location rather than into the repository itself. Can be a URI
227 string. Can refer to a directory that will be used to write
228 ``butler.yaml``.
230 Returns
231 -------
232 config : `Config`
233 The updated `Config` instance written to the repo.
235 Raises
236 ------
237 ValueError
238 Raised if a ButlerConfig or ConfigSubset is passed instead of a
239 regular Config (as these subclasses would make it impossible to
240 support ``standalone=False``).
241 os.error
242 Raised if the directory does not exist, exists but is not a
243 directory, or cannot be created.
245 Notes
246 -----
247 Note that when ``standalone=False`` (the default), the configuration
248 search path (see `ConfigSubset.defaultSearchPaths`) that was used to
249 construct the repository should also be used to construct any Butlers
250 to avoid configuration inconsistencies.
251 """
252 if isinstance(config, (ButlerConfig, ConfigSubset)):
253 raise ValueError("makeRepo must be passed a regular Config without defaults applied.")
255 # for "file" schemes we are assuming POSIX semantics for paths, for
256 # schemeless URIs we are assuming os.path semantics.
257 uri = ButlerURI(root)
258 if uri.scheme == "file" or not uri.scheme:
259 if not os.path.isdir(uri.ospath):
260 safeMakeDir(uri.ospath)
261 elif uri.scheme == "s3":
262 s3 = boto3.resource("s3")
263 # implies bucket exists, if not another level of checks
264 bucket = s3.Bucket(uri.netloc)
265 bucket.put_object(Bucket=uri.netloc, Key=uri.relativeToPathRoot)
266 else:
267 raise ValueError(f"Unrecognized scheme: {uri.scheme}")
268 config = Config(config)
270 # If we are creating a new repo from scratch with relative roots,
271 # do not propagate an explicit root from the config file
272 if "root" in config:
273 del config["root"]
275 full = ButlerConfig(config, searchPaths=searchPaths) # this applies defaults
276 datastoreClass = doImport(full["datastore", "cls"])
277 datastoreClass.setConfigRoot(BUTLER_ROOT_TAG, config, full, overwrite=forceConfigRoot)
279 # if key exists in given config, parse it, otherwise parse the defaults
280 # in the expanded config
281 if config.get(("registry", "db")):
282 registryConfig = RegistryConfig(config)
283 else:
284 registryConfig = RegistryConfig(full)
285 defaultDatabaseUri = registryConfig.makeDefaultDatabaseUri(BUTLER_ROOT_TAG)
286 if defaultDatabaseUri is not None:
287 Config.updateParameters(RegistryConfig, config, full,
288 toUpdate={"db": defaultDatabaseUri},
289 overwrite=forceConfigRoot)
290 else:
291 Config.updateParameters(RegistryConfig, config, full, toCopy=("db",),
292 overwrite=forceConfigRoot)
294 if standalone:
295 config.merge(full)
296 if outfile is not None:
297 # When writing to a separate location we must include
298 # the root of the butler repo in the config else it won't know
299 # where to look.
300 config["root"] = uri.geturl()
301 configURI = outfile
302 else:
303 configURI = uri
304 config.dumpToUri(configURI)
306 # Create Registry and populate tables
307 Registry.fromConfig(config, create=createRegistry, butlerRoot=root)
308 return config
310 @classmethod
311 def _unpickle(cls, config: ButlerConfig, collection: str, run: Optional[str], writeable: bool) -> Butler:
312 """Callable used to unpickle a Butler.
314 We prefer not to use ``Butler.__init__`` directly so we can force some
315 of its many arguments to be keyword-only (note that ``__reduce__``
316 can only invoke callables with positional arguments).
318 Parameters
319 ----------
320 config : `ButlerConfig`
321 Butler configuration, already coerced into a true `ButlerConfig`
322 instance (and hence after any search paths for overrides have been
323 utilized).
324 collection : `str`
325 String name of a collection to use for read operations.
326 run : `str`, optional
327 String name of a run to use for write operations, or `None` for a
328 read-only butler.
330 Returns
331 -------
332 butler : `Butler`
333 A new `Butler` instance.
334 """
335 return cls(config=config, collection=collection, run=run, writeable=writeable)
337 def __reduce__(self):
338 """Support pickling.
339 """
340 return (Butler._unpickle, (self._config, self.collection, self.run, self.registry.isWriteable()))
342 def __str__(self):
343 return "Butler(collection='{}', datastore='{}', registry='{}')".format(
344 self.collection, self.datastore, self.registry)
346 def isWriteable(self) -> bool:
347 """Return `True` if this `Butler` supports write operations.
348 """
349 return self.registry.isWriteable()
351 @contextlib.contextmanager
352 def transaction(self):
353 """Context manager supporting `Butler` transactions.
355 Transactions can be nested.
356 """
357 with self.registry.transaction():
358 with self.datastore.transaction():
359 yield
361 def _standardizeArgs(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
362 dataId: Optional[DataId] = None, **kwds: Any) -> Tuple[DatasetType, DataId]:
363 """Standardize the arguments passed to several Butler APIs.
365 Parameters
366 ----------
367 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
368 When `DatasetRef` the `dataId` should be `None`.
369 Otherwise the `DatasetType` or name thereof.
370 dataId : `dict` or `DataCoordinate`
371 A `dict` of `Dimension` link name, value pairs that label the
372 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
373 should be provided as the second argument.
374 kwds
375 Additional keyword arguments used to augment or construct a
376 `DataCoordinate`. See `DataCoordinate.standardize`
377 parameters.
379 Returns
380 -------
381 datasetType : `DatasetType`
382 A `DatasetType` instance extracted from ``datasetRefOrType``.
383 dataId : `dict` or `DataId`, optional
384 Argument that can be used (along with ``kwds``) to construct a
385 `DataId`.
387 Notes
388 -----
389 Butler APIs that conceptually need a DatasetRef also allow passing a
390 `DatasetType` (or the name of one) and a `DataId` (or a dict and
391 keyword arguments that can be used to construct one) separately. This
392 method accepts those arguments and always returns a true `DatasetType`
393 and a `DataId` or `dict`.
395 Standardization of `dict` vs `DataId` is best handled by passing the
396 returned ``dataId`` (and ``kwds``) to `Registry` APIs, which are
397 generally similarly flexible.
398 """
399 if isinstance(datasetRefOrType, DatasetRef):
400 if dataId is not None or kwds:
401 raise ValueError("DatasetRef given, cannot use dataId as well")
402 datasetType = datasetRefOrType.datasetType
403 dataId = datasetRefOrType.dataId
404 else:
405 # Don't check whether DataId is provided, because Registry APIs
406 # can usually construct a better error message when it wasn't.
407 if isinstance(datasetRefOrType, DatasetType):
408 datasetType = datasetRefOrType
409 else:
410 datasetType = self.registry.getDatasetType(datasetRefOrType)
411 return datasetType, dataId
413 def _findDatasetRef(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
414 dataId: Optional[DataId] = None, *,
415 collection: Optional[str] = None,
416 allowUnresolved: bool = False,
417 **kwds: Any) -> DatasetRef:
418 """Shared logic for methods that start with a search for a dataset in
419 the registry.
421 Parameters
422 ----------
423 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
424 When `DatasetRef` the `dataId` should be `None`.
425 Otherwise the `DatasetType` or name thereof.
426 dataId : `dict` or `DataCoordinate`, optional
427 A `dict` of `Dimension` link name, value pairs that label the
428 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
429 should be provided as the first argument.
430 collection : `str`, optional
431 Name of the collection to search, overriding ``self.collection``.
432 allowUnresolved : `bool`, optional
433 If `True`, return an unresolved `DatasetRef` if finding a resolved
434 one in the `Registry` fails. Defaults to `False`.
435 kwds
436 Additional keyword arguments used to augment or construct a
437 `DataId`. See `DataId` parameters.
439 Returns
440 -------
441 ref : `DatasetRef`
442 A reference to the dataset identified by the given arguments.
444 Raises
445 ------
446 LookupError
447 Raised if no matching dataset exists in the `Registry` (and
448 ``allowUnresolved is False``).
449 ValueError
450 Raised if a resolved `DatasetRef` was passed as an input, but it
451 differs from the one found in the registry in this collection.
452 TypeError
453 Raised if ``collection`` and ``self.collection`` are both `None`.
454 """
455 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwds)
456 if isinstance(datasetRefOrType, DatasetRef):
457 idNumber = datasetRefOrType.id
458 else:
459 idNumber = None
460 # Expand the data ID first instead of letting registry.find do it, so
461 # we get the result even if it returns None.
462 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions, **kwds)
463 if collection is None:
464 collection = self.collection
465 if collection is None:
466 raise TypeError("No collection provided.")
467 # Always lookup the DatasetRef, even if one is given, to ensure it is
468 # present in the current collection.
469 ref = self.registry.find(collection, datasetType, dataId)
470 if ref is None:
471 if allowUnresolved:
472 return DatasetRef(datasetType, dataId)
473 else:
474 raise LookupError(f"Dataset {datasetType.name} with data ID {dataId} "
475 f"could not be found in collection '{collection}'.")
476 if idNumber is not None and idNumber != ref.id:
477 raise ValueError(f"DatasetRef.id provided ({idNumber}) does not match "
478 f"id ({ref.id}) in registry in collection '{collection}'.")
479 return ref
481 @transactional
482 def put(self, obj: Any, datasetRefOrType: Union[DatasetRef, DatasetType, str],
483 dataId: Optional[DataId] = None, *,
484 producer: Optional[Quantum] = None,
485 run: Optional[str] = None,
486 **kwds: Any) -> DatasetRef:
487 """Store and register a dataset.
489 Parameters
490 ----------
491 obj : `object`
492 The dataset.
493 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
494 When `DatasetRef` is provided, ``dataId`` should be `None`.
495 Otherwise the `DatasetType` or name thereof.
496 dataId : `dict` or `DataCoordinate`
497 A `dict` of `Dimension` link name, value pairs that label the
498 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
499 should be provided as the second argument.
500 producer : `Quantum`, optional
501 The producer.
502 run : `str`, optional
503 The name of the run the dataset should be added to, overriding
504 ``self.run``.
505 kwds
506 Additional keyword arguments used to augment or construct a
507 `DataCoordinate`. See `DataCoordinate.standardize`
508 parameters.
510 Returns
511 -------
512 ref : `DatasetRef`
513 A reference to the stored dataset, updated with the correct id if
514 given.
516 Raises
517 ------
518 TypeError
519 Raised if the butler is read-only or if no run has been provided.
520 """
521 log.debug("Butler put: %s, dataId=%s, producer=%s, run=%s", datasetRefOrType, dataId, producer, run)
522 if not self.isWriteable():
523 raise TypeError("Butler is read-only.")
524 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwds)
525 if isinstance(datasetRefOrType, DatasetRef) and datasetRefOrType.id is not None:
526 raise ValueError("DatasetRef must not be in registry, must have None id")
528 if run is None:
529 if self.run is None:
530 raise TypeError("No run provided.")
531 run = self.run
533 isVirtualComposite = self._composites.shouldBeDisassembled(datasetType)
535 # Add Registry Dataset entry. If not a virtual composite, add
536 # and attach components at the same time.
537 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions, **kwds)
538 ref, = self.registry.insertDatasets(datasetType, run=run, dataIds=[dataId],
539 producer=producer, recursive=not isVirtualComposite)
541 # Check to see if this datasetType requires disassembly
542 if isVirtualComposite:
543 components = datasetType.storageClass.assembler().disassemble(obj)
544 for component, info in components.items():
545 compTypeName = datasetType.componentTypeName(component)
546 compRef = self.put(info.component, compTypeName, dataId, producer=producer, run=run)
547 self.registry.attachComponent(component, ref, compRef)
548 else:
549 # This is an entity without a disassembler.
550 self.datastore.put(obj, ref)
552 return ref
554 def getDirect(self, ref: DatasetRef, *, parameters: Optional[Dict[str, Any]] = None):
555 """Retrieve a stored dataset.
557 Unlike `Butler.get`, this method allows datasets outside the Butler's
558 collection to be read as long as the `DatasetRef` that identifies them
559 can be obtained separately.
561 Parameters
562 ----------
563 ref : `DatasetRef`
564 Reference to an already stored dataset.
565 parameters : `dict`
566 Additional StorageClass-defined options to control reading,
567 typically used to efficiently read only a subset of the dataset.
569 Returns
570 -------
571 obj : `object`
572 The dataset.
573 """
574 # if the ref exists in the store we return it directly
575 if self.datastore.exists(ref):
576 return self.datastore.get(ref, parameters=parameters)
577 elif ref.isComposite():
578 # Check that we haven't got any unknown parameters
579 ref.datasetType.storageClass.validateParameters(parameters)
580 # Reconstruct the composite
581 usedParams = set()
582 components = {}
583 for compName, compRef in ref.components.items():
584 # make a dictionary of parameters containing only the subset
585 # supported by the StorageClass of the components
586 compParams = compRef.datasetType.storageClass.filterParameters(parameters)
587 usedParams.update(set(compParams))
588 components[compName] = self.datastore.get(compRef, parameters=compParams)
590 # Any unused parameters will have to be passed to the assembler
591 if parameters:
592 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams}
593 else:
594 unusedParams = {}
596 # Assemble the components
597 inMemoryDataset = ref.datasetType.storageClass.assembler().assemble(components)
598 return ref.datasetType.storageClass.assembler().handleParameters(inMemoryDataset,
599 parameters=unusedParams)
600 else:
601 # single entity in datastore
602 raise FileNotFoundError(f"Unable to locate dataset '{ref}' in datastore {self.datastore.name}")
604 def getDeferred(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
605 dataId: Optional[DataId] = None, *,
606 parameters: Union[dict, None] = None,
607 collection: Optional[str] = None,
608 **kwds: Any) -> DeferredDatasetHandle:
609 """Create a `DeferredDatasetHandle` which can later retrieve a dataset
611 Parameters
612 ----------
613 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
614 When `DatasetRef` the `dataId` should be `None`.
615 Otherwise the `DatasetType` or name thereof.
616 dataId : `dict` or `DataCoordinate`, optional
617 A `dict` of `Dimension` link name, value pairs that label the
618 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
619 should be provided as the first argument.
620 collection : `str`, optional
621 Name of the collection to search, overriding ``self.collection``.
622 parameters : `dict`
623 Additional StorageClass-defined options to control reading,
624 typically used to efficiently read only a subset of the dataset.
625 collection : `str`, optional
626 Collection to search, overriding ``self.collection``.
627 kwds
628 Additional keyword arguments used to augment or construct a
629 `DataId`. See `DataId` parameters.
631 Returns
632 -------
633 obj : `DeferredDatasetHandle`
634 A handle which can be used to retrieve a dataset at a later time.
636 Raises
637 ------
638 LookupError
639 Raised if no matching dataset exists in the `Registry` (and
640 ``allowUnresolved is False``).
641 ValueError
642 Raised if a resolved `DatasetRef` was passed as an input, but it
643 differs from the one found in the registry in this collection.
644 TypeError
645 Raised if ``collection`` and ``self.collection`` are both `None`.
646 """
647 ref = self._findDatasetRef(datasetRefOrType, dataId, collection=collection, **kwds)
648 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters)
650 def get(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
651 dataId: Optional[DataId] = None, *,
652 parameters: Optional[Dict[str, Any]] = None,
653 collection: Optional[str] = None,
654 **kwds: Any) -> Any:
655 """Retrieve a stored dataset.
657 Parameters
658 ----------
659 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
660 When `DatasetRef` the `dataId` should be `None`.
661 Otherwise the `DatasetType` or name thereof.
662 dataId : `dict` or `DataCoordinate`
663 A `dict` of `Dimension` link name, value pairs that label the
664 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
665 should be provided as the first argument.
666 parameters : `dict`
667 Additional StorageClass-defined options to control reading,
668 typically used to efficiently read only a subset of the dataset.
669 collection : `str`, optional
670 Collection to search, overriding ``self.collection``.
671 kwds
672 Additional keyword arguments used to augment or construct a
673 `DataCoordinate`. See `DataCoordinate.standardize`
674 parameters.
676 Returns
677 -------
678 obj : `object`
679 The dataset.
681 Raises
682 ------
683 ValueError
684 Raised if a resolved `DatasetRef` was passed as an input, but it
685 differs from the one found in the registry in this collection.
686 LookupError
687 Raised if no matching dataset exists in the `Registry`.
688 TypeError
689 Raised if ``collection`` and ``self.collection`` are both `None`.
690 """
691 log.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters)
692 ref = self._findDatasetRef(datasetRefOrType, dataId, collection=collection, **kwds)
693 return self.getDirect(ref, parameters=parameters)
695 def getUri(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
696 dataId: Optional[DataId] = None, *,
697 predict: bool = False,
698 collection: Optional[str] = None,
699 run: Optional[str] = None,
700 **kwds: Any) -> str:
701 """Return the URI to the Dataset.
703 Parameters
704 ----------
705 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
706 When `DatasetRef` the `dataId` should be `None`.
707 Otherwise the `DatasetType` or name thereof.
708 dataId : `dict` or `DataCoordinate`
709 A `dict` of `Dimension` link name, value pairs that label the
710 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
711 should be provided as the first argument.
712 predict : `bool`
713 If `True`, allow URIs to be returned of datasets that have not
714 been written.
715 collection : `str`, optional
716 Collection to search, overriding ``self.collection``.
717 run : `str`, optional
718 Run to use for predictions, overriding ``self.run``.
719 kwds
720 Additional keyword arguments used to augment or construct a
721 `DataCoordinate`. See `DataCoordinate.standardize`
722 parameters.
724 Returns
725 -------
726 uri : `str`
727 URI string pointing to the Dataset within the datastore. If the
728 Dataset does not exist in the datastore, and if ``predict`` is
729 `True`, the URI will be a prediction and will include a URI
730 fragment "#predicted".
731 If the datastore does not have entities that relate well
732 to the concept of a URI the returned URI string will be
733 descriptive. The returned URI is not guaranteed to be obtainable.
735 Raises
736 ------
737 LookupError
738 A URI has been requested for a dataset that does not exist and
739 guessing is not allowed.
740 ValueError
741 Raised if a resolved `DatasetRef` was passed as an input, but it
742 differs from the one found in the registry in this collection.
743 TypeError
744 Raised if ``collection`` and ``self.collection`` are both `None`.
745 """
746 ref = self._findDatasetRef(datasetRefOrType, dataId, allowUnresolved=predict, collection=collection,
747 **kwds)
748 if ref.id is None: # only possible if predict is True
749 if run is None:
750 run = self.run
751 if run is None:
752 raise TypeError("Cannot predict location with run=None.")
753 # Lie about ID, because we can't guess it, and only
754 # Datastore.getUri() will ever see it (and it doesn't use it).
755 ref = ref.resolved(id=0, run=self.run)
756 return self.datastore.getUri(ref, predict)
758 def datasetExists(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
759 dataId: Optional[DataId] = None, *,
760 collection: Optional[str] = None,
761 **kwds: Any) -> bool:
762 """Return True if the Dataset is actually present in the Datastore.
764 Parameters
765 ----------
766 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
767 When `DatasetRef` the `dataId` should be `None`.
768 Otherwise the `DatasetType` or name thereof.
769 dataId : `dict` or `DataCoordinate`
770 A `dict` of `Dimension` link name, value pairs that label the
771 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
772 should be provided as the first argument.
773 collection : `str`, optional
774 Collection to search, overriding ``self.collection``.
775 kwds
776 Additional keyword arguments used to augment or construct a
777 `DataCoordinate`. See `DataCoordinate.standardize`
778 parameters.
780 Raises
781 ------
782 LookupError
783 Raised if the dataset is not even present in the Registry.
784 ValueError
785 Raised if a resolved `DatasetRef` was passed as an input, but it
786 differs from the one found in the registry in this collection.
787 TypeError
788 Raised if ``collection`` and ``self.collection`` are both `None`.
789 """
790 ref = self._findDatasetRef(datasetRefOrType, dataId, collection=collection, **kwds)
791 return self.datastore.exists(ref)
793 def remove(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
794 dataId: Optional[DataId] = None, *,
795 delete: bool = True, remember: bool = True, collection: Optional[str] = None, **kwds: Any):
796 """Remove a dataset from the collection and possibly the repository.
798 The identified dataset is always at least removed from the Butler's
799 collection. By default it is also deleted from the Datastore (e.g.
800 files are actually deleted), but the dataset is "remembered" by
801 retaining its row in the dataset and provenance tables in the registry.
803 If the dataset is a composite, all components will also be removed.
805 Parameters
806 ----------
807 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
808 When `DatasetRef` the `dataId` should be `None`.
809 Otherwise the `DatasetType` or name thereof.
810 dataId : `dict` or `DataId`
811 A `dict` of `Dimension` link name, value pairs that label the
812 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
813 should be provided as the first argument.
814 delete : `bool`
815 If `True` (default) actually delete the dataset from the
816 Datastore (i.e. actually remove files).
817 remember : `bool`
818 If `True` (default), retain dataset and provenance records in
819 the `Registry` for this dataset.
820 collection : `str`, optional
821 Collection to search, overriding ``self.collection``.
822 kwds
823 Additional keyword arguments used to augment or construct a
824 `DataId`. See `DataId` parameters.
826 Raises
827 ------
828 TypeError
829 Raised if the butler is read-only, if no collection was provided,
830 or if ``delete`` and ``remember`` are both `False`; a dataset
831 cannot remain in a `Datastore` if its `Registry` entries is
832 removed.
833 OrphanedRecordError
834 Raised if ``remember`` is `False` but the dataset is still present
835 in a `Datastore` not recognized by this `Butler` client.
836 ValueError
837 Raised if a resolved `DatasetRef` was passed as an input, but it
838 differs from the one found in the registry in this collection.
839 """
840 if not self.isWriteable():
841 raise TypeError("Butler is read-only.")
842 ref = self._findDatasetRef(datasetRefOrType, dataId, collection=collection, **kwds)
843 if delete:
844 # There is a difference between a concrete composite and virtual
845 # composite. In a virtual composite the datastore is never
846 # given the top level DatasetRef. In the concrete composite
847 # the datastore knows all the refs and will clean up itself
848 # if asked to remove the parent ref.
849 # We can not check configuration for this since we can not trust
850 # that the configuration is the same. We therefore have to ask
851 # if the ref exists or not
852 if self.datastore.exists(ref):
853 self.datastore.remove(ref)
854 elif ref.isComposite():
855 datastoreNames = set(self.datastore.names)
856 for r in ref.components.values():
857 # If a dataset was removed previously but remembered
858 # in registry, skip the removal in the datastore.
859 datastoreLocations = self.registry.getDatasetLocations(r)
860 if datastoreLocations & datastoreNames:
861 self.datastore.remove(r)
862 else:
863 raise FileNotFoundError(f"Dataset {ref} not known to datastore")
864 elif not remember:
865 raise ValueError("Cannot retain dataset in Datastore without keeping Registry dataset record.")
866 if remember:
867 self.registry.disassociate(self.collection, [ref])
868 else:
869 # This also implicitly disassociates.
870 self.registry.removeDataset(ref)
872 @transactional
873 def ingest(self, *datasets: FileDataset, transfer: Optional[str] = None, run: Optional[str] = None):
874 """Store and register one or more datasets that already exist on disk.
876 Parameters
877 ----------
878 datasets : `FileDataset`
879 Each positional argument is a struct containing information about
880 a file to be ingested, including its path (either absolute or
881 relative to the datastore root, if applicable), a `DatasetRef`,
882 and optionally a formatter class or its fully-qualified string
883 name. If a formatter is not provided, the formatter that would be
884 used for `put` is assumed. On successful return, all
885 `FileDataset.ref` attributes will have their `DatasetRef.id`
886 attribute populated and all `FileDataset.formatter` attributes will
887 be set to the formatter class used. `FileDataset.path` attributes
888 may be modified to put paths in whatever the datastore considers a
889 standardized form.
890 transfer : `str`, optional
891 If not `None`, must be one of 'move', 'copy', 'hardlink', or
892 'symlink', indicating how to transfer the file.
893 run : `str`, optional
894 The name of the run ingested datasets should be added to,
895 overriding ``self.run``.
897 Raises
898 ------
899 TypeError
900 Raised if the butler is read-only or if no run was provided.
901 NotImplementedError
902 Raised if the `Datastore` does not support the given transfer mode.
903 DatasetTypeNotSupportedError
904 Raised if one or more files to be ingested have a dataset type that
905 is not supported by the `Datastore`..
906 FileNotFoundError
907 Raised if one of the given files does not exist.
908 FileExistsError
909 Raised if transfer is not `None` but the (internal) location the
910 file would be moved to is already occupied.
912 Notes
913 -----
914 This operation is not fully exception safe: if a database operation
915 fails, the given `FileDataset` instances may be only partially updated.
917 It is atomic in terms of database operations (they will either all
918 succeed or all fail) providing the database engine implements
919 transactions correctly. It will attempt to be atomic in terms of
920 filesystem operations as well, but this cannot be implemented
921 rigorously for most datastores.
922 """
923 if not self.isWriteable():
924 raise TypeError("Butler is read-only.")
925 if run is None:
926 if self.run is None:
927 raise TypeError("No run provided.")
928 run = self.run
930 # Reorganize the inputs so they're grouped by DatasetType and then
931 # data ID. We also include a list of DatasetRefs for each FileDataset
932 # to hold the resolved DatasetRefs returned by the Registry, before
933 # it's safe to swap them into FileDataset.refs.
934 # Some type annotation aliases to make that clearer:
935 GroupForType = Dict[DataCoordinate, Tuple[FileDataset, List[DatasetRef]]]
936 GroupedData = MutableMapping[DatasetType, GroupForType]
937 # The actual data structure:
938 groupedData: GroupedData = defaultdict(dict)
939 # And the nested loop that populates it:
940 for dataset in datasets:
941 # This list intentionally shared across the inner loop, since it's
942 # associated with `dataset`.
943 resolvedRefs = []
944 for ref in dataset.refs:
945 groupedData[ref.datasetType][ref.dataId] = (dataset, resolvedRefs)
947 # Now we can bulk-insert into Registry for each DatasetType.
948 for datasetType, groupForType in groupedData.items():
949 refs = self.registry.insertDatasets(datasetType,
950 dataIds=groupForType.keys(),
951 run=run,
952 recursive=True)
953 # Append those resolved DatasetRefs to the new lists we set up for
954 # them.
955 for ref, (_, resolvedRefs) in zip(refs, groupForType.values()):
956 resolvedRefs.append(ref)
958 # Go back to the original FileDatasets to replace their refs with the
959 # new resolved ones.
960 for groupForType in groupedData.values():
961 for dataset, resolvedRefs in groupForType.values():
962 dataset.refs = resolvedRefs
964 # Bulk-insert everything into Datastore.
965 self.datastore.ingest(*datasets, transfer=transfer)
967 @contextlib.contextmanager
968 def export(self, *, directory: Optional[str] = None,
969 filename: Optional[str] = None,
970 format: Optional[str] = None,
971 transfer: Optional[str] = None) -> ContextManager[RepoExport]:
972 """Export datasets from the repository represented by this `Butler`.
974 This method is a context manager that returns a helper object
975 (`RepoExport`) that is used to indicate what information from the
976 repository should be exported.
978 Parameters
979 ----------
980 directory : `str`, optional
981 Directory dataset files should be written to if ``transfer`` is not
982 `None`.
983 filename : `str`, optional
984 Name for the file that will include database information associated
985 with the exported datasets. If this is not an absolute path and
986 ``directory`` is not `None`, it will be written to ``directory``
987 instead of the current working directory. Defaults to
988 "export.{format}".
989 format : `str`, optional
990 File format for the database information file. If `None`, the
991 extension of ``filename`` will be used.
992 transfer : `str`, optional
993 Transfer mode passed to `Datastore.export`.
995 Raises
996 ------
997 TypeError
998 Raised if the set of arguments passed is inconsistent.
1000 Examples
1001 --------
1002 Typically the `Registry.queryDimensions` and `Registry.queryDatasets`
1003 methods are used to provide the iterables over data IDs and/or datasets
1004 to be exported::
1006 with butler.export("exports.yaml") as export:
1007 # Export all flats, and the calibration_label dimensions
1008 # associated with them.
1009 export.saveDatasets(butler.registry.queryDatasets("flat"),
1010 elements=[butler.registry.dimensions["calibration_label"]])
1011 # Export all datasets that start with "deepCoadd_" and all of
1012 # their associated data ID information.
1013 export.saveDatasets(butler.registry.queryDatasets("deepCoadd_*"))
1014 """
1015 if directory is None and transfer is not None:
1016 raise TypeError("Cannot transfer without providing a directory.")
1017 if transfer == "move":
1018 raise TypeError("Transfer may not be 'move': export is read-only")
1019 if format is None:
1020 if filename is None:
1021 raise TypeError("At least one of 'filename' or 'format' must be provided.")
1022 else:
1023 _, format = os.path.splitext(filename)
1024 elif filename is None:
1025 filename = f"export.{format}"
1026 if directory is not None:
1027 filename = os.path.join(directory, filename)
1028 BackendClass = getClassOf(self._config["repo_transfer_formats"][format]["export"])
1029 with open(filename, 'w') as stream:
1030 backend = BackendClass(stream)
1031 try:
1032 helper = RepoExport(self.registry, self.datastore, backend=backend,
1033 directory=directory, transfer=transfer)
1034 yield helper
1035 except BaseException:
1036 raise
1037 else:
1038 helper._finish()
1040 def import_(self, *, directory: Optional[str] = None,
1041 filename: Optional[str] = None,
1042 format: Optional[str] = None,
1043 transfer: Optional[str] = None):
1044 """Import datasets exported from a different butler repository.
1046 Parameters
1047 ----------
1048 directory : `str`, optional
1049 Directory containing dataset files. If `None`, all file paths
1050 must be absolute.
1051 filename : `str`, optional
1052 Name for the file that containing database information associated
1053 with the exported datasets. If this is not an absolute path, does
1054 not exist in the current working directory, and ``directory`` is
1055 not `None`, it is assumed to be in ``directory``. Defaults to
1056 "export.{format}".
1057 format : `str`, optional
1058 File format for the database information file. If `None`, the
1059 extension of ``filename`` will be used.
1060 transfer : `str`, optional
1061 Transfer mode passed to `Datastore.export`.
1063 Raises
1064 ------
1065 TypeError
1066 Raised if the set of arguments passed is inconsistent, or if the
1067 butler is read-only.
1068 """
1069 if not self.isWriteable():
1070 raise TypeError("Butler is read-only.")
1071 if format is None:
1072 if filename is None:
1073 raise TypeError("At least one of 'filename' or 'format' must be provided.")
1074 else:
1075 _, format = os.path.splitext(filename)
1076 elif filename is None:
1077 filename = f"export.{format}"
1078 if directory is not None and not os.path.exists(filename):
1079 filename = os.path.join(directory, filename)
1080 BackendClass = getClassOf(self._config["repo_transfer_formats"][format]["import"])
1081 with open(filename, 'r') as stream:
1082 backend = BackendClass(stream, self.registry)
1083 backend.register()
1084 with self.transaction():
1085 backend.load(self.datastore, directory=directory, transfer=transfer)
1087 def validateConfiguration(self, logFailures: bool = False,
1088 datasetTypeNames: Optional[Iterable[str]] = None,
1089 ignore: Iterable[str] = None):
1090 """Validate butler configuration.
1092 Checks that each `DatasetType` can be stored in the `Datastore`.
1094 Parameters
1095 ----------
1096 logFailures : `bool`, optional
1097 If `True`, output a log message for every validation error
1098 detected.
1099 datasetTypeNames : iterable of `str`, optional
1100 The `DatasetType` names that should be checked. This allows
1101 only a subset to be selected.
1102 ignore : iterable of `str`, optional
1103 Names of DatasetTypes to skip over. This can be used to skip
1104 known problems. If a named `DatasetType` corresponds to a
1105 composite, all component of that `DatasetType` will also be
1106 ignored.
1108 Raises
1109 ------
1110 ButlerValidationError
1111 Raised if there is some inconsistency with how this Butler
1112 is configured.
1113 """
1114 if datasetTypeNames:
1115 entities = [self.registry.getDatasetType(name) for name in datasetTypeNames]
1116 else:
1117 entities = list(self.registry.getAllDatasetTypes())
1119 # filter out anything from the ignore list
1120 if ignore:
1121 ignore = set(ignore)
1122 entities = [e for e in entities if e.name not in ignore and e.nameAndComponent()[0] not in ignore]
1123 else:
1124 ignore = set()
1126 # Find all the registered instruments
1127 instruments = set(
1128 dataId["instrument"] for dataId in self.registry.queryDimensions(["instrument"])
1129 )
1131 # For each datasetType that has an instrument dimension, create
1132 # a DatasetRef for each defined instrument
1133 datasetRefs = []
1135 for datasetType in entities:
1136 if "instrument" in datasetType.dimensions:
1137 for instrument in instruments:
1138 datasetRef = DatasetRef(datasetType, {"instrument": instrument}, conform=False)
1139 datasetRefs.append(datasetRef)
1141 entities.extend(datasetRefs)
1143 datastoreErrorStr = None
1144 try:
1145 self.datastore.validateConfiguration(entities, logFailures=logFailures)
1146 except ValidationError as e:
1147 datastoreErrorStr = str(e)
1149 # Also check that the LookupKeys used by the datastores match
1150 # registry and storage class definitions
1151 keys = self.datastore.getLookupKeys()
1153 failedNames = set()
1154 failedDataId = set()
1155 for key in keys:
1156 datasetType = None
1157 if key.name is not None:
1158 if key.name in ignore:
1159 continue
1161 # skip if specific datasetType names were requested and this
1162 # name does not match
1163 if datasetTypeNames and key.name not in datasetTypeNames:
1164 continue
1166 # See if it is a StorageClass or a DatasetType
1167 if key.name in self.storageClasses:
1168 pass
1169 else:
1170 try:
1171 self.registry.getDatasetType(key.name)
1172 except KeyError:
1173 if logFailures:
1174 log.fatal("Key '%s' does not correspond to a DatasetType or StorageClass", key)
1175 failedNames.add(key)
1176 else:
1177 # Dimensions are checked for consistency when the Butler
1178 # is created and rendezvoused with a universe.
1179 pass
1181 # Check that the instrument is a valid instrument
1182 # Currently only support instrument so check for that
1183 if key.dataId:
1184 dataIdKeys = set(key.dataId)
1185 if set(["instrument"]) != dataIdKeys:
1186 if logFailures:
1187 log.fatal("Key '%s' has unsupported DataId override", key)
1188 failedDataId.add(key)
1189 elif key.dataId["instrument"] not in instruments:
1190 if logFailures:
1191 log.fatal("Key '%s' has unknown instrument", key)
1192 failedDataId.add(key)
1194 messages = []
1196 if datastoreErrorStr:
1197 messages.append(datastoreErrorStr)
1199 for failed, msg in ((failedNames, "Keys without corresponding DatasetType or StorageClass entry: "),
1200 (failedDataId, "Keys with bad DataId entries: ")):
1201 if failed:
1202 msg += ", ".join(str(k) for k in failed)
1203 messages.append(msg)
1205 if messages:
1206 raise ValidationError(";\n".join(messages))
1208 registry: Registry
1209 """The object that manages dataset metadata and relationships (`Registry`).
1211 Most operations that don't involve reading or writing butler datasets are
1212 accessible only via `Registry` methods.
1213 """
1215 datastore: Datastore
1216 """The object that manages actual dataset storage (`Datastore`).
1218 Direct user access to the datastore should rarely be necessary; the primary
1219 exception is the case where a `Datastore` implementation provides extra
1220 functionality beyond what the base class defines.
1221 """
1223 storageClasses: StorageClassFactory
1224 """An object that maps known storage class names to objects that fully
1225 describe them (`StorageClassFactory`).
1226 """
1228 run: Optional[str]
1229 """Name of the run this butler writes outputs to (`str` or `None`).
1230 """
1232 collection: Optional[str]
1233 """Name of the collection this butler searches for datasets (`str` or
1234 `None`).
1235 """