Coverage for python/lsst/daf/butler/_butler.py : 10%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22"""
23Butler top level classes.
24"""
25from __future__ import annotations
27__all__ = ("Butler", "ButlerValidationError")
29import os
30from collections import defaultdict
31import contextlib
32import logging
33from typing import (
34 Any,
35 ClassVar,
36 ContextManager,
37 Dict,
38 Iterable,
39 List,
40 MutableMapping,
41 Optional,
42 Tuple,
43 Union,
44)
46try:
47 import boto3
48except ImportError:
49 boto3 = None
51from lsst.utils import doImport
52from .core import (
53 ButlerURI,
54 CompositesMap,
55 Config,
56 ConfigSubset,
57 DataCoordinate,
58 DataId,
59 DatasetRef,
60 DatasetType,
61 Datastore,
62 FileDataset,
63 Quantum,
64 RepoExport,
65 StorageClassFactory,
66 ValidationError,
67)
68from .core.repoRelocation import BUTLER_ROOT_TAG
69from .core.safeFileIo import safeMakeDir
70from .core.utils import transactional, getClassOf
71from ._deferredDatasetHandle import DeferredDatasetHandle
72from ._butlerConfig import ButlerConfig
73from .registry import Registry, RegistryConfig
75log = logging.getLogger(__name__)
78class ButlerValidationError(ValidationError):
79 """There is a problem with the Butler configuration."""
80 pass
83class Butler:
84 """Main entry point for the data access system.
86 Attributes
87 ----------
88 config : `str`, `ButlerConfig` or `Config`, optional
89 (filename to) configuration. If this is not a `ButlerConfig`, defaults
90 will be read. If a `str`, may be the path to a directory containing
91 a "butler.yaml" file.
92 datastore : `Datastore`
93 Datastore to use for storage.
94 registry : `Registry`
95 Registry to use for lookups.
97 Parameters
98 ----------
99 config : `ButlerConfig`, `Config` or `str`, optional.
100 Configuration. Anything acceptable to the
101 `ButlerConfig` constructor. If a directory path
102 is given the configuration will be read from a ``butler.yaml`` file in
103 that location. If `None` is given default values will be used.
104 butler : `Butler`, optional.
105 If provided, construct a new Butler that uses the same registry and
106 datastore as the given one, but with the given collection and run.
107 Incompatible with the ``config``, ``searchPaths``, and ``writeable``
108 arguments.
109 collection : `str`, optional
110 Collection to use for all input lookups. May be `None` to either use
111 the value passed to ``run``, or to defer passing a collection until
112 the methods that require one are called.
113 run : `str`, optional
114 Name of the run datasets should be output to; also used as a tagged
115 collection name these dataset will be associated with. If the run
116 does not exist, it will be created. If ``collection`` is `None`, this
117 collection will be used for input lookups as well; if not, it must have
118 the same value as ``run``.
119 searchPaths : `list` of `str`, optional
120 Directory paths to search when calculating the full Butler
121 configuration. Not used if the supplied config is already a
122 `ButlerConfig`.
123 writeable : `bool`, optional
124 Explicitly sets whether the butler supports write operations. If not
125 provided, a read-only butler is created unless ``run`` is passed.
127 Raises
128 ------
129 ValueError
130 Raised if neither "collection" nor "run" are provided by argument or
131 config, or if both are provided and are inconsistent.
132 """
133 def __init__(self, config: Union[Config, str, None] = None, *,
134 butler: Optional[Butler] = None,
135 collection: Optional[str] = None,
136 run: Optional[str] = None,
137 searchPaths: Optional[List[str]] = None,
138 writeable: Optional[bool] = None):
139 if butler is not None:
140 if config is not None or searchPaths is not None or writeable is not None:
141 raise TypeError("Cannot pass 'config', 'searchPaths', or 'writeable' "
142 "arguments with 'butler' argument.")
143 self.registry = butler.registry
144 self.datastore = butler.datastore
145 self.storageClasses = butler.storageClasses
146 self._composites = butler._composites
147 self._config = butler._config
148 else:
149 self._config = ButlerConfig(config, searchPaths=searchPaths)
150 if "root" in self._config:
151 butlerRoot = self._config["root"]
152 else:
153 butlerRoot = self._config.configDir
154 if writeable is None:
155 writeable = run is not None
156 self.registry = Registry.fromConfig(self._config, butlerRoot=butlerRoot, writeable=writeable)
157 self.datastore = Datastore.fromConfig(self._config, self.registry, butlerRoot=butlerRoot)
158 self.storageClasses = StorageClassFactory()
159 self.storageClasses.addFromConfig(self._config)
160 self._composites = CompositesMap(self._config, universe=self.registry.dimensions)
161 if "run" in self._config or "collection" in self._config:
162 raise ValueError("Passing a run or collection via configuration is no longer supported.")
163 if run is not None and writeable is False:
164 raise ValueError(f"Butler initialized with run='{run}', "
165 f"but is read-only; use collection='{run}' instead.")
166 self.run = run
167 if collection is None and run is not None:
168 collection = run
169 if self.run is not None and collection != self.run:
170 raise ValueError(
171 "Run ({}) and collection ({}) are inconsistent.".format(self.run, collection)
172 )
173 self.collection = collection
174 if self.run is not None:
175 self.registry.registerRun(self.run)
177 GENERATION: ClassVar[int] = 3
178 """This is a Generation 3 Butler.
180 This attribute may be removed in the future, once the Generation 2 Butler
181 interface has been fully retired; it should only be used in transitional
182 code.
183 """
185 @staticmethod
186 def makeRepo(root: str, config: Union[Config, str, None] = None, standalone: bool = False,
187 createRegistry: bool = True, searchPaths: Optional[List[str]] = None,
188 forceConfigRoot: bool = True, outfile: Optional[str] = None,
189 overwrite: bool = False) -> Config:
190 """Create an empty data repository by adding a butler.yaml config
191 to a repository root directory.
193 Parameters
194 ----------
195 root : `str`
196 Filesystem path to the root of the new repository. Will be created
197 if it does not exist.
198 config : `Config` or `str`, optional
199 Configuration to write to the repository, after setting any
200 root-dependent Registry or Datastore config options. Can not
201 be a `ButlerConfig` or a `ConfigSubset`. If `None`, default
202 configuration will be used. Root-dependent config options
203 specified in this config are overwritten if ``forceConfigRoot``
204 is `True`.
205 standalone : `bool`
206 If True, write all expanded defaults, not just customized or
207 repository-specific settings.
208 This (mostly) decouples the repository from the default
209 configuration, insulating it from changes to the defaults (which
210 may be good or bad, depending on the nature of the changes).
211 Future *additions* to the defaults will still be picked up when
212 initializing `Butlers` to repos created with ``standalone=True``.
213 createRegistry : `bool`, optional
214 If `True` create a new Registry.
215 searchPaths : `list` of `str`, optional
216 Directory paths to search when calculating the full butler
217 configuration.
218 forceConfigRoot : `bool`, optional
219 If `False`, any values present in the supplied ``config`` that
220 would normally be reset are not overridden and will appear
221 directly in the output config. This allows non-standard overrides
222 of the root directory for a datastore or registry to be given.
223 If this parameter is `True` the values for ``root`` will be
224 forced into the resulting config if appropriate.
225 outfile : `str`, optional
226 If not-`None`, the output configuration will be written to this
227 location rather than into the repository itself. Can be a URI
228 string. Can refer to a directory that will be used to write
229 ``butler.yaml``.
230 overwrite : `bool`, optional
231 Create a new configuration file even if one already exists
232 in the specified output location. Default is to raise
233 an exception.
235 Returns
236 -------
237 config : `Config`
238 The updated `Config` instance written to the repo.
240 Raises
241 ------
242 ValueError
243 Raised if a ButlerConfig or ConfigSubset is passed instead of a
244 regular Config (as these subclasses would make it impossible to
245 support ``standalone=False``).
246 FileExistsError
247 Raised if the output config file already exists.
248 os.error
249 Raised if the directory does not exist, exists but is not a
250 directory, or cannot be created.
252 Notes
253 -----
254 Note that when ``standalone=False`` (the default), the configuration
255 search path (see `ConfigSubset.defaultSearchPaths`) that was used to
256 construct the repository should also be used to construct any Butlers
257 to avoid configuration inconsistencies.
258 """
259 if isinstance(config, (ButlerConfig, ConfigSubset)):
260 raise ValueError("makeRepo must be passed a regular Config without defaults applied.")
262 # for "file" schemes we are assuming POSIX semantics for paths, for
263 # schemeless URIs we are assuming os.path semantics.
264 uri = ButlerURI(root)
265 if uri.scheme == "file" or not uri.scheme:
266 if not os.path.isdir(uri.ospath):
267 safeMakeDir(uri.ospath)
268 elif uri.scheme == "s3":
269 s3 = boto3.resource("s3")
270 # implies bucket exists, if not another level of checks
271 bucket = s3.Bucket(uri.netloc)
272 bucket.put_object(Bucket=uri.netloc, Key=uri.relativeToPathRoot)
273 else:
274 raise ValueError(f"Unrecognized scheme: {uri.scheme}")
275 config = Config(config)
277 # If we are creating a new repo from scratch with relative roots,
278 # do not propagate an explicit root from the config file
279 if "root" in config:
280 del config["root"]
282 full = ButlerConfig(config, searchPaths=searchPaths) # this applies defaults
283 datastoreClass = doImport(full["datastore", "cls"])
284 datastoreClass.setConfigRoot(BUTLER_ROOT_TAG, config, full, overwrite=forceConfigRoot)
286 # if key exists in given config, parse it, otherwise parse the defaults
287 # in the expanded config
288 if config.get(("registry", "db")):
289 registryConfig = RegistryConfig(config)
290 else:
291 registryConfig = RegistryConfig(full)
292 defaultDatabaseUri = registryConfig.makeDefaultDatabaseUri(BUTLER_ROOT_TAG)
293 if defaultDatabaseUri is not None:
294 Config.updateParameters(RegistryConfig, config, full,
295 toUpdate={"db": defaultDatabaseUri},
296 overwrite=forceConfigRoot)
297 else:
298 Config.updateParameters(RegistryConfig, config, full, toCopy=("db",),
299 overwrite=forceConfigRoot)
301 if standalone:
302 config.merge(full)
303 if outfile is not None:
304 # When writing to a separate location we must include
305 # the root of the butler repo in the config else it won't know
306 # where to look.
307 config["root"] = uri.geturl()
308 configURI = outfile
309 else:
310 configURI = uri
311 config.dumpToUri(configURI, overwrite=overwrite)
313 # Create Registry and populate tables
314 Registry.fromConfig(config, create=createRegistry, butlerRoot=root)
315 return config
317 @classmethod
318 def _unpickle(cls, config: ButlerConfig, collection: str, run: Optional[str], writeable: bool) -> Butler:
319 """Callable used to unpickle a Butler.
321 We prefer not to use ``Butler.__init__`` directly so we can force some
322 of its many arguments to be keyword-only (note that ``__reduce__``
323 can only invoke callables with positional arguments).
325 Parameters
326 ----------
327 config : `ButlerConfig`
328 Butler configuration, already coerced into a true `ButlerConfig`
329 instance (and hence after any search paths for overrides have been
330 utilized).
331 collection : `str`
332 String name of a collection to use for read operations.
333 run : `str`, optional
334 String name of a run to use for write operations, or `None` for a
335 read-only butler.
337 Returns
338 -------
339 butler : `Butler`
340 A new `Butler` instance.
341 """
342 return cls(config=config, collection=collection, run=run, writeable=writeable)
344 def __reduce__(self):
345 """Support pickling.
346 """
347 return (Butler._unpickle, (self._config, self.collection, self.run, self.registry.isWriteable()))
349 def __str__(self):
350 return "Butler(collection='{}', datastore='{}', registry='{}')".format(
351 self.collection, self.datastore, self.registry)
353 def isWriteable(self) -> bool:
354 """Return `True` if this `Butler` supports write operations.
355 """
356 return self.registry.isWriteable()
358 @contextlib.contextmanager
359 def transaction(self):
360 """Context manager supporting `Butler` transactions.
362 Transactions can be nested.
363 """
364 with self.registry.transaction():
365 with self.datastore.transaction():
366 yield
368 def _standardizeArgs(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
369 dataId: Optional[DataId] = None, **kwds: Any) -> Tuple[DatasetType, DataId]:
370 """Standardize the arguments passed to several Butler APIs.
372 Parameters
373 ----------
374 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
375 When `DatasetRef` the `dataId` should be `None`.
376 Otherwise the `DatasetType` or name thereof.
377 dataId : `dict` or `DataCoordinate`
378 A `dict` of `Dimension` link name, value pairs that label the
379 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
380 should be provided as the second argument.
381 kwds
382 Additional keyword arguments used to augment or construct a
383 `DataCoordinate`. See `DataCoordinate.standardize`
384 parameters.
386 Returns
387 -------
388 datasetType : `DatasetType`
389 A `DatasetType` instance extracted from ``datasetRefOrType``.
390 dataId : `dict` or `DataId`, optional
391 Argument that can be used (along with ``kwds``) to construct a
392 `DataId`.
394 Notes
395 -----
396 Butler APIs that conceptually need a DatasetRef also allow passing a
397 `DatasetType` (or the name of one) and a `DataId` (or a dict and
398 keyword arguments that can be used to construct one) separately. This
399 method accepts those arguments and always returns a true `DatasetType`
400 and a `DataId` or `dict`.
402 Standardization of `dict` vs `DataId` is best handled by passing the
403 returned ``dataId`` (and ``kwds``) to `Registry` APIs, which are
404 generally similarly flexible.
405 """
406 externalDatasetType = None
407 internalDatasetType = None
408 if isinstance(datasetRefOrType, DatasetRef):
409 if dataId is not None or kwds:
410 raise ValueError("DatasetRef given, cannot use dataId as well")
411 externalDatasetType = datasetRefOrType.datasetType
412 dataId = datasetRefOrType.dataId
413 else:
414 # Don't check whether DataId is provided, because Registry APIs
415 # can usually construct a better error message when it wasn't.
416 if isinstance(datasetRefOrType, DatasetType):
417 externalDatasetType = datasetRefOrType
418 else:
419 internalDatasetType = self.registry.getDatasetType(datasetRefOrType)
421 # Check that they are self-consistent
422 if externalDatasetType is not None:
423 internalDatasetType = self.registry.getDatasetType(externalDatasetType.name)
424 if externalDatasetType != internalDatasetType:
425 raise ValueError(f"Supplied dataset type ({externalDatasetType}) inconsistent with "
426 f"registry definition ({internalDatasetType})")
428 return internalDatasetType, dataId
430 def _findDatasetRef(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
431 dataId: Optional[DataId] = None, *,
432 collection: Optional[str] = None,
433 allowUnresolved: bool = False,
434 **kwds: Any) -> DatasetRef:
435 """Shared logic for methods that start with a search for a dataset in
436 the registry.
438 Parameters
439 ----------
440 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
441 When `DatasetRef` the `dataId` should be `None`.
442 Otherwise the `DatasetType` or name thereof.
443 dataId : `dict` or `DataCoordinate`, optional
444 A `dict` of `Dimension` link name, value pairs that label the
445 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
446 should be provided as the first argument.
447 collection : `str`, optional
448 Name of the collection to search, overriding ``self.collection``.
449 allowUnresolved : `bool`, optional
450 If `True`, return an unresolved `DatasetRef` if finding a resolved
451 one in the `Registry` fails. Defaults to `False`.
452 kwds
453 Additional keyword arguments used to augment or construct a
454 `DataId`. See `DataId` parameters.
456 Returns
457 -------
458 ref : `DatasetRef`
459 A reference to the dataset identified by the given arguments.
461 Raises
462 ------
463 LookupError
464 Raised if no matching dataset exists in the `Registry` (and
465 ``allowUnresolved is False``).
466 ValueError
467 Raised if a resolved `DatasetRef` was passed as an input, but it
468 differs from the one found in the registry in this collection.
469 TypeError
470 Raised if ``collection`` and ``self.collection`` are both `None`.
471 """
472 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwds)
473 if isinstance(datasetRefOrType, DatasetRef):
474 idNumber = datasetRefOrType.id
475 else:
476 idNumber = None
477 # Expand the data ID first instead of letting registry.find do it, so
478 # we get the result even if it returns None.
479 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions, **kwds)
480 if collection is None:
481 collection = self.collection
482 if collection is None:
483 raise TypeError("No collection provided.")
484 # Always lookup the DatasetRef, even if one is given, to ensure it is
485 # present in the current collection.
486 ref = self.registry.find(collection, datasetType, dataId)
487 if ref is None:
488 if allowUnresolved:
489 return DatasetRef(datasetType, dataId)
490 else:
491 raise LookupError(f"Dataset {datasetType.name} with data ID {dataId} "
492 f"could not be found in collection '{collection}'.")
493 if idNumber is not None and idNumber != ref.id:
494 raise ValueError(f"DatasetRef.id provided ({idNumber}) does not match "
495 f"id ({ref.id}) in registry in collection '{collection}'.")
496 return ref
498 @transactional
499 def put(self, obj: Any, datasetRefOrType: Union[DatasetRef, DatasetType, str],
500 dataId: Optional[DataId] = None, *,
501 producer: Optional[Quantum] = None,
502 run: Optional[str] = None,
503 **kwds: Any) -> DatasetRef:
504 """Store and register a dataset.
506 Parameters
507 ----------
508 obj : `object`
509 The dataset.
510 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
511 When `DatasetRef` is provided, ``dataId`` should be `None`.
512 Otherwise the `DatasetType` or name thereof.
513 dataId : `dict` or `DataCoordinate`
514 A `dict` of `Dimension` link name, value pairs that label the
515 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
516 should be provided as the second argument.
517 producer : `Quantum`, optional
518 The producer.
519 run : `str`, optional
520 The name of the run the dataset should be added to, overriding
521 ``self.run``.
522 kwds
523 Additional keyword arguments used to augment or construct a
524 `DataCoordinate`. See `DataCoordinate.standardize`
525 parameters.
527 Returns
528 -------
529 ref : `DatasetRef`
530 A reference to the stored dataset, updated with the correct id if
531 given.
533 Raises
534 ------
535 TypeError
536 Raised if the butler is read-only or if no run has been provided.
537 """
538 log.debug("Butler put: %s, dataId=%s, producer=%s, run=%s", datasetRefOrType, dataId, producer, run)
539 if not self.isWriteable():
540 raise TypeError("Butler is read-only.")
541 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwds)
542 if isinstance(datasetRefOrType, DatasetRef) and datasetRefOrType.id is not None:
543 raise ValueError("DatasetRef must not be in registry, must have None id")
545 if run is None:
546 if self.run is None:
547 raise TypeError("No run provided.")
548 run = self.run
550 isVirtualComposite = self._composites.shouldBeDisassembled(datasetType)
552 # Add Registry Dataset entry. If not a virtual composite, add
553 # and attach components at the same time.
554 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions, **kwds)
555 ref, = self.registry.insertDatasets(datasetType, run=run, dataIds=[dataId],
556 producer=producer, recursive=not isVirtualComposite)
558 # Check to see if this datasetType requires disassembly
559 if isVirtualComposite:
560 components = datasetType.storageClass.assembler().disassemble(obj)
561 for component, info in components.items():
562 compTypeName = datasetType.componentTypeName(component)
563 compRef = self.put(info.component, compTypeName, dataId, producer=producer, run=run)
564 self.registry.attachComponent(component, ref, compRef)
565 else:
566 # This is an entity without a disassembler.
567 self.datastore.put(obj, ref)
569 return ref
571 def getDirect(self, ref: DatasetRef, *, parameters: Optional[Dict[str, Any]] = None):
572 """Retrieve a stored dataset.
574 Unlike `Butler.get`, this method allows datasets outside the Butler's
575 collection to be read as long as the `DatasetRef` that identifies them
576 can be obtained separately.
578 Parameters
579 ----------
580 ref : `DatasetRef`
581 Reference to an already stored dataset.
582 parameters : `dict`
583 Additional StorageClass-defined options to control reading,
584 typically used to efficiently read only a subset of the dataset.
586 Returns
587 -------
588 obj : `object`
589 The dataset.
590 """
591 # if the ref exists in the store we return it directly
592 if self.datastore.exists(ref):
593 return self.datastore.get(ref, parameters=parameters)
594 elif ref.isComposite():
595 # Check that we haven't got any unknown parameters
596 ref.datasetType.storageClass.validateParameters(parameters)
597 # Reconstruct the composite
598 usedParams = set()
599 components = {}
600 for compName, compRef in ref.components.items():
601 # make a dictionary of parameters containing only the subset
602 # supported by the StorageClass of the components
603 compParams = compRef.datasetType.storageClass.filterParameters(parameters)
604 usedParams.update(set(compParams))
605 components[compName] = self.datastore.get(compRef, parameters=compParams)
607 # Any unused parameters will have to be passed to the assembler
608 if parameters:
609 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams}
610 else:
611 unusedParams = {}
613 # Assemble the components
614 inMemoryDataset = ref.datasetType.storageClass.assembler().assemble(components)
615 return ref.datasetType.storageClass.assembler().handleParameters(inMemoryDataset,
616 parameters=unusedParams)
617 else:
618 # single entity in datastore
619 raise FileNotFoundError(f"Unable to locate dataset '{ref}' in datastore {self.datastore.name}")
621 def getDeferred(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
622 dataId: Optional[DataId] = None, *,
623 parameters: Union[dict, None] = None,
624 collection: Optional[str] = None,
625 **kwds: Any) -> DeferredDatasetHandle:
626 """Create a `DeferredDatasetHandle` which can later retrieve a dataset
628 Parameters
629 ----------
630 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
631 When `DatasetRef` the `dataId` should be `None`.
632 Otherwise the `DatasetType` or name thereof.
633 dataId : `dict` or `DataCoordinate`, optional
634 A `dict` of `Dimension` link name, value pairs that label the
635 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
636 should be provided as the first argument.
637 collection : `str`, optional
638 Name of the collection to search, overriding ``self.collection``.
639 parameters : `dict`
640 Additional StorageClass-defined options to control reading,
641 typically used to efficiently read only a subset of the dataset.
642 collection : `str`, optional
643 Collection to search, overriding ``self.collection``.
644 kwds
645 Additional keyword arguments used to augment or construct a
646 `DataId`. See `DataId` parameters.
648 Returns
649 -------
650 obj : `DeferredDatasetHandle`
651 A handle which can be used to retrieve a dataset at a later time.
653 Raises
654 ------
655 LookupError
656 Raised if no matching dataset exists in the `Registry` (and
657 ``allowUnresolved is False``).
658 ValueError
659 Raised if a resolved `DatasetRef` was passed as an input, but it
660 differs from the one found in the registry in this collection.
661 TypeError
662 Raised if ``collection`` and ``self.collection`` are both `None`.
663 """
664 ref = self._findDatasetRef(datasetRefOrType, dataId, collection=collection, **kwds)
665 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters)
667 def get(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
668 dataId: Optional[DataId] = None, *,
669 parameters: Optional[Dict[str, Any]] = None,
670 collection: Optional[str] = None,
671 **kwds: Any) -> Any:
672 """Retrieve a stored dataset.
674 Parameters
675 ----------
676 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
677 When `DatasetRef` the `dataId` should be `None`.
678 Otherwise the `DatasetType` or name thereof.
679 dataId : `dict` or `DataCoordinate`
680 A `dict` of `Dimension` link name, value pairs that label the
681 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
682 should be provided as the first argument.
683 parameters : `dict`
684 Additional StorageClass-defined options to control reading,
685 typically used to efficiently read only a subset of the dataset.
686 collection : `str`, optional
687 Collection to search, overriding ``self.collection``.
688 kwds
689 Additional keyword arguments used to augment or construct a
690 `DataCoordinate`. See `DataCoordinate.standardize`
691 parameters.
693 Returns
694 -------
695 obj : `object`
696 The dataset.
698 Raises
699 ------
700 ValueError
701 Raised if a resolved `DatasetRef` was passed as an input, but it
702 differs from the one found in the registry in this collection.
703 LookupError
704 Raised if no matching dataset exists in the `Registry`.
705 TypeError
706 Raised if ``collection`` and ``self.collection`` are both `None`.
707 """
708 log.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters)
709 ref = self._findDatasetRef(datasetRefOrType, dataId, collection=collection, **kwds)
710 return self.getDirect(ref, parameters=parameters)
712 def getUri(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
713 dataId: Optional[DataId] = None, *,
714 predict: bool = False,
715 collection: Optional[str] = None,
716 run: Optional[str] = None,
717 **kwds: Any) -> str:
718 """Return the URI to the Dataset.
720 Parameters
721 ----------
722 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
723 When `DatasetRef` the `dataId` should be `None`.
724 Otherwise the `DatasetType` or name thereof.
725 dataId : `dict` or `DataCoordinate`
726 A `dict` of `Dimension` link name, value pairs that label the
727 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
728 should be provided as the first argument.
729 predict : `bool`
730 If `True`, allow URIs to be returned of datasets that have not
731 been written.
732 collection : `str`, optional
733 Collection to search, overriding ``self.collection``.
734 run : `str`, optional
735 Run to use for predictions, overriding ``self.run``.
736 kwds
737 Additional keyword arguments used to augment or construct a
738 `DataCoordinate`. See `DataCoordinate.standardize`
739 parameters.
741 Returns
742 -------
743 uri : `str`
744 URI string pointing to the Dataset within the datastore. If the
745 Dataset does not exist in the datastore, and if ``predict`` is
746 `True`, the URI will be a prediction and will include a URI
747 fragment "#predicted".
748 If the datastore does not have entities that relate well
749 to the concept of a URI the returned URI string will be
750 descriptive. The returned URI is not guaranteed to be obtainable.
752 Raises
753 ------
754 LookupError
755 A URI has been requested for a dataset that does not exist and
756 guessing is not allowed.
757 ValueError
758 Raised if a resolved `DatasetRef` was passed as an input, but it
759 differs from the one found in the registry in this collection.
760 TypeError
761 Raised if ``collection`` and ``self.collection`` are both `None`.
762 """
763 ref = self._findDatasetRef(datasetRefOrType, dataId, allowUnresolved=predict, collection=collection,
764 **kwds)
765 if ref.id is None: # only possible if predict is True
766 if run is None:
767 run = self.run
768 if run is None:
769 raise TypeError("Cannot predict location with run=None.")
770 # Lie about ID, because we can't guess it, and only
771 # Datastore.getUri() will ever see it (and it doesn't use it).
772 ref = ref.resolved(id=0, run=self.run)
773 return self.datastore.getUri(ref, predict)
775 def datasetExists(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
776 dataId: Optional[DataId] = None, *,
777 collection: Optional[str] = None,
778 **kwds: Any) -> bool:
779 """Return True if the Dataset is actually present in the Datastore.
781 Parameters
782 ----------
783 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
784 When `DatasetRef` the `dataId` should be `None`.
785 Otherwise the `DatasetType` or name thereof.
786 dataId : `dict` or `DataCoordinate`
787 A `dict` of `Dimension` link name, value pairs that label the
788 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
789 should be provided as the first argument.
790 collection : `str`, optional
791 Collection to search, overriding ``self.collection``.
792 kwds
793 Additional keyword arguments used to augment or construct a
794 `DataCoordinate`. See `DataCoordinate.standardize`
795 parameters.
797 Raises
798 ------
799 LookupError
800 Raised if the dataset is not even present in the Registry.
801 ValueError
802 Raised if a resolved `DatasetRef` was passed as an input, but it
803 differs from the one found in the registry in this collection.
804 TypeError
805 Raised if ``collection`` and ``self.collection`` are both `None`.
806 """
807 ref = self._findDatasetRef(datasetRefOrType, dataId, collection=collection, **kwds)
808 return self.datastore.exists(ref)
810 def remove(self, datasetRefOrType: Union[DatasetRef, DatasetType, str],
811 dataId: Optional[DataId] = None, *,
812 delete: bool = True, remember: bool = True, collection: Optional[str] = None, **kwds: Any):
813 """Remove a dataset from the collection and possibly the repository.
815 The identified dataset is always at least removed from the Butler's
816 collection. By default it is also deleted from the Datastore (e.g.
817 files are actually deleted), but the dataset is "remembered" by
818 retaining its row in the dataset and provenance tables in the registry.
820 If the dataset is a composite, all components will also be removed.
822 Parameters
823 ----------
824 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
825 When `DatasetRef` the `dataId` should be `None`.
826 Otherwise the `DatasetType` or name thereof.
827 dataId : `dict` or `DataId`
828 A `dict` of `Dimension` link name, value pairs that label the
829 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
830 should be provided as the first argument.
831 delete : `bool`
832 If `True` (default) actually delete the dataset from the
833 Datastore (i.e. actually remove files).
834 remember : `bool`
835 If `True` (default), retain dataset and provenance records in
836 the `Registry` for this dataset.
837 collection : `str`, optional
838 Collection to search, overriding ``self.collection``.
839 kwds
840 Additional keyword arguments used to augment or construct a
841 `DataId`. See `DataId` parameters.
843 Raises
844 ------
845 TypeError
846 Raised if the butler is read-only, if no collection was provided,
847 or if ``delete`` and ``remember`` are both `False`; a dataset
848 cannot remain in a `Datastore` if its `Registry` entries is
849 removed.
850 OrphanedRecordError
851 Raised if ``remember`` is `False` but the dataset is still present
852 in a `Datastore` not recognized by this `Butler` client.
853 ValueError
854 Raised if a resolved `DatasetRef` was passed as an input, but it
855 differs from the one found in the registry in this collection.
856 """
857 if not self.isWriteable():
858 raise TypeError("Butler is read-only.")
859 ref = self._findDatasetRef(datasetRefOrType, dataId, collection=collection, **kwds)
860 if delete:
861 # There is a difference between a concrete composite and virtual
862 # composite. In a virtual composite the datastore is never
863 # given the top level DatasetRef. In the concrete composite
864 # the datastore knows all the refs and will clean up itself
865 # if asked to remove the parent ref.
866 # We can not check configuration for this since we can not trust
867 # that the configuration is the same. We therefore have to ask
868 # if the ref exists or not
869 if self.datastore.exists(ref):
870 self.datastore.remove(ref)
871 elif ref.isComposite():
872 datastoreNames = set(self.datastore.names)
873 for r in ref.components.values():
874 # If a dataset was removed previously but remembered
875 # in registry, skip the removal in the datastore.
876 datastoreLocations = self.registry.getDatasetLocations(r)
877 if datastoreLocations & datastoreNames:
878 self.datastore.remove(r)
879 else:
880 raise FileNotFoundError(f"Dataset {ref} not known to datastore")
881 elif not remember:
882 raise ValueError("Cannot retain dataset in Datastore without keeping Registry dataset record.")
883 if remember:
884 self.registry.disassociate(self.collection, [ref])
885 else:
886 # This also implicitly disassociates.
887 self.registry.removeDataset(ref)
889 @transactional
890 def ingest(self, *datasets: FileDataset, transfer: Optional[str] = None, run: Optional[str] = None):
891 """Store and register one or more datasets that already exist on disk.
893 Parameters
894 ----------
895 datasets : `FileDataset`
896 Each positional argument is a struct containing information about
897 a file to be ingested, including its path (either absolute or
898 relative to the datastore root, if applicable), a `DatasetRef`,
899 and optionally a formatter class or its fully-qualified string
900 name. If a formatter is not provided, the formatter that would be
901 used for `put` is assumed. On successful return, all
902 `FileDataset.ref` attributes will have their `DatasetRef.id`
903 attribute populated and all `FileDataset.formatter` attributes will
904 be set to the formatter class used. `FileDataset.path` attributes
905 may be modified to put paths in whatever the datastore considers a
906 standardized form.
907 transfer : `str`, optional
908 If not `None`, must be one of 'move', 'copy', 'hardlink', or
909 'symlink', indicating how to transfer the file.
910 run : `str`, optional
911 The name of the run ingested datasets should be added to,
912 overriding ``self.run``.
914 Raises
915 ------
916 TypeError
917 Raised if the butler is read-only or if no run was provided.
918 NotImplementedError
919 Raised if the `Datastore` does not support the given transfer mode.
920 DatasetTypeNotSupportedError
921 Raised if one or more files to be ingested have a dataset type that
922 is not supported by the `Datastore`..
923 FileNotFoundError
924 Raised if one of the given files does not exist.
925 FileExistsError
926 Raised if transfer is not `None` but the (internal) location the
927 file would be moved to is already occupied.
929 Notes
930 -----
931 This operation is not fully exception safe: if a database operation
932 fails, the given `FileDataset` instances may be only partially updated.
934 It is atomic in terms of database operations (they will either all
935 succeed or all fail) providing the database engine implements
936 transactions correctly. It will attempt to be atomic in terms of
937 filesystem operations as well, but this cannot be implemented
938 rigorously for most datastores.
939 """
940 if not self.isWriteable():
941 raise TypeError("Butler is read-only.")
942 if run is None:
943 if self.run is None:
944 raise TypeError("No run provided.")
945 run = self.run
947 # Reorganize the inputs so they're grouped by DatasetType and then
948 # data ID. We also include a list of DatasetRefs for each FileDataset
949 # to hold the resolved DatasetRefs returned by the Registry, before
950 # it's safe to swap them into FileDataset.refs.
951 # Some type annotation aliases to make that clearer:
952 GroupForType = Dict[DataCoordinate, Tuple[FileDataset, List[DatasetRef]]]
953 GroupedData = MutableMapping[DatasetType, GroupForType]
954 # The actual data structure:
955 groupedData: GroupedData = defaultdict(dict)
956 # And the nested loop that populates it:
957 for dataset in datasets:
958 # This list intentionally shared across the inner loop, since it's
959 # associated with `dataset`.
960 resolvedRefs = []
961 for ref in dataset.refs:
962 groupedData[ref.datasetType][ref.dataId] = (dataset, resolvedRefs)
964 # Now we can bulk-insert into Registry for each DatasetType.
965 for datasetType, groupForType in groupedData.items():
966 refs = self.registry.insertDatasets(datasetType,
967 dataIds=groupForType.keys(),
968 run=run,
969 recursive=True)
970 # Append those resolved DatasetRefs to the new lists we set up for
971 # them.
972 for ref, (_, resolvedRefs) in zip(refs, groupForType.values()):
973 resolvedRefs.append(ref)
975 # Go back to the original FileDatasets to replace their refs with the
976 # new resolved ones.
977 for groupForType in groupedData.values():
978 for dataset, resolvedRefs in groupForType.values():
979 dataset.refs = resolvedRefs
981 # Bulk-insert everything into Datastore.
982 self.datastore.ingest(*datasets, transfer=transfer)
984 @contextlib.contextmanager
985 def export(self, *, directory: Optional[str] = None,
986 filename: Optional[str] = None,
987 format: Optional[str] = None,
988 transfer: Optional[str] = None) -> ContextManager[RepoExport]:
989 """Export datasets from the repository represented by this `Butler`.
991 This method is a context manager that returns a helper object
992 (`RepoExport`) that is used to indicate what information from the
993 repository should be exported.
995 Parameters
996 ----------
997 directory : `str`, optional
998 Directory dataset files should be written to if ``transfer`` is not
999 `None`.
1000 filename : `str`, optional
1001 Name for the file that will include database information associated
1002 with the exported datasets. If this is not an absolute path and
1003 ``directory`` is not `None`, it will be written to ``directory``
1004 instead of the current working directory. Defaults to
1005 "export.{format}".
1006 format : `str`, optional
1007 File format for the database information file. If `None`, the
1008 extension of ``filename`` will be used.
1009 transfer : `str`, optional
1010 Transfer mode passed to `Datastore.export`.
1012 Raises
1013 ------
1014 TypeError
1015 Raised if the set of arguments passed is inconsistent.
1017 Examples
1018 --------
1019 Typically the `Registry.queryDimensions` and `Registry.queryDatasets`
1020 methods are used to provide the iterables over data IDs and/or datasets
1021 to be exported::
1023 with butler.export("exports.yaml") as export:
1024 # Export all flats, and the calibration_label dimensions
1025 # associated with them.
1026 export.saveDatasets(butler.registry.queryDatasets("flat"),
1027 elements=[butler.registry.dimensions["calibration_label"]])
1028 # Export all datasets that start with "deepCoadd_" and all of
1029 # their associated data ID information.
1030 export.saveDatasets(butler.registry.queryDatasets("deepCoadd_*"))
1031 """
1032 if directory is None and transfer is not None:
1033 raise TypeError("Cannot transfer without providing a directory.")
1034 if transfer == "move":
1035 raise TypeError("Transfer may not be 'move': export is read-only")
1036 if format is None:
1037 if filename is None:
1038 raise TypeError("At least one of 'filename' or 'format' must be provided.")
1039 else:
1040 _, format = os.path.splitext(filename)
1041 elif filename is None:
1042 filename = f"export.{format}"
1043 if directory is not None:
1044 filename = os.path.join(directory, filename)
1045 BackendClass = getClassOf(self._config["repo_transfer_formats"][format]["export"])
1046 with open(filename, 'w') as stream:
1047 backend = BackendClass(stream)
1048 try:
1049 helper = RepoExport(self.registry, self.datastore, backend=backend,
1050 directory=directory, transfer=transfer)
1051 yield helper
1052 except BaseException:
1053 raise
1054 else:
1055 helper._finish()
1057 def import_(self, *, directory: Optional[str] = None,
1058 filename: Optional[str] = None,
1059 format: Optional[str] = None,
1060 transfer: Optional[str] = None):
1061 """Import datasets exported from a different butler repository.
1063 Parameters
1064 ----------
1065 directory : `str`, optional
1066 Directory containing dataset files. If `None`, all file paths
1067 must be absolute.
1068 filename : `str`, optional
1069 Name for the file that containing database information associated
1070 with the exported datasets. If this is not an absolute path, does
1071 not exist in the current working directory, and ``directory`` is
1072 not `None`, it is assumed to be in ``directory``. Defaults to
1073 "export.{format}".
1074 format : `str`, optional
1075 File format for the database information file. If `None`, the
1076 extension of ``filename`` will be used.
1077 transfer : `str`, optional
1078 Transfer mode passed to `Datastore.export`.
1080 Raises
1081 ------
1082 TypeError
1083 Raised if the set of arguments passed is inconsistent, or if the
1084 butler is read-only.
1085 """
1086 if not self.isWriteable():
1087 raise TypeError("Butler is read-only.")
1088 if format is None:
1089 if filename is None:
1090 raise TypeError("At least one of 'filename' or 'format' must be provided.")
1091 else:
1092 _, format = os.path.splitext(filename)
1093 elif filename is None:
1094 filename = f"export.{format}"
1095 if directory is not None and not os.path.exists(filename):
1096 filename = os.path.join(directory, filename)
1097 BackendClass = getClassOf(self._config["repo_transfer_formats"][format]["import"])
1098 with open(filename, 'r') as stream:
1099 backend = BackendClass(stream, self.registry)
1100 backend.register()
1101 with self.transaction():
1102 backend.load(self.datastore, directory=directory, transfer=transfer)
1104 def validateConfiguration(self, logFailures: bool = False,
1105 datasetTypeNames: Optional[Iterable[str]] = None,
1106 ignore: Iterable[str] = None):
1107 """Validate butler configuration.
1109 Checks that each `DatasetType` can be stored in the `Datastore`.
1111 Parameters
1112 ----------
1113 logFailures : `bool`, optional
1114 If `True`, output a log message for every validation error
1115 detected.
1116 datasetTypeNames : iterable of `str`, optional
1117 The `DatasetType` names that should be checked. This allows
1118 only a subset to be selected.
1119 ignore : iterable of `str`, optional
1120 Names of DatasetTypes to skip over. This can be used to skip
1121 known problems. If a named `DatasetType` corresponds to a
1122 composite, all component of that `DatasetType` will also be
1123 ignored.
1125 Raises
1126 ------
1127 ButlerValidationError
1128 Raised if there is some inconsistency with how this Butler
1129 is configured.
1130 """
1131 if datasetTypeNames:
1132 entities = [self.registry.getDatasetType(name) for name in datasetTypeNames]
1133 else:
1134 entities = list(self.registry.getAllDatasetTypes())
1136 # filter out anything from the ignore list
1137 if ignore:
1138 ignore = set(ignore)
1139 entities = [e for e in entities if e.name not in ignore and e.nameAndComponent()[0] not in ignore]
1140 else:
1141 ignore = set()
1143 # Find all the registered instruments
1144 instruments = set(
1145 dataId["instrument"] for dataId in self.registry.queryDimensions(["instrument"])
1146 )
1148 # For each datasetType that has an instrument dimension, create
1149 # a DatasetRef for each defined instrument
1150 datasetRefs = []
1152 for datasetType in entities:
1153 if "instrument" in datasetType.dimensions:
1154 for instrument in instruments:
1155 datasetRef = DatasetRef(datasetType, {"instrument": instrument}, conform=False)
1156 datasetRefs.append(datasetRef)
1158 entities.extend(datasetRefs)
1160 datastoreErrorStr = None
1161 try:
1162 self.datastore.validateConfiguration(entities, logFailures=logFailures)
1163 except ValidationError as e:
1164 datastoreErrorStr = str(e)
1166 # Also check that the LookupKeys used by the datastores match
1167 # registry and storage class definitions
1168 keys = self.datastore.getLookupKeys()
1170 failedNames = set()
1171 failedDataId = set()
1172 for key in keys:
1173 datasetType = None
1174 if key.name is not None:
1175 if key.name in ignore:
1176 continue
1178 # skip if specific datasetType names were requested and this
1179 # name does not match
1180 if datasetTypeNames and key.name not in datasetTypeNames:
1181 continue
1183 # See if it is a StorageClass or a DatasetType
1184 if key.name in self.storageClasses:
1185 pass
1186 else:
1187 try:
1188 self.registry.getDatasetType(key.name)
1189 except KeyError:
1190 if logFailures:
1191 log.fatal("Key '%s' does not correspond to a DatasetType or StorageClass", key)
1192 failedNames.add(key)
1193 else:
1194 # Dimensions are checked for consistency when the Butler
1195 # is created and rendezvoused with a universe.
1196 pass
1198 # Check that the instrument is a valid instrument
1199 # Currently only support instrument so check for that
1200 if key.dataId:
1201 dataIdKeys = set(key.dataId)
1202 if set(["instrument"]) != dataIdKeys:
1203 if logFailures:
1204 log.fatal("Key '%s' has unsupported DataId override", key)
1205 failedDataId.add(key)
1206 elif key.dataId["instrument"] not in instruments:
1207 if logFailures:
1208 log.fatal("Key '%s' has unknown instrument", key)
1209 failedDataId.add(key)
1211 messages = []
1213 if datastoreErrorStr:
1214 messages.append(datastoreErrorStr)
1216 for failed, msg in ((failedNames, "Keys without corresponding DatasetType or StorageClass entry: "),
1217 (failedDataId, "Keys with bad DataId entries: ")):
1218 if failed:
1219 msg += ", ".join(str(k) for k in failed)
1220 messages.append(msg)
1222 if messages:
1223 raise ValidationError(";\n".join(messages))
1225 registry: Registry
1226 """The object that manages dataset metadata and relationships (`Registry`).
1228 Most operations that don't involve reading or writing butler datasets are
1229 accessible only via `Registry` methods.
1230 """
1232 datastore: Datastore
1233 """The object that manages actual dataset storage (`Datastore`).
1235 Direct user access to the datastore should rarely be necessary; the primary
1236 exception is the case where a `Datastore` implementation provides extra
1237 functionality beyond what the base class defines.
1238 """
1240 storageClasses: StorageClassFactory
1241 """An object that maps known storage class names to objects that fully
1242 describe them (`StorageClassFactory`).
1243 """
1245 run: Optional[str]
1246 """Name of the run this butler writes outputs to (`str` or `None`).
1247 """
1249 collection: Optional[str]
1250 """Name of the collection this butler searches for datasets (`str` or
1251 `None`).
1252 """