Coverage for python/lsst/daf/butler/_butler.py: 64%
140 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-11-04 09:46 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-11-04 09:46 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28from __future__ import annotations
30__all__ = ["Butler"]
32from abc import abstractmethod
33from collections.abc import Collection, Iterable, Sequence
34from contextlib import AbstractContextManager
35from typing import Any, TextIO
37from lsst.resources import ResourcePath, ResourcePathExpression
38from lsst.utils import doImportType
39from lsst.utils.logging import getLogger
41from ._butler_config import ButlerConfig
42from ._butler_repo_index import ButlerRepoIndex
43from ._config import Config, ConfigSubset
44from ._dataset_existence import DatasetExistence
45from ._dataset_ref import DatasetId, DatasetIdGenEnum, DatasetRef
46from ._dataset_type import DatasetType
47from ._deferredDatasetHandle import DeferredDatasetHandle
48from ._file_dataset import FileDataset
49from ._limited_butler import LimitedButler
50from ._storage_class import StorageClass
51from ._timespan import Timespan
52from .datastore import DatasetRefURIs, Datastore
53from .dimensions import DataId, DimensionConfig
54from .registry import Registry, RegistryConfig, _RegistryFactory
55from .repo_relocation import BUTLER_ROOT_TAG
56from .transfers import RepoExportContext
58_LOG = getLogger(__name__)
61class Butler(LimitedButler):
62 """Interface for data butler and factory for Butler instances.
64 Parameters
65 ----------
66 config : `ButlerConfig`, `Config` or `str`, optional.
67 Configuration. Anything acceptable to the `ButlerConfig` constructor.
68 If a directory path is given the configuration will be read from a
69 ``butler.yaml`` file in that location. If `None` is given default
70 values will be used. If ``config`` contains "cls" key then its value is
71 used as a name of butler class and it must be a sub-class of this
72 class, otherwise `DirectButler` is instantiated.
73 collections : `str` or `~collections.abc.Iterable` [ `str` ], optional
74 An expression specifying the collections to be searched (in order) when
75 reading datasets.
76 This may be a `str` collection name or an iterable thereof.
77 See :ref:`daf_butler_collection_expressions` for more information.
78 These collections are not registered automatically and must be
79 manually registered before they are used by any method, but they may be
80 manually registered after the `Butler` is initialized.
81 run : `str`, optional
82 Name of the `~CollectionType.RUN` collection new datasets should be
83 inserted into. If ``collections`` is `None` and ``run`` is not `None`,
84 ``collections`` will be set to ``[run]``. If not `None`, this
85 collection will automatically be registered. If this is not set (and
86 ``writeable`` is not set either), a read-only butler will be created.
87 searchPaths : `list` of `str`, optional
88 Directory paths to search when calculating the full Butler
89 configuration. Not used if the supplied config is already a
90 `ButlerConfig`.
91 writeable : `bool`, optional
92 Explicitly sets whether the butler supports write operations. If not
93 provided, a read-write butler is created if any of ``run``, ``tags``,
94 or ``chains`` is non-empty.
95 inferDefaults : `bool`, optional
96 If `True` (default) infer default data ID values from the values
97 present in the datasets in ``collections``: if all collections have the
98 same value (or no value) for a governor dimension, that value will be
99 the default for that dimension. Nonexistent collections are ignored.
100 If a default value is provided explicitly for a governor dimension via
101 ``**kwargs``, no default will be inferred for that dimension.
102 **kwargs : `Any`
103 Additional keyword arguments passed to a constructor of actual butler
104 class.
106 Notes
107 -----
108 The preferred way to instantiate Butler is via the `from_config` method.
109 The call to ``Butler(...)`` is equivalent to ``Butler.from_config(...)``,
110 but ``mypy`` will complain about the former.
111 """
113 def __new__(
114 cls,
115 config: Config | ResourcePathExpression | None = None,
116 *,
117 collections: Any = None,
118 run: str | None = None,
119 searchPaths: Sequence[ResourcePathExpression] | None = None,
120 writeable: bool | None = None,
121 inferDefaults: bool = True,
122 **kwargs: Any,
123 ) -> Butler:
124 if cls is Butler:
125 cls = cls._find_butler_class(config, searchPaths)
126 # Note: we do not pass any parameters to __new__, Python will pass them
127 # to __init__ after __new__ returns sub-class instance.
128 return super().__new__(cls)
130 @staticmethod
131 def _find_butler_class(
132 config: Config | ResourcePathExpression | None = None,
133 searchPaths: Sequence[ResourcePathExpression] | None = None,
134 ) -> type[Butler]:
135 """Find actual class to instantiate."""
136 butler_class_name: str | None = None
137 if config is not None:
138 # Check for optional "cls" key in config.
139 if not isinstance(config, Config):
140 config = ButlerConfig(config, searchPaths=searchPaths)
141 butler_class_name = config.get("cls")
143 # Make DirectButler if class is not specified.
144 butler_class: type[Butler]
145 if butler_class_name is None:
146 from .direct_butler import DirectButler
148 butler_class = DirectButler
149 else:
150 butler_class = doImportType(butler_class_name)
151 if not issubclass(butler_class, Butler):
152 raise TypeError(f"{butler_class_name} is not a subclass of Butler")
153 return butler_class
155 @classmethod
156 def from_config(
157 cls,
158 config: Config | ResourcePathExpression | None = None,
159 *,
160 collections: Any = None,
161 run: str | None = None,
162 searchPaths: Sequence[ResourcePathExpression] | None = None,
163 writeable: bool | None = None,
164 inferDefaults: bool = True,
165 **kwargs: Any,
166 ) -> Butler:
167 """Create butler instance from configuration.
169 Parameters
170 ----------
171 config : `ButlerConfig`, `Config` or `str`, optional.
172 Configuration. Anything acceptable to the `ButlerConfig`
173 constructor. If a directory path is given the configuration will be
174 read from a ``butler.yaml`` file in that location. If `None` is
175 given default values will be used. If ``config`` contains "cls" key
176 then its value is used as a name of butler class and it must be a
177 sub-class of this class, otherwise `DirectButler` is instantiated.
178 collections : `str` or `~collections.abc.Iterable` [ `str` ], optional
179 An expression specifying the collections to be searched (in order)
180 when reading datasets.
181 This may be a `str` collection name or an iterable thereof.
182 See :ref:`daf_butler_collection_expressions` for more information.
183 These collections are not registered automatically and must be
184 manually registered before they are used by any method, but they
185 may be manually registered after the `Butler` is initialized.
186 run : `str`, optional
187 Name of the `~CollectionType.RUN` collection new datasets should be
188 inserted into. If ``collections`` is `None` and ``run`` is not
189 `None`, ``collections`` will be set to ``[run]``. If not `None`,
190 this collection will automatically be registered. If this is not
191 set (and ``writeable`` is not set either), a read-only butler will
192 be created.
193 searchPaths : `list` of `str`, optional
194 Directory paths to search when calculating the full Butler
195 configuration. Not used if the supplied config is already a
196 `ButlerConfig`.
197 writeable : `bool`, optional
198 Explicitly sets whether the butler supports write operations. If
199 not provided, a read-write butler is created if any of ``run``,
200 ``tags``, or ``chains`` is non-empty.
201 inferDefaults : `bool`, optional
202 If `True` (default) infer default data ID values from the values
203 present in the datasets in ``collections``: if all collections have
204 the same value (or no value) for a governor dimension, that value
205 will be the default for that dimension. Nonexistent collections
206 are ignored. If a default value is provided explicitly for a
207 governor dimension via ``**kwargs``, no default will be inferred
208 for that dimension.
209 **kwargs : `Any`
210 Additional keyword arguments passed to a constructor of actual
211 butler class.
213 Notes
214 -----
215 Calling this factory method is identical to calling
216 ``Butler(config, ...)``. Its only raison d'être is that ``mypy``
217 complains about ``Butler()`` call.
219 Examples
220 --------
221 While there are many ways to control exactly how a `Butler` interacts
222 with the collections in its `Registry`, the most common cases are still
223 simple.
225 For a read-only `Butler` that searches one collection, do::
227 butler = Butler.from_config(
228 "/path/to/repo", collections=["u/alice/DM-50000"]
229 )
231 For a read-write `Butler` that writes to and reads from a
232 `~CollectionType.RUN` collection::
234 butler = Butler.from_config(
235 "/path/to/repo", run="u/alice/DM-50000/a"
236 )
238 The `Butler` passed to a ``PipelineTask`` is often much more complex,
239 because we want to write to one `~CollectionType.RUN` collection but
240 read from several others (as well)::
242 butler = Butler.from_config(
243 "/path/to/repo",
244 run="u/alice/DM-50000/a",
245 collections=[
246 "u/alice/DM-50000/a", "u/bob/DM-49998", "HSC/defaults"
247 ]
248 )
250 This butler will `put` new datasets to the run ``u/alice/DM-50000/a``.
251 Datasets will be read first from that run (since it appears first in
252 the chain), and then from ``u/bob/DM-49998`` and finally
253 ``HSC/defaults``.
255 Finally, one can always create a `Butler` with no collections::
257 butler = Butler.from_config("/path/to/repo", writeable=True)
259 This can be extremely useful when you just want to use
260 ``butler.registry``, e.g. for inserting dimension data or managing
261 collections, or when the collections you want to use with the butler
262 are not consistent. Passing ``writeable`` explicitly here is only
263 necessary if you want to be able to make changes to the repo - usually
264 the value for ``writeable`` can be guessed from the collection
265 arguments provided, but it defaults to `False` when there are not
266 collection arguments.
267 """
268 cls = cls._find_butler_class(config, searchPaths)
269 return cls(
270 config,
271 collections=collections,
272 run=run,
273 searchPaths=searchPaths,
274 writeable=writeable,
275 inferDefaults=inferDefaults,
276 **kwargs,
277 )
279 @staticmethod
280 def makeRepo(
281 root: ResourcePathExpression,
282 config: Config | str | None = None,
283 dimensionConfig: Config | str | None = None,
284 standalone: bool = False,
285 searchPaths: list[str] | None = None,
286 forceConfigRoot: bool = True,
287 outfile: ResourcePathExpression | None = None,
288 overwrite: bool = False,
289 ) -> Config:
290 """Create an empty data repository by adding a butler.yaml config
291 to a repository root directory.
293 Parameters
294 ----------
295 root : `lsst.resources.ResourcePathExpression`
296 Path or URI to the root location of the new repository. Will be
297 created if it does not exist.
298 config : `Config` or `str`, optional
299 Configuration to write to the repository, after setting any
300 root-dependent Registry or Datastore config options. Can not
301 be a `ButlerConfig` or a `ConfigSubset`. If `None`, default
302 configuration will be used. Root-dependent config options
303 specified in this config are overwritten if ``forceConfigRoot``
304 is `True`.
305 dimensionConfig : `Config` or `str`, optional
306 Configuration for dimensions, will be used to initialize registry
307 database.
308 standalone : `bool`
309 If True, write all expanded defaults, not just customized or
310 repository-specific settings.
311 This (mostly) decouples the repository from the default
312 configuration, insulating it from changes to the defaults (which
313 may be good or bad, depending on the nature of the changes).
314 Future *additions* to the defaults will still be picked up when
315 initializing `Butlers` to repos created with ``standalone=True``.
316 searchPaths : `list` of `str`, optional
317 Directory paths to search when calculating the full butler
318 configuration.
319 forceConfigRoot : `bool`, optional
320 If `False`, any values present in the supplied ``config`` that
321 would normally be reset are not overridden and will appear
322 directly in the output config. This allows non-standard overrides
323 of the root directory for a datastore or registry to be given.
324 If this parameter is `True` the values for ``root`` will be
325 forced into the resulting config if appropriate.
326 outfile : `lss.resources.ResourcePathExpression`, optional
327 If not-`None`, the output configuration will be written to this
328 location rather than into the repository itself. Can be a URI
329 string. Can refer to a directory that will be used to write
330 ``butler.yaml``.
331 overwrite : `bool`, optional
332 Create a new configuration file even if one already exists
333 in the specified output location. Default is to raise
334 an exception.
336 Returns
337 -------
338 config : `Config`
339 The updated `Config` instance written to the repo.
341 Raises
342 ------
343 ValueError
344 Raised if a ButlerConfig or ConfigSubset is passed instead of a
345 regular Config (as these subclasses would make it impossible to
346 support ``standalone=False``).
347 FileExistsError
348 Raised if the output config file already exists.
349 os.error
350 Raised if the directory does not exist, exists but is not a
351 directory, or cannot be created.
353 Notes
354 -----
355 Note that when ``standalone=False`` (the default), the configuration
356 search path (see `ConfigSubset.defaultSearchPaths`) that was used to
357 construct the repository should also be used to construct any Butlers
358 to avoid configuration inconsistencies.
359 """
360 if isinstance(config, ButlerConfig | ConfigSubset):
361 raise ValueError("makeRepo must be passed a regular Config without defaults applied.")
363 # Ensure that the root of the repository exists or can be made
364 root_uri = ResourcePath(root, forceDirectory=True)
365 root_uri.mkdir()
367 config = Config(config)
369 # If we are creating a new repo from scratch with relative roots,
370 # do not propagate an explicit root from the config file
371 if "root" in config:
372 del config["root"]
374 full = ButlerConfig(config, searchPaths=searchPaths) # this applies defaults
375 imported_class = doImportType(full["datastore", "cls"])
376 if not issubclass(imported_class, Datastore):
377 raise TypeError(f"Imported datastore class {full['datastore', 'cls']} is not a Datastore")
378 datastoreClass: type[Datastore] = imported_class
379 datastoreClass.setConfigRoot(BUTLER_ROOT_TAG, config, full, overwrite=forceConfigRoot)
381 # if key exists in given config, parse it, otherwise parse the defaults
382 # in the expanded config
383 if config.get(("registry", "db")):
384 registryConfig = RegistryConfig(config)
385 else:
386 registryConfig = RegistryConfig(full)
387 defaultDatabaseUri = registryConfig.makeDefaultDatabaseUri(BUTLER_ROOT_TAG)
388 if defaultDatabaseUri is not None:
389 Config.updateParameters(
390 RegistryConfig, config, full, toUpdate={"db": defaultDatabaseUri}, overwrite=forceConfigRoot
391 )
392 else:
393 Config.updateParameters(RegistryConfig, config, full, toCopy=("db",), overwrite=forceConfigRoot)
395 if standalone:
396 config.merge(full)
397 else:
398 # Always expand the registry.managers section into the per-repo
399 # config, because after the database schema is created, it's not
400 # allowed to change anymore. Note that in the standalone=True
401 # branch, _everything_ in the config is expanded, so there's no
402 # need to special case this.
403 Config.updateParameters(RegistryConfig, config, full, toMerge=("managers",), overwrite=False)
404 configURI: ResourcePathExpression
405 if outfile is not None:
406 # When writing to a separate location we must include
407 # the root of the butler repo in the config else it won't know
408 # where to look.
409 config["root"] = root_uri.geturl()
410 configURI = outfile
411 else:
412 configURI = root_uri
413 # Strip obscore configuration, if it is present, before writing config
414 # to a file, obscore config will be stored in registry.
415 if (obscore_config_key := ("registry", "managers", "obscore", "config")) in config:
416 config_to_write = config.copy()
417 del config_to_write[obscore_config_key]
418 config_to_write.dumpToUri(configURI, overwrite=overwrite)
419 # configFile attribute is updated, need to copy it to original.
420 config.configFile = config_to_write.configFile
421 else:
422 config.dumpToUri(configURI, overwrite=overwrite)
424 # Create Registry and populate tables
425 registryConfig = RegistryConfig(config.get("registry"))
426 dimensionConfig = DimensionConfig(dimensionConfig)
427 _RegistryFactory(registryConfig).create_from_config(
428 dimensionConfig=dimensionConfig, butlerRoot=root_uri
429 )
431 _LOG.verbose("Wrote new Butler configuration file to %s", configURI)
433 return config
435 @classmethod
436 def get_repo_uri(cls, label: str, return_label: bool = False) -> ResourcePath:
437 """Look up the label in a butler repository index.
439 Parameters
440 ----------
441 label : `str`
442 Label of the Butler repository to look up.
443 return_label : `bool`, optional
444 If ``label`` cannot be found in the repository index (either
445 because index is not defined or ``label`` is not in the index) and
446 ``return_label`` is `True` then return ``ResourcePath(label)``.
447 If ``return_label`` is `False` (default) then an exception will be
448 raised instead.
450 Returns
451 -------
452 uri : `lsst.resources.ResourcePath`
453 URI to the Butler repository associated with the given label or
454 default value if it is provided.
456 Raises
457 ------
458 KeyError
459 Raised if the label is not found in the index, or if an index
460 is not defined, and ``return_label`` is `False`.
462 Notes
463 -----
464 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the
465 information is discovered.
466 """
467 return ButlerRepoIndex.get_repo_uri(label, return_label)
469 @classmethod
470 def get_known_repos(cls) -> set[str]:
471 """Retrieve the list of known repository labels.
473 Returns
474 -------
475 repos : `set` of `str`
476 All the known labels. Can be empty if no index can be found.
478 Notes
479 -----
480 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the
481 information is discovered.
482 """
483 return ButlerRepoIndex.get_known_repos()
485 @abstractmethod
486 def transaction(self) -> AbstractContextManager[None]:
487 """Context manager supporting `Butler` transactions.
489 Transactions can be nested.
490 """
491 raise NotImplementedError()
493 @abstractmethod
494 def put(
495 self,
496 obj: Any,
497 datasetRefOrType: DatasetRef | DatasetType | str,
498 /,
499 dataId: DataId | None = None,
500 *,
501 run: str | None = None,
502 **kwargs: Any,
503 ) -> DatasetRef:
504 """Store and register a dataset.
506 Parameters
507 ----------
508 obj : `object`
509 The dataset.
510 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
511 When `DatasetRef` is provided, ``dataId`` should be `None`.
512 Otherwise the `DatasetType` or name thereof. If a fully resolved
513 `DatasetRef` is given the run and ID are used directly.
514 dataId : `dict` or `DataCoordinate`
515 A `dict` of `Dimension` link name, value pairs that label the
516 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
517 should be provided as the second argument.
518 run : `str`, optional
519 The name of the run the dataset should be added to, overriding
520 ``self.run``. Not used if a resolved `DatasetRef` is provided.
521 **kwargs
522 Additional keyword arguments used to augment or construct a
523 `DataCoordinate`. See `DataCoordinate.standardize`
524 parameters. Not used if a resolve `DatasetRef` is provided.
526 Returns
527 -------
528 ref : `DatasetRef`
529 A reference to the stored dataset, updated with the correct id if
530 given.
532 Raises
533 ------
534 TypeError
535 Raised if the butler is read-only or if no run has been provided.
536 """
537 raise NotImplementedError()
539 @abstractmethod
540 def getDeferred(
541 self,
542 datasetRefOrType: DatasetRef | DatasetType | str,
543 /,
544 dataId: DataId | None = None,
545 *,
546 parameters: dict | None = None,
547 collections: Any = None,
548 storageClass: str | StorageClass | None = None,
549 **kwargs: Any,
550 ) -> DeferredDatasetHandle:
551 """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
552 after an immediate registry lookup.
554 Parameters
555 ----------
556 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
557 When `DatasetRef` the `dataId` should be `None`.
558 Otherwise the `DatasetType` or name thereof.
559 dataId : `dict` or `DataCoordinate`, optional
560 A `dict` of `Dimension` link name, value pairs that label the
561 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
562 should be provided as the first argument.
563 parameters : `dict`
564 Additional StorageClass-defined options to control reading,
565 typically used to efficiently read only a subset of the dataset.
566 collections : Any, optional
567 Collections to be searched, overriding ``self.collections``.
568 Can be any of the types supported by the ``collections`` argument
569 to butler construction.
570 storageClass : `StorageClass` or `str`, optional
571 The storage class to be used to override the Python type
572 returned by this method. By default the returned type matches
573 the dataset type definition for this dataset. Specifying a
574 read `StorageClass` can force a different type to be returned.
575 This type must be compatible with the original type.
576 **kwargs
577 Additional keyword arguments used to augment or construct a
578 `DataId`. See `DataId` parameters.
580 Returns
581 -------
582 obj : `DeferredDatasetHandle`
583 A handle which can be used to retrieve a dataset at a later time.
585 Raises
586 ------
587 LookupError
588 Raised if no matching dataset exists in the `Registry` or
589 datastore.
590 ValueError
591 Raised if a resolved `DatasetRef` was passed as an input, but it
592 differs from the one found in the registry.
593 TypeError
594 Raised if no collections were provided.
595 """
596 raise NotImplementedError()
598 @abstractmethod
599 def get(
600 self,
601 datasetRefOrType: DatasetRef | DatasetType | str,
602 /,
603 dataId: DataId | None = None,
604 *,
605 parameters: dict[str, Any] | None = None,
606 collections: Any = None,
607 storageClass: StorageClass | str | None = None,
608 **kwargs: Any,
609 ) -> Any:
610 """Retrieve a stored dataset.
612 Parameters
613 ----------
614 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
615 When `DatasetRef` the `dataId` should be `None`.
616 Otherwise the `DatasetType` or name thereof.
617 If a resolved `DatasetRef`, the associated dataset
618 is returned directly without additional querying.
619 dataId : `dict` or `DataCoordinate`
620 A `dict` of `Dimension` link name, value pairs that label the
621 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
622 should be provided as the first argument.
623 parameters : `dict`
624 Additional StorageClass-defined options to control reading,
625 typically used to efficiently read only a subset of the dataset.
626 collections : Any, optional
627 Collections to be searched, overriding ``self.collections``.
628 Can be any of the types supported by the ``collections`` argument
629 to butler construction.
630 storageClass : `StorageClass` or `str`, optional
631 The storage class to be used to override the Python type
632 returned by this method. By default the returned type matches
633 the dataset type definition for this dataset. Specifying a
634 read `StorageClass` can force a different type to be returned.
635 This type must be compatible with the original type.
636 **kwargs
637 Additional keyword arguments used to augment or construct a
638 `DataCoordinate`. See `DataCoordinate.standardize`
639 parameters.
641 Returns
642 -------
643 obj : `object`
644 The dataset.
646 Raises
647 ------
648 LookupError
649 Raised if no matching dataset exists in the `Registry`.
650 TypeError
651 Raised if no collections were provided.
653 Notes
654 -----
655 When looking up datasets in a `~CollectionType.CALIBRATION` collection,
656 this method requires that the given data ID include temporal dimensions
657 beyond the dimensions of the dataset type itself, in order to find the
658 dataset with the appropriate validity range. For example, a "bias"
659 dataset with native dimensions ``{instrument, detector}`` could be
660 fetched with a ``{instrument, detector, exposure}`` data ID, because
661 ``exposure`` is a temporal dimension.
662 """
663 raise NotImplementedError()
665 @abstractmethod
666 def getURIs(
667 self,
668 datasetRefOrType: DatasetRef | DatasetType | str,
669 /,
670 dataId: DataId | None = None,
671 *,
672 predict: bool = False,
673 collections: Any = None,
674 run: str | None = None,
675 **kwargs: Any,
676 ) -> DatasetRefURIs:
677 """Return the URIs associated with the dataset.
679 Parameters
680 ----------
681 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
682 When `DatasetRef` the `dataId` should be `None`.
683 Otherwise the `DatasetType` or name thereof.
684 dataId : `dict` or `DataCoordinate`
685 A `dict` of `Dimension` link name, value pairs that label the
686 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
687 should be provided as the first argument.
688 predict : `bool`
689 If `True`, allow URIs to be returned of datasets that have not
690 been written.
691 collections : Any, optional
692 Collections to be searched, overriding ``self.collections``.
693 Can be any of the types supported by the ``collections`` argument
694 to butler construction.
695 run : `str`, optional
696 Run to use for predictions, overriding ``self.run``.
697 **kwargs
698 Additional keyword arguments used to augment or construct a
699 `DataCoordinate`. See `DataCoordinate.standardize`
700 parameters.
702 Returns
703 -------
704 uris : `DatasetRefURIs`
705 The URI to the primary artifact associated with this dataset (if
706 the dataset was disassembled within the datastore this may be
707 `None`), and the URIs to any components associated with the dataset
708 artifact. (can be empty if there are no components).
709 """
710 raise NotImplementedError()
712 @abstractmethod
713 def getURI(
714 self,
715 datasetRefOrType: DatasetRef | DatasetType | str,
716 /,
717 dataId: DataId | None = None,
718 *,
719 predict: bool = False,
720 collections: Any = None,
721 run: str | None = None,
722 **kwargs: Any,
723 ) -> ResourcePath:
724 """Return the URI to the Dataset.
726 Parameters
727 ----------
728 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
729 When `DatasetRef` the `dataId` should be `None`.
730 Otherwise the `DatasetType` or name thereof.
731 dataId : `dict` or `DataCoordinate`
732 A `dict` of `Dimension` link name, value pairs that label the
733 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
734 should be provided as the first argument.
735 predict : `bool`
736 If `True`, allow URIs to be returned of datasets that have not
737 been written.
738 collections : Any, optional
739 Collections to be searched, overriding ``self.collections``.
740 Can be any of the types supported by the ``collections`` argument
741 to butler construction.
742 run : `str`, optional
743 Run to use for predictions, overriding ``self.run``.
744 **kwargs
745 Additional keyword arguments used to augment or construct a
746 `DataCoordinate`. See `DataCoordinate.standardize`
747 parameters.
749 Returns
750 -------
751 uri : `lsst.resources.ResourcePath`
752 URI pointing to the Dataset within the datastore. If the
753 Dataset does not exist in the datastore, and if ``predict`` is
754 `True`, the URI will be a prediction and will include a URI
755 fragment "#predicted".
756 If the datastore does not have entities that relate well
757 to the concept of a URI the returned URI string will be
758 descriptive. The returned URI is not guaranteed to be obtainable.
760 Raises
761 ------
762 LookupError
763 A URI has been requested for a dataset that does not exist and
764 guessing is not allowed.
765 ValueError
766 Raised if a resolved `DatasetRef` was passed as an input, but it
767 differs from the one found in the registry.
768 TypeError
769 Raised if no collections were provided.
770 RuntimeError
771 Raised if a URI is requested for a dataset that consists of
772 multiple artifacts.
773 """
774 raise NotImplementedError()
776 @abstractmethod
777 def get_dataset_type(self, name: str) -> DatasetType:
778 """Get the `DatasetType`.
780 Parameters
781 ----------
782 name : `str`
783 Name of the type.
785 Returns
786 -------
787 type : `DatasetType`
788 The `DatasetType` associated with the given name.
790 Raises
791 ------
792 lsst.daf.butler.MissingDatasetTypeError
793 Raised if the requested dataset type has not been registered.
795 Notes
796 -----
797 This method handles component dataset types automatically, though most
798 other operations do not.
799 """
800 raise NotImplementedError()
802 @abstractmethod
803 def get_dataset(
804 self,
805 id: DatasetId,
806 storage_class: str | StorageClass | None,
807 dimension_records: bool = False,
808 datastore_records: bool = False,
809 ) -> DatasetRef | None:
810 """Retrieve a Dataset entry.
812 Parameters
813 ----------
814 id : `DatasetId`
815 The unique identifier for the dataset.
816 storage_class : `str` or `StorageClass` or `None`
817 A storage class to use when creating the returned entry. If given
818 it must be compatible with the default storage class.
819 dimension_records: `bool`, optional
820 If `True` the ref will be expanded and contain dimension records.
821 datastore_records: `bool`, optional.
822 If `True` the ref will contain associated datastore records.
824 Returns
825 -------
826 ref : `DatasetRef` or `None`
827 A ref to the Dataset, or `None` if no matching Dataset
828 was found.
829 """
830 raise NotImplementedError()
832 @abstractmethod
833 def find_dataset(
834 self,
835 dataset_type: DatasetType | str,
836 data_id: DataId | None = None,
837 *,
838 collections: str | Sequence[str] | None = None,
839 timespan: Timespan | None = None,
840 storage_class: str | StorageClass | None = None,
841 dimension_records: bool = False,
842 datastore_records: bool = False,
843 **kwargs: Any,
844 ) -> DatasetRef | None:
845 """Find a dataset given its `DatasetType` and data ID.
847 This can be used to obtain a `DatasetRef` that permits the dataset to
848 be read from a `Datastore`. If the dataset is a component and can not
849 be found using the provided dataset type, a dataset ref for the parent
850 will be returned instead but with the correct dataset type.
852 Parameters
853 ----------
854 dataset_type : `DatasetType` or `str`
855 A `DatasetType` or the name of one. If this is a `DatasetType`
856 instance, its storage class will be respected and propagated to
857 the output, even if it differs from the dataset type definition
858 in the registry, as long as the storage classes are convertible.
859 data_id : `dict` or `DataCoordinate`, optional
860 A `dict`-like object containing the `Dimension` links that identify
861 the dataset within a collection. If it is a `dict` the dataId
862 can include dimension record values such as ``day_obs`` and
863 ``seq_num`` or ``full_name`` that can be used to derive the
864 primary dimension.
865 collections : `str` or `list` [`str`], optional
866 A an ordered list of collections to search for the dataset.
867 Defaults to ``self.defaults.collections``.
868 timespan : `Timespan`, optional
869 A timespan that the validity range of the dataset must overlap.
870 If not provided, any `~CollectionType.CALIBRATION` collections
871 matched by the ``collections`` argument will not be searched.
872 storage_class : `str` or `StorageClass` or `None`
873 A storage class to use when creating the returned entry. If given
874 it must be compatible with the default storage class.
875 dimension_records: `bool`, optional
876 If `True` the ref will be expanded and contain dimension records.
877 datastore_records: `bool`, optional.
878 If `True` the ref will contain associated datastore records.
879 **kwargs
880 Additional keyword arguments passed to
881 `DataCoordinate.standardize` to convert ``dataId`` to a true
882 `DataCoordinate` or augment an existing one. This can also include
883 dimension record metadata that can be used to derive a primary
884 dimension value.
886 Returns
887 -------
888 ref : `DatasetRef`
889 A reference to the dataset, or `None` if no matching Dataset
890 was found.
892 Raises
893 ------
894 lsst.daf.butler.NoDefaultCollectionError
895 Raised if ``collections`` is `None` and
896 ``self.collections`` is `None`.
897 LookupError
898 Raised if one or more data ID keys are missing.
899 lsst.daf.butler.MissingDatasetTypeError
900 Raised if the dataset type does not exist.
901 lsst.daf.butler.MissingCollectionError
902 Raised if any of ``collections`` does not exist in the registry.
904 Notes
905 -----
906 This method simply returns `None` and does not raise an exception even
907 when the set of collections searched is intrinsically incompatible with
908 the dataset type, e.g. if ``datasetType.isCalibration() is False``, but
909 only `~CollectionType.CALIBRATION` collections are being searched.
910 This may make it harder to debug some lookup failures, but the behavior
911 is intentional; we consider it more important that failed searches are
912 reported consistently, regardless of the reason, and that adding
913 additional collections that do not contain a match to the search path
914 never changes the behavior.
916 This method handles component dataset types automatically, though most
917 other query operations do not.
918 """
919 raise NotImplementedError()
921 @abstractmethod
922 def retrieveArtifacts(
923 self,
924 refs: Iterable[DatasetRef],
925 destination: ResourcePathExpression,
926 transfer: str = "auto",
927 preserve_path: bool = True,
928 overwrite: bool = False,
929 ) -> list[ResourcePath]:
930 """Retrieve the artifacts associated with the supplied refs.
932 Parameters
933 ----------
934 refs : iterable of `DatasetRef`
935 The datasets for which artifacts are to be retrieved.
936 A single ref can result in multiple artifacts. The refs must
937 be resolved.
938 destination : `lsst.resources.ResourcePath` or `str`
939 Location to write the artifacts.
940 transfer : `str`, optional
941 Method to use to transfer the artifacts. Must be one of the options
942 supported by `~lsst.resources.ResourcePath.transfer_from()`.
943 "move" is not allowed.
944 preserve_path : `bool`, optional
945 If `True` the full path of the artifact within the datastore
946 is preserved. If `False` the final file component of the path
947 is used.
948 overwrite : `bool`, optional
949 If `True` allow transfers to overwrite existing files at the
950 destination.
952 Returns
953 -------
954 targets : `list` of `lsst.resources.ResourcePath`
955 URIs of file artifacts in destination location. Order is not
956 preserved.
958 Notes
959 -----
960 For non-file datastores the artifacts written to the destination
961 may not match the representation inside the datastore. For example
962 a hierarchical data structure in a NoSQL database may well be stored
963 as a JSON file.
964 """
965 raise NotImplementedError()
967 @abstractmethod
968 def exists(
969 self,
970 dataset_ref_or_type: DatasetRef | DatasetType | str,
971 /,
972 data_id: DataId | None = None,
973 *,
974 full_check: bool = True,
975 collections: Any = None,
976 **kwargs: Any,
977 ) -> DatasetExistence:
978 """Indicate whether a dataset is known to Butler registry and
979 datastore.
981 Parameters
982 ----------
983 dataset_ref_or_type : `DatasetRef`, `DatasetType`, or `str`
984 When `DatasetRef` the `dataId` should be `None`.
985 Otherwise the `DatasetType` or name thereof.
986 data_id : `dict` or `DataCoordinate`
987 A `dict` of `Dimension` link name, value pairs that label the
988 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
989 should be provided as the first argument.
990 full_check : `bool`, optional
991 If `True`, an additional check will be made for dataset artifact
992 existence. This will involve additional overhead due to the need
993 to query an external system. If `False` registry and datastore
994 will solely be asked if they know about the dataset but no
995 check for the artifact will be performed.
996 collections : Any, optional
997 Collections to be searched, overriding ``self.collections``.
998 Can be any of the types supported by the ``collections`` argument
999 to butler construction.
1000 **kwargs
1001 Additional keyword arguments used to augment or construct a
1002 `DataCoordinate`. See `DataCoordinate.standardize`
1003 parameters.
1005 Returns
1006 -------
1007 existence : `DatasetExistence`
1008 Object indicating whether the dataset is known to registry and
1009 datastore. Evaluates to `True` if the dataset is present and known
1010 to both.
1011 """
1012 raise NotImplementedError()
1014 @abstractmethod
1015 def _exists_many(
1016 self,
1017 refs: Iterable[DatasetRef],
1018 /,
1019 *,
1020 full_check: bool = True,
1021 ) -> dict[DatasetRef, DatasetExistence]:
1022 """Indicate whether multiple datasets are known to Butler registry and
1023 datastore.
1025 This is an experimental API that may change at any moment.
1027 Parameters
1028 ----------
1029 refs : iterable of `DatasetRef`
1030 The datasets to be checked.
1031 full_check : `bool`, optional
1032 If `True`, an additional check will be made for dataset artifact
1033 existence. This will involve additional overhead due to the need
1034 to query an external system. If `False` registry and datastore
1035 will solely be asked if they know about the dataset but no
1036 check for the artifact will be performed.
1038 Returns
1039 -------
1040 existence : dict of [`DatasetRef`, `DatasetExistence`]
1041 Mapping from the given dataset refs to an enum indicating the
1042 status of the dataset in registry and datastore.
1043 Each value evaluates to `True` if the dataset is present and known
1044 to both.
1045 """
1046 raise NotImplementedError()
1048 @abstractmethod
1049 def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None:
1050 """Remove one or more `~CollectionType.RUN` collections and the
1051 datasets within them.
1053 Parameters
1054 ----------
1055 names : `~collections.abc.Iterable` [ `str` ]
1056 The names of the collections to remove.
1057 unstore : `bool`, optional
1058 If `True` (default), delete datasets from all datastores in which
1059 they are present, and attempt to rollback the registry deletions if
1060 datastore deletions fail (which may not always be possible). If
1061 `False`, datastore records for these datasets are still removed,
1062 but any artifacts (e.g. files) will not be.
1064 Raises
1065 ------
1066 TypeError
1067 Raised if one or more collections are not of type
1068 `~CollectionType.RUN`.
1069 """
1070 raise NotImplementedError()
1072 @abstractmethod
1073 def ingest(
1074 self,
1075 *datasets: FileDataset,
1076 transfer: str | None = "auto",
1077 run: str | None = None,
1078 idGenerationMode: DatasetIdGenEnum | None = None,
1079 record_validation_info: bool = True,
1080 ) -> None:
1081 """Store and register one or more datasets that already exist on disk.
1083 Parameters
1084 ----------
1085 datasets : `FileDataset`
1086 Each positional argument is a struct containing information about
1087 a file to be ingested, including its URI (either absolute or
1088 relative to the datastore root, if applicable), a resolved
1089 `DatasetRef`, and optionally a formatter class or its
1090 fully-qualified string name. If a formatter is not provided, the
1091 formatter that would be used for `put` is assumed. On successful
1092 ingest all `FileDataset.formatter` attributes will be set to the
1093 formatter class used. `FileDataset.path` attributes may be modified
1094 to put paths in whatever the datastore considers a standardized
1095 form.
1096 transfer : `str`, optional
1097 If not `None`, must be one of 'auto', 'move', 'copy', 'direct',
1098 'split', 'hardlink', 'relsymlink' or 'symlink', indicating how to
1099 transfer the file.
1100 run : `str`, optional
1101 The name of the run ingested datasets should be added to,
1102 overriding ``self.run``. This parameter is now deprecated since
1103 the run is encoded in the ``FileDataset``.
1104 idGenerationMode : `DatasetIdGenEnum`, optional
1105 Specifies option for generating dataset IDs. Parameter is
1106 deprecated.
1107 record_validation_info : `bool`, optional
1108 If `True`, the default, the datastore can record validation
1109 information associated with the file. If `False` the datastore
1110 will not attempt to track any information such as checksums
1111 or file sizes. This can be useful if such information is tracked
1112 in an external system or if the file is to be compressed in place.
1113 It is up to the datastore whether this parameter is relevant.
1115 Raises
1116 ------
1117 TypeError
1118 Raised if the butler is read-only or if no run was provided.
1119 NotImplementedError
1120 Raised if the `Datastore` does not support the given transfer mode.
1121 DatasetTypeNotSupportedError
1122 Raised if one or more files to be ingested have a dataset type that
1123 is not supported by the `Datastore`..
1124 FileNotFoundError
1125 Raised if one of the given files does not exist.
1126 FileExistsError
1127 Raised if transfer is not `None` but the (internal) location the
1128 file would be moved to is already occupied.
1130 Notes
1131 -----
1132 This operation is not fully exception safe: if a database operation
1133 fails, the given `FileDataset` instances may be only partially updated.
1135 It is atomic in terms of database operations (they will either all
1136 succeed or all fail) providing the database engine implements
1137 transactions correctly. It will attempt to be atomic in terms of
1138 filesystem operations as well, but this cannot be implemented
1139 rigorously for most datastores.
1140 """
1141 raise NotImplementedError()
1143 @abstractmethod
1144 def export(
1145 self,
1146 *,
1147 directory: str | None = None,
1148 filename: str | None = None,
1149 format: str | None = None,
1150 transfer: str | None = None,
1151 ) -> AbstractContextManager[RepoExportContext]:
1152 """Export datasets from the repository represented by this `Butler`.
1154 This method is a context manager that returns a helper object
1155 (`RepoExportContext`) that is used to indicate what information from
1156 the repository should be exported.
1158 Parameters
1159 ----------
1160 directory : `str`, optional
1161 Directory dataset files should be written to if ``transfer`` is not
1162 `None`.
1163 filename : `str`, optional
1164 Name for the file that will include database information associated
1165 with the exported datasets. If this is not an absolute path and
1166 ``directory`` is not `None`, it will be written to ``directory``
1167 instead of the current working directory. Defaults to
1168 "export.{format}".
1169 format : `str`, optional
1170 File format for the database information file. If `None`, the
1171 extension of ``filename`` will be used.
1172 transfer : `str`, optional
1173 Transfer mode passed to `Datastore.export`.
1175 Raises
1176 ------
1177 TypeError
1178 Raised if the set of arguments passed is inconsistent.
1180 Examples
1181 --------
1182 Typically the `Registry.queryDataIds` and `Registry.queryDatasets`
1183 methods are used to provide the iterables over data IDs and/or datasets
1184 to be exported::
1186 with butler.export("exports.yaml") as export:
1187 # Export all flats, but none of the dimension element rows
1188 # (i.e. data ID information) associated with them.
1189 export.saveDatasets(butler.registry.queryDatasets("flat"),
1190 elements=())
1191 # Export all datasets that start with "deepCoadd_" and all of
1192 # their associated data ID information.
1193 export.saveDatasets(butler.registry.queryDatasets("deepCoadd_*"))
1194 """
1195 raise NotImplementedError()
1197 @abstractmethod
1198 def import_(
1199 self,
1200 *,
1201 directory: ResourcePathExpression | None = None,
1202 filename: ResourcePathExpression | TextIO | None = None,
1203 format: str | None = None,
1204 transfer: str | None = None,
1205 skip_dimensions: set | None = None,
1206 ) -> None:
1207 """Import datasets into this repository that were exported from a
1208 different butler repository via `~lsst.daf.butler.Butler.export`.
1210 Parameters
1211 ----------
1212 directory : `~lsst.resources.ResourcePathExpression`, optional
1213 Directory containing dataset files to import from. If `None`,
1214 ``filename`` and all dataset file paths specified therein must
1215 be absolute.
1216 filename : `~lsst.resources.ResourcePathExpression` or `TextIO`
1217 A stream or name of file that contains database information
1218 associated with the exported datasets, typically generated by
1219 `~lsst.daf.butler.Butler.export`. If this a string (name) or
1220 `~lsst.resources.ResourcePath` and is not an absolute path,
1221 it will first be looked for relative to ``directory`` and if not
1222 found there it will be looked for in the current working
1223 directory. Defaults to "export.{format}".
1224 format : `str`, optional
1225 File format for ``filename``. If `None`, the extension of
1226 ``filename`` will be used.
1227 transfer : `str`, optional
1228 Transfer mode passed to `~lsst.daf.butler.Datastore.ingest`.
1229 skip_dimensions : `set`, optional
1230 Names of dimensions that should be skipped and not imported.
1232 Raises
1233 ------
1234 TypeError
1235 Raised if the set of arguments passed is inconsistent, or if the
1236 butler is read-only.
1237 """
1238 raise NotImplementedError()
1240 @abstractmethod
1241 def transfer_from(
1242 self,
1243 source_butler: LimitedButler,
1244 source_refs: Iterable[DatasetRef],
1245 transfer: str = "auto",
1246 skip_missing: bool = True,
1247 register_dataset_types: bool = False,
1248 transfer_dimensions: bool = False,
1249 ) -> Collection[DatasetRef]:
1250 """Transfer datasets to this Butler from a run in another Butler.
1252 Parameters
1253 ----------
1254 source_butler : `LimitedButler`
1255 Butler from which the datasets are to be transferred. If data IDs
1256 in ``source_refs`` are not expanded then this has to be a full
1257 `Butler` whose registry will be used to expand data IDs.
1258 source_refs : iterable of `DatasetRef`
1259 Datasets defined in the source butler that should be transferred to
1260 this butler. In most circumstances, ``transfer_from`` is faster if
1261 the dataset refs are expanded.
1262 transfer : `str`, optional
1263 Transfer mode passed to `~lsst.daf.butler.Datastore.transfer_from`.
1264 skip_missing : `bool`
1265 If `True`, datasets with no datastore artifact associated with
1266 them are not transferred. If `False` a registry entry will be
1267 created even if no datastore record is created (and so will
1268 look equivalent to the dataset being unstored).
1269 register_dataset_types : `bool`
1270 If `True` any missing dataset types are registered. Otherwise
1271 an exception is raised.
1272 transfer_dimensions : `bool`, optional
1273 If `True`, dimension record data associated with the new datasets
1274 will be transferred.
1276 Returns
1277 -------
1278 refs : `list` of `DatasetRef`
1279 The refs added to this Butler.
1281 Notes
1282 -----
1283 The datastore artifact has to exist for a transfer
1284 to be made but non-existence is not an error.
1286 Datasets that already exist in this run will be skipped.
1288 The datasets are imported as part of a transaction, although
1289 dataset types are registered before the transaction is started.
1290 This means that it is possible for a dataset type to be registered
1291 even though transfer has failed.
1292 """
1293 raise NotImplementedError()
1295 @abstractmethod
1296 def validateConfiguration(
1297 self,
1298 logFailures: bool = False,
1299 datasetTypeNames: Iterable[str] | None = None,
1300 ignore: Iterable[str] | None = None,
1301 ) -> None:
1302 """Validate butler configuration.
1304 Checks that each `DatasetType` can be stored in the `Datastore`.
1306 Parameters
1307 ----------
1308 logFailures : `bool`, optional
1309 If `True`, output a log message for every validation error
1310 detected.
1311 datasetTypeNames : iterable of `str`, optional
1312 The `DatasetType` names that should be checked. This allows
1313 only a subset to be selected.
1314 ignore : iterable of `str`, optional
1315 Names of DatasetTypes to skip over. This can be used to skip
1316 known problems. If a named `DatasetType` corresponds to a
1317 composite, all components of that `DatasetType` will also be
1318 ignored.
1320 Raises
1321 ------
1322 ButlerValidationError
1323 Raised if there is some inconsistency with how this Butler
1324 is configured.
1325 """
1326 raise NotImplementedError()
1328 @property
1329 @abstractmethod
1330 def collections(self) -> Sequence[str]:
1331 """The collections to search by default, in order
1332 (`~collections.abc.Sequence` [ `str` ]).
1333 """
1334 raise NotImplementedError()
1336 @property
1337 @abstractmethod
1338 def run(self) -> str | None:
1339 """Name of the run this butler writes outputs to by default (`str` or
1340 `None`).
1341 """
1342 raise NotImplementedError()
1344 @property
1345 @abstractmethod
1346 def registry(self) -> Registry:
1347 """The object that manages dataset metadata and relationships
1348 (`Registry`).
1350 Many operations that don't involve reading or writing butler datasets
1351 are accessible only via `Registry` methods. Eventually these methods
1352 will be replaced by equivalent `Butler` methods.
1353 """
1354 raise NotImplementedError()