Coverage for python/lsst/daf/butler/_butler.py: 65%
142 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-01 11:00 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-01 11:00 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28from __future__ import annotations
30__all__ = ["Butler"]
32from abc import abstractmethod
33from collections.abc import Collection, Iterable, Sequence
34from contextlib import AbstractContextManager
35from typing import Any, TextIO
37from lsst.resources import ResourcePath, ResourcePathExpression
38from lsst.utils import doImportType
39from lsst.utils.logging import getLogger
41from ._butler_config import ButlerConfig
42from ._butler_repo_index import ButlerRepoIndex
43from ._config import Config, ConfigSubset
44from ._dataset_existence import DatasetExistence
45from ._dataset_ref import DatasetId, DatasetIdGenEnum, DatasetRef
46from ._dataset_type import DatasetType
47from ._deferredDatasetHandle import DeferredDatasetHandle
48from ._file_dataset import FileDataset
49from ._limited_butler import LimitedButler
50from ._storage_class import StorageClass
51from ._timespan import Timespan
52from .datastore import DatasetRefURIs, Datastore
53from .dimensions import DataId, DimensionConfig
54from .registry import Registry, RegistryConfig, _RegistryFactory
55from .repo_relocation import BUTLER_ROOT_TAG
56from .transfers import RepoExportContext
58_LOG = getLogger(__name__)
61class Butler(LimitedButler):
62 """Interface for data butler and factory for Butler instances.
64 Parameters
65 ----------
66 config : `ButlerConfig`, `Config` or `str`, optional.
67 Configuration. Anything acceptable to the `ButlerConfig` constructor.
68 If a directory path is given the configuration will be read from a
69 ``butler.yaml`` file in that location. If `None` is given default
70 values will be used. If ``config`` contains "cls" key then its value is
71 used as a name of butler class and it must be a sub-class of this
72 class, otherwise `DirectButler` is instantiated.
73 collections : `str` or `~collections.abc.Iterable` [ `str` ], optional
74 An expression specifying the collections to be searched (in order) when
75 reading datasets.
76 This may be a `str` collection name or an iterable thereof.
77 See :ref:`daf_butler_collection_expressions` for more information.
78 These collections are not registered automatically and must be
79 manually registered before they are used by any method, but they may be
80 manually registered after the `Butler` is initialized.
81 run : `str`, optional
82 Name of the `~CollectionType.RUN` collection new datasets should be
83 inserted into. If ``collections`` is `None` and ``run`` is not `None`,
84 ``collections`` will be set to ``[run]``. If not `None`, this
85 collection will automatically be registered. If this is not set (and
86 ``writeable`` is not set either), a read-only butler will be created.
87 searchPaths : `list` of `str`, optional
88 Directory paths to search when calculating the full Butler
89 configuration. Not used if the supplied config is already a
90 `ButlerConfig`.
91 writeable : `bool`, optional
92 Explicitly sets whether the butler supports write operations. If not
93 provided, a read-write butler is created if any of ``run``, ``tags``,
94 or ``chains`` is non-empty.
95 inferDefaults : `bool`, optional
96 If `True` (default) infer default data ID values from the values
97 present in the datasets in ``collections``: if all collections have the
98 same value (or no value) for a governor dimension, that value will be
99 the default for that dimension. Nonexistent collections are ignored.
100 If a default value is provided explicitly for a governor dimension via
101 ``**kwargs``, no default will be inferred for that dimension.
102 **kwargs : `Any`
103 Additional keyword arguments passed to a constructor of actual butler
104 class.
106 Notes
107 -----
108 The preferred way to instantiate Butler is via the `from_config` method.
109 The call to ``Butler(...)`` is equivalent to ``Butler.from_config(...)``,
110 but ``mypy`` will complain about the former.
111 """
113 def __new__(
114 cls,
115 config: Config | ResourcePathExpression | None = None,
116 *,
117 collections: Any = None,
118 run: str | None = None,
119 searchPaths: Sequence[ResourcePathExpression] | None = None,
120 writeable: bool | None = None,
121 inferDefaults: bool = True,
122 **kwargs: Any,
123 ) -> Butler:
124 if cls is Butler:
125 cls = cls._find_butler_class(config, searchPaths)
126 # Note: we do not pass any parameters to __new__, Python will pass them
127 # to __init__ after __new__ returns sub-class instance.
128 return super().__new__(cls)
130 @staticmethod
131 def _find_butler_class(
132 config: Config | ResourcePathExpression | None = None,
133 searchPaths: Sequence[ResourcePathExpression] | None = None,
134 ) -> type[Butler]:
135 """Find actual class to instantiate."""
136 butler_class_name: str | None = None
137 if config is not None:
138 # Check for optional "cls" key in config.
139 if not isinstance(config, Config):
140 config = ButlerConfig(config, searchPaths=searchPaths)
141 butler_class_name = config.get("cls")
143 # Make DirectButler if class is not specified.
144 butler_class: type[Butler]
145 if butler_class_name is None:
146 from .direct_butler import DirectButler
148 butler_class = DirectButler
149 else:
150 butler_class = doImportType(butler_class_name)
151 if not issubclass(butler_class, Butler):
152 raise TypeError(f"{butler_class_name} is not a subclass of Butler")
153 return butler_class
155 @classmethod
156 def from_config(
157 cls,
158 config: Config | ResourcePathExpression | None = None,
159 *,
160 collections: Any = None,
161 run: str | None = None,
162 searchPaths: Sequence[ResourcePathExpression] | None = None,
163 writeable: bool | None = None,
164 inferDefaults: bool = True,
165 **kwargs: Any,
166 ) -> Butler:
167 """Create butler instance from configuration.
169 Parameters
170 ----------
171 config : `ButlerConfig`, `Config` or `str`, optional.
172 Configuration. Anything acceptable to the `ButlerConfig`
173 constructor. If a directory path is given the configuration will be
174 read from a ``butler.yaml`` file in that location. If `None` is
175 given default values will be used. If ``config`` contains "cls" key
176 then its value is used as a name of butler class and it must be a
177 sub-class of this class, otherwise `DirectButler` is instantiated.
178 collections : `str` or `~collections.abc.Iterable` [ `str` ], optional
179 An expression specifying the collections to be searched (in order)
180 when reading datasets.
181 This may be a `str` collection name or an iterable thereof.
182 See :ref:`daf_butler_collection_expressions` for more information.
183 These collections are not registered automatically and must be
184 manually registered before they are used by any method, but they
185 may be manually registered after the `Butler` is initialized.
186 run : `str`, optional
187 Name of the `~CollectionType.RUN` collection new datasets should be
188 inserted into. If ``collections`` is `None` and ``run`` is not
189 `None`, ``collections`` will be set to ``[run]``. If not `None`,
190 this collection will automatically be registered. If this is not
191 set (and ``writeable`` is not set either), a read-only butler will
192 be created.
193 searchPaths : `list` of `str`, optional
194 Directory paths to search when calculating the full Butler
195 configuration. Not used if the supplied config is already a
196 `ButlerConfig`.
197 writeable : `bool`, optional
198 Explicitly sets whether the butler supports write operations. If
199 not provided, a read-write butler is created if any of ``run``,
200 ``tags``, or ``chains`` is non-empty.
201 inferDefaults : `bool`, optional
202 If `True` (default) infer default data ID values from the values
203 present in the datasets in ``collections``: if all collections have
204 the same value (or no value) for a governor dimension, that value
205 will be the default for that dimension. Nonexistent collections
206 are ignored. If a default value is provided explicitly for a
207 governor dimension via ``**kwargs``, no default will be inferred
208 for that dimension.
209 **kwargs : `Any`
210 Additional keyword arguments passed to a constructor of actual
211 butler class.
213 Notes
214 -----
215 Calling this factory method is identical to calling
216 ``Butler(config, ...)``. Its only raison d'être is that ``mypy``
217 complains about ``Butler()`` call.
219 Examples
220 --------
221 While there are many ways to control exactly how a `Butler` interacts
222 with the collections in its `Registry`, the most common cases are still
223 simple.
225 For a read-only `Butler` that searches one collection, do::
227 butler = Butler.from_config(
228 "/path/to/repo", collections=["u/alice/DM-50000"]
229 )
231 For a read-write `Butler` that writes to and reads from a
232 `~CollectionType.RUN` collection::
234 butler = Butler.from_config(
235 "/path/to/repo", run="u/alice/DM-50000/a"
236 )
238 The `Butler` passed to a ``PipelineTask`` is often much more complex,
239 because we want to write to one `~CollectionType.RUN` collection but
240 read from several others (as well)::
242 butler = Butler.from_config(
243 "/path/to/repo",
244 run="u/alice/DM-50000/a",
245 collections=[
246 "u/alice/DM-50000/a", "u/bob/DM-49998", "HSC/defaults"
247 ]
248 )
250 This butler will `put` new datasets to the run ``u/alice/DM-50000/a``.
251 Datasets will be read first from that run (since it appears first in
252 the chain), and then from ``u/bob/DM-49998`` and finally
253 ``HSC/defaults``.
255 Finally, one can always create a `Butler` with no collections::
257 butler = Butler.from_config("/path/to/repo", writeable=True)
259 This can be extremely useful when you just want to use
260 ``butler.registry``, e.g. for inserting dimension data or managing
261 collections, or when the collections you want to use with the butler
262 are not consistent. Passing ``writeable`` explicitly here is only
263 necessary if you want to be able to make changes to the repo - usually
264 the value for ``writeable`` can be guessed from the collection
265 arguments provided, but it defaults to `False` when there are not
266 collection arguments.
267 """
268 cls = cls._find_butler_class(config, searchPaths)
269 return cls(
270 config,
271 collections=collections,
272 run=run,
273 searchPaths=searchPaths,
274 writeable=writeable,
275 inferDefaults=inferDefaults,
276 **kwargs,
277 )
279 @staticmethod
280 def makeRepo(
281 root: ResourcePathExpression,
282 config: Config | str | None = None,
283 dimensionConfig: Config | str | None = None,
284 standalone: bool = False,
285 searchPaths: list[str] | None = None,
286 forceConfigRoot: bool = True,
287 outfile: ResourcePathExpression | None = None,
288 overwrite: bool = False,
289 ) -> Config:
290 """Create an empty data repository by adding a butler.yaml config
291 to a repository root directory.
293 Parameters
294 ----------
295 root : `lsst.resources.ResourcePathExpression`
296 Path or URI to the root location of the new repository. Will be
297 created if it does not exist.
298 config : `Config` or `str`, optional
299 Configuration to write to the repository, after setting any
300 root-dependent Registry or Datastore config options. Can not
301 be a `ButlerConfig` or a `ConfigSubset`. If `None`, default
302 configuration will be used. Root-dependent config options
303 specified in this config are overwritten if ``forceConfigRoot``
304 is `True`.
305 dimensionConfig : `Config` or `str`, optional
306 Configuration for dimensions, will be used to initialize registry
307 database.
308 standalone : `bool`
309 If True, write all expanded defaults, not just customized or
310 repository-specific settings.
311 This (mostly) decouples the repository from the default
312 configuration, insulating it from changes to the defaults (which
313 may be good or bad, depending on the nature of the changes).
314 Future *additions* to the defaults will still be picked up when
315 initializing `Butlers` to repos created with ``standalone=True``.
316 searchPaths : `list` of `str`, optional
317 Directory paths to search when calculating the full butler
318 configuration.
319 forceConfigRoot : `bool`, optional
320 If `False`, any values present in the supplied ``config`` that
321 would normally be reset are not overridden and will appear
322 directly in the output config. This allows non-standard overrides
323 of the root directory for a datastore or registry to be given.
324 If this parameter is `True` the values for ``root`` will be
325 forced into the resulting config if appropriate.
326 outfile : `lss.resources.ResourcePathExpression`, optional
327 If not-`None`, the output configuration will be written to this
328 location rather than into the repository itself. Can be a URI
329 string. Can refer to a directory that will be used to write
330 ``butler.yaml``.
331 overwrite : `bool`, optional
332 Create a new configuration file even if one already exists
333 in the specified output location. Default is to raise
334 an exception.
336 Returns
337 -------
338 config : `Config`
339 The updated `Config` instance written to the repo.
341 Raises
342 ------
343 ValueError
344 Raised if a ButlerConfig or ConfigSubset is passed instead of a
345 regular Config (as these subclasses would make it impossible to
346 support ``standalone=False``).
347 FileExistsError
348 Raised if the output config file already exists.
349 os.error
350 Raised if the directory does not exist, exists but is not a
351 directory, or cannot be created.
353 Notes
354 -----
355 Note that when ``standalone=False`` (the default), the configuration
356 search path (see `ConfigSubset.defaultSearchPaths`) that was used to
357 construct the repository should also be used to construct any Butlers
358 to avoid configuration inconsistencies.
359 """
360 if isinstance(config, ButlerConfig | ConfigSubset):
361 raise ValueError("makeRepo must be passed a regular Config without defaults applied.")
363 # Ensure that the root of the repository exists or can be made
364 root_uri = ResourcePath(root, forceDirectory=True)
365 root_uri.mkdir()
367 config = Config(config)
369 # If we are creating a new repo from scratch with relative roots,
370 # do not propagate an explicit root from the config file
371 if "root" in config:
372 del config["root"]
374 full = ButlerConfig(config, searchPaths=searchPaths) # this applies defaults
375 imported_class = doImportType(full["datastore", "cls"])
376 if not issubclass(imported_class, Datastore):
377 raise TypeError(f"Imported datastore class {full['datastore', 'cls']} is not a Datastore")
378 datastoreClass: type[Datastore] = imported_class
379 datastoreClass.setConfigRoot(BUTLER_ROOT_TAG, config, full, overwrite=forceConfigRoot)
381 # if key exists in given config, parse it, otherwise parse the defaults
382 # in the expanded config
383 if config.get(("registry", "db")):
384 registryConfig = RegistryConfig(config)
385 else:
386 registryConfig = RegistryConfig(full)
387 defaultDatabaseUri = registryConfig.makeDefaultDatabaseUri(BUTLER_ROOT_TAG)
388 if defaultDatabaseUri is not None:
389 Config.updateParameters(
390 RegistryConfig, config, full, toUpdate={"db": defaultDatabaseUri}, overwrite=forceConfigRoot
391 )
392 else:
393 Config.updateParameters(RegistryConfig, config, full, toCopy=("db",), overwrite=forceConfigRoot)
395 if standalone:
396 config.merge(full)
397 else:
398 # Always expand the registry.managers section into the per-repo
399 # config, because after the database schema is created, it's not
400 # allowed to change anymore. Note that in the standalone=True
401 # branch, _everything_ in the config is expanded, so there's no
402 # need to special case this.
403 Config.updateParameters(RegistryConfig, config, full, toMerge=("managers",), overwrite=False)
404 configURI: ResourcePathExpression
405 if outfile is not None:
406 # When writing to a separate location we must include
407 # the root of the butler repo in the config else it won't know
408 # where to look.
409 config["root"] = root_uri.geturl()
410 configURI = outfile
411 else:
412 configURI = root_uri
413 # Strip obscore configuration, if it is present, before writing config
414 # to a file, obscore config will be stored in registry.
415 if (obscore_config_key := ("registry", "managers", "obscore", "config")) in config:
416 config_to_write = config.copy()
417 del config_to_write[obscore_config_key]
418 config_to_write.dumpToUri(configURI, overwrite=overwrite)
419 # configFile attribute is updated, need to copy it to original.
420 config.configFile = config_to_write.configFile
421 else:
422 config.dumpToUri(configURI, overwrite=overwrite)
424 # Create Registry and populate tables
425 registryConfig = RegistryConfig(config.get("registry"))
426 dimensionConfig = DimensionConfig(dimensionConfig)
427 _RegistryFactory(registryConfig).create_from_config(
428 dimensionConfig=dimensionConfig, butlerRoot=root_uri
429 )
431 _LOG.verbose("Wrote new Butler configuration file to %s", configURI)
433 return config
435 @classmethod
436 def get_repo_uri(cls, label: str, return_label: bool = False) -> ResourcePath:
437 """Look up the label in a butler repository index.
439 Parameters
440 ----------
441 label : `str`
442 Label of the Butler repository to look up.
443 return_label : `bool`, optional
444 If ``label`` cannot be found in the repository index (either
445 because index is not defined or ``label`` is not in the index) and
446 ``return_label`` is `True` then return ``ResourcePath(label)``.
447 If ``return_label`` is `False` (default) then an exception will be
448 raised instead.
450 Returns
451 -------
452 uri : `lsst.resources.ResourcePath`
453 URI to the Butler repository associated with the given label or
454 default value if it is provided.
456 Raises
457 ------
458 KeyError
459 Raised if the label is not found in the index, or if an index
460 is not defined, and ``return_label`` is `False`.
462 Notes
463 -----
464 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the
465 information is discovered.
466 """
467 return ButlerRepoIndex.get_repo_uri(label, return_label)
469 @classmethod
470 def get_known_repos(cls) -> set[str]:
471 """Retrieve the list of known repository labels.
473 Returns
474 -------
475 repos : `set` of `str`
476 All the known labels. Can be empty if no index can be found.
478 Notes
479 -----
480 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the
481 information is discovered.
482 """
483 return ButlerRepoIndex.get_known_repos()
485 @abstractmethod
486 def _caching_context(self) -> AbstractContextManager[None]:
487 """Context manager that enables caching."""
488 raise NotImplementedError()
490 @abstractmethod
491 def transaction(self) -> AbstractContextManager[None]:
492 """Context manager supporting `Butler` transactions.
494 Transactions can be nested.
495 """
496 raise NotImplementedError()
498 @abstractmethod
499 def put(
500 self,
501 obj: Any,
502 datasetRefOrType: DatasetRef | DatasetType | str,
503 /,
504 dataId: DataId | None = None,
505 *,
506 run: str | None = None,
507 **kwargs: Any,
508 ) -> DatasetRef:
509 """Store and register a dataset.
511 Parameters
512 ----------
513 obj : `object`
514 The dataset.
515 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
516 When `DatasetRef` is provided, ``dataId`` should be `None`.
517 Otherwise the `DatasetType` or name thereof. If a fully resolved
518 `DatasetRef` is given the run and ID are used directly.
519 dataId : `dict` or `DataCoordinate`
520 A `dict` of `Dimension` link name, value pairs that label the
521 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
522 should be provided as the second argument.
523 run : `str`, optional
524 The name of the run the dataset should be added to, overriding
525 ``self.run``. Not used if a resolved `DatasetRef` is provided.
526 **kwargs
527 Additional keyword arguments used to augment or construct a
528 `DataCoordinate`. See `DataCoordinate.standardize`
529 parameters. Not used if a resolve `DatasetRef` is provided.
531 Returns
532 -------
533 ref : `DatasetRef`
534 A reference to the stored dataset, updated with the correct id if
535 given.
537 Raises
538 ------
539 TypeError
540 Raised if the butler is read-only or if no run has been provided.
541 """
542 raise NotImplementedError()
544 @abstractmethod
545 def getDeferred(
546 self,
547 datasetRefOrType: DatasetRef | DatasetType | str,
548 /,
549 dataId: DataId | None = None,
550 *,
551 parameters: dict | None = None,
552 collections: Any = None,
553 storageClass: str | StorageClass | None = None,
554 **kwargs: Any,
555 ) -> DeferredDatasetHandle:
556 """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
557 after an immediate registry lookup.
559 Parameters
560 ----------
561 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
562 When `DatasetRef` the `dataId` should be `None`.
563 Otherwise the `DatasetType` or name thereof.
564 dataId : `dict` or `DataCoordinate`, optional
565 A `dict` of `Dimension` link name, value pairs that label the
566 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
567 should be provided as the first argument.
568 parameters : `dict`
569 Additional StorageClass-defined options to control reading,
570 typically used to efficiently read only a subset of the dataset.
571 collections : Any, optional
572 Collections to be searched, overriding ``self.collections``.
573 Can be any of the types supported by the ``collections`` argument
574 to butler construction.
575 storageClass : `StorageClass` or `str`, optional
576 The storage class to be used to override the Python type
577 returned by this method. By default the returned type matches
578 the dataset type definition for this dataset. Specifying a
579 read `StorageClass` can force a different type to be returned.
580 This type must be compatible with the original type.
581 **kwargs
582 Additional keyword arguments used to augment or construct a
583 `DataId`. See `DataId` parameters.
585 Returns
586 -------
587 obj : `DeferredDatasetHandle`
588 A handle which can be used to retrieve a dataset at a later time.
590 Raises
591 ------
592 LookupError
593 Raised if no matching dataset exists in the `Registry` or
594 datastore.
595 ValueError
596 Raised if a resolved `DatasetRef` was passed as an input, but it
597 differs from the one found in the registry.
598 TypeError
599 Raised if no collections were provided.
600 """
601 raise NotImplementedError()
603 @abstractmethod
604 def get(
605 self,
606 datasetRefOrType: DatasetRef | DatasetType | str,
607 /,
608 dataId: DataId | None = None,
609 *,
610 parameters: dict[str, Any] | None = None,
611 collections: Any = None,
612 storageClass: StorageClass | str | None = None,
613 **kwargs: Any,
614 ) -> Any:
615 """Retrieve a stored dataset.
617 Parameters
618 ----------
619 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
620 When `DatasetRef` the `dataId` should be `None`.
621 Otherwise the `DatasetType` or name thereof.
622 If a resolved `DatasetRef`, the associated dataset
623 is returned directly without additional querying.
624 dataId : `dict` or `DataCoordinate`
625 A `dict` of `Dimension` link name, value pairs that label the
626 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
627 should be provided as the first argument.
628 parameters : `dict`
629 Additional StorageClass-defined options to control reading,
630 typically used to efficiently read only a subset of the dataset.
631 collections : Any, optional
632 Collections to be searched, overriding ``self.collections``.
633 Can be any of the types supported by the ``collections`` argument
634 to butler construction.
635 storageClass : `StorageClass` or `str`, optional
636 The storage class to be used to override the Python type
637 returned by this method. By default the returned type matches
638 the dataset type definition for this dataset. Specifying a
639 read `StorageClass` can force a different type to be returned.
640 This type must be compatible with the original type.
641 **kwargs
642 Additional keyword arguments used to augment or construct a
643 `DataCoordinate`. See `DataCoordinate.standardize`
644 parameters.
646 Returns
647 -------
648 obj : `object`
649 The dataset.
651 Raises
652 ------
653 LookupError
654 Raised if no matching dataset exists in the `Registry`.
655 TypeError
656 Raised if no collections were provided.
658 Notes
659 -----
660 When looking up datasets in a `~CollectionType.CALIBRATION` collection,
661 this method requires that the given data ID include temporal dimensions
662 beyond the dimensions of the dataset type itself, in order to find the
663 dataset with the appropriate validity range. For example, a "bias"
664 dataset with native dimensions ``{instrument, detector}`` could be
665 fetched with a ``{instrument, detector, exposure}`` data ID, because
666 ``exposure`` is a temporal dimension.
667 """
668 raise NotImplementedError()
670 @abstractmethod
671 def getURIs(
672 self,
673 datasetRefOrType: DatasetRef | DatasetType | str,
674 /,
675 dataId: DataId | None = None,
676 *,
677 predict: bool = False,
678 collections: Any = None,
679 run: str | None = None,
680 **kwargs: Any,
681 ) -> DatasetRefURIs:
682 """Return the URIs associated with the dataset.
684 Parameters
685 ----------
686 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
687 When `DatasetRef` the `dataId` should be `None`.
688 Otherwise the `DatasetType` or name thereof.
689 dataId : `dict` or `DataCoordinate`
690 A `dict` of `Dimension` link name, value pairs that label the
691 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
692 should be provided as the first argument.
693 predict : `bool`
694 If `True`, allow URIs to be returned of datasets that have not
695 been written.
696 collections : Any, optional
697 Collections to be searched, overriding ``self.collections``.
698 Can be any of the types supported by the ``collections`` argument
699 to butler construction.
700 run : `str`, optional
701 Run to use for predictions, overriding ``self.run``.
702 **kwargs
703 Additional keyword arguments used to augment or construct a
704 `DataCoordinate`. See `DataCoordinate.standardize`
705 parameters.
707 Returns
708 -------
709 uris : `DatasetRefURIs`
710 The URI to the primary artifact associated with this dataset (if
711 the dataset was disassembled within the datastore this may be
712 `None`), and the URIs to any components associated with the dataset
713 artifact. (can be empty if there are no components).
714 """
715 raise NotImplementedError()
717 @abstractmethod
718 def getURI(
719 self,
720 datasetRefOrType: DatasetRef | DatasetType | str,
721 /,
722 dataId: DataId | None = None,
723 *,
724 predict: bool = False,
725 collections: Any = None,
726 run: str | None = None,
727 **kwargs: Any,
728 ) -> ResourcePath:
729 """Return the URI to the Dataset.
731 Parameters
732 ----------
733 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
734 When `DatasetRef` the `dataId` should be `None`.
735 Otherwise the `DatasetType` or name thereof.
736 dataId : `dict` or `DataCoordinate`
737 A `dict` of `Dimension` link name, value pairs that label the
738 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
739 should be provided as the first argument.
740 predict : `bool`
741 If `True`, allow URIs to be returned of datasets that have not
742 been written.
743 collections : Any, optional
744 Collections to be searched, overriding ``self.collections``.
745 Can be any of the types supported by the ``collections`` argument
746 to butler construction.
747 run : `str`, optional
748 Run to use for predictions, overriding ``self.run``.
749 **kwargs
750 Additional keyword arguments used to augment or construct a
751 `DataCoordinate`. See `DataCoordinate.standardize`
752 parameters.
754 Returns
755 -------
756 uri : `lsst.resources.ResourcePath`
757 URI pointing to the Dataset within the datastore. If the
758 Dataset does not exist in the datastore, and if ``predict`` is
759 `True`, the URI will be a prediction and will include a URI
760 fragment "#predicted".
761 If the datastore does not have entities that relate well
762 to the concept of a URI the returned URI string will be
763 descriptive. The returned URI is not guaranteed to be obtainable.
765 Raises
766 ------
767 LookupError
768 A URI has been requested for a dataset that does not exist and
769 guessing is not allowed.
770 ValueError
771 Raised if a resolved `DatasetRef` was passed as an input, but it
772 differs from the one found in the registry.
773 TypeError
774 Raised if no collections were provided.
775 RuntimeError
776 Raised if a URI is requested for a dataset that consists of
777 multiple artifacts.
778 """
779 raise NotImplementedError()
781 @abstractmethod
782 def get_dataset_type(self, name: str) -> DatasetType:
783 """Get the `DatasetType`.
785 Parameters
786 ----------
787 name : `str`
788 Name of the type.
790 Returns
791 -------
792 type : `DatasetType`
793 The `DatasetType` associated with the given name.
795 Raises
796 ------
797 lsst.daf.butler.MissingDatasetTypeError
798 Raised if the requested dataset type has not been registered.
800 Notes
801 -----
802 This method handles component dataset types automatically, though most
803 other operations do not.
804 """
805 raise NotImplementedError()
807 @abstractmethod
808 def get_dataset(
809 self,
810 id: DatasetId,
811 storage_class: str | StorageClass | None,
812 dimension_records: bool = False,
813 datastore_records: bool = False,
814 ) -> DatasetRef | None:
815 """Retrieve a Dataset entry.
817 Parameters
818 ----------
819 id : `DatasetId`
820 The unique identifier for the dataset.
821 storage_class : `str` or `StorageClass` or `None`
822 A storage class to use when creating the returned entry. If given
823 it must be compatible with the default storage class.
824 dimension_records: `bool`, optional
825 If `True` the ref will be expanded and contain dimension records.
826 datastore_records: `bool`, optional.
827 If `True` the ref will contain associated datastore records.
829 Returns
830 -------
831 ref : `DatasetRef` or `None`
832 A ref to the Dataset, or `None` if no matching Dataset
833 was found.
834 """
835 raise NotImplementedError()
837 @abstractmethod
838 def find_dataset(
839 self,
840 dataset_type: DatasetType | str,
841 data_id: DataId | None = None,
842 *,
843 collections: str | Sequence[str] | None = None,
844 timespan: Timespan | None = None,
845 storage_class: str | StorageClass | None = None,
846 dimension_records: bool = False,
847 datastore_records: bool = False,
848 **kwargs: Any,
849 ) -> DatasetRef | None:
850 """Find a dataset given its `DatasetType` and data ID.
852 This can be used to obtain a `DatasetRef` that permits the dataset to
853 be read from a `Datastore`. If the dataset is a component and can not
854 be found using the provided dataset type, a dataset ref for the parent
855 will be returned instead but with the correct dataset type.
857 Parameters
858 ----------
859 dataset_type : `DatasetType` or `str`
860 A `DatasetType` or the name of one. If this is a `DatasetType`
861 instance, its storage class will be respected and propagated to
862 the output, even if it differs from the dataset type definition
863 in the registry, as long as the storage classes are convertible.
864 data_id : `dict` or `DataCoordinate`, optional
865 A `dict`-like object containing the `Dimension` links that identify
866 the dataset within a collection. If it is a `dict` the dataId
867 can include dimension record values such as ``day_obs`` and
868 ``seq_num`` or ``full_name`` that can be used to derive the
869 primary dimension.
870 collections : `str` or `list` [`str`], optional
871 A an ordered list of collections to search for the dataset.
872 Defaults to ``self.defaults.collections``.
873 timespan : `Timespan`, optional
874 A timespan that the validity range of the dataset must overlap.
875 If not provided, any `~CollectionType.CALIBRATION` collections
876 matched by the ``collections`` argument will not be searched.
877 storage_class : `str` or `StorageClass` or `None`
878 A storage class to use when creating the returned entry. If given
879 it must be compatible with the default storage class.
880 dimension_records: `bool`, optional
881 If `True` the ref will be expanded and contain dimension records.
882 datastore_records: `bool`, optional.
883 If `True` the ref will contain associated datastore records.
884 **kwargs
885 Additional keyword arguments passed to
886 `DataCoordinate.standardize` to convert ``dataId`` to a true
887 `DataCoordinate` or augment an existing one. This can also include
888 dimension record metadata that can be used to derive a primary
889 dimension value.
891 Returns
892 -------
893 ref : `DatasetRef`
894 A reference to the dataset, or `None` if no matching Dataset
895 was found.
897 Raises
898 ------
899 lsst.daf.butler.NoDefaultCollectionError
900 Raised if ``collections`` is `None` and
901 ``self.collections`` is `None`.
902 LookupError
903 Raised if one or more data ID keys are missing.
904 lsst.daf.butler.MissingDatasetTypeError
905 Raised if the dataset type does not exist.
906 lsst.daf.butler.MissingCollectionError
907 Raised if any of ``collections`` does not exist in the registry.
909 Notes
910 -----
911 This method simply returns `None` and does not raise an exception even
912 when the set of collections searched is intrinsically incompatible with
913 the dataset type, e.g. if ``datasetType.isCalibration() is False``, but
914 only `~CollectionType.CALIBRATION` collections are being searched.
915 This may make it harder to debug some lookup failures, but the behavior
916 is intentional; we consider it more important that failed searches are
917 reported consistently, regardless of the reason, and that adding
918 additional collections that do not contain a match to the search path
919 never changes the behavior.
921 This method handles component dataset types automatically, though most
922 other query operations do not.
923 """
924 raise NotImplementedError()
926 @abstractmethod
927 def retrieveArtifacts(
928 self,
929 refs: Iterable[DatasetRef],
930 destination: ResourcePathExpression,
931 transfer: str = "auto",
932 preserve_path: bool = True,
933 overwrite: bool = False,
934 ) -> list[ResourcePath]:
935 """Retrieve the artifacts associated with the supplied refs.
937 Parameters
938 ----------
939 refs : iterable of `DatasetRef`
940 The datasets for which artifacts are to be retrieved.
941 A single ref can result in multiple artifacts. The refs must
942 be resolved.
943 destination : `lsst.resources.ResourcePath` or `str`
944 Location to write the artifacts.
945 transfer : `str`, optional
946 Method to use to transfer the artifacts. Must be one of the options
947 supported by `~lsst.resources.ResourcePath.transfer_from()`.
948 "move" is not allowed.
949 preserve_path : `bool`, optional
950 If `True` the full path of the artifact within the datastore
951 is preserved. If `False` the final file component of the path
952 is used.
953 overwrite : `bool`, optional
954 If `True` allow transfers to overwrite existing files at the
955 destination.
957 Returns
958 -------
959 targets : `list` of `lsst.resources.ResourcePath`
960 URIs of file artifacts in destination location. Order is not
961 preserved.
963 Notes
964 -----
965 For non-file datastores the artifacts written to the destination
966 may not match the representation inside the datastore. For example
967 a hierarchical data structure in a NoSQL database may well be stored
968 as a JSON file.
969 """
970 raise NotImplementedError()
972 @abstractmethod
973 def exists(
974 self,
975 dataset_ref_or_type: DatasetRef | DatasetType | str,
976 /,
977 data_id: DataId | None = None,
978 *,
979 full_check: bool = True,
980 collections: Any = None,
981 **kwargs: Any,
982 ) -> DatasetExistence:
983 """Indicate whether a dataset is known to Butler registry and
984 datastore.
986 Parameters
987 ----------
988 dataset_ref_or_type : `DatasetRef`, `DatasetType`, or `str`
989 When `DatasetRef` the `dataId` should be `None`.
990 Otherwise the `DatasetType` or name thereof.
991 data_id : `dict` or `DataCoordinate`
992 A `dict` of `Dimension` link name, value pairs that label the
993 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
994 should be provided as the first argument.
995 full_check : `bool`, optional
996 If `True`, an additional check will be made for dataset artifact
997 existence. This will involve additional overhead due to the need
998 to query an external system. If `False` registry and datastore
999 will solely be asked if they know about the dataset but no
1000 check for the artifact will be performed.
1001 collections : Any, optional
1002 Collections to be searched, overriding ``self.collections``.
1003 Can be any of the types supported by the ``collections`` argument
1004 to butler construction.
1005 **kwargs
1006 Additional keyword arguments used to augment or construct a
1007 `DataCoordinate`. See `DataCoordinate.standardize`
1008 parameters.
1010 Returns
1011 -------
1012 existence : `DatasetExistence`
1013 Object indicating whether the dataset is known to registry and
1014 datastore. Evaluates to `True` if the dataset is present and known
1015 to both.
1016 """
1017 raise NotImplementedError()
1019 @abstractmethod
1020 def _exists_many(
1021 self,
1022 refs: Iterable[DatasetRef],
1023 /,
1024 *,
1025 full_check: bool = True,
1026 ) -> dict[DatasetRef, DatasetExistence]:
1027 """Indicate whether multiple datasets are known to Butler registry and
1028 datastore.
1030 This is an experimental API that may change at any moment.
1032 Parameters
1033 ----------
1034 refs : iterable of `DatasetRef`
1035 The datasets to be checked.
1036 full_check : `bool`, optional
1037 If `True`, an additional check will be made for dataset artifact
1038 existence. This will involve additional overhead due to the need
1039 to query an external system. If `False` registry and datastore
1040 will solely be asked if they know about the dataset but no
1041 check for the artifact will be performed.
1043 Returns
1044 -------
1045 existence : dict of [`DatasetRef`, `DatasetExistence`]
1046 Mapping from the given dataset refs to an enum indicating the
1047 status of the dataset in registry and datastore.
1048 Each value evaluates to `True` if the dataset is present and known
1049 to both.
1050 """
1051 raise NotImplementedError()
1053 @abstractmethod
1054 def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None:
1055 """Remove one or more `~CollectionType.RUN` collections and the
1056 datasets within them.
1058 Parameters
1059 ----------
1060 names : `~collections.abc.Iterable` [ `str` ]
1061 The names of the collections to remove.
1062 unstore : `bool`, optional
1063 If `True` (default), delete datasets from all datastores in which
1064 they are present, and attempt to rollback the registry deletions if
1065 datastore deletions fail (which may not always be possible). If
1066 `False`, datastore records for these datasets are still removed,
1067 but any artifacts (e.g. files) will not be.
1069 Raises
1070 ------
1071 TypeError
1072 Raised if one or more collections are not of type
1073 `~CollectionType.RUN`.
1074 """
1075 raise NotImplementedError()
1077 @abstractmethod
1078 def ingest(
1079 self,
1080 *datasets: FileDataset,
1081 transfer: str | None = "auto",
1082 run: str | None = None,
1083 idGenerationMode: DatasetIdGenEnum | None = None,
1084 record_validation_info: bool = True,
1085 ) -> None:
1086 """Store and register one or more datasets that already exist on disk.
1088 Parameters
1089 ----------
1090 datasets : `FileDataset`
1091 Each positional argument is a struct containing information about
1092 a file to be ingested, including its URI (either absolute or
1093 relative to the datastore root, if applicable), a resolved
1094 `DatasetRef`, and optionally a formatter class or its
1095 fully-qualified string name. If a formatter is not provided, the
1096 formatter that would be used for `put` is assumed. On successful
1097 ingest all `FileDataset.formatter` attributes will be set to the
1098 formatter class used. `FileDataset.path` attributes may be modified
1099 to put paths in whatever the datastore considers a standardized
1100 form.
1101 transfer : `str`, optional
1102 If not `None`, must be one of 'auto', 'move', 'copy', 'direct',
1103 'split', 'hardlink', 'relsymlink' or 'symlink', indicating how to
1104 transfer the file.
1105 run : `str`, optional
1106 The name of the run ingested datasets should be added to,
1107 overriding ``self.run``. This parameter is now deprecated since
1108 the run is encoded in the ``FileDataset``.
1109 idGenerationMode : `DatasetIdGenEnum`, optional
1110 Specifies option for generating dataset IDs. Parameter is
1111 deprecated.
1112 record_validation_info : `bool`, optional
1113 If `True`, the default, the datastore can record validation
1114 information associated with the file. If `False` the datastore
1115 will not attempt to track any information such as checksums
1116 or file sizes. This can be useful if such information is tracked
1117 in an external system or if the file is to be compressed in place.
1118 It is up to the datastore whether this parameter is relevant.
1120 Raises
1121 ------
1122 TypeError
1123 Raised if the butler is read-only or if no run was provided.
1124 NotImplementedError
1125 Raised if the `Datastore` does not support the given transfer mode.
1126 DatasetTypeNotSupportedError
1127 Raised if one or more files to be ingested have a dataset type that
1128 is not supported by the `Datastore`..
1129 FileNotFoundError
1130 Raised if one of the given files does not exist.
1131 FileExistsError
1132 Raised if transfer is not `None` but the (internal) location the
1133 file would be moved to is already occupied.
1135 Notes
1136 -----
1137 This operation is not fully exception safe: if a database operation
1138 fails, the given `FileDataset` instances may be only partially updated.
1140 It is atomic in terms of database operations (they will either all
1141 succeed or all fail) providing the database engine implements
1142 transactions correctly. It will attempt to be atomic in terms of
1143 filesystem operations as well, but this cannot be implemented
1144 rigorously for most datastores.
1145 """
1146 raise NotImplementedError()
1148 @abstractmethod
1149 def export(
1150 self,
1151 *,
1152 directory: str | None = None,
1153 filename: str | None = None,
1154 format: str | None = None,
1155 transfer: str | None = None,
1156 ) -> AbstractContextManager[RepoExportContext]:
1157 """Export datasets from the repository represented by this `Butler`.
1159 This method is a context manager that returns a helper object
1160 (`RepoExportContext`) that is used to indicate what information from
1161 the repository should be exported.
1163 Parameters
1164 ----------
1165 directory : `str`, optional
1166 Directory dataset files should be written to if ``transfer`` is not
1167 `None`.
1168 filename : `str`, optional
1169 Name for the file that will include database information associated
1170 with the exported datasets. If this is not an absolute path and
1171 ``directory`` is not `None`, it will be written to ``directory``
1172 instead of the current working directory. Defaults to
1173 "export.{format}".
1174 format : `str`, optional
1175 File format for the database information file. If `None`, the
1176 extension of ``filename`` will be used.
1177 transfer : `str`, optional
1178 Transfer mode passed to `Datastore.export`.
1180 Raises
1181 ------
1182 TypeError
1183 Raised if the set of arguments passed is inconsistent.
1185 Examples
1186 --------
1187 Typically the `Registry.queryDataIds` and `Registry.queryDatasets`
1188 methods are used to provide the iterables over data IDs and/or datasets
1189 to be exported::
1191 with butler.export("exports.yaml") as export:
1192 # Export all flats, but none of the dimension element rows
1193 # (i.e. data ID information) associated with them.
1194 export.saveDatasets(butler.registry.queryDatasets("flat"),
1195 elements=())
1196 # Export all datasets that start with "deepCoadd_" and all of
1197 # their associated data ID information.
1198 export.saveDatasets(butler.registry.queryDatasets("deepCoadd_*"))
1199 """
1200 raise NotImplementedError()
1202 @abstractmethod
1203 def import_(
1204 self,
1205 *,
1206 directory: ResourcePathExpression | None = None,
1207 filename: ResourcePathExpression | TextIO | None = None,
1208 format: str | None = None,
1209 transfer: str | None = None,
1210 skip_dimensions: set | None = None,
1211 ) -> None:
1212 """Import datasets into this repository that were exported from a
1213 different butler repository via `~lsst.daf.butler.Butler.export`.
1215 Parameters
1216 ----------
1217 directory : `~lsst.resources.ResourcePathExpression`, optional
1218 Directory containing dataset files to import from. If `None`,
1219 ``filename`` and all dataset file paths specified therein must
1220 be absolute.
1221 filename : `~lsst.resources.ResourcePathExpression` or `TextIO`
1222 A stream or name of file that contains database information
1223 associated with the exported datasets, typically generated by
1224 `~lsst.daf.butler.Butler.export`. If this a string (name) or
1225 `~lsst.resources.ResourcePath` and is not an absolute path,
1226 it will first be looked for relative to ``directory`` and if not
1227 found there it will be looked for in the current working
1228 directory. Defaults to "export.{format}".
1229 format : `str`, optional
1230 File format for ``filename``. If `None`, the extension of
1231 ``filename`` will be used.
1232 transfer : `str`, optional
1233 Transfer mode passed to `~lsst.daf.butler.Datastore.ingest`.
1234 skip_dimensions : `set`, optional
1235 Names of dimensions that should be skipped and not imported.
1237 Raises
1238 ------
1239 TypeError
1240 Raised if the set of arguments passed is inconsistent, or if the
1241 butler is read-only.
1242 """
1243 raise NotImplementedError()
1245 @abstractmethod
1246 def transfer_from(
1247 self,
1248 source_butler: LimitedButler,
1249 source_refs: Iterable[DatasetRef],
1250 transfer: str = "auto",
1251 skip_missing: bool = True,
1252 register_dataset_types: bool = False,
1253 transfer_dimensions: bool = False,
1254 ) -> Collection[DatasetRef]:
1255 """Transfer datasets to this Butler from a run in another Butler.
1257 Parameters
1258 ----------
1259 source_butler : `LimitedButler`
1260 Butler from which the datasets are to be transferred. If data IDs
1261 in ``source_refs`` are not expanded then this has to be a full
1262 `Butler` whose registry will be used to expand data IDs.
1263 source_refs : iterable of `DatasetRef`
1264 Datasets defined in the source butler that should be transferred to
1265 this butler. In most circumstances, ``transfer_from`` is faster if
1266 the dataset refs are expanded.
1267 transfer : `str`, optional
1268 Transfer mode passed to `~lsst.daf.butler.Datastore.transfer_from`.
1269 skip_missing : `bool`
1270 If `True`, datasets with no datastore artifact associated with
1271 them are not transferred. If `False` a registry entry will be
1272 created even if no datastore record is created (and so will
1273 look equivalent to the dataset being unstored).
1274 register_dataset_types : `bool`
1275 If `True` any missing dataset types are registered. Otherwise
1276 an exception is raised.
1277 transfer_dimensions : `bool`, optional
1278 If `True`, dimension record data associated with the new datasets
1279 will be transferred.
1281 Returns
1282 -------
1283 refs : `list` of `DatasetRef`
1284 The refs added to this Butler.
1286 Notes
1287 -----
1288 The datastore artifact has to exist for a transfer
1289 to be made but non-existence is not an error.
1291 Datasets that already exist in this run will be skipped.
1293 The datasets are imported as part of a transaction, although
1294 dataset types are registered before the transaction is started.
1295 This means that it is possible for a dataset type to be registered
1296 even though transfer has failed.
1297 """
1298 raise NotImplementedError()
1300 @abstractmethod
1301 def validateConfiguration(
1302 self,
1303 logFailures: bool = False,
1304 datasetTypeNames: Iterable[str] | None = None,
1305 ignore: Iterable[str] | None = None,
1306 ) -> None:
1307 """Validate butler configuration.
1309 Checks that each `DatasetType` can be stored in the `Datastore`.
1311 Parameters
1312 ----------
1313 logFailures : `bool`, optional
1314 If `True`, output a log message for every validation error
1315 detected.
1316 datasetTypeNames : iterable of `str`, optional
1317 The `DatasetType` names that should be checked. This allows
1318 only a subset to be selected.
1319 ignore : iterable of `str`, optional
1320 Names of DatasetTypes to skip over. This can be used to skip
1321 known problems. If a named `DatasetType` corresponds to a
1322 composite, all components of that `DatasetType` will also be
1323 ignored.
1325 Raises
1326 ------
1327 ButlerValidationError
1328 Raised if there is some inconsistency with how this Butler
1329 is configured.
1330 """
1331 raise NotImplementedError()
1333 @property
1334 @abstractmethod
1335 def collections(self) -> Sequence[str]:
1336 """The collections to search by default, in order
1337 (`~collections.abc.Sequence` [ `str` ]).
1338 """
1339 raise NotImplementedError()
1341 @property
1342 @abstractmethod
1343 def run(self) -> str | None:
1344 """Name of the run this butler writes outputs to by default (`str` or
1345 `None`).
1346 """
1347 raise NotImplementedError()
1349 @property
1350 @abstractmethod
1351 def registry(self) -> Registry:
1352 """The object that manages dataset metadata and relationships
1353 (`Registry`).
1355 Many operations that don't involve reading or writing butler datasets
1356 are accessible only via `Registry` methods. Eventually these methods
1357 will be replaced by equivalent `Butler` methods.
1358 """
1359 raise NotImplementedError()