Coverage for python/lsst/daf/butler/_butler.py: 62%
133 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-10-27 09:44 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-10-27 09:44 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28from __future__ import annotations
30__all__ = ["Butler"]
32from abc import abstractmethod
33from collections.abc import Collection, Iterable, Sequence
34from contextlib import AbstractContextManager
35from typing import Any, TextIO
37from lsst.resources import ResourcePath, ResourcePathExpression
38from lsst.utils import doImportType
39from lsst.utils.logging import getLogger
41from ._butler_config import ButlerConfig
42from ._butler_repo_index import ButlerRepoIndex
43from ._config import Config, ConfigSubset
44from ._dataset_existence import DatasetExistence
45from ._dataset_ref import DatasetIdGenEnum, DatasetRef
46from ._dataset_type import DatasetType
47from ._deferredDatasetHandle import DeferredDatasetHandle
48from ._file_dataset import FileDataset
49from ._limited_butler import LimitedButler
50from ._storage_class import StorageClass
51from .datastore import DatasetRefURIs, Datastore
52from .dimensions import DataId, DimensionConfig
53from .registry import Registry, RegistryConfig, _RegistryFactory
54from .repo_relocation import BUTLER_ROOT_TAG
55from .transfers import RepoExportContext
57_LOG = getLogger(__name__)
60class Butler(LimitedButler):
61 """Interface for data butler and factory for Butler instances.
63 Parameters
64 ----------
65 config : `ButlerConfig`, `Config` or `str`, optional.
66 Configuration. Anything acceptable to the `ButlerConfig` constructor.
67 If a directory path is given the configuration will be read from a
68 ``butler.yaml`` file in that location. If `None` is given default
69 values will be used. If ``config`` contains "cls" key then its value is
70 used as a name of butler class and it must be a sub-class of this
71 class, otherwise `DirectButler` is instantiated.
72 collections : `str` or `~collections.abc.Iterable` [ `str` ], optional
73 An expression specifying the collections to be searched (in order) when
74 reading datasets.
75 This may be a `str` collection name or an iterable thereof.
76 See :ref:`daf_butler_collection_expressions` for more information.
77 These collections are not registered automatically and must be
78 manually registered before they are used by any method, but they may be
79 manually registered after the `Butler` is initialized.
80 run : `str`, optional
81 Name of the `~CollectionType.RUN` collection new datasets should be
82 inserted into. If ``collections`` is `None` and ``run`` is not `None`,
83 ``collections`` will be set to ``[run]``. If not `None`, this
84 collection will automatically be registered. If this is not set (and
85 ``writeable`` is not set either), a read-only butler will be created.
86 searchPaths : `list` of `str`, optional
87 Directory paths to search when calculating the full Butler
88 configuration. Not used if the supplied config is already a
89 `ButlerConfig`.
90 writeable : `bool`, optional
91 Explicitly sets whether the butler supports write operations. If not
92 provided, a read-write butler is created if any of ``run``, ``tags``,
93 or ``chains`` is non-empty.
94 inferDefaults : `bool`, optional
95 If `True` (default) infer default data ID values from the values
96 present in the datasets in ``collections``: if all collections have the
97 same value (or no value) for a governor dimension, that value will be
98 the default for that dimension. Nonexistent collections are ignored.
99 If a default value is provided explicitly for a governor dimension via
100 ``**kwargs``, no default will be inferred for that dimension.
101 **kwargs : `Any`
102 Additional keyword arguments passed to a constructor of actual butler
103 class.
105 Notes
106 -----
107 The preferred way to instantiate Butler is via the `from_config` method.
108 The call to ``Butler(...)`` is equivalent to ``Butler.from_config(...)``,
109 but ``mypy`` will complain about the former.
110 """
112 def __new__(
113 cls,
114 config: Config | ResourcePathExpression | None = None,
115 *,
116 collections: Any = None,
117 run: str | None = None,
118 searchPaths: Sequence[ResourcePathExpression] | None = None,
119 writeable: bool | None = None,
120 inferDefaults: bool = True,
121 **kwargs: Any,
122 ) -> Butler:
123 if cls is Butler:
124 cls = cls._find_butler_class(config, searchPaths)
125 # Note: we do not pass any parameters to __new__, Python will pass them
126 # to __init__ after __new__ returns sub-class instance.
127 return super().__new__(cls)
129 @staticmethod
130 def _find_butler_class(
131 config: Config | ResourcePathExpression | None = None,
132 searchPaths: Sequence[ResourcePathExpression] | None = None,
133 ) -> type[Butler]:
134 """Find actual class to instantiate."""
135 butler_class_name: str | None = None
136 if config is not None:
137 # Check for optional "cls" key in config.
138 if not isinstance(config, Config):
139 config = ButlerConfig(config, searchPaths=searchPaths)
140 butler_class_name = config.get("cls")
142 # Make DirectButler if class is not specified.
143 butler_class: type[Butler]
144 if butler_class_name is None:
145 from .direct_butler import DirectButler
147 butler_class = DirectButler
148 else:
149 butler_class = doImportType(butler_class_name)
150 if not issubclass(butler_class, Butler):
151 raise TypeError(f"{butler_class_name} is not a subclass of Butler")
152 return butler_class
154 @classmethod
155 def from_config(
156 cls,
157 config: Config | ResourcePathExpression | None = None,
158 *,
159 collections: Any = None,
160 run: str | None = None,
161 searchPaths: Sequence[ResourcePathExpression] | None = None,
162 writeable: bool | None = None,
163 inferDefaults: bool = True,
164 **kwargs: Any,
165 ) -> Butler:
166 """Create butler instance from configuration.
168 Parameters
169 ----------
170 config : `ButlerConfig`, `Config` or `str`, optional.
171 Configuration. Anything acceptable to the `ButlerConfig`
172 constructor. If a directory path is given the configuration will be
173 read from a ``butler.yaml`` file in that location. If `None` is
174 given default values will be used. If ``config`` contains "cls" key
175 then its value is used as a name of butler class and it must be a
176 sub-class of this class, otherwise `DirectButler` is instantiated.
177 collections : `str` or `~collections.abc.Iterable` [ `str` ], optional
178 An expression specifying the collections to be searched (in order)
179 when reading datasets.
180 This may be a `str` collection name or an iterable thereof.
181 See :ref:`daf_butler_collection_expressions` for more information.
182 These collections are not registered automatically and must be
183 manually registered before they are used by any method, but they
184 may be manually registered after the `Butler` is initialized.
185 run : `str`, optional
186 Name of the `~CollectionType.RUN` collection new datasets should be
187 inserted into. If ``collections`` is `None` and ``run`` is not
188 `None`, ``collections`` will be set to ``[run]``. If not `None`,
189 this collection will automatically be registered. If this is not
190 set (and ``writeable`` is not set either), a read-only butler will
191 be created.
192 searchPaths : `list` of `str`, optional
193 Directory paths to search when calculating the full Butler
194 configuration. Not used if the supplied config is already a
195 `ButlerConfig`.
196 writeable : `bool`, optional
197 Explicitly sets whether the butler supports write operations. If
198 not provided, a read-write butler is created if any of ``run``,
199 ``tags``, or ``chains`` is non-empty.
200 inferDefaults : `bool`, optional
201 If `True` (default) infer default data ID values from the values
202 present in the datasets in ``collections``: if all collections have
203 the same value (or no value) for a governor dimension, that value
204 will be the default for that dimension. Nonexistent collections
205 are ignored. If a default value is provided explicitly for a
206 governor dimension via ``**kwargs``, no default will be inferred
207 for that dimension.
208 **kwargs : `Any`
209 Additional keyword arguments passed to a constructor of actual
210 butler class.
212 Notes
213 -----
214 Calling this factory method is identical to calling
215 ``Butler(config, ...)``. Its only raison d'être is that ``mypy``
216 complains about ``Butler()`` call.
218 Examples
219 --------
220 While there are many ways to control exactly how a `Butler` interacts
221 with the collections in its `Registry`, the most common cases are still
222 simple.
224 For a read-only `Butler` that searches one collection, do::
226 butler = Butler.from_config(
227 "/path/to/repo", collections=["u/alice/DM-50000"]
228 )
230 For a read-write `Butler` that writes to and reads from a
231 `~CollectionType.RUN` collection::
233 butler = Butler.from_config(
234 "/path/to/repo", run="u/alice/DM-50000/a"
235 )
237 The `Butler` passed to a ``PipelineTask`` is often much more complex,
238 because we want to write to one `~CollectionType.RUN` collection but
239 read from several others (as well)::
241 butler = Butler.from_config(
242 "/path/to/repo",
243 run="u/alice/DM-50000/a",
244 collections=[
245 "u/alice/DM-50000/a", "u/bob/DM-49998", "HSC/defaults"
246 ]
247 )
249 This butler will `put` new datasets to the run ``u/alice/DM-50000/a``.
250 Datasets will be read first from that run (since it appears first in
251 the chain), and then from ``u/bob/DM-49998`` and finally
252 ``HSC/defaults``.
254 Finally, one can always create a `Butler` with no collections::
256 butler = Butler.from_config("/path/to/repo", writeable=True)
258 This can be extremely useful when you just want to use
259 ``butler.registry``, e.g. for inserting dimension data or managing
260 collections, or when the collections you want to use with the butler
261 are not consistent. Passing ``writeable`` explicitly here is only
262 necessary if you want to be able to make changes to the repo - usually
263 the value for ``writeable`` can be guessed from the collection
264 arguments provided, but it defaults to `False` when there are not
265 collection arguments.
266 """
267 cls = cls._find_butler_class(config, searchPaths)
268 return cls(
269 config,
270 collections=collections,
271 run=run,
272 searchPaths=searchPaths,
273 writeable=writeable,
274 inferDefaults=inferDefaults,
275 **kwargs,
276 )
278 @staticmethod
279 def makeRepo(
280 root: ResourcePathExpression,
281 config: Config | str | None = None,
282 dimensionConfig: Config | str | None = None,
283 standalone: bool = False,
284 searchPaths: list[str] | None = None,
285 forceConfigRoot: bool = True,
286 outfile: ResourcePathExpression | None = None,
287 overwrite: bool = False,
288 ) -> Config:
289 """Create an empty data repository by adding a butler.yaml config
290 to a repository root directory.
292 Parameters
293 ----------
294 root : `lsst.resources.ResourcePathExpression`
295 Path or URI to the root location of the new repository. Will be
296 created if it does not exist.
297 config : `Config` or `str`, optional
298 Configuration to write to the repository, after setting any
299 root-dependent Registry or Datastore config options. Can not
300 be a `ButlerConfig` or a `ConfigSubset`. If `None`, default
301 configuration will be used. Root-dependent config options
302 specified in this config are overwritten if ``forceConfigRoot``
303 is `True`.
304 dimensionConfig : `Config` or `str`, optional
305 Configuration for dimensions, will be used to initialize registry
306 database.
307 standalone : `bool`
308 If True, write all expanded defaults, not just customized or
309 repository-specific settings.
310 This (mostly) decouples the repository from the default
311 configuration, insulating it from changes to the defaults (which
312 may be good or bad, depending on the nature of the changes).
313 Future *additions* to the defaults will still be picked up when
314 initializing `Butlers` to repos created with ``standalone=True``.
315 searchPaths : `list` of `str`, optional
316 Directory paths to search when calculating the full butler
317 configuration.
318 forceConfigRoot : `bool`, optional
319 If `False`, any values present in the supplied ``config`` that
320 would normally be reset are not overridden and will appear
321 directly in the output config. This allows non-standard overrides
322 of the root directory for a datastore or registry to be given.
323 If this parameter is `True` the values for ``root`` will be
324 forced into the resulting config if appropriate.
325 outfile : `lss.resources.ResourcePathExpression`, optional
326 If not-`None`, the output configuration will be written to this
327 location rather than into the repository itself. Can be a URI
328 string. Can refer to a directory that will be used to write
329 ``butler.yaml``.
330 overwrite : `bool`, optional
331 Create a new configuration file even if one already exists
332 in the specified output location. Default is to raise
333 an exception.
335 Returns
336 -------
337 config : `Config`
338 The updated `Config` instance written to the repo.
340 Raises
341 ------
342 ValueError
343 Raised if a ButlerConfig or ConfigSubset is passed instead of a
344 regular Config (as these subclasses would make it impossible to
345 support ``standalone=False``).
346 FileExistsError
347 Raised if the output config file already exists.
348 os.error
349 Raised if the directory does not exist, exists but is not a
350 directory, or cannot be created.
352 Notes
353 -----
354 Note that when ``standalone=False`` (the default), the configuration
355 search path (see `ConfigSubset.defaultSearchPaths`) that was used to
356 construct the repository should also be used to construct any Butlers
357 to avoid configuration inconsistencies.
358 """
359 if isinstance(config, ButlerConfig | ConfigSubset):
360 raise ValueError("makeRepo must be passed a regular Config without defaults applied.")
362 # Ensure that the root of the repository exists or can be made
363 root_uri = ResourcePath(root, forceDirectory=True)
364 root_uri.mkdir()
366 config = Config(config)
368 # If we are creating a new repo from scratch with relative roots,
369 # do not propagate an explicit root from the config file
370 if "root" in config:
371 del config["root"]
373 full = ButlerConfig(config, searchPaths=searchPaths) # this applies defaults
374 imported_class = doImportType(full["datastore", "cls"])
375 if not issubclass(imported_class, Datastore):
376 raise TypeError(f"Imported datastore class {full['datastore', 'cls']} is not a Datastore")
377 datastoreClass: type[Datastore] = imported_class
378 datastoreClass.setConfigRoot(BUTLER_ROOT_TAG, config, full, overwrite=forceConfigRoot)
380 # if key exists in given config, parse it, otherwise parse the defaults
381 # in the expanded config
382 if config.get(("registry", "db")):
383 registryConfig = RegistryConfig(config)
384 else:
385 registryConfig = RegistryConfig(full)
386 defaultDatabaseUri = registryConfig.makeDefaultDatabaseUri(BUTLER_ROOT_TAG)
387 if defaultDatabaseUri is not None:
388 Config.updateParameters(
389 RegistryConfig, config, full, toUpdate={"db": defaultDatabaseUri}, overwrite=forceConfigRoot
390 )
391 else:
392 Config.updateParameters(RegistryConfig, config, full, toCopy=("db",), overwrite=forceConfigRoot)
394 if standalone:
395 config.merge(full)
396 else:
397 # Always expand the registry.managers section into the per-repo
398 # config, because after the database schema is created, it's not
399 # allowed to change anymore. Note that in the standalone=True
400 # branch, _everything_ in the config is expanded, so there's no
401 # need to special case this.
402 Config.updateParameters(RegistryConfig, config, full, toMerge=("managers",), overwrite=False)
403 configURI: ResourcePathExpression
404 if outfile is not None:
405 # When writing to a separate location we must include
406 # the root of the butler repo in the config else it won't know
407 # where to look.
408 config["root"] = root_uri.geturl()
409 configURI = outfile
410 else:
411 configURI = root_uri
412 # Strip obscore configuration, if it is present, before writing config
413 # to a file, obscore config will be stored in registry.
414 if (obscore_config_key := ("registry", "managers", "obscore", "config")) in config:
415 config_to_write = config.copy()
416 del config_to_write[obscore_config_key]
417 config_to_write.dumpToUri(configURI, overwrite=overwrite)
418 # configFile attribute is updated, need to copy it to original.
419 config.configFile = config_to_write.configFile
420 else:
421 config.dumpToUri(configURI, overwrite=overwrite)
423 # Create Registry and populate tables
424 registryConfig = RegistryConfig(config.get("registry"))
425 dimensionConfig = DimensionConfig(dimensionConfig)
426 _RegistryFactory(registryConfig).create_from_config(
427 dimensionConfig=dimensionConfig, butlerRoot=root_uri
428 )
430 _LOG.verbose("Wrote new Butler configuration file to %s", configURI)
432 return config
434 @classmethod
435 def get_repo_uri(cls, label: str, return_label: bool = False) -> ResourcePath:
436 """Look up the label in a butler repository index.
438 Parameters
439 ----------
440 label : `str`
441 Label of the Butler repository to look up.
442 return_label : `bool`, optional
443 If ``label`` cannot be found in the repository index (either
444 because index is not defined or ``label`` is not in the index) and
445 ``return_label`` is `True` then return ``ResourcePath(label)``.
446 If ``return_label`` is `False` (default) then an exception will be
447 raised instead.
449 Returns
450 -------
451 uri : `lsst.resources.ResourcePath`
452 URI to the Butler repository associated with the given label or
453 default value if it is provided.
455 Raises
456 ------
457 KeyError
458 Raised if the label is not found in the index, or if an index
459 is not defined, and ``return_label`` is `False`.
461 Notes
462 -----
463 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the
464 information is discovered.
465 """
466 return ButlerRepoIndex.get_repo_uri(label, return_label)
468 @classmethod
469 def get_known_repos(cls) -> set[str]:
470 """Retrieve the list of known repository labels.
472 Returns
473 -------
474 repos : `set` of `str`
475 All the known labels. Can be empty if no index can be found.
477 Notes
478 -----
479 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the
480 information is discovered.
481 """
482 return ButlerRepoIndex.get_known_repos()
484 @abstractmethod
485 def transaction(self) -> AbstractContextManager[None]:
486 """Context manager supporting `Butler` transactions.
488 Transactions can be nested.
489 """
490 raise NotImplementedError()
492 @abstractmethod
493 def put(
494 self,
495 obj: Any,
496 datasetRefOrType: DatasetRef | DatasetType | str,
497 /,
498 dataId: DataId | None = None,
499 *,
500 run: str | None = None,
501 **kwargs: Any,
502 ) -> DatasetRef:
503 """Store and register a dataset.
505 Parameters
506 ----------
507 obj : `object`
508 The dataset.
509 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
510 When `DatasetRef` is provided, ``dataId`` should be `None`.
511 Otherwise the `DatasetType` or name thereof. If a fully resolved
512 `DatasetRef` is given the run and ID are used directly.
513 dataId : `dict` or `DataCoordinate`
514 A `dict` of `Dimension` link name, value pairs that label the
515 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
516 should be provided as the second argument.
517 run : `str`, optional
518 The name of the run the dataset should be added to, overriding
519 ``self.run``. Not used if a resolved `DatasetRef` is provided.
520 **kwargs
521 Additional keyword arguments used to augment or construct a
522 `DataCoordinate`. See `DataCoordinate.standardize`
523 parameters. Not used if a resolve `DatasetRef` is provided.
525 Returns
526 -------
527 ref : `DatasetRef`
528 A reference to the stored dataset, updated with the correct id if
529 given.
531 Raises
532 ------
533 TypeError
534 Raised if the butler is read-only or if no run has been provided.
535 """
536 raise NotImplementedError()
538 @abstractmethod
539 def getDeferred(
540 self,
541 datasetRefOrType: DatasetRef | DatasetType | str,
542 /,
543 dataId: DataId | None = None,
544 *,
545 parameters: dict | None = None,
546 collections: Any = None,
547 storageClass: str | StorageClass | None = None,
548 **kwargs: Any,
549 ) -> DeferredDatasetHandle:
550 """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
551 after an immediate registry lookup.
553 Parameters
554 ----------
555 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
556 When `DatasetRef` the `dataId` should be `None`.
557 Otherwise the `DatasetType` or name thereof.
558 dataId : `dict` or `DataCoordinate`, optional
559 A `dict` of `Dimension` link name, value pairs that label the
560 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
561 should be provided as the first argument.
562 parameters : `dict`
563 Additional StorageClass-defined options to control reading,
564 typically used to efficiently read only a subset of the dataset.
565 collections : Any, optional
566 Collections to be searched, overriding ``self.collections``.
567 Can be any of the types supported by the ``collections`` argument
568 to butler construction.
569 storageClass : `StorageClass` or `str`, optional
570 The storage class to be used to override the Python type
571 returned by this method. By default the returned type matches
572 the dataset type definition for this dataset. Specifying a
573 read `StorageClass` can force a different type to be returned.
574 This type must be compatible with the original type.
575 **kwargs
576 Additional keyword arguments used to augment or construct a
577 `DataId`. See `DataId` parameters.
579 Returns
580 -------
581 obj : `DeferredDatasetHandle`
582 A handle which can be used to retrieve a dataset at a later time.
584 Raises
585 ------
586 LookupError
587 Raised if no matching dataset exists in the `Registry` or
588 datastore.
589 ValueError
590 Raised if a resolved `DatasetRef` was passed as an input, but it
591 differs from the one found in the registry.
592 TypeError
593 Raised if no collections were provided.
594 """
595 raise NotImplementedError()
597 @abstractmethod
598 def get(
599 self,
600 datasetRefOrType: DatasetRef | DatasetType | str,
601 /,
602 dataId: DataId | None = None,
603 *,
604 parameters: dict[str, Any] | None = None,
605 collections: Any = None,
606 storageClass: StorageClass | str | None = None,
607 **kwargs: Any,
608 ) -> Any:
609 """Retrieve a stored dataset.
611 Parameters
612 ----------
613 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
614 When `DatasetRef` the `dataId` should be `None`.
615 Otherwise the `DatasetType` or name thereof.
616 If a resolved `DatasetRef`, the associated dataset
617 is returned directly without additional querying.
618 dataId : `dict` or `DataCoordinate`
619 A `dict` of `Dimension` link name, value pairs that label the
620 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
621 should be provided as the first argument.
622 parameters : `dict`
623 Additional StorageClass-defined options to control reading,
624 typically used to efficiently read only a subset of the dataset.
625 collections : Any, optional
626 Collections to be searched, overriding ``self.collections``.
627 Can be any of the types supported by the ``collections`` argument
628 to butler construction.
629 storageClass : `StorageClass` or `str`, optional
630 The storage class to be used to override the Python type
631 returned by this method. By default the returned type matches
632 the dataset type definition for this dataset. Specifying a
633 read `StorageClass` can force a different type to be returned.
634 This type must be compatible with the original type.
635 **kwargs
636 Additional keyword arguments used to augment or construct a
637 `DataCoordinate`. See `DataCoordinate.standardize`
638 parameters.
640 Returns
641 -------
642 obj : `object`
643 The dataset.
645 Raises
646 ------
647 LookupError
648 Raised if no matching dataset exists in the `Registry`.
649 TypeError
650 Raised if no collections were provided.
652 Notes
653 -----
654 When looking up datasets in a `~CollectionType.CALIBRATION` collection,
655 this method requires that the given data ID include temporal dimensions
656 beyond the dimensions of the dataset type itself, in order to find the
657 dataset with the appropriate validity range. For example, a "bias"
658 dataset with native dimensions ``{instrument, detector}`` could be
659 fetched with a ``{instrument, detector, exposure}`` data ID, because
660 ``exposure`` is a temporal dimension.
661 """
662 raise NotImplementedError()
664 @abstractmethod
665 def getURIs(
666 self,
667 datasetRefOrType: DatasetRef | DatasetType | str,
668 /,
669 dataId: DataId | None = None,
670 *,
671 predict: bool = False,
672 collections: Any = None,
673 run: str | None = None,
674 **kwargs: Any,
675 ) -> DatasetRefURIs:
676 """Return the URIs associated with the dataset.
678 Parameters
679 ----------
680 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
681 When `DatasetRef` the `dataId` should be `None`.
682 Otherwise the `DatasetType` or name thereof.
683 dataId : `dict` or `DataCoordinate`
684 A `dict` of `Dimension` link name, value pairs that label the
685 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
686 should be provided as the first argument.
687 predict : `bool`
688 If `True`, allow URIs to be returned of datasets that have not
689 been written.
690 collections : Any, optional
691 Collections to be searched, overriding ``self.collections``.
692 Can be any of the types supported by the ``collections`` argument
693 to butler construction.
694 run : `str`, optional
695 Run to use for predictions, overriding ``self.run``.
696 **kwargs
697 Additional keyword arguments used to augment or construct a
698 `DataCoordinate`. See `DataCoordinate.standardize`
699 parameters.
701 Returns
702 -------
703 uris : `DatasetRefURIs`
704 The URI to the primary artifact associated with this dataset (if
705 the dataset was disassembled within the datastore this may be
706 `None`), and the URIs to any components associated with the dataset
707 artifact. (can be empty if there are no components).
708 """
709 raise NotImplementedError()
711 @abstractmethod
712 def getURI(
713 self,
714 datasetRefOrType: DatasetRef | DatasetType | str,
715 /,
716 dataId: DataId | None = None,
717 *,
718 predict: bool = False,
719 collections: Any = None,
720 run: str | None = None,
721 **kwargs: Any,
722 ) -> ResourcePath:
723 """Return the URI to the Dataset.
725 Parameters
726 ----------
727 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
728 When `DatasetRef` the `dataId` should be `None`.
729 Otherwise the `DatasetType` or name thereof.
730 dataId : `dict` or `DataCoordinate`
731 A `dict` of `Dimension` link name, value pairs that label the
732 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
733 should be provided as the first argument.
734 predict : `bool`
735 If `True`, allow URIs to be returned of datasets that have not
736 been written.
737 collections : Any, optional
738 Collections to be searched, overriding ``self.collections``.
739 Can be any of the types supported by the ``collections`` argument
740 to butler construction.
741 run : `str`, optional
742 Run to use for predictions, overriding ``self.run``.
743 **kwargs
744 Additional keyword arguments used to augment or construct a
745 `DataCoordinate`. See `DataCoordinate.standardize`
746 parameters.
748 Returns
749 -------
750 uri : `lsst.resources.ResourcePath`
751 URI pointing to the Dataset within the datastore. If the
752 Dataset does not exist in the datastore, and if ``predict`` is
753 `True`, the URI will be a prediction and will include a URI
754 fragment "#predicted".
755 If the datastore does not have entities that relate well
756 to the concept of a URI the returned URI string will be
757 descriptive. The returned URI is not guaranteed to be obtainable.
759 Raises
760 ------
761 LookupError
762 A URI has been requested for a dataset that does not exist and
763 guessing is not allowed.
764 ValueError
765 Raised if a resolved `DatasetRef` was passed as an input, but it
766 differs from the one found in the registry.
767 TypeError
768 Raised if no collections were provided.
769 RuntimeError
770 Raised if a URI is requested for a dataset that consists of
771 multiple artifacts.
772 """
773 raise NotImplementedError()
775 @abstractmethod
776 def retrieveArtifacts(
777 self,
778 refs: Iterable[DatasetRef],
779 destination: ResourcePathExpression,
780 transfer: str = "auto",
781 preserve_path: bool = True,
782 overwrite: bool = False,
783 ) -> list[ResourcePath]:
784 """Retrieve the artifacts associated with the supplied refs.
786 Parameters
787 ----------
788 refs : iterable of `DatasetRef`
789 The datasets for which artifacts are to be retrieved.
790 A single ref can result in multiple artifacts. The refs must
791 be resolved.
792 destination : `lsst.resources.ResourcePath` or `str`
793 Location to write the artifacts.
794 transfer : `str`, optional
795 Method to use to transfer the artifacts. Must be one of the options
796 supported by `~lsst.resources.ResourcePath.transfer_from()`.
797 "move" is not allowed.
798 preserve_path : `bool`, optional
799 If `True` the full path of the artifact within the datastore
800 is preserved. If `False` the final file component of the path
801 is used.
802 overwrite : `bool`, optional
803 If `True` allow transfers to overwrite existing files at the
804 destination.
806 Returns
807 -------
808 targets : `list` of `lsst.resources.ResourcePath`
809 URIs of file artifacts in destination location. Order is not
810 preserved.
812 Notes
813 -----
814 For non-file datastores the artifacts written to the destination
815 may not match the representation inside the datastore. For example
816 a hierarchical data structure in a NoSQL database may well be stored
817 as a JSON file.
818 """
819 raise NotImplementedError()
821 @abstractmethod
822 def exists(
823 self,
824 dataset_ref_or_type: DatasetRef | DatasetType | str,
825 /,
826 data_id: DataId | None = None,
827 *,
828 full_check: bool = True,
829 collections: Any = None,
830 **kwargs: Any,
831 ) -> DatasetExistence:
832 """Indicate whether a dataset is known to Butler registry and
833 datastore.
835 Parameters
836 ----------
837 dataset_ref_or_type : `DatasetRef`, `DatasetType`, or `str`
838 When `DatasetRef` the `dataId` should be `None`.
839 Otherwise the `DatasetType` or name thereof.
840 data_id : `dict` or `DataCoordinate`
841 A `dict` of `Dimension` link name, value pairs that label the
842 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
843 should be provided as the first argument.
844 full_check : `bool`, optional
845 If `True`, an additional check will be made for dataset artifact
846 existence. This will involve additional overhead due to the need
847 to query an external system. If `False` registry and datastore
848 will solely be asked if they know about the dataset but no
849 check for the artifact will be performed.
850 collections : Any, optional
851 Collections to be searched, overriding ``self.collections``.
852 Can be any of the types supported by the ``collections`` argument
853 to butler construction.
854 **kwargs
855 Additional keyword arguments used to augment or construct a
856 `DataCoordinate`. See `DataCoordinate.standardize`
857 parameters.
859 Returns
860 -------
861 existence : `DatasetExistence`
862 Object indicating whether the dataset is known to registry and
863 datastore. Evaluates to `True` if the dataset is present and known
864 to both.
865 """
866 raise NotImplementedError()
868 @abstractmethod
869 def _exists_many(
870 self,
871 refs: Iterable[DatasetRef],
872 /,
873 *,
874 full_check: bool = True,
875 ) -> dict[DatasetRef, DatasetExistence]:
876 """Indicate whether multiple datasets are known to Butler registry and
877 datastore.
879 This is an experimental API that may change at any moment.
881 Parameters
882 ----------
883 refs : iterable of `DatasetRef`
884 The datasets to be checked.
885 full_check : `bool`, optional
886 If `True`, an additional check will be made for dataset artifact
887 existence. This will involve additional overhead due to the need
888 to query an external system. If `False` registry and datastore
889 will solely be asked if they know about the dataset but no
890 check for the artifact will be performed.
892 Returns
893 -------
894 existence : dict of [`DatasetRef`, `DatasetExistence`]
895 Mapping from the given dataset refs to an enum indicating the
896 status of the dataset in registry and datastore.
897 Each value evaluates to `True` if the dataset is present and known
898 to both.
899 """
900 raise NotImplementedError()
902 @abstractmethod
903 def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None:
904 """Remove one or more `~CollectionType.RUN` collections and the
905 datasets within them.
907 Parameters
908 ----------
909 names : `~collections.abc.Iterable` [ `str` ]
910 The names of the collections to remove.
911 unstore : `bool`, optional
912 If `True` (default), delete datasets from all datastores in which
913 they are present, and attempt to rollback the registry deletions if
914 datastore deletions fail (which may not always be possible). If
915 `False`, datastore records for these datasets are still removed,
916 but any artifacts (e.g. files) will not be.
918 Raises
919 ------
920 TypeError
921 Raised if one or more collections are not of type
922 `~CollectionType.RUN`.
923 """
924 raise NotImplementedError()
926 @abstractmethod
927 def ingest(
928 self,
929 *datasets: FileDataset,
930 transfer: str | None = "auto",
931 run: str | None = None,
932 idGenerationMode: DatasetIdGenEnum | None = None,
933 record_validation_info: bool = True,
934 ) -> None:
935 """Store and register one or more datasets that already exist on disk.
937 Parameters
938 ----------
939 datasets : `FileDataset`
940 Each positional argument is a struct containing information about
941 a file to be ingested, including its URI (either absolute or
942 relative to the datastore root, if applicable), a resolved
943 `DatasetRef`, and optionally a formatter class or its
944 fully-qualified string name. If a formatter is not provided, the
945 formatter that would be used for `put` is assumed. On successful
946 ingest all `FileDataset.formatter` attributes will be set to the
947 formatter class used. `FileDataset.path` attributes may be modified
948 to put paths in whatever the datastore considers a standardized
949 form.
950 transfer : `str`, optional
951 If not `None`, must be one of 'auto', 'move', 'copy', 'direct',
952 'split', 'hardlink', 'relsymlink' or 'symlink', indicating how to
953 transfer the file.
954 run : `str`, optional
955 The name of the run ingested datasets should be added to,
956 overriding ``self.run``. This parameter is now deprecated since
957 the run is encoded in the ``FileDataset``.
958 idGenerationMode : `DatasetIdGenEnum`, optional
959 Specifies option for generating dataset IDs. Parameter is
960 deprecated.
961 record_validation_info : `bool`, optional
962 If `True`, the default, the datastore can record validation
963 information associated with the file. If `False` the datastore
964 will not attempt to track any information such as checksums
965 or file sizes. This can be useful if such information is tracked
966 in an external system or if the file is to be compressed in place.
967 It is up to the datastore whether this parameter is relevant.
969 Raises
970 ------
971 TypeError
972 Raised if the butler is read-only or if no run was provided.
973 NotImplementedError
974 Raised if the `Datastore` does not support the given transfer mode.
975 DatasetTypeNotSupportedError
976 Raised if one or more files to be ingested have a dataset type that
977 is not supported by the `Datastore`..
978 FileNotFoundError
979 Raised if one of the given files does not exist.
980 FileExistsError
981 Raised if transfer is not `None` but the (internal) location the
982 file would be moved to is already occupied.
984 Notes
985 -----
986 This operation is not fully exception safe: if a database operation
987 fails, the given `FileDataset` instances may be only partially updated.
989 It is atomic in terms of database operations (they will either all
990 succeed or all fail) providing the database engine implements
991 transactions correctly. It will attempt to be atomic in terms of
992 filesystem operations as well, but this cannot be implemented
993 rigorously for most datastores.
994 """
995 raise NotImplementedError()
997 @abstractmethod
998 def export(
999 self,
1000 *,
1001 directory: str | None = None,
1002 filename: str | None = None,
1003 format: str | None = None,
1004 transfer: str | None = None,
1005 ) -> AbstractContextManager[RepoExportContext]:
1006 """Export datasets from the repository represented by this `Butler`.
1008 This method is a context manager that returns a helper object
1009 (`RepoExportContext`) that is used to indicate what information from
1010 the repository should be exported.
1012 Parameters
1013 ----------
1014 directory : `str`, optional
1015 Directory dataset files should be written to if ``transfer`` is not
1016 `None`.
1017 filename : `str`, optional
1018 Name for the file that will include database information associated
1019 with the exported datasets. If this is not an absolute path and
1020 ``directory`` is not `None`, it will be written to ``directory``
1021 instead of the current working directory. Defaults to
1022 "export.{format}".
1023 format : `str`, optional
1024 File format for the database information file. If `None`, the
1025 extension of ``filename`` will be used.
1026 transfer : `str`, optional
1027 Transfer mode passed to `Datastore.export`.
1029 Raises
1030 ------
1031 TypeError
1032 Raised if the set of arguments passed is inconsistent.
1034 Examples
1035 --------
1036 Typically the `Registry.queryDataIds` and `Registry.queryDatasets`
1037 methods are used to provide the iterables over data IDs and/or datasets
1038 to be exported::
1040 with butler.export("exports.yaml") as export:
1041 # Export all flats, but none of the dimension element rows
1042 # (i.e. data ID information) associated with them.
1043 export.saveDatasets(butler.registry.queryDatasets("flat"),
1044 elements=())
1045 # Export all datasets that start with "deepCoadd_" and all of
1046 # their associated data ID information.
1047 export.saveDatasets(butler.registry.queryDatasets("deepCoadd_*"))
1048 """
1049 raise NotImplementedError()
1051 @abstractmethod
1052 def import_(
1053 self,
1054 *,
1055 directory: ResourcePathExpression | None = None,
1056 filename: ResourcePathExpression | TextIO | None = None,
1057 format: str | None = None,
1058 transfer: str | None = None,
1059 skip_dimensions: set | None = None,
1060 ) -> None:
1061 """Import datasets into this repository that were exported from a
1062 different butler repository via `~lsst.daf.butler.Butler.export`.
1064 Parameters
1065 ----------
1066 directory : `~lsst.resources.ResourcePathExpression`, optional
1067 Directory containing dataset files to import from. If `None`,
1068 ``filename`` and all dataset file paths specified therein must
1069 be absolute.
1070 filename : `~lsst.resources.ResourcePathExpression` or `TextIO`
1071 A stream or name of file that contains database information
1072 associated with the exported datasets, typically generated by
1073 `~lsst.daf.butler.Butler.export`. If this a string (name) or
1074 `~lsst.resources.ResourcePath` and is not an absolute path,
1075 it will first be looked for relative to ``directory`` and if not
1076 found there it will be looked for in the current working
1077 directory. Defaults to "export.{format}".
1078 format : `str`, optional
1079 File format for ``filename``. If `None`, the extension of
1080 ``filename`` will be used.
1081 transfer : `str`, optional
1082 Transfer mode passed to `~lsst.daf.butler.Datastore.ingest`.
1083 skip_dimensions : `set`, optional
1084 Names of dimensions that should be skipped and not imported.
1086 Raises
1087 ------
1088 TypeError
1089 Raised if the set of arguments passed is inconsistent, or if the
1090 butler is read-only.
1091 """
1092 raise NotImplementedError()
1094 @abstractmethod
1095 def transfer_from(
1096 self,
1097 source_butler: LimitedButler,
1098 source_refs: Iterable[DatasetRef],
1099 transfer: str = "auto",
1100 skip_missing: bool = True,
1101 register_dataset_types: bool = False,
1102 transfer_dimensions: bool = False,
1103 ) -> Collection[DatasetRef]:
1104 """Transfer datasets to this Butler from a run in another Butler.
1106 Parameters
1107 ----------
1108 source_butler : `LimitedButler`
1109 Butler from which the datasets are to be transferred. If data IDs
1110 in ``source_refs`` are not expanded then this has to be a full
1111 `Butler` whose registry will be used to expand data IDs.
1112 source_refs : iterable of `DatasetRef`
1113 Datasets defined in the source butler that should be transferred to
1114 this butler. In most circumstances, ``transfer_from`` is faster if
1115 the dataset refs are expanded.
1116 transfer : `str`, optional
1117 Transfer mode passed to `~lsst.daf.butler.Datastore.transfer_from`.
1118 skip_missing : `bool`
1119 If `True`, datasets with no datastore artifact associated with
1120 them are not transferred. If `False` a registry entry will be
1121 created even if no datastore record is created (and so will
1122 look equivalent to the dataset being unstored).
1123 register_dataset_types : `bool`
1124 If `True` any missing dataset types are registered. Otherwise
1125 an exception is raised.
1126 transfer_dimensions : `bool`, optional
1127 If `True`, dimension record data associated with the new datasets
1128 will be transferred.
1130 Returns
1131 -------
1132 refs : `list` of `DatasetRef`
1133 The refs added to this Butler.
1135 Notes
1136 -----
1137 The datastore artifact has to exist for a transfer
1138 to be made but non-existence is not an error.
1140 Datasets that already exist in this run will be skipped.
1142 The datasets are imported as part of a transaction, although
1143 dataset types are registered before the transaction is started.
1144 This means that it is possible for a dataset type to be registered
1145 even though transfer has failed.
1146 """
1147 raise NotImplementedError()
1149 @abstractmethod
1150 def validateConfiguration(
1151 self,
1152 logFailures: bool = False,
1153 datasetTypeNames: Iterable[str] | None = None,
1154 ignore: Iterable[str] | None = None,
1155 ) -> None:
1156 """Validate butler configuration.
1158 Checks that each `DatasetType` can be stored in the `Datastore`.
1160 Parameters
1161 ----------
1162 logFailures : `bool`, optional
1163 If `True`, output a log message for every validation error
1164 detected.
1165 datasetTypeNames : iterable of `str`, optional
1166 The `DatasetType` names that should be checked. This allows
1167 only a subset to be selected.
1168 ignore : iterable of `str`, optional
1169 Names of DatasetTypes to skip over. This can be used to skip
1170 known problems. If a named `DatasetType` corresponds to a
1171 composite, all components of that `DatasetType` will also be
1172 ignored.
1174 Raises
1175 ------
1176 ButlerValidationError
1177 Raised if there is some inconsistency with how this Butler
1178 is configured.
1179 """
1180 raise NotImplementedError()
1182 @property
1183 @abstractmethod
1184 def collections(self) -> Sequence[str]:
1185 """The collections to search by default, in order
1186 (`~collections.abc.Sequence` [ `str` ]).
1187 """
1188 raise NotImplementedError()
1190 @property
1191 @abstractmethod
1192 def run(self) -> str | None:
1193 """Name of the run this butler writes outputs to by default (`str` or
1194 `None`).
1195 """
1196 raise NotImplementedError()
1198 @property
1199 @abstractmethod
1200 def registry(self) -> Registry:
1201 """The object that manages dataset metadata and relationships
1202 (`Registry`).
1204 Many operations that don't involve reading or writing butler datasets
1205 are accessible only via `Registry` methods. Eventually these methods
1206 will be replaced by equivalent `Butler` methods.
1207 """
1208 raise NotImplementedError()