Coverage for python / lsst / daf / butler / _butler.py: 31%
321 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-17 08:49 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-17 08:49 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28from __future__ import annotations
30__all__ = ["Butler", "ParsedButlerDatasetURI", "SpecificButlerDataset"]
32import dataclasses
33import urllib.parse
34import uuid
35import warnings
36from abc import abstractmethod
37from collections.abc import Collection, Iterable, Iterator, Mapping, Sequence
38from contextlib import AbstractContextManager
39from types import EllipsisType
40from typing import TYPE_CHECKING, Any, TextIO
42from lsst.resources import ResourcePath, ResourcePathExpression
43from lsst.utils import doImportType
44from lsst.utils.iteration import ensure_iterable
45from lsst.utils.logging import getLogger
47from ._butler_collections import ButlerCollections
48from ._butler_config import ButlerConfig, ButlerType
49from ._butler_instance_options import ButlerInstanceOptions
50from ._butler_metrics import ButlerMetrics
51from ._butler_repo_index import ButlerRepoIndex
52from ._config import Config, ConfigSubset
53from ._exceptions import EmptyQueryResultError, InvalidQueryError
54from ._limited_butler import LimitedButler
55from ._query_all_datasets import QueryAllDatasetsParameters
56from .datastore import Datastore
57from .dimensions import DataCoordinate, DimensionConfig
58from .registry import RegistryConfig, _RegistryFactory
59from .repo_relocation import BUTLER_ROOT_TAG
60from .utils import has_globs
62if TYPE_CHECKING:
63 from ._dataset_existence import DatasetExistence
64 from ._dataset_provenance import DatasetProvenance
65 from ._dataset_ref import DatasetId, DatasetRef
66 from ._dataset_type import DatasetType
67 from ._deferredDatasetHandle import DeferredDatasetHandle
68 from ._file_dataset import FileDataset
69 from ._labeled_butler_factory import LabeledButlerFactoryProtocol
70 from ._storage_class import StorageClass
71 from ._timespan import Timespan
72 from .datastore import DatasetRefURIs
73 from .dimensions import DataId, DimensionGroup, DimensionRecord
74 from .queries import Query
75 from .registry import CollectionArgType, Registry
76 from .transfers import RepoExportContext
78_LOG = getLogger(__name__)
81@dataclasses.dataclass
82class ParsedButlerDatasetURI:
83 """Representation of the contents of an IVOA IVOID or dataset URI."""
85 label: str
86 """Label of the associated butler repository. (`str`)"""
87 dataset_id: uuid.UUID
88 """Dataset ID of the referenced dataset within the labeled repository.
89 (`uuid.UUID`)"""
90 uri: str
91 """The original URI that was parsed (`str`)."""
94@dataclasses.dataclass
95class SpecificButlerDataset:
96 """A dataset ref associated with a specific butler."""
98 butler: Butler
99 """A specific butler repository (`Butler`)."""
100 dataset: DatasetRef | None
101 """The reference of a specific dataset in that butler (`DatasetRef`)."""
104class _DeprecatedDefault:
105 """Default value for a deprecated parameter."""
108class Butler(LimitedButler): # numpydoc ignore=PR02
109 """Interface for data butler and factory for Butler instances.
111 Parameters
112 ----------
113 config : `ButlerConfig`, `Config` or `str`, optional
114 Configuration. Anything acceptable to the `ButlerConfig` constructor.
115 If a directory path is given the configuration will be read from a
116 ``butler.yaml`` file in that location. If `None` is given default
117 values will be used. If ``config`` contains "cls" key then its value is
118 used as a name of butler class and it must be a sub-class of this
119 class, otherwise `DirectButler` is instantiated.
120 collections : `str` or `~collections.abc.Iterable` [ `str` ], optional
121 An expression specifying the collections to be searched (in order) when
122 reading datasets.
123 This may be a `str` collection name or an iterable thereof.
124 See :ref:`daf_butler_collection_expressions` for more information.
125 These collections are not registered automatically and must be
126 manually registered before they are used by any method, but they may be
127 manually registered after the `Butler` is initialized.
128 run : `str`, optional
129 Name of the `~CollectionType.RUN` collection new datasets should be
130 inserted into. If ``collections`` is `None` and ``run`` is not `None`,
131 ``collections`` will be set to ``[run]``. If not `None`, this
132 collection will automatically be registered. If this is not set (and
133 ``writeable`` is not set either), a read-only butler will be created.
134 searchPaths : `list` of `str`, optional
135 Directory paths to search when calculating the full Butler
136 configuration. Not used if the supplied config is already a
137 `ButlerConfig`.
138 writeable : `bool`, optional
139 Explicitly sets whether the butler supports write operations. If not
140 provided, a read-write butler is created if any of ``run``, ``tags``,
141 or ``chains`` is non-empty.
142 inferDefaults : `bool`, optional
143 If `True` (default) infer default data ID values from the values
144 present in the datasets in ``collections``: if all collections have the
145 same value (or no value) for a governor dimension, that value will be
146 the default for that dimension. Nonexistent collections are ignored.
147 If a default value is provided explicitly for a governor dimension via
148 ``**kwargs``, no default will be inferred for that dimension.
149 without_datastore : `bool`, optional
150 If `True` do not attach a datastore to this butler. Any attempts
151 to use a datastore will fail.
152 metrics : `ButlerMetrics` or `None`
153 External metrics object to be used for tracking butler usage. If `None`
154 a new metrics object is created.
155 **kwargs : `typing.Any`
156 Additional keyword arguments passed to a constructor of actual butler
157 class.
159 Notes
160 -----
161 The preferred way to instantiate Butler is via the `from_config` method.
162 The call to ``Butler(...)`` is equivalent to ``Butler.from_config(...)``,
163 but ``mypy`` will complain about the former.
164 """
166 def __new__(
167 cls,
168 config: Config | ResourcePathExpression | None = None,
169 *,
170 collections: Any = None,
171 run: str | None = None,
172 searchPaths: Sequence[ResourcePathExpression] | None = None,
173 writeable: bool | None = None,
174 inferDefaults: bool = True,
175 without_datastore: bool = False,
176 metrics: ButlerMetrics | None = None,
177 **kwargs: Any,
178 ) -> Butler:
179 if cls is Butler:
180 return Butler.from_config(
181 config=config,
182 collections=collections,
183 run=run,
184 searchPaths=searchPaths,
185 writeable=writeable,
186 inferDefaults=inferDefaults,
187 without_datastore=without_datastore,
188 metrics=metrics,
189 **kwargs,
190 )
192 # Note: we do not pass any parameters to __new__, Python will pass them
193 # to __init__ after __new__ returns sub-class instance.
194 return super().__new__(cls)
196 @classmethod
197 def from_config(
198 cls,
199 config: Config | ResourcePathExpression | None = None,
200 *,
201 collections: Any = None,
202 run: str | None = None,
203 searchPaths: Sequence[ResourcePathExpression] | None = None,
204 writeable: bool | None = None,
205 inferDefaults: bool = True,
206 without_datastore: bool = False,
207 metrics: ButlerMetrics | None = None,
208 **kwargs: Any,
209 ) -> Butler:
210 """Create butler instance from configuration.
212 Parameters
213 ----------
214 config : `ButlerConfig`, `Config` or `str`, optional
215 Configuration. Anything acceptable to the `ButlerConfig`
216 constructor. If a directory path is given the configuration will be
217 read from a ``butler.yaml`` file in that location. If `None` is
218 given default values will be used. If ``config`` contains "cls" key
219 then its value is used as a name of butler class and it must be a
220 sub-class of this class, otherwise `DirectButler` is instantiated.
221 collections : `str` or `~collections.abc.Iterable` [ `str` ], optional
222 An expression specifying the collections to be searched (in order)
223 when reading datasets.
224 This may be a `str` collection name or an iterable thereof.
225 See :ref:`daf_butler_collection_expressions` for more information.
226 These collections are not registered automatically and must be
227 manually registered before they are used by any method, but they
228 may be manually registered after the `Butler` is initialized.
229 run : `str`, optional
230 Name of the `~CollectionType.RUN` collection new datasets should be
231 inserted into. If ``collections`` is `None` and ``run`` is not
232 `None`, ``collections`` will be set to ``[run]``. If not `None`,
233 this collection will automatically be registered. If this is not
234 set (and ``writeable`` is not set either), a read-only butler will
235 be created.
236 searchPaths : `list` of `str`, optional
237 Directory paths to search when calculating the full Butler
238 configuration. Not used if the supplied config is already a
239 `ButlerConfig`.
240 writeable : `bool`, optional
241 Explicitly sets whether the butler supports write operations. If
242 not provided, a read-write butler is created if any of ``run``,
243 ``tags``, or ``chains`` is non-empty.
244 inferDefaults : `bool`, optional
245 If `True` (default) infer default data ID values from the values
246 present in the datasets in ``collections``: if all collections have
247 the same value (or no value) for a governor dimension, that value
248 will be the default for that dimension. Nonexistent collections
249 are ignored. If a default value is provided explicitly for a
250 governor dimension via ``**kwargs``, no default will be inferred
251 for that dimension.
252 without_datastore : `bool`, optional
253 If `True` do not attach a datastore to this butler. Any attempts
254 to use a datastore will fail.
255 metrics : `ButlerMetrics` or `None`, optional
256 Metrics object to record butler usage statistics.
257 **kwargs : `typing.Any`
258 Default data ID key-value pairs. These may only identify
259 "governor" dimensions like ``instrument`` and ``skymap``.
261 Returns
262 -------
263 butler : `Butler`
264 A `Butler` constructed from the given configuration.
266 Notes
267 -----
268 Calling this factory method is identical to calling
269 ``Butler(config, ...)``. Its only raison d'être is that ``mypy``
270 complains about ``Butler()`` call.
272 Examples
273 --------
274 While there are many ways to control exactly how a `Butler` interacts
275 with the collections in its `Registry`, the most common cases are still
276 simple.
278 For a read-only `Butler` that searches one collection, do::
280 butler = Butler.from_config(
281 "/path/to/repo", collections=["u/alice/DM-50000"]
282 )
284 For a read-write `Butler` that writes to and reads from a
285 `~CollectionType.RUN` collection::
287 butler = Butler.from_config(
288 "/path/to/repo", run="u/alice/DM-50000/a"
289 )
291 The `Butler` passed to a ``PipelineTask`` is often much more complex,
292 because we want to write to one `~CollectionType.RUN` collection but
293 read from several others (as well)::
295 butler = Butler.from_config(
296 "/path/to/repo",
297 run="u/alice/DM-50000/a",
298 collections=[
299 "u/alice/DM-50000/a",
300 "u/bob/DM-49998",
301 "HSC/defaults",
302 ],
303 )
305 This butler will `put` new datasets to the run ``u/alice/DM-50000/a``.
306 Datasets will be read first from that run (since it appears first in
307 the chain), and then from ``u/bob/DM-49998`` and finally
308 ``HSC/defaults``.
310 Finally, one can always create a `Butler` with no collections::
312 butler = Butler.from_config("/path/to/repo", writeable=True)
314 This can be extremely useful when you just want to use
315 ``butler.registry``, e.g. for inserting dimension data or managing
316 collections, or when the collections you want to use with the butler
317 are not consistent. Passing ``writeable`` explicitly here is only
318 necessary if you want to be able to make changes to the repo - usually
319 the value for ``writeable`` can be guessed from the collection
320 arguments provided, but it defaults to `False` when there are not
321 collection arguments.
322 """
323 # DirectButler used to have a way to specify a "copy constructor" by
324 # passing the "butler" parameter to its constructor. This has
325 # been moved out of the constructor into Butler.clone().
326 butler = kwargs.pop("butler", None)
327 metrics = metrics if metrics is not None else ButlerMetrics()
328 if butler is not None:
329 if not isinstance(butler, Butler):
330 raise TypeError("'butler' parameter must be a Butler instance")
331 if config is not None or searchPaths is not None or writeable is not None:
332 raise TypeError(
333 "Cannot pass 'config', 'searchPaths', or 'writeable' arguments with 'butler' argument."
334 )
335 return butler.clone(
336 collections=collections, run=run, inferDefaults=inferDefaults, metrics=metrics, dataId=kwargs
337 )
339 options = ButlerInstanceOptions(
340 collections=collections,
341 run=run,
342 writeable=writeable,
343 inferDefaults=inferDefaults,
344 metrics=metrics,
345 kwargs=kwargs,
346 )
348 # Load the Butler configuration. This may involve searching the
349 # environment to locate a configuration file.
350 butler_config = ButlerConfig(config, searchPaths=searchPaths, without_datastore=without_datastore)
351 butler_type = butler_config.get_butler_type()
353 # Make DirectButler if class is not specified.
354 match butler_type:
355 case ButlerType.DIRECT:
356 from .direct_butler import DirectButler
358 return DirectButler.create_from_config(
359 butler_config,
360 options=options,
361 without_datastore=without_datastore,
362 )
363 case ButlerType.REMOTE:
364 from .remote_butler._factory import RemoteButlerFactory
366 # Assume this is being created by a client who would like
367 # default caching of remote datasets.
368 factory = RemoteButlerFactory.create_factory_from_config(butler_config)
369 return factory.create_butler_with_credentials_from_environment(
370 butler_options=options, enable_datastore_cache=True
371 )
372 case _:
373 raise TypeError(f"Unknown Butler type '{butler_type}'")
375 @staticmethod
376 def makeRepo(
377 root: ResourcePathExpression,
378 config: Config | str | None = None,
379 dimensionConfig: Config | str | None = None,
380 standalone: bool = False,
381 searchPaths: list[str] | None = None,
382 forceConfigRoot: bool = True,
383 outfile: ResourcePathExpression | None = None,
384 overwrite: bool = False,
385 ) -> Config:
386 """Create an empty data repository by adding a butler.yaml config
387 to a repository root directory.
389 Parameters
390 ----------
391 root : `lsst.resources.ResourcePathExpression`
392 Path or URI to the root location of the new repository. Will be
393 created if it does not exist.
394 config : `Config` or `str`, optional
395 Configuration to write to the repository, after setting any
396 root-dependent Registry or Datastore config options. Can not
397 be a `ButlerConfig` or a `ConfigSubset`. If `None`, default
398 configuration will be used. Root-dependent config options
399 specified in this config are overwritten if ``forceConfigRoot``
400 is `True`.
401 dimensionConfig : `Config` or `str`, optional
402 Configuration for dimensions, will be used to initialize registry
403 database.
404 standalone : `bool`
405 If True, write all expanded defaults, not just customized or
406 repository-specific settings.
407 This (mostly) decouples the repository from the default
408 configuration, insulating it from changes to the defaults (which
409 may be good or bad, depending on the nature of the changes).
410 Future *additions* to the defaults will still be picked up when
411 initializing a `Butler` for repos created with ``standalone=True``.
412 searchPaths : `list` of `str`, optional
413 Directory paths to search when calculating the full butler
414 configuration.
415 forceConfigRoot : `bool`, optional
416 If `False`, any values present in the supplied ``config`` that
417 would normally be reset are not overridden and will appear
418 directly in the output config. This allows non-standard overrides
419 of the root directory for a datastore or registry to be given.
420 If this parameter is `True` the values for ``root`` will be
421 forced into the resulting config if appropriate.
422 outfile : `lsst.resources.ResourcePathExpression`, optional
423 If not-`None`, the output configuration will be written to this
424 location rather than into the repository itself. Can be a URI
425 string. Can refer to a directory that will be used to write
426 ``butler.yaml``.
427 overwrite : `bool`, optional
428 Create a new configuration file even if one already exists
429 in the specified output location. Default is to raise
430 an exception.
432 Returns
433 -------
434 config : `Config`
435 The updated `Config` instance written to the repo.
437 Raises
438 ------
439 ValueError
440 Raised if a ButlerConfig or ConfigSubset is passed instead of a
441 regular Config (as these subclasses would make it impossible to
442 support ``standalone=False``).
443 FileExistsError
444 Raised if the output config file already exists.
445 os.error
446 Raised if the directory does not exist, exists but is not a
447 directory, or cannot be created.
449 Notes
450 -----
451 Note that when ``standalone=False`` (the default), the configuration
452 search path (see `ConfigSubset.defaultSearchPaths`) that was used to
453 construct the repository should also be used to construct any Butlers
454 to avoid configuration inconsistencies.
455 """
456 if isinstance(config, ButlerConfig | ConfigSubset):
457 raise ValueError("makeRepo must be passed a regular Config without defaults applied.")
459 # Ensure that the root of the repository exists or can be made
460 root_uri = ResourcePath(root, forceDirectory=True)
461 root_uri.mkdir()
463 config = Config(config)
465 # If we are creating a new repo from scratch with relative roots,
466 # do not propagate an explicit root from the config file
467 if "root" in config:
468 del config["root"]
470 full = ButlerConfig(config, searchPaths=searchPaths) # this applies defaults
471 imported_class = doImportType(full["datastore", "cls"])
472 if not issubclass(imported_class, Datastore):
473 raise TypeError(f"Imported datastore class {full['datastore', 'cls']} is not a Datastore")
474 datastoreClass: type[Datastore] = imported_class
475 datastoreClass.setConfigRoot(BUTLER_ROOT_TAG, config, full, overwrite=forceConfigRoot)
477 # if key exists in given config, parse it, otherwise parse the defaults
478 # in the expanded config
479 if config.get(("registry", "db")):
480 registryConfig = RegistryConfig(config)
481 else:
482 registryConfig = RegistryConfig(full)
483 defaultDatabaseUri = registryConfig.makeDefaultDatabaseUri(BUTLER_ROOT_TAG)
484 if defaultDatabaseUri is not None:
485 Config.updateParameters(
486 RegistryConfig, config, full, toUpdate={"db": defaultDatabaseUri}, overwrite=forceConfigRoot
487 )
488 else:
489 Config.updateParameters(RegistryConfig, config, full, toCopy=("db",), overwrite=forceConfigRoot)
491 if standalone:
492 config.merge(full)
493 else:
494 # Always expand the registry.managers section into the per-repo
495 # config, because after the database schema is created, it's not
496 # allowed to change anymore. Note that in the standalone=True
497 # branch, _everything_ in the config is expanded, so there's no
498 # need to special case this.
499 Config.updateParameters(RegistryConfig, config, full, toMerge=("managers",), overwrite=False)
500 configURI: ResourcePathExpression
501 if outfile is not None:
502 # When writing to a separate location we must include
503 # the root of the butler repo in the config else it won't know
504 # where to look.
505 config["root"] = root_uri.geturl()
506 configURI = outfile
507 else:
508 configURI = root_uri
509 # Check that if obscore key is present then its config must be there
510 # too, this is to avoid common mistake when people copy butler.yaml
511 # from existing repo with obscore but do not fill its config.
512 if (obscore_key := ("registry", "managers", "obscore")) in config:
513 obscore_config_key = ("registry", "managers", "obscore", "config")
514 if obscore_config_key not in config or not config[obscore_config_key]:
515 warnings.warn(
516 "Obscore manager is declared in registry configuration, "
517 "but obscore configuration is missing, obscore manager will be removed.",
518 stacklevel=2,
519 )
520 del config[obscore_key]
521 # Strip obscore configuration, if it is present, before writing config
522 # to a file, obscore config will be stored in registry.
523 if (obscore_config_key := ("registry", "managers", "obscore", "config")) in config:
524 config_to_write = config.copy()
525 del config_to_write[obscore_config_key]
526 config_to_write.dumpToUri(configURI, overwrite=overwrite)
527 # configFile attribute is updated, need to copy it to original.
528 config.configFile = config_to_write.configFile
529 else:
530 config.dumpToUri(configURI, overwrite=overwrite)
532 # Create Registry and populate tables
533 registryConfig = RegistryConfig(config.get("registry"))
534 dimensionConfig = DimensionConfig(dimensionConfig)
535 registry = _RegistryFactory(registryConfig).create_from_config(
536 dimensionConfig=dimensionConfig, butlerRoot=root_uri
537 )
538 registry.close()
540 _LOG.verbose("Wrote new Butler configuration file to %s", configURI)
542 return config
544 @classmethod
545 def get_repo_uri(cls, label: str, return_label: bool = False) -> ResourcePath:
546 """Look up the label in a butler repository index.
548 Parameters
549 ----------
550 label : `str`
551 Label of the Butler repository to look up.
552 return_label : `bool`, optional
553 If ``label`` cannot be found in the repository index (either
554 because index is not defined or ``label`` is not in the index) and
555 ``return_label`` is `True` then return ``ResourcePath(label)``.
556 If ``return_label`` is `False` (default) then an exception will be
557 raised instead.
559 Returns
560 -------
561 uri : `lsst.resources.ResourcePath`
562 URI to the Butler repository associated with the given label or
563 default value if it is provided.
565 Raises
566 ------
567 KeyError
568 Raised if the label is not found in the index, or if an index
569 is not defined, and ``return_label`` is `False`.
571 Notes
572 -----
573 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the
574 information is discovered.
575 """
576 return ButlerRepoIndex.get_repo_uri(label, return_label)
578 @classmethod
579 def get_known_repos(cls) -> set[str]:
580 """Retrieve the list of known repository labels.
582 Returns
583 -------
584 repos : `set` of `str`
585 All the known labels. Can be empty if no index can be found.
587 Notes
588 -----
589 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the
590 information is discovered.
591 """
592 return ButlerRepoIndex.get_known_repos()
594 @classmethod
595 def parse_dataset_uri(cls, uri: str) -> ParsedButlerDatasetURI:
596 """Extract the butler label and dataset ID from a dataset URI.
598 Parameters
599 ----------
600 uri : `str`
601 The dataset URI to parse.
603 Returns
604 -------
605 parsed : `ParsedButlerDatasetURI`
606 The label associated with the butler repository from which this
607 dataset originates and the ID of the dataset.
609 Notes
610 -----
611 Supports dataset URIs of the forms
612 ``ivo://org.rubinobs/usdac/dr1?repo=butler_label&id=UUID`` (see
613 DMTN-302) and ``butler://butler_label/UUID``. The ``butler`` URI is
614 deprecated and can not include ``/`` in the label string. ``ivo`` URIs
615 can include anything supported by the `Butler` constructor, including
616 paths to repositories and alias labels.
618 ivo://org.rubinobs/dr1?repo=/repo/main&id=UUID
620 will return a label of ``/repo/main``.
622 This method does not attempt to check that the dataset exists in the
623 labeled butler.
625 Since the IVOID can be issued by any publisher to represent a Butler
626 dataset there is no validation of the path or netloc component of the
627 URI. The only requirement is that there are ``id`` and ``repo`` keys
628 in the ``ivo`` URI query component.
629 """
630 parsed = urllib.parse.urlparse(uri)
631 parsed_scheme = parsed.scheme.lower()
632 if parsed_scheme == "ivo":
633 # Do not validate the netloc or the path values.
634 qs = urllib.parse.parse_qs(parsed.query)
635 if "repo" not in qs or "id" not in qs:
636 raise ValueError(f"Missing 'repo' and/or 'id' query parameters in IVOID {uri}.")
637 if len(qs["repo"]) != 1 or len(qs["id"]) != 1:
638 raise ValueError(f"Butler IVOID only supports a single value of repo and id, got {uri}")
639 label = qs["repo"][0]
640 id_ = qs["id"][0]
641 elif parsed_scheme == "butler":
642 label = parsed.netloc # Butler label is case sensitive.
643 # Need to strip the leading /.
644 id_ = parsed.path[1:]
645 else:
646 raise ValueError(f"Unrecognized URI scheme: {uri!r}")
647 # Strip trailing/leading whitespace from label.
648 label = label.strip()
649 if not label:
650 raise ValueError(f"No butler repository label found in uri {uri!r}")
651 try:
652 dataset_id = uuid.UUID(hex=id_)
653 except Exception as e:
654 e.add_note(f"Error extracting dataset ID from uri {uri!r} with dataset ID string {id_!r}")
655 raise
657 return ParsedButlerDatasetURI(label=label, dataset_id=dataset_id, uri=uri)
659 @classmethod
660 def get_dataset_from_uri(
661 cls, uri: str, factory: LabeledButlerFactoryProtocol | None = None
662 ) -> SpecificButlerDataset:
663 """Get the dataset associated with the given dataset URI.
665 Parameters
666 ----------
667 uri : `str`
668 The URI associated with a dataset.
669 factory : `LabeledButlerFactoryProtocol` or `None`, optional
670 Bound factory function that will be given the butler label
671 and receive a `Butler`. If this is not provided the label
672 will be tried directly.
674 Returns
675 -------
676 result : `SpecificButlerDataset`
677 The butler associated with this URI and the dataset itself.
678 The dataset can be `None` if the UUID is valid but the dataset
679 is not known to this butler.
680 """
681 parsed = cls.parse_dataset_uri(uri)
682 butler: Butler | None = None
683 if factory is not None:
684 # If the label is not recognized, it might be a path.
685 try:
686 butler = factory(parsed.label)
687 except KeyError:
688 pass
689 if butler is None:
690 butler = cls.from_config(parsed.label)
691 return SpecificButlerDataset(butler=butler, dataset=butler.get_dataset(parsed.dataset_id))
693 @abstractmethod
694 def _caching_context(self) -> AbstractContextManager[None]:
695 """Context manager that enables caching."""
696 raise NotImplementedError()
698 @abstractmethod
699 def transaction(self) -> AbstractContextManager[None]:
700 """Context manager supporting `Butler` transactions.
702 Transactions can be nested.
703 """
704 raise NotImplementedError()
706 @abstractmethod
707 def put(
708 self,
709 obj: Any,
710 datasetRefOrType: DatasetRef | DatasetType | str,
711 /,
712 dataId: DataId | None = None,
713 *,
714 run: str | None = None,
715 provenance: DatasetProvenance | None = None,
716 **kwargs: Any,
717 ) -> DatasetRef:
718 """Store and register a dataset.
720 Parameters
721 ----------
722 obj : `object`
723 The dataset.
724 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
725 When `DatasetRef` is provided, ``dataId`` should be `None`.
726 Otherwise the `DatasetType` or name thereof. If a fully resolved
727 `DatasetRef` is given the run and ID are used directly.
728 dataId : `dict` or `DataCoordinate`
729 A `dict` of `Dimension` link name, value pairs that label the
730 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
731 should be provided as the second argument.
732 run : `str`, optional
733 The name of the run the dataset should be added to, overriding
734 ``self.run``. Not used if a resolved `DatasetRef` is provided.
735 provenance : `DatasetProvenance` or `None`, optional
736 Any provenance that should be attached to the serialized dataset.
737 Not supported by all serialization mechanisms.
738 **kwargs
739 Additional keyword arguments used to augment or construct a
740 `DataCoordinate`. See `DataCoordinate.standardize`
741 parameters. Not used if a resolve `DatasetRef` is provided.
743 Returns
744 -------
745 ref : `DatasetRef`
746 A reference to the stored dataset, updated with the correct id if
747 given.
749 Raises
750 ------
751 TypeError
752 Raised if the butler is read-only or if no run has been provided.
753 """
754 raise NotImplementedError()
756 @abstractmethod
757 def getDeferred(
758 self,
759 datasetRefOrType: DatasetRef | DatasetType | str,
760 /,
761 dataId: DataId | None = None,
762 *,
763 parameters: dict | None = None,
764 collections: Any = None,
765 storageClass: str | StorageClass | None = None,
766 timespan: Timespan | None = None,
767 **kwargs: Any,
768 ) -> DeferredDatasetHandle:
769 """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
770 after an immediate registry lookup.
772 Parameters
773 ----------
774 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
775 When `DatasetRef` the `dataId` should be `None`.
776 Otherwise the `DatasetType` or name thereof.
777 dataId : `dict` or `DataCoordinate`, optional
778 A `dict` of `Dimension` link name, value pairs that label the
779 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
780 should be provided as the first argument.
781 parameters : `dict`
782 Additional StorageClass-defined options to control reading,
783 typically used to efficiently read only a subset of the dataset.
784 collections : Any, optional
785 Collections to be searched, overriding ``self.collections``.
786 Can be any of the types supported by the ``collections`` argument
787 to butler construction.
788 storageClass : `StorageClass` or `str`, optional
789 The storage class to be used to override the Python type
790 returned by this method. By default the returned type matches
791 the dataset type definition for this dataset. Specifying a
792 read `StorageClass` can force a different type to be returned.
793 This type must be compatible with the original type.
794 timespan : `Timespan` or `None`, optional
795 A timespan that the validity range of the dataset must overlap.
796 If not provided and this is a calibration dataset type, an attempt
797 will be made to find the timespan from any temporal coordinate
798 in the data ID.
799 **kwargs
800 Additional keyword arguments used to augment or construct a
801 `DataId`. See `DataId` parameters.
803 Returns
804 -------
805 obj : `DeferredDatasetHandle`
806 A handle which can be used to retrieve a dataset at a later time.
808 Raises
809 ------
810 LookupError
811 Raised if no matching dataset exists in the `Registry` or
812 datastore.
813 ValueError
814 Raised if a resolved `DatasetRef` was passed as an input, but it
815 differs from the one found in the registry.
816 TypeError
817 Raised if no collections were provided.
818 """
819 raise NotImplementedError()
821 @abstractmethod
822 def get(
823 self,
824 datasetRefOrType: DatasetRef | DatasetType | str,
825 /,
826 dataId: DataId | None = None,
827 *,
828 parameters: dict[str, Any] | None = None,
829 collections: Any = None,
830 storageClass: StorageClass | str | None = None,
831 timespan: Timespan | None = None,
832 **kwargs: Any,
833 ) -> Any:
834 """Retrieve a stored dataset.
836 Parameters
837 ----------
838 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
839 When `DatasetRef` the `dataId` should be `None`.
840 Otherwise the `DatasetType` or name thereof.
841 If a resolved `DatasetRef`, the associated dataset
842 is returned directly without additional querying.
843 dataId : `dict` or `DataCoordinate`
844 A `dict` of `Dimension` link name, value pairs that label the
845 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
846 should be provided as the first argument.
847 parameters : `dict`
848 Additional StorageClass-defined options to control reading,
849 typically used to efficiently read only a subset of the dataset.
850 collections : Any, optional
851 Collections to be searched, overriding ``self.collections``.
852 Can be any of the types supported by the ``collections`` argument
853 to butler construction.
854 storageClass : `StorageClass` or `str`, optional
855 The storage class to be used to override the Python type
856 returned by this method. By default the returned type matches
857 the dataset type definition for this dataset. Specifying a
858 read `StorageClass` can force a different type to be returned.
859 This type must be compatible with the original type.
860 timespan : `Timespan` or `None`, optional
861 A timespan that the validity range of the dataset must overlap.
862 If not provided and this is a calibration dataset type, an attempt
863 will be made to find the timespan from any temporal coordinate
864 in the data ID.
865 **kwargs
866 Additional keyword arguments used to augment or construct a
867 `DataCoordinate`. See `DataCoordinate.standardize`
868 parameters.
870 Returns
871 -------
872 obj : `object`
873 The dataset.
875 Raises
876 ------
877 LookupError
878 Raised if no matching dataset exists in the `Registry`.
879 TypeError
880 Raised if no collections were provided.
882 Notes
883 -----
884 When looking up datasets in a `~CollectionType.CALIBRATION` collection,
885 this method requires that the given data ID include temporal dimensions
886 beyond the dimensions of the dataset type itself, in order to find the
887 dataset with the appropriate validity range. For example, a "bias"
888 dataset with native dimensions ``{instrument, detector}`` could be
889 fetched with a ``{instrument, detector, exposure}`` data ID, because
890 ``exposure`` is a temporal dimension.
891 """
892 raise NotImplementedError()
894 @abstractmethod
895 def getURIs(
896 self,
897 datasetRefOrType: DatasetRef | DatasetType | str,
898 /,
899 dataId: DataId | None = None,
900 *,
901 predict: bool = False,
902 collections: Any = None,
903 run: str | None = None,
904 **kwargs: Any,
905 ) -> DatasetRefURIs:
906 """Return the URIs associated with the dataset.
908 Parameters
909 ----------
910 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
911 When `DatasetRef` the `dataId` should be `None`.
912 Otherwise the `DatasetType` or name thereof.
913 dataId : `dict` or `DataCoordinate`
914 A `dict` of `Dimension` link name, value pairs that label the
915 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
916 should be provided as the first argument.
917 predict : `bool`
918 If `True`, allow URIs to be returned of datasets that have not
919 been written.
920 collections : Any, optional
921 Collections to be searched, overriding ``self.collections``.
922 Can be any of the types supported by the ``collections`` argument
923 to butler construction.
924 run : `str`, optional
925 Run to use for predictions, overriding ``self.run``.
926 **kwargs
927 Additional keyword arguments used to augment or construct a
928 `DataCoordinate`. See `DataCoordinate.standardize`
929 parameters.
931 Returns
932 -------
933 uris : `DatasetRefURIs`
934 The URI to the primary artifact associated with this dataset (if
935 the dataset was disassembled within the datastore this may be
936 `None`), and the URIs to any components associated with the dataset
937 artifact. (can be empty if there are no components).
938 """
939 raise NotImplementedError()
941 def getURI(
942 self,
943 datasetRefOrType: DatasetRef | DatasetType | str,
944 /,
945 dataId: DataId | None = None,
946 *,
947 predict: bool = False,
948 collections: Any = None,
949 run: str | None = None,
950 **kwargs: Any,
951 ) -> ResourcePath:
952 """Return the URI to the Dataset.
954 Parameters
955 ----------
956 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
957 When `DatasetRef` the `dataId` should be `None`.
958 Otherwise the `DatasetType` or name thereof.
959 dataId : `dict` or `DataCoordinate`
960 A `dict` of `Dimension` link name, value pairs that label the
961 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
962 should be provided as the first argument.
963 predict : `bool`
964 If `True`, allow URIs to be returned of datasets that have not
965 been written.
966 collections : Any, optional
967 Collections to be searched, overriding ``self.collections``.
968 Can be any of the types supported by the ``collections`` argument
969 to butler construction.
970 run : `str`, optional
971 Run to use for predictions, overriding ``self.run``.
972 **kwargs
973 Additional keyword arguments used to augment or construct a
974 `DataCoordinate`. See `DataCoordinate.standardize`
975 parameters.
977 Returns
978 -------
979 uri : `lsst.resources.ResourcePath`
980 URI pointing to the Dataset within the datastore. If the
981 Dataset does not exist in the datastore, and if ``predict`` is
982 `True`, the URI will be a prediction and will include a URI
983 fragment "#predicted".
984 If the datastore does not have entities that relate well
985 to the concept of a URI the returned URI string will be
986 descriptive. The returned URI is not guaranteed to be obtainable.
988 Raises
989 ------
990 LookupError
991 A URI has been requested for a dataset that does not exist and
992 guessing is not allowed.
993 ValueError
994 Raised if a resolved `DatasetRef` was passed as an input, but it
995 differs from the one found in the registry.
996 TypeError
997 Raised if no collections were provided.
998 RuntimeError
999 Raised if a URI is requested for a dataset that consists of
1000 multiple artifacts.
1001 """
1002 primary, components = self.getURIs(
1003 datasetRefOrType, dataId=dataId, predict=predict, collections=collections, run=run, **kwargs
1004 )
1006 if primary is None or components:
1007 raise RuntimeError(
1008 f"Dataset ({datasetRefOrType}) includes distinct URIs for components. "
1009 "Use Butler.getURIs() instead."
1010 )
1011 return primary
1013 @abstractmethod
1014 def get_dataset_type(self, name: str) -> DatasetType:
1015 """Get the `DatasetType`.
1017 Parameters
1018 ----------
1019 name : `str`
1020 Name of the type.
1022 Returns
1023 -------
1024 type : `DatasetType`
1025 The `DatasetType` associated with the given name.
1027 Raises
1028 ------
1029 lsst.daf.butler.MissingDatasetTypeError
1030 Raised if the requested dataset type has not been registered.
1032 Notes
1033 -----
1034 This method handles component dataset types automatically, though most
1035 other operations do not.
1036 """
1037 raise NotImplementedError()
1039 @abstractmethod
1040 def get_dataset(
1041 self,
1042 id: DatasetId | str,
1043 *,
1044 storage_class: str | StorageClass | None = None,
1045 dimension_records: bool = False,
1046 datastore_records: bool = False,
1047 ) -> DatasetRef | None:
1048 """Retrieve a Dataset entry.
1050 Parameters
1051 ----------
1052 id : `DatasetId`
1053 The unique identifier for the dataset, as an instance of
1054 `uuid.UUID` or a string containing a hexadecimal number.
1055 storage_class : `str` or `StorageClass` or `None`
1056 A storage class to use when creating the returned entry. If given
1057 it must be compatible with the default storage class.
1058 dimension_records : `bool`, optional
1059 If `True` the ref will be expanded and contain dimension records.
1060 datastore_records : `bool`, optional
1061 If `True` the ref will contain associated datastore records.
1063 Returns
1064 -------
1065 ref : `DatasetRef` or `None`
1066 A ref to the Dataset, or `None` if no matching Dataset
1067 was found.
1068 """
1069 raise NotImplementedError()
1071 @abstractmethod
1072 def get_many_datasets(self, ids: Iterable[DatasetId | str]) -> list[DatasetRef]:
1073 """Retrieve a list of dataset entries.
1075 Parameters
1076 ----------
1077 ids : `~collections.abc.Iterable` [ `DatasetId` or `str` ]
1078 The unique identifiers for the datasets, as instances of
1079 `uuid.UUID` or strings containing a hexadecimal number.
1081 Returns
1082 -------
1083 refs : `list` [ `DatasetRef` ]
1084 A list containing a `DatasetRef` for each of the given dataset IDs.
1085 If a dataset was not found, no error is thrown -- it is just not
1086 included in the list. The returned datasets are in no particular
1087 order.
1088 """
1089 raise NotImplementedError()
1091 @abstractmethod
1092 def find_dataset(
1093 self,
1094 dataset_type: DatasetType | str,
1095 data_id: DataId | None = None,
1096 *,
1097 collections: str | Sequence[str] | None = None,
1098 timespan: Timespan | None = None,
1099 storage_class: str | StorageClass | None = None,
1100 dimension_records: bool = False,
1101 datastore_records: bool = False,
1102 **kwargs: Any,
1103 ) -> DatasetRef | None:
1104 """Find a dataset given its `DatasetType` and data ID.
1106 This can be used to obtain a `DatasetRef` that permits the dataset to
1107 be read from a `Datastore`. If the dataset is a component and can not
1108 be found using the provided dataset type, a dataset ref for the parent
1109 will be returned instead but with the correct dataset type.
1111 Parameters
1112 ----------
1113 dataset_type : `DatasetType` or `str`
1114 A `DatasetType` or the name of one. If this is a `DatasetType`
1115 instance, its storage class will be respected and propagated to
1116 the output, even if it differs from the dataset type definition
1117 in the registry, as long as the storage classes are convertible.
1118 data_id : `dict` or `DataCoordinate`, optional
1119 A `dict`-like object containing the `Dimension` links that identify
1120 the dataset within a collection. If it is a `dict` the dataId
1121 can include dimension record values such as ``day_obs`` and
1122 ``seq_num`` or ``full_name`` that can be used to derive the
1123 primary dimension.
1124 collections : `str` or `list` [`str`], optional
1125 A an ordered list of collections to search for the dataset.
1126 Defaults to ``self.defaults.collections``.
1127 timespan : `Timespan`, optional
1128 A timespan that the validity range of the dataset must overlap.
1129 If not provided, any `~CollectionType.CALIBRATION` collections
1130 matched by the ``collections`` argument will not be searched.
1131 storage_class : `str` or `StorageClass` or `None`
1132 A storage class to use when creating the returned entry. If given
1133 it must be compatible with the default storage class.
1134 dimension_records : `bool`, optional
1135 If `True` the ref will be expanded and contain dimension records.
1136 datastore_records : `bool`, optional
1137 If `True` the ref will contain associated datastore records.
1138 **kwargs
1139 Additional keyword arguments passed to
1140 `DataCoordinate.standardize` to convert ``dataId`` to a true
1141 `DataCoordinate` or augment an existing one. This can also include
1142 dimension record metadata that can be used to derive a primary
1143 dimension value.
1145 Returns
1146 -------
1147 ref : `DatasetRef`
1148 A reference to the dataset, or `None` if no matching Dataset
1149 was found.
1151 Raises
1152 ------
1153 lsst.daf.butler.NoDefaultCollectionError
1154 Raised if ``collections`` is `None` and
1155 ``self.collections`` is `None`.
1156 LookupError
1157 Raised if one or more data ID keys are missing.
1158 lsst.daf.butler.MissingDatasetTypeError
1159 Raised if the dataset type does not exist.
1160 lsst.daf.butler.MissingCollectionError
1161 Raised if any of ``collections`` does not exist in the registry.
1163 Notes
1164 -----
1165 This method simply returns `None` and does not raise an exception even
1166 when the set of collections searched is intrinsically incompatible with
1167 the dataset type, e.g. if ``datasetType.isCalibration() is False``, but
1168 only `~CollectionType.CALIBRATION` collections are being searched.
1169 This may make it harder to debug some lookup failures, but the behavior
1170 is intentional; we consider it more important that failed searches are
1171 reported consistently, regardless of the reason, and that adding
1172 additional collections that do not contain a match to the search path
1173 never changes the behavior.
1175 This method handles component dataset types automatically, though most
1176 other query operations do not.
1177 """
1178 raise NotImplementedError()
1180 @abstractmethod
1181 def retrieve_artifacts_zip(
1182 self,
1183 refs: Iterable[DatasetRef],
1184 destination: ResourcePathExpression,
1185 overwrite: bool = True,
1186 ) -> ResourcePath:
1187 """Retrieve artifacts from a Butler and place in ZIP file.
1189 Parameters
1190 ----------
1191 refs : `~collections.abc.Iterable` [ `DatasetRef` ]
1192 The datasets to be included in the zip file.
1193 destination : `lsst.resources.ResourcePathExpression`
1194 Directory to write the new ZIP file. This directory will
1195 also be used as a staging area for the datasets being downloaded
1196 from the datastore.
1197 overwrite : `bool`, optional
1198 If `False` the output Zip will not be written if a file of the
1199 same name is already present in ``destination``.
1201 Returns
1202 -------
1203 zip_file : `lsst.resources.ResourcePath`
1204 The path to the new ZIP file.
1206 Raises
1207 ------
1208 ValueError
1209 Raised if there are no refs to retrieve.
1210 """
1211 raise NotImplementedError()
1213 @abstractmethod
1214 def retrieveArtifacts(
1215 self,
1216 refs: Iterable[DatasetRef],
1217 destination: ResourcePathExpression,
1218 transfer: str = "auto",
1219 preserve_path: bool = True,
1220 overwrite: bool = False,
1221 ) -> list[ResourcePath]:
1222 """Retrieve the artifacts associated with the supplied refs.
1224 Parameters
1225 ----------
1226 refs : `~collections.abc.Iterable` of `DatasetRef`
1227 The datasets for which artifacts are to be retrieved.
1228 A single ref can result in multiple artifacts. The refs must
1229 be resolved.
1230 destination : `lsst.resources.ResourcePath` or `str`
1231 Location to write the artifacts.
1232 transfer : `str`, optional
1233 Method to use to transfer the artifacts. Must be one of the options
1234 supported by `~lsst.resources.ResourcePath.transfer_from`.
1235 "move" is not allowed.
1236 preserve_path : `bool`, optional
1237 If `True` the full path of the artifact within the datastore
1238 is preserved. If `False` the final file component of the path
1239 is used.
1240 overwrite : `bool`, optional
1241 If `True` allow transfers to overwrite existing files at the
1242 destination.
1244 Returns
1245 -------
1246 targets : `list` of `lsst.resources.ResourcePath`
1247 URIs of file artifacts in destination location. Order is not
1248 preserved.
1250 Notes
1251 -----
1252 For non-file datastores the artifacts written to the destination
1253 may not match the representation inside the datastore. For example
1254 a hierarchical data structure in a NoSQL database may well be stored
1255 as a JSON file.
1256 """
1257 raise NotImplementedError()
1259 @abstractmethod
1260 def exists(
1261 self,
1262 dataset_ref_or_type: DatasetRef | DatasetType | str,
1263 /,
1264 data_id: DataId | None = None,
1265 *,
1266 full_check: bool = True,
1267 collections: Any = None,
1268 **kwargs: Any,
1269 ) -> DatasetExistence:
1270 """Indicate whether a dataset is known to Butler registry and
1271 datastore.
1273 Parameters
1274 ----------
1275 dataset_ref_or_type : `DatasetRef`, `DatasetType`, or `str`
1276 When `DatasetRef` the `dataId` should be `None`.
1277 Otherwise the `DatasetType` or name thereof.
1278 data_id : `dict` or `DataCoordinate`
1279 A `dict` of `Dimension` link name, value pairs that label the
1280 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1281 should be provided as the first argument.
1282 full_check : `bool`, optional
1283 If `True`, a check will be made for the actual existence of a
1284 dataset artifact. This will involve additional overhead due to
1285 the need to query an external system. If `False`, this check will
1286 be omitted, and the registry and datastore will solely be asked
1287 if they know about the dataset but no direct check for the
1288 artifact will be performed.
1289 collections : Any, optional
1290 Collections to be searched, overriding ``self.collections``.
1291 Can be any of the types supported by the ``collections`` argument
1292 to butler construction.
1293 **kwargs
1294 Additional keyword arguments used to augment or construct a
1295 `DataCoordinate`. See `DataCoordinate.standardize`
1296 parameters.
1298 Returns
1299 -------
1300 existence : `DatasetExistence`
1301 Object indicating whether the dataset is known to registry and
1302 datastore. Evaluates to `True` if the dataset is present and known
1303 to both.
1304 """
1305 raise NotImplementedError()
1307 @abstractmethod
1308 def _exists_many(
1309 self,
1310 refs: Iterable[DatasetRef],
1311 /,
1312 *,
1313 full_check: bool = True,
1314 ) -> dict[DatasetRef, DatasetExistence]:
1315 """Indicate whether multiple datasets are known to Butler registry and
1316 datastore.
1318 This is an experimental API that may change at any moment.
1320 Parameters
1321 ----------
1322 refs : `~collections.abc.Iterable` of `DatasetRef`
1323 The datasets to be checked.
1324 full_check : `bool`, optional
1325 If `True`, a check will be made for the actual existence of each
1326 dataset artifact. This will involve additional overhead due to
1327 the need to query an external system. If `False`, this check will
1328 be omitted, and the registry and datastore will solely be asked
1329 if they know about the dataset(s) but no direct check for the
1330 artifact(s) will be performed.
1332 Returns
1333 -------
1334 existence : `dict` [`DatasetRef`, `DatasetExistence`]
1335 Mapping from the given dataset refs to an enum indicating the
1336 status of the dataset in registry and datastore.
1337 Each value evaluates to `True` if the dataset is present and known
1338 to both.
1339 """
1340 raise NotImplementedError()
1342 @abstractmethod
1343 def removeRuns(
1344 self,
1345 names: Iterable[str],
1346 unstore: bool | type[_DeprecatedDefault] = _DeprecatedDefault,
1347 *,
1348 unlink_from_chains: bool = False,
1349 ) -> None:
1350 """Remove one or more `~CollectionType.RUN` collections and the
1351 datasets within them.
1353 Parameters
1354 ----------
1355 names : `~collections.abc.Iterable` [ `str` ]
1356 The names of the collections to remove.
1357 unstore : `bool`, optional
1358 If `True` (default), delete datasets from all datastores in which
1359 they are present, and attempt to rollback the registry deletions if
1360 datastore deletions fail (which may not always be possible). If
1361 `False`, datastore records for these datasets are still removed,
1362 but any artifacts (e.g. files) will not be. This parameter is now
1363 deprecated and no longer has any effect. Files are always deleted
1364 from datastores unless they were ingested using full URIs.
1365 unlink_from_chains : `bool`, optional
1366 If `True` remove the RUN collection from any chains prior to
1367 removing the RUN. If `False` the removal will fail if any chains
1368 still refer to the RUN.
1370 Raises
1371 ------
1372 TypeError
1373 Raised if one or more collections are not of type
1374 `~CollectionType.RUN`.
1375 """
1376 raise NotImplementedError()
1378 @abstractmethod
1379 def ingest(
1380 self,
1381 *datasets: FileDataset,
1382 transfer: str | None = "auto",
1383 record_validation_info: bool = True,
1384 skip_existing: bool = False,
1385 ) -> None:
1386 """Store and register one or more datasets that already exist on disk.
1388 Parameters
1389 ----------
1390 *datasets : `FileDataset`
1391 Each positional argument is a struct containing information about
1392 a file to be ingested, including its URI (either absolute or
1393 relative to the datastore root, if applicable), a resolved
1394 `DatasetRef`, and optionally a formatter class or its
1395 fully-qualified string name. If a formatter is not provided, the
1396 formatter that would be used for `put` is assumed. On successful
1397 ingest all `FileDataset.formatter` attributes will be set to the
1398 formatter class used. `FileDataset.path` attributes may be modified
1399 to put paths in whatever the datastore considers a standardized
1400 form.
1401 transfer : `str`, optional
1402 If not `None`, must be one of 'auto', 'move', 'copy', 'direct',
1403 'split', 'hardlink', 'relsymlink' or 'symlink', indicating how to
1404 transfer the file.
1405 record_validation_info : `bool`, optional
1406 If `True`, the default, the datastore can record validation
1407 information associated with the file. If `False` the datastore
1408 will not attempt to track any information such as checksums
1409 or file sizes. This can be useful if such information is tracked
1410 in an external system or if the file is to be compressed in place.
1411 It is up to the datastore whether this parameter is relevant.
1412 skip_existing : `bool`, optional
1413 If `True`, a dataset will not be ingested if a dataset with the
1414 same dataset ID already exists in the datastore.
1415 If `False` (the default), a `ConflictingDefinitionError` will be
1416 raised if any datasets with the same dataset ID already exist
1417 in the datastore.
1419 Returns
1420 -------
1421 None
1423 Raises
1424 ------
1425 TypeError
1426 Raised if the butler is read-only or if no run was provided.
1427 NotImplementedError
1428 Raised if the `Datastore` does not support the given transfer mode.
1429 DatasetTypeNotSupportedError
1430 Raised if one or more files to be ingested have a dataset type that
1431 is not supported by the `Datastore`..
1432 FileNotFoundError
1433 Raised if one of the given files does not exist.
1434 FileExistsError
1435 Raised if transfer is not `None` but the (internal) location the
1436 file would be moved to is already occupied.
1437 ConflictingDefinitionError
1438 Raised if a dataset already exists in the repository and
1439 ``skip_existing`` is `False`.
1441 Notes
1442 -----
1443 This operation is not fully exception safe: if a database operation
1444 fails, the given `FileDataset` instances may be only partially updated.
1446 It is atomic in terms of database operations (they will either all
1447 succeed or all fail) providing the database engine implements
1448 transactions correctly. It will attempt to be atomic in terms of
1449 filesystem operations as well, but this cannot be implemented
1450 rigorously for most datastores.
1451 """
1452 raise NotImplementedError()
1454 @abstractmethod
1455 def ingest_zip(
1456 self,
1457 zip_file: ResourcePathExpression,
1458 transfer: str = "auto",
1459 *,
1460 transfer_dimensions: bool = False,
1461 dry_run: bool = False,
1462 skip_existing: bool = False,
1463 ) -> None:
1464 """Ingest a Zip file into this butler.
1466 The Zip file must have been created by `retrieve_artifacts_zip`.
1468 Parameters
1469 ----------
1470 zip_file : `lsst.resources.ResourcePathExpression`
1471 Path to the Zip file.
1472 transfer : `str`, optional
1473 Method to use to transfer the Zip into the datastore.
1474 transfer_dimensions : `bool`, optional
1475 If `True`, dimension record data associated with the new datasets
1476 will be transferred from the Zip file, if present.
1477 dry_run : `bool`, optional
1478 If `True` the ingest will be processed without any modifications
1479 made to the target butler and as if the target butler did not
1480 have any of the datasets.
1481 skip_existing : `bool`, optional
1482 If `True`, a zip will not be ingested if the dataset entries listed
1483 in the index with the same dataset ID already exists in the butler.
1484 If `False` (the default), a `ConflictingDefinitionError` will be
1485 raised if any datasets with the same dataset ID already exist
1486 in the repository. If, somehow, some datasets are known to the
1487 butler and some are not, this is currently treated as an error
1488 rather than attempting to do a partial ingest.
1490 Notes
1491 -----
1492 Run collections and dataset types are created as needed.
1493 """
1494 raise NotImplementedError()
1496 @abstractmethod
1497 def export(
1498 self,
1499 *,
1500 directory: str | None = None,
1501 filename: str | None = None,
1502 format: str | None = None,
1503 transfer: str | None = None,
1504 ) -> AbstractContextManager[RepoExportContext]:
1505 """Export datasets from the repository represented by this `Butler`.
1507 This method is a context manager that returns a helper object
1508 (`RepoExportContext`) that is used to indicate what information from
1509 the repository should be exported.
1511 Parameters
1512 ----------
1513 directory : `str`, optional
1514 Directory dataset files should be written to if ``transfer`` is not
1515 `None`.
1516 filename : `str`, optional
1517 Name for the file that will include database information associated
1518 with the exported datasets. If this is not an absolute path and
1519 ``directory`` is not `None`, it will be written to ``directory``
1520 instead of the current working directory. Defaults to
1521 "export.{format}".
1522 format : `str`, optional
1523 File format for the database information file. If `None`, the
1524 extension of ``filename`` will be used.
1525 transfer : `str`, optional
1526 Transfer mode passed to `Datastore.export`.
1528 Raises
1529 ------
1530 TypeError
1531 Raised if the set of arguments passed is inconsistent.
1533 Examples
1534 --------
1535 Typically the `Registry.queryDataIds` and `Registry.queryDatasets`
1536 methods are used to provide the iterables over data IDs and/or datasets
1537 to be exported::
1539 with butler.export("exports.yaml") as export:
1540 # Export all flats, but none of the dimension element rows
1541 # (i.e. data ID information) associated with them.
1542 export.saveDatasets(
1543 butler.registry.queryDatasets("flat"), elements=()
1544 )
1545 # Export all datasets that start with "deepCoadd_" and all of
1546 # their associated data ID information.
1547 export.saveDatasets(butler.registry.queryDatasets("deepCoadd_*"))
1548 """
1549 raise NotImplementedError()
1551 @abstractmethod
1552 def import_(
1553 self,
1554 *,
1555 directory: ResourcePathExpression | None = None,
1556 filename: ResourcePathExpression | TextIO | None = None,
1557 format: str | None = None,
1558 transfer: str | None = None,
1559 skip_dimensions: set | None = None,
1560 record_validation_info: bool = True,
1561 without_datastore: bool = False,
1562 ) -> None:
1563 """Import datasets into this repository that were exported from a
1564 different butler repository via `~lsst.daf.butler.Butler.export`.
1566 Parameters
1567 ----------
1568 directory : `~lsst.resources.ResourcePathExpression`, optional
1569 Directory containing dataset files to import from. If `None`,
1570 ``filename`` and all dataset file paths specified therein must
1571 be absolute.
1572 filename : `~lsst.resources.ResourcePathExpression` or `typing.TextIO`
1573 A stream or name of file that contains database information
1574 associated with the exported datasets, typically generated by
1575 `~lsst.daf.butler.Butler.export`. If this a string (name) or
1576 `~lsst.resources.ResourcePath` and is not an absolute path,
1577 it will first be looked for relative to ``directory`` and if not
1578 found there it will be looked for in the current working
1579 directory. Defaults to "export.{format}".
1580 format : `str`, optional
1581 File format for ``filename``. If `None`, the extension of
1582 ``filename`` will be used.
1583 transfer : `str`, optional
1584 Transfer mode passed to `~lsst.daf.butler.Datastore.ingest`.
1585 skip_dimensions : `set`, optional
1586 Names of dimensions that should be skipped and not imported.
1587 record_validation_info : `bool`, optional
1588 If `True`, the default, the datastore can record validation
1589 information associated with the file. If `False` the datastore
1590 will not attempt to track any information such as checksums
1591 or file sizes. This can be useful if such information is tracked
1592 in an external system or if the file is to be compressed in place.
1593 It is up to the datastore whether this parameter is relevant.
1594 without_datastore : `bool`, optional
1595 If `True` only registry records will be imported and the datastore
1596 will be ignored.
1598 Raises
1599 ------
1600 TypeError
1601 Raised if the set of arguments passed is inconsistent, or if the
1602 butler is read-only.
1603 """
1604 raise NotImplementedError()
1606 @abstractmethod
1607 def transfer_dimension_records_from(
1608 self, source_butler: LimitedButler | Butler, source_refs: Iterable[DatasetRef | DataCoordinate]
1609 ) -> None:
1610 """Transfer dimension records to this Butler from another Butler.
1612 Parameters
1613 ----------
1614 source_butler : `LimitedButler` or `Butler`
1615 Butler from which the records are to be transferred. If data IDs
1616 in ``source_refs`` are not expanded then this has to be a full
1617 `Butler` whose registry will be used to expand data IDs. If the
1618 source refs contain coordinates that are used to populate other
1619 records then this will also need to be a full `Butler`.
1620 source_refs : `~collections.abc.Iterable` [`DatasetRef` |\
1621 `DataCoordinate`]
1622 Datasets or data IDs defined in the source butler whose dimension
1623 records should be transferred to this butler.
1624 """
1625 raise NotImplementedError()
1627 @abstractmethod
1628 def transfer_from(
1629 self,
1630 source_butler: LimitedButler,
1631 source_refs: Iterable[DatasetRef],
1632 transfer: str = "auto",
1633 skip_missing: bool = True,
1634 register_dataset_types: bool = False,
1635 transfer_dimensions: bool = False,
1636 dry_run: bool = False,
1637 ) -> Collection[DatasetRef]:
1638 """Transfer datasets to this Butler from a run in another Butler.
1640 Parameters
1641 ----------
1642 source_butler : `LimitedButler`
1643 Butler from which the datasets are to be transferred. If data IDs
1644 in ``source_refs`` are not expanded then this has to be a full
1645 `Butler` whose registry will be used to expand data IDs.
1646 source_refs : `~collections.abc.Iterable` of `DatasetRef`
1647 Datasets defined in the source butler that should be transferred to
1648 this butler. In most circumstances, ``transfer_from`` is faster if
1649 the dataset refs are expanded.
1650 transfer : `str`, optional
1651 Transfer mode passed to `~lsst.daf.butler.Datastore.transfer_from`.
1652 skip_missing : `bool`
1653 If `True`, datasets with no datastore artifact associated with
1654 them are not transferred. If `False` a registry entry will be
1655 created even if no datastore record is created (and so will
1656 look equivalent to the dataset being unstored).
1657 register_dataset_types : `bool`
1658 If `True` any missing dataset types are registered. Otherwise
1659 an exception is raised.
1660 transfer_dimensions : `bool`, optional
1661 If `True`, dimension record data associated with the new datasets
1662 will be transferred.
1663 dry_run : `bool`, optional
1664 If `True` the transfer will be processed without any modifications
1665 made to the target butler and as if the target butler did not
1666 have any of the datasets.
1668 Returns
1669 -------
1670 refs : `list` of `DatasetRef`
1671 The refs added to this Butler.
1673 Notes
1674 -----
1675 The datastore artifact has to exist for a transfer
1676 to be made but non-existence is not an error.
1678 Datasets that already exist in this run will be skipped.
1680 The datasets are imported as part of a transaction, although
1681 dataset types are registered before the transaction is started.
1682 This means that it is possible for a dataset type to be registered
1683 even though transfer has failed.
1684 """
1685 raise NotImplementedError()
1687 @abstractmethod
1688 def validateConfiguration(
1689 self,
1690 logFailures: bool = False,
1691 datasetTypeNames: Iterable[str] | None = None,
1692 ignore: Iterable[str] | None = None,
1693 ) -> None:
1694 """Validate butler configuration.
1696 Checks that each `DatasetType` can be stored in the `Datastore`.
1698 Parameters
1699 ----------
1700 logFailures : `bool`, optional
1701 If `True`, output a log message for every validation error
1702 detected.
1703 datasetTypeNames : `~collections.abc.Iterable` of `str`, optional
1704 The `DatasetType` names that should be checked. This allows
1705 only a subset to be selected.
1706 ignore : `~collections.abc.Iterable` of `str`, optional
1707 Names of DatasetTypes to skip over. This can be used to skip
1708 known problems. If a named `DatasetType` corresponds to a
1709 composite, all components of that `DatasetType` will also be
1710 ignored.
1712 Raises
1713 ------
1714 ButlerValidationError
1715 Raised if there is some inconsistency with how this Butler
1716 is configured.
1717 """
1718 raise NotImplementedError()
1720 @property
1721 @abstractmethod
1722 def collection_chains(self) -> ButlerCollections:
1723 """Object with methods for modifying collection chains
1724 (`~lsst.daf.butler.ButlerCollections`).
1726 Deprecated. Replaced with ``collections`` property.
1727 """
1728 raise NotImplementedError()
1730 @property
1731 @abstractmethod
1732 def collections(self) -> ButlerCollections:
1733 """Object with methods for modifying and querying collections
1734 (`~lsst.daf.butler.ButlerCollections`).
1736 Use of this object is preferred over `registry` wherever possible.
1737 """
1738 raise NotImplementedError()
1740 @property
1741 @abstractmethod
1742 def run(self) -> str | None:
1743 """Name of the run this butler writes outputs to by default (`str` or
1744 `None`).
1745 """
1746 raise NotImplementedError()
1748 @property
1749 @abstractmethod
1750 def registry(self) -> Registry:
1751 """The object that manages dataset metadata and relationships
1752 (`Registry`).
1754 Many operations that don't involve reading or writing butler datasets
1755 are accessible only via `Registry` methods. Eventually these methods
1756 will be replaced by equivalent `Butler` methods.
1757 """
1758 raise NotImplementedError()
1760 @abstractmethod
1761 def query(self) -> AbstractContextManager[Query]:
1762 """Context manager returning a `.queries.Query` object used for
1763 construction and execution of complex queries.
1764 """
1765 raise NotImplementedError()
1767 def query_data_ids(
1768 self,
1769 dimensions: DimensionGroup | Iterable[str] | str,
1770 *,
1771 data_id: DataId | None = None,
1772 where: str = "",
1773 bind: Mapping[str, Any] | None = None,
1774 with_dimension_records: bool = False,
1775 order_by: Iterable[str] | str | None = None,
1776 limit: int | None = -20_000,
1777 explain: bool = True,
1778 **kwargs: Any,
1779 ) -> list[DataCoordinate]:
1780 """Query for data IDs matching user-provided criteria.
1782 Parameters
1783 ----------
1784 dimensions : `DimensionGroup`, `str`, or \
1785 `~collections.abc.Iterable` [`str`]
1786 The dimensions of the data IDs to yield, as either `DimensionGroup`
1787 instances or `str`. Will be automatically expanded to a complete
1788 `DimensionGroup`.
1789 data_id : `dict` or `DataCoordinate`, optional
1790 A data ID whose key-value pairs are used as equality constraints
1791 in the query.
1792 where : `str`, optional
1793 A string expression similar to a SQL WHERE clause. May involve
1794 any column of a dimension table or (as a shortcut for the primary
1795 key column of a dimension table) dimension name. See
1796 :ref:`daf_butler_dimension_expressions` for more information.
1797 bind : `~collections.abc.Mapping`, optional
1798 Mapping containing literal values that should be injected into the
1799 ``where`` expression, keyed by the identifiers they replace.
1800 Values of collection type can be expanded in some cases; see
1801 :ref:`daf_butler_dimension_expressions_identifiers` for more
1802 information.
1803 with_dimension_records : `bool`, optional
1804 If `True` (default is `False`) then returned data IDs will have
1805 dimension records.
1806 order_by : `~collections.abc.Iterable` [`str`] or `str`, optional
1807 Names of the columns/dimensions to use for ordering returned data
1808 IDs. Column name can be prefixed with minus (``-``) to use
1809 descending ordering.
1810 limit : `int` or `None`, optional
1811 Upper limit on the number of returned records. `None` can be used
1812 if no limit is wanted. A limit of ``0`` means that the query will
1813 be executed and validated but no results will be returned. In this
1814 case there will be no exception even if ``explain`` is `True`.
1815 If a negative value is given a warning will be issued if the number
1816 of results is capped by that limit.
1817 explain : `bool`, optional
1818 If `True` (default) then `EmptyQueryResultError` exception is
1819 raised when resulting list is empty. The exception contains
1820 non-empty list of strings explaining possible causes for empty
1821 result.
1822 **kwargs
1823 Additional keyword arguments are forwarded to
1824 `DataCoordinate.standardize` when processing the ``data_id``
1825 argument (and may be used to provide a constraining data ID even
1826 when the ``data_id`` argument is `None`).
1828 Returns
1829 -------
1830 dataIds : `list` [`DataCoordinate`]
1831 Data IDs matching the given query parameters. These are always
1832 guaranteed to identify all dimensions (`DataCoordinate.hasFull`
1833 returns `True`).
1835 Raises
1836 ------
1837 lsst.daf.butler.registry.DataIdError
1838 Raised when ``data_id`` or keyword arguments specify unknown
1839 dimensions or values, or when they contain inconsistent values.
1840 lsst.daf.butler.registry.UserExpressionError
1841 Raised when ``where`` expression is invalid.
1842 lsst.daf.butler.EmptyQueryResultError
1843 Raised when query generates empty result and ``explain`` is set to
1844 `True`.
1845 TypeError
1846 Raised when the arguments are incompatible.
1847 """
1848 if data_id is None:
1849 data_id = DataCoordinate.make_empty(self.dimensions)
1850 if order_by is None:
1851 order_by = []
1852 query_limit = limit
1853 warn_limit = False
1854 if limit is not None and limit < 0:
1855 query_limit = abs(limit) + 1
1856 warn_limit = True
1857 with self.query() as query:
1858 result = (
1859 query.data_ids(dimensions)
1860 .where(data_id, where, bind=bind, **kwargs)
1861 .order_by(*ensure_iterable(order_by))
1862 .limit(query_limit)
1863 )
1864 if with_dimension_records:
1865 result = result.with_dimension_records()
1866 data_ids = list(result)
1867 if warn_limit and len(data_ids) == query_limit:
1868 # We asked for one too many so must remove that from the list.
1869 data_ids.pop(-1)
1870 assert limit is not None # For mypy.
1871 _LOG.warning("More data IDs are available than the requested limit of %d.", abs(limit))
1872 if explain and (limit is None or limit != 0) and not data_ids:
1873 raise EmptyQueryResultError(list(result.explain_no_results()))
1874 return data_ids
1876 def query_datasets(
1877 self,
1878 dataset_type: str | DatasetType,
1879 collections: str | Iterable[str] | None = None,
1880 *,
1881 find_first: bool = True,
1882 data_id: DataId | None = None,
1883 where: str = "",
1884 bind: Mapping[str, Any] | None = None,
1885 with_dimension_records: bool = False,
1886 order_by: Iterable[str] | str | None = None,
1887 limit: int | None = -20_000,
1888 explain: bool = True,
1889 **kwargs: Any,
1890 ) -> list[DatasetRef]:
1891 """Query for dataset references matching user-provided criteria.
1893 Parameters
1894 ----------
1895 dataset_type : `str` or `DatasetType`
1896 Dataset type object or name to search for.
1897 collections : collection expression, optional
1898 A collection name or iterable of collection names to search. If not
1899 provided, the default collections are used. Can be a wildcard if
1900 ``find_first`` is `False` (if find first is requested the order
1901 of collections matters and wildcards make the order indeterminate).
1902 See :ref:`daf_butler_collection_expressions` for more information.
1903 find_first : `bool`, optional
1904 If `True` (default), for each result data ID, only yield one
1905 `DatasetRef` of each `DatasetType`, from the first collection in
1906 which a dataset of that dataset type appears (according to the
1907 order of ``collections`` passed in). If `True`, ``collections``
1908 must not contain wildcards.
1909 data_id : `dict` or `DataCoordinate`, optional
1910 A data ID whose key-value pairs are used as equality constraints in
1911 the query.
1912 where : `str`, optional
1913 A string expression similar to a SQL WHERE clause. May involve any
1914 column of a dimension table or (as a shortcut for the primary key
1915 column of a dimension table) dimension name. See
1916 :ref:`daf_butler_dimension_expressions` for more information.
1917 bind : `~collections.abc.Mapping`, optional
1918 Mapping containing literal values that should be injected into the
1919 ``where`` expression, keyed by the identifiers they replace. Values
1920 of collection type can be expanded in some cases; see
1921 :ref:`daf_butler_dimension_expressions_identifiers` for more
1922 information.
1923 with_dimension_records : `bool`, optional
1924 If `True` (default is `False`) then returned data IDs will have
1925 dimension records.
1926 order_by : `~collections.abc.Iterable` [`str`] or `str`, optional
1927 Names of the columns/dimensions to use for ordering returned data
1928 IDs. Column name can be prefixed with minus (``-``) to use
1929 descending ordering.
1930 limit : `int` or `None`, optional
1931 Upper limit on the number of returned records. `None` can be used
1932 if no limit is wanted. A limit of ``0`` means that the query will
1933 be executed and validated but no results will be returned. In this
1934 case there will be no exception even if ``explain`` is `True`.
1935 If a negative value is given a warning will be issued if the number
1936 of results is capped by that limit.
1937 explain : `bool`, optional
1938 If `True` (default) then `EmptyQueryResultError` exception is
1939 raised when resulting list is empty. The exception contains
1940 non-empty list of strings explaining possible causes for empty
1941 result.
1942 **kwargs
1943 Additional keyword arguments are forwarded to
1944 `DataCoordinate.standardize` when processing the ``data_id``
1945 argument (and may be used to provide a constraining data ID even
1946 when the ``data_id`` argument is `None`).
1948 Returns
1949 -------
1950 refs : `.queries.DatasetRefQueryResults`
1951 Dataset references matching the given query criteria. Nested data
1952 IDs are guaranteed to include values for all implied dimensions
1953 (i.e. `DataCoordinate.hasFull` will return `True`).
1955 Raises
1956 ------
1957 lsst.daf.butler.registry.DatasetTypeExpressionError
1958 Raised when ``dataset_type`` expression is invalid.
1959 lsst.daf.butler.registry.DataIdError
1960 Raised when ``data_id`` or keyword arguments specify unknown
1961 dimensions or values, or when they contain inconsistent values.
1962 lsst.daf.butler.registry.UserExpressionError
1963 Raised when ``where`` expression is invalid.
1964 lsst.daf.butler.EmptyQueryResultError
1965 Raised when query generates empty result and ``explain`` is set to
1966 `True`.
1967 TypeError
1968 Raised when the arguments are incompatible, such as when a
1969 collection wildcard is passed when ``find_first`` is `True`, or
1970 when ``collections`` is `None` and default butler collections are
1971 not defined.
1972 """
1973 if data_id is None:
1974 data_id = DataCoordinate.make_empty(self.dimensions)
1975 if order_by is None:
1976 order_by = []
1977 if collections and has_globs(collections):
1978 # Wild cards need to be expanded but can only be allowed if
1979 # find_first=False because expanding wildcards does not return
1980 # a guaranteed ordering. Querying collection registry to expand
1981 # collections when we do not have wildcards is expensive so only
1982 # do it if we need it.
1983 if find_first:
1984 raise InvalidQueryError(
1985 f"Can not use wildcards in collections when find_first=True (given {collections})"
1986 )
1987 collections = self.collections.query(collections)
1988 query_limit = limit
1989 warn_limit = False
1990 if limit is not None and limit < 0:
1991 query_limit = abs(limit) + 1
1992 warn_limit = True
1993 with self.query() as query:
1994 result = (
1995 query.datasets(dataset_type, collections=collections, find_first=find_first)
1996 .where(data_id, where, bind=bind, **kwargs)
1997 .order_by(*ensure_iterable(order_by))
1998 .limit(query_limit)
1999 )
2000 if with_dimension_records:
2001 result = result.with_dimension_records()
2002 refs = list(result)
2003 if warn_limit and len(refs) == query_limit:
2004 # We asked for one too many so must remove that from the list.
2005 refs.pop(-1)
2006 assert limit is not None # For mypy.
2007 _LOG.warning("More datasets are available than the requested limit of %d.", abs(limit))
2008 if explain and (limit is None or limit != 0) and not refs:
2009 raise EmptyQueryResultError(list(result.explain_no_results()))
2010 return refs
2012 def query_dimension_records(
2013 self,
2014 element: str,
2015 *,
2016 data_id: DataId | None = None,
2017 where: str = "",
2018 bind: Mapping[str, Any] | None = None,
2019 order_by: Iterable[str] | str | None = None,
2020 limit: int | None = -20_000,
2021 explain: bool = True,
2022 **kwargs: Any,
2023 ) -> list[DimensionRecord]:
2024 """Query for dimension information matching user-provided criteria.
2026 Parameters
2027 ----------
2028 element : `str`
2029 The name of a dimension element to obtain records for.
2030 data_id : `dict` or `DataCoordinate`, optional
2031 A data ID whose key-value pairs are used as equality constraints
2032 in the query.
2033 where : `str`, optional
2034 A string expression similar to a SQL WHERE clause. See
2035 `Registry.queryDataIds` and :ref:`daf_butler_dimension_expressions`
2036 for more information.
2037 bind : `~collections.abc.Mapping`, optional
2038 Mapping containing literal values that should be injected into the
2039 ``where`` expression, keyed by the identifiers they replace.
2040 Values of collection type can be expanded in some cases; see
2041 :ref:`daf_butler_dimension_expressions_identifiers` for more
2042 information.
2043 order_by : `~collections.abc.Iterable` [`str`] or `str`, optional
2044 Names of the columns/dimensions to use for ordering returned data
2045 IDs. Column name can be prefixed with minus (``-``) to use
2046 descending ordering.
2047 limit : `int` or `None`, optional
2048 Upper limit on the number of returned records. `None` can be used
2049 if no limit is wanted. A limit of ``0`` means that the query will
2050 be executed and validated but no results will be returned. In this
2051 case there will be no exception even if ``explain`` is `True`.
2052 If a negative value is given a warning will be issued if the number
2053 of results is capped by that limit.
2054 explain : `bool`, optional
2055 If `True` (default) then `EmptyQueryResultError` exception is
2056 raised when resulting list is empty. The exception contains
2057 non-empty list of strings explaining possible causes for empty
2058 result.
2059 **kwargs
2060 Additional keyword arguments are forwarded to
2061 `DataCoordinate.standardize` when processing the ``data_id``
2062 argument (and may be used to provide a constraining data ID even
2063 when the ``data_id`` argument is `None`).
2065 Returns
2066 -------
2067 records : `list` [`DimensionRecord`]
2068 Dimension records matching the given query parameters.
2070 Raises
2071 ------
2072 lsst.daf.butler.registry.DataIdError
2073 Raised when ``data_id`` or keyword arguments specify unknown
2074 dimensions or values, or when they contain inconsistent values.
2075 lsst.daf.butler.registry.UserExpressionError
2076 Raised when ``where`` expression is invalid.
2077 lsst.daf.butler.EmptyQueryResultError
2078 Raised when query generates empty result and ``explain`` is set to
2079 `True`.
2080 TypeError
2081 Raised when the arguments are incompatible, such as when a
2082 collection wildcard is passed when ``find_first`` is `True`, or
2083 when ``collections`` is `None` and default butler collections are
2084 not defined.
2085 """
2086 if data_id is None:
2087 data_id = DataCoordinate.make_empty(self.dimensions)
2088 if order_by is None:
2089 order_by = []
2090 query_limit = limit
2091 warn_limit = False
2092 if limit is not None and limit < 0:
2093 query_limit = abs(limit) + 1
2094 warn_limit = True
2095 with self.query() as query:
2096 result = (
2097 query.dimension_records(element)
2098 .where(data_id, where, bind=bind, **kwargs)
2099 .order_by(*ensure_iterable(order_by))
2100 .limit(query_limit)
2101 )
2102 dimension_records = list(result)
2103 if warn_limit and len(dimension_records) == query_limit:
2104 # We asked for one too many so must remove that from the list.
2105 dimension_records.pop(-1)
2106 assert limit is not None # For mypy.
2107 _LOG.warning(
2108 "More dimension records are available than the requested limit of %d.", abs(limit)
2109 )
2110 if explain and (limit is None or limit != 0) and not dimension_records:
2111 raise EmptyQueryResultError(list(result.explain_no_results()))
2112 return dimension_records
2114 def query_all_datasets(
2115 self,
2116 collections: str | Iterable[str] | None = None,
2117 *,
2118 name: str | Iterable[str] = "*",
2119 find_first: bool = True,
2120 data_id: DataId | None = None,
2121 where: str = "",
2122 bind: Mapping[str, Any] | None = None,
2123 limit: int | None = -20_000,
2124 **kwargs: Any,
2125 ) -> list[DatasetRef]:
2126 """Query for datasets of potentially multiple types.
2128 Parameters
2129 ----------
2130 collections : `str` or `~collections.abc.Iterable` [ `str` ], optional
2131 The collection or collections to search, in order. If not provided
2132 or `None`, the default collection search path for this butler is
2133 used.
2134 name : `str` or `~collections.abc.Iterable` [ `str` ], optional
2135 Names or name patterns (glob-style) that returned dataset type
2136 names must match. If an iterable, items are OR'd together. The
2137 default is to include all dataset types in the given collections.
2138 find_first : `bool`, optional
2139 If `True` (default), for each result data ID, only yield one
2140 `DatasetRef` of each `DatasetType`, from the first collection in
2141 which a dataset of that dataset type appears (according to the
2142 order of ``collections`` passed in).
2143 data_id : `dict` or `DataCoordinate`, optional
2144 A data ID whose key-value pairs are used as equality constraints in
2145 the query.
2146 where : `str`, optional
2147 A string expression similar to a SQL WHERE clause. May involve any
2148 column of a dimension table or (as a shortcut for the primary key
2149 column of a dimension table) dimension name. See
2150 :ref:`daf_butler_dimension_expressions` for more information.
2151 bind : `~collections.abc.Mapping`, optional
2152 Mapping containing literal values that should be injected into the
2153 ``where`` expression, keyed by the identifiers they replace. Values
2154 of collection type can be expanded in some cases; see
2155 :ref:`daf_butler_dimension_expressions_identifiers` for more
2156 information.
2157 limit : `int` or `None`, optional
2158 Upper limit on the number of returned records. `None` can be used
2159 if no limit is wanted. A limit of ``0`` means that the query will
2160 be executed and validated but no results will be returned.
2161 If a negative value is given a warning will be issued if the number
2162 of results is capped by that limit. If no limit is provided, by
2163 default a maximum of 20,000 records will be returned.
2164 **kwargs
2165 Additional keyword arguments are forwarded to
2166 `DataCoordinate.standardize` when processing the ``data_id``
2167 argument (and may be used to provide a constraining data ID even
2168 when the ``data_id`` argument is `None`).
2170 Raises
2171 ------
2172 MissingDatasetTypeError
2173 When no dataset types match ``name``, or an explicit (non-glob)
2174 dataset type in ``name`` does not exist.
2175 InvalidQueryError
2176 If the parameters to the query are inconsistent or malformed.
2177 MissingCollectionError
2178 If a given collection is not found.
2180 Returns
2181 -------
2182 refs : `list` [ `DatasetRef` ]
2183 Dataset references matching the given query criteria. Nested data
2184 IDs are guaranteed to include values for all implied dimensions
2185 (i.e. `DataCoordinate.hasFull` will return `True`), but will not
2186 include dimension records (`DataCoordinate.hasRecords` will be
2187 `False`).
2188 """
2189 if collections is None:
2190 collections = list(self.collections.defaults)
2191 else:
2192 collections = list(ensure_iterable(collections))
2194 if bind is None:
2195 bind = {}
2196 if data_id is None:
2197 data_id = {}
2199 warn_limit = False
2200 if limit is not None and limit < 0:
2201 # Add one to the limit so we can detect if we have exceeded it.
2202 limit = abs(limit) + 1
2203 warn_limit = True
2205 args = QueryAllDatasetsParameters(
2206 collections=collections,
2207 name=list(ensure_iterable(name)),
2208 find_first=find_first,
2209 data_id=data_id,
2210 where=where,
2211 limit=limit,
2212 bind=bind,
2213 kwargs=kwargs,
2214 with_dimension_records=False,
2215 )
2216 with self._query_all_datasets_by_page(args) as pages:
2217 result = []
2218 for page in pages:
2219 result.extend(page)
2221 if warn_limit and limit is not None and len(result) >= limit:
2222 # Remove the extra dataset we added for the limit check.
2223 result.pop()
2224 _LOG.warning("More datasets are available than the requested limit of %d.", limit - 1)
2226 return result
2228 @abstractmethod
2229 def _query_all_datasets_by_page(
2230 self, args: QueryAllDatasetsParameters
2231 ) -> AbstractContextManager[Iterator[list[DatasetRef]]]:
2232 raise NotImplementedError()
2234 def clone(
2235 self,
2236 *,
2237 collections: CollectionArgType | None | EllipsisType = ...,
2238 run: str | None | EllipsisType = ...,
2239 inferDefaults: bool | EllipsisType = ...,
2240 dataId: dict[str, str] | EllipsisType = ...,
2241 metrics: ButlerMetrics | None = None,
2242 ) -> Butler:
2243 """Return a new Butler instance connected to the same repository
2244 as this one, optionally overriding ``collections``, ``run``,
2245 ``inferDefaults``, and default data ID.
2247 Parameters
2248 ----------
2249 collections : `~lsst.daf.butler.registry.CollectionArgType` or `None`,\
2250 optional
2251 Same as constructor. If omitted, uses value from original object.
2252 run : `str` or `None`, optional
2253 Same as constructor. If `None`, no default run is used. If
2254 omitted, copies value from original object.
2255 inferDefaults : `bool`, optional
2256 Same as constructor. If omitted, copies value from original
2257 object.
2258 dataId : `str`
2259 Same as ``kwargs`` passed to the constructor. If omitted, copies
2260 values from original object.
2261 metrics : `ButlerMetrics` or `None`, optional
2262 Metrics object to record butler statistics.
2263 """
2264 raise NotImplementedError()
2266 @abstractmethod
2267 def close(self) -> None:
2268 raise NotImplementedError()
2270 @abstractmethod
2271 def _expand_data_ids(self, data_ids: Iterable[DataCoordinate]) -> list[DataCoordinate]:
2272 raise NotImplementedError()