Coverage for python / lsst / daf / butler / _butler.py: 31%
325 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-28 08:36 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-28 08:36 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28from __future__ import annotations
30__all__ = ["Butler", "ParsedButlerDatasetURI", "SpecificButlerDataset"]
32import dataclasses
33import urllib.parse
34import uuid
35import warnings
36from abc import abstractmethod
37from collections.abc import Collection, Iterable, Iterator, Mapping, Sequence
38from contextlib import AbstractContextManager
39from types import EllipsisType
40from typing import TYPE_CHECKING, Any, TextIO
42from lsst.resources import ResourcePath, ResourcePathExpression
43from lsst.utils import doImportType
44from lsst.utils.iteration import ensure_iterable
45from lsst.utils.logging import getLogger
47from ._butler_collections import ButlerCollections
48from ._butler_config import ButlerConfig, ButlerType
49from ._butler_instance_options import ButlerInstanceOptions
50from ._butler_metrics import ButlerMetrics
51from ._butler_repo_index import ButlerRepoIndex
52from ._config import Config, ConfigSubset
53from ._exceptions import EmptyQueryResultError, InvalidQueryError
54from ._limited_butler import LimitedButler
55from ._query_all_datasets import QueryAllDatasetsParameters
56from .datastore import Datastore
57from .dimensions import DataCoordinate, DimensionConfig
58from .registry import RegistryConfig, _RegistryFactory
59from .repo_relocation import BUTLER_ROOT_TAG
60from .utils import has_globs
62if TYPE_CHECKING:
63 from ._dataset_existence import DatasetExistence
64 from ._dataset_provenance import DatasetProvenance
65 from ._dataset_ref import DatasetId, DatasetRef
66 from ._dataset_type import DatasetType
67 from ._deferredDatasetHandle import DeferredDatasetHandle
68 from ._file_dataset import FileDataset
69 from ._labeled_butler_factory import LabeledButlerFactoryProtocol
70 from ._storage_class import StorageClass
71 from ._timespan import Timespan
72 from .datastore import DatasetRefURIs
73 from .dimensions import DataId, DimensionGroup, DimensionRecord
74 from .queries import Query
75 from .registry import CollectionArgType, Registry
76 from .transfers import RepoExportContext
78_LOG = getLogger(__name__)
81@dataclasses.dataclass
82class ParsedButlerDatasetURI:
83 """Representation of the contents of an IVOA IVOID or dataset URI."""
85 label: str
86 """Label of the associated butler repository. (`str`)"""
87 dataset_id: uuid.UUID
88 """Dataset ID of the referenced dataset within the labeled repository.
89 (`uuid.UUID`)"""
90 uri: str
91 """The original URI that was parsed (`str`)."""
94@dataclasses.dataclass
95class SpecificButlerDataset:
96 """A dataset ref associated with a specific butler."""
98 butler: Butler
99 """A specific butler repository (`Butler`)."""
100 dataset: DatasetRef | None
101 """The reference of a specific dataset in that butler (`DatasetRef`)."""
104class _DeprecatedDefault:
105 """Default value for a deprecated parameter."""
108class Butler(LimitedButler): # numpydoc ignore=PR02
109 """Interface for data butler and factory for Butler instances.
111 Parameters
112 ----------
113 config : `ButlerConfig`, `Config` or `str`, optional
114 Configuration. Anything acceptable to the `ButlerConfig` constructor.
115 If a directory path is given the configuration will be read from a
116 ``butler.yaml`` file in that location. If `None` is given default
117 values will be used. If ``config`` contains "cls" key then its value is
118 used as a name of butler class and it must be a sub-class of this
119 class, otherwise `DirectButler` is instantiated.
120 collections : `str` or `~collections.abc.Iterable` [ `str` ], optional
121 An expression specifying the collections to be searched (in order) when
122 reading datasets.
123 This may be a `str` collection name or an iterable thereof.
124 See :ref:`daf_butler_collection_expressions` for more information.
125 These collections are not registered automatically and must be
126 manually registered before they are used by any method, but they may be
127 manually registered after the `Butler` is initialized.
128 run : `str`, optional
129 Name of the `~CollectionType.RUN` collection new datasets should be
130 inserted into. If ``collections`` is `None` and ``run`` is not `None`,
131 ``collections`` will be set to ``[run]``. If not `None`, this
132 collection will automatically be registered. If this is not set (and
133 ``writeable`` is not set either), a read-only butler will be created.
134 searchPaths : `list` of `str`, optional
135 Directory paths to search when calculating the full Butler
136 configuration. Not used if the supplied config is already a
137 `ButlerConfig`.
138 writeable : `bool`, optional
139 Explicitly sets whether the butler supports write operations. If not
140 provided, a read-write butler is created if any of ``run``, ``tags``,
141 or ``chains`` is non-empty.
142 inferDefaults : `bool`, optional
143 If `True` (default) infer default data ID values from the values
144 present in the datasets in ``collections``: if all collections have the
145 same value (or no value) for a governor dimension, that value will be
146 the default for that dimension. Nonexistent collections are ignored.
147 If a default value is provided explicitly for a governor dimension via
148 ``**kwargs``, no default will be inferred for that dimension.
149 without_datastore : `bool`, optional
150 If `True` do not attach a datastore to this butler. Any attempts
151 to use a datastore will fail.
152 metrics : `ButlerMetrics` or `None`
153 External metrics object to be used for tracking butler usage. If `None`
154 a new metrics object is created.
155 **kwargs : `typing.Any`
156 Additional keyword arguments passed to a constructor of actual butler
157 class.
159 Notes
160 -----
161 The preferred way to instantiate Butler is via the `from_config` method.
162 The call to ``Butler(...)`` is equivalent to ``Butler.from_config(...)``,
163 but ``mypy`` will complain about the former.
164 """
166 def __new__(
167 cls,
168 config: Config | ResourcePathExpression | None = None,
169 *,
170 collections: Any = None,
171 run: str | None = None,
172 searchPaths: Sequence[ResourcePathExpression] | None = None,
173 writeable: bool | None = None,
174 inferDefaults: bool = True,
175 without_datastore: bool = False,
176 metrics: ButlerMetrics | None = None,
177 **kwargs: Any,
178 ) -> Butler:
179 if cls is Butler:
180 return Butler.from_config(
181 config=config,
182 collections=collections,
183 run=run,
184 searchPaths=searchPaths,
185 writeable=writeable,
186 inferDefaults=inferDefaults,
187 without_datastore=without_datastore,
188 metrics=metrics,
189 **kwargs,
190 )
192 # Note: we do not pass any parameters to __new__, Python will pass them
193 # to __init__ after __new__ returns sub-class instance.
194 return super().__new__(cls)
196 @classmethod
197 def from_config(
198 cls,
199 config: Config | ResourcePathExpression | None = None,
200 *,
201 collections: Any = None,
202 run: str | None = None,
203 searchPaths: Sequence[ResourcePathExpression] | None = None,
204 writeable: bool | None = None,
205 inferDefaults: bool = True,
206 without_datastore: bool = False,
207 metrics: ButlerMetrics | None = None,
208 **kwargs: Any,
209 ) -> Butler:
210 """Create butler instance from configuration.
212 Parameters
213 ----------
214 config : `ButlerConfig`, `Config` or `str`, optional
215 Configuration. Anything acceptable to the `ButlerConfig`
216 constructor. If a directory path is given the configuration will be
217 read from a ``butler.yaml`` file in that location. If `None` is
218 given default values will be used. If ``config`` contains "cls" key
219 then its value is used as a name of butler class and it must be a
220 sub-class of this class, otherwise `DirectButler` is instantiated.
221 collections : `str` or `~collections.abc.Iterable` [ `str` ], optional
222 An expression specifying the collections to be searched (in order)
223 when reading datasets.
224 This may be a `str` collection name or an iterable thereof.
225 See :ref:`daf_butler_collection_expressions` for more information.
226 These collections are not registered automatically and must be
227 manually registered before they are used by any method, but they
228 may be manually registered after the `Butler` is initialized.
229 run : `str`, optional
230 Name of the `~CollectionType.RUN` collection new datasets should be
231 inserted into. If ``collections`` is `None` and ``run`` is not
232 `None`, ``collections`` will be set to ``[run]``. If not `None`,
233 this collection will automatically be registered. If this is not
234 set (and ``writeable`` is not set either), a read-only butler will
235 be created.
236 searchPaths : `list` of `str`, optional
237 Directory paths to search when calculating the full Butler
238 configuration. Not used if the supplied config is already a
239 `ButlerConfig`.
240 writeable : `bool`, optional
241 Explicitly sets whether the butler supports write operations. If
242 not provided, a read-write butler is created if any of ``run``,
243 ``tags``, or ``chains`` is non-empty.
244 inferDefaults : `bool`, optional
245 If `True` (default) infer default data ID values from the values
246 present in the datasets in ``collections``: if all collections have
247 the same value (or no value) for a governor dimension, that value
248 will be the default for that dimension. Nonexistent collections
249 are ignored. If a default value is provided explicitly for a
250 governor dimension via ``**kwargs``, no default will be inferred
251 for that dimension.
252 without_datastore : `bool`, optional
253 If `True` do not attach a datastore to this butler. Any attempts
254 to use a datastore will fail.
255 metrics : `ButlerMetrics` or `None`, optional
256 Metrics object to record butler usage statistics.
257 **kwargs : `typing.Any`
258 Default data ID key-value pairs. These may only identify
259 "governor" dimensions like ``instrument`` and ``skymap``.
261 Returns
262 -------
263 butler : `Butler`
264 A `Butler` constructed from the given configuration.
266 Notes
267 -----
268 Calling this factory method is identical to calling
269 ``Butler(config, ...)``. Its only raison d'être is that ``mypy``
270 complains about ``Butler()`` call.
272 Examples
273 --------
274 While there are many ways to control exactly how a `Butler` interacts
275 with the collections in its `Registry`, the most common cases are still
276 simple.
278 For a read-only `Butler` that searches one collection, do::
280 butler = Butler.from_config(
281 "/path/to/repo", collections=["u/alice/DM-50000"]
282 )
284 For a read-write `Butler` that writes to and reads from a
285 `~CollectionType.RUN` collection::
287 butler = Butler.from_config(
288 "/path/to/repo", run="u/alice/DM-50000/a"
289 )
291 The `Butler` passed to a ``PipelineTask`` is often much more complex,
292 because we want to write to one `~CollectionType.RUN` collection but
293 read from several others (as well)::
295 butler = Butler.from_config(
296 "/path/to/repo",
297 run="u/alice/DM-50000/a",
298 collections=[
299 "u/alice/DM-50000/a",
300 "u/bob/DM-49998",
301 "HSC/defaults",
302 ],
303 )
305 This butler will `put` new datasets to the run ``u/alice/DM-50000/a``.
306 Datasets will be read first from that run (since it appears first in
307 the chain), and then from ``u/bob/DM-49998`` and finally
308 ``HSC/defaults``.
310 Finally, one can always create a `Butler` with no collections::
312 butler = Butler.from_config("/path/to/repo", writeable=True)
314 This can be extremely useful when you just want to use
315 ``butler.registry``, e.g. for inserting dimension data or managing
316 collections, or when the collections you want to use with the butler
317 are not consistent. Passing ``writeable`` explicitly here is only
318 necessary if you want to be able to make changes to the repo - usually
319 the value for ``writeable`` can be guessed from the collection
320 arguments provided, but it defaults to `False` when there are not
321 collection arguments.
322 """
323 # DirectButler used to have a way to specify a "copy constructor" by
324 # passing the "butler" parameter to its constructor. This has
325 # been moved out of the constructor into Butler.clone().
326 butler = kwargs.pop("butler", None)
327 metrics = metrics if metrics is not None else ButlerMetrics()
328 if butler is not None:
329 if not isinstance(butler, Butler):
330 raise TypeError("'butler' parameter must be a Butler instance")
331 if config is not None or searchPaths is not None or writeable is not None:
332 raise TypeError(
333 "Cannot pass 'config', 'searchPaths', or 'writeable' arguments with 'butler' argument."
334 )
335 return butler.clone(
336 collections=collections, run=run, inferDefaults=inferDefaults, metrics=metrics, dataId=kwargs
337 )
339 options = ButlerInstanceOptions(
340 collections=collections,
341 run=run,
342 writeable=writeable,
343 inferDefaults=inferDefaults,
344 metrics=metrics,
345 kwargs=kwargs,
346 )
348 # Load the Butler configuration. This may involve searching the
349 # environment to locate a configuration file.
350 butler_config = ButlerConfig(config, searchPaths=searchPaths, without_datastore=without_datastore)
351 butler_type = butler_config.get_butler_type()
353 # Make DirectButler if class is not specified.
354 match butler_type:
355 case ButlerType.DIRECT:
356 from .direct_butler import DirectButler
358 return DirectButler.create_from_config(
359 butler_config,
360 options=options,
361 without_datastore=without_datastore,
362 )
363 case ButlerType.REMOTE:
364 from .remote_butler._factory import RemoteButlerFactory
366 # Assume this is being created by a client who would like
367 # default caching of remote datasets.
368 factory = RemoteButlerFactory.create_factory_from_config(butler_config)
369 return factory.create_butler_with_credentials_from_environment(
370 butler_options=options, enable_datastore_cache=True
371 )
372 case _:
373 raise TypeError(f"Unknown Butler type '{butler_type}'")
375 @staticmethod
376 def has_repo_config(root: ResourcePathExpression) -> bool:
377 """Check whether the given directory path contains a Butler
378 configuration or not.
380 Parameters
381 ----------
382 root : `lsst.resources.ResourcePathExpression`
383 The directory URI to check.
385 Returns
386 -------
387 is_root : `bool`
388 `True` if this is a directory containing a butler configuration.
389 """
390 root_uri = ResourcePath(root, forceDirectory=True)
391 return root_uri.join("butler.yaml").exists()
393 @staticmethod
394 def makeRepo(
395 root: ResourcePathExpression,
396 config: Config | str | None = None,
397 dimensionConfig: Config | str | None = None,
398 standalone: bool = False,
399 searchPaths: list[str] | None = None,
400 forceConfigRoot: bool = True,
401 outfile: ResourcePathExpression | None = None,
402 overwrite: bool = False,
403 ) -> Config:
404 """Create an empty data repository by adding a butler.yaml config
405 to a repository root directory.
407 Parameters
408 ----------
409 root : `lsst.resources.ResourcePathExpression`
410 Path or URI to the root location of the new repository. Will be
411 created if it does not exist.
412 config : `Config` or `str`, optional
413 Configuration to write to the repository, after setting any
414 root-dependent Registry or Datastore config options. Can not
415 be a `ButlerConfig` or a `ConfigSubset`. If `None`, default
416 configuration will be used. Root-dependent config options
417 specified in this config are overwritten if ``forceConfigRoot``
418 is `True`.
419 dimensionConfig : `Config` or `str`, optional
420 Configuration for dimensions, will be used to initialize registry
421 database.
422 standalone : `bool`
423 If True, write all expanded defaults, not just customized or
424 repository-specific settings.
425 This (mostly) decouples the repository from the default
426 configuration, insulating it from changes to the defaults (which
427 may be good or bad, depending on the nature of the changes).
428 Future *additions* to the defaults will still be picked up when
429 initializing a `Butler` for repos created with ``standalone=True``.
430 searchPaths : `list` of `str`, optional
431 Directory paths to search when calculating the full butler
432 configuration.
433 forceConfigRoot : `bool`, optional
434 If `False`, any values present in the supplied ``config`` that
435 would normally be reset are not overridden and will appear
436 directly in the output config. This allows non-standard overrides
437 of the root directory for a datastore or registry to be given.
438 If this parameter is `True` the values for ``root`` will be
439 forced into the resulting config if appropriate.
440 outfile : `lsst.resources.ResourcePathExpression`, optional
441 If not-`None`, the output configuration will be written to this
442 location rather than into the repository itself. Can be a URI
443 string. Can refer to a directory that will be used to write
444 ``butler.yaml``.
445 overwrite : `bool`, optional
446 Create a new configuration file even if one already exists
447 in the specified output location. Default is to raise
448 an exception.
450 Returns
451 -------
452 config : `Config`
453 The updated `Config` instance written to the repo.
455 Raises
456 ------
457 ValueError
458 Raised if a ButlerConfig or ConfigSubset is passed instead of a
459 regular Config (as these subclasses would make it impossible to
460 support ``standalone=False``).
461 FileExistsError
462 Raised if the output config file already exists.
463 os.error
464 Raised if the directory does not exist, exists but is not a
465 directory, or cannot be created.
467 Notes
468 -----
469 Note that when ``standalone=False`` (the default), the configuration
470 search path (see `ConfigSubset.defaultSearchPaths`) that was used to
471 construct the repository should also be used to construct any Butlers
472 to avoid configuration inconsistencies.
473 """
474 if isinstance(config, ButlerConfig | ConfigSubset):
475 raise ValueError("makeRepo must be passed a regular Config without defaults applied.")
477 # Ensure that the root of the repository exists or can be made
478 root_uri = ResourcePath(root, forceDirectory=True)
479 root_uri.mkdir()
481 config = Config(config)
483 # If we are creating a new repo from scratch with relative roots,
484 # do not propagate an explicit root from the config file
485 if "root" in config:
486 del config["root"]
488 full = ButlerConfig(config, searchPaths=searchPaths) # this applies defaults
489 imported_class = doImportType(full["datastore", "cls"])
490 if not issubclass(imported_class, Datastore):
491 raise TypeError(f"Imported datastore class {full['datastore', 'cls']} is not a Datastore")
492 datastoreClass: type[Datastore] = imported_class
493 datastoreClass.setConfigRoot(BUTLER_ROOT_TAG, config, full, overwrite=forceConfigRoot)
495 # if key exists in given config, parse it, otherwise parse the defaults
496 # in the expanded config
497 if config.get(("registry", "db")):
498 registryConfig = RegistryConfig(config)
499 else:
500 registryConfig = RegistryConfig(full)
501 defaultDatabaseUri = registryConfig.makeDefaultDatabaseUri(BUTLER_ROOT_TAG)
502 if defaultDatabaseUri is not None:
503 Config.updateParameters(
504 RegistryConfig, config, full, toUpdate={"db": defaultDatabaseUri}, overwrite=forceConfigRoot
505 )
506 else:
507 Config.updateParameters(RegistryConfig, config, full, toCopy=("db",), overwrite=forceConfigRoot)
509 if standalone:
510 config.merge(full)
511 else:
512 # Always expand the registry.managers section into the per-repo
513 # config, because after the database schema is created, it's not
514 # allowed to change anymore. Note that in the standalone=True
515 # branch, _everything_ in the config is expanded, so there's no
516 # need to special case this.
517 Config.updateParameters(RegistryConfig, config, full, toMerge=("managers",), overwrite=False)
518 configURI: ResourcePathExpression
519 if outfile is not None:
520 # When writing to a separate location we must include
521 # the root of the butler repo in the config else it won't know
522 # where to look.
523 config["root"] = root_uri.geturl()
524 configURI = outfile
525 else:
526 configURI = root_uri
527 # Check that if obscore key is present then its config must be there
528 # too, this is to avoid common mistake when people copy butler.yaml
529 # from existing repo with obscore but do not fill its config.
530 if (obscore_key := ("registry", "managers", "obscore")) in config:
531 obscore_config_key = ("registry", "managers", "obscore", "config")
532 if obscore_config_key not in config or not config[obscore_config_key]:
533 warnings.warn(
534 "Obscore manager is declared in registry configuration, "
535 "but obscore configuration is missing, obscore manager will be removed.",
536 stacklevel=2,
537 )
538 del config[obscore_key]
539 # Strip obscore configuration, if it is present, before writing config
540 # to a file, obscore config will be stored in registry.
541 if (obscore_config_key := ("registry", "managers", "obscore", "config")) in config:
542 config_to_write = config.copy()
543 del config_to_write[obscore_config_key]
544 config_to_write.dumpToUri(configURI, overwrite=overwrite)
545 # configFile attribute is updated, need to copy it to original.
546 config.configFile = config_to_write.configFile
547 else:
548 config.dumpToUri(configURI, overwrite=overwrite)
550 # Create Registry and populate tables
551 registryConfig = RegistryConfig(config.get("registry"))
552 dimensionConfig = DimensionConfig(dimensionConfig)
553 registry = _RegistryFactory(registryConfig).create_from_config(
554 dimensionConfig=dimensionConfig, butlerRoot=root_uri
555 )
556 registry.close()
558 _LOG.verbose("Wrote new Butler configuration file to %s", configURI)
560 return config
562 @classmethod
563 def get_repo_uri(cls, label: str, return_label: bool = False) -> ResourcePath:
564 """Look up the label in a butler repository index.
566 Parameters
567 ----------
568 label : `str`
569 Label of the Butler repository to look up.
570 return_label : `bool`, optional
571 If ``label`` cannot be found in the repository index (either
572 because index is not defined or ``label`` is not in the index) and
573 ``return_label`` is `True` then return ``ResourcePath(label)``.
574 If ``return_label`` is `False` (default) then an exception will be
575 raised instead.
577 Returns
578 -------
579 uri : `lsst.resources.ResourcePath`
580 URI to the Butler repository associated with the given label or
581 default value if it is provided.
583 Raises
584 ------
585 KeyError
586 Raised if the label is not found in the index, or if an index
587 is not defined, and ``return_label`` is `False`.
589 Notes
590 -----
591 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the
592 information is discovered.
593 """
594 return ButlerRepoIndex.get_repo_uri(label, return_label)
596 @classmethod
597 def get_known_repos(cls) -> set[str]:
598 """Retrieve the list of known repository labels.
600 Returns
601 -------
602 repos : `set` of `str`
603 All the known labels. Can be empty if no index can be found.
605 Notes
606 -----
607 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the
608 information is discovered.
609 """
610 return ButlerRepoIndex.get_known_repos()
612 @classmethod
613 def parse_dataset_uri(cls, uri: str) -> ParsedButlerDatasetURI:
614 """Extract the butler label and dataset ID from a dataset URI.
616 Parameters
617 ----------
618 uri : `str`
619 The dataset URI to parse.
621 Returns
622 -------
623 parsed : `ParsedButlerDatasetURI`
624 The label associated with the butler repository from which this
625 dataset originates and the ID of the dataset.
627 Notes
628 -----
629 Supports dataset URIs of the forms
630 ``ivo://org.rubinobs/usdac/dr1?repo=butler_label&id=UUID`` (see
631 DMTN-302) and ``butler://butler_label/UUID``. The ``butler`` URI is
632 deprecated and can not include ``/`` in the label string. ``ivo`` URIs
633 can include anything supported by the `Butler` constructor, including
634 paths to repositories and alias labels.
636 ivo://org.rubinobs/dr1?repo=/repo/main&id=UUID
638 will return a label of ``/repo/main``.
640 This method does not attempt to check that the dataset exists in the
641 labeled butler.
643 Since the IVOID can be issued by any publisher to represent a Butler
644 dataset there is no validation of the path or netloc component of the
645 URI. The only requirement is that there are ``id`` and ``repo`` keys
646 in the ``ivo`` URI query component.
647 """
648 parsed = urllib.parse.urlparse(uri)
649 parsed_scheme = parsed.scheme.lower()
650 if parsed_scheme == "ivo":
651 # Do not validate the netloc or the path values.
652 qs = urllib.parse.parse_qs(parsed.query)
653 if "repo" not in qs or "id" not in qs:
654 raise ValueError(f"Missing 'repo' and/or 'id' query parameters in IVOID {uri}.")
655 if len(qs["repo"]) != 1 or len(qs["id"]) != 1:
656 raise ValueError(f"Butler IVOID only supports a single value of repo and id, got {uri}")
657 label = qs["repo"][0]
658 id_ = qs["id"][0]
659 elif parsed_scheme == "butler":
660 label = parsed.netloc # Butler label is case sensitive.
661 # Need to strip the leading /.
662 id_ = parsed.path[1:]
663 else:
664 raise ValueError(f"Unrecognized URI scheme: {uri!r}")
665 # Strip trailing/leading whitespace from label.
666 label = label.strip()
667 if not label:
668 raise ValueError(f"No butler repository label found in uri {uri!r}")
669 try:
670 dataset_id = uuid.UUID(hex=id_)
671 except Exception as e:
672 e.add_note(f"Error extracting dataset ID from uri {uri!r} with dataset ID string {id_!r}")
673 raise
675 return ParsedButlerDatasetURI(label=label, dataset_id=dataset_id, uri=uri)
677 @classmethod
678 def get_dataset_from_uri(
679 cls, uri: str, factory: LabeledButlerFactoryProtocol | None = None
680 ) -> SpecificButlerDataset:
681 """Get the dataset associated with the given dataset URI.
683 Parameters
684 ----------
685 uri : `str`
686 The URI associated with a dataset.
687 factory : `LabeledButlerFactoryProtocol` or `None`, optional
688 Bound factory function that will be given the butler label
689 and receive a `Butler`. If this is not provided the label
690 will be tried directly.
692 Returns
693 -------
694 result : `SpecificButlerDataset`
695 The butler associated with this URI and the dataset itself.
696 The dataset can be `None` if the UUID is valid but the dataset
697 is not known to this butler.
698 """
699 parsed = cls.parse_dataset_uri(uri)
700 butler: Butler | None = None
701 if factory is not None:
702 # If the label is not recognized, it might be a path.
703 try:
704 butler = factory(parsed.label)
705 except KeyError:
706 pass
707 if butler is None:
708 butler = cls.from_config(parsed.label)
709 return SpecificButlerDataset(butler=butler, dataset=butler.get_dataset(parsed.dataset_id))
711 @abstractmethod
712 def _caching_context(self) -> AbstractContextManager[None]:
713 """Context manager that enables caching."""
714 raise NotImplementedError()
716 @abstractmethod
717 def transaction(self) -> AbstractContextManager[None]:
718 """Context manager supporting `Butler` transactions.
720 Transactions can be nested.
721 """
722 raise NotImplementedError()
724 @abstractmethod
725 def put(
726 self,
727 obj: Any,
728 datasetRefOrType: DatasetRef | DatasetType | str,
729 /,
730 dataId: DataId | None = None,
731 *,
732 run: str | None = None,
733 provenance: DatasetProvenance | None = None,
734 **kwargs: Any,
735 ) -> DatasetRef:
736 """Store and register a dataset.
738 Parameters
739 ----------
740 obj : `object`
741 The dataset.
742 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
743 When `DatasetRef` is provided, ``dataId`` should be `None`.
744 Otherwise the `DatasetType` or name thereof. If a fully resolved
745 `DatasetRef` is given the run and ID are used directly.
746 dataId : `dict` or `DataCoordinate`
747 A `dict` of `Dimension` link name, value pairs that label the
748 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
749 should be provided as the second argument.
750 run : `str`, optional
751 The name of the run the dataset should be added to, overriding
752 ``self.run``. Not used if a resolved `DatasetRef` is provided.
753 provenance : `DatasetProvenance` or `None`, optional
754 Any provenance that should be attached to the serialized dataset.
755 Not supported by all serialization mechanisms.
756 **kwargs
757 Additional keyword arguments used to augment or construct a
758 `DataCoordinate`. See `DataCoordinate.standardize`
759 parameters. Not used if a resolve `DatasetRef` is provided.
761 Returns
762 -------
763 ref : `DatasetRef`
764 A reference to the stored dataset, updated with the correct id if
765 given.
767 Raises
768 ------
769 TypeError
770 Raised if the butler is read-only or if no run has been provided.
771 """
772 raise NotImplementedError()
774 @abstractmethod
775 def getDeferred(
776 self,
777 datasetRefOrType: DatasetRef | DatasetType | str,
778 /,
779 dataId: DataId | None = None,
780 *,
781 parameters: dict | None = None,
782 collections: Any = None,
783 storageClass: str | StorageClass | None = None,
784 timespan: Timespan | None = None,
785 **kwargs: Any,
786 ) -> DeferredDatasetHandle:
787 """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
788 after an immediate registry lookup.
790 Parameters
791 ----------
792 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
793 When `DatasetRef` the `dataId` should be `None`.
794 Otherwise the `DatasetType` or name thereof.
795 dataId : `dict` or `DataCoordinate`, optional
796 A `dict` of `Dimension` link name, value pairs that label the
797 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
798 should be provided as the first argument.
799 parameters : `dict`
800 Additional StorageClass-defined options to control reading,
801 typically used to efficiently read only a subset of the dataset.
802 collections : Any, optional
803 Collections to be searched, overriding ``self.collections``.
804 Can be any of the types supported by the ``collections`` argument
805 to butler construction.
806 storageClass : `StorageClass` or `str`, optional
807 The storage class to be used to override the Python type
808 returned by this method. By default the returned type matches
809 the dataset type definition for this dataset. Specifying a
810 read `StorageClass` can force a different type to be returned.
811 This type must be compatible with the original type.
812 timespan : `Timespan` or `None`, optional
813 A timespan that the validity range of the dataset must overlap.
814 If not provided and this is a calibration dataset type, an attempt
815 will be made to find the timespan from any temporal coordinate
816 in the data ID.
817 **kwargs
818 Additional keyword arguments used to augment or construct a
819 `DataId`. See `DataId` parameters.
821 Returns
822 -------
823 obj : `DeferredDatasetHandle`
824 A handle which can be used to retrieve a dataset at a later time.
826 Raises
827 ------
828 LookupError
829 Raised if no matching dataset exists in the `Registry` or
830 datastore.
831 ValueError
832 Raised if a resolved `DatasetRef` was passed as an input, but it
833 differs from the one found in the registry.
834 TypeError
835 Raised if no collections were provided.
836 """
837 raise NotImplementedError()
839 @abstractmethod
840 def get(
841 self,
842 datasetRefOrType: DatasetRef | DatasetType | str,
843 /,
844 dataId: DataId | None = None,
845 *,
846 parameters: dict[str, Any] | None = None,
847 collections: Any = None,
848 storageClass: StorageClass | str | None = None,
849 timespan: Timespan | None = None,
850 **kwargs: Any,
851 ) -> Any:
852 """Retrieve a stored dataset.
854 Parameters
855 ----------
856 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
857 When `DatasetRef` the `dataId` should be `None`.
858 Otherwise the `DatasetType` or name thereof.
859 If a resolved `DatasetRef`, the associated dataset
860 is returned directly without additional querying.
861 dataId : `dict` or `DataCoordinate`
862 A `dict` of `Dimension` link name, value pairs that label the
863 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
864 should be provided as the first argument.
865 parameters : `dict`
866 Additional StorageClass-defined options to control reading,
867 typically used to efficiently read only a subset of the dataset.
868 collections : Any, optional
869 Collections to be searched, overriding ``self.collections``.
870 Can be any of the types supported by the ``collections`` argument
871 to butler construction.
872 storageClass : `StorageClass` or `str`, optional
873 The storage class to be used to override the Python type
874 returned by this method. By default the returned type matches
875 the dataset type definition for this dataset. Specifying a
876 read `StorageClass` can force a different type to be returned.
877 This type must be compatible with the original type.
878 timespan : `Timespan` or `None`, optional
879 A timespan that the validity range of the dataset must overlap.
880 If not provided and this is a calibration dataset type, an attempt
881 will be made to find the timespan from any temporal coordinate
882 in the data ID.
883 **kwargs
884 Additional keyword arguments used to augment or construct a
885 `DataCoordinate`. See `DataCoordinate.standardize`
886 parameters.
888 Returns
889 -------
890 obj : `object`
891 The dataset.
893 Raises
894 ------
895 LookupError
896 Raised if no matching dataset exists in the `Registry`.
897 TypeError
898 Raised if no collections were provided.
900 Notes
901 -----
902 When looking up datasets in a `~CollectionType.CALIBRATION` collection,
903 this method requires that the given data ID include temporal dimensions
904 beyond the dimensions of the dataset type itself, in order to find the
905 dataset with the appropriate validity range. For example, a "bias"
906 dataset with native dimensions ``{instrument, detector}`` could be
907 fetched with a ``{instrument, detector, exposure}`` data ID, because
908 ``exposure`` is a temporal dimension.
909 """
910 raise NotImplementedError()
912 @abstractmethod
913 def getURIs(
914 self,
915 datasetRefOrType: DatasetRef | DatasetType | str,
916 /,
917 dataId: DataId | None = None,
918 *,
919 predict: bool = False,
920 collections: Any = None,
921 run: str | None = None,
922 **kwargs: Any,
923 ) -> DatasetRefURIs:
924 """Return the URIs associated with the dataset.
926 Parameters
927 ----------
928 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
929 When `DatasetRef` the `dataId` should be `None`.
930 Otherwise the `DatasetType` or name thereof.
931 dataId : `dict` or `DataCoordinate`
932 A `dict` of `Dimension` link name, value pairs that label the
933 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
934 should be provided as the first argument.
935 predict : `bool`
936 If `True`, allow URIs to be returned of datasets that have not
937 been written.
938 collections : Any, optional
939 Collections to be searched, overriding ``self.collections``.
940 Can be any of the types supported by the ``collections`` argument
941 to butler construction.
942 run : `str`, optional
943 Run to use for predictions, overriding ``self.run``.
944 **kwargs
945 Additional keyword arguments used to augment or construct a
946 `DataCoordinate`. See `DataCoordinate.standardize`
947 parameters.
949 Returns
950 -------
951 uris : `DatasetRefURIs`
952 The URI to the primary artifact associated with this dataset (if
953 the dataset was disassembled within the datastore this may be
954 `None`), and the URIs to any components associated with the dataset
955 artifact. (can be empty if there are no components).
956 """
957 raise NotImplementedError()
959 def getURI(
960 self,
961 datasetRefOrType: DatasetRef | DatasetType | str,
962 /,
963 dataId: DataId | None = None,
964 *,
965 predict: bool = False,
966 collections: Any = None,
967 run: str | None = None,
968 **kwargs: Any,
969 ) -> ResourcePath:
970 """Return the URI to the Dataset.
972 Parameters
973 ----------
974 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
975 When `DatasetRef` the `dataId` should be `None`.
976 Otherwise the `DatasetType` or name thereof.
977 dataId : `dict` or `DataCoordinate`
978 A `dict` of `Dimension` link name, value pairs that label the
979 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
980 should be provided as the first argument.
981 predict : `bool`
982 If `True`, allow URIs to be returned of datasets that have not
983 been written.
984 collections : Any, optional
985 Collections to be searched, overriding ``self.collections``.
986 Can be any of the types supported by the ``collections`` argument
987 to butler construction.
988 run : `str`, optional
989 Run to use for predictions, overriding ``self.run``.
990 **kwargs
991 Additional keyword arguments used to augment or construct a
992 `DataCoordinate`. See `DataCoordinate.standardize`
993 parameters.
995 Returns
996 -------
997 uri : `lsst.resources.ResourcePath`
998 URI pointing to the Dataset within the datastore. If the
999 Dataset does not exist in the datastore, and if ``predict`` is
1000 `True`, the URI will be a prediction and will include a URI
1001 fragment "#predicted".
1002 If the datastore does not have entities that relate well
1003 to the concept of a URI the returned URI string will be
1004 descriptive. The returned URI is not guaranteed to be obtainable.
1006 Raises
1007 ------
1008 LookupError
1009 A URI has been requested for a dataset that does not exist and
1010 guessing is not allowed.
1011 ValueError
1012 Raised if a resolved `DatasetRef` was passed as an input, but it
1013 differs from the one found in the registry.
1014 TypeError
1015 Raised if no collections were provided.
1016 RuntimeError
1017 Raised if a URI is requested for a dataset that consists of
1018 multiple artifacts.
1019 """
1020 primary, components = self.getURIs(
1021 datasetRefOrType, dataId=dataId, predict=predict, collections=collections, run=run, **kwargs
1022 )
1024 if primary is None or components:
1025 raise RuntimeError(
1026 f"Dataset ({datasetRefOrType}) includes distinct URIs for components. "
1027 "Use Butler.getURIs() instead."
1028 )
1029 return primary
1031 @abstractmethod
1032 def get_dataset_type(self, name: str) -> DatasetType:
1033 """Get the `DatasetType`.
1035 Parameters
1036 ----------
1037 name : `str`
1038 Name of the type.
1040 Returns
1041 -------
1042 type : `DatasetType`
1043 The `DatasetType` associated with the given name.
1045 Raises
1046 ------
1047 lsst.daf.butler.MissingDatasetTypeError
1048 Raised if the requested dataset type has not been registered.
1050 Notes
1051 -----
1052 This method handles component dataset types automatically, though most
1053 other operations do not.
1054 """
1055 raise NotImplementedError()
1057 @abstractmethod
1058 def get_dataset(
1059 self,
1060 id: DatasetId | str,
1061 *,
1062 storage_class: str | StorageClass | None = None,
1063 dimension_records: bool = False,
1064 datastore_records: bool = False,
1065 ) -> DatasetRef | None:
1066 """Retrieve a Dataset entry.
1068 Parameters
1069 ----------
1070 id : `DatasetId`
1071 The unique identifier for the dataset, as an instance of
1072 `uuid.UUID` or a string containing a hexadecimal number.
1073 storage_class : `str` or `StorageClass` or `None`
1074 A storage class to use when creating the returned entry. If given
1075 it must be compatible with the default storage class.
1076 dimension_records : `bool`, optional
1077 If `True` the ref will be expanded and contain dimension records.
1078 datastore_records : `bool`, optional
1079 If `True` the ref will contain associated datastore records.
1081 Returns
1082 -------
1083 ref : `DatasetRef` or `None`
1084 A ref to the Dataset, or `None` if no matching Dataset
1085 was found.
1086 """
1087 raise NotImplementedError()
1089 @abstractmethod
1090 def get_many_datasets(self, ids: Iterable[DatasetId | str]) -> list[DatasetRef]:
1091 """Retrieve a list of dataset entries.
1093 Parameters
1094 ----------
1095 ids : `~collections.abc.Iterable` [ `DatasetId` or `str` ]
1096 The unique identifiers for the datasets, as instances of
1097 `uuid.UUID` or strings containing a hexadecimal number.
1099 Returns
1100 -------
1101 refs : `list` [ `DatasetRef` ]
1102 A list containing a `DatasetRef` for each of the given dataset IDs.
1103 If a dataset was not found, no error is thrown -- it is just not
1104 included in the list. The returned datasets are in no particular
1105 order.
1106 """
1107 raise NotImplementedError()
1109 @abstractmethod
1110 def find_dataset(
1111 self,
1112 dataset_type: DatasetType | str,
1113 data_id: DataId | None = None,
1114 *,
1115 collections: str | Sequence[str] | None = None,
1116 timespan: Timespan | None = None,
1117 storage_class: str | StorageClass | None = None,
1118 dimension_records: bool = False,
1119 datastore_records: bool = False,
1120 **kwargs: Any,
1121 ) -> DatasetRef | None:
1122 """Find a dataset given its `DatasetType` and data ID.
1124 This can be used to obtain a `DatasetRef` that permits the dataset to
1125 be read from a `Datastore`. If the dataset is a component and can not
1126 be found using the provided dataset type, a dataset ref for the parent
1127 will be returned instead but with the correct dataset type.
1129 Parameters
1130 ----------
1131 dataset_type : `DatasetType` or `str`
1132 A `DatasetType` or the name of one. If this is a `DatasetType`
1133 instance, its storage class will be respected and propagated to
1134 the output, even if it differs from the dataset type definition
1135 in the registry, as long as the storage classes are convertible.
1136 data_id : `dict` or `DataCoordinate`, optional
1137 A `dict`-like object containing the `Dimension` links that identify
1138 the dataset within a collection. If it is a `dict` the dataId
1139 can include dimension record values such as ``day_obs`` and
1140 ``seq_num`` or ``full_name`` that can be used to derive the
1141 primary dimension.
1142 collections : `str` or `list` [`str`], optional
1143 A an ordered list of collections to search for the dataset.
1144 Defaults to ``self.defaults.collections``.
1145 timespan : `Timespan`, optional
1146 A timespan that the validity range of the dataset must overlap.
1147 If not provided, any `~CollectionType.CALIBRATION` collections
1148 matched by the ``collections`` argument will not be searched.
1149 storage_class : `str` or `StorageClass` or `None`
1150 A storage class to use when creating the returned entry. If given
1151 it must be compatible with the default storage class.
1152 dimension_records : `bool`, optional
1153 If `True` the ref will be expanded and contain dimension records.
1154 datastore_records : `bool`, optional
1155 If `True` the ref will contain associated datastore records.
1156 **kwargs
1157 Additional keyword arguments passed to
1158 `DataCoordinate.standardize` to convert ``dataId`` to a true
1159 `DataCoordinate` or augment an existing one. This can also include
1160 dimension record metadata that can be used to derive a primary
1161 dimension value.
1163 Returns
1164 -------
1165 ref : `DatasetRef`
1166 A reference to the dataset, or `None` if no matching Dataset
1167 was found.
1169 Raises
1170 ------
1171 lsst.daf.butler.NoDefaultCollectionError
1172 Raised if ``collections`` is `None` and
1173 ``self.collections`` is `None`.
1174 LookupError
1175 Raised if one or more data ID keys are missing.
1176 lsst.daf.butler.MissingDatasetTypeError
1177 Raised if the dataset type does not exist.
1178 lsst.daf.butler.MissingCollectionError
1179 Raised if any of ``collections`` does not exist in the registry.
1181 Notes
1182 -----
1183 This method simply returns `None` and does not raise an exception even
1184 when the set of collections searched is intrinsically incompatible with
1185 the dataset type, e.g. if ``datasetType.isCalibration() is False``, but
1186 only `~CollectionType.CALIBRATION` collections are being searched.
1187 This may make it harder to debug some lookup failures, but the behavior
1188 is intentional; we consider it more important that failed searches are
1189 reported consistently, regardless of the reason, and that adding
1190 additional collections that do not contain a match to the search path
1191 never changes the behavior.
1193 This method handles component dataset types automatically, though most
1194 other query operations do not.
1195 """
1196 raise NotImplementedError()
1198 @abstractmethod
1199 def retrieve_artifacts_zip(
1200 self,
1201 refs: Iterable[DatasetRef],
1202 destination: ResourcePathExpression,
1203 overwrite: bool = True,
1204 ) -> ResourcePath:
1205 """Retrieve artifacts from a Butler and place in ZIP file.
1207 Parameters
1208 ----------
1209 refs : `~collections.abc.Iterable` [ `DatasetRef` ]
1210 The datasets to be included in the zip file.
1211 destination : `lsst.resources.ResourcePathExpression`
1212 Directory to write the new ZIP file. This directory will
1213 also be used as a staging area for the datasets being downloaded
1214 from the datastore.
1215 overwrite : `bool`, optional
1216 If `False` the output Zip will not be written if a file of the
1217 same name is already present in ``destination``.
1219 Returns
1220 -------
1221 zip_file : `lsst.resources.ResourcePath`
1222 The path to the new ZIP file.
1224 Raises
1225 ------
1226 ValueError
1227 Raised if there are no refs to retrieve.
1228 """
1229 raise NotImplementedError()
1231 @abstractmethod
1232 def retrieveArtifacts(
1233 self,
1234 refs: Iterable[DatasetRef],
1235 destination: ResourcePathExpression,
1236 transfer: str = "auto",
1237 preserve_path: bool = True,
1238 overwrite: bool = False,
1239 ) -> list[ResourcePath]:
1240 """Retrieve the artifacts associated with the supplied refs.
1242 Parameters
1243 ----------
1244 refs : `~collections.abc.Iterable` of `DatasetRef`
1245 The datasets for which artifacts are to be retrieved.
1246 A single ref can result in multiple artifacts. The refs must
1247 be resolved.
1248 destination : `lsst.resources.ResourcePath` or `str`
1249 Location to write the artifacts.
1250 transfer : `str`, optional
1251 Method to use to transfer the artifacts. Must be one of the options
1252 supported by `~lsst.resources.ResourcePath.transfer_from`.
1253 "move" is not allowed.
1254 preserve_path : `bool`, optional
1255 If `True` the full path of the artifact within the datastore
1256 is preserved. If `False` the final file component of the path
1257 is used.
1258 overwrite : `bool`, optional
1259 If `True` allow transfers to overwrite existing files at the
1260 destination.
1262 Returns
1263 -------
1264 targets : `list` of `lsst.resources.ResourcePath`
1265 URIs of file artifacts in destination location. Order is not
1266 preserved.
1268 Notes
1269 -----
1270 For non-file datastores the artifacts written to the destination
1271 may not match the representation inside the datastore. For example
1272 a hierarchical data structure in a NoSQL database may well be stored
1273 as a JSON file.
1274 """
1275 raise NotImplementedError()
1277 @abstractmethod
1278 def exists(
1279 self,
1280 dataset_ref_or_type: DatasetRef | DatasetType | str,
1281 /,
1282 data_id: DataId | None = None,
1283 *,
1284 full_check: bool = True,
1285 collections: Any = None,
1286 **kwargs: Any,
1287 ) -> DatasetExistence:
1288 """Indicate whether a dataset is known to Butler registry and
1289 datastore.
1291 Parameters
1292 ----------
1293 dataset_ref_or_type : `DatasetRef`, `DatasetType`, or `str`
1294 When `DatasetRef` the `dataId` should be `None`.
1295 Otherwise the `DatasetType` or name thereof.
1296 data_id : `dict` or `DataCoordinate`
1297 A `dict` of `Dimension` link name, value pairs that label the
1298 `DatasetRef` within a Collection. When `None`, a `DatasetRef`
1299 should be provided as the first argument.
1300 full_check : `bool`, optional
1301 If `True`, a check will be made for the actual existence of a
1302 dataset artifact. This will involve additional overhead due to
1303 the need to query an external system. If `False`, this check will
1304 be omitted, and the registry and datastore will solely be asked
1305 if they know about the dataset but no direct check for the
1306 artifact will be performed.
1307 collections : Any, optional
1308 Collections to be searched, overriding ``self.collections``.
1309 Can be any of the types supported by the ``collections`` argument
1310 to butler construction.
1311 **kwargs
1312 Additional keyword arguments used to augment or construct a
1313 `DataCoordinate`. See `DataCoordinate.standardize`
1314 parameters.
1316 Returns
1317 -------
1318 existence : `DatasetExistence`
1319 Object indicating whether the dataset is known to registry and
1320 datastore. Evaluates to `True` if the dataset is present and known
1321 to both.
1322 """
1323 raise NotImplementedError()
1325 @abstractmethod
1326 def _exists_many(
1327 self,
1328 refs: Iterable[DatasetRef],
1329 /,
1330 *,
1331 full_check: bool = True,
1332 ) -> dict[DatasetRef, DatasetExistence]:
1333 """Indicate whether multiple datasets are known to Butler registry and
1334 datastore.
1336 This is an experimental API that may change at any moment.
1338 Parameters
1339 ----------
1340 refs : `~collections.abc.Iterable` of `DatasetRef`
1341 The datasets to be checked.
1342 full_check : `bool`, optional
1343 If `True`, a check will be made for the actual existence of each
1344 dataset artifact. This will involve additional overhead due to
1345 the need to query an external system. If `False`, this check will
1346 be omitted, and the registry and datastore will solely be asked
1347 if they know about the dataset(s) but no direct check for the
1348 artifact(s) will be performed.
1350 Returns
1351 -------
1352 existence : `dict` [`DatasetRef`, `DatasetExistence`]
1353 Mapping from the given dataset refs to an enum indicating the
1354 status of the dataset in registry and datastore.
1355 Each value evaluates to `True` if the dataset is present and known
1356 to both.
1357 """
1358 raise NotImplementedError()
1360 @abstractmethod
1361 def removeRuns(
1362 self,
1363 names: Iterable[str],
1364 unstore: bool | type[_DeprecatedDefault] = _DeprecatedDefault,
1365 *,
1366 unlink_from_chains: bool = False,
1367 ) -> None:
1368 """Remove one or more `~CollectionType.RUN` collections and the
1369 datasets within them.
1371 Parameters
1372 ----------
1373 names : `~collections.abc.Iterable` [ `str` ]
1374 The names of the collections to remove.
1375 unstore : `bool`, optional
1376 If `True` (default), delete datasets from all datastores in which
1377 they are present, and attempt to rollback the registry deletions if
1378 datastore deletions fail (which may not always be possible). If
1379 `False`, datastore records for these datasets are still removed,
1380 but any artifacts (e.g. files) will not be. This parameter is now
1381 deprecated and no longer has any effect. Files are always deleted
1382 from datastores unless they were ingested using full URIs.
1383 unlink_from_chains : `bool`, optional
1384 If `True` remove the RUN collection from any chains prior to
1385 removing the RUN. If `False` the removal will fail if any chains
1386 still refer to the RUN.
1388 Raises
1389 ------
1390 TypeError
1391 Raised if one or more collections are not of type
1392 `~CollectionType.RUN`.
1393 """
1394 raise NotImplementedError()
1396 @abstractmethod
1397 def ingest(
1398 self,
1399 *datasets: FileDataset,
1400 transfer: str | None = "auto",
1401 record_validation_info: bool = True,
1402 skip_existing: bool = False,
1403 ) -> None:
1404 """Store and register one or more datasets that already exist on disk.
1406 Parameters
1407 ----------
1408 *datasets : `FileDataset`
1409 Each positional argument is a struct containing information about
1410 a file to be ingested, including its URI (either absolute or
1411 relative to the datastore root, if applicable), a resolved
1412 `DatasetRef`, and optionally a formatter class or its
1413 fully-qualified string name. If a formatter is not provided, the
1414 formatter that would be used for `put` is assumed. On successful
1415 ingest all `FileDataset.formatter` attributes will be set to the
1416 formatter class used. `FileDataset.path` attributes may be modified
1417 to put paths in whatever the datastore considers a standardized
1418 form.
1419 transfer : `str`, optional
1420 If not `None`, must be one of 'auto', 'move', 'copy', 'direct',
1421 'split', 'hardlink', 'relsymlink' or 'symlink', indicating how to
1422 transfer the file.
1423 record_validation_info : `bool`, optional
1424 If `True`, the default, the datastore can record validation
1425 information associated with the file. If `False` the datastore
1426 will not attempt to track any information such as checksums
1427 or file sizes. This can be useful if such information is tracked
1428 in an external system or if the file is to be compressed in place.
1429 It is up to the datastore whether this parameter is relevant.
1430 skip_existing : `bool`, optional
1431 If `True`, a dataset will not be ingested if a dataset with the
1432 same dataset ID already exists in the datastore.
1433 If `False` (the default), a `ConflictingDefinitionError` will be
1434 raised if any datasets with the same dataset ID already exist
1435 in the datastore.
1437 Returns
1438 -------
1439 None
1441 Raises
1442 ------
1443 TypeError
1444 Raised if the butler is read-only or if no run was provided.
1445 NotImplementedError
1446 Raised if the `Datastore` does not support the given transfer mode.
1447 DatasetTypeNotSupportedError
1448 Raised if one or more files to be ingested have a dataset type that
1449 is not supported by the `Datastore`..
1450 FileNotFoundError
1451 Raised if one of the given files does not exist.
1452 FileExistsError
1453 Raised if transfer is not `None` but the (internal) location the
1454 file would be moved to is already occupied.
1455 ConflictingDefinitionError
1456 Raised if a dataset already exists in the repository and
1457 ``skip_existing`` is `False`.
1459 Notes
1460 -----
1461 This operation is not fully exception safe: if a database operation
1462 fails, the given `FileDataset` instances may be only partially updated.
1464 It is atomic in terms of database operations (they will either all
1465 succeed or all fail) providing the database engine implements
1466 transactions correctly. It will attempt to be atomic in terms of
1467 filesystem operations as well, but this cannot be implemented
1468 rigorously for most datastores.
1469 """
1470 raise NotImplementedError()
1472 @abstractmethod
1473 def ingest_zip(
1474 self,
1475 zip_file: ResourcePathExpression,
1476 transfer: str = "auto",
1477 *,
1478 transfer_dimensions: bool = False,
1479 dry_run: bool = False,
1480 skip_existing: bool = False,
1481 ) -> None:
1482 """Ingest a Zip file into this butler.
1484 The Zip file must have been created by `retrieve_artifacts_zip`.
1486 Parameters
1487 ----------
1488 zip_file : `lsst.resources.ResourcePathExpression`
1489 Path to the Zip file.
1490 transfer : `str`, optional
1491 Method to use to transfer the Zip into the datastore.
1492 transfer_dimensions : `bool`, optional
1493 If `True`, dimension record data associated with the new datasets
1494 will be transferred from the Zip file, if present.
1495 dry_run : `bool`, optional
1496 If `True` the ingest will be processed without any modifications
1497 made to the target butler and as if the target butler did not
1498 have any of the datasets.
1499 skip_existing : `bool`, optional
1500 If `True`, a zip will not be ingested if the dataset entries listed
1501 in the index with the same dataset ID already exists in the butler.
1502 If `False` (the default), a `ConflictingDefinitionError` will be
1503 raised if any datasets with the same dataset ID already exist
1504 in the repository. If, somehow, some datasets are known to the
1505 butler and some are not, this is currently treated as an error
1506 rather than attempting to do a partial ingest.
1508 Notes
1509 -----
1510 Run collections and dataset types are created as needed.
1511 """
1512 raise NotImplementedError()
1514 @abstractmethod
1515 def export(
1516 self,
1517 *,
1518 directory: str | None = None,
1519 filename: str | None = None,
1520 format: str | None = None,
1521 transfer: str | None = None,
1522 ) -> AbstractContextManager[RepoExportContext]:
1523 """Export datasets from the repository represented by this `Butler`.
1525 This method is a context manager that returns a helper object
1526 (`RepoExportContext`) that is used to indicate what information from
1527 the repository should be exported.
1529 Parameters
1530 ----------
1531 directory : `str`, optional
1532 Directory dataset files should be written to if ``transfer`` is not
1533 `None`.
1534 filename : `str`, optional
1535 Name for the file that will include database information associated
1536 with the exported datasets. If this is not an absolute path and
1537 ``directory`` is not `None`, it will be written to ``directory``
1538 instead of the current working directory. Defaults to
1539 "export.{format}".
1540 format : `str`, optional
1541 File format for the database information file. If `None`, the
1542 extension of ``filename`` will be used.
1543 transfer : `str`, optional
1544 Transfer mode passed to `Datastore.export`.
1546 Raises
1547 ------
1548 TypeError
1549 Raised if the set of arguments passed is inconsistent.
1551 Examples
1552 --------
1553 Typically the `Registry.queryDataIds` and `Registry.queryDatasets`
1554 methods are used to provide the iterables over data IDs and/or datasets
1555 to be exported::
1557 with butler.export("exports.yaml") as export:
1558 # Export all flats, but none of the dimension element rows
1559 # (i.e. data ID information) associated with them.
1560 export.saveDatasets(
1561 butler.registry.queryDatasets("flat"), elements=()
1562 )
1563 # Export all datasets that start with "deepCoadd_" and all of
1564 # their associated data ID information.
1565 export.saveDatasets(butler.registry.queryDatasets("deepCoadd_*"))
1566 """
1567 raise NotImplementedError()
1569 @abstractmethod
1570 def import_(
1571 self,
1572 *,
1573 directory: ResourcePathExpression | None = None,
1574 filename: ResourcePathExpression | TextIO | None = None,
1575 format: str | None = None,
1576 transfer: str | None = None,
1577 skip_dimensions: set | None = None,
1578 record_validation_info: bool = True,
1579 without_datastore: bool = False,
1580 ) -> None:
1581 """Import datasets into this repository that were exported from a
1582 different butler repository via `~lsst.daf.butler.Butler.export`.
1584 Parameters
1585 ----------
1586 directory : `~lsst.resources.ResourcePathExpression`, optional
1587 Directory containing dataset files to import from. If `None`,
1588 ``filename`` and all dataset file paths specified therein must
1589 be absolute.
1590 filename : `~lsst.resources.ResourcePathExpression` or `typing.TextIO`
1591 A stream or name of file that contains database information
1592 associated with the exported datasets, typically generated by
1593 `~lsst.daf.butler.Butler.export`. If this a string (name) or
1594 `~lsst.resources.ResourcePath` and is not an absolute path,
1595 it will first be looked for relative to ``directory`` and if not
1596 found there it will be looked for in the current working
1597 directory. Defaults to "export.{format}".
1598 format : `str`, optional
1599 File format for ``filename``. If `None`, the extension of
1600 ``filename`` will be used.
1601 transfer : `str`, optional
1602 Transfer mode passed to `~lsst.daf.butler.Datastore.ingest`.
1603 skip_dimensions : `set`, optional
1604 Names of dimensions that should be skipped and not imported.
1605 record_validation_info : `bool`, optional
1606 If `True`, the default, the datastore can record validation
1607 information associated with the file. If `False` the datastore
1608 will not attempt to track any information such as checksums
1609 or file sizes. This can be useful if such information is tracked
1610 in an external system or if the file is to be compressed in place.
1611 It is up to the datastore whether this parameter is relevant.
1612 without_datastore : `bool`, optional
1613 If `True` only registry records will be imported and the datastore
1614 will be ignored.
1616 Raises
1617 ------
1618 TypeError
1619 Raised if the set of arguments passed is inconsistent, or if the
1620 butler is read-only.
1621 """
1622 raise NotImplementedError()
1624 @abstractmethod
1625 def transfer_dimension_records_from(
1626 self, source_butler: LimitedButler | Butler, source_refs: Iterable[DatasetRef | DataCoordinate]
1627 ) -> None:
1628 """Transfer dimension records to this Butler from another Butler.
1630 Parameters
1631 ----------
1632 source_butler : `LimitedButler` or `Butler`
1633 Butler from which the records are to be transferred. If data IDs
1634 in ``source_refs`` are not expanded then this has to be a full
1635 `Butler` whose registry will be used to expand data IDs. If the
1636 source refs contain coordinates that are used to populate other
1637 records then this will also need to be a full `Butler`.
1638 source_refs : `~collections.abc.Iterable` [`DatasetRef` |\
1639 `DataCoordinate`]
1640 Datasets or data IDs defined in the source butler whose dimension
1641 records should be transferred to this butler.
1642 """
1643 raise NotImplementedError()
1645 @abstractmethod
1646 def transfer_from(
1647 self,
1648 source_butler: LimitedButler,
1649 source_refs: Iterable[DatasetRef],
1650 transfer: str = "auto",
1651 skip_missing: bool = True,
1652 register_dataset_types: bool = False,
1653 transfer_dimensions: bool = False,
1654 dry_run: bool = False,
1655 ) -> Collection[DatasetRef]:
1656 """Transfer datasets to this Butler from a run in another Butler.
1658 Parameters
1659 ----------
1660 source_butler : `LimitedButler`
1661 Butler from which the datasets are to be transferred. If data IDs
1662 in ``source_refs`` are not expanded then this has to be a full
1663 `Butler` whose registry will be used to expand data IDs.
1664 source_refs : `~collections.abc.Iterable` of `DatasetRef`
1665 Datasets defined in the source butler that should be transferred to
1666 this butler. In most circumstances, ``transfer_from`` is faster if
1667 the dataset refs are expanded.
1668 transfer : `str`, optional
1669 Transfer mode passed to `~lsst.daf.butler.Datastore.transfer_from`.
1670 skip_missing : `bool`
1671 If `True`, datasets with no datastore artifact associated with
1672 them are not transferred. If `False` a registry entry will be
1673 created even if no datastore record is created (and so will
1674 look equivalent to the dataset being unstored).
1675 register_dataset_types : `bool`
1676 If `True` any missing dataset types are registered. Otherwise
1677 an exception is raised.
1678 transfer_dimensions : `bool`, optional
1679 If `True`, dimension record data associated with the new datasets
1680 will be transferred.
1681 dry_run : `bool`, optional
1682 If `True` the transfer will be processed without any modifications
1683 made to the target butler and as if the target butler did not
1684 have any of the datasets.
1686 Returns
1687 -------
1688 refs : `list` of `DatasetRef`
1689 The refs added to this Butler.
1691 Notes
1692 -----
1693 The datastore artifact has to exist for a transfer
1694 to be made but non-existence is not an error.
1696 Datasets that already exist in this run will be skipped.
1698 The datasets are imported as part of a transaction, although
1699 dataset types are registered before the transaction is started.
1700 This means that it is possible for a dataset type to be registered
1701 even though transfer has failed.
1702 """
1703 raise NotImplementedError()
1705 @abstractmethod
1706 def validateConfiguration(
1707 self,
1708 logFailures: bool = False,
1709 datasetTypeNames: Iterable[str] | None = None,
1710 ignore: Iterable[str] | None = None,
1711 ) -> None:
1712 """Validate butler configuration.
1714 Checks that each `DatasetType` can be stored in the `Datastore`.
1716 Parameters
1717 ----------
1718 logFailures : `bool`, optional
1719 If `True`, output a log message for every validation error
1720 detected.
1721 datasetTypeNames : `~collections.abc.Iterable` of `str`, optional
1722 The `DatasetType` names that should be checked. This allows
1723 only a subset to be selected.
1724 ignore : `~collections.abc.Iterable` of `str`, optional
1725 Names of DatasetTypes to skip over. This can be used to skip
1726 known problems. If a named `DatasetType` corresponds to a
1727 composite, all components of that `DatasetType` will also be
1728 ignored.
1730 Raises
1731 ------
1732 ButlerValidationError
1733 Raised if there is some inconsistency with how this Butler
1734 is configured.
1735 """
1736 raise NotImplementedError()
1738 @property
1739 @abstractmethod
1740 def collection_chains(self) -> ButlerCollections:
1741 """Object with methods for modifying collection chains
1742 (`~lsst.daf.butler.ButlerCollections`).
1744 Deprecated. Replaced with ``collections`` property.
1745 """
1746 raise NotImplementedError()
1748 @property
1749 @abstractmethod
1750 def collections(self) -> ButlerCollections:
1751 """Object with methods for modifying and querying collections
1752 (`~lsst.daf.butler.ButlerCollections`).
1754 Use of this object is preferred over `registry` wherever possible.
1755 """
1756 raise NotImplementedError()
1758 @property
1759 @abstractmethod
1760 def run(self) -> str | None:
1761 """Name of the run this butler writes outputs to by default (`str` or
1762 `None`).
1763 """
1764 raise NotImplementedError()
1766 @property
1767 @abstractmethod
1768 def registry(self) -> Registry:
1769 """The object that manages dataset metadata and relationships
1770 (`Registry`).
1772 Many operations that don't involve reading or writing butler datasets
1773 are accessible only via `Registry` methods. Eventually these methods
1774 will be replaced by equivalent `Butler` methods.
1775 """
1776 raise NotImplementedError()
1778 @abstractmethod
1779 def query(self) -> AbstractContextManager[Query]:
1780 """Context manager returning a `.queries.Query` object used for
1781 construction and execution of complex queries.
1782 """
1783 raise NotImplementedError()
1785 def query_data_ids(
1786 self,
1787 dimensions: DimensionGroup | Iterable[str] | str,
1788 *,
1789 data_id: DataId | None = None,
1790 where: str = "",
1791 bind: Mapping[str, Any] | None = None,
1792 with_dimension_records: bool = False,
1793 order_by: Iterable[str] | str | None = None,
1794 limit: int | None = -20_000,
1795 explain: bool = True,
1796 **kwargs: Any,
1797 ) -> list[DataCoordinate]:
1798 """Query for data IDs matching user-provided criteria.
1800 Parameters
1801 ----------
1802 dimensions : `DimensionGroup`, `str`, or \
1803 `~collections.abc.Iterable` [`str`]
1804 The dimensions of the data IDs to yield, as either `DimensionGroup`
1805 instances or `str`. Will be automatically expanded to a complete
1806 `DimensionGroup`.
1807 data_id : `dict` or `DataCoordinate`, optional
1808 A data ID whose key-value pairs are used as equality constraints
1809 in the query.
1810 where : `str`, optional
1811 A string expression similar to a SQL WHERE clause. May involve
1812 any column of a dimension table or (as a shortcut for the primary
1813 key column of a dimension table) dimension name. See
1814 :ref:`daf_butler_dimension_expressions` for more information.
1815 bind : `~collections.abc.Mapping`, optional
1816 Mapping containing literal values that should be injected into the
1817 ``where`` expression, keyed by the identifiers they replace.
1818 Values of collection type can be expanded in some cases; see
1819 :ref:`daf_butler_dimension_expressions_identifiers` for more
1820 information.
1821 with_dimension_records : `bool`, optional
1822 If `True` (default is `False`) then returned data IDs will have
1823 dimension records.
1824 order_by : `~collections.abc.Iterable` [`str`] or `str`, optional
1825 Names of the columns/dimensions to use for ordering returned data
1826 IDs. Column name can be prefixed with minus (``-``) to use
1827 descending ordering.
1828 limit : `int` or `None`, optional
1829 Upper limit on the number of returned records. `None` can be used
1830 if no limit is wanted. A limit of ``0`` means that the query will
1831 be executed and validated but no results will be returned. In this
1832 case there will be no exception even if ``explain`` is `True`.
1833 If a negative value is given a warning will be issued if the number
1834 of results is capped by that limit.
1835 explain : `bool`, optional
1836 If `True` (default) then `EmptyQueryResultError` exception is
1837 raised when resulting list is empty. The exception contains
1838 non-empty list of strings explaining possible causes for empty
1839 result.
1840 **kwargs
1841 Additional keyword arguments are forwarded to
1842 `DataCoordinate.standardize` when processing the ``data_id``
1843 argument (and may be used to provide a constraining data ID even
1844 when the ``data_id`` argument is `None`).
1846 Returns
1847 -------
1848 dataIds : `list` [`DataCoordinate`]
1849 Data IDs matching the given query parameters. These are always
1850 guaranteed to identify all dimensions (`DataCoordinate.hasFull`
1851 returns `True`).
1853 Raises
1854 ------
1855 lsst.daf.butler.registry.DataIdError
1856 Raised when ``data_id`` or keyword arguments specify unknown
1857 dimensions or values, or when they contain inconsistent values.
1858 lsst.daf.butler.registry.UserExpressionError
1859 Raised when ``where`` expression is invalid.
1860 lsst.daf.butler.EmptyQueryResultError
1861 Raised when query generates empty result and ``explain`` is set to
1862 `True`.
1863 TypeError
1864 Raised when the arguments are incompatible.
1865 """
1866 if data_id is None:
1867 data_id = DataCoordinate.make_empty(self.dimensions)
1868 if order_by is None:
1869 order_by = []
1870 query_limit = limit
1871 warn_limit = False
1872 if limit is not None and limit < 0:
1873 query_limit = abs(limit) + 1
1874 warn_limit = True
1875 with self.query() as query:
1876 result = (
1877 query.data_ids(dimensions)
1878 .where(data_id, where, bind=bind, **kwargs)
1879 .order_by(*ensure_iterable(order_by))
1880 .limit(query_limit)
1881 )
1882 if with_dimension_records:
1883 result = result.with_dimension_records()
1884 data_ids = list(result)
1885 if warn_limit and len(data_ids) == query_limit:
1886 # We asked for one too many so must remove that from the list.
1887 data_ids.pop(-1)
1888 assert limit is not None # For mypy.
1889 _LOG.warning("More data IDs are available than the requested limit of %d.", abs(limit))
1890 if explain and (limit is None or limit != 0) and not data_ids:
1891 raise EmptyQueryResultError(list(result.explain_no_results()))
1892 return data_ids
1894 def query_datasets(
1895 self,
1896 dataset_type: str | DatasetType,
1897 collections: str | Iterable[str] | None = None,
1898 *,
1899 find_first: bool = True,
1900 data_id: DataId | None = None,
1901 where: str = "",
1902 bind: Mapping[str, Any] | None = None,
1903 with_dimension_records: bool = False,
1904 order_by: Iterable[str] | str | None = None,
1905 limit: int | None = -20_000,
1906 explain: bool = True,
1907 **kwargs: Any,
1908 ) -> list[DatasetRef]:
1909 """Query for dataset references matching user-provided criteria.
1911 Parameters
1912 ----------
1913 dataset_type : `str` or `DatasetType`
1914 Dataset type object or name to search for.
1915 collections : collection expression, optional
1916 A collection name or iterable of collection names to search. If not
1917 provided, the default collections are used. Can be a wildcard if
1918 ``find_first`` is `False` (if find first is requested the order
1919 of collections matters and wildcards make the order indeterminate).
1920 See :ref:`daf_butler_collection_expressions` for more information.
1921 find_first : `bool`, optional
1922 If `True` (default), for each result data ID, only yield one
1923 `DatasetRef` of each `DatasetType`, from the first collection in
1924 which a dataset of that dataset type appears (according to the
1925 order of ``collections`` passed in). If `True`, ``collections``
1926 must not contain wildcards.
1927 data_id : `dict` or `DataCoordinate`, optional
1928 A data ID whose key-value pairs are used as equality constraints in
1929 the query.
1930 where : `str`, optional
1931 A string expression similar to a SQL WHERE clause. May involve any
1932 column of a dimension table or (as a shortcut for the primary key
1933 column of a dimension table) dimension name. See
1934 :ref:`daf_butler_dimension_expressions` for more information.
1935 bind : `~collections.abc.Mapping`, optional
1936 Mapping containing literal values that should be injected into the
1937 ``where`` expression, keyed by the identifiers they replace. Values
1938 of collection type can be expanded in some cases; see
1939 :ref:`daf_butler_dimension_expressions_identifiers` for more
1940 information.
1941 with_dimension_records : `bool`, optional
1942 If `True` (default is `False`) then returned data IDs will have
1943 dimension records.
1944 order_by : `~collections.abc.Iterable` [`str`] or `str`, optional
1945 Names of the columns/dimensions to use for ordering returned data
1946 IDs. Column name can be prefixed with minus (``-``) to use
1947 descending ordering.
1948 limit : `int` or `None`, optional
1949 Upper limit on the number of returned records. `None` can be used
1950 if no limit is wanted. A limit of ``0`` means that the query will
1951 be executed and validated but no results will be returned. In this
1952 case there will be no exception even if ``explain`` is `True`.
1953 If a negative value is given a warning will be issued if the number
1954 of results is capped by that limit.
1955 explain : `bool`, optional
1956 If `True` (default) then `EmptyQueryResultError` exception is
1957 raised when resulting list is empty. The exception contains
1958 non-empty list of strings explaining possible causes for empty
1959 result.
1960 **kwargs
1961 Additional keyword arguments are forwarded to
1962 `DataCoordinate.standardize` when processing the ``data_id``
1963 argument (and may be used to provide a constraining data ID even
1964 when the ``data_id`` argument is `None`).
1966 Returns
1967 -------
1968 refs : `.queries.DatasetRefQueryResults`
1969 Dataset references matching the given query criteria. Nested data
1970 IDs are guaranteed to include values for all implied dimensions
1971 (i.e. `DataCoordinate.hasFull` will return `True`).
1973 Raises
1974 ------
1975 lsst.daf.butler.registry.DatasetTypeExpressionError
1976 Raised when ``dataset_type`` expression is invalid.
1977 lsst.daf.butler.registry.DataIdError
1978 Raised when ``data_id`` or keyword arguments specify unknown
1979 dimensions or values, or when they contain inconsistent values.
1980 lsst.daf.butler.registry.UserExpressionError
1981 Raised when ``where`` expression is invalid.
1982 lsst.daf.butler.EmptyQueryResultError
1983 Raised when query generates empty result and ``explain`` is set to
1984 `True`.
1985 TypeError
1986 Raised when the arguments are incompatible, such as when a
1987 collection wildcard is passed when ``find_first`` is `True`, or
1988 when ``collections`` is `None` and default butler collections are
1989 not defined.
1990 """
1991 if data_id is None:
1992 data_id = DataCoordinate.make_empty(self.dimensions)
1993 if order_by is None:
1994 order_by = []
1995 if collections and has_globs(collections):
1996 # Wild cards need to be expanded but can only be allowed if
1997 # find_first=False because expanding wildcards does not return
1998 # a guaranteed ordering. Querying collection registry to expand
1999 # collections when we do not have wildcards is expensive so only
2000 # do it if we need it.
2001 if find_first:
2002 raise InvalidQueryError(
2003 f"Can not use wildcards in collections when find_first=True (given {collections})"
2004 )
2005 collections = self.collections.query(collections)
2006 query_limit = limit
2007 warn_limit = False
2008 if limit is not None and limit < 0:
2009 query_limit = abs(limit) + 1
2010 warn_limit = True
2011 with self.query() as query:
2012 result = (
2013 query.datasets(dataset_type, collections=collections, find_first=find_first)
2014 .where(data_id, where, bind=bind, **kwargs)
2015 .order_by(*ensure_iterable(order_by))
2016 .limit(query_limit)
2017 )
2018 if with_dimension_records:
2019 result = result.with_dimension_records()
2020 refs = list(result)
2021 if warn_limit and len(refs) == query_limit:
2022 # We asked for one too many so must remove that from the list.
2023 refs.pop(-1)
2024 assert limit is not None # For mypy.
2025 _LOG.warning("More datasets are available than the requested limit of %d.", abs(limit))
2026 if explain and (limit is None or limit != 0) and not refs:
2027 raise EmptyQueryResultError(list(result.explain_no_results()))
2028 return refs
2030 def query_dimension_records(
2031 self,
2032 element: str,
2033 *,
2034 data_id: DataId | None = None,
2035 where: str = "",
2036 bind: Mapping[str, Any] | None = None,
2037 order_by: Iterable[str] | str | None = None,
2038 limit: int | None = -20_000,
2039 explain: bool = True,
2040 **kwargs: Any,
2041 ) -> list[DimensionRecord]:
2042 """Query for dimension information matching user-provided criteria.
2044 Parameters
2045 ----------
2046 element : `str`
2047 The name of a dimension element to obtain records for.
2048 data_id : `dict` or `DataCoordinate`, optional
2049 A data ID whose key-value pairs are used as equality constraints
2050 in the query.
2051 where : `str`, optional
2052 A string expression similar to a SQL WHERE clause. See
2053 `Registry.queryDataIds` and :ref:`daf_butler_dimension_expressions`
2054 for more information.
2055 bind : `~collections.abc.Mapping`, optional
2056 Mapping containing literal values that should be injected into the
2057 ``where`` expression, keyed by the identifiers they replace.
2058 Values of collection type can be expanded in some cases; see
2059 :ref:`daf_butler_dimension_expressions_identifiers` for more
2060 information.
2061 order_by : `~collections.abc.Iterable` [`str`] or `str`, optional
2062 Names of the columns/dimensions to use for ordering returned data
2063 IDs. Column name can be prefixed with minus (``-``) to use
2064 descending ordering.
2065 limit : `int` or `None`, optional
2066 Upper limit on the number of returned records. `None` can be used
2067 if no limit is wanted. A limit of ``0`` means that the query will
2068 be executed and validated but no results will be returned. In this
2069 case there will be no exception even if ``explain`` is `True`.
2070 If a negative value is given a warning will be issued if the number
2071 of results is capped by that limit.
2072 explain : `bool`, optional
2073 If `True` (default) then `EmptyQueryResultError` exception is
2074 raised when resulting list is empty. The exception contains
2075 non-empty list of strings explaining possible causes for empty
2076 result.
2077 **kwargs
2078 Additional keyword arguments are forwarded to
2079 `DataCoordinate.standardize` when processing the ``data_id``
2080 argument (and may be used to provide a constraining data ID even
2081 when the ``data_id`` argument is `None`).
2083 Returns
2084 -------
2085 records : `list` [`DimensionRecord`]
2086 Dimension records matching the given query parameters.
2088 Raises
2089 ------
2090 lsst.daf.butler.registry.DataIdError
2091 Raised when ``data_id`` or keyword arguments specify unknown
2092 dimensions or values, or when they contain inconsistent values.
2093 lsst.daf.butler.registry.UserExpressionError
2094 Raised when ``where`` expression is invalid.
2095 lsst.daf.butler.EmptyQueryResultError
2096 Raised when query generates empty result and ``explain`` is set to
2097 `True`.
2098 TypeError
2099 Raised when the arguments are incompatible, such as when a
2100 collection wildcard is passed when ``find_first`` is `True`, or
2101 when ``collections`` is `None` and default butler collections are
2102 not defined.
2103 """
2104 if data_id is None:
2105 data_id = DataCoordinate.make_empty(self.dimensions)
2106 if order_by is None:
2107 order_by = []
2108 query_limit = limit
2109 warn_limit = False
2110 if limit is not None and limit < 0:
2111 query_limit = abs(limit) + 1
2112 warn_limit = True
2113 with self.query() as query:
2114 result = (
2115 query.dimension_records(element)
2116 .where(data_id, where, bind=bind, **kwargs)
2117 .order_by(*ensure_iterable(order_by))
2118 .limit(query_limit)
2119 )
2120 dimension_records = list(result)
2121 if warn_limit and len(dimension_records) == query_limit:
2122 # We asked for one too many so must remove that from the list.
2123 dimension_records.pop(-1)
2124 assert limit is not None # For mypy.
2125 _LOG.warning(
2126 "More dimension records are available than the requested limit of %d.", abs(limit)
2127 )
2128 if explain and (limit is None or limit != 0) and not dimension_records:
2129 raise EmptyQueryResultError(list(result.explain_no_results()))
2130 return dimension_records
2132 def query_all_datasets(
2133 self,
2134 collections: str | Iterable[str] | None = None,
2135 *,
2136 name: str | Iterable[str] = "*",
2137 find_first: bool = True,
2138 data_id: DataId | None = None,
2139 where: str = "",
2140 bind: Mapping[str, Any] | None = None,
2141 limit: int | None = -20_000,
2142 **kwargs: Any,
2143 ) -> list[DatasetRef]:
2144 """Query for datasets of potentially multiple types.
2146 Parameters
2147 ----------
2148 collections : `str` or `~collections.abc.Iterable` [ `str` ], optional
2149 The collection or collections to search, in order. If not provided
2150 or `None`, the default collection search path for this butler is
2151 used.
2152 name : `str` or `~collections.abc.Iterable` [ `str` ], optional
2153 Names or name patterns (glob-style) that returned dataset type
2154 names must match. If an iterable, items are OR'd together. The
2155 default is to include all dataset types in the given collections.
2156 find_first : `bool`, optional
2157 If `True` (default), for each result data ID, only yield one
2158 `DatasetRef` of each `DatasetType`, from the first collection in
2159 which a dataset of that dataset type appears (according to the
2160 order of ``collections`` passed in).
2161 data_id : `dict` or `DataCoordinate`, optional
2162 A data ID whose key-value pairs are used as equality constraints in
2163 the query.
2164 where : `str`, optional
2165 A string expression similar to a SQL WHERE clause. May involve any
2166 column of a dimension table or (as a shortcut for the primary key
2167 column of a dimension table) dimension name. See
2168 :ref:`daf_butler_dimension_expressions` for more information.
2169 bind : `~collections.abc.Mapping`, optional
2170 Mapping containing literal values that should be injected into the
2171 ``where`` expression, keyed by the identifiers they replace. Values
2172 of collection type can be expanded in some cases; see
2173 :ref:`daf_butler_dimension_expressions_identifiers` for more
2174 information.
2175 limit : `int` or `None`, optional
2176 Upper limit on the number of returned records. `None` can be used
2177 if no limit is wanted. A limit of ``0`` means that the query will
2178 be executed and validated but no results will be returned.
2179 If a negative value is given a warning will be issued if the number
2180 of results is capped by that limit. If no limit is provided, by
2181 default a maximum of 20,000 records will be returned.
2182 **kwargs
2183 Additional keyword arguments are forwarded to
2184 `DataCoordinate.standardize` when processing the ``data_id``
2185 argument (and may be used to provide a constraining data ID even
2186 when the ``data_id`` argument is `None`).
2188 Raises
2189 ------
2190 MissingDatasetTypeError
2191 When no dataset types match ``name``, or an explicit (non-glob)
2192 dataset type in ``name`` does not exist.
2193 InvalidQueryError
2194 If the parameters to the query are inconsistent or malformed.
2195 MissingCollectionError
2196 If a given collection is not found.
2198 Returns
2199 -------
2200 refs : `list` [ `DatasetRef` ]
2201 Dataset references matching the given query criteria. Nested data
2202 IDs are guaranteed to include values for all implied dimensions
2203 (i.e. `DataCoordinate.hasFull` will return `True`), but will not
2204 include dimension records (`DataCoordinate.hasRecords` will be
2205 `False`).
2206 """
2207 if collections is None:
2208 collections = list(self.collections.defaults)
2209 else:
2210 collections = list(ensure_iterable(collections))
2212 if bind is None:
2213 bind = {}
2214 if data_id is None:
2215 data_id = {}
2217 warn_limit = False
2218 if limit is not None and limit < 0:
2219 # Add one to the limit so we can detect if we have exceeded it.
2220 limit = abs(limit) + 1
2221 warn_limit = True
2223 args = QueryAllDatasetsParameters(
2224 collections=collections,
2225 name=list(ensure_iterable(name)),
2226 find_first=find_first,
2227 data_id=data_id,
2228 where=where,
2229 limit=limit,
2230 bind=bind,
2231 kwargs=kwargs,
2232 with_dimension_records=False,
2233 )
2234 with self._query_all_datasets_by_page(args) as pages:
2235 result = []
2236 for page in pages:
2237 result.extend(page)
2239 if warn_limit and limit is not None and len(result) >= limit:
2240 # Remove the extra dataset we added for the limit check.
2241 result.pop()
2242 _LOG.warning("More datasets are available than the requested limit of %d.", limit - 1)
2244 return result
2246 @abstractmethod
2247 def _query_all_datasets_by_page(
2248 self, args: QueryAllDatasetsParameters
2249 ) -> AbstractContextManager[Iterator[list[DatasetRef]]]:
2250 raise NotImplementedError()
2252 def clone(
2253 self,
2254 *,
2255 collections: CollectionArgType | None | EllipsisType = ...,
2256 run: str | None | EllipsisType = ...,
2257 inferDefaults: bool | EllipsisType = ...,
2258 dataId: dict[str, str] | EllipsisType = ...,
2259 metrics: ButlerMetrics | None = None,
2260 ) -> Butler:
2261 """Return a new Butler instance connected to the same repository
2262 as this one, optionally overriding ``collections``, ``run``,
2263 ``inferDefaults``, and default data ID.
2265 Parameters
2266 ----------
2267 collections : `~lsst.daf.butler.registry.CollectionArgType` or `None`,\
2268 optional
2269 Same as constructor. If omitted, uses value from original object.
2270 run : `str` or `None`, optional
2271 Same as constructor. If `None`, no default run is used. If
2272 omitted, copies value from original object.
2273 inferDefaults : `bool`, optional
2274 Same as constructor. If omitted, copies value from original
2275 object.
2276 dataId : `str`
2277 Same as ``kwargs`` passed to the constructor. If omitted, copies
2278 values from original object.
2279 metrics : `ButlerMetrics` or `None`, optional
2280 Metrics object to record butler statistics.
2281 """
2282 raise NotImplementedError()
2284 @abstractmethod
2285 def close(self) -> None:
2286 raise NotImplementedError()
2288 @abstractmethod
2289 def _expand_data_ids(self, data_ids: Iterable[DataCoordinate]) -> list[DataCoordinate]:
2290 raise NotImplementedError()