Coverage for python / lsst / daf / butler / script / queryDataIds.py: 13%
85 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-17 08:49 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-17 08:49 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
27from __future__ import annotations
29import logging
30from collections.abc import Iterable
31from typing import TYPE_CHECKING
33import numpy as np
34from astropy.table import Table as AstropyTable
36from .._butler import Butler
37from ..cli.utils import sortAstropyTable
38from ..dimensions import DataCoordinate
40if TYPE_CHECKING:
41 from lsst.daf.butler import DimensionGroup
43_LOG = logging.getLogger(__name__)
46class _Table:
47 """Aggregates DataIds and creates an astropy table with one DataId per
48 row. Eliminates duplicate rows.
50 Parameters
51 ----------
52 dataIds : `~collections.abc.Iterable` [ ``DataId`` ]
53 The DataIds to add to the table.
54 """
56 def __init__(self, dataIds: Iterable[DataCoordinate]):
57 # use dict to store dataIds as keys to preserve ordering
58 self.dataIds = dict.fromkeys(dataIds)
60 def __len__(self) -> int:
61 return len(self.dataIds)
63 def pop_last(self) -> None:
64 if self.dataIds:
65 final_key = list(self.dataIds.keys())[-1]
66 self.dataIds.pop(final_key)
68 def getAstropyTable(self, order: bool) -> AstropyTable:
69 """Get the table as an astropy table.
71 Parameters
72 ----------
73 order : `bool`
74 If True then order rows based on DataIds.
76 Returns
77 -------
78 table : `astropy.table.Table`
79 The dataIds, sorted by spatial and temporal columns first, and then
80 the rest of the columns, with duplicate dataIds removed.
81 """
82 # Should never happen; adding a dataset should be the action that
83 # causes a _Table to be created.
84 if not self.dataIds:
85 raise RuntimeError("No DataIds were provided.")
87 dataId = next(iter(self.dataIds))
88 dimensions = [dataId.universe.dimensions[k] for k in dataId.dimensions.data_coordinate_keys]
89 columnNames = [str(item) for item in dimensions]
91 # Need to hint the column types for numbers since the per-row
92 # constructor of Table does not work this out on its own and sorting
93 # will not work properly without.
94 typeMap = {float: np.float64, int: np.int64}
95 columnTypes = [typeMap.get(type(value)) for value in dataId.full_values]
97 rows = [dataId.full_values for dataId in self.dataIds]
99 table = AstropyTable(np.array(rows), names=columnNames, dtype=columnTypes)
100 if order:
101 table = sortAstropyTable(table, dimensions)
102 return table
105def queryDataIds(
106 repo: str,
107 dimensions: Iterable[str],
108 datasets: tuple[str, ...],
109 where: str,
110 collections: Iterable[str],
111 order_by: tuple[str, ...],
112 limit: int,
113 offset: int,
114) -> tuple[AstropyTable | None, str | None]:
115 """Query for data IDs.
117 Parameters
118 ----------
119 repo : `str`
120 Butler location.
121 dimensions : `~collections.abc.Iterable` of `str`
122 Dimensions to use for query.
123 datasets : `tuple` of `str`
124 Dataset types to restrict query by.
125 where : `str`
126 Query string.
127 collections : `~collections.abc.Iterable` of `str`
128 Collections to search.
129 order_by : `tuple` of `str`
130 Columns to order results by.
131 limit : `int`
132 Maximum number of results.
133 offset : `int`
134 Offset into the results.
136 Notes
137 -----
138 Docstring for supported parameters is the same as
139 `~lsst.daf.butler.Registry.queryDataIds`.
140 """
141 if offset:
142 raise NotImplementedError("--offset is no longer supported. It will be removed after v28.")
144 with Butler.from_config(repo, without_datastore=True) as butler:
145 dataset_types = []
146 if datasets:
147 dataset_types = list(butler.registry.queryDatasetTypes(datasets))
149 if datasets and collections and not dimensions:
150 # Determine the dimensions relevant to all given dataset types.
151 # Since we are going to AND together all dimensions, we can not
152 # seed the result with an empty set.
153 dataset_type_dimensions: DimensionGroup | None = None
154 for dataset_type in dataset_types:
155 if dataset_type_dimensions is None:
156 # Seed with dimensions of first dataset type.
157 dataset_type_dimensions = dataset_type.dimensions
158 else:
159 # Only retain dimensions that are in the current
160 # set AND the set from this dataset type.
161 dataset_type_dimensions = dataset_type_dimensions.intersection(dataset_type.dimensions)
162 _LOG.debug("Dimensions now %s from %s", set(dataset_type_dimensions.names), dataset_type.name)
164 # Break out of the loop early. No additional dimensions
165 # can be added to an empty set when using AND.
166 if not dataset_type_dimensions:
167 break
169 if not dataset_type_dimensions:
170 names = [d.name for d in dataset_types]
171 return None, f"No dimensions in common for specified dataset types ({names})"
172 dimensions = set(dataset_type_dimensions.names)
173 _LOG.info("Determined dimensions %s from datasets option %s", dimensions, datasets)
175 with butler.query() as query:
176 if datasets:
177 # Need to constrain results based on dataset type and
178 # collection.
179 query_collections = collections or "*"
180 collections_info = butler.collections.query_info(
181 query_collections, include_summary=True, summary_datasets=dataset_types
182 )
183 expanded_collections = [info.name for info in collections_info]
184 dataset_type_collections = butler.collections._group_by_dataset_type(
185 {dt.name for dt in dataset_types}, collections_info
186 )
187 if not dataset_type_collections:
188 return (
189 None,
190 f"No datasets of type {datasets!r} existed in the specified "
191 f"collections {','.join(expanded_collections)}.",
192 )
194 for dt, dt_collections in dataset_type_collections.items():
195 query = query.join_dataset_search(dt, collections=dt_collections)
197 results = query.data_ids(dimensions)
199 if where:
200 results = results.where(where)
201 if order_by:
202 results = results.order_by(*order_by)
203 query_limit = abs(limit)
204 warn_limit = False
205 if limit != 0:
206 if limit < 0:
207 query_limit += 1
208 warn_limit = True
210 results = results.limit(query_limit)
212 if results.any(exact=False):
213 if results.dimensions:
214 table = _Table(results)
215 if warn_limit and len(table) == query_limit:
216 table.pop_last()
217 _LOG.warning("More data IDs are available than the request limit of %d", abs(limit))
218 if not table.dataIds:
219 return None, "Post-query region filtering removed all rows, since nothing overlapped."
220 return table.getAstropyTable(not order_by), None
221 else:
222 return (
223 None,
224 "Result has one logical row but no columns because no dimensions were requested.",
225 )
226 else:
227 return None, "\n".join(results.explain_no_results())