Coverage for python / lsst / daf / butler / script / queryDataIds.py: 13%

85 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-22 08:55 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27from __future__ import annotations 

28 

29import logging 

30from collections.abc import Iterable 

31from typing import TYPE_CHECKING 

32 

33import numpy as np 

34from astropy.table import Table as AstropyTable 

35 

36from .._butler import Butler 

37from ..cli.utils import sortAstropyTable 

38from ..dimensions import DataCoordinate 

39 

40if TYPE_CHECKING: 

41 from lsst.daf.butler import DimensionGroup 

42 

43_LOG = logging.getLogger(__name__) 

44 

45 

46class _Table: 

47 """Aggregates DataIds and creates an astropy table with one DataId per 

48 row. Eliminates duplicate rows. 

49 

50 Parameters 

51 ---------- 

52 dataIds : `~collections.abc.Iterable` [ ``DataId`` ] 

53 The DataIds to add to the table. 

54 """ 

55 

56 def __init__(self, dataIds: Iterable[DataCoordinate]): 

57 # use dict to store dataIds as keys to preserve ordering 

58 self.dataIds = dict.fromkeys(dataIds) 

59 

60 def __len__(self) -> int: 

61 return len(self.dataIds) 

62 

63 def pop_last(self) -> None: 

64 if self.dataIds: 

65 final_key = list(self.dataIds.keys())[-1] 

66 self.dataIds.pop(final_key) 

67 

68 def getAstropyTable(self, order: bool) -> AstropyTable: 

69 """Get the table as an astropy table. 

70 

71 Parameters 

72 ---------- 

73 order : `bool` 

74 If True then order rows based on DataIds. 

75 

76 Returns 

77 ------- 

78 table : `astropy.table.Table` 

79 The dataIds, sorted by spatial and temporal columns first, and then 

80 the rest of the columns, with duplicate dataIds removed. 

81 """ 

82 # Should never happen; adding a dataset should be the action that 

83 # causes a _Table to be created. 

84 if not self.dataIds: 

85 raise RuntimeError("No DataIds were provided.") 

86 

87 dataId = next(iter(self.dataIds)) 

88 dimensions = [dataId.universe.dimensions[k] for k in dataId.dimensions.data_coordinate_keys] 

89 columnNames = [str(item) for item in dimensions] 

90 

91 # Need to hint the column types for numbers since the per-row 

92 # constructor of Table does not work this out on its own and sorting 

93 # will not work properly without. 

94 typeMap = {float: np.float64, int: np.int64} 

95 columnTypes = [typeMap.get(type(value)) for value in dataId.full_values] 

96 

97 rows = [dataId.full_values for dataId in self.dataIds] 

98 

99 table = AstropyTable(np.array(rows), names=columnNames, dtype=columnTypes) 

100 if order: 

101 table = sortAstropyTable(table, dimensions) 

102 return table 

103 

104 

105def queryDataIds( 

106 repo: str, 

107 dimensions: Iterable[str], 

108 datasets: tuple[str, ...], 

109 where: str, 

110 collections: Iterable[str], 

111 order_by: tuple[str, ...], 

112 limit: int, 

113 offset: int, 

114) -> tuple[AstropyTable | None, str | None]: 

115 """Query for data IDs. 

116 

117 Parameters 

118 ---------- 

119 repo : `str` 

120 Butler location. 

121 dimensions : `~collections.abc.Iterable` of `str` 

122 Dimensions to use for query. 

123 datasets : `tuple` of `str` 

124 Dataset types to restrict query by. 

125 where : `str` 

126 Query string. 

127 collections : `~collections.abc.Iterable` of `str` 

128 Collections to search. 

129 order_by : `tuple` of `str` 

130 Columns to order results by. 

131 limit : `int` 

132 Maximum number of results. 

133 offset : `int` 

134 Offset into the results. 

135 

136 Notes 

137 ----- 

138 Docstring for supported parameters is the same as 

139 `~lsst.daf.butler.Registry.queryDataIds`. 

140 """ 

141 if offset: 

142 raise NotImplementedError("--offset is no longer supported. It will be removed after v28.") 

143 

144 with Butler.from_config(repo, without_datastore=True) as butler: 

145 dataset_types = [] 

146 if datasets: 

147 dataset_types = list(butler.registry.queryDatasetTypes(datasets)) 

148 

149 if datasets and collections and not dimensions: 

150 # Determine the dimensions relevant to all given dataset types. 

151 # Since we are going to AND together all dimensions, we can not 

152 # seed the result with an empty set. 

153 dataset_type_dimensions: DimensionGroup | None = None 

154 for dataset_type in dataset_types: 

155 if dataset_type_dimensions is None: 

156 # Seed with dimensions of first dataset type. 

157 dataset_type_dimensions = dataset_type.dimensions 

158 else: 

159 # Only retain dimensions that are in the current 

160 # set AND the set from this dataset type. 

161 dataset_type_dimensions = dataset_type_dimensions.intersection(dataset_type.dimensions) 

162 _LOG.debug("Dimensions now %s from %s", set(dataset_type_dimensions.names), dataset_type.name) 

163 

164 # Break out of the loop early. No additional dimensions 

165 # can be added to an empty set when using AND. 

166 if not dataset_type_dimensions: 

167 break 

168 

169 if not dataset_type_dimensions: 

170 names = [d.name for d in dataset_types] 

171 return None, f"No dimensions in common for specified dataset types ({names})" 

172 dimensions = set(dataset_type_dimensions.names) 

173 _LOG.info("Determined dimensions %s from datasets option %s", dimensions, datasets) 

174 

175 with butler.query() as query: 

176 if datasets: 

177 # Need to constrain results based on dataset type and 

178 # collection. 

179 query_collections = collections or "*" 

180 collections_info = butler.collections.query_info( 

181 query_collections, include_summary=True, summary_datasets=dataset_types 

182 ) 

183 expanded_collections = [info.name for info in collections_info] 

184 dataset_type_collections = butler.collections._group_by_dataset_type( 

185 {dt.name for dt in dataset_types}, collections_info 

186 ) 

187 if not dataset_type_collections: 

188 return ( 

189 None, 

190 f"No datasets of type {datasets!r} existed in the specified " 

191 f"collections {','.join(expanded_collections)}.", 

192 ) 

193 

194 for dt, dt_collections in dataset_type_collections.items(): 

195 query = query.join_dataset_search(dt, collections=dt_collections) 

196 

197 results = query.data_ids(dimensions) 

198 

199 if where: 

200 results = results.where(where) 

201 if order_by: 

202 results = results.order_by(*order_by) 

203 query_limit = abs(limit) 

204 warn_limit = False 

205 if limit != 0: 

206 if limit < 0: 

207 query_limit += 1 

208 warn_limit = True 

209 

210 results = results.limit(query_limit) 

211 

212 if results.any(exact=False): 

213 if results.dimensions: 

214 table = _Table(results) 

215 if warn_limit and len(table) == query_limit: 

216 table.pop_last() 

217 _LOG.warning("More data IDs are available than the request limit of %d", abs(limit)) 

218 if not table.dataIds: 

219 return None, "Post-query region filtering removed all rows, since nothing overlapped." 

220 return table.getAstropyTable(not order_by), None 

221 else: 

222 return ( 

223 None, 

224 "Result has one logical row but no columns because no dimensions were requested.", 

225 ) 

226 else: 

227 return None, "\n".join(results.explain_no_results())