Coverage for python/lsst/daf/butler/registry/queries/_builder.py: 18%

68 statements  

« prev     ^ index     » next       coverage.py v7.5.1, created at 2024-05-07 02:46 -0700

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27from __future__ import annotations 

28 

29__all__ = ("QueryBuilder",) 

30 

31import itertools 

32from typing import Any 

33 

34from lsst.daf.relation import ColumnExpression, ColumnTag, Diagnostics, Relation 

35 

36from ..._column_categorization import ColumnCategorization 

37from ..._column_tags import DatasetColumnTag, DimensionKeyColumnTag, DimensionRecordColumnTag 

38from ..._dataset_type import DatasetType 

39from ..wildcards import CollectionWildcard 

40from ._query import Query 

41from ._query_backend import QueryBackend 

42from ._query_context import QueryContext 

43from ._structs import QuerySummary 

44 

45 

46class QueryBuilder: 

47 """A builder for potentially complex queries that join tables based 

48 on dimension relationships. 

49 

50 Parameters 

51 ---------- 

52 summary : `QuerySummary` 

53 Struct organizing the dimensions involved in the query. 

54 backend : `QueryBackend` 

55 Backend object that represents the `Registry` implementation. 

56 context : `QueryContext`, optional 

57 Object that manages relation engines and database-side state (e.g. 

58 temporary tables) for the query. Must have been created by 

59 ``backend.context()``, which is used if ``context`` is not provided. 

60 relation : `~lsst.daf.relation.Relation`, optional 

61 Initial relation for the query. 

62 """ 

63 

64 def __init__( 

65 self, 

66 summary: QuerySummary, 

67 backend: QueryBackend, 

68 context: QueryContext | None = None, 

69 relation: Relation | None = None, 

70 ): 

71 self.summary = summary 

72 self._backend = backend 

73 self._context = backend.context() if context is None else context 

74 self.relation = self._context.make_initial_relation(relation) 

75 self._governor_constraints = self._backend.resolve_governor_constraints( 

76 self.summary.dimensions, self.summary.where.governor_constraints 

77 ) 

78 

79 def joinDataset( 

80 self, datasetType: DatasetType, collections: Any, *, isResult: bool = True, findFirst: bool = False 

81 ) -> bool: 

82 """Add a dataset search or constraint to the query. 

83 

84 Unlike other `QueryBuilder` join methods, this *must* be called 

85 directly to search for datasets of a particular type or constrain the 

86 query results based on the exists of datasets. However, all dimensions 

87 used to identify the dataset type must have already been included in 

88 `QuerySummary.requested` when initializing the `QueryBuilder`. 

89 

90 Parameters 

91 ---------- 

92 datasetType : `DatasetType` 

93 The type of datasets to search for. 

94 collections : `Any` 

95 An expression that fully or partially identifies the collections 

96 to search for datasets, such as a `str`, `re.Pattern`, or iterable 

97 thereof. `...` can be used to return all collections. See 

98 :ref:`daf_butler_collection_expressions` for more information. 

99 isResult : `bool`, optional 

100 If `True` (default), include the dataset ID column in the 

101 result columns of the query, allowing complete `DatasetRef` 

102 instances to be produced from the query results for this dataset 

103 type. If `False`, the existence of datasets of this type is used 

104 only to constrain the data IDs returned by the query. 

105 `joinDataset` may be called with ``isResult=True`` at most one time 

106 on a particular `QueryBuilder` instance. 

107 findFirst : `bool`, optional 

108 If `True` (`False` is default), only include the first match for 

109 each data ID, searching the given collections in order. Requires 

110 that all entries in ``collections`` be regular strings, so there is 

111 a clear search order. Ignored if ``isResult`` is `False`. 

112 

113 Returns 

114 ------- 

115 anyRecords : `bool` 

116 If `True`, joining the dataset table was successful and the query 

117 should proceed. If `False`, we were able to determine (from the 

118 combination of ``datasetType`` and ``collections``) that there 

119 would be no results joined in from this dataset, and hence (due to 

120 the inner join that would normally be present), the full query will 

121 return no results. 

122 """ 

123 assert datasetType in self.summary.datasets 

124 collections = CollectionWildcard.from_expression(collections) 

125 if isResult and findFirst: 

126 collections.require_ordered() 

127 rejections: list[str] = [] 

128 collection_records = self._backend.resolve_dataset_collections( 

129 datasetType, 

130 collections, 

131 governor_constraints=self._governor_constraints, 

132 rejections=rejections, 

133 allow_calibration_collections=(not findFirst and not self.summary.dimensions.temporal), 

134 ) 

135 columns_requested = {"dataset_id", "run", "ingest_date"} if isResult else frozenset() 

136 if not collection_records: 

137 relation = self._backend.make_doomed_dataset_relation( 

138 datasetType, columns_requested, rejections, self._context 

139 ) 

140 elif isResult and findFirst: 

141 relation = self._backend.make_dataset_search_relation( 

142 datasetType, 

143 collection_records, 

144 columns_requested, 

145 self._context, 

146 ) 

147 else: 

148 relation = self._backend.make_dataset_query_relation( 

149 datasetType, 

150 collection_records, 

151 columns_requested, 

152 self._context, 

153 ) 

154 self.relation = self.relation.join(relation) 

155 return not Diagnostics.run(relation).is_doomed 

156 

157 def _addWhereClause(self, categorized_columns: ColumnCategorization) -> None: 

158 """Add a WHERE clause to the query under construction, connecting all 

159 joined dimensions to the expression and data ID dimensions from 

160 `QuerySummary`. 

161 

162 For internal use by `QueryBuilder` only; will be called (and should 

163 only by called) by `finish`. 

164 

165 Parameters 

166 ---------- 

167 categorized_columns : `ColumnCategorization` 

168 Struct that organizes the columns in ``self.relation`` by 

169 `ColumnTag` type. 

170 """ 

171 # Append WHERE clause terms from predicates. 

172 if self.summary.where.expression_predicate is not None: 

173 self.relation = self.relation.with_rows_satisfying( 

174 self.summary.where.expression_predicate, 

175 preferred_engine=self._context.preferred_engine, 

176 require_preferred_engine=True, 

177 ) 

178 if self.summary.where.data_id: 

179 known_dimensions = self.summary.where.data_id.dimensions.intersection(self.summary.dimensions) 

180 known_data_id = self.summary.where.data_id.subset(known_dimensions) 

181 self.relation = self.relation.with_rows_satisfying( 

182 self._context.make_data_coordinate_predicate(known_data_id), 

183 preferred_engine=self._context.preferred_engine, 

184 require_preferred_engine=True, 

185 ) 

186 if self.summary.region is not None: 

187 for skypix_dimension in categorized_columns.filter_skypix(self._backend.universe): 

188 self.relation = self.relation.with_rows_satisfying( 

189 self._context.make_spatial_region_skypix_predicate( 

190 skypix_dimension, 

191 self.summary.region, 

192 ), 

193 preferred_engine=self._context.preferred_engine, 

194 require_preferred_engine=True, 

195 ) 

196 for element in categorized_columns.filter_spatial_region_dimension_elements(): 

197 self.relation = self.relation.with_rows_satisfying( 

198 self._context.make_spatial_region_overlap_predicate( 

199 ColumnExpression.reference(DimensionRecordColumnTag(element, "region")), 

200 ColumnExpression.literal(self.summary.region), 

201 ), 

202 preferred_engine=self._context.iteration_engine, 

203 transfer=True, 

204 ) 

205 

206 def finish(self, joinMissing: bool = True) -> Query: 

207 """Finish query constructing, returning a new `Query` instance. 

208 

209 Parameters 

210 ---------- 

211 joinMissing : `bool`, optional 

212 If `True` (default), automatically join any missing dimension 

213 element tables (according to the categorization of the 

214 `QuerySummary` the builder was constructed with). `False` should 

215 only be passed if the caller can independently guarantee that all 

216 dimension relationships are already captured in non-dimension 

217 tables that have been manually included in the query. 

218 

219 Returns 

220 ------- 

221 query : `Query` 

222 A `Query` object that can be executed and used to interpret result 

223 rows. 

224 """ 

225 if joinMissing: 

226 spatial_joins = [] 

227 for family1, family2 in itertools.combinations(self.summary.dimensions.spatial, 2): 

228 spatial_joins.append( 

229 ( 

230 family1.choose(self.summary.dimensions.elements.names, self.summary.universe).name, 

231 family2.choose(self.summary.dimensions.elements.names, self.summary.universe).name, 

232 ) 

233 ) 

234 self.relation = self._backend.make_dimension_relation( 

235 self.summary.dimensions, 

236 columns=self.summary.columns_required, 

237 context=self._context, 

238 spatial_joins=spatial_joins, 

239 initial_relation=self.relation, 

240 governor_constraints=self._governor_constraints, 

241 ) 

242 categorized_columns = ColumnCategorization.from_iterable(self.relation.columns) 

243 self._addWhereClause(categorized_columns) 

244 query = Query( 

245 self.summary.dimensions, 

246 self._backend, 

247 context=self._context, 

248 relation=self.relation, 

249 governor_constraints=self._governor_constraints, 

250 is_deferred=True, 

251 has_record_columns=False, 

252 ) 

253 if self.summary.order_by is not None: 

254 query = query.sorted(self.summary.order_by.terms) 

255 if self.summary.limit is not None: 

256 query = query.sliced( 

257 start=self.summary.limit[0], 

258 stop=( 

259 self.summary.limit[0] + self.summary.limit[1] 

260 if self.summary.limit[1] is not None 

261 else None 

262 ), 

263 ) 

264 projected_columns: set[ColumnTag] = set() 

265 projected_columns.update(DimensionKeyColumnTag.generate(self.summary.requested.names)) 

266 for dataset_type in self.summary.datasets: 

267 for dataset_column_name in ("dataset_id", "run"): 

268 tag = DatasetColumnTag(dataset_type.name, dataset_column_name) 

269 if tag in self.relation.columns: 

270 projected_columns.add(tag) 

271 return query.projected( 

272 dimensions=self.summary.requested, 

273 columns=projected_columns, 

274 drop_postprocessing=False, 

275 unique=False, 

276 )