Coverage for python/lsst/daf/butler/registry/queries/_builder.py: 17%

66 statements  

« prev     ^ index     » next       coverage.py v7.3.1, created at 2023-10-02 08:00 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27from __future__ import annotations 

28 

29__all__ = ("QueryBuilder",) 

30 

31import itertools 

32from typing import Any 

33 

34from lsst.daf.relation import ColumnExpression, ColumnTag, Diagnostics, Relation 

35 

36from ...core import ( 

37 ColumnCategorization, 

38 DatasetColumnTag, 

39 DatasetType, 

40 DimensionKeyColumnTag, 

41 DimensionRecordColumnTag, 

42) 

43from ..wildcards import CollectionWildcard 

44from ._query import Query 

45from ._query_backend import QueryBackend 

46from ._query_context import QueryContext 

47from ._structs import QuerySummary 

48 

49 

50class QueryBuilder: 

51 """A builder for potentially complex queries that join tables based 

52 on dimension relationships. 

53 

54 Parameters 

55 ---------- 

56 summary : `QuerySummary` 

57 Struct organizing the dimensions involved in the query. 

58 backend : `QueryBackend` 

59 Backend object that represents the `Registry` implementation. 

60 context : `QueryContext`, optional 

61 Object that manages relation engines and database-side state (e.g. 

62 temporary tables) for the query. Must have been created by 

63 ``backend.context()``, which is used if ``context`` is not provided. 

64 relation : `~lsst.daf.relation.Relation`, optional 

65 Initial relation for the query. 

66 """ 

67 

68 def __init__( 

69 self, 

70 summary: QuerySummary, 

71 backend: QueryBackend, 

72 context: QueryContext | None = None, 

73 relation: Relation | None = None, 

74 ): 

75 self.summary = summary 

76 self._backend = backend 

77 self._context = backend.context() if context is None else context 

78 self.relation = self._context.make_initial_relation(relation) 

79 self._governor_constraints = self._backend.resolve_governor_constraints( 

80 self.summary.dimensions, self.summary.where.governor_constraints, self._context 

81 ) 

82 

83 def joinDataset( 

84 self, datasetType: DatasetType, collections: Any, *, isResult: bool = True, findFirst: bool = False 

85 ) -> bool: 

86 """Add a dataset search or constraint to the query. 

87 

88 Unlike other `QueryBuilder` join methods, this *must* be called 

89 directly to search for datasets of a particular type or constrain the 

90 query results based on the exists of datasets. However, all dimensions 

91 used to identify the dataset type must have already been included in 

92 `QuerySummary.requested` when initializing the `QueryBuilder`. 

93 

94 Parameters 

95 ---------- 

96 datasetType : `DatasetType` 

97 The type of datasets to search for. 

98 collections : `Any` 

99 An expression that fully or partially identifies the collections 

100 to search for datasets, such as a `str`, `re.Pattern`, or iterable 

101 thereof. `...` can be used to return all collections. See 

102 :ref:`daf_butler_collection_expressions` for more information. 

103 isResult : `bool`, optional 

104 If `True` (default), include the dataset ID column in the 

105 result columns of the query, allowing complete `DatasetRef` 

106 instances to be produced from the query results for this dataset 

107 type. If `False`, the existence of datasets of this type is used 

108 only to constrain the data IDs returned by the query. 

109 `joinDataset` may be called with ``isResult=True`` at most one time 

110 on a particular `QueryBuilder` instance. 

111 findFirst : `bool`, optional 

112 If `True` (`False` is default), only include the first match for 

113 each data ID, searching the given collections in order. Requires 

114 that all entries in ``collections`` be regular strings, so there is 

115 a clear search order. Ignored if ``isResult`` is `False`. 

116 

117 Returns 

118 ------- 

119 anyRecords : `bool` 

120 If `True`, joining the dataset table was successful and the query 

121 should proceed. If `False`, we were able to determine (from the 

122 combination of ``datasetType`` and ``collections``) that there 

123 would be no results joined in from this dataset, and hence (due to 

124 the inner join that would normally be present), the full query will 

125 return no results. 

126 """ 

127 assert datasetType in self.summary.datasets 

128 collections = CollectionWildcard.from_expression(collections) 

129 if isResult and findFirst: 

130 collections.require_ordered() 

131 rejections: list[str] = [] 

132 collection_records = self._backend.resolve_dataset_collections( 

133 datasetType, 

134 collections, 

135 governor_constraints=self._governor_constraints, 

136 rejections=rejections, 

137 allow_calibration_collections=(not findFirst and not self.summary.dimensions.temporal), 

138 ) 

139 columns_requested = {"dataset_id", "run", "ingest_date"} if isResult else frozenset() 

140 if not collection_records: 

141 relation = self._backend.make_doomed_dataset_relation( 

142 datasetType, columns_requested, rejections, self._context 

143 ) 

144 elif isResult and findFirst: 

145 relation = self._backend.make_dataset_search_relation( 

146 datasetType, 

147 collection_records, 

148 columns_requested, 

149 self._context, 

150 ) 

151 else: 

152 relation = self._backend.make_dataset_query_relation( 

153 datasetType, 

154 collection_records, 

155 columns_requested, 

156 self._context, 

157 ) 

158 self.relation = self.relation.join(relation) 

159 return not Diagnostics.run(relation).is_doomed 

160 

161 def _addWhereClause(self, categorized_columns: ColumnCategorization) -> None: 

162 """Add a WHERE clause to the query under construction, connecting all 

163 joined dimensions to the expression and data ID dimensions from 

164 `QuerySummary`. 

165 

166 For internal use by `QueryBuilder` only; will be called (and should 

167 only by called) by `finish`. 

168 

169 Parameters 

170 ---------- 

171 categorized_columns : `ColumnCategorization` 

172 Struct that organizes the columns in ``self.relation`` by 

173 `ColumnTag` type. 

174 """ 

175 # Append WHERE clause terms from predicates. 

176 if self.summary.where.expression_predicate is not None: 

177 self.relation = self.relation.with_rows_satisfying( 

178 self.summary.where.expression_predicate, 

179 preferred_engine=self._context.preferred_engine, 

180 require_preferred_engine=True, 

181 ) 

182 if self.summary.where.data_id: 

183 known_dimensions = self.summary.where.data_id.graph.intersection(self.summary.dimensions) 

184 known_data_id = self.summary.where.data_id.subset(known_dimensions) 

185 self.relation = self.relation.with_rows_satisfying( 

186 self._context.make_data_coordinate_predicate(known_data_id), 

187 preferred_engine=self._context.preferred_engine, 

188 require_preferred_engine=True, 

189 ) 

190 if self.summary.region is not None: 

191 for skypix_dimension in categorized_columns.filter_skypix(self._backend.universe): 

192 self.relation = self.relation.with_rows_satisfying( 

193 self._context.make_spatial_region_skypix_predicate( 

194 skypix_dimension, 

195 self.summary.region, 

196 ), 

197 preferred_engine=self._context.preferred_engine, 

198 require_preferred_engine=True, 

199 ) 

200 for element in categorized_columns.filter_spatial_region_dimension_elements(): 

201 self.relation = self.relation.with_rows_satisfying( 

202 self._context.make_spatial_region_overlap_predicate( 

203 ColumnExpression.reference(DimensionRecordColumnTag(element, "region")), 

204 ColumnExpression.literal(self.summary.region), 

205 ), 

206 preferred_engine=self._context.iteration_engine, 

207 transfer=True, 

208 ) 

209 

210 def finish(self, joinMissing: bool = True) -> Query: 

211 """Finish query constructing, returning a new `Query` instance. 

212 

213 Parameters 

214 ---------- 

215 joinMissing : `bool`, optional 

216 If `True` (default), automatically join any missing dimension 

217 element tables (according to the categorization of the 

218 `QuerySummary` the builder was constructed with). `False` should 

219 only be passed if the caller can independently guarantee that all 

220 dimension relationships are already captured in non-dimension 

221 tables that have been manually included in the query. 

222 

223 Returns 

224 ------- 

225 query : `Query` 

226 A `Query` object that can be executed and used to interpret result 

227 rows. 

228 """ 

229 if joinMissing: 

230 spatial_joins = [] 

231 for family1, family2 in itertools.combinations(self.summary.dimensions.spatial, 2): 

232 spatial_joins.append( 

233 ( 

234 family1.choose(self.summary.dimensions.elements).name, 

235 family2.choose(self.summary.dimensions.elements).name, 

236 ) 

237 ) 

238 self.relation = self._backend.make_dimension_relation( 

239 self.summary.dimensions, 

240 columns=self.summary.columns_required, 

241 context=self._context, 

242 spatial_joins=spatial_joins, 

243 initial_relation=self.relation, 

244 governor_constraints=self._governor_constraints, 

245 ) 

246 categorized_columns = ColumnCategorization.from_iterable(self.relation.columns) 

247 self._addWhereClause(categorized_columns) 

248 query = Query( 

249 self.summary.dimensions, 

250 self._backend, 

251 context=self._context, 

252 relation=self.relation, 

253 governor_constraints=self._governor_constraints, 

254 is_deferred=True, 

255 has_record_columns=False, 

256 ) 

257 if self.summary.order_by is not None: 

258 query = query.sorted(self.summary.order_by.terms) 

259 if self.summary.limit is not None: 

260 query = query.sliced( 

261 start=self.summary.limit[0], 

262 stop=self.summary.limit[0] + self.summary.limit[1] 

263 if self.summary.limit[1] is not None 

264 else None, 

265 ) 

266 projected_columns: set[ColumnTag] = set() 

267 projected_columns.update(DimensionKeyColumnTag.generate(self.summary.requested.names)) 

268 for dataset_type in self.summary.datasets: 

269 for dataset_column_name in ("dataset_id", "run"): 

270 tag = DatasetColumnTag(dataset_type.name, dataset_column_name) 

271 if tag in self.relation.columns: 

272 projected_columns.add(tag) 

273 return query.projected( 

274 dimensions=self.summary.requested, 

275 columns=projected_columns, 

276 drop_postprocessing=False, 

277 unique=False, 

278 )