Coverage for python/lsst/daf/butler/registry/queries/_builder.py: 17%

66 statements  

« prev     ^ index     » next       coverage.py v7.2.5, created at 2023-05-17 02:31 -0700

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ("QueryBuilder",) 

24 

25import itertools 

26from typing import Any 

27 

28from lsst.daf.relation import ColumnExpression, ColumnTag, Diagnostics, Relation 

29 

30from ...core import ( 

31 ColumnCategorization, 

32 DatasetColumnTag, 

33 DatasetType, 

34 DimensionKeyColumnTag, 

35 DimensionRecordColumnTag, 

36) 

37from ..wildcards import CollectionWildcard 

38from ._query import Query 

39from ._query_backend import QueryBackend 

40from ._query_context import QueryContext 

41from ._structs import QuerySummary 

42 

43 

44class QueryBuilder: 

45 """A builder for potentially complex queries that join tables based 

46 on dimension relationships. 

47 

48 Parameters 

49 ---------- 

50 summary : `QuerySummary` 

51 Struct organizing the dimensions involved in the query. 

52 backend : `QueryBackend` 

53 Backend object that represents the `Registry` implementation. 

54 context : `QueryContext`, optional 

55 Object that manages relation engines and database-side state (e.g. 

56 temporary tables) for the query. Must have been created by 

57 ``backend.context()``, which is used if ``context`` is not provided. 

58 relation : `~lsst.daf.relation.Relation`, optional 

59 Initial relation for the query. 

60 """ 

61 

62 def __init__( 

63 self, 

64 summary: QuerySummary, 

65 backend: QueryBackend, 

66 context: QueryContext | None = None, 

67 relation: Relation | None = None, 

68 ): 

69 self.summary = summary 

70 self._backend = backend 

71 self._context = backend.context() if context is None else context 

72 self.relation = self._context.make_initial_relation(relation) 

73 self._governor_constraints = self._backend.resolve_governor_constraints( 

74 self.summary.dimensions, self.summary.where.governor_constraints, self._context 

75 ) 

76 

77 def joinDataset( 

78 self, datasetType: DatasetType, collections: Any, *, isResult: bool = True, findFirst: bool = False 

79 ) -> bool: 

80 """Add a dataset search or constraint to the query. 

81 

82 Unlike other `QueryBuilder` join methods, this *must* be called 

83 directly to search for datasets of a particular type or constrain the 

84 query results based on the exists of datasets. However, all dimensions 

85 used to identify the dataset type must have already been included in 

86 `QuerySummary.requested` when initializing the `QueryBuilder`. 

87 

88 Parameters 

89 ---------- 

90 datasetType : `DatasetType` 

91 The type of datasets to search for. 

92 collections : `Any` 

93 An expression that fully or partially identifies the collections 

94 to search for datasets, such as a `str`, `re.Pattern`, or iterable 

95 thereof. `...` can be used to return all collections. See 

96 :ref:`daf_butler_collection_expressions` for more information. 

97 isResult : `bool`, optional 

98 If `True` (default), include the dataset ID column in the 

99 result columns of the query, allowing complete `DatasetRef` 

100 instances to be produced from the query results for this dataset 

101 type. If `False`, the existence of datasets of this type is used 

102 only to constrain the data IDs returned by the query. 

103 `joinDataset` may be called with ``isResult=True`` at most one time 

104 on a particular `QueryBuilder` instance. 

105 findFirst : `bool`, optional 

106 If `True` (`False` is default), only include the first match for 

107 each data ID, searching the given collections in order. Requires 

108 that all entries in ``collections`` be regular strings, so there is 

109 a clear search order. Ignored if ``isResult`` is `False`. 

110 

111 Returns 

112 ------- 

113 anyRecords : `bool` 

114 If `True`, joining the dataset table was successful and the query 

115 should proceed. If `False`, we were able to determine (from the 

116 combination of ``datasetType`` and ``collections``) that there 

117 would be no results joined in from this dataset, and hence (due to 

118 the inner join that would normally be present), the full query will 

119 return no results. 

120 """ 

121 assert datasetType in self.summary.datasets 

122 collections = CollectionWildcard.from_expression(collections) 

123 if isResult and findFirst: 

124 collections.require_ordered() 

125 rejections: list[str] = [] 

126 collection_records = self._backend.resolve_dataset_collections( 

127 datasetType, 

128 collections, 

129 governor_constraints=self._governor_constraints, 

130 rejections=rejections, 

131 allow_calibration_collections=(not findFirst and not self.summary.dimensions.temporal), 

132 ) 

133 columns_requested = {"dataset_id", "run", "ingest_date"} if isResult else frozenset() 

134 if not collection_records: 

135 relation = self._backend.make_doomed_dataset_relation( 

136 datasetType, columns_requested, rejections, self._context 

137 ) 

138 elif isResult and findFirst: 

139 relation = self._backend.make_dataset_search_relation( 

140 datasetType, 

141 collection_records, 

142 columns_requested, 

143 self._context, 

144 ) 

145 else: 

146 relation = self._backend.make_dataset_query_relation( 

147 datasetType, 

148 collection_records, 

149 columns_requested, 

150 self._context, 

151 ) 

152 self.relation = self.relation.join(relation) 

153 return not Diagnostics.run(relation).is_doomed 

154 

155 def _addWhereClause(self, categorized_columns: ColumnCategorization) -> None: 

156 """Add a WHERE clause to the query under construction, connecting all 

157 joined dimensions to the expression and data ID dimensions from 

158 `QuerySummary`. 

159 

160 For internal use by `QueryBuilder` only; will be called (and should 

161 only by called) by `finish`. 

162 

163 Parameters 

164 ---------- 

165 categorized_columns : `ColumnCategorization` 

166 Struct that organizes the columns in ``self.relation`` by 

167 `ColumnTag` type. 

168 """ 

169 # Append WHERE clause terms from predicates. 

170 if self.summary.where.expression_predicate is not None: 

171 self.relation = self.relation.with_rows_satisfying( 

172 self.summary.where.expression_predicate, 

173 preferred_engine=self._context.preferred_engine, 

174 require_preferred_engine=True, 

175 ) 

176 if self.summary.where.data_id: 

177 known_dimensions = self.summary.where.data_id.graph.intersection(self.summary.dimensions) 

178 known_data_id = self.summary.where.data_id.subset(known_dimensions) 

179 self.relation = self.relation.with_rows_satisfying( 

180 self._context.make_data_coordinate_predicate(known_data_id), 

181 preferred_engine=self._context.preferred_engine, 

182 require_preferred_engine=True, 

183 ) 

184 if self.summary.region is not None: 

185 for skypix_dimension in categorized_columns.filter_skypix(self._backend.universe): 

186 self.relation = self.relation.with_rows_satisfying( 

187 self._context.make_spatial_region_skypix_predicate( 

188 skypix_dimension, 

189 self.summary.region, 

190 ), 

191 preferred_engine=self._context.preferred_engine, 

192 require_preferred_engine=True, 

193 ) 

194 for element in categorized_columns.filter_spatial_region_dimension_elements(): 

195 self.relation = self.relation.with_rows_satisfying( 

196 self._context.make_spatial_region_overlap_predicate( 

197 ColumnExpression.reference(DimensionRecordColumnTag(element, "region")), 

198 ColumnExpression.literal(self.summary.region), 

199 ), 

200 preferred_engine=self._context.iteration_engine, 

201 transfer=True, 

202 ) 

203 

204 def finish(self, joinMissing: bool = True) -> Query: 

205 """Finish query constructing, returning a new `Query` instance. 

206 

207 Parameters 

208 ---------- 

209 joinMissing : `bool`, optional 

210 If `True` (default), automatically join any missing dimension 

211 element tables (according to the categorization of the 

212 `QuerySummary` the builder was constructed with). `False` should 

213 only be passed if the caller can independently guarantee that all 

214 dimension relationships are already captured in non-dimension 

215 tables that have been manually included in the query. 

216 

217 Returns 

218 ------- 

219 query : `Query` 

220 A `Query` object that can be executed and used to interpret result 

221 rows. 

222 """ 

223 if joinMissing: 

224 spatial_joins = [] 

225 for family1, family2 in itertools.combinations(self.summary.dimensions.spatial, 2): 

226 spatial_joins.append( 

227 ( 

228 family1.choose(self.summary.dimensions.elements).name, 

229 family2.choose(self.summary.dimensions.elements).name, 

230 ) 

231 ) 

232 self.relation = self._backend.make_dimension_relation( 

233 self.summary.dimensions, 

234 columns=self.summary.columns_required, 

235 context=self._context, 

236 spatial_joins=spatial_joins, 

237 initial_relation=self.relation, 

238 governor_constraints=self._governor_constraints, 

239 ) 

240 categorized_columns = ColumnCategorization.from_iterable(self.relation.columns) 

241 self._addWhereClause(categorized_columns) 

242 query = Query( 

243 self.summary.dimensions, 

244 self._backend, 

245 context=self._context, 

246 relation=self.relation, 

247 governor_constraints=self._governor_constraints, 

248 is_deferred=True, 

249 has_record_columns=False, 

250 ) 

251 if self.summary.order_by is not None: 

252 query = query.sorted(self.summary.order_by.terms) 

253 if self.summary.limit is not None: 

254 query = query.sliced( 

255 start=self.summary.limit[0], 

256 stop=self.summary.limit[0] + self.summary.limit[1] 

257 if self.summary.limit[1] is not None 

258 else None, 

259 ) 

260 projected_columns: set[ColumnTag] = set() 

261 projected_columns.update(DimensionKeyColumnTag.generate(self.summary.requested.names)) 

262 for dataset_type in self.summary.datasets: 

263 for dataset_column_name in ("dataset_id", "run"): 

264 tag = DatasetColumnTag(dataset_type.name, dataset_column_name) 

265 if tag in self.relation.columns: 

266 projected_columns.add(tag) 

267 return query.projected( 

268 dimensions=self.summary.requested, 

269 columns=projected_columns, 

270 drop_postprocessing=False, 

271 unique=False, 

272 )