Coverage for python/lsst/daf/butler/registry/queries/_builder.py: 13%

76 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2023-02-08 10:28 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ("QueryBuilder",) 

24 

25from typing import Any, cast 

26 

27from lsst.daf.relation import ColumnExpression, ColumnTag, Diagnostics, Predicate, Relation 

28 

29from ...core import ( 

30 ColumnCategorization, 

31 DatasetColumnTag, 

32 DatasetType, 

33 DimensionKeyColumnTag, 

34 DimensionRecordColumnTag, 

35) 

36from ..wildcards import CollectionWildcard 

37from ._query import Query 

38from ._query_backend import QueryBackend 

39from ._query_context import QueryContext 

40from ._structs import QuerySummary 

41 

42 

43class QueryBuilder: 

44 """A builder for potentially complex queries that join tables based 

45 on dimension relationships. 

46 

47 Parameters 

48 ---------- 

49 summary : `QuerySummary` 

50 Struct organizing the dimensions involved in the query. 

51 backend : `QueryBackend` 

52 Backend object that represents the `Registry` implementation. 

53 context : `QueryContext`, optional 

54 Object that manages relation engines and database-side state (e.g. 

55 temporary tables) for the query. Must have been created by 

56 ``backend.context()``, which is used if ``context`` is not provided. 

57 relation : `~lsst.daf.relation.Relation`, optional 

58 Initial relation for the query. 

59 """ 

60 

61 def __init__( 

62 self, 

63 summary: QuerySummary, 

64 backend: QueryBackend, 

65 context: QueryContext | None = None, 

66 relation: Relation | None = None, 

67 ): 

68 self.summary = summary 

69 self._backend = backend 

70 self._context = backend.context() if context is None else context 

71 self.relation = self._context.make_initial_relation(relation) 

72 self._governor_constraints = self._backend.resolve_governor_constraints( 

73 self.summary.dimensions, self.summary.where.governor_constraints, self._context 

74 ) 

75 

76 def joinDataset( 

77 self, datasetType: DatasetType, collections: Any, *, isResult: bool = True, findFirst: bool = False 

78 ) -> bool: 

79 """Add a dataset search or constraint to the query. 

80 

81 Unlike other `QueryBuilder` join methods, this *must* be called 

82 directly to search for datasets of a particular type or constrain the 

83 query results based on the exists of datasets. However, all dimensions 

84 used to identify the dataset type must have already been included in 

85 `QuerySummary.requested` when initializing the `QueryBuilder`. 

86 

87 Parameters 

88 ---------- 

89 datasetType : `DatasetType` 

90 The type of datasets to search for. 

91 collections : `Any` 

92 An expression that fully or partially identifies the collections 

93 to search for datasets, such as a `str`, `re.Pattern`, or iterable 

94 thereof. `...` can be used to return all collections. See 

95 :ref:`daf_butler_collection_expressions` for more information. 

96 isResult : `bool`, optional 

97 If `True` (default), include the dataset ID column in the 

98 result columns of the query, allowing complete `DatasetRef` 

99 instances to be produced from the query results for this dataset 

100 type. If `False`, the existence of datasets of this type is used 

101 only to constrain the data IDs returned by the query. 

102 `joinDataset` may be called with ``isResult=True`` at most one time 

103 on a particular `QueryBuilder` instance. 

104 findFirst : `bool`, optional 

105 If `True` (`False` is default), only include the first match for 

106 each data ID, searching the given collections in order. Requires 

107 that all entries in ``collections`` be regular strings, so there is 

108 a clear search order. Ignored if ``isResult`` is `False`. 

109 

110 Returns 

111 ------- 

112 anyRecords : `bool` 

113 If `True`, joining the dataset table was successful and the query 

114 should proceed. If `False`, we were able to determine (from the 

115 combination of ``datasetType`` and ``collections``) that there 

116 would be no results joined in from this dataset, and hence (due to 

117 the inner join that would normally be present), the full query will 

118 return no results. 

119 """ 

120 assert datasetType in self.summary.datasets 

121 collections = CollectionWildcard.from_expression(collections) 

122 if isResult and findFirst: 

123 collections.require_ordered() 

124 rejections: list[str] = [] 

125 collection_records = self._backend.resolve_dataset_collections( 

126 datasetType, 

127 collections, 

128 governor_constraints=self._governor_constraints, 

129 rejections=rejections, 

130 allow_calibration_collections=( 

131 not findFirst and not (self.summary.temporal or self.summary.dimensions.temporal) 

132 ), 

133 ) 

134 columns_requested = {"dataset_id", "run", "ingest_date"} if isResult else frozenset() 

135 if not collection_records: 

136 relation = self._backend.make_doomed_dataset_relation( 

137 datasetType, columns_requested, rejections, self._context 

138 ) 

139 elif isResult and findFirst: 

140 relation = self._backend.make_dataset_search_relation( 

141 datasetType, 

142 collection_records, 

143 columns_requested, 

144 self._context, 

145 ) 

146 else: 

147 relation = self._backend.make_dataset_query_relation( 

148 datasetType, 

149 collection_records, 

150 columns_requested, 

151 self._context, 

152 ) 

153 self.relation = self.relation.join(relation) 

154 return not Diagnostics.run(relation).is_doomed 

155 

156 def _addWhereClause(self, categorized_columns: ColumnCategorization) -> None: 

157 """Add a WHERE clause to the query under construction, connecting all 

158 joined dimensions to the expression and data ID dimensions from 

159 `QuerySummary`. 

160 

161 For internal use by `QueryBuilder` only; will be called (and should 

162 only by called) by `finish`. 

163 

164 Parameters 

165 ---------- 

166 categorized_columns : `ColumnCategorization` 

167 Struct that organizes the columns in ``self.relation`` by 

168 `ColumnTag` type. 

169 """ 

170 # Append WHERE clause terms from predicates. 

171 predicate: Predicate = Predicate.literal(True) 

172 if self.summary.where.expression_predicate is not None: 

173 predicate = predicate.logical_and(self.summary.where.expression_predicate) 

174 if self.summary.where.data_id: 

175 known_dimensions = self.summary.where.data_id.graph.intersection(self.summary.dimensions) 

176 known_data_id = self.summary.where.data_id.subset(known_dimensions) 

177 predicate = predicate.logical_and(self._context.make_data_coordinate_predicate(known_data_id)) 

178 if self.summary.where.region is not None: 

179 for skypix_dimension in categorized_columns.filter_skypix(self._backend.universe): 

180 if skypix_dimension not in self.summary.where.data_id.graph: 

181 predicate = predicate.logical_and( 

182 self._context.make_spatial_region_skypix_predicate( 

183 skypix_dimension, 

184 self.summary.where.region, 

185 ) 

186 ) 

187 for element in categorized_columns.filter_spatial_region_dimension_elements(): 

188 if element not in self.summary.where.data_id.graph.names: 

189 predicate = predicate.logical_and( 

190 self._context.make_spatial_region_overlap_predicate( 

191 ColumnExpression.reference(DimensionRecordColumnTag(element, "region")), 

192 ColumnExpression.literal(self.summary.where.region), 

193 ) 

194 ) 

195 if self.summary.where.timespan is not None: 

196 for element in categorized_columns.filter_timespan_dimension_elements(): 

197 if element not in self.summary.where.data_id.graph.names: 

198 predicate = predicate.logical_and( 

199 self._context.make_timespan_overlap_predicate( 

200 DimensionRecordColumnTag(element, "timespan"), self.summary.where.timespan 

201 ) 

202 ) 

203 self.relation = self.relation.with_rows_satisfying( 

204 predicate, preferred_engine=self._context.preferred_engine, require_preferred_engine=True 

205 ) 

206 

207 def finish(self, joinMissing: bool = True) -> Query: 

208 """Finish query constructing, returning a new `Query` instance. 

209 

210 Parameters 

211 ---------- 

212 joinMissing : `bool`, optional 

213 If `True` (default), automatically join any missing dimension 

214 element tables (according to the categorization of the 

215 `QuerySummary` the builder was constructed with). `False` should 

216 only be passed if the caller can independently guarantee that all 

217 dimension relationships are already captured in non-dimension 

218 tables that have been manually included in the query. 

219 

220 Returns 

221 ------- 

222 query : `Query` 

223 A `Query` object that can be executed and used to interpret result 

224 rows. 

225 """ 

226 columns_required: set[ColumnTag] = set() 

227 if self.summary.where.expression_predicate is not None: 

228 columns_required.update(self.summary.where.expression_predicate.columns_required) 

229 if self.summary.order_by is not None: 

230 columns_required.update(self.summary.order_by.columns_required) 

231 columns_required.update(DimensionKeyColumnTag.generate(self.summary.requested.names)) 

232 if joinMissing: 

233 self.relation = self._backend.make_dimension_relation( 

234 self.summary.dimensions, 

235 columns=columns_required, 

236 context=self._context, 

237 spatial_joins=( 

238 [cast(tuple[str, str], tuple(self.summary.spatial.names))] 

239 if len(self.summary.spatial) == 2 

240 else [] 

241 ), 

242 initial_relation=self.relation, 

243 governor_constraints=self._governor_constraints, 

244 ) 

245 categorized_columns = ColumnCategorization.from_iterable(columns_required) 

246 self._addWhereClause(categorized_columns) 

247 query = Query( 

248 self.summary.dimensions, 

249 self._backend, 

250 context=self._context, 

251 relation=self.relation, 

252 governor_constraints=self._governor_constraints, 

253 is_deferred=True, 

254 has_record_columns=False, 

255 ) 

256 if self.summary.order_by is not None: 

257 query = query.sorted(self.summary.order_by.terms) 

258 if self.summary.limit is not None: 

259 query = query.sliced( 

260 start=self.summary.limit[0], 

261 stop=self.summary.limit[0] + self.summary.limit[1] 

262 if self.summary.limit[1] is not None 

263 else None, 

264 ) 

265 projected_columns: set[ColumnTag] = set() 

266 projected_columns.update(DimensionKeyColumnTag.generate(self.summary.requested.names)) 

267 for dataset_type in self.summary.datasets: 

268 for dataset_column_name in ("dataset_id", "run"): 

269 tag = DatasetColumnTag(dataset_type.name, dataset_column_name) 

270 if tag in self.relation.columns: 

271 projected_columns.add(tag) 

272 return query.projected( 

273 dimensions=self.summary.requested, 

274 columns=projected_columns, 

275 drop_postprocessing=False, 

276 unique=False, 

277 )