Coverage for python/lsst/daf/butler/queries/tree/_query_tree.py: 48%

83 statements  

« prev     ^ index     » next       coverage.py v7.4.4, created at 2024-03-26 02:48 -0700

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28from __future__ import annotations 

29 

30__all__ = ( 

31 "QueryTree", 

32 "make_identity_query_tree", 

33 "DataCoordinateUploadKey", 

34 "MaterializationKey", 

35 "DatasetSearch", 

36 "SerializedQueryTree", 

37) 

38 

39import uuid 

40from collections.abc import Mapping 

41from typing import TypeAlias, final 

42 

43import pydantic 

44 

45from ...dimensions import DimensionGroup, DimensionUniverse 

46from ...pydantic_utils import DeferredValidation 

47from ._base import InvalidQueryError, QueryTreeBase 

48from ._column_set import ColumnSet 

49from ._predicate import Predicate 

50 

51DataCoordinateUploadKey: TypeAlias = uuid.UUID 

52 

53MaterializationKey: TypeAlias = uuid.UUID 

54 

55 

56def make_identity_query_tree(universe: DimensionUniverse) -> QueryTree: 

57 """Make an initial query tree with empty dimensions and a single logical 

58 row. 

59 

60 This method should be used by `Butler._query` to construct the initial 

61 query tree. This tree is a useful initial state because it is the 

62 identity for joins, in that joining any other query tree to the identity 

63 yields that query tree. 

64 

65 Parameters 

66 ---------- 

67 universe : `..DimensionUniverse` 

68 Definitions for all dimensions. 

69 

70 Returns 

71 ------- 

72 tree : `QueryTree` 

73 A tree with empty dimensions. 

74 """ 

75 return QueryTree(dimensions=universe.empty.as_group()) 

76 

77 

78@final 

79class DatasetSearch(QueryTreeBase): 

80 """Information about a dataset search joined into a query tree. 

81 

82 The dataset type name is the key of the dictionary (in `QueryTree`) where 

83 this type is used as a value. 

84 """ 

85 

86 collections: tuple[str, ...] 

87 """The collections to search. 

88 

89 Order matters if this dataset type is later referenced by a `FindFirst` 

90 operation. Collection wildcards are always resolved before being included 

91 in a dataset search. 

92 """ 

93 

94 dimensions: DimensionGroup 

95 """The dimensions of the dataset type. 

96 

97 This must match the dimensions of the dataset type as already defined in 

98 the butler database, but this cannot generally be verified when a relation 

99 tree is validated (since it requires a database query) and hence must be 

100 checked later. 

101 """ 

102 

103 

104@final 

105class QueryTree(QueryTreeBase): 

106 """A declarative, serializable description of the row constraints and joins 

107 in a butler query. 

108 

109 Notes 

110 ----- 

111 A `QueryTree` is the struct that represents the serializable form of a 

112 `Query` object, or one piece (with `ResultSpec` the other) of the 

113 serializable form of a query results object. 

114 

115 This class's attributes describe the columns that are "available" to be 

116 returned or used in ``where`` or ``order_by`` expressions, but it does not 

117 carry information about the columns that are actually included in result 

118 rows, or what kind of butler primitive (e.g. `DataCoordinate` or 

119 `DatasetRef`) those rows might be transformed into. 

120 """ 

121 

122 dimensions: DimensionGroup 

123 """The dimensions whose keys are joined into the query. 

124 """ 

125 

126 datasets: Mapping[str, DatasetSearch] = pydantic.Field(default_factory=dict) 

127 """Dataset searches that have been joined into the query.""" 

128 

129 data_coordinate_uploads: Mapping[DataCoordinateUploadKey, DimensionGroup] = pydantic.Field( 

130 default_factory=dict 

131 ) 

132 """Uploaded tables of data ID values that have been joined into the query. 

133 """ 

134 

135 materializations: Mapping[MaterializationKey, DimensionGroup] = pydantic.Field(default_factory=dict) 

136 """Tables of result rows from other queries that have been stored 

137 temporarily on the server. 

138 """ 

139 

140 predicate: Predicate = Predicate.from_bool(True) 

141 """Boolean expression trees whose logical AND defines a row filter.""" 

142 

143 def get_joined_dimension_groups(self) -> frozenset[DimensionGroup]: 

144 """Return a set of the dimension groups of all data coordinate uploads, 

145 dataset searches, and materializations. 

146 """ 

147 result: set[DimensionGroup] = set(self.data_coordinate_uploads.values()) 

148 result.update(self.materializations.values()) 

149 for dataset_spec in self.datasets.values(): 

150 result.add(dataset_spec.dimensions) 

151 return frozenset(result) 

152 

153 def join_dimensions(self, dimensions: DimensionGroup) -> QueryTree: 

154 """Return a new tree that includes additional dimensions. 

155 

156 Parameters 

157 ---------- 

158 dimensions : `DimensionGroup` 

159 Dimensions to include. 

160 

161 Returns 

162 ------- 

163 result : `QueryTree` 

164 A new tree with the additional dimensions. 

165 """ 

166 return self.model_copy(update=dict(dimensions=self.dimensions | dimensions)) 

167 

168 def join_data_coordinate_upload( 

169 self, key: DataCoordinateUploadKey, dimensions: DimensionGroup 

170 ) -> QueryTree: 

171 """Return a new tree that joins in an uploaded table of data ID values. 

172 

173 Parameters 

174 ---------- 

175 key : `DataCoordinateUploadKey` 

176 Unique identifier for this upload, as assigned by a `QueryDriver`. 

177 dimensions : `DimensionGroup` 

178 Dimensions of the data IDs. 

179 

180 Returns 

181 ------- 

182 result : `QueryTree` 

183 A new tree that joins in the data ID table. 

184 """ 

185 assert key not in self.data_coordinate_uploads, "Query should prevent doing the same upload twice." 

186 data_coordinate_uploads = dict(self.data_coordinate_uploads) 

187 data_coordinate_uploads[key] = dimensions 

188 return self.model_copy( 

189 update=dict( 

190 dimensions=self.dimensions | dimensions, data_coordinate_uploads=data_coordinate_uploads 

191 ) 

192 ) 

193 

194 def join_materialization(self, key: MaterializationKey, dimensions: DimensionGroup) -> QueryTree: 

195 """Return a new tree that joins in temporarily stored results from 

196 another query. 

197 

198 Parameters 

199 ---------- 

200 key : `MaterializationKey` 

201 Unique identifier for this materialization, as assigned by a 

202 `QueryDriver`. 

203 dimensions : `DimensionGroup` 

204 The dimensions stored in the materialization. 

205 

206 Returns 

207 ------- 

208 result : `QueryTree` 

209 A new tree that joins in the materialization. 

210 """ 

211 assert key not in self.data_coordinate_uploads, "Query should prevent duplicate materialization." 

212 materializations = dict(self.materializations) 

213 materializations[key] = dimensions 

214 return self.model_copy( 

215 update=dict(dimensions=self.dimensions | dimensions, materializations=materializations) 

216 ) 

217 

218 def join_dataset(self, dataset_type: str, search: DatasetSearch) -> QueryTree: 

219 """Return a new tree joins in a search for a dataset. 

220 

221 Parameters 

222 ---------- 

223 dataset_type : `str` 

224 Name of dataset type to join in. 

225 search : `DatasetSearch` 

226 Struct containing the collection search path and dataset type 

227 dimensions. 

228 

229 Returns 

230 ------- 

231 result : `QueryTree` 

232 A new tree that joins in the dataset search. 

233 

234 Notes 

235 ----- 

236 If this dataset type was already joined in, the new `DatasetSearch` 

237 replaces the old one. 

238 """ 

239 if existing := self.datasets.get(dataset_type): 

240 assert existing == search, "Dataset search should be new or the same." 

241 return self 

242 else: 

243 datasets = dict(self.datasets) 

244 datasets[dataset_type] = search 

245 return self.model_copy( 

246 update=dict(dimensions=self.dimensions | search.dimensions, datasets=datasets) 

247 ) 

248 

249 def where(self, *terms: Predicate) -> QueryTree: 

250 """Return a new tree that adds row filtering via a boolean column 

251 expression. 

252 

253 Parameters 

254 ---------- 

255 *terms : `Predicate` 

256 Boolean column expressions that filter rows. Arguments are 

257 combined with logical AND. 

258 

259 Returns 

260 ------- 

261 result : `QueryTree` 

262 A new tree that with row filtering. 

263 

264 Raises 

265 ------ 

266 InvalidQueryTreeError 

267 Raised if a column expression requires a dataset column that is not 

268 already present in the query tree. 

269 

270 Notes 

271 ----- 

272 If an expression references a dimension or dimension element that is 

273 not already present in the query tree, it will be joined in, but 

274 datasets must already be joined into a query tree in order to reference 

275 their fields in expressions. 

276 """ 

277 predicate = self.predicate 

278 columns = ColumnSet(self.dimensions) 

279 for where_term in terms: 

280 where_term.gather_required_columns(columns) 

281 predicate = predicate.logical_and(where_term) 

282 if not (columns.dataset_fields.keys() <= self.datasets.keys()): 

283 raise InvalidQueryError( 

284 f"Cannot reference dataset type(s) {columns.dataset_fields.keys() - self.datasets.keys()} " 

285 "that have not been joined." 

286 ) 

287 return self.model_copy(update=dict(dimensions=columns.dimensions, predicate=predicate)) 

288 

289 @pydantic.model_validator(mode="after") 

290 def _validate_join_operands(self) -> QueryTree: 

291 for dimensions in self.get_joined_dimension_groups(): 

292 if not dimensions.issubset(self.dimensions): 

293 raise InvalidQueryError( 

294 f"Dimensions {dimensions} of join operand are not a " 

295 f"subset of the query tree's dimensions {self.dimensions}." 

296 ) 

297 return self 

298 

299 @pydantic.model_validator(mode="after") 

300 def _validate_required_columns(self) -> QueryTree: 

301 columns = ColumnSet(self.dimensions) 

302 self.predicate.gather_required_columns(columns) 

303 if not columns.dimensions.issubset(self.dimensions): 

304 raise InvalidQueryError("Predicate requires dimensions beyond those in the query tree.") 

305 if not columns.dataset_fields.keys() <= self.datasets.keys(): 

306 raise InvalidQueryError("Predicate requires dataset columns that are not in the query tree.") 

307 return self 

308 

309 

310class SerializedQueryTree(DeferredValidation[QueryTree]): 

311 """A Pydantic-serializable wrapper for `QueryTree` that defers validation 

312 to the `validated` method, allowing a `.DimensionUniverse` to be provided. 

313 """