Coverage for python/lsst/daf/butler/queries/tree/_query_tree.py: 51%

82 statements  

« prev     ^ index     » next       coverage.py v7.4.3, created at 2024-03-05 11:36 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28from __future__ import annotations 

29 

30__all__ = ( 

31 "QueryTree", 

32 "make_identity_query_tree", 

33 "DataCoordinateUploadKey", 

34 "MaterializationKey", 

35 "DatasetSearch", 

36 "SerializedQueryTree", 

37) 

38 

39import uuid 

40from collections.abc import Mapping 

41from typing import TypeAlias, final 

42 

43import pydantic 

44 

45from ...dimensions import DimensionGroup, DimensionUniverse 

46from ...pydantic_utils import DeferredValidation 

47from ._base import InvalidQueryError, QueryTreeBase 

48from ._column_set import ColumnSet 

49from ._predicate import Predicate 

50 

51DataCoordinateUploadKey: TypeAlias = uuid.UUID 

52 

53MaterializationKey: TypeAlias = uuid.UUID 

54 

55 

56def make_identity_query_tree(universe: DimensionUniverse) -> QueryTree: 

57 """Make an initial query tree with empty dimensions and a single logical 

58 row. 

59 

60 This method should be used by `Butler._query` to construct the initial 

61 query tree. This tree is a useful initial state because it is the 

62 identity for joins, in that joining any other query tree to the identity 

63 yields that query tree. 

64 

65 Parameters 

66 ---------- 

67 universe : `..DimensionUniverse` 

68 Definitions for all dimensions. 

69 

70 Returns 

71 ------- 

72 tree : `QueryTree` 

73 A tree with empty dimensions. 

74 """ 

75 return QueryTree(dimensions=universe.empty.as_group()) 

76 

77 

78@final 

79class DatasetSearch(QueryTreeBase): 

80 """Information about a dataset search joined into a query tree. 

81 

82 The dataset type name is the key of the dictionary (in `QueryTree`) where 

83 this type is used as a value. 

84 """ 

85 

86 collections: tuple[str, ...] 

87 """The collections to search. 

88 

89 Order matters if this dataset type is later referenced by a `FindFirst` 

90 operation. Collection wildcards are always resolved before being included 

91 in a dataset search. 

92 """ 

93 

94 dimensions: DimensionGroup 

95 """The dimensions of the dataset type. 

96 

97 This must match the dimensions of the dataset type as already defined in 

98 the butler database, but this cannot generally be verified when a relation 

99 tree is validated (since it requires a database query) and hence must be 

100 checked later. 

101 """ 

102 

103 storage_class_name: str | None 

104 """Name of the storage class to use when returning `DatasetRef` results. 

105 

106 May be `None` if the dataset is only used as a constraint or to return 

107 columns that do not include a full dataset type. 

108 """ 

109 

110 

111@final 

112class QueryTree(QueryTreeBase): 

113 """A declarative, serializable description of the row constraints and joins 

114 in a butler query. 

115 

116 Notes 

117 ----- 

118 A `QueryTree` is the struct that represents the serializable form of a 

119 `Query` object, or one piece (with `ResultSpec` the other) of the 

120 serializable form of a query results object. 

121 

122 This class's attributes describe the columns that are "available" to be 

123 returned or used in ``where`` or ``order_by`` expressions, but it does not 

124 carry information about the columns that are actually included in result 

125 rows, or what kind of butler primitive (e.g. `DataCoordinate` or 

126 `DatasetRef`) those rows might be transformed into. 

127 """ 

128 

129 dimensions: DimensionGroup 

130 """The dimensions whose keys are joined into the query. 

131 """ 

132 

133 datasets: Mapping[str, DatasetSearch] = pydantic.Field(default_factory=dict) 

134 """Dataset searches that have been joined into the query.""" 

135 

136 data_coordinate_uploads: Mapping[DataCoordinateUploadKey, DimensionGroup] = pydantic.Field( 

137 default_factory=dict 

138 ) 

139 """Uploaded tables of data ID values that have been joined into the query. 

140 """ 

141 

142 materializations: Mapping[MaterializationKey, DimensionGroup] = pydantic.Field(default_factory=dict) 

143 """Tables of result rows from other queries that have been stored 

144 temporarily on the server. 

145 """ 

146 

147 predicate: Predicate = Predicate.from_bool(True) 

148 """Boolean expression trees whose logical AND defines a row filter.""" 

149 

150 def get_joined_dimension_groups(self) -> frozenset[DimensionGroup]: 

151 """Return a set of the dimension groups of all data coordinate uploads, 

152 dataset searches, and materializations. 

153 """ 

154 result: set[DimensionGroup] = set(self.data_coordinate_uploads.values()) 

155 result.update(self.materializations.values()) 

156 for dataset_spec in self.datasets.values(): 

157 result.add(dataset_spec.dimensions) 

158 return frozenset(result) 

159 

160 def join_dimensions(self, dimensions: DimensionGroup) -> QueryTree: 

161 """Return a new tree that includes additional dimensions. 

162 

163 Parameters 

164 ---------- 

165 dimensions : `DimensionGroup` 

166 Dimensions to include. 

167 

168 Returns 

169 ------- 

170 result : `QueryTree` 

171 A new tree with the additional dimensions. 

172 """ 

173 return self.model_copy(update=dict(dimensions=self.dimensions | dimensions)) 

174 

175 def join_data_coordinate_upload( 

176 self, key: DataCoordinateUploadKey, dimensions: DimensionGroup 

177 ) -> QueryTree: 

178 """Return a new tree that joins in an uploaded table of data ID values. 

179 

180 Parameters 

181 ---------- 

182 key : `DataCoordinateUploadKey` 

183 Unique identifier for this upload, as assigned by a `QueryDriver`. 

184 dimensions : `DimensionGroup` 

185 Dimensions of the data IDs. 

186 

187 Returns 

188 ------- 

189 result : `QueryTree` 

190 A new tree that joins in the data ID table. 

191 """ 

192 assert key not in self.data_coordinate_uploads, "Query should prevent doing the same upload twice." 

193 data_coordinate_uploads = dict(self.data_coordinate_uploads) 

194 data_coordinate_uploads[key] = dimensions 

195 return self.model_copy( 

196 update=dict( 

197 dimensions=self.dimensions | dimensions, data_coordinate_uploads=data_coordinate_uploads 

198 ) 

199 ) 

200 

201 def join_materialization(self, key: MaterializationKey, dimensions: DimensionGroup) -> QueryTree: 

202 """Return a new tree that joins in temporarily stored results from 

203 another query. 

204 

205 Parameters 

206 ---------- 

207 key : `MaterializationKey` 

208 Unique identifier for this materialization, as assigned by a 

209 `QueryDriver`. 

210 dimensions : `DimensionGroup` 

211 The dimensions stored in the materialization. 

212 

213 Returns 

214 ------- 

215 result : `QueryTree` 

216 A new tree that joins in the materialization. 

217 """ 

218 assert key not in self.data_coordinate_uploads, "Query should prevent duplicate materialization." 

219 materializations = dict(self.materializations) 

220 materializations[key] = dimensions 

221 return self.model_copy( 

222 update=dict(dimensions=self.dimensions | dimensions, materializations=materializations) 

223 ) 

224 

225 def join_dataset(self, dataset_type: str, search: DatasetSearch) -> QueryTree: 

226 """Return a new tree joins in a search for a dataset. 

227 

228 Parameters 

229 ---------- 

230 dataset_type : `str` 

231 Name of dataset type to join in. 

232 search : `DatasetSearch` 

233 Struct containing the collection search path and dataset type 

234 dimensions. 

235 

236 Returns 

237 ------- 

238 result : `QueryTree` 

239 A new tree that joins in the dataset search. 

240 

241 Notes 

242 ----- 

243 If this dataset type was already joined in, the new `DatasetSearch` 

244 replaces the old one. 

245 """ 

246 datasets = dict(self.datasets) 

247 datasets[dataset_type] = search 

248 return self.model_copy(update=dict(dimensions=self.dimensions | search.dimensions, datasets=datasets)) 

249 

250 def where(self, *terms: Predicate) -> QueryTree: 

251 """Return a new tree that adds row filtering via a boolean column 

252 expression. 

253 

254 Parameters 

255 ---------- 

256 *terms : `Predicate` 

257 Boolean column expressions that filter rows. Arguments are 

258 combined with logical AND. 

259 

260 Returns 

261 ------- 

262 result : `QueryTree` 

263 A new tree that with row filtering. 

264 

265 Raises 

266 ------ 

267 InvalidQueryTreeError 

268 Raised if a column expression requires a dataset column that is not 

269 already present in the query tree. 

270 

271 Notes 

272 ----- 

273 If an expression references a dimension or dimension element that is 

274 not already present in the query tree, it will be joined in, but 

275 datasets must already be joined into a query tree in order to reference 

276 their fields in expressions. 

277 """ 

278 predicate = self.predicate 

279 columns = ColumnSet(self.dimensions) 

280 for where_term in terms: 

281 where_term.gather_required_columns(columns) 

282 predicate = predicate.logical_and(where_term) 

283 if not (columns.dataset_fields.keys() <= self.datasets.keys()): 

284 raise InvalidQueryError( 

285 f"Cannot reference dataset type(s) {columns.dataset_fields.keys() - self.datasets.keys()} " 

286 "that have not been joined." 

287 ) 

288 return self.model_copy(update=dict(dimensions=columns.dimensions, predicate=predicate)) 

289 

290 @pydantic.model_validator(mode="after") 

291 def _validate_join_operands(self) -> QueryTree: 

292 for dimensions in self.get_joined_dimension_groups(): 

293 if not dimensions.issubset(self.dimensions): 

294 raise InvalidQueryError( 

295 f"Dimensions {dimensions} of join operand are not a " 

296 f"subset of the query tree's dimensions {self.dimensions}." 

297 ) 

298 return self 

299 

300 @pydantic.model_validator(mode="after") 

301 def _validate_required_columns(self) -> QueryTree: 

302 columns = ColumnSet(self.dimensions) 

303 self.predicate.gather_required_columns(columns) 

304 if not columns.dimensions.issubset(self.dimensions): 

305 raise InvalidQueryError("Predicate requires dimensions beyond those in the query tree.") 

306 if not columns.dataset_fields.keys() <= self.datasets.keys(): 

307 raise InvalidQueryError("Predicate requires dataset columns that are not in the query tree.") 

308 return self 

309 

310 

311class SerializedQueryTree(DeferredValidation[QueryTree]): 

312 """A Pydantic-serializable wrapper for `QueryTree` that defers validation 

313 to the `validated` method, allowing a `.DimensionUniverse` to be provided. 

314 """