Coverage for python/lsst/daf/butler/queries/tree/_column_set.py: 25%

100 statements  

« prev     ^ index     » next       coverage.py v7.4.3, created at 2024-03-12 10:07 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28from __future__ import annotations 

29 

30__all__ = ("ColumnSet",) 

31 

32from collections.abc import Iterable, Iterator, Mapping, Set 

33 

34from ... import column_spec 

35from ...dimensions import DimensionGroup 

36from ...nonempty_mapping import NonemptyMapping 

37 

38 

39class ColumnSet: 

40 """A set-like hierarchical container for the columns in a query. 

41 

42 Parameters 

43 ---------- 

44 dimensions : `DimensionGroup` 

45 The dimensions that bound the set of columns, and by default specify 

46 the set of dimension key columns present. 

47 

48 Notes 

49 ----- 

50 This class does not inherit from `collections.abc.Set` because that brings 

51 in a lot of requirements we don't need (particularly interoperability with 

52 other set-like objects). 

53 

54 This class is iterable over tuples of ``(logical_table, field)``, where 

55 ``logical_table`` is a dimension element name or dataset type name, and 

56 ``field`` is a column associated with one of those, or `None` for dimension 

57 key columns. Iteration order is guaranteed to be deterministic and to 

58 start with all included dimension keys in `DimensionGroup.dimension_ 

59 """ 

60 

61 def __init__(self, dimensions: DimensionGroup) -> None: 

62 self._dimensions = dimensions 

63 self._removed_dimension_keys: set[str] = set() 

64 self._dimension_fields: dict[str, set[str]] = {name: set() for name in dimensions.elements} 

65 self._dataset_fields = NonemptyMapping[str, set[str]](set) 

66 

67 @property 

68 def dimensions(self) -> DimensionGroup: 

69 """The dimensions that bound all columns in the set.""" 

70 return self._dimensions 

71 

72 @property 

73 def dimension_fields(self) -> Mapping[str, set[str]]: 

74 """Dimension record fields included in the set, grouped by dimension 

75 element name. 

76 

77 The keys of this mapping are always ``self.dimensions.elements``, and 

78 nested sets may be empty. 

79 """ 

80 return self._dimension_fields 

81 

82 @property 

83 def dataset_fields(self) -> NonemptyMapping[str, set[str]]: 

84 """Dataset fields included in the set, grouped by dataset type name. 

85 

86 The keys of this mapping are just those that actually have nonempty 

87 nested sets. 

88 """ 

89 return self._dataset_fields 

90 

91 def __bool__(self) -> bool: 

92 return bool(self._dimensions) or any(self._dataset_fields.values()) 

93 

94 def __eq__(self, other: object) -> bool: 

95 if not isinstance(other, ColumnSet): 

96 return False 

97 return ( 

98 self._dimensions == other._dimensions 

99 and self._removed_dimension_keys == other._removed_dimension_keys 

100 and self._dimension_fields == other._dimension_fields 

101 and self._dataset_fields == other._dataset_fields 

102 ) 

103 

104 def __str__(self) -> str: 

105 return f"{{{', '.join(self.get_qualified_name(k, v) for k, v in self)}}}" 

106 

107 def issubset(self, other: ColumnSet) -> bool: 

108 """Test whether all columns in this set are also in another. 

109 

110 Parameters 

111 ---------- 

112 other : `ColumnSet` 

113 Set of columns to compare to. 

114 

115 Returns 

116 ------- 

117 issubset : `bool` 

118 Whether all columns in ``self`` are also in ``other``. 

119 """ 

120 return ( 

121 (self._get_dimension_keys() <= other._get_dimension_keys()) 

122 and all( 

123 fields.issubset(other._dimension_fields.get(element_name, frozenset())) 

124 for element_name, fields in self._dimension_fields.items() 

125 ) 

126 and all( 

127 fields.issubset(other._dataset_fields.get(dataset_type, frozenset())) 

128 for dataset_type, fields in self._dataset_fields.items() 

129 ) 

130 ) 

131 

132 def issuperset(self, other: ColumnSet) -> bool: 

133 """Test whether all columns another set are also in this one. 

134 

135 Parameters 

136 ---------- 

137 other : `ColumnSet` 

138 Set of columns to compare to. 

139 

140 Returns 

141 ------- 

142 issuperset : `bool` 

143 Whether all columns in ``other`` are also in ``self``. 

144 """ 

145 return other.issubset(self) 

146 

147 def isdisjoint(self, other: ColumnSet) -> bool: 

148 """Test whether there are no columns in both this set and another. 

149 

150 Parameters 

151 ---------- 

152 other : `ColumnSet` 

153 Set of columns to compare to. 

154 

155 Returns 

156 ------- 

157 isdisjoint : `bool` 

158 Whether there are any columns in both ``self`` and ``other``. 

159 """ 

160 return ( 

161 self._get_dimension_keys().isdisjoint(other._get_dimension_keys()) 

162 and all( 

163 fields.isdisjoint(other._dimension_fields.get(element, frozenset())) 

164 for element, fields in self._dimension_fields.items() 

165 ) 

166 and all( 

167 fields.isdisjoint(other._dataset_fields.get(dataset_type, frozenset())) 

168 for dataset_type, fields in self._dataset_fields.items() 

169 ) 

170 ) 

171 

172 def copy(self) -> ColumnSet: 

173 """Return a copy of this set. 

174 

175 Returns 

176 ------- 

177 copy : `ColumnSet` 

178 New column set that can be modified without changing the original. 

179 """ 

180 result = ColumnSet(self._dimensions) 

181 for element_name, element_fields in self._dimension_fields.items(): 

182 result._dimension_fields[element_name].update(element_fields) 

183 for dataset_type, dataset_fields in self._dataset_fields.items(): 

184 result._dataset_fields[dataset_type].update(dataset_fields) 

185 return result 

186 

187 def update_dimensions(self, dimensions: DimensionGroup) -> None: 

188 """Add new dimensions to the set. 

189 

190 Parameters 

191 ---------- 

192 dimensions : `DimensionGroup` 

193 Dimensions to be included. 

194 """ 

195 if not dimensions.issubset(self._dimensions): 

196 self._dimensions = dimensions.union(self._dimensions) 

197 self._dimension_fields = { 

198 name: self._dimension_fields.get(name, set()) for name in self._dimensions.elements 

199 } 

200 self._removed_dimension_keys.intersection_update(dimensions.names) 

201 

202 def update(self, other: ColumnSet) -> None: 

203 """Add columns from another set to this one. 

204 

205 Parameters 

206 ---------- 

207 other : `ColumnSet` 

208 Column set whose columns should be included in this one. 

209 """ 

210 self.update_dimensions(other.dimensions) 

211 self._removed_dimension_keys.intersection_update(other._removed_dimension_keys) 

212 for element_name, element_fields in other._dimension_fields.items(): 

213 self._dimension_fields[element_name].update(element_fields) 

214 for dataset_type, dataset_fields in other._dataset_fields.items(): 

215 self._dataset_fields[dataset_type].update(dataset_fields) 

216 

217 def drop_dimension_keys(self, names: Iterable[str]) -> ColumnSet: 

218 """Remove the given dimension key columns from the set. 

219 

220 Parameters 

221 ---------- 

222 names : `~collections.abc.Iterable` [ `str` ] 

223 Names of the dimensions to remove. 

224 

225 Returns 

226 ------- 

227 self : `ColumnSet` 

228 This column set, modified in place. 

229 """ 

230 self._removed_dimension_keys.update(names) 

231 return self 

232 

233 def drop_implied_dimension_keys(self) -> ColumnSet: 

234 """Remove dimension key columns that are implied by others. 

235 

236 Returns 

237 ------- 

238 self : `ColumnSet` 

239 This column set, modified in place. 

240 """ 

241 return self.drop_dimension_keys(self._dimensions.implied) 

242 

243 def restore_dimension_keys(self) -> None: 

244 """Restore all removed dimension key columns.""" 

245 self._removed_dimension_keys.clear() 

246 

247 def __iter__(self) -> Iterator[tuple[str, str | None]]: 

248 for dimension_name in self._dimensions.data_coordinate_keys: 

249 if dimension_name not in self._removed_dimension_keys: 

250 yield dimension_name, None 

251 # We iterate over DimensionElements and their DimensionRecord columns 

252 # in order to make sure that's predictable. We might want to extract 

253 # these query results positionally in some contexts. 

254 for element_name in self._dimensions.elements: 

255 element = self._dimensions.universe[element_name] 

256 fields_for_element = self._dimension_fields[element_name] 

257 for spec in element.schema.remainder: 

258 if spec.name in fields_for_element: 

259 yield element_name, spec.name 

260 # We sort dataset types and their fields lexicographically just to keep 

261 # our queries from having any dependence on set-iteration order. 

262 for dataset_type in sorted(self._dataset_fields): 

263 for field in sorted(self._dataset_fields[dataset_type]): 

264 yield dataset_type, field 

265 

266 def is_timespan(self, logical_table: str, field: str | None) -> bool: 

267 """Test whether the given column is a timespan. 

268 

269 Parameters 

270 ---------- 

271 logical_table : `str` 

272 Name of the dimension element or dataset type the column belongs 

273 to. 

274 field : `str` or `None` 

275 Column within the logical table, or `None` for dimension key 

276 columns. 

277 

278 Returns 

279 ------- 

280 is_timespan : `bool` 

281 Whether this column is a timespan. 

282 """ 

283 return field == "timespan" 

284 

285 @staticmethod 

286 def get_qualified_name(logical_table: str, field: str | None) -> str: 

287 """Return string that should be used to fully identify a column. 

288 

289 Parameters 

290 ---------- 

291 logical_table : `str` 

292 Name of the dimension element or dataset type the column belongs 

293 to. 

294 field : `str` or `None` 

295 Column within the logical table, or `None` for dimension key 

296 columns. 

297 

298 Returns 

299 ------- 

300 name : `str` 

301 Fully-qualified name. 

302 """ 

303 return logical_table if field is None else f"{logical_table}:{field}" 

304 

305 def get_column_spec(self, logical_table: str, field: str | None) -> column_spec.ColumnSpec: 

306 """Return a complete description of a column. 

307 

308 Parameters 

309 ---------- 

310 logical_table : `str` 

311 Name of the dimension element or dataset type the column belongs 

312 to. 

313 field : `str` or `None` 

314 Column within the logical table, or `None` for dimension key 

315 columns. 

316 

317 Returns 

318 ------- 

319 spec : `.column_spec.ColumnSpec` 

320 Description of the column. 

321 """ 

322 qualified_name = self.get_qualified_name(logical_table, field) 

323 if field is None: 

324 return self._dimensions.universe.dimensions[logical_table].primary_key.model_copy( 

325 update=dict(name=qualified_name) 

326 ) 

327 if logical_table in self._dimension_fields: 

328 return ( 

329 self._dimensions.universe[logical_table] 

330 .schema.all[field] 

331 .model_copy(update=dict(name=qualified_name)) 

332 ) 

333 match field: 

334 case "dataset_id": 

335 return column_spec.UUIDColumnSpec.model_construct(name=qualified_name, nullable=False) 

336 case "ingest_date": 

337 return column_spec.DateTimeColumnSpec.model_construct(name=qualified_name) 

338 case "run": 

339 return column_spec.StringColumnSpec.model_construct( 

340 name=qualified_name, nullable=False, length=column_spec.COLLECTION_NAME_MAX_LENGTH 

341 ) 

342 case "collection": 

343 return column_spec.StringColumnSpec.model_construct( 

344 name=qualified_name, nullable=False, length=column_spec.COLLECTION_NAME_MAX_LENGTH 

345 ) 

346 case "timespan": 

347 return column_spec.TimespanColumnSpec.model_construct(name=qualified_name, nullable=False) 

348 raise AssertionError(f"Unrecognized column identifiers: {logical_table}, {field}.") 

349 

350 def _get_dimension_keys(self) -> Set[str]: 

351 if not self._removed_dimension_keys: 

352 return self._dimensions.names 

353 else: 

354 return self._dimensions.names - self._removed_dimension_keys