Coverage for python/lsst/daf/butler/queries/tree/_column_set.py: 25%

100 statements  

« prev     ^ index     » next       coverage.py v7.5.0, created at 2024-05-02 10:24 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28from __future__ import annotations 

29 

30__all__ = ("ColumnSet",) 

31 

32from collections.abc import Iterable, Iterator, Mapping, Set 

33 

34from ... import column_spec 

35from ...dimensions import DimensionGroup 

36from ...nonempty_mapping import NonemptyMapping 

37 

38 

39class ColumnSet: 

40 """A set-like hierarchical container for the columns in a query. 

41 

42 Parameters 

43 ---------- 

44 dimensions : `DimensionGroup` 

45 The dimensions that bound the set of columns, and by default specify 

46 the set of dimension key columns present. 

47 

48 Notes 

49 ----- 

50 This class does not inherit from `collections.abc.Set` because that brings 

51 in a lot of requirements we don't need (particularly interoperability with 

52 other set-like objects). 

53 

54 This class is iterable over tuples of ``(logical_table, field)``, where 

55 ``logical_table`` is a dimension element name or dataset type name, and 

56 ``field`` is a column associated with one of those, or `None` for dimension 

57 key columns. Iteration order is guaranteed to be deterministic and to 

58 start with all included dimension keys in 

59 `DimensionGroup.data_coordinate_keys`. 

60 """ 

61 

62 def __init__(self, dimensions: DimensionGroup) -> None: 

63 self._dimensions = dimensions 

64 self._removed_dimension_keys: set[str] = set() 

65 self._dimension_fields: dict[str, set[str]] = {name: set() for name in dimensions.elements} 

66 self._dataset_fields = NonemptyMapping[str, set[str]](set) 

67 

68 @property 

69 def dimensions(self) -> DimensionGroup: 

70 """The dimensions that bound all columns in the set.""" 

71 return self._dimensions 

72 

73 @property 

74 def dimension_fields(self) -> Mapping[str, set[str]]: 

75 """Dimension record fields included in the set, grouped by dimension 

76 element name. 

77 

78 The keys of this mapping are always ``self.dimensions.elements``, and 

79 nested sets may be empty. 

80 """ 

81 return self._dimension_fields 

82 

83 @property 

84 def dataset_fields(self) -> NonemptyMapping[str, set[str]]: 

85 """Dataset fields included in the set, grouped by dataset type name. 

86 

87 The keys of this mapping are just those that actually have nonempty 

88 nested sets. 

89 """ 

90 return self._dataset_fields 

91 

92 def __bool__(self) -> bool: 

93 return bool(self._dimensions) or any(self._dataset_fields.values()) 

94 

95 def __eq__(self, other: object) -> bool: 

96 if not isinstance(other, ColumnSet): 

97 return False 

98 return ( 

99 self._dimensions == other._dimensions 

100 and self._removed_dimension_keys == other._removed_dimension_keys 

101 and self._dimension_fields == other._dimension_fields 

102 and self._dataset_fields == other._dataset_fields 

103 ) 

104 

105 def __str__(self) -> str: 

106 return f"{{{', '.join(self.get_qualified_name(k, v) for k, v in self)}}}" 

107 

108 def issubset(self, other: ColumnSet) -> bool: 

109 """Test whether all columns in this set are also in another. 

110 

111 Parameters 

112 ---------- 

113 other : `ColumnSet` 

114 Set of columns to compare to. 

115 

116 Returns 

117 ------- 

118 issubset : `bool` 

119 Whether all columns in ``self`` are also in ``other``. 

120 """ 

121 return ( 

122 (self._get_dimension_keys() <= other._get_dimension_keys()) 

123 and all( 

124 fields.issubset(other._dimension_fields.get(element_name, frozenset())) 

125 for element_name, fields in self._dimension_fields.items() 

126 ) 

127 and all( 

128 fields.issubset(other._dataset_fields.get(dataset_type, frozenset())) 

129 for dataset_type, fields in self._dataset_fields.items() 

130 ) 

131 ) 

132 

133 def issuperset(self, other: ColumnSet) -> bool: 

134 """Test whether all columns another set are also in this one. 

135 

136 Parameters 

137 ---------- 

138 other : `ColumnSet` 

139 Set of columns to compare to. 

140 

141 Returns 

142 ------- 

143 issuperset : `bool` 

144 Whether all columns in ``other`` are also in ``self``. 

145 """ 

146 return other.issubset(self) 

147 

148 def isdisjoint(self, other: ColumnSet) -> bool: 

149 """Test whether there are no columns in both this set and another. 

150 

151 Parameters 

152 ---------- 

153 other : `ColumnSet` 

154 Set of columns to compare to. 

155 

156 Returns 

157 ------- 

158 isdisjoint : `bool` 

159 Whether there are any columns in both ``self`` and ``other``. 

160 """ 

161 return ( 

162 self._get_dimension_keys().isdisjoint(other._get_dimension_keys()) 

163 and all( 

164 fields.isdisjoint(other._dimension_fields.get(element, frozenset())) 

165 for element, fields in self._dimension_fields.items() 

166 ) 

167 and all( 

168 fields.isdisjoint(other._dataset_fields.get(dataset_type, frozenset())) 

169 for dataset_type, fields in self._dataset_fields.items() 

170 ) 

171 ) 

172 

173 def copy(self) -> ColumnSet: 

174 """Return a copy of this set. 

175 

176 Returns 

177 ------- 

178 copy : `ColumnSet` 

179 New column set that can be modified without changing the original. 

180 """ 

181 result = ColumnSet(self._dimensions) 

182 for element_name, element_fields in self._dimension_fields.items(): 

183 result._dimension_fields[element_name].update(element_fields) 

184 for dataset_type, dataset_fields in self._dataset_fields.items(): 

185 result._dataset_fields[dataset_type].update(dataset_fields) 

186 return result 

187 

188 def update_dimensions(self, dimensions: DimensionGroup) -> None: 

189 """Add new dimensions to the set. 

190 

191 Parameters 

192 ---------- 

193 dimensions : `DimensionGroup` 

194 Dimensions to be included. 

195 """ 

196 if not dimensions.issubset(self._dimensions): 

197 self._dimensions = dimensions.union(self._dimensions) 

198 self._dimension_fields = { 

199 name: self._dimension_fields.get(name, set()) for name in self._dimensions.elements 

200 } 

201 self._removed_dimension_keys.intersection_update(dimensions.names) 

202 

203 def update(self, other: ColumnSet) -> None: 

204 """Add columns from another set to this one. 

205 

206 Parameters 

207 ---------- 

208 other : `ColumnSet` 

209 Column set whose columns should be included in this one. 

210 """ 

211 self.update_dimensions(other.dimensions) 

212 self._removed_dimension_keys.intersection_update(other._removed_dimension_keys) 

213 for element_name, element_fields in other._dimension_fields.items(): 

214 self._dimension_fields[element_name].update(element_fields) 

215 for dataset_type, dataset_fields in other._dataset_fields.items(): 

216 self._dataset_fields[dataset_type].update(dataset_fields) 

217 

218 def drop_dimension_keys(self, names: Iterable[str]) -> ColumnSet: 

219 """Remove the given dimension key columns from the set. 

220 

221 Parameters 

222 ---------- 

223 names : `~collections.abc.Iterable` [ `str` ] 

224 Names of the dimensions to remove. 

225 

226 Returns 

227 ------- 

228 self : `ColumnSet` 

229 This column set, modified in place. 

230 """ 

231 self._removed_dimension_keys.update(names) 

232 return self 

233 

234 def drop_implied_dimension_keys(self) -> ColumnSet: 

235 """Remove dimension key columns that are implied by others. 

236 

237 Returns 

238 ------- 

239 self : `ColumnSet` 

240 This column set, modified in place. 

241 """ 

242 return self.drop_dimension_keys(self._dimensions.implied) 

243 

244 def restore_dimension_keys(self) -> None: 

245 """Restore all removed dimension key columns.""" 

246 self._removed_dimension_keys.clear() 

247 

248 def __iter__(self) -> Iterator[tuple[str, str | None]]: 

249 for dimension_name in self._dimensions.data_coordinate_keys: 

250 if dimension_name not in self._removed_dimension_keys: 

251 yield dimension_name, None 

252 # We iterate over DimensionElements and their DimensionRecord columns 

253 # in order to make sure that's predictable. We might want to extract 

254 # these query results positionally in some contexts. 

255 for element_name in self._dimensions.elements: 

256 element = self._dimensions.universe[element_name] 

257 fields_for_element = self._dimension_fields[element_name] 

258 for spec in element.schema.remainder: 

259 if spec.name in fields_for_element: 

260 yield element_name, spec.name 

261 # We sort dataset types and their fields lexicographically just to keep 

262 # our queries from having any dependence on set-iteration order. 

263 for dataset_type in sorted(self._dataset_fields): 

264 for field in sorted(self._dataset_fields[dataset_type]): 

265 yield dataset_type, field 

266 

267 def is_timespan(self, logical_table: str, field: str | None) -> bool: 

268 """Test whether the given column is a timespan. 

269 

270 Parameters 

271 ---------- 

272 logical_table : `str` 

273 Name of the dimension element or dataset type the column belongs 

274 to. 

275 field : `str` or `None` 

276 Column within the logical table, or `None` for dimension key 

277 columns. 

278 

279 Returns 

280 ------- 

281 is_timespan : `bool` 

282 Whether this column is a timespan. 

283 """ 

284 return field == "timespan" 

285 

286 @staticmethod 

287 def get_qualified_name(logical_table: str, field: str | None) -> str: 

288 """Return string that should be used to fully identify a column. 

289 

290 Parameters 

291 ---------- 

292 logical_table : `str` 

293 Name of the dimension element or dataset type the column belongs 

294 to. 

295 field : `str` or `None` 

296 Column within the logical table, or `None` for dimension key 

297 columns. 

298 

299 Returns 

300 ------- 

301 name : `str` 

302 Fully-qualified name. 

303 """ 

304 return logical_table if field is None else f"{logical_table}:{field}" 

305 

306 def get_column_spec(self, logical_table: str, field: str | None) -> column_spec.ColumnSpec: 

307 """Return a complete description of a column. 

308 

309 Parameters 

310 ---------- 

311 logical_table : `str` 

312 Name of the dimension element or dataset type the column belongs 

313 to. 

314 field : `str` or `None` 

315 Column within the logical table, or `None` for dimension key 

316 columns. 

317 

318 Returns 

319 ------- 

320 spec : `.column_spec.ColumnSpec` 

321 Description of the column. 

322 """ 

323 qualified_name = self.get_qualified_name(logical_table, field) 

324 if field is None: 

325 return self._dimensions.universe.dimensions[logical_table].primary_key.model_copy( 

326 update=dict(name=qualified_name) 

327 ) 

328 if logical_table in self._dimension_fields: 

329 return ( 

330 self._dimensions.universe[logical_table] 

331 .schema.all[field] 

332 .model_copy(update=dict(name=qualified_name)) 

333 ) 

334 match field: 

335 case "dataset_id": 

336 return column_spec.UUIDColumnSpec.model_construct(name=qualified_name, nullable=False) 

337 case "ingest_date": 

338 return column_spec.DateTimeColumnSpec.model_construct(name=qualified_name) 

339 case "run": 

340 return column_spec.StringColumnSpec.model_construct( 

341 name=qualified_name, nullable=False, length=column_spec.COLLECTION_NAME_MAX_LENGTH 

342 ) 

343 case "collection": 

344 return column_spec.StringColumnSpec.model_construct( 

345 name=qualified_name, nullable=False, length=column_spec.COLLECTION_NAME_MAX_LENGTH 

346 ) 

347 case "timespan": 

348 return column_spec.TimespanColumnSpec.model_construct(name=qualified_name, nullable=False) 

349 raise AssertionError(f"Unrecognized column identifiers: {logical_table}, {field}.") 

350 

351 def _get_dimension_keys(self) -> Set[str]: 

352 if not self._removed_dimension_keys: 

353 return self._dimensions.names 

354 else: 

355 return self._dimensions.names - self._removed_dimension_keys