Coverage for python/lsst/daf/butler/dimensions/_schema.py: 25%

176 statements  

« prev     ^ index     » next       coverage.py v7.4.4, created at 2024-04-10 10:14 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27from __future__ import annotations 

28 

29__all__ = ("addDimensionForeignKey", "DimensionRecordSchema") 

30 

31import copy 

32from collections.abc import Mapping, Set 

33from typing import TYPE_CHECKING 

34 

35from lsst.utils.classes import cached_getter, immutable 

36 

37from .. import arrow_utils, ddl 

38from .._column_tags import DimensionKeyColumnTag, DimensionRecordColumnTag 

39from .._named import NamedValueAbstractSet, NamedValueSet 

40from ..column_spec import RegionColumnSpec, TimespanColumnSpec 

41from ..timespan_database_representation import TimespanDatabaseRepresentation 

42 

43if TYPE_CHECKING: # Imports needed only for type annotations; may be circular. 

44 from lsst.daf.relation import ColumnTag 

45 

46 from ._elements import Dimension, DimensionElement, KeyColumnSpec, MetadataColumnSpec 

47 from ._group import DimensionGroup 

48 

49 

50@immutable 

51class DimensionRecordSchema: 

52 """A description of the columns in a dimension element's records. 

53 

54 Instances of this class should be obtained via `DimensionElement.schema`, 

55 where they are cached on first use. 

56 

57 Parameters 

58 ---------- 

59 element : `DimensionElement` 

60 Element this object describes. 

61 """ 

62 

63 def __init__(self, element: DimensionElement): 

64 self.element = element 

65 self.required = NamedValueSet() 

66 self.implied = NamedValueSet() 

67 self.dimensions = NamedValueSet() 

68 self.remainder = NamedValueSet() 

69 self.all = NamedValueSet() 

70 for dimension in element.required: 

71 if dimension != element: 

72 key_spec = dimension.primary_key.model_copy(update={"name": dimension.name}) 

73 else: 

74 # A Dimension instance is in its own required dependency graph 

75 # (always at the end, because of topological ordering). In 

76 # this case we don't want to rename the field. 

77 key_spec = element.primary_key # type: ignore 

78 self.required.add(key_spec) 

79 self.dimensions.add(key_spec) 

80 for dimension in element.implied: 

81 key_spec = dimension.primary_key.model_copy(update={"name": dimension.name}) 

82 self.implied.add(key_spec) 

83 self.dimensions.add(key_spec) 

84 self.all.update(self.dimensions) 

85 # Add non-primary unique keys. 

86 self.remainder.update(element.alternate_keys) 

87 # Add other metadata record_fields. 

88 self.remainder.update(element.metadata_columns) 

89 if element.spatial: 

90 self.remainder.add(RegionColumnSpec(nullable=True)) 

91 if element.temporal: 

92 self.remainder.add(TimespanColumnSpec(nullable=True)) 

93 self.all.update(self.remainder) 

94 self.required.freeze() 

95 self.implied.freeze() 

96 self.dimensions.freeze() 

97 self.remainder.freeze() 

98 self.all.freeze() 

99 

100 element: DimensionElement 

101 """The dimension element these fields correspond to. 

102 

103 (`DimensionElement`) 

104 """ 

105 

106 required: NamedValueAbstractSet[KeyColumnSpec] 

107 """The required dimension columns of this element's records. 

108 

109 The elements of this set correspond to `DimensionElement.required`, in the 

110 same order. 

111 """ 

112 

113 implied: NamedValueAbstractSet[KeyColumnSpec] 

114 """The implied dimension columns of this element's records. 

115 

116 The elements of this set correspond to `DimensionElement.implied`, in the 

117 same order. 

118 """ 

119 

120 dimensions: NamedValueAbstractSet[KeyColumnSpec] 

121 """The required and implied dimension columns of this element's records. 

122 

123 The elements of this set correspond to `DimensionElement.dimensions`, in 

124 the same order. 

125 """ 

126 

127 remainder: NamedValueAbstractSet[MetadataColumnSpec | RegionColumnSpec | TimespanColumnSpec] 

128 """The fields of this table that do not correspond to dimensions. 

129 

130 This includes alternate keys, metadata columns, and any region or timespan. 

131 """ 

132 

133 all: NamedValueAbstractSet[MetadataColumnSpec | RegionColumnSpec | TimespanColumnSpec] 

134 """All columns for this dimension element's records, in order.""" 

135 

136 @property 

137 def names(self) -> Set[str]: 

138 """The names of all columns, in order.""" 

139 return self.all.names 

140 

141 def __str__(self) -> str: 

142 lines = [f"{self.element.name}: "] 

143 for column_spec in self.all: 

144 lines.extend(column_spec.display(level=1)) 

145 return "\n".join(lines) 

146 

147 def to_arrow( 

148 self, remainder_only: bool = False, dimensions: DimensionGroup | None = None 

149 ) -> list[arrow_utils.ToArrow]: 

150 """Convert this schema to Arrow form. 

151 

152 Parameters 

153 ---------- 

154 remainder_only : `bool`, optional 

155 If `True`, skip the fields in `dimensions` and convert only those 

156 in `remainder`. 

157 dimensions : `DimensionGroup`, optional 

158 Full set of dimensions over which the rows of the table are unique 

159 or close to unique. This is used to determine whether to use 

160 Arrow's dictionary encoding to compress duplicate values. Defaults 

161 to this element's `~DimensionElement.minimal_group`, which is 

162 appropriate for tables of just the records of this element. 

163 

164 Returns 

165 ------- 

166 converters : `list` [ `arrow_utils.ToArrow` ] 

167 List of objects that can convert `DimensionRecord` attribute values 

168 to Arrow records, corresponding exactly to either `all` or 

169 `remainder`, depending on ``remainder_only``. 

170 """ 

171 if dimensions is None: 

172 dimensions = self.element.minimal_group 

173 converters: list[arrow_utils.ToArrow] = [] 

174 if not remainder_only: 

175 for dimension, key_spec in zip(self.element.dimensions, self.dimensions): 

176 converters.append(dimension.to_arrow(dimensions, key_spec)) 

177 for remainder_spec in self.remainder: 

178 if remainder_spec.type == "string" and ( 

179 remainder_spec.name in self.element.metadata_columns.names 

180 or dimensions != self.element.minimal_group 

181 ): 

182 converters.append(remainder_spec.to_arrow().dictionary_encoded()) 

183 else: 

184 converters.append(remainder_spec.to_arrow()) 

185 return converters 

186 

187 

188def _makeForeignKeySpec(dimension: Dimension) -> ddl.ForeignKeySpec: 

189 """Make a `ddl.ForeignKeySpec`. 

190 

191 This will reference the table for the given `Dimension` table. 

192 

193 Most callers should use the higher-level `addDimensionForeignKey` function 

194 instead. 

195 

196 Parameters 

197 ---------- 

198 dimension : `Dimension` 

199 The dimension to be referenced. Caller guarantees that it is actually 

200 associated with a table. 

201 

202 Returns 

203 ------- 

204 spec : `ddl.ForeignKeySpec` 

205 A database-agnostic foreign key specification. 

206 """ 

207 source = [] 

208 target = [] 

209 for other in dimension.required: 

210 if other == dimension: 

211 target.append(dimension.primaryKey.name) 

212 else: 

213 target.append(other.name) 

214 source.append(other.name) 

215 return ddl.ForeignKeySpec(table=dimension.name, source=tuple(source), target=tuple(target)) 

216 

217 

218def addDimensionForeignKey( 

219 tableSpec: ddl.TableSpec, 

220 dimension: Dimension, 

221 *, 

222 primaryKey: bool, 

223 nullable: bool = False, 

224 constraint: bool = True, 

225) -> ddl.FieldSpec: 

226 """Add a field and possibly a foreign key to a table specification. 

227 

228 The field will reference the table for the given `Dimension`. 

229 

230 Parameters 

231 ---------- 

232 tableSpec : `ddl.TableSpec` 

233 Specification the field and foreign key are to be added to. 

234 dimension : `Dimension` 

235 Dimension to be referenced. If this dimension has required 

236 dependencies, those must have already been added to the table. A field 

237 will be added that correspond to this dimension's primary key, and a 

238 foreign key constraint will be added only if the dimension is 

239 associated with a table of its own. 

240 primaryKey : `bool` 

241 If `True`, the new field will be added as part of a compound primary 

242 key for the table. 

243 nullable : `bool`, optional 

244 If `False` (default) the new field will be added with a NOT NULL 

245 constraint. 

246 constraint : `bool` 

247 If `False` (`True` is default), just add the field, not the foreign 

248 key constraint. 

249 

250 Returns 

251 ------- 

252 fieldSpec : `ddl.FieldSpec` 

253 Specification for the field just added. 

254 """ 

255 # Add the dependency's primary key field, but use the dimension name for 

256 # the field name to make it unique and more meaningful in this table. 

257 fieldSpec = copy.copy(dimension.primaryKey) 

258 fieldSpec.name = dimension.name 

259 fieldSpec.primaryKey = primaryKey 

260 fieldSpec.nullable = nullable 

261 tableSpec.fields.add(fieldSpec) 

262 # Also add a foreign key constraint on the dependency table, but only if 

263 # there actually is one and we weren't told not to. 

264 if dimension.has_own_table and constraint: 

265 tableSpec.foreignKeys.append(_makeForeignKeySpec(dimension)) 

266 return fieldSpec 

267 

268 

269class DimensionElementFields: 

270 """Class for constructing table schemas for `DimensionElement`. 

271 

272 This creates an object that constructs the table schema for a 

273 `DimensionElement` and provides a categorized view of its fields. 

274 

275 Parameters 

276 ---------- 

277 element : `DimensionElement` 

278 Element for which to make a table specification. 

279 

280 Notes 

281 ----- 

282 This combines the foreign key fields from dependencies, unique keys 

283 for true `Dimension` instances, metadata fields, and region/timestamp 

284 fields for spatial/temporal elements. 

285 

286 Callers should use `DimensionUniverse.makeSchemaSpec` if they want to 

287 account for elements that have no table or reference another table; this 

288 class simply creates a specification for the table an element _would_ have 

289 without checking whether it does have one. That can be useful in contexts 

290 (e.g. `DimensionRecord`) where we want to simulate the existence of such a 

291 table. 

292 """ 

293 

294 def __init__(self, element: DimensionElement): 

295 self.element = element 

296 self._tableSpec = ddl.TableSpec(fields=()) 

297 # Add the primary key fields of required dimensions. These continue to 

298 # be primary keys in the table for this dimension. 

299 self.required = NamedValueSet() 

300 self.dimensions = NamedValueSet() 

301 self.facts = NamedValueSet() 

302 self.standard = NamedValueSet() 

303 dependencies = [] 

304 for dimension in element.required: 

305 if dimension != element: 

306 fieldSpec = addDimensionForeignKey(self._tableSpec, dimension, primaryKey=True) 

307 dependencies.append(fieldSpec.name) 

308 else: 

309 fieldSpec = element.primaryKey # type: ignore 

310 # A Dimension instance is in its own required dependency graph 

311 # (always at the end, because of topological ordering). In 

312 # this case we don't want to rename the field. 

313 self._tableSpec.fields.add(fieldSpec) 

314 self.required.add(fieldSpec) 

315 self.dimensions.add(fieldSpec) 

316 self.standard.add(fieldSpec) 

317 # Add fields and foreign keys for implied dimensions. These are 

318 # primary keys in their own table, but should not be here. As with 

319 # required dependencies, we rename the fields with the dimension name. 

320 # We use element.implied instead of element.graph.implied because we 

321 # don't want *recursive* implied dependencies. 

322 self.implied = NamedValueSet() 

323 for dimension in element.implied: 

324 fieldSpec = addDimensionForeignKey(self._tableSpec, dimension, primaryKey=False, nullable=False) 

325 self.implied.add(fieldSpec) 

326 self.dimensions.add(fieldSpec) 

327 self.standard.add(fieldSpec) 

328 # Add non-primary unique keys and unique constraints for them. 

329 for fieldSpec in getattr(element, "alternateKeys", ()): 

330 self._tableSpec.fields.add(fieldSpec) 

331 self._tableSpec.unique.add(tuple(dependencies) + (fieldSpec.name,)) 

332 self.standard.add(fieldSpec) 

333 self.facts.add(fieldSpec) 

334 # Add other metadata fields. 

335 for fieldSpec in element.metadata: 

336 self._tableSpec.fields.add(fieldSpec) 

337 self.standard.add(fieldSpec) 

338 self.facts.add(fieldSpec) 

339 names = list(self.standard.names) 

340 # Add fields for regions and/or timespans. 

341 if element.spatial is not None: 

342 names.append("region") 

343 if element.temporal is not None: 

344 names.append(TimespanDatabaseRepresentation.NAME) 

345 self.names = tuple(names) 

346 

347 def makeTableSpec( 

348 self, 

349 TimespanReprClass: type[TimespanDatabaseRepresentation], 

350 ) -> ddl.TableSpec: 

351 """Construct a complete specification for a table. 

352 

353 The table could hold the records of this element. 

354 

355 Parameters 

356 ---------- 

357 TimespanReprClass : `type` [ `TimespanDatabaseRepresentation` ] 

358 Class object that specifies how timespans are represented in the 

359 database. 

360 

361 Returns 

362 ------- 

363 spec : `ddl.TableSpec` 

364 Specification for a table. 

365 """ 

366 if self.element.temporal is not None or self.element.spatial is not None: 

367 spec = ddl.TableSpec( 

368 fields=NamedValueSet(self._tableSpec.fields), 

369 unique=self._tableSpec.unique, 

370 indexes=self._tableSpec.indexes, 

371 foreignKeys=self._tableSpec.foreignKeys, 

372 ) 

373 if self.element.spatial is not None: 

374 spec.fields.add(ddl.FieldSpec.for_region()) 

375 if self.element.temporal is not None: 

376 spec.fields.update(TimespanReprClass.makeFieldSpecs(nullable=True)) 

377 else: 

378 spec = self._tableSpec 

379 return spec 

380 

381 def __str__(self) -> str: 

382 lines = [f"{self.element.name}: "] 

383 lines.extend(f" {field.name}: {field.getPythonType().__name__}" for field in self.standard) 

384 if self.element.spatial is not None: 

385 lines.append(" region: lsst.sphgeom.Region") 

386 if self.element.temporal is not None: 

387 lines.append(" timespan: lsst.daf.butler.Timespan") 

388 return "\n".join(lines) 

389 

390 @property 

391 @cached_getter 

392 def columns(self) -> Mapping[ColumnTag, str]: 

393 """A mapping from `ColumnTag` to field name for all fields in this 

394 element's records (`~collections.abc.Mapping`). 

395 """ 

396 result: dict[ColumnTag, str] = {} 

397 for dimension_name, field_name in zip( 

398 self.element.dimensions.names, self.dimensions.names, strict=True 

399 ): 

400 result[DimensionKeyColumnTag(dimension_name)] = field_name 

401 for field_name in self.facts.names: 

402 result[DimensionRecordColumnTag(self.element.name, field_name)] = field_name 

403 if self.element.spatial: 

404 result[DimensionRecordColumnTag(self.element.name, "region")] = "region" 

405 if self.element.temporal: 

406 result[DimensionRecordColumnTag(self.element.name, "timespan")] = "timespan" 

407 return result 

408 

409 element: DimensionElement 

410 """The dimension element these fields correspond to. 

411 

412 (`DimensionElement`) 

413 """ 

414 

415 required: NamedValueSet[ddl.FieldSpec] 

416 """The required dimension fields of this table. 

417 

418 They correspond to the element's required 

419 dimensions, in that order, i.e. `DimensionElement.required` 

420 (`NamedValueSet` [ `ddl.FieldSpec` ]). 

421 """ 

422 

423 implied: NamedValueSet[ddl.FieldSpec] 

424 """The implied dimension fields of this table. 

425 

426 They correspond to the element's implied 

427 dimensions, in that order, i.e. `DimensionElement.implied` 

428 (`NamedValueSet` [ `ddl.FieldSpec` ]). 

429 """ 

430 

431 dimensions: NamedValueSet[ddl.FieldSpec] 

432 """The direct and implied dimension fields of this table. 

433 

434 They correspond to the element's direct 

435 required and implied dimensions, in that order, i.e. 

436 `DimensionElement.dimensions` (`NamedValueSet` [ `ddl.FieldSpec` ]). 

437 """ 

438 

439 facts: NamedValueSet[ddl.FieldSpec] 

440 """The standard fields of this table that do not correspond to dimensions. 

441 

442 (`NamedValueSet` [ `ddl.FieldSpec` ]). 

443 

444 This is equivalent to ``standard - dimensions`` (but possibly in a 

445 different order). 

446 """ 

447 

448 standard: NamedValueSet[ddl.FieldSpec] 

449 """All standard fields that are expected to have the same form. 

450 

451 They are expected to have the same form in all 

452 databases; this is all fields other than those that represent a region 

453 and/or timespan (`NamedValueSet` [ `ddl.FieldSpec` ]). 

454 """ 

455 

456 names: tuple[str, ...] 

457 """The names of all fields in the specification (`tuple` [ `str` ]). 

458 

459 This includes "region" and/or "timespan" if `element` is spatial and/or 

460 temporal (respectively). The actual database representation of these 

461 quantities may involve multiple fields (or even fields only on a different 

462 table), but the Python representation of those rows (i.e. `DimensionRecord` 

463 instances) will always contain exactly these fields. 

464 """