Coverage for python / lsst / daf / butler / dimensions / _schema.py: 23%

162 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-26 08:49 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27from __future__ import annotations 

28 

29__all__ = ("DimensionRecordSchema", "addDimensionForeignKey") 

30 

31import copy 

32from collections.abc import Set 

33from typing import TYPE_CHECKING 

34 

35from lsst.utils.classes import immutable 

36 

37from .. import arrow_utils, ddl 

38from .._named import NamedValueAbstractSet, NamedValueSet 

39from ..column_spec import RegionColumnSpec, TimespanColumnSpec 

40from ..timespan_database_representation import TimespanDatabaseRepresentation 

41 

42if TYPE_CHECKING: # Imports needed only for type annotations; may be circular. 

43 from ._elements import Dimension, DimensionElement, KeyColumnSpec, MetadataColumnSpec 

44 from ._group import DimensionGroup 

45 

46 

47@immutable 

48class DimensionRecordSchema: 

49 """A description of the columns in a dimension element's records. 

50 

51 Instances of this class should be obtained via `DimensionElement.schema`, 

52 where they are cached on first use. 

53 

54 Parameters 

55 ---------- 

56 element : `DimensionElement` 

57 Element this object describes. 

58 """ 

59 

60 def __init__(self, element: DimensionElement): 

61 self.element = element 

62 self.required = NamedValueSet() 

63 self.implied = NamedValueSet() 

64 self.dimensions = NamedValueSet() 

65 self.remainder = NamedValueSet() 

66 self.all = NamedValueSet() 

67 for dimension in element.required: 

68 if dimension != element: 

69 key_spec = dimension.primary_key.model_copy(update={"name": dimension.name}) 

70 else: 

71 # A Dimension instance is in its own required dependency graph 

72 # (always at the end, because of topological ordering). In 

73 # this case we don't want to rename the field. 

74 key_spec = element.primary_key # type: ignore 

75 self.required.add(key_spec) 

76 self.dimensions.add(key_spec) 

77 for dimension in element.implied: 

78 key_spec = dimension.primary_key.model_copy(update={"name": dimension.name}) 

79 self.implied.add(key_spec) 

80 self.dimensions.add(key_spec) 

81 self.all.update(self.dimensions) 

82 # Add non-primary unique keys. 

83 self.remainder.update(element.alternate_keys) 

84 # Add other metadata record_fields. 

85 self.remainder.update(element.metadata_columns) 

86 if element.spatial: 

87 self.remainder.add(RegionColumnSpec(nullable=True)) 

88 if element.temporal: 

89 self.remainder.add(TimespanColumnSpec(nullable=True)) 

90 self.all.update(self.remainder) 

91 self.required.freeze() 

92 self.implied.freeze() 

93 self.dimensions.freeze() 

94 self.remainder.freeze() 

95 self.all.freeze() 

96 

97 element: DimensionElement 

98 """The dimension element these fields correspond to. 

99 

100 (`DimensionElement`) 

101 """ 

102 

103 required: NamedValueAbstractSet[KeyColumnSpec] 

104 """The required dimension columns of this element's records. 

105 

106 The elements of this set correspond to `DimensionElement.required`, in the 

107 same order. 

108 """ 

109 

110 implied: NamedValueAbstractSet[KeyColumnSpec] 

111 """The implied dimension columns of this element's records. 

112 

113 The elements of this set correspond to `DimensionElement.implied`, in the 

114 same order. 

115 """ 

116 

117 dimensions: NamedValueAbstractSet[KeyColumnSpec] 

118 """The required and implied dimension columns of this element's records. 

119 

120 The elements of this set correspond to `DimensionElement.dimensions`, in 

121 the same order. 

122 """ 

123 

124 remainder: NamedValueAbstractSet[MetadataColumnSpec | RegionColumnSpec | TimespanColumnSpec] 

125 """The fields of this table that do not correspond to dimensions. 

126 

127 This includes alternate keys, metadata columns, and any region or timespan. 

128 """ 

129 

130 all: NamedValueAbstractSet[MetadataColumnSpec | RegionColumnSpec | TimespanColumnSpec] 

131 """All columns for this dimension element's records, in order.""" 

132 

133 @property 

134 def names(self) -> Set[str]: 

135 """The names of all columns, in order.""" 

136 return self.all.names 

137 

138 def __str__(self) -> str: 

139 lines = [f"{self.element.name}: "] 

140 for column_spec in self.all: 

141 lines.extend(column_spec.display(level=1)) 

142 return "\n".join(lines) 

143 

144 def to_arrow( 

145 self, remainder_only: bool = False, dimensions: DimensionGroup | None = None 

146 ) -> list[arrow_utils.ToArrow]: 

147 """Convert this schema to Arrow form. 

148 

149 Parameters 

150 ---------- 

151 remainder_only : `bool`, optional 

152 If `True`, skip the fields in `dimensions` and convert only those 

153 in ``remainder``. 

154 dimensions : `DimensionGroup`, optional 

155 Full set of dimensions over which the rows of the table are unique 

156 or close to unique. This is used to determine whether to use 

157 Arrow's dictionary encoding to compress duplicate values. Defaults 

158 to this element's `~DimensionElement.minimal_group`, which is 

159 appropriate for tables of just the records of this element. 

160 

161 Returns 

162 ------- 

163 converters : `list` [ `~lsst.daf.butler.arrow_utils.ToArrow` ] 

164 List of objects that can convert `DimensionRecord` attribute values 

165 to Arrow records, corresponding exactly to either ``all`` or 

166 ``remainder``, depending on ``remainder_only``. 

167 """ 

168 if dimensions is None: 

169 dimensions = self.element.minimal_group 

170 converters: list[arrow_utils.ToArrow] = [] 

171 if not remainder_only: 

172 for dimension, key_spec in zip(self.element.dimensions, self.dimensions): 

173 converters.append(dimension.to_arrow(dimensions, key_spec)) 

174 for remainder_spec in self.remainder: 

175 if remainder_spec.type == "string" and ( 

176 remainder_spec.name in self.element.metadata_columns.names 

177 or dimensions != self.element.minimal_group 

178 ): 

179 converters.append(remainder_spec.to_arrow().dictionary_encoded()) 

180 else: 

181 converters.append(remainder_spec.to_arrow()) 

182 return converters 

183 

184 

185def _makeForeignKeySpec(dimension: Dimension) -> ddl.ForeignKeySpec: 

186 """Make a `ddl.ForeignKeySpec`. 

187 

188 This will reference the table for the given `Dimension` table. 

189 

190 Most callers should use the higher-level `addDimensionForeignKey` function 

191 instead. 

192 

193 Parameters 

194 ---------- 

195 dimension : `Dimension` 

196 The dimension to be referenced. Caller guarantees that it is actually 

197 associated with a table. 

198 

199 Returns 

200 ------- 

201 spec : `ddl.ForeignKeySpec` 

202 A database-agnostic foreign key specification. 

203 """ 

204 source = [] 

205 target = [] 

206 for other in dimension.required: 

207 if other == dimension: 

208 target.append(dimension.primaryKey.name) 

209 else: 

210 target.append(other.name) 

211 source.append(other.name) 

212 return ddl.ForeignKeySpec(table=dimension.name, source=tuple(source), target=tuple(target)) 

213 

214 

215def addDimensionForeignKey( 

216 tableSpec: ddl.TableSpec, 

217 dimension: Dimension, 

218 *, 

219 primaryKey: bool, 

220 nullable: bool = False, 

221 constraint: bool = True, 

222) -> ddl.FieldSpec: 

223 """Add a field and possibly a foreign key to a table specification. 

224 

225 The field will reference the table for the given `Dimension`. 

226 

227 Parameters 

228 ---------- 

229 tableSpec : `ddl.TableSpec` 

230 Specification the field and foreign key are to be added to. 

231 dimension : `Dimension` 

232 Dimension to be referenced. If this dimension has required 

233 dependencies, those must have already been added to the table. A field 

234 will be added that correspond to this dimension's primary key, and a 

235 foreign key constraint will be added only if the dimension is 

236 associated with a table of its own. 

237 primaryKey : `bool` 

238 If `True`, the new field will be added as part of a compound primary 

239 key for the table. 

240 nullable : `bool`, optional 

241 If `False` (default) the new field will be added with a NOT NULL 

242 constraint. 

243 constraint : `bool` 

244 If `False` (`True` is default), just add the field, not the foreign 

245 key constraint. 

246 

247 Returns 

248 ------- 

249 fieldSpec : `ddl.FieldSpec` 

250 Specification for the field just added. 

251 """ 

252 # Add the dependency's primary key field, but use the dimension name for 

253 # the field name to make it unique and more meaningful in this table. 

254 fieldSpec = copy.copy(dimension.primaryKey) 

255 fieldSpec.name = dimension.name 

256 fieldSpec.primaryKey = primaryKey 

257 fieldSpec.nullable = nullable 

258 tableSpec.fields.add(fieldSpec) 

259 # Also add a foreign key constraint on the dependency table, but only if 

260 # there actually is one and we weren't told not to. 

261 if dimension.has_own_table and constraint: 

262 tableSpec.foreignKeys.append(_makeForeignKeySpec(dimension)) 

263 return fieldSpec 

264 

265 

266class DimensionElementFields: 

267 """Class for constructing table schemas for `DimensionElement`. 

268 

269 This creates an object that constructs the table schema for a 

270 `DimensionElement` and provides a categorized view of its fields. 

271 

272 Parameters 

273 ---------- 

274 element : `DimensionElement` 

275 Element for which to make a table specification. 

276 

277 Notes 

278 ----- 

279 This combines the foreign key fields from dependencies, unique keys 

280 for true `Dimension` instances, metadata fields, and region/timestamp 

281 fields for spatial/temporal elements. 

282 

283 Callers should use `DimensionUniverse.makeSchemaSpec` if they want to 

284 account for elements that have no table or reference another table; this 

285 class simply creates a specification for the table an element _would_ have 

286 without checking whether it does have one. That can be useful in contexts 

287 (e.g. `DimensionRecord`) where we want to simulate the existence of such a 

288 table. 

289 """ 

290 

291 def __init__(self, element: DimensionElement): 

292 self.element = element 

293 self._tableSpec = ddl.TableSpec(fields=()) 

294 # Add the primary key fields of required dimensions. These continue to 

295 # be primary keys in the table for this dimension. 

296 self.required = NamedValueSet() 

297 self.dimensions = NamedValueSet() 

298 self.facts = NamedValueSet() 

299 self.standard = NamedValueSet() 

300 dependencies = [] 

301 for dimension in element.required: 

302 if dimension != element: 

303 fieldSpec = addDimensionForeignKey(self._tableSpec, dimension, primaryKey=True) 

304 dependencies.append(fieldSpec.name) 

305 else: 

306 fieldSpec = element.primaryKey # type: ignore 

307 # A Dimension instance is in its own required dependency graph 

308 # (always at the end, because of topological ordering). In 

309 # this case we don't want to rename the field. 

310 self._tableSpec.fields.add(fieldSpec) 

311 self.required.add(fieldSpec) 

312 self.dimensions.add(fieldSpec) 

313 self.standard.add(fieldSpec) 

314 # Add fields and foreign keys for implied dimensions. These are 

315 # primary keys in their own table, but should not be here. As with 

316 # required dependencies, we rename the fields with the dimension name. 

317 # We use element.implied instead of element.graph.implied because we 

318 # don't want *recursive* implied dependencies. 

319 self.implied = NamedValueSet() 

320 for dimension in element.implied: 

321 fieldSpec = addDimensionForeignKey(self._tableSpec, dimension, primaryKey=False, nullable=False) 

322 self.implied.add(fieldSpec) 

323 self.dimensions.add(fieldSpec) 

324 self.standard.add(fieldSpec) 

325 # Add non-primary unique keys and unique constraints for them. 

326 for fieldSpec in getattr(element, "alternateKeys", ()): 

327 self._tableSpec.fields.add(fieldSpec) 

328 self._tableSpec.unique.add(tuple(dependencies) + (fieldSpec.name,)) 

329 self.standard.add(fieldSpec) 

330 self.facts.add(fieldSpec) 

331 # Add other metadata fields. 

332 for fieldSpec in element.metadata: 

333 self._tableSpec.fields.add(fieldSpec) 

334 self.standard.add(fieldSpec) 

335 self.facts.add(fieldSpec) 

336 names = list(self.standard.names) 

337 # Add fields for regions and/or timespans. 

338 if element.spatial is not None: 

339 names.append("region") 

340 if element.temporal is not None: 

341 names.append(TimespanDatabaseRepresentation.NAME) 

342 self.names = tuple(names) 

343 

344 def makeTableSpec( 

345 self, 

346 TimespanReprClass: type[TimespanDatabaseRepresentation], 

347 ) -> ddl.TableSpec: 

348 """Construct a complete specification for a table. 

349 

350 The table could hold the records of this element. 

351 

352 Parameters 

353 ---------- 

354 TimespanReprClass : `type` [ `TimespanDatabaseRepresentation` ] 

355 Class object that specifies how timespans are represented in the 

356 database. 

357 

358 Returns 

359 ------- 

360 spec : `ddl.TableSpec` 

361 Specification for a table. 

362 """ 

363 if self.element.temporal is not None or self.element.spatial is not None: 

364 spec = ddl.TableSpec( 

365 fields=NamedValueSet(self._tableSpec.fields), 

366 unique=self._tableSpec.unique, 

367 indexes=self._tableSpec.indexes, 

368 foreignKeys=self._tableSpec.foreignKeys, 

369 ) 

370 if self.element.spatial is not None: 

371 spec.fields.add(ddl.FieldSpec.for_region()) 

372 if self.element.temporal is not None: 

373 spec.fields.update(TimespanReprClass.makeFieldSpecs(nullable=True)) 

374 else: 

375 spec = self._tableSpec 

376 return spec 

377 

378 def __str__(self) -> str: 

379 lines = [f"{self.element.name}: "] 

380 lines.extend(f" {field.name}: {field.getPythonType().__name__}" for field in self.standard) 

381 if self.element.spatial is not None: 

382 lines.append(" region: lsst.sphgeom.Region") 

383 if self.element.temporal is not None: 

384 lines.append(" timespan: lsst.daf.butler.Timespan") 

385 return "\n".join(lines) 

386 

387 element: DimensionElement 

388 """The dimension element these fields correspond to. 

389 

390 (`DimensionElement`) 

391 """ 

392 

393 required: NamedValueSet[ddl.FieldSpec] 

394 """The required dimension fields of this table. 

395 

396 They correspond to the element's required 

397 dimensions, in that order, i.e. `DimensionElement.required` 

398 (`NamedValueSet` [ `ddl.FieldSpec` ]). 

399 """ 

400 

401 implied: NamedValueSet[ddl.FieldSpec] 

402 """The implied dimension fields of this table. 

403 

404 They correspond to the element's implied 

405 dimensions, in that order, i.e. `DimensionElement.implied` 

406 (`NamedValueSet` [ `ddl.FieldSpec` ]). 

407 """ 

408 

409 dimensions: NamedValueSet[ddl.FieldSpec] 

410 """The direct and implied dimension fields of this table. 

411 

412 They correspond to the element's direct 

413 required and implied dimensions, in that order, i.e. 

414 `DimensionElement.dimensions` (`NamedValueSet` [ `ddl.FieldSpec` ]). 

415 """ 

416 

417 facts: NamedValueSet[ddl.FieldSpec] 

418 """The standard fields of this table that do not correspond to dimensions. 

419 

420 (`NamedValueSet` [ `ddl.FieldSpec` ]). 

421 

422 This is equivalent to ``standard - dimensions`` (but possibly in a 

423 different order). 

424 """ 

425 

426 standard: NamedValueSet[ddl.FieldSpec] 

427 """All standard fields that are expected to have the same form. 

428 

429 They are expected to have the same form in all 

430 databases; this is all fields other than those that represent a region 

431 and/or timespan (`NamedValueSet` [ `ddl.FieldSpec` ]). 

432 """ 

433 

434 names: tuple[str, ...] 

435 """The names of all fields in the specification (`tuple` [ `str` ]). 

436 

437 This includes "region" and/or "timespan" if `element` is spatial and/or 

438 temporal (respectively). The actual database representation of these 

439 quantities may involve multiple fields (or even fields only on a different 

440 table), but the Python representation of those rows (i.e. `DimensionRecord` 

441 instances) will always contain exactly these fields. 

442 """