Coverage for python/lsst/daf/butler/dimensions/_elements.py: 72%

176 statements  

« prev     ^ index     » next       coverage.py v7.4.1, created at 2024-02-01 11:20 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28from __future__ import annotations 

29 

30__all__ = ( 

31 "Dimension", 

32 "DimensionCombination", 

33 "DimensionElement", 

34) 

35 

36from abc import abstractmethod 

37from typing import TYPE_CHECKING, Annotated, Any, ClassVar, TypeAlias, Union, cast 

38 

39import pydantic 

40from lsst.utils.classes import cached_getter 

41 

42from .. import arrow_utils, column_spec, ddl 

43from .._named import NamedValueAbstractSet, NamedValueSet 

44from .._topology import TopologicalRelationshipEndpoint 

45from ..json import from_json_generic, to_json_generic 

46 

47if TYPE_CHECKING: # Imports needed only for type annotations; may be circular. 

48 from ..registry import Registry 

49 from ._governor import GovernorDimension 

50 from ._graph import DimensionGraph 

51 from ._group import DimensionGroup 

52 from ._records import DimensionRecord 

53 from ._schema import DimensionRecordSchema 

54 from ._universe import DimensionUniverse 

55 

56KeyColumnSpec: TypeAlias = Annotated[ 

57 Union[ 

58 column_spec.IntColumnSpec, 

59 column_spec.StringColumnSpec, 

60 column_spec.HashColumnSpec, 

61 ], 

62 pydantic.Field(discriminator="type"), 

63] 

64 

65MetadataColumnSpec: TypeAlias = Annotated[ 

66 Union[ 

67 column_spec.IntColumnSpec, 

68 column_spec.StringColumnSpec, 

69 column_spec.FloatColumnSpec, 

70 column_spec.HashColumnSpec, 

71 column_spec.BoolColumnSpec, 

72 ], 

73 pydantic.Field(discriminator="type"), 

74] 

75 

76 

77class DimensionElement(TopologicalRelationshipEndpoint): 

78 """A label and/or metadata in the dimensions system. 

79 

80 A named data-organization concept that defines a label and/or metadata 

81 in the dimensions system. 

82 

83 A `DimensionElement` instance typically corresponds to a _logical_ table in 

84 the `Registry`: either an actual database table or a way of generating rows 

85 on-the-fly that can similarly participate in queries. The rows in that 

86 table are represented by instances of a `DimensionRecord` subclass. Most 

87 `DimensionElement` instances are instances of its `Dimension` subclass, 

88 which is used for elements that can be used as data ID keys. 

89 

90 Notes 

91 ----- 

92 `DimensionElement` instances should always be constructed by and retrieved 

93 from a `DimensionUniverse`. They are immutable after they are fully 

94 constructed, and should never be copied. 

95 

96 Pickling a `DimensionElement` just records its name and universe; 

97 unpickling one actually just looks up the element via the singleton 

98 dictionary of all universes. This allows pickle to be used to transfer 

99 elements between processes, but only when each process initializes its own 

100 instance of the same `DimensionUniverse`. 

101 """ 

102 

103 def __str__(self) -> str: 

104 return self.name 

105 

106 def __repr__(self) -> str: 

107 return f"{type(self).__name__}({self.name})" 

108 

109 def __eq__(self, other: Any) -> bool: 

110 try: 

111 return self.name == other.name 

112 except AttributeError: 

113 # TODO: try removing this fallback; it's not really consistent with 

114 # base class intent, and it could be confusing 

115 return self.name == other 

116 

117 def __hash__(self) -> int: 

118 return hash(self.name) 

119 

120 # TODO: try removing comparison operators; DimensionUniverse.sorted should 

121 # be adequate. 

122 

123 def __lt__(self, other: DimensionElement) -> bool: 

124 try: 

125 return self.universe.getElementIndex(self.name) < self.universe.getElementIndex(other.name) 

126 except KeyError: 

127 return NotImplemented 

128 

129 def __le__(self, other: DimensionElement) -> bool: 

130 try: 

131 return self.universe.getElementIndex(self.name) <= self.universe.getElementIndex(other.name) 

132 except KeyError: 

133 return NotImplemented 

134 

135 def __gt__(self, other: DimensionElement) -> bool: 

136 try: 

137 return self.universe.getElementIndex(self.name) > self.universe.getElementIndex(other.name) 

138 except KeyError: 

139 return NotImplemented 

140 

141 def __ge__(self, other: DimensionElement) -> bool: 

142 try: 

143 return self.universe.getElementIndex(self.name) >= self.universe.getElementIndex(other.name) 

144 except KeyError: 

145 return NotImplemented 

146 

147 @classmethod 

148 def _unpickle(cls, universe: DimensionUniverse, name: str) -> DimensionElement: 

149 """Callable used for unpickling. 

150 

151 For internal use only. 

152 """ 

153 return universe[name] 

154 

155 def __reduce__(self) -> tuple: 

156 return (self._unpickle, (self.universe, self.name)) 

157 

158 def __deepcopy__(self, memo: dict) -> DimensionElement: 

159 # DimensionElement is recursively immutable; see note in @immutable 

160 # decorator. 

161 return self 

162 

163 def to_simple(self, minimal: bool = False) -> str: 

164 """Convert this class to a simple python type. 

165 

166 This is suitable for serialization. 

167 

168 Parameters 

169 ---------- 

170 minimal : `bool`, optional 

171 Use minimal serialization. Has no effect on for this class. 

172 

173 Returns 

174 ------- 

175 simple : `str` 

176 The object converted to a single string. 

177 """ 

178 return self.name 

179 

180 @classmethod 

181 def from_simple( 

182 cls, simple: str, universe: DimensionUniverse | None = None, registry: Registry | None = None 

183 ) -> DimensionElement: 

184 """Construct a new object from the simplified form. 

185 

186 Usually the data is returned from the `to_simple` method. 

187 

188 Parameters 

189 ---------- 

190 simple : `str` 

191 The value returned by `to_simple()`. 

192 universe : `DimensionUniverse` 

193 The special graph of all known dimensions. 

194 registry : `lsst.daf.butler.Registry`, optional 

195 Registry from which a universe can be extracted. Can be `None` 

196 if universe is provided explicitly. 

197 

198 Returns 

199 ------- 

200 dataId : `DimensionElement` 

201 Newly-constructed object. 

202 """ 

203 if universe is None and registry is None: 

204 raise ValueError("One of universe or registry is required to convert a dict to a DataCoordinate") 

205 if universe is None and registry is not None: 

206 universe = registry.dimensions 

207 if universe is None: 

208 # this is for mypy 

209 raise ValueError("Unable to determine a usable universe") 

210 

211 return universe[simple] 

212 

213 to_json = to_json_generic 

214 from_json: ClassVar = classmethod(from_json_generic) 

215 

216 def hasTable(self) -> bool: 

217 """Indicate if this element is associated with a table. 

218 

219 Return `True` if this element is associated with a table 

220 (even if that table "belongs" to another element). 

221 """ 

222 return self.has_own_table or self.implied_union_target is not None 

223 

224 universe: DimensionUniverse 

225 """The universe of all compatible dimensions with which this element is 

226 associated (`DimensionUniverse`). 

227 """ 

228 

229 @property 

230 @cached_getter 

231 def governor(self) -> GovernorDimension | None: 

232 """Return the governor dimension. 

233 

234 This is the `GovernorDimension` that is a required dependency of this 

235 element, or `None` if there is no such dimension (`GovernorDimension` 

236 or `None`). 

237 """ 

238 if len(self.minimal_group.governors) == 1: 

239 (result,) = self.minimal_group.governors 

240 return cast("GovernorDimension", self.universe[result]) 

241 elif len(self.minimal_group.governors) > 1: 

242 raise RuntimeError( 

243 f"Dimension element {self.name} has multiple governors: {self.minimal_group.governors}." 

244 ) 

245 else: 

246 return None 

247 

248 @property 

249 @abstractmethod 

250 def required(self) -> NamedValueAbstractSet[Dimension]: 

251 """Return the required dimensions. 

252 

253 Dimensions that are necessary to uniquely identify a record of this 

254 dimension element. 

255 

256 For elements with a database representation, these dimension are 

257 exactly those used to form the (possibly compound) primary key, and all 

258 dimensions here that are not ``self`` are also used to form foreign 

259 keys. 

260 

261 For `Dimension` instances, this should be exactly the same as 

262 ``graph.required``, but that may not be true for `DimensionElement` 

263 instances in general. When they differ, there are multiple 

264 combinations of dimensions that uniquely identify this element, but 

265 this one is more direct. 

266 """ 

267 raise NotImplementedError() 

268 

269 @property 

270 @abstractmethod 

271 def implied(self) -> NamedValueAbstractSet[Dimension]: 

272 """Return the implied dimensions. 

273 

274 Other dimensions that are uniquely identified directly by a record 

275 of this dimension element. 

276 

277 For elements with a database representation, these are exactly the 

278 dimensions used to form foreign key constraints whose fields are not 

279 (wholly) also part of the primary key. 

280 

281 Unlike ``self.graph.implied``, this set is not expanded recursively. 

282 """ 

283 raise NotImplementedError() 

284 

285 @property 

286 @cached_getter 

287 def dimensions(self) -> NamedValueAbstractSet[Dimension]: 

288 """Return all dimensions. 

289 

290 The union of `required` and `implied`, with all elements in 

291 `required` before any elements in `implied`. 

292 

293 This differs from ``self.graph.dimensions`` both in order and in 

294 content: 

295 

296 - as in ``self.implied``, implied dimensions are not expanded 

297 recursively here; 

298 - implied dimensions appear after required dimensions here, instead of 

299 being topologically ordered. 

300 

301 As a result, this set is ordered consistently with 

302 ``self.RecordClass.fields``. 

303 """ 

304 return NamedValueSet(list(self.required) + list(self.implied)).freeze() 

305 

306 # Deprecated via a warning from its implementation. 

307 # TODO: remove on DM-41326. 

308 @property 

309 def graph(self) -> DimensionGraph: 

310 """Return minimal graph that includes this element (`DimensionGraph`). 

311 

312 ``self.graph.required`` includes all dimensions whose primary key 

313 values are sufficient (often necessary) to uniquely identify ``self`` 

314 (including ``self`` if ``isinstance(self, Dimension)``. 

315 ``self.graph.implied`` includes all dimensions also identified 

316 (possibly recursively) by this set. 

317 """ 

318 return self.minimal_group._as_graph() 

319 

320 @property 

321 @cached_getter 

322 def minimal_group(self) -> DimensionGroup: 

323 """Return minimal dimension group that includes this element. 

324 

325 ``self.minimal_group.required`` includes all dimensions whose primary 

326 key values are sufficient (often necessary) to uniquely identify 

327 ``self`` (including ``self`` if ``isinstance(self, Dimension)``. 

328 ``self.minimal_group.implied`` includes all dimensions also identified 

329 (possibly recursively) by this set. 

330 """ 

331 return self.universe.conform(self.dimensions.names) 

332 

333 @property 

334 @cached_getter 

335 def RecordClass(self) -> type[DimensionRecord]: 

336 """Return the record subclass for this element. 

337 

338 The `DimensionRecord` subclass used to hold records for this element 

339 (`type`). 

340 

341 Because `DimensionRecord` subclasses are generated dynamically, this 

342 type cannot be imported directly and hence can only be obtained from 

343 this attribute. 

344 """ 

345 from ._records import _subclassDimensionRecord 

346 

347 return _subclassDimensionRecord(self) 

348 

349 @property 

350 def alternate_keys(self) -> NamedValueAbstractSet[KeyColumnSpec]: 

351 """Additional unique key fields for this dimension element that are not 

352 the primary key (`NamedValueAbstractSet` of `KeyColumnSpec`). 

353 

354 This is always empty for elements that are not dimensions. 

355 

356 If this dimension has required dependencies, the keys of those 

357 dimensions are also included in the unique constraints defined for 

358 these alternate keys. 

359 """ 

360 return NamedValueSet().freeze() 

361 

362 @property 

363 @abstractmethod 

364 def metadata_columns(self) -> NamedValueAbstractSet[MetadataColumnSpec]: 

365 """Additional metadata fields included in this element's table. 

366 

367 (`NamedValueSet` of `MetadataColumnSpec`). 

368 """ 

369 raise NotImplementedError() 

370 

371 @property 

372 @cached_getter 

373 def metadata(self) -> NamedValueAbstractSet[ddl.FieldSpec]: 

374 """Additional metadata fields included in this element's table. 

375 

376 (`NamedValueSet` of `FieldSpec`). 

377 """ 

378 return NamedValueSet([column_spec.to_sql_spec() for column_spec in self.metadata_columns]).freeze() 

379 

380 @property 

381 def viewOf(self) -> str | None: 

382 """Name of another table this element's records are drawn from. 

383 

384 (`str` or `None`). 

385 """ 

386 return self.implied_union_target.name if self.implied_union_target is not None else None 

387 

388 @property 

389 def alwaysJoin(self) -> bool: 

390 """Indicate if the element should always be included. 

391 

392 If `True`, always include this element in any query or data ID in 

393 which its ``required`` dimensions appear, because it defines a 

394 relationship between those dimensions that must always be satisfied. 

395 """ 

396 return False 

397 

398 @property 

399 def has_own_table(self) -> bool: 

400 """Whether this element should have its own table in the database.""" 

401 return self.implied_union_target is None 

402 

403 @property 

404 def implied_union_target(self) -> DimensionElement | None: 

405 """If not `None`, another element whose implied values for this element 

406 form the set of allowable values. 

407 

408 For example, in the default dimension universe, the allowed values for 

409 ``band`` is the union of all ``band`` values in the ``physical_filter`` 

410 table, so the `implied_union_target` for ``band`` is 

411 ``physical_filter``. 

412 """ 

413 return None 

414 

415 @property 

416 def defines_relationships(self) -> bool: 

417 """Whether this element's records define one or more relationships that 

418 must be satisfied in rows over dimensions that include it. 

419 """ 

420 return bool(self.implied) 

421 

422 @property 

423 def is_cached(self) -> bool: 

424 """Whether this element's records should be aggressively cached, 

425 because they are small in number and rarely inserted. 

426 """ 

427 return False 

428 

429 @property 

430 @abstractmethod 

431 def populated_by(self) -> Dimension | None: 

432 """The dimension that this element's records are always inserted, 

433 exported, and imported alongside. 

434 

435 Notes 

436 ----- 

437 When this is `None` (as it will be, at least at first, for any data 

438 repositories created before this attribute was added), records for 

439 this element will often need to be exported manually when datasets 

440 associated with some other related dimension are exported, in order for 

441 the post-import data repository to function as expected. 

442 """ 

443 raise NotImplementedError() 

444 

445 @property 

446 @cached_getter 

447 def schema(self) -> DimensionRecordSchema: 

448 """A description of the columns in this element's records and (at least 

449 conceptual) table. 

450 """ 

451 from ._schema import DimensionRecordSchema 

452 

453 return DimensionRecordSchema(self) 

454 

455 @property 

456 @abstractmethod 

457 def documentation(self) -> str: 

458 """Extended description of this dimension element.""" 

459 raise NotImplementedError() 

460 

461 

462class Dimension(DimensionElement): 

463 """A dimension. 

464 

465 A named data-organization concept that can be used as a key in a data 

466 ID. 

467 """ 

468 

469 @property 

470 @abstractmethod 

471 def unique_keys(self) -> NamedValueAbstractSet[KeyColumnSpec]: 

472 """Descriptions of unique identifiers for this dimension. 

473 

474 All fields that can individually be used to identify records of this 

475 element, given the primary keys of all required dependencies 

476 (`NamedValueAbstractSet` of `KeyColumnSpec`). 

477 """ 

478 raise NotImplementedError() 

479 

480 @property 

481 @cached_getter 

482 def primary_key(self) -> KeyColumnSpec: 

483 """The primary key field for this dimension (`KeyColumnSpec`). 

484 

485 Note that the database primary keys for dimension tables are in general 

486 compound; this field is the only field in the database primary key that 

487 is not also a foreign key (to a required dependency dimension table). 

488 """ 

489 primary_ey, *_ = self.unique_keys 

490 return primary_ey 

491 

492 @property 

493 @cached_getter 

494 def alternate_keys(self) -> NamedValueAbstractSet[KeyColumnSpec]: 

495 # Docstring inherited. 

496 _, *alternate_keys = self.unique_keys 

497 return NamedValueSet(alternate_keys).freeze() 

498 

499 @property 

500 @cached_getter 

501 def uniqueKeys(self) -> NamedValueAbstractSet[ddl.FieldSpec]: 

502 """Return the unique fields. 

503 

504 All fields that can individually be used to identify records of this 

505 element, given the primary keys of all required dependencies 

506 (`NamedValueAbstractSet` of `FieldSpec`). 

507 """ 

508 return NamedValueSet( 

509 [column_spec.to_sql_spec(primaryKey=(n == 0)) for n, column_spec in enumerate(self.unique_keys)] 

510 ) 

511 

512 @property 

513 @cached_getter 

514 def primaryKey(self) -> ddl.FieldSpec: 

515 """Return primary key field for this dimension (`FieldSpec`). 

516 

517 Note that the database primary keys for dimension tables are in general 

518 compound; this field is the only field in the database primary key that 

519 is not also a foreign key (to a required dependency dimension table). 

520 """ 

521 primaryKey, *_ = self.uniqueKeys 

522 return primaryKey 

523 

524 @property 

525 @cached_getter 

526 def alternateKeys(self) -> NamedValueAbstractSet[ddl.FieldSpec]: 

527 """Return alternate keys. 

528 

529 Additional unique key fields for this dimension that are not the 

530 primary key (`NamedValueAbstractSet` of `FieldSpec`). 

531 

532 If this dimension has required dependencies, the keys of those 

533 dimensions are also included in the unique constraints defined for 

534 these alternate keys. 

535 """ 

536 _, *alternateKeys = self.uniqueKeys 

537 return NamedValueSet(alternateKeys).freeze() 

538 

539 @property 

540 def populated_by(self) -> Dimension: 

541 # Docstring inherited. 

542 return self 

543 

544 def to_arrow(self, dimensions: DimensionGroup, spec: KeyColumnSpec | None = None) -> arrow_utils.ToArrow: 

545 """Return an object that converts the primary key value for this 

546 dimension to column in an Arrow table. 

547 

548 Parameters 

549 ---------- 

550 dimensions : `DimensionGroup` 

551 Full set of dimensions over which the rows of the table are unique 

552 or close to unique. This is used to determine whether to use 

553 Arrow's dictionary encoding to compress duplicate values. 

554 spec : `KeyColumnSpec`, optional 

555 Column specification for this dimension. If not provided, a copy 

556 of `primary_key` the the field name replaced with the dimension 

557 name will be used, which is appropriate for when this dimension 

558 appears in data ID or the dimension record tables of other 

559 dimension elements. 

560 

561 Returns 

562 ------- 

563 converter : `arrow_utils.ToArrow` 

564 Converter for this dimension's primary key. 

565 """ 

566 if spec is None: 

567 spec = self.primary_key.model_copy(update={"name": self.name}) 

568 if dimensions != self.minimal_group and spec.type != "int": 

569 # Values are large and will be duplicated in rows that are unique 

570 # over these dimensions, so dictionary encoding may help a lot. 

571 return spec.to_arrow().dictionary_encoded() 

572 else: 

573 return spec.to_arrow() 

574 

575 

576class DimensionCombination(DimensionElement): 

577 """Element with extra information. 

578 

579 A `DimensionElement` that provides extra metadata and/or relationship 

580 endpoint information for a combination of dimensions. 

581 """