Coverage for python/lsst/daf/butler/dimensions/_elements.py: 72%

188 statements  

« prev     ^ index     » next       coverage.py v7.4.4, created at 2024-03-30 09:59 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28from __future__ import annotations 

29 

30__all__ = ( 

31 "Dimension", 

32 "DimensionCombination", 

33 "DimensionElement", 

34) 

35 

36from abc import abstractmethod 

37from typing import TYPE_CHECKING, Annotated, Any, ClassVar, TypeAlias, Union, cast 

38 

39import pydantic 

40from lsst.utils.classes import cached_getter 

41from pydantic_core import core_schema 

42 

43from .. import arrow_utils, column_spec, ddl, pydantic_utils 

44from .._named import NamedValueAbstractSet, NamedValueSet 

45from .._topology import TopologicalRelationshipEndpoint 

46from ..json import from_json_generic, to_json_generic 

47 

48if TYPE_CHECKING: # Imports needed only for type annotations; may be circular. 

49 from ..registry import Registry 

50 from ._governor import GovernorDimension 

51 from ._graph import DimensionGraph 

52 from ._group import DimensionGroup 

53 from ._records import DimensionRecord 

54 from ._schema import DimensionRecordSchema 

55 from ._universe import DimensionUniverse 

56 

57KeyColumnSpec: TypeAlias = Annotated[ 

58 Union[ 

59 column_spec.IntColumnSpec, 

60 column_spec.StringColumnSpec, 

61 column_spec.HashColumnSpec, 

62 ], 

63 pydantic.Field(discriminator="type"), 

64] 

65 

66MetadataColumnSpec: TypeAlias = Annotated[ 

67 Union[ 

68 column_spec.IntColumnSpec, 

69 column_spec.StringColumnSpec, 

70 column_spec.FloatColumnSpec, 

71 column_spec.HashColumnSpec, 

72 column_spec.BoolColumnSpec, 

73 ], 

74 pydantic.Field(discriminator="type"), 

75] 

76 

77 

78class DimensionElement(TopologicalRelationshipEndpoint): 

79 """A label and/or metadata in the dimensions system. 

80 

81 A named data-organization concept that defines a label and/or metadata 

82 in the dimensions system. 

83 

84 A `DimensionElement` instance typically corresponds to a _logical_ table in 

85 the `Registry`: either an actual database table or a way of generating rows 

86 on-the-fly that can similarly participate in queries. The rows in that 

87 table are represented by instances of a `DimensionRecord` subclass. Most 

88 `DimensionElement` instances are instances of its `Dimension` subclass, 

89 which is used for elements that can be used as data ID keys. 

90 

91 Notes 

92 ----- 

93 `DimensionElement` instances should always be constructed by and retrieved 

94 from a `DimensionUniverse`. They are immutable after they are fully 

95 constructed, and should never be copied. 

96 

97 Pickling a `DimensionElement` just records its name and universe; 

98 unpickling one actually just looks up the element via the singleton 

99 dictionary of all universes. This allows pickle to be used to transfer 

100 elements between processes, but only when each process initializes its own 

101 instance of the same `DimensionUniverse`. 

102 """ 

103 

104 def __str__(self) -> str: 

105 return self.name 

106 

107 def __repr__(self) -> str: 

108 return f"{type(self).__name__}({self.name})" 

109 

110 def __eq__(self, other: Any) -> bool: 

111 try: 

112 return self.name == other.name 

113 except AttributeError: 

114 # TODO: try removing this fallback; it's not really consistent with 

115 # base class intent, and it could be confusing 

116 return self.name == other 

117 

118 def __hash__(self) -> int: 

119 return hash(self.name) 

120 

121 # TODO: try removing comparison operators; DimensionUniverse.sorted should 

122 # be adequate. 

123 

124 def __lt__(self, other: DimensionElement) -> bool: 

125 try: 

126 return self.universe.getElementIndex(self.name) < self.universe.getElementIndex(other.name) 

127 except KeyError: 

128 return NotImplemented 

129 

130 def __le__(self, other: DimensionElement) -> bool: 

131 try: 

132 return self.universe.getElementIndex(self.name) <= self.universe.getElementIndex(other.name) 

133 except KeyError: 

134 return NotImplemented 

135 

136 def __gt__(self, other: DimensionElement) -> bool: 

137 try: 

138 return self.universe.getElementIndex(self.name) > self.universe.getElementIndex(other.name) 

139 except KeyError: 

140 return NotImplemented 

141 

142 def __ge__(self, other: DimensionElement) -> bool: 

143 try: 

144 return self.universe.getElementIndex(self.name) >= self.universe.getElementIndex(other.name) 

145 except KeyError: 

146 return NotImplemented 

147 

148 @classmethod 

149 def _unpickle(cls, universe: DimensionUniverse, name: str) -> DimensionElement: 

150 """Callable used for unpickling. 

151 

152 For internal use only. 

153 """ 

154 return universe[name] 

155 

156 def __reduce__(self) -> tuple: 

157 return (self._unpickle, (self.universe, self.name)) 

158 

159 def __deepcopy__(self, memo: dict) -> DimensionElement: 

160 # DimensionElement is recursively immutable; see note in @immutable 

161 # decorator. 

162 return self 

163 

164 def to_simple(self, minimal: bool = False) -> str: 

165 """Convert this class to a simple python type. 

166 

167 This is suitable for serialization. 

168 

169 Parameters 

170 ---------- 

171 minimal : `bool`, optional 

172 Use minimal serialization. Has no effect on for this class. 

173 

174 Returns 

175 ------- 

176 simple : `str` 

177 The object converted to a single string. 

178 """ 

179 return self.name 

180 

181 @classmethod 

182 def from_simple( 

183 cls, simple: str, universe: DimensionUniverse | None = None, registry: Registry | None = None 

184 ) -> DimensionElement: 

185 """Construct a new object from the simplified form. 

186 

187 Usually the data is returned from the `to_simple` method. 

188 

189 Parameters 

190 ---------- 

191 simple : `str` 

192 The value returned by `to_simple()`. 

193 universe : `DimensionUniverse` 

194 The special graph of all known dimensions. 

195 registry : `lsst.daf.butler.Registry`, optional 

196 Registry from which a universe can be extracted. Can be `None` 

197 if universe is provided explicitly. 

198 

199 Returns 

200 ------- 

201 dataId : `DimensionElement` 

202 Newly-constructed object. 

203 """ 

204 if universe is None and registry is None: 

205 raise ValueError("One of universe or registry is required to convert a dict to a DataCoordinate") 

206 if universe is None and registry is not None: 

207 universe = registry.dimensions 

208 if universe is None: 

209 # this is for mypy 

210 raise ValueError("Unable to determine a usable universe") 

211 

212 return universe[simple] 

213 

214 to_json = to_json_generic 

215 from_json: ClassVar = classmethod(from_json_generic) 

216 

217 def hasTable(self) -> bool: 

218 """Indicate if this element is associated with a table. 

219 

220 Return `True` if this element is associated with a table 

221 (even if that table "belongs" to another element). 

222 """ 

223 return self.has_own_table or self.implied_union_target is not None 

224 

225 universe: DimensionUniverse 

226 """The universe of all compatible dimensions with which this element is 

227 associated (`DimensionUniverse`). 

228 """ 

229 

230 @property 

231 @cached_getter 

232 def governor(self) -> GovernorDimension | None: 

233 """Return the governor dimension. 

234 

235 This is the `GovernorDimension` that is a required dependency of this 

236 element, or `None` if there is no such dimension (`GovernorDimension` 

237 or `None`). 

238 """ 

239 if len(self.minimal_group.governors) == 1: 

240 (result,) = self.minimal_group.governors 

241 return cast("GovernorDimension", self.universe[result]) 

242 elif len(self.minimal_group.governors) > 1: 

243 raise RuntimeError( 

244 f"Dimension element {self.name} has multiple governors: {self.minimal_group.governors}." 

245 ) 

246 else: 

247 return None 

248 

249 @property 

250 @abstractmethod 

251 def required(self) -> NamedValueAbstractSet[Dimension]: 

252 """Return the required dimensions. 

253 

254 Dimensions that are necessary to uniquely identify a record of this 

255 dimension element. 

256 

257 For elements with a database representation, these dimension are 

258 exactly those used to form the (possibly compound) primary key, and all 

259 dimensions here that are not ``self`` are also used to form foreign 

260 keys. 

261 

262 For `Dimension` instances, this should be exactly the same as 

263 ``graph.required``, but that may not be true for `DimensionElement` 

264 instances in general. When they differ, there are multiple 

265 combinations of dimensions that uniquely identify this element, but 

266 this one is more direct. 

267 """ 

268 raise NotImplementedError() 

269 

270 @property 

271 @abstractmethod 

272 def implied(self) -> NamedValueAbstractSet[Dimension]: 

273 """Return the implied dimensions. 

274 

275 Other dimensions that are uniquely identified directly by a record 

276 of this dimension element. 

277 

278 For elements with a database representation, these are exactly the 

279 dimensions used to form foreign key constraints whose fields are not 

280 (wholly) also part of the primary key. 

281 

282 Unlike ``self.graph.implied``, this set is not expanded recursively. 

283 """ 

284 raise NotImplementedError() 

285 

286 @property 

287 @cached_getter 

288 def dimensions(self) -> NamedValueAbstractSet[Dimension]: 

289 """Return all dimensions. 

290 

291 The union of `required` and `implied`, with all elements in 

292 `required` before any elements in `implied`. 

293 

294 This differs from ``self.graph.dimensions`` both in order and in 

295 content: 

296 

297 - as in ``self.implied``, implied dimensions are not expanded 

298 recursively here; 

299 - implied dimensions appear after required dimensions here, instead of 

300 being topologically ordered. 

301 

302 As a result, this set is ordered consistently with 

303 ``self.RecordClass.fields``. 

304 """ 

305 return NamedValueSet(list(self.required) + list(self.implied)).freeze() 

306 

307 # Deprecated via a warning from its implementation. 

308 # TODO: remove on DM-41326. 

309 @property 

310 def graph(self) -> DimensionGraph: 

311 """Return minimal graph that includes this element (`DimensionGraph`). 

312 

313 ``self.graph.required`` includes all dimensions whose primary key 

314 values are sufficient (often necessary) to uniquely identify ``self`` 

315 (including ``self`` if ``isinstance(self, Dimension)``. 

316 ``self.graph.implied`` includes all dimensions also identified 

317 (possibly recursively) by this set. 

318 """ 

319 return self.minimal_group._as_graph() 

320 

321 @property 

322 @cached_getter 

323 def minimal_group(self) -> DimensionGroup: 

324 """Return minimal dimension group that includes this element. 

325 

326 ``self.minimal_group.required`` includes all dimensions whose primary 

327 key values are sufficient (often necessary) to uniquely identify 

328 ``self`` (including ``self`` if ``isinstance(self, Dimension)``. 

329 ``self.minimal_group.implied`` includes all dimensions also identified 

330 (possibly recursively) by this set. 

331 """ 

332 return self.universe.conform(self.dimensions.names) 

333 

334 @property 

335 @cached_getter 

336 def RecordClass(self) -> type[DimensionRecord]: 

337 """Return the record subclass for this element. 

338 

339 The `DimensionRecord` subclass used to hold records for this element 

340 (`type`). 

341 

342 Because `DimensionRecord` subclasses are generated dynamically, this 

343 type cannot be imported directly and hence can only be obtained from 

344 this attribute. 

345 """ 

346 from ._records import _subclassDimensionRecord 

347 

348 return _subclassDimensionRecord(self) 

349 

350 @property 

351 def alternate_keys(self) -> NamedValueAbstractSet[KeyColumnSpec]: 

352 """Additional unique key fields for this dimension element that are not 

353 the primary key (`NamedValueAbstractSet` of `KeyColumnSpec`). 

354 

355 This is always empty for elements that are not dimensions. 

356 

357 If this dimension has required dependencies, the keys of those 

358 dimensions are also included in the unique constraints defined for 

359 these alternate keys. 

360 """ 

361 return NamedValueSet().freeze() 

362 

363 @property 

364 @abstractmethod 

365 def metadata_columns(self) -> NamedValueAbstractSet[MetadataColumnSpec]: 

366 """Additional metadata fields included in this element's table. 

367 

368 (`NamedValueSet` of `MetadataColumnSpec`). 

369 """ 

370 raise NotImplementedError() 

371 

372 @property 

373 @cached_getter 

374 def metadata(self) -> NamedValueAbstractSet[ddl.FieldSpec]: 

375 """Additional metadata fields included in this element's table. 

376 

377 (`NamedValueSet` of `FieldSpec`). 

378 """ 

379 return NamedValueSet([column_spec.to_sql_spec() for column_spec in self.metadata_columns]).freeze() 

380 

381 @property 

382 def viewOf(self) -> str | None: 

383 """Name of another table this element's records are drawn from. 

384 

385 (`str` or `None`). 

386 """ 

387 return self.implied_union_target.name if self.implied_union_target is not None else None 

388 

389 @property 

390 def alwaysJoin(self) -> bool: 

391 """Indicate if the element should always be included. 

392 

393 If `True`, always include this element in any query or data ID in 

394 which its ``required`` dimensions appear, because it defines a 

395 relationship between those dimensions that must always be satisfied. 

396 """ 

397 return False 

398 

399 @property 

400 def has_own_table(self) -> bool: 

401 """Whether this element should have its own table in the database.""" 

402 return self.implied_union_target is None 

403 

404 @property 

405 def implied_union_target(self) -> DimensionElement | None: 

406 """If not `None`, another element whose implied values for this element 

407 form the set of allowable values. 

408 

409 For example, in the default dimension universe, the allowed values for 

410 ``band`` is the union of all ``band`` values in the ``physical_filter`` 

411 table, so the `implied_union_target` for ``band`` is 

412 ``physical_filter``. 

413 """ 

414 return None 

415 

416 @property 

417 def defines_relationships(self) -> bool: 

418 """Whether this element's records define one or more relationships that 

419 must be satisfied in rows over dimensions that include it. 

420 """ 

421 return bool(self.implied) 

422 

423 @property 

424 def is_cached(self) -> bool: 

425 """Whether this element's records should be aggressively cached, 

426 because they are small in number and rarely inserted. 

427 """ 

428 return False 

429 

430 @property 

431 @abstractmethod 

432 def populated_by(self) -> Dimension | None: 

433 """The dimension that this element's records are always inserted, 

434 exported, and imported alongside. 

435 

436 Notes 

437 ----- 

438 When this is `None` (as it will be, at least at first, for any data 

439 repositories created before this attribute was added), records for 

440 this element will often need to be exported manually when datasets 

441 associated with some other related dimension are exported, in order for 

442 the post-import data repository to function as expected. 

443 """ 

444 raise NotImplementedError() 

445 

446 @property 

447 @cached_getter 

448 def schema(self) -> DimensionRecordSchema: 

449 """A description of the columns in this element's records and (at least 

450 conceptual) table. 

451 """ 

452 from ._schema import DimensionRecordSchema 

453 

454 return DimensionRecordSchema(self) 

455 

456 @property 

457 @abstractmethod 

458 def documentation(self) -> str: 

459 """Extended description of this dimension element.""" 

460 raise NotImplementedError() 

461 

462 @classmethod 

463 def _validate(cls, data: Any, info: pydantic.ValidationInfo) -> DimensionElement: 

464 """Pydantic validator (deserializer) for `DimensionElement`. 

465 

466 This satisfies the `pydantic.WithInfoPlainValidatorFunction` signature. 

467 """ 

468 universe = pydantic_utils.get_universe_from_context(info.context) 

469 return universe[data] 

470 

471 def _serialize(self) -> str: 

472 """Pydantic serializer for `DimensionElement`. 

473 

474 This satisfies the `pydantic.PlainSerializerFunction` signature. 

475 """ 

476 return self.name 

477 

478 @classmethod 

479 def __get_pydantic_core_schema__( 

480 cls, source_type: Any, handler: pydantic.GetCoreSchemaHandler 

481 ) -> core_schema.CoreSchema: 

482 # This is the Pydantic hook for overriding serialization, validation, 

483 # and JSON schema generation. 

484 str_schema = core_schema.str_schema() 

485 from_str_schema = core_schema.chain_schema( 

486 [str_schema, core_schema.with_info_plain_validator_function(cls._validate)] 

487 ) 

488 return core_schema.json_or_python_schema( 

489 # When deserializing from JSON, expect it to be a `str` 

490 json_schema=from_str_schema, 

491 # When deserializing from Python, first see if it's already a 

492 # DimensionElement and then try conversion from `str`. 

493 python_schema=core_schema.union_schema( 

494 [core_schema.is_instance_schema(DimensionElement), from_str_schema] 

495 ), 

496 # When serializing convert it to a `str`. 

497 serialization=core_schema.plain_serializer_function_ser_schema( 

498 cls._serialize, return_schema=str_schema 

499 ), 

500 ) 

501 

502 

503class Dimension(DimensionElement): 

504 """A dimension. 

505 

506 A named data-organization concept that can be used as a key in a data 

507 ID. 

508 """ 

509 

510 @property 

511 @abstractmethod 

512 def unique_keys(self) -> NamedValueAbstractSet[KeyColumnSpec]: 

513 """Descriptions of unique identifiers for this dimension. 

514 

515 All fields that can individually be used to identify records of this 

516 element, given the primary keys of all required dependencies 

517 (`NamedValueAbstractSet` of `KeyColumnSpec`). 

518 """ 

519 raise NotImplementedError() 

520 

521 @property 

522 @cached_getter 

523 def primary_key(self) -> KeyColumnSpec: 

524 """The primary key field for this dimension (`KeyColumnSpec`). 

525 

526 Note that the database primary keys for dimension tables are in general 

527 compound; this field is the only field in the database primary key that 

528 is not also a foreign key (to a required dependency dimension table). 

529 """ 

530 primary_ey, *_ = self.unique_keys 

531 return primary_ey 

532 

533 @property 

534 @cached_getter 

535 def alternate_keys(self) -> NamedValueAbstractSet[KeyColumnSpec]: 

536 # Docstring inherited. 

537 _, *alternate_keys = self.unique_keys 

538 return NamedValueSet(alternate_keys).freeze() 

539 

540 @property 

541 @cached_getter 

542 def uniqueKeys(self) -> NamedValueAbstractSet[ddl.FieldSpec]: 

543 """Return the unique fields. 

544 

545 All fields that can individually be used to identify records of this 

546 element, given the primary keys of all required dependencies 

547 (`NamedValueAbstractSet` of `FieldSpec`). 

548 """ 

549 return NamedValueSet( 

550 [column_spec.to_sql_spec(primaryKey=(n == 0)) for n, column_spec in enumerate(self.unique_keys)] 

551 ) 

552 

553 @property 

554 @cached_getter 

555 def primaryKey(self) -> ddl.FieldSpec: 

556 """Return primary key field for this dimension (`FieldSpec`). 

557 

558 Note that the database primary keys for dimension tables are in general 

559 compound; this field is the only field in the database primary key that 

560 is not also a foreign key (to a required dependency dimension table). 

561 """ 

562 primaryKey, *_ = self.uniqueKeys 

563 return primaryKey 

564 

565 @property 

566 @cached_getter 

567 def alternateKeys(self) -> NamedValueAbstractSet[ddl.FieldSpec]: 

568 """Return alternate keys. 

569 

570 Additional unique key fields for this dimension that are not the 

571 primary key (`NamedValueAbstractSet` of `FieldSpec`). 

572 

573 If this dimension has required dependencies, the keys of those 

574 dimensions are also included in the unique constraints defined for 

575 these alternate keys. 

576 """ 

577 _, *alternateKeys = self.uniqueKeys 

578 return NamedValueSet(alternateKeys).freeze() 

579 

580 @property 

581 def populated_by(self) -> Dimension: 

582 # Docstring inherited. 

583 return self 

584 

585 def to_arrow(self, dimensions: DimensionGroup, spec: KeyColumnSpec | None = None) -> arrow_utils.ToArrow: 

586 """Return an object that converts the primary key value for this 

587 dimension to column in an Arrow table. 

588 

589 Parameters 

590 ---------- 

591 dimensions : `DimensionGroup` 

592 Full set of dimensions over which the rows of the table are unique 

593 or close to unique. This is used to determine whether to use 

594 Arrow's dictionary encoding to compress duplicate values. 

595 spec : `KeyColumnSpec`, optional 

596 Column specification for this dimension. If not provided, a copy 

597 of `primary_key` the the field name replaced with the dimension 

598 name will be used, which is appropriate for when this dimension 

599 appears in data ID or the dimension record tables of other 

600 dimension elements. 

601 

602 Returns 

603 ------- 

604 converter : `arrow_utils.ToArrow` 

605 Converter for this dimension's primary key. 

606 """ 

607 if spec is None: 

608 spec = self.primary_key.model_copy(update={"name": self.name}) 

609 if dimensions != self.minimal_group and spec.type != "int": 

610 # Values are large and will be duplicated in rows that are unique 

611 # over these dimensions, so dictionary encoding may help a lot. 

612 return spec.to_arrow().dictionary_encoded() 

613 else: 

614 return spec.to_arrow() 

615 

616 

617class DimensionCombination(DimensionElement): 

618 """Element with extra information. 

619 

620 A `DimensionElement` that provides extra metadata and/or relationship 

621 endpoint information for a combination of dimensions. 

622 """