Coverage for python/lsst/daf/butler/dimensions/_records.py: 21%

162 statements  

« prev     ^ index     » next       coverage.py v7.5.0, created at 2024-05-02 10:24 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28from __future__ import annotations 

29 

30__all__ = ("DimensionRecord", "SerializedDimensionRecord") 

31 

32from collections.abc import Hashable 

33from typing import TYPE_CHECKING, Any, ClassVar 

34 

35import lsst.sphgeom 

36from lsst.utils.classes import immutable 

37from pydantic import BaseModel, Field, StrictBool, StrictFloat, StrictInt, StrictStr, create_model 

38 

39from .._timespan import Timespan 

40from ..json import from_json_pydantic, to_json_pydantic 

41from ..persistence_context import PersistenceContextVars 

42from ._elements import Dimension, DimensionElement 

43 

44if TYPE_CHECKING: # Imports needed only for type annotations; may be circular. 

45 from ..registry import Registry 

46 from ._coordinate import DataCoordinate 

47 from ._schema import DimensionElementFields 

48 from ._universe import DimensionUniverse 

49 

50 

51def _reconstructDimensionRecord(definition: DimensionElement, mapping: dict[str, Any]) -> DimensionRecord: 

52 """Unpickle implementation for `DimensionRecord` subclasses. 

53 

54 For internal use by `DimensionRecord`. 

55 """ 

56 return definition.RecordClass(**mapping) 

57 

58 

59def _subclassDimensionRecord(definition: DimensionElement) -> type[DimensionRecord]: 

60 """Create a dynamic subclass of `DimensionRecord` for the given element. 

61 

62 For internal use by `DimensionRecord`. 

63 """ 

64 from ._schema import DimensionElementFields 

65 

66 fields = DimensionElementFields(definition) 

67 slots = list(fields.standard.names) 

68 if definition.spatial: 

69 slots.append("region") 

70 if definition.temporal: 

71 slots.append("timespan") 

72 d = {"definition": definition, "__slots__": tuple(slots), "fields": fields} 

73 return type(definition.name + ".RecordClass", (DimensionRecord,), d) 

74 

75 

76class SpecificSerializedDimensionRecord(BaseModel, extra="forbid"): 

77 """Base model for a specific serialized record content.""" 

78 

79 

80_SIMPLE_RECORD_CLASS_CACHE: dict[ 

81 tuple[DimensionElement, DimensionUniverse], type[SpecificSerializedDimensionRecord] 

82] = {} 

83 

84 

85def _createSimpleRecordSubclass(definition: DimensionElement) -> type[SpecificSerializedDimensionRecord]: 

86 from ._schema import DimensionElementFields 

87 

88 # Cache on the definition (which hashes as the name) and the 

89 # associated universe. 

90 cache_key = (definition, definition.universe) 

91 if cache_key in _SIMPLE_RECORD_CLASS_CACHE: 

92 return _SIMPLE_RECORD_CLASS_CACHE[cache_key] 

93 

94 fields = DimensionElementFields(definition) 

95 members = {} 

96 # Prefer strict typing for external data 

97 type_map = { 

98 str: StrictStr, 

99 float: StrictFloat, 

100 bool: StrictBool, 

101 int: StrictInt, 

102 } 

103 

104 for field in fields.standard: 

105 field_type = field.getPythonType() 

106 field_type = type_map.get(field_type, field_type) 

107 if field.nullable: 

108 field_type = field_type | None # type: ignore 

109 members[field.name] = (field_type, ...) 

110 if definition.temporal: 

111 members["timespan"] = (Timespan | None, ...) # type: ignore 

112 if definition.spatial: 

113 members["region"] = (str | None, ...) # type: ignore 

114 

115 # For the new derived class name need to convert to camel case. 

116 # so "day_obs" -> "DayObs". 

117 derived_name = "".join([part.capitalize() for part in definition.name.split("_")]) 

118 

119 model = create_model( 

120 f"SpecificSerializedDimensionRecord{derived_name}", 

121 __base__=SpecificSerializedDimensionRecord, 

122 **members, # type: ignore 

123 ) 

124 

125 _SIMPLE_RECORD_CLASS_CACHE[cache_key] = model 

126 return model 

127 

128 

129# While supporting pydantic v1 and v2 keep this outside the model. 

130_serialized_dimension_record_schema_extra = { 

131 "examples": [ 

132 { 

133 "definition": "detector", 

134 "record": { 

135 "instrument": "HSC", 

136 "id": 72, 

137 "full_name": "0_01", 

138 "name_in_raft": "01", 

139 "raft": "0", 

140 "purpose": "SCIENCE", 

141 }, 

142 } 

143 ] 

144} 

145 

146 

147class SerializedDimensionRecord(BaseModel): 

148 """Simplified model for serializing a `DimensionRecord`.""" 

149 

150 definition: str = Field( 

151 ..., 

152 title="Name of dimension associated with this record.", 

153 examples=["exposure"], 

154 ) 

155 

156 # Use strict types to prevent casting 

157 record: dict[str, None | StrictBool | StrictInt | StrictFloat | StrictStr | Timespan] = Field( 

158 ..., 

159 title="Dimension record keys and values.", 

160 examples=[ 

161 { 

162 "definition": "exposure", 

163 "record": { 

164 "instrument": "LATISS", 

165 "exposure": 2021050300044, 

166 "obs_id": "AT_O_20210503_00044", 

167 }, 

168 } 

169 ], 

170 ) 

171 

172 model_config = { 

173 "json_schema_extra": _serialized_dimension_record_schema_extra, # type: ignore[typeddict-item] 

174 } 

175 

176 @classmethod 

177 def direct( 

178 cls, 

179 *, 

180 definition: str, 

181 record: dict[str, Any], 

182 ) -> SerializedDimensionRecord: 

183 """Construct a `SerializedDimensionRecord` directly without validators. 

184 

185 Parameters 

186 ---------- 

187 definition : `str` 

188 The name of the record. 

189 record : `dict` 

190 A dictionary representation of the record content. 

191 

192 Returns 

193 ------- 

194 rec : `SerializedDimensionRecord` 

195 A model representing the dimension records. 

196 

197 Notes 

198 ----- 

199 This differs from the pydantic "construct" method in that the arguments 

200 are explicitly what the model requires, and it will recurse through 

201 members, constructing them from their corresponding `direct` methods. 

202 

203 This method should only be called when the inputs are trusted. 

204 """ 

205 # This method requires tuples as values of the mapping, but JSON 

206 # readers will read things in as lists. Be kind and transparently 

207 # transform to tuples. 

208 _recItems = { 

209 k: ( 

210 v if type(v) is not list else Timespan(begin=None, end=None, _nsec=tuple(v)) # noqa: E721 

211 ) # type: ignore 

212 for k, v in record.items() 

213 } 

214 

215 # Type ignore because the ternary statement seems to confuse mypy 

216 # based on conflicting inferred types of v. 

217 key = ( 

218 definition, 

219 frozenset(_recItems.items()), 

220 ) 

221 cache = PersistenceContextVars.serializedDimensionRecordMapping.get() 

222 if cache is not None and (result := cache.get(key)) is not None: 

223 return result 

224 

225 node = cls.model_construct(definition=definition, record=_recItems) # type: ignore 

226 

227 if cache is not None: 

228 cache[key] = node 

229 return node 

230 

231 

232@immutable 

233class DimensionRecord: 

234 """Base class for the Python representation of database records. 

235 

236 Parameters 

237 ---------- 

238 **kwargs 

239 Field values for this record. Unrecognized keys are ignored. If this 

240 is the record for a `Dimension`, its primary key value may be provided 

241 with the actual name of the field (e.g. "id" or "name"), the name of 

242 the `Dimension`, or both. If this record class has a "timespan" 

243 attribute, "datetime_begin" and "datetime_end" keyword arguments may 

244 be provided instead of a single "timespan" keyword argument (but are 

245 ignored if a "timespan" argument is provided). 

246 

247 Notes 

248 ----- 

249 `DimensionRecord` subclasses are created dynamically for each 

250 `DimensionElement` in a `DimensionUniverse`, and are accessible via the 

251 `DimensionElement.RecordClass` attribute. The `DimensionRecord` base class 

252 itself is pure abstract, but does not use the `abc` module to indicate this 

253 because it does not have overridable methods. 

254 

255 Record classes have attributes that correspond exactly to the 

256 `~DimensionElementFields.standard` fields in the related database table, 

257 plus "region" and "timespan" attributes for spatial and/or temporal 

258 elements (respectively). 

259 

260 Instances are usually obtained from a `Registry`, but can be constructed 

261 directly from Python as well. 

262 

263 `DimensionRecord` instances are immutable. 

264 """ 

265 

266 # Derived classes are required to define __slots__ as well, and it's those 

267 # derived-class slots that other methods on the base class expect to see 

268 # when they access self.__slots__. 

269 __slots__ = ("dataId",) 

270 

271 _serializedType: ClassVar[type[BaseModel]] = SerializedDimensionRecord 

272 

273 def __init__(self, **kwargs: Any): 

274 # Accept either the dimension name or the actual name of its primary 

275 # key field; ensure both are present in the dict for convenience below. 

276 if isinstance(self.definition, Dimension): 

277 v = kwargs.get(self.definition.primaryKey.name) 

278 if v is None: 

279 v = kwargs.get(self.definition.name) 

280 if v is None: 

281 raise ValueError( 

282 f"No value provided for {self.definition.name}.{self.definition.primaryKey.name}." 

283 ) 

284 kwargs[self.definition.primaryKey.name] = v 

285 else: 

286 v2 = kwargs.setdefault(self.definition.name, v) 

287 if v != v2: 

288 raise ValueError( 

289 "Multiple inconsistent values for " 

290 f"{self.definition.name}.{self.definition.primaryKey.name}: {v!r} != {v2!r}." 

291 ) 

292 

293 from ._coordinate import DataCoordinate 

294 

295 object.__setattr__( 

296 self, 

297 "dataId", 

298 DataCoordinate.from_required_values( 

299 self.definition.minimal_group, 

300 tuple(kwargs[dimension] for dimension in self.definition.required.names), 

301 ), 

302 ) 

303 # Don't need the primary key value aliased to the dimension name 

304 # anymore. 

305 kwargs.pop(self.definition.name, None) 

306 

307 for name in self.__slots__: 

308 # Note that we remove from kwargs as we go, to make sure there's 

309 # nothing left at the end. 

310 object.__setattr__(self, name, kwargs.pop(name, None)) 

311 # Support 'datetime_begin' and 'datetime_end' instead of 'timespan' for 

312 # backwards compatibility, but if one is provided both must be. 

313 if self.definition.temporal is not None and self.timespan is None and "datetime_begin" in kwargs: 

314 object.__setattr__( 

315 self, 

316 "timespan", 

317 Timespan( 

318 kwargs.pop("datetime_begin"), 

319 kwargs.pop("datetime_end"), 

320 ), 

321 ) 

322 

323 if kwargs: 

324 raise TypeError(f"Invalid fields for {self.definition} dimension record: {set(kwargs.keys())}.") 

325 

326 def __eq__(self, other: Any) -> bool: 

327 if type(other) != type(self): 

328 return False 

329 return self.dataId == other.dataId 

330 

331 def __hash__(self) -> int: 

332 return hash(self.dataId.required_values) 

333 

334 def __str__(self) -> str: 

335 lines = [f"{self.definition.name}:"] 

336 lines.extend(f" {name}: {getattr(self, name)!r}" for name in self.__slots__) 

337 return "\n".join(lines) 

338 

339 def __repr__(self) -> str: 

340 return "{}.RecordClass({})".format( 

341 self.definition.name, ", ".join(f"{name}={getattr(self, name)!r}" for name in self.__slots__) 

342 ) 

343 

344 def __reduce__(self) -> tuple: 

345 mapping = {name: getattr(self, name) for name in self.__slots__} 

346 return (_reconstructDimensionRecord, (self.definition, mapping)) 

347 

348 def _repr_html_(self) -> str: 

349 """Override the default representation in IPython/Jupyter notebooks. 

350 

351 This gives a more readable output that understands embedded newlines. 

352 """ 

353 return f"<pre>{self}<pre>" 

354 

355 def to_simple(self, minimal: bool = False) -> SerializedDimensionRecord: 

356 """Convert this class to a simple python type. 

357 

358 This makes it suitable for serialization. 

359 

360 Parameters 

361 ---------- 

362 minimal : `bool`, optional 

363 Use minimal serialization. Has no effect on for this class. 

364 

365 Returns 

366 ------- 

367 names : `list` 

368 The names of the dimensions. 

369 """ 

370 # The DataId is sufficient if you are willing to do a deferred 

371 # query. This may not be overly useful since to reconstruct 

372 # a collection of records will require repeated registry queries. 

373 # For now do not implement minimal form. 

374 key = (id(self.definition), self.dataId) 

375 cache = PersistenceContextVars.serializedDimensionRecordMapping.get() 

376 if cache is not None and (result := cache.get(key)) is not None: 

377 return result 

378 

379 mapping = {name: getattr(self, name) for name in self.__slots__} 

380 for k, v in mapping.items(): 

381 if isinstance(v, lsst.sphgeom.Region): 

382 # YAML serialization specifies the class when it 

383 # doesn't have to. This is partly for explicitness 

384 # and also history. Here use a different approach. 

385 # This code needs to be migrated to sphgeom 

386 mapping[k] = v.encode().hex() 

387 if isinstance(v, bytes): 

388 # We actually can't handle serializing out to bytes for 

389 # hash objects, encode it here to a hex string 

390 mapping[k] = v.hex() 

391 definition = self.definition.to_simple(minimal=minimal) 

392 dimRec = SerializedDimensionRecord(definition=definition, record=mapping) 

393 if cache is not None: 

394 cache[key] = dimRec 

395 return dimRec 

396 

397 @classmethod 

398 def from_simple( 

399 cls, 

400 simple: SerializedDimensionRecord, 

401 universe: DimensionUniverse | None = None, 

402 registry: Registry | None = None, 

403 cacheKey: Hashable | None = None, 

404 ) -> DimensionRecord: 

405 """Construct a new object from the simplified form. 

406 

407 This is generally data returned from the `to_simple` 

408 method. 

409 

410 Parameters 

411 ---------- 

412 simple : `SerializedDimensionRecord` 

413 Value return from `to_simple`. 

414 universe : `DimensionUniverse` 

415 The special graph of all known dimensions of which this graph will 

416 be a subset. Can be `None` if `Registry` is provided. 

417 registry : `lsst.daf.butler.Registry`, optional 

418 Registry from which a universe can be extracted. Can be `None` 

419 if universe is provided explicitly. 

420 cacheKey : `Hashable` or `None` 

421 If this is not None, it will be used as a key for any cached 

422 reconstruction instead of calculating a value from the serialized 

423 format. 

424 

425 Returns 

426 ------- 

427 record : `DimensionRecord` 

428 Newly-constructed object. 

429 """ 

430 if universe is None and registry is None: 

431 raise ValueError("One of universe or registry is required to convert names to a DimensionGraph") 

432 if universe is None and registry is not None: 

433 universe = registry.dimensions 

434 if universe is None: 

435 # this is for mypy 

436 raise ValueError("Unable to determine a usable universe") 

437 # Type ignore because the ternary statement seems to confuse mypy 

438 # based on conflicting inferred types of v. 

439 key = cacheKey or ( 

440 simple.definition, 

441 frozenset(simple.record.items()), # type: ignore 

442 ) 

443 cache = PersistenceContextVars.dimensionRecords.get() 

444 if cache is not None and (result := cache.get(key)) is not None: 

445 return result 

446 

447 definition = DimensionElement.from_simple(simple.definition, universe=universe) 

448 

449 # Create a specialist subclass model with type validation. 

450 # This allows us to do simple checks of external data (possibly 

451 # sent as JSON) since for now _reconstructDimensionRecord does not 

452 # do any validation. 

453 record_model_cls = _createSimpleRecordSubclass(definition) 

454 record_model = record_model_cls(**simple.record) 

455 

456 # Region and hash have to be converted to native form; for now assume 

457 # that the keys are special. We make the mapping we need to pass to 

458 # the DimensionRecord constructor via getattr, because we don't 

459 # model_dump re-disassembling things like Timespans that we've already 

460 # assembled. 

461 mapping = {k: getattr(record_model, k) for k in definition.schema.names} 

462 

463 if mapping.get("region") is not None: 

464 mapping["region"] = lsst.sphgeom.Region.decode(bytes.fromhex(mapping["region"])) 

465 if "hash" in mapping: 

466 mapping["hash"] = bytes.fromhex(mapping["hash"].decode()) 

467 

468 dimRec = _reconstructDimensionRecord(definition, mapping) 

469 if cache is not None: 

470 cache[key] = dimRec 

471 return dimRec 

472 

473 to_json = to_json_pydantic 

474 from_json: ClassVar = classmethod(from_json_pydantic) 

475 

476 def toDict(self, splitTimespan: bool = False) -> dict[str, Any]: 

477 """Return a vanilla `dict` representation of this record. 

478 

479 Parameters 

480 ---------- 

481 splitTimespan : `bool`, optional 

482 If `True` (`False` is default) transform any "timespan" key value 

483 from a `Timespan` instance into a pair of regular 

484 ("datetime_begin", "datetime_end") fields. 

485 """ 

486 results = {name: getattr(self, name) for name in self.__slots__} 

487 if splitTimespan: 

488 timespan = results.pop("timespan", None) 

489 if timespan is not None: 

490 results["datetime_begin"] = timespan.begin 

491 results["datetime_end"] = timespan.end 

492 return results 

493 

494 # DimensionRecord subclasses are dynamically created, so static type 

495 # checkers can't know about them or their attributes. To avoid having to 

496 # put "type: ignore", everywhere, add a dummy __getattr__ that tells type 

497 # checkers not to worry about missing attributes. 

498 def __getattr__(self, name: str) -> Any: 

499 raise AttributeError(name) 

500 

501 # Class attributes below are shadowed by instance attributes, and are 

502 # present just to hold the docstrings for those instance attributes. 

503 

504 dataId: DataCoordinate 

505 """A dict-like identifier for this record's primary keys 

506 (`DataCoordinate`). 

507 """ 

508 

509 definition: ClassVar[DimensionElement] 

510 """The `DimensionElement` whose records this class represents 

511 (`DimensionElement`). 

512 """ 

513 

514 fields: ClassVar[DimensionElementFields] 

515 """A categorized view of the fields in this class 

516 (`DimensionElementFields`). 

517 """