Coverage for python/lsst/daf/butler/dimensions/_records.py: 21%

167 statements  

« prev     ^ index     » next       coverage.py v7.4.0, created at 2024-01-25 10:50 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28from __future__ import annotations 

29 

30__all__ = ("DimensionRecord", "SerializedDimensionRecord") 

31 

32from collections.abc import Hashable 

33from typing import TYPE_CHECKING, Any, ClassVar, Optional, Tuple 

34 

35import lsst.sphgeom 

36from lsst.utils.classes import immutable 

37from pydantic import BaseModel, Field, StrictBool, StrictFloat, StrictInt, StrictStr, create_model 

38 

39from .._timespan import Timespan 

40from ..json import from_json_pydantic, to_json_pydantic 

41from ..persistence_context import PersistenceContextVars 

42from ._elements import Dimension, DimensionElement 

43 

44if TYPE_CHECKING: # Imports needed only for type annotations; may be circular. 

45 from ..registry import Registry 

46 from ._coordinate import DataCoordinate 

47 from ._schema import DimensionElementFields 

48 from ._universe import DimensionUniverse 

49 

50 

51def _reconstructDimensionRecord(definition: DimensionElement, mapping: dict[str, Any]) -> DimensionRecord: 

52 """Unpickle implementation for `DimensionRecord` subclasses. 

53 

54 For internal use by `DimensionRecord`. 

55 """ 

56 return definition.RecordClass(**mapping) 

57 

58 

59def _subclassDimensionRecord(definition: DimensionElement) -> type[DimensionRecord]: 

60 """Create a dynamic subclass of `DimensionRecord` for the given element. 

61 

62 For internal use by `DimensionRecord`. 

63 """ 

64 from ._schema import DimensionElementFields 

65 

66 fields = DimensionElementFields(definition) 

67 slots = list(fields.standard.names) 

68 if definition.spatial: 

69 slots.append("region") 

70 if definition.temporal: 

71 slots.append("timespan") 

72 d = {"definition": definition, "__slots__": tuple(slots), "fields": fields} 

73 return type(definition.name + ".RecordClass", (DimensionRecord,), d) 

74 

75 

76class SpecificSerializedDimensionRecord(BaseModel, extra="forbid"): 

77 """Base model for a specific serialized record content.""" 

78 

79 

80_SIMPLE_RECORD_CLASS_CACHE: dict[ 

81 tuple[DimensionElement, DimensionUniverse], type[SpecificSerializedDimensionRecord] 

82] = {} 

83 

84 

85def _createSimpleRecordSubclass(definition: DimensionElement) -> type[SpecificSerializedDimensionRecord]: 

86 from ._schema import DimensionElementFields 

87 

88 # Cache on the definition (which hashes as the name) and the 

89 # associated universe. 

90 cache_key = (definition, definition.universe) 

91 if cache_key in _SIMPLE_RECORD_CLASS_CACHE: 

92 return _SIMPLE_RECORD_CLASS_CACHE[cache_key] 

93 

94 fields = DimensionElementFields(definition) 

95 members = {} 

96 # Prefer strict typing for external data 

97 type_map = { 

98 str: StrictStr, 

99 float: StrictFloat, 

100 bool: StrictBool, 

101 int: StrictInt, 

102 } 

103 

104 for field in fields.standard: 

105 field_type = field.getPythonType() 

106 field_type = type_map.get(field_type, field_type) 

107 if field.nullable: 

108 field_type = Optional[field_type] # type: ignore 

109 members[field.name] = (field_type, ...) 

110 if definition.temporal: 

111 members["timespan"] = (Optional[Tuple[int, int]], ...) # type: ignore 

112 if definition.spatial: 

113 members["region"] = (str, ...) 

114 

115 # mypy does not seem to like create_model 

116 model = create_model( 

117 f"SpecificSerializedDimensionRecord{definition.name.capitalize()}", 

118 __base__=SpecificSerializedDimensionRecord, 

119 **members, # type: ignore 

120 ) 

121 

122 _SIMPLE_RECORD_CLASS_CACHE[cache_key] = model 

123 return model 

124 

125 

126# While supporting pydantic v1 and v2 keep this outside the model. 

127_serialized_dimension_record_schema_extra = { 

128 "examples": [ 

129 { 

130 "definition": "detector", 

131 "record": { 

132 "instrument": "HSC", 

133 "id": 72, 

134 "full_name": "0_01", 

135 "name_in_raft": "01", 

136 "raft": "0", 

137 "purpose": "SCIENCE", 

138 }, 

139 } 

140 ] 

141} 

142 

143 

144class SerializedDimensionRecord(BaseModel): 

145 """Simplified model for serializing a `DimensionRecord`.""" 

146 

147 definition: str = Field( 

148 ..., 

149 title="Name of dimension associated with this record.", 

150 examples=["exposure"], 

151 ) 

152 

153 # Use strict types to prevent casting 

154 record: dict[str, None | StrictInt | StrictFloat | StrictStr | StrictBool | tuple[int, int]] = Field( 

155 ..., 

156 title="Dimension record keys and values.", 

157 examples=[ 

158 { 

159 "definition": "exposure", 

160 "record": { 

161 "instrument": "LATISS", 

162 "exposure": 2021050300044, 

163 "obs_id": "AT_O_20210503_00044", 

164 }, 

165 } 

166 ], 

167 ) 

168 

169 model_config = { 

170 "json_schema_extra": _serialized_dimension_record_schema_extra, # type: ignore[typeddict-item] 

171 } 

172 

173 @classmethod 

174 def direct( 

175 cls, 

176 *, 

177 definition: str, 

178 record: dict[str, None | StrictFloat | StrictStr | StrictBool | StrictInt | tuple[int, int]], 

179 ) -> SerializedDimensionRecord: 

180 """Construct a `SerializedDimensionRecord` directly without validators. 

181 

182 Parameters 

183 ---------- 

184 definition : `str` 

185 The name of the record. 

186 record : `dict` 

187 A dictionary representation of the record content. 

188 

189 Returns 

190 ------- 

191 rec : `SerializedDimensionRecord` 

192 A model representing the dimension records. 

193 

194 Notes 

195 ----- 

196 This differs from the pydantic "construct" method in that the arguments 

197 are explicitly what the model requires, and it will recurse through 

198 members, constructing them from their corresponding `direct` methods. 

199 

200 This method should only be called when the inputs are trusted. 

201 """ 

202 # This method requires tuples as values of the mapping, but JSON 

203 # readers will read things in as lists. Be kind and transparently 

204 # transform to tuples 

205 _recItems = { 

206 k: v if type(v) != list else tuple(v) for k, v in record.items() # type: ignore # noqa: E721 

207 } 

208 

209 # Type ignore because the ternary statement seems to confuse mypy 

210 # based on conflicting inferred types of v. 

211 key = ( 

212 definition, 

213 frozenset(_recItems.items()), 

214 ) 

215 cache = PersistenceContextVars.serializedDimensionRecordMapping.get() 

216 if cache is not None and (result := cache.get(key)) is not None: 

217 return result 

218 

219 node = cls.model_construct(definition=definition, record=_recItems) # type: ignore 

220 

221 if cache is not None: 

222 cache[key] = node 

223 return node 

224 

225 

226@immutable 

227class DimensionRecord: 

228 """Base class for the Python representation of database records. 

229 

230 Parameters 

231 ---------- 

232 **kwargs 

233 Field values for this record. Unrecognized keys are ignored. If this 

234 is the record for a `Dimension`, its primary key value may be provided 

235 with the actual name of the field (e.g. "id" or "name"), the name of 

236 the `Dimension`, or both. If this record class has a "timespan" 

237 attribute, "datetime_begin" and "datetime_end" keyword arguments may 

238 be provided instead of a single "timespan" keyword argument (but are 

239 ignored if a "timespan" argument is provided). 

240 

241 Notes 

242 ----- 

243 `DimensionRecord` subclasses are created dynamically for each 

244 `DimensionElement` in a `DimensionUniverse`, and are accessible via the 

245 `DimensionElement.RecordClass` attribute. The `DimensionRecord` base class 

246 itself is pure abstract, but does not use the `abc` module to indicate this 

247 because it does not have overridable methods. 

248 

249 Record classes have attributes that correspond exactly to the 

250 `~DimensionElementFields.standard` fields in the related database table, 

251 plus "region" and "timespan" attributes for spatial and/or temporal 

252 elements (respectively). 

253 

254 Instances are usually obtained from a `Registry`, but can be constructed 

255 directly from Python as well. 

256 

257 `DimensionRecord` instances are immutable. 

258 """ 

259 

260 # Derived classes are required to define __slots__ as well, and it's those 

261 # derived-class slots that other methods on the base class expect to see 

262 # when they access self.__slots__. 

263 __slots__ = ("dataId",) 

264 

265 _serializedType = SerializedDimensionRecord 

266 

267 def __init__(self, **kwargs: Any): 

268 # Accept either the dimension name or the actual name of its primary 

269 # key field; ensure both are present in the dict for convenience below. 

270 if isinstance(self.definition, Dimension): 

271 v = kwargs.get(self.definition.primaryKey.name) 

272 if v is None: 

273 v = kwargs.get(self.definition.name) 

274 if v is None: 

275 raise ValueError( 

276 f"No value provided for {self.definition.name}.{self.definition.primaryKey.name}." 

277 ) 

278 kwargs[self.definition.primaryKey.name] = v 

279 else: 

280 v2 = kwargs.setdefault(self.definition.name, v) 

281 if v != v2: 

282 raise ValueError( 

283 "Multiple inconsistent values for " 

284 f"{self.definition.name}.{self.definition.primaryKey.name}: {v!r} != {v2!r}." 

285 ) 

286 

287 from ._coordinate import DataCoordinate 

288 

289 object.__setattr__( 

290 self, 

291 "dataId", 

292 DataCoordinate.from_required_values( 

293 self.definition.minimal_group, 

294 tuple(kwargs[dimension] for dimension in self.definition.required.names), 

295 ), 

296 ) 

297 # Don't need the primary key value aliased to the dimension name 

298 # anymore. 

299 kwargs.pop(self.definition.name, None) 

300 

301 for name in self.__slots__: 

302 # Note that we remove from kwargs as we go, to make sure there's 

303 # nothing left at the end. 

304 object.__setattr__(self, name, kwargs.pop(name, None)) 

305 # Support 'datetime_begin' and 'datetime_end' instead of 'timespan' for 

306 # backwards compatibility, but if one is provided both must be. 

307 if self.definition.temporal is not None and self.timespan is None and "datetime_begin" in kwargs: 

308 object.__setattr__( 

309 self, 

310 "timespan", 

311 Timespan( 

312 kwargs.pop("datetime_begin"), 

313 kwargs.pop("datetime_end"), 

314 ), 

315 ) 

316 

317 if kwargs: 

318 raise TypeError(f"Invalid fields for {self.definition} dimension record: {set(kwargs.keys())}.") 

319 

320 def __eq__(self, other: Any) -> bool: 

321 if type(other) != type(self): 

322 return False 

323 return self.dataId == other.dataId 

324 

325 def __hash__(self) -> int: 

326 return hash(self.dataId.required_values) 

327 

328 def __str__(self) -> str: 

329 lines = [f"{self.definition.name}:"] 

330 lines.extend(f" {name}: {getattr(self, name)!r}" for name in self.__slots__) 

331 return "\n".join(lines) 

332 

333 def __repr__(self) -> str: 

334 return "{}.RecordClass({})".format( 

335 self.definition.name, ", ".join(f"{name}={getattr(self, name)!r}" for name in self.__slots__) 

336 ) 

337 

338 def __reduce__(self) -> tuple: 

339 mapping = {name: getattr(self, name) for name in self.__slots__} 

340 return (_reconstructDimensionRecord, (self.definition, mapping)) 

341 

342 def _repr_html_(self) -> str: 

343 """Override the default representation in IPython/Jupyter notebooks. 

344 

345 This gives a more readable output that understands embedded newlines. 

346 """ 

347 return f"<pre>{self}<pre>" 

348 

349 def to_simple(self, minimal: bool = False) -> SerializedDimensionRecord: 

350 """Convert this class to a simple python type. 

351 

352 This makes it suitable for serialization. 

353 

354 Parameters 

355 ---------- 

356 minimal : `bool`, optional 

357 Use minimal serialization. Has no effect on for this class. 

358 

359 Returns 

360 ------- 

361 names : `list` 

362 The names of the dimensions. 

363 """ 

364 # The DataId is sufficient if you are willing to do a deferred 

365 # query. This may not be overly useful since to reconstruct 

366 # a collection of records will require repeated registry queries. 

367 # For now do not implement minimal form. 

368 key = (id(self.definition), self.dataId) 

369 cache = PersistenceContextVars.serializedDimensionRecordMapping.get() 

370 if cache is not None and (result := cache.get(key)) is not None: 

371 return result 

372 

373 mapping = {name: getattr(self, name) for name in self.__slots__} 

374 # If the item in mapping supports simplification update it 

375 for k, v in mapping.items(): 

376 try: 

377 mapping[k] = v.to_simple(minimal=minimal) 

378 except AttributeError: 

379 if isinstance(v, lsst.sphgeom.Region): 

380 # YAML serialization specifies the class when it 

381 # doesn't have to. This is partly for explicitness 

382 # and also history. Here use a different approach. 

383 # This code needs to be migrated to sphgeom 

384 mapping[k] = v.encode().hex() 

385 if isinstance(v, bytes): 

386 # We actually can't handle serializing out to bytes for 

387 # hash objects, encode it here to a hex string 

388 mapping[k] = v.hex() 

389 definition = self.definition.to_simple(minimal=minimal) 

390 dimRec = SerializedDimensionRecord(definition=definition, record=mapping) 

391 if cache is not None: 

392 cache[key] = dimRec 

393 return dimRec 

394 

395 @classmethod 

396 def from_simple( 

397 cls, 

398 simple: SerializedDimensionRecord, 

399 universe: DimensionUniverse | None = None, 

400 registry: Registry | None = None, 

401 cacheKey: Hashable | None = None, 

402 ) -> DimensionRecord: 

403 """Construct a new object from the simplified form. 

404 

405 This is generally data returned from the `to_simple` 

406 method. 

407 

408 Parameters 

409 ---------- 

410 simple : `SerializedDimensionRecord` 

411 Value return from `to_simple`. 

412 universe : `DimensionUniverse` 

413 The special graph of all known dimensions of which this graph will 

414 be a subset. Can be `None` if `Registry` is provided. 

415 registry : `lsst.daf.butler.Registry`, optional 

416 Registry from which a universe can be extracted. Can be `None` 

417 if universe is provided explicitly. 

418 cacheKey : `Hashable` or `None` 

419 If this is not None, it will be used as a key for any cached 

420 reconstruction instead of calculating a value from the serialized 

421 format. 

422 

423 Returns 

424 ------- 

425 record : `DimensionRecord` 

426 Newly-constructed object. 

427 """ 

428 if universe is None and registry is None: 

429 raise ValueError("One of universe or registry is required to convert names to a DimensionGraph") 

430 if universe is None and registry is not None: 

431 universe = registry.dimensions 

432 if universe is None: 

433 # this is for mypy 

434 raise ValueError("Unable to determine a usable universe") 

435 # Type ignore because the ternary statement seems to confuse mypy 

436 # based on conflicting inferred types of v. 

437 key = cacheKey or ( 

438 simple.definition, 

439 frozenset(simple.record.items()), # type: ignore 

440 ) 

441 cache = PersistenceContextVars.dimensionRecords.get() 

442 if cache is not None and (result := cache.get(key)) is not None: 

443 return result 

444 

445 definition = DimensionElement.from_simple(simple.definition, universe=universe) 

446 

447 # Create a specialist subclass model with type validation. 

448 # This allows us to do simple checks of external data (possibly 

449 # sent as JSON) since for now _reconstructDimensionRecord does not 

450 # do any validation. 

451 record_model_cls = _createSimpleRecordSubclass(definition) 

452 record_model = record_model_cls(**simple.record) 

453 

454 # Timespan and region have to be converted to native form 

455 # for now assume that those keys are special 

456 rec = record_model.model_dump() 

457 

458 if (ts := "timespan") in rec: 

459 rec[ts] = Timespan.from_simple(rec[ts], universe=universe, registry=registry) 

460 if (reg := "region") in rec: 

461 encoded = bytes.fromhex(rec[reg]) 

462 rec[reg] = lsst.sphgeom.Region.decode(encoded) 

463 if (hsh := "hash") in rec: 

464 rec[hsh] = bytes.fromhex(rec[hsh].decode()) 

465 

466 dimRec = _reconstructDimensionRecord(definition, rec) 

467 if cache is not None: 

468 cache[key] = dimRec 

469 return dimRec 

470 

471 to_json = to_json_pydantic 

472 from_json: ClassVar = classmethod(from_json_pydantic) 

473 

474 def toDict(self, splitTimespan: bool = False) -> dict[str, Any]: 

475 """Return a vanilla `dict` representation of this record. 

476 

477 Parameters 

478 ---------- 

479 splitTimespan : `bool`, optional 

480 If `True` (`False` is default) transform any "timespan" key value 

481 from a `Timespan` instance into a pair of regular 

482 ("datetime_begin", "datetime_end") fields. 

483 """ 

484 results = {name: getattr(self, name) for name in self.__slots__} 

485 if splitTimespan: 

486 timespan = results.pop("timespan", None) 

487 if timespan is not None: 

488 results["datetime_begin"] = timespan.begin 

489 results["datetime_end"] = timespan.end 

490 return results 

491 

492 # DimensionRecord subclasses are dynamically created, so static type 

493 # checkers can't know about them or their attributes. To avoid having to 

494 # put "type: ignore", everywhere, add a dummy __getattr__ that tells type 

495 # checkers not to worry about missing attributes. 

496 def __getattr__(self, name: str) -> Any: 

497 raise AttributeError(name) 

498 

499 # Class attributes below are shadowed by instance attributes, and are 

500 # present just to hold the docstrings for those instance attributes. 

501 

502 dataId: DataCoordinate 

503 """A dict-like identifier for this record's primary keys 

504 (`DataCoordinate`). 

505 """ 

506 

507 definition: ClassVar[DimensionElement] 

508 """The `DimensionElement` whose records this class represents 

509 (`DimensionElement`). 

510 """ 

511 

512 fields: ClassVar[DimensionElementFields] 

513 """A categorized view of the fields in this class 

514 (`DimensionElementFields`). 

515 """