Coverage for python/lsst/daf/butler/dimensions/_records.py: 21%

168 statements  

« prev     ^ index     » next       coverage.py v7.4.3, created at 2024-03-07 11:04 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28from __future__ import annotations 

29 

30__all__ = ("DimensionRecord", "SerializedDimensionRecord") 

31 

32from collections.abc import Hashable 

33from typing import TYPE_CHECKING, Any, ClassVar, Optional, Tuple 

34 

35import lsst.sphgeom 

36from lsst.utils.classes import immutable 

37from pydantic import BaseModel, Field, StrictBool, StrictFloat, StrictInt, StrictStr, create_model 

38 

39from .._timespan import Timespan 

40from ..json import from_json_pydantic, to_json_pydantic 

41from ..persistence_context import PersistenceContextVars 

42from ._elements import Dimension, DimensionElement 

43 

44if TYPE_CHECKING: # Imports needed only for type annotations; may be circular. 

45 from ..registry import Registry 

46 from ._coordinate import DataCoordinate 

47 from ._schema import DimensionElementFields 

48 from ._universe import DimensionUniverse 

49 

50 

51def _reconstructDimensionRecord(definition: DimensionElement, mapping: dict[str, Any]) -> DimensionRecord: 

52 """Unpickle implementation for `DimensionRecord` subclasses. 

53 

54 For internal use by `DimensionRecord`. 

55 """ 

56 return definition.RecordClass(**mapping) 

57 

58 

59def _subclassDimensionRecord(definition: DimensionElement) -> type[DimensionRecord]: 

60 """Create a dynamic subclass of `DimensionRecord` for the given element. 

61 

62 For internal use by `DimensionRecord`. 

63 """ 

64 from ._schema import DimensionElementFields 

65 

66 fields = DimensionElementFields(definition) 

67 slots = list(fields.standard.names) 

68 if definition.spatial: 

69 slots.append("region") 

70 if definition.temporal: 

71 slots.append("timespan") 

72 d = {"definition": definition, "__slots__": tuple(slots), "fields": fields} 

73 return type(definition.name + ".RecordClass", (DimensionRecord,), d) 

74 

75 

76class SpecificSerializedDimensionRecord(BaseModel, extra="forbid"): 

77 """Base model for a specific serialized record content.""" 

78 

79 

80_SIMPLE_RECORD_CLASS_CACHE: dict[ 

81 tuple[DimensionElement, DimensionUniverse], type[SpecificSerializedDimensionRecord] 

82] = {} 

83 

84 

85def _createSimpleRecordSubclass(definition: DimensionElement) -> type[SpecificSerializedDimensionRecord]: 

86 from ._schema import DimensionElementFields 

87 

88 # Cache on the definition (which hashes as the name) and the 

89 # associated universe. 

90 cache_key = (definition, definition.universe) 

91 if cache_key in _SIMPLE_RECORD_CLASS_CACHE: 

92 return _SIMPLE_RECORD_CLASS_CACHE[cache_key] 

93 

94 fields = DimensionElementFields(definition) 

95 members = {} 

96 # Prefer strict typing for external data 

97 type_map = { 

98 str: StrictStr, 

99 float: StrictFloat, 

100 bool: StrictBool, 

101 int: StrictInt, 

102 } 

103 

104 for field in fields.standard: 

105 field_type = field.getPythonType() 

106 field_type = type_map.get(field_type, field_type) 

107 if field.nullable: 

108 field_type = Optional[field_type] # type: ignore 

109 members[field.name] = (field_type, ...) 

110 if definition.temporal: 

111 members["timespan"] = (Optional[Tuple[int, int]], ...) # type: ignore 

112 if definition.spatial: 

113 members["region"] = (str, ...) 

114 

115 # For the new derived class name need to convert to camel case. 

116 # so "day_obs" -> "DayObs". 

117 derived_name = "".join([part.capitalize() for part in definition.name.split("_")]) 

118 

119 model = create_model( 

120 f"SpecificSerializedDimensionRecord{derived_name}", 

121 __base__=SpecificSerializedDimensionRecord, 

122 **members, # type: ignore 

123 ) 

124 

125 _SIMPLE_RECORD_CLASS_CACHE[cache_key] = model 

126 return model 

127 

128 

129# While supporting pydantic v1 and v2 keep this outside the model. 

130_serialized_dimension_record_schema_extra = { 

131 "examples": [ 

132 { 

133 "definition": "detector", 

134 "record": { 

135 "instrument": "HSC", 

136 "id": 72, 

137 "full_name": "0_01", 

138 "name_in_raft": "01", 

139 "raft": "0", 

140 "purpose": "SCIENCE", 

141 }, 

142 } 

143 ] 

144} 

145 

146 

147class SerializedDimensionRecord(BaseModel): 

148 """Simplified model for serializing a `DimensionRecord`.""" 

149 

150 definition: str = Field( 

151 ..., 

152 title="Name of dimension associated with this record.", 

153 examples=["exposure"], 

154 ) 

155 

156 # Use strict types to prevent casting 

157 record: dict[str, None | StrictBool | StrictInt | StrictFloat | StrictStr | tuple[int, int]] = Field( 

158 ..., 

159 title="Dimension record keys and values.", 

160 examples=[ 

161 { 

162 "definition": "exposure", 

163 "record": { 

164 "instrument": "LATISS", 

165 "exposure": 2021050300044, 

166 "obs_id": "AT_O_20210503_00044", 

167 }, 

168 } 

169 ], 

170 ) 

171 

172 model_config = { 

173 "json_schema_extra": _serialized_dimension_record_schema_extra, # type: ignore[typeddict-item] 

174 } 

175 

176 @classmethod 

177 def direct( 

178 cls, 

179 *, 

180 definition: str, 

181 record: dict[str, None | StrictFloat | StrictStr | StrictBool | StrictInt | tuple[int, int]], 

182 ) -> SerializedDimensionRecord: 

183 """Construct a `SerializedDimensionRecord` directly without validators. 

184 

185 Parameters 

186 ---------- 

187 definition : `str` 

188 The name of the record. 

189 record : `dict` 

190 A dictionary representation of the record content. 

191 

192 Returns 

193 ------- 

194 rec : `SerializedDimensionRecord` 

195 A model representing the dimension records. 

196 

197 Notes 

198 ----- 

199 This differs from the pydantic "construct" method in that the arguments 

200 are explicitly what the model requires, and it will recurse through 

201 members, constructing them from their corresponding `direct` methods. 

202 

203 This method should only be called when the inputs are trusted. 

204 """ 

205 # This method requires tuples as values of the mapping, but JSON 

206 # readers will read things in as lists. Be kind and transparently 

207 # transform to tuples 

208 _recItems = { 

209 k: v if type(v) != list else tuple(v) for k, v in record.items() # type: ignore # noqa: E721 

210 } 

211 

212 # Type ignore because the ternary statement seems to confuse mypy 

213 # based on conflicting inferred types of v. 

214 key = ( 

215 definition, 

216 frozenset(_recItems.items()), 

217 ) 

218 cache = PersistenceContextVars.serializedDimensionRecordMapping.get() 

219 if cache is not None and (result := cache.get(key)) is not None: 

220 return result 

221 

222 node = cls.model_construct(definition=definition, record=_recItems) # type: ignore 

223 

224 if cache is not None: 

225 cache[key] = node 

226 return node 

227 

228 

229@immutable 

230class DimensionRecord: 

231 """Base class for the Python representation of database records. 

232 

233 Parameters 

234 ---------- 

235 **kwargs 

236 Field values for this record. Unrecognized keys are ignored. If this 

237 is the record for a `Dimension`, its primary key value may be provided 

238 with the actual name of the field (e.g. "id" or "name"), the name of 

239 the `Dimension`, or both. If this record class has a "timespan" 

240 attribute, "datetime_begin" and "datetime_end" keyword arguments may 

241 be provided instead of a single "timespan" keyword argument (but are 

242 ignored if a "timespan" argument is provided). 

243 

244 Notes 

245 ----- 

246 `DimensionRecord` subclasses are created dynamically for each 

247 `DimensionElement` in a `DimensionUniverse`, and are accessible via the 

248 `DimensionElement.RecordClass` attribute. The `DimensionRecord` base class 

249 itself is pure abstract, but does not use the `abc` module to indicate this 

250 because it does not have overridable methods. 

251 

252 Record classes have attributes that correspond exactly to the 

253 `~DimensionElementFields.standard` fields in the related database table, 

254 plus "region" and "timespan" attributes for spatial and/or temporal 

255 elements (respectively). 

256 

257 Instances are usually obtained from a `Registry`, but can be constructed 

258 directly from Python as well. 

259 

260 `DimensionRecord` instances are immutable. 

261 """ 

262 

263 # Derived classes are required to define __slots__ as well, and it's those 

264 # derived-class slots that other methods on the base class expect to see 

265 # when they access self.__slots__. 

266 __slots__ = ("dataId",) 

267 

268 _serializedType = SerializedDimensionRecord 

269 

270 def __init__(self, **kwargs: Any): 

271 # Accept either the dimension name or the actual name of its primary 

272 # key field; ensure both are present in the dict for convenience below. 

273 if isinstance(self.definition, Dimension): 

274 v = kwargs.get(self.definition.primaryKey.name) 

275 if v is None: 

276 v = kwargs.get(self.definition.name) 

277 if v is None: 

278 raise ValueError( 

279 f"No value provided for {self.definition.name}.{self.definition.primaryKey.name}." 

280 ) 

281 kwargs[self.definition.primaryKey.name] = v 

282 else: 

283 v2 = kwargs.setdefault(self.definition.name, v) 

284 if v != v2: 

285 raise ValueError( 

286 "Multiple inconsistent values for " 

287 f"{self.definition.name}.{self.definition.primaryKey.name}: {v!r} != {v2!r}." 

288 ) 

289 

290 from ._coordinate import DataCoordinate 

291 

292 object.__setattr__( 

293 self, 

294 "dataId", 

295 DataCoordinate.from_required_values( 

296 self.definition.minimal_group, 

297 tuple(kwargs[dimension] for dimension in self.definition.required.names), 

298 ), 

299 ) 

300 # Don't need the primary key value aliased to the dimension name 

301 # anymore. 

302 kwargs.pop(self.definition.name, None) 

303 

304 for name in self.__slots__: 

305 # Note that we remove from kwargs as we go, to make sure there's 

306 # nothing left at the end. 

307 object.__setattr__(self, name, kwargs.pop(name, None)) 

308 # Support 'datetime_begin' and 'datetime_end' instead of 'timespan' for 

309 # backwards compatibility, but if one is provided both must be. 

310 if self.definition.temporal is not None and self.timespan is None and "datetime_begin" in kwargs: 

311 object.__setattr__( 

312 self, 

313 "timespan", 

314 Timespan( 

315 kwargs.pop("datetime_begin"), 

316 kwargs.pop("datetime_end"), 

317 ), 

318 ) 

319 

320 if kwargs: 

321 raise TypeError(f"Invalid fields for {self.definition} dimension record: {set(kwargs.keys())}.") 

322 

323 def __eq__(self, other: Any) -> bool: 

324 if type(other) != type(self): 

325 return False 

326 return self.dataId == other.dataId 

327 

328 def __hash__(self) -> int: 

329 return hash(self.dataId.required_values) 

330 

331 def __str__(self) -> str: 

332 lines = [f"{self.definition.name}:"] 

333 lines.extend(f" {name}: {getattr(self, name)!r}" for name in self.__slots__) 

334 return "\n".join(lines) 

335 

336 def __repr__(self) -> str: 

337 return "{}.RecordClass({})".format( 

338 self.definition.name, ", ".join(f"{name}={getattr(self, name)!r}" for name in self.__slots__) 

339 ) 

340 

341 def __reduce__(self) -> tuple: 

342 mapping = {name: getattr(self, name) for name in self.__slots__} 

343 return (_reconstructDimensionRecord, (self.definition, mapping)) 

344 

345 def _repr_html_(self) -> str: 

346 """Override the default representation in IPython/Jupyter notebooks. 

347 

348 This gives a more readable output that understands embedded newlines. 

349 """ 

350 return f"<pre>{self}<pre>" 

351 

352 def to_simple(self, minimal: bool = False) -> SerializedDimensionRecord: 

353 """Convert this class to a simple python type. 

354 

355 This makes it suitable for serialization. 

356 

357 Parameters 

358 ---------- 

359 minimal : `bool`, optional 

360 Use minimal serialization. Has no effect on for this class. 

361 

362 Returns 

363 ------- 

364 names : `list` 

365 The names of the dimensions. 

366 """ 

367 # The DataId is sufficient if you are willing to do a deferred 

368 # query. This may not be overly useful since to reconstruct 

369 # a collection of records will require repeated registry queries. 

370 # For now do not implement minimal form. 

371 key = (id(self.definition), self.dataId) 

372 cache = PersistenceContextVars.serializedDimensionRecordMapping.get() 

373 if cache is not None and (result := cache.get(key)) is not None: 

374 return result 

375 

376 mapping = {name: getattr(self, name) for name in self.__slots__} 

377 # If the item in mapping supports simplification update it 

378 for k, v in mapping.items(): 

379 try: 

380 mapping[k] = v.to_simple(minimal=minimal) 

381 except AttributeError: 

382 if isinstance(v, lsst.sphgeom.Region): 

383 # YAML serialization specifies the class when it 

384 # doesn't have to. This is partly for explicitness 

385 # and also history. Here use a different approach. 

386 # This code needs to be migrated to sphgeom 

387 mapping[k] = v.encode().hex() 

388 if isinstance(v, bytes): 

389 # We actually can't handle serializing out to bytes for 

390 # hash objects, encode it here to a hex string 

391 mapping[k] = v.hex() 

392 definition = self.definition.to_simple(minimal=minimal) 

393 dimRec = SerializedDimensionRecord(definition=definition, record=mapping) 

394 if cache is not None: 

395 cache[key] = dimRec 

396 return dimRec 

397 

398 @classmethod 

399 def from_simple( 

400 cls, 

401 simple: SerializedDimensionRecord, 

402 universe: DimensionUniverse | None = None, 

403 registry: Registry | None = None, 

404 cacheKey: Hashable | None = None, 

405 ) -> DimensionRecord: 

406 """Construct a new object from the simplified form. 

407 

408 This is generally data returned from the `to_simple` 

409 method. 

410 

411 Parameters 

412 ---------- 

413 simple : `SerializedDimensionRecord` 

414 Value return from `to_simple`. 

415 universe : `DimensionUniverse` 

416 The special graph of all known dimensions of which this graph will 

417 be a subset. Can be `None` if `Registry` is provided. 

418 registry : `lsst.daf.butler.Registry`, optional 

419 Registry from which a universe can be extracted. Can be `None` 

420 if universe is provided explicitly. 

421 cacheKey : `Hashable` or `None` 

422 If this is not None, it will be used as a key for any cached 

423 reconstruction instead of calculating a value from the serialized 

424 format. 

425 

426 Returns 

427 ------- 

428 record : `DimensionRecord` 

429 Newly-constructed object. 

430 """ 

431 if universe is None and registry is None: 

432 raise ValueError("One of universe or registry is required to convert names to a DimensionGraph") 

433 if universe is None and registry is not None: 

434 universe = registry.dimensions 

435 if universe is None: 

436 # this is for mypy 

437 raise ValueError("Unable to determine a usable universe") 

438 # Type ignore because the ternary statement seems to confuse mypy 

439 # based on conflicting inferred types of v. 

440 key = cacheKey or ( 

441 simple.definition, 

442 frozenset(simple.record.items()), # type: ignore 

443 ) 

444 cache = PersistenceContextVars.dimensionRecords.get() 

445 if cache is not None and (result := cache.get(key)) is not None: 

446 return result 

447 

448 definition = DimensionElement.from_simple(simple.definition, universe=universe) 

449 

450 # Create a specialist subclass model with type validation. 

451 # This allows us to do simple checks of external data (possibly 

452 # sent as JSON) since for now _reconstructDimensionRecord does not 

453 # do any validation. 

454 record_model_cls = _createSimpleRecordSubclass(definition) 

455 record_model = record_model_cls(**simple.record) 

456 

457 # Timespan and region have to be converted to native form 

458 # for now assume that those keys are special 

459 rec = record_model.model_dump() 

460 

461 if (ts := "timespan") in rec: 

462 rec[ts] = Timespan.from_simple(rec[ts], universe=universe, registry=registry) 

463 if (reg := "region") in rec: 

464 encoded = bytes.fromhex(rec[reg]) 

465 rec[reg] = lsst.sphgeom.Region.decode(encoded) 

466 if (hsh := "hash") in rec: 

467 rec[hsh] = bytes.fromhex(rec[hsh].decode()) 

468 

469 dimRec = _reconstructDimensionRecord(definition, rec) 

470 if cache is not None: 

471 cache[key] = dimRec 

472 return dimRec 

473 

474 to_json = to_json_pydantic 

475 from_json: ClassVar = classmethod(from_json_pydantic) 

476 

477 def toDict(self, splitTimespan: bool = False) -> dict[str, Any]: 

478 """Return a vanilla `dict` representation of this record. 

479 

480 Parameters 

481 ---------- 

482 splitTimespan : `bool`, optional 

483 If `True` (`False` is default) transform any "timespan" key value 

484 from a `Timespan` instance into a pair of regular 

485 ("datetime_begin", "datetime_end") fields. 

486 """ 

487 results = {name: getattr(self, name) for name in self.__slots__} 

488 if splitTimespan: 

489 timespan = results.pop("timespan", None) 

490 if timespan is not None: 

491 results["datetime_begin"] = timespan.begin 

492 results["datetime_end"] = timespan.end 

493 return results 

494 

495 # DimensionRecord subclasses are dynamically created, so static type 

496 # checkers can't know about them or their attributes. To avoid having to 

497 # put "type: ignore", everywhere, add a dummy __getattr__ that tells type 

498 # checkers not to worry about missing attributes. 

499 def __getattr__(self, name: str) -> Any: 

500 raise AttributeError(name) 

501 

502 # Class attributes below are shadowed by instance attributes, and are 

503 # present just to hold the docstrings for those instance attributes. 

504 

505 dataId: DataCoordinate 

506 """A dict-like identifier for this record's primary keys 

507 (`DataCoordinate`). 

508 """ 

509 

510 definition: ClassVar[DimensionElement] 

511 """The `DimensionElement` whose records this class represents 

512 (`DimensionElement`). 

513 """ 

514 

515 fields: ClassVar[DimensionElementFields] 

516 """A categorized view of the fields in this class 

517 (`DimensionElementFields`). 

518 """