Coverage for python/lsst/daf/butler/dimensions/_records.py: 22%

171 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-12-01 11:00 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28from __future__ import annotations 

29 

30__all__ = ("DimensionRecord", "SerializedDimensionRecord") 

31 

32from collections.abc import Hashable 

33from typing import TYPE_CHECKING, Any, ClassVar, Optional, Tuple 

34 

35import lsst.sphgeom 

36from lsst.daf.butler._compat import PYDANTIC_V2, _BaseModelCompat 

37from lsst.utils.classes import immutable 

38from pydantic import Field, StrictBool, StrictFloat, StrictInt, StrictStr, create_model 

39 

40from .._timespan import Timespan, TimespanDatabaseRepresentation 

41from ..json import from_json_pydantic, to_json_pydantic 

42from ..persistence_context import PersistenceContextVars 

43from ._elements import Dimension, DimensionElement 

44 

45if TYPE_CHECKING: # Imports needed only for type annotations; may be circular. 

46 from ..registry import Registry 

47 from ._coordinate import DataCoordinate 

48 from ._schema import DimensionElementFields 

49 from ._universe import DimensionUniverse 

50 

51 

52def _reconstructDimensionRecord(definition: DimensionElement, mapping: dict[str, Any]) -> DimensionRecord: 

53 """Unpickle implementation for `DimensionRecord` subclasses. 

54 

55 For internal use by `DimensionRecord`. 

56 """ 

57 return definition.RecordClass(**mapping) 

58 

59 

60def _subclassDimensionRecord(definition: DimensionElement) -> type[DimensionRecord]: 

61 """Create a dynamic subclass of `DimensionRecord` for the given element. 

62 

63 For internal use by `DimensionRecord`. 

64 """ 

65 from ._schema import DimensionElementFields 

66 

67 fields = DimensionElementFields(definition) 

68 slots = list(fields.standard.names) 

69 if definition.spatial: 

70 slots.append("region") 

71 if definition.temporal: 

72 slots.append(TimespanDatabaseRepresentation.NAME) 

73 d = {"definition": definition, "__slots__": tuple(slots), "fields": fields} 

74 return type(definition.name + ".RecordClass", (DimensionRecord,), d) 

75 

76 

77class SpecificSerializedDimensionRecord(_BaseModelCompat, extra="forbid"): 

78 """Base model for a specific serialized record content.""" 

79 

80 

81_SIMPLE_RECORD_CLASS_CACHE: dict[ 

82 tuple[DimensionElement, DimensionUniverse], type[SpecificSerializedDimensionRecord] 

83] = {} 

84 

85 

86def _createSimpleRecordSubclass(definition: DimensionElement) -> type[SpecificSerializedDimensionRecord]: 

87 from ._schema import DimensionElementFields 

88 

89 # Cache on the definition (which hashes as the name) and the 

90 # associated universe. 

91 cache_key = (definition, definition.universe) 

92 if cache_key in _SIMPLE_RECORD_CLASS_CACHE: 

93 return _SIMPLE_RECORD_CLASS_CACHE[cache_key] 

94 

95 fields = DimensionElementFields(definition) 

96 members = {} 

97 # Prefer strict typing for external data 

98 type_map = { 

99 str: StrictStr, 

100 float: StrictFloat, 

101 bool: StrictBool, 

102 int: StrictInt, 

103 } 

104 

105 for field in fields.standard: 

106 field_type = field.getPythonType() 

107 field_type = type_map.get(field_type, field_type) 

108 if field.nullable: 

109 field_type = Optional[field_type] # type: ignore 

110 members[field.name] = (field_type, ...) 

111 if definition.temporal: 

112 members["timespan"] = (Optional[Tuple[int, int]], ...) # type: ignore 

113 if definition.spatial: 

114 members["region"] = (str, ...) 

115 

116 # mypy does not seem to like create_model 

117 model = create_model( 

118 f"SpecificSerializedDimensionRecord{definition.name.capitalize()}", 

119 __base__=SpecificSerializedDimensionRecord, 

120 **members, # type: ignore 

121 ) 

122 

123 _SIMPLE_RECORD_CLASS_CACHE[cache_key] = model 

124 return model 

125 

126 

127# While supporting pydantic v1 and v2 keep this outside the model. 

128_serialized_dimension_record_schema_extra = { 

129 "examples": [ 

130 { 

131 "definition": "detector", 

132 "record": { 

133 "instrument": "HSC", 

134 "id": 72, 

135 "full_name": "0_01", 

136 "name_in_raft": "01", 

137 "raft": "0", 

138 "purpose": "SCIENCE", 

139 }, 

140 } 

141 ] 

142} 

143 

144 

145class SerializedDimensionRecord(_BaseModelCompat): 

146 """Simplified model for serializing a `DimensionRecord`.""" 

147 

148 definition: str = Field( 

149 ..., 

150 title="Name of dimension associated with this record.", 

151 examples=["exposure"], 

152 ) 

153 

154 # Use strict types to prevent casting 

155 record: dict[str, None | StrictInt | StrictFloat | StrictStr | StrictBool | tuple[int, int]] = Field( 

156 ..., 

157 title="Dimension record keys and values.", 

158 examples=[ 

159 { 

160 "definition": "exposure", 

161 "record": { 

162 "instrument": "LATISS", 

163 "exposure": 2021050300044, 

164 "obs_id": "AT_O_20210503_00044", 

165 }, 

166 } 

167 ], 

168 ) 

169 

170 if PYDANTIC_V2: 170 ↛ 171line 170 didn't jump to line 171

171 model_config = { 

172 "json_schema_extra": _serialized_dimension_record_schema_extra, # type: ignore[typeddict-item] 

173 } 

174 else: 

175 

176 class Config: 

177 """Local configuration overrides for model.""" 

178 

179 schema_extra = _serialized_dimension_record_schema_extra 

180 

181 @classmethod 

182 def direct( 

183 cls, 

184 *, 

185 definition: str, 

186 record: dict[str, None | StrictFloat | StrictStr | StrictBool | StrictInt | tuple[int, int]], 

187 ) -> SerializedDimensionRecord: 

188 """Construct a `SerializedDimensionRecord` directly without validators. 

189 

190 This differs from the pydantic "construct" method in that the arguments 

191 are explicitly what the model requires, and it will recurse through 

192 members, constructing them from their corresponding `direct` methods. 

193 

194 This method should only be called when the inputs are trusted. 

195 """ 

196 # This method requires tuples as values of the mapping, but JSON 

197 # readers will read things in as lists. Be kind and transparently 

198 # transform to tuples 

199 _recItems = { 

200 k: v if type(v) != list else tuple(v) for k, v in record.items() # type: ignore # noqa: E721 

201 } 

202 

203 # Type ignore because the ternary statement seems to confuse mypy 

204 # based on conflicting inferred types of v. 

205 key = ( 

206 definition, 

207 frozenset(_recItems.items()), 

208 ) 

209 cache = PersistenceContextVars.serializedDimensionRecordMapping.get() 

210 if cache is not None and (result := cache.get(key)) is not None: 

211 return result 

212 

213 node = cls.model_construct(definition=definition, record=_recItems) # type: ignore 

214 

215 if cache is not None: 

216 cache[key] = node 

217 return node 

218 

219 

220@immutable 

221class DimensionRecord: 

222 """Base class for the Python representation of database records. 

223 

224 Parameters 

225 ---------- 

226 **kwargs 

227 Field values for this record. Unrecognized keys are ignored. If this 

228 is the record for a `Dimension`, its primary key value may be provided 

229 with the actual name of the field (e.g. "id" or "name"), the name of 

230 the `Dimension`, or both. If this record class has a "timespan" 

231 attribute, "datetime_begin" and "datetime_end" keyword arguments may 

232 be provided instead of a single "timespan" keyword argument (but are 

233 ignored if a "timespan" argument is provided). 

234 

235 Notes 

236 ----- 

237 `DimensionRecord` subclasses are created dynamically for each 

238 `DimensionElement` in a `DimensionUniverse`, and are accessible via the 

239 `DimensionElement.RecordClass` attribute. The `DimensionRecord` base class 

240 itself is pure abstract, but does not use the `abc` module to indicate this 

241 because it does not have overridable methods. 

242 

243 Record classes have attributes that correspond exactly to the 

244 `~DimensionElementFields.standard` fields in the related database table, 

245 plus "region" and "timespan" attributes for spatial and/or temporal 

246 elements (respectively). 

247 

248 Instances are usually obtained from a `Registry`, but can be constructed 

249 directly from Python as well. 

250 

251 `DimensionRecord` instances are immutable. 

252 """ 

253 

254 # Derived classes are required to define __slots__ as well, and it's those 

255 # derived-class slots that other methods on the base class expect to see 

256 # when they access self.__slots__. 

257 __slots__ = ("dataId",) 

258 

259 _serializedType = SerializedDimensionRecord 

260 

261 def __init__(self, **kwargs: Any): 

262 # Accept either the dimension name or the actual name of its primary 

263 # key field; ensure both are present in the dict for convenience below. 

264 if isinstance(self.definition, Dimension): 

265 v = kwargs.get(self.definition.primaryKey.name) 

266 if v is None: 

267 v = kwargs.get(self.definition.name) 

268 if v is None: 

269 raise ValueError( 

270 f"No value provided for {self.definition.name}.{self.definition.primaryKey.name}." 

271 ) 

272 kwargs[self.definition.primaryKey.name] = v 

273 else: 

274 v2 = kwargs.setdefault(self.definition.name, v) 

275 if v != v2: 

276 raise ValueError( 

277 "Multiple inconsistent values for " 

278 f"{self.definition.name}.{self.definition.primaryKey.name}: {v!r} != {v2!r}." 

279 ) 

280 

281 from ._coordinate import DataCoordinate 

282 

283 object.__setattr__( 

284 self, 

285 "dataId", 

286 DataCoordinate.from_required_values( 

287 self.definition.minimal_group, 

288 tuple(kwargs[dimension] for dimension in self.definition.required.names), 

289 ), 

290 ) 

291 # Don't need the primary key value aliased to the dimension name 

292 # anymore. 

293 kwargs.pop(self.definition.name, None) 

294 

295 for name in self.__slots__: 

296 # Note that we remove from kwargs as we go, to make sure there's 

297 # nothing left at the end. 

298 object.__setattr__(self, name, kwargs.pop(name, None)) 

299 # Support 'datetime_begin' and 'datetime_end' instead of 'timespan' for 

300 # backwards compatibility, but if one is provided both must be. 

301 if self.definition.temporal is not None and self.timespan is None and "datetime_begin" in kwargs: 

302 object.__setattr__( 

303 self, 

304 "timespan", 

305 Timespan( 

306 kwargs.pop("datetime_begin"), 

307 kwargs.pop("datetime_end"), 

308 ), 

309 ) 

310 

311 if kwargs: 

312 raise TypeError(f"Invalid fields for {self.definition} dimension record: {set(kwargs.keys())}.") 

313 

314 def __eq__(self, other: Any) -> bool: 

315 if type(other) != type(self): 

316 return False 

317 return self.dataId == other.dataId 

318 

319 def __hash__(self) -> int: 

320 return hash(self.dataId.required_values) 

321 

322 def __str__(self) -> str: 

323 lines = [f"{self.definition.name}:"] 

324 lines.extend(f" {name}: {getattr(self, name)!r}" for name in self.__slots__) 

325 return "\n".join(lines) 

326 

327 def __repr__(self) -> str: 

328 return "{}.RecordClass({})".format( 

329 self.definition.name, ", ".join(f"{name}={getattr(self, name)!r}" for name in self.__slots__) 

330 ) 

331 

332 def __reduce__(self) -> tuple: 

333 mapping = {name: getattr(self, name) for name in self.__slots__} 

334 return (_reconstructDimensionRecord, (self.definition, mapping)) 

335 

336 def _repr_html_(self) -> str: 

337 """Override the default representation in IPython/Jupyter notebooks. 

338 

339 This gives a more readable output that understands embedded newlines. 

340 """ 

341 return f"<pre>{self}<pre>" 

342 

343 def to_simple(self, minimal: bool = False) -> SerializedDimensionRecord: 

344 """Convert this class to a simple python type. 

345 

346 This makes it suitable for serialization. 

347 

348 Parameters 

349 ---------- 

350 minimal : `bool`, optional 

351 Use minimal serialization. Has no effect on for this class. 

352 

353 Returns 

354 ------- 

355 names : `list` 

356 The names of the dimensions. 

357 """ 

358 # The DataId is sufficient if you are willing to do a deferred 

359 # query. This may not be overly useful since to reconstruct 

360 # a collection of records will require repeated registry queries. 

361 # For now do not implement minimal form. 

362 key = (id(self.definition), self.dataId) 

363 cache = PersistenceContextVars.serializedDimensionRecordMapping.get() 

364 if cache is not None and (result := cache.get(key)) is not None: 

365 return result 

366 

367 mapping = {name: getattr(self, name) for name in self.__slots__} 

368 # If the item in mapping supports simplification update it 

369 for k, v in mapping.items(): 

370 try: 

371 mapping[k] = v.to_simple(minimal=minimal) 

372 except AttributeError: 

373 if isinstance(v, lsst.sphgeom.Region): 

374 # YAML serialization specifies the class when it 

375 # doesn't have to. This is partly for explicitness 

376 # and also history. Here use a different approach. 

377 # This code needs to be migrated to sphgeom 

378 mapping[k] = v.encode().hex() 

379 if isinstance(v, bytes): 

380 # We actually can't handle serializing out to bytes for 

381 # hash objects, encode it here to a hex string 

382 mapping[k] = v.hex() 

383 definition = self.definition.to_simple(minimal=minimal) 

384 dimRec = SerializedDimensionRecord(definition=definition, record=mapping) 

385 if cache is not None: 

386 cache[key] = dimRec 

387 return dimRec 

388 

389 @classmethod 

390 def from_simple( 

391 cls, 

392 simple: SerializedDimensionRecord, 

393 universe: DimensionUniverse | None = None, 

394 registry: Registry | None = None, 

395 cacheKey: Hashable | None = None, 

396 ) -> DimensionRecord: 

397 """Construct a new object from the simplified form. 

398 

399 This is generally data returned from the `to_simple` 

400 method. 

401 

402 Parameters 

403 ---------- 

404 simple : `SerializedDimensionRecord` 

405 Value return from `to_simple`. 

406 universe : `DimensionUniverse` 

407 The special graph of all known dimensions of which this graph will 

408 be a subset. Can be `None` if `Registry` is provided. 

409 registry : `lsst.daf.butler.Registry`, optional 

410 Registry from which a universe can be extracted. Can be `None` 

411 if universe is provided explicitly. 

412 cacheKey : `Hashable` or `None` 

413 If this is not None, it will be used as a key for any cached 

414 reconstruction instead of calculating a value from the serialized 

415 format. 

416 

417 Returns 

418 ------- 

419 record : `DimensionRecord` 

420 Newly-constructed object. 

421 """ 

422 if universe is None and registry is None: 

423 raise ValueError("One of universe or registry is required to convert names to a DimensionGraph") 

424 if universe is None and registry is not None: 

425 universe = registry.dimensions 

426 if universe is None: 

427 # this is for mypy 

428 raise ValueError("Unable to determine a usable universe") 

429 # Type ignore because the ternary statement seems to confuse mypy 

430 # based on conflicting inferred types of v. 

431 key = cacheKey or ( 

432 simple.definition, 

433 frozenset(simple.record.items()), # type: ignore 

434 ) 

435 cache = PersistenceContextVars.dimensionRecords.get() 

436 if cache is not None and (result := cache.get(key)) is not None: 

437 return result 

438 

439 definition = DimensionElement.from_simple(simple.definition, universe=universe) 

440 

441 # Create a specialist subclass model with type validation. 

442 # This allows us to do simple checks of external data (possibly 

443 # sent as JSON) since for now _reconstructDimensionRecord does not 

444 # do any validation. 

445 record_model_cls = _createSimpleRecordSubclass(definition) 

446 record_model = record_model_cls(**simple.record) 

447 

448 # Timespan and region have to be converted to native form 

449 # for now assume that those keys are special 

450 rec = record_model.model_dump() 

451 

452 if (ts := "timespan") in rec: 

453 rec[ts] = Timespan.from_simple(rec[ts], universe=universe, registry=registry) 

454 if (reg := "region") in rec: 

455 encoded = bytes.fromhex(rec[reg]) 

456 rec[reg] = lsst.sphgeom.Region.decode(encoded) 

457 if (hsh := "hash") in rec: 

458 rec[hsh] = bytes.fromhex(rec[hsh].decode()) 

459 

460 dimRec = _reconstructDimensionRecord(definition, rec) 

461 if cache is not None: 

462 cache[key] = dimRec 

463 return dimRec 

464 

465 to_json = to_json_pydantic 

466 from_json: ClassVar = classmethod(from_json_pydantic) 

467 

468 def toDict(self, splitTimespan: bool = False) -> dict[str, Any]: 

469 """Return a vanilla `dict` representation of this record. 

470 

471 Parameters 

472 ---------- 

473 splitTimespan : `bool`, optional 

474 If `True` (`False` is default) transform any "timespan" key value 

475 from a `Timespan` instance into a pair of regular 

476 ("datetime_begin", "datetime_end") fields. 

477 """ 

478 results = {name: getattr(self, name) for name in self.__slots__} 

479 if splitTimespan: 

480 timespan = results.pop("timespan", None) 

481 if timespan is not None: 

482 results["datetime_begin"] = timespan.begin 

483 results["datetime_end"] = timespan.end 

484 return results 

485 

486 # DimensionRecord subclasses are dynamically created, so static type 

487 # checkers can't know about them or their attributes. To avoid having to 

488 # put "type: ignore", everywhere, add a dummy __getattr__ that tells type 

489 # checkers not to worry about missing attributes. 

490 def __getattr__(self, name: str) -> Any: 

491 raise AttributeError(name) 

492 

493 # Class attributes below are shadowed by instance attributes, and are 

494 # present just to hold the docstrings for those instance attributes. 

495 

496 dataId: DataCoordinate 

497 """A dict-like identifier for this record's primary keys 

498 (`DataCoordinate`). 

499 """ 

500 

501 definition: ClassVar[DimensionElement] 

502 """The `DimensionElement` whose records this class represents 

503 (`DimensionElement`). 

504 """ 

505 

506 fields: ClassVar[DimensionElementFields] 

507 """A categorized view of the fields in this class 

508 (`DimensionElementFields`). 

509 """