Coverage for python/lsst/daf/butler/core/quantum.py: 18%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

204 statements  

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ("Quantum", "SerializedQuantum", "DimensionRecordsAccumulator") 

25 

26from typing import Any, Dict, Iterable, List, Mapping, MutableMapping, Optional, Tuple, Type, Union 

27 

28from lsst.utils import doImportType 

29from pydantic import BaseModel 

30 

31from .datasets import DatasetRef, DatasetType, SerializedDatasetRef, SerializedDatasetType 

32from .datastore import DatastoreRecordData 

33from .dimensions import ( 

34 DataCoordinate, 

35 DimensionRecord, 

36 DimensionUniverse, 

37 SerializedDataCoordinate, 

38 SerializedDimensionRecord, 

39) 

40from .named import NamedKeyDict, NamedKeyMapping 

41 

42 

43def _reconstructDatasetRef( 

44 simple: SerializedDatasetRef, 

45 type_: Optional[DatasetType], 

46 ids: Iterable[int], 

47 dimensionRecords: Optional[Dict[int, SerializedDimensionRecord]], 

48 reconstitutedDimensions: Dict[int, Tuple[str, DimensionRecord]], 

49 universe: DimensionUniverse, 

50) -> DatasetRef: 

51 """Reconstruct a DatasetRef stored in a Serialized Quantum""" 

52 # Reconstruct the dimension records 

53 records = {} 

54 for dId in ids: 

55 # if the dimension record has been loaded previously use that, 

56 # otherwise load it from the dict of Serialized DimensionRecords 

57 if (recId := reconstitutedDimensions.get(dId)) is None: 

58 if dimensionRecords is None: 

59 raise ValueError( 

60 "Cannot construct from a SerializedQuantum with no dimension records. " 

61 "Reconstituted Dimensions must be supplied and populated in method call." 

62 ) 

63 tmpSerialized = dimensionRecords[dId] 

64 reconstructedDim = DimensionRecord.from_simple(tmpSerialized, universe=universe) 

65 definition = tmpSerialized.definition 

66 reconstitutedDimensions[dId] = (definition, reconstructedDim) 

67 else: 

68 definition, reconstructedDim = recId 

69 records[definition] = reconstructedDim 

70 # turn the serialized form into an object and attach the dimension records 

71 rebuiltDatasetRef = DatasetRef.from_simple(simple, universe, datasetType=type_) 

72 if records: 

73 object.__setattr__(rebuiltDatasetRef, "dataId", rebuiltDatasetRef.dataId.expanded(records)) 

74 return rebuiltDatasetRef 

75 

76 

77class SerializedQuantum(BaseModel): 

78 """Simplified model of a `Quantum` suitable for serialization.""" 

79 

80 taskName: str 

81 dataId: Optional[SerializedDataCoordinate] 

82 datasetTypeMapping: Mapping[str, SerializedDatasetType] 

83 initInputs: Mapping[str, Tuple[SerializedDatasetRef, List[int]]] 

84 inputs: Mapping[str, List[Tuple[SerializedDatasetRef, List[int]]]] 

85 outputs: Mapping[str, List[Tuple[SerializedDatasetRef, List[int]]]] 

86 dimensionRecords: Optional[Dict[int, SerializedDimensionRecord]] = None 

87 

88 @classmethod 

89 def direct( 

90 cls, 

91 *, 

92 taskName: str, 

93 dataId: Optional[Dict], 

94 datasetTypeMapping: Mapping[str, Dict], 

95 initInputs: Mapping[str, Tuple[Dict, List[int]]], 

96 inputs: Mapping[str, List[Tuple[Dict, List[int]]]], 

97 outputs: Mapping[str, List[Tuple[Dict, List[int]]]], 

98 dimensionRecords: Optional[Dict[int, Dict]], 

99 ) -> SerializedQuantum: 

100 """Construct a `SerializedQuantum` directly without validators. 

101 

102 This differs from the pydantic "construct" method in that the arguments 

103 are explicitly what the model requires, and it will recurse through 

104 members, constructing them from their corresponding `direct` methods. 

105 

106 This method should only be called when the inputs are trusted. 

107 """ 

108 node = SerializedQuantum.__new__(cls) 

109 setter = object.__setattr__ 

110 setter(node, "taskName", taskName) 

111 setter(node, "dataId", dataId if dataId is None else SerializedDataCoordinate.direct(**dataId)) 

112 setter( 

113 node, 

114 "datasetTypeMapping", 

115 {k: SerializedDatasetType.direct(**v) for k, v in datasetTypeMapping.items()}, 

116 ) 

117 setter( 

118 node, 

119 "initInputs", 

120 {k: (SerializedDatasetRef.direct(**v), refs) for k, (v, refs) in initInputs.items()}, 

121 ) 

122 setter( 

123 node, 

124 "inputs", 

125 {k: [(SerializedDatasetRef.direct(**ref), id) for ref, id in v] for k, v in inputs.items()}, 

126 ) 

127 setter( 

128 node, 

129 "outputs", 

130 {k: [(SerializedDatasetRef.direct(**ref), id) for ref, id in v] for k, v in outputs.items()}, 

131 ) 

132 setter( 

133 node, 

134 "dimensionRecords", 

135 dimensionRecords 

136 if dimensionRecords is None 

137 else {int(k): SerializedDimensionRecord.direct(**v) for k, v in dimensionRecords.items()}, 

138 ) 

139 setter( 

140 node, 

141 "__fields_set__", 

142 { 

143 "taskName", 

144 "dataId", 

145 "datasetTypeMapping", 

146 "initInputs", 

147 "inputs", 

148 "outputs", 

149 "dimensionRecords", 

150 }, 

151 ) 

152 return node 

153 

154 

155class Quantum: 

156 """Class representing a discrete unit of work. 

157 

158 A Quantum may depend on one or more datasets and produce one or more 

159 datasets. 

160 

161 Most Quanta will be executions of a particular ``PipelineTask``’s 

162 ``runQuantum`` method, but they can also be used to represent discrete 

163 units of work performed manually by human operators or other software 

164 agents. 

165 

166 Parameters 

167 ---------- 

168 taskName : `str`, optional 

169 Fully-qualified name of the Task class that executed or will execute 

170 this Quantum. If not provided, ``taskClass`` must be. 

171 taskClass : `type`, optional 

172 The Task class that executed or will execute this Quantum. If not 

173 provided, ``taskName`` must be. Overrides ``taskName`` if both are 

174 provided. 

175 dataId : `DataId`, optional 

176 The dimension values that identify this `Quantum`. 

177 initInputs : collection of `DatasetRef`, optional 

178 Datasets that are needed to construct an instance of the Task. May 

179 be a flat iterable of `DatasetRef` instances or a mapping from 

180 `DatasetType` to `DatasetRef`. 

181 inputs : `~collections.abc.Mapping`, optional 

182 Inputs identified prior to execution, organized as a mapping from 

183 `DatasetType` to a list of `DatasetRef`. 

184 outputs : `~collections.abc.Mapping`, optional 

185 Outputs from executing this quantum of work, organized as a mapping 

186 from `DatasetType` to a list of `DatasetRef`. 

187 datastore_records : `DatastoreRecordData`, optional 

188 Datastore record data for input or initInput datasets that already 

189 exist. 

190 """ 

191 

192 __slots__ = ( 

193 "_taskName", 

194 "_taskClass", 

195 "_dataId", 

196 "_initInputs", 

197 "_inputs", 

198 "_outputs", 

199 "_hash", 

200 "_datastore_records", 

201 ) 

202 

203 def __init__( 

204 self, 

205 *, 

206 taskName: Optional[str] = None, 

207 taskClass: Optional[Type] = None, 

208 dataId: Optional[DataCoordinate] = None, 

209 initInputs: Optional[Union[Mapping[DatasetType, DatasetRef], Iterable[DatasetRef]]] = None, 

210 inputs: Optional[Mapping[DatasetType, List[DatasetRef]]] = None, 

211 outputs: Optional[Mapping[DatasetType, List[DatasetRef]]] = None, 

212 datastore_records: Optional[Mapping[str, DatastoreRecordData]] = None, 

213 ): 

214 if taskClass is not None: 

215 taskName = f"{taskClass.__module__}.{taskClass.__name__}" 

216 self._taskName = taskName 

217 self._taskClass = taskClass 

218 self._dataId = dataId 

219 if initInputs is None: 

220 initInputs = {} 

221 elif not isinstance(initInputs, Mapping): 

222 initInputs = {ref.datasetType: ref for ref in initInputs} 

223 if inputs is None: 

224 inputs = {} 

225 if outputs is None: 

226 outputs = {} 

227 self._initInputs = NamedKeyDict[DatasetType, DatasetRef](initInputs).freeze() 

228 self._inputs = NamedKeyDict[DatasetType, List[DatasetRef]](inputs).freeze() 

229 self._outputs = NamedKeyDict[DatasetType, List[DatasetRef]](outputs).freeze() 

230 if datastore_records is None: 

231 datastore_records = {} 

232 self._datastore_records = datastore_records 

233 

234 def to_simple(self, accumulator: Optional[DimensionRecordsAccumulator] = None) -> SerializedQuantum: 

235 """Convert this class to a simple python type. 

236 

237 This makes it suitable for serialization. 

238 

239 Parameters 

240 ---------- 

241 accumulator : `DimensionRecordsAccumulator`, optional 

242 This accumulator can be used to aggregate dimension records accross 

243 multiple Quanta. If this is None, the default, dimension records 

244 are serialized with this Quantum. If an accumulator is supplied it 

245 is assumed something else is responsible for serializing the 

246 records, and they will not be stored with the SerializedQuantum. 

247 

248 Returns 

249 ------- 

250 simple : `SerializedQuantum` 

251 This object converted to a serializable representation. 

252 """ 

253 typeMapping = {} 

254 initInputs = {} 

255 

256 if accumulator is None: 

257 accumulator = DimensionRecordsAccumulator() 

258 writeDimensionRecords = True 

259 else: 

260 writeDimensionRecords = False 

261 

262 # collect the init inputs for serialization, recording the types into 

263 # their own mapping, used throughout to minimize saving the same object 

264 # multiple times. String name of the type used to index mappings. 

265 for key, value in self._initInputs.items(): 

266 # add the type to the typeMapping 

267 typeMapping[key.name] = key.to_simple() 

268 # convert to a simple DatasetRef representation 

269 simple = value.to_simple() 

270 # extract the dimension records 

271 recIds = [] 

272 if simple.dataId is not None and simple.dataId.records is not None: 

273 # for each dimension record get a id by adding it to the 

274 # record accumulator. 

275 for rec in value.dataId.records.values(): 

276 if rec is not None: 

277 recordId = accumulator.addRecord(rec) 

278 recIds.append(recordId) 

279 # Set properties to None to save space 

280 simple.dataId.records = None 

281 simple.datasetType = None 

282 initInputs[key.name] = (simple, recIds) 

283 

284 # container for all the SerializedDatasetRefs, keyed on the 

285 # DatasetType name. 

286 inputs = {} 

287 

288 # collect the inputs 

289 for key, values in self._inputs.items(): 

290 # collect type if it is not already in the mapping 

291 if key.name not in typeMapping: 

292 typeMapping[key.name] = key.to_simple() 

293 # for each input type there are a list of inputs, collect them 

294 tmp = [] 

295 for e in values: 

296 simp = e.to_simple() 

297 # This container will hold ids (hashes) that point to all the 

298 # dimension records within the SerializedDatasetRef dataId 

299 # These dimension records repeat in almost every DatasetRef 

300 # So it is hugely wasteful in terms of disk and cpu time to 

301 # store them over and over again. 

302 recIds = [] 

303 if simp.dataId is not None and simp.dataId.records is not None: 

304 for rec in e.dataId.records.values(): 

305 # for each dimension record get a id by adding it to 

306 # the record accumulator. 

307 if rec is not None: 

308 recordId = accumulator.addRecord(rec) 

309 recIds.append(recordId) 

310 # Set the records to None to avoid serializing them 

311 simp.dataId.records = None 

312 # Dataset type is the same as the key in _inputs, no need 

313 # to serialize it out multiple times, set it to None 

314 simp.datasetType = None 

315 # append a tuple of the simplified SerializedDatasetRef, along 

316 # with the list of all the keys for the dimension records 

317 # needed for reconstruction. 

318 tmp.append((simp, recIds)) 

319 inputs[key.name] = tmp 

320 

321 # container for all the SerializedDatasetRefs, keyed on the 

322 # DatasetType name. 

323 outputs = {} 

324 for key, values in self._outputs.items(): 

325 # collect type if it is not already in the mapping 

326 if key.name not in typeMapping: 

327 typeMapping[key.name] = key.to_simple() 

328 # for each output type there are a list of inputs, collect them 

329 tmp = [] 

330 for e in values: 

331 simp = e.to_simple() 

332 # This container will hold ids (hashes) that point to all the 

333 # dimension records within the SerializedDatasetRef dataId 

334 # These dimension records repeat in almost every DatasetRef 

335 # So it is hugely wasteful in terms of disk and cpu time to 

336 # store them over and over again. 

337 recIds = [] 

338 if simp.dataId is not None and simp.dataId.records is not None: 

339 for rec in e.dataId.records.values(): 

340 # for each dimension record get a id by adding it to 

341 # the record accumulator. 

342 if rec is not None: 

343 recordId = accumulator.addRecord(rec) 

344 recIds.append(recordId) 

345 # Set the records to None to avoid serializing them 

346 simp.dataId.records = None 

347 # Dataset type is the same as the key in _outputs, no need 

348 # to serialize it out multiple times, set it to None 

349 simp.datasetType = None 

350 # append a tuple of the simplified SerializedDatasetRef, along 

351 # with the list of all the keys for the dimension records 

352 # needed for reconstruction. 

353 tmp.append((simp, recIds)) 

354 outputs[key.name] = tmp 

355 

356 dimensionRecords: Optional[Mapping[int, SerializedDimensionRecord]] 

357 if writeDimensionRecords: 

358 dimensionRecords = accumulator.makeSerializedDimensionRecordMapping() 

359 else: 

360 dimensionRecords = None 

361 

362 return SerializedQuantum( 

363 taskName=self._taskName, 

364 dataId=self.dataId.to_simple() if self.dataId is not None else None, 

365 datasetTypeMapping=typeMapping, 

366 initInputs=initInputs, 

367 inputs=inputs, 

368 outputs=outputs, 

369 dimensionRecords=dimensionRecords, 

370 ) 

371 

372 @classmethod 

373 def from_simple( 

374 cls, 

375 simple: SerializedQuantum, 

376 universe: DimensionUniverse, 

377 reconstitutedDimensions: Optional[Dict[int, Tuple[str, DimensionRecord]]] = None, 

378 ) -> Quantum: 

379 """Construct a new object from a simplified form. 

380 

381 Generally this is data returned from the `to_simple` method. 

382 

383 Parameters 

384 ---------- 

385 simple : SerializedQuantum 

386 The value returned by a call to `to_simple` 

387 universe : `DimensionUniverse` 

388 The special graph of all known dimensions. 

389 reconstitutedDimensions : `dict` of `int` to `DimensionRecord` or None 

390 A mapping of ids to dimension records to be used when populating 

391 dimensions for this Quantum. If supplied it will be used in place 

392 of the dimension Records stored with the SerializedQuantum, if a 

393 required dimension has already been loaded. Otherwise the record 

394 will be unpersisted from the SerializedQuatnum and added to the 

395 reconstitutedDimensions dict (if not None). Defaults to None. 

396 """ 

397 loadedTypes: MutableMapping[str, DatasetType] = {} 

398 initInputs: MutableMapping[DatasetType, DatasetRef] = {} 

399 if reconstitutedDimensions is None: 

400 reconstitutedDimensions = {} 

401 

402 # Unpersist all the init inputs 

403 for key, (value, dimensionIds) in simple.initInputs.items(): 

404 # If a datasetType has already been created use that instead of 

405 # unpersisting. 

406 if (type_ := loadedTypes.get(key)) is None: 

407 type_ = loadedTypes.setdefault( 

408 key, DatasetType.from_simple(simple.datasetTypeMapping[key], universe=universe) 

409 ) 

410 # reconstruct the dimension records 

411 rebuiltDatasetRef = _reconstructDatasetRef( 

412 value, type_, dimensionIds, simple.dimensionRecords, reconstitutedDimensions, universe 

413 ) 

414 initInputs[type_] = rebuiltDatasetRef 

415 

416 # containers for the dataset refs 

417 inputs: MutableMapping[DatasetType, List[DatasetRef]] = {} 

418 outputs: MutableMapping[DatasetType, List[DatasetRef]] = {} 

419 

420 for container, simpleRefs in ((inputs, simple.inputs), (outputs, simple.outputs)): 

421 for key, values in simpleRefs.items(): 

422 # If a datasetType has already been created use that instead of 

423 # unpersisting. 

424 if (type_ := loadedTypes.get(key)) is None: 

425 type_ = loadedTypes.setdefault( 

426 key, DatasetType.from_simple(simple.datasetTypeMapping[key], universe=universe) 

427 ) 

428 # reconstruct the list of DatasetRefs for this DatasetType 

429 tmp: List[DatasetRef] = [] 

430 for v, recIds in values: 

431 rebuiltDatasetRef = _reconstructDatasetRef( 

432 v, type_, recIds, simple.dimensionRecords, reconstitutedDimensions, universe 

433 ) 

434 tmp.append(rebuiltDatasetRef) 

435 container[type_] = tmp 

436 

437 dataId = ( 

438 DataCoordinate.from_simple(simple.dataId, universe=universe) 

439 if simple.dataId is not None 

440 else None 

441 ) 

442 return Quantum( 

443 taskName=simple.taskName, dataId=dataId, initInputs=initInputs, inputs=inputs, outputs=outputs 

444 ) 

445 

446 @property 

447 def taskClass(self) -> Optional[Type]: 

448 """Task class associated with this `Quantum` (`type`).""" 

449 if self._taskClass is None: 

450 if self._taskName is None: 

451 raise ValueError("No task class defined and task name is None") 

452 task_class = doImportType(self._taskName) 

453 self._taskClass = task_class 

454 return self._taskClass 

455 

456 @property 

457 def taskName(self) -> Optional[str]: 

458 """Return Fully-qualified name of the task associated with `Quantum`. 

459 

460 (`str`). 

461 """ 

462 return self._taskName 

463 

464 @property 

465 def dataId(self) -> Optional[DataCoordinate]: 

466 """Return dimension values of the unit of processing (`DataId`).""" 

467 return self._dataId 

468 

469 @property 

470 def initInputs(self) -> NamedKeyMapping[DatasetType, DatasetRef]: 

471 """Return mapping of datasets used to construct the Task. 

472 

473 Has `DatasetType` instances as keys (names can also be used for 

474 lookups) and `DatasetRef` instances as values. 

475 """ 

476 return self._initInputs 

477 

478 @property 

479 def inputs(self) -> NamedKeyMapping[DatasetType, List[DatasetRef]]: 

480 """Return mapping of input datasets that were expected to be used. 

481 

482 Has `DatasetType` instances as keys (names can also be used for 

483 lookups) and a list of `DatasetRef` instances as values. 

484 

485 Notes 

486 ----- 

487 We cannot use `set` instead of `list` for the nested container because 

488 `DatasetRef` instances cannot be compared reliably when some have 

489 integers IDs and others do not. 

490 """ 

491 return self._inputs 

492 

493 @property 

494 def outputs(self) -> NamedKeyMapping[DatasetType, List[DatasetRef]]: 

495 """Return mapping of output datasets (to be) generated by this quantum. 

496 

497 Has the same form as `predictedInputs`. 

498 

499 Notes 

500 ----- 

501 We cannot use `set` instead of `list` for the nested container because 

502 `DatasetRef` instances cannot be compared reliably when some have 

503 integers IDs and others do not. 

504 """ 

505 return self._outputs 

506 

507 @property 

508 def datastore_records(self) -> Mapping[str, DatastoreRecordData]: 

509 """Tabular data stored with this quantum (`dict`). 

510 

511 This attribute may be modified in place, but not assigned to. 

512 """ 

513 return self._datastore_records 

514 

515 def __eq__(self, other: object) -> bool: 

516 if not isinstance(other, Quantum): 

517 return False 

518 for item in ("taskClass", "dataId", "initInputs", "inputs", "outputs"): 

519 if getattr(self, item) != getattr(other, item): 

520 return False 

521 return True 

522 

523 def __hash__(self) -> int: 

524 return hash((self.taskClass, self.dataId)) 

525 

526 def __reduce__(self) -> Union[str, Tuple[Any, ...]]: 

527 return ( 

528 self._reduceFactory, 

529 ( 

530 self.taskName, 

531 self.taskClass, 

532 self.dataId, 

533 dict(self.initInputs.items()), 

534 dict(self.inputs), 

535 dict(self.outputs), 

536 ), 

537 ) 

538 

539 def __str__(self) -> str: 

540 return f"{self.__class__.__name__}(taskName={self.taskName}, dataId={self.dataId})" 

541 

542 @staticmethod 

543 def _reduceFactory( 

544 taskName: Optional[str], 

545 taskClass: Optional[Type], 

546 dataId: Optional[DataCoordinate], 

547 initInputs: Optional[Union[Mapping[DatasetType, DatasetRef], Iterable[DatasetRef]]], 

548 inputs: Optional[Mapping[DatasetType, List[DatasetRef]]], 

549 outputs: Optional[Mapping[DatasetType, List[DatasetRef]]], 

550 datastore_records: Optional[Mapping[str, DatastoreRecordData]] = None, 

551 ) -> Quantum: 

552 return Quantum( 

553 taskName=taskName, 

554 taskClass=taskClass, 

555 dataId=dataId, 

556 initInputs=initInputs, 

557 inputs=inputs, 

558 outputs=outputs, 

559 datastore_records=datastore_records, 

560 ) 

561 

562 

563class DimensionRecordsAccumulator: 

564 """Class used to accumulate dimension records for serialization. 

565 

566 This class generates an auto increment key for each unique dimension record 

567 added to it. This allows serialization of dimension records to occur once 

568 for each record but be refereed to multiple times. 

569 """ 

570 

571 def __init__(self) -> None: 

572 self._counter = 0 

573 self.mapping: MutableMapping[DimensionRecord, Tuple[int, SerializedDimensionRecord]] = {} 

574 

575 def addRecord(self, record: DimensionRecord) -> int: 

576 """Add a dimension record to the accumulator if it has not already been 

577 added. When a record is inserted for the first time it is assigned 

578 a unique integer key. 

579 

580 This function returns the key associated with the record (either the 

581 newly allocated key, or the existing one) 

582 

583 Parameters 

584 ---------- 

585 record : `DimensionRecord` 

586 The record to add to the accumulator 

587 

588 Returns 

589 ------- 

590 accumulatorKey : int 

591 The key that is associated with the supplied record 

592 """ 

593 if (mappingValue := self.mapping.get(record)) is None: 

594 simple = record.to_simple() 

595 mappingValue = (self._counter, simple) 

596 self._counter += 1 

597 self.mapping[record] = mappingValue 

598 return mappingValue[0] 

599 

600 def makeSerializedDimensionRecordMapping(self) -> Mapping[int, SerializedDimensionRecord]: 

601 return {id_: serializeRef for id_, serializeRef in self.mapping.values()}