Coverage for python/lsst/daf/butler/core/quantum.py: 18%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

197 statements  

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ("Quantum", "SerializedQuantum", "DimensionRecordsAccumulator") 

25 

26from typing import Any, Dict, Iterable, List, Mapping, MutableMapping, Optional, Tuple, Type, Union 

27 

28from lsst.utils import doImportType 

29from pydantic import BaseModel 

30 

31from .datasets import DatasetRef, DatasetType, SerializedDatasetRef, SerializedDatasetType 

32from .dimensions import ( 

33 DataCoordinate, 

34 DimensionRecord, 

35 DimensionUniverse, 

36 SerializedDataCoordinate, 

37 SerializedDimensionRecord, 

38) 

39from .named import NamedKeyDict, NamedKeyMapping 

40 

41 

42def _reconstructDatasetRef( 

43 simple: SerializedDatasetRef, 

44 type_: Optional[DatasetType], 

45 ids: Iterable[int], 

46 dimensionRecords: Optional[Dict[int, SerializedDimensionRecord]], 

47 reconstitutedDimensions: Dict[int, Tuple[str, DimensionRecord]], 

48 universe: DimensionUniverse, 

49) -> DatasetRef: 

50 """Reconstruct a DatasetRef stored in a Serialized Quantum""" 

51 # Reconstruct the dimension records 

52 records = {} 

53 for dId in ids: 

54 # if the dimension record has been loaded previously use that, 

55 # otherwise load it from the dict of Serialized DimensionRecords 

56 if (recId := reconstitutedDimensions.get(dId)) is None: 

57 if dimensionRecords is None: 

58 raise ValueError( 

59 "Cannot construct from a SerializedQuantum with no dimension records. " 

60 "Reconstituted Dimensions must be supplied and populated in method call." 

61 ) 

62 tmpSerialized = dimensionRecords[dId] 

63 reconstructedDim = DimensionRecord.from_simple(tmpSerialized, universe=universe) 

64 definition = tmpSerialized.definition 

65 reconstitutedDimensions[dId] = (definition, reconstructedDim) 

66 else: 

67 definition, reconstructedDim = recId 

68 records[definition] = reconstructedDim 

69 # turn the serialized form into an object and attach the dimension records 

70 rebuiltDatasetRef = DatasetRef.from_simple(simple, universe, datasetType=type_) 

71 if records: 

72 object.__setattr__(rebuiltDatasetRef, "dataId", rebuiltDatasetRef.dataId.expanded(records)) 

73 return rebuiltDatasetRef 

74 

75 

76class SerializedQuantum(BaseModel): 

77 """Simplified model of a `Quantum` suitable for serialization.""" 

78 

79 taskName: str 

80 dataId: Optional[SerializedDataCoordinate] 

81 datasetTypeMapping: Mapping[str, SerializedDatasetType] 

82 initInputs: Mapping[str, Tuple[SerializedDatasetRef, List[int]]] 

83 inputs: Mapping[str, List[Tuple[SerializedDatasetRef, List[int]]]] 

84 outputs: Mapping[str, List[Tuple[SerializedDatasetRef, List[int]]]] 

85 dimensionRecords: Optional[Dict[int, SerializedDimensionRecord]] = None 

86 

87 @classmethod 

88 def direct( 

89 cls, 

90 *, 

91 taskName: str, 

92 dataId: Optional[Dict], 

93 datasetTypeMapping: Mapping[str, Dict], 

94 initInputs: Mapping[str, Tuple[Dict, List[int]]], 

95 inputs: Mapping[str, List[Tuple[Dict, List[int]]]], 

96 outputs: Mapping[str, List[Tuple[Dict, List[int]]]], 

97 dimensionRecords: Optional[Dict[int, Dict]], 

98 ) -> SerializedQuantum: 

99 """Construct a `SerializedQuantum` directly without validators. 

100 

101 This differs from the pydantic "construct" method in that the arguments 

102 are explicitly what the model requires, and it will recurse through 

103 members, constructing them from their corresponding `direct` methods. 

104 

105 This method should only be called when the inputs are trusted. 

106 """ 

107 node = SerializedQuantum.__new__(cls) 

108 setter = object.__setattr__ 

109 setter(node, "taskName", taskName) 

110 setter(node, "dataId", dataId if dataId is None else SerializedDataCoordinate.direct(**dataId)) 

111 setter( 

112 node, 

113 "datasetTypeMapping", 

114 {k: SerializedDatasetType.direct(**v) for k, v in datasetTypeMapping.items()}, 

115 ) 

116 setter( 

117 node, 

118 "initInputs", 

119 {k: (SerializedDatasetRef.direct(**v), refs) for k, (v, refs) in initInputs.items()}, 

120 ) 

121 setter( 

122 node, 

123 "inputs", 

124 {k: [(SerializedDatasetRef.direct(**ref), id) for ref, id in v] for k, v in inputs.items()}, 

125 ) 

126 setter( 

127 node, 

128 "outputs", 

129 {k: [(SerializedDatasetRef.direct(**ref), id) for ref, id in v] for k, v in outputs.items()}, 

130 ) 

131 setter( 

132 node, 

133 "dimensionRecords", 

134 dimensionRecords 

135 if dimensionRecords is None 

136 else {int(k): SerializedDimensionRecord.direct(**v) for k, v in dimensionRecords.items()}, 

137 ) 

138 setter( 

139 node, 

140 "__fields_set__", 

141 { 

142 "taskName", 

143 "dataId", 

144 "datasetTypeMapping", 

145 "initInputs", 

146 "inputs", 

147 "outputs", 

148 "dimensionRecords", 

149 }, 

150 ) 

151 return node 

152 

153 

154class Quantum: 

155 """Class representing a discrete unit of work. 

156 

157 A Quantum may depend on one or more datasets and produce one or more 

158 datasets. 

159 

160 Most Quanta will be executions of a particular ``PipelineTask``’s 

161 ``runQuantum`` method, but they can also be used to represent discrete 

162 units of work performed manually by human operators or other software 

163 agents. 

164 

165 Parameters 

166 ---------- 

167 taskName : `str`, optional 

168 Fully-qualified name of the Task class that executed or will execute 

169 this Quantum. If not provided, ``taskClass`` must be. 

170 taskClass : `type`, optional 

171 The Task class that executed or will execute this Quantum. If not 

172 provided, ``taskName`` must be. Overrides ``taskName`` if both are 

173 provided. 

174 dataId : `DataId`, optional 

175 The dimension values that identify this `Quantum`. 

176 initInputs : collection of `DatasetRef`, optional 

177 Datasets that are needed to construct an instance of the Task. May 

178 be a flat iterable of `DatasetRef` instances or a mapping from 

179 `DatasetType` to `DatasetRef`. 

180 inputs : `~collections.abc.Mapping`, optional 

181 Inputs identified prior to execution, organized as a mapping from 

182 `DatasetType` to a list of `DatasetRef`. 

183 outputs : `~collections.abc.Mapping`, optional 

184 Outputs from executing this quantum of work, organized as a mapping 

185 from `DatasetType` to a list of `DatasetRef`. 

186 """ 

187 

188 __slots__ = ("_taskName", "_taskClass", "_dataId", "_initInputs", "_inputs", "_outputs", "_hash") 

189 

190 def __init__( 

191 self, 

192 *, 

193 taskName: Optional[str] = None, 

194 taskClass: Optional[Type] = None, 

195 dataId: Optional[DataCoordinate] = None, 

196 initInputs: Optional[Union[Mapping[DatasetType, DatasetRef], Iterable[DatasetRef]]] = None, 

197 inputs: Optional[Mapping[DatasetType, List[DatasetRef]]] = None, 

198 outputs: Optional[Mapping[DatasetType, List[DatasetRef]]] = None, 

199 ): 

200 if taskClass is not None: 

201 taskName = f"{taskClass.__module__}.{taskClass.__name__}" 

202 self._taskName = taskName 

203 self._taskClass = taskClass 

204 self._dataId = dataId 

205 if initInputs is None: 

206 initInputs = {} 

207 elif not isinstance(initInputs, Mapping): 

208 initInputs = {ref.datasetType: ref for ref in initInputs} 

209 if inputs is None: 

210 inputs = {} 

211 if outputs is None: 

212 outputs = {} 

213 self._initInputs = NamedKeyDict[DatasetType, DatasetRef](initInputs).freeze() 

214 self._inputs = NamedKeyDict[DatasetType, List[DatasetRef]](inputs).freeze() 

215 self._outputs = NamedKeyDict[DatasetType, List[DatasetRef]](outputs).freeze() 

216 

217 def to_simple(self, accumulator: Optional[DimensionRecordsAccumulator] = None) -> SerializedQuantum: 

218 """Convert this class to a simple python type. 

219 

220 This makes it suitable for serialization. 

221 

222 Parameters 

223 ---------- 

224 accumulator : `DimensionRecordsAccumulator`, optional 

225 This accumulator can be used to aggregate dimension records accross 

226 multiple Quanta. If this is None, the default, dimension records 

227 are serialized with this Quantum. If an accumulator is supplied it 

228 is assumed something else is responsible for serializing the 

229 records, and they will not be stored with the SerializedQuantum. 

230 

231 Returns 

232 ------- 

233 simple : `SerializedQuantum` 

234 This object converted to a serializable representation. 

235 """ 

236 typeMapping = {} 

237 initInputs = {} 

238 

239 if accumulator is None: 

240 accumulator = DimensionRecordsAccumulator() 

241 writeDimensionRecords = True 

242 else: 

243 writeDimensionRecords = False 

244 

245 # collect the init inputs for serialization, recording the types into 

246 # their own mapping, used throughout to minimize saving the same object 

247 # multiple times. String name of the type used to index mappings. 

248 for key, value in self._initInputs.items(): 

249 # add the type to the typeMapping 

250 typeMapping[key.name] = key.to_simple() 

251 # convert to a simple DatasetRef representation 

252 simple = value.to_simple() 

253 # extract the dimension records 

254 recIds = [] 

255 if simple.dataId is not None and simple.dataId.records is not None: 

256 # for each dimension record get a id by adding it to the 

257 # record accumulator. 

258 for rec in value.dataId.records.values(): 

259 if rec is not None: 

260 recordId = accumulator.addRecord(rec) 

261 recIds.append(recordId) 

262 # Set properties to None to save space 

263 simple.dataId.records = None 

264 simple.datasetType = None 

265 initInputs[key.name] = (simple, recIds) 

266 

267 # container for all the SerializedDatasetRefs, keyed on the 

268 # DatasetType name. 

269 inputs = {} 

270 

271 # collect the inputs 

272 for key, values in self._inputs.items(): 

273 # collect type if it is not already in the mapping 

274 if key.name not in typeMapping: 

275 typeMapping[key.name] = key.to_simple() 

276 # for each input type there are a list of inputs, collect them 

277 tmp = [] 

278 for e in values: 

279 simp = e.to_simple() 

280 # This container will hold ids (hashes) that point to all the 

281 # dimension records within the SerializedDatasetRef dataId 

282 # These dimension records repeat in almost every DatasetRef 

283 # So it is hugely wasteful in terms of disk and cpu time to 

284 # store them over and over again. 

285 recIds = [] 

286 if simp.dataId is not None and simp.dataId.records is not None: 

287 for rec in e.dataId.records.values(): 

288 # for each dimension record get a id by adding it to 

289 # the record accumulator. 

290 if rec is not None: 

291 recordId = accumulator.addRecord(rec) 

292 recIds.append(recordId) 

293 # Set the records to None to avoid serializing them 

294 simp.dataId.records = None 

295 # Dataset type is the same as the key in _inputs, no need 

296 # to serialize it out multiple times, set it to None 

297 simp.datasetType = None 

298 # append a tuple of the simplified SerializedDatasetRef, along 

299 # with the list of all the keys for the dimension records 

300 # needed for reconstruction. 

301 tmp.append((simp, recIds)) 

302 inputs[key.name] = tmp 

303 

304 # container for all the SerializedDatasetRefs, keyed on the 

305 # DatasetType name. 

306 outputs = {} 

307 for key, values in self._outputs.items(): 

308 # collect type if it is not already in the mapping 

309 if key.name not in typeMapping: 

310 typeMapping[key.name] = key.to_simple() 

311 # for each output type there are a list of inputs, collect them 

312 tmp = [] 

313 for e in values: 

314 simp = e.to_simple() 

315 # This container will hold ids (hashes) that point to all the 

316 # dimension records within the SerializedDatasetRef dataId 

317 # These dimension records repeat in almost every DatasetRef 

318 # So it is hugely wasteful in terms of disk and cpu time to 

319 # store them over and over again. 

320 recIds = [] 

321 if simp.dataId is not None and simp.dataId.records is not None: 

322 for rec in e.dataId.records.values(): 

323 # for each dimension record get a id by adding it to 

324 # the record accumulator. 

325 if rec is not None: 

326 recordId = accumulator.addRecord(rec) 

327 recIds.append(recordId) 

328 # Set the records to None to avoid serializing them 

329 simp.dataId.records = None 

330 # Dataset type is the same as the key in _outputs, no need 

331 # to serialize it out multiple times, set it to None 

332 simp.datasetType = None 

333 # append a tuple of the simplified SerializedDatasetRef, along 

334 # with the list of all the keys for the dimension records 

335 # needed for reconstruction. 

336 tmp.append((simp, recIds)) 

337 outputs[key.name] = tmp 

338 

339 dimensionRecords: Optional[Mapping[int, SerializedDimensionRecord]] 

340 if writeDimensionRecords: 

341 dimensionRecords = accumulator.makeSerializedDimensionRecordMapping() 

342 else: 

343 dimensionRecords = None 

344 

345 return SerializedQuantum( 

346 taskName=self._taskName, 

347 dataId=self.dataId.to_simple() if self.dataId is not None else None, 

348 datasetTypeMapping=typeMapping, 

349 initInputs=initInputs, 

350 inputs=inputs, 

351 outputs=outputs, 

352 dimensionRecords=dimensionRecords, 

353 ) 

354 

355 @classmethod 

356 def from_simple( 

357 cls, 

358 simple: SerializedQuantum, 

359 universe: DimensionUniverse, 

360 reconstitutedDimensions: Optional[Dict[int, Tuple[str, DimensionRecord]]] = None, 

361 ) -> Quantum: 

362 """Construct a new object from a simplified form. 

363 

364 Generally this is data returned from the `to_simple` method. 

365 

366 Parameters 

367 ---------- 

368 simple : SerializedQuantum 

369 The value returned by a call to `to_simple` 

370 universe : `DimensionUniverse` 

371 The special graph of all known dimensions. 

372 reconstitutedDimensions : `dict` of `int` to `DimensionRecord` or None 

373 A mapping of ids to dimension records to be used when populating 

374 dimensions for this Quantum. If supplied it will be used in place 

375 of the dimension Records stored with the SerializedQuantum, if a 

376 required dimension has already been loaded. Otherwise the record 

377 will be unpersisted from the SerializedQuatnum and added to the 

378 reconstitutedDimensions dict (if not None). Defaults to None. 

379 """ 

380 loadedTypes: MutableMapping[str, DatasetType] = {} 

381 initInputs: MutableMapping[DatasetType, DatasetRef] = {} 

382 if reconstitutedDimensions is None: 

383 reconstitutedDimensions = {} 

384 

385 # Unpersist all the init inputs 

386 for key, (value, dimensionIds) in simple.initInputs.items(): 

387 # If a datasetType has already been created use that instead of 

388 # unpersisting. 

389 if (type_ := loadedTypes.get(key)) is None: 

390 type_ = loadedTypes.setdefault( 

391 key, DatasetType.from_simple(simple.datasetTypeMapping[key], universe=universe) 

392 ) 

393 # reconstruct the dimension records 

394 rebuiltDatasetRef = _reconstructDatasetRef( 

395 value, type_, dimensionIds, simple.dimensionRecords, reconstitutedDimensions, universe 

396 ) 

397 initInputs[type_] = rebuiltDatasetRef 

398 

399 # containers for the dataset refs 

400 inputs: MutableMapping[DatasetType, List[DatasetRef]] = {} 

401 outputs: MutableMapping[DatasetType, List[DatasetRef]] = {} 

402 

403 for container, simpleRefs in ((inputs, simple.inputs), (outputs, simple.outputs)): 

404 for key, values in simpleRefs.items(): 

405 # If a datasetType has already been created use that instead of 

406 # unpersisting. 

407 if (type_ := loadedTypes.get(key)) is None: 

408 type_ = loadedTypes.setdefault( 

409 key, DatasetType.from_simple(simple.datasetTypeMapping[key], universe=universe) 

410 ) 

411 # reconstruct the list of DatasetRefs for this DatasetType 

412 tmp: List[DatasetRef] = [] 

413 for v, recIds in values: 

414 rebuiltDatasetRef = _reconstructDatasetRef( 

415 v, type_, recIds, simple.dimensionRecords, reconstitutedDimensions, universe 

416 ) 

417 tmp.append(rebuiltDatasetRef) 

418 container[type_] = tmp 

419 

420 dataId = ( 

421 DataCoordinate.from_simple(simple.dataId, universe=universe) 

422 if simple.dataId is not None 

423 else None 

424 ) 

425 return Quantum( 

426 taskName=simple.taskName, dataId=dataId, initInputs=initInputs, inputs=inputs, outputs=outputs 

427 ) 

428 

429 @property 

430 def taskClass(self) -> Optional[Type]: 

431 """Task class associated with this `Quantum` (`type`).""" 

432 if self._taskClass is None: 

433 if self._taskName is None: 

434 raise ValueError("No task class defined and task name is None") 

435 task_class = doImportType(self._taskName) 

436 self._taskClass = task_class 

437 return self._taskClass 

438 

439 @property 

440 def taskName(self) -> Optional[str]: 

441 """Return Fully-qualified name of the task associated with `Quantum`. 

442 

443 (`str`). 

444 """ 

445 return self._taskName 

446 

447 @property 

448 def dataId(self) -> Optional[DataCoordinate]: 

449 """Return dimension values of the unit of processing (`DataId`).""" 

450 return self._dataId 

451 

452 @property 

453 def initInputs(self) -> NamedKeyMapping[DatasetType, DatasetRef]: 

454 """Return mapping of datasets used to construct the Task. 

455 

456 Has `DatasetType` instances as keys (names can also be used for 

457 lookups) and `DatasetRef` instances as values. 

458 """ 

459 return self._initInputs 

460 

461 @property 

462 def inputs(self) -> NamedKeyMapping[DatasetType, List[DatasetRef]]: 

463 """Return mapping of input datasets that were expected to be used. 

464 

465 Has `DatasetType` instances as keys (names can also be used for 

466 lookups) and a list of `DatasetRef` instances as values. 

467 

468 Notes 

469 ----- 

470 We cannot use `set` instead of `list` for the nested container because 

471 `DatasetRef` instances cannot be compared reliably when some have 

472 integers IDs and others do not. 

473 """ 

474 return self._inputs 

475 

476 @property 

477 def outputs(self) -> NamedKeyMapping[DatasetType, List[DatasetRef]]: 

478 """Return mapping of output datasets (to be) generated by this quantum. 

479 

480 Has the same form as `predictedInputs`. 

481 

482 Notes 

483 ----- 

484 We cannot use `set` instead of `list` for the nested container because 

485 `DatasetRef` instances cannot be compared reliably when some have 

486 integers IDs and others do not. 

487 """ 

488 return self._outputs 

489 

490 def __eq__(self, other: object) -> bool: 

491 if not isinstance(other, Quantum): 

492 return False 

493 for item in ("taskClass", "dataId", "initInputs", "inputs", "outputs"): 

494 if getattr(self, item) != getattr(other, item): 

495 return False 

496 return True 

497 

498 def __hash__(self) -> int: 

499 return hash((self.taskClass, self.dataId)) 

500 

501 def __reduce__(self) -> Union[str, Tuple[Any, ...]]: 

502 return ( 

503 self._reduceFactory, 

504 ( 

505 self.taskName, 

506 self.taskClass, 

507 self.dataId, 

508 dict(self.initInputs.items()), 

509 dict(self.inputs), 

510 dict(self.outputs), 

511 ), 

512 ) 

513 

514 def __str__(self) -> str: 

515 return f"{self.__class__.__name__}(taskName={self.taskName}, dataId={self.dataId})" 

516 

517 @staticmethod 

518 def _reduceFactory( 

519 taskName: Optional[str], 

520 taskClass: Optional[Type], 

521 dataId: Optional[DataCoordinate], 

522 initInputs: Optional[Union[Mapping[DatasetType, DatasetRef], Iterable[DatasetRef]]], 

523 inputs: Optional[Mapping[DatasetType, List[DatasetRef]]], 

524 outputs: Optional[Mapping[DatasetType, List[DatasetRef]]], 

525 ) -> Quantum: 

526 return Quantum( 

527 taskName=taskName, 

528 taskClass=taskClass, 

529 dataId=dataId, 

530 initInputs=initInputs, 

531 inputs=inputs, 

532 outputs=outputs, 

533 ) 

534 

535 

536class DimensionRecordsAccumulator: 

537 """Class used to accumulate dimension records for serialization. 

538 

539 This class generates an auto increment key for each unique dimension record 

540 added to it. This allows serialization of dimension records to occur once 

541 for each record but be refereed to multiple times. 

542 """ 

543 

544 def __init__(self) -> None: 

545 self._counter = 0 

546 self.mapping: MutableMapping[DimensionRecord, Tuple[int, SerializedDimensionRecord]] = {} 

547 

548 def addRecord(self, record: DimensionRecord) -> int: 

549 """Add a dimension record to the accumulator if it has not already been 

550 added. When a record is inserted for the first time it is assigned 

551 a unique integer key. 

552 

553 This function returns the key associated with the record (either the 

554 newly allocated key, or the existing one) 

555 

556 Parameters 

557 ---------- 

558 record : `DimensionRecord` 

559 The record to add to the accumulator 

560 

561 Returns 

562 ------- 

563 accumulatorKey : int 

564 The key that is associated with the supplied record 

565 """ 

566 if (mappingValue := self.mapping.get(record)) is None: 

567 simple = record.to_simple() 

568 mappingValue = (self._counter, simple) 

569 self._counter += 1 

570 self.mapping[record] = mappingValue 

571 return mappingValue[0] 

572 

573 def makeSerializedDimensionRecordMapping(self) -> Mapping[int, SerializedDimensionRecord]: 

574 return {id_: serializeRef for id_, serializeRef in self.mapping.values()}