Coverage for python/lsst/daf/butler/core/quantum.py: 18%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

199 statements  

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ("Quantum", "SerializedQuantum", "DimensionRecordsAccumulator") 

25 

26from typing import ( 

27 Any, 

28 Iterable, 

29 List, 

30 Mapping, 

31 MutableMapping, 

32 Optional, 

33 Tuple, 

34 Type, 

35 Union, 

36 Dict 

37) 

38 

39from pydantic import BaseModel 

40 

41from lsst.utils import doImportType 

42 

43from .datasets import DatasetRef, DatasetType 

44from .dimensions import DataCoordinate 

45from .named import NamedKeyDict, NamedKeyMapping 

46from .dimensions import (SerializedDataCoordinate, DimensionUniverse, SerializedDimensionRecord, 

47 DimensionRecord) 

48from .datasets import SerializedDatasetRef, SerializedDatasetType 

49 

50 

51def _reconstructDatasetRef(simple: SerializedDatasetRef, type_: Optional[DatasetType], 

52 ids: Iterable[int], 

53 dimensionRecords: Optional[Dict[int, SerializedDimensionRecord]], 

54 reconstitutedDimensions: Dict[int, Tuple[str, DimensionRecord]], 

55 universe: DimensionUniverse) -> DatasetRef: 

56 """Reconstruct a DatasetRef stored in a Serialized Quantum 

57 """ 

58 # Reconstruct the dimension records 

59 records = {} 

60 for dId in ids: 

61 # if the dimension record has been loaded previously use that, 

62 # otherwise load it from the dict of Serialized DimensionRecords 

63 if (recId := reconstitutedDimensions.get(dId)) is None: 

64 if dimensionRecords is None: 

65 raise ValueError("Cannot construct from a SerializedQuantum with no dimension records. " 

66 "Reconstituted Dimensions must be supplied and populated in method call.") 

67 tmpSerialized = dimensionRecords[dId] 

68 reconstructedDim = DimensionRecord.from_simple(tmpSerialized, universe=universe) 

69 definition = tmpSerialized.definition 

70 reconstitutedDimensions[dId] = (definition, reconstructedDim) 

71 else: 

72 definition, reconstructedDim = recId 

73 records[definition] = reconstructedDim 

74 # turn the serialized form into an object and attach the dimension records 

75 rebuiltDatasetRef = DatasetRef.from_simple(simple, universe, datasetType=type_) 

76 if records: 

77 object.__setattr__(rebuiltDatasetRef, 'dataId', 

78 rebuiltDatasetRef.dataId.expanded(records)) 

79 return rebuiltDatasetRef 

80 

81 

82class SerializedQuantum(BaseModel): 

83 """Simplified model of a `Quantum` suitable for serialization.""" 

84 

85 taskName: str 

86 dataId: Optional[SerializedDataCoordinate] 

87 datasetTypeMapping: Mapping[str, SerializedDatasetType] 

88 initInputs: Mapping[str, Tuple[SerializedDatasetRef, List[int]]] 

89 inputs: Mapping[str, List[Tuple[SerializedDatasetRef, List[int]]]] 

90 outputs: Mapping[str, List[Tuple[SerializedDatasetRef, List[int]]]] 

91 dimensionRecords: Optional[Dict[int, SerializedDimensionRecord]] = None 

92 

93 @classmethod 

94 def direct(cls, *, 

95 taskName: str, 

96 dataId: Optional[Dict], 

97 datasetTypeMapping: Mapping[str, Dict], 

98 initInputs: Mapping[str, Tuple[Dict, List[int]]], 

99 inputs: Mapping[str, List[Tuple[Dict, List[int]]]], 

100 outputs: Mapping[str, List[Tuple[Dict, List[int]]]], 

101 dimensionRecords: Optional[Dict[int, Dict]] 

102 ) -> SerializedQuantum: 

103 """Construct a `SerializedQuantum` directly without validators. 

104 

105 This differs from the pydantic "construct" method in that the arguments 

106 are explicitly what the model requires, and it will recurse through 

107 members, constructing them from their corresponding `direct` methods. 

108 

109 This method should only be called when the inputs are trusted. 

110 """ 

111 node = SerializedQuantum.__new__(cls) 

112 setter = object.__setattr__ 

113 setter(node, 'taskName', taskName) 

114 setter(node, 'dataId', 

115 dataId if dataId is None else SerializedDataCoordinate.direct(**dataId)) 

116 setter(node, "datasetTypeMapping", 

117 {k: SerializedDatasetType.direct(**v) for k, v in datasetTypeMapping.items()}) 

118 setter(node, "initInputs", 

119 {k: (SerializedDatasetRef.direct(**v), refs) for k, (v, refs) in initInputs.items()}) 

120 setter(node, "inputs", 

121 {k: [(SerializedDatasetRef.direct(**ref), id) for ref, id in v] for k, v in inputs.items()}) 

122 setter(node, "outputs", 

123 {k: [(SerializedDatasetRef.direct(**ref), id) for ref, id in v] for k, v in outputs.items()}) 

124 setter(node, "dimensionRecords", dimensionRecords if dimensionRecords is None else 

125 {int(k): SerializedDimensionRecord.direct(**v) for k, v in dimensionRecords.items()}) 

126 setter(node, '__fields_set__', {'taskName', 'dataId', 'datasetTypeMapping', 'initInputs', 'inputs', 

127 'outputs', 'dimensionRecords'}) 

128 return node 

129 

130 

131class Quantum: 

132 """Class representing a discrete unit of work. 

133 

134 A Quantum may depend on one or more datasets and produce one or more 

135 datasets. 

136 

137 Most Quanta will be executions of a particular ``PipelineTask``’s 

138 ``runQuantum`` method, but they can also be used to represent discrete 

139 units of work performed manually by human operators or other software 

140 agents. 

141 

142 Parameters 

143 ---------- 

144 taskName : `str`, optional 

145 Fully-qualified name of the Task class that executed or will execute 

146 this Quantum. If not provided, ``taskClass`` must be. 

147 taskClass : `type`, optional 

148 The Task class that executed or will execute this Quantum. If not 

149 provided, ``taskName`` must be. Overrides ``taskName`` if both are 

150 provided. 

151 dataId : `DataId`, optional 

152 The dimension values that identify this `Quantum`. 

153 initInputs : collection of `DatasetRef`, optional 

154 Datasets that are needed to construct an instance of the Task. May 

155 be a flat iterable of `DatasetRef` instances or a mapping from 

156 `DatasetType` to `DatasetRef`. 

157 inputs : `~collections.abc.Mapping`, optional 

158 Inputs identified prior to execution, organized as a mapping from 

159 `DatasetType` to a list of `DatasetRef`. 

160 outputs : `~collections.abc.Mapping`, optional 

161 Outputs from executing this quantum of work, organized as a mapping 

162 from `DatasetType` to a list of `DatasetRef`. 

163 """ 

164 

165 __slots__ = ("_taskName", "_taskClass", "_dataId", "_initInputs", "_inputs", "_outputs", "_hash") 

166 

167 def __init__(self, *, taskName: Optional[str] = None, 

168 taskClass: Optional[Type] = None, 

169 dataId: Optional[DataCoordinate] = None, 

170 initInputs: Optional[Union[Mapping[DatasetType, DatasetRef], Iterable[DatasetRef]]] = None, 

171 inputs: Optional[Mapping[DatasetType, List[DatasetRef]]] = None, 

172 outputs: Optional[Mapping[DatasetType, List[DatasetRef]]] = None, 

173 ): 

174 if taskClass is not None: 

175 taskName = f"{taskClass.__module__}.{taskClass.__name__}" 

176 self._taskName = taskName 

177 self._taskClass = taskClass 

178 self._dataId = dataId 

179 if initInputs is None: 

180 initInputs = {} 

181 elif not isinstance(initInputs, Mapping): 

182 initInputs = {ref.datasetType: ref for ref in initInputs} 

183 if inputs is None: 

184 inputs = {} 

185 if outputs is None: 

186 outputs = {} 

187 self._initInputs = NamedKeyDict[DatasetType, DatasetRef](initInputs).freeze() 

188 self._inputs = NamedKeyDict[DatasetType, List[DatasetRef]](inputs).freeze() 

189 self._outputs = NamedKeyDict[DatasetType, List[DatasetRef]](outputs).freeze() 

190 

191 def to_simple(self, accumulator: Optional[DimensionRecordsAccumulator] = None) -> SerializedQuantum: 

192 """Convert this class to a simple python type. 

193 

194 This makes it suitable for serialization. 

195 

196 Parameters 

197 ---------- 

198 accumulator : `DimensionRecordsAccumulator`, optional 

199 This accumulator can be used to aggregate dimension records accross 

200 multiple Quanta. If this is None, the default, dimension records 

201 are serialized with this Quantum. If an accumulator is supplied it 

202 is assumed something else is responsible for serializing the 

203 records, and they will not be stored with the SerializedQuantum. 

204 

205 Returns 

206 ------- 

207 simple : `SerializedQuantum` 

208 This object converted to a serializable representation. 

209 """ 

210 typeMapping = {} 

211 initInputs = {} 

212 

213 if accumulator is None: 

214 accumulator = DimensionRecordsAccumulator() 

215 writeDimensionRecords = True 

216 else: 

217 writeDimensionRecords = False 

218 

219 # collect the init inputs for serialization, recording the types into 

220 # their own mapping, used throughout to minimize saving the same object 

221 # multiple times. String name of the type used to index mappings. 

222 for key, value in self._initInputs.items(): 

223 # add the type to the typeMapping 

224 typeMapping[key.name] = key.to_simple() 

225 # convert to a simple DatasetRef representation 

226 simple = value.to_simple() 

227 # extract the dimension records 

228 recIds = [] 

229 if simple.dataId is not None and simple.dataId.records is not None: 

230 # for each dimension record get a id by adding it to the 

231 # record accumulator. 

232 for rec in value.dataId.records.values(): 

233 if rec is not None: 

234 recordId = accumulator.addRecord(rec) 

235 recIds.append(recordId) 

236 # Set properties to None to save space 

237 simple.dataId.records = None 

238 simple.datasetType = None 

239 initInputs[key.name] = (simple, recIds) 

240 

241 # container for all the SerializedDatasetRefs, keyed on the 

242 # DatasetType name. 

243 inputs = {} 

244 

245 # collect the inputs 

246 for key, values in self._inputs.items(): 

247 # collect type if it is not already in the mapping 

248 if key.name not in typeMapping: 

249 typeMapping[key.name] = key.to_simple() 

250 # for each input type there are a list of inputs, collect them 

251 tmp = [] 

252 for e in values: 

253 simp = e.to_simple() 

254 # This container will hold ids (hashes) that point to all the 

255 # dimension records within the SerializedDatasetRef dataId 

256 # These dimension records repeat in almost every DatasetRef 

257 # So it is hugely wasteful in terms of disk and cpu time to 

258 # store them over and over again. 

259 recIds = [] 

260 if simp.dataId is not None and simp.dataId.records is not None: 

261 for rec in e.dataId.records.values(): 

262 # for each dimension record get a id by adding it to 

263 # the record accumulator. 

264 if rec is not None: 

265 recordId = accumulator.addRecord(rec) 

266 recIds.append(recordId) 

267 # Set the records to None to avoid serializing them 

268 simp.dataId.records = None 

269 # Dataset type is the same as the key in _inputs, no need 

270 # to serialize it out multiple times, set it to None 

271 simp.datasetType = None 

272 # append a tuple of the simplified SerializedDatasetRef, along 

273 # with the list of all the keys for the dimension records 

274 # needed for reconstruction. 

275 tmp.append((simp, recIds)) 

276 inputs[key.name] = tmp 

277 

278 # container for all the SerializedDatasetRefs, keyed on the 

279 # DatasetType name. 

280 outputs = {} 

281 for key, values in self._outputs.items(): 

282 # collect type if it is not already in the mapping 

283 if key.name not in typeMapping: 

284 typeMapping[key.name] = key.to_simple() 

285 # for each output type there are a list of inputs, collect them 

286 tmp = [] 

287 for e in values: 

288 simp = e.to_simple() 

289 # This container will hold ids (hashes) that point to all the 

290 # dimension records within the SerializedDatasetRef dataId 

291 # These dimension records repeat in almost every DatasetRef 

292 # So it is hugely wasteful in terms of disk and cpu time to 

293 # store them over and over again. 

294 recIds = [] 

295 if simp.dataId is not None and simp.dataId.records is not None: 

296 for rec in e.dataId.records.values(): 

297 # for each dimension record get a id by adding it to 

298 # the record accumulator. 

299 if rec is not None: 

300 recordId = accumulator.addRecord(rec) 

301 recIds.append(recordId) 

302 # Set the records to None to avoid serializing them 

303 simp.dataId.records = None 

304 # Dataset type is the same as the key in _outputs, no need 

305 # to serialize it out multiple times, set it to None 

306 simp.datasetType = None 

307 # append a tuple of the simplified SerializedDatasetRef, along 

308 # with the list of all the keys for the dimension records 

309 # needed for reconstruction. 

310 tmp.append((simp, recIds)) 

311 outputs[key.name] = tmp 

312 

313 dimensionRecords: Optional[Mapping[int, SerializedDimensionRecord]] 

314 if writeDimensionRecords: 

315 dimensionRecords = accumulator.makeSerializedDimensionRecordMapping() 

316 else: 

317 dimensionRecords = None 

318 

319 return SerializedQuantum(taskName=self._taskName, 

320 dataId=self.dataId.to_simple() if self.dataId is not None else None, 

321 datasetTypeMapping=typeMapping, 

322 initInputs=initInputs, 

323 inputs=inputs, 

324 outputs=outputs, 

325 dimensionRecords=dimensionRecords) 

326 

327 @classmethod 

328 def from_simple(cls, simple: SerializedQuantum, universe: DimensionUniverse, 

329 reconstitutedDimensions: Optional[Dict[int, Tuple[str, DimensionRecord]]] = None 

330 ) -> Quantum: 

331 """Construct a new object from a simplified form. 

332 

333 Generally this is data returned from the `to_simple` method. 

334 

335 Parameters 

336 ---------- 

337 simple : SerializedQuantum 

338 The value returned by a call to `to_simple` 

339 universe : `DimensionUniverse` 

340 The special graph of all known dimensions. 

341 reconstitutedDimensions : `dict` of `int` to `DimensionRecord` or None 

342 A mapping of ids to dimension records to be used when populating 

343 dimensions for this Quantum. If supplied it will be used in place 

344 of the dimension Records stored with the SerializedQuantum, if a 

345 required dimension has already been loaded. Otherwise the record 

346 will be unpersisted from the SerializedQuatnum and added to the 

347 reconstitutedDimensions dict (if not None). Defaults to None. 

348 """ 

349 loadedTypes: MutableMapping[str, DatasetType] = {} 

350 initInputs: MutableMapping[DatasetType, DatasetRef] = {} 

351 if reconstitutedDimensions is None: 

352 reconstitutedDimensions = {} 

353 

354 # Unpersist all the init inputs 

355 for key, (value, dimensionIds) in simple.initInputs.items(): 

356 # If a datasetType has already been created use that instead of 

357 # unpersisting. 

358 if (type_ := loadedTypes.get(key)) is None: 

359 type_ = loadedTypes.setdefault(key, 

360 DatasetType.from_simple(simple.datasetTypeMapping[key], 

361 universe=universe)) 

362 # reconstruct the dimension records 

363 rebuiltDatasetRef = _reconstructDatasetRef(value, type_, dimensionIds, simple.dimensionRecords, 

364 reconstitutedDimensions, universe) 

365 initInputs[type_] = rebuiltDatasetRef 

366 

367 # containers for the dataset refs 

368 inputs: MutableMapping[DatasetType, List[DatasetRef]] = {} 

369 outputs: MutableMapping[DatasetType, List[DatasetRef]] = {} 

370 

371 for container, simpleRefs in ((inputs, simple.inputs), (outputs, simple.outputs)): 

372 for key, values in simpleRefs.items(): 

373 # If a datasetType has already been created use that instead of 

374 # unpersisting. 

375 if (type_ := loadedTypes.get(key)) is None: 

376 type_ = loadedTypes.setdefault(key, 

377 DatasetType.from_simple(simple.datasetTypeMapping[key], 

378 universe=universe)) 

379 # reconstruct the list of DatasetRefs for this DatasetType 

380 tmp: List[DatasetRef] = [] 

381 for v, recIds in values: 

382 rebuiltDatasetRef = _reconstructDatasetRef(v, type_, recIds, simple.dimensionRecords, 

383 reconstitutedDimensions, universe) 

384 tmp.append(rebuiltDatasetRef) 

385 container[type_] = tmp 

386 

387 dataId = DataCoordinate.from_simple(simple.dataId, 

388 universe=universe) if simple.dataId is not None else None 

389 return Quantum(taskName=simple.taskName, dataId=dataId, initInputs=initInputs, inputs=inputs, 

390 outputs=outputs) 

391 

392 @property 

393 def taskClass(self) -> Optional[Type]: 

394 """Task class associated with this `Quantum` (`type`).""" 

395 if self._taskClass is None: 

396 if self._taskName is None: 

397 raise ValueError("No task class defined and task name is None") 

398 task_class = doImportType(self._taskName) 

399 self._taskClass = task_class 

400 return self._taskClass 

401 

402 @property 

403 def taskName(self) -> Optional[str]: 

404 """Return Fully-qualified name of the task associated with `Quantum`. 

405 

406 (`str`). 

407 """ 

408 return self._taskName 

409 

410 @property 

411 def dataId(self) -> Optional[DataCoordinate]: 

412 """Return dimension values of the unit of processing (`DataId`).""" 

413 return self._dataId 

414 

415 @property 

416 def initInputs(self) -> NamedKeyMapping[DatasetType, DatasetRef]: 

417 """Return mapping of datasets used to construct the Task. 

418 

419 Has `DatasetType` instances as keys (names can also be used for 

420 lookups) and `DatasetRef` instances as values. 

421 """ 

422 return self._initInputs 

423 

424 @property 

425 def inputs(self) -> NamedKeyMapping[DatasetType, List[DatasetRef]]: 

426 """Return mapping of input datasets that were expected to be used. 

427 

428 Has `DatasetType` instances as keys (names can also be used for 

429 lookups) and a list of `DatasetRef` instances as values. 

430 

431 Notes 

432 ----- 

433 We cannot use `set` instead of `list` for the nested container because 

434 `DatasetRef` instances cannot be compared reliably when some have 

435 integers IDs and others do not. 

436 """ 

437 return self._inputs 

438 

439 @property 

440 def outputs(self) -> NamedKeyMapping[DatasetType, List[DatasetRef]]: 

441 """Return mapping of output datasets (to be) generated by this quantum. 

442 

443 Has the same form as `predictedInputs`. 

444 

445 Notes 

446 ----- 

447 We cannot use `set` instead of `list` for the nested container because 

448 `DatasetRef` instances cannot be compared reliably when some have 

449 integers IDs and others do not. 

450 """ 

451 return self._outputs 

452 

453 def __eq__(self, other: object) -> bool: 

454 if not isinstance(other, Quantum): 

455 return False 

456 for item in ("taskClass", "dataId", "initInputs", "inputs", "outputs"): 

457 if getattr(self, item) != getattr(other, item): 

458 return False 

459 return True 

460 

461 def __hash__(self) -> int: 

462 return hash((self.taskClass, self.dataId)) 

463 

464 def __reduce__(self) -> Union[str, Tuple[Any, ...]]: 

465 return (self._reduceFactory, 

466 (self.taskName, self.taskClass, self.dataId, dict(self.initInputs.items()), 

467 dict(self.inputs), dict(self.outputs))) 

468 

469 def __str__(self) -> str: 

470 return f"{self.__class__.__name__}(taskName={self.taskName}, dataId={self.dataId})" 

471 

472 @staticmethod 

473 def _reduceFactory(taskName: Optional[str], 

474 taskClass: Optional[Type], 

475 dataId: Optional[DataCoordinate], 

476 initInputs: Optional[Union[Mapping[DatasetType, DatasetRef], Iterable[DatasetRef]]], 

477 inputs: Optional[Mapping[DatasetType, List[DatasetRef]]], 

478 outputs: Optional[Mapping[DatasetType, List[DatasetRef]]] 

479 ) -> Quantum: 

480 return Quantum(taskName=taskName, taskClass=taskClass, dataId=dataId, initInputs=initInputs, 

481 inputs=inputs, outputs=outputs) 

482 

483 

484class DimensionRecordsAccumulator: 

485 """Class used to accumulate dimension records for serialization. 

486 

487 This class generates an auto increment key for each unique dimension record 

488 added to it. This allows serialization of dimension records to occur once 

489 for each record but be refereed to multiple times. 

490 """ 

491 

492 def __init__(self) -> None: 

493 self._counter = 0 

494 self.mapping: MutableMapping[DimensionRecord, Tuple[int, SerializedDimensionRecord]] = {} 

495 

496 def addRecord(self, record: DimensionRecord) -> int: 

497 """Add a dimension record to the accumulator if it has not already been 

498 added. When a record is inserted for the first time it is assigned 

499 a unique integer key. 

500 

501 This function returns the key associated with the record (either the 

502 newly allocated key, or the existing one) 

503 

504 Paramters 

505 --------- 

506 record : `DimensionRecord` 

507 The record to add to the accumulator 

508 

509 Returns 

510 ------- 

511 accumulatorKey : int 

512 The key that is associated with the supplied record 

513 """ 

514 if (mappingValue := self.mapping.get(record)) is None: 

515 simple = record.to_simple() 

516 mappingValue = (self._counter, simple) 

517 self._counter += 1 

518 self.mapping[record] = mappingValue 

519 return mappingValue[0] 

520 

521 def makeSerializedDimensionRecordMapping(self) -> Mapping[int, SerializedDimensionRecord]: 

522 return {id_: serializeRef for id_, serializeRef in self.mapping.values()}