Coverage for python/lsst/daf/butler/core/quantum.py: 23%

207 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-07-21 09:55 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ("Quantum", "SerializedQuantum", "DimensionRecordsAccumulator") 

25 

26import sys 

27import warnings 

28from collections.abc import Iterable, Mapping, MutableMapping, Sequence 

29from typing import Any 

30 

31from lsst.daf.butler._compat import _BaseModelCompat 

32from lsst.utils import doImportType 

33from lsst.utils.introspection import find_outside_stacklevel 

34 

35from .datasets import DatasetRef, DatasetType, SerializedDatasetRef, SerializedDatasetType 

36from .datastoreRecordData import DatastoreRecordData, SerializedDatastoreRecordData 

37from .dimensions import ( 

38 DataCoordinate, 

39 DimensionRecord, 

40 DimensionUniverse, 

41 SerializedDataCoordinate, 

42 SerializedDimensionRecord, 

43) 

44from .named import NamedKeyDict, NamedKeyMapping 

45 

46 

47def _reconstructDatasetRef( 

48 simple: SerializedDatasetRef, 

49 type_: DatasetType | None, 

50 ids: Iterable[int], 

51 dimensionRecords: dict[int, SerializedDimensionRecord] | None, 

52 universe: DimensionUniverse, 

53) -> DatasetRef: 

54 """Reconstruct a DatasetRef stored in a Serialized Quantum.""" 

55 # Reconstruct the dimension records 

56 records = {} 

57 for dId in ids: 

58 # if the dimension record has been loaded previously use that, 

59 # otherwise load it from the dict of Serialized DimensionRecords 

60 if dimensionRecords is None: 

61 raise ValueError("Cannot construct from a SerializedQuantum with no dimension records. ") 

62 tmpSerialized = dimensionRecords[dId] 

63 reconstructedDim = DimensionRecord.from_simple(tmpSerialized, universe=universe) 

64 records[sys.intern(reconstructedDim.definition.name)] = reconstructedDim 

65 # turn the serialized form into an object and attach the dimension records 

66 rebuiltDatasetRef = DatasetRef.from_simple(simple, universe, datasetType=type_) 

67 if records: 

68 object.__setattr__(rebuiltDatasetRef, "dataId", rebuiltDatasetRef.dataId.expanded(records)) 

69 return rebuiltDatasetRef 

70 

71 

72class SerializedQuantum(_BaseModelCompat): 

73 """Simplified model of a `Quantum` suitable for serialization.""" 

74 

75 taskName: str | None = None 

76 dataId: SerializedDataCoordinate | None = None 

77 datasetTypeMapping: Mapping[str, SerializedDatasetType] 

78 initInputs: Mapping[str, tuple[SerializedDatasetRef, list[int]]] 

79 inputs: Mapping[str, list[tuple[SerializedDatasetRef, list[int]]]] 

80 outputs: Mapping[str, list[tuple[SerializedDatasetRef, list[int]]]] 

81 dimensionRecords: dict[int, SerializedDimensionRecord] | None = None 

82 datastoreRecords: dict[str, SerializedDatastoreRecordData] | None = None 

83 

84 @classmethod 

85 def direct( 

86 cls, 

87 *, 

88 taskName: str | None, 

89 dataId: dict | None, 

90 datasetTypeMapping: Mapping[str, dict], 

91 initInputs: Mapping[str, tuple[dict, list[int]]], 

92 inputs: Mapping[str, list[tuple[dict, list[int]]]], 

93 outputs: Mapping[str, list[tuple[dict, list[int]]]], 

94 dimensionRecords: dict[int, dict] | None, 

95 datastoreRecords: dict[str, dict] | None, 

96 ) -> SerializedQuantum: 

97 """Construct a `SerializedQuantum` directly without validators. 

98 

99 This differs from the pydantic "construct" method in that the arguments 

100 are explicitly what the model requires, and it will recurse through 

101 members, constructing them from their corresponding `direct` methods. 

102 

103 This method should only be called when the inputs are trusted. 

104 """ 

105 serialized_dataId = SerializedDataCoordinate.direct(**dataId) if dataId is not None else None 

106 serialized_datasetTypeMapping = { 

107 k: SerializedDatasetType.direct(**v) for k, v in datasetTypeMapping.items() 

108 } 

109 serialized_initInputs = { 

110 k: (SerializedDatasetRef.direct(**v), refs) for k, (v, refs) in initInputs.items() 

111 } 

112 serialized_inputs = { 

113 k: [(SerializedDatasetRef.direct(**ref), id) for ref, id in v] for k, v in inputs.items() 

114 } 

115 serialized_outputs = { 

116 k: [(SerializedDatasetRef.direct(**ref), id) for ref, id in v] for k, v in outputs.items() 

117 } 

118 serialized_records = ( 

119 {int(k): SerializedDimensionRecord.direct(**v) for k, v in dimensionRecords.items()} 

120 if dimensionRecords is not None 

121 else None 

122 ) 

123 serialized_datastore_records = ( 

124 {k: SerializedDatastoreRecordData.direct(**v) for k, v in datastoreRecords.items()} 

125 if datastoreRecords is not None 

126 else None 

127 ) 

128 

129 node = cls.model_construct( 

130 taskName=sys.intern(taskName or ""), 

131 dataId=serialized_dataId, 

132 datasetTypeMapping=serialized_datasetTypeMapping, 

133 initInputs=serialized_initInputs, 

134 inputs=serialized_inputs, 

135 outputs=serialized_outputs, 

136 dimensionRecords=serialized_records, 

137 datastoreRecords=serialized_datastore_records, 

138 ) 

139 

140 return node 

141 

142 

143class Quantum: 

144 """Class representing a discrete unit of work. 

145 

146 A Quantum may depend on one or more datasets and produce one or more 

147 datasets. 

148 

149 Most Quanta will be executions of a particular ``PipelineTask``’s 

150 ``runQuantum`` method, but they can also be used to represent discrete 

151 units of work performed manually by human operators or other software 

152 agents. 

153 

154 Parameters 

155 ---------- 

156 taskName : `str`, optional 

157 Fully-qualified name of the Task class that executed or will execute 

158 this Quantum. If not provided, ``taskClass`` must be. 

159 taskClass : `type`, optional 

160 The Task class that executed or will execute this Quantum. If not 

161 provided, ``taskName`` must be. Overrides ``taskName`` if both are 

162 provided. 

163 dataId : `DataId`, optional 

164 The dimension values that identify this `Quantum`. 

165 initInputs : collection of `DatasetRef`, optional 

166 Datasets that are needed to construct an instance of the Task. May 

167 be a flat iterable of `DatasetRef` instances or a mapping from 

168 `DatasetType` to `DatasetRef`. 

169 inputs : `~collections.abc.Mapping`, optional 

170 Inputs identified prior to execution, organized as a mapping from 

171 `DatasetType` to a list of `DatasetRef`. 

172 outputs : `~collections.abc.Mapping`, optional 

173 Outputs from executing this quantum of work, organized as a mapping 

174 from `DatasetType` to a list of `DatasetRef`. 

175 datastore_records : `DatastoreRecordData`, optional 

176 Datastore record data for input or initInput datasets that already 

177 exist. 

178 """ 

179 

180 __slots__ = ( 

181 "_taskName", 

182 "_taskClass", 

183 "_dataId", 

184 "_initInputs", 

185 "_inputs", 

186 "_outputs", 

187 "_datastore_records", 

188 ) 

189 

190 def __init__( 

191 self, 

192 *, 

193 taskName: str | None = None, 

194 taskClass: type | None = None, 

195 dataId: DataCoordinate | None = None, 

196 initInputs: Mapping[DatasetType, DatasetRef] | Iterable[DatasetRef] | None = None, 

197 inputs: Mapping[DatasetType, Sequence[DatasetRef]] | None = None, 

198 outputs: Mapping[DatasetType, Sequence[DatasetRef]] | None = None, 

199 datastore_records: Mapping[str, DatastoreRecordData] | None = None, 

200 ): 

201 if taskClass is not None: 

202 taskName = f"{taskClass.__module__}.{taskClass.__name__}" 

203 self._taskName = taskName 

204 self._taskClass = taskClass 

205 self._dataId = dataId 

206 if initInputs is None: 

207 initInputs = {} 

208 elif not isinstance(initInputs, Mapping): 

209 initInputs = {ref.datasetType: ref for ref in initInputs} 

210 if inputs is None: 

211 inputs = {} 

212 if outputs is None: 

213 outputs = {} 

214 self._initInputs = NamedKeyDict[DatasetType, DatasetRef](initInputs).freeze() 

215 self._inputs = NamedKeyDict[DatasetType, tuple[DatasetRef]]( 

216 (k, tuple(v)) for k, v in inputs.items() 

217 ).freeze() 

218 self._outputs = NamedKeyDict[DatasetType, tuple[DatasetRef]]( 

219 (k, tuple(v)) for k, v in outputs.items() 

220 ).freeze() 

221 if datastore_records is None: 

222 datastore_records = {} 

223 self._datastore_records = datastore_records 

224 

225 def to_simple(self, accumulator: DimensionRecordsAccumulator | None = None) -> SerializedQuantum: 

226 """Convert this class to a simple python type. 

227 

228 This makes it suitable for serialization. 

229 

230 Parameters 

231 ---------- 

232 accumulator : `DimensionRecordsAccumulator`, optional 

233 This accumulator can be used to aggregate dimension records accross 

234 multiple Quanta. If this is None, the default, dimension records 

235 are serialized with this Quantum. If an accumulator is supplied it 

236 is assumed something else is responsible for serializing the 

237 records, and they will not be stored with the SerializedQuantum. 

238 

239 Returns 

240 ------- 

241 simple : `SerializedQuantum` 

242 This object converted to a serializable representation. 

243 """ 

244 typeMapping = {} 

245 initInputs = {} 

246 

247 if accumulator is None: 

248 accumulator = DimensionRecordsAccumulator() 

249 writeDimensionRecords = True 

250 else: 

251 writeDimensionRecords = False 

252 

253 # collect the init inputs for serialization, recording the types into 

254 # their own mapping, used throughout to minimize saving the same object 

255 # multiple times. String name of the type used to index mappings. 

256 for key, value in self._initInputs.items(): 

257 # add the type to the typeMapping 

258 typeMapping[key.name] = key.to_simple() 

259 # convert to a simple DatasetRef representation 

260 simple = value.to_simple() 

261 # extract the dimension records 

262 recIds = [] 

263 if simple.dataId is not None and simple.dataId.records is not None: 

264 # for each dimension record get a id by adding it to the 

265 # record accumulator. 

266 for rec in value.dataId.records.values(): 

267 if rec is not None: 

268 recordId = accumulator.addRecord(rec) 

269 recIds.append(recordId) 

270 # Set properties to None to save space 

271 simple.dataId.records = None 

272 simple.datasetType = None 

273 initInputs[key.name] = (simple, recIds) 

274 

275 # container for all the SerializedDatasetRefs, keyed on the 

276 # DatasetType name. 

277 inputs = {} 

278 

279 # collect the inputs 

280 for key, values in self._inputs.items(): 

281 # collect type if it is not already in the mapping 

282 if key.name not in typeMapping: 

283 typeMapping[key.name] = key.to_simple() 

284 # for each input type there are a list of inputs, collect them 

285 tmp = [] 

286 for e in values: 

287 simp = e.to_simple() 

288 # This container will hold ids (hashes) that point to all the 

289 # dimension records within the SerializedDatasetRef dataId 

290 # These dimension records repeat in almost every DatasetRef 

291 # So it is hugely wasteful in terms of disk and cpu time to 

292 # store them over and over again. 

293 recIds = [] 

294 if simp.dataId is not None and simp.dataId.records is not None: 

295 for rec in e.dataId.records.values(): 

296 # for each dimension record get a id by adding it to 

297 # the record accumulator. 

298 if rec is not None: 

299 recordId = accumulator.addRecord(rec) 

300 recIds.append(recordId) 

301 # Set the records to None to avoid serializing them 

302 simp.dataId.records = None 

303 # Dataset type is the same as the key in _inputs, no need 

304 # to serialize it out multiple times, set it to None 

305 simp.datasetType = None 

306 # append a tuple of the simplified SerializedDatasetRef, along 

307 # with the list of all the keys for the dimension records 

308 # needed for reconstruction. 

309 tmp.append((simp, recIds)) 

310 inputs[key.name] = tmp 

311 

312 # container for all the SerializedDatasetRefs, keyed on the 

313 # DatasetType name. 

314 outputs = {} 

315 for key, values in self._outputs.items(): 

316 # collect type if it is not already in the mapping 

317 if key.name not in typeMapping: 

318 typeMapping[key.name] = key.to_simple() 

319 # for each output type there are a list of inputs, collect them 

320 tmp = [] 

321 for e in values: 

322 simp = e.to_simple() 

323 # This container will hold ids (hashes) that point to all the 

324 # dimension records within the SerializedDatasetRef dataId 

325 # These dimension records repeat in almost every DatasetRef 

326 # So it is hugely wasteful in terms of disk and cpu time to 

327 # store them over and over again. 

328 recIds = [] 

329 if simp.dataId is not None and simp.dataId.records is not None: 

330 for rec in e.dataId.records.values(): 

331 # for each dimension record get a id by adding it to 

332 # the record accumulator. 

333 if rec is not None: 

334 recordId = accumulator.addRecord(rec) 

335 recIds.append(recordId) 

336 # Set the records to None to avoid serializing them 

337 simp.dataId.records = None 

338 # Dataset type is the same as the key in _outputs, no need 

339 # to serialize it out multiple times, set it to None 

340 simp.datasetType = None 

341 # append a tuple of the simplified SerializedDatasetRef, along 

342 # with the list of all the keys for the dimension records 

343 # needed for reconstruction. 

344 tmp.append((simp, recIds)) 

345 outputs[key.name] = tmp 

346 

347 dimensionRecords: Mapping[int, SerializedDimensionRecord] | None 

348 if writeDimensionRecords: 

349 dimensionRecords = accumulator.makeSerializedDimensionRecordMapping() 

350 else: 

351 dimensionRecords = None 

352 

353 datastore_records: dict[str, SerializedDatastoreRecordData] | None = None 

354 if self.datastore_records is not None: 

355 datastore_records = { 

356 datastore_name: record_data.to_simple() 

357 for datastore_name, record_data in self.datastore_records.items() 

358 } 

359 

360 return SerializedQuantum( 

361 taskName=self._taskName, 

362 dataId=self.dataId.to_simple() if self.dataId is not None else None, 

363 datasetTypeMapping=typeMapping, 

364 initInputs=initInputs, 

365 inputs=inputs, 

366 outputs=outputs, 

367 dimensionRecords=dimensionRecords, 

368 datastoreRecords=datastore_records, 

369 ) 

370 

371 @classmethod 

372 def from_simple( 

373 cls, 

374 simple: SerializedQuantum, 

375 universe: DimensionUniverse, 

376 reconstitutedDimensions: dict[int, tuple[str, DimensionRecord]] | None = None, 

377 ) -> Quantum: 

378 """Construct a new object from a simplified form. 

379 

380 Generally this is data returned from the `to_simple` method. 

381 

382 Parameters 

383 ---------- 

384 simple : SerializedQuantum 

385 The value returned by a call to `to_simple` 

386 universe : `DimensionUniverse` 

387 The special graph of all known dimensions. 

388 reconstitutedDimensions : `dict` of `int` to `DimensionRecord` or None 

389 A mapping of ids to dimension records to be used when populating 

390 dimensions for this Quantum. If supplied it will be used in place 

391 of the dimension Records stored with the SerializedQuantum, if a 

392 required dimension has already been loaded. Otherwise the record 

393 will be unpersisted from the SerializedQuatnum and added to the 

394 reconstitutedDimensions dict (if not None). Defaults to None. 

395 Deprecated, any argument will be ignored. 

396 """ 

397 initInputs: MutableMapping[DatasetType, DatasetRef] = {} 

398 if reconstitutedDimensions is not None: 

399 warnings.warn( 

400 "The reconstitutedDimensions argument is now ignored and may be removed after v 27", 

401 category=FutureWarning, 

402 stacklevel=find_outside_stacklevel("lsst.daf.butler"), 

403 ) 

404 

405 # Unpersist all the init inputs 

406 for key, (value, dimensionIds) in simple.initInputs.items(): 

407 type_ = DatasetType.from_simple(simple.datasetTypeMapping[key], universe=universe) 

408 # reconstruct the dimension records 

409 rebuiltDatasetRef = _reconstructDatasetRef( 

410 value, type_, dimensionIds, simple.dimensionRecords, universe 

411 ) 

412 initInputs[type_] = rebuiltDatasetRef 

413 

414 # containers for the dataset refs 

415 inputs: MutableMapping[DatasetType, list[DatasetRef]] = {} 

416 outputs: MutableMapping[DatasetType, list[DatasetRef]] = {} 

417 

418 for container, simpleRefs in ((inputs, simple.inputs), (outputs, simple.outputs)): 

419 for key, values in simpleRefs.items(): 

420 type_ = DatasetType.from_simple(simple.datasetTypeMapping[key], universe=universe) 

421 # reconstruct the list of DatasetRefs for this DatasetType 

422 tmp: list[DatasetRef] = [] 

423 for v, recIds in values: 

424 rebuiltDatasetRef = _reconstructDatasetRef( 

425 v, type_, recIds, simple.dimensionRecords, universe 

426 ) 

427 tmp.append(rebuiltDatasetRef) 

428 container[type_] = tmp 

429 

430 dataId = ( 

431 DataCoordinate.from_simple(simple.dataId, universe=universe) 

432 if simple.dataId is not None 

433 else None 

434 ) 

435 

436 datastore_records: dict[str, DatastoreRecordData] | None = None 

437 if simple.datastoreRecords is not None: 

438 datastore_records = { 

439 datastore_name: DatastoreRecordData.from_simple(record_data) 

440 for datastore_name, record_data in simple.datastoreRecords.items() 

441 } 

442 

443 quant = Quantum( 

444 taskName=simple.taskName, 

445 dataId=dataId, 

446 initInputs=initInputs, 

447 inputs=inputs, 

448 outputs=outputs, 

449 datastore_records=datastore_records, 

450 ) 

451 return quant 

452 

453 @property 

454 def taskClass(self) -> type | None: 

455 """Task class associated with this `Quantum` (`type`).""" 

456 if self._taskClass is None: 

457 if self._taskName is None: 

458 raise ValueError("No task class defined and task name is None") 

459 task_class = doImportType(self._taskName) 

460 self._taskClass = task_class 

461 return self._taskClass 

462 

463 @property 

464 def taskName(self) -> str | None: 

465 """Return Fully-qualified name of the task associated with `Quantum`. 

466 

467 (`str`). 

468 """ 

469 return self._taskName 

470 

471 @property 

472 def dataId(self) -> DataCoordinate | None: 

473 """Return dimension values of the unit of processing (`DataId`).""" 

474 return self._dataId 

475 

476 @property 

477 def initInputs(self) -> NamedKeyMapping[DatasetType, DatasetRef]: 

478 """Return mapping of datasets used to construct the Task. 

479 

480 Has `DatasetType` instances as keys (names can also be used for 

481 lookups) and `DatasetRef` instances as values. 

482 """ 

483 return self._initInputs 

484 

485 @property 

486 def inputs(self) -> NamedKeyMapping[DatasetType, tuple[DatasetRef]]: 

487 """Return mapping of input datasets that were expected to be used. 

488 

489 Has `DatasetType` instances as keys (names can also be used for 

490 lookups) and a list of `DatasetRef` instances as values. 

491 

492 Notes 

493 ----- 

494 We cannot use `set` instead of `list` for the nested container because 

495 `DatasetRef` instances cannot be compared reliably when some have 

496 integers IDs and others do not. 

497 """ 

498 return self._inputs 

499 

500 @property 

501 def outputs(self) -> NamedKeyMapping[DatasetType, tuple[DatasetRef]]: 

502 """Return mapping of output datasets (to be) generated by this quantum. 

503 

504 Has the same form as ``predictedInputs``. 

505 

506 Notes 

507 ----- 

508 We cannot use `set` instead of `list` for the nested container because 

509 `DatasetRef` instances cannot be compared reliably when some have 

510 integers IDs and others do not. 

511 """ 

512 return self._outputs 

513 

514 @property 

515 def datastore_records(self) -> Mapping[str, DatastoreRecordData]: 

516 """Tabular data stored with this quantum (`dict`). 

517 

518 This attribute may be modified in place, but not assigned to. 

519 """ 

520 return self._datastore_records 

521 

522 def __eq__(self, other: object) -> bool: 

523 if not isinstance(other, Quantum): 

524 return False 

525 for item in ("taskClass", "dataId", "initInputs", "inputs", "outputs"): 

526 if getattr(self, item) != getattr(other, item): 

527 return False 

528 return True 

529 

530 def __hash__(self) -> int: 

531 return hash((self.taskClass, self.dataId)) 

532 

533 def __reduce__(self) -> str | tuple[Any, ...]: 

534 return ( 

535 self._reduceFactory, 

536 ( 

537 self.taskName, 

538 self.taskClass, 

539 self.dataId, 

540 dict(self.initInputs.items()), 

541 dict(self.inputs), 

542 dict(self.outputs), 

543 self.datastore_records, 

544 ), 

545 ) 

546 

547 def __str__(self) -> str: 

548 return f"{self.__class__.__name__}(taskName={self.taskName}, dataId={self.dataId})" 

549 

550 @staticmethod 

551 def _reduceFactory( 

552 taskName: str | None, 

553 taskClass: type | None, 

554 dataId: DataCoordinate | None, 

555 initInputs: Mapping[DatasetType, DatasetRef] | Iterable[DatasetRef] | None, 

556 inputs: Mapping[DatasetType, list[DatasetRef]] | None, 

557 outputs: Mapping[DatasetType, list[DatasetRef]] | None, 

558 datastore_records: Mapping[str, DatastoreRecordData], 

559 ) -> Quantum: 

560 return Quantum( 

561 taskName=taskName, 

562 taskClass=taskClass, 

563 dataId=dataId, 

564 initInputs=initInputs, 

565 inputs=inputs, 

566 outputs=outputs, 

567 datastore_records=datastore_records, 

568 ) 

569 

570 

571class DimensionRecordsAccumulator: 

572 """Class used to accumulate dimension records for serialization. 

573 

574 This class generates an auto increment key for each unique dimension record 

575 added to it. This allows serialization of dimension records to occur once 

576 for each record but be refereed to multiple times. 

577 """ 

578 

579 def __init__(self) -> None: 

580 self._counter = 0 

581 self.mapping: MutableMapping[DimensionRecord, tuple[int, SerializedDimensionRecord]] = {} 

582 

583 def addRecord(self, record: DimensionRecord) -> int: 

584 """Add a dimension record to the accumulator if it has not already been 

585 added. When a record is inserted for the first time it is assigned 

586 a unique integer key. 

587 

588 This function returns the key associated with the record (either the 

589 newly allocated key, or the existing one) 

590 

591 Parameters 

592 ---------- 

593 record : `DimensionRecord` 

594 The record to add to the accumulator 

595 

596 Returns 

597 ------- 

598 accumulatorKey : int 

599 The key that is associated with the supplied record 

600 """ 

601 if (mappingValue := self.mapping.get(record)) is None: 

602 simple = record.to_simple() 

603 mappingValue = (self._counter, simple) 

604 self._counter += 1 

605 self.mapping[record] = mappingValue 

606 return mappingValue[0] 

607 

608 def makeSerializedDimensionRecordMapping(self) -> dict[int, SerializedDimensionRecord]: 

609 return {id_: serializeRef for id_, serializeRef in self.mapping.values()}