Coverage for python/lsst/daf/butler/_quantum_backed.py: 25%

179 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2023-03-01 02:25 -0800

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ("QuantumBackedButler", "QuantumProvenanceData") 

25 

26import itertools 

27import logging 

28import uuid 

29from collections import defaultdict 

30from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Mapping, Optional, Set, Type, Union 

31 

32from pydantic import BaseModel 

33 

34from ._butlerConfig import ButlerConfig 

35from ._deferredDatasetHandle import DeferredDatasetHandle 

36from ._limited_butler import LimitedButler 

37from .core import ( 

38 Config, 

39 DatasetId, 

40 DatasetRef, 

41 DatasetType, 

42 Datastore, 

43 DatastoreRecordData, 

44 DimensionUniverse, 

45 Quantum, 

46 SerializedDatastoreRecordData, 

47 StorageClass, 

48 StorageClassFactory, 

49 ddl, 

50) 

51from .registry.bridge.monolithic import MonolithicDatastoreRegistryBridgeManager 

52from .registry.databases.sqlite import SqliteDatabase 

53from .registry.interfaces import DatastoreRegistryBridgeManager, OpaqueTableStorageManager 

54from .registry.opaque import ByNameOpaqueTableStorageManager 

55 

56if TYPE_CHECKING: 56 ↛ 57line 56 didn't jump to line 57, because the condition on line 56 was never true

57 from ._butler import Butler 

58 

59_LOG = logging.getLogger(__name__) 

60 

61 

62class _DatasetRecordStorageManagerDatastoreConstructionMimic: 

63 """A partial implementation of `DatasetRecordStorageManager` that exists 

64 only to allow a `DatastoreRegistryBridgeManager` (and hence a `Datastore`) 

65 to be constructed without a full `Registry`. 

66 

67 Notes 

68 ----- 

69 The interface implemented by this class should probably be its own ABC, 

70 and that ABC should probably be used in the definition of 

71 `DatastoreRegistryBridgeManager`, but while prototyping I'm trying to keep 

72 changes minimal. 

73 """ 

74 

75 @classmethod 

76 def getIdColumnType(cls) -> type: 

77 # Docstring inherited. 

78 return ddl.GUID 

79 

80 @classmethod 

81 def addDatasetForeignKey( 

82 cls, 

83 tableSpec: ddl.TableSpec, 

84 *, 

85 name: str = "dataset", 

86 constraint: bool = True, 

87 onDelete: Optional[str] = None, 

88 **kwargs: Any, 

89 ) -> ddl.FieldSpec: 

90 # Docstring inherited. 

91 idFieldSpec = ddl.FieldSpec(f"{name}_id", dtype=ddl.GUID, **kwargs) 

92 tableSpec.fields.add(idFieldSpec) 

93 return idFieldSpec 

94 

95 

96class QuantumBackedButler(LimitedButler): 

97 """An implementation of `LimitedButler` intended to back execution of a 

98 single `Quantum`. 

99 

100 Parameters 

101 ---------- 

102 predicted_inputs : `~collections.abc.Iterable` [`DatasetId`] 

103 Dataset IDs for datasets that can can be read from this butler. 

104 predicted_outputs : `~collections.abc.Iterable` [`DatasetId`] 

105 Dataset IDs for datasets that can be stored in this butler. 

106 dimensions : `DimensionUniverse` 

107 Object managing all dimension definitions. 

108 datastore : `Datastore` 

109 Datastore to use for all dataset I/O and existence checks. 

110 storageClasses : `StorageClassFactory` 

111 Object managing all storage class definitions. 

112 

113 Notes 

114 ----- 

115 Most callers should use the `initialize` `classmethod` to construct new 

116 instances instead of calling the constructor directly. 

117 

118 `QuantumBackedButler` uses a SQLite database internally, in order to reuse 

119 existing `DatastoreRegistryBridge` and `OpaqueTableStorage` 

120 implementations that rely SQLAlchemy. If implementations are added in the 

121 future that don't rely on SQLAlchemy, it should be possible to swap them 

122 in by overriding the type arguments to `initialize` (though at present, 

123 `QuantumBackedButler` would still create at least an in-memory SQLite 

124 database that would then go unused).` 

125 

126 We imagine `QuantumBackedButler` being used during (at least) batch 

127 execution to capture `Datastore` records and save them to per-quantum 

128 files, which are also a convenient place to store provenance for eventual 

129 upload to a SQL-backed `Registry` (once `Registry` has tables to store 

130 provenance, that is). 

131 These per-quantum files can be written in two ways: 

132 

133 - The SQLite file used internally by `QuantumBackedButler` can be used 

134 directly but customizing the ``filename`` argument to ``initialize``, and 

135 then transferring that file to the object store after execution completes 

136 (or fails; a ``try/finally`` pattern probably makes sense here). 

137 

138 - A JSON or YAML file can be written by calling `extract_provenance_data`, 

139 and using ``pydantic`` methods to write the returned 

140 `QuantumProvenanceData` to a file. 

141 

142 Note that at present, the SQLite file only contains datastore records, not 

143 provenance, but that should be easy to address (if desired) after we 

144 actually design a `Registry` schema for provenance. I also suspect that 

145 we'll want to explicitly close the SQLite file somehow before trying to 

146 transfer it. But I'm guessing we'd prefer to write the per-quantum files 

147 as JSON anyway. 

148 """ 

149 

150 def __init__( 

151 self, 

152 predicted_inputs: Iterable[DatasetId], 

153 predicted_outputs: Iterable[DatasetId], 

154 dimensions: DimensionUniverse, 

155 datastore: Datastore, 

156 storageClasses: StorageClassFactory, 

157 dataset_types: Mapping[str, DatasetType] | None = None, 

158 ): 

159 self._dimensions = dimensions 

160 self._predicted_inputs = set(predicted_inputs) 

161 self._predicted_outputs = set(predicted_outputs) 

162 self._available_inputs: Set[DatasetId] = set() 

163 self._unavailable_inputs: Set[DatasetId] = set() 

164 self._actual_inputs: Set[DatasetId] = set() 

165 self._actual_output_refs: Set[DatasetRef] = set() 

166 self.datastore = datastore 

167 self.storageClasses = storageClasses 

168 self._dataset_types: Mapping[str, DatasetType] = {} 

169 if dataset_types is not None: 

170 self._dataset_types = dataset_types 

171 self.datastore.set_retrieve_dataset_type_method(self._retrieve_dataset_type) 

172 

173 @classmethod 

174 def initialize( 

175 cls, 

176 config: Union[Config, str], 

177 quantum: Quantum, 

178 dimensions: DimensionUniverse, 

179 filename: str = ":memory:", 

180 OpaqueManagerClass: Type[OpaqueTableStorageManager] = ByNameOpaqueTableStorageManager, 

181 BridgeManagerClass: Type[DatastoreRegistryBridgeManager] = MonolithicDatastoreRegistryBridgeManager, 

182 search_paths: Optional[List[str]] = None, 

183 dataset_types: Mapping[str, DatasetType] | None = None, 

184 ) -> QuantumBackedButler: 

185 """Construct a new `QuantumBackedButler` from repository configuration 

186 and helper types. 

187 

188 Parameters 

189 ---------- 

190 config : `Config` or `str` 

191 A butler repository root, configuration filename, or configuration 

192 instance. 

193 quantum : `Quantum` 

194 Object describing the predicted input and output dataset relevant 

195 to this butler. This must have resolved `DatasetRef` instances for 

196 all inputs and outputs. 

197 dimensions : `DimensionUniverse` 

198 Object managing all dimension definitions. 

199 filename : `str`, optional 

200 Name for the SQLite database that will back this butler; defaults 

201 to an in-memory database. 

202 OpaqueManagerClass : `type`, optional 

203 A subclass of `OpaqueTableStorageManager` to use for datastore 

204 opaque records. Default is a SQL-backed implementation. 

205 BridgeManagerClass : `type`, optional 

206 A subclass of `DatastoreRegistryBridgeManager` to use for datastore 

207 location records. Default is a SQL-backed implementation. 

208 search_paths : `list` of `str`, optional 

209 Additional search paths for butler configuration. 

210 dataset_types: `Mapping` [`str`, `DatasetType`], optional 

211 Mapping of the dataset type name to its registry definition. 

212 """ 

213 predicted_inputs = [ 

214 ref.getCheckedId() for ref in itertools.chain.from_iterable(quantum.inputs.values()) 

215 ] 

216 predicted_inputs += [ref.getCheckedId() for ref in quantum.initInputs.values()] 

217 predicted_outputs = [ 

218 ref.getCheckedId() for ref in itertools.chain.from_iterable(quantum.outputs.values()) 

219 ] 

220 return cls._initialize( 

221 config=config, 

222 predicted_inputs=predicted_inputs, 

223 predicted_outputs=predicted_outputs, 

224 dimensions=dimensions, 

225 filename=filename, 

226 datastore_records=quantum.datastore_records, 

227 OpaqueManagerClass=OpaqueManagerClass, 

228 BridgeManagerClass=BridgeManagerClass, 

229 search_paths=search_paths, 

230 dataset_types=dataset_types, 

231 ) 

232 

233 @classmethod 

234 def from_predicted( 

235 cls, 

236 config: Union[Config, str], 

237 predicted_inputs: Iterable[DatasetId], 

238 predicted_outputs: Iterable[DatasetId], 

239 dimensions: DimensionUniverse, 

240 datastore_records: Mapping[str, DatastoreRecordData], 

241 filename: str = ":memory:", 

242 OpaqueManagerClass: Type[OpaqueTableStorageManager] = ByNameOpaqueTableStorageManager, 

243 BridgeManagerClass: Type[DatastoreRegistryBridgeManager] = MonolithicDatastoreRegistryBridgeManager, 

244 search_paths: Optional[List[str]] = None, 

245 dataset_types: Mapping[str, DatasetType] | None = None, 

246 ) -> QuantumBackedButler: 

247 """Construct a new `QuantumBackedButler` from sets of input and output 

248 dataset IDs. 

249 

250 Parameters 

251 ---------- 

252 config : `Config` or `str` 

253 A butler repository root, configuration filename, or configuration 

254 instance. 

255 predicted_inputs : `~collections.abc.Iterable` [`DatasetId`] 

256 Dataset IDs for datasets that can can be read from this butler. 

257 predicted_outputs : `~collections.abc.Iterable` [`DatasetId`] 

258 Dataset IDs for datasets that can be stored in this butler, must be 

259 fully resolved. 

260 dimensions : `DimensionUniverse` 

261 Object managing all dimension definitions. 

262 filename : `str`, optional 

263 Name for the SQLite database that will back this butler; defaults 

264 to an in-memory database. 

265 datastore_records : `dict` [`str`, `DatastoreRecordData`] or `None` 

266 Datastore records to import into a datastore. 

267 OpaqueManagerClass : `type`, optional 

268 A subclass of `OpaqueTableStorageManager` to use for datastore 

269 opaque records. Default is a SQL-backed implementation. 

270 BridgeManagerClass : `type`, optional 

271 A subclass of `DatastoreRegistryBridgeManager` to use for datastore 

272 location records. Default is a SQL-backed implementation. 

273 search_paths : `list` of `str`, optional 

274 Additional search paths for butler configuration. 

275 dataset_types: `Mapping` [`str`, `DatasetType`], optional 

276 Mapping of the dataset type name to its registry definition. 

277 """ 

278 return cls._initialize( 

279 config=config, 

280 predicted_inputs=predicted_inputs, 

281 predicted_outputs=predicted_outputs, 

282 dimensions=dimensions, 

283 filename=filename, 

284 datastore_records=datastore_records, 

285 OpaqueManagerClass=OpaqueManagerClass, 

286 BridgeManagerClass=BridgeManagerClass, 

287 search_paths=search_paths, 

288 dataset_types=dataset_types, 

289 ) 

290 

291 @classmethod 

292 def _initialize( 

293 cls, 

294 *, 

295 config: Union[Config, str], 

296 predicted_inputs: Iterable[DatasetId], 

297 predicted_outputs: Iterable[DatasetId], 

298 dimensions: DimensionUniverse, 

299 filename: str = ":memory:", 

300 datastore_records: Mapping[str, DatastoreRecordData] | None = None, 

301 OpaqueManagerClass: Type[OpaqueTableStorageManager] = ByNameOpaqueTableStorageManager, 

302 BridgeManagerClass: Type[DatastoreRegistryBridgeManager] = MonolithicDatastoreRegistryBridgeManager, 

303 search_paths: Optional[List[str]] = None, 

304 dataset_types: Mapping[str, DatasetType] | None = None, 

305 ) -> QuantumBackedButler: 

306 """Internal method with common implementation used by `initialize` and 

307 `for_output`. 

308 

309 Parameters 

310 ---------- 

311 config : `Config` or `str` 

312 A butler repository root, configuration filename, or configuration 

313 instance. 

314 predicted_inputs : `~collections.abc.Iterable` [`DatasetId`] 

315 Dataset IDs for datasets that can can be read from this butler. 

316 predicted_outputs : `~collections.abc.Iterable` [`DatasetId`] 

317 Dataset IDs for datasets that can be stored in this butler. 

318 dimensions : `DimensionUniverse` 

319 Object managing all dimension definitions. 

320 filename : `str`, optional 

321 Name for the SQLite database that will back this butler; defaults 

322 to an in-memory database. 

323 datastore_records : `dict` [`str`, `DatastoreRecordData`] or `None` 

324 Datastore records to import into a datastore. 

325 OpaqueManagerClass : `type`, optional 

326 A subclass of `OpaqueTableStorageManager` to use for datastore 

327 opaque records. Default is a SQL-backed implementation. 

328 BridgeManagerClass : `type`, optional 

329 A subclass of `DatastoreRegistryBridgeManager` to use for datastore 

330 location records. Default is a SQL-backed implementation. 

331 search_paths : `list` of `str`, optional 

332 Additional search paths for butler configuration. 

333 dataset_types: `Mapping` [`str`, `DatasetType`] 

334 Mapping of the dataset type name to its registry definition. 

335 """ 

336 butler_config = ButlerConfig(config, searchPaths=search_paths) 

337 if "root" in butler_config: 

338 butler_root = butler_config["root"] 

339 else: 

340 butler_root = butler_config.configDir 

341 db = SqliteDatabase.fromUri(f"sqlite:///{filename}", origin=0) 

342 with db.declareStaticTables(create=True) as context: 

343 opaque_manager = OpaqueManagerClass.initialize(db, context) 

344 bridge_manager = BridgeManagerClass.initialize( 

345 db, 

346 context, 

347 opaque=opaque_manager, 

348 # MyPy can tell it's a fake, but we know it shouldn't care. 

349 datasets=_DatasetRecordStorageManagerDatastoreConstructionMimic, # type: ignore 

350 universe=dimensions, 

351 ) 

352 # TODO: We need to inform `Datastore` here that it needs to support 

353 # predictive reads; right now that's a configuration option, but after 

354 # execution butler is retired it could just be a kwarg we pass here. 

355 # For now just force this option as we cannot work without it. 

356 butler_config["datastore", "trust_get_request"] = True 

357 datastore = Datastore.fromConfig(butler_config, bridge_manager, butler_root) 

358 if datastore_records is not None: 

359 datastore.import_records(datastore_records) 

360 storageClasses = StorageClassFactory() 

361 storageClasses.addFromConfig(butler_config) 

362 return cls( 

363 predicted_inputs, 

364 predicted_outputs, 

365 dimensions, 

366 datastore, 

367 storageClasses=storageClasses, 

368 dataset_types=dataset_types, 

369 ) 

370 

371 def _retrieve_dataset_type(self, name: str) -> DatasetType | None: 

372 """Return DatasetType defined in registry given dataset type name.""" 

373 return self._dataset_types.get(name) 

374 

375 def isWriteable(self) -> bool: 

376 # Docstring inherited. 

377 return True 

378 

379 def getDirect( 

380 self, 

381 ref: DatasetRef, 

382 *, 

383 parameters: Optional[Dict[str, Any]] = None, 

384 storageClass: str | StorageClass | None = None, 

385 ) -> Any: 

386 # Docstring inherited. 

387 try: 

388 obj = super().getDirect(ref, parameters=parameters, storageClass=storageClass) 

389 except (LookupError, FileNotFoundError, IOError): 

390 self._unavailable_inputs.add(ref.getCheckedId()) 

391 raise 

392 if ref.id in self._predicted_inputs: 

393 # do this after delegating to super in case that raises. 

394 self._actual_inputs.add(ref.id) 

395 self._available_inputs.add(ref.id) 

396 return obj 

397 

398 def getDirectDeferred( 

399 self, 

400 ref: DatasetRef, 

401 *, 

402 parameters: Union[dict, None] = None, 

403 storageClass: str | StorageClass | None = None, 

404 ) -> DeferredDatasetHandle: 

405 # Docstring inherited. 

406 if ref.id in self._predicted_inputs: 

407 # Unfortunately, we can't do this after the handle succeeds in 

408 # loading, so it's conceivable here that we're marking an input 

409 # as "actual" even when it's not even available. 

410 self._actual_inputs.add(ref.id) 

411 return super().getDirectDeferred(ref, parameters=parameters, storageClass=storageClass) 

412 

413 def datasetExistsDirect(self, ref: DatasetRef) -> bool: 

414 # Docstring inherited. 

415 exists = super().datasetExistsDirect(ref) 

416 if ref.id in self._predicted_inputs: 

417 if exists: 

418 self._available_inputs.add(ref.id) 

419 else: 

420 self._unavailable_inputs.add(ref.id) 

421 return exists 

422 

423 def markInputUnused(self, ref: DatasetRef) -> None: 

424 # Docstring inherited. 

425 self._actual_inputs.discard(ref.getCheckedId()) 

426 

427 @property 

428 def dimensions(self) -> DimensionUniverse: 

429 # Docstring inherited. 

430 return self._dimensions 

431 

432 def putDirect(self, obj: Any, ref: DatasetRef) -> DatasetRef: 

433 # Docstring inherited. 

434 if ref.id not in self._predicted_outputs: 

435 raise RuntimeError("Cannot `put` dataset that was not predicted as an output.") 

436 self.datastore.put(obj, ref) 

437 self._actual_output_refs.add(ref) 

438 return ref 

439 

440 def pruneDatasets( 

441 self, 

442 refs: Iterable[DatasetRef], 

443 *, 

444 disassociate: bool = True, 

445 unstore: bool = False, 

446 tags: Iterable[str] = (), 

447 purge: bool = False, 

448 ) -> None: 

449 # docstring inherited from LimitedButler 

450 

451 if purge: 

452 if not disassociate: 

453 raise TypeError("Cannot pass purge=True without disassociate=True.") 

454 if not unstore: 

455 raise TypeError("Cannot pass purge=True without unstore=True.") 

456 elif disassociate: 

457 # No tagged collections for this butler. 

458 raise TypeError("Cannot pass disassociate=True without purge=True.") 

459 

460 refs = list(refs) 

461 

462 # Pruning a component of a DatasetRef makes no sense. 

463 for ref in refs: 

464 if ref.datasetType.component(): 

465 raise ValueError(f"Can not prune a component of a dataset (ref={ref})") 

466 

467 if unstore: 

468 self.datastore.trash(refs) 

469 if purge: 

470 for ref in refs: 

471 # We only care about removing them from actual output refs, 

472 self._actual_output_refs.discard(ref) 

473 

474 if unstore: 

475 # Point of no return for removing artifacts 

476 self.datastore.emptyTrash() 

477 

478 def extract_provenance_data(self) -> QuantumProvenanceData: 

479 """Extract provenance information and datastore records from this 

480 butler. 

481 

482 Returns 

483 ------- 

484 provenance : `QuantumProvenanceData` 

485 A serializable struct containing input/output dataset IDs and 

486 datastore records. This assumes all dataset IDs are UUIDs (just to 

487 make it easier for `pydantic` to reason about the struct's types); 

488 the rest of this class makes no such assumption, but the approach 

489 to processing in which it's useful effectively requires UUIDs 

490 anyway. 

491 

492 Notes 

493 ----- 

494 `QuantumBackedButler` records this provenance information when its 

495 methods are used, which mostly saves `~lsst.pipe.base.PipelineTask` 

496 authors from having to worry about while still recording very 

497 detailed information. But it has two small weaknesses: 

498 

499 - Calling `getDirectDeferred` or `getDirect` is enough to mark a 

500 dataset as an "actual input", which may mark some datasets that 

501 aren't actually used. We rely on task authors to use 

502 `markInputUnused` to address this. 

503 

504 - We assume that the execution system will call ``datasetExistsDirect`` 

505 on all predicted inputs prior to execution, in order to populate the 

506 "available inputs" set. This is what I envision 

507 '`~lsst.ctrl.mpexec.SingleQuantumExecutor` doing after we update it 

508 to use this class, but it feels fragile for this class to make such 

509 a strong assumption about how it will be used, even if I can't think 

510 of any other executor behavior that would make sense. 

511 """ 

512 if not self._actual_inputs.isdisjoint(self._unavailable_inputs): 

513 _LOG.warning( 

514 "Inputs %s were marked as actually used (probably because a DeferredDatasetHandle) " 

515 "was obtained, but did not actually exist. This task should be be using markInputUnused " 

516 "directly to clarify its provenance.", 

517 self._actual_inputs & self._unavailable_inputs, 

518 ) 

519 self._actual_inputs -= self._unavailable_inputs 

520 checked_inputs = self._available_inputs | self._unavailable_inputs 

521 if not self._predicted_inputs == checked_inputs: 

522 _LOG.warning( 

523 "Execution harness did not check predicted inputs %s for existence; available inputs " 

524 "recorded in provenance may be incomplete.", 

525 self._predicted_inputs - checked_inputs, 

526 ) 

527 datastore_records = self.datastore.export_records(self._actual_output_refs) 

528 provenance_records = { 

529 datastore_name: records.to_simple() for datastore_name, records in datastore_records.items() 

530 } 

531 

532 return QuantumProvenanceData( 

533 predicted_inputs=self._predicted_inputs, 

534 available_inputs=self._available_inputs, 

535 actual_inputs=self._actual_inputs, 

536 predicted_outputs=self._predicted_outputs, 

537 actual_outputs={ref.getCheckedId() for ref in self._actual_output_refs}, 

538 datastore_records=provenance_records, 

539 ) 

540 

541 

542class QuantumProvenanceData(BaseModel): 

543 """A serializable struct for per-quantum provenance information and 

544 datastore records. 

545 

546 Notes 

547 ----- 

548 This class slightly duplicates information from the `Quantum` class itself 

549 (the `predicted_inputs` and `predicted_outputs` sets should have the same 

550 IDs present in `Quantum.inputs` and `Quantum.outputs`), but overall it 

551 assumes the original `Quantum` is also available to reconstruct the 

552 complete provenance (e.g. by associating dataset IDs with data IDs, 

553 dataset types, and `~CollectionType.RUN` names. 

554 

555 Note that ``pydantic`` method ``parse_raw()`` is not going to work 

556 correctly for this class, use `direct` method instead. 

557 """ 

558 

559 # This class probably should have information about its execution 

560 # environment (anything not controlled and recorded at the 

561 # `~CollectionType.RUN` level, such as the compute node ID). but adding it 

562 # now is out of scope for this prototype. 

563 

564 predicted_inputs: Set[uuid.UUID] 

565 """Unique IDs of datasets that were predicted as inputs to this quantum 

566 when the QuantumGraph was built. 

567 """ 

568 

569 available_inputs: Set[uuid.UUID] 

570 """Unique IDs of input datasets that were actually present in the datastore 

571 when this quantum was executed. 

572 

573 This is a subset of `predicted_inputs`, with the difference generally being 

574 datasets were `predicted_outputs` but not `actual_outputs` of some upstream 

575 task. 

576 """ 

577 

578 actual_inputs: Set[uuid.UUID] 

579 """Unique IDs of datasets that were actually used as inputs by this task. 

580 

581 This is a subset of `available_inputs`. 

582 

583 Notes 

584 ----- 

585 The criteria for marking an input as used is that rerunning the quantum 

586 with only these `actual_inputs` available must yield identical outputs. 

587 This means that (for example) even just using an input to help determine 

588 an output rejection criteria and then rejecting it as an outlier qualifies 

589 that input as actually used. 

590 """ 

591 

592 predicted_outputs: Set[uuid.UUID] 

593 """Unique IDs of datasets that were predicted as outputs of this quantum 

594 when the QuantumGraph was built. 

595 """ 

596 

597 actual_outputs: Set[uuid.UUID] 

598 """Unique IDs of datasets that were actually written when this quantum 

599 was executed. 

600 """ 

601 

602 datastore_records: Dict[str, SerializedDatastoreRecordData] 

603 """Datastore records indexed by datastore name.""" 

604 

605 @staticmethod 

606 def collect_and_transfer( 

607 butler: Butler, quanta: Iterable[Quantum], provenance: Iterable[QuantumProvenanceData] 

608 ) -> None: 

609 """Transfer output datasets from multiple quanta to a more permantent 

610 `Butler` repository. 

611 

612 Parameters 

613 ---------- 

614 butler : `Butler` 

615 Full butler representing the data repository to transfer datasets 

616 to. 

617 quanta : `Iterable` [ `Quantum` ] 

618 Iterable of `Quantum` objects that carry information about 

619 predicted outputs. May be a single-pass iterator. 

620 provenance : `Iterable` [ `QuantumProvenanceData` ] 

621 Provenance and datastore data for each of the given quanta, in the 

622 same order. May be a single-pass iterator. 

623 

624 Notes 

625 ----- 

626 Input-output provenance data is not actually transferred yet, because 

627 `Registry` has no place to store it. 

628 

629 This method probably works most efficiently if run on all quanta for a 

630 single task label at once, because this will gather all datasets of 

631 a particular type together into a single vectorized `Registry` import. 

632 It should still behave correctly if run on smaller groups of quanta 

633 or even quanta from multiple tasks. 

634 

635 Currently this method transfers datastore record data unchanged, with 

636 no possibility of actually moving (e.g.) files. Datastores that are 

637 present only in execution or only in the more permanent butler are 

638 ignored. 

639 """ 

640 grouped_refs = defaultdict(list) 

641 summary_records: Dict[str, DatastoreRecordData] = {} 

642 for quantum, provenance_for_quantum in zip(quanta, provenance): 

643 quantum_refs_by_id = { 

644 ref.getCheckedId(): ref 

645 for ref in itertools.chain.from_iterable(quantum.outputs.values()) 

646 if ref.getCheckedId() in provenance_for_quantum.actual_outputs 

647 } 

648 for ref in quantum_refs_by_id.values(): 

649 grouped_refs[ref.datasetType, ref.run].append(ref) 

650 

651 # merge datastore records into a summary structure 

652 for datastore_name, serialized_records in provenance_for_quantum.datastore_records.items(): 

653 quantum_records = DatastoreRecordData.from_simple(serialized_records) 

654 if (records := summary_records.get(datastore_name)) is not None: 

655 records.update(quantum_records) 

656 else: 

657 summary_records[datastore_name] = quantum_records 

658 

659 for refs in grouped_refs.values(): 

660 butler.registry._importDatasets(refs) 

661 butler.datastore.import_records(summary_records) 

662 

663 @classmethod 

664 def parse_raw(cls, *args: Any, **kwargs: Any) -> QuantumProvenanceData: 

665 raise NotImplementedError("parse_raw() is not usable for this class, use direct() instead.") 

666 

667 @classmethod 

668 def direct( 

669 cls, 

670 *, 

671 predicted_inputs: Iterable[Union[str, uuid.UUID]], 

672 available_inputs: Iterable[Union[str, uuid.UUID]], 

673 actual_inputs: Iterable[Union[str, uuid.UUID]], 

674 predicted_outputs: Iterable[Union[str, uuid.UUID]], 

675 actual_outputs: Iterable[Union[str, uuid.UUID]], 

676 datastore_records: Mapping[str, Mapping], 

677 ) -> QuantumProvenanceData: 

678 """Construct an instance directly without validators. 

679 

680 This differs from the pydantic "construct" method in that the 

681 arguments are explicitly what the model requires, and it will recurse 

682 through members, constructing them from their corresponding `direct` 

683 methods. 

684 

685 This method should only be called when the inputs are trusted. 

686 """ 

687 

688 def _to_uuid_set(uuids: Iterable[Union[str, uuid.UUID]]) -> Set[uuid.UUID]: 

689 """Convert input UUIDs, which could be in string representation to 

690 a set of `UUID` instances. 

691 """ 

692 return set(uuid.UUID(id) if isinstance(id, str) else id for id in uuids) 

693 

694 data = QuantumProvenanceData.__new__(cls) 

695 setter = object.__setattr__ 

696 setter(data, "predicted_inputs", _to_uuid_set(predicted_inputs)) 

697 setter(data, "available_inputs", _to_uuid_set(available_inputs)) 

698 setter(data, "actual_inputs", _to_uuid_set(actual_inputs)) 

699 setter(data, "predicted_outputs", _to_uuid_set(predicted_outputs)) 

700 setter(data, "actual_outputs", _to_uuid_set(actual_outputs)) 

701 setter( 

702 data, 

703 "datastore_records", 

704 { 

705 key: SerializedDatastoreRecordData.direct(**records) 

706 for key, records in datastore_records.items() 

707 }, 

708 ) 

709 return data