Coverage for python/lsst/daf/butler/_quantum_backed.py: 26%

183 statements  

« prev     ^ index     » next       coverage.py v7.2.5, created at 2023-05-06 09:33 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ("QuantumBackedButler", "QuantumProvenanceData") 

25 

26import itertools 

27import logging 

28import uuid 

29from collections import defaultdict 

30from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Mapping, Optional, Set, Type, Union 

31 

32from deprecated.sphinx import deprecated 

33from pydantic import BaseModel 

34 

35from ._butlerConfig import ButlerConfig 

36from ._deferredDatasetHandle import DeferredDatasetHandle 

37from ._limited_butler import LimitedButler 

38from .core import ( 

39 Config, 

40 DatasetId, 

41 DatasetRef, 

42 DatasetType, 

43 Datastore, 

44 DatastoreRecordData, 

45 DimensionUniverse, 

46 Quantum, 

47 SerializedDatastoreRecordData, 

48 StorageClass, 

49 StorageClassFactory, 

50 ddl, 

51) 

52from .registry.bridge.monolithic import MonolithicDatastoreRegistryBridgeManager 

53from .registry.databases.sqlite import SqliteDatabase 

54from .registry.interfaces import DatastoreRegistryBridgeManager, OpaqueTableStorageManager 

55from .registry.opaque import ByNameOpaqueTableStorageManager 

56 

57if TYPE_CHECKING: 

58 from ._butler import Butler 

59 

60_LOG = logging.getLogger(__name__) 

61 

62 

63class _DatasetRecordStorageManagerDatastoreConstructionMimic: 

64 """A partial implementation of `DatasetRecordStorageManager` that exists 

65 only to allow a `DatastoreRegistryBridgeManager` (and hence a `Datastore`) 

66 to be constructed without a full `Registry`. 

67 

68 Notes 

69 ----- 

70 The interface implemented by this class should probably be its own ABC, 

71 and that ABC should probably be used in the definition of 

72 `DatastoreRegistryBridgeManager`, but while prototyping I'm trying to keep 

73 changes minimal. 

74 """ 

75 

76 @classmethod 

77 def getIdColumnType(cls) -> type: 

78 # Docstring inherited. 

79 return ddl.GUID 

80 

81 @classmethod 

82 def addDatasetForeignKey( 

83 cls, 

84 tableSpec: ddl.TableSpec, 

85 *, 

86 name: str = "dataset", 

87 constraint: bool = True, 

88 onDelete: Optional[str] = None, 

89 **kwargs: Any, 

90 ) -> ddl.FieldSpec: 

91 # Docstring inherited. 

92 idFieldSpec = ddl.FieldSpec(f"{name}_id", dtype=ddl.GUID, **kwargs) 

93 tableSpec.fields.add(idFieldSpec) 

94 return idFieldSpec 

95 

96 

97class QuantumBackedButler(LimitedButler): 

98 """An implementation of `LimitedButler` intended to back execution of a 

99 single `Quantum`. 

100 

101 Parameters 

102 ---------- 

103 predicted_inputs : `~collections.abc.Iterable` [`DatasetId`] 

104 Dataset IDs for datasets that can can be read from this butler. 

105 predicted_outputs : `~collections.abc.Iterable` [`DatasetId`] 

106 Dataset IDs for datasets that can be stored in this butler. 

107 dimensions : `DimensionUniverse` 

108 Object managing all dimension definitions. 

109 datastore : `Datastore` 

110 Datastore to use for all dataset I/O and existence checks. 

111 storageClasses : `StorageClassFactory` 

112 Object managing all storage class definitions. 

113 

114 Notes 

115 ----- 

116 Most callers should use the `initialize` `classmethod` to construct new 

117 instances instead of calling the constructor directly. 

118 

119 `QuantumBackedButler` uses a SQLite database internally, in order to reuse 

120 existing `DatastoreRegistryBridge` and `OpaqueTableStorage` 

121 implementations that rely SQLAlchemy. If implementations are added in the 

122 future that don't rely on SQLAlchemy, it should be possible to swap them 

123 in by overriding the type arguments to `initialize` (though at present, 

124 `QuantumBackedButler` would still create at least an in-memory SQLite 

125 database that would then go unused).` 

126 

127 We imagine `QuantumBackedButler` being used during (at least) batch 

128 execution to capture `Datastore` records and save them to per-quantum 

129 files, which are also a convenient place to store provenance for eventual 

130 upload to a SQL-backed `Registry` (once `Registry` has tables to store 

131 provenance, that is). 

132 These per-quantum files can be written in two ways: 

133 

134 - The SQLite file used internally by `QuantumBackedButler` can be used 

135 directly but customizing the ``filename`` argument to ``initialize``, and 

136 then transferring that file to the object store after execution completes 

137 (or fails; a ``try/finally`` pattern probably makes sense here). 

138 

139 - A JSON or YAML file can be written by calling `extract_provenance_data`, 

140 and using ``pydantic`` methods to write the returned 

141 `QuantumProvenanceData` to a file. 

142 

143 Note that at present, the SQLite file only contains datastore records, not 

144 provenance, but that should be easy to address (if desired) after we 

145 actually design a `Registry` schema for provenance. I also suspect that 

146 we'll want to explicitly close the SQLite file somehow before trying to 

147 transfer it. But I'm guessing we'd prefer to write the per-quantum files 

148 as JSON anyway. 

149 """ 

150 

151 def __init__( 

152 self, 

153 predicted_inputs: Iterable[DatasetId], 

154 predicted_outputs: Iterable[DatasetId], 

155 dimensions: DimensionUniverse, 

156 datastore: Datastore, 

157 storageClasses: StorageClassFactory, 

158 dataset_types: Mapping[str, DatasetType] | None = None, 

159 ): 

160 self._dimensions = dimensions 

161 self._predicted_inputs = set(predicted_inputs) 

162 self._predicted_outputs = set(predicted_outputs) 

163 self._available_inputs: Set[DatasetId] = set() 

164 self._unavailable_inputs: Set[DatasetId] = set() 

165 self._actual_inputs: Set[DatasetId] = set() 

166 self._actual_output_refs: Set[DatasetRef] = set() 

167 self.datastore = datastore 

168 self.storageClasses = storageClasses 

169 self._dataset_types: Mapping[str, DatasetType] = {} 

170 if dataset_types is not None: 

171 self._dataset_types = dataset_types 

172 self.datastore.set_retrieve_dataset_type_method(self._retrieve_dataset_type) 

173 

174 @classmethod 

175 def initialize( 

176 cls, 

177 config: Union[Config, str], 

178 quantum: Quantum, 

179 dimensions: DimensionUniverse, 

180 filename: str = ":memory:", 

181 OpaqueManagerClass: Type[OpaqueTableStorageManager] = ByNameOpaqueTableStorageManager, 

182 BridgeManagerClass: Type[DatastoreRegistryBridgeManager] = MonolithicDatastoreRegistryBridgeManager, 

183 search_paths: Optional[List[str]] = None, 

184 dataset_types: Mapping[str, DatasetType] | None = None, 

185 ) -> QuantumBackedButler: 

186 """Construct a new `QuantumBackedButler` from repository configuration 

187 and helper types. 

188 

189 Parameters 

190 ---------- 

191 config : `Config` or `str` 

192 A butler repository root, configuration filename, or configuration 

193 instance. 

194 quantum : `Quantum` 

195 Object describing the predicted input and output dataset relevant 

196 to this butler. This must have resolved `DatasetRef` instances for 

197 all inputs and outputs. 

198 dimensions : `DimensionUniverse` 

199 Object managing all dimension definitions. 

200 filename : `str`, optional 

201 Name for the SQLite database that will back this butler; defaults 

202 to an in-memory database. 

203 OpaqueManagerClass : `type`, optional 

204 A subclass of `OpaqueTableStorageManager` to use for datastore 

205 opaque records. Default is a SQL-backed implementation. 

206 BridgeManagerClass : `type`, optional 

207 A subclass of `DatastoreRegistryBridgeManager` to use for datastore 

208 location records. Default is a SQL-backed implementation. 

209 search_paths : `list` of `str`, optional 

210 Additional search paths for butler configuration. 

211 dataset_types: `Mapping` [`str`, `DatasetType`], optional 

212 Mapping of the dataset type name to its registry definition. 

213 """ 

214 predicted_inputs = [ 

215 ref.getCheckedId() for ref in itertools.chain.from_iterable(quantum.inputs.values()) 

216 ] 

217 predicted_inputs += [ref.getCheckedId() for ref in quantum.initInputs.values()] 

218 predicted_outputs = [ 

219 ref.getCheckedId() for ref in itertools.chain.from_iterable(quantum.outputs.values()) 

220 ] 

221 return cls._initialize( 

222 config=config, 

223 predicted_inputs=predicted_inputs, 

224 predicted_outputs=predicted_outputs, 

225 dimensions=dimensions, 

226 filename=filename, 

227 datastore_records=quantum.datastore_records, 

228 OpaqueManagerClass=OpaqueManagerClass, 

229 BridgeManagerClass=BridgeManagerClass, 

230 search_paths=search_paths, 

231 dataset_types=dataset_types, 

232 ) 

233 

234 @classmethod 

235 def from_predicted( 

236 cls, 

237 config: Union[Config, str], 

238 predicted_inputs: Iterable[DatasetId], 

239 predicted_outputs: Iterable[DatasetId], 

240 dimensions: DimensionUniverse, 

241 datastore_records: Mapping[str, DatastoreRecordData], 

242 filename: str = ":memory:", 

243 OpaqueManagerClass: Type[OpaqueTableStorageManager] = ByNameOpaqueTableStorageManager, 

244 BridgeManagerClass: Type[DatastoreRegistryBridgeManager] = MonolithicDatastoreRegistryBridgeManager, 

245 search_paths: Optional[List[str]] = None, 

246 dataset_types: Mapping[str, DatasetType] | None = None, 

247 ) -> QuantumBackedButler: 

248 """Construct a new `QuantumBackedButler` from sets of input and output 

249 dataset IDs. 

250 

251 Parameters 

252 ---------- 

253 config : `Config` or `str` 

254 A butler repository root, configuration filename, or configuration 

255 instance. 

256 predicted_inputs : `~collections.abc.Iterable` [`DatasetId`] 

257 Dataset IDs for datasets that can can be read from this butler. 

258 predicted_outputs : `~collections.abc.Iterable` [`DatasetId`] 

259 Dataset IDs for datasets that can be stored in this butler, must be 

260 fully resolved. 

261 dimensions : `DimensionUniverse` 

262 Object managing all dimension definitions. 

263 filename : `str`, optional 

264 Name for the SQLite database that will back this butler; defaults 

265 to an in-memory database. 

266 datastore_records : `dict` [`str`, `DatastoreRecordData`] or `None` 

267 Datastore records to import into a datastore. 

268 OpaqueManagerClass : `type`, optional 

269 A subclass of `OpaqueTableStorageManager` to use for datastore 

270 opaque records. Default is a SQL-backed implementation. 

271 BridgeManagerClass : `type`, optional 

272 A subclass of `DatastoreRegistryBridgeManager` to use for datastore 

273 location records. Default is a SQL-backed implementation. 

274 search_paths : `list` of `str`, optional 

275 Additional search paths for butler configuration. 

276 dataset_types: `Mapping` [`str`, `DatasetType`], optional 

277 Mapping of the dataset type name to its registry definition. 

278 """ 

279 return cls._initialize( 

280 config=config, 

281 predicted_inputs=predicted_inputs, 

282 predicted_outputs=predicted_outputs, 

283 dimensions=dimensions, 

284 filename=filename, 

285 datastore_records=datastore_records, 

286 OpaqueManagerClass=OpaqueManagerClass, 

287 BridgeManagerClass=BridgeManagerClass, 

288 search_paths=search_paths, 

289 dataset_types=dataset_types, 

290 ) 

291 

292 @classmethod 

293 def _initialize( 

294 cls, 

295 *, 

296 config: Union[Config, str], 

297 predicted_inputs: Iterable[DatasetId], 

298 predicted_outputs: Iterable[DatasetId], 

299 dimensions: DimensionUniverse, 

300 filename: str = ":memory:", 

301 datastore_records: Mapping[str, DatastoreRecordData] | None = None, 

302 OpaqueManagerClass: Type[OpaqueTableStorageManager] = ByNameOpaqueTableStorageManager, 

303 BridgeManagerClass: Type[DatastoreRegistryBridgeManager] = MonolithicDatastoreRegistryBridgeManager, 

304 search_paths: Optional[List[str]] = None, 

305 dataset_types: Mapping[str, DatasetType] | None = None, 

306 ) -> QuantumBackedButler: 

307 """Internal method with common implementation used by `initialize` and 

308 `for_output`. 

309 

310 Parameters 

311 ---------- 

312 config : `Config` or `str` 

313 A butler repository root, configuration filename, or configuration 

314 instance. 

315 predicted_inputs : `~collections.abc.Iterable` [`DatasetId`] 

316 Dataset IDs for datasets that can can be read from this butler. 

317 predicted_outputs : `~collections.abc.Iterable` [`DatasetId`] 

318 Dataset IDs for datasets that can be stored in this butler. 

319 dimensions : `DimensionUniverse` 

320 Object managing all dimension definitions. 

321 filename : `str`, optional 

322 Name for the SQLite database that will back this butler; defaults 

323 to an in-memory database. 

324 datastore_records : `dict` [`str`, `DatastoreRecordData`] or `None` 

325 Datastore records to import into a datastore. 

326 OpaqueManagerClass : `type`, optional 

327 A subclass of `OpaqueTableStorageManager` to use for datastore 

328 opaque records. Default is a SQL-backed implementation. 

329 BridgeManagerClass : `type`, optional 

330 A subclass of `DatastoreRegistryBridgeManager` to use for datastore 

331 location records. Default is a SQL-backed implementation. 

332 search_paths : `list` of `str`, optional 

333 Additional search paths for butler configuration. 

334 dataset_types: `Mapping` [`str`, `DatasetType`] 

335 Mapping of the dataset type name to its registry definition. 

336 """ 

337 butler_config = ButlerConfig(config, searchPaths=search_paths) 

338 if "root" in butler_config: 

339 butler_root = butler_config["root"] 

340 else: 

341 butler_root = butler_config.configDir 

342 db = SqliteDatabase.fromUri(f"sqlite:///{filename}", origin=0) 

343 with db.declareStaticTables(create=True) as context: 

344 opaque_manager = OpaqueManagerClass.initialize(db, context) 

345 bridge_manager = BridgeManagerClass.initialize( 

346 db, 

347 context, 

348 opaque=opaque_manager, 

349 # MyPy can tell it's a fake, but we know it shouldn't care. 

350 datasets=_DatasetRecordStorageManagerDatastoreConstructionMimic, # type: ignore 

351 universe=dimensions, 

352 ) 

353 # TODO: We need to inform `Datastore` here that it needs to support 

354 # predictive reads; right now that's a configuration option, but after 

355 # execution butler is retired it could just be a kwarg we pass here. 

356 # For now just force this option as we cannot work without it. 

357 butler_config["datastore", "trust_get_request"] = True 

358 datastore = Datastore.fromConfig(butler_config, bridge_manager, butler_root) 

359 if datastore_records is not None: 

360 datastore.import_records(datastore_records) 

361 storageClasses = StorageClassFactory() 

362 storageClasses.addFromConfig(butler_config) 

363 return cls( 

364 predicted_inputs, 

365 predicted_outputs, 

366 dimensions, 

367 datastore, 

368 storageClasses=storageClasses, 

369 dataset_types=dataset_types, 

370 ) 

371 

372 def _retrieve_dataset_type(self, name: str) -> DatasetType | None: 

373 """Return DatasetType defined in registry given dataset type name.""" 

374 return self._dataset_types.get(name) 

375 

376 def isWriteable(self) -> bool: 

377 # Docstring inherited. 

378 return True 

379 

380 @deprecated( 

381 reason="Butler.get() now behaves like Butler.getDirect() when given a DatasetRef." 

382 " Please use Butler.get(). Will be removed after v27.0.", 

383 version="v26.0", 

384 category=FutureWarning, 

385 ) 

386 def getDirect( 

387 self, 

388 ref: DatasetRef, 

389 *, 

390 parameters: Optional[Dict[str, Any]] = None, 

391 storageClass: str | StorageClass | None = None, 

392 ) -> Any: 

393 # Docstring inherited. 

394 return self.get(ref, parameters=parameters, storageClass=storageClass) 

395 

396 def get( 

397 self, 

398 ref: DatasetRef, 

399 /, 

400 *, 

401 parameters: dict[str, Any] | None = None, 

402 storageClass: StorageClass | str | None = None, 

403 ) -> Any: 

404 try: 

405 obj = super().get( 

406 ref, 

407 parameters=parameters, 

408 storageClass=storageClass, 

409 ) 

410 except (LookupError, FileNotFoundError, IOError): 

411 self._unavailable_inputs.add(ref.getCheckedId()) 

412 raise 

413 if ref.id in self._predicted_inputs: 

414 # do this after delegating to super in case that raises. 

415 self._actual_inputs.add(ref.id) 

416 self._available_inputs.add(ref.id) 

417 return obj 

418 

419 @deprecated( 

420 reason="Butler.getDeferred() now behaves like getDirectDeferred() when given a DatasetRef. " 

421 "Please use Butler.getDeferred(). Will be removed after v27.0.", 

422 version="v26.0", 

423 category=FutureWarning, 

424 ) 

425 def getDirectDeferred( 

426 self, 

427 ref: DatasetRef, 

428 *, 

429 parameters: Union[dict, None] = None, 

430 storageClass: str | StorageClass | None = None, 

431 ) -> DeferredDatasetHandle: 

432 # Docstring inherited. 

433 return self.getDeferred(ref, parameters=parameters, storageClass=storageClass) 

434 

435 def getDeferred( 

436 self, 

437 ref: DatasetRef, 

438 /, 

439 *, 

440 parameters: dict[str, Any] | None = None, 

441 storageClass: str | StorageClass | None = None, 

442 ) -> DeferredDatasetHandle: 

443 if ref.id in self._predicted_inputs: 

444 # Unfortunately, we can't do this after the handle succeeds in 

445 # loading, so it's conceivable here that we're marking an input 

446 # as "actual" even when it's not even available. 

447 self._actual_inputs.add(ref.id) 

448 return super().getDeferred(ref, parameters=parameters, storageClass=storageClass) 

449 

450 def datasetExistsDirect(self, ref: DatasetRef) -> bool: 

451 # Docstring inherited. 

452 exists = super().datasetExistsDirect(ref) 

453 if ref.id in self._predicted_inputs: 

454 if exists: 

455 self._available_inputs.add(ref.id) 

456 else: 

457 self._unavailable_inputs.add(ref.id) 

458 return exists 

459 

460 def markInputUnused(self, ref: DatasetRef) -> None: 

461 # Docstring inherited. 

462 self._actual_inputs.discard(ref.getCheckedId()) 

463 

464 @property 

465 def dimensions(self) -> DimensionUniverse: 

466 # Docstring inherited. 

467 return self._dimensions 

468 

469 def put(self, obj: Any, ref: DatasetRef, /) -> DatasetRef: 

470 # Docstring inherited. 

471 if ref.id not in self._predicted_outputs: 

472 raise RuntimeError("Cannot `put` dataset that was not predicted as an output.") 

473 self.datastore.put(obj, ref) 

474 self._actual_output_refs.add(ref) 

475 return ref 

476 

477 def pruneDatasets( 

478 self, 

479 refs: Iterable[DatasetRef], 

480 *, 

481 disassociate: bool = True, 

482 unstore: bool = False, 

483 tags: Iterable[str] = (), 

484 purge: bool = False, 

485 ) -> None: 

486 # docstring inherited from LimitedButler 

487 

488 if purge: 

489 if not disassociate: 

490 raise TypeError("Cannot pass purge=True without disassociate=True.") 

491 if not unstore: 

492 raise TypeError("Cannot pass purge=True without unstore=True.") 

493 elif disassociate: 

494 # No tagged collections for this butler. 

495 raise TypeError("Cannot pass disassociate=True without purge=True.") 

496 

497 refs = list(refs) 

498 

499 # Pruning a component of a DatasetRef makes no sense. 

500 for ref in refs: 

501 if ref.datasetType.component(): 

502 raise ValueError(f"Can not prune a component of a dataset (ref={ref})") 

503 

504 if unstore: 

505 self.datastore.trash(refs) 

506 if purge: 

507 for ref in refs: 

508 # We only care about removing them from actual output refs, 

509 self._actual_output_refs.discard(ref) 

510 

511 if unstore: 

512 # Point of no return for removing artifacts 

513 self.datastore.emptyTrash() 

514 

515 def extract_provenance_data(self) -> QuantumProvenanceData: 

516 """Extract provenance information and datastore records from this 

517 butler. 

518 

519 Returns 

520 ------- 

521 provenance : `QuantumProvenanceData` 

522 A serializable struct containing input/output dataset IDs and 

523 datastore records. This assumes all dataset IDs are UUIDs (just to 

524 make it easier for `pydantic` to reason about the struct's types); 

525 the rest of this class makes no such assumption, but the approach 

526 to processing in which it's useful effectively requires UUIDs 

527 anyway. 

528 

529 Notes 

530 ----- 

531 `QuantumBackedButler` records this provenance information when its 

532 methods are used, which mostly saves `~lsst.pipe.base.PipelineTask` 

533 authors from having to worry about while still recording very 

534 detailed information. But it has two small weaknesses: 

535 

536 - Calling `getDirectDeferred` or `getDirect` is enough to mark a 

537 dataset as an "actual input", which may mark some datasets that 

538 aren't actually used. We rely on task authors to use 

539 `markInputUnused` to address this. 

540 

541 - We assume that the execution system will call ``datasetExistsDirect`` 

542 on all predicted inputs prior to execution, in order to populate the 

543 "available inputs" set. This is what I envision 

544 '`~lsst.ctrl.mpexec.SingleQuantumExecutor` doing after we update it 

545 to use this class, but it feels fragile for this class to make such 

546 a strong assumption about how it will be used, even if I can't think 

547 of any other executor behavior that would make sense. 

548 """ 

549 if not self._actual_inputs.isdisjoint(self._unavailable_inputs): 

550 _LOG.warning( 

551 "Inputs %s were marked as actually used (probably because a DeferredDatasetHandle) " 

552 "was obtained, but did not actually exist. This task should be be using markInputUnused " 

553 "directly to clarify its provenance.", 

554 self._actual_inputs & self._unavailable_inputs, 

555 ) 

556 self._actual_inputs -= self._unavailable_inputs 

557 checked_inputs = self._available_inputs | self._unavailable_inputs 

558 if not self._predicted_inputs == checked_inputs: 

559 _LOG.warning( 

560 "Execution harness did not check predicted inputs %s for existence; available inputs " 

561 "recorded in provenance may be incomplete.", 

562 self._predicted_inputs - checked_inputs, 

563 ) 

564 datastore_records = self.datastore.export_records(self._actual_output_refs) 

565 provenance_records = { 

566 datastore_name: records.to_simple() for datastore_name, records in datastore_records.items() 

567 } 

568 

569 return QuantumProvenanceData( 

570 predicted_inputs=self._predicted_inputs, 

571 available_inputs=self._available_inputs, 

572 actual_inputs=self._actual_inputs, 

573 predicted_outputs=self._predicted_outputs, 

574 actual_outputs={ref.getCheckedId() for ref in self._actual_output_refs}, 

575 datastore_records=provenance_records, 

576 ) 

577 

578 

579class QuantumProvenanceData(BaseModel): 

580 """A serializable struct for per-quantum provenance information and 

581 datastore records. 

582 

583 Notes 

584 ----- 

585 This class slightly duplicates information from the `Quantum` class itself 

586 (the `predicted_inputs` and `predicted_outputs` sets should have the same 

587 IDs present in `Quantum.inputs` and `Quantum.outputs`), but overall it 

588 assumes the original `Quantum` is also available to reconstruct the 

589 complete provenance (e.g. by associating dataset IDs with data IDs, 

590 dataset types, and `~CollectionType.RUN` names. 

591 

592 Note that ``pydantic`` method ``parse_raw()`` is not going to work 

593 correctly for this class, use `direct` method instead. 

594 """ 

595 

596 # This class probably should have information about its execution 

597 # environment (anything not controlled and recorded at the 

598 # `~CollectionType.RUN` level, such as the compute node ID). but adding it 

599 # now is out of scope for this prototype. 

600 

601 predicted_inputs: Set[uuid.UUID] 

602 """Unique IDs of datasets that were predicted as inputs to this quantum 

603 when the QuantumGraph was built. 

604 """ 

605 

606 available_inputs: Set[uuid.UUID] 

607 """Unique IDs of input datasets that were actually present in the datastore 

608 when this quantum was executed. 

609 

610 This is a subset of `predicted_inputs`, with the difference generally being 

611 datasets were `predicted_outputs` but not `actual_outputs` of some upstream 

612 task. 

613 """ 

614 

615 actual_inputs: Set[uuid.UUID] 

616 """Unique IDs of datasets that were actually used as inputs by this task. 

617 

618 This is a subset of `available_inputs`. 

619 

620 Notes 

621 ----- 

622 The criteria for marking an input as used is that rerunning the quantum 

623 with only these `actual_inputs` available must yield identical outputs. 

624 This means that (for example) even just using an input to help determine 

625 an output rejection criteria and then rejecting it as an outlier qualifies 

626 that input as actually used. 

627 """ 

628 

629 predicted_outputs: Set[uuid.UUID] 

630 """Unique IDs of datasets that were predicted as outputs of this quantum 

631 when the QuantumGraph was built. 

632 """ 

633 

634 actual_outputs: Set[uuid.UUID] 

635 """Unique IDs of datasets that were actually written when this quantum 

636 was executed. 

637 """ 

638 

639 datastore_records: Dict[str, SerializedDatastoreRecordData] 

640 """Datastore records indexed by datastore name.""" 

641 

642 @staticmethod 

643 def collect_and_transfer( 

644 butler: Butler, quanta: Iterable[Quantum], provenance: Iterable[QuantumProvenanceData] 

645 ) -> None: 

646 """Transfer output datasets from multiple quanta to a more permantent 

647 `Butler` repository. 

648 

649 Parameters 

650 ---------- 

651 butler : `Butler` 

652 Full butler representing the data repository to transfer datasets 

653 to. 

654 quanta : `Iterable` [ `Quantum` ] 

655 Iterable of `Quantum` objects that carry information about 

656 predicted outputs. May be a single-pass iterator. 

657 provenance : `Iterable` [ `QuantumProvenanceData` ] 

658 Provenance and datastore data for each of the given quanta, in the 

659 same order. May be a single-pass iterator. 

660 

661 Notes 

662 ----- 

663 Input-output provenance data is not actually transferred yet, because 

664 `Registry` has no place to store it. 

665 

666 This method probably works most efficiently if run on all quanta for a 

667 single task label at once, because this will gather all datasets of 

668 a particular type together into a single vectorized `Registry` import. 

669 It should still behave correctly if run on smaller groups of quanta 

670 or even quanta from multiple tasks. 

671 

672 Currently this method transfers datastore record data unchanged, with 

673 no possibility of actually moving (e.g.) files. Datastores that are 

674 present only in execution or only in the more permanent butler are 

675 ignored. 

676 """ 

677 grouped_refs = defaultdict(list) 

678 summary_records: Dict[str, DatastoreRecordData] = {} 

679 for quantum, provenance_for_quantum in zip(quanta, provenance): 

680 quantum_refs_by_id = { 

681 ref.getCheckedId(): ref 

682 for ref in itertools.chain.from_iterable(quantum.outputs.values()) 

683 if ref.getCheckedId() in provenance_for_quantum.actual_outputs 

684 } 

685 for ref in quantum_refs_by_id.values(): 

686 grouped_refs[ref.datasetType, ref.run].append(ref) 

687 

688 # merge datastore records into a summary structure 

689 for datastore_name, serialized_records in provenance_for_quantum.datastore_records.items(): 

690 quantum_records = DatastoreRecordData.from_simple(serialized_records) 

691 if (records := summary_records.get(datastore_name)) is not None: 

692 records.update(quantum_records) 

693 else: 

694 summary_records[datastore_name] = quantum_records 

695 

696 for refs in grouped_refs.values(): 

697 butler.registry._importDatasets(refs) 

698 butler.datastore.import_records(summary_records) 

699 

700 @classmethod 

701 def parse_raw(cls, *args: Any, **kwargs: Any) -> QuantumProvenanceData: 

702 raise NotImplementedError("parse_raw() is not usable for this class, use direct() instead.") 

703 

704 @classmethod 

705 def direct( 

706 cls, 

707 *, 

708 predicted_inputs: Iterable[Union[str, uuid.UUID]], 

709 available_inputs: Iterable[Union[str, uuid.UUID]], 

710 actual_inputs: Iterable[Union[str, uuid.UUID]], 

711 predicted_outputs: Iterable[Union[str, uuid.UUID]], 

712 actual_outputs: Iterable[Union[str, uuid.UUID]], 

713 datastore_records: Mapping[str, Mapping], 

714 ) -> QuantumProvenanceData: 

715 """Construct an instance directly without validators. 

716 

717 This differs from the pydantic "construct" method in that the 

718 arguments are explicitly what the model requires, and it will recurse 

719 through members, constructing them from their corresponding `direct` 

720 methods. 

721 

722 This method should only be called when the inputs are trusted. 

723 """ 

724 

725 def _to_uuid_set(uuids: Iterable[Union[str, uuid.UUID]]) -> Set[uuid.UUID]: 

726 """Convert input UUIDs, which could be in string representation to 

727 a set of `UUID` instances. 

728 """ 

729 return set(uuid.UUID(id) if isinstance(id, str) else id for id in uuids) 

730 

731 data = QuantumProvenanceData.__new__(cls) 

732 setter = object.__setattr__ 

733 setter(data, "predicted_inputs", _to_uuid_set(predicted_inputs)) 

734 setter(data, "available_inputs", _to_uuid_set(available_inputs)) 

735 setter(data, "actual_inputs", _to_uuid_set(actual_inputs)) 

736 setter(data, "predicted_outputs", _to_uuid_set(predicted_outputs)) 

737 setter(data, "actual_outputs", _to_uuid_set(actual_outputs)) 

738 setter( 

739 data, 

740 "datastore_records", 

741 { 

742 key: SerializedDatastoreRecordData.direct(**records) 

743 for key, records in datastore_records.items() 

744 }, 

745 ) 

746 return data