Coverage for python/lsst/daf/butler/_quantum_backed.py: 25%

193 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-06-28 10:10 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ("QuantumBackedButler", "QuantumProvenanceData") 

25 

26import itertools 

27import logging 

28import uuid 

29from collections import defaultdict 

30from collections.abc import Iterable, Mapping 

31from typing import TYPE_CHECKING, Any 

32 

33from deprecated.sphinx import deprecated 

34from lsst.resources import ResourcePathExpression 

35from pydantic import BaseModel 

36 

37from ._butlerConfig import ButlerConfig 

38from ._deferredDatasetHandle import DeferredDatasetHandle 

39from ._limited_butler import LimitedButler 

40from .core import ( 

41 Config, 

42 DatasetId, 

43 DatasetRef, 

44 DatasetType, 

45 Datastore, 

46 DatastoreRecordData, 

47 DimensionUniverse, 

48 Quantum, 

49 SerializedDatastoreRecordData, 

50 StorageClass, 

51 StorageClassFactory, 

52 ddl, 

53) 

54from .registry.bridge.monolithic import MonolithicDatastoreRegistryBridgeManager 

55from .registry.databases.sqlite import SqliteDatabase 

56from .registry.interfaces import DatastoreRegistryBridgeManager, OpaqueTableStorageManager 

57from .registry.opaque import ByNameOpaqueTableStorageManager 

58 

59if TYPE_CHECKING: 

60 from ._butler import Butler 

61 

62_LOG = logging.getLogger(__name__) 

63 

64 

65class _DatasetRecordStorageManagerDatastoreConstructionMimic: 

66 """A partial implementation of `DatasetRecordStorageManager` that exists 

67 only to allow a `DatastoreRegistryBridgeManager` (and hence a `Datastore`) 

68 to be constructed without a full `Registry`. 

69 

70 Notes 

71 ----- 

72 The interface implemented by this class should probably be its own ABC, 

73 and that ABC should probably be used in the definition of 

74 `DatastoreRegistryBridgeManager`, but while prototyping I'm trying to keep 

75 changes minimal. 

76 """ 

77 

78 @classmethod 

79 def getIdColumnType(cls) -> type: 

80 # Docstring inherited. 

81 return ddl.GUID 

82 

83 @classmethod 

84 def addDatasetForeignKey( 

85 cls, 

86 tableSpec: ddl.TableSpec, 

87 *, 

88 name: str = "dataset", 

89 constraint: bool = True, 

90 onDelete: str | None = None, 

91 **kwargs: Any, 

92 ) -> ddl.FieldSpec: 

93 # Docstring inherited. 

94 idFieldSpec = ddl.FieldSpec(f"{name}_id", dtype=ddl.GUID, **kwargs) 

95 tableSpec.fields.add(idFieldSpec) 

96 return idFieldSpec 

97 

98 

99class QuantumBackedButler(LimitedButler): 

100 """An implementation of `LimitedButler` intended to back execution of a 

101 single `Quantum`. 

102 

103 Parameters 

104 ---------- 

105 predicted_inputs : `~collections.abc.Iterable` [`DatasetId`] 

106 Dataset IDs for datasets that can can be read from this butler. 

107 predicted_outputs : `~collections.abc.Iterable` [`DatasetId`] 

108 Dataset IDs for datasets that can be stored in this butler. 

109 dimensions : `DimensionUniverse` 

110 Object managing all dimension definitions. 

111 datastore : `Datastore` 

112 Datastore to use for all dataset I/O and existence checks. 

113 storageClasses : `StorageClassFactory` 

114 Object managing all storage class definitions. 

115 

116 Notes 

117 ----- 

118 Most callers should use the `initialize` `classmethod` to construct new 

119 instances instead of calling the constructor directly. 

120 

121 `QuantumBackedButler` uses a SQLite database internally, in order to reuse 

122 existing `DatastoreRegistryBridge` and `OpaqueTableStorage` 

123 implementations that rely SQLAlchemy. If implementations are added in the 

124 future that don't rely on SQLAlchemy, it should be possible to swap them 

125 in by overriding the type arguments to `initialize` (though at present, 

126 `QuantumBackedButler` would still create at least an in-memory SQLite 

127 database that would then go unused).` 

128 

129 We imagine `QuantumBackedButler` being used during (at least) batch 

130 execution to capture `Datastore` records and save them to per-quantum 

131 files, which are also a convenient place to store provenance for eventual 

132 upload to a SQL-backed `Registry` (once `Registry` has tables to store 

133 provenance, that is). 

134 These per-quantum files can be written in two ways: 

135 

136 - The SQLite file used internally by `QuantumBackedButler` can be used 

137 directly but customizing the ``filename`` argument to ``initialize``, and 

138 then transferring that file to the object store after execution completes 

139 (or fails; a ``try/finally`` pattern probably makes sense here). 

140 

141 - A JSON or YAML file can be written by calling `extract_provenance_data`, 

142 and using ``pydantic`` methods to write the returned 

143 `QuantumProvenanceData` to a file. 

144 

145 Note that at present, the SQLite file only contains datastore records, not 

146 provenance, but that should be easy to address (if desired) after we 

147 actually design a `Registry` schema for provenance. I also suspect that 

148 we'll want to explicitly close the SQLite file somehow before trying to 

149 transfer it. But I'm guessing we'd prefer to write the per-quantum files 

150 as JSON anyway. 

151 """ 

152 

153 def __init__( 

154 self, 

155 predicted_inputs: Iterable[DatasetId], 

156 predicted_outputs: Iterable[DatasetId], 

157 dimensions: DimensionUniverse, 

158 datastore: Datastore, 

159 storageClasses: StorageClassFactory, 

160 dataset_types: Mapping[str, DatasetType] | None = None, 

161 ): 

162 self._dimensions = dimensions 

163 self._predicted_inputs = set(predicted_inputs) 

164 self._predicted_outputs = set(predicted_outputs) 

165 self._available_inputs: set[DatasetId] = set() 

166 self._unavailable_inputs: set[DatasetId] = set() 

167 self._actual_inputs: set[DatasetId] = set() 

168 self._actual_output_refs: set[DatasetRef] = set() 

169 self.datastore = datastore 

170 self.storageClasses = storageClasses 

171 self._dataset_types: Mapping[str, DatasetType] = {} 

172 if dataset_types is not None: 

173 self._dataset_types = dataset_types 

174 self.datastore.set_retrieve_dataset_type_method(self._retrieve_dataset_type) 

175 

176 @classmethod 

177 def initialize( 

178 cls, 

179 config: Config | ResourcePathExpression, 

180 quantum: Quantum, 

181 dimensions: DimensionUniverse, 

182 filename: str = ":memory:", 

183 OpaqueManagerClass: type[OpaqueTableStorageManager] = ByNameOpaqueTableStorageManager, 

184 BridgeManagerClass: type[DatastoreRegistryBridgeManager] = MonolithicDatastoreRegistryBridgeManager, 

185 search_paths: list[str] | None = None, 

186 dataset_types: Mapping[str, DatasetType] | None = None, 

187 ) -> QuantumBackedButler: 

188 """Construct a new `QuantumBackedButler` from repository configuration 

189 and helper types. 

190 

191 Parameters 

192 ---------- 

193 config : `Config` or `~lsst.resources.ResourcePathExpression` 

194 A butler repository root, configuration filename, or configuration 

195 instance. 

196 quantum : `Quantum` 

197 Object describing the predicted input and output dataset relevant 

198 to this butler. This must have resolved `DatasetRef` instances for 

199 all inputs and outputs. 

200 dimensions : `DimensionUniverse` 

201 Object managing all dimension definitions. 

202 filename : `str`, optional 

203 Name for the SQLite database that will back this butler; defaults 

204 to an in-memory database. 

205 OpaqueManagerClass : `type`, optional 

206 A subclass of `OpaqueTableStorageManager` to use for datastore 

207 opaque records. Default is a SQL-backed implementation. 

208 BridgeManagerClass : `type`, optional 

209 A subclass of `DatastoreRegistryBridgeManager` to use for datastore 

210 location records. Default is a SQL-backed implementation. 

211 search_paths : `list` of `str`, optional 

212 Additional search paths for butler configuration. 

213 dataset_types: `~collections.abc.Mapping` [`str`, `DatasetType`], \ 

214 optional 

215 Mapping of the dataset type name to its registry definition. 

216 """ 

217 predicted_inputs = [ref.id for ref in itertools.chain.from_iterable(quantum.inputs.values())] 

218 predicted_inputs += [ref.id for ref in quantum.initInputs.values()] 

219 predicted_outputs = [ref.id for ref in itertools.chain.from_iterable(quantum.outputs.values())] 

220 return cls._initialize( 

221 config=config, 

222 predicted_inputs=predicted_inputs, 

223 predicted_outputs=predicted_outputs, 

224 dimensions=dimensions, 

225 filename=filename, 

226 datastore_records=quantum.datastore_records, 

227 OpaqueManagerClass=OpaqueManagerClass, 

228 BridgeManagerClass=BridgeManagerClass, 

229 search_paths=search_paths, 

230 dataset_types=dataset_types, 

231 ) 

232 

233 @classmethod 

234 def from_predicted( 

235 cls, 

236 config: Config | ResourcePathExpression, 

237 predicted_inputs: Iterable[DatasetId], 

238 predicted_outputs: Iterable[DatasetId], 

239 dimensions: DimensionUniverse, 

240 datastore_records: Mapping[str, DatastoreRecordData], 

241 filename: str = ":memory:", 

242 OpaqueManagerClass: type[OpaqueTableStorageManager] = ByNameOpaqueTableStorageManager, 

243 BridgeManagerClass: type[DatastoreRegistryBridgeManager] = MonolithicDatastoreRegistryBridgeManager, 

244 search_paths: list[str] | None = None, 

245 dataset_types: Mapping[str, DatasetType] | None = None, 

246 ) -> QuantumBackedButler: 

247 """Construct a new `QuantumBackedButler` from sets of input and output 

248 dataset IDs. 

249 

250 Parameters 

251 ---------- 

252 config : `Config` or `~lsst.resources.ResourcePathExpression` 

253 A butler repository root, configuration filename, or configuration 

254 instance. 

255 predicted_inputs : `~collections.abc.Iterable` [`DatasetId`] 

256 Dataset IDs for datasets that can can be read from this butler. 

257 predicted_outputs : `~collections.abc.Iterable` [`DatasetId`] 

258 Dataset IDs for datasets that can be stored in this butler, must be 

259 fully resolved. 

260 dimensions : `DimensionUniverse` 

261 Object managing all dimension definitions. 

262 filename : `str`, optional 

263 Name for the SQLite database that will back this butler; defaults 

264 to an in-memory database. 

265 datastore_records : `dict` [`str`, `DatastoreRecordData`] or `None` 

266 Datastore records to import into a datastore. 

267 OpaqueManagerClass : `type`, optional 

268 A subclass of `OpaqueTableStorageManager` to use for datastore 

269 opaque records. Default is a SQL-backed implementation. 

270 BridgeManagerClass : `type`, optional 

271 A subclass of `DatastoreRegistryBridgeManager` to use for datastore 

272 location records. Default is a SQL-backed implementation. 

273 search_paths : `list` of `str`, optional 

274 Additional search paths for butler configuration. 

275 dataset_types: `~collections.abc.Mapping` [`str`, `DatasetType`], \ 

276 optional 

277 Mapping of the dataset type name to its registry definition. 

278 """ 

279 return cls._initialize( 

280 config=config, 

281 predicted_inputs=predicted_inputs, 

282 predicted_outputs=predicted_outputs, 

283 dimensions=dimensions, 

284 filename=filename, 

285 datastore_records=datastore_records, 

286 OpaqueManagerClass=OpaqueManagerClass, 

287 BridgeManagerClass=BridgeManagerClass, 

288 search_paths=search_paths, 

289 dataset_types=dataset_types, 

290 ) 

291 

292 @classmethod 

293 def _initialize( 

294 cls, 

295 *, 

296 config: Config | ResourcePathExpression, 

297 predicted_inputs: Iterable[DatasetId], 

298 predicted_outputs: Iterable[DatasetId], 

299 dimensions: DimensionUniverse, 

300 filename: str = ":memory:", 

301 datastore_records: Mapping[str, DatastoreRecordData] | None = None, 

302 OpaqueManagerClass: type[OpaqueTableStorageManager] = ByNameOpaqueTableStorageManager, 

303 BridgeManagerClass: type[DatastoreRegistryBridgeManager] = MonolithicDatastoreRegistryBridgeManager, 

304 search_paths: list[str] | None = None, 

305 dataset_types: Mapping[str, DatasetType] | None = None, 

306 ) -> QuantumBackedButler: 

307 """Initialize quantum-backed butler. 

308 

309 Internal method with common implementation used by `initialize` and 

310 `for_output`. 

311 

312 Parameters 

313 ---------- 

314 config : `Config` or `~lsst.resources.ResourcePathExpression` 

315 A butler repository root, configuration filename, or configuration 

316 instance. 

317 predicted_inputs : `~collections.abc.Iterable` [`DatasetId`] 

318 Dataset IDs for datasets that can can be read from this butler. 

319 predicted_outputs : `~collections.abc.Iterable` [`DatasetId`] 

320 Dataset IDs for datasets that can be stored in this butler. 

321 dimensions : `DimensionUniverse` 

322 Object managing all dimension definitions. 

323 filename : `str`, optional 

324 Name for the SQLite database that will back this butler; defaults 

325 to an in-memory database. 

326 datastore_records : `dict` [`str`, `DatastoreRecordData`] or `None` 

327 Datastore records to import into a datastore. 

328 OpaqueManagerClass : `type`, optional 

329 A subclass of `OpaqueTableStorageManager` to use for datastore 

330 opaque records. Default is a SQL-backed implementation. 

331 BridgeManagerClass : `type`, optional 

332 A subclass of `DatastoreRegistryBridgeManager` to use for datastore 

333 location records. Default is a SQL-backed implementation. 

334 search_paths : `list` of `str`, optional 

335 Additional search paths for butler configuration. 

336 dataset_types: `~collections.abc.Mapping` [`str`, `DatasetType`] 

337 Mapping of the dataset type name to its registry definition. 

338 """ 

339 butler_config = ButlerConfig(config, searchPaths=search_paths) 

340 if "root" in butler_config: 

341 butler_root = butler_config["root"] 

342 else: 

343 butler_root = butler_config.configDir 

344 db = SqliteDatabase.fromUri(f"sqlite:///{filename}", origin=0) 

345 with db.declareStaticTables(create=True) as context: 

346 opaque_manager = OpaqueManagerClass.initialize(db, context) 

347 bridge_manager = BridgeManagerClass.initialize( 

348 db, 

349 context, 

350 opaque=opaque_manager, 

351 # MyPy can tell it's a fake, but we know it shouldn't care. 

352 datasets=_DatasetRecordStorageManagerDatastoreConstructionMimic, # type: ignore 

353 universe=dimensions, 

354 ) 

355 # TODO: We need to inform `Datastore` here that it needs to support 

356 # predictive reads; right now that's a configuration option, but after 

357 # execution butler is retired it could just be a kwarg we pass here. 

358 # For now just force this option as we cannot work without it. 

359 butler_config["datastore", "trust_get_request"] = True 

360 datastore = Datastore.fromConfig(butler_config, bridge_manager, butler_root) 

361 if datastore_records is not None: 

362 datastore.import_records(datastore_records) 

363 storageClasses = StorageClassFactory() 

364 storageClasses.addFromConfig(butler_config) 

365 return cls( 

366 predicted_inputs, 

367 predicted_outputs, 

368 dimensions, 

369 datastore, 

370 storageClasses=storageClasses, 

371 dataset_types=dataset_types, 

372 ) 

373 

374 def _retrieve_dataset_type(self, name: str) -> DatasetType | None: 

375 """Return DatasetType defined in registry given dataset type name.""" 

376 return self._dataset_types.get(name) 

377 

378 def isWriteable(self) -> bool: 

379 # Docstring inherited. 

380 return True 

381 

382 @deprecated( 

383 reason="Butler.get() now behaves like Butler.getDirect() when given a DatasetRef." 

384 " Please use Butler.get(). Will be removed after v27.0.", 

385 version="v26.0", 

386 category=FutureWarning, 

387 ) 

388 def getDirect( 

389 self, 

390 ref: DatasetRef, 

391 *, 

392 parameters: dict[str, Any] | None = None, 

393 storageClass: str | StorageClass | None = None, 

394 ) -> Any: 

395 # Docstring inherited. 

396 return self.get(ref, parameters=parameters, storageClass=storageClass) 

397 

398 def get( 

399 self, 

400 ref: DatasetRef, 

401 /, 

402 *, 

403 parameters: dict[str, Any] | None = None, 

404 storageClass: StorageClass | str | None = None, 

405 ) -> Any: 

406 try: 

407 obj = super().get( 

408 ref, 

409 parameters=parameters, 

410 storageClass=storageClass, 

411 ) 

412 except (LookupError, FileNotFoundError, OSError): 

413 self._unavailable_inputs.add(ref.id) 

414 raise 

415 if ref.id in self._predicted_inputs: 

416 # do this after delegating to super in case that raises. 

417 self._actual_inputs.add(ref.id) 

418 self._available_inputs.add(ref.id) 

419 return obj 

420 

421 @deprecated( 

422 reason="Butler.getDeferred() now behaves like getDirectDeferred() when given a DatasetRef. " 

423 "Please use Butler.getDeferred(). Will be removed after v27.0.", 

424 version="v26.0", 

425 category=FutureWarning, 

426 ) 

427 def getDirectDeferred( 

428 self, 

429 ref: DatasetRef, 

430 *, 

431 parameters: dict[str, Any] | None = None, 

432 storageClass: str | StorageClass | None = None, 

433 ) -> DeferredDatasetHandle: 

434 # Docstring inherited. 

435 return self.getDeferred(ref, parameters=parameters, storageClass=storageClass) 

436 

437 def getDeferred( 

438 self, 

439 ref: DatasetRef, 

440 /, 

441 *, 

442 parameters: dict[str, Any] | None = None, 

443 storageClass: str | StorageClass | None = None, 

444 ) -> DeferredDatasetHandle: 

445 if ref.id in self._predicted_inputs: 

446 # Unfortunately, we can't do this after the handle succeeds in 

447 # loading, so it's conceivable here that we're marking an input 

448 # as "actual" even when it's not even available. 

449 self._actual_inputs.add(ref.id) 

450 return super().getDeferred(ref, parameters=parameters, storageClass=storageClass) 

451 

452 def stored(self, ref: DatasetRef) -> bool: 

453 # Docstring inherited. 

454 stored = super().stored(ref) 

455 if ref.id in self._predicted_inputs: 

456 if stored: 

457 self._available_inputs.add(ref.id) 

458 else: 

459 self._unavailable_inputs.add(ref.id) 

460 return stored 

461 

462 def stored_many( 

463 self, 

464 refs: Iterable[DatasetRef], 

465 ) -> dict[DatasetRef, bool]: 

466 # Docstring inherited. 

467 existence = super().stored_many(refs) 

468 

469 for ref, stored in existence.items(): 

470 if ref.id in self._predicted_inputs: 

471 if stored: 

472 self._available_inputs.add(ref.id) 

473 else: 

474 self._unavailable_inputs.add(ref.id) 

475 return existence 

476 

477 def markInputUnused(self, ref: DatasetRef) -> None: 

478 # Docstring inherited. 

479 self._actual_inputs.discard(ref.id) 

480 

481 @property 

482 def dimensions(self) -> DimensionUniverse: 

483 # Docstring inherited. 

484 return self._dimensions 

485 

486 def put(self, obj: Any, ref: DatasetRef, /) -> DatasetRef: 

487 # Docstring inherited. 

488 if ref.id not in self._predicted_outputs: 

489 raise RuntimeError("Cannot `put` dataset that was not predicted as an output.") 

490 self.datastore.put(obj, ref) 

491 self._actual_output_refs.add(ref) 

492 return ref 

493 

494 def pruneDatasets( 

495 self, 

496 refs: Iterable[DatasetRef], 

497 *, 

498 disassociate: bool = True, 

499 unstore: bool = False, 

500 tags: Iterable[str] = (), 

501 purge: bool = False, 

502 ) -> None: 

503 # docstring inherited from LimitedButler 

504 

505 if purge: 

506 if not disassociate: 

507 raise TypeError("Cannot pass purge=True without disassociate=True.") 

508 if not unstore: 

509 raise TypeError("Cannot pass purge=True without unstore=True.") 

510 elif disassociate: 

511 # No tagged collections for this butler. 

512 raise TypeError("Cannot pass disassociate=True without purge=True.") 

513 

514 refs = list(refs) 

515 

516 # Pruning a component of a DatasetRef makes no sense. 

517 for ref in refs: 

518 if ref.datasetType.component(): 

519 raise ValueError(f"Can not prune a component of a dataset (ref={ref})") 

520 

521 if unstore: 

522 self.datastore.trash(refs) 

523 if purge: 

524 for ref in refs: 

525 # We only care about removing them from actual output refs, 

526 self._actual_output_refs.discard(ref) 

527 

528 if unstore: 

529 # Point of no return for removing artifacts 

530 self.datastore.emptyTrash() 

531 

532 def extract_provenance_data(self) -> QuantumProvenanceData: 

533 """Extract provenance information and datastore records from this 

534 butler. 

535 

536 Returns 

537 ------- 

538 provenance : `QuantumProvenanceData` 

539 A serializable struct containing input/output dataset IDs and 

540 datastore records. This assumes all dataset IDs are UUIDs (just to 

541 make it easier for `pydantic` to reason about the struct's types); 

542 the rest of this class makes no such assumption, but the approach 

543 to processing in which it's useful effectively requires UUIDs 

544 anyway. 

545 

546 Notes 

547 ----- 

548 `QuantumBackedButler` records this provenance information when its 

549 methods are used, which mostly saves `~lsst.pipe.base.PipelineTask` 

550 authors from having to worry about while still recording very 

551 detailed information. But it has two small weaknesses: 

552 

553 - Calling `getDirectDeferred` or `getDirect` is enough to mark a 

554 dataset as an "actual input", which may mark some datasets that 

555 aren't actually used. We rely on task authors to use 

556 `markInputUnused` to address this. 

557 

558 - We assume that the execution system will call ``datasetExistsDirect`` 

559 on all predicted inputs prior to execution, in order to populate the 

560 "available inputs" set. This is what I envision 

561 '`~lsst.ctrl.mpexec.SingleQuantumExecutor` doing after we update it 

562 to use this class, but it feels fragile for this class to make such 

563 a strong assumption about how it will be used, even if I can't think 

564 of any other executor behavior that would make sense. 

565 """ 

566 if not self._actual_inputs.isdisjoint(self._unavailable_inputs): 

567 _LOG.warning( 

568 "Inputs %s were marked as actually used (probably because a DeferredDatasetHandle) " 

569 "was obtained, but did not actually exist. This task should be be using markInputUnused " 

570 "directly to clarify its provenance.", 

571 self._actual_inputs & self._unavailable_inputs, 

572 ) 

573 self._actual_inputs -= self._unavailable_inputs 

574 checked_inputs = self._available_inputs | self._unavailable_inputs 

575 if not self._predicted_inputs == checked_inputs: 

576 _LOG.warning( 

577 "Execution harness did not check predicted inputs %s for existence; available inputs " 

578 "recorded in provenance may be incomplete.", 

579 self._predicted_inputs - checked_inputs, 

580 ) 

581 datastore_records = self.datastore.export_records(self._actual_output_refs) 

582 provenance_records = { 

583 datastore_name: records.to_simple() for datastore_name, records in datastore_records.items() 

584 } 

585 

586 return QuantumProvenanceData( 

587 predicted_inputs=self._predicted_inputs, 

588 available_inputs=self._available_inputs, 

589 actual_inputs=self._actual_inputs, 

590 predicted_outputs=self._predicted_outputs, 

591 actual_outputs={ref.id for ref in self._actual_output_refs}, 

592 datastore_records=provenance_records, 

593 ) 

594 

595 

596class QuantumProvenanceData(BaseModel): 

597 """A serializable struct for per-quantum provenance information and 

598 datastore records. 

599 

600 Notes 

601 ----- 

602 This class slightly duplicates information from the `Quantum` class itself 

603 (the `predicted_inputs` and `predicted_outputs` sets should have the same 

604 IDs present in `Quantum.inputs` and `Quantum.outputs`), but overall it 

605 assumes the original `Quantum` is also available to reconstruct the 

606 complete provenance (e.g. by associating dataset IDs with data IDs, 

607 dataset types, and `~CollectionType.RUN` names. 

608 

609 Note that ``pydantic`` method ``parse_raw()`` is not going to work 

610 correctly for this class, use `direct` method instead. 

611 """ 

612 

613 # This class probably should have information about its execution 

614 # environment (anything not controlled and recorded at the 

615 # `~CollectionType.RUN` level, such as the compute node ID). but adding it 

616 # now is out of scope for this prototype. 

617 

618 predicted_inputs: set[uuid.UUID] 

619 """Unique IDs of datasets that were predicted as inputs to this quantum 

620 when the QuantumGraph was built. 

621 """ 

622 

623 available_inputs: set[uuid.UUID] 

624 """Unique IDs of input datasets that were actually present in the datastore 

625 when this quantum was executed. 

626 

627 This is a subset of `predicted_inputs`, with the difference generally being 

628 datasets were `predicted_outputs` but not `actual_outputs` of some upstream 

629 task. 

630 """ 

631 

632 actual_inputs: set[uuid.UUID] 

633 """Unique IDs of datasets that were actually used as inputs by this task. 

634 

635 This is a subset of `available_inputs`. 

636 

637 Notes 

638 ----- 

639 The criteria for marking an input as used is that rerunning the quantum 

640 with only these `actual_inputs` available must yield identical outputs. 

641 This means that (for example) even just using an input to help determine 

642 an output rejection criteria and then rejecting it as an outlier qualifies 

643 that input as actually used. 

644 """ 

645 

646 predicted_outputs: set[uuid.UUID] 

647 """Unique IDs of datasets that were predicted as outputs of this quantum 

648 when the QuantumGraph was built. 

649 """ 

650 

651 actual_outputs: set[uuid.UUID] 

652 """Unique IDs of datasets that were actually written when this quantum 

653 was executed. 

654 """ 

655 

656 datastore_records: dict[str, SerializedDatastoreRecordData] 

657 """Datastore records indexed by datastore name.""" 

658 

659 @staticmethod 

660 def collect_and_transfer( 

661 butler: Butler, quanta: Iterable[Quantum], provenance: Iterable[QuantumProvenanceData] 

662 ) -> None: 

663 """Transfer output datasets from multiple quanta to a more permantent 

664 `Butler` repository. 

665 

666 Parameters 

667 ---------- 

668 butler : `Butler` 

669 Full butler representing the data repository to transfer datasets 

670 to. 

671 quanta : `~collections.abc.Iterable` [ `Quantum` ] 

672 Iterable of `Quantum` objects that carry information about 

673 predicted outputs. May be a single-pass iterator. 

674 provenance : `~collections.abc.Iterable` [ `QuantumProvenanceData` ] 

675 Provenance and datastore data for each of the given quanta, in the 

676 same order. May be a single-pass iterator. 

677 

678 Notes 

679 ----- 

680 Input-output provenance data is not actually transferred yet, because 

681 `Registry` has no place to store it. 

682 

683 This method probably works most efficiently if run on all quanta for a 

684 single task label at once, because this will gather all datasets of 

685 a particular type together into a single vectorized `Registry` import. 

686 It should still behave correctly if run on smaller groups of quanta 

687 or even quanta from multiple tasks. 

688 

689 Currently this method transfers datastore record data unchanged, with 

690 no possibility of actually moving (e.g.) files. Datastores that are 

691 present only in execution or only in the more permanent butler are 

692 ignored. 

693 """ 

694 grouped_refs = defaultdict(list) 

695 summary_records: dict[str, DatastoreRecordData] = {} 

696 for quantum, provenance_for_quantum in zip(quanta, provenance): 

697 quantum_refs_by_id = { 

698 ref.id: ref 

699 for ref in itertools.chain.from_iterable(quantum.outputs.values()) 

700 if ref.id in provenance_for_quantum.actual_outputs 

701 } 

702 for ref in quantum_refs_by_id.values(): 

703 grouped_refs[ref.datasetType, ref.run].append(ref) 

704 

705 # merge datastore records into a summary structure 

706 for datastore_name, serialized_records in provenance_for_quantum.datastore_records.items(): 

707 quantum_records = DatastoreRecordData.from_simple(serialized_records) 

708 if (records := summary_records.get(datastore_name)) is not None: 

709 records.update(quantum_records) 

710 else: 

711 summary_records[datastore_name] = quantum_records 

712 

713 for refs in grouped_refs.values(): 

714 butler.registry._importDatasets(refs) 

715 butler.datastore.import_records(summary_records) 

716 

717 @classmethod 

718 def parse_raw(cls, *args: Any, **kwargs: Any) -> QuantumProvenanceData: 

719 raise NotImplementedError("parse_raw() is not usable for this class, use direct() instead.") 

720 

721 @classmethod 

722 def direct( 

723 cls, 

724 *, 

725 predicted_inputs: Iterable[str | uuid.UUID], 

726 available_inputs: Iterable[str | uuid.UUID], 

727 actual_inputs: Iterable[str | uuid.UUID], 

728 predicted_outputs: Iterable[str | uuid.UUID], 

729 actual_outputs: Iterable[str | uuid.UUID], 

730 datastore_records: Mapping[str, Mapping], 

731 ) -> QuantumProvenanceData: 

732 """Construct an instance directly without validators. 

733 

734 This differs from the pydantic "construct" method in that the 

735 arguments are explicitly what the model requires, and it will recurse 

736 through members, constructing them from their corresponding `direct` 

737 methods. 

738 

739 This method should only be called when the inputs are trusted. 

740 """ 

741 

742 def _to_uuid_set(uuids: Iterable[str | uuid.UUID]) -> set[uuid.UUID]: 

743 """Convert input UUIDs, which could be in string representation to 

744 a set of `UUID` instances. 

745 """ 

746 return {uuid.UUID(id) if isinstance(id, str) else id for id in uuids} 

747 

748 data = QuantumProvenanceData.__new__(cls) 

749 setter = object.__setattr__ 

750 setter(data, "predicted_inputs", _to_uuid_set(predicted_inputs)) 

751 setter(data, "available_inputs", _to_uuid_set(available_inputs)) 

752 setter(data, "actual_inputs", _to_uuid_set(actual_inputs)) 

753 setter(data, "predicted_outputs", _to_uuid_set(predicted_outputs)) 

754 setter(data, "actual_outputs", _to_uuid_set(actual_outputs)) 

755 setter( 

756 data, 

757 "datastore_records", 

758 { 

759 key: SerializedDatastoreRecordData.direct(**records) 

760 for key, records in datastore_records.items() 

761 }, 

762 ) 

763 return data