Coverage for python/lsst/daf/butler/_quantum_backed.py: 26%

196 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-07-12 10:56 -0700

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ("QuantumBackedButler", "QuantumProvenanceData") 

25 

26import itertools 

27import logging 

28import uuid 

29from collections import defaultdict 

30from collections.abc import Iterable, Mapping 

31from typing import TYPE_CHECKING, Any 

32 

33from deprecated.sphinx import deprecated 

34from lsst.resources import ResourcePathExpression 

35 

36try: 

37 from pydantic.v1 import BaseModel 

38except ModuleNotFoundError: 

39 from pydantic import BaseModel # type: ignore 

40 

41from ._butlerConfig import ButlerConfig 

42from ._deferredDatasetHandle import DeferredDatasetHandle 

43from ._limited_butler import LimitedButler 

44from .core import ( 

45 Config, 

46 DatasetId, 

47 DatasetRef, 

48 DatasetType, 

49 Datastore, 

50 DatastoreRecordData, 

51 DimensionUniverse, 

52 Quantum, 

53 SerializedDatastoreRecordData, 

54 StorageClass, 

55 StorageClassFactory, 

56 ddl, 

57) 

58from .registry.bridge.monolithic import MonolithicDatastoreRegistryBridgeManager 

59from .registry.databases.sqlite import SqliteDatabase 

60from .registry.interfaces import DatastoreRegistryBridgeManager, OpaqueTableStorageManager 

61from .registry.opaque import ByNameOpaqueTableStorageManager 

62 

63if TYPE_CHECKING: 

64 from ._butler import Butler 

65 

66_LOG = logging.getLogger(__name__) 

67 

68 

69class _DatasetRecordStorageManagerDatastoreConstructionMimic: 

70 """A partial implementation of `DatasetRecordStorageManager` that exists 

71 only to allow a `DatastoreRegistryBridgeManager` (and hence a `Datastore`) 

72 to be constructed without a full `Registry`. 

73 

74 Notes 

75 ----- 

76 The interface implemented by this class should probably be its own ABC, 

77 and that ABC should probably be used in the definition of 

78 `DatastoreRegistryBridgeManager`, but while prototyping I'm trying to keep 

79 changes minimal. 

80 """ 

81 

82 @classmethod 

83 def getIdColumnType(cls) -> type: 

84 # Docstring inherited. 

85 return ddl.GUID 

86 

87 @classmethod 

88 def addDatasetForeignKey( 

89 cls, 

90 tableSpec: ddl.TableSpec, 

91 *, 

92 name: str = "dataset", 

93 constraint: bool = True, 

94 onDelete: str | None = None, 

95 **kwargs: Any, 

96 ) -> ddl.FieldSpec: 

97 # Docstring inherited. 

98 idFieldSpec = ddl.FieldSpec(f"{name}_id", dtype=ddl.GUID, **kwargs) 

99 tableSpec.fields.add(idFieldSpec) 

100 return idFieldSpec 

101 

102 

103class QuantumBackedButler(LimitedButler): 

104 """An implementation of `LimitedButler` intended to back execution of a 

105 single `Quantum`. 

106 

107 Parameters 

108 ---------- 

109 predicted_inputs : `~collections.abc.Iterable` [`DatasetId`] 

110 Dataset IDs for datasets that can can be read from this butler. 

111 predicted_outputs : `~collections.abc.Iterable` [`DatasetId`] 

112 Dataset IDs for datasets that can be stored in this butler. 

113 dimensions : `DimensionUniverse` 

114 Object managing all dimension definitions. 

115 datastore : `Datastore` 

116 Datastore to use for all dataset I/O and existence checks. 

117 storageClasses : `StorageClassFactory` 

118 Object managing all storage class definitions. 

119 

120 Notes 

121 ----- 

122 Most callers should use the `initialize` `classmethod` to construct new 

123 instances instead of calling the constructor directly. 

124 

125 `QuantumBackedButler` uses a SQLite database internally, in order to reuse 

126 existing `DatastoreRegistryBridge` and `OpaqueTableStorage` 

127 implementations that rely SQLAlchemy. If implementations are added in the 

128 future that don't rely on SQLAlchemy, it should be possible to swap them 

129 in by overriding the type arguments to `initialize` (though at present, 

130 `QuantumBackedButler` would still create at least an in-memory SQLite 

131 database that would then go unused).` 

132 

133 We imagine `QuantumBackedButler` being used during (at least) batch 

134 execution to capture `Datastore` records and save them to per-quantum 

135 files, which are also a convenient place to store provenance for eventual 

136 upload to a SQL-backed `Registry` (once `Registry` has tables to store 

137 provenance, that is). 

138 These per-quantum files can be written in two ways: 

139 

140 - The SQLite file used internally by `QuantumBackedButler` can be used 

141 directly but customizing the ``filename`` argument to ``initialize``, and 

142 then transferring that file to the object store after execution completes 

143 (or fails; a ``try/finally`` pattern probably makes sense here). 

144 

145 - A JSON or YAML file can be written by calling `extract_provenance_data`, 

146 and using ``pydantic`` methods to write the returned 

147 `QuantumProvenanceData` to a file. 

148 

149 Note that at present, the SQLite file only contains datastore records, not 

150 provenance, but that should be easy to address (if desired) after we 

151 actually design a `Registry` schema for provenance. I also suspect that 

152 we'll want to explicitly close the SQLite file somehow before trying to 

153 transfer it. But I'm guessing we'd prefer to write the per-quantum files 

154 as JSON anyway. 

155 """ 

156 

157 def __init__( 

158 self, 

159 predicted_inputs: Iterable[DatasetId], 

160 predicted_outputs: Iterable[DatasetId], 

161 dimensions: DimensionUniverse, 

162 datastore: Datastore, 

163 storageClasses: StorageClassFactory, 

164 dataset_types: Mapping[str, DatasetType] | None = None, 

165 ): 

166 self._dimensions = dimensions 

167 self._predicted_inputs = set(predicted_inputs) 

168 self._predicted_outputs = set(predicted_outputs) 

169 self._available_inputs: set[DatasetId] = set() 

170 self._unavailable_inputs: set[DatasetId] = set() 

171 self._actual_inputs: set[DatasetId] = set() 

172 self._actual_output_refs: set[DatasetRef] = set() 

173 self._datastore = datastore 

174 self.storageClasses = storageClasses 

175 self._dataset_types: Mapping[str, DatasetType] = {} 

176 if dataset_types is not None: 

177 self._dataset_types = dataset_types 

178 self._datastore.set_retrieve_dataset_type_method(self._retrieve_dataset_type) 

179 

180 @classmethod 

181 def initialize( 

182 cls, 

183 config: Config | ResourcePathExpression, 

184 quantum: Quantum, 

185 dimensions: DimensionUniverse, 

186 filename: str = ":memory:", 

187 OpaqueManagerClass: type[OpaqueTableStorageManager] = ByNameOpaqueTableStorageManager, 

188 BridgeManagerClass: type[DatastoreRegistryBridgeManager] = MonolithicDatastoreRegistryBridgeManager, 

189 search_paths: list[str] | None = None, 

190 dataset_types: Mapping[str, DatasetType] | None = None, 

191 ) -> QuantumBackedButler: 

192 """Construct a new `QuantumBackedButler` from repository configuration 

193 and helper types. 

194 

195 Parameters 

196 ---------- 

197 config : `Config` or `~lsst.resources.ResourcePathExpression` 

198 A butler repository root, configuration filename, or configuration 

199 instance. 

200 quantum : `Quantum` 

201 Object describing the predicted input and output dataset relevant 

202 to this butler. This must have resolved `DatasetRef` instances for 

203 all inputs and outputs. 

204 dimensions : `DimensionUniverse` 

205 Object managing all dimension definitions. 

206 filename : `str`, optional 

207 Name for the SQLite database that will back this butler; defaults 

208 to an in-memory database. 

209 OpaqueManagerClass : `type`, optional 

210 A subclass of `OpaqueTableStorageManager` to use for datastore 

211 opaque records. Default is a SQL-backed implementation. 

212 BridgeManagerClass : `type`, optional 

213 A subclass of `DatastoreRegistryBridgeManager` to use for datastore 

214 location records. Default is a SQL-backed implementation. 

215 search_paths : `list` of `str`, optional 

216 Additional search paths for butler configuration. 

217 dataset_types: `~collections.abc.Mapping` [`str`, `DatasetType`], \ 

218 optional 

219 Mapping of the dataset type name to its registry definition. 

220 """ 

221 predicted_inputs = [ref.id for ref in itertools.chain.from_iterable(quantum.inputs.values())] 

222 predicted_inputs += [ref.id for ref in quantum.initInputs.values()] 

223 predicted_outputs = [ref.id for ref in itertools.chain.from_iterable(quantum.outputs.values())] 

224 return cls._initialize( 

225 config=config, 

226 predicted_inputs=predicted_inputs, 

227 predicted_outputs=predicted_outputs, 

228 dimensions=dimensions, 

229 filename=filename, 

230 datastore_records=quantum.datastore_records, 

231 OpaqueManagerClass=OpaqueManagerClass, 

232 BridgeManagerClass=BridgeManagerClass, 

233 search_paths=search_paths, 

234 dataset_types=dataset_types, 

235 ) 

236 

237 @classmethod 

238 def from_predicted( 

239 cls, 

240 config: Config | ResourcePathExpression, 

241 predicted_inputs: Iterable[DatasetId], 

242 predicted_outputs: Iterable[DatasetId], 

243 dimensions: DimensionUniverse, 

244 datastore_records: Mapping[str, DatastoreRecordData], 

245 filename: str = ":memory:", 

246 OpaqueManagerClass: type[OpaqueTableStorageManager] = ByNameOpaqueTableStorageManager, 

247 BridgeManagerClass: type[DatastoreRegistryBridgeManager] = MonolithicDatastoreRegistryBridgeManager, 

248 search_paths: list[str] | None = None, 

249 dataset_types: Mapping[str, DatasetType] | None = None, 

250 ) -> QuantumBackedButler: 

251 """Construct a new `QuantumBackedButler` from sets of input and output 

252 dataset IDs. 

253 

254 Parameters 

255 ---------- 

256 config : `Config` or `~lsst.resources.ResourcePathExpression` 

257 A butler repository root, configuration filename, or configuration 

258 instance. 

259 predicted_inputs : `~collections.abc.Iterable` [`DatasetId`] 

260 Dataset IDs for datasets that can can be read from this butler. 

261 predicted_outputs : `~collections.abc.Iterable` [`DatasetId`] 

262 Dataset IDs for datasets that can be stored in this butler, must be 

263 fully resolved. 

264 dimensions : `DimensionUniverse` 

265 Object managing all dimension definitions. 

266 filename : `str`, optional 

267 Name for the SQLite database that will back this butler; defaults 

268 to an in-memory database. 

269 datastore_records : `dict` [`str`, `DatastoreRecordData`] or `None` 

270 Datastore records to import into a datastore. 

271 OpaqueManagerClass : `type`, optional 

272 A subclass of `OpaqueTableStorageManager` to use for datastore 

273 opaque records. Default is a SQL-backed implementation. 

274 BridgeManagerClass : `type`, optional 

275 A subclass of `DatastoreRegistryBridgeManager` to use for datastore 

276 location records. Default is a SQL-backed implementation. 

277 search_paths : `list` of `str`, optional 

278 Additional search paths for butler configuration. 

279 dataset_types: `~collections.abc.Mapping` [`str`, `DatasetType`], \ 

280 optional 

281 Mapping of the dataset type name to its registry definition. 

282 """ 

283 return cls._initialize( 

284 config=config, 

285 predicted_inputs=predicted_inputs, 

286 predicted_outputs=predicted_outputs, 

287 dimensions=dimensions, 

288 filename=filename, 

289 datastore_records=datastore_records, 

290 OpaqueManagerClass=OpaqueManagerClass, 

291 BridgeManagerClass=BridgeManagerClass, 

292 search_paths=search_paths, 

293 dataset_types=dataset_types, 

294 ) 

295 

296 @classmethod 

297 def _initialize( 

298 cls, 

299 *, 

300 config: Config | ResourcePathExpression, 

301 predicted_inputs: Iterable[DatasetId], 

302 predicted_outputs: Iterable[DatasetId], 

303 dimensions: DimensionUniverse, 

304 filename: str = ":memory:", 

305 datastore_records: Mapping[str, DatastoreRecordData] | None = None, 

306 OpaqueManagerClass: type[OpaqueTableStorageManager] = ByNameOpaqueTableStorageManager, 

307 BridgeManagerClass: type[DatastoreRegistryBridgeManager] = MonolithicDatastoreRegistryBridgeManager, 

308 search_paths: list[str] | None = None, 

309 dataset_types: Mapping[str, DatasetType] | None = None, 

310 ) -> QuantumBackedButler: 

311 """Initialize quantum-backed butler. 

312 

313 Internal method with common implementation used by `initialize` and 

314 `for_output`. 

315 

316 Parameters 

317 ---------- 

318 config : `Config` or `~lsst.resources.ResourcePathExpression` 

319 A butler repository root, configuration filename, or configuration 

320 instance. 

321 predicted_inputs : `~collections.abc.Iterable` [`DatasetId`] 

322 Dataset IDs for datasets that can can be read from this butler. 

323 predicted_outputs : `~collections.abc.Iterable` [`DatasetId`] 

324 Dataset IDs for datasets that can be stored in this butler. 

325 dimensions : `DimensionUniverse` 

326 Object managing all dimension definitions. 

327 filename : `str`, optional 

328 Name for the SQLite database that will back this butler; defaults 

329 to an in-memory database. 

330 datastore_records : `dict` [`str`, `DatastoreRecordData`] or `None` 

331 Datastore records to import into a datastore. 

332 OpaqueManagerClass : `type`, optional 

333 A subclass of `OpaqueTableStorageManager` to use for datastore 

334 opaque records. Default is a SQL-backed implementation. 

335 BridgeManagerClass : `type`, optional 

336 A subclass of `DatastoreRegistryBridgeManager` to use for datastore 

337 location records. Default is a SQL-backed implementation. 

338 search_paths : `list` of `str`, optional 

339 Additional search paths for butler configuration. 

340 dataset_types: `~collections.abc.Mapping` [`str`, `DatasetType`] 

341 Mapping of the dataset type name to its registry definition. 

342 """ 

343 butler_config = ButlerConfig(config, searchPaths=search_paths) 

344 if "root" in butler_config: 

345 butler_root = butler_config["root"] 

346 else: 

347 butler_root = butler_config.configDir 

348 db = SqliteDatabase.fromUri(f"sqlite:///{filename}", origin=0) 

349 with db.declareStaticTables(create=True) as context: 

350 opaque_manager = OpaqueManagerClass.initialize(db, context) 

351 bridge_manager = BridgeManagerClass.initialize( 

352 db, 

353 context, 

354 opaque=opaque_manager, 

355 # MyPy can tell it's a fake, but we know it shouldn't care. 

356 datasets=_DatasetRecordStorageManagerDatastoreConstructionMimic, # type: ignore 

357 universe=dimensions, 

358 ) 

359 # TODO: We need to inform `Datastore` here that it needs to support 

360 # predictive reads; right now that's a configuration option, but after 

361 # execution butler is retired it could just be a kwarg we pass here. 

362 # For now just force this option as we cannot work without it. 

363 butler_config["datastore", "trust_get_request"] = True 

364 datastore = Datastore.fromConfig(butler_config, bridge_manager, butler_root) 

365 if datastore_records is not None: 

366 datastore.import_records(datastore_records) 

367 storageClasses = StorageClassFactory() 

368 storageClasses.addFromConfig(butler_config) 

369 return cls( 

370 predicted_inputs, 

371 predicted_outputs, 

372 dimensions, 

373 datastore, 

374 storageClasses=storageClasses, 

375 dataset_types=dataset_types, 

376 ) 

377 

378 def _retrieve_dataset_type(self, name: str) -> DatasetType | None: 

379 """Return DatasetType defined in registry given dataset type name.""" 

380 return self._dataset_types.get(name) 

381 

382 def isWriteable(self) -> bool: 

383 # Docstring inherited. 

384 return True 

385 

386 @deprecated( 

387 reason="Butler.get() now behaves like Butler.getDirect() when given a DatasetRef." 

388 " Please use Butler.get(). Will be removed after v27.0.", 

389 version="v26.0", 

390 category=FutureWarning, 

391 ) 

392 def getDirect( 

393 self, 

394 ref: DatasetRef, 

395 *, 

396 parameters: dict[str, Any] | None = None, 

397 storageClass: str | StorageClass | None = None, 

398 ) -> Any: 

399 # Docstring inherited. 

400 return self.get(ref, parameters=parameters, storageClass=storageClass) 

401 

402 def get( 

403 self, 

404 ref: DatasetRef, 

405 /, 

406 *, 

407 parameters: dict[str, Any] | None = None, 

408 storageClass: StorageClass | str | None = None, 

409 ) -> Any: 

410 try: 

411 obj = super().get( 

412 ref, 

413 parameters=parameters, 

414 storageClass=storageClass, 

415 ) 

416 except (LookupError, FileNotFoundError, OSError): 

417 self._unavailable_inputs.add(ref.id) 

418 raise 

419 if ref.id in self._predicted_inputs: 

420 # do this after delegating to super in case that raises. 

421 self._actual_inputs.add(ref.id) 

422 self._available_inputs.add(ref.id) 

423 return obj 

424 

425 @deprecated( 

426 reason="Butler.getDeferred() now behaves like getDirectDeferred() when given a DatasetRef. " 

427 "Please use Butler.getDeferred(). Will be removed after v27.0.", 

428 version="v26.0", 

429 category=FutureWarning, 

430 ) 

431 def getDirectDeferred( 

432 self, 

433 ref: DatasetRef, 

434 *, 

435 parameters: dict[str, Any] | None = None, 

436 storageClass: str | StorageClass | None = None, 

437 ) -> DeferredDatasetHandle: 

438 # Docstring inherited. 

439 return self.getDeferred(ref, parameters=parameters, storageClass=storageClass) 

440 

441 def getDeferred( 

442 self, 

443 ref: DatasetRef, 

444 /, 

445 *, 

446 parameters: dict[str, Any] | None = None, 

447 storageClass: str | StorageClass | None = None, 

448 ) -> DeferredDatasetHandle: 

449 if ref.id in self._predicted_inputs: 

450 # Unfortunately, we can't do this after the handle succeeds in 

451 # loading, so it's conceivable here that we're marking an input 

452 # as "actual" even when it's not even available. 

453 self._actual_inputs.add(ref.id) 

454 return super().getDeferred(ref, parameters=parameters, storageClass=storageClass) 

455 

456 def stored(self, ref: DatasetRef) -> bool: 

457 # Docstring inherited. 

458 stored = super().stored(ref) 

459 if ref.id in self._predicted_inputs: 

460 if stored: 

461 self._available_inputs.add(ref.id) 

462 else: 

463 self._unavailable_inputs.add(ref.id) 

464 return stored 

465 

466 def stored_many( 

467 self, 

468 refs: Iterable[DatasetRef], 

469 ) -> dict[DatasetRef, bool]: 

470 # Docstring inherited. 

471 existence = super().stored_many(refs) 

472 

473 for ref, stored in existence.items(): 

474 if ref.id in self._predicted_inputs: 

475 if stored: 

476 self._available_inputs.add(ref.id) 

477 else: 

478 self._unavailable_inputs.add(ref.id) 

479 return existence 

480 

481 def markInputUnused(self, ref: DatasetRef) -> None: 

482 # Docstring inherited. 

483 self._actual_inputs.discard(ref.id) 

484 

485 @property 

486 def dimensions(self) -> DimensionUniverse: 

487 # Docstring inherited. 

488 return self._dimensions 

489 

490 def put(self, obj: Any, ref: DatasetRef, /) -> DatasetRef: 

491 # Docstring inherited. 

492 if ref.id not in self._predicted_outputs: 

493 raise RuntimeError("Cannot `put` dataset that was not predicted as an output.") 

494 self._datastore.put(obj, ref) 

495 self._actual_output_refs.add(ref) 

496 return ref 

497 

498 def pruneDatasets( 

499 self, 

500 refs: Iterable[DatasetRef], 

501 *, 

502 disassociate: bool = True, 

503 unstore: bool = False, 

504 tags: Iterable[str] = (), 

505 purge: bool = False, 

506 ) -> None: 

507 # docstring inherited from LimitedButler 

508 

509 if purge: 

510 if not disassociate: 

511 raise TypeError("Cannot pass purge=True without disassociate=True.") 

512 if not unstore: 

513 raise TypeError("Cannot pass purge=True without unstore=True.") 

514 elif disassociate: 

515 # No tagged collections for this butler. 

516 raise TypeError("Cannot pass disassociate=True without purge=True.") 

517 

518 refs = list(refs) 

519 

520 # Pruning a component of a DatasetRef makes no sense. 

521 for ref in refs: 

522 if ref.datasetType.component(): 

523 raise ValueError(f"Can not prune a component of a dataset (ref={ref})") 

524 

525 if unstore: 

526 self._datastore.trash(refs) 

527 if purge: 

528 for ref in refs: 

529 # We only care about removing them from actual output refs, 

530 self._actual_output_refs.discard(ref) 

531 

532 if unstore: 

533 # Point of no return for removing artifacts 

534 self._datastore.emptyTrash() 

535 

536 def extract_provenance_data(self) -> QuantumProvenanceData: 

537 """Extract provenance information and datastore records from this 

538 butler. 

539 

540 Returns 

541 ------- 

542 provenance : `QuantumProvenanceData` 

543 A serializable struct containing input/output dataset IDs and 

544 datastore records. This assumes all dataset IDs are UUIDs (just to 

545 make it easier for `pydantic` to reason about the struct's types); 

546 the rest of this class makes no such assumption, but the approach 

547 to processing in which it's useful effectively requires UUIDs 

548 anyway. 

549 

550 Notes 

551 ----- 

552 `QuantumBackedButler` records this provenance information when its 

553 methods are used, which mostly saves `~lsst.pipe.base.PipelineTask` 

554 authors from having to worry about while still recording very 

555 detailed information. But it has two small weaknesses: 

556 

557 - Calling `getDirectDeferred` or `getDirect` is enough to mark a 

558 dataset as an "actual input", which may mark some datasets that 

559 aren't actually used. We rely on task authors to use 

560 `markInputUnused` to address this. 

561 

562 - We assume that the execution system will call ``datasetExistsDirect`` 

563 on all predicted inputs prior to execution, in order to populate the 

564 "available inputs" set. This is what I envision 

565 '`~lsst.ctrl.mpexec.SingleQuantumExecutor` doing after we update it 

566 to use this class, but it feels fragile for this class to make such 

567 a strong assumption about how it will be used, even if I can't think 

568 of any other executor behavior that would make sense. 

569 """ 

570 if not self._actual_inputs.isdisjoint(self._unavailable_inputs): 

571 _LOG.warning( 

572 "Inputs %s were marked as actually used (probably because a DeferredDatasetHandle) " 

573 "was obtained, but did not actually exist. This task should be be using markInputUnused " 

574 "directly to clarify its provenance.", 

575 self._actual_inputs & self._unavailable_inputs, 

576 ) 

577 self._actual_inputs -= self._unavailable_inputs 

578 checked_inputs = self._available_inputs | self._unavailable_inputs 

579 if not self._predicted_inputs == checked_inputs: 

580 _LOG.warning( 

581 "Execution harness did not check predicted inputs %s for existence; available inputs " 

582 "recorded in provenance may be incomplete.", 

583 self._predicted_inputs - checked_inputs, 

584 ) 

585 datastore_records = self._datastore.export_records(self._actual_output_refs) 

586 provenance_records = { 

587 datastore_name: records.to_simple() for datastore_name, records in datastore_records.items() 

588 } 

589 

590 return QuantumProvenanceData( 

591 predicted_inputs=self._predicted_inputs, 

592 available_inputs=self._available_inputs, 

593 actual_inputs=self._actual_inputs, 

594 predicted_outputs=self._predicted_outputs, 

595 actual_outputs={ref.id for ref in self._actual_output_refs}, 

596 datastore_records=provenance_records, 

597 ) 

598 

599 

600class QuantumProvenanceData(BaseModel): 

601 """A serializable struct for per-quantum provenance information and 

602 datastore records. 

603 

604 Notes 

605 ----- 

606 This class slightly duplicates information from the `Quantum` class itself 

607 (the ``predicted_inputs`` and ``predicted_outputs`` sets should have the 

608 same IDs present in `Quantum.inputs` and `Quantum.outputs`), but overall it 

609 assumes the original `Quantum` is also available to reconstruct the 

610 complete provenance (e.g. by associating dataset IDs with data IDs, 

611 dataset types, and `~CollectionType.RUN` names. 

612 

613 Note that ``pydantic`` method ``parse_raw()`` is not going to work 

614 correctly for this class, use `direct` method instead. 

615 """ 

616 

617 # This class probably should have information about its execution 

618 # environment (anything not controlled and recorded at the 

619 # `~CollectionType.RUN` level, such as the compute node ID). but adding it 

620 # now is out of scope for this prototype. 

621 

622 predicted_inputs: set[uuid.UUID] 

623 """Unique IDs of datasets that were predicted as inputs to this quantum 

624 when the QuantumGraph was built. 

625 """ 

626 

627 available_inputs: set[uuid.UUID] 

628 """Unique IDs of input datasets that were actually present in the datastore 

629 when this quantum was executed. 

630 

631 This is a subset of ``predicted_inputs``, with the difference generally 

632 being datasets were ``predicted_outputs`` but not ``actual_outputs`` of 

633 some upstream task. 

634 """ 

635 

636 actual_inputs: set[uuid.UUID] 

637 """Unique IDs of datasets that were actually used as inputs by this task. 

638 

639 This is a subset of ``available_inputs``. 

640 

641 Notes 

642 ----- 

643 The criteria for marking an input as used is that rerunning the quantum 

644 with only these ``actual_inputs`` available must yield identical outputs. 

645 This means that (for example) even just using an input to help determine 

646 an output rejection criteria and then rejecting it as an outlier qualifies 

647 that input as actually used. 

648 """ 

649 

650 predicted_outputs: set[uuid.UUID] 

651 """Unique IDs of datasets that were predicted as outputs of this quantum 

652 when the QuantumGraph was built. 

653 """ 

654 

655 actual_outputs: set[uuid.UUID] 

656 """Unique IDs of datasets that were actually written when this quantum 

657 was executed. 

658 """ 

659 

660 datastore_records: dict[str, SerializedDatastoreRecordData] 

661 """Datastore records indexed by datastore name.""" 

662 

663 @staticmethod 

664 def collect_and_transfer( 

665 butler: Butler, quanta: Iterable[Quantum], provenance: Iterable[QuantumProvenanceData] 

666 ) -> None: 

667 """Transfer output datasets from multiple quanta to a more permantent 

668 `Butler` repository. 

669 

670 Parameters 

671 ---------- 

672 butler : `Butler` 

673 Full butler representing the data repository to transfer datasets 

674 to. 

675 quanta : `~collections.abc.Iterable` [ `Quantum` ] 

676 Iterable of `Quantum` objects that carry information about 

677 predicted outputs. May be a single-pass iterator. 

678 provenance : `~collections.abc.Iterable` [ `QuantumProvenanceData` ] 

679 Provenance and datastore data for each of the given quanta, in the 

680 same order. May be a single-pass iterator. 

681 

682 Notes 

683 ----- 

684 Input-output provenance data is not actually transferred yet, because 

685 `Registry` has no place to store it. 

686 

687 This method probably works most efficiently if run on all quanta for a 

688 single task label at once, because this will gather all datasets of 

689 a particular type together into a single vectorized `Registry` import. 

690 It should still behave correctly if run on smaller groups of quanta 

691 or even quanta from multiple tasks. 

692 

693 Currently this method transfers datastore record data unchanged, with 

694 no possibility of actually moving (e.g.) files. Datastores that are 

695 present only in execution or only in the more permanent butler are 

696 ignored. 

697 """ 

698 grouped_refs = defaultdict(list) 

699 summary_records: dict[str, DatastoreRecordData] = {} 

700 for quantum, provenance_for_quantum in zip(quanta, provenance): 

701 quantum_refs_by_id = { 

702 ref.id: ref 

703 for ref in itertools.chain.from_iterable(quantum.outputs.values()) 

704 if ref.id in provenance_for_quantum.actual_outputs 

705 } 

706 for ref in quantum_refs_by_id.values(): 

707 grouped_refs[ref.datasetType, ref.run].append(ref) 

708 

709 # merge datastore records into a summary structure 

710 for datastore_name, serialized_records in provenance_for_quantum.datastore_records.items(): 

711 quantum_records = DatastoreRecordData.from_simple(serialized_records) 

712 if (records := summary_records.get(datastore_name)) is not None: 

713 records.update(quantum_records) 

714 else: 

715 summary_records[datastore_name] = quantum_records 

716 

717 for refs in grouped_refs.values(): 

718 butler.registry._importDatasets(refs) 

719 butler._datastore.import_records(summary_records) 

720 

721 @classmethod 

722 def parse_raw(cls, *args: Any, **kwargs: Any) -> QuantumProvenanceData: 

723 raise NotImplementedError("parse_raw() is not usable for this class, use direct() instead.") 

724 

725 @classmethod 

726 def direct( 

727 cls, 

728 *, 

729 predicted_inputs: Iterable[str | uuid.UUID], 

730 available_inputs: Iterable[str | uuid.UUID], 

731 actual_inputs: Iterable[str | uuid.UUID], 

732 predicted_outputs: Iterable[str | uuid.UUID], 

733 actual_outputs: Iterable[str | uuid.UUID], 

734 datastore_records: Mapping[str, Mapping], 

735 ) -> QuantumProvenanceData: 

736 """Construct an instance directly without validators. 

737 

738 This differs from the pydantic "construct" method in that the 

739 arguments are explicitly what the model requires, and it will recurse 

740 through members, constructing them from their corresponding `direct` 

741 methods. 

742 

743 This method should only be called when the inputs are trusted. 

744 """ 

745 

746 def _to_uuid_set(uuids: Iterable[str | uuid.UUID]) -> set[uuid.UUID]: 

747 """Convert input UUIDs, which could be in string representation to 

748 a set of `UUID` instances. 

749 """ 

750 return {uuid.UUID(id) if isinstance(id, str) else id for id in uuids} 

751 

752 data = QuantumProvenanceData.__new__(cls) 

753 setter = object.__setattr__ 

754 setter(data, "predicted_inputs", _to_uuid_set(predicted_inputs)) 

755 setter(data, "available_inputs", _to_uuid_set(available_inputs)) 

756 setter(data, "actual_inputs", _to_uuid_set(actual_inputs)) 

757 setter(data, "predicted_outputs", _to_uuid_set(predicted_outputs)) 

758 setter(data, "actual_outputs", _to_uuid_set(actual_outputs)) 

759 setter( 

760 data, 

761 "datastore_records", 

762 { 

763 key: SerializedDatastoreRecordData.direct(**records) 

764 for key, records in datastore_records.items() 

765 }, 

766 ) 

767 return data