Coverage for python/lsst/daf/butler/_quantum_backed.py: 34%

192 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-12-05 11:07 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28from __future__ import annotations 

29 

30from . import ddl 

31 

32__all__ = ("QuantumBackedButler", "QuantumProvenanceData") 

33 

34import itertools 

35import logging 

36import uuid 

37from collections import defaultdict 

38from collections.abc import Iterable, Mapping 

39from typing import TYPE_CHECKING, Any 

40 

41from deprecated.sphinx import deprecated 

42from lsst.resources import ResourcePathExpression 

43 

44from ._butler_config import ButlerConfig 

45from ._compat import _BaseModelCompat 

46from ._config import Config 

47from ._dataset_ref import DatasetId, DatasetRef 

48from ._dataset_type import DatasetType 

49from ._deferredDatasetHandle import DeferredDatasetHandle 

50from ._limited_butler import LimitedButler 

51from ._quantum import Quantum 

52from ._storage_class import StorageClass, StorageClassFactory 

53from .datastore import Datastore 

54from .datastore.record_data import DatastoreRecordData, SerializedDatastoreRecordData 

55from .dimensions import DimensionUniverse 

56from .registry.bridge.monolithic import MonolithicDatastoreRegistryBridgeManager 

57from .registry.databases.sqlite import SqliteDatabase 

58from .registry.interfaces import DatastoreRegistryBridgeManager, OpaqueTableStorageManager 

59from .registry.opaque import ByNameOpaqueTableStorageManager 

60 

61if TYPE_CHECKING: 

62 from ._butler import Butler 

63 

64_LOG = logging.getLogger(__name__) 

65 

66 

67class _DatasetRecordStorageManagerDatastoreConstructionMimic: 

68 """A partial implementation of `DatasetRecordStorageManager` that exists 

69 only to allow a `DatastoreRegistryBridgeManager` (and hence a `Datastore`) 

70 to be constructed without a full `Registry`. 

71 

72 Notes 

73 ----- 

74 The interface implemented by this class should probably be its own ABC, 

75 and that ABC should probably be used in the definition of 

76 `DatastoreRegistryBridgeManager`, but while prototyping I'm trying to keep 

77 changes minimal. 

78 """ 

79 

80 @classmethod 

81 def getIdColumnType(cls) -> type: 

82 # Docstring inherited. 

83 return ddl.GUID 

84 

85 @classmethod 

86 def addDatasetForeignKey( 

87 cls, 

88 tableSpec: ddl.TableSpec, 

89 *, 

90 name: str = "dataset", 

91 constraint: bool = True, 

92 onDelete: str | None = None, 

93 **kwargs: Any, 

94 ) -> ddl.FieldSpec: 

95 # Docstring inherited. 

96 idFieldSpec = ddl.FieldSpec(f"{name}_id", dtype=ddl.GUID, **kwargs) 

97 tableSpec.fields.add(idFieldSpec) 

98 return idFieldSpec 

99 

100 

101class QuantumBackedButler(LimitedButler): 

102 """An implementation of `LimitedButler` intended to back execution of a 

103 single `Quantum`. 

104 

105 Parameters 

106 ---------- 

107 predicted_inputs : `~collections.abc.Iterable` [`DatasetId`] 

108 Dataset IDs for datasets that can can be read from this butler. 

109 predicted_outputs : `~collections.abc.Iterable` [`DatasetId`] 

110 Dataset IDs for datasets that can be stored in this butler. 

111 dimensions : `DimensionUniverse` 

112 Object managing all dimension definitions. 

113 datastore : `Datastore` 

114 Datastore to use for all dataset I/O and existence checks. 

115 storageClasses : `StorageClassFactory` 

116 Object managing all storage class definitions. 

117 

118 Notes 

119 ----- 

120 Most callers should use the `initialize` `classmethod` to construct new 

121 instances instead of calling the constructor directly. 

122 

123 `QuantumBackedButler` uses a SQLite database internally, in order to reuse 

124 existing `DatastoreRegistryBridge` and `OpaqueTableStorage` 

125 implementations that rely SQLAlchemy. If implementations are added in the 

126 future that don't rely on SQLAlchemy, it should be possible to swap them 

127 in by overriding the type arguments to `initialize` (though at present, 

128 `QuantumBackedButler` would still create at least an in-memory SQLite 

129 database that would then go unused).` 

130 

131 We imagine `QuantumBackedButler` being used during (at least) batch 

132 execution to capture `Datastore` records and save them to per-quantum 

133 files, which are also a convenient place to store provenance for eventual 

134 upload to a SQL-backed `Registry` (once `Registry` has tables to store 

135 provenance, that is). 

136 These per-quantum files can be written in two ways: 

137 

138 - The SQLite file used internally by `QuantumBackedButler` can be used 

139 directly but customizing the ``filename`` argument to ``initialize``, and 

140 then transferring that file to the object store after execution completes 

141 (or fails; a ``try/finally`` pattern probably makes sense here). 

142 

143 - A JSON or YAML file can be written by calling `extract_provenance_data`, 

144 and using ``pydantic`` methods to write the returned 

145 `QuantumProvenanceData` to a file. 

146 

147 Note that at present, the SQLite file only contains datastore records, not 

148 provenance, but that should be easy to address (if desired) after we 

149 actually design a `Registry` schema for provenance. I also suspect that 

150 we'll want to explicitly close the SQLite file somehow before trying to 

151 transfer it. But I'm guessing we'd prefer to write the per-quantum files 

152 as JSON anyway. 

153 """ 

154 

155 def __init__( 

156 self, 

157 predicted_inputs: Iterable[DatasetId], 

158 predicted_outputs: Iterable[DatasetId], 

159 dimensions: DimensionUniverse, 

160 datastore: Datastore, 

161 storageClasses: StorageClassFactory, 

162 dataset_types: Mapping[str, DatasetType] | None = None, 

163 ): 

164 self._dimensions = dimensions 

165 self._predicted_inputs = set(predicted_inputs) 

166 self._predicted_outputs = set(predicted_outputs) 

167 self._available_inputs: set[DatasetId] = set() 

168 self._unavailable_inputs: set[DatasetId] = set() 

169 self._actual_inputs: set[DatasetId] = set() 

170 self._actual_output_refs: set[DatasetRef] = set() 

171 self._datastore = datastore 

172 self.storageClasses = storageClasses 

173 self._dataset_types: Mapping[str, DatasetType] = {} 

174 if dataset_types is not None: 

175 self._dataset_types = dataset_types 

176 self._datastore.set_retrieve_dataset_type_method(self._retrieve_dataset_type) 

177 

178 @classmethod 

179 def initialize( 

180 cls, 

181 config: Config | ResourcePathExpression, 

182 quantum: Quantum, 

183 dimensions: DimensionUniverse, 

184 filename: str = ":memory:", 

185 OpaqueManagerClass: type[OpaqueTableStorageManager] = ByNameOpaqueTableStorageManager, 

186 BridgeManagerClass: type[DatastoreRegistryBridgeManager] = MonolithicDatastoreRegistryBridgeManager, 

187 search_paths: list[str] | None = None, 

188 dataset_types: Mapping[str, DatasetType] | None = None, 

189 ) -> QuantumBackedButler: 

190 """Construct a new `QuantumBackedButler` from repository configuration 

191 and helper types. 

192 

193 Parameters 

194 ---------- 

195 config : `Config` or `~lsst.resources.ResourcePathExpression` 

196 A butler repository root, configuration filename, or configuration 

197 instance. 

198 quantum : `Quantum` 

199 Object describing the predicted input and output dataset relevant 

200 to this butler. This must have resolved `DatasetRef` instances for 

201 all inputs and outputs. 

202 dimensions : `DimensionUniverse` 

203 Object managing all dimension definitions. 

204 filename : `str`, optional 

205 Name for the SQLite database that will back this butler; defaults 

206 to an in-memory database. 

207 OpaqueManagerClass : `type`, optional 

208 A subclass of `OpaqueTableStorageManager` to use for datastore 

209 opaque records. Default is a SQL-backed implementation. 

210 BridgeManagerClass : `type`, optional 

211 A subclass of `DatastoreRegistryBridgeManager` to use for datastore 

212 location records. Default is a SQL-backed implementation. 

213 search_paths : `list` of `str`, optional 

214 Additional search paths for butler configuration. 

215 dataset_types: `~collections.abc.Mapping` [`str`, `DatasetType`], \ 

216 optional 

217 Mapping of the dataset type name to its registry definition. 

218 """ 

219 predicted_inputs = [ref.id for ref in itertools.chain.from_iterable(quantum.inputs.values())] 

220 predicted_inputs += [ref.id for ref in quantum.initInputs.values()] 

221 predicted_outputs = [ref.id for ref in itertools.chain.from_iterable(quantum.outputs.values())] 

222 return cls._initialize( 

223 config=config, 

224 predicted_inputs=predicted_inputs, 

225 predicted_outputs=predicted_outputs, 

226 dimensions=dimensions, 

227 filename=filename, 

228 datastore_records=quantum.datastore_records, 

229 OpaqueManagerClass=OpaqueManagerClass, 

230 BridgeManagerClass=BridgeManagerClass, 

231 search_paths=search_paths, 

232 dataset_types=dataset_types, 

233 ) 

234 

235 @classmethod 

236 def from_predicted( 

237 cls, 

238 config: Config | ResourcePathExpression, 

239 predicted_inputs: Iterable[DatasetId], 

240 predicted_outputs: Iterable[DatasetId], 

241 dimensions: DimensionUniverse, 

242 datastore_records: Mapping[str, DatastoreRecordData], 

243 filename: str = ":memory:", 

244 OpaqueManagerClass: type[OpaqueTableStorageManager] = ByNameOpaqueTableStorageManager, 

245 BridgeManagerClass: type[DatastoreRegistryBridgeManager] = MonolithicDatastoreRegistryBridgeManager, 

246 search_paths: list[str] | None = None, 

247 dataset_types: Mapping[str, DatasetType] | None = None, 

248 ) -> QuantumBackedButler: 

249 """Construct a new `QuantumBackedButler` from sets of input and output 

250 dataset IDs. 

251 

252 Parameters 

253 ---------- 

254 config : `Config` or `~lsst.resources.ResourcePathExpression` 

255 A butler repository root, configuration filename, or configuration 

256 instance. 

257 predicted_inputs : `~collections.abc.Iterable` [`DatasetId`] 

258 Dataset IDs for datasets that can can be read from this butler. 

259 predicted_outputs : `~collections.abc.Iterable` [`DatasetId`] 

260 Dataset IDs for datasets that can be stored in this butler, must be 

261 fully resolved. 

262 dimensions : `DimensionUniverse` 

263 Object managing all dimension definitions. 

264 filename : `str`, optional 

265 Name for the SQLite database that will back this butler; defaults 

266 to an in-memory database. 

267 datastore_records : `dict` [`str`, `DatastoreRecordData`] or `None` 

268 Datastore records to import into a datastore. 

269 OpaqueManagerClass : `type`, optional 

270 A subclass of `OpaqueTableStorageManager` to use for datastore 

271 opaque records. Default is a SQL-backed implementation. 

272 BridgeManagerClass : `type`, optional 

273 A subclass of `DatastoreRegistryBridgeManager` to use for datastore 

274 location records. Default is a SQL-backed implementation. 

275 search_paths : `list` of `str`, optional 

276 Additional search paths for butler configuration. 

277 dataset_types: `~collections.abc.Mapping` [`str`, `DatasetType`], \ 

278 optional 

279 Mapping of the dataset type name to its registry definition. 

280 """ 

281 return cls._initialize( 

282 config=config, 

283 predicted_inputs=predicted_inputs, 

284 predicted_outputs=predicted_outputs, 

285 dimensions=dimensions, 

286 filename=filename, 

287 datastore_records=datastore_records, 

288 OpaqueManagerClass=OpaqueManagerClass, 

289 BridgeManagerClass=BridgeManagerClass, 

290 search_paths=search_paths, 

291 dataset_types=dataset_types, 

292 ) 

293 

294 @classmethod 

295 def _initialize( 

296 cls, 

297 *, 

298 config: Config | ResourcePathExpression, 

299 predicted_inputs: Iterable[DatasetId], 

300 predicted_outputs: Iterable[DatasetId], 

301 dimensions: DimensionUniverse, 

302 filename: str = ":memory:", 

303 datastore_records: Mapping[str, DatastoreRecordData] | None = None, 

304 OpaqueManagerClass: type[OpaqueTableStorageManager] = ByNameOpaqueTableStorageManager, 

305 BridgeManagerClass: type[DatastoreRegistryBridgeManager] = MonolithicDatastoreRegistryBridgeManager, 

306 search_paths: list[str] | None = None, 

307 dataset_types: Mapping[str, DatasetType] | None = None, 

308 ) -> QuantumBackedButler: 

309 """Initialize quantum-backed butler. 

310 

311 Internal method with common implementation used by `initialize` and 

312 `for_output`. 

313 

314 Parameters 

315 ---------- 

316 config : `Config` or `~lsst.resources.ResourcePathExpression` 

317 A butler repository root, configuration filename, or configuration 

318 instance. 

319 predicted_inputs : `~collections.abc.Iterable` [`DatasetId`] 

320 Dataset IDs for datasets that can can be read from this butler. 

321 predicted_outputs : `~collections.abc.Iterable` [`DatasetId`] 

322 Dataset IDs for datasets that can be stored in this butler. 

323 dimensions : `DimensionUniverse` 

324 Object managing all dimension definitions. 

325 filename : `str`, optional 

326 Name for the SQLite database that will back this butler; defaults 

327 to an in-memory database. 

328 datastore_records : `dict` [`str`, `DatastoreRecordData`] or `None` 

329 Datastore records to import into a datastore. 

330 OpaqueManagerClass : `type`, optional 

331 A subclass of `OpaqueTableStorageManager` to use for datastore 

332 opaque records. Default is a SQL-backed implementation. 

333 BridgeManagerClass : `type`, optional 

334 A subclass of `DatastoreRegistryBridgeManager` to use for datastore 

335 location records. Default is a SQL-backed implementation. 

336 search_paths : `list` of `str`, optional 

337 Additional search paths for butler configuration. 

338 dataset_types: `~collections.abc.Mapping` [`str`, `DatasetType`] 

339 Mapping of the dataset type name to its registry definition. 

340 """ 

341 butler_config = ButlerConfig(config, searchPaths=search_paths) 

342 butler_root = butler_config.get("root", butler_config.configDir) 

343 db = SqliteDatabase.fromUri(f"sqlite:///{filename}", origin=0) 

344 with db.declareStaticTables(create=True) as context: 

345 opaque_manager = OpaqueManagerClass.initialize(db, context) 

346 bridge_manager = BridgeManagerClass.initialize( 

347 db, 

348 context, 

349 opaque=opaque_manager, 

350 # MyPy can tell it's a fake, but we know it shouldn't care. 

351 datasets=_DatasetRecordStorageManagerDatastoreConstructionMimic, # type: ignore 

352 universe=dimensions, 

353 ) 

354 # TODO: We need to inform `Datastore` here that it needs to support 

355 # predictive reads; right now that's a configuration option, but after 

356 # execution butler is retired it could just be a kwarg we pass here. 

357 # For now just force this option as we cannot work without it. 

358 butler_config["datastore", "trust_get_request"] = True 

359 datastore = Datastore.fromConfig(butler_config, bridge_manager, butler_root) 

360 if datastore_records is not None: 

361 datastore.import_records(datastore_records) 

362 storageClasses = StorageClassFactory() 

363 storageClasses.addFromConfig(butler_config) 

364 return cls( 

365 predicted_inputs, 

366 predicted_outputs, 

367 dimensions, 

368 datastore, 

369 storageClasses=storageClasses, 

370 dataset_types=dataset_types, 

371 ) 

372 

373 def _retrieve_dataset_type(self, name: str) -> DatasetType | None: 

374 """Return DatasetType defined in registry given dataset type name.""" 

375 return self._dataset_types.get(name) 

376 

377 def isWriteable(self) -> bool: 

378 # Docstring inherited. 

379 return True 

380 

381 # TODO: remove on DM-40067. 

382 @deprecated( 

383 reason="Butler.get() now behaves like Butler.getDirect() when given a DatasetRef." 

384 " Please use Butler.get(). Will be removed after v26.0.", 

385 version="v26.0", 

386 category=FutureWarning, 

387 ) 

388 def getDirect( 

389 self, 

390 ref: DatasetRef, 

391 *, 

392 parameters: dict[str, Any] | None = None, 

393 storageClass: str | StorageClass | None = None, 

394 ) -> Any: 

395 # Docstring inherited. 

396 return self.get(ref, parameters=parameters, storageClass=storageClass) 

397 

398 def get( 

399 self, 

400 ref: DatasetRef, 

401 /, 

402 *, 

403 parameters: dict[str, Any] | None = None, 

404 storageClass: StorageClass | str | None = None, 

405 ) -> Any: 

406 try: 

407 obj = super().get( 

408 ref, 

409 parameters=parameters, 

410 storageClass=storageClass, 

411 ) 

412 except (LookupError, FileNotFoundError, OSError): 

413 self._unavailable_inputs.add(ref.id) 

414 raise 

415 if ref.id in self._predicted_inputs: 

416 # do this after delegating to super in case that raises. 

417 self._actual_inputs.add(ref.id) 

418 self._available_inputs.add(ref.id) 

419 return obj 

420 

421 # TODO: remove on DM-40067. 

422 @deprecated( 

423 reason="Butler.getDeferred() now behaves like getDirectDeferred() when given a DatasetRef. " 

424 "Please use Butler.getDeferred(). Will be removed after v26.0.", 

425 version="v26.0", 

426 category=FutureWarning, 

427 ) 

428 def getDirectDeferred( 

429 self, 

430 ref: DatasetRef, 

431 *, 

432 parameters: dict[str, Any] | None = None, 

433 storageClass: str | StorageClass | None = None, 

434 ) -> DeferredDatasetHandle: 

435 # Docstring inherited. 

436 return self.getDeferred(ref, parameters=parameters, storageClass=storageClass) 

437 

438 def getDeferred( 

439 self, 

440 ref: DatasetRef, 

441 /, 

442 *, 

443 parameters: dict[str, Any] | None = None, 

444 storageClass: str | StorageClass | None = None, 

445 ) -> DeferredDatasetHandle: 

446 if ref.id in self._predicted_inputs: 

447 # Unfortunately, we can't do this after the handle succeeds in 

448 # loading, so it's conceivable here that we're marking an input 

449 # as "actual" even when it's not even available. 

450 self._actual_inputs.add(ref.id) 

451 return super().getDeferred(ref, parameters=parameters, storageClass=storageClass) 

452 

453 def stored(self, ref: DatasetRef) -> bool: 

454 # Docstring inherited. 

455 stored = super().stored(ref) 

456 if ref.id in self._predicted_inputs: 

457 if stored: 

458 self._available_inputs.add(ref.id) 

459 else: 

460 self._unavailable_inputs.add(ref.id) 

461 return stored 

462 

463 def stored_many( 

464 self, 

465 refs: Iterable[DatasetRef], 

466 ) -> dict[DatasetRef, bool]: 

467 # Docstring inherited. 

468 existence = super().stored_many(refs) 

469 

470 for ref, stored in existence.items(): 

471 if ref.id in self._predicted_inputs: 

472 if stored: 

473 self._available_inputs.add(ref.id) 

474 else: 

475 self._unavailable_inputs.add(ref.id) 

476 return existence 

477 

478 def markInputUnused(self, ref: DatasetRef) -> None: 

479 # Docstring inherited. 

480 self._actual_inputs.discard(ref.id) 

481 

482 @property 

483 def dimensions(self) -> DimensionUniverse: 

484 # Docstring inherited. 

485 return self._dimensions 

486 

487 def put(self, obj: Any, ref: DatasetRef, /) -> DatasetRef: 

488 # Docstring inherited. 

489 if ref.id not in self._predicted_outputs: 

490 raise RuntimeError("Cannot `put` dataset that was not predicted as an output.") 

491 self._datastore.put(obj, ref) 

492 self._actual_output_refs.add(ref) 

493 return ref 

494 

495 def pruneDatasets( 

496 self, 

497 refs: Iterable[DatasetRef], 

498 *, 

499 disassociate: bool = True, 

500 unstore: bool = False, 

501 tags: Iterable[str] = (), 

502 purge: bool = False, 

503 ) -> None: 

504 # docstring inherited from LimitedButler 

505 

506 if purge: 

507 if not disassociate: 

508 raise TypeError("Cannot pass purge=True without disassociate=True.") 

509 if not unstore: 

510 raise TypeError("Cannot pass purge=True without unstore=True.") 

511 elif disassociate: 

512 # No tagged collections for this butler. 

513 raise TypeError("Cannot pass disassociate=True without purge=True.") 

514 

515 refs = list(refs) 

516 

517 # Pruning a component of a DatasetRef makes no sense. 

518 for ref in refs: 

519 if ref.datasetType.component(): 

520 raise ValueError(f"Can not prune a component of a dataset (ref={ref})") 

521 

522 if unstore: 

523 self._datastore.trash(refs) 

524 if purge: 

525 for ref in refs: 

526 # We only care about removing them from actual output refs, 

527 self._actual_output_refs.discard(ref) 

528 

529 if unstore: 

530 # Point of no return for removing artifacts 

531 self._datastore.emptyTrash() 

532 

533 def extract_provenance_data(self) -> QuantumProvenanceData: 

534 """Extract provenance information and datastore records from this 

535 butler. 

536 

537 Returns 

538 ------- 

539 provenance : `QuantumProvenanceData` 

540 A serializable struct containing input/output dataset IDs and 

541 datastore records. This assumes all dataset IDs are UUIDs (just to 

542 make it easier for `pydantic` to reason about the struct's types); 

543 the rest of this class makes no such assumption, but the approach 

544 to processing in which it's useful effectively requires UUIDs 

545 anyway. 

546 

547 Notes 

548 ----- 

549 `QuantumBackedButler` records this provenance information when its 

550 methods are used, which mostly saves `~lsst.pipe.base.PipelineTask` 

551 authors from having to worry about while still recording very 

552 detailed information. But it has two small weaknesses: 

553 

554 - Calling `getDirectDeferred` or `getDirect` is enough to mark a 

555 dataset as an "actual input", which may mark some datasets that 

556 aren't actually used. We rely on task authors to use 

557 `markInputUnused` to address this. 

558 

559 - We assume that the execution system will call ``datasetExistsDirect`` 

560 on all predicted inputs prior to execution, in order to populate the 

561 "available inputs" set. This is what I envision 

562 '`~lsst.ctrl.mpexec.SingleQuantumExecutor` doing after we update it 

563 to use this class, but it feels fragile for this class to make such 

564 a strong assumption about how it will be used, even if I can't think 

565 of any other executor behavior that would make sense. 

566 """ 

567 if not self._actual_inputs.isdisjoint(self._unavailable_inputs): 

568 _LOG.warning( 

569 "Inputs %s were marked as actually used (probably because a DeferredDatasetHandle) " 

570 "was obtained, but did not actually exist. This task should be be using markInputUnused " 

571 "directly to clarify its provenance.", 

572 self._actual_inputs & self._unavailable_inputs, 

573 ) 

574 self._actual_inputs -= self._unavailable_inputs 

575 checked_inputs = self._available_inputs | self._unavailable_inputs 

576 if self._predicted_inputs != checked_inputs: 

577 _LOG.warning( 

578 "Execution harness did not check predicted inputs %s for existence; available inputs " 

579 "recorded in provenance may be incomplete.", 

580 self._predicted_inputs - checked_inputs, 

581 ) 

582 datastore_records = self._datastore.export_records(self._actual_output_refs) 

583 provenance_records = { 

584 datastore_name: records.to_simple() for datastore_name, records in datastore_records.items() 

585 } 

586 

587 return QuantumProvenanceData( 

588 predicted_inputs=self._predicted_inputs, 

589 available_inputs=self._available_inputs, 

590 actual_inputs=self._actual_inputs, 

591 predicted_outputs=self._predicted_outputs, 

592 actual_outputs={ref.id for ref in self._actual_output_refs}, 

593 datastore_records=provenance_records, 

594 ) 

595 

596 

597class QuantumProvenanceData(_BaseModelCompat): 

598 """A serializable struct for per-quantum provenance information and 

599 datastore records. 

600 

601 Notes 

602 ----- 

603 This class slightly duplicates information from the `Quantum` class itself 

604 (the ``predicted_inputs`` and ``predicted_outputs`` sets should have the 

605 same IDs present in `Quantum.inputs` and `Quantum.outputs`), but overall it 

606 assumes the original `Quantum` is also available to reconstruct the 

607 complete provenance (e.g. by associating dataset IDs with data IDs, 

608 dataset types, and `~CollectionType.RUN` names. 

609 

610 Note that ``pydantic`` method ``parse_raw()`` is not going to work 

611 correctly for this class, use `direct` method instead. 

612 """ 

613 

614 # This class probably should have information about its execution 

615 # environment (anything not controlled and recorded at the 

616 # `~CollectionType.RUN` level, such as the compute node ID). but adding it 

617 # now is out of scope for this prototype. 

618 

619 predicted_inputs: set[uuid.UUID] 

620 """Unique IDs of datasets that were predicted as inputs to this quantum 

621 when the QuantumGraph was built. 

622 """ 

623 

624 available_inputs: set[uuid.UUID] 

625 """Unique IDs of input datasets that were actually present in the datastore 

626 when this quantum was executed. 

627 

628 This is a subset of ``predicted_inputs``, with the difference generally 

629 being datasets were ``predicted_outputs`` but not ``actual_outputs`` of 

630 some upstream task. 

631 """ 

632 

633 actual_inputs: set[uuid.UUID] 

634 """Unique IDs of datasets that were actually used as inputs by this task. 

635 

636 This is a subset of ``available_inputs``. 

637 

638 Notes 

639 ----- 

640 The criteria for marking an input as used is that rerunning the quantum 

641 with only these ``actual_inputs`` available must yield identical outputs. 

642 This means that (for example) even just using an input to help determine 

643 an output rejection criteria and then rejecting it as an outlier qualifies 

644 that input as actually used. 

645 """ 

646 

647 predicted_outputs: set[uuid.UUID] 

648 """Unique IDs of datasets that were predicted as outputs of this quantum 

649 when the QuantumGraph was built. 

650 """ 

651 

652 actual_outputs: set[uuid.UUID] 

653 """Unique IDs of datasets that were actually written when this quantum 

654 was executed. 

655 """ 

656 

657 datastore_records: dict[str, SerializedDatastoreRecordData] 

658 """Datastore records indexed by datastore name.""" 

659 

660 @staticmethod 

661 def collect_and_transfer( 

662 butler: Butler, quanta: Iterable[Quantum], provenance: Iterable[QuantumProvenanceData] 

663 ) -> None: 

664 """Transfer output datasets from multiple quanta to a more permanent 

665 `Butler` repository. 

666 

667 Parameters 

668 ---------- 

669 butler : `Butler` 

670 Full butler representing the data repository to transfer datasets 

671 to. 

672 quanta : `~collections.abc.Iterable` [ `Quantum` ] 

673 Iterable of `Quantum` objects that carry information about 

674 predicted outputs. May be a single-pass iterator. 

675 provenance : `~collections.abc.Iterable` [ `QuantumProvenanceData` ] 

676 Provenance and datastore data for each of the given quanta, in the 

677 same order. May be a single-pass iterator. 

678 

679 Notes 

680 ----- 

681 Input-output provenance data is not actually transferred yet, because 

682 `Registry` has no place to store it. 

683 

684 This method probably works most efficiently if run on all quanta for a 

685 single task label at once, because this will gather all datasets of 

686 a particular type together into a single vectorized `Registry` import. 

687 It should still behave correctly if run on smaller groups of quanta 

688 or even quanta from multiple tasks. 

689 

690 Currently this method transfers datastore record data unchanged, with 

691 no possibility of actually moving (e.g.) files. Datastores that are 

692 present only in execution or only in the more permanent butler are 

693 ignored. 

694 """ 

695 grouped_refs = defaultdict(list) 

696 summary_records: dict[str, DatastoreRecordData] = {} 

697 for quantum, provenance_for_quantum in zip(quanta, provenance, strict=True): 

698 quantum_refs_by_id = { 

699 ref.id: ref 

700 for ref in itertools.chain.from_iterable(quantum.outputs.values()) 

701 if ref.id in provenance_for_quantum.actual_outputs 

702 } 

703 for ref in quantum_refs_by_id.values(): 

704 grouped_refs[ref.datasetType, ref.run].append(ref) 

705 

706 # merge datastore records into a summary structure 

707 for datastore_name, serialized_records in provenance_for_quantum.datastore_records.items(): 

708 quantum_records = DatastoreRecordData.from_simple(serialized_records) 

709 if (records := summary_records.get(datastore_name)) is not None: 

710 records.update(quantum_records) 

711 else: 

712 summary_records[datastore_name] = quantum_records 

713 

714 for refs in grouped_refs.values(): 

715 butler.registry._importDatasets(refs) 

716 butler._datastore.import_records(summary_records) 

717 

718 @classmethod 

719 def parse_raw(cls, *args: Any, **kwargs: Any) -> QuantumProvenanceData: 

720 raise NotImplementedError("parse_raw() is not usable for this class, use direct() instead.") 

721 

722 @classmethod 

723 def direct( 

724 cls, 

725 *, 

726 predicted_inputs: Iterable[str | uuid.UUID], 

727 available_inputs: Iterable[str | uuid.UUID], 

728 actual_inputs: Iterable[str | uuid.UUID], 

729 predicted_outputs: Iterable[str | uuid.UUID], 

730 actual_outputs: Iterable[str | uuid.UUID], 

731 datastore_records: Mapping[str, Mapping], 

732 ) -> QuantumProvenanceData: 

733 """Construct an instance directly without validators. 

734 

735 This differs from the pydantic "construct" method in that the 

736 arguments are explicitly what the model requires, and it will recurse 

737 through members, constructing them from their corresponding `direct` 

738 methods. 

739 

740 This method should only be called when the inputs are trusted. 

741 """ 

742 

743 def _to_uuid_set(uuids: Iterable[str | uuid.UUID]) -> set[uuid.UUID]: 

744 """Convert input UUIDs, which could be in string representation to 

745 a set of `UUID` instances. 

746 """ 

747 return {uuid.UUID(id) if isinstance(id, str) else id for id in uuids} 

748 

749 data = cls.model_construct( 

750 predicted_inputs=_to_uuid_set(predicted_inputs), 

751 available_inputs=_to_uuid_set(available_inputs), 

752 actual_inputs=_to_uuid_set(actual_inputs), 

753 predicted_outputs=_to_uuid_set(predicted_outputs), 

754 actual_outputs=_to_uuid_set(actual_outputs), 

755 datastore_records={ 

756 key: SerializedDatastoreRecordData.direct(**records) 

757 for key, records in datastore_records.items() 

758 }, 

759 ) 

760 

761 return data