Coverage for python/lsst/daf/butler/_quantum_backed.py: 32%

184 statements  

« prev     ^ index     » next       coverage.py v7.3.1, created at 2023-10-02 08:00 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28from __future__ import annotations 

29 

30__all__ = ("QuantumBackedButler", "QuantumProvenanceData") 

31 

32import itertools 

33import logging 

34import uuid 

35from collections import defaultdict 

36from collections.abc import Iterable, Mapping 

37from typing import TYPE_CHECKING, Any 

38 

39from deprecated.sphinx import deprecated 

40from lsst.daf.butler._compat import _BaseModelCompat 

41from lsst.resources import ResourcePathExpression 

42 

43from ._butlerConfig import ButlerConfig 

44from ._deferredDatasetHandle import DeferredDatasetHandle 

45from ._limited_butler import LimitedButler 

46from .core import ( 

47 Config, 

48 DatasetId, 

49 DatasetRef, 

50 DatasetType, 

51 Datastore, 

52 DatastoreRecordData, 

53 DimensionUniverse, 

54 Quantum, 

55 SerializedDatastoreRecordData, 

56 StorageClass, 

57 StorageClassFactory, 

58 ddl, 

59) 

60from .registry.bridge.monolithic import MonolithicDatastoreRegistryBridgeManager 

61from .registry.databases.sqlite import SqliteDatabase 

62from .registry.interfaces import DatastoreRegistryBridgeManager, OpaqueTableStorageManager 

63from .registry.opaque import ByNameOpaqueTableStorageManager 

64 

65if TYPE_CHECKING: 

66 from ._butler import Butler 

67 

68_LOG = logging.getLogger(__name__) 

69 

70 

71class _DatasetRecordStorageManagerDatastoreConstructionMimic: 

72 """A partial implementation of `DatasetRecordStorageManager` that exists 

73 only to allow a `DatastoreRegistryBridgeManager` (and hence a `Datastore`) 

74 to be constructed without a full `Registry`. 

75 

76 Notes 

77 ----- 

78 The interface implemented by this class should probably be its own ABC, 

79 and that ABC should probably be used in the definition of 

80 `DatastoreRegistryBridgeManager`, but while prototyping I'm trying to keep 

81 changes minimal. 

82 """ 

83 

84 @classmethod 

85 def getIdColumnType(cls) -> type: 

86 # Docstring inherited. 

87 return ddl.GUID 

88 

89 @classmethod 

90 def addDatasetForeignKey( 

91 cls, 

92 tableSpec: ddl.TableSpec, 

93 *, 

94 name: str = "dataset", 

95 constraint: bool = True, 

96 onDelete: str | None = None, 

97 **kwargs: Any, 

98 ) -> ddl.FieldSpec: 

99 # Docstring inherited. 

100 idFieldSpec = ddl.FieldSpec(f"{name}_id", dtype=ddl.GUID, **kwargs) 

101 tableSpec.fields.add(idFieldSpec) 

102 return idFieldSpec 

103 

104 

105class QuantumBackedButler(LimitedButler): 

106 """An implementation of `LimitedButler` intended to back execution of a 

107 single `Quantum`. 

108 

109 Parameters 

110 ---------- 

111 predicted_inputs : `~collections.abc.Iterable` [`DatasetId`] 

112 Dataset IDs for datasets that can can be read from this butler. 

113 predicted_outputs : `~collections.abc.Iterable` [`DatasetId`] 

114 Dataset IDs for datasets that can be stored in this butler. 

115 dimensions : `DimensionUniverse` 

116 Object managing all dimension definitions. 

117 datastore : `Datastore` 

118 Datastore to use for all dataset I/O and existence checks. 

119 storageClasses : `StorageClassFactory` 

120 Object managing all storage class definitions. 

121 

122 Notes 

123 ----- 

124 Most callers should use the `initialize` `classmethod` to construct new 

125 instances instead of calling the constructor directly. 

126 

127 `QuantumBackedButler` uses a SQLite database internally, in order to reuse 

128 existing `DatastoreRegistryBridge` and `OpaqueTableStorage` 

129 implementations that rely SQLAlchemy. If implementations are added in the 

130 future that don't rely on SQLAlchemy, it should be possible to swap them 

131 in by overriding the type arguments to `initialize` (though at present, 

132 `QuantumBackedButler` would still create at least an in-memory SQLite 

133 database that would then go unused).` 

134 

135 We imagine `QuantumBackedButler` being used during (at least) batch 

136 execution to capture `Datastore` records and save them to per-quantum 

137 files, which are also a convenient place to store provenance for eventual 

138 upload to a SQL-backed `Registry` (once `Registry` has tables to store 

139 provenance, that is). 

140 These per-quantum files can be written in two ways: 

141 

142 - The SQLite file used internally by `QuantumBackedButler` can be used 

143 directly but customizing the ``filename`` argument to ``initialize``, and 

144 then transferring that file to the object store after execution completes 

145 (or fails; a ``try/finally`` pattern probably makes sense here). 

146 

147 - A JSON or YAML file can be written by calling `extract_provenance_data`, 

148 and using ``pydantic`` methods to write the returned 

149 `QuantumProvenanceData` to a file. 

150 

151 Note that at present, the SQLite file only contains datastore records, not 

152 provenance, but that should be easy to address (if desired) after we 

153 actually design a `Registry` schema for provenance. I also suspect that 

154 we'll want to explicitly close the SQLite file somehow before trying to 

155 transfer it. But I'm guessing we'd prefer to write the per-quantum files 

156 as JSON anyway. 

157 """ 

158 

159 def __init__( 

160 self, 

161 predicted_inputs: Iterable[DatasetId], 

162 predicted_outputs: Iterable[DatasetId], 

163 dimensions: DimensionUniverse, 

164 datastore: Datastore, 

165 storageClasses: StorageClassFactory, 

166 dataset_types: Mapping[str, DatasetType] | None = None, 

167 ): 

168 self._dimensions = dimensions 

169 self._predicted_inputs = set(predicted_inputs) 

170 self._predicted_outputs = set(predicted_outputs) 

171 self._available_inputs: set[DatasetId] = set() 

172 self._unavailable_inputs: set[DatasetId] = set() 

173 self._actual_inputs: set[DatasetId] = set() 

174 self._actual_output_refs: set[DatasetRef] = set() 

175 self._datastore = datastore 

176 self.storageClasses = storageClasses 

177 self._dataset_types: Mapping[str, DatasetType] = {} 

178 if dataset_types is not None: 

179 self._dataset_types = dataset_types 

180 self._datastore.set_retrieve_dataset_type_method(self._retrieve_dataset_type) 

181 

182 @classmethod 

183 def initialize( 

184 cls, 

185 config: Config | ResourcePathExpression, 

186 quantum: Quantum, 

187 dimensions: DimensionUniverse, 

188 filename: str = ":memory:", 

189 OpaqueManagerClass: type[OpaqueTableStorageManager] = ByNameOpaqueTableStorageManager, 

190 BridgeManagerClass: type[DatastoreRegistryBridgeManager] = MonolithicDatastoreRegistryBridgeManager, 

191 search_paths: list[str] | None = None, 

192 dataset_types: Mapping[str, DatasetType] | None = None, 

193 ) -> QuantumBackedButler: 

194 """Construct a new `QuantumBackedButler` from repository configuration 

195 and helper types. 

196 

197 Parameters 

198 ---------- 

199 config : `Config` or `~lsst.resources.ResourcePathExpression` 

200 A butler repository root, configuration filename, or configuration 

201 instance. 

202 quantum : `Quantum` 

203 Object describing the predicted input and output dataset relevant 

204 to this butler. This must have resolved `DatasetRef` instances for 

205 all inputs and outputs. 

206 dimensions : `DimensionUniverse` 

207 Object managing all dimension definitions. 

208 filename : `str`, optional 

209 Name for the SQLite database that will back this butler; defaults 

210 to an in-memory database. 

211 OpaqueManagerClass : `type`, optional 

212 A subclass of `OpaqueTableStorageManager` to use for datastore 

213 opaque records. Default is a SQL-backed implementation. 

214 BridgeManagerClass : `type`, optional 

215 A subclass of `DatastoreRegistryBridgeManager` to use for datastore 

216 location records. Default is a SQL-backed implementation. 

217 search_paths : `list` of `str`, optional 

218 Additional search paths for butler configuration. 

219 dataset_types: `~collections.abc.Mapping` [`str`, `DatasetType`], \ 

220 optional 

221 Mapping of the dataset type name to its registry definition. 

222 """ 

223 predicted_inputs = [ref.id for ref in itertools.chain.from_iterable(quantum.inputs.values())] 

224 predicted_inputs += [ref.id for ref in quantum.initInputs.values()] 

225 predicted_outputs = [ref.id for ref in itertools.chain.from_iterable(quantum.outputs.values())] 

226 return cls._initialize( 

227 config=config, 

228 predicted_inputs=predicted_inputs, 

229 predicted_outputs=predicted_outputs, 

230 dimensions=dimensions, 

231 filename=filename, 

232 datastore_records=quantum.datastore_records, 

233 OpaqueManagerClass=OpaqueManagerClass, 

234 BridgeManagerClass=BridgeManagerClass, 

235 search_paths=search_paths, 

236 dataset_types=dataset_types, 

237 ) 

238 

239 @classmethod 

240 def from_predicted( 

241 cls, 

242 config: Config | ResourcePathExpression, 

243 predicted_inputs: Iterable[DatasetId], 

244 predicted_outputs: Iterable[DatasetId], 

245 dimensions: DimensionUniverse, 

246 datastore_records: Mapping[str, DatastoreRecordData], 

247 filename: str = ":memory:", 

248 OpaqueManagerClass: type[OpaqueTableStorageManager] = ByNameOpaqueTableStorageManager, 

249 BridgeManagerClass: type[DatastoreRegistryBridgeManager] = MonolithicDatastoreRegistryBridgeManager, 

250 search_paths: list[str] | None = None, 

251 dataset_types: Mapping[str, DatasetType] | None = None, 

252 ) -> QuantumBackedButler: 

253 """Construct a new `QuantumBackedButler` from sets of input and output 

254 dataset IDs. 

255 

256 Parameters 

257 ---------- 

258 config : `Config` or `~lsst.resources.ResourcePathExpression` 

259 A butler repository root, configuration filename, or configuration 

260 instance. 

261 predicted_inputs : `~collections.abc.Iterable` [`DatasetId`] 

262 Dataset IDs for datasets that can can be read from this butler. 

263 predicted_outputs : `~collections.abc.Iterable` [`DatasetId`] 

264 Dataset IDs for datasets that can be stored in this butler, must be 

265 fully resolved. 

266 dimensions : `DimensionUniverse` 

267 Object managing all dimension definitions. 

268 filename : `str`, optional 

269 Name for the SQLite database that will back this butler; defaults 

270 to an in-memory database. 

271 datastore_records : `dict` [`str`, `DatastoreRecordData`] or `None` 

272 Datastore records to import into a datastore. 

273 OpaqueManagerClass : `type`, optional 

274 A subclass of `OpaqueTableStorageManager` to use for datastore 

275 opaque records. Default is a SQL-backed implementation. 

276 BridgeManagerClass : `type`, optional 

277 A subclass of `DatastoreRegistryBridgeManager` to use for datastore 

278 location records. Default is a SQL-backed implementation. 

279 search_paths : `list` of `str`, optional 

280 Additional search paths for butler configuration. 

281 dataset_types: `~collections.abc.Mapping` [`str`, `DatasetType`], \ 

282 optional 

283 Mapping of the dataset type name to its registry definition. 

284 """ 

285 return cls._initialize( 

286 config=config, 

287 predicted_inputs=predicted_inputs, 

288 predicted_outputs=predicted_outputs, 

289 dimensions=dimensions, 

290 filename=filename, 

291 datastore_records=datastore_records, 

292 OpaqueManagerClass=OpaqueManagerClass, 

293 BridgeManagerClass=BridgeManagerClass, 

294 search_paths=search_paths, 

295 dataset_types=dataset_types, 

296 ) 

297 

298 @classmethod 

299 def _initialize( 

300 cls, 

301 *, 

302 config: Config | ResourcePathExpression, 

303 predicted_inputs: Iterable[DatasetId], 

304 predicted_outputs: Iterable[DatasetId], 

305 dimensions: DimensionUniverse, 

306 filename: str = ":memory:", 

307 datastore_records: Mapping[str, DatastoreRecordData] | None = None, 

308 OpaqueManagerClass: type[OpaqueTableStorageManager] = ByNameOpaqueTableStorageManager, 

309 BridgeManagerClass: type[DatastoreRegistryBridgeManager] = MonolithicDatastoreRegistryBridgeManager, 

310 search_paths: list[str] | None = None, 

311 dataset_types: Mapping[str, DatasetType] | None = None, 

312 ) -> QuantumBackedButler: 

313 """Initialize quantum-backed butler. 

314 

315 Internal method with common implementation used by `initialize` and 

316 `for_output`. 

317 

318 Parameters 

319 ---------- 

320 config : `Config` or `~lsst.resources.ResourcePathExpression` 

321 A butler repository root, configuration filename, or configuration 

322 instance. 

323 predicted_inputs : `~collections.abc.Iterable` [`DatasetId`] 

324 Dataset IDs for datasets that can can be read from this butler. 

325 predicted_outputs : `~collections.abc.Iterable` [`DatasetId`] 

326 Dataset IDs for datasets that can be stored in this butler. 

327 dimensions : `DimensionUniverse` 

328 Object managing all dimension definitions. 

329 filename : `str`, optional 

330 Name for the SQLite database that will back this butler; defaults 

331 to an in-memory database. 

332 datastore_records : `dict` [`str`, `DatastoreRecordData`] or `None` 

333 Datastore records to import into a datastore. 

334 OpaqueManagerClass : `type`, optional 

335 A subclass of `OpaqueTableStorageManager` to use for datastore 

336 opaque records. Default is a SQL-backed implementation. 

337 BridgeManagerClass : `type`, optional 

338 A subclass of `DatastoreRegistryBridgeManager` to use for datastore 

339 location records. Default is a SQL-backed implementation. 

340 search_paths : `list` of `str`, optional 

341 Additional search paths for butler configuration. 

342 dataset_types: `~collections.abc.Mapping` [`str`, `DatasetType`] 

343 Mapping of the dataset type name to its registry definition. 

344 """ 

345 butler_config = ButlerConfig(config, searchPaths=search_paths) 

346 butler_root = butler_config.get("root", butler_config.configDir) 

347 db = SqliteDatabase.fromUri(f"sqlite:///{filename}", origin=0) 

348 with db.declareStaticTables(create=True) as context: 

349 opaque_manager = OpaqueManagerClass.initialize(db, context) 

350 bridge_manager = BridgeManagerClass.initialize( 

351 db, 

352 context, 

353 opaque=opaque_manager, 

354 # MyPy can tell it's a fake, but we know it shouldn't care. 

355 datasets=_DatasetRecordStorageManagerDatastoreConstructionMimic, # type: ignore 

356 universe=dimensions, 

357 ) 

358 # TODO: We need to inform `Datastore` here that it needs to support 

359 # predictive reads; right now that's a configuration option, but after 

360 # execution butler is retired it could just be a kwarg we pass here. 

361 # For now just force this option as we cannot work without it. 

362 butler_config["datastore", "trust_get_request"] = True 

363 datastore = Datastore.fromConfig(butler_config, bridge_manager, butler_root) 

364 if datastore_records is not None: 

365 datastore.import_records(datastore_records) 

366 storageClasses = StorageClassFactory() 

367 storageClasses.addFromConfig(butler_config) 

368 return cls( 

369 predicted_inputs, 

370 predicted_outputs, 

371 dimensions, 

372 datastore, 

373 storageClasses=storageClasses, 

374 dataset_types=dataset_types, 

375 ) 

376 

377 def _retrieve_dataset_type(self, name: str) -> DatasetType | None: 

378 """Return DatasetType defined in registry given dataset type name.""" 

379 return self._dataset_types.get(name) 

380 

381 def isWriteable(self) -> bool: 

382 # Docstring inherited. 

383 return True 

384 

385 # TODO: remove on DM-40067. 

386 @deprecated( 

387 reason="Butler.get() now behaves like Butler.getDirect() when given a DatasetRef." 

388 " Please use Butler.get(). Will be removed after v26.0.", 

389 version="v26.0", 

390 category=FutureWarning, 

391 ) 

392 def getDirect( 

393 self, 

394 ref: DatasetRef, 

395 *, 

396 parameters: dict[str, Any] | None = None, 

397 storageClass: str | StorageClass | None = None, 

398 ) -> Any: 

399 # Docstring inherited. 

400 return self.get(ref, parameters=parameters, storageClass=storageClass) 

401 

402 def get( 

403 self, 

404 ref: DatasetRef, 

405 /, 

406 *, 

407 parameters: dict[str, Any] | None = None, 

408 storageClass: StorageClass | str | None = None, 

409 ) -> Any: 

410 try: 

411 obj = super().get( 

412 ref, 

413 parameters=parameters, 

414 storageClass=storageClass, 

415 ) 

416 except (LookupError, FileNotFoundError, OSError): 

417 self._unavailable_inputs.add(ref.id) 

418 raise 

419 if ref.id in self._predicted_inputs: 

420 # do this after delegating to super in case that raises. 

421 self._actual_inputs.add(ref.id) 

422 self._available_inputs.add(ref.id) 

423 return obj 

424 

425 # TODO: remove on DM-40067. 

426 @deprecated( 

427 reason="Butler.getDeferred() now behaves like getDirectDeferred() when given a DatasetRef. " 

428 "Please use Butler.getDeferred(). Will be removed after v26.0.", 

429 version="v26.0", 

430 category=FutureWarning, 

431 ) 

432 def getDirectDeferred( 

433 self, 

434 ref: DatasetRef, 

435 *, 

436 parameters: dict[str, Any] | None = None, 

437 storageClass: str | StorageClass | None = None, 

438 ) -> DeferredDatasetHandle: 

439 # Docstring inherited. 

440 return self.getDeferred(ref, parameters=parameters, storageClass=storageClass) 

441 

442 def getDeferred( 

443 self, 

444 ref: DatasetRef, 

445 /, 

446 *, 

447 parameters: dict[str, Any] | None = None, 

448 storageClass: str | StorageClass | None = None, 

449 ) -> DeferredDatasetHandle: 

450 if ref.id in self._predicted_inputs: 

451 # Unfortunately, we can't do this after the handle succeeds in 

452 # loading, so it's conceivable here that we're marking an input 

453 # as "actual" even when it's not even available. 

454 self._actual_inputs.add(ref.id) 

455 return super().getDeferred(ref, parameters=parameters, storageClass=storageClass) 

456 

457 def stored(self, ref: DatasetRef) -> bool: 

458 # Docstring inherited. 

459 stored = super().stored(ref) 

460 if ref.id in self._predicted_inputs: 

461 if stored: 

462 self._available_inputs.add(ref.id) 

463 else: 

464 self._unavailable_inputs.add(ref.id) 

465 return stored 

466 

467 def stored_many( 

468 self, 

469 refs: Iterable[DatasetRef], 

470 ) -> dict[DatasetRef, bool]: 

471 # Docstring inherited. 

472 existence = super().stored_many(refs) 

473 

474 for ref, stored in existence.items(): 

475 if ref.id in self._predicted_inputs: 

476 if stored: 

477 self._available_inputs.add(ref.id) 

478 else: 

479 self._unavailable_inputs.add(ref.id) 

480 return existence 

481 

482 def markInputUnused(self, ref: DatasetRef) -> None: 

483 # Docstring inherited. 

484 self._actual_inputs.discard(ref.id) 

485 

486 @property 

487 def dimensions(self) -> DimensionUniverse: 

488 # Docstring inherited. 

489 return self._dimensions 

490 

491 def put(self, obj: Any, ref: DatasetRef, /) -> DatasetRef: 

492 # Docstring inherited. 

493 if ref.id not in self._predicted_outputs: 

494 raise RuntimeError("Cannot `put` dataset that was not predicted as an output.") 

495 self._datastore.put(obj, ref) 

496 self._actual_output_refs.add(ref) 

497 return ref 

498 

499 def pruneDatasets( 

500 self, 

501 refs: Iterable[DatasetRef], 

502 *, 

503 disassociate: bool = True, 

504 unstore: bool = False, 

505 tags: Iterable[str] = (), 

506 purge: bool = False, 

507 ) -> None: 

508 # docstring inherited from LimitedButler 

509 

510 if purge: 

511 if not disassociate: 

512 raise TypeError("Cannot pass purge=True without disassociate=True.") 

513 if not unstore: 

514 raise TypeError("Cannot pass purge=True without unstore=True.") 

515 elif disassociate: 

516 # No tagged collections for this butler. 

517 raise TypeError("Cannot pass disassociate=True without purge=True.") 

518 

519 refs = list(refs) 

520 

521 # Pruning a component of a DatasetRef makes no sense. 

522 for ref in refs: 

523 if ref.datasetType.component(): 

524 raise ValueError(f"Can not prune a component of a dataset (ref={ref})") 

525 

526 if unstore: 

527 self._datastore.trash(refs) 

528 if purge: 

529 for ref in refs: 

530 # We only care about removing them from actual output refs, 

531 self._actual_output_refs.discard(ref) 

532 

533 if unstore: 

534 # Point of no return for removing artifacts 

535 self._datastore.emptyTrash() 

536 

537 def extract_provenance_data(self) -> QuantumProvenanceData: 

538 """Extract provenance information and datastore records from this 

539 butler. 

540 

541 Returns 

542 ------- 

543 provenance : `QuantumProvenanceData` 

544 A serializable struct containing input/output dataset IDs and 

545 datastore records. This assumes all dataset IDs are UUIDs (just to 

546 make it easier for `pydantic` to reason about the struct's types); 

547 the rest of this class makes no such assumption, but the approach 

548 to processing in which it's useful effectively requires UUIDs 

549 anyway. 

550 

551 Notes 

552 ----- 

553 `QuantumBackedButler` records this provenance information when its 

554 methods are used, which mostly saves `~lsst.pipe.base.PipelineTask` 

555 authors from having to worry about while still recording very 

556 detailed information. But it has two small weaknesses: 

557 

558 - Calling `getDirectDeferred` or `getDirect` is enough to mark a 

559 dataset as an "actual input", which may mark some datasets that 

560 aren't actually used. We rely on task authors to use 

561 `markInputUnused` to address this. 

562 

563 - We assume that the execution system will call ``datasetExistsDirect`` 

564 on all predicted inputs prior to execution, in order to populate the 

565 "available inputs" set. This is what I envision 

566 '`~lsst.ctrl.mpexec.SingleQuantumExecutor` doing after we update it 

567 to use this class, but it feels fragile for this class to make such 

568 a strong assumption about how it will be used, even if I can't think 

569 of any other executor behavior that would make sense. 

570 """ 

571 if not self._actual_inputs.isdisjoint(self._unavailable_inputs): 

572 _LOG.warning( 

573 "Inputs %s were marked as actually used (probably because a DeferredDatasetHandle) " 

574 "was obtained, but did not actually exist. This task should be be using markInputUnused " 

575 "directly to clarify its provenance.", 

576 self._actual_inputs & self._unavailable_inputs, 

577 ) 

578 self._actual_inputs -= self._unavailable_inputs 

579 checked_inputs = self._available_inputs | self._unavailable_inputs 

580 if self._predicted_inputs != checked_inputs: 

581 _LOG.warning( 

582 "Execution harness did not check predicted inputs %s for existence; available inputs " 

583 "recorded in provenance may be incomplete.", 

584 self._predicted_inputs - checked_inputs, 

585 ) 

586 datastore_records = self._datastore.export_records(self._actual_output_refs) 

587 provenance_records = { 

588 datastore_name: records.to_simple() for datastore_name, records in datastore_records.items() 

589 } 

590 

591 return QuantumProvenanceData( 

592 predicted_inputs=self._predicted_inputs, 

593 available_inputs=self._available_inputs, 

594 actual_inputs=self._actual_inputs, 

595 predicted_outputs=self._predicted_outputs, 

596 actual_outputs={ref.id for ref in self._actual_output_refs}, 

597 datastore_records=provenance_records, 

598 ) 

599 

600 

601class QuantumProvenanceData(_BaseModelCompat): 

602 """A serializable struct for per-quantum provenance information and 

603 datastore records. 

604 

605 Notes 

606 ----- 

607 This class slightly duplicates information from the `Quantum` class itself 

608 (the ``predicted_inputs`` and ``predicted_outputs`` sets should have the 

609 same IDs present in `Quantum.inputs` and `Quantum.outputs`), but overall it 

610 assumes the original `Quantum` is also available to reconstruct the 

611 complete provenance (e.g. by associating dataset IDs with data IDs, 

612 dataset types, and `~CollectionType.RUN` names. 

613 

614 Note that ``pydantic`` method ``parse_raw()`` is not going to work 

615 correctly for this class, use `direct` method instead. 

616 """ 

617 

618 # This class probably should have information about its execution 

619 # environment (anything not controlled and recorded at the 

620 # `~CollectionType.RUN` level, such as the compute node ID). but adding it 

621 # now is out of scope for this prototype. 

622 

623 predicted_inputs: set[uuid.UUID] 

624 """Unique IDs of datasets that were predicted as inputs to this quantum 

625 when the QuantumGraph was built. 

626 """ 

627 

628 available_inputs: set[uuid.UUID] 

629 """Unique IDs of input datasets that were actually present in the datastore 

630 when this quantum was executed. 

631 

632 This is a subset of ``predicted_inputs``, with the difference generally 

633 being datasets were ``predicted_outputs`` but not ``actual_outputs`` of 

634 some upstream task. 

635 """ 

636 

637 actual_inputs: set[uuid.UUID] 

638 """Unique IDs of datasets that were actually used as inputs by this task. 

639 

640 This is a subset of ``available_inputs``. 

641 

642 Notes 

643 ----- 

644 The criteria for marking an input as used is that rerunning the quantum 

645 with only these ``actual_inputs`` available must yield identical outputs. 

646 This means that (for example) even just using an input to help determine 

647 an output rejection criteria and then rejecting it as an outlier qualifies 

648 that input as actually used. 

649 """ 

650 

651 predicted_outputs: set[uuid.UUID] 

652 """Unique IDs of datasets that were predicted as outputs of this quantum 

653 when the QuantumGraph was built. 

654 """ 

655 

656 actual_outputs: set[uuid.UUID] 

657 """Unique IDs of datasets that were actually written when this quantum 

658 was executed. 

659 """ 

660 

661 datastore_records: dict[str, SerializedDatastoreRecordData] 

662 """Datastore records indexed by datastore name.""" 

663 

664 @staticmethod 

665 def collect_and_transfer( 

666 butler: Butler, quanta: Iterable[Quantum], provenance: Iterable[QuantumProvenanceData] 

667 ) -> None: 

668 """Transfer output datasets from multiple quanta to a more permantent 

669 `Butler` repository. 

670 

671 Parameters 

672 ---------- 

673 butler : `Butler` 

674 Full butler representing the data repository to transfer datasets 

675 to. 

676 quanta : `~collections.abc.Iterable` [ `Quantum` ] 

677 Iterable of `Quantum` objects that carry information about 

678 predicted outputs. May be a single-pass iterator. 

679 provenance : `~collections.abc.Iterable` [ `QuantumProvenanceData` ] 

680 Provenance and datastore data for each of the given quanta, in the 

681 same order. May be a single-pass iterator. 

682 

683 Notes 

684 ----- 

685 Input-output provenance data is not actually transferred yet, because 

686 `Registry` has no place to store it. 

687 

688 This method probably works most efficiently if run on all quanta for a 

689 single task label at once, because this will gather all datasets of 

690 a particular type together into a single vectorized `Registry` import. 

691 It should still behave correctly if run on smaller groups of quanta 

692 or even quanta from multiple tasks. 

693 

694 Currently this method transfers datastore record data unchanged, with 

695 no possibility of actually moving (e.g.) files. Datastores that are 

696 present only in execution or only in the more permanent butler are 

697 ignored. 

698 """ 

699 grouped_refs = defaultdict(list) 

700 summary_records: dict[str, DatastoreRecordData] = {} 

701 for quantum, provenance_for_quantum in zip(quanta, provenance, strict=True): 

702 quantum_refs_by_id = { 

703 ref.id: ref 

704 for ref in itertools.chain.from_iterable(quantum.outputs.values()) 

705 if ref.id in provenance_for_quantum.actual_outputs 

706 } 

707 for ref in quantum_refs_by_id.values(): 

708 grouped_refs[ref.datasetType, ref.run].append(ref) 

709 

710 # merge datastore records into a summary structure 

711 for datastore_name, serialized_records in provenance_for_quantum.datastore_records.items(): 

712 quantum_records = DatastoreRecordData.from_simple(serialized_records) 

713 if (records := summary_records.get(datastore_name)) is not None: 

714 records.update(quantum_records) 

715 else: 

716 summary_records[datastore_name] = quantum_records 

717 

718 for refs in grouped_refs.values(): 

719 butler.registry._importDatasets(refs) 

720 butler._datastore.import_records(summary_records) 

721 

722 @classmethod 

723 def parse_raw(cls, *args: Any, **kwargs: Any) -> QuantumProvenanceData: 

724 raise NotImplementedError("parse_raw() is not usable for this class, use direct() instead.") 

725 

726 @classmethod 

727 def direct( 

728 cls, 

729 *, 

730 predicted_inputs: Iterable[str | uuid.UUID], 

731 available_inputs: Iterable[str | uuid.UUID], 

732 actual_inputs: Iterable[str | uuid.UUID], 

733 predicted_outputs: Iterable[str | uuid.UUID], 

734 actual_outputs: Iterable[str | uuid.UUID], 

735 datastore_records: Mapping[str, Mapping], 

736 ) -> QuantumProvenanceData: 

737 """Construct an instance directly without validators. 

738 

739 This differs from the pydantic "construct" method in that the 

740 arguments are explicitly what the model requires, and it will recurse 

741 through members, constructing them from their corresponding `direct` 

742 methods. 

743 

744 This method should only be called when the inputs are trusted. 

745 """ 

746 

747 def _to_uuid_set(uuids: Iterable[str | uuid.UUID]) -> set[uuid.UUID]: 

748 """Convert input UUIDs, which could be in string representation to 

749 a set of `UUID` instances. 

750 """ 

751 return {uuid.UUID(id) if isinstance(id, str) else id for id in uuids} 

752 

753 data = cls.model_construct( 

754 predicted_inputs=_to_uuid_set(predicted_inputs), 

755 available_inputs=_to_uuid_set(available_inputs), 

756 actual_inputs=_to_uuid_set(actual_inputs), 

757 predicted_outputs=_to_uuid_set(predicted_outputs), 

758 actual_outputs=_to_uuid_set(actual_outputs), 

759 datastore_records={ 

760 key: SerializedDatastoreRecordData.direct(**records) 

761 for key, records in datastore_records.items() 

762 }, 

763 ) 

764 

765 return data