Coverage for python/lsst/daf/butler/_quantum_backed.py: 32%

185 statements  

« prev     ^ index     » next       coverage.py v7.4.4, created at 2024-04-10 10:14 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28from __future__ import annotations 

29 

30from . import ddl 

31 

32__all__ = ("QuantumBackedButler", "QuantumProvenanceData") 

33 

34import itertools 

35import logging 

36import uuid 

37from collections import defaultdict 

38from collections.abc import Iterable, Mapping 

39from typing import TYPE_CHECKING, Any 

40 

41import pydantic 

42from lsst.resources import ResourcePathExpression 

43 

44from ._butler_config import ButlerConfig 

45from ._config import Config 

46from ._dataset_ref import DatasetId, DatasetRef 

47from ._dataset_type import DatasetType 

48from ._deferredDatasetHandle import DeferredDatasetHandle 

49from ._limited_butler import LimitedButler 

50from ._quantum import Quantum 

51from ._storage_class import StorageClass, StorageClassFactory 

52from .datastore import Datastore 

53from .datastore.record_data import DatastoreRecordData, SerializedDatastoreRecordData 

54from .dimensions import DimensionUniverse 

55from .registry.bridge.monolithic import MonolithicDatastoreRegistryBridgeManager 

56from .registry.databases.sqlite import SqliteDatabase 

57from .registry.interfaces import DatastoreRegistryBridgeManager, OpaqueTableStorageManager 

58from .registry.opaque import ByNameOpaqueTableStorageManager 

59 

60if TYPE_CHECKING: 

61 from ._butler import Butler 

62 

63_LOG = logging.getLogger(__name__) 

64 

65 

66class _DatasetRecordStorageManagerDatastoreConstructionMimic: 

67 """A partial implementation of `DatasetRecordStorageManager` that exists 

68 only to allow a `DatastoreRegistryBridgeManager` (and hence a `Datastore`) 

69 to be constructed without a full `Registry`. 

70 

71 Notes 

72 ----- 

73 The interface implemented by this class should probably be its own ABC, 

74 and that ABC should probably be used in the definition of 

75 `DatastoreRegistryBridgeManager`, but while prototyping I'm trying to keep 

76 changes minimal. 

77 """ 

78 

79 @classmethod 

80 def getIdColumnType(cls) -> type: 

81 # Docstring inherited. 

82 return ddl.GUID 

83 

84 @classmethod 

85 def addDatasetForeignKey( 

86 cls, 

87 tableSpec: ddl.TableSpec, 

88 *, 

89 name: str = "dataset", 

90 constraint: bool = True, 

91 onDelete: str | None = None, 

92 **kwargs: Any, 

93 ) -> ddl.FieldSpec: 

94 # Docstring inherited. 

95 idFieldSpec = ddl.FieldSpec(f"{name}_id", dtype=ddl.GUID, **kwargs) 

96 tableSpec.fields.add(idFieldSpec) 

97 return idFieldSpec 

98 

99 

100class QuantumBackedButler(LimitedButler): 

101 """An implementation of `LimitedButler` intended to back execution of a 

102 single `Quantum`. 

103 

104 Parameters 

105 ---------- 

106 predicted_inputs : `~collections.abc.Iterable` [`DatasetId`] 

107 Dataset IDs for datasets that can can be read from this butler. 

108 predicted_outputs : `~collections.abc.Iterable` [`DatasetId`] 

109 Dataset IDs for datasets that can be stored in this butler. 

110 dimensions : `DimensionUniverse` 

111 Object managing all dimension definitions. 

112 datastore : `Datastore` 

113 Datastore to use for all dataset I/O and existence checks. 

114 storageClasses : `StorageClassFactory` 

115 Object managing all storage class definitions. 

116 dataset_types : `~collections.abc.Mapping` [`str`, `DatasetType`] 

117 The registry dataset type definitions, indexed by name. 

118 

119 Notes 

120 ----- 

121 Most callers should use the `initialize` `classmethod` to construct new 

122 instances instead of calling the constructor directly. 

123 

124 `QuantumBackedButler` uses a SQLite database internally, in order to reuse 

125 existing `DatastoreRegistryBridge` and `OpaqueTableStorage` 

126 implementations that rely SQLAlchemy. If implementations are added in the 

127 future that don't rely on SQLAlchemy, it should be possible to swap them 

128 in by overriding the type arguments to `initialize` (though at present, 

129 `QuantumBackedButler` would still create at least an in-memory SQLite 

130 database that would then go unused).` 

131 

132 We imagine `QuantumBackedButler` being used during (at least) batch 

133 execution to capture `Datastore` records and save them to per-quantum 

134 files, which are also a convenient place to store provenance for eventual 

135 upload to a SQL-backed `Registry` (once `Registry` has tables to store 

136 provenance, that is). 

137 These per-quantum files can be written in two ways: 

138 

139 - The SQLite file used internally by `QuantumBackedButler` can be used 

140 directly but customizing the ``filename`` argument to ``initialize``, and 

141 then transferring that file to the object store after execution completes 

142 (or fails; a ``try/finally`` pattern probably makes sense here). 

143 

144 - A JSON or YAML file can be written by calling `extract_provenance_data`, 

145 and using ``pydantic`` methods to write the returned 

146 `QuantumProvenanceData` to a file. 

147 

148 Note that at present, the SQLite file only contains datastore records, not 

149 provenance, but that should be easy to address (if desired) after we 

150 actually design a `Registry` schema for provenance. I also suspect that 

151 we'll want to explicitly close the SQLite file somehow before trying to 

152 transfer it. But I'm guessing we'd prefer to write the per-quantum files 

153 as JSON anyway. 

154 """ 

155 

156 def __init__( 

157 self, 

158 predicted_inputs: Iterable[DatasetId], 

159 predicted_outputs: Iterable[DatasetId], 

160 dimensions: DimensionUniverse, 

161 datastore: Datastore, 

162 storageClasses: StorageClassFactory, 

163 dataset_types: Mapping[str, DatasetType] | None = None, 

164 ): 

165 self._dimensions = dimensions 

166 self._predicted_inputs = set(predicted_inputs) 

167 self._predicted_outputs = set(predicted_outputs) 

168 self._available_inputs: set[DatasetId] = set() 

169 self._unavailable_inputs: set[DatasetId] = set() 

170 self._actual_inputs: set[DatasetId] = set() 

171 self._actual_output_refs: set[DatasetRef] = set() 

172 self._datastore = datastore 

173 self.storageClasses = storageClasses 

174 self._dataset_types: Mapping[str, DatasetType] = {} 

175 if dataset_types is not None: 

176 self._dataset_types = dataset_types 

177 self._datastore.set_retrieve_dataset_type_method(self._retrieve_dataset_type) 

178 

179 @classmethod 

180 def initialize( 

181 cls, 

182 config: Config | ResourcePathExpression, 

183 quantum: Quantum, 

184 dimensions: DimensionUniverse, 

185 filename: str = ":memory:", 

186 OpaqueManagerClass: type[OpaqueTableStorageManager] = ByNameOpaqueTableStorageManager, 

187 BridgeManagerClass: type[DatastoreRegistryBridgeManager] = MonolithicDatastoreRegistryBridgeManager, 

188 search_paths: list[str] | None = None, 

189 dataset_types: Mapping[str, DatasetType] | None = None, 

190 ) -> QuantumBackedButler: 

191 """Construct a new `QuantumBackedButler` from repository configuration 

192 and helper types. 

193 

194 Parameters 

195 ---------- 

196 config : `Config` or `~lsst.resources.ResourcePathExpression` 

197 A butler repository root, configuration filename, or configuration 

198 instance. 

199 quantum : `Quantum` 

200 Object describing the predicted input and output dataset relevant 

201 to this butler. This must have resolved `DatasetRef` instances for 

202 all inputs and outputs. 

203 dimensions : `DimensionUniverse` 

204 Object managing all dimension definitions. 

205 filename : `str`, optional 

206 Name for the SQLite database that will back this butler; defaults 

207 to an in-memory database. 

208 OpaqueManagerClass : `type`, optional 

209 A subclass of `OpaqueTableStorageManager` to use for datastore 

210 opaque records. Default is a SQL-backed implementation. 

211 BridgeManagerClass : `type`, optional 

212 A subclass of `DatastoreRegistryBridgeManager` to use for datastore 

213 location records. Default is a SQL-backed implementation. 

214 search_paths : `list` of `str`, optional 

215 Additional search paths for butler configuration. 

216 dataset_types : `~collections.abc.Mapping` [`str`, `DatasetType`], \ 

217 optional 

218 Mapping of the dataset type name to its registry definition. 

219 """ 

220 predicted_inputs = [ref.id for ref in itertools.chain.from_iterable(quantum.inputs.values())] 

221 predicted_inputs += [ref.id for ref in quantum.initInputs.values()] 

222 predicted_outputs = [ref.id for ref in itertools.chain.from_iterable(quantum.outputs.values())] 

223 return cls._initialize( 

224 config=config, 

225 predicted_inputs=predicted_inputs, 

226 predicted_outputs=predicted_outputs, 

227 dimensions=dimensions, 

228 filename=filename, 

229 datastore_records=quantum.datastore_records, 

230 OpaqueManagerClass=OpaqueManagerClass, 

231 BridgeManagerClass=BridgeManagerClass, 

232 search_paths=search_paths, 

233 dataset_types=dataset_types, 

234 ) 

235 

236 @classmethod 

237 def from_predicted( 

238 cls, 

239 config: Config | ResourcePathExpression, 

240 predicted_inputs: Iterable[DatasetId], 

241 predicted_outputs: Iterable[DatasetId], 

242 dimensions: DimensionUniverse, 

243 datastore_records: Mapping[str, DatastoreRecordData], 

244 filename: str = ":memory:", 

245 OpaqueManagerClass: type[OpaqueTableStorageManager] = ByNameOpaqueTableStorageManager, 

246 BridgeManagerClass: type[DatastoreRegistryBridgeManager] = MonolithicDatastoreRegistryBridgeManager, 

247 search_paths: list[str] | None = None, 

248 dataset_types: Mapping[str, DatasetType] | None = None, 

249 ) -> QuantumBackedButler: 

250 """Construct a new `QuantumBackedButler` from sets of input and output 

251 dataset IDs. 

252 

253 Parameters 

254 ---------- 

255 config : `Config` or `~lsst.resources.ResourcePathExpression` 

256 A butler repository root, configuration filename, or configuration 

257 instance. 

258 predicted_inputs : `~collections.abc.Iterable` [`DatasetId`] 

259 Dataset IDs for datasets that can can be read from this butler. 

260 predicted_outputs : `~collections.abc.Iterable` [`DatasetId`] 

261 Dataset IDs for datasets that can be stored in this butler, must be 

262 fully resolved. 

263 dimensions : `DimensionUniverse` 

264 Object managing all dimension definitions. 

265 datastore_records : `dict` [`str`, `DatastoreRecordData`] or `None` 

266 Datastore records to import into a datastore. 

267 filename : `str`, optional 

268 Name for the SQLite database that will back this butler; defaults 

269 to an in-memory database. 

270 OpaqueManagerClass : `type`, optional 

271 A subclass of `OpaqueTableStorageManager` to use for datastore 

272 opaque records. Default is a SQL-backed implementation. 

273 BridgeManagerClass : `type`, optional 

274 A subclass of `DatastoreRegistryBridgeManager` to use for datastore 

275 location records. Default is a SQL-backed implementation. 

276 search_paths : `list` of `str`, optional 

277 Additional search paths for butler configuration. 

278 dataset_types : `~collections.abc.Mapping` [`str`, `DatasetType`], \ 

279 optional 

280 Mapping of the dataset type name to its registry definition. 

281 """ 

282 return cls._initialize( 

283 config=config, 

284 predicted_inputs=predicted_inputs, 

285 predicted_outputs=predicted_outputs, 

286 dimensions=dimensions, 

287 filename=filename, 

288 datastore_records=datastore_records, 

289 OpaqueManagerClass=OpaqueManagerClass, 

290 BridgeManagerClass=BridgeManagerClass, 

291 search_paths=search_paths, 

292 dataset_types=dataset_types, 

293 ) 

294 

295 @classmethod 

296 def _initialize( 

297 cls, 

298 *, 

299 config: Config | ResourcePathExpression, 

300 predicted_inputs: Iterable[DatasetId], 

301 predicted_outputs: Iterable[DatasetId], 

302 dimensions: DimensionUniverse, 

303 filename: str = ":memory:", 

304 datastore_records: Mapping[str, DatastoreRecordData] | None = None, 

305 OpaqueManagerClass: type[OpaqueTableStorageManager] = ByNameOpaqueTableStorageManager, 

306 BridgeManagerClass: type[DatastoreRegistryBridgeManager] = MonolithicDatastoreRegistryBridgeManager, 

307 search_paths: list[str] | None = None, 

308 dataset_types: Mapping[str, DatasetType] | None = None, 

309 ) -> QuantumBackedButler: 

310 """Initialize quantum-backed butler. 

311 

312 Internal method with common implementation used by `initialize` and 

313 `for_output`. 

314 

315 Parameters 

316 ---------- 

317 config : `Config` or `~lsst.resources.ResourcePathExpression` 

318 A butler repository root, configuration filename, or configuration 

319 instance. 

320 predicted_inputs : `~collections.abc.Iterable` [`DatasetId`] 

321 Dataset IDs for datasets that can can be read from this butler. 

322 predicted_outputs : `~collections.abc.Iterable` [`DatasetId`] 

323 Dataset IDs for datasets that can be stored in this butler. 

324 dimensions : `DimensionUniverse` 

325 Object managing all dimension definitions. 

326 filename : `str`, optional 

327 Name for the SQLite database that will back this butler; defaults 

328 to an in-memory database. 

329 datastore_records : `dict` [`str`, `DatastoreRecordData`] or `None` 

330 Datastore records to import into a datastore. 

331 OpaqueManagerClass : `type`, optional 

332 A subclass of `OpaqueTableStorageManager` to use for datastore 

333 opaque records. Default is a SQL-backed implementation. 

334 BridgeManagerClass : `type`, optional 

335 A subclass of `DatastoreRegistryBridgeManager` to use for datastore 

336 location records. Default is a SQL-backed implementation. 

337 search_paths : `list` of `str`, optional 

338 Additional search paths for butler configuration. 

339 dataset_types : `~collections.abc.Mapping` [`str`, `DatasetType`] 

340 Mapping of the dataset type name to its registry definition. 

341 """ 

342 butler_config = ButlerConfig(config, searchPaths=search_paths) 

343 butler_root = butler_config.get("root", butler_config.configDir) 

344 db = SqliteDatabase.fromUri(f"sqlite:///{filename}", origin=0) 

345 with db.declareStaticTables(create=True) as context: 

346 opaque_manager = OpaqueManagerClass.initialize(db, context) 

347 bridge_manager = BridgeManagerClass.initialize( 

348 db, 

349 context, 

350 opaque=opaque_manager, 

351 # MyPy can tell it's a fake, but we know it shouldn't care. 

352 datasets=_DatasetRecordStorageManagerDatastoreConstructionMimic, # type: ignore 

353 universe=dimensions, 

354 ) 

355 datastore = Datastore.fromConfig(butler_config, bridge_manager, butler_root) 

356 

357 # TODO: We need to inform `Datastore` here that it needs to support 

358 # predictive reads; This only really works for file datastore but 

359 # we need to try everything in case there is a chained datastore. 

360 datastore._set_trust_mode(True) 

361 

362 if datastore_records is not None: 

363 datastore.import_records(datastore_records) 

364 storageClasses = StorageClassFactory() 

365 storageClasses.addFromConfig(butler_config) 

366 return cls( 

367 predicted_inputs, 

368 predicted_outputs, 

369 dimensions, 

370 datastore, 

371 storageClasses=storageClasses, 

372 dataset_types=dataset_types, 

373 ) 

374 

375 def _retrieve_dataset_type(self, name: str) -> DatasetType | None: 

376 """Return DatasetType defined in registry given dataset type name.""" 

377 return self._dataset_types.get(name) 

378 

379 def isWriteable(self) -> bool: 

380 # Docstring inherited. 

381 return True 

382 

383 def get( 

384 self, 

385 ref: DatasetRef, 

386 /, 

387 *, 

388 parameters: dict[str, Any] | None = None, 

389 storageClass: StorageClass | str | None = None, 

390 ) -> Any: 

391 try: 

392 obj = super().get( 

393 ref, 

394 parameters=parameters, 

395 storageClass=storageClass, 

396 ) 

397 except (LookupError, FileNotFoundError, OSError): 

398 self._unavailable_inputs.add(ref.id) 

399 raise 

400 if ref.id in self._predicted_inputs: 

401 # do this after delegating to super in case that raises. 

402 self._actual_inputs.add(ref.id) 

403 self._available_inputs.add(ref.id) 

404 return obj 

405 

406 def getDeferred( 

407 self, 

408 ref: DatasetRef, 

409 /, 

410 *, 

411 parameters: dict[str, Any] | None = None, 

412 storageClass: str | StorageClass | None = None, 

413 ) -> DeferredDatasetHandle: 

414 if ref.id in self._predicted_inputs: 

415 # Unfortunately, we can't do this after the handle succeeds in 

416 # loading, so it's conceivable here that we're marking an input 

417 # as "actual" even when it's not even available. 

418 self._actual_inputs.add(ref.id) 

419 return super().getDeferred(ref, parameters=parameters, storageClass=storageClass) 

420 

421 def stored(self, ref: DatasetRef) -> bool: 

422 # Docstring inherited. 

423 stored = super().stored(ref) 

424 if ref.id in self._predicted_inputs: 

425 if stored: 

426 self._available_inputs.add(ref.id) 

427 else: 

428 self._unavailable_inputs.add(ref.id) 

429 return stored 

430 

431 def stored_many( 

432 self, 

433 refs: Iterable[DatasetRef], 

434 ) -> dict[DatasetRef, bool]: 

435 # Docstring inherited. 

436 existence = super().stored_many(refs) 

437 

438 for ref, stored in existence.items(): 

439 if ref.id in self._predicted_inputs: 

440 if stored: 

441 self._available_inputs.add(ref.id) 

442 else: 

443 self._unavailable_inputs.add(ref.id) 

444 return existence 

445 

446 def markInputUnused(self, ref: DatasetRef) -> None: 

447 # Docstring inherited. 

448 self._actual_inputs.discard(ref.id) 

449 

450 @property 

451 def dimensions(self) -> DimensionUniverse: 

452 # Docstring inherited. 

453 return self._dimensions 

454 

455 def put(self, obj: Any, ref: DatasetRef, /) -> DatasetRef: 

456 # Docstring inherited. 

457 if ref.id not in self._predicted_outputs: 

458 raise RuntimeError("Cannot `put` dataset that was not predicted as an output.") 

459 self._datastore.put(obj, ref) 

460 self._actual_output_refs.add(ref) 

461 return ref 

462 

463 def pruneDatasets( 

464 self, 

465 refs: Iterable[DatasetRef], 

466 *, 

467 disassociate: bool = True, 

468 unstore: bool = False, 

469 tags: Iterable[str] = (), 

470 purge: bool = False, 

471 ) -> None: 

472 # docstring inherited from LimitedButler 

473 

474 if purge: 

475 if not disassociate: 

476 raise TypeError("Cannot pass purge=True without disassociate=True.") 

477 if not unstore: 

478 raise TypeError("Cannot pass purge=True without unstore=True.") 

479 elif disassociate: 

480 # No tagged collections for this butler. 

481 raise TypeError("Cannot pass disassociate=True without purge=True.") 

482 

483 refs = list(refs) 

484 

485 # Pruning a component of a DatasetRef makes no sense. 

486 for ref in refs: 

487 if ref.datasetType.component(): 

488 raise ValueError(f"Can not prune a component of a dataset (ref={ref})") 

489 

490 if unstore: 

491 self._datastore.trash(refs) 

492 if purge: 

493 for ref in refs: 

494 # We only care about removing them from actual output refs, 

495 self._actual_output_refs.discard(ref) 

496 

497 if unstore: 

498 # Point of no return for removing artifacts 

499 self._datastore.emptyTrash() 

500 

501 def extract_provenance_data(self) -> QuantumProvenanceData: 

502 """Extract provenance information and datastore records from this 

503 butler. 

504 

505 Returns 

506 ------- 

507 provenance : `QuantumProvenanceData` 

508 A serializable struct containing input/output dataset IDs and 

509 datastore records. This assumes all dataset IDs are UUIDs (just to 

510 make it easier for `pydantic` to reason about the struct's types); 

511 the rest of this class makes no such assumption, but the approach 

512 to processing in which it's useful effectively requires UUIDs 

513 anyway. 

514 

515 Notes 

516 ----- 

517 `QuantumBackedButler` records this provenance information when its 

518 methods are used, which mostly saves `~lsst.pipe.base.PipelineTask` 

519 authors from having to worry about while still recording very 

520 detailed information. But it has two small weaknesses: 

521 

522 - Calling `getDeferred` or `get` is enough to mark a 

523 dataset as an "actual input", which may mark some datasets that 

524 aren't actually used. We rely on task authors to use 

525 `markInputUnused` to address this. 

526 

527 - We assume that the execution system will call ``stored`` 

528 on all predicted inputs prior to execution, in order to populate the 

529 "available inputs" set. This is what I envision 

530 '`~lsst.ctrl.mpexec.SingleQuantumExecutor` doing after we update it 

531 to use this class, but it feels fragile for this class to make such 

532 a strong assumption about how it will be used, even if I can't think 

533 of any other executor behavior that would make sense. 

534 """ 

535 if not self._actual_inputs.isdisjoint(self._unavailable_inputs): 

536 _LOG.warning( 

537 "Inputs %s were marked as actually used (probably because a DeferredDatasetHandle) " 

538 "was obtained, but did not actually exist. This task should be be using markInputUnused " 

539 "directly to clarify its provenance.", 

540 self._actual_inputs & self._unavailable_inputs, 

541 ) 

542 self._actual_inputs -= self._unavailable_inputs 

543 checked_inputs = self._available_inputs | self._unavailable_inputs 

544 if self._predicted_inputs != checked_inputs: 

545 _LOG.warning( 

546 "Execution harness did not check predicted inputs %s for existence; available inputs " 

547 "recorded in provenance may be incomplete.", 

548 self._predicted_inputs - checked_inputs, 

549 ) 

550 datastore_records = self._datastore.export_records(self._actual_output_refs) 

551 provenance_records = { 

552 datastore_name: records.to_simple() for datastore_name, records in datastore_records.items() 

553 } 

554 

555 return QuantumProvenanceData( 

556 predicted_inputs=self._predicted_inputs, 

557 available_inputs=self._available_inputs, 

558 actual_inputs=self._actual_inputs, 

559 predicted_outputs=self._predicted_outputs, 

560 actual_outputs={ref.id for ref in self._actual_output_refs}, 

561 datastore_records=provenance_records, 

562 ) 

563 

564 

565class QuantumProvenanceData(pydantic.BaseModel): 

566 """A serializable struct for per-quantum provenance information and 

567 datastore records. 

568 

569 Notes 

570 ----- 

571 This class slightly duplicates information from the `Quantum` class itself 

572 (the ``predicted_inputs`` and ``predicted_outputs`` sets should have the 

573 same IDs present in `Quantum.inputs` and `Quantum.outputs`), but overall it 

574 assumes the original `Quantum` is also available to reconstruct the 

575 complete provenance (e.g. by associating dataset IDs with data IDs, 

576 dataset types, and `~CollectionType.RUN` names. 

577 

578 Note that ``pydantic`` method ``parse_raw()`` is not going to work 

579 correctly for this class, use `direct` method instead. 

580 """ 

581 

582 # This class probably should have information about its execution 

583 # environment (anything not controlled and recorded at the 

584 # `~CollectionType.RUN` level, such as the compute node ID). but adding it 

585 # now is out of scope for this prototype. 

586 

587 predicted_inputs: set[uuid.UUID] 

588 """Unique IDs of datasets that were predicted as inputs to this quantum 

589 when the QuantumGraph was built. 

590 """ 

591 

592 available_inputs: set[uuid.UUID] 

593 """Unique IDs of input datasets that were actually present in the datastore 

594 when this quantum was executed. 

595 

596 This is a subset of ``predicted_inputs``, with the difference generally 

597 being datasets were ``predicted_outputs`` but not ``actual_outputs`` of 

598 some upstream task. 

599 """ 

600 

601 actual_inputs: set[uuid.UUID] 

602 """Unique IDs of datasets that were actually used as inputs by this task. 

603 

604 This is a subset of ``available_inputs``. 

605 

606 Notes 

607 ----- 

608 The criteria for marking an input as used is that rerunning the quantum 

609 with only these ``actual_inputs`` available must yield identical outputs. 

610 This means that (for example) even just using an input to help determine 

611 an output rejection criteria and then rejecting it as an outlier qualifies 

612 that input as actually used. 

613 """ 

614 

615 predicted_outputs: set[uuid.UUID] 

616 """Unique IDs of datasets that were predicted as outputs of this quantum 

617 when the QuantumGraph was built. 

618 """ 

619 

620 actual_outputs: set[uuid.UUID] 

621 """Unique IDs of datasets that were actually written when this quantum 

622 was executed. 

623 """ 

624 

625 datastore_records: dict[str, SerializedDatastoreRecordData] 

626 """Datastore records indexed by datastore name.""" 

627 

628 @staticmethod 

629 def collect_and_transfer( 

630 butler: Butler, quanta: Iterable[Quantum], provenance: Iterable[QuantumProvenanceData] 

631 ) -> None: 

632 """Transfer output datasets from multiple quanta to a more permanent 

633 `Butler` repository. 

634 

635 Parameters 

636 ---------- 

637 butler : `Butler` 

638 Full butler representing the data repository to transfer datasets 

639 to. 

640 quanta : `~collections.abc.Iterable` [ `Quantum` ] 

641 Iterable of `Quantum` objects that carry information about 

642 predicted outputs. May be a single-pass iterator. 

643 provenance : `~collections.abc.Iterable` [ `QuantumProvenanceData` ] 

644 Provenance and datastore data for each of the given quanta, in the 

645 same order. May be a single-pass iterator. 

646 

647 Notes 

648 ----- 

649 Input-output provenance data is not actually transferred yet, because 

650 `Registry` has no place to store it. 

651 

652 This method probably works most efficiently if run on all quanta for a 

653 single task label at once, because this will gather all datasets of 

654 a particular type together into a single vectorized `Registry` import. 

655 It should still behave correctly if run on smaller groups of quanta 

656 or even quanta from multiple tasks. 

657 

658 Currently this method transfers datastore record data unchanged, with 

659 no possibility of actually moving (e.g.) files. Datastores that are 

660 present only in execution or only in the more permanent butler are 

661 ignored. 

662 """ 

663 grouped_refs = defaultdict(list) 

664 summary_records: dict[str, DatastoreRecordData] = {} 

665 for quantum, provenance_for_quantum in zip(quanta, provenance, strict=True): 

666 quantum_refs_by_id = { 

667 ref.id: ref 

668 for ref in itertools.chain.from_iterable(quantum.outputs.values()) 

669 if ref.id in provenance_for_quantum.actual_outputs 

670 } 

671 for ref in quantum_refs_by_id.values(): 

672 grouped_refs[ref.datasetType, ref.run].append(ref) 

673 

674 # merge datastore records into a summary structure 

675 for datastore_name, serialized_records in provenance_for_quantum.datastore_records.items(): 

676 quantum_records = DatastoreRecordData.from_simple(serialized_records) 

677 if (records := summary_records.get(datastore_name)) is not None: 

678 records.update(quantum_records) 

679 else: 

680 summary_records[datastore_name] = quantum_records 

681 

682 for refs in grouped_refs.values(): 

683 butler.registry._importDatasets(refs) 

684 butler._datastore.import_records(summary_records) 

685 

686 @classmethod 

687 def parse_raw(cls, *args: Any, **kwargs: Any) -> QuantumProvenanceData: 

688 raise NotImplementedError("parse_raw() is not usable for this class, use direct() instead.") 

689 

690 @classmethod 

691 def direct( 

692 cls, 

693 *, 

694 predicted_inputs: Iterable[str | uuid.UUID], 

695 available_inputs: Iterable[str | uuid.UUID], 

696 actual_inputs: Iterable[str | uuid.UUID], 

697 predicted_outputs: Iterable[str | uuid.UUID], 

698 actual_outputs: Iterable[str | uuid.UUID], 

699 datastore_records: Mapping[str, Mapping], 

700 ) -> QuantumProvenanceData: 

701 """Construct an instance directly without validators. 

702 

703 Parameters 

704 ---------- 

705 predicted_inputs : `~collections.abc.Iterable` of `str` or `uuid.UUID` 

706 The predicted inputs. 

707 available_inputs : `~collections.abc.Iterable` of `str` or `uuid.UUID` 

708 The available inputs. 

709 actual_inputs : `~collections.abc.Iterable` of `str` or `uuid.UUID` 

710 The actual inputs. 

711 predicted_outputs : `~collections.abc.Iterable` of `str` or `uuid.UUID` 

712 The predicted outputs. 

713 actual_outputs : `~collections.abc.Iterable` of `str` or `uuid.UUID` 

714 The actual outputs. 

715 datastore_records : `~collections.abc.Mapping` [ `str`, \ 

716 `~collections.abc.Mapping` ] 

717 The datastore records. 

718 

719 Returns 

720 ------- 

721 provenance : `QuantumProvenanceData` 

722 Serializable model of the quantum provenance. 

723 

724 Notes 

725 ----- 

726 This differs from the Pydantic "construct" method in that the 

727 arguments are explicitly what the model requires, and it will recurse 

728 through members, constructing them from their corresponding `direct` 

729 methods. 

730 

731 This method should only be called when the inputs are trusted. 

732 """ 

733 

734 def _to_uuid_set(uuids: Iterable[str | uuid.UUID]) -> set[uuid.UUID]: 

735 """Convert input UUIDs, which could be in string representation to 

736 a set of `UUID` instances. 

737 """ 

738 return {uuid.UUID(id) if isinstance(id, str) else id for id in uuids} 

739 

740 data = cls.model_construct( 

741 predicted_inputs=_to_uuid_set(predicted_inputs), 

742 available_inputs=_to_uuid_set(available_inputs), 

743 actual_inputs=_to_uuid_set(actual_inputs), 

744 predicted_outputs=_to_uuid_set(predicted_outputs), 

745 actual_outputs=_to_uuid_set(actual_outputs), 

746 datastore_records={ 

747 key: SerializedDatastoreRecordData.direct(**records) 

748 for key, records in datastore_records.items() 

749 }, 

750 ) 

751 

752 return data