Coverage for python/lsst/daf/butler/_quantum_backed.py: 32%

185 statements  

« prev     ^ index     » next       coverage.py v7.4.0, created at 2024-01-16 10:44 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28from __future__ import annotations 

29 

30from . import ddl 

31 

32__all__ = ("QuantumBackedButler", "QuantumProvenanceData") 

33 

34import itertools 

35import logging 

36import uuid 

37from collections import defaultdict 

38from collections.abc import Iterable, Mapping 

39from typing import TYPE_CHECKING, Any 

40 

41import pydantic 

42from lsst.resources import ResourcePathExpression 

43 

44from ._butler_config import ButlerConfig 

45from ._config import Config 

46from ._dataset_ref import DatasetId, DatasetRef 

47from ._dataset_type import DatasetType 

48from ._deferredDatasetHandle import DeferredDatasetHandle 

49from ._limited_butler import LimitedButler 

50from ._quantum import Quantum 

51from ._storage_class import StorageClass, StorageClassFactory 

52from .datastore import Datastore 

53from .datastore.record_data import DatastoreRecordData, SerializedDatastoreRecordData 

54from .dimensions import DimensionUniverse 

55from .registry.bridge.monolithic import MonolithicDatastoreRegistryBridgeManager 

56from .registry.databases.sqlite import SqliteDatabase 

57from .registry.interfaces import DatastoreRegistryBridgeManager, OpaqueTableStorageManager 

58from .registry.opaque import ByNameOpaqueTableStorageManager 

59 

60if TYPE_CHECKING: 

61 from ._butler import Butler 

62 

63_LOG = logging.getLogger(__name__) 

64 

65 

66class _DatasetRecordStorageManagerDatastoreConstructionMimic: 

67 """A partial implementation of `DatasetRecordStorageManager` that exists 

68 only to allow a `DatastoreRegistryBridgeManager` (and hence a `Datastore`) 

69 to be constructed without a full `Registry`. 

70 

71 Notes 

72 ----- 

73 The interface implemented by this class should probably be its own ABC, 

74 and that ABC should probably be used in the definition of 

75 `DatastoreRegistryBridgeManager`, but while prototyping I'm trying to keep 

76 changes minimal. 

77 """ 

78 

79 @classmethod 

80 def getIdColumnType(cls) -> type: 

81 # Docstring inherited. 

82 return ddl.GUID 

83 

84 @classmethod 

85 def addDatasetForeignKey( 

86 cls, 

87 tableSpec: ddl.TableSpec, 

88 *, 

89 name: str = "dataset", 

90 constraint: bool = True, 

91 onDelete: str | None = None, 

92 **kwargs: Any, 

93 ) -> ddl.FieldSpec: 

94 # Docstring inherited. 

95 idFieldSpec = ddl.FieldSpec(f"{name}_id", dtype=ddl.GUID, **kwargs) 

96 tableSpec.fields.add(idFieldSpec) 

97 return idFieldSpec 

98 

99 

100class QuantumBackedButler(LimitedButler): 

101 """An implementation of `LimitedButler` intended to back execution of a 

102 single `Quantum`. 

103 

104 Parameters 

105 ---------- 

106 predicted_inputs : `~collections.abc.Iterable` [`DatasetId`] 

107 Dataset IDs for datasets that can can be read from this butler. 

108 predicted_outputs : `~collections.abc.Iterable` [`DatasetId`] 

109 Dataset IDs for datasets that can be stored in this butler. 

110 dimensions : `DimensionUniverse` 

111 Object managing all dimension definitions. 

112 datastore : `Datastore` 

113 Datastore to use for all dataset I/O and existence checks. 

114 storageClasses : `StorageClassFactory` 

115 Object managing all storage class definitions. 

116 dataset_types : `~collections.abc.Mapping` [`str`, `DatasetType`] 

117 The registry dataset type definitions, indexed by name. 

118 

119 Notes 

120 ----- 

121 Most callers should use the `initialize` `classmethod` to construct new 

122 instances instead of calling the constructor directly. 

123 

124 `QuantumBackedButler` uses a SQLite database internally, in order to reuse 

125 existing `DatastoreRegistryBridge` and `OpaqueTableStorage` 

126 implementations that rely SQLAlchemy. If implementations are added in the 

127 future that don't rely on SQLAlchemy, it should be possible to swap them 

128 in by overriding the type arguments to `initialize` (though at present, 

129 `QuantumBackedButler` would still create at least an in-memory SQLite 

130 database that would then go unused).` 

131 

132 We imagine `QuantumBackedButler` being used during (at least) batch 

133 execution to capture `Datastore` records and save them to per-quantum 

134 files, which are also a convenient place to store provenance for eventual 

135 upload to a SQL-backed `Registry` (once `Registry` has tables to store 

136 provenance, that is). 

137 These per-quantum files can be written in two ways: 

138 

139 - The SQLite file used internally by `QuantumBackedButler` can be used 

140 directly but customizing the ``filename`` argument to ``initialize``, and 

141 then transferring that file to the object store after execution completes 

142 (or fails; a ``try/finally`` pattern probably makes sense here). 

143 

144 - A JSON or YAML file can be written by calling `extract_provenance_data`, 

145 and using ``pydantic`` methods to write the returned 

146 `QuantumProvenanceData` to a file. 

147 

148 Note that at present, the SQLite file only contains datastore records, not 

149 provenance, but that should be easy to address (if desired) after we 

150 actually design a `Registry` schema for provenance. I also suspect that 

151 we'll want to explicitly close the SQLite file somehow before trying to 

152 transfer it. But I'm guessing we'd prefer to write the per-quantum files 

153 as JSON anyway. 

154 """ 

155 

156 def __init__( 

157 self, 

158 predicted_inputs: Iterable[DatasetId], 

159 predicted_outputs: Iterable[DatasetId], 

160 dimensions: DimensionUniverse, 

161 datastore: Datastore, 

162 storageClasses: StorageClassFactory, 

163 dataset_types: Mapping[str, DatasetType] | None = None, 

164 ): 

165 self._dimensions = dimensions 

166 self._predicted_inputs = set(predicted_inputs) 

167 self._predicted_outputs = set(predicted_outputs) 

168 self._available_inputs: set[DatasetId] = set() 

169 self._unavailable_inputs: set[DatasetId] = set() 

170 self._actual_inputs: set[DatasetId] = set() 

171 self._actual_output_refs: set[DatasetRef] = set() 

172 self._datastore = datastore 

173 self.storageClasses = storageClasses 

174 self._dataset_types: Mapping[str, DatasetType] = {} 

175 if dataset_types is not None: 

176 self._dataset_types = dataset_types 

177 self._datastore.set_retrieve_dataset_type_method(self._retrieve_dataset_type) 

178 

179 @classmethod 

180 def initialize( 

181 cls, 

182 config: Config | ResourcePathExpression, 

183 quantum: Quantum, 

184 dimensions: DimensionUniverse, 

185 filename: str = ":memory:", 

186 OpaqueManagerClass: type[OpaqueTableStorageManager] = ByNameOpaqueTableStorageManager, 

187 BridgeManagerClass: type[DatastoreRegistryBridgeManager] = MonolithicDatastoreRegistryBridgeManager, 

188 search_paths: list[str] | None = None, 

189 dataset_types: Mapping[str, DatasetType] | None = None, 

190 ) -> QuantumBackedButler: 

191 """Construct a new `QuantumBackedButler` from repository configuration 

192 and helper types. 

193 

194 Parameters 

195 ---------- 

196 config : `Config` or `~lsst.resources.ResourcePathExpression` 

197 A butler repository root, configuration filename, or configuration 

198 instance. 

199 quantum : `Quantum` 

200 Object describing the predicted input and output dataset relevant 

201 to this butler. This must have resolved `DatasetRef` instances for 

202 all inputs and outputs. 

203 dimensions : `DimensionUniverse` 

204 Object managing all dimension definitions. 

205 filename : `str`, optional 

206 Name for the SQLite database that will back this butler; defaults 

207 to an in-memory database. 

208 OpaqueManagerClass : `type`, optional 

209 A subclass of `OpaqueTableStorageManager` to use for datastore 

210 opaque records. Default is a SQL-backed implementation. 

211 BridgeManagerClass : `type`, optional 

212 A subclass of `DatastoreRegistryBridgeManager` to use for datastore 

213 location records. Default is a SQL-backed implementation. 

214 search_paths : `list` of `str`, optional 

215 Additional search paths for butler configuration. 

216 dataset_types : `~collections.abc.Mapping` [`str`, `DatasetType`], \ 

217 optional 

218 Mapping of the dataset type name to its registry definition. 

219 """ 

220 predicted_inputs = [ref.id for ref in itertools.chain.from_iterable(quantum.inputs.values())] 

221 predicted_inputs += [ref.id for ref in quantum.initInputs.values()] 

222 predicted_outputs = [ref.id for ref in itertools.chain.from_iterable(quantum.outputs.values())] 

223 return cls._initialize( 

224 config=config, 

225 predicted_inputs=predicted_inputs, 

226 predicted_outputs=predicted_outputs, 

227 dimensions=dimensions, 

228 filename=filename, 

229 datastore_records=quantum.datastore_records, 

230 OpaqueManagerClass=OpaqueManagerClass, 

231 BridgeManagerClass=BridgeManagerClass, 

232 search_paths=search_paths, 

233 dataset_types=dataset_types, 

234 ) 

235 

236 @classmethod 

237 def from_predicted( 

238 cls, 

239 config: Config | ResourcePathExpression, 

240 predicted_inputs: Iterable[DatasetId], 

241 predicted_outputs: Iterable[DatasetId], 

242 dimensions: DimensionUniverse, 

243 datastore_records: Mapping[str, DatastoreRecordData], 

244 filename: str = ":memory:", 

245 OpaqueManagerClass: type[OpaqueTableStorageManager] = ByNameOpaqueTableStorageManager, 

246 BridgeManagerClass: type[DatastoreRegistryBridgeManager] = MonolithicDatastoreRegistryBridgeManager, 

247 search_paths: list[str] | None = None, 

248 dataset_types: Mapping[str, DatasetType] | None = None, 

249 ) -> QuantumBackedButler: 

250 """Construct a new `QuantumBackedButler` from sets of input and output 

251 dataset IDs. 

252 

253 Parameters 

254 ---------- 

255 config : `Config` or `~lsst.resources.ResourcePathExpression` 

256 A butler repository root, configuration filename, or configuration 

257 instance. 

258 predicted_inputs : `~collections.abc.Iterable` [`DatasetId`] 

259 Dataset IDs for datasets that can can be read from this butler. 

260 predicted_outputs : `~collections.abc.Iterable` [`DatasetId`] 

261 Dataset IDs for datasets that can be stored in this butler, must be 

262 fully resolved. 

263 dimensions : `DimensionUniverse` 

264 Object managing all dimension definitions. 

265 datastore_records : `dict` [`str`, `DatastoreRecordData`] or `None` 

266 Datastore records to import into a datastore. 

267 filename : `str`, optional 

268 Name for the SQLite database that will back this butler; defaults 

269 to an in-memory database. 

270 OpaqueManagerClass : `type`, optional 

271 A subclass of `OpaqueTableStorageManager` to use for datastore 

272 opaque records. Default is a SQL-backed implementation. 

273 BridgeManagerClass : `type`, optional 

274 A subclass of `DatastoreRegistryBridgeManager` to use for datastore 

275 location records. Default is a SQL-backed implementation. 

276 search_paths : `list` of `str`, optional 

277 Additional search paths for butler configuration. 

278 dataset_types : `~collections.abc.Mapping` [`str`, `DatasetType`], \ 

279 optional 

280 Mapping of the dataset type name to its registry definition. 

281 """ 

282 return cls._initialize( 

283 config=config, 

284 predicted_inputs=predicted_inputs, 

285 predicted_outputs=predicted_outputs, 

286 dimensions=dimensions, 

287 filename=filename, 

288 datastore_records=datastore_records, 

289 OpaqueManagerClass=OpaqueManagerClass, 

290 BridgeManagerClass=BridgeManagerClass, 

291 search_paths=search_paths, 

292 dataset_types=dataset_types, 

293 ) 

294 

295 @classmethod 

296 def _initialize( 

297 cls, 

298 *, 

299 config: Config | ResourcePathExpression, 

300 predicted_inputs: Iterable[DatasetId], 

301 predicted_outputs: Iterable[DatasetId], 

302 dimensions: DimensionUniverse, 

303 filename: str = ":memory:", 

304 datastore_records: Mapping[str, DatastoreRecordData] | None = None, 

305 OpaqueManagerClass: type[OpaqueTableStorageManager] = ByNameOpaqueTableStorageManager, 

306 BridgeManagerClass: type[DatastoreRegistryBridgeManager] = MonolithicDatastoreRegistryBridgeManager, 

307 search_paths: list[str] | None = None, 

308 dataset_types: Mapping[str, DatasetType] | None = None, 

309 ) -> QuantumBackedButler: 

310 """Initialize quantum-backed butler. 

311 

312 Internal method with common implementation used by `initialize` and 

313 `for_output`. 

314 

315 Parameters 

316 ---------- 

317 config : `Config` or `~lsst.resources.ResourcePathExpression` 

318 A butler repository root, configuration filename, or configuration 

319 instance. 

320 predicted_inputs : `~collections.abc.Iterable` [`DatasetId`] 

321 Dataset IDs for datasets that can can be read from this butler. 

322 predicted_outputs : `~collections.abc.Iterable` [`DatasetId`] 

323 Dataset IDs for datasets that can be stored in this butler. 

324 dimensions : `DimensionUniverse` 

325 Object managing all dimension definitions. 

326 filename : `str`, optional 

327 Name for the SQLite database that will back this butler; defaults 

328 to an in-memory database. 

329 datastore_records : `dict` [`str`, `DatastoreRecordData`] or `None` 

330 Datastore records to import into a datastore. 

331 OpaqueManagerClass : `type`, optional 

332 A subclass of `OpaqueTableStorageManager` to use for datastore 

333 opaque records. Default is a SQL-backed implementation. 

334 BridgeManagerClass : `type`, optional 

335 A subclass of `DatastoreRegistryBridgeManager` to use for datastore 

336 location records. Default is a SQL-backed implementation. 

337 search_paths : `list` of `str`, optional 

338 Additional search paths for butler configuration. 

339 dataset_types : `~collections.abc.Mapping` [`str`, `DatasetType`] 

340 Mapping of the dataset type name to its registry definition. 

341 """ 

342 butler_config = ButlerConfig(config, searchPaths=search_paths) 

343 butler_root = butler_config.get("root", butler_config.configDir) 

344 db = SqliteDatabase.fromUri(f"sqlite:///{filename}", origin=0) 

345 with db.declareStaticTables(create=True) as context: 

346 opaque_manager = OpaqueManagerClass.initialize(db, context) 

347 bridge_manager = BridgeManagerClass.initialize( 

348 db, 

349 context, 

350 opaque=opaque_manager, 

351 # MyPy can tell it's a fake, but we know it shouldn't care. 

352 datasets=_DatasetRecordStorageManagerDatastoreConstructionMimic, # type: ignore 

353 universe=dimensions, 

354 ) 

355 # TODO: We need to inform `Datastore` here that it needs to support 

356 # predictive reads; right now that's a configuration option, but after 

357 # execution butler is retired it could just be a kwarg we pass here. 

358 # For now just force this option as we cannot work without it. 

359 butler_config["datastore", "trust_get_request"] = True 

360 datastore = Datastore.fromConfig(butler_config, bridge_manager, butler_root) 

361 if datastore_records is not None: 

362 datastore.import_records(datastore_records) 

363 storageClasses = StorageClassFactory() 

364 storageClasses.addFromConfig(butler_config) 

365 return cls( 

366 predicted_inputs, 

367 predicted_outputs, 

368 dimensions, 

369 datastore, 

370 storageClasses=storageClasses, 

371 dataset_types=dataset_types, 

372 ) 

373 

374 def _retrieve_dataset_type(self, name: str) -> DatasetType | None: 

375 """Return DatasetType defined in registry given dataset type name.""" 

376 return self._dataset_types.get(name) 

377 

378 def isWriteable(self) -> bool: 

379 # Docstring inherited. 

380 return True 

381 

382 def get( 

383 self, 

384 ref: DatasetRef, 

385 /, 

386 *, 

387 parameters: dict[str, Any] | None = None, 

388 storageClass: StorageClass | str | None = None, 

389 ) -> Any: 

390 try: 

391 obj = super().get( 

392 ref, 

393 parameters=parameters, 

394 storageClass=storageClass, 

395 ) 

396 except (LookupError, FileNotFoundError, OSError): 

397 self._unavailable_inputs.add(ref.id) 

398 raise 

399 if ref.id in self._predicted_inputs: 

400 # do this after delegating to super in case that raises. 

401 self._actual_inputs.add(ref.id) 

402 self._available_inputs.add(ref.id) 

403 return obj 

404 

405 def getDeferred( 

406 self, 

407 ref: DatasetRef, 

408 /, 

409 *, 

410 parameters: dict[str, Any] | None = None, 

411 storageClass: str | StorageClass | None = None, 

412 ) -> DeferredDatasetHandle: 

413 if ref.id in self._predicted_inputs: 

414 # Unfortunately, we can't do this after the handle succeeds in 

415 # loading, so it's conceivable here that we're marking an input 

416 # as "actual" even when it's not even available. 

417 self._actual_inputs.add(ref.id) 

418 return super().getDeferred(ref, parameters=parameters, storageClass=storageClass) 

419 

420 def stored(self, ref: DatasetRef) -> bool: 

421 # Docstring inherited. 

422 stored = super().stored(ref) 

423 if ref.id in self._predicted_inputs: 

424 if stored: 

425 self._available_inputs.add(ref.id) 

426 else: 

427 self._unavailable_inputs.add(ref.id) 

428 return stored 

429 

430 def stored_many( 

431 self, 

432 refs: Iterable[DatasetRef], 

433 ) -> dict[DatasetRef, bool]: 

434 # Docstring inherited. 

435 existence = super().stored_many(refs) 

436 

437 for ref, stored in existence.items(): 

438 if ref.id in self._predicted_inputs: 

439 if stored: 

440 self._available_inputs.add(ref.id) 

441 else: 

442 self._unavailable_inputs.add(ref.id) 

443 return existence 

444 

445 def markInputUnused(self, ref: DatasetRef) -> None: 

446 # Docstring inherited. 

447 self._actual_inputs.discard(ref.id) 

448 

449 @property 

450 def dimensions(self) -> DimensionUniverse: 

451 # Docstring inherited. 

452 return self._dimensions 

453 

454 def put(self, obj: Any, ref: DatasetRef, /) -> DatasetRef: 

455 # Docstring inherited. 

456 if ref.id not in self._predicted_outputs: 

457 raise RuntimeError("Cannot `put` dataset that was not predicted as an output.") 

458 self._datastore.put(obj, ref) 

459 self._actual_output_refs.add(ref) 

460 return ref 

461 

462 def pruneDatasets( 

463 self, 

464 refs: Iterable[DatasetRef], 

465 *, 

466 disassociate: bool = True, 

467 unstore: bool = False, 

468 tags: Iterable[str] = (), 

469 purge: bool = False, 

470 ) -> None: 

471 # docstring inherited from LimitedButler 

472 

473 if purge: 

474 if not disassociate: 

475 raise TypeError("Cannot pass purge=True without disassociate=True.") 

476 if not unstore: 

477 raise TypeError("Cannot pass purge=True without unstore=True.") 

478 elif disassociate: 

479 # No tagged collections for this butler. 

480 raise TypeError("Cannot pass disassociate=True without purge=True.") 

481 

482 refs = list(refs) 

483 

484 # Pruning a component of a DatasetRef makes no sense. 

485 for ref in refs: 

486 if ref.datasetType.component(): 

487 raise ValueError(f"Can not prune a component of a dataset (ref={ref})") 

488 

489 if unstore: 

490 self._datastore.trash(refs) 

491 if purge: 

492 for ref in refs: 

493 # We only care about removing them from actual output refs, 

494 self._actual_output_refs.discard(ref) 

495 

496 if unstore: 

497 # Point of no return for removing artifacts 

498 self._datastore.emptyTrash() 

499 

500 def extract_provenance_data(self) -> QuantumProvenanceData: 

501 """Extract provenance information and datastore records from this 

502 butler. 

503 

504 Returns 

505 ------- 

506 provenance : `QuantumProvenanceData` 

507 A serializable struct containing input/output dataset IDs and 

508 datastore records. This assumes all dataset IDs are UUIDs (just to 

509 make it easier for `pydantic` to reason about the struct's types); 

510 the rest of this class makes no such assumption, but the approach 

511 to processing in which it's useful effectively requires UUIDs 

512 anyway. 

513 

514 Notes 

515 ----- 

516 `QuantumBackedButler` records this provenance information when its 

517 methods are used, which mostly saves `~lsst.pipe.base.PipelineTask` 

518 authors from having to worry about while still recording very 

519 detailed information. But it has two small weaknesses: 

520 

521 - Calling `getDeferred` or `get` is enough to mark a 

522 dataset as an "actual input", which may mark some datasets that 

523 aren't actually used. We rely on task authors to use 

524 `markInputUnused` to address this. 

525 

526 - We assume that the execution system will call ``stored`` 

527 on all predicted inputs prior to execution, in order to populate the 

528 "available inputs" set. This is what I envision 

529 '`~lsst.ctrl.mpexec.SingleQuantumExecutor` doing after we update it 

530 to use this class, but it feels fragile for this class to make such 

531 a strong assumption about how it will be used, even if I can't think 

532 of any other executor behavior that would make sense. 

533 """ 

534 if not self._actual_inputs.isdisjoint(self._unavailable_inputs): 

535 _LOG.warning( 

536 "Inputs %s were marked as actually used (probably because a DeferredDatasetHandle) " 

537 "was obtained, but did not actually exist. This task should be be using markInputUnused " 

538 "directly to clarify its provenance.", 

539 self._actual_inputs & self._unavailable_inputs, 

540 ) 

541 self._actual_inputs -= self._unavailable_inputs 

542 checked_inputs = self._available_inputs | self._unavailable_inputs 

543 if self._predicted_inputs != checked_inputs: 

544 _LOG.warning( 

545 "Execution harness did not check predicted inputs %s for existence; available inputs " 

546 "recorded in provenance may be incomplete.", 

547 self._predicted_inputs - checked_inputs, 

548 ) 

549 datastore_records = self._datastore.export_records(self._actual_output_refs) 

550 provenance_records = { 

551 datastore_name: records.to_simple() for datastore_name, records in datastore_records.items() 

552 } 

553 

554 return QuantumProvenanceData( 

555 predicted_inputs=self._predicted_inputs, 

556 available_inputs=self._available_inputs, 

557 actual_inputs=self._actual_inputs, 

558 predicted_outputs=self._predicted_outputs, 

559 actual_outputs={ref.id for ref in self._actual_output_refs}, 

560 datastore_records=provenance_records, 

561 ) 

562 

563 

564class QuantumProvenanceData(pydantic.BaseModel): 

565 """A serializable struct for per-quantum provenance information and 

566 datastore records. 

567 

568 Notes 

569 ----- 

570 This class slightly duplicates information from the `Quantum` class itself 

571 (the ``predicted_inputs`` and ``predicted_outputs`` sets should have the 

572 same IDs present in `Quantum.inputs` and `Quantum.outputs`), but overall it 

573 assumes the original `Quantum` is also available to reconstruct the 

574 complete provenance (e.g. by associating dataset IDs with data IDs, 

575 dataset types, and `~CollectionType.RUN` names. 

576 

577 Note that ``pydantic`` method ``parse_raw()`` is not going to work 

578 correctly for this class, use `direct` method instead. 

579 """ 

580 

581 # This class probably should have information about its execution 

582 # environment (anything not controlled and recorded at the 

583 # `~CollectionType.RUN` level, such as the compute node ID). but adding it 

584 # now is out of scope for this prototype. 

585 

586 predicted_inputs: set[uuid.UUID] 

587 """Unique IDs of datasets that were predicted as inputs to this quantum 

588 when the QuantumGraph was built. 

589 """ 

590 

591 available_inputs: set[uuid.UUID] 

592 """Unique IDs of input datasets that were actually present in the datastore 

593 when this quantum was executed. 

594 

595 This is a subset of ``predicted_inputs``, with the difference generally 

596 being datasets were ``predicted_outputs`` but not ``actual_outputs`` of 

597 some upstream task. 

598 """ 

599 

600 actual_inputs: set[uuid.UUID] 

601 """Unique IDs of datasets that were actually used as inputs by this task. 

602 

603 This is a subset of ``available_inputs``. 

604 

605 Notes 

606 ----- 

607 The criteria for marking an input as used is that rerunning the quantum 

608 with only these ``actual_inputs`` available must yield identical outputs. 

609 This means that (for example) even just using an input to help determine 

610 an output rejection criteria and then rejecting it as an outlier qualifies 

611 that input as actually used. 

612 """ 

613 

614 predicted_outputs: set[uuid.UUID] 

615 """Unique IDs of datasets that were predicted as outputs of this quantum 

616 when the QuantumGraph was built. 

617 """ 

618 

619 actual_outputs: set[uuid.UUID] 

620 """Unique IDs of datasets that were actually written when this quantum 

621 was executed. 

622 """ 

623 

624 datastore_records: dict[str, SerializedDatastoreRecordData] 

625 """Datastore records indexed by datastore name.""" 

626 

627 @staticmethod 

628 def collect_and_transfer( 

629 butler: Butler, quanta: Iterable[Quantum], provenance: Iterable[QuantumProvenanceData] 

630 ) -> None: 

631 """Transfer output datasets from multiple quanta to a more permanent 

632 `Butler` repository. 

633 

634 Parameters 

635 ---------- 

636 butler : `Butler` 

637 Full butler representing the data repository to transfer datasets 

638 to. 

639 quanta : `~collections.abc.Iterable` [ `Quantum` ] 

640 Iterable of `Quantum` objects that carry information about 

641 predicted outputs. May be a single-pass iterator. 

642 provenance : `~collections.abc.Iterable` [ `QuantumProvenanceData` ] 

643 Provenance and datastore data for each of the given quanta, in the 

644 same order. May be a single-pass iterator. 

645 

646 Notes 

647 ----- 

648 Input-output provenance data is not actually transferred yet, because 

649 `Registry` has no place to store it. 

650 

651 This method probably works most efficiently if run on all quanta for a 

652 single task label at once, because this will gather all datasets of 

653 a particular type together into a single vectorized `Registry` import. 

654 It should still behave correctly if run on smaller groups of quanta 

655 or even quanta from multiple tasks. 

656 

657 Currently this method transfers datastore record data unchanged, with 

658 no possibility of actually moving (e.g.) files. Datastores that are 

659 present only in execution or only in the more permanent butler are 

660 ignored. 

661 """ 

662 grouped_refs = defaultdict(list) 

663 summary_records: dict[str, DatastoreRecordData] = {} 

664 for quantum, provenance_for_quantum in zip(quanta, provenance, strict=True): 

665 quantum_refs_by_id = { 

666 ref.id: ref 

667 for ref in itertools.chain.from_iterable(quantum.outputs.values()) 

668 if ref.id in provenance_for_quantum.actual_outputs 

669 } 

670 for ref in quantum_refs_by_id.values(): 

671 grouped_refs[ref.datasetType, ref.run].append(ref) 

672 

673 # merge datastore records into a summary structure 

674 for datastore_name, serialized_records in provenance_for_quantum.datastore_records.items(): 

675 quantum_records = DatastoreRecordData.from_simple(serialized_records) 

676 if (records := summary_records.get(datastore_name)) is not None: 

677 records.update(quantum_records) 

678 else: 

679 summary_records[datastore_name] = quantum_records 

680 

681 for refs in grouped_refs.values(): 

682 butler.registry._importDatasets(refs) 

683 butler._datastore.import_records(summary_records) 

684 

685 @classmethod 

686 def parse_raw(cls, *args: Any, **kwargs: Any) -> QuantumProvenanceData: 

687 raise NotImplementedError("parse_raw() is not usable for this class, use direct() instead.") 

688 

689 @classmethod 

690 def direct( 

691 cls, 

692 *, 

693 predicted_inputs: Iterable[str | uuid.UUID], 

694 available_inputs: Iterable[str | uuid.UUID], 

695 actual_inputs: Iterable[str | uuid.UUID], 

696 predicted_outputs: Iterable[str | uuid.UUID], 

697 actual_outputs: Iterable[str | uuid.UUID], 

698 datastore_records: Mapping[str, Mapping], 

699 ) -> QuantumProvenanceData: 

700 """Construct an instance directly without validators. 

701 

702 Parameters 

703 ---------- 

704 predicted_inputs : `~collections.abc.Iterable` of `str` or `uuid.UUID` 

705 The predicted inputs. 

706 available_inputs : `~collections.abc.Iterable` of `str` or `uuid.UUID` 

707 The available inputs. 

708 actual_inputs : `~collections.abc.Iterable` of `str` or `uuid.UUID` 

709 The actual inputs. 

710 predicted_outputs : `~collections.abc.Iterable` of `str` or `uuid.UUID` 

711 The predicted outputs. 

712 actual_outputs : `~collections.abc.Iterable` of `str` or `uuid.UUID` 

713 The actual outputs. 

714 datastore_records : `~collections.abc.Mapping` [ `str`, \ 

715 `~collections.abc.Mapping` ] 

716 The datastore records. 

717 

718 Returns 

719 ------- 

720 provenance : `QuantumProvenanceData` 

721 Serializable model of the quantum provenance. 

722 

723 Notes 

724 ----- 

725 This differs from the Pydantic "construct" method in that the 

726 arguments are explicitly what the model requires, and it will recurse 

727 through members, constructing them from their corresponding `direct` 

728 methods. 

729 

730 This method should only be called when the inputs are trusted. 

731 """ 

732 

733 def _to_uuid_set(uuids: Iterable[str | uuid.UUID]) -> set[uuid.UUID]: 

734 """Convert input UUIDs, which could be in string representation to 

735 a set of `UUID` instances. 

736 """ 

737 return {uuid.UUID(id) if isinstance(id, str) else id for id in uuids} 

738 

739 data = cls.model_construct( 

740 predicted_inputs=_to_uuid_set(predicted_inputs), 

741 available_inputs=_to_uuid_set(available_inputs), 

742 actual_inputs=_to_uuid_set(actual_inputs), 

743 predicted_outputs=_to_uuid_set(predicted_outputs), 

744 actual_outputs=_to_uuid_set(actual_outputs), 

745 datastore_records={ 

746 key: SerializedDatastoreRecordData.direct(**records) 

747 for key, records in datastore_records.items() 

748 }, 

749 ) 

750 

751 return data