Coverage for python/lsst/daf/butler/_quantum_backed.py: 26%

183 statements  

« prev     ^ index     » next       coverage.py v7.2.6, created at 2023-05-26 02:11 -0700

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ("QuantumBackedButler", "QuantumProvenanceData") 

25 

26import itertools 

27import logging 

28import uuid 

29from collections import defaultdict 

30from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Mapping, Optional, Set, Type, Union 

31 

32from deprecated.sphinx import deprecated 

33from pydantic import BaseModel 

34 

35from ._butlerConfig import ButlerConfig 

36from ._deferredDatasetHandle import DeferredDatasetHandle 

37from ._limited_butler import LimitedButler 

38from .core import ( 

39 Config, 

40 DatasetId, 

41 DatasetRef, 

42 DatasetType, 

43 Datastore, 

44 DatastoreRecordData, 

45 DimensionUniverse, 

46 Quantum, 

47 SerializedDatastoreRecordData, 

48 StorageClass, 

49 StorageClassFactory, 

50 ddl, 

51) 

52from .registry.bridge.monolithic import MonolithicDatastoreRegistryBridgeManager 

53from .registry.databases.sqlite import SqliteDatabase 

54from .registry.interfaces import DatastoreRegistryBridgeManager, OpaqueTableStorageManager 

55from .registry.opaque import ByNameOpaqueTableStorageManager 

56 

57if TYPE_CHECKING: 

58 from ._butler import Butler 

59 

60_LOG = logging.getLogger(__name__) 

61 

62 

63class _DatasetRecordStorageManagerDatastoreConstructionMimic: 

64 """A partial implementation of `DatasetRecordStorageManager` that exists 

65 only to allow a `DatastoreRegistryBridgeManager` (and hence a `Datastore`) 

66 to be constructed without a full `Registry`. 

67 

68 Notes 

69 ----- 

70 The interface implemented by this class should probably be its own ABC, 

71 and that ABC should probably be used in the definition of 

72 `DatastoreRegistryBridgeManager`, but while prototyping I'm trying to keep 

73 changes minimal. 

74 """ 

75 

76 @classmethod 

77 def getIdColumnType(cls) -> type: 

78 # Docstring inherited. 

79 return ddl.GUID 

80 

81 @classmethod 

82 def addDatasetForeignKey( 

83 cls, 

84 tableSpec: ddl.TableSpec, 

85 *, 

86 name: str = "dataset", 

87 constraint: bool = True, 

88 onDelete: Optional[str] = None, 

89 **kwargs: Any, 

90 ) -> ddl.FieldSpec: 

91 # Docstring inherited. 

92 idFieldSpec = ddl.FieldSpec(f"{name}_id", dtype=ddl.GUID, **kwargs) 

93 tableSpec.fields.add(idFieldSpec) 

94 return idFieldSpec 

95 

96 

97class QuantumBackedButler(LimitedButler): 

98 """An implementation of `LimitedButler` intended to back execution of a 

99 single `Quantum`. 

100 

101 Parameters 

102 ---------- 

103 predicted_inputs : `~collections.abc.Iterable` [`DatasetId`] 

104 Dataset IDs for datasets that can can be read from this butler. 

105 predicted_outputs : `~collections.abc.Iterable` [`DatasetId`] 

106 Dataset IDs for datasets that can be stored in this butler. 

107 dimensions : `DimensionUniverse` 

108 Object managing all dimension definitions. 

109 datastore : `Datastore` 

110 Datastore to use for all dataset I/O and existence checks. 

111 storageClasses : `StorageClassFactory` 

112 Object managing all storage class definitions. 

113 

114 Notes 

115 ----- 

116 Most callers should use the `initialize` `classmethod` to construct new 

117 instances instead of calling the constructor directly. 

118 

119 `QuantumBackedButler` uses a SQLite database internally, in order to reuse 

120 existing `DatastoreRegistryBridge` and `OpaqueTableStorage` 

121 implementations that rely SQLAlchemy. If implementations are added in the 

122 future that don't rely on SQLAlchemy, it should be possible to swap them 

123 in by overriding the type arguments to `initialize` (though at present, 

124 `QuantumBackedButler` would still create at least an in-memory SQLite 

125 database that would then go unused).` 

126 

127 We imagine `QuantumBackedButler` being used during (at least) batch 

128 execution to capture `Datastore` records and save them to per-quantum 

129 files, which are also a convenient place to store provenance for eventual 

130 upload to a SQL-backed `Registry` (once `Registry` has tables to store 

131 provenance, that is). 

132 These per-quantum files can be written in two ways: 

133 

134 - The SQLite file used internally by `QuantumBackedButler` can be used 

135 directly but customizing the ``filename`` argument to ``initialize``, and 

136 then transferring that file to the object store after execution completes 

137 (or fails; a ``try/finally`` pattern probably makes sense here). 

138 

139 - A JSON or YAML file can be written by calling `extract_provenance_data`, 

140 and using ``pydantic`` methods to write the returned 

141 `QuantumProvenanceData` to a file. 

142 

143 Note that at present, the SQLite file only contains datastore records, not 

144 provenance, but that should be easy to address (if desired) after we 

145 actually design a `Registry` schema for provenance. I also suspect that 

146 we'll want to explicitly close the SQLite file somehow before trying to 

147 transfer it. But I'm guessing we'd prefer to write the per-quantum files 

148 as JSON anyway. 

149 """ 

150 

151 def __init__( 

152 self, 

153 predicted_inputs: Iterable[DatasetId], 

154 predicted_outputs: Iterable[DatasetId], 

155 dimensions: DimensionUniverse, 

156 datastore: Datastore, 

157 storageClasses: StorageClassFactory, 

158 dataset_types: Mapping[str, DatasetType] | None = None, 

159 ): 

160 self._dimensions = dimensions 

161 self._predicted_inputs = set(predicted_inputs) 

162 self._predicted_outputs = set(predicted_outputs) 

163 self._available_inputs: Set[DatasetId] = set() 

164 self._unavailable_inputs: Set[DatasetId] = set() 

165 self._actual_inputs: Set[DatasetId] = set() 

166 self._actual_output_refs: Set[DatasetRef] = set() 

167 self.datastore = datastore 

168 self.storageClasses = storageClasses 

169 self._dataset_types: Mapping[str, DatasetType] = {} 

170 if dataset_types is not None: 

171 self._dataset_types = dataset_types 

172 self.datastore.set_retrieve_dataset_type_method(self._retrieve_dataset_type) 

173 

174 @classmethod 

175 def initialize( 

176 cls, 

177 config: Union[Config, str], 

178 quantum: Quantum, 

179 dimensions: DimensionUniverse, 

180 filename: str = ":memory:", 

181 OpaqueManagerClass: Type[OpaqueTableStorageManager] = ByNameOpaqueTableStorageManager, 

182 BridgeManagerClass: Type[DatastoreRegistryBridgeManager] = MonolithicDatastoreRegistryBridgeManager, 

183 search_paths: Optional[List[str]] = None, 

184 dataset_types: Mapping[str, DatasetType] | None = None, 

185 ) -> QuantumBackedButler: 

186 """Construct a new `QuantumBackedButler` from repository configuration 

187 and helper types. 

188 

189 Parameters 

190 ---------- 

191 config : `Config` or `str` 

192 A butler repository root, configuration filename, or configuration 

193 instance. 

194 quantum : `Quantum` 

195 Object describing the predicted input and output dataset relevant 

196 to this butler. This must have resolved `DatasetRef` instances for 

197 all inputs and outputs. 

198 dimensions : `DimensionUniverse` 

199 Object managing all dimension definitions. 

200 filename : `str`, optional 

201 Name for the SQLite database that will back this butler; defaults 

202 to an in-memory database. 

203 OpaqueManagerClass : `type`, optional 

204 A subclass of `OpaqueTableStorageManager` to use for datastore 

205 opaque records. Default is a SQL-backed implementation. 

206 BridgeManagerClass : `type`, optional 

207 A subclass of `DatastoreRegistryBridgeManager` to use for datastore 

208 location records. Default is a SQL-backed implementation. 

209 search_paths : `list` of `str`, optional 

210 Additional search paths for butler configuration. 

211 dataset_types: `Mapping` [`str`, `DatasetType`], optional 

212 Mapping of the dataset type name to its registry definition. 

213 """ 

214 predicted_inputs = [ref.id for ref in itertools.chain.from_iterable(quantum.inputs.values())] 

215 predicted_inputs += [ref.id for ref in quantum.initInputs.values()] 

216 predicted_outputs = [ref.id for ref in itertools.chain.from_iterable(quantum.outputs.values())] 

217 return cls._initialize( 

218 config=config, 

219 predicted_inputs=predicted_inputs, 

220 predicted_outputs=predicted_outputs, 

221 dimensions=dimensions, 

222 filename=filename, 

223 datastore_records=quantum.datastore_records, 

224 OpaqueManagerClass=OpaqueManagerClass, 

225 BridgeManagerClass=BridgeManagerClass, 

226 search_paths=search_paths, 

227 dataset_types=dataset_types, 

228 ) 

229 

230 @classmethod 

231 def from_predicted( 

232 cls, 

233 config: Union[Config, str], 

234 predicted_inputs: Iterable[DatasetId], 

235 predicted_outputs: Iterable[DatasetId], 

236 dimensions: DimensionUniverse, 

237 datastore_records: Mapping[str, DatastoreRecordData], 

238 filename: str = ":memory:", 

239 OpaqueManagerClass: Type[OpaqueTableStorageManager] = ByNameOpaqueTableStorageManager, 

240 BridgeManagerClass: Type[DatastoreRegistryBridgeManager] = MonolithicDatastoreRegistryBridgeManager, 

241 search_paths: Optional[List[str]] = None, 

242 dataset_types: Mapping[str, DatasetType] | None = None, 

243 ) -> QuantumBackedButler: 

244 """Construct a new `QuantumBackedButler` from sets of input and output 

245 dataset IDs. 

246 

247 Parameters 

248 ---------- 

249 config : `Config` or `str` 

250 A butler repository root, configuration filename, or configuration 

251 instance. 

252 predicted_inputs : `~collections.abc.Iterable` [`DatasetId`] 

253 Dataset IDs for datasets that can can be read from this butler. 

254 predicted_outputs : `~collections.abc.Iterable` [`DatasetId`] 

255 Dataset IDs for datasets that can be stored in this butler, must be 

256 fully resolved. 

257 dimensions : `DimensionUniverse` 

258 Object managing all dimension definitions. 

259 filename : `str`, optional 

260 Name for the SQLite database that will back this butler; defaults 

261 to an in-memory database. 

262 datastore_records : `dict` [`str`, `DatastoreRecordData`] or `None` 

263 Datastore records to import into a datastore. 

264 OpaqueManagerClass : `type`, optional 

265 A subclass of `OpaqueTableStorageManager` to use for datastore 

266 opaque records. Default is a SQL-backed implementation. 

267 BridgeManagerClass : `type`, optional 

268 A subclass of `DatastoreRegistryBridgeManager` to use for datastore 

269 location records. Default is a SQL-backed implementation. 

270 search_paths : `list` of `str`, optional 

271 Additional search paths for butler configuration. 

272 dataset_types: `Mapping` [`str`, `DatasetType`], optional 

273 Mapping of the dataset type name to its registry definition. 

274 """ 

275 return cls._initialize( 

276 config=config, 

277 predicted_inputs=predicted_inputs, 

278 predicted_outputs=predicted_outputs, 

279 dimensions=dimensions, 

280 filename=filename, 

281 datastore_records=datastore_records, 

282 OpaqueManagerClass=OpaqueManagerClass, 

283 BridgeManagerClass=BridgeManagerClass, 

284 search_paths=search_paths, 

285 dataset_types=dataset_types, 

286 ) 

287 

288 @classmethod 

289 def _initialize( 

290 cls, 

291 *, 

292 config: Union[Config, str], 

293 predicted_inputs: Iterable[DatasetId], 

294 predicted_outputs: Iterable[DatasetId], 

295 dimensions: DimensionUniverse, 

296 filename: str = ":memory:", 

297 datastore_records: Mapping[str, DatastoreRecordData] | None = None, 

298 OpaqueManagerClass: Type[OpaqueTableStorageManager] = ByNameOpaqueTableStorageManager, 

299 BridgeManagerClass: Type[DatastoreRegistryBridgeManager] = MonolithicDatastoreRegistryBridgeManager, 

300 search_paths: Optional[List[str]] = None, 

301 dataset_types: Mapping[str, DatasetType] | None = None, 

302 ) -> QuantumBackedButler: 

303 """Internal method with common implementation used by `initialize` and 

304 `for_output`. 

305 

306 Parameters 

307 ---------- 

308 config : `Config` or `str` 

309 A butler repository root, configuration filename, or configuration 

310 instance. 

311 predicted_inputs : `~collections.abc.Iterable` [`DatasetId`] 

312 Dataset IDs for datasets that can can be read from this butler. 

313 predicted_outputs : `~collections.abc.Iterable` [`DatasetId`] 

314 Dataset IDs for datasets that can be stored in this butler. 

315 dimensions : `DimensionUniverse` 

316 Object managing all dimension definitions. 

317 filename : `str`, optional 

318 Name for the SQLite database that will back this butler; defaults 

319 to an in-memory database. 

320 datastore_records : `dict` [`str`, `DatastoreRecordData`] or `None` 

321 Datastore records to import into a datastore. 

322 OpaqueManagerClass : `type`, optional 

323 A subclass of `OpaqueTableStorageManager` to use for datastore 

324 opaque records. Default is a SQL-backed implementation. 

325 BridgeManagerClass : `type`, optional 

326 A subclass of `DatastoreRegistryBridgeManager` to use for datastore 

327 location records. Default is a SQL-backed implementation. 

328 search_paths : `list` of `str`, optional 

329 Additional search paths for butler configuration. 

330 dataset_types: `Mapping` [`str`, `DatasetType`] 

331 Mapping of the dataset type name to its registry definition. 

332 """ 

333 butler_config = ButlerConfig(config, searchPaths=search_paths) 

334 if "root" in butler_config: 

335 butler_root = butler_config["root"] 

336 else: 

337 butler_root = butler_config.configDir 

338 db = SqliteDatabase.fromUri(f"sqlite:///{filename}", origin=0) 

339 with db.declareStaticTables(create=True) as context: 

340 opaque_manager = OpaqueManagerClass.initialize(db, context) 

341 bridge_manager = BridgeManagerClass.initialize( 

342 db, 

343 context, 

344 opaque=opaque_manager, 

345 # MyPy can tell it's a fake, but we know it shouldn't care. 

346 datasets=_DatasetRecordStorageManagerDatastoreConstructionMimic, # type: ignore 

347 universe=dimensions, 

348 ) 

349 # TODO: We need to inform `Datastore` here that it needs to support 

350 # predictive reads; right now that's a configuration option, but after 

351 # execution butler is retired it could just be a kwarg we pass here. 

352 # For now just force this option as we cannot work without it. 

353 butler_config["datastore", "trust_get_request"] = True 

354 datastore = Datastore.fromConfig(butler_config, bridge_manager, butler_root) 

355 if datastore_records is not None: 

356 datastore.import_records(datastore_records) 

357 storageClasses = StorageClassFactory() 

358 storageClasses.addFromConfig(butler_config) 

359 return cls( 

360 predicted_inputs, 

361 predicted_outputs, 

362 dimensions, 

363 datastore, 

364 storageClasses=storageClasses, 

365 dataset_types=dataset_types, 

366 ) 

367 

368 def _retrieve_dataset_type(self, name: str) -> DatasetType | None: 

369 """Return DatasetType defined in registry given dataset type name.""" 

370 return self._dataset_types.get(name) 

371 

372 def isWriteable(self) -> bool: 

373 # Docstring inherited. 

374 return True 

375 

376 @deprecated( 

377 reason="Butler.get() now behaves like Butler.getDirect() when given a DatasetRef." 

378 " Please use Butler.get(). Will be removed after v27.0.", 

379 version="v26.0", 

380 category=FutureWarning, 

381 ) 

382 def getDirect( 

383 self, 

384 ref: DatasetRef, 

385 *, 

386 parameters: Optional[Dict[str, Any]] = None, 

387 storageClass: str | StorageClass | None = None, 

388 ) -> Any: 

389 # Docstring inherited. 

390 return self.get(ref, parameters=parameters, storageClass=storageClass) 

391 

392 def get( 

393 self, 

394 ref: DatasetRef, 

395 /, 

396 *, 

397 parameters: dict[str, Any] | None = None, 

398 storageClass: StorageClass | str | None = None, 

399 ) -> Any: 

400 try: 

401 obj = super().get( 

402 ref, 

403 parameters=parameters, 

404 storageClass=storageClass, 

405 ) 

406 except (LookupError, FileNotFoundError, IOError): 

407 self._unavailable_inputs.add(ref.id) 

408 raise 

409 if ref.id in self._predicted_inputs: 

410 # do this after delegating to super in case that raises. 

411 self._actual_inputs.add(ref.id) 

412 self._available_inputs.add(ref.id) 

413 return obj 

414 

415 @deprecated( 

416 reason="Butler.getDeferred() now behaves like getDirectDeferred() when given a DatasetRef. " 

417 "Please use Butler.getDeferred(). Will be removed after v27.0.", 

418 version="v26.0", 

419 category=FutureWarning, 

420 ) 

421 def getDirectDeferred( 

422 self, 

423 ref: DatasetRef, 

424 *, 

425 parameters: Union[dict, None] = None, 

426 storageClass: str | StorageClass | None = None, 

427 ) -> DeferredDatasetHandle: 

428 # Docstring inherited. 

429 return self.getDeferred(ref, parameters=parameters, storageClass=storageClass) 

430 

431 def getDeferred( 

432 self, 

433 ref: DatasetRef, 

434 /, 

435 *, 

436 parameters: dict[str, Any] | None = None, 

437 storageClass: str | StorageClass | None = None, 

438 ) -> DeferredDatasetHandle: 

439 if ref.id in self._predicted_inputs: 

440 # Unfortunately, we can't do this after the handle succeeds in 

441 # loading, so it's conceivable here that we're marking an input 

442 # as "actual" even when it's not even available. 

443 self._actual_inputs.add(ref.id) 

444 return super().getDeferred(ref, parameters=parameters, storageClass=storageClass) 

445 

446 def datasetExistsDirect(self, ref: DatasetRef) -> bool: 

447 # Docstring inherited. 

448 exists = super().datasetExistsDirect(ref) 

449 if ref.id in self._predicted_inputs: 

450 if exists: 

451 self._available_inputs.add(ref.id) 

452 else: 

453 self._unavailable_inputs.add(ref.id) 

454 return exists 

455 

456 def markInputUnused(self, ref: DatasetRef) -> None: 

457 # Docstring inherited. 

458 self._actual_inputs.discard(ref.id) 

459 

460 @property 

461 def dimensions(self) -> DimensionUniverse: 

462 # Docstring inherited. 

463 return self._dimensions 

464 

465 def put(self, obj: Any, ref: DatasetRef, /) -> DatasetRef: 

466 # Docstring inherited. 

467 if ref.id not in self._predicted_outputs: 

468 raise RuntimeError("Cannot `put` dataset that was not predicted as an output.") 

469 self.datastore.put(obj, ref) 

470 self._actual_output_refs.add(ref) 

471 return ref 

472 

473 def pruneDatasets( 

474 self, 

475 refs: Iterable[DatasetRef], 

476 *, 

477 disassociate: bool = True, 

478 unstore: bool = False, 

479 tags: Iterable[str] = (), 

480 purge: bool = False, 

481 ) -> None: 

482 # docstring inherited from LimitedButler 

483 

484 if purge: 

485 if not disassociate: 

486 raise TypeError("Cannot pass purge=True without disassociate=True.") 

487 if not unstore: 

488 raise TypeError("Cannot pass purge=True without unstore=True.") 

489 elif disassociate: 

490 # No tagged collections for this butler. 

491 raise TypeError("Cannot pass disassociate=True without purge=True.") 

492 

493 refs = list(refs) 

494 

495 # Pruning a component of a DatasetRef makes no sense. 

496 for ref in refs: 

497 if ref.datasetType.component(): 

498 raise ValueError(f"Can not prune a component of a dataset (ref={ref})") 

499 

500 if unstore: 

501 self.datastore.trash(refs) 

502 if purge: 

503 for ref in refs: 

504 # We only care about removing them from actual output refs, 

505 self._actual_output_refs.discard(ref) 

506 

507 if unstore: 

508 # Point of no return for removing artifacts 

509 self.datastore.emptyTrash() 

510 

511 def extract_provenance_data(self) -> QuantumProvenanceData: 

512 """Extract provenance information and datastore records from this 

513 butler. 

514 

515 Returns 

516 ------- 

517 provenance : `QuantumProvenanceData` 

518 A serializable struct containing input/output dataset IDs and 

519 datastore records. This assumes all dataset IDs are UUIDs (just to 

520 make it easier for `pydantic` to reason about the struct's types); 

521 the rest of this class makes no such assumption, but the approach 

522 to processing in which it's useful effectively requires UUIDs 

523 anyway. 

524 

525 Notes 

526 ----- 

527 `QuantumBackedButler` records this provenance information when its 

528 methods are used, which mostly saves `~lsst.pipe.base.PipelineTask` 

529 authors from having to worry about while still recording very 

530 detailed information. But it has two small weaknesses: 

531 

532 - Calling `getDirectDeferred` or `getDirect` is enough to mark a 

533 dataset as an "actual input", which may mark some datasets that 

534 aren't actually used. We rely on task authors to use 

535 `markInputUnused` to address this. 

536 

537 - We assume that the execution system will call ``datasetExistsDirect`` 

538 on all predicted inputs prior to execution, in order to populate the 

539 "available inputs" set. This is what I envision 

540 '`~lsst.ctrl.mpexec.SingleQuantumExecutor` doing after we update it 

541 to use this class, but it feels fragile for this class to make such 

542 a strong assumption about how it will be used, even if I can't think 

543 of any other executor behavior that would make sense. 

544 """ 

545 if not self._actual_inputs.isdisjoint(self._unavailable_inputs): 

546 _LOG.warning( 

547 "Inputs %s were marked as actually used (probably because a DeferredDatasetHandle) " 

548 "was obtained, but did not actually exist. This task should be be using markInputUnused " 

549 "directly to clarify its provenance.", 

550 self._actual_inputs & self._unavailable_inputs, 

551 ) 

552 self._actual_inputs -= self._unavailable_inputs 

553 checked_inputs = self._available_inputs | self._unavailable_inputs 

554 if not self._predicted_inputs == checked_inputs: 

555 _LOG.warning( 

556 "Execution harness did not check predicted inputs %s for existence; available inputs " 

557 "recorded in provenance may be incomplete.", 

558 self._predicted_inputs - checked_inputs, 

559 ) 

560 datastore_records = self.datastore.export_records(self._actual_output_refs) 

561 provenance_records = { 

562 datastore_name: records.to_simple() for datastore_name, records in datastore_records.items() 

563 } 

564 

565 return QuantumProvenanceData( 

566 predicted_inputs=self._predicted_inputs, 

567 available_inputs=self._available_inputs, 

568 actual_inputs=self._actual_inputs, 

569 predicted_outputs=self._predicted_outputs, 

570 actual_outputs={ref.id for ref in self._actual_output_refs}, 

571 datastore_records=provenance_records, 

572 ) 

573 

574 

575class QuantumProvenanceData(BaseModel): 

576 """A serializable struct for per-quantum provenance information and 

577 datastore records. 

578 

579 Notes 

580 ----- 

581 This class slightly duplicates information from the `Quantum` class itself 

582 (the `predicted_inputs` and `predicted_outputs` sets should have the same 

583 IDs present in `Quantum.inputs` and `Quantum.outputs`), but overall it 

584 assumes the original `Quantum` is also available to reconstruct the 

585 complete provenance (e.g. by associating dataset IDs with data IDs, 

586 dataset types, and `~CollectionType.RUN` names. 

587 

588 Note that ``pydantic`` method ``parse_raw()`` is not going to work 

589 correctly for this class, use `direct` method instead. 

590 """ 

591 

592 # This class probably should have information about its execution 

593 # environment (anything not controlled and recorded at the 

594 # `~CollectionType.RUN` level, such as the compute node ID). but adding it 

595 # now is out of scope for this prototype. 

596 

597 predicted_inputs: Set[uuid.UUID] 

598 """Unique IDs of datasets that were predicted as inputs to this quantum 

599 when the QuantumGraph was built. 

600 """ 

601 

602 available_inputs: Set[uuid.UUID] 

603 """Unique IDs of input datasets that were actually present in the datastore 

604 when this quantum was executed. 

605 

606 This is a subset of `predicted_inputs`, with the difference generally being 

607 datasets were `predicted_outputs` but not `actual_outputs` of some upstream 

608 task. 

609 """ 

610 

611 actual_inputs: Set[uuid.UUID] 

612 """Unique IDs of datasets that were actually used as inputs by this task. 

613 

614 This is a subset of `available_inputs`. 

615 

616 Notes 

617 ----- 

618 The criteria for marking an input as used is that rerunning the quantum 

619 with only these `actual_inputs` available must yield identical outputs. 

620 This means that (for example) even just using an input to help determine 

621 an output rejection criteria and then rejecting it as an outlier qualifies 

622 that input as actually used. 

623 """ 

624 

625 predicted_outputs: Set[uuid.UUID] 

626 """Unique IDs of datasets that were predicted as outputs of this quantum 

627 when the QuantumGraph was built. 

628 """ 

629 

630 actual_outputs: Set[uuid.UUID] 

631 """Unique IDs of datasets that were actually written when this quantum 

632 was executed. 

633 """ 

634 

635 datastore_records: Dict[str, SerializedDatastoreRecordData] 

636 """Datastore records indexed by datastore name.""" 

637 

638 @staticmethod 

639 def collect_and_transfer( 

640 butler: Butler, quanta: Iterable[Quantum], provenance: Iterable[QuantumProvenanceData] 

641 ) -> None: 

642 """Transfer output datasets from multiple quanta to a more permantent 

643 `Butler` repository. 

644 

645 Parameters 

646 ---------- 

647 butler : `Butler` 

648 Full butler representing the data repository to transfer datasets 

649 to. 

650 quanta : `Iterable` [ `Quantum` ] 

651 Iterable of `Quantum` objects that carry information about 

652 predicted outputs. May be a single-pass iterator. 

653 provenance : `Iterable` [ `QuantumProvenanceData` ] 

654 Provenance and datastore data for each of the given quanta, in the 

655 same order. May be a single-pass iterator. 

656 

657 Notes 

658 ----- 

659 Input-output provenance data is not actually transferred yet, because 

660 `Registry` has no place to store it. 

661 

662 This method probably works most efficiently if run on all quanta for a 

663 single task label at once, because this will gather all datasets of 

664 a particular type together into a single vectorized `Registry` import. 

665 It should still behave correctly if run on smaller groups of quanta 

666 or even quanta from multiple tasks. 

667 

668 Currently this method transfers datastore record data unchanged, with 

669 no possibility of actually moving (e.g.) files. Datastores that are 

670 present only in execution or only in the more permanent butler are 

671 ignored. 

672 """ 

673 grouped_refs = defaultdict(list) 

674 summary_records: Dict[str, DatastoreRecordData] = {} 

675 for quantum, provenance_for_quantum in zip(quanta, provenance): 

676 quantum_refs_by_id = { 

677 ref.id: ref 

678 for ref in itertools.chain.from_iterable(quantum.outputs.values()) 

679 if ref.id in provenance_for_quantum.actual_outputs 

680 } 

681 for ref in quantum_refs_by_id.values(): 

682 grouped_refs[ref.datasetType, ref.run].append(ref) 

683 

684 # merge datastore records into a summary structure 

685 for datastore_name, serialized_records in provenance_for_quantum.datastore_records.items(): 

686 quantum_records = DatastoreRecordData.from_simple(serialized_records) 

687 if (records := summary_records.get(datastore_name)) is not None: 

688 records.update(quantum_records) 

689 else: 

690 summary_records[datastore_name] = quantum_records 

691 

692 for refs in grouped_refs.values(): 

693 butler.registry._importDatasets(refs) 

694 butler.datastore.import_records(summary_records) 

695 

696 @classmethod 

697 def parse_raw(cls, *args: Any, **kwargs: Any) -> QuantumProvenanceData: 

698 raise NotImplementedError("parse_raw() is not usable for this class, use direct() instead.") 

699 

700 @classmethod 

701 def direct( 

702 cls, 

703 *, 

704 predicted_inputs: Iterable[Union[str, uuid.UUID]], 

705 available_inputs: Iterable[Union[str, uuid.UUID]], 

706 actual_inputs: Iterable[Union[str, uuid.UUID]], 

707 predicted_outputs: Iterable[Union[str, uuid.UUID]], 

708 actual_outputs: Iterable[Union[str, uuid.UUID]], 

709 datastore_records: Mapping[str, Mapping], 

710 ) -> QuantumProvenanceData: 

711 """Construct an instance directly without validators. 

712 

713 This differs from the pydantic "construct" method in that the 

714 arguments are explicitly what the model requires, and it will recurse 

715 through members, constructing them from their corresponding `direct` 

716 methods. 

717 

718 This method should only be called when the inputs are trusted. 

719 """ 

720 

721 def _to_uuid_set(uuids: Iterable[Union[str, uuid.UUID]]) -> Set[uuid.UUID]: 

722 """Convert input UUIDs, which could be in string representation to 

723 a set of `UUID` instances. 

724 """ 

725 return set(uuid.UUID(id) if isinstance(id, str) else id for id in uuids) 

726 

727 data = QuantumProvenanceData.__new__(cls) 

728 setter = object.__setattr__ 

729 setter(data, "predicted_inputs", _to_uuid_set(predicted_inputs)) 

730 setter(data, "available_inputs", _to_uuid_set(available_inputs)) 

731 setter(data, "actual_inputs", _to_uuid_set(actual_inputs)) 

732 setter(data, "predicted_outputs", _to_uuid_set(predicted_outputs)) 

733 setter(data, "actual_outputs", _to_uuid_set(actual_outputs)) 

734 setter( 

735 data, 

736 "datastore_records", 

737 { 

738 key: SerializedDatastoreRecordData.direct(**records) 

739 for key, records in datastore_records.items() 

740 }, 

741 ) 

742 return data