Coverage for python/lsst/daf/butler/_quantum_backed.py: 25%

173 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2023-02-22 03:05 -0800

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ("QuantumBackedButler", "QuantumProvenanceData") 

25 

26import itertools 

27import logging 

28import uuid 

29from collections import defaultdict 

30from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Mapping, Optional, Set, Type, Union 

31 

32from pydantic import BaseModel 

33 

34from ._butlerConfig import ButlerConfig 

35from ._deferredDatasetHandle import DeferredDatasetHandle 

36from ._limited_butler import LimitedButler 

37from .core import ( 

38 Config, 

39 DatasetId, 

40 DatasetRef, 

41 Datastore, 

42 DatastoreRecordData, 

43 DimensionUniverse, 

44 Quantum, 

45 SerializedDatastoreRecordData, 

46 StorageClass, 

47 StorageClassFactory, 

48 ddl, 

49) 

50from .registry.bridge.monolithic import MonolithicDatastoreRegistryBridgeManager 

51from .registry.databases.sqlite import SqliteDatabase 

52from .registry.interfaces import DatastoreRegistryBridgeManager, OpaqueTableStorageManager 

53from .registry.opaque import ByNameOpaqueTableStorageManager 

54 

55if TYPE_CHECKING: 55 ↛ 56line 55 didn't jump to line 56, because the condition on line 55 was never true

56 from ._butler import Butler 

57 

58_LOG = logging.getLogger(__name__) 

59 

60 

61class _DatasetRecordStorageManagerDatastoreConstructionMimic: 

62 """A partial implementation of `DatasetRecordStorageManager` that exists 

63 only to allow a `DatastoreRegistryBridgeManager` (and hence a `Datastore`) 

64 to be constructed without a full `Registry`. 

65 

66 Notes 

67 ----- 

68 The interface implemented by this class should probably be its own ABC, 

69 and that ABC should probably be used in the definition of 

70 `DatastoreRegistryBridgeManager`, but while prototyping I'm trying to keep 

71 changes minimal. 

72 """ 

73 

74 @classmethod 

75 def getIdColumnType(cls) -> type: 

76 # Docstring inherited. 

77 return ddl.GUID 

78 

79 @classmethod 

80 def addDatasetForeignKey( 

81 cls, 

82 tableSpec: ddl.TableSpec, 

83 *, 

84 name: str = "dataset", 

85 constraint: bool = True, 

86 onDelete: Optional[str] = None, 

87 **kwargs: Any, 

88 ) -> ddl.FieldSpec: 

89 # Docstring inherited. 

90 idFieldSpec = ddl.FieldSpec(f"{name}_id", dtype=ddl.GUID, **kwargs) 

91 tableSpec.fields.add(idFieldSpec) 

92 return idFieldSpec 

93 

94 

95class QuantumBackedButler(LimitedButler): 

96 """An implementation of `LimitedButler` intended to back execution of a 

97 single `Quantum`. 

98 

99 Parameters 

100 ---------- 

101 predicted_inputs : `~collections.abc.Iterable` [`DatasetId`] 

102 Dataset IDs for datasets that can can be read from this butler. 

103 predicted_outputs : `~collections.abc.Iterable` [`DatasetId`] 

104 Dataset IDs for datasets that can be stored in this butler. 

105 dimensions : `DimensionUniverse` 

106 Object managing all dimension definitions. 

107 datastore : `Datastore` 

108 Datastore to use for all dataset I/O and existence checks. 

109 storageClasses : `StorageClassFactory` 

110 Object managing all storage class definitions. 

111 

112 Notes 

113 ----- 

114 Most callers should use the `initialize` `classmethod` to construct new 

115 instances instead of calling the constructor directly. 

116 

117 `QuantumBackedButler` uses a SQLite database internally, in order to reuse 

118 existing `DatastoreRegistryBridge` and `OpaqueTableStorage` 

119 implementations that rely SQLAlchemy. If implementations are added in the 

120 future that don't rely on SQLAlchemy, it should be possible to swap them 

121 in by overriding the type arguments to `initialize` (though at present, 

122 `QuantumBackedButler` would still create at least an in-memory SQLite 

123 database that would then go unused).` 

124 

125 We imagine `QuantumBackedButler` being used during (at least) batch 

126 execution to capture `Datastore` records and save them to per-quantum 

127 files, which are also a convenient place to store provenance for eventual 

128 upload to a SQL-backed `Registry` (once `Registry` has tables to store 

129 provenance, that is). 

130 These per-quantum files can be written in two ways: 

131 

132 - The SQLite file used internally by `QuantumBackedButler` can be used 

133 directly but customizing the ``filename`` argument to ``initialize``, and 

134 then transferring that file to the object store after execution completes 

135 (or fails; a ``try/finally`` pattern probably makes sense here). 

136 

137 - A JSON or YAML file can be written by calling `extract_provenance_data`, 

138 and using ``pydantic`` methods to write the returned 

139 `QuantumProvenanceData` to a file. 

140 

141 Note that at present, the SQLite file only contains datastore records, not 

142 provenance, but that should be easy to address (if desired) after we 

143 actually design a `Registry` schema for provenance. I also suspect that 

144 we'll want to explicitly close the SQLite file somehow before trying to 

145 transfer it. But I'm guessing we'd prefer to write the per-quantum files 

146 as JSON anyway. 

147 """ 

148 

149 def __init__( 

150 self, 

151 predicted_inputs: Iterable[DatasetId], 

152 predicted_outputs: Iterable[DatasetId], 

153 dimensions: DimensionUniverse, 

154 datastore: Datastore, 

155 storageClasses: StorageClassFactory, 

156 ): 

157 self._dimensions = dimensions 

158 self._predicted_inputs = set(predicted_inputs) 

159 self._predicted_outputs = set(predicted_outputs) 

160 self._available_inputs: Set[DatasetId] = set() 

161 self._unavailable_inputs: Set[DatasetId] = set() 

162 self._actual_inputs: Set[DatasetId] = set() 

163 self._actual_output_refs: Set[DatasetRef] = set() 

164 self.datastore = datastore 

165 self.storageClasses = storageClasses 

166 

167 @classmethod 

168 def initialize( 

169 cls, 

170 config: Union[Config, str], 

171 quantum: Quantum, 

172 dimensions: DimensionUniverse, 

173 filename: str = ":memory:", 

174 OpaqueManagerClass: Type[OpaqueTableStorageManager] = ByNameOpaqueTableStorageManager, 

175 BridgeManagerClass: Type[DatastoreRegistryBridgeManager] = MonolithicDatastoreRegistryBridgeManager, 

176 search_paths: Optional[List[str]] = None, 

177 ) -> QuantumBackedButler: 

178 """Construct a new `QuantumBackedButler` from repository configuration 

179 and helper types. 

180 

181 Parameters 

182 ---------- 

183 config : `Config` or `str` 

184 A butler repository root, configuration filename, or configuration 

185 instance. 

186 quantum : `Quantum` 

187 Object describing the predicted input and output dataset relevant 

188 to this butler. This must have resolved `DatasetRef` instances for 

189 all inputs and outputs. 

190 dimensions : `DimensionUniverse` 

191 Object managing all dimension definitions. 

192 filename : `str`, optional 

193 Name for the SQLite database that will back this butler; defaults 

194 to an in-memory database. 

195 OpaqueManagerClass : `type`, optional 

196 A subclass of `OpaqueTableStorageManager` to use for datastore 

197 opaque records. Default is a SQL-backed implementation. 

198 BridgeManagerClass : `type`, optional 

199 A subclass of `DatastoreRegistryBridgeManager` to use for datastore 

200 location records. Default is a SQL-backed implementation. 

201 search_paths : `list` of `str`, optional 

202 Additional search paths for butler configuration. 

203 """ 

204 predicted_inputs = [ 

205 ref.getCheckedId() for ref in itertools.chain.from_iterable(quantum.inputs.values()) 

206 ] 

207 predicted_inputs += [ref.getCheckedId() for ref in quantum.initInputs.values()] 

208 predicted_outputs = [ 

209 ref.getCheckedId() for ref in itertools.chain.from_iterable(quantum.outputs.values()) 

210 ] 

211 return cls._initialize( 

212 config=config, 

213 predicted_inputs=predicted_inputs, 

214 predicted_outputs=predicted_outputs, 

215 dimensions=dimensions, 

216 filename=filename, 

217 datastore_records=quantum.datastore_records, 

218 OpaqueManagerClass=OpaqueManagerClass, 

219 BridgeManagerClass=BridgeManagerClass, 

220 search_paths=search_paths, 

221 ) 

222 

223 @classmethod 

224 def from_predicted( 

225 cls, 

226 config: Union[Config, str], 

227 predicted_inputs: Iterable[DatasetId], 

228 predicted_outputs: Iterable[DatasetId], 

229 dimensions: DimensionUniverse, 

230 datastore_records: Mapping[str, DatastoreRecordData], 

231 filename: str = ":memory:", 

232 OpaqueManagerClass: Type[OpaqueTableStorageManager] = ByNameOpaqueTableStorageManager, 

233 BridgeManagerClass: Type[DatastoreRegistryBridgeManager] = MonolithicDatastoreRegistryBridgeManager, 

234 search_paths: Optional[List[str]] = None, 

235 ) -> QuantumBackedButler: 

236 """Construct a new `QuantumBackedButler` from sets of input and output 

237 dataset IDs. 

238 

239 Parameters 

240 ---------- 

241 config : `Config` or `str` 

242 A butler repository root, configuration filename, or configuration 

243 instance. 

244 predicted_inputs : `~collections.abc.Iterable` [`DatasetId`] 

245 Dataset IDs for datasets that can can be read from this butler. 

246 predicted_outputs : `~collections.abc.Iterable` [`DatasetId`] 

247 Dataset IDs for datasets that can be stored in this butler, must be 

248 fully resolved. 

249 dimensions : `DimensionUniverse` 

250 Object managing all dimension definitions. 

251 filename : `str`, optional 

252 Name for the SQLite database that will back this butler; defaults 

253 to an in-memory database. 

254 datastore_records : `dict` [`str`, `DatastoreRecordData`] or `None` 

255 Datastore records to import into a datastore. 

256 OpaqueManagerClass : `type`, optional 

257 A subclass of `OpaqueTableStorageManager` to use for datastore 

258 opaque records. Default is a SQL-backed implementation. 

259 BridgeManagerClass : `type`, optional 

260 A subclass of `DatastoreRegistryBridgeManager` to use for datastore 

261 location records. Default is a SQL-backed implementation. 

262 search_paths : `list` of `str`, optional 

263 Additional search paths for butler configuration. 

264 """ 

265 return cls._initialize( 

266 config=config, 

267 predicted_inputs=predicted_inputs, 

268 predicted_outputs=predicted_outputs, 

269 dimensions=dimensions, 

270 filename=filename, 

271 datastore_records=datastore_records, 

272 OpaqueManagerClass=OpaqueManagerClass, 

273 BridgeManagerClass=BridgeManagerClass, 

274 search_paths=search_paths, 

275 ) 

276 

277 @classmethod 

278 def _initialize( 

279 cls, 

280 *, 

281 config: Union[Config, str], 

282 predicted_inputs: Iterable[DatasetId], 

283 predicted_outputs: Iterable[DatasetId], 

284 dimensions: DimensionUniverse, 

285 filename: str = ":memory:", 

286 datastore_records: Mapping[str, DatastoreRecordData] | None = None, 

287 OpaqueManagerClass: Type[OpaqueTableStorageManager] = ByNameOpaqueTableStorageManager, 

288 BridgeManagerClass: Type[DatastoreRegistryBridgeManager] = MonolithicDatastoreRegistryBridgeManager, 

289 search_paths: Optional[List[str]] = None, 

290 ) -> QuantumBackedButler: 

291 """Internal method with common implementation used by `initialize` and 

292 `for_output`. 

293 

294 Parameters 

295 ---------- 

296 config : `Config` or `str` 

297 A butler repository root, configuration filename, or configuration 

298 instance. 

299 predicted_inputs : `~collections.abc.Iterable` [`DatasetId`] 

300 Dataset IDs for datasets that can can be read from this butler. 

301 predicted_outputs : `~collections.abc.Iterable` [`DatasetId`] 

302 Dataset IDs for datasets that can be stored in this butler. 

303 dimensions : `DimensionUniverse` 

304 Object managing all dimension definitions. 

305 filename : `str`, optional 

306 Name for the SQLite database that will back this butler; defaults 

307 to an in-memory database. 

308 datastore_records : `dict` [`str`, `DatastoreRecordData`] or `None` 

309 Datastore records to import into a datastore. 

310 OpaqueManagerClass : `type`, optional 

311 A subclass of `OpaqueTableStorageManager` to use for datastore 

312 opaque records. Default is a SQL-backed implementation. 

313 BridgeManagerClass : `type`, optional 

314 A subclass of `DatastoreRegistryBridgeManager` to use for datastore 

315 location records. Default is a SQL-backed implementation. 

316 search_paths : `list` of `str`, optional 

317 Additional search paths for butler configuration. 

318 """ 

319 butler_config = ButlerConfig(config, searchPaths=search_paths) 

320 if "root" in butler_config: 

321 butler_root = butler_config["root"] 

322 else: 

323 butler_root = butler_config.configDir 

324 db = SqliteDatabase.fromUri(f"sqlite:///{filename}", origin=0) 

325 with db.declareStaticTables(create=True) as context: 

326 opaque_manager = OpaqueManagerClass.initialize(db, context) 

327 bridge_manager = BridgeManagerClass.initialize( 

328 db, 

329 context, 

330 opaque=opaque_manager, 

331 # MyPy can tell it's a fake, but we know it shouldn't care. 

332 datasets=_DatasetRecordStorageManagerDatastoreConstructionMimic, # type: ignore 

333 universe=dimensions, 

334 ) 

335 # TODO: We need to inform `Datastore` here that it needs to support 

336 # predictive reads; right now that's a configuration option, but after 

337 # execution butler is retired it could just be a kwarg we pass here. 

338 # For now just force this option as we cannot work without it. 

339 butler_config["datastore", "trust_get_request"] = True 

340 datastore = Datastore.fromConfig(butler_config, bridge_manager, butler_root) 

341 if datastore_records is not None: 

342 datastore.import_records(datastore_records) 

343 storageClasses = StorageClassFactory() 

344 storageClasses.addFromConfig(butler_config) 

345 return cls(predicted_inputs, predicted_outputs, dimensions, datastore, storageClasses=storageClasses) 

346 

347 def isWriteable(self) -> bool: 

348 # Docstring inherited. 

349 return True 

350 

351 def getDirect( 

352 self, 

353 ref: DatasetRef, 

354 *, 

355 parameters: Optional[Dict[str, Any]] = None, 

356 storageClass: str | StorageClass | None = None, 

357 ) -> Any: 

358 # Docstring inherited. 

359 try: 

360 obj = super().getDirect(ref, parameters=parameters, storageClass=storageClass) 

361 except (LookupError, FileNotFoundError, IOError): 

362 self._unavailable_inputs.add(ref.getCheckedId()) 

363 raise 

364 if ref.id in self._predicted_inputs: 

365 # do this after delegating to super in case that raises. 

366 self._actual_inputs.add(ref.id) 

367 self._available_inputs.add(ref.id) 

368 return obj 

369 

370 def getDirectDeferred( 

371 self, 

372 ref: DatasetRef, 

373 *, 

374 parameters: Union[dict, None] = None, 

375 storageClass: str | StorageClass | None = None, 

376 ) -> DeferredDatasetHandle: 

377 # Docstring inherited. 

378 if ref.id in self._predicted_inputs: 

379 # Unfortunately, we can't do this after the handle succeeds in 

380 # loading, so it's conceivable here that we're marking an input 

381 # as "actual" even when it's not even available. 

382 self._actual_inputs.add(ref.id) 

383 return super().getDirectDeferred(ref, parameters=parameters, storageClass=storageClass) 

384 

385 def datasetExistsDirect(self, ref: DatasetRef) -> bool: 

386 # Docstring inherited. 

387 exists = super().datasetExistsDirect(ref) 

388 if ref.id in self._predicted_inputs: 

389 if exists: 

390 self._available_inputs.add(ref.id) 

391 else: 

392 self._unavailable_inputs.add(ref.id) 

393 return exists 

394 

395 def markInputUnused(self, ref: DatasetRef) -> None: 

396 # Docstring inherited. 

397 self._actual_inputs.discard(ref.getCheckedId()) 

398 

399 @property 

400 def dimensions(self) -> DimensionUniverse: 

401 # Docstring inherited. 

402 return self._dimensions 

403 

404 def putDirect(self, obj: Any, ref: DatasetRef) -> DatasetRef: 

405 # Docstring inherited. 

406 if ref.id not in self._predicted_outputs: 

407 raise RuntimeError("Cannot `put` dataset that was not predicted as an output.") 

408 self.datastore.put(obj, ref) 

409 self._actual_output_refs.add(ref) 

410 return ref 

411 

412 def pruneDatasets( 

413 self, 

414 refs: Iterable[DatasetRef], 

415 *, 

416 disassociate: bool = True, 

417 unstore: bool = False, 

418 tags: Iterable[str] = (), 

419 purge: bool = False, 

420 ) -> None: 

421 # docstring inherited from LimitedButler 

422 

423 if purge: 

424 if not disassociate: 

425 raise TypeError("Cannot pass purge=True without disassociate=True.") 

426 if not unstore: 

427 raise TypeError("Cannot pass purge=True without unstore=True.") 

428 elif disassociate: 

429 # No tagged collections for this butler. 

430 raise TypeError("Cannot pass disassociate=True without purge=True.") 

431 

432 refs = list(refs) 

433 

434 # Pruning a component of a DatasetRef makes no sense. 

435 for ref in refs: 

436 if ref.datasetType.component(): 

437 raise ValueError(f"Can not prune a component of a dataset (ref={ref})") 

438 

439 if unstore: 

440 self.datastore.trash(refs) 

441 if purge: 

442 for ref in refs: 

443 # We only care about removing them from actual output refs, 

444 self._actual_output_refs.discard(ref) 

445 

446 if unstore: 

447 # Point of no return for removing artifacts 

448 self.datastore.emptyTrash() 

449 

450 def extract_provenance_data(self) -> QuantumProvenanceData: 

451 """Extract provenance information and datastore records from this 

452 butler. 

453 

454 Returns 

455 ------- 

456 provenance : `QuantumProvenanceData` 

457 A serializable struct containing input/output dataset IDs and 

458 datastore records. This assumes all dataset IDs are UUIDs (just to 

459 make it easier for `pydantic` to reason about the struct's types); 

460 the rest of this class makes no such assumption, but the approach 

461 to processing in which it's useful effectively requires UUIDs 

462 anyway. 

463 

464 Notes 

465 ----- 

466 `QuantumBackedButler` records this provenance information when its 

467 methods are used, which mostly saves `~lsst.pipe.base.PipelineTask` 

468 authors from having to worry about while still recording very 

469 detailed information. But it has two small weaknesses: 

470 

471 - Calling `getDirectDeferred` or `getDirect` is enough to mark a 

472 dataset as an "actual input", which may mark some datasets that 

473 aren't actually used. We rely on task authors to use 

474 `markInputUnused` to address this. 

475 

476 - We assume that the execution system will call ``datasetExistsDirect`` 

477 on all predicted inputs prior to execution, in order to populate the 

478 "available inputs" set. This is what I envision 

479 '`~lsst.ctrl.mpexec.SingleQuantumExecutor` doing after we update it 

480 to use this class, but it feels fragile for this class to make such 

481 a strong assumption about how it will be used, even if I can't think 

482 of any other executor behavior that would make sense. 

483 """ 

484 if not self._actual_inputs.isdisjoint(self._unavailable_inputs): 

485 _LOG.warning( 

486 "Inputs %s were marked as actually used (probably because a DeferredDatasetHandle) " 

487 "was obtained, but did not actually exist. This task should be be using markInputUnused " 

488 "directly to clarify its provenance.", 

489 self._actual_inputs & self._unavailable_inputs, 

490 ) 

491 self._actual_inputs -= self._unavailable_inputs 

492 checked_inputs = self._available_inputs | self._unavailable_inputs 

493 if not self._predicted_inputs == checked_inputs: 

494 _LOG.warning( 

495 "Execution harness did not check predicted inputs %s for existence; available inputs " 

496 "recorded in provenance may be incomplete.", 

497 self._predicted_inputs - checked_inputs, 

498 ) 

499 datastore_records = self.datastore.export_records(self._actual_output_refs) 

500 provenance_records = { 

501 datastore_name: records.to_simple() for datastore_name, records in datastore_records.items() 

502 } 

503 

504 return QuantumProvenanceData( 

505 predicted_inputs=self._predicted_inputs, 

506 available_inputs=self._available_inputs, 

507 actual_inputs=self._actual_inputs, 

508 predicted_outputs=self._predicted_outputs, 

509 actual_outputs={ref.getCheckedId() for ref in self._actual_output_refs}, 

510 datastore_records=provenance_records, 

511 ) 

512 

513 

514class QuantumProvenanceData(BaseModel): 

515 """A serializable struct for per-quantum provenance information and 

516 datastore records. 

517 

518 Notes 

519 ----- 

520 This class slightly duplicates information from the `Quantum` class itself 

521 (the `predicted_inputs` and `predicted_outputs` sets should have the same 

522 IDs present in `Quantum.inputs` and `Quantum.outputs`), but overall it 

523 assumes the original `Quantum` is also available to reconstruct the 

524 complete provenance (e.g. by associating dataset IDs with data IDs, 

525 dataset types, and `~CollectionType.RUN` names. 

526 

527 Note that ``pydantic`` method ``parse_raw()`` is not going to work 

528 correctly for this class, use `direct` method instead. 

529 """ 

530 

531 # This class probably should have information about its execution 

532 # environment (anything not controlled and recorded at the 

533 # `~CollectionType.RUN` level, such as the compute node ID). but adding it 

534 # now is out of scope for this prototype. 

535 

536 predicted_inputs: Set[uuid.UUID] 

537 """Unique IDs of datasets that were predicted as inputs to this quantum 

538 when the QuantumGraph was built. 

539 """ 

540 

541 available_inputs: Set[uuid.UUID] 

542 """Unique IDs of input datasets that were actually present in the datastore 

543 when this quantum was executed. 

544 

545 This is a subset of `predicted_inputs`, with the difference generally being 

546 datasets were `predicted_outputs` but not `actual_outputs` of some upstream 

547 task. 

548 """ 

549 

550 actual_inputs: Set[uuid.UUID] 

551 """Unique IDs of datasets that were actually used as inputs by this task. 

552 

553 This is a subset of `available_inputs`. 

554 

555 Notes 

556 ----- 

557 The criteria for marking an input as used is that rerunning the quantum 

558 with only these `actual_inputs` available must yield identical outputs. 

559 This means that (for example) even just using an input to help determine 

560 an output rejection criteria and then rejecting it as an outlier qualifies 

561 that input as actually used. 

562 """ 

563 

564 predicted_outputs: Set[uuid.UUID] 

565 """Unique IDs of datasets that were predicted as outputs of this quantum 

566 when the QuantumGraph was built. 

567 """ 

568 

569 actual_outputs: Set[uuid.UUID] 

570 """Unique IDs of datasets that were actually written when this quantum 

571 was executed. 

572 """ 

573 

574 datastore_records: Dict[str, SerializedDatastoreRecordData] 

575 """Datastore records indexed by datastore name.""" 

576 

577 @staticmethod 

578 def collect_and_transfer( 

579 butler: Butler, quanta: Iterable[Quantum], provenance: Iterable[QuantumProvenanceData] 

580 ) -> None: 

581 """Transfer output datasets from multiple quanta to a more permantent 

582 `Butler` repository. 

583 

584 Parameters 

585 ---------- 

586 butler : `Butler` 

587 Full butler representing the data repository to transfer datasets 

588 to. 

589 quanta : `Iterable` [ `Quantum` ] 

590 Iterable of `Quantum` objects that carry information about 

591 predicted outputs. May be a single-pass iterator. 

592 provenance : `Iterable` [ `QuantumProvenanceData` ] 

593 Provenance and datastore data for each of the given quanta, in the 

594 same order. May be a single-pass iterator. 

595 

596 Notes 

597 ----- 

598 Input-output provenance data is not actually transferred yet, because 

599 `Registry` has no place to store it. 

600 

601 This method probably works most efficiently if run on all quanta for a 

602 single task label at once, because this will gather all datasets of 

603 a particular type together into a single vectorized `Registry` import. 

604 It should still behave correctly if run on smaller groups of quanta 

605 or even quanta from multiple tasks. 

606 

607 Currently this method transfers datastore record data unchanged, with 

608 no possibility of actually moving (e.g.) files. Datastores that are 

609 present only in execution or only in the more permanent butler are 

610 ignored. 

611 """ 

612 grouped_refs = defaultdict(list) 

613 summary_records: Dict[str, DatastoreRecordData] = {} 

614 for quantum, provenance_for_quantum in zip(quanta, provenance): 

615 quantum_refs_by_id = { 

616 ref.getCheckedId(): ref 

617 for ref in itertools.chain.from_iterable(quantum.outputs.values()) 

618 if ref.getCheckedId() in provenance_for_quantum.actual_outputs 

619 } 

620 for ref in quantum_refs_by_id.values(): 

621 grouped_refs[ref.datasetType, ref.run].append(ref) 

622 

623 # merge datastore records into a summary structure 

624 for datastore_name, serialized_records in provenance_for_quantum.datastore_records.items(): 

625 quantum_records = DatastoreRecordData.from_simple(serialized_records) 

626 if (records := summary_records.get(datastore_name)) is not None: 

627 records.update(quantum_records) 

628 else: 

629 summary_records[datastore_name] = quantum_records 

630 

631 for refs in grouped_refs.values(): 

632 butler.registry._importDatasets(refs) 

633 butler.datastore.import_records(summary_records) 

634 

635 @classmethod 

636 def parse_raw(cls, *args: Any, **kwargs: Any) -> QuantumProvenanceData: 

637 raise NotImplementedError("parse_raw() is not usable for this class, use direct() instead.") 

638 

639 @classmethod 

640 def direct( 

641 cls, 

642 *, 

643 predicted_inputs: Iterable[Union[str, uuid.UUID]], 

644 available_inputs: Iterable[Union[str, uuid.UUID]], 

645 actual_inputs: Iterable[Union[str, uuid.UUID]], 

646 predicted_outputs: Iterable[Union[str, uuid.UUID]], 

647 actual_outputs: Iterable[Union[str, uuid.UUID]], 

648 datastore_records: Mapping[str, Mapping], 

649 ) -> QuantumProvenanceData: 

650 """Construct an instance directly without validators. 

651 

652 This differs from the pydantic "construct" method in that the 

653 arguments are explicitly what the model requires, and it will recurse 

654 through members, constructing them from their corresponding `direct` 

655 methods. 

656 

657 This method should only be called when the inputs are trusted. 

658 """ 

659 

660 def _to_uuid_set(uuids: Iterable[Union[str, uuid.UUID]]) -> Set[uuid.UUID]: 

661 """Convert input UUIDs, which could be in string representation to 

662 a set of `UUID` instances. 

663 """ 

664 return set(uuid.UUID(id) if isinstance(id, str) else id for id in uuids) 

665 

666 data = QuantumProvenanceData.__new__(cls) 

667 setter = object.__setattr__ 

668 setter(data, "predicted_inputs", _to_uuid_set(predicted_inputs)) 

669 setter(data, "available_inputs", _to_uuid_set(available_inputs)) 

670 setter(data, "actual_inputs", _to_uuid_set(actual_inputs)) 

671 setter(data, "predicted_outputs", _to_uuid_set(predicted_outputs)) 

672 setter(data, "actual_outputs", _to_uuid_set(actual_outputs)) 

673 setter( 

674 data, 

675 "datastore_records", 

676 { 

677 key: SerializedDatastoreRecordData.direct(**records) 

678 for key, records in datastore_records.items() 

679 }, 

680 ) 

681 return data