Coverage for python/lsst/daf/butler/_quantum_backed.py: 25%

164 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2022-11-19 01:58 -0800

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ("QuantumBackedButler", "QuantumProvenanceData") 

25 

26import itertools 

27import logging 

28import uuid 

29from collections import defaultdict 

30from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Mapping, Optional, Set, Type, Union 

31 

32from pydantic import BaseModel 

33 

34from ._butlerConfig import ButlerConfig 

35from ._deferredDatasetHandle import DeferredDatasetHandle 

36from ._limited_butler import LimitedButler 

37from .core import ( 

38 Config, 

39 DatasetId, 

40 DatasetRef, 

41 Datastore, 

42 DatastoreRecordData, 

43 DimensionUniverse, 

44 Quantum, 

45 SerializedDatastoreRecordData, 

46 StorageClass, 

47 StorageClassFactory, 

48 ddl, 

49) 

50from .registry.bridge.monolithic import MonolithicDatastoreRegistryBridgeManager 

51from .registry.databases.sqlite import SqliteDatabase 

52from .registry.interfaces import DatastoreRegistryBridgeManager, OpaqueTableStorageManager 

53from .registry.opaque import ByNameOpaqueTableStorageManager 

54 

55if TYPE_CHECKING: 55 ↛ 56line 55 didn't jump to line 56, because the condition on line 55 was never true

56 from ._butler import Butler 

57 

58_LOG = logging.getLogger(__name__) 

59 

60 

61class _DatasetRecordStorageManagerDatastoreConstructionMimic: 

62 """A partial implementation of `DatasetRecordStorageManager` that exists 

63 only to allow a `DatastoreRegistryBridgeManager` (and hence a `Datastore`) 

64 to be constructed without a full `Registry`. 

65 

66 Notes 

67 ----- 

68 The interface implemented by this class should probably be its own ABC, 

69 and that ABC should probably be used in the definition of 

70 `DatastoreRegistryBridgeManager`, but while prototyping I'm trying to keep 

71 changes minimal. 

72 """ 

73 

74 @classmethod 

75 def getIdColumnType(cls) -> type: 

76 # Docstring inherited. 

77 return ddl.GUID 

78 

79 @classmethod 

80 def addDatasetForeignKey( 

81 cls, 

82 tableSpec: ddl.TableSpec, 

83 *, 

84 name: str = "dataset", 

85 constraint: bool = True, 

86 onDelete: Optional[str] = None, 

87 **kwargs: Any, 

88 ) -> ddl.FieldSpec: 

89 # Docstring inherited. 

90 idFieldSpec = ddl.FieldSpec(f"{name}_id", dtype=ddl.GUID, **kwargs) 

91 tableSpec.fields.add(idFieldSpec) 

92 return idFieldSpec 

93 

94 

95class QuantumBackedButler(LimitedButler): 

96 """An implementation of `LimitedButler` intended to back execution of a 

97 single `Quantum`. 

98 

99 Parameters 

100 ---------- 

101 quantum : `Quantum` 

102 Object describing the predicted input and output dataset relevant to 

103 this butler. This must have resolved `DatasetRef` instances for all 

104 inputs and outputs. 

105 dimensions : `DimensionUniverse` 

106 Object managing all dimension definitions. 

107 datastore : `Datastore` 

108 Datastore to use for all dataset I/O and existence checks. 

109 storageClasses : `StorageClassFactory` 

110 Object managing all storage class definitions. 

111 

112 Notes 

113 ----- 

114 Most callers should use the `initialize` `classmethod` to construct new 

115 instances instead of calling the constructor directly. 

116 

117 `QuantumBackedButler` uses a SQLite database internally, in order to reuse 

118 existing `DatastoreRegistryBridge` and `OpaqueTableStorage` 

119 implementations that rely SQLAlchemy. If implementations are added in the 

120 future that don't rely on SQLAlchemy, it should be possible to swap them 

121 in by overriding the type arguments to `initialize` (though at present, 

122 `QuantumBackedButler` would still create at least an in-memory SQLite 

123 database that would then go unused).` 

124 

125 We imagine `QuantumBackedButler` being used during (at least) batch 

126 execution to capture `Datastore` records and save them to per-quantum 

127 files, which are also a convenient place to store provenance for eventual 

128 upload to a SQL-backed `Registry` (once `Registry` has tables to store 

129 provenance, that is). 

130 These per-quantum files can be written in two ways: 

131 

132 - The SQLite file used internally by `QuantumBackedButler` can be used 

133 directly but customizing the ``filename`` argument to ``initialize``, and 

134 then transferring that file to the object store after execution completes 

135 (or fails; a ``try/finally`` pattern probably makes sense here). 

136 

137 - A JSON or YAML file can be written by calling `extract_provenance_data`, 

138 and using ``pydantic`` methods to write the returned 

139 `QuantumProvenanceData` to a file. 

140 

141 Note that at present, the SQLite file only contains datastore records, not 

142 provenance, but that should be easy to address (if desired) after we 

143 actually design a `Registry` schema for provenance. I also suspect that 

144 we'll want to explicitly close the SQLite file somehow before trying to 

145 transfer it. But I'm guessing we'd prefer to write the per-quantum files 

146 as JSON anyway. 

147 """ 

148 

149 def __init__( 

150 self, 

151 quantum: Quantum, 

152 dimensions: DimensionUniverse, 

153 datastore: Datastore, 

154 storageClasses: StorageClassFactory, 

155 ): 

156 self._quantum = quantum 

157 self._dimensions = dimensions 

158 self._predicted_inputs: Set[DatasetId] = { 

159 ref.getCheckedId() for ref in itertools.chain.from_iterable(quantum.inputs.values()) 

160 } 

161 self._predicted_outputs: Set[DatasetId] = { 

162 ref.getCheckedId() for ref in itertools.chain.from_iterable(quantum.outputs.values()) 

163 } 

164 self._available_inputs: Set[DatasetId] = set() 

165 self._unavailable_inputs: Set[DatasetId] = set() 

166 self._actual_inputs: Set[DatasetId] = set() 

167 self._actual_output_refs: Set[DatasetRef] = set() 

168 self.datastore = datastore 

169 self.storageClasses = storageClasses 

170 

171 @classmethod 

172 def initialize( 

173 cls, 

174 config: Union[Config, str], 

175 quantum: Quantum, 

176 dimensions: DimensionUniverse, 

177 filename: str = ":memory:", 

178 OpaqueManagerClass: Type[OpaqueTableStorageManager] = ByNameOpaqueTableStorageManager, 

179 BridgeManagerClass: Type[DatastoreRegistryBridgeManager] = MonolithicDatastoreRegistryBridgeManager, 

180 search_paths: Optional[List[str]] = None, 

181 ) -> QuantumBackedButler: 

182 """Construct a new `QuantumBackedButler` from repository configuration 

183 and helper types. 

184 

185 Parameters 

186 ---------- 

187 config : `Config` or `str` 

188 A butler repository root, configuration filename, or configuration 

189 instance. 

190 quantum : `Quantum` 

191 Object describing the predicted input and output dataset relevant 

192 to this butler. This must have resolved `DatasetRef` instances for 

193 all inputs and outputs. 

194 dimensions : `DimensionUniverse` 

195 Object managing all dimension definitions. 

196 filename : `str`, optional 

197 Name for the SQLite database that will back this butler; defaults 

198 to an in-memory database. 

199 OpaqueManagerClass : `type`, optional 

200 A subclass of `OpaqueTableStorageManager` to use for datastore 

201 opaque records. Default is a SQL-backed implementation. 

202 BridgeManagerClass : `type`, optional 

203 A subclass of `DatastoreRegistryBridgeManager` to use for datastore 

204 location records. Default is a SQL-backed implementation. 

205 search_paths : `list` of `str`, optional 

206 Additional search paths for butler configuration. 

207 """ 

208 butler_config = ButlerConfig(config, searchPaths=search_paths) 

209 if "root" in butler_config: 

210 butler_root = butler_config["root"] 

211 else: 

212 butler_root = butler_config.configDir 

213 db = SqliteDatabase.fromUri(f"sqlite:///{filename}", origin=0) 

214 with db.declareStaticTables(create=True) as context: 

215 opaque_manager = OpaqueManagerClass.initialize(db, context) 

216 bridge_manager = BridgeManagerClass.initialize( 

217 db, 

218 context, 

219 opaque=opaque_manager, 

220 # MyPy can tell it's a fake, but we know it shouldn't care. 

221 datasets=_DatasetRecordStorageManagerDatastoreConstructionMimic, # type: ignore 

222 universe=dimensions, 

223 ) 

224 # TODO: We need to inform `Datastore` here that it needs to support 

225 # predictive reads; right now that's a configuration option, but after 

226 # execution butler is retired it could just be a kwarg we pass here. 

227 # For now just force this option as we cannot work without it. 

228 butler_config["datastore", "trust_get_request"] = True 

229 datastore = Datastore.fromConfig(butler_config, bridge_manager, butler_root) 

230 datastore.import_records(quantum.datastore_records) 

231 storageClasses = StorageClassFactory() 

232 storageClasses.addFromConfig(butler_config) 

233 return cls(quantum, dimensions, datastore, storageClasses=storageClasses) 

234 

235 def isWriteable(self) -> bool: 

236 # Docstring inherited. 

237 return True 

238 

239 def getDirect( 

240 self, 

241 ref: DatasetRef, 

242 *, 

243 parameters: Optional[Dict[str, Any]] = None, 

244 storageClass: str | StorageClass | None = None, 

245 ) -> Any: 

246 # Docstring inherited. 

247 try: 

248 obj = super().getDirect(ref, parameters=parameters, storageClass=storageClass) 

249 except (LookupError, FileNotFoundError, IOError): 

250 self._unavailable_inputs.add(ref.getCheckedId()) 

251 raise 

252 if ref.id in self._predicted_inputs: 

253 # do this after delegating to super in case that raises. 

254 self._actual_inputs.add(ref.id) 

255 self._available_inputs.add(ref.id) 

256 return obj 

257 

258 def getDirectDeferred( 

259 self, 

260 ref: DatasetRef, 

261 *, 

262 parameters: Union[dict, None] = None, 

263 storageClass: str | StorageClass | None = None, 

264 ) -> DeferredDatasetHandle: 

265 # Docstring inherited. 

266 if ref.id in self._predicted_inputs: 

267 # Unfortunately, we can't do this after the handle succeeds in 

268 # loading, so it's conceivable here that we're marking an input 

269 # as "actual" even when it's not even available. 

270 self._actual_inputs.add(ref.id) 

271 return super().getDirectDeferred(ref, parameters=parameters, storageClass=storageClass) 

272 

273 def datasetExistsDirect(self, ref: DatasetRef) -> bool: 

274 # Docstring inherited. 

275 exists = super().datasetExistsDirect(ref) 

276 if ref.id in self._predicted_inputs: 

277 if exists: 

278 self._available_inputs.add(ref.id) 

279 else: 

280 self._unavailable_inputs.add(ref.id) 

281 return exists 

282 

283 def markInputUnused(self, ref: DatasetRef) -> None: 

284 # Docstring inherited. 

285 self._actual_inputs.discard(ref.getCheckedId()) 

286 

287 @property 

288 def dimensions(self) -> DimensionUniverse: 

289 # Docstring inherited. 

290 return self._dimensions 

291 

292 def putDirect(self, obj: Any, ref: DatasetRef) -> DatasetRef: 

293 # Docstring inherited. 

294 if ref.id not in self._predicted_outputs: 

295 raise RuntimeError("Cannot `put` dataset that was not predicted as an output.") 

296 self.datastore.put(obj, ref) 

297 self._actual_output_refs.add(ref) 

298 return ref 

299 

300 def pruneDatasets( 

301 self, 

302 refs: Iterable[DatasetRef], 

303 *, 

304 disassociate: bool = True, 

305 unstore: bool = False, 

306 tags: Iterable[str] = (), 

307 purge: bool = False, 

308 ) -> None: 

309 # docstring inherited from LimitedButler 

310 

311 if purge: 

312 if not disassociate: 

313 raise TypeError("Cannot pass purge=True without disassociate=True.") 

314 if not unstore: 

315 raise TypeError("Cannot pass purge=True without unstore=True.") 

316 elif disassociate: 

317 # No tagged collections for this butler. 

318 raise TypeError("Cannot pass disassociate=True without purge=True.") 

319 

320 refs = list(refs) 

321 

322 # Pruning a component of a DatasetRef makes no sense. 

323 for ref in refs: 

324 if ref.datasetType.component(): 

325 raise ValueError(f"Can not prune a component of a dataset (ref={ref})") 

326 

327 if unstore: 

328 self.datastore.trash(refs) 

329 if purge: 

330 for ref in refs: 

331 # We only care about removing them from actual output refs, 

332 self._actual_output_refs.discard(ref) 

333 

334 if unstore: 

335 # Point of no return for removing artifacts 

336 self.datastore.emptyTrash() 

337 

338 def extract_provenance_data(self) -> QuantumProvenanceData: 

339 """Extract provenance information and datastore records from this 

340 butler. 

341 

342 Returns 

343 ------- 

344 provenance : `QuantumProvenanceData` 

345 A serializable struct containing input/output dataset IDs and 

346 datastore records. This assumes all dataset IDs are UUIDs (just to 

347 make it easier for `pydantic` to reason about the struct's types); 

348 the rest of this class makes no such assumption, but the approach 

349 to processing in which it's useful effectively requires UUIDs 

350 anyway. 

351 

352 Notes 

353 ----- 

354 `QuantumBackedButler` records this provenance information when its 

355 methods are used, which mostly saves `~lsst.pipe.base.PipelineTask` 

356 authors from having to worry about while still recording very 

357 detailed information. But it has two small weaknesses: 

358 

359 - Calling `getDirectDeferred` or `getDirect` is enough to mark a 

360 dataset as an "actual input", which may mark some datasets that 

361 aren't actually used. We rely on task authors to use 

362 `markInputUnused` to address this. 

363 

364 - We assume that the execution system will call ``datasetExistsDirect`` 

365 on all predicted inputs prior to execution, in order to populate the 

366 "available inputs" set. This is what I envision 

367 '`~lsst.ctrl.mpexec.SingleQuantumExecutor` doing after we update it 

368 to use this class, but it feels fragile for this class to make such 

369 a strong assumption about how it will be used, even if I can't think 

370 of any other executor behavior that would make sense. 

371 """ 

372 if not self._actual_inputs.isdisjoint(self._unavailable_inputs): 

373 _LOG.warning( 

374 "Inputs %s were marked as actually used (probably because a DeferredDatasetHandle) " 

375 "was obtained, but did not actually exist. This task should be be using markInputUnused " 

376 "directly to clarify its provenance.", 

377 self._actual_inputs & self._unavailable_inputs, 

378 ) 

379 self._actual_inputs -= self._unavailable_inputs 

380 checked_inputs = self._available_inputs | self._unavailable_inputs 

381 if not self._predicted_inputs == checked_inputs: 

382 _LOG.warning( 

383 "Execution harness did not check predicted inputs %s for existence; available inputs " 

384 "recorded in provenance may be incomplete.", 

385 self._predicted_inputs - checked_inputs, 

386 ) 

387 datastore_records = self.datastore.export_records(self._actual_output_refs) 

388 provenance_records = { 

389 datastore_name: records.to_simple() for datastore_name, records in datastore_records.items() 

390 } 

391 

392 return QuantumProvenanceData( 

393 predicted_inputs=self._predicted_inputs, 

394 available_inputs=self._available_inputs, 

395 actual_inputs=self._actual_inputs, 

396 predicted_outputs=self._predicted_outputs, 

397 actual_outputs={ref.getCheckedId() for ref in self._actual_output_refs}, 

398 datastore_records=provenance_records, 

399 ) 

400 

401 

402class QuantumProvenanceData(BaseModel): 

403 """A serializable struct for per-quantum provenance information and 

404 datastore records. 

405 

406 Notes 

407 ----- 

408 This class slightly duplicates information from the `Quantum` class itself 

409 (the `predicted_inputs` and `predicted_outputs` sets should have the same 

410 IDs present in `Quantum.inputs` and `Quantum.outputs`), but overall it 

411 assumes the original `Quantum` is also available to reconstruct the 

412 complete provenance (e.g. by associating dataset IDs with data IDs, 

413 dataset types, and `~CollectionType.RUN` names. 

414 

415 Note that ``pydantic`` method ``parse_raw()`` is not going to work 

416 correctly for this class, use `direct` method instead. 

417 """ 

418 

419 # This class probably should have information about its execution 

420 # environment (anything not controlled and recorded at the 

421 # `~CollectionType.RUN` level, such as the compute node ID). but adding it 

422 # now is out of scope for this prototype. 

423 

424 predicted_inputs: Set[uuid.UUID] 

425 """Unique IDs of datasets that were predicted as inputs to this quantum 

426 when the QuantumGraph was built. 

427 """ 

428 

429 available_inputs: Set[uuid.UUID] 

430 """Unique IDs of input datasets that were actually present in the datastore 

431 when this quantum was executed. 

432 

433 This is a subset of `predicted_inputs`, with the difference generally being 

434 datasets were `predicted_outputs` but not `actual_outputs` of some upstream 

435 task. 

436 """ 

437 

438 actual_inputs: Set[uuid.UUID] 

439 """Unique IDs of datasets that were actually used as inputs by this task. 

440 

441 This is a subset of `available_inputs`. 

442 

443 Notes 

444 ----- 

445 The criteria for marking an input as used is that rerunning the quantum 

446 with only these `actual_inputs` available must yield identical outputs. 

447 This means that (for example) even just using an input to help determine 

448 an output rejection criteria and then rejecting it as an outlier qualifies 

449 that input as actually used. 

450 """ 

451 

452 predicted_outputs: Set[uuid.UUID] 

453 """Unique IDs of datasets that were predicted as outputs of this quantum 

454 when the QuantumGraph was built. 

455 """ 

456 

457 actual_outputs: Set[uuid.UUID] 

458 """Unique IDs of datasets that were actually written when this quantum 

459 was executed. 

460 """ 

461 

462 datastore_records: Dict[str, SerializedDatastoreRecordData] 

463 """Datastore records indexed by datastore name.""" 

464 

465 @staticmethod 

466 def collect_and_transfer( 

467 butler: Butler, quanta: Iterable[Quantum], provenance: Iterable[QuantumProvenanceData] 

468 ) -> None: 

469 """Transfer output datasets from multiple quanta to a more permantent 

470 `Butler` repository. 

471 

472 Parameters 

473 ---------- 

474 butler : `Butler` 

475 Full butler representing the data repository to transfer datasets 

476 to. 

477 quanta : `Iterable` [ `Quantum` ] 

478 Iterable of `Quantum` objects that carry information about 

479 predicted outputs. May be a single-pass iterator. 

480 provenance : `Iterable` [ `QuantumProvenanceData` ] 

481 Provenance and datastore data for each of the given quanta, in the 

482 same order. May be a single-pass iterator. 

483 

484 Notes 

485 ----- 

486 Input-output provenance data is not actually transferred yet, because 

487 `Registry` has no place to store it. 

488 

489 This method probably works most efficiently if run on all quanta for a 

490 single task label at once, because this will gather all datasets of 

491 a particular type together into a single vectorized `Registry` import. 

492 It should still behave correctly if run on smaller groups of quanta 

493 or even quanta from multiple tasks. 

494 

495 Currently this method transfers datastore record data unchanged, with 

496 no possibility of actually moving (e.g.) files. Datastores that are 

497 present only in execution or only in the more permanent butler are 

498 ignored. 

499 """ 

500 grouped_refs = defaultdict(list) 

501 summary_records: Dict[str, DatastoreRecordData] = {} 

502 for quantum, provenance_for_quantum in zip(quanta, provenance): 

503 quantum_refs_by_id = { 

504 ref.getCheckedId(): ref 

505 for ref in itertools.chain.from_iterable(quantum.outputs.values()) 

506 if ref.getCheckedId() in provenance_for_quantum.actual_outputs 

507 } 

508 for ref in quantum_refs_by_id.values(): 

509 grouped_refs[ref.datasetType, ref.run].append(ref) 

510 

511 # merge datastore records into a summary structure 

512 for datastore_name, serialized_records in provenance_for_quantum.datastore_records.items(): 

513 quantum_records = DatastoreRecordData.from_simple(serialized_records) 

514 if (records := summary_records.get(datastore_name)) is not None: 

515 records.update(quantum_records) 

516 else: 

517 summary_records[datastore_name] = quantum_records 

518 

519 for refs in grouped_refs.values(): 

520 butler.registry._importDatasets(refs) 

521 butler.datastore.import_records(summary_records) 

522 

523 @classmethod 

524 def parse_raw(cls, *args: Any, **kwargs: Any) -> QuantumProvenanceData: 

525 raise NotImplementedError("parse_raw() is not usable for this class, use direct() instead.") 

526 

527 @classmethod 

528 def direct( 

529 cls, 

530 *, 

531 predicted_inputs: Iterable[Union[str, uuid.UUID]], 

532 available_inputs: Iterable[Union[str, uuid.UUID]], 

533 actual_inputs: Iterable[Union[str, uuid.UUID]], 

534 predicted_outputs: Iterable[Union[str, uuid.UUID]], 

535 actual_outputs: Iterable[Union[str, uuid.UUID]], 

536 datastore_records: Mapping[str, Mapping], 

537 ) -> QuantumProvenanceData: 

538 """Construct an instance directly without validators. 

539 

540 This differs from the pydantic "construct" method in that the 

541 arguments are explicitly what the model requires, and it will recurse 

542 through members, constructing them from their corresponding `direct` 

543 methods. 

544 

545 This method should only be called when the inputs are trusted. 

546 """ 

547 

548 def _to_uuid_set(uuids: Iterable[Union[str, uuid.UUID]]) -> Set[uuid.UUID]: 

549 """Convert input UUIDs, which could be in string representation to 

550 a set of `UUID` instances. 

551 """ 

552 return set(uuid.UUID(id) if isinstance(id, str) else id for id in uuids) 

553 

554 data = QuantumProvenanceData.__new__(cls) 

555 setter = object.__setattr__ 

556 setter(data, "predicted_inputs", _to_uuid_set(predicted_inputs)) 

557 setter(data, "available_inputs", _to_uuid_set(available_inputs)) 

558 setter(data, "actual_inputs", _to_uuid_set(actual_inputs)) 

559 setter(data, "predicted_outputs", _to_uuid_set(predicted_outputs)) 

560 setter(data, "actual_outputs", _to_uuid_set(actual_outputs)) 

561 setter( 

562 data, 

563 "datastore_records", 

564 { 

565 key: SerializedDatastoreRecordData.direct(**records) 

566 for key, records in datastore_records.items() 

567 }, 

568 ) 

569 return data