Coverage for python/lsst/daf/butler/_quantum_backed.py: 25%

164 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2022-11-08 22:06 -0800

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ("QuantumBackedButler", "QuantumProvenanceData") 

25 

26import itertools 

27import logging 

28import uuid 

29from collections import defaultdict 

30from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Mapping, Optional, Set, Type, Union 

31 

32from pydantic import BaseModel 

33 

34from ._butlerConfig import ButlerConfig 

35from ._deferredDatasetHandle import DeferredDatasetHandle 

36from ._limited_butler import LimitedButler 

37from .core import ( 

38 Config, 

39 DatasetId, 

40 DatasetRef, 

41 Datastore, 

42 DatastoreRecordData, 

43 DimensionUniverse, 

44 Quantum, 

45 SerializedDatastoreRecordData, 

46 StorageClassFactory, 

47 ddl, 

48) 

49from .registry.bridge.monolithic import MonolithicDatastoreRegistryBridgeManager 

50from .registry.databases.sqlite import SqliteDatabase 

51from .registry.interfaces import DatastoreRegistryBridgeManager, OpaqueTableStorageManager 

52from .registry.opaque import ByNameOpaqueTableStorageManager 

53 

54if TYPE_CHECKING: 54 ↛ 55line 54 didn't jump to line 55, because the condition on line 54 was never true

55 from ._butler import Butler 

56 

57_LOG = logging.getLogger(__name__) 

58 

59 

60class _DatasetRecordStorageManagerDatastoreConstructionMimic: 

61 """A partial implementation of `DatasetRecordStorageManager` that exists 

62 only to allow a `DatastoreRegistryBridgeManager` (and hence a `Datastore`) 

63 to be constructed without a full `Registry`. 

64 

65 Notes 

66 ----- 

67 The interface implemented by this class should probably be its own ABC, 

68 and that ABC should probably be used in the definition of 

69 `DatastoreRegistryBridgeManager`, but while prototyping I'm trying to keep 

70 changes minimal. 

71 """ 

72 

73 @classmethod 

74 def getIdColumnType(cls) -> type: 

75 # Docstring inherited. 

76 return ddl.GUID 

77 

78 @classmethod 

79 def addDatasetForeignKey( 

80 cls, 

81 tableSpec: ddl.TableSpec, 

82 *, 

83 name: str = "dataset", 

84 constraint: bool = True, 

85 onDelete: Optional[str] = None, 

86 **kwargs: Any, 

87 ) -> ddl.FieldSpec: 

88 # Docstring inherited. 

89 idFieldSpec = ddl.FieldSpec(f"{name}_id", dtype=ddl.GUID, **kwargs) 

90 tableSpec.fields.add(idFieldSpec) 

91 return idFieldSpec 

92 

93 

94class QuantumBackedButler(LimitedButler): 

95 """An implementation of `LimitedButler` intended to back execution of a 

96 single `Quantum`. 

97 

98 Parameters 

99 ---------- 

100 quantum : `Quantum` 

101 Object describing the predicted input and output dataset relevant to 

102 this butler. This must have resolved `DatasetRef` instances for all 

103 inputs and outputs. 

104 dimensions : `DimensionUniverse` 

105 Object managing all dimension definitions. 

106 datastore : `Datastore` 

107 Datastore to use for all dataset I/O and existence checks. 

108 storageClasses : `StorageClassFactory` 

109 Object managing all storage class definitions. 

110 

111 Notes 

112 ----- 

113 Most callers should use the `initialize` `classmethod` to construct new 

114 instances instead of calling the constructor directly. 

115 

116 `QuantumBackedButler` uses a SQLite database internally, in order to reuse 

117 existing `DatastoreRegistryBridge` and `OpaqueTableStorage` 

118 implementations that rely SQLAlchemy. If implementations are added in the 

119 future that don't rely on SQLAlchemy, it should be possible to swap them 

120 in by overriding the type arguments to `initialize` (though at present, 

121 `QuantumBackedButler` would still create at least an in-memory SQLite 

122 database that would then go unused).` 

123 

124 We imagine `QuantumBackedButler` being used during (at least) batch 

125 execution to capture `Datastore` records and save them to per-quantum 

126 files, which are also a convenient place to store provenance for eventual 

127 upload to a SQL-backed `Registry` (once `Registry` has tables to store 

128 provenance, that is). 

129 These per-quantum files can be written in two ways: 

130 

131 - The SQLite file used internally by `QuantumBackedButler` can be used 

132 directly but customizing the ``filename`` argument to ``initialize``, and 

133 then transferring that file to the object store after execution completes 

134 (or fails; a ``try/finally`` pattern probably makes sense here). 

135 

136 - A JSON or YAML file can be written by calling `extract_provenance_data`, 

137 and using ``pydantic`` methods to write the returned 

138 `QuantumProvenanceData` to a file. 

139 

140 Note that at present, the SQLite file only contains datastore records, not 

141 provenance, but that should be easy to address (if desired) after we 

142 actually design a `Registry` schema for provenance. I also suspect that 

143 we'll want to explicitly close the SQLite file somehow before trying to 

144 transfer it. But I'm guessing we'd prefer to write the per-quantum files 

145 as JSON anyway. 

146 """ 

147 

148 def __init__( 

149 self, 

150 quantum: Quantum, 

151 dimensions: DimensionUniverse, 

152 datastore: Datastore, 

153 storageClasses: StorageClassFactory, 

154 ): 

155 self._quantum = quantum 

156 self._dimensions = dimensions 

157 self._predicted_inputs: Set[DatasetId] = { 

158 ref.getCheckedId() for ref in itertools.chain.from_iterable(quantum.inputs.values()) 

159 } 

160 self._predicted_outputs: Set[DatasetId] = { 

161 ref.getCheckedId() for ref in itertools.chain.from_iterable(quantum.outputs.values()) 

162 } 

163 self._available_inputs: Set[DatasetId] = set() 

164 self._unavailable_inputs: Set[DatasetId] = set() 

165 self._actual_inputs: Set[DatasetId] = set() 

166 self._actual_output_refs: Set[DatasetRef] = set() 

167 self.datastore = datastore 

168 self.storageClasses = storageClasses 

169 

170 @classmethod 

171 def initialize( 

172 cls, 

173 config: Union[Config, str], 

174 quantum: Quantum, 

175 dimensions: DimensionUniverse, 

176 filename: str = ":memory:", 

177 OpaqueManagerClass: Type[OpaqueTableStorageManager] = ByNameOpaqueTableStorageManager, 

178 BridgeManagerClass: Type[DatastoreRegistryBridgeManager] = MonolithicDatastoreRegistryBridgeManager, 

179 search_paths: Optional[List[str]] = None, 

180 ) -> QuantumBackedButler: 

181 """Construct a new `QuantumBackedButler` from repository configuration 

182 and helper types. 

183 

184 Parameters 

185 ---------- 

186 config : `Config` or `str` 

187 A butler repository root, configuration filename, or configuration 

188 instance. 

189 quantum : `Quantum` 

190 Object describing the predicted input and output dataset relevant 

191 to this butler. This must have resolved `DatasetRef` instances for 

192 all inputs and outputs. 

193 dimensions : `DimensionUniverse` 

194 Object managing all dimension definitions. 

195 filename : `str`, optional 

196 Name for the SQLite database that will back this butler; defaults 

197 to an in-memory database. 

198 OpaqueManagerClass : `type`, optional 

199 A subclass of `OpaqueTableStorageManager` to use for datastore 

200 opaque records. Default is a SQL-backed implementation. 

201 BridgeManagerClass : `type`, optional 

202 A subclass of `DatastoreRegistryBridgeManager` to use for datastore 

203 location records. Default is a SQL-backed implementation. 

204 search_paths : `list` of `str`, optional 

205 Additional search paths for butler configuration. 

206 """ 

207 butler_config = ButlerConfig(config, searchPaths=search_paths) 

208 if "root" in butler_config: 

209 butler_root = butler_config["root"] 

210 else: 

211 butler_root = butler_config.configDir 

212 db = SqliteDatabase.fromUri(f"sqlite:///{filename}", origin=0) 

213 with db.declareStaticTables(create=True) as context: 

214 opaque_manager = OpaqueManagerClass.initialize(db, context) 

215 bridge_manager = BridgeManagerClass.initialize( 

216 db, 

217 context, 

218 opaque=opaque_manager, 

219 # MyPy can tell it's a fake, but we know it shouldn't care. 

220 datasets=_DatasetRecordStorageManagerDatastoreConstructionMimic, # type: ignore 

221 universe=dimensions, 

222 ) 

223 # TODO: We need to inform `Datastore` here that it needs to support 

224 # predictive reads; right now that's a configuration option, but after 

225 # execution butler is retired it could just be a kwarg we pass here. 

226 # For now just force this option as we cannot work without it. 

227 butler_config["datastore", "trust_get_request"] = True 

228 datastore = Datastore.fromConfig(butler_config, bridge_manager, butler_root) 

229 datastore.import_records(quantum.datastore_records) 

230 storageClasses = StorageClassFactory() 

231 storageClasses.addFromConfig(butler_config) 

232 return cls(quantum, dimensions, datastore, storageClasses=storageClasses) 

233 

234 def isWriteable(self) -> bool: 

235 # Docstring inherited. 

236 return True 

237 

238 def getDirect(self, ref: DatasetRef, *, parameters: Optional[Dict[str, Any]] = None) -> Any: 

239 # Docstring inherited. 

240 try: 

241 obj = super().getDirect(ref, parameters=parameters) 

242 except (LookupError, FileNotFoundError, IOError): 

243 self._unavailable_inputs.add(ref.getCheckedId()) 

244 raise 

245 if ref.id in self._predicted_inputs: 

246 # do this after delegating to super in case that raises. 

247 self._actual_inputs.add(ref.id) 

248 self._available_inputs.add(ref.id) 

249 return obj 

250 

251 def getDirectDeferred( 

252 self, ref: DatasetRef, *, parameters: Union[dict, None] = None 

253 ) -> DeferredDatasetHandle: 

254 # Docstring inherited. 

255 if ref.id in self._predicted_inputs: 

256 # Unfortunately, we can't do this after the handle succeeds in 

257 # loading, so it's conceivable here that we're marking an input 

258 # as "actual" even when it's not even available. 

259 self._actual_inputs.add(ref.id) 

260 return super().getDirectDeferred(ref, parameters=parameters) 

261 

262 def datasetExistsDirect(self, ref: DatasetRef) -> bool: 

263 # Docstring inherited. 

264 exists = super().datasetExistsDirect(ref) 

265 if ref.id in self._predicted_inputs: 

266 if exists: 

267 self._available_inputs.add(ref.id) 

268 else: 

269 self._unavailable_inputs.add(ref.id) 

270 return exists 

271 

272 def markInputUnused(self, ref: DatasetRef) -> None: 

273 # Docstring inherited. 

274 self._actual_inputs.discard(ref.getCheckedId()) 

275 

276 @property 

277 def dimensions(self) -> DimensionUniverse: 

278 # Docstring inherited. 

279 return self._dimensions 

280 

281 def putDirect(self, obj: Any, ref: DatasetRef) -> DatasetRef: 

282 # Docstring inherited. 

283 if ref.id not in self._predicted_outputs: 

284 raise RuntimeError("Cannot `put` dataset that was not predicted as an output.") 

285 self.datastore.put(obj, ref) 

286 self._actual_output_refs.add(ref) 

287 return ref 

288 

289 def pruneDatasets( 

290 self, 

291 refs: Iterable[DatasetRef], 

292 *, 

293 disassociate: bool = True, 

294 unstore: bool = False, 

295 tags: Iterable[str] = (), 

296 purge: bool = False, 

297 ) -> None: 

298 # docstring inherited from LimitedButler 

299 

300 if purge: 

301 if not disassociate: 

302 raise TypeError("Cannot pass purge=True without disassociate=True.") 

303 if not unstore: 

304 raise TypeError("Cannot pass purge=True without unstore=True.") 

305 elif disassociate: 

306 # No tagged collections for this butler. 

307 raise TypeError("Cannot pass disassociate=True without purge=True.") 

308 

309 refs = list(refs) 

310 

311 # Pruning a component of a DatasetRef makes no sense. 

312 for ref in refs: 

313 if ref.datasetType.component(): 

314 raise ValueError(f"Can not prune a component of a dataset (ref={ref})") 

315 

316 if unstore: 

317 self.datastore.trash(refs) 

318 if purge: 

319 for ref in refs: 

320 # We only care about removing them from actual output refs, 

321 self._actual_output_refs.discard(ref) 

322 

323 if unstore: 

324 # Point of no return for removing artifacts 

325 self.datastore.emptyTrash() 

326 

327 def extract_provenance_data(self) -> QuantumProvenanceData: 

328 """Extract provenance information and datastore records from this 

329 butler. 

330 

331 Returns 

332 ------- 

333 provenance : `QuantumProvenanceData` 

334 A serializable struct containing input/output dataset IDs and 

335 datastore records. This assumes all dataset IDs are UUIDs (just to 

336 make it easier for `pydantic` to reason about the struct's types); 

337 the rest of this class makes no such assumption, but the approach 

338 to processing in which it's useful effectively requires UUIDs 

339 anyway. 

340 

341 Notes 

342 ----- 

343 `QuantumBackedButler` records this provenance information when its 

344 methods are used, which mostly saves `~lsst.pipe.base.PipelineTask` 

345 authors from having to worry about while still recording very 

346 detailed information. But it has two small weaknesses: 

347 

348 - Calling `getDirectDeferred` or `getDirect` is enough to mark a 

349 dataset as an "actual input", which may mark some datasets that 

350 aren't actually used. We rely on task authors to use 

351 `markInputUnused` to address this. 

352 

353 - We assume that the execution system will call ``datasetExistsDirect`` 

354 on all predicted inputs prior to execution, in order to populate the 

355 "available inputs" set. This is what I envision 

356 '`~lsst.ctrl.mpexec.SingleQuantumExecutor` doing after we update it 

357 to use this class, but it feels fragile for this class to make such 

358 a strong assumption about how it will be used, even if I can't think 

359 of any other executor behavior that would make sense. 

360 """ 

361 if not self._actual_inputs.isdisjoint(self._unavailable_inputs): 

362 _LOG.warning( 

363 "Inputs %s were marked as actually used (probably because a DeferredDatasetHandle) " 

364 "was obtained, but did not actually exist. This task should be be using markInputUnused " 

365 "directly to clarify its provenance.", 

366 self._actual_inputs & self._unavailable_inputs, 

367 ) 

368 self._actual_inputs -= self._unavailable_inputs 

369 checked_inputs = self._available_inputs | self._unavailable_inputs 

370 if not self._predicted_inputs == checked_inputs: 

371 _LOG.warning( 

372 "Execution harness did not check predicted inputs %s for existence; available inputs " 

373 "recorded in provenance may be incomplete.", 

374 self._predicted_inputs - checked_inputs, 

375 ) 

376 datastore_records = self.datastore.export_records(self._actual_output_refs) 

377 provenance_records = { 

378 datastore_name: records.to_simple() for datastore_name, records in datastore_records.items() 

379 } 

380 

381 return QuantumProvenanceData( 

382 predicted_inputs=self._predicted_inputs, 

383 available_inputs=self._available_inputs, 

384 actual_inputs=self._actual_inputs, 

385 predicted_outputs=self._predicted_outputs, 

386 actual_outputs={ref.getCheckedId() for ref in self._actual_output_refs}, 

387 datastore_records=provenance_records, 

388 ) 

389 

390 

391class QuantumProvenanceData(BaseModel): 

392 """A serializable struct for per-quantum provenance information and 

393 datastore records. 

394 

395 Notes 

396 ----- 

397 This class slightly duplicates information from the `Quantum` class itself 

398 (the `predicted_inputs` and `predicted_outputs` sets should have the same 

399 IDs present in `Quantum.inputs` and `Quantum.outputs`), but overall it 

400 assumes the original `Quantum` is also available to reconstruct the 

401 complete provenance (e.g. by associating dataset IDs with data IDs, 

402 dataset types, and `~CollectionType.RUN` names. 

403 

404 Note that ``pydantic`` method ``parse_raw()`` is not going to work 

405 correctly for this class, use `direct` method instead. 

406 """ 

407 

408 # This class probably should have information about its execution 

409 # environment (anything not controlled and recorded at the 

410 # `~CollectionType.RUN` level, such as the compute node ID). but adding it 

411 # now is out of scope for this prototype. 

412 

413 predicted_inputs: Set[uuid.UUID] 

414 """Unique IDs of datasets that were predicted as inputs to this quantum 

415 when the QuantumGraph was built. 

416 """ 

417 

418 available_inputs: Set[uuid.UUID] 

419 """Unique IDs of input datasets that were actually present in the datastore 

420 when this quantum was executed. 

421 

422 This is a subset of `predicted_inputs`, with the difference generally being 

423 datasets were `predicted_outputs` but not `actual_outputs` of some upstream 

424 task. 

425 """ 

426 

427 actual_inputs: Set[uuid.UUID] 

428 """Unique IDs of datasets that were actually used as inputs by this task. 

429 

430 This is a subset of `available_inputs`. 

431 

432 Notes 

433 ----- 

434 The criteria for marking an input as used is that rerunning the quantum 

435 with only these `actual_inputs` available must yield identical outputs. 

436 This means that (for example) even just using an input to help determine 

437 an output rejection criteria and then rejecting it as an outlier qualifies 

438 that input as actually used. 

439 """ 

440 

441 predicted_outputs: Set[uuid.UUID] 

442 """Unique IDs of datasets that were predicted as outputs of this quantum 

443 when the QuantumGraph was built. 

444 """ 

445 

446 actual_outputs: Set[uuid.UUID] 

447 """Unique IDs of datasets that were actually written when this quantum 

448 was executed. 

449 """ 

450 

451 datastore_records: Dict[str, SerializedDatastoreRecordData] 

452 """Datastore records indexed by datastore name.""" 

453 

454 @staticmethod 

455 def collect_and_transfer( 

456 butler: Butler, quanta: Iterable[Quantum], provenance: Iterable[QuantumProvenanceData] 

457 ) -> None: 

458 """Transfer output datasets from multiple quanta to a more permantent 

459 `Butler` repository. 

460 

461 Parameters 

462 ---------- 

463 butler : `Butler` 

464 Full butler representing the data repository to transfer datasets 

465 to. 

466 quanta : `Iterable` [ `Quantum` ] 

467 Iterable of `Quantum` objects that carry information about 

468 predicted outputs. May be a single-pass iterator. 

469 provenance : `Iterable` [ `QuantumProvenanceData` ] 

470 Provenance and datastore data for each of the given quanta, in the 

471 same order. May be a single-pass iterator. 

472 

473 Notes 

474 ----- 

475 Input-output provenance data is not actually transferred yet, because 

476 `Registry` has no place to store it. 

477 

478 This method probably works most efficiently if run on all quanta for a 

479 single task label at once, because this will gather all datasets of 

480 a particular type together into a single vectorized `Registry` import. 

481 It should still behave correctly if run on smaller groups of quanta 

482 or even quanta from multiple tasks. 

483 

484 Currently this method transfers datastore record data unchanged, with 

485 no possibility of actually moving (e.g.) files. Datastores that are 

486 present only in execution or only in the more permanent butler are 

487 ignored. 

488 """ 

489 grouped_refs = defaultdict(list) 

490 summary_records: Dict[str, DatastoreRecordData] = {} 

491 for quantum, provenance_for_quantum in zip(quanta, provenance): 

492 quantum_refs_by_id = { 

493 ref.getCheckedId(): ref 

494 for ref in itertools.chain.from_iterable(quantum.outputs.values()) 

495 if ref.getCheckedId() in provenance_for_quantum.actual_outputs 

496 } 

497 for ref in quantum_refs_by_id.values(): 

498 grouped_refs[ref.datasetType, ref.run].append(ref) 

499 

500 # merge datastore records into a summary structure 

501 for datastore_name, serialized_records in provenance_for_quantum.datastore_records.items(): 

502 quantum_records = DatastoreRecordData.from_simple(serialized_records) 

503 if (records := summary_records.get(datastore_name)) is not None: 

504 records.update(quantum_records) 

505 else: 

506 summary_records[datastore_name] = quantum_records 

507 

508 for refs in grouped_refs.values(): 

509 butler.registry._importDatasets(refs) 

510 butler.datastore.import_records(summary_records) 

511 

512 @classmethod 

513 def parse_raw(cls, *args: Any, **kwargs: Any) -> QuantumProvenanceData: 

514 raise NotImplementedError("parse_raw() is not usable for this class, use direct() instead.") 

515 

516 @classmethod 

517 def direct( 

518 cls, 

519 *, 

520 predicted_inputs: Iterable[Union[str, uuid.UUID]], 

521 available_inputs: Iterable[Union[str, uuid.UUID]], 

522 actual_inputs: Iterable[Union[str, uuid.UUID]], 

523 predicted_outputs: Iterable[Union[str, uuid.UUID]], 

524 actual_outputs: Iterable[Union[str, uuid.UUID]], 

525 datastore_records: Mapping[str, Mapping], 

526 ) -> QuantumProvenanceData: 

527 """Construct an instance directly without validators. 

528 

529 This differs from the pydantic "construct" method in that the 

530 arguments are explicitly what the model requires, and it will recurse 

531 through members, constructing them from their corresponding `direct` 

532 methods. 

533 

534 This method should only be called when the inputs are trusted. 

535 """ 

536 

537 def _to_uuid_set(uuids: Iterable[Union[str, uuid.UUID]]) -> Set[uuid.UUID]: 

538 """Convert input UUIDs, which could be in string representation to 

539 a set of `UUID` instances. 

540 """ 

541 return set(uuid.UUID(id) if isinstance(id, str) else id for id in uuids) 

542 

543 data = QuantumProvenanceData.__new__(cls) 

544 setter = object.__setattr__ 

545 setter(data, "predicted_inputs", _to_uuid_set(predicted_inputs)) 

546 setter(data, "available_inputs", _to_uuid_set(available_inputs)) 

547 setter(data, "actual_inputs", _to_uuid_set(actual_inputs)) 

548 setter(data, "predicted_outputs", _to_uuid_set(predicted_outputs)) 

549 setter(data, "actual_outputs", _to_uuid_set(actual_outputs)) 

550 setter( 

551 data, 

552 "datastore_records", 

553 { 

554 key: SerializedDatastoreRecordData.direct(**records) 

555 for key, records in datastore_records.items() 

556 }, 

557 ) 

558 return data