Coverage for python/lsst/daf/butler/_quantum_backed.py: 27%
164 statements
« prev ^ index » next coverage.py v6.4.1, created at 2022-06-17 02:08 -0700
« prev ^ index » next coverage.py v6.4.1, created at 2022-06-17 02:08 -0700
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = ("QuantumBackedButler", "QuantumProvenanceData")
26import itertools
27import logging
28import uuid
29from collections import defaultdict
30from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Mapping, Optional, Set, Type, Union
32from pydantic import BaseModel
34from ._butlerConfig import ButlerConfig
35from ._deferredDatasetHandle import DeferredDatasetHandle
36from ._limited_butler import LimitedButler
37from .core import (
38 Config,
39 DatasetId,
40 DatasetRef,
41 Datastore,
42 DatastoreRecordData,
43 DimensionUniverse,
44 Quantum,
45 SerializedDatastoreRecordData,
46 StorageClassFactory,
47 ddl,
48)
49from .registry.bridge.monolithic import MonolithicDatastoreRegistryBridgeManager
50from .registry.databases.sqlite import SqliteDatabase
51from .registry.interfaces import DatastoreRegistryBridgeManager, OpaqueTableStorageManager
52from .registry.opaque import ByNameOpaqueTableStorageManager
54if TYPE_CHECKING: 54 ↛ 55line 54 didn't jump to line 55, because the condition on line 54 was never true
55 from ._butler import Butler
57_LOG = logging.getLogger(__name__)
60class _DatasetRecordStorageManagerDatastoreConstructionMimic:
61 """A partial implementation of `DatasetRecordStorageManager` that exists
62 only to allow a `DatastoreRegistryBridgeManager` (and hence a `Datastore`)
63 to be constructed without a full `Registry`.
65 Notes
66 -----
67 The interface implemented by this class should probably be its own ABC,
68 and that ABC should probably be used in the definition of
69 `DatastoreRegistryBridgeManager`, but while prototyping I'm trying to keep
70 changes minimal.
71 """
73 @classmethod
74 def getIdColumnType(cls) -> type:
75 # Docstring inherited.
76 return ddl.GUID
78 @classmethod
79 def addDatasetForeignKey(
80 cls,
81 tableSpec: ddl.TableSpec,
82 *,
83 name: str = "dataset",
84 constraint: bool = True,
85 onDelete: Optional[str] = None,
86 **kwargs: Any,
87 ) -> ddl.FieldSpec:
88 # Docstring inherited.
89 idFieldSpec = ddl.FieldSpec(f"{name}_id", dtype=ddl.GUID, **kwargs)
90 tableSpec.fields.add(idFieldSpec)
91 return idFieldSpec
94class QuantumBackedButler(LimitedButler):
95 """An implementation of `LimitedButler` intended to back execution of a
96 single `Quantum`.
98 Parameters
99 ----------
100 quantum : `Quantum`
101 Object describing the predicted input and output dataset relevant to
102 this butler. This must have resolved `DatasetRef` instances for all
103 inputs and outputs.
104 dimensions : `DimensionUniverse`
105 Object managing all dimension definitions.
106 datastore : `Datastore`
107 Datastore to use for all dataset I/O and existence checks.
108 storageClasses : `StorageClassFactory`
109 Object managing all storage class definitions.
111 Notes
112 -----
113 Most callers should use the `initialize` `classmethod` to construct new
114 instances instead of calling the constructor directly.
116 `QuantumBackedButler` uses a SQLite database internally, in order to reuse
117 existing `DatastoreRegistryBridge` and `OpaqueTableStorage`
118 implementations that rely SQLAlchemy. If implementations are added in the
119 future that don't rely on SQLAlchemy, it should be possible to swap them
120 in by overriding the type arguments to `initialize` (though at present,
121 `QuantumBackedButler` would still create at least an in-memory SQLite
122 database that would then go unused).`
124 We imagine `QuantumBackedButler` being used during (at least) batch
125 execution to capture `Datastore` records and save them to per-quantum
126 files, which are also a convenient place to store provenance for eventual
127 upload to a SQL-backed `Registry` (once `Registry` has tables to store
128 provenance, that is).
129 These per-quantum files can be written in two ways:
131 - The SQLite file used internally by `QuantumBackedButler` can be used
132 directly but customizing the ``filename`` argument to ``initialize``, and
133 then transferring that file to the object store after execution completes
134 (or fails; a ``try/finally`` pattern probably makes sense here).
136 - A JSON or YAML file can be written by calling `extract_provenance_data`,
137 and using ``pydantic`` methods to write the returned
138 `QuantumProvenanceData` to a file.
140 Note that at present, the SQLite file only contains datastore records, not
141 provenance, but that should be easy to address (if desired) after we
142 actually design a `Registry` schema for provenance. I also suspect that
143 we'll want to explicitly close the SQLite file somehow before trying to
144 transfer it. But I'm guessing we'd prefer to write the per-quantum files
145 as JSON anyway.
146 """
148 def __init__(
149 self,
150 quantum: Quantum,
151 dimensions: DimensionUniverse,
152 datastore: Datastore,
153 storageClasses: StorageClassFactory,
154 ):
155 self._quantum = quantum
156 self._dimensions = dimensions
157 self._predicted_inputs: Set[DatasetId] = {
158 ref.getCheckedId() for ref in itertools.chain.from_iterable(quantum.inputs.values())
159 }
160 self._predicted_outputs: Set[DatasetId] = {
161 ref.getCheckedId() for ref in itertools.chain.from_iterable(quantum.outputs.values())
162 }
163 self._available_inputs: Set[DatasetId] = set()
164 self._unavailable_inputs: Set[DatasetId] = set()
165 self._actual_inputs: Set[DatasetId] = set()
166 self._actual_output_refs: Set[DatasetRef] = set()
167 self.datastore = datastore
168 self.storageClasses = storageClasses
170 @classmethod
171 def initialize(
172 cls,
173 config: Union[Config, str],
174 quantum: Quantum,
175 dimensions: DimensionUniverse,
176 filename: str = ":memory:",
177 OpaqueManagerClass: Type[OpaqueTableStorageManager] = ByNameOpaqueTableStorageManager,
178 BridgeManagerClass: Type[DatastoreRegistryBridgeManager] = MonolithicDatastoreRegistryBridgeManager,
179 search_paths: Optional[List[str]] = None,
180 ) -> QuantumBackedButler:
181 """Construct a new `QuantumBackedButler` from repository configuration
182 and helper types.
184 Parameters
185 ----------
186 config : `Config` or `str`
187 A butler repository root, configuration filename, or configuration
188 instance.
189 quantum : `Quantum`
190 Object describing the predicted input and output dataset relevant
191 to this butler. This must have resolved `DatasetRef` instances for
192 all inputs and outputs.
193 dimensions : `DimensionUniverse`
194 Object managing all dimension definitions.
195 filename : `str`, optional
196 Name for the SQLite database that will back this butler; defaults
197 to an in-memory database.
198 OpaqueManagerClass : `type`, optional
199 A subclass of `OpaqueTableStorageManager` to use for datastore
200 opaque records. Default is a SQL-backed implementation.
201 BridgeManagerClass : `type`, optional
202 A subclass of `DatastoreRegistryBridgeManager` to use for datastore
203 location records. Default is a SQL-backed implementation.
204 search_paths : `list` of `str`, optional
205 Additional search paths for butler configuration.
206 """
207 butler_config = ButlerConfig(config, searchPaths=search_paths)
208 if "root" in butler_config:
209 butler_root = butler_config["root"]
210 else:
211 butler_root = butler_config.configDir
212 db = SqliteDatabase.fromUri(f"sqlite:///{filename}", origin=0)
213 with db.declareStaticTables(create=True) as context:
214 opaque_manager = OpaqueManagerClass.initialize(db, context)
215 bridge_manager = BridgeManagerClass.initialize(
216 db,
217 context,
218 opaque=opaque_manager,
219 # MyPy can tell it's a fake, but we know it shouldn't care.
220 datasets=_DatasetRecordStorageManagerDatastoreConstructionMimic, # type: ignore
221 universe=dimensions,
222 )
223 # TODO: We need to inform `Datastore` here that it needs to support
224 # predictive reads; right now that's a configuration option, but after
225 # execution butler is retired it could just be a kwarg we pass here.
226 # For now just force this option as we cannot work without it.
227 butler_config["datastore", "trust_get_request"] = True
228 datastore = Datastore.fromConfig(butler_config, bridge_manager, butler_root)
229 datastore.import_records(quantum.datastore_records)
230 storageClasses = StorageClassFactory()
231 storageClasses.addFromConfig(butler_config)
232 return cls(quantum, dimensions, datastore, storageClasses=storageClasses)
234 def isWriteable(self) -> bool:
235 # Docstring inherited.
236 return True
238 def getDirect(self, ref: DatasetRef, *, parameters: Optional[Dict[str, Any]] = None) -> Any:
239 # Docstring inherited.
240 try:
241 obj = super().getDirect(ref, parameters=parameters)
242 except (LookupError, FileNotFoundError, IOError):
243 self._unavailable_inputs.add(ref.getCheckedId())
244 raise
245 if ref.id in self._predicted_inputs:
246 # do this after delegating to super in case that raises.
247 self._actual_inputs.add(ref.id)
248 self._available_inputs.add(ref.id)
249 return obj
251 def getDirectDeferred(
252 self, ref: DatasetRef, *, parameters: Union[dict, None] = None
253 ) -> DeferredDatasetHandle:
254 # Docstring inherited.
255 if ref.id in self._predicted_inputs:
256 # Unfortunately, we can't do this after the handle succeeds in
257 # loading, so it's conceivable here that we're marking an input
258 # as "actual" even when it's not even available.
259 self._actual_inputs.add(ref.id)
260 return super().getDirectDeferred(ref, parameters=parameters)
262 def datasetExistsDirect(self, ref: DatasetRef) -> bool:
263 # Docstring inherited.
264 exists = super().datasetExistsDirect(ref)
265 if ref.id in self._predicted_inputs:
266 if exists:
267 self._available_inputs.add(ref.id)
268 else:
269 self._unavailable_inputs.add(ref.id)
270 return exists
272 def markInputUnused(self, ref: DatasetRef) -> None:
273 # Docstring inherited.
274 self._actual_inputs.discard(ref.getCheckedId())
276 @property
277 def dimensions(self) -> DimensionUniverse:
278 # Docstring inherited.
279 return self._dimensions
281 def putDirect(self, obj: Any, ref: DatasetRef) -> DatasetRef:
282 # Docstring inherited.
283 if ref.id not in self._predicted_outputs:
284 raise RuntimeError("Cannot `put` dataset that was not predicted as an output.")
285 self.datastore.put(obj, ref)
286 self._actual_output_refs.add(ref)
287 return ref
289 def pruneDatasets(
290 self,
291 refs: Iterable[DatasetRef],
292 *,
293 disassociate: bool = True,
294 unstore: bool = False,
295 tags: Iterable[str] = (),
296 purge: bool = False,
297 ) -> None:
298 # docstring inherited from LimitedButler
300 if purge:
301 if not disassociate:
302 raise TypeError("Cannot pass purge=True without disassociate=True.")
303 if not unstore:
304 raise TypeError("Cannot pass purge=True without unstore=True.")
305 elif disassociate:
306 # No tagged collections for this butler.
307 raise TypeError("Cannot pass disassociate=True without purge=True.")
309 refs = list(refs)
311 # Pruning a component of a DatasetRef makes no sense.
312 for ref in refs:
313 if ref.datasetType.component():
314 raise ValueError(f"Can not prune a component of a dataset (ref={ref})")
316 if unstore:
317 self.datastore.trash(refs)
318 if purge:
319 for ref in refs:
320 # We only care about removing them from actual output refs,
321 self._actual_output_refs.discard(ref)
323 if unstore:
324 # Point of no return for removing artifacts
325 self.datastore.emptyTrash()
327 def extract_provenance_data(self) -> QuantumProvenanceData:
328 """Extract provenance information and datastore records from this
329 butler.
331 Returns
332 -------
333 provenance : `QuantumProvenanceData`
334 A serializable struct containing input/output dataset IDs and
335 datastore records. This assumes all dataset IDs are UUIDs (just to
336 make it easier for `pydantic` to reason about the struct's types);
337 the rest of this class makes no such assumption, but the approach
338 to processing in which it's useful effectively requires UUIDs
339 anyway.
341 Notes
342 -----
343 `QuantumBackedButler` records this provenance information when its
344 methods are used, which mostly saves `~lsst.pipe.base.PipelineTask`
345 authors from having to worry about while still recording very
346 detailed information. But it has two small weaknesses:
348 - Calling `getDirectDeferred` or `getDirect` is enough to mark a
349 dataset as an "actual input", which may mark some datasets that
350 aren't actually used. We rely on task authors to use
351 `markInputUnused` to address this.
353 - We assume that the execution system will call ``datasetExistsDirect``
354 on all predicted inputs prior to execution, in order to populate the
355 "available inputs" set. This is what I envision
356 '`~lsst.ctrl.mpexec.SingleQuantumExecutor` doing after we update it
357 to use this class, but it feels fragile for this class to make such
358 a strong assumption about how it will be used, even if I can't think
359 of any other executor behavior that would make sense.
360 """
361 if not self._actual_inputs.isdisjoint(self._unavailable_inputs):
362 _LOG.warning(
363 "Inputs %s were marked as actually used (probably because a DeferredDatasetHandle) "
364 "was obtained, but did not actually exist. This task should be be using markInputUnused "
365 "directly to clarify its provenance.",
366 self._actual_inputs & self._unavailable_inputs,
367 )
368 self._actual_inputs -= self._unavailable_inputs
369 checked_inputs = self._available_inputs | self._unavailable_inputs
370 if not self._predicted_inputs == checked_inputs:
371 _LOG.warning(
372 "Execution harness did not check predicted inputs %s for existence; available inputs "
373 "recorded in provenance may be incomplete.",
374 self._predicted_inputs - checked_inputs,
375 )
376 datastore_records = self.datastore.export_records(self._actual_output_refs)
377 provenance_records = {
378 datastore_name: records.to_simple() for datastore_name, records in datastore_records.items()
379 }
381 return QuantumProvenanceData(
382 predicted_inputs=self._predicted_inputs,
383 available_inputs=self._available_inputs,
384 actual_inputs=self._actual_inputs,
385 predicted_outputs=self._predicted_outputs,
386 actual_outputs={ref.getCheckedId() for ref in self._actual_output_refs},
387 datastore_records=provenance_records,
388 )
391class QuantumProvenanceData(BaseModel):
392 """A serializable struct for per-quantum provenance information and
393 datastore records.
395 Notes
396 -----
397 This class slightly duplicates information from the `Quantum` class itself
398 (the `predicted_inputs` and `predicted_outputs` sets should have the same
399 IDs present in `Quantum.inputs` and `Quantum.outputs`), but overall it
400 assumes the original `Quantum` is also available to reconstruct the
401 complete provenance (e.g. by associating dataset IDs with data IDs,
402 dataset types, and `~CollectionType.RUN` names.
404 Note that ``pydantic`` method ``parse_raw()`` is not going to work
405 correctly for this class, use `direct` method instead.
406 """
408 # This class probably should have information about its execution
409 # environment (anything not controlled and recorded at the
410 # `~CollectionType.RUN` level, such as the compute node ID). but adding it
411 # now is out of scope for this prototype.
413 predicted_inputs: Set[uuid.UUID]
414 """Unique IDs of datasets that were predicted as inputs to this quantum
415 when the QuantumGraph was built.
416 """
418 available_inputs: Set[uuid.UUID]
419 """Unique IDs of input datasets that were actually present in the datastore
420 when this quantum was executed.
422 This is a subset of `predicted_inputs`, with the difference generally being
423 datasets were `predicted_outputs` but not `actual_outputs` of some upstream
424 task.
425 """
427 actual_inputs: Set[uuid.UUID]
428 """Unique IDs of datasets that were actually used as inputs by this task.
430 This is a subset of `available_inputs`.
432 Notes
433 -----
434 The criteria for marking an input as used is that rerunning the quantum
435 with only these `actual_inputs` available must yield identical outputs.
436 This means that (for example) even just using an input to help determine
437 an output rejection criteria and then rejecting it as an outlier qualifies
438 that input as actually used.
439 """
441 predicted_outputs: Set[uuid.UUID]
442 """Unique IDs of datasets that were predicted as outputs of this quantum
443 when the QuantumGraph was built.
444 """
446 actual_outputs: Set[uuid.UUID]
447 """Unique IDs of datasets that were actually written when this quantum
448 was executed.
449 """
451 datastore_records: Dict[str, SerializedDatastoreRecordData]
452 """Datastore records indexed by datastore name."""
454 @staticmethod
455 def collect_and_transfer(
456 butler: Butler, quanta: Iterable[Quantum], provenance: Iterable[QuantumProvenanceData]
457 ) -> None:
458 """Transfer output datasets from multiple quanta to a more permantent
459 `Butler` repository.
461 Parameters
462 ----------
463 butler : `Butler`
464 Full butler representing the data repository to transfer datasets
465 to.
466 quanta : `Iterable` [ `Quantum` ]
467 Iterable of `Quantum` objects that carry information about
468 predicted outputs. May be a single-pass iterator.
469 provenance : `Iterable` [ `QuantumProvenanceData` ]
470 Provenance and datastore data for each of the given quanta, in the
471 same order. May be a single-pass iterator.
473 Notes
474 -----
475 Input-output provenance data is not actually transferred yet, because
476 `Registry` has no place to store it.
478 This method probably works most efficiently if run on all quanta for a
479 single task label at once, because this will gather all datasets of
480 a particular type together into a single vectorized `Registry` import.
481 It should still behave correctly if run on smaller groups of quanta
482 or even quanta from multiple tasks.
484 Currently this method transfers datastore record data unchanged, with
485 no possibility of actually moving (e.g.) files. Datastores that are
486 present only in execution or only in the more permanent butler are
487 ignored.
488 """
489 grouped_refs = defaultdict(list)
490 summary_records: Dict[str, DatastoreRecordData] = {}
491 for quantum, provenance_for_quantum in zip(quanta, provenance):
492 quantum_refs_by_id = {
493 ref.getCheckedId(): ref
494 for ref in itertools.chain.from_iterable(quantum.outputs.values())
495 if ref.getCheckedId() in provenance_for_quantum.actual_outputs
496 }
497 for ref in quantum_refs_by_id.values():
498 grouped_refs[ref.datasetType, ref.run].append(ref)
500 # merge datastore records into a summary structure
501 for datastore_name, serialized_records in provenance_for_quantum.datastore_records.items():
502 quantum_records = DatastoreRecordData.from_simple(serialized_records)
503 if (records := summary_records.get(datastore_name)) is not None:
504 records.update(quantum_records)
505 else:
506 summary_records[datastore_name] = quantum_records
508 for refs in grouped_refs.values():
509 butler.registry._importDatasets(refs)
510 butler.datastore.import_records(summary_records)
512 @classmethod
513 def parse_raw(cls, *args: Any, **kwargs: Any) -> QuantumProvenanceData:
514 raise NotImplementedError("parse_raw() is not usable for this class, use direct() instead.")
516 @classmethod
517 def direct(
518 cls,
519 *,
520 predicted_inputs: Iterable[Union[str, uuid.UUID]],
521 available_inputs: Iterable[Union[str, uuid.UUID]],
522 actual_inputs: Iterable[Union[str, uuid.UUID]],
523 predicted_outputs: Iterable[Union[str, uuid.UUID]],
524 actual_outputs: Iterable[Union[str, uuid.UUID]],
525 datastore_records: Mapping[str, Mapping],
526 ) -> QuantumProvenanceData:
527 """Construct an instance directly without validators.
529 This differs from the pydantic "construct" method in that the
530 arguments are explicitly what the model requires, and it will recurse
531 through members, constructing them from their corresponding `direct`
532 methods.
534 This method should only be called when the inputs are trusted.
535 """
537 def _to_uuid_set(uuids: Iterable[Union[str, uuid.UUID]]) -> Set[uuid.UUID]:
538 """Convert input UUIDs, which could be in string representation to
539 a set of `UUID` instances.
540 """
541 return set(uuid.UUID(id) if isinstance(id, str) else id for id in uuids)
543 data = QuantumProvenanceData.__new__(cls)
544 setter = object.__setattr__
545 setter(data, "predicted_inputs", _to_uuid_set(predicted_inputs))
546 setter(data, "available_inputs", _to_uuid_set(available_inputs))
547 setter(data, "actual_inputs", _to_uuid_set(actual_inputs))
548 setter(data, "predicted_outputs", _to_uuid_set(predicted_outputs))
549 setter(data, "actual_outputs", _to_uuid_set(actual_outputs))
550 setter(
551 data,
552 "datastore_records",
553 {
554 key: SerializedDatastoreRecordData.direct(**records)
555 for key, records in datastore_records.items()
556 },
557 )
558 return data