Coverage for python/lsst/daf/butler/_quantum_backed.py: 32%
145 statements
« prev ^ index » next coverage.py v6.4.1, created at 2022-06-09 02:25 -0700
« prev ^ index » next coverage.py v6.4.1, created at 2022-06-09 02:25 -0700
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = ("QuantumBackedButler", "QuantumProvenanceData")
26import itertools
27import logging
28import uuid
29from collections import defaultdict
30from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Mapping, Optional, Set, Type, Union
32from pydantic import BaseModel
34from ._butlerConfig import ButlerConfig
35from ._deferredDatasetHandle import DeferredDatasetHandle
36from ._limited_butler import LimitedButler
37from .core import (
38 Config,
39 DatasetId,
40 DatasetRef,
41 Datastore,
42 DatastoreRecordData,
43 DimensionUniverse,
44 Quantum,
45 SerializedDatastoreRecordData,
46 StorageClassFactory,
47 ddl,
48)
49from .registry.bridge.monolithic import MonolithicDatastoreRegistryBridgeManager
50from .registry.databases.sqlite import SqliteDatabase
51from .registry.interfaces import DatastoreRegistryBridgeManager, OpaqueTableStorageManager
52from .registry.opaque import ByNameOpaqueTableStorageManager
54if TYPE_CHECKING: 54 ↛ 55line 54 didn't jump to line 55, because the condition on line 54 was never true
55 from ._butler import Butler
57_LOG = logging.getLogger(__name__)
60class _DatasetRecordStorageManagerDatastoreContructionMimic:
61 """A partial implementation of `DatasetRecordStorageManager` that exists
62 only to allow a `DatastoreRegistryBridgeManager` (and hence a `Datastore`)
63 to be constructed without a full `Registry`.
65 Notes
66 -----
67 The interface implemented by this class should probably be its own ABC,
68 and that ABC should probably be used in the definition of
69 `DatastoreRegistryBridgeManager`, but while prototyping I'm trying to keep
70 changes minimal.
71 """
73 @classmethod
74 def getIdColumnType(cls) -> type:
75 # Docstring inherited.
76 return ddl.GUID
78 @classmethod
79 def addDatasetForeignKey(
80 cls,
81 tableSpec: ddl.TableSpec,
82 *,
83 name: str = "dataset",
84 constraint: bool = True,
85 onDelete: Optional[str] = None,
86 **kwargs: Any,
87 ) -> ddl.FieldSpec:
88 # Docstring inherited.
89 idFieldSpec = ddl.FieldSpec(f"{name}_id", dtype=ddl.GUID, **kwargs)
90 tableSpec.fields.add(idFieldSpec)
91 return idFieldSpec
94class QuantumBackedButler(LimitedButler):
95 """An implementation of `LimitedButler` intended to back execution of a
96 single `Quantum`.
98 Parameters
99 ----------
100 quantum : `Quantum`
101 Object describing the predicted input and output dataset relevant to
102 this butler. This must have resolved `DatasetRef` instances for all
103 inputs and outputs.
104 dimensions : `DimensionUniverse`
105 Object managing all dimension definitions.
106 datastore : `Datastore`
107 Datastore to use for all dataset I/O and existence checks.
108 storageClasses : `StorageClassFactory`
109 Object managing all storage class definitions.
111 Notes
112 -----
113 Most callers should use the `initialize` `classmethod` to construct new
114 instances instead of calling the constructor directly.
116 `QuantumBackedButler` uses a SQLite database internally, in order to reuse
117 existing `DatastoreRegistryBridge` and `OpaqueTableStorage`
118 implementations that rely SQLAlchemy. If implementations are added in the
119 future that don't rely on SQLAlchemy, it should be possible to swap them
120 in by overriding the type arguments to `initialize` (though at present,
121 `QuantumBackedButler` would still create at least an in-memory SQLite
122 database that would then go unused).`
124 We imagine `QuantumBackedButler` being used during (at least) batch
125 execution to capture `Datastore` records and save them to per-quantum
126 files, which are also a convenient place to store provenance for eventual
127 upload to a SQL-backed `Registry` (once `Registry` has tables to store
128 provenance, that is).
129 These per-quantum files can be written in two ways:
131 - The SQLite file used internally by `QuantumBackedButler` can be used
132 directly but customizing the ``filename`` argument to ``initialize``, and
133 then transferring that file to the object store after execution completes
134 (or fails; a ``try/finally`` pattern probably makes sense here).
136 - A JSON or YAML file can be written by calling `extract_provenance_data`,
137 and using ``pydantic`` methods to write the returned
138 `QuantumProvenanceData` to a file.
140 Note that at present, the SQLite file only contains datastore records, not
141 provenance, but that should be easy to address (if desired) after we
142 actually design a `Registry` schema for provenance. I also suspect that
143 we'll want to explicitly close the SQLite file somehow before trying to
144 transfer it. But I'm guessing we'd prefer to write the per-quantum files
145 as JSON anyway.
146 """
148 def __init__(
149 self,
150 quantum: Quantum,
151 dimensions: DimensionUniverse,
152 datastore: Datastore,
153 storageClasses: StorageClassFactory,
154 ):
155 self._quantum = quantum
156 self._dimensions = dimensions
157 self._predicted_inputs: Set[DatasetId] = {
158 ref.getCheckedId() for ref in itertools.chain.from_iterable(quantum.inputs.values())
159 }
160 self._predicted_outputs: Set[DatasetId] = {
161 ref.getCheckedId() for ref in itertools.chain.from_iterable(quantum.outputs.values())
162 }
163 self._available_inputs: Set[DatasetId] = set()
164 self._unavailable_inputs: Set[DatasetId] = set()
165 self._actual_inputs: Set[DatasetId] = set()
166 self._actual_output_refs: Set[DatasetRef] = set()
167 self.datastore = datastore
168 self.storageClasses = storageClasses
170 @classmethod
171 def initialize(
172 cls,
173 config: Union[Config, str],
174 quantum: Quantum,
175 dimensions: DimensionUniverse,
176 filename: str = ":memory:",
177 OpaqueManagerClass: Type[OpaqueTableStorageManager] = ByNameOpaqueTableStorageManager,
178 BridgeManagerClass: Type[DatastoreRegistryBridgeManager] = MonolithicDatastoreRegistryBridgeManager,
179 search_paths: Optional[List[str]] = None,
180 ) -> QuantumBackedButler:
181 """Construct a new `QuantumBackedButler` from repository configuration
182 and helper types.
184 Parameters
185 ----------
186 config : `Config` or `str`
187 A butler repository root, configuration filename, or configuration
188 instance.
189 quantum : `Quantum`
190 Object describing the predicted input and output dataset relevant
191 to this butler. This must have resolved `DatasetRef` instances for
192 all inputs and outputs.
193 dimensions : `DimensionUniverse`
194 Object managing all dimension definitions.
195 filename : `str`, optional
196 Name for the SQLite database that will back this butler; defaults
197 to an in-memory database.
198 OpaqueManagerClass : `type`, optional
199 A subclass of `OpaqueTableStorageManager` to use for datastore
200 opaque records. Default is a SQL-backed implementation.
201 BridgeManagerClass : `type`, optional
202 A subclass of `DatastoreRegistryBridgeManager` to use for datastore
203 location records. Default is a SQL-backed implementation.
204 search_paths : `list` of `str`, optional
205 Additional search paths for butler configuration.
206 """
207 butler_config = ButlerConfig(config, searchPaths=search_paths)
208 if "root" in butler_config:
209 butler_root = butler_config["root"]
210 else:
211 butler_root = butler_config.configDir
212 db = SqliteDatabase.fromUri(f"sqlite:///{filename}", origin=0)
213 with db.declareStaticTables(create=True) as context:
214 opaque_manager = OpaqueManagerClass.initialize(db, context)
215 bridge_manager = BridgeManagerClass.initialize(
216 db,
217 context,
218 opaque=opaque_manager,
219 # MyPy can tell it's a fake, but we know it shouldn't care.
220 datasets=_DatasetRecordStorageManagerDatastoreContructionMimic, # type: ignore
221 universe=dimensions,
222 )
223 # TODO: We need to inform `Datastore` here that it needs to support
224 # predictive reads; right now that's a configuration option, but after
225 # execution butler is retired it could just be a kwarg we pass here.
226 # For now just force this option as we cannot work without it.
227 butler_config["datastore", "trust_get_request"] = True
228 datastore = Datastore.fromConfig(butler_config, bridge_manager, butler_root)
229 datastore.import_records(quantum.datastore_records)
230 storageClasses = StorageClassFactory()
231 storageClasses.addFromConfig(butler_config)
232 return cls(quantum, dimensions, datastore, storageClasses=storageClasses)
234 def isWriteable(self) -> bool:
235 # Docstring inherited.
236 return True
238 def getDirect(self, ref: DatasetRef, *, parameters: Optional[Dict[str, Any]] = None) -> Any:
239 # Docstring inherited.
240 try:
241 obj = super().getDirect(ref, parameters=parameters)
242 except (LookupError, FileNotFoundError, IOError):
243 self._unavailable_inputs.add(ref.getCheckedId())
244 raise
245 if ref.id in self._predicted_inputs:
246 # do this after delegating to super in case that raises.
247 self._actual_inputs.add(ref.id)
248 self._available_inputs.add(ref.id)
249 return obj
251 def getDirectDeferred(
252 self, ref: DatasetRef, *, parameters: Union[dict, None] = None
253 ) -> DeferredDatasetHandle:
254 # Docstring inherited.
255 if ref.id in self._predicted_inputs:
256 # Unfortunately, we can't do this after the handle succeeds in
257 # loading, so it's conceivable here that we're marking an input
258 # as "actual" even when it's not even available.
259 self._actual_inputs.add(ref.id)
260 return super().getDirectDeferred(ref, parameters=parameters)
262 def datasetExistsDirect(self, ref: DatasetRef) -> bool:
263 # Docstring inherited.
264 exists = super().datasetExistsDirect(ref)
265 if ref.id in self._predicted_inputs:
266 if exists:
267 self._available_inputs.add(ref.id)
268 else:
269 self._unavailable_inputs.add(ref.id)
270 return exists
272 def markInputUnused(self, ref: DatasetRef) -> None:
273 # Docstring inherited.
274 self._actual_inputs.discard(ref.getCheckedId())
276 @property
277 def dimensions(self) -> DimensionUniverse:
278 # Docstring inherited.
279 return self._dimensions
281 def putDirect(self, obj: Any, ref: DatasetRef) -> DatasetRef:
282 # Docstring inherited.
283 if ref.id not in self._predicted_outputs:
284 raise RuntimeError("Cannot `put` dataset that was not predicted as an output.")
285 self.datastore.put(obj, ref)
286 self._actual_output_refs.add(ref)
287 return ref
289 def extract_provenance_data(self) -> QuantumProvenanceData:
290 """Extract provenance information and datastore records from this
291 butler.
293 Returns
294 -------
295 provenance : `QuantumProvenanceData`
296 A serializable struct containing input/output dataset IDs and
297 datastore records. This assumes all dataset IDs are UUIDs (just to
298 make it easier for `pydantic` to reason about the struct's types);
299 the rest of this class makes no such assumption, but the approach
300 to processing in which it's useful effectively requires UUIDs
301 anyway.
303 Notes
304 -----
305 `QuantumBackedButler` records this provenance information when its
306 methods are used, which mostly saves `~lsst.pipe.base.PipelineTask`
307 authors from having to worry about while still recording very
308 detailed information. But it has two small weaknesses:
310 - Calling `getDirectDeferred` or `getDirect` is enough to mark a
311 dataset as an "actual input", which may mark some datasets that
312 aren't actually used. We rely on task authors to use
313 `markInputUnused` to address this.
315 - We assume that the execution system will call ``datasetExistsDirect``
316 on all predicted inputs prior to execution, in order to populate the
317 "available inputs" set. This is what I envision
318 '`~lsst.ctrl.mpexec.SingleQuantumExecutor` doing after we update it
319 to use this class, but it feels fragile for this class to make such
320 a strong assumption about how it will be used, even if I can't think
321 of any other executor behavior that would make sense.
322 """
323 if not self._actual_inputs.isdisjoint(self._unavailable_inputs):
324 _LOG.warning(
325 "Inputs %s were marked as actually used (probably because a DeferredDatasetHandle) "
326 "was obtained, but did not actually exist. This task should be be using markInputUnused "
327 "directly to clarify its provenance.",
328 self._actual_inputs & self._unavailable_inputs,
329 )
330 self._actual_inputs -= self._unavailable_inputs
331 checked_inputs = self._available_inputs | self._unavailable_inputs
332 if not self._predicted_inputs == checked_inputs:
333 _LOG.warning(
334 "Execution harness did not check predicted inputs %s for existence; available inputs "
335 "recorded in provenance may be incomplete.",
336 self._predicted_inputs - checked_inputs,
337 )
338 datastore_records = self.datastore.export_records(self._actual_output_refs)
339 provenance_records = {
340 datastore_name: records.to_simple() for datastore_name, records in datastore_records.items()
341 }
343 return QuantumProvenanceData(
344 predicted_inputs=self._predicted_inputs,
345 available_inputs=self._available_inputs,
346 actual_inputs=self._actual_inputs,
347 predicted_outputs=self._predicted_outputs,
348 actual_outputs={ref.getCheckedId() for ref in self._actual_output_refs},
349 datastore_records=provenance_records,
350 )
353class QuantumProvenanceData(BaseModel):
354 """A serializable struct for per-quantum provenance information and
355 datastore records.
357 Notes
358 -----
359 This class slightly duplicates information from the `Quantum` class itself
360 (the `predicted_inputs` and `predicted_outputs` sets should have the same
361 IDs present in `Quantum.inputs` and `Quantum.outputs`), but overall it
362 assumes the original `Quantum` is also available to reconstruct the
363 complete provenance (e.g. by associating dataset IDs with data IDs,
364 dataset types, and `~CollectionType.RUN` names.
366 Note that ``pydantic`` method ``parse_raw()`` is not going to work
367 correctly for this class, use `direct` method instead.
368 """
370 # This class probably should have information about its execution
371 # environment (anything not controlled and recorded at the
372 # `~CollectionType.RUN` level, such as the compute node ID). but adding it
373 # now is out of scope for this prototype.
375 predicted_inputs: Set[uuid.UUID]
376 """Unique IDs of datasets that were predicted as inputs to this quantum
377 when the QuantumGraph was built.
378 """
380 available_inputs: Set[uuid.UUID]
381 """Unique IDs of input datasets that were actually present in the datastore
382 when this quantum was executed.
384 This is a subset of `predicted_inputs`, with the difference generally being
385 datasets were `predicted_outputs` but not `actual_outputs` of some upstream
386 task.
387 """
389 actual_inputs: Set[uuid.UUID]
390 """Unique IDs of datasets that were actually used as inputs by this task.
392 This is a subset of `available_inputs`.
394 Notes
395 -----
396 The criteria for marking an input as used is that rerunning the quantum
397 with only these `actual_inputs` available must yield identical outputs.
398 This means that (for example) even just using an input to help determine
399 an output rejection criteria and then rejecting it as an outlier qualifies
400 that input as actually used.
401 """
403 predicted_outputs: Set[uuid.UUID]
404 """Unique IDs of datasets that were predicted as outputs of this quantum
405 when the QuantumGraph was built.
406 """
408 actual_outputs: Set[uuid.UUID]
409 """Unique IDs of datasets that were actually written when this quantum
410 was executed.
411 """
413 datastore_records: Dict[str, SerializedDatastoreRecordData]
414 """Datastore records indexed by datastore name."""
416 @staticmethod
417 def collect_and_transfer(
418 butler: Butler, quanta: Iterable[Quantum], provenance: Iterable[QuantumProvenanceData]
419 ) -> None:
420 """Transfer output datasets from multiple quanta to a more permantent
421 `Butler` repository.
423 Parameters
424 ----------
425 butler : `Butler`
426 Full butler representing the data repository to transfer datasets
427 to.
428 quanta : `Iterable` [ `Quantum` ]
429 Iterable of `Quantum` objects that carry information about
430 predicted outputs. May be a single-pass iterator.
431 provenance : `Iterable` [ `QuantumProvenanceData` ]
432 Provenance and datastore data for each of the given quanta, in the
433 same order. May be a single-pass iterator.
435 Notes
436 -----
437 Input-output provenance data is not actually transferred yet, because
438 `Registry` has no place to store it.
440 This method probably works most efficiently if run on all quanta for a
441 single task label at once, because this will gather all datasets of
442 a particular type together into a single vectorized `Registry` import.
443 It should still behave correctly if run on smaller groups of quanta
444 or even quanta from multiple tasks.
446 Currently this method transfers datastore record data unchanged, with
447 no possibility of actually moving (e.g.) files. Datastores that are
448 present only in execution or only in the more permanent butler are
449 ignored.
450 """
451 grouped_refs = defaultdict(list)
452 summary_records: Dict[str, DatastoreRecordData] = {}
453 for quantum, provenance_for_quantum in zip(quanta, provenance):
454 quantum_refs_by_id = {
455 ref.getCheckedId(): ref
456 for ref in itertools.chain.from_iterable(quantum.outputs.values())
457 if ref.getCheckedId() in provenance_for_quantum.actual_outputs
458 }
459 for ref in quantum_refs_by_id.values():
460 grouped_refs[ref.datasetType, ref.run].append(ref)
462 # merge datastore records into a summary structure
463 for datastore_name, serialized_records in provenance_for_quantum.datastore_records.items():
464 quantum_records = DatastoreRecordData.from_simple(serialized_records)
465 if (records := summary_records.get(datastore_name)) is not None:
466 records.update(quantum_records)
467 else:
468 summary_records[datastore_name] = quantum_records
470 for refs in grouped_refs.values():
471 butler.registry._importDatasets(refs)
472 butler.datastore.import_records(summary_records)
474 @classmethod
475 def parse_raw(cls, *args: Any, **kwargs: Any) -> QuantumProvenanceData:
476 raise NotImplementedError("parse_raw() is not usable for this class, use direct() instead.")
478 @classmethod
479 def direct(
480 cls,
481 *,
482 predicted_inputs: Iterable[Union[str, uuid.UUID]],
483 available_inputs: Iterable[Union[str, uuid.UUID]],
484 actual_inputs: Iterable[Union[str, uuid.UUID]],
485 predicted_outputs: Iterable[Union[str, uuid.UUID]],
486 actual_outputs: Iterable[Union[str, uuid.UUID]],
487 datastore_records: Mapping[str, Mapping],
488 ) -> QuantumProvenanceData:
489 """Construct an instance directly without validators.
491 This differs from the pydantic "construct" method in that the
492 arguments are explicitly what the model requires, and it will recurse
493 through members, constructing them from their corresponding `direct`
494 methods.
496 This method should only be called when the inputs are trusted.
497 """
499 def _to_uuid_set(uuids: Iterable[Union[str, uuid.UUID]]) -> Set[uuid.UUID]:
500 """Convert input UUIDs, which could be in string representation to
501 a set of `UUID` instances.
502 """
503 return set(uuid.UUID(id) if isinstance(id, str) else id for id in uuids)
505 data = QuantumProvenanceData.__new__(cls)
506 setter = object.__setattr__
507 setter(data, "predicted_inputs", _to_uuid_set(predicted_inputs))
508 setter(data, "available_inputs", _to_uuid_set(available_inputs))
509 setter(data, "actual_inputs", _to_uuid_set(actual_inputs))
510 setter(data, "predicted_outputs", _to_uuid_set(predicted_outputs))
511 setter(data, "actual_outputs", _to_uuid_set(actual_outputs))
512 setter(
513 data,
514 "datastore_records",
515 {
516 key: SerializedDatastoreRecordData.direct(**records)
517 for key, records in datastore_records.items()
518 },
519 )
520 return data