Coverage for python/lsst/daf/butler/_quantum_backed.py: 25%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = ("QuantumBackedButler", "QuantumProvenanceData")
26import dataclasses
27import functools
28import itertools
29import logging
30import uuid
31from collections import defaultdict
32from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Set, Type, Union
34from lsst.utils import doImportType
35from lsst.utils.introspection import get_full_type_name
37from ._butlerConfig import ButlerConfig
38from ._deferredDatasetHandle import DeferredDatasetHandle
39from ._limited_butler import LimitedButler
40from .core import (
41 Config,
42 DatasetId,
43 DatasetRef,
44 Datastore,
45 DatastoreRecordData,
46 DimensionUniverse,
47 Quantum,
48 StorageClassFactory,
49 StoredDatastoreItemInfo,
50 ddl,
51)
52from .registry.bridge.monolithic import MonolithicDatastoreRegistryBridgeManager
53from .registry.databases.sqlite import SqliteDatabase
54from .registry.interfaces import DatastoreRegistryBridgeManager, OpaqueTableStorageManager
55from .registry.opaque import ByNameOpaqueTableStorageManager
57if TYPE_CHECKING: 57 ↛ 58line 57 didn't jump to line 58, because the condition on line 57 was never true
58 from ._butler import Butler
59 from .registry import Registry
61_LOG = logging.getLogger(__name__)
64class _DatasetRecordStorageManagerDatastoreContructionMimic:
65 """A partial implementation of `DatasetRecordStorageManager` that exists
66 only to allow a `DatastoreRegistryBridgeManager` (and hence a `Datastore`)
67 to be constructed without a full `Registry`.
69 Notes
70 -----
71 The interface implemented by this class should probably be its own ABC,
72 and that ABC should probably be used in the definition of
73 `DatastoreRegistryBridgeManager`, but while prototyping I'm trying to keep
74 changes minimal.
75 """
77 @classmethod
78 def getIdColumnType(cls) -> type:
79 # Docstring inherited.
80 return ddl.GUID
82 @classmethod
83 def addDatasetForeignKey(
84 cls,
85 tableSpec: ddl.TableSpec,
86 *,
87 name: str = "dataset",
88 constraint: bool = True,
89 onDelete: Optional[str] = None,
90 **kwargs: Any,
91 ) -> ddl.FieldSpec:
92 # Docstring inherited.
93 idFieldSpec = ddl.FieldSpec(f"{name}_id", dtype=ddl.GUID, **kwargs)
94 tableSpec.fields.add(idFieldSpec)
95 return idFieldSpec
98class QuantumBackedButler(LimitedButler):
99 """An implementation of `LimitedButler` intended to back execution of a
100 single `Quantum`.
102 Parameters
103 ----------
104 quantum : `Quantum`
105 Object describing the predicted input and output dataset relevant to
106 this butler. This must have resolved `DatasetRef` instances for all
107 inputs and outputs.
108 dimensions : `DimensionUniverse`
109 Object managing all dimension definitions.
110 datastore : `Datastore`
111 Datastore to use for all dataset I/O and existence checks.
112 storageClasses : `StorageClassFactory`
113 Object managing all storage class definitions.
115 Notes
116 -----
117 Most callers should use the `initialize` `classmethod` to construct new
118 instances instead of calling the constructor directly.
120 `QuantumBackedButler` uses a SQLite database internally, in order to reuse
121 existing `DatastoreRegistryBridge` and `OpaqueTableStorage`
122 implementations that rely SQLAlchemy. If implementations are added in the
123 future that don't rely on SQLAlchemy, it should be possible to swap them
124 in by overriding the type arguments to `initialize` (though at present,
125 `QuantumBackedButler` would still create at least an in-memory SQLite
126 database that would then go unused).`
128 We imagine `QuantumBackedButler` being used during (at least) batch
129 execution to capture `Datastore` records and save them to per-quantum
130 files, which are also a convenient place to store provenance for eventual
131 upload to a SQL-backed `Registry` (once `Registry` has tables to store
132 provenance, that is).
133 These per-quantum files can be written in two ways:
135 - The SQLite file used internally by `QuantumBackedButler` can be used
136 directly but customizing the ``filename`` argument to ``initialize``, and
137 then transferring that file to the object store after execution completes
138 (or fails; a ``try/finally`` pattern probably makes sense here).
140 - A JSON or YAML file can be written by calling `extract_provenance_data`,
141 and using ``pydantic`` methods to write the returned
142 `QuantumProvenanceData` to a file.
144 Note that at present, the SQLite file only contains datastore records, not
145 provenance, but that should be easy to address (if desired) after we
146 actually design a `Registry` schema for provenance. I also suspect that
147 we'll want to explicitly close the SQLite file somehow before trying to
148 transfer it. But I'm guessing we'd prefer to write the per-quantum files
149 as JSON anyway.
150 """
152 def __init__(
153 self,
154 quantum: Quantum,
155 dimensions: DimensionUniverse,
156 datastore: Datastore,
157 storageClasses: StorageClassFactory,
158 ):
159 self._quantum = quantum
160 self._dimensions = dimensions
161 self._predicted_inputs: Set[DatasetId] = {
162 ref.getCheckedId() for ref in itertools.chain.from_iterable(quantum.inputs.values())
163 }
164 self._predicted_outputs: Set[DatasetId] = {
165 ref.getCheckedId() for ref in itertools.chain.from_iterable(quantum.outputs.values())
166 }
167 self._available_inputs: Set[DatasetId] = set()
168 self._unavailable_inputs: Set[DatasetId] = set()
169 self._actual_inputs: Set[DatasetId] = set()
170 self._actual_output_refs: Set[DatasetRef] = set()
171 self.datastore = datastore
172 self.storageClasses = storageClasses
174 @classmethod
175 def initialize(
176 cls,
177 config: Union[Config, str],
178 quantum: Quantum,
179 dimensions: DimensionUniverse,
180 filename: str = ":memory:",
181 OpaqueManagerClass: Type[OpaqueTableStorageManager] = ByNameOpaqueTableStorageManager,
182 BridgeManagerClass: Type[DatastoreRegistryBridgeManager] = MonolithicDatastoreRegistryBridgeManager,
183 search_paths: Optional[List[str]] = None,
184 ) -> QuantumBackedButler:
185 """Construct a new `QuantumBackedButler` from repository configuration
186 and helper types.
188 Parameters
189 ----------
190 config : `Config` or `str`
191 A butler repository root, configuration filename, or configuration
192 instance.
193 quantum : `Quantum`
194 Object describing the predicted input and output dataset relevant
195 to this butler. This must have resolved `DatasetRef` instances for
196 all inputs and outputs.
197 dimensions : `DimensionUniverse`
198 Object managing all dimension definitions.
199 filename : `str`, optional
200 Name for the SQLite database that will back this butler; defaults
201 to an in-memory database.
202 OpaqueManagerClass : `type`, optional
203 A subclass of `OpaqueTableStorageManager` to use for datastore
204 opaque records. Default is a SQL-backed implementation.
205 BridgeManagerClass : `type`, optional
206 A subclass of `DatastoreRegistryBridgeManager` to use for datastore
207 location records. Default is a SQL-backed implementation.
208 search_paths : `list` of `str`, optional
209 Additional search paths for butler configuration.
210 """
211 butler_config = ButlerConfig(config, searchPaths=search_paths)
212 if "root" in butler_config:
213 butler_root = butler_config["root"]
214 else:
215 butler_root = butler_config.configDir
216 db = SqliteDatabase.fromUri(f"sqlite:///{filename}", origin=0)
217 with db.declareStaticTables(create=True) as context:
218 opaque_manager = OpaqueManagerClass.initialize(db, context)
219 bridge_manager = BridgeManagerClass.initialize(
220 db,
221 context,
222 opaque=opaque_manager,
223 # MyPy can tell it's a fake, but we know it shouldn't care.
224 datasets=_DatasetRecordStorageManagerDatastoreContructionMimic, # type: ignore
225 universe=dimensions,
226 )
227 # TODO: We need to inform `Datastore` here that it needs to support
228 # predictive reads; right now that's a configuration option, but after
229 # execution butler is retired it could just be a kwarg we pass here.
230 # For now just force this option as we cannot work without it.
231 butler_config["datastore", "trust_get_request"] = True
232 datastore = Datastore.fromConfig(butler_config, bridge_manager, butler_root)
233 datastore.import_records(quantum.datastore_records)
234 storageClasses = StorageClassFactory()
235 storageClasses.addFromConfig(butler_config)
236 return cls(quantum, dimensions, datastore, storageClasses=storageClasses)
238 def isWriteable(self) -> bool:
239 # Docstring inherited.
240 return True
242 def getDirect(self, ref: DatasetRef, *, parameters: Optional[Dict[str, Any]] = None) -> Any:
243 # Docstring inherited.
244 try:
245 obj = super().getDirect(ref, parameters=parameters)
246 except (LookupError, FileNotFoundError, IOError):
247 self._unavailable_inputs.add(ref.getCheckedId())
248 raise
249 if ref.id in self._predicted_inputs:
250 # do this after delegating to super in case that raises.
251 self._actual_inputs.add(ref.id)
252 self._available_inputs.add(ref.id)
253 return obj
255 def getDirectDeferred(
256 self, ref: DatasetRef, *, parameters: Union[dict, None] = None
257 ) -> DeferredDatasetHandle:
258 # Docstring inherited.
259 if ref.id in self._predicted_inputs:
260 # Unfortunately, we can't do this after the handle succeeds in
261 # loading, so it's conceivable here that we're marking an input
262 # as "actual" even when it's not even available.
263 self._actual_inputs.add(ref.id)
264 return super().getDirectDeferred(ref, parameters=parameters)
266 def datasetExistsDirect(self, ref: DatasetRef) -> bool:
267 # Docstring inherited.
268 exists = super().datasetExistsDirect(ref)
269 if ref.id in self._predicted_inputs:
270 if exists:
271 self._available_inputs.add(ref.id)
272 else:
273 self._unavailable_inputs.add(ref.id)
274 return exists
276 def markInputUnused(self, ref: DatasetRef) -> None:
277 # Docstring inherited.
278 self._actual_inputs.discard(ref.getCheckedId())
280 @property
281 def dimensions(self) -> DimensionUniverse:
282 # Docstring inherited.
283 return self._dimensions
285 def putDirect(self, obj: Any, ref: DatasetRef) -> DatasetRef:
286 # Docstring inherited.
287 if ref.id not in self._predicted_outputs:
288 raise RuntimeError("Cannot `put` dataset that was not predicted as an output.")
289 self.datastore.put(obj, ref)
290 self._actual_output_refs.add(ref)
291 return ref
293 def extract_provenance_data(self) -> QuantumProvenanceData:
294 """Extract provenance information and datastore records from this
295 butler.
297 Returns
298 -------
299 provenance : `QuantumProvenanceData`
300 A serializable struct containing input/output dataset IDs and
301 datastore records. This assumes all dataset IDs are UUIDs (just to
302 make it easier for `pydantic` to reason about the struct's types);
303 the rest of this class makes no such assumption, but the approach
304 to processing in which it's useful effectively requires UUIDs
305 anyway.
307 Notes
308 -----
309 `QuantumBackedButler` records this provenance information when its
310 methods are used, which mostly saves `~lsst.pipe.base.PipelineTask`
311 authors from having to worry about while still recording very
312 detailed information. But it has two small weaknesses:
314 - Calling `getDirectDeferred` or `getDirect` is enough to mark a
315 dataset as an "actual input", which may mark some datasets that
316 aren't actually used. We rely on task authors to use
317 `markInputUnused` to address this.
319 - We assume that the execution system will call ``datasetExistsDirect``
320 on all predicted inputs prior to execution, in order to populate the
321 "available inputs" set. This is what I envision
322 '`~lsst.ctrl.mpexec.SingleQuantumExecutor` doing after we update it
323 to use this class, but it feels fragile for this class to make such
324 a strong assumption about how it will be used, even if I can't think
325 of any other executor behavior that would make sense.
326 """
327 if not self._actual_inputs.isdisjoint(self._unavailable_inputs):
328 _LOG.warning(
329 "Inputs %s were marked as actually used (probably because a DeferredDatasetHandle) "
330 "was obtained, but did not actually exist. This task should be be using markInputUnused "
331 "directly to clarify its provenance.",
332 self._actual_inputs & self._unavailable_inputs,
333 )
334 self._actual_inputs -= self._unavailable_inputs
335 checked_inputs = self._available_inputs | self._unavailable_inputs
336 if not self._predicted_inputs == checked_inputs:
337 _LOG.warning(
338 "Execution harness did not check predicted inputs %s for existence; available inputs "
339 "recorded in provenance may be incomplete.",
340 self._predicted_inputs - checked_inputs,
341 )
342 datastore_records = self.datastore.export_records(self._actual_output_refs)
343 locations: Dict[str, Set[DatasetId]] = defaultdict(set)
344 records: Dict[str, List[StoredDatastoreItemInfo]] = defaultdict(list)
345 for datastore_name, record_data in datastore_records.items():
346 locations[datastore_name].update(ref.getCheckedId() for ref in record_data.refs)
347 for table_name, table_records in record_data.records.items():
348 records[table_name].extend(table_records)
350 return QuantumProvenanceData(
351 predicted_inputs=self._predicted_inputs,
352 available_inputs=self._available_inputs,
353 actual_inputs=self._actual_inputs,
354 predicted_outputs=self._predicted_outputs,
355 actual_outputs={ref.getCheckedId() for ref in self._actual_output_refs},
356 locations=dict(locations),
357 records=dict(records),
358 )
361@dataclasses.dataclass(frozen=True)
362class QuantumProvenanceData:
363 """A serializable struct for per-quantum provenance information and
364 datastore records.
366 Notes
367 -----
368 This class slightly duplicates information from the `Quantum` class itself
369 (the `predicted_inputs` and `predicted_outputs` sets should have the same
370 IDs present in `Quantum.inputs` and `Quantum.outputs`), but overall it
371 assumes the original `Quantum` is also available to reconstruct the
372 complete provenance (e.g. by associating dataset IDs with data IDs,
373 dataset types, and `~CollectionType.RUN` names.
374 """
376 # This class probably should have information about its execution
377 # environment (anything not controlled and recorded at the
378 # `~CollectionType.RUN` level, such as the compute node ID). but adding it
379 # now is out of scope for this prototype.
381 predicted_inputs: Set[DatasetId]
382 """Unique IDs of datasets that were predicted as inputs to this quantum
383 when the QuantumGraph was built.
384 """
386 available_inputs: Set[DatasetId]
387 """Unique IDs of input datasets that were actually present in the datastore
388 when this quantum was executed.
390 This is a subset of `predicted_inputs`, with the difference generally being
391 datasets were `predicted_outputs` but not `actual_outputs` of some upstream
392 task.
393 """
395 actual_inputs: Set[DatasetId]
396 """Unique IDs of datasets that were actually used as inputs by this task.
398 This is a subset of `available_inputs`.
400 Notes
401 -----
402 The criteria for marking an input as used is that rerunning the quantum
403 with only these `actual_inputs` available must yield identical outputs.
404 This means that (for example) even just using an input to help determine
405 an output rejection criteria and then rejecting it as an outlier qualifies
406 that input as actually used.
407 """
409 predicted_outputs: Set[DatasetId]
410 """Unique IDs of datasets that were predicted as outputs of this quantum
411 when the QuantumGraph was built.
412 """
414 actual_outputs: Set[DatasetId]
415 """Unique IDs of datasets that were actually written when this quantum
416 was executed.
417 """
419 locations: Dict[str, Set[DatasetId]]
420 """Mapping from datastore name to the set of `actual_output` dataset IDs
421 written by this quantum.
422 """
424 records: Dict[str, List[StoredDatastoreItemInfo]]
425 """Rows from the opaque tables used by datastores for the `actual_output`
426 datasets written by this quantum, indexed by opaque table name.
427 """
429 def to_simple(self, minimal: bool = False) -> Dict[str, Any]:
430 """Make representation of the provenance suitable for serialization.
432 Implements `~lsst.daf.butler.core.json.SupportsSimple` protocol.
434 Parameters
435 ----------
436 minimal : `bool`, optional
437 If True produce minimal representation, not used by this method.
439 Returns
440 -------
441 simple : `dict`
442 Representation of this instance as a simple dictionary.
443 """
444 # dataclasses.asdict does not know how to handle some types, have to
445 # do it manually. Also have to replace sets with lists as some
446 # serializers do not support set type.
447 def _serialize_dataset_id(id: DatasetId) -> Union[int, str]:
448 return id if isinstance(id, int) else f"urn:uuid:{id}"
450 def _serialize_dataset_ids(ids: Set[DatasetId]) -> List[Union[int, str]]:
451 return [_serialize_dataset_id(id) for id in ids]
453 records: Dict[str, List[Dict[str, Any]]] = {}
454 for table_name, table_records in self.records.items():
455 records[table_name] = []
456 for record in table_records:
457 record_dict = record.to_record()
458 # Have to remember actual class name of the record.
459 record_dict["__class__"] = get_full_type_name(record)
460 if "dataset_id" in record_dict:
461 record_dict["dataset_id"] = _serialize_dataset_id(record_dict["dataset_id"])
462 records[table_name].append(record_dict)
463 locations = {datastore: _serialize_dataset_ids(ids) for datastore, ids in self.locations.items()}
464 return dict(
465 predicted_inputs=_serialize_dataset_ids(self.predicted_inputs),
466 available_inputs=_serialize_dataset_ids(self.available_inputs),
467 actual_inputs=_serialize_dataset_ids(self.actual_inputs),
468 predicted_outputs=_serialize_dataset_ids(self.predicted_outputs),
469 actual_outputs=_serialize_dataset_ids(self.actual_outputs),
470 locations=locations,
471 records=records,
472 )
474 @classmethod
475 def from_simple(
476 cls,
477 simple: Dict[str, Any],
478 universe: Optional[DimensionUniverse] = None,
479 registry: Optional[Registry] = None,
480 ) -> QuantumProvenanceData:
481 """Make an instance of this class from serialized data.
483 Implements `~lsst.daf.butler.core.json.SupportsSimple` protocol.
485 Parameters
486 ----------
487 data : `dict`
488 Serialized representation returned from `to_simple` method.
489 universe : `DimensionUniverse`, optional
490 Dimension universe, not used by this method.
491 registry : `Registry`, optional
492 Registry instance, not used by this method.
494 Returns
495 -------
496 provenance : `QuantumProvenanceData`
497 De-serialized instance of `QuantumProvenanceData`.
498 """
500 def _deserialize_dataset_id(id: Union[int, str]) -> DatasetId:
501 return id if isinstance(id, int) else uuid.UUID(id)
503 def _deserialize_dataset_ids(ids: List[Union[int, str]]) -> Set[DatasetId]:
504 return set(_deserialize_dataset_id(id) for id in ids)
506 @functools.lru_cache(maxsize=None)
507 def _get_class(class_name: str) -> Type:
508 """Get class type for a given class name"""
509 return doImportType(class_name)
511 # unpack records
512 records: Dict[str, List[StoredDatastoreItemInfo]] = {}
513 for table_name, table_records in simple["records"].items():
514 records[table_name] = []
515 for record in table_records:
516 cls = _get_class(record.pop("__class__"))
517 if "dataset_id" in record:
518 record["dataset_id"] = _deserialize_dataset_id(record["dataset_id"])
519 records[table_name].append(cls.from_record(record))
520 locations = {
521 datastore: _deserialize_dataset_ids(ids) for datastore, ids in simple["locations"].items()
522 }
524 return QuantumProvenanceData(
525 predicted_inputs=_deserialize_dataset_ids(simple["predicted_inputs"]),
526 available_inputs=_deserialize_dataset_ids(simple["available_inputs"]),
527 actual_inputs=_deserialize_dataset_ids(simple["actual_inputs"]),
528 predicted_outputs=_deserialize_dataset_ids(simple["predicted_outputs"]),
529 actual_outputs=_deserialize_dataset_ids(simple["actual_outputs"]),
530 locations=locations,
531 records=records,
532 )
534 @staticmethod
535 def collect_and_transfer(
536 butler: Butler, quanta: Iterable[Quantum], provenance: Iterable[QuantumProvenanceData]
537 ) -> None:
538 """Transfer output datasets from multiple quanta to a more permantent
539 `Butler` repository.
541 Parameters
542 ----------
543 butler : `Butler`
544 Full butler representing the data repository to transfer datasets
545 to.
546 quanta : `Iterable` [ `Quantum` ]
547 Iterable of `Quantum` objects that carry information about
548 predicted outputs. May be a single-pass iterator.
549 provenance : `Iterable` [ `QuantumProvenanceData` ]
550 Provenance and datastore data for each of the given quanta, in the
551 same order. May be a single-pass iterator.
553 Notes
554 -----
555 Input-output provenance data is not actually transferred yet, because
556 `Registry` has no place to store it.
558 This method probably works most efficiently if run on all quanta for a
559 single task label at once, because this will gather all datasets of
560 a particular type together into a single vectorized `Registry` import.
561 It should still behave correctly if run on smaller groups of quanta
562 or even quanta from multiple tasks.
564 Currently this method transfers datastore record data unchanged, with
565 no possibility of actually moving (e.g.) files. Datastores that are
566 present only in execution or only in the more permanent butler are
567 ignored.
568 """
569 grouped_refs = defaultdict(list)
570 datastore_records: Dict[str, DatastoreRecordData] = defaultdict(DatastoreRecordData)
571 for quantum, provenance_for_quantum in zip(quanta, provenance):
572 quantum_refs_by_id = {
573 ref.getCheckedId(): ref
574 for ref in itertools.chain.from_iterable(quantum.outputs.values())
575 if ref.getCheckedId() in provenance_for_quantum.actual_outputs
576 }
577 for ref in quantum_refs_by_id.values():
578 grouped_refs[ref.datasetType, ref.run].append(ref)
579 for datastore_name in set(butler.datastore.names) & provenance_for_quantum.locations.keys():
580 datastore_records[datastore_name].refs.extend(
581 quantum_refs_by_id[id] for id in provenance_for_quantum.locations[datastore_name]
582 )
583 for opaque_table_name, records_for_table in provenance_for_quantum.records.items():
584 datastore_records[datastore_name].records[opaque_table_name].extend(records_for_table)
585 for refs in grouped_refs.values():
586 butler.registry._importDatasets(refs)
587 butler.datastore.import_records(datastore_records)