Coverage for python/lsst/daf/butler/core/quantum.py: 18%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = ("Quantum", "SerializedQuantum", "DimensionRecordsAccumulator")
26from typing import Any, Dict, Iterable, List, Mapping, MutableMapping, Optional, Tuple, Type, Union
28from lsst.utils import doImportType
29from pydantic import BaseModel
31from .datasets import DatasetRef, DatasetType, SerializedDatasetRef, SerializedDatasetType
32from .datastore import DatastoreRecordData
33from .dimensions import (
34 DataCoordinate,
35 DimensionRecord,
36 DimensionUniverse,
37 SerializedDataCoordinate,
38 SerializedDimensionRecord,
39)
40from .named import NamedKeyDict, NamedKeyMapping
43def _reconstructDatasetRef(
44 simple: SerializedDatasetRef,
45 type_: Optional[DatasetType],
46 ids: Iterable[int],
47 dimensionRecords: Optional[Dict[int, SerializedDimensionRecord]],
48 reconstitutedDimensions: Dict[int, Tuple[str, DimensionRecord]],
49 universe: DimensionUniverse,
50) -> DatasetRef:
51 """Reconstruct a DatasetRef stored in a Serialized Quantum"""
52 # Reconstruct the dimension records
53 records = {}
54 for dId in ids:
55 # if the dimension record has been loaded previously use that,
56 # otherwise load it from the dict of Serialized DimensionRecords
57 if (recId := reconstitutedDimensions.get(dId)) is None:
58 if dimensionRecords is None:
59 raise ValueError(
60 "Cannot construct from a SerializedQuantum with no dimension records. "
61 "Reconstituted Dimensions must be supplied and populated in method call."
62 )
63 tmpSerialized = dimensionRecords[dId]
64 reconstructedDim = DimensionRecord.from_simple(tmpSerialized, universe=universe)
65 definition = tmpSerialized.definition
66 reconstitutedDimensions[dId] = (definition, reconstructedDim)
67 else:
68 definition, reconstructedDim = recId
69 records[definition] = reconstructedDim
70 # turn the serialized form into an object and attach the dimension records
71 rebuiltDatasetRef = DatasetRef.from_simple(simple, universe, datasetType=type_)
72 if records:
73 object.__setattr__(rebuiltDatasetRef, "dataId", rebuiltDatasetRef.dataId.expanded(records))
74 return rebuiltDatasetRef
77class SerializedQuantum(BaseModel):
78 """Simplified model of a `Quantum` suitable for serialization."""
80 taskName: str
81 dataId: Optional[SerializedDataCoordinate]
82 datasetTypeMapping: Mapping[str, SerializedDatasetType]
83 initInputs: Mapping[str, Tuple[SerializedDatasetRef, List[int]]]
84 inputs: Mapping[str, List[Tuple[SerializedDatasetRef, List[int]]]]
85 outputs: Mapping[str, List[Tuple[SerializedDatasetRef, List[int]]]]
86 dimensionRecords: Optional[Dict[int, SerializedDimensionRecord]] = None
88 @classmethod
89 def direct(
90 cls,
91 *,
92 taskName: str,
93 dataId: Optional[Dict],
94 datasetTypeMapping: Mapping[str, Dict],
95 initInputs: Mapping[str, Tuple[Dict, List[int]]],
96 inputs: Mapping[str, List[Tuple[Dict, List[int]]]],
97 outputs: Mapping[str, List[Tuple[Dict, List[int]]]],
98 dimensionRecords: Optional[Dict[int, Dict]],
99 ) -> SerializedQuantum:
100 """Construct a `SerializedQuantum` directly without validators.
102 This differs from the pydantic "construct" method in that the arguments
103 are explicitly what the model requires, and it will recurse through
104 members, constructing them from their corresponding `direct` methods.
106 This method should only be called when the inputs are trusted.
107 """
108 node = SerializedQuantum.__new__(cls)
109 setter = object.__setattr__
110 setter(node, "taskName", taskName)
111 setter(node, "dataId", dataId if dataId is None else SerializedDataCoordinate.direct(**dataId))
112 setter(
113 node,
114 "datasetTypeMapping",
115 {k: SerializedDatasetType.direct(**v) for k, v in datasetTypeMapping.items()},
116 )
117 setter(
118 node,
119 "initInputs",
120 {k: (SerializedDatasetRef.direct(**v), refs) for k, (v, refs) in initInputs.items()},
121 )
122 setter(
123 node,
124 "inputs",
125 {k: [(SerializedDatasetRef.direct(**ref), id) for ref, id in v] for k, v in inputs.items()},
126 )
127 setter(
128 node,
129 "outputs",
130 {k: [(SerializedDatasetRef.direct(**ref), id) for ref, id in v] for k, v in outputs.items()},
131 )
132 setter(
133 node,
134 "dimensionRecords",
135 dimensionRecords
136 if dimensionRecords is None
137 else {int(k): SerializedDimensionRecord.direct(**v) for k, v in dimensionRecords.items()},
138 )
139 setter(
140 node,
141 "__fields_set__",
142 {
143 "taskName",
144 "dataId",
145 "datasetTypeMapping",
146 "initInputs",
147 "inputs",
148 "outputs",
149 "dimensionRecords",
150 },
151 )
152 return node
155class Quantum:
156 """Class representing a discrete unit of work.
158 A Quantum may depend on one or more datasets and produce one or more
159 datasets.
161 Most Quanta will be executions of a particular ``PipelineTask``’s
162 ``runQuantum`` method, but they can also be used to represent discrete
163 units of work performed manually by human operators or other software
164 agents.
166 Parameters
167 ----------
168 taskName : `str`, optional
169 Fully-qualified name of the Task class that executed or will execute
170 this Quantum. If not provided, ``taskClass`` must be.
171 taskClass : `type`, optional
172 The Task class that executed or will execute this Quantum. If not
173 provided, ``taskName`` must be. Overrides ``taskName`` if both are
174 provided.
175 dataId : `DataId`, optional
176 The dimension values that identify this `Quantum`.
177 initInputs : collection of `DatasetRef`, optional
178 Datasets that are needed to construct an instance of the Task. May
179 be a flat iterable of `DatasetRef` instances or a mapping from
180 `DatasetType` to `DatasetRef`.
181 inputs : `~collections.abc.Mapping`, optional
182 Inputs identified prior to execution, organized as a mapping from
183 `DatasetType` to a list of `DatasetRef`.
184 outputs : `~collections.abc.Mapping`, optional
185 Outputs from executing this quantum of work, organized as a mapping
186 from `DatasetType` to a list of `DatasetRef`.
187 datastore_records : `DatastoreRecordData`, optional
188 Datastore record data for input or initInput datasets that already
189 exist.
190 """
192 __slots__ = (
193 "_taskName",
194 "_taskClass",
195 "_dataId",
196 "_initInputs",
197 "_inputs",
198 "_outputs",
199 "_hash",
200 "_datastore_records",
201 )
203 def __init__(
204 self,
205 *,
206 taskName: Optional[str] = None,
207 taskClass: Optional[Type] = None,
208 dataId: Optional[DataCoordinate] = None,
209 initInputs: Optional[Union[Mapping[DatasetType, DatasetRef], Iterable[DatasetRef]]] = None,
210 inputs: Optional[Mapping[DatasetType, List[DatasetRef]]] = None,
211 outputs: Optional[Mapping[DatasetType, List[DatasetRef]]] = None,
212 datastore_records: Optional[Mapping[str, DatastoreRecordData]] = None,
213 ):
214 if taskClass is not None:
215 taskName = f"{taskClass.__module__}.{taskClass.__name__}"
216 self._taskName = taskName
217 self._taskClass = taskClass
218 self._dataId = dataId
219 if initInputs is None:
220 initInputs = {}
221 elif not isinstance(initInputs, Mapping):
222 initInputs = {ref.datasetType: ref for ref in initInputs}
223 if inputs is None:
224 inputs = {}
225 if outputs is None:
226 outputs = {}
227 self._initInputs = NamedKeyDict[DatasetType, DatasetRef](initInputs).freeze()
228 self._inputs = NamedKeyDict[DatasetType, List[DatasetRef]](inputs).freeze()
229 self._outputs = NamedKeyDict[DatasetType, List[DatasetRef]](outputs).freeze()
230 if datastore_records is None:
231 datastore_records = {}
232 self._datastore_records = datastore_records
234 def to_simple(self, accumulator: Optional[DimensionRecordsAccumulator] = None) -> SerializedQuantum:
235 """Convert this class to a simple python type.
237 This makes it suitable for serialization.
239 Parameters
240 ----------
241 accumulator : `DimensionRecordsAccumulator`, optional
242 This accumulator can be used to aggregate dimension records accross
243 multiple Quanta. If this is None, the default, dimension records
244 are serialized with this Quantum. If an accumulator is supplied it
245 is assumed something else is responsible for serializing the
246 records, and they will not be stored with the SerializedQuantum.
248 Returns
249 -------
250 simple : `SerializedQuantum`
251 This object converted to a serializable representation.
252 """
253 typeMapping = {}
254 initInputs = {}
256 if accumulator is None:
257 accumulator = DimensionRecordsAccumulator()
258 writeDimensionRecords = True
259 else:
260 writeDimensionRecords = False
262 # collect the init inputs for serialization, recording the types into
263 # their own mapping, used throughout to minimize saving the same object
264 # multiple times. String name of the type used to index mappings.
265 for key, value in self._initInputs.items():
266 # add the type to the typeMapping
267 typeMapping[key.name] = key.to_simple()
268 # convert to a simple DatasetRef representation
269 simple = value.to_simple()
270 # extract the dimension records
271 recIds = []
272 if simple.dataId is not None and simple.dataId.records is not None:
273 # for each dimension record get a id by adding it to the
274 # record accumulator.
275 for rec in value.dataId.records.values():
276 if rec is not None:
277 recordId = accumulator.addRecord(rec)
278 recIds.append(recordId)
279 # Set properties to None to save space
280 simple.dataId.records = None
281 simple.datasetType = None
282 initInputs[key.name] = (simple, recIds)
284 # container for all the SerializedDatasetRefs, keyed on the
285 # DatasetType name.
286 inputs = {}
288 # collect the inputs
289 for key, values in self._inputs.items():
290 # collect type if it is not already in the mapping
291 if key.name not in typeMapping:
292 typeMapping[key.name] = key.to_simple()
293 # for each input type there are a list of inputs, collect them
294 tmp = []
295 for e in values:
296 simp = e.to_simple()
297 # This container will hold ids (hashes) that point to all the
298 # dimension records within the SerializedDatasetRef dataId
299 # These dimension records repeat in almost every DatasetRef
300 # So it is hugely wasteful in terms of disk and cpu time to
301 # store them over and over again.
302 recIds = []
303 if simp.dataId is not None and simp.dataId.records is not None:
304 for rec in e.dataId.records.values():
305 # for each dimension record get a id by adding it to
306 # the record accumulator.
307 if rec is not None:
308 recordId = accumulator.addRecord(rec)
309 recIds.append(recordId)
310 # Set the records to None to avoid serializing them
311 simp.dataId.records = None
312 # Dataset type is the same as the key in _inputs, no need
313 # to serialize it out multiple times, set it to None
314 simp.datasetType = None
315 # append a tuple of the simplified SerializedDatasetRef, along
316 # with the list of all the keys for the dimension records
317 # needed for reconstruction.
318 tmp.append((simp, recIds))
319 inputs[key.name] = tmp
321 # container for all the SerializedDatasetRefs, keyed on the
322 # DatasetType name.
323 outputs = {}
324 for key, values in self._outputs.items():
325 # collect type if it is not already in the mapping
326 if key.name not in typeMapping:
327 typeMapping[key.name] = key.to_simple()
328 # for each output type there are a list of inputs, collect them
329 tmp = []
330 for e in values:
331 simp = e.to_simple()
332 # This container will hold ids (hashes) that point to all the
333 # dimension records within the SerializedDatasetRef dataId
334 # These dimension records repeat in almost every DatasetRef
335 # So it is hugely wasteful in terms of disk and cpu time to
336 # store them over and over again.
337 recIds = []
338 if simp.dataId is not None and simp.dataId.records is not None:
339 for rec in e.dataId.records.values():
340 # for each dimension record get a id by adding it to
341 # the record accumulator.
342 if rec is not None:
343 recordId = accumulator.addRecord(rec)
344 recIds.append(recordId)
345 # Set the records to None to avoid serializing them
346 simp.dataId.records = None
347 # Dataset type is the same as the key in _outputs, no need
348 # to serialize it out multiple times, set it to None
349 simp.datasetType = None
350 # append a tuple of the simplified SerializedDatasetRef, along
351 # with the list of all the keys for the dimension records
352 # needed for reconstruction.
353 tmp.append((simp, recIds))
354 outputs[key.name] = tmp
356 dimensionRecords: Optional[Mapping[int, SerializedDimensionRecord]]
357 if writeDimensionRecords:
358 dimensionRecords = accumulator.makeSerializedDimensionRecordMapping()
359 else:
360 dimensionRecords = None
362 return SerializedQuantum(
363 taskName=self._taskName,
364 dataId=self.dataId.to_simple() if self.dataId is not None else None,
365 datasetTypeMapping=typeMapping,
366 initInputs=initInputs,
367 inputs=inputs,
368 outputs=outputs,
369 dimensionRecords=dimensionRecords,
370 )
372 @classmethod
373 def from_simple(
374 cls,
375 simple: SerializedQuantum,
376 universe: DimensionUniverse,
377 reconstitutedDimensions: Optional[Dict[int, Tuple[str, DimensionRecord]]] = None,
378 ) -> Quantum:
379 """Construct a new object from a simplified form.
381 Generally this is data returned from the `to_simple` method.
383 Parameters
384 ----------
385 simple : SerializedQuantum
386 The value returned by a call to `to_simple`
387 universe : `DimensionUniverse`
388 The special graph of all known dimensions.
389 reconstitutedDimensions : `dict` of `int` to `DimensionRecord` or None
390 A mapping of ids to dimension records to be used when populating
391 dimensions for this Quantum. If supplied it will be used in place
392 of the dimension Records stored with the SerializedQuantum, if a
393 required dimension has already been loaded. Otherwise the record
394 will be unpersisted from the SerializedQuatnum and added to the
395 reconstitutedDimensions dict (if not None). Defaults to None.
396 """
397 loadedTypes: MutableMapping[str, DatasetType] = {}
398 initInputs: MutableMapping[DatasetType, DatasetRef] = {}
399 if reconstitutedDimensions is None:
400 reconstitutedDimensions = {}
402 # Unpersist all the init inputs
403 for key, (value, dimensionIds) in simple.initInputs.items():
404 # If a datasetType has already been created use that instead of
405 # unpersisting.
406 if (type_ := loadedTypes.get(key)) is None:
407 type_ = loadedTypes.setdefault(
408 key, DatasetType.from_simple(simple.datasetTypeMapping[key], universe=universe)
409 )
410 # reconstruct the dimension records
411 rebuiltDatasetRef = _reconstructDatasetRef(
412 value, type_, dimensionIds, simple.dimensionRecords, reconstitutedDimensions, universe
413 )
414 initInputs[type_] = rebuiltDatasetRef
416 # containers for the dataset refs
417 inputs: MutableMapping[DatasetType, List[DatasetRef]] = {}
418 outputs: MutableMapping[DatasetType, List[DatasetRef]] = {}
420 for container, simpleRefs in ((inputs, simple.inputs), (outputs, simple.outputs)):
421 for key, values in simpleRefs.items():
422 # If a datasetType has already been created use that instead of
423 # unpersisting.
424 if (type_ := loadedTypes.get(key)) is None:
425 type_ = loadedTypes.setdefault(
426 key, DatasetType.from_simple(simple.datasetTypeMapping[key], universe=universe)
427 )
428 # reconstruct the list of DatasetRefs for this DatasetType
429 tmp: List[DatasetRef] = []
430 for v, recIds in values:
431 rebuiltDatasetRef = _reconstructDatasetRef(
432 v, type_, recIds, simple.dimensionRecords, reconstitutedDimensions, universe
433 )
434 tmp.append(rebuiltDatasetRef)
435 container[type_] = tmp
437 dataId = (
438 DataCoordinate.from_simple(simple.dataId, universe=universe)
439 if simple.dataId is not None
440 else None
441 )
442 return Quantum(
443 taskName=simple.taskName, dataId=dataId, initInputs=initInputs, inputs=inputs, outputs=outputs
444 )
446 @property
447 def taskClass(self) -> Optional[Type]:
448 """Task class associated with this `Quantum` (`type`)."""
449 if self._taskClass is None:
450 if self._taskName is None:
451 raise ValueError("No task class defined and task name is None")
452 task_class = doImportType(self._taskName)
453 self._taskClass = task_class
454 return self._taskClass
456 @property
457 def taskName(self) -> Optional[str]:
458 """Return Fully-qualified name of the task associated with `Quantum`.
460 (`str`).
461 """
462 return self._taskName
464 @property
465 def dataId(self) -> Optional[DataCoordinate]:
466 """Return dimension values of the unit of processing (`DataId`)."""
467 return self._dataId
469 @property
470 def initInputs(self) -> NamedKeyMapping[DatasetType, DatasetRef]:
471 """Return mapping of datasets used to construct the Task.
473 Has `DatasetType` instances as keys (names can also be used for
474 lookups) and `DatasetRef` instances as values.
475 """
476 return self._initInputs
478 @property
479 def inputs(self) -> NamedKeyMapping[DatasetType, List[DatasetRef]]:
480 """Return mapping of input datasets that were expected to be used.
482 Has `DatasetType` instances as keys (names can also be used for
483 lookups) and a list of `DatasetRef` instances as values.
485 Notes
486 -----
487 We cannot use `set` instead of `list` for the nested container because
488 `DatasetRef` instances cannot be compared reliably when some have
489 integers IDs and others do not.
490 """
491 return self._inputs
493 @property
494 def outputs(self) -> NamedKeyMapping[DatasetType, List[DatasetRef]]:
495 """Return mapping of output datasets (to be) generated by this quantum.
497 Has the same form as `predictedInputs`.
499 Notes
500 -----
501 We cannot use `set` instead of `list` for the nested container because
502 `DatasetRef` instances cannot be compared reliably when some have
503 integers IDs and others do not.
504 """
505 return self._outputs
507 @property
508 def datastore_records(self) -> Mapping[str, DatastoreRecordData]:
509 """Tabular data stored with this quantum (`dict`).
511 This attribute may be modified in place, but not assigned to.
512 """
513 return self._datastore_records
515 def __eq__(self, other: object) -> bool:
516 if not isinstance(other, Quantum):
517 return False
518 for item in ("taskClass", "dataId", "initInputs", "inputs", "outputs"):
519 if getattr(self, item) != getattr(other, item):
520 return False
521 return True
523 def __hash__(self) -> int:
524 return hash((self.taskClass, self.dataId))
526 def __reduce__(self) -> Union[str, Tuple[Any, ...]]:
527 return (
528 self._reduceFactory,
529 (
530 self.taskName,
531 self.taskClass,
532 self.dataId,
533 dict(self.initInputs.items()),
534 dict(self.inputs),
535 dict(self.outputs),
536 ),
537 )
539 def __str__(self) -> str:
540 return f"{self.__class__.__name__}(taskName={self.taskName}, dataId={self.dataId})"
542 @staticmethod
543 def _reduceFactory(
544 taskName: Optional[str],
545 taskClass: Optional[Type],
546 dataId: Optional[DataCoordinate],
547 initInputs: Optional[Union[Mapping[DatasetType, DatasetRef], Iterable[DatasetRef]]],
548 inputs: Optional[Mapping[DatasetType, List[DatasetRef]]],
549 outputs: Optional[Mapping[DatasetType, List[DatasetRef]]],
550 datastore_records: Optional[Mapping[str, DatastoreRecordData]] = None,
551 ) -> Quantum:
552 return Quantum(
553 taskName=taskName,
554 taskClass=taskClass,
555 dataId=dataId,
556 initInputs=initInputs,
557 inputs=inputs,
558 outputs=outputs,
559 datastore_records=datastore_records,
560 )
563class DimensionRecordsAccumulator:
564 """Class used to accumulate dimension records for serialization.
566 This class generates an auto increment key for each unique dimension record
567 added to it. This allows serialization of dimension records to occur once
568 for each record but be refereed to multiple times.
569 """
571 def __init__(self) -> None:
572 self._counter = 0
573 self.mapping: MutableMapping[DimensionRecord, Tuple[int, SerializedDimensionRecord]] = {}
575 def addRecord(self, record: DimensionRecord) -> int:
576 """Add a dimension record to the accumulator if it has not already been
577 added. When a record is inserted for the first time it is assigned
578 a unique integer key.
580 This function returns the key associated with the record (either the
581 newly allocated key, or the existing one)
583 Parameters
584 ----------
585 record : `DimensionRecord`
586 The record to add to the accumulator
588 Returns
589 -------
590 accumulatorKey : int
591 The key that is associated with the supplied record
592 """
593 if (mappingValue := self.mapping.get(record)) is None:
594 simple = record.to_simple()
595 mappingValue = (self._counter, simple)
596 self._counter += 1
597 self.mapping[record] = mappingValue
598 return mappingValue[0]
600 def makeSerializedDimensionRecordMapping(self) -> Mapping[int, SerializedDimensionRecord]:
601 return {id_: serializeRef for id_, serializeRef in self.mapping.values()}