Coverage for python/lsst/daf/butler/core/quantum.py: 18%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = ("Quantum", "SerializedQuantum", "DimensionRecordsAccumulator")
26from typing import (
27 Any,
28 Iterable,
29 List,
30 Mapping,
31 MutableMapping,
32 Optional,
33 Tuple,
34 Type,
35 Union,
36 Dict
37)
39from pydantic import BaseModel
41from lsst.utils import doImportType
43from .datasets import DatasetRef, DatasetType
44from .dimensions import DataCoordinate
45from .named import NamedKeyDict, NamedKeyMapping
46from .dimensions import (SerializedDataCoordinate, DimensionUniverse, SerializedDimensionRecord,
47 DimensionRecord)
48from .datasets import SerializedDatasetRef, SerializedDatasetType
51def _reconstructDatasetRef(simple: SerializedDatasetRef, type_: Optional[DatasetType],
52 ids: Iterable[int],
53 dimensionRecords: Optional[Dict[int, SerializedDimensionRecord]],
54 reconstitutedDimensions: Dict[int, Tuple[str, DimensionRecord]],
55 universe: DimensionUniverse) -> DatasetRef:
56 """Reconstruct a DatasetRef stored in a Serialized Quantum
57 """
58 # Reconstruct the dimension records
59 records = {}
60 for dId in ids:
61 # if the dimension record has been loaded previously use that,
62 # otherwise load it from the dict of Serialized DimensionRecords
63 if (recId := reconstitutedDimensions.get(dId)) is None:
64 if dimensionRecords is None:
65 raise ValueError("Cannot construct from a SerializedQuantum with no dimension records. "
66 "Reconstituted Dimensions must be supplied and populated in method call.")
67 tmpSerialized = dimensionRecords[dId]
68 reconstructedDim = DimensionRecord.from_simple(tmpSerialized, universe=universe)
69 definition = tmpSerialized.definition
70 reconstitutedDimensions[dId] = (definition, reconstructedDim)
71 else:
72 definition, reconstructedDim = recId
73 records[definition] = reconstructedDim
74 # turn the serialized form into an object and attach the dimension records
75 rebuiltDatasetRef = DatasetRef.from_simple(simple, universe, datasetType=type_)
76 if records:
77 object.__setattr__(rebuiltDatasetRef, 'dataId',
78 rebuiltDatasetRef.dataId.expanded(records))
79 return rebuiltDatasetRef
82class SerializedQuantum(BaseModel):
83 """Simplified model of a `Quantum` suitable for serialization."""
85 taskName: str
86 dataId: Optional[SerializedDataCoordinate]
87 datasetTypeMapping: Mapping[str, SerializedDatasetType]
88 initInputs: Mapping[str, Tuple[SerializedDatasetRef, List[int]]]
89 inputs: Mapping[str, List[Tuple[SerializedDatasetRef, List[int]]]]
90 outputs: Mapping[str, List[Tuple[SerializedDatasetRef, List[int]]]]
91 dimensionRecords: Optional[Dict[int, SerializedDimensionRecord]] = None
93 @classmethod
94 def direct(cls, *,
95 taskName: str,
96 dataId: Optional[Dict],
97 datasetTypeMapping: Mapping[str, Dict],
98 initInputs: Mapping[str, Tuple[Dict, List[int]]],
99 inputs: Mapping[str, List[Tuple[Dict, List[int]]]],
100 outputs: Mapping[str, List[Tuple[Dict, List[int]]]],
101 dimensionRecords: Optional[Dict[int, Dict]]
102 ) -> SerializedQuantum:
103 """Construct a `SerializedQuantum` directly without validators.
105 This differs from the pydantic "construct" method in that the arguments
106 are explicitly what the model requires, and it will recurse through
107 members, constructing them from their corresponding `direct` methods.
109 This method should only be called when the inputs are trusted.
110 """
111 node = SerializedQuantum.__new__(cls)
112 setter = object.__setattr__
113 setter(node, 'taskName', taskName)
114 setter(node, 'dataId',
115 dataId if dataId is None else SerializedDataCoordinate.direct(**dataId))
116 setter(node, "datasetTypeMapping",
117 {k: SerializedDatasetType.direct(**v) for k, v in datasetTypeMapping.items()})
118 setter(node, "initInputs",
119 {k: (SerializedDatasetRef.direct(**v), refs) for k, (v, refs) in initInputs.items()})
120 setter(node, "inputs",
121 {k: [(SerializedDatasetRef.direct(**ref), id) for ref, id in v] for k, v in inputs.items()})
122 setter(node, "outputs",
123 {k: [(SerializedDatasetRef.direct(**ref), id) for ref, id in v] for k, v in outputs.items()})
124 setter(node, "dimensionRecords", dimensionRecords if dimensionRecords is None else
125 {int(k): SerializedDimensionRecord.direct(**v) for k, v in dimensionRecords.items()})
126 setter(node, '__fields_set__', {'taskName', 'dataId', 'datasetTypeMapping', 'initInputs', 'inputs',
127 'outputs', 'dimensionRecords'})
128 return node
131class Quantum:
132 """Class representing a discrete unit of work.
134 A Quantum may depend on one or more datasets and produce one or more
135 datasets.
137 Most Quanta will be executions of a particular ``PipelineTask``’s
138 ``runQuantum`` method, but they can also be used to represent discrete
139 units of work performed manually by human operators or other software
140 agents.
142 Parameters
143 ----------
144 taskName : `str`, optional
145 Fully-qualified name of the Task class that executed or will execute
146 this Quantum. If not provided, ``taskClass`` must be.
147 taskClass : `type`, optional
148 The Task class that executed or will execute this Quantum. If not
149 provided, ``taskName`` must be. Overrides ``taskName`` if both are
150 provided.
151 dataId : `DataId`, optional
152 The dimension values that identify this `Quantum`.
153 initInputs : collection of `DatasetRef`, optional
154 Datasets that are needed to construct an instance of the Task. May
155 be a flat iterable of `DatasetRef` instances or a mapping from
156 `DatasetType` to `DatasetRef`.
157 inputs : `~collections.abc.Mapping`, optional
158 Inputs identified prior to execution, organized as a mapping from
159 `DatasetType` to a list of `DatasetRef`.
160 outputs : `~collections.abc.Mapping`, optional
161 Outputs from executing this quantum of work, organized as a mapping
162 from `DatasetType` to a list of `DatasetRef`.
163 """
165 __slots__ = ("_taskName", "_taskClass", "_dataId", "_initInputs", "_inputs", "_outputs", "_hash")
167 def __init__(self, *, taskName: Optional[str] = None,
168 taskClass: Optional[Type] = None,
169 dataId: Optional[DataCoordinate] = None,
170 initInputs: Optional[Union[Mapping[DatasetType, DatasetRef], Iterable[DatasetRef]]] = None,
171 inputs: Optional[Mapping[DatasetType, List[DatasetRef]]] = None,
172 outputs: Optional[Mapping[DatasetType, List[DatasetRef]]] = None,
173 ):
174 if taskClass is not None:
175 taskName = f"{taskClass.__module__}.{taskClass.__name__}"
176 self._taskName = taskName
177 self._taskClass = taskClass
178 self._dataId = dataId
179 if initInputs is None:
180 initInputs = {}
181 elif not isinstance(initInputs, Mapping):
182 initInputs = {ref.datasetType: ref for ref in initInputs}
183 if inputs is None:
184 inputs = {}
185 if outputs is None:
186 outputs = {}
187 self._initInputs = NamedKeyDict[DatasetType, DatasetRef](initInputs).freeze()
188 self._inputs = NamedKeyDict[DatasetType, List[DatasetRef]](inputs).freeze()
189 self._outputs = NamedKeyDict[DatasetType, List[DatasetRef]](outputs).freeze()
191 def to_simple(self, accumulator: Optional[DimensionRecordsAccumulator] = None) -> SerializedQuantum:
192 """Convert this class to a simple python type.
194 This makes it suitable for serialization.
196 Parameters
197 ----------
198 accumulator : `DimensionRecordsAccumulator`, optional
199 This accumulator can be used to aggregate dimension records accross
200 multiple Quanta. If this is None, the default, dimension records
201 are serialized with this Quantum. If an accumulator is supplied it
202 is assumed something else is responsible for serializing the
203 records, and they will not be stored with the SerializedQuantum.
205 Returns
206 -------
207 simple : `SerializedQuantum`
208 This object converted to a serializable representation.
209 """
210 typeMapping = {}
211 initInputs = {}
213 if accumulator is None:
214 accumulator = DimensionRecordsAccumulator()
215 writeDimensionRecords = True
216 else:
217 writeDimensionRecords = False
219 # collect the init inputs for serialization, recording the types into
220 # their own mapping, used throughout to minimize saving the same object
221 # multiple times. String name of the type used to index mappings.
222 for key, value in self._initInputs.items():
223 # add the type to the typeMapping
224 typeMapping[key.name] = key.to_simple()
225 # convert to a simple DatasetRef representation
226 simple = value.to_simple()
227 # extract the dimension records
228 recIds = []
229 if simple.dataId is not None and simple.dataId.records is not None:
230 # for each dimension record get a id by adding it to the
231 # record accumulator.
232 for rec in value.dataId.records.values():
233 if rec is not None:
234 recordId = accumulator.addRecord(rec)
235 recIds.append(recordId)
236 # Set properties to None to save space
237 simple.dataId.records = None
238 simple.datasetType = None
239 initInputs[key.name] = (simple, recIds)
241 # container for all the SerializedDatasetRefs, keyed on the
242 # DatasetType name.
243 inputs = {}
245 # collect the inputs
246 for key, values in self._inputs.items():
247 # collect type if it is not already in the mapping
248 if key.name not in typeMapping:
249 typeMapping[key.name] = key.to_simple()
250 # for each input type there are a list of inputs, collect them
251 tmp = []
252 for e in values:
253 simp = e.to_simple()
254 # This container will hold ids (hashes) that point to all the
255 # dimension records within the SerializedDatasetRef dataId
256 # These dimension records repeat in almost every DatasetRef
257 # So it is hugely wasteful in terms of disk and cpu time to
258 # store them over and over again.
259 recIds = []
260 if simp.dataId is not None and simp.dataId.records is not None:
261 for rec in e.dataId.records.values():
262 # for each dimension record get a id by adding it to
263 # the record accumulator.
264 if rec is not None:
265 recordId = accumulator.addRecord(rec)
266 recIds.append(recordId)
267 # Set the records to None to avoid serializing them
268 simp.dataId.records = None
269 # Dataset type is the same as the key in _inputs, no need
270 # to serialize it out multiple times, set it to None
271 simp.datasetType = None
272 # append a tuple of the simplified SerializedDatasetRef, along
273 # with the list of all the keys for the dimension records
274 # needed for reconstruction.
275 tmp.append((simp, recIds))
276 inputs[key.name] = tmp
278 # container for all the SerializedDatasetRefs, keyed on the
279 # DatasetType name.
280 outputs = {}
281 for key, values in self._outputs.items():
282 # collect type if it is not already in the mapping
283 if key.name not in typeMapping:
284 typeMapping[key.name] = key.to_simple()
285 # for each output type there are a list of inputs, collect them
286 tmp = []
287 for e in values:
288 simp = e.to_simple()
289 # This container will hold ids (hashes) that point to all the
290 # dimension records within the SerializedDatasetRef dataId
291 # These dimension records repeat in almost every DatasetRef
292 # So it is hugely wasteful in terms of disk and cpu time to
293 # store them over and over again.
294 recIds = []
295 if simp.dataId is not None and simp.dataId.records is not None:
296 for rec in e.dataId.records.values():
297 # for each dimension record get a id by adding it to
298 # the record accumulator.
299 if rec is not None:
300 recordId = accumulator.addRecord(rec)
301 recIds.append(recordId)
302 # Set the records to None to avoid serializing them
303 simp.dataId.records = None
304 # Dataset type is the same as the key in _outputs, no need
305 # to serialize it out multiple times, set it to None
306 simp.datasetType = None
307 # append a tuple of the simplified SerializedDatasetRef, along
308 # with the list of all the keys for the dimension records
309 # needed for reconstruction.
310 tmp.append((simp, recIds))
311 outputs[key.name] = tmp
313 dimensionRecords: Optional[Mapping[int, SerializedDimensionRecord]]
314 if writeDimensionRecords:
315 dimensionRecords = accumulator.makeSerializedDimensionRecordMapping()
316 else:
317 dimensionRecords = None
319 return SerializedQuantum(taskName=self._taskName,
320 dataId=self.dataId.to_simple() if self.dataId is not None else None,
321 datasetTypeMapping=typeMapping,
322 initInputs=initInputs,
323 inputs=inputs,
324 outputs=outputs,
325 dimensionRecords=dimensionRecords)
327 @classmethod
328 def from_simple(cls, simple: SerializedQuantum, universe: DimensionUniverse,
329 reconstitutedDimensions: Optional[Dict[int, Tuple[str, DimensionRecord]]] = None
330 ) -> Quantum:
331 """Construct a new object from a simplified form.
333 Generally this is data returned from the `to_simple` method.
335 Parameters
336 ----------
337 simple : SerializedQuantum
338 The value returned by a call to `to_simple`
339 universe : `DimensionUniverse`
340 The special graph of all known dimensions.
341 reconstitutedDimensions : `dict` of `int` to `DimensionRecord` or None
342 A mapping of ids to dimension records to be used when populating
343 dimensions for this Quantum. If supplied it will be used in place
344 of the dimension Records stored with the SerializedQuantum, if a
345 required dimension has already been loaded. Otherwise the record
346 will be unpersisted from the SerializedQuatnum and added to the
347 reconstitutedDimensions dict (if not None). Defaults to None.
348 """
349 loadedTypes: MutableMapping[str, DatasetType] = {}
350 initInputs: MutableMapping[DatasetType, DatasetRef] = {}
351 if reconstitutedDimensions is None:
352 reconstitutedDimensions = {}
354 # Unpersist all the init inputs
355 for key, (value, dimensionIds) in simple.initInputs.items():
356 # If a datasetType has already been created use that instead of
357 # unpersisting.
358 if (type_ := loadedTypes.get(key)) is None:
359 type_ = loadedTypes.setdefault(key,
360 DatasetType.from_simple(simple.datasetTypeMapping[key],
361 universe=universe))
362 # reconstruct the dimension records
363 rebuiltDatasetRef = _reconstructDatasetRef(value, type_, dimensionIds, simple.dimensionRecords,
364 reconstitutedDimensions, universe)
365 initInputs[type_] = rebuiltDatasetRef
367 # containers for the dataset refs
368 inputs: MutableMapping[DatasetType, List[DatasetRef]] = {}
369 outputs: MutableMapping[DatasetType, List[DatasetRef]] = {}
371 for container, simpleRefs in ((inputs, simple.inputs), (outputs, simple.outputs)):
372 for key, values in simpleRefs.items():
373 # If a datasetType has already been created use that instead of
374 # unpersisting.
375 if (type_ := loadedTypes.get(key)) is None:
376 type_ = loadedTypes.setdefault(key,
377 DatasetType.from_simple(simple.datasetTypeMapping[key],
378 universe=universe))
379 # reconstruct the list of DatasetRefs for this DatasetType
380 tmp: List[DatasetRef] = []
381 for v, recIds in values:
382 rebuiltDatasetRef = _reconstructDatasetRef(v, type_, recIds, simple.dimensionRecords,
383 reconstitutedDimensions, universe)
384 tmp.append(rebuiltDatasetRef)
385 container[type_] = tmp
387 dataId = DataCoordinate.from_simple(simple.dataId,
388 universe=universe) if simple.dataId is not None else None
389 return Quantum(taskName=simple.taskName, dataId=dataId, initInputs=initInputs, inputs=inputs,
390 outputs=outputs)
392 @property
393 def taskClass(self) -> Optional[Type]:
394 """Task class associated with this `Quantum` (`type`)."""
395 if self._taskClass is None:
396 if self._taskName is None:
397 raise ValueError("No task class defined and task name is None")
398 task_class = doImportType(self._taskName)
399 self._taskClass = task_class
400 return self._taskClass
402 @property
403 def taskName(self) -> Optional[str]:
404 """Return Fully-qualified name of the task associated with `Quantum`.
406 (`str`).
407 """
408 return self._taskName
410 @property
411 def dataId(self) -> Optional[DataCoordinate]:
412 """Return dimension values of the unit of processing (`DataId`)."""
413 return self._dataId
415 @property
416 def initInputs(self) -> NamedKeyMapping[DatasetType, DatasetRef]:
417 """Return mapping of datasets used to construct the Task.
419 Has `DatasetType` instances as keys (names can also be used for
420 lookups) and `DatasetRef` instances as values.
421 """
422 return self._initInputs
424 @property
425 def inputs(self) -> NamedKeyMapping[DatasetType, List[DatasetRef]]:
426 """Return mapping of input datasets that were expected to be used.
428 Has `DatasetType` instances as keys (names can also be used for
429 lookups) and a list of `DatasetRef` instances as values.
431 Notes
432 -----
433 We cannot use `set` instead of `list` for the nested container because
434 `DatasetRef` instances cannot be compared reliably when some have
435 integers IDs and others do not.
436 """
437 return self._inputs
439 @property
440 def outputs(self) -> NamedKeyMapping[DatasetType, List[DatasetRef]]:
441 """Return mapping of output datasets (to be) generated by this quantum.
443 Has the same form as `predictedInputs`.
445 Notes
446 -----
447 We cannot use `set` instead of `list` for the nested container because
448 `DatasetRef` instances cannot be compared reliably when some have
449 integers IDs and others do not.
450 """
451 return self._outputs
453 def __eq__(self, other: object) -> bool:
454 if not isinstance(other, Quantum):
455 return False
456 for item in ("taskClass", "dataId", "initInputs", "inputs", "outputs"):
457 if getattr(self, item) != getattr(other, item):
458 return False
459 return True
461 def __hash__(self) -> int:
462 return hash((self.taskClass, self.dataId))
464 def __reduce__(self) -> Union[str, Tuple[Any, ...]]:
465 return (self._reduceFactory,
466 (self.taskName, self.taskClass, self.dataId, dict(self.initInputs.items()),
467 dict(self.inputs), dict(self.outputs)))
469 def __str__(self) -> str:
470 return f"{self.__class__.__name__}(taskName={self.taskName}, dataId={self.dataId})"
472 @staticmethod
473 def _reduceFactory(taskName: Optional[str],
474 taskClass: Optional[Type],
475 dataId: Optional[DataCoordinate],
476 initInputs: Optional[Union[Mapping[DatasetType, DatasetRef], Iterable[DatasetRef]]],
477 inputs: Optional[Mapping[DatasetType, List[DatasetRef]]],
478 outputs: Optional[Mapping[DatasetType, List[DatasetRef]]]
479 ) -> Quantum:
480 return Quantum(taskName=taskName, taskClass=taskClass, dataId=dataId, initInputs=initInputs,
481 inputs=inputs, outputs=outputs)
484class DimensionRecordsAccumulator:
485 """Class used to accumulate dimension records for serialization.
487 This class generates an auto increment key for each unique dimension record
488 added to it. This allows serialization of dimension records to occur once
489 for each record but be refereed to multiple times.
490 """
492 def __init__(self) -> None:
493 self._counter = 0
494 self.mapping: MutableMapping[DimensionRecord, Tuple[int, SerializedDimensionRecord]] = {}
496 def addRecord(self, record: DimensionRecord) -> int:
497 """Add a dimension record to the accumulator if it has not already been
498 added. When a record is inserted for the first time it is assigned
499 a unique integer key.
501 This function returns the key associated with the record (either the
502 newly allocated key, or the existing one)
504 Paramters
505 ---------
506 record : `DimensionRecord`
507 The record to add to the accumulator
509 Returns
510 -------
511 accumulatorKey : int
512 The key that is associated with the supplied record
513 """
514 if (mappingValue := self.mapping.get(record)) is None:
515 simple = record.to_simple()
516 mappingValue = (self._counter, simple)
517 self._counter += 1
518 self.mapping[record] = mappingValue
519 return mappingValue[0]
521 def makeSerializedDimensionRecordMapping(self) -> Mapping[int, SerializedDimensionRecord]:
522 return {id_: serializeRef for id_, serializeRef in self.mapping.values()}