Coverage for python/lsst/daf/butler/core/quantum.py: 18%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = ("Quantum", "SerializedQuantum", "DimensionRecordsAccumulator")
26from typing import Any, Dict, Iterable, List, Mapping, MutableMapping, Optional, Tuple, Type, Union
28from lsst.utils import doImportType
29from pydantic import BaseModel
31from .datasets import DatasetRef, DatasetType, SerializedDatasetRef, SerializedDatasetType
32from .dimensions import (
33 DataCoordinate,
34 DimensionRecord,
35 DimensionUniverse,
36 SerializedDataCoordinate,
37 SerializedDimensionRecord,
38)
39from .named import NamedKeyDict, NamedKeyMapping
42def _reconstructDatasetRef(
43 simple: SerializedDatasetRef,
44 type_: Optional[DatasetType],
45 ids: Iterable[int],
46 dimensionRecords: Optional[Dict[int, SerializedDimensionRecord]],
47 reconstitutedDimensions: Dict[int, Tuple[str, DimensionRecord]],
48 universe: DimensionUniverse,
49) -> DatasetRef:
50 """Reconstruct a DatasetRef stored in a Serialized Quantum"""
51 # Reconstruct the dimension records
52 records = {}
53 for dId in ids:
54 # if the dimension record has been loaded previously use that,
55 # otherwise load it from the dict of Serialized DimensionRecords
56 if (recId := reconstitutedDimensions.get(dId)) is None:
57 if dimensionRecords is None:
58 raise ValueError(
59 "Cannot construct from a SerializedQuantum with no dimension records. "
60 "Reconstituted Dimensions must be supplied and populated in method call."
61 )
62 tmpSerialized = dimensionRecords[dId]
63 reconstructedDim = DimensionRecord.from_simple(tmpSerialized, universe=universe)
64 definition = tmpSerialized.definition
65 reconstitutedDimensions[dId] = (definition, reconstructedDim)
66 else:
67 definition, reconstructedDim = recId
68 records[definition] = reconstructedDim
69 # turn the serialized form into an object and attach the dimension records
70 rebuiltDatasetRef = DatasetRef.from_simple(simple, universe, datasetType=type_)
71 if records:
72 object.__setattr__(rebuiltDatasetRef, "dataId", rebuiltDatasetRef.dataId.expanded(records))
73 return rebuiltDatasetRef
76class SerializedQuantum(BaseModel):
77 """Simplified model of a `Quantum` suitable for serialization."""
79 taskName: str
80 dataId: Optional[SerializedDataCoordinate]
81 datasetTypeMapping: Mapping[str, SerializedDatasetType]
82 initInputs: Mapping[str, Tuple[SerializedDatasetRef, List[int]]]
83 inputs: Mapping[str, List[Tuple[SerializedDatasetRef, List[int]]]]
84 outputs: Mapping[str, List[Tuple[SerializedDatasetRef, List[int]]]]
85 dimensionRecords: Optional[Dict[int, SerializedDimensionRecord]] = None
87 @classmethod
88 def direct(
89 cls,
90 *,
91 taskName: str,
92 dataId: Optional[Dict],
93 datasetTypeMapping: Mapping[str, Dict],
94 initInputs: Mapping[str, Tuple[Dict, List[int]]],
95 inputs: Mapping[str, List[Tuple[Dict, List[int]]]],
96 outputs: Mapping[str, List[Tuple[Dict, List[int]]]],
97 dimensionRecords: Optional[Dict[int, Dict]],
98 ) -> SerializedQuantum:
99 """Construct a `SerializedQuantum` directly without validators.
101 This differs from the pydantic "construct" method in that the arguments
102 are explicitly what the model requires, and it will recurse through
103 members, constructing them from their corresponding `direct` methods.
105 This method should only be called when the inputs are trusted.
106 """
107 node = SerializedQuantum.__new__(cls)
108 setter = object.__setattr__
109 setter(node, "taskName", taskName)
110 setter(node, "dataId", dataId if dataId is None else SerializedDataCoordinate.direct(**dataId))
111 setter(
112 node,
113 "datasetTypeMapping",
114 {k: SerializedDatasetType.direct(**v) for k, v in datasetTypeMapping.items()},
115 )
116 setter(
117 node,
118 "initInputs",
119 {k: (SerializedDatasetRef.direct(**v), refs) for k, (v, refs) in initInputs.items()},
120 )
121 setter(
122 node,
123 "inputs",
124 {k: [(SerializedDatasetRef.direct(**ref), id) for ref, id in v] for k, v in inputs.items()},
125 )
126 setter(
127 node,
128 "outputs",
129 {k: [(SerializedDatasetRef.direct(**ref), id) for ref, id in v] for k, v in outputs.items()},
130 )
131 setter(
132 node,
133 "dimensionRecords",
134 dimensionRecords
135 if dimensionRecords is None
136 else {int(k): SerializedDimensionRecord.direct(**v) for k, v in dimensionRecords.items()},
137 )
138 setter(
139 node,
140 "__fields_set__",
141 {
142 "taskName",
143 "dataId",
144 "datasetTypeMapping",
145 "initInputs",
146 "inputs",
147 "outputs",
148 "dimensionRecords",
149 },
150 )
151 return node
154class Quantum:
155 """Class representing a discrete unit of work.
157 A Quantum may depend on one or more datasets and produce one or more
158 datasets.
160 Most Quanta will be executions of a particular ``PipelineTask``’s
161 ``runQuantum`` method, but they can also be used to represent discrete
162 units of work performed manually by human operators or other software
163 agents.
165 Parameters
166 ----------
167 taskName : `str`, optional
168 Fully-qualified name of the Task class that executed or will execute
169 this Quantum. If not provided, ``taskClass`` must be.
170 taskClass : `type`, optional
171 The Task class that executed or will execute this Quantum. If not
172 provided, ``taskName`` must be. Overrides ``taskName`` if both are
173 provided.
174 dataId : `DataId`, optional
175 The dimension values that identify this `Quantum`.
176 initInputs : collection of `DatasetRef`, optional
177 Datasets that are needed to construct an instance of the Task. May
178 be a flat iterable of `DatasetRef` instances or a mapping from
179 `DatasetType` to `DatasetRef`.
180 inputs : `~collections.abc.Mapping`, optional
181 Inputs identified prior to execution, organized as a mapping from
182 `DatasetType` to a list of `DatasetRef`.
183 outputs : `~collections.abc.Mapping`, optional
184 Outputs from executing this quantum of work, organized as a mapping
185 from `DatasetType` to a list of `DatasetRef`.
186 """
188 __slots__ = ("_taskName", "_taskClass", "_dataId", "_initInputs", "_inputs", "_outputs", "_hash")
190 def __init__(
191 self,
192 *,
193 taskName: Optional[str] = None,
194 taskClass: Optional[Type] = None,
195 dataId: Optional[DataCoordinate] = None,
196 initInputs: Optional[Union[Mapping[DatasetType, DatasetRef], Iterable[DatasetRef]]] = None,
197 inputs: Optional[Mapping[DatasetType, List[DatasetRef]]] = None,
198 outputs: Optional[Mapping[DatasetType, List[DatasetRef]]] = None,
199 ):
200 if taskClass is not None:
201 taskName = f"{taskClass.__module__}.{taskClass.__name__}"
202 self._taskName = taskName
203 self._taskClass = taskClass
204 self._dataId = dataId
205 if initInputs is None:
206 initInputs = {}
207 elif not isinstance(initInputs, Mapping):
208 initInputs = {ref.datasetType: ref for ref in initInputs}
209 if inputs is None:
210 inputs = {}
211 if outputs is None:
212 outputs = {}
213 self._initInputs = NamedKeyDict[DatasetType, DatasetRef](initInputs).freeze()
214 self._inputs = NamedKeyDict[DatasetType, List[DatasetRef]](inputs).freeze()
215 self._outputs = NamedKeyDict[DatasetType, List[DatasetRef]](outputs).freeze()
217 def to_simple(self, accumulator: Optional[DimensionRecordsAccumulator] = None) -> SerializedQuantum:
218 """Convert this class to a simple python type.
220 This makes it suitable for serialization.
222 Parameters
223 ----------
224 accumulator : `DimensionRecordsAccumulator`, optional
225 This accumulator can be used to aggregate dimension records accross
226 multiple Quanta. If this is None, the default, dimension records
227 are serialized with this Quantum. If an accumulator is supplied it
228 is assumed something else is responsible for serializing the
229 records, and they will not be stored with the SerializedQuantum.
231 Returns
232 -------
233 simple : `SerializedQuantum`
234 This object converted to a serializable representation.
235 """
236 typeMapping = {}
237 initInputs = {}
239 if accumulator is None:
240 accumulator = DimensionRecordsAccumulator()
241 writeDimensionRecords = True
242 else:
243 writeDimensionRecords = False
245 # collect the init inputs for serialization, recording the types into
246 # their own mapping, used throughout to minimize saving the same object
247 # multiple times. String name of the type used to index mappings.
248 for key, value in self._initInputs.items():
249 # add the type to the typeMapping
250 typeMapping[key.name] = key.to_simple()
251 # convert to a simple DatasetRef representation
252 simple = value.to_simple()
253 # extract the dimension records
254 recIds = []
255 if simple.dataId is not None and simple.dataId.records is not None:
256 # for each dimension record get a id by adding it to the
257 # record accumulator.
258 for rec in value.dataId.records.values():
259 if rec is not None:
260 recordId = accumulator.addRecord(rec)
261 recIds.append(recordId)
262 # Set properties to None to save space
263 simple.dataId.records = None
264 simple.datasetType = None
265 initInputs[key.name] = (simple, recIds)
267 # container for all the SerializedDatasetRefs, keyed on the
268 # DatasetType name.
269 inputs = {}
271 # collect the inputs
272 for key, values in self._inputs.items():
273 # collect type if it is not already in the mapping
274 if key.name not in typeMapping:
275 typeMapping[key.name] = key.to_simple()
276 # for each input type there are a list of inputs, collect them
277 tmp = []
278 for e in values:
279 simp = e.to_simple()
280 # This container will hold ids (hashes) that point to all the
281 # dimension records within the SerializedDatasetRef dataId
282 # These dimension records repeat in almost every DatasetRef
283 # So it is hugely wasteful in terms of disk and cpu time to
284 # store them over and over again.
285 recIds = []
286 if simp.dataId is not None and simp.dataId.records is not None:
287 for rec in e.dataId.records.values():
288 # for each dimension record get a id by adding it to
289 # the record accumulator.
290 if rec is not None:
291 recordId = accumulator.addRecord(rec)
292 recIds.append(recordId)
293 # Set the records to None to avoid serializing them
294 simp.dataId.records = None
295 # Dataset type is the same as the key in _inputs, no need
296 # to serialize it out multiple times, set it to None
297 simp.datasetType = None
298 # append a tuple of the simplified SerializedDatasetRef, along
299 # with the list of all the keys for the dimension records
300 # needed for reconstruction.
301 tmp.append((simp, recIds))
302 inputs[key.name] = tmp
304 # container for all the SerializedDatasetRefs, keyed on the
305 # DatasetType name.
306 outputs = {}
307 for key, values in self._outputs.items():
308 # collect type if it is not already in the mapping
309 if key.name not in typeMapping:
310 typeMapping[key.name] = key.to_simple()
311 # for each output type there are a list of inputs, collect them
312 tmp = []
313 for e in values:
314 simp = e.to_simple()
315 # This container will hold ids (hashes) that point to all the
316 # dimension records within the SerializedDatasetRef dataId
317 # These dimension records repeat in almost every DatasetRef
318 # So it is hugely wasteful in terms of disk and cpu time to
319 # store them over and over again.
320 recIds = []
321 if simp.dataId is not None and simp.dataId.records is not None:
322 for rec in e.dataId.records.values():
323 # for each dimension record get a id by adding it to
324 # the record accumulator.
325 if rec is not None:
326 recordId = accumulator.addRecord(rec)
327 recIds.append(recordId)
328 # Set the records to None to avoid serializing them
329 simp.dataId.records = None
330 # Dataset type is the same as the key in _outputs, no need
331 # to serialize it out multiple times, set it to None
332 simp.datasetType = None
333 # append a tuple of the simplified SerializedDatasetRef, along
334 # with the list of all the keys for the dimension records
335 # needed for reconstruction.
336 tmp.append((simp, recIds))
337 outputs[key.name] = tmp
339 dimensionRecords: Optional[Mapping[int, SerializedDimensionRecord]]
340 if writeDimensionRecords:
341 dimensionRecords = accumulator.makeSerializedDimensionRecordMapping()
342 else:
343 dimensionRecords = None
345 return SerializedQuantum(
346 taskName=self._taskName,
347 dataId=self.dataId.to_simple() if self.dataId is not None else None,
348 datasetTypeMapping=typeMapping,
349 initInputs=initInputs,
350 inputs=inputs,
351 outputs=outputs,
352 dimensionRecords=dimensionRecords,
353 )
355 @classmethod
356 def from_simple(
357 cls,
358 simple: SerializedQuantum,
359 universe: DimensionUniverse,
360 reconstitutedDimensions: Optional[Dict[int, Tuple[str, DimensionRecord]]] = None,
361 ) -> Quantum:
362 """Construct a new object from a simplified form.
364 Generally this is data returned from the `to_simple` method.
366 Parameters
367 ----------
368 simple : SerializedQuantum
369 The value returned by a call to `to_simple`
370 universe : `DimensionUniverse`
371 The special graph of all known dimensions.
372 reconstitutedDimensions : `dict` of `int` to `DimensionRecord` or None
373 A mapping of ids to dimension records to be used when populating
374 dimensions for this Quantum. If supplied it will be used in place
375 of the dimension Records stored with the SerializedQuantum, if a
376 required dimension has already been loaded. Otherwise the record
377 will be unpersisted from the SerializedQuatnum and added to the
378 reconstitutedDimensions dict (if not None). Defaults to None.
379 """
380 loadedTypes: MutableMapping[str, DatasetType] = {}
381 initInputs: MutableMapping[DatasetType, DatasetRef] = {}
382 if reconstitutedDimensions is None:
383 reconstitutedDimensions = {}
385 # Unpersist all the init inputs
386 for key, (value, dimensionIds) in simple.initInputs.items():
387 # If a datasetType has already been created use that instead of
388 # unpersisting.
389 if (type_ := loadedTypes.get(key)) is None:
390 type_ = loadedTypes.setdefault(
391 key, DatasetType.from_simple(simple.datasetTypeMapping[key], universe=universe)
392 )
393 # reconstruct the dimension records
394 rebuiltDatasetRef = _reconstructDatasetRef(
395 value, type_, dimensionIds, simple.dimensionRecords, reconstitutedDimensions, universe
396 )
397 initInputs[type_] = rebuiltDatasetRef
399 # containers for the dataset refs
400 inputs: MutableMapping[DatasetType, List[DatasetRef]] = {}
401 outputs: MutableMapping[DatasetType, List[DatasetRef]] = {}
403 for container, simpleRefs in ((inputs, simple.inputs), (outputs, simple.outputs)):
404 for key, values in simpleRefs.items():
405 # If a datasetType has already been created use that instead of
406 # unpersisting.
407 if (type_ := loadedTypes.get(key)) is None:
408 type_ = loadedTypes.setdefault(
409 key, DatasetType.from_simple(simple.datasetTypeMapping[key], universe=universe)
410 )
411 # reconstruct the list of DatasetRefs for this DatasetType
412 tmp: List[DatasetRef] = []
413 for v, recIds in values:
414 rebuiltDatasetRef = _reconstructDatasetRef(
415 v, type_, recIds, simple.dimensionRecords, reconstitutedDimensions, universe
416 )
417 tmp.append(rebuiltDatasetRef)
418 container[type_] = tmp
420 dataId = (
421 DataCoordinate.from_simple(simple.dataId, universe=universe)
422 if simple.dataId is not None
423 else None
424 )
425 return Quantum(
426 taskName=simple.taskName, dataId=dataId, initInputs=initInputs, inputs=inputs, outputs=outputs
427 )
429 @property
430 def taskClass(self) -> Optional[Type]:
431 """Task class associated with this `Quantum` (`type`)."""
432 if self._taskClass is None:
433 if self._taskName is None:
434 raise ValueError("No task class defined and task name is None")
435 task_class = doImportType(self._taskName)
436 self._taskClass = task_class
437 return self._taskClass
439 @property
440 def taskName(self) -> Optional[str]:
441 """Return Fully-qualified name of the task associated with `Quantum`.
443 (`str`).
444 """
445 return self._taskName
447 @property
448 def dataId(self) -> Optional[DataCoordinate]:
449 """Return dimension values of the unit of processing (`DataId`)."""
450 return self._dataId
452 @property
453 def initInputs(self) -> NamedKeyMapping[DatasetType, DatasetRef]:
454 """Return mapping of datasets used to construct the Task.
456 Has `DatasetType` instances as keys (names can also be used for
457 lookups) and `DatasetRef` instances as values.
458 """
459 return self._initInputs
461 @property
462 def inputs(self) -> NamedKeyMapping[DatasetType, List[DatasetRef]]:
463 """Return mapping of input datasets that were expected to be used.
465 Has `DatasetType` instances as keys (names can also be used for
466 lookups) and a list of `DatasetRef` instances as values.
468 Notes
469 -----
470 We cannot use `set` instead of `list` for the nested container because
471 `DatasetRef` instances cannot be compared reliably when some have
472 integers IDs and others do not.
473 """
474 return self._inputs
476 @property
477 def outputs(self) -> NamedKeyMapping[DatasetType, List[DatasetRef]]:
478 """Return mapping of output datasets (to be) generated by this quantum.
480 Has the same form as `predictedInputs`.
482 Notes
483 -----
484 We cannot use `set` instead of `list` for the nested container because
485 `DatasetRef` instances cannot be compared reliably when some have
486 integers IDs and others do not.
487 """
488 return self._outputs
490 def __eq__(self, other: object) -> bool:
491 if not isinstance(other, Quantum):
492 return False
493 for item in ("taskClass", "dataId", "initInputs", "inputs", "outputs"):
494 if getattr(self, item) != getattr(other, item):
495 return False
496 return True
498 def __hash__(self) -> int:
499 return hash((self.taskClass, self.dataId))
501 def __reduce__(self) -> Union[str, Tuple[Any, ...]]:
502 return (
503 self._reduceFactory,
504 (
505 self.taskName,
506 self.taskClass,
507 self.dataId,
508 dict(self.initInputs.items()),
509 dict(self.inputs),
510 dict(self.outputs),
511 ),
512 )
514 def __str__(self) -> str:
515 return f"{self.__class__.__name__}(taskName={self.taskName}, dataId={self.dataId})"
517 @staticmethod
518 def _reduceFactory(
519 taskName: Optional[str],
520 taskClass: Optional[Type],
521 dataId: Optional[DataCoordinate],
522 initInputs: Optional[Union[Mapping[DatasetType, DatasetRef], Iterable[DatasetRef]]],
523 inputs: Optional[Mapping[DatasetType, List[DatasetRef]]],
524 outputs: Optional[Mapping[DatasetType, List[DatasetRef]]],
525 ) -> Quantum:
526 return Quantum(
527 taskName=taskName,
528 taskClass=taskClass,
529 dataId=dataId,
530 initInputs=initInputs,
531 inputs=inputs,
532 outputs=outputs,
533 )
536class DimensionRecordsAccumulator:
537 """Class used to accumulate dimension records for serialization.
539 This class generates an auto increment key for each unique dimension record
540 added to it. This allows serialization of dimension records to occur once
541 for each record but be refereed to multiple times.
542 """
544 def __init__(self) -> None:
545 self._counter = 0
546 self.mapping: MutableMapping[DimensionRecord, Tuple[int, SerializedDimensionRecord]] = {}
548 def addRecord(self, record: DimensionRecord) -> int:
549 """Add a dimension record to the accumulator if it has not already been
550 added. When a record is inserted for the first time it is assigned
551 a unique integer key.
553 This function returns the key associated with the record (either the
554 newly allocated key, or the existing one)
556 Paramters
557 ---------
558 record : `DimensionRecord`
559 The record to add to the accumulator
561 Returns
562 -------
563 accumulatorKey : int
564 The key that is associated with the supplied record
565 """
566 if (mappingValue := self.mapping.get(record)) is None:
567 simple = record.to_simple()
568 mappingValue = (self._counter, simple)
569 self._counter += 1
570 self.mapping[record] = mappingValue
571 return mappingValue[0]
573 def makeSerializedDimensionRecordMapping(self) -> Mapping[int, SerializedDimensionRecord]:
574 return {id_: serializeRef for id_, serializeRef in self.mapping.values()}