Coverage for python/lsst/daf/butler/core/quantum.py: 16%
212 statements
« prev ^ index » next coverage.py v6.5.0, created at 2023-06-06 09:38 +0000
« prev ^ index » next coverage.py v6.5.0, created at 2023-06-06 09:38 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = ("Quantum", "SerializedQuantum", "DimensionRecordsAccumulator")
26from typing import Any, Dict, Iterable, List, Mapping, MutableMapping, Optional, Tuple, Type, Union
28from lsst.utils import doImportType
29from pydantic import BaseModel
31from .datasets import DatasetRef, DatasetType, SerializedDatasetRef, SerializedDatasetType
32from .datastoreRecordData import DatastoreRecordData, SerializedDatastoreRecordData
33from .dimensions import (
34 DataCoordinate,
35 DimensionRecord,
36 DimensionUniverse,
37 SerializedDataCoordinate,
38 SerializedDimensionRecord,
39)
40from .named import NamedKeyDict, NamedKeyMapping
43def _reconstructDatasetRef(
44 simple: SerializedDatasetRef,
45 type_: Optional[DatasetType],
46 ids: Iterable[int],
47 dimensionRecords: Optional[Dict[int, SerializedDimensionRecord]],
48 reconstitutedDimensions: Dict[int, Tuple[str, DimensionRecord]],
49 universe: DimensionUniverse,
50) -> DatasetRef:
51 """Reconstruct a DatasetRef stored in a Serialized Quantum"""
52 # Reconstruct the dimension records
53 records = {}
54 for dId in ids:
55 # if the dimension record has been loaded previously use that,
56 # otherwise load it from the dict of Serialized DimensionRecords
57 if (recId := reconstitutedDimensions.get(dId)) is None:
58 if dimensionRecords is None:
59 raise ValueError(
60 "Cannot construct from a SerializedQuantum with no dimension records. "
61 "Reconstituted Dimensions must be supplied and populated in method call."
62 )
63 tmpSerialized = dimensionRecords[dId]
64 reconstructedDim = DimensionRecord.from_simple(tmpSerialized, universe=universe)
65 definition = tmpSerialized.definition
66 reconstitutedDimensions[dId] = (definition, reconstructedDim)
67 else:
68 definition, reconstructedDim = recId
69 records[definition] = reconstructedDim
70 # turn the serialized form into an object and attach the dimension records
71 rebuiltDatasetRef = DatasetRef.from_simple(simple, universe, datasetType=type_)
72 if records:
73 object.__setattr__(rebuiltDatasetRef, "dataId", rebuiltDatasetRef.dataId.expanded(records))
74 return rebuiltDatasetRef
77class SerializedQuantum(BaseModel):
78 """Simplified model of a `Quantum` suitable for serialization."""
80 taskName: str | None
81 dataId: Optional[SerializedDataCoordinate]
82 datasetTypeMapping: Mapping[str, SerializedDatasetType]
83 initInputs: Mapping[str, Tuple[SerializedDatasetRef, List[int]]]
84 inputs: Mapping[str, List[Tuple[SerializedDatasetRef, List[int]]]]
85 outputs: Mapping[str, List[Tuple[SerializedDatasetRef, List[int]]]]
86 dimensionRecords: Optional[Dict[int, SerializedDimensionRecord]] = None
87 datastoreRecords: Optional[Dict[str, SerializedDatastoreRecordData]] = None
89 @classmethod
90 def direct(
91 cls,
92 *,
93 taskName: str | None,
94 dataId: Optional[Dict],
95 datasetTypeMapping: Mapping[str, Dict],
96 initInputs: Mapping[str, Tuple[Dict, List[int]]],
97 inputs: Mapping[str, List[Tuple[Dict, List[int]]]],
98 outputs: Mapping[str, List[Tuple[Dict, List[int]]]],
99 dimensionRecords: Optional[Dict[int, Dict]],
100 datastoreRecords: Optional[Dict[str, Dict]],
101 ) -> SerializedQuantum:
102 """Construct a `SerializedQuantum` directly without validators.
104 This differs from the pydantic "construct" method in that the arguments
105 are explicitly what the model requires, and it will recurse through
106 members, constructing them from their corresponding `direct` methods.
108 This method should only be called when the inputs are trusted.
109 """
110 node = SerializedQuantum.__new__(cls)
111 setter = object.__setattr__
112 setter(node, "taskName", taskName)
113 setter(node, "dataId", dataId if dataId is None else SerializedDataCoordinate.direct(**dataId))
114 setter(
115 node,
116 "datasetTypeMapping",
117 {k: SerializedDatasetType.direct(**v) for k, v in datasetTypeMapping.items()},
118 )
119 setter(
120 node,
121 "initInputs",
122 {k: (SerializedDatasetRef.direct(**v), refs) for k, (v, refs) in initInputs.items()},
123 )
124 setter(
125 node,
126 "inputs",
127 {k: [(SerializedDatasetRef.direct(**ref), id) for ref, id in v] for k, v in inputs.items()},
128 )
129 setter(
130 node,
131 "outputs",
132 {k: [(SerializedDatasetRef.direct(**ref), id) for ref, id in v] for k, v in outputs.items()},
133 )
134 setter(
135 node,
136 "dimensionRecords",
137 dimensionRecords
138 if dimensionRecords is None
139 else {int(k): SerializedDimensionRecord.direct(**v) for k, v in dimensionRecords.items()},
140 )
141 setter(
142 node,
143 "datastoreRecords",
144 datastoreRecords
145 if datastoreRecords is None
146 else {k: SerializedDatastoreRecordData.direct(**v) for k, v in datastoreRecords.items()},
147 )
148 setter(
149 node,
150 "__fields_set__",
151 {
152 "taskName",
153 "dataId",
154 "datasetTypeMapping",
155 "initInputs",
156 "inputs",
157 "outputs",
158 "dimensionRecords",
159 "datastore_records",
160 },
161 )
162 return node
165class Quantum:
166 """Class representing a discrete unit of work.
168 A Quantum may depend on one or more datasets and produce one or more
169 datasets.
171 Most Quanta will be executions of a particular ``PipelineTask``’s
172 ``runQuantum`` method, but they can also be used to represent discrete
173 units of work performed manually by human operators or other software
174 agents.
176 Parameters
177 ----------
178 taskName : `str`, optional
179 Fully-qualified name of the Task class that executed or will execute
180 this Quantum. If not provided, ``taskClass`` must be.
181 taskClass : `type`, optional
182 The Task class that executed or will execute this Quantum. If not
183 provided, ``taskName`` must be. Overrides ``taskName`` if both are
184 provided.
185 dataId : `DataId`, optional
186 The dimension values that identify this `Quantum`.
187 initInputs : collection of `DatasetRef`, optional
188 Datasets that are needed to construct an instance of the Task. May
189 be a flat iterable of `DatasetRef` instances or a mapping from
190 `DatasetType` to `DatasetRef`.
191 inputs : `~collections.abc.Mapping`, optional
192 Inputs identified prior to execution, organized as a mapping from
193 `DatasetType` to a list of `DatasetRef`.
194 outputs : `~collections.abc.Mapping`, optional
195 Outputs from executing this quantum of work, organized as a mapping
196 from `DatasetType` to a list of `DatasetRef`.
197 datastore_records : `DatastoreRecordData`, optional
198 Datastore record data for input or initInput datasets that already
199 exist.
200 """
202 __slots__ = (
203 "_taskName",
204 "_taskClass",
205 "_dataId",
206 "_initInputs",
207 "_inputs",
208 "_outputs",
209 "_hash",
210 "_datastore_records",
211 )
213 def __init__(
214 self,
215 *,
216 taskName: Optional[str] = None,
217 taskClass: Optional[Type] = None,
218 dataId: Optional[DataCoordinate] = None,
219 initInputs: Optional[Union[Mapping[DatasetType, DatasetRef], Iterable[DatasetRef]]] = None,
220 inputs: Optional[Mapping[DatasetType, List[DatasetRef]]] = None,
221 outputs: Optional[Mapping[DatasetType, List[DatasetRef]]] = None,
222 datastore_records: Optional[Mapping[str, DatastoreRecordData]] = None,
223 ):
224 if taskClass is not None:
225 taskName = f"{taskClass.__module__}.{taskClass.__name__}"
226 self._taskName = taskName
227 self._taskClass = taskClass
228 self._dataId = dataId
229 if initInputs is None:
230 initInputs = {}
231 elif not isinstance(initInputs, Mapping):
232 initInputs = {ref.datasetType: ref for ref in initInputs}
233 if inputs is None:
234 inputs = {}
235 if outputs is None:
236 outputs = {}
237 self._initInputs = NamedKeyDict[DatasetType, DatasetRef](initInputs).freeze()
238 self._inputs = NamedKeyDict[DatasetType, List[DatasetRef]](inputs).freeze()
239 self._outputs = NamedKeyDict[DatasetType, List[DatasetRef]](outputs).freeze()
240 if datastore_records is None:
241 datastore_records = {}
242 self._datastore_records = datastore_records
244 def to_simple(self, accumulator: Optional[DimensionRecordsAccumulator] = None) -> SerializedQuantum:
245 """Convert this class to a simple python type.
247 This makes it suitable for serialization.
249 Parameters
250 ----------
251 accumulator : `DimensionRecordsAccumulator`, optional
252 This accumulator can be used to aggregate dimension records accross
253 multiple Quanta. If this is None, the default, dimension records
254 are serialized with this Quantum. If an accumulator is supplied it
255 is assumed something else is responsible for serializing the
256 records, and they will not be stored with the SerializedQuantum.
258 Returns
259 -------
260 simple : `SerializedQuantum`
261 This object converted to a serializable representation.
262 """
263 typeMapping = {}
264 initInputs = {}
266 if accumulator is None:
267 accumulator = DimensionRecordsAccumulator()
268 writeDimensionRecords = True
269 else:
270 writeDimensionRecords = False
272 # collect the init inputs for serialization, recording the types into
273 # their own mapping, used throughout to minimize saving the same object
274 # multiple times. String name of the type used to index mappings.
275 for key, value in self._initInputs.items():
276 # add the type to the typeMapping
277 typeMapping[key.name] = key.to_simple()
278 # convert to a simple DatasetRef representation
279 simple = value.to_simple()
280 # extract the dimension records
281 recIds = []
282 if simple.dataId is not None and simple.dataId.records is not None:
283 # for each dimension record get a id by adding it to the
284 # record accumulator.
285 for rec in value.dataId.records.values():
286 if rec is not None:
287 recordId = accumulator.addRecord(rec)
288 recIds.append(recordId)
289 # Set properties to None to save space
290 simple.dataId.records = None
291 simple.datasetType = None
292 initInputs[key.name] = (simple, recIds)
294 # container for all the SerializedDatasetRefs, keyed on the
295 # DatasetType name.
296 inputs = {}
298 # collect the inputs
299 for key, values in self._inputs.items():
300 # collect type if it is not already in the mapping
301 if key.name not in typeMapping:
302 typeMapping[key.name] = key.to_simple()
303 # for each input type there are a list of inputs, collect them
304 tmp = []
305 for e in values:
306 simp = e.to_simple()
307 # This container will hold ids (hashes) that point to all the
308 # dimension records within the SerializedDatasetRef dataId
309 # These dimension records repeat in almost every DatasetRef
310 # So it is hugely wasteful in terms of disk and cpu time to
311 # store them over and over again.
312 recIds = []
313 if simp.dataId is not None and simp.dataId.records is not None:
314 for rec in e.dataId.records.values():
315 # for each dimension record get a id by adding it to
316 # the record accumulator.
317 if rec is not None:
318 recordId = accumulator.addRecord(rec)
319 recIds.append(recordId)
320 # Set the records to None to avoid serializing them
321 simp.dataId.records = None
322 # Dataset type is the same as the key in _inputs, no need
323 # to serialize it out multiple times, set it to None
324 simp.datasetType = None
325 # append a tuple of the simplified SerializedDatasetRef, along
326 # with the list of all the keys for the dimension records
327 # needed for reconstruction.
328 tmp.append((simp, recIds))
329 inputs[key.name] = tmp
331 # container for all the SerializedDatasetRefs, keyed on the
332 # DatasetType name.
333 outputs = {}
334 for key, values in self._outputs.items():
335 # collect type if it is not already in the mapping
336 if key.name not in typeMapping:
337 typeMapping[key.name] = key.to_simple()
338 # for each output type there are a list of inputs, collect them
339 tmp = []
340 for e in values:
341 simp = e.to_simple()
342 # This container will hold ids (hashes) that point to all the
343 # dimension records within the SerializedDatasetRef dataId
344 # These dimension records repeat in almost every DatasetRef
345 # So it is hugely wasteful in terms of disk and cpu time to
346 # store them over and over again.
347 recIds = []
348 if simp.dataId is not None and simp.dataId.records is not None:
349 for rec in e.dataId.records.values():
350 # for each dimension record get a id by adding it to
351 # the record accumulator.
352 if rec is not None:
353 recordId = accumulator.addRecord(rec)
354 recIds.append(recordId)
355 # Set the records to None to avoid serializing them
356 simp.dataId.records = None
357 # Dataset type is the same as the key in _outputs, no need
358 # to serialize it out multiple times, set it to None
359 simp.datasetType = None
360 # append a tuple of the simplified SerializedDatasetRef, along
361 # with the list of all the keys for the dimension records
362 # needed for reconstruction.
363 tmp.append((simp, recIds))
364 outputs[key.name] = tmp
366 dimensionRecords: Optional[Mapping[int, SerializedDimensionRecord]]
367 if writeDimensionRecords:
368 dimensionRecords = accumulator.makeSerializedDimensionRecordMapping()
369 else:
370 dimensionRecords = None
372 datastore_records: Optional[Dict[str, SerializedDatastoreRecordData]] = None
373 if self.datastore_records is not None:
374 datastore_records = {
375 datastore_name: record_data.to_simple()
376 for datastore_name, record_data in self.datastore_records.items()
377 }
379 return SerializedQuantum(
380 taskName=self._taskName,
381 dataId=self.dataId.to_simple() if self.dataId is not None else None,
382 datasetTypeMapping=typeMapping,
383 initInputs=initInputs,
384 inputs=inputs,
385 outputs=outputs,
386 dimensionRecords=dimensionRecords,
387 datastoreRecords=datastore_records,
388 )
390 @classmethod
391 def from_simple(
392 cls,
393 simple: SerializedQuantum,
394 universe: DimensionUniverse,
395 reconstitutedDimensions: Optional[Dict[int, Tuple[str, DimensionRecord]]] = None,
396 ) -> Quantum:
397 """Construct a new object from a simplified form.
399 Generally this is data returned from the `to_simple` method.
401 Parameters
402 ----------
403 simple : SerializedQuantum
404 The value returned by a call to `to_simple`
405 universe : `DimensionUniverse`
406 The special graph of all known dimensions.
407 reconstitutedDimensions : `dict` of `int` to `DimensionRecord` or None
408 A mapping of ids to dimension records to be used when populating
409 dimensions for this Quantum. If supplied it will be used in place
410 of the dimension Records stored with the SerializedQuantum, if a
411 required dimension has already been loaded. Otherwise the record
412 will be unpersisted from the SerializedQuatnum and added to the
413 reconstitutedDimensions dict (if not None). Defaults to None.
414 """
415 loadedTypes: MutableMapping[str, DatasetType] = {}
416 initInputs: MutableMapping[DatasetType, DatasetRef] = {}
417 if reconstitutedDimensions is None:
418 reconstitutedDimensions = {}
420 # Unpersist all the init inputs
421 for key, (value, dimensionIds) in simple.initInputs.items():
422 # If a datasetType has already been created use that instead of
423 # unpersisting.
424 if (type_ := loadedTypes.get(key)) is None:
425 type_ = loadedTypes.setdefault(
426 key, DatasetType.from_simple(simple.datasetTypeMapping[key], universe=universe)
427 )
428 # reconstruct the dimension records
429 rebuiltDatasetRef = _reconstructDatasetRef(
430 value, type_, dimensionIds, simple.dimensionRecords, reconstitutedDimensions, universe
431 )
432 initInputs[type_] = rebuiltDatasetRef
434 # containers for the dataset refs
435 inputs: MutableMapping[DatasetType, List[DatasetRef]] = {}
436 outputs: MutableMapping[DatasetType, List[DatasetRef]] = {}
438 for container, simpleRefs in ((inputs, simple.inputs), (outputs, simple.outputs)):
439 for key, values in simpleRefs.items():
440 # If a datasetType has already been created use that instead of
441 # unpersisting.
442 if (type_ := loadedTypes.get(key)) is None:
443 type_ = loadedTypes.setdefault(
444 key, DatasetType.from_simple(simple.datasetTypeMapping[key], universe=universe)
445 )
446 # reconstruct the list of DatasetRefs for this DatasetType
447 tmp: List[DatasetRef] = []
448 for v, recIds in values:
449 rebuiltDatasetRef = _reconstructDatasetRef(
450 v, type_, recIds, simple.dimensionRecords, reconstitutedDimensions, universe
451 )
452 tmp.append(rebuiltDatasetRef)
453 container[type_] = tmp
455 dataId = (
456 DataCoordinate.from_simple(simple.dataId, universe=universe)
457 if simple.dataId is not None
458 else None
459 )
461 datastore_records: Optional[Dict[str, DatastoreRecordData]] = None
462 if simple.datastoreRecords is not None:
463 datastore_records = {
464 datastore_name: DatastoreRecordData.from_simple(record_data)
465 for datastore_name, record_data in simple.datastoreRecords.items()
466 }
468 return Quantum(
469 taskName=simple.taskName,
470 dataId=dataId,
471 initInputs=initInputs,
472 inputs=inputs,
473 outputs=outputs,
474 datastore_records=datastore_records,
475 )
477 @property
478 def taskClass(self) -> Optional[Type]:
479 """Task class associated with this `Quantum` (`type`)."""
480 if self._taskClass is None:
481 if self._taskName is None:
482 raise ValueError("No task class defined and task name is None")
483 task_class = doImportType(self._taskName)
484 self._taskClass = task_class
485 return self._taskClass
487 @property
488 def taskName(self) -> Optional[str]:
489 """Return Fully-qualified name of the task associated with `Quantum`.
491 (`str`).
492 """
493 return self._taskName
495 @property
496 def dataId(self) -> Optional[DataCoordinate]:
497 """Return dimension values of the unit of processing (`DataId`)."""
498 return self._dataId
500 @property
501 def initInputs(self) -> NamedKeyMapping[DatasetType, DatasetRef]:
502 """Return mapping of datasets used to construct the Task.
504 Has `DatasetType` instances as keys (names can also be used for
505 lookups) and `DatasetRef` instances as values.
506 """
507 return self._initInputs
509 @property
510 def inputs(self) -> NamedKeyMapping[DatasetType, List[DatasetRef]]:
511 """Return mapping of input datasets that were expected to be used.
513 Has `DatasetType` instances as keys (names can also be used for
514 lookups) and a list of `DatasetRef` instances as values.
516 Notes
517 -----
518 We cannot use `set` instead of `list` for the nested container because
519 `DatasetRef` instances cannot be compared reliably when some have
520 integers IDs and others do not.
521 """
522 return self._inputs
524 @property
525 def outputs(self) -> NamedKeyMapping[DatasetType, List[DatasetRef]]:
526 """Return mapping of output datasets (to be) generated by this quantum.
528 Has the same form as `predictedInputs`.
530 Notes
531 -----
532 We cannot use `set` instead of `list` for the nested container because
533 `DatasetRef` instances cannot be compared reliably when some have
534 integers IDs and others do not.
535 """
536 return self._outputs
538 @property
539 def datastore_records(self) -> Mapping[str, DatastoreRecordData]:
540 """Tabular data stored with this quantum (`dict`).
542 This attribute may be modified in place, but not assigned to.
543 """
544 return self._datastore_records
546 def __eq__(self, other: object) -> bool:
547 if not isinstance(other, Quantum):
548 return False
549 for item in ("taskClass", "dataId", "initInputs", "inputs", "outputs"):
550 if getattr(self, item) != getattr(other, item):
551 return False
552 return True
554 def __hash__(self) -> int:
555 return hash((self.taskClass, self.dataId))
557 def __reduce__(self) -> Union[str, Tuple[Any, ...]]:
558 return (
559 self._reduceFactory,
560 (
561 self.taskName,
562 self.taskClass,
563 self.dataId,
564 dict(self.initInputs.items()),
565 dict(self.inputs),
566 dict(self.outputs),
567 self.datastore_records,
568 ),
569 )
571 def __str__(self) -> str:
572 return f"{self.__class__.__name__}(taskName={self.taskName}, dataId={self.dataId})"
574 @staticmethod
575 def _reduceFactory(
576 taskName: Optional[str],
577 taskClass: Optional[Type],
578 dataId: Optional[DataCoordinate],
579 initInputs: Optional[Union[Mapping[DatasetType, DatasetRef], Iterable[DatasetRef]]],
580 inputs: Optional[Mapping[DatasetType, List[DatasetRef]]],
581 outputs: Optional[Mapping[DatasetType, List[DatasetRef]]],
582 datastore_records: Mapping[str, DatastoreRecordData],
583 ) -> Quantum:
584 return Quantum(
585 taskName=taskName,
586 taskClass=taskClass,
587 dataId=dataId,
588 initInputs=initInputs,
589 inputs=inputs,
590 outputs=outputs,
591 datastore_records=datastore_records,
592 )
595class DimensionRecordsAccumulator:
596 """Class used to accumulate dimension records for serialization.
598 This class generates an auto increment key for each unique dimension record
599 added to it. This allows serialization of dimension records to occur once
600 for each record but be refereed to multiple times.
601 """
603 def __init__(self) -> None:
604 self._counter = 0
605 self.mapping: MutableMapping[DimensionRecord, Tuple[int, SerializedDimensionRecord]] = {}
607 def addRecord(self, record: DimensionRecord) -> int:
608 """Add a dimension record to the accumulator if it has not already been
609 added. When a record is inserted for the first time it is assigned
610 a unique integer key.
612 This function returns the key associated with the record (either the
613 newly allocated key, or the existing one)
615 Parameters
616 ----------
617 record : `DimensionRecord`
618 The record to add to the accumulator
620 Returns
621 -------
622 accumulatorKey : int
623 The key that is associated with the supplied record
624 """
625 if (mappingValue := self.mapping.get(record)) is None:
626 simple = record.to_simple()
627 mappingValue = (self._counter, simple)
628 self._counter += 1
629 self.mapping[record] = mappingValue
630 return mappingValue[0]
632 def makeSerializedDimensionRecordMapping(self) -> dict[int, SerializedDimensionRecord]:
633 return {id_: serializeRef for id_, serializeRef in self.mapping.values()}