Coverage for python/lsst/daf/butler/core/quantum.py: 16%
213 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-28 10:10 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-28 10:10 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = ("Quantum", "SerializedQuantum", "DimensionRecordsAccumulator")
26from collections.abc import Iterable, Mapping, MutableMapping
27from typing import Any
29from lsst.utils import doImportType
30from pydantic import BaseModel
32from .datasets import DatasetRef, DatasetType, SerializedDatasetRef, SerializedDatasetType
33from .datastoreRecordData import DatastoreRecordData, SerializedDatastoreRecordData
34from .dimensions import (
35 DataCoordinate,
36 DimensionRecord,
37 DimensionUniverse,
38 SerializedDataCoordinate,
39 SerializedDimensionRecord,
40)
41from .named import NamedKeyDict, NamedKeyMapping
44def _reconstructDatasetRef(
45 simple: SerializedDatasetRef,
46 type_: DatasetType | None,
47 ids: Iterable[int],
48 dimensionRecords: dict[int, SerializedDimensionRecord] | None,
49 reconstitutedDimensions: dict[int, tuple[str, DimensionRecord]],
50 universe: DimensionUniverse,
51) -> DatasetRef:
52 """Reconstruct a DatasetRef stored in a Serialized Quantum."""
53 # Reconstruct the dimension records
54 records = {}
55 for dId in ids:
56 # if the dimension record has been loaded previously use that,
57 # otherwise load it from the dict of Serialized DimensionRecords
58 if (recId := reconstitutedDimensions.get(dId)) is None:
59 if dimensionRecords is None:
60 raise ValueError(
61 "Cannot construct from a SerializedQuantum with no dimension records. "
62 "Reconstituted Dimensions must be supplied and populated in method call."
63 )
64 tmpSerialized = dimensionRecords[dId]
65 reconstructedDim = DimensionRecord.from_simple(tmpSerialized, universe=universe)
66 definition = tmpSerialized.definition
67 reconstitutedDimensions[dId] = (definition, reconstructedDim)
68 else:
69 definition, reconstructedDim = recId
70 records[definition] = reconstructedDim
71 # turn the serialized form into an object and attach the dimension records
72 rebuiltDatasetRef = DatasetRef.from_simple(simple, universe, datasetType=type_)
73 if records:
74 object.__setattr__(rebuiltDatasetRef, "dataId", rebuiltDatasetRef.dataId.expanded(records))
75 return rebuiltDatasetRef
78class SerializedQuantum(BaseModel):
79 """Simplified model of a `Quantum` suitable for serialization."""
81 taskName: str | None
82 dataId: SerializedDataCoordinate | None
83 datasetTypeMapping: Mapping[str, SerializedDatasetType]
84 initInputs: Mapping[str, tuple[SerializedDatasetRef, list[int]]]
85 inputs: Mapping[str, list[tuple[SerializedDatasetRef, list[int]]]]
86 outputs: Mapping[str, list[tuple[SerializedDatasetRef, list[int]]]]
87 dimensionRecords: dict[int, SerializedDimensionRecord] | None = None
88 datastoreRecords: dict[str, SerializedDatastoreRecordData] | None = None
90 @classmethod
91 def direct(
92 cls,
93 *,
94 taskName: str | None,
95 dataId: dict | None,
96 datasetTypeMapping: Mapping[str, dict],
97 initInputs: Mapping[str, tuple[dict, list[int]]],
98 inputs: Mapping[str, list[tuple[dict, list[int]]]],
99 outputs: Mapping[str, list[tuple[dict, list[int]]]],
100 dimensionRecords: dict[int, dict] | None,
101 datastoreRecords: dict[str, dict] | None,
102 ) -> SerializedQuantum:
103 """Construct a `SerializedQuantum` directly without validators.
105 This differs from the pydantic "construct" method in that the arguments
106 are explicitly what the model requires, and it will recurse through
107 members, constructing them from their corresponding `direct` methods.
109 This method should only be called when the inputs are trusted.
110 """
111 node = SerializedQuantum.__new__(cls)
112 setter = object.__setattr__
113 setter(node, "taskName", taskName)
114 setter(node, "dataId", dataId if dataId is None else SerializedDataCoordinate.direct(**dataId))
115 setter(
116 node,
117 "datasetTypeMapping",
118 {k: SerializedDatasetType.direct(**v) for k, v in datasetTypeMapping.items()},
119 )
120 setter(
121 node,
122 "initInputs",
123 {k: (SerializedDatasetRef.direct(**v), refs) for k, (v, refs) in initInputs.items()},
124 )
125 setter(
126 node,
127 "inputs",
128 {k: [(SerializedDatasetRef.direct(**ref), id) for ref, id in v] for k, v in inputs.items()},
129 )
130 setter(
131 node,
132 "outputs",
133 {k: [(SerializedDatasetRef.direct(**ref), id) for ref, id in v] for k, v in outputs.items()},
134 )
135 setter(
136 node,
137 "dimensionRecords",
138 dimensionRecords
139 if dimensionRecords is None
140 else {int(k): SerializedDimensionRecord.direct(**v) for k, v in dimensionRecords.items()},
141 )
142 setter(
143 node,
144 "datastoreRecords",
145 datastoreRecords
146 if datastoreRecords is None
147 else {k: SerializedDatastoreRecordData.direct(**v) for k, v in datastoreRecords.items()},
148 )
149 setter(
150 node,
151 "__fields_set__",
152 {
153 "taskName",
154 "dataId",
155 "datasetTypeMapping",
156 "initInputs",
157 "inputs",
158 "outputs",
159 "dimensionRecords",
160 "datastore_records",
161 },
162 )
163 return node
166class Quantum:
167 """Class representing a discrete unit of work.
169 A Quantum may depend on one or more datasets and produce one or more
170 datasets.
172 Most Quanta will be executions of a particular ``PipelineTask``’s
173 ``runQuantum`` method, but they can also be used to represent discrete
174 units of work performed manually by human operators or other software
175 agents.
177 Parameters
178 ----------
179 taskName : `str`, optional
180 Fully-qualified name of the Task class that executed or will execute
181 this Quantum. If not provided, ``taskClass`` must be.
182 taskClass : `type`, optional
183 The Task class that executed or will execute this Quantum. If not
184 provided, ``taskName`` must be. Overrides ``taskName`` if both are
185 provided.
186 dataId : `DataId`, optional
187 The dimension values that identify this `Quantum`.
188 initInputs : collection of `DatasetRef`, optional
189 Datasets that are needed to construct an instance of the Task. May
190 be a flat iterable of `DatasetRef` instances or a mapping from
191 `DatasetType` to `DatasetRef`.
192 inputs : `~collections.abc.Mapping`, optional
193 Inputs identified prior to execution, organized as a mapping from
194 `DatasetType` to a list of `DatasetRef`.
195 outputs : `~collections.abc.Mapping`, optional
196 Outputs from executing this quantum of work, organized as a mapping
197 from `DatasetType` to a list of `DatasetRef`.
198 datastore_records : `DatastoreRecordData`, optional
199 Datastore record data for input or initInput datasets that already
200 exist.
201 """
203 __slots__ = (
204 "_taskName",
205 "_taskClass",
206 "_dataId",
207 "_initInputs",
208 "_inputs",
209 "_outputs",
210 "_hash",
211 "_datastore_records",
212 )
214 def __init__(
215 self,
216 *,
217 taskName: str | None = None,
218 taskClass: type | None = None,
219 dataId: DataCoordinate | None = None,
220 initInputs: Mapping[DatasetType, DatasetRef] | Iterable[DatasetRef] | None = None,
221 inputs: Mapping[DatasetType, list[DatasetRef]] | None = None,
222 outputs: Mapping[DatasetType, list[DatasetRef]] | None = None,
223 datastore_records: Mapping[str, DatastoreRecordData] | None = None,
224 ):
225 if taskClass is not None:
226 taskName = f"{taskClass.__module__}.{taskClass.__name__}"
227 self._taskName = taskName
228 self._taskClass = taskClass
229 self._dataId = dataId
230 if initInputs is None:
231 initInputs = {}
232 elif not isinstance(initInputs, Mapping):
233 initInputs = {ref.datasetType: ref for ref in initInputs}
234 if inputs is None:
235 inputs = {}
236 if outputs is None:
237 outputs = {}
238 self._initInputs = NamedKeyDict[DatasetType, DatasetRef](initInputs).freeze()
239 self._inputs = NamedKeyDict[DatasetType, list[DatasetRef]](inputs).freeze()
240 self._outputs = NamedKeyDict[DatasetType, list[DatasetRef]](outputs).freeze()
241 if datastore_records is None:
242 datastore_records = {}
243 self._datastore_records = datastore_records
245 def to_simple(self, accumulator: DimensionRecordsAccumulator | None = None) -> SerializedQuantum:
246 """Convert this class to a simple python type.
248 This makes it suitable for serialization.
250 Parameters
251 ----------
252 accumulator : `DimensionRecordsAccumulator`, optional
253 This accumulator can be used to aggregate dimension records accross
254 multiple Quanta. If this is None, the default, dimension records
255 are serialized with this Quantum. If an accumulator is supplied it
256 is assumed something else is responsible for serializing the
257 records, and they will not be stored with the SerializedQuantum.
259 Returns
260 -------
261 simple : `SerializedQuantum`
262 This object converted to a serializable representation.
263 """
264 typeMapping = {}
265 initInputs = {}
267 if accumulator is None:
268 accumulator = DimensionRecordsAccumulator()
269 writeDimensionRecords = True
270 else:
271 writeDimensionRecords = False
273 # collect the init inputs for serialization, recording the types into
274 # their own mapping, used throughout to minimize saving the same object
275 # multiple times. String name of the type used to index mappings.
276 for key, value in self._initInputs.items():
277 # add the type to the typeMapping
278 typeMapping[key.name] = key.to_simple()
279 # convert to a simple DatasetRef representation
280 simple = value.to_simple()
281 # extract the dimension records
282 recIds = []
283 if simple.dataId is not None and simple.dataId.records is not None:
284 # for each dimension record get a id by adding it to the
285 # record accumulator.
286 for rec in value.dataId.records.values():
287 if rec is not None:
288 recordId = accumulator.addRecord(rec)
289 recIds.append(recordId)
290 # Set properties to None to save space
291 simple.dataId.records = None
292 simple.datasetType = None
293 initInputs[key.name] = (simple, recIds)
295 # container for all the SerializedDatasetRefs, keyed on the
296 # DatasetType name.
297 inputs = {}
299 # collect the inputs
300 for key, values in self._inputs.items():
301 # collect type if it is not already in the mapping
302 if key.name not in typeMapping:
303 typeMapping[key.name] = key.to_simple()
304 # for each input type there are a list of inputs, collect them
305 tmp = []
306 for e in values:
307 simp = e.to_simple()
308 # This container will hold ids (hashes) that point to all the
309 # dimension records within the SerializedDatasetRef dataId
310 # These dimension records repeat in almost every DatasetRef
311 # So it is hugely wasteful in terms of disk and cpu time to
312 # store them over and over again.
313 recIds = []
314 if simp.dataId is not None and simp.dataId.records is not None:
315 for rec in e.dataId.records.values():
316 # for each dimension record get a id by adding it to
317 # the record accumulator.
318 if rec is not None:
319 recordId = accumulator.addRecord(rec)
320 recIds.append(recordId)
321 # Set the records to None to avoid serializing them
322 simp.dataId.records = None
323 # Dataset type is the same as the key in _inputs, no need
324 # to serialize it out multiple times, set it to None
325 simp.datasetType = None
326 # append a tuple of the simplified SerializedDatasetRef, along
327 # with the list of all the keys for the dimension records
328 # needed for reconstruction.
329 tmp.append((simp, recIds))
330 inputs[key.name] = tmp
332 # container for all the SerializedDatasetRefs, keyed on the
333 # DatasetType name.
334 outputs = {}
335 for key, values in self._outputs.items():
336 # collect type if it is not already in the mapping
337 if key.name not in typeMapping:
338 typeMapping[key.name] = key.to_simple()
339 # for each output type there are a list of inputs, collect them
340 tmp = []
341 for e in values:
342 simp = e.to_simple()
343 # This container will hold ids (hashes) that point to all the
344 # dimension records within the SerializedDatasetRef dataId
345 # These dimension records repeat in almost every DatasetRef
346 # So it is hugely wasteful in terms of disk and cpu time to
347 # store them over and over again.
348 recIds = []
349 if simp.dataId is not None and simp.dataId.records is not None:
350 for rec in e.dataId.records.values():
351 # for each dimension record get a id by adding it to
352 # the record accumulator.
353 if rec is not None:
354 recordId = accumulator.addRecord(rec)
355 recIds.append(recordId)
356 # Set the records to None to avoid serializing them
357 simp.dataId.records = None
358 # Dataset type is the same as the key in _outputs, no need
359 # to serialize it out multiple times, set it to None
360 simp.datasetType = None
361 # append a tuple of the simplified SerializedDatasetRef, along
362 # with the list of all the keys for the dimension records
363 # needed for reconstruction.
364 tmp.append((simp, recIds))
365 outputs[key.name] = tmp
367 dimensionRecords: Mapping[int, SerializedDimensionRecord] | None
368 if writeDimensionRecords:
369 dimensionRecords = accumulator.makeSerializedDimensionRecordMapping()
370 else:
371 dimensionRecords = None
373 datastore_records: dict[str, SerializedDatastoreRecordData] | None = None
374 if self.datastore_records is not None:
375 datastore_records = {
376 datastore_name: record_data.to_simple()
377 for datastore_name, record_data in self.datastore_records.items()
378 }
380 return SerializedQuantum(
381 taskName=self._taskName,
382 dataId=self.dataId.to_simple() if self.dataId is not None else None,
383 datasetTypeMapping=typeMapping,
384 initInputs=initInputs,
385 inputs=inputs,
386 outputs=outputs,
387 dimensionRecords=dimensionRecords,
388 datastoreRecords=datastore_records,
389 )
391 @classmethod
392 def from_simple(
393 cls,
394 simple: SerializedQuantum,
395 universe: DimensionUniverse,
396 reconstitutedDimensions: dict[int, tuple[str, DimensionRecord]] | None = None,
397 ) -> Quantum:
398 """Construct a new object from a simplified form.
400 Generally this is data returned from the `to_simple` method.
402 Parameters
403 ----------
404 simple : SerializedQuantum
405 The value returned by a call to `to_simple`
406 universe : `DimensionUniverse`
407 The special graph of all known dimensions.
408 reconstitutedDimensions : `dict` of `int` to `DimensionRecord` or None
409 A mapping of ids to dimension records to be used when populating
410 dimensions for this Quantum. If supplied it will be used in place
411 of the dimension Records stored with the SerializedQuantum, if a
412 required dimension has already been loaded. Otherwise the record
413 will be unpersisted from the SerializedQuatnum and added to the
414 reconstitutedDimensions dict (if not None). Defaults to None.
415 """
416 loadedTypes: MutableMapping[str, DatasetType] = {}
417 initInputs: MutableMapping[DatasetType, DatasetRef] = {}
418 if reconstitutedDimensions is None:
419 reconstitutedDimensions = {}
421 # Unpersist all the init inputs
422 for key, (value, dimensionIds) in simple.initInputs.items():
423 # If a datasetType has already been created use that instead of
424 # unpersisting.
425 if (type_ := loadedTypes.get(key)) is None:
426 type_ = loadedTypes.setdefault(
427 key, DatasetType.from_simple(simple.datasetTypeMapping[key], universe=universe)
428 )
429 # reconstruct the dimension records
430 rebuiltDatasetRef = _reconstructDatasetRef(
431 value, type_, dimensionIds, simple.dimensionRecords, reconstitutedDimensions, universe
432 )
433 initInputs[type_] = rebuiltDatasetRef
435 # containers for the dataset refs
436 inputs: MutableMapping[DatasetType, list[DatasetRef]] = {}
437 outputs: MutableMapping[DatasetType, list[DatasetRef]] = {}
439 for container, simpleRefs in ((inputs, simple.inputs), (outputs, simple.outputs)):
440 for key, values in simpleRefs.items():
441 # If a datasetType has already been created use that instead of
442 # unpersisting.
443 if (type_ := loadedTypes.get(key)) is None:
444 type_ = loadedTypes.setdefault(
445 key, DatasetType.from_simple(simple.datasetTypeMapping[key], universe=universe)
446 )
447 # reconstruct the list of DatasetRefs for this DatasetType
448 tmp: list[DatasetRef] = []
449 for v, recIds in values:
450 rebuiltDatasetRef = _reconstructDatasetRef(
451 v, type_, recIds, simple.dimensionRecords, reconstitutedDimensions, universe
452 )
453 tmp.append(rebuiltDatasetRef)
454 container[type_] = tmp
456 dataId = (
457 DataCoordinate.from_simple(simple.dataId, universe=universe)
458 if simple.dataId is not None
459 else None
460 )
462 datastore_records: dict[str, DatastoreRecordData] | None = None
463 if simple.datastoreRecords is not None:
464 datastore_records = {
465 datastore_name: DatastoreRecordData.from_simple(record_data)
466 for datastore_name, record_data in simple.datastoreRecords.items()
467 }
469 return Quantum(
470 taskName=simple.taskName,
471 dataId=dataId,
472 initInputs=initInputs,
473 inputs=inputs,
474 outputs=outputs,
475 datastore_records=datastore_records,
476 )
478 @property
479 def taskClass(self) -> type | None:
480 """Task class associated with this `Quantum` (`type`)."""
481 if self._taskClass is None:
482 if self._taskName is None:
483 raise ValueError("No task class defined and task name is None")
484 task_class = doImportType(self._taskName)
485 self._taskClass = task_class
486 return self._taskClass
488 @property
489 def taskName(self) -> str | None:
490 """Return Fully-qualified name of the task associated with `Quantum`.
492 (`str`).
493 """
494 return self._taskName
496 @property
497 def dataId(self) -> DataCoordinate | None:
498 """Return dimension values of the unit of processing (`DataId`)."""
499 return self._dataId
501 @property
502 def initInputs(self) -> NamedKeyMapping[DatasetType, DatasetRef]:
503 """Return mapping of datasets used to construct the Task.
505 Has `DatasetType` instances as keys (names can also be used for
506 lookups) and `DatasetRef` instances as values.
507 """
508 return self._initInputs
510 @property
511 def inputs(self) -> NamedKeyMapping[DatasetType, list[DatasetRef]]:
512 """Return mapping of input datasets that were expected to be used.
514 Has `DatasetType` instances as keys (names can also be used for
515 lookups) and a list of `DatasetRef` instances as values.
517 Notes
518 -----
519 We cannot use `set` instead of `list` for the nested container because
520 `DatasetRef` instances cannot be compared reliably when some have
521 integers IDs and others do not.
522 """
523 return self._inputs
525 @property
526 def outputs(self) -> NamedKeyMapping[DatasetType, list[DatasetRef]]:
527 """Return mapping of output datasets (to be) generated by this quantum.
529 Has the same form as `predictedInputs`.
531 Notes
532 -----
533 We cannot use `set` instead of `list` for the nested container because
534 `DatasetRef` instances cannot be compared reliably when some have
535 integers IDs and others do not.
536 """
537 return self._outputs
539 @property
540 def datastore_records(self) -> Mapping[str, DatastoreRecordData]:
541 """Tabular data stored with this quantum (`dict`).
543 This attribute may be modified in place, but not assigned to.
544 """
545 return self._datastore_records
547 def __eq__(self, other: object) -> bool:
548 if not isinstance(other, Quantum):
549 return False
550 for item in ("taskClass", "dataId", "initInputs", "inputs", "outputs"):
551 if getattr(self, item) != getattr(other, item):
552 return False
553 return True
555 def __hash__(self) -> int:
556 return hash((self.taskClass, self.dataId))
558 def __reduce__(self) -> str | tuple[Any, ...]:
559 return (
560 self._reduceFactory,
561 (
562 self.taskName,
563 self.taskClass,
564 self.dataId,
565 dict(self.initInputs.items()),
566 dict(self.inputs),
567 dict(self.outputs),
568 self.datastore_records,
569 ),
570 )
572 def __str__(self) -> str:
573 return f"{self.__class__.__name__}(taskName={self.taskName}, dataId={self.dataId})"
575 @staticmethod
576 def _reduceFactory(
577 taskName: str | None,
578 taskClass: type | None,
579 dataId: DataCoordinate | None,
580 initInputs: Mapping[DatasetType, DatasetRef] | Iterable[DatasetRef] | None,
581 inputs: Mapping[DatasetType, list[DatasetRef]] | None,
582 outputs: Mapping[DatasetType, list[DatasetRef]] | None,
583 datastore_records: Mapping[str, DatastoreRecordData],
584 ) -> Quantum:
585 return Quantum(
586 taskName=taskName,
587 taskClass=taskClass,
588 dataId=dataId,
589 initInputs=initInputs,
590 inputs=inputs,
591 outputs=outputs,
592 datastore_records=datastore_records,
593 )
596class DimensionRecordsAccumulator:
597 """Class used to accumulate dimension records for serialization.
599 This class generates an auto increment key for each unique dimension record
600 added to it. This allows serialization of dimension records to occur once
601 for each record but be refereed to multiple times.
602 """
604 def __init__(self) -> None:
605 self._counter = 0
606 self.mapping: MutableMapping[DimensionRecord, tuple[int, SerializedDimensionRecord]] = {}
608 def addRecord(self, record: DimensionRecord) -> int:
609 """Add a dimension record to the accumulator if it has not already been
610 added. When a record is inserted for the first time it is assigned
611 a unique integer key.
613 This function returns the key associated with the record (either the
614 newly allocated key, or the existing one)
616 Parameters
617 ----------
618 record : `DimensionRecord`
619 The record to add to the accumulator
621 Returns
622 -------
623 accumulatorKey : int
624 The key that is associated with the supplied record
625 """
626 if (mappingValue := self.mapping.get(record)) is None:
627 simple = record.to_simple()
628 mappingValue = (self._counter, simple)
629 self._counter += 1
630 self.mapping[record] = mappingValue
631 return mappingValue[0]
633 def makeSerializedDimensionRecordMapping(self) -> dict[int, SerializedDimensionRecord]:
634 return {id_: serializeRef for id_, serializeRef in self.mapping.values()}