Coverage for python/lsst/daf/butler/_quantum.py: 23%
206 statements
« prev ^ index » next coverage.py v7.5.1, created at 2024-05-11 03:16 -0700
« prev ^ index » next coverage.py v7.5.1, created at 2024-05-11 03:16 -0700
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28from __future__ import annotations
30__all__ = ("Quantum", "SerializedQuantum", "DimensionRecordsAccumulator")
32import sys
33from collections.abc import Iterable, Mapping, MutableMapping, Sequence
34from typing import Any
36import pydantic
37from lsst.utils import doImportType
39from ._dataset_ref import DatasetRef, SerializedDatasetRef
40from ._dataset_type import DatasetType, SerializedDatasetType
41from ._named import NamedKeyDict, NamedKeyMapping
42from .datastore.record_data import DatastoreRecordData, SerializedDatastoreRecordData
43from .dimensions import (
44 DataCoordinate,
45 DimensionRecord,
46 DimensionUniverse,
47 SerializedDataCoordinate,
48 SerializedDimensionRecord,
49)
52def _reconstructDatasetRef(
53 simple: SerializedDatasetRef,
54 type_: DatasetType | None,
55 ids: Iterable[int],
56 dimensionRecords: dict[int, SerializedDimensionRecord] | None,
57 universe: DimensionUniverse,
58) -> DatasetRef:
59 """Reconstruct a DatasetRef stored in a Serialized Quantum."""
60 # Reconstruct the dimension records
61 # if the dimension record has been loaded previously use that,
62 # otherwise load it from the dict of Serialized DimensionRecords
63 if dimensionRecords is None and ids:
64 raise ValueError("Cannot construct from a SerializedQuantum with no dimension records. ")
65 records = {}
66 for dId in ids:
67 # Ignore typing because it is missing that the above if statement
68 # ensures that if there is a loop that dimensionRecords is not None.
69 tmpSerialized = dimensionRecords[dId] # type: ignore
70 records[tmpSerialized.definition] = tmpSerialized
71 if simple.dataId is not None:
72 simple.dataId.records = records or None
73 rebuiltDatasetRef = DatasetRef.from_simple(simple, universe, datasetType=type_)
74 return rebuiltDatasetRef
77class SerializedQuantum(pydantic.BaseModel):
78 """Simplified model of a `Quantum` suitable for serialization."""
80 taskName: str | None = None
81 dataId: SerializedDataCoordinate | None = None
82 datasetTypeMapping: Mapping[str, SerializedDatasetType]
83 initInputs: Mapping[str, tuple[SerializedDatasetRef, list[int]]]
84 inputs: Mapping[str, list[tuple[SerializedDatasetRef, list[int]]]]
85 outputs: Mapping[str, list[tuple[SerializedDatasetRef, list[int]]]]
86 dimensionRecords: dict[int, SerializedDimensionRecord] | None = None
87 datastoreRecords: dict[str, SerializedDatastoreRecordData] | None = None
89 @classmethod
90 def direct(
91 cls,
92 *,
93 taskName: str | None,
94 dataId: dict | None,
95 datasetTypeMapping: Mapping[str, dict],
96 initInputs: Mapping[str, tuple[dict, list[int]]],
97 inputs: Mapping[str, list[tuple[dict, list[int]]]],
98 outputs: Mapping[str, list[tuple[dict, list[int]]]],
99 dimensionRecords: dict[int, dict] | None,
100 datastoreRecords: dict[str, dict] | None,
101 ) -> SerializedQuantum:
102 """Construct a `SerializedQuantum` directly without validators.
104 Parameters
105 ----------
106 taskName : `str` or `None`
107 The name of the task.
108 dataId : `dict` or `None`
109 The dataId of the quantum.
110 datasetTypeMapping : `~collections.abc.Mapping` [`str`, `dict`]
111 Dataset type definitions.
112 initInputs : `~collections.abc.Mapping`
113 The quantum init inputs.
114 inputs : `~collections.abc.Mapping`
115 The quantum inputs.
116 outputs : `~collections.abc.Mapping`
117 The quantum outputs.
118 dimensionRecords : `dict` [`int`, `dict`] or `None`
119 The dimension records.
120 datastoreRecords : `dict` [`str`, `dict`] or `None`
121 The datastore records.
123 Returns
124 -------
125 quantum : `SerializedQuantum`
126 Serializable model of the quantum.
128 Notes
129 -----
130 This differs from the pydantic "construct" method in that the arguments
131 are explicitly what the model requires, and it will recurse through
132 members, constructing them from their corresponding `direct` methods.
134 This method should only be called when the inputs are trusted.
135 """
136 serialized_dataId = SerializedDataCoordinate.direct(**dataId) if dataId is not None else None
137 serialized_datasetTypeMapping = {
138 k: SerializedDatasetType.direct(**v) for k, v in datasetTypeMapping.items()
139 }
140 serialized_initInputs = {
141 k: (SerializedDatasetRef.direct(**v), refs) for k, (v, refs) in initInputs.items()
142 }
143 serialized_inputs = {
144 k: [(SerializedDatasetRef.direct(**ref), id) for ref, id in v] for k, v in inputs.items()
145 }
146 serialized_outputs = {
147 k: [(SerializedDatasetRef.direct(**ref), id) for ref, id in v] for k, v in outputs.items()
148 }
149 serialized_records = (
150 {int(k): SerializedDimensionRecord.direct(**v) for k, v in dimensionRecords.items()}
151 if dimensionRecords is not None
152 else None
153 )
154 serialized_datastore_records = (
155 {k: SerializedDatastoreRecordData.direct(**v) for k, v in datastoreRecords.items()}
156 if datastoreRecords is not None
157 else None
158 )
160 node = cls.model_construct(
161 taskName=sys.intern(taskName or ""),
162 dataId=serialized_dataId,
163 datasetTypeMapping=serialized_datasetTypeMapping,
164 initInputs=serialized_initInputs,
165 inputs=serialized_inputs,
166 outputs=serialized_outputs,
167 dimensionRecords=serialized_records,
168 datastoreRecords=serialized_datastore_records,
169 )
171 return node
174class Quantum:
175 """Class representing a discrete unit of work.
177 A Quantum may depend on one or more datasets and produce one or more
178 datasets.
180 Most Quanta will be executions of a particular ``PipelineTask``’s
181 ``runQuantum`` method, but they can also be used to represent discrete
182 units of work performed manually by human operators or other software
183 agents.
185 Parameters
186 ----------
187 taskName : `str`, optional
188 Fully-qualified name of the Task class that executed or will execute
189 this Quantum. If not provided, ``taskClass`` must be.
190 taskClass : `type`, optional
191 The Task class that executed or will execute this Quantum. If not
192 provided, ``taskName`` must be. Overrides ``taskName`` if both are
193 provided.
194 dataId : `DataId`, optional
195 The dimension values that identify this `Quantum`.
196 initInputs : collection of `DatasetRef`, optional
197 Datasets that are needed to construct an instance of the Task. May
198 be a flat iterable of `DatasetRef` instances or a mapping from
199 `DatasetType` to `DatasetRef`.
200 inputs : `~collections.abc.Mapping`, optional
201 Inputs identified prior to execution, organized as a mapping from
202 `DatasetType` to a list of `DatasetRef`.
203 outputs : `~collections.abc.Mapping`, optional
204 Outputs from executing this quantum of work, organized as a mapping
205 from `DatasetType` to a list of `DatasetRef`.
206 datastore_records : `DatastoreRecordData`, optional
207 Datastore record data for input or initInput datasets that already
208 exist.
209 """
211 __slots__ = (
212 "_taskName",
213 "_taskClass",
214 "_dataId",
215 "_initInputs",
216 "_inputs",
217 "_outputs",
218 "_datastore_records",
219 )
221 def __init__(
222 self,
223 *,
224 taskName: str | None = None,
225 taskClass: type | None = None,
226 dataId: DataCoordinate | None = None,
227 initInputs: Mapping[DatasetType, DatasetRef] | Iterable[DatasetRef] | None = None,
228 inputs: Mapping[DatasetType, Sequence[DatasetRef]] | None = None,
229 outputs: Mapping[DatasetType, Sequence[DatasetRef]] | None = None,
230 datastore_records: Mapping[str, DatastoreRecordData] | None = None,
231 ):
232 if taskClass is not None:
233 taskName = f"{taskClass.__module__}.{taskClass.__name__}"
234 self._taskName = taskName
235 self._taskClass = taskClass
236 self._dataId = dataId
237 if initInputs is None:
238 initInputs = {}
239 elif not isinstance(initInputs, Mapping):
240 initInputs = {ref.datasetType: ref for ref in initInputs}
241 if inputs is None:
242 inputs = {}
243 if outputs is None:
244 outputs = {}
245 self._initInputs = NamedKeyDict[DatasetType, DatasetRef](initInputs).freeze()
246 self._inputs = NamedKeyDict[DatasetType, tuple[DatasetRef]](
247 (k, tuple(v)) for k, v in inputs.items()
248 ).freeze()
249 self._outputs = NamedKeyDict[DatasetType, tuple[DatasetRef]](
250 (k, tuple(v)) for k, v in outputs.items()
251 ).freeze()
252 if datastore_records is None:
253 datastore_records = {}
254 self._datastore_records = datastore_records
256 def to_simple(self, accumulator: DimensionRecordsAccumulator | None = None) -> SerializedQuantum:
257 """Convert this class to a simple python type.
259 This makes it suitable for serialization.
261 Parameters
262 ----------
263 accumulator : `DimensionRecordsAccumulator`, optional
264 This accumulator can be used to aggregate dimension records accross
265 multiple Quanta. If this is None, the default, dimension records
266 are serialized with this Quantum. If an accumulator is supplied it
267 is assumed something else is responsible for serializing the
268 records, and they will not be stored with the SerializedQuantum.
270 Returns
271 -------
272 simple : `SerializedQuantum`
273 This object converted to a serializable representation.
274 """
275 typeMapping = {}
276 initInputs = {}
278 if accumulator is None:
279 accumulator = DimensionRecordsAccumulator()
280 writeDimensionRecords = True
281 else:
282 writeDimensionRecords = False
284 # collect the init inputs for serialization, recording the types into
285 # their own mapping, used throughout to minimize saving the same object
286 # multiple times. String name of the type used to index mappings.
287 for key, value in self._initInputs.items():
288 # add the type to the typeMapping
289 typeMapping[key.name] = key.to_simple()
290 # convert to a simple DatasetRef representation
291 simple = value.to_simple()
292 # extract the dimension records
293 recIds = []
294 if simple.dataId is not None and simple.dataId.records is not None:
295 # for each dimension record get a id by adding it to the
296 # record accumulator.
297 for element_name in value.dataId.dimensions.elements:
298 rec = value.dataId.records[element_name]
299 if rec is not None:
300 recordId = accumulator.addRecord(rec)
301 recIds.append(recordId)
302 # Set properties to None to save space
303 simple.dataId.records = None
304 simple.datasetType = None
305 initInputs[key.name] = (simple, recIds)
307 # container for all the SerializedDatasetRefs, keyed on the
308 # DatasetType name.
309 inputs = {}
311 # collect the inputs
312 for key, values in self._inputs.items():
313 # collect type if it is not already in the mapping
314 if key.name not in typeMapping:
315 typeMapping[key.name] = key.to_simple()
316 # for each input type there are a list of inputs, collect them
317 tmp = []
318 for e in values:
319 simp = e.to_simple()
320 # This container will hold ids (hashes) that point to all the
321 # dimension records within the SerializedDatasetRef dataId
322 # These dimension records repeat in almost every DatasetRef
323 # So it is hugely wasteful in terms of disk and cpu time to
324 # store them over and over again.
325 recIds = []
326 if simp.dataId is not None and simp.dataId.records is not None:
327 for element_name in e.dataId.dimensions.elements:
328 rec = e.dataId.records[element_name]
329 # for each dimension record get a id by adding it to
330 # the record accumulator.
331 if rec is not None:
332 recordId = accumulator.addRecord(rec)
333 recIds.append(recordId)
334 # Set the records to None to avoid serializing them
335 simp.dataId.records = None
336 # Dataset type is the same as the key in _inputs, no need
337 # to serialize it out multiple times, set it to None
338 simp.datasetType = None
339 # append a tuple of the simplified SerializedDatasetRef, along
340 # with the list of all the keys for the dimension records
341 # needed for reconstruction.
342 tmp.append((simp, recIds))
343 inputs[key.name] = tmp
345 # container for all the SerializedDatasetRefs, keyed on the
346 # DatasetType name.
347 outputs = {}
348 for key, values in self._outputs.items():
349 # collect type if it is not already in the mapping
350 if key.name not in typeMapping:
351 typeMapping[key.name] = key.to_simple()
352 # for each output type there are a list of inputs, collect them
353 tmp = []
354 for e in values:
355 simp = e.to_simple()
356 # This container will hold ids (hashes) that point to all the
357 # dimension records within the SerializedDatasetRef dataId
358 # These dimension records repeat in almost every DatasetRef
359 # So it is hugely wasteful in terms of disk and cpu time to
360 # store them over and over again.
361 recIds = []
362 if simp.dataId is not None and simp.dataId.records is not None:
363 for element_name in e.dataId.dimensions.elements:
364 rec = e.dataId.records[element_name]
365 # for each dimension record get a id by adding it to
366 # the record accumulator.
367 if rec is not None:
368 recordId = accumulator.addRecord(rec)
369 recIds.append(recordId)
370 # Set the records to None to avoid serializing them
371 simp.dataId.records = None
372 # Dataset type is the same as the key in _outputs, no need
373 # to serialize it out multiple times, set it to None
374 simp.datasetType = None
375 # append a tuple of the simplified SerializedDatasetRef, along
376 # with the list of all the keys for the dimension records
377 # needed for reconstruction.
378 tmp.append((simp, recIds))
379 outputs[key.name] = tmp
381 dimensionRecords: Mapping[int, SerializedDimensionRecord] | None
382 if writeDimensionRecords:
383 dimensionRecords = accumulator.makeSerializedDimensionRecordMapping()
384 else:
385 dimensionRecords = None
387 datastore_records: dict[str, SerializedDatastoreRecordData] | None = None
388 if self.datastore_records is not None:
389 datastore_records = {
390 datastore_name: record_data.to_simple()
391 for datastore_name, record_data in self.datastore_records.items()
392 }
394 return SerializedQuantum(
395 taskName=self._taskName,
396 dataId=self.dataId.to_simple() if self.dataId is not None else None,
397 datasetTypeMapping=typeMapping,
398 initInputs=initInputs,
399 inputs=inputs,
400 outputs=outputs,
401 dimensionRecords=dimensionRecords,
402 datastoreRecords=datastore_records,
403 )
405 @classmethod
406 def from_simple(
407 cls,
408 simple: SerializedQuantum,
409 universe: DimensionUniverse,
410 ) -> Quantum:
411 """Construct a new object from a simplified form.
413 Generally this is data returned from the `to_simple` method.
415 Parameters
416 ----------
417 simple : SerializedQuantum
418 The value returned by a call to `to_simple`.
419 universe : `DimensionUniverse`
420 The special graph of all known dimensions.
421 """
422 initInputs: MutableMapping[DatasetType, DatasetRef] = {}
424 # Unpersist all the init inputs
425 for key, (value, dimensionIds) in simple.initInputs.items():
426 type_ = DatasetType.from_simple(simple.datasetTypeMapping[key], universe=universe)
427 # reconstruct the dimension records
428 rebuiltDatasetRef = _reconstructDatasetRef(
429 value, type_, dimensionIds, simple.dimensionRecords, universe
430 )
431 initInputs[type_] = rebuiltDatasetRef
433 # containers for the dataset refs
434 inputs: MutableMapping[DatasetType, list[DatasetRef]] = {}
435 outputs: MutableMapping[DatasetType, list[DatasetRef]] = {}
437 for container, simpleRefs in ((inputs, simple.inputs), (outputs, simple.outputs)):
438 for key, values in simpleRefs.items():
439 type_ = DatasetType.from_simple(simple.datasetTypeMapping[key], universe=universe)
440 # reconstruct the list of DatasetRefs for this DatasetType
441 tmp: list[DatasetRef] = []
442 for v, recIds in values:
443 rebuiltDatasetRef = _reconstructDatasetRef(
444 v, type_, recIds, simple.dimensionRecords, universe
445 )
446 tmp.append(rebuiltDatasetRef)
447 container[type_] = tmp
449 dataId = (
450 DataCoordinate.from_simple(simple.dataId, universe=universe)
451 if simple.dataId is not None
452 else None
453 )
455 datastore_records: dict[str, DatastoreRecordData] | None = None
456 if simple.datastoreRecords is not None:
457 datastore_records = {
458 datastore_name: DatastoreRecordData.from_simple(record_data)
459 for datastore_name, record_data in simple.datastoreRecords.items()
460 }
462 quant = Quantum(
463 taskName=simple.taskName,
464 dataId=dataId,
465 initInputs=initInputs,
466 inputs=inputs,
467 outputs=outputs,
468 datastore_records=datastore_records,
469 )
470 return quant
472 @property
473 def taskClass(self) -> type | None:
474 """Task class associated with this `Quantum` (`type`)."""
475 if self._taskClass is None:
476 if self._taskName is None:
477 raise ValueError("No task class defined and task name is None")
478 task_class = doImportType(self._taskName)
479 self._taskClass = task_class
480 return self._taskClass
482 @property
483 def taskName(self) -> str | None:
484 """Return Fully-qualified name of the task associated with `Quantum`.
486 (`str`).
487 """
488 return self._taskName
490 @property
491 def dataId(self) -> DataCoordinate | None:
492 """Return dimension values of the unit of processing (`DataId`)."""
493 return self._dataId
495 @property
496 def initInputs(self) -> NamedKeyMapping[DatasetType, DatasetRef]:
497 """Return mapping of datasets used to construct the Task.
499 Has `DatasetType` instances as keys (names can also be used for
500 lookups) and `DatasetRef` instances as values.
501 """
502 return self._initInputs
504 @property
505 def inputs(self) -> NamedKeyMapping[DatasetType, tuple[DatasetRef]]:
506 """Return mapping of input datasets that were expected to be used.
508 Has `DatasetType` instances as keys (names can also be used for
509 lookups) and a list of `DatasetRef` instances as values.
511 Notes
512 -----
513 We cannot use `set` instead of `list` for the nested container because
514 `DatasetRef` instances cannot be compared reliably when some have
515 integers IDs and others do not.
516 """
517 return self._inputs
519 @property
520 def outputs(self) -> NamedKeyMapping[DatasetType, tuple[DatasetRef]]:
521 """Return mapping of output datasets (to be) generated by this quantum.
523 Has the same form as ``predictedInputs``.
525 Notes
526 -----
527 We cannot use `set` instead of `list` for the nested container because
528 `DatasetRef` instances cannot be compared reliably when some have
529 integers IDs and others do not.
530 """
531 return self._outputs
533 @property
534 def datastore_records(self) -> Mapping[str, DatastoreRecordData]:
535 """Tabular data stored with this quantum (`dict`).
537 This attribute may be modified in place, but not assigned to.
538 """
539 return self._datastore_records
541 def __eq__(self, other: object) -> bool:
542 if not isinstance(other, Quantum):
543 return False
544 for item in ("taskClass", "dataId", "initInputs", "inputs", "outputs"):
545 if getattr(self, item) != getattr(other, item):
546 return False
547 return True
549 def __hash__(self) -> int:
550 return hash((self.taskClass, self.dataId))
552 def __reduce__(self) -> str | tuple[Any, ...]:
553 return (
554 self._reduceFactory,
555 (
556 self.taskName,
557 self.taskClass,
558 self.dataId,
559 dict(self.initInputs.items()),
560 dict(self.inputs),
561 dict(self.outputs),
562 self.datastore_records,
563 ),
564 )
566 def __str__(self) -> str:
567 return f"{self.__class__.__name__}(taskName={self.taskName}, dataId={self.dataId})"
569 @staticmethod
570 def _reduceFactory(
571 taskName: str | None,
572 taskClass: type | None,
573 dataId: DataCoordinate | None,
574 initInputs: Mapping[DatasetType, DatasetRef] | Iterable[DatasetRef] | None,
575 inputs: Mapping[DatasetType, list[DatasetRef]] | None,
576 outputs: Mapping[DatasetType, list[DatasetRef]] | None,
577 datastore_records: Mapping[str, DatastoreRecordData],
578 ) -> Quantum:
579 return Quantum(
580 taskName=taskName,
581 taskClass=taskClass,
582 dataId=dataId,
583 initInputs=initInputs,
584 inputs=inputs,
585 outputs=outputs,
586 datastore_records=datastore_records,
587 )
590class DimensionRecordsAccumulator:
591 """Class used to accumulate dimension records for serialization.
593 This class generates an auto increment key for each unique dimension record
594 added to it. This allows serialization of dimension records to occur once
595 for each record but be refereed to multiple times.
596 """
598 def __init__(self) -> None:
599 self._counter = 0
600 self.mapping: MutableMapping[DimensionRecord, tuple[int, SerializedDimensionRecord]] = {}
602 def addRecord(self, record: DimensionRecord) -> int:
603 """Add a dimension record to the accumulator if it has not already been
604 added. When a record is inserted for the first time it is assigned
605 a unique integer key.
607 This function returns the key associated with the record (either the
608 newly allocated key, or the existing one).
610 Parameters
611 ----------
612 record : `DimensionRecord`
613 The record to add to the accumulator.
615 Returns
616 -------
617 accumulatorKey : int
618 The key that is associated with the supplied record.
619 """
620 if (mappingValue := self.mapping.get(record)) is None:
621 simple = record.to_simple()
622 mappingValue = (self._counter, simple)
623 self._counter += 1
624 self.mapping[record] = mappingValue
625 return mappingValue[0]
627 def makeSerializedDimensionRecordMapping(self) -> dict[int, SerializedDimensionRecord]:
628 return dict(self.mapping.values())