Coverage for python/lsst/daf/butler/core/quantum.py: 23%
206 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-08-12 09:20 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-08-12 09:20 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = ("Quantum", "SerializedQuantum", "DimensionRecordsAccumulator")
26import sys
27import warnings
28from collections.abc import Iterable, Mapping, MutableMapping, Sequence
29from typing import Any
31from lsst.daf.butler._compat import _BaseModelCompat
32from lsst.utils import doImportType
33from lsst.utils.introspection import find_outside_stacklevel
35from .datasets import DatasetRef, DatasetType, SerializedDatasetRef, SerializedDatasetType
36from .datastoreRecordData import DatastoreRecordData, SerializedDatastoreRecordData
37from .dimensions import (
38 DataCoordinate,
39 DimensionRecord,
40 DimensionUniverse,
41 SerializedDataCoordinate,
42 SerializedDimensionRecord,
43)
44from .named import NamedKeyDict, NamedKeyMapping
47def _reconstructDatasetRef(
48 simple: SerializedDatasetRef,
49 type_: DatasetType | None,
50 ids: Iterable[int],
51 dimensionRecords: dict[int, SerializedDimensionRecord] | None,
52 universe: DimensionUniverse,
53) -> DatasetRef:
54 """Reconstruct a DatasetRef stored in a Serialized Quantum."""
55 # Reconstruct the dimension records
56 # if the dimension record has been loaded previously use that,
57 # otherwise load it from the dict of Serialized DimensionRecords
58 if dimensionRecords is None and ids:
59 raise ValueError("Cannot construct from a SerializedQuantum with no dimension records. ")
60 records = {}
61 for dId in ids:
62 # Ignore typing because it is missing that the above if statement
63 # ensures that if there is a loop that dimensionRecords is not None.
64 tmpSerialized = dimensionRecords[dId] # type: ignore
65 records[tmpSerialized.definition] = tmpSerialized
66 if simple.dataId is not None:
67 simple.dataId.records = records or None
68 rebuiltDatasetRef = DatasetRef.from_simple(simple, universe, datasetType=type_)
69 return rebuiltDatasetRef
72class SerializedQuantum(_BaseModelCompat):
73 """Simplified model of a `Quantum` suitable for serialization."""
75 taskName: str | None = None
76 dataId: SerializedDataCoordinate | None = None
77 datasetTypeMapping: Mapping[str, SerializedDatasetType]
78 initInputs: Mapping[str, tuple[SerializedDatasetRef, list[int]]]
79 inputs: Mapping[str, list[tuple[SerializedDatasetRef, list[int]]]]
80 outputs: Mapping[str, list[tuple[SerializedDatasetRef, list[int]]]]
81 dimensionRecords: dict[int, SerializedDimensionRecord] | None = None
82 datastoreRecords: dict[str, SerializedDatastoreRecordData] | None = None
84 @classmethod
85 def direct(
86 cls,
87 *,
88 taskName: str | None,
89 dataId: dict | None,
90 datasetTypeMapping: Mapping[str, dict],
91 initInputs: Mapping[str, tuple[dict, list[int]]],
92 inputs: Mapping[str, list[tuple[dict, list[int]]]],
93 outputs: Mapping[str, list[tuple[dict, list[int]]]],
94 dimensionRecords: dict[int, dict] | None,
95 datastoreRecords: dict[str, dict] | None,
96 ) -> SerializedQuantum:
97 """Construct a `SerializedQuantum` directly without validators.
99 This differs from the pydantic "construct" method in that the arguments
100 are explicitly what the model requires, and it will recurse through
101 members, constructing them from their corresponding `direct` methods.
103 This method should only be called when the inputs are trusted.
104 """
105 serialized_dataId = SerializedDataCoordinate.direct(**dataId) if dataId is not None else None
106 serialized_datasetTypeMapping = {
107 k: SerializedDatasetType.direct(**v) for k, v in datasetTypeMapping.items()
108 }
109 serialized_initInputs = {
110 k: (SerializedDatasetRef.direct(**v), refs) for k, (v, refs) in initInputs.items()
111 }
112 serialized_inputs = {
113 k: [(SerializedDatasetRef.direct(**ref), id) for ref, id in v] for k, v in inputs.items()
114 }
115 serialized_outputs = {
116 k: [(SerializedDatasetRef.direct(**ref), id) for ref, id in v] for k, v in outputs.items()
117 }
118 serialized_records = (
119 {int(k): SerializedDimensionRecord.direct(**v) for k, v in dimensionRecords.items()}
120 if dimensionRecords is not None
121 else None
122 )
123 serialized_datastore_records = (
124 {k: SerializedDatastoreRecordData.direct(**v) for k, v in datastoreRecords.items()}
125 if datastoreRecords is not None
126 else None
127 )
129 node = cls.model_construct(
130 taskName=sys.intern(taskName or ""),
131 dataId=serialized_dataId,
132 datasetTypeMapping=serialized_datasetTypeMapping,
133 initInputs=serialized_initInputs,
134 inputs=serialized_inputs,
135 outputs=serialized_outputs,
136 dimensionRecords=serialized_records,
137 datastoreRecords=serialized_datastore_records,
138 )
140 return node
143class Quantum:
144 """Class representing a discrete unit of work.
146 A Quantum may depend on one or more datasets and produce one or more
147 datasets.
149 Most Quanta will be executions of a particular ``PipelineTask``’s
150 ``runQuantum`` method, but they can also be used to represent discrete
151 units of work performed manually by human operators or other software
152 agents.
154 Parameters
155 ----------
156 taskName : `str`, optional
157 Fully-qualified name of the Task class that executed or will execute
158 this Quantum. If not provided, ``taskClass`` must be.
159 taskClass : `type`, optional
160 The Task class that executed or will execute this Quantum. If not
161 provided, ``taskName`` must be. Overrides ``taskName`` if both are
162 provided.
163 dataId : `DataId`, optional
164 The dimension values that identify this `Quantum`.
165 initInputs : collection of `DatasetRef`, optional
166 Datasets that are needed to construct an instance of the Task. May
167 be a flat iterable of `DatasetRef` instances or a mapping from
168 `DatasetType` to `DatasetRef`.
169 inputs : `~collections.abc.Mapping`, optional
170 Inputs identified prior to execution, organized as a mapping from
171 `DatasetType` to a list of `DatasetRef`.
172 outputs : `~collections.abc.Mapping`, optional
173 Outputs from executing this quantum of work, organized as a mapping
174 from `DatasetType` to a list of `DatasetRef`.
175 datastore_records : `DatastoreRecordData`, optional
176 Datastore record data for input or initInput datasets that already
177 exist.
178 """
180 __slots__ = (
181 "_taskName",
182 "_taskClass",
183 "_dataId",
184 "_initInputs",
185 "_inputs",
186 "_outputs",
187 "_datastore_records",
188 )
190 def __init__(
191 self,
192 *,
193 taskName: str | None = None,
194 taskClass: type | None = None,
195 dataId: DataCoordinate | None = None,
196 initInputs: Mapping[DatasetType, DatasetRef] | Iterable[DatasetRef] | None = None,
197 inputs: Mapping[DatasetType, Sequence[DatasetRef]] | None = None,
198 outputs: Mapping[DatasetType, Sequence[DatasetRef]] | None = None,
199 datastore_records: Mapping[str, DatastoreRecordData] | None = None,
200 ):
201 if taskClass is not None:
202 taskName = f"{taskClass.__module__}.{taskClass.__name__}"
203 self._taskName = taskName
204 self._taskClass = taskClass
205 self._dataId = dataId
206 if initInputs is None:
207 initInputs = {}
208 elif not isinstance(initInputs, Mapping):
209 initInputs = {ref.datasetType: ref for ref in initInputs}
210 if inputs is None:
211 inputs = {}
212 if outputs is None:
213 outputs = {}
214 self._initInputs = NamedKeyDict[DatasetType, DatasetRef](initInputs).freeze()
215 self._inputs = NamedKeyDict[DatasetType, tuple[DatasetRef]](
216 (k, tuple(v)) for k, v in inputs.items()
217 ).freeze()
218 self._outputs = NamedKeyDict[DatasetType, tuple[DatasetRef]](
219 (k, tuple(v)) for k, v in outputs.items()
220 ).freeze()
221 if datastore_records is None:
222 datastore_records = {}
223 self._datastore_records = datastore_records
225 def to_simple(self, accumulator: DimensionRecordsAccumulator | None = None) -> SerializedQuantum:
226 """Convert this class to a simple python type.
228 This makes it suitable for serialization.
230 Parameters
231 ----------
232 accumulator : `DimensionRecordsAccumulator`, optional
233 This accumulator can be used to aggregate dimension records accross
234 multiple Quanta. If this is None, the default, dimension records
235 are serialized with this Quantum. If an accumulator is supplied it
236 is assumed something else is responsible for serializing the
237 records, and they will not be stored with the SerializedQuantum.
239 Returns
240 -------
241 simple : `SerializedQuantum`
242 This object converted to a serializable representation.
243 """
244 typeMapping = {}
245 initInputs = {}
247 if accumulator is None:
248 accumulator = DimensionRecordsAccumulator()
249 writeDimensionRecords = True
250 else:
251 writeDimensionRecords = False
253 # collect the init inputs for serialization, recording the types into
254 # their own mapping, used throughout to minimize saving the same object
255 # multiple times. String name of the type used to index mappings.
256 for key, value in self._initInputs.items():
257 # add the type to the typeMapping
258 typeMapping[key.name] = key.to_simple()
259 # convert to a simple DatasetRef representation
260 simple = value.to_simple()
261 # extract the dimension records
262 recIds = []
263 if simple.dataId is not None and simple.dataId.records is not None:
264 # for each dimension record get a id by adding it to the
265 # record accumulator.
266 for rec in value.dataId.records.values():
267 if rec is not None:
268 recordId = accumulator.addRecord(rec)
269 recIds.append(recordId)
270 # Set properties to None to save space
271 simple.dataId.records = None
272 simple.datasetType = None
273 initInputs[key.name] = (simple, recIds)
275 # container for all the SerializedDatasetRefs, keyed on the
276 # DatasetType name.
277 inputs = {}
279 # collect the inputs
280 for key, values in self._inputs.items():
281 # collect type if it is not already in the mapping
282 if key.name not in typeMapping:
283 typeMapping[key.name] = key.to_simple()
284 # for each input type there are a list of inputs, collect them
285 tmp = []
286 for e in values:
287 simp = e.to_simple()
288 # This container will hold ids (hashes) that point to all the
289 # dimension records within the SerializedDatasetRef dataId
290 # These dimension records repeat in almost every DatasetRef
291 # So it is hugely wasteful in terms of disk and cpu time to
292 # store them over and over again.
293 recIds = []
294 if simp.dataId is not None and simp.dataId.records is not None:
295 for rec in e.dataId.records.values():
296 # for each dimension record get a id by adding it to
297 # the record accumulator.
298 if rec is not None:
299 recordId = accumulator.addRecord(rec)
300 recIds.append(recordId)
301 # Set the records to None to avoid serializing them
302 simp.dataId.records = None
303 # Dataset type is the same as the key in _inputs, no need
304 # to serialize it out multiple times, set it to None
305 simp.datasetType = None
306 # append a tuple of the simplified SerializedDatasetRef, along
307 # with the list of all the keys for the dimension records
308 # needed for reconstruction.
309 tmp.append((simp, recIds))
310 inputs[key.name] = tmp
312 # container for all the SerializedDatasetRefs, keyed on the
313 # DatasetType name.
314 outputs = {}
315 for key, values in self._outputs.items():
316 # collect type if it is not already in the mapping
317 if key.name not in typeMapping:
318 typeMapping[key.name] = key.to_simple()
319 # for each output type there are a list of inputs, collect them
320 tmp = []
321 for e in values:
322 simp = e.to_simple()
323 # This container will hold ids (hashes) that point to all the
324 # dimension records within the SerializedDatasetRef dataId
325 # These dimension records repeat in almost every DatasetRef
326 # So it is hugely wasteful in terms of disk and cpu time to
327 # store them over and over again.
328 recIds = []
329 if simp.dataId is not None and simp.dataId.records is not None:
330 for rec in e.dataId.records.values():
331 # for each dimension record get a id by adding it to
332 # the record accumulator.
333 if rec is not None:
334 recordId = accumulator.addRecord(rec)
335 recIds.append(recordId)
336 # Set the records to None to avoid serializing them
337 simp.dataId.records = None
338 # Dataset type is the same as the key in _outputs, no need
339 # to serialize it out multiple times, set it to None
340 simp.datasetType = None
341 # append a tuple of the simplified SerializedDatasetRef, along
342 # with the list of all the keys for the dimension records
343 # needed for reconstruction.
344 tmp.append((simp, recIds))
345 outputs[key.name] = tmp
347 dimensionRecords: Mapping[int, SerializedDimensionRecord] | None
348 if writeDimensionRecords:
349 dimensionRecords = accumulator.makeSerializedDimensionRecordMapping()
350 else:
351 dimensionRecords = None
353 datastore_records: dict[str, SerializedDatastoreRecordData] | None = None
354 if self.datastore_records is not None:
355 datastore_records = {
356 datastore_name: record_data.to_simple()
357 for datastore_name, record_data in self.datastore_records.items()
358 }
360 return SerializedQuantum(
361 taskName=self._taskName,
362 dataId=self.dataId.to_simple() if self.dataId is not None else None,
363 datasetTypeMapping=typeMapping,
364 initInputs=initInputs,
365 inputs=inputs,
366 outputs=outputs,
367 dimensionRecords=dimensionRecords,
368 datastoreRecords=datastore_records,
369 )
371 @classmethod
372 def from_simple(
373 cls,
374 simple: SerializedQuantum,
375 universe: DimensionUniverse,
376 reconstitutedDimensions: dict[int, tuple[str, DimensionRecord]] | None = None,
377 ) -> Quantum:
378 """Construct a new object from a simplified form.
380 Generally this is data returned from the `to_simple` method.
382 Parameters
383 ----------
384 simple : SerializedQuantum
385 The value returned by a call to `to_simple`
386 universe : `DimensionUniverse`
387 The special graph of all known dimensions.
388 reconstitutedDimensions : `dict` of `int` to `DimensionRecord` or None
389 A mapping of ids to dimension records to be used when populating
390 dimensions for this Quantum. If supplied it will be used in place
391 of the dimension Records stored with the SerializedQuantum, if a
392 required dimension has already been loaded. Otherwise the record
393 will be unpersisted from the SerializedQuatnum and added to the
394 reconstitutedDimensions dict (if not None). Defaults to None.
395 Deprecated, any argument will be ignored. Will be removed after
396 v26.
397 """
398 initInputs: MutableMapping[DatasetType, DatasetRef] = {}
399 if reconstitutedDimensions is not None:
400 # TODO: remove this argument on DM-40150.
401 warnings.warn(
402 "The reconstitutedDimensions argument is now ignored and may be removed after v26",
403 category=FutureWarning,
404 stacklevel=find_outside_stacklevel("lsst.daf.butler"),
405 )
407 # Unpersist all the init inputs
408 for key, (value, dimensionIds) in simple.initInputs.items():
409 type_ = DatasetType.from_simple(simple.datasetTypeMapping[key], universe=universe)
410 # reconstruct the dimension records
411 rebuiltDatasetRef = _reconstructDatasetRef(
412 value, type_, dimensionIds, simple.dimensionRecords, universe
413 )
414 initInputs[type_] = rebuiltDatasetRef
416 # containers for the dataset refs
417 inputs: MutableMapping[DatasetType, list[DatasetRef]] = {}
418 outputs: MutableMapping[DatasetType, list[DatasetRef]] = {}
420 for container, simpleRefs in ((inputs, simple.inputs), (outputs, simple.outputs)):
421 for key, values in simpleRefs.items():
422 type_ = DatasetType.from_simple(simple.datasetTypeMapping[key], universe=universe)
423 # reconstruct the list of DatasetRefs for this DatasetType
424 tmp: list[DatasetRef] = []
425 for v, recIds in values:
426 rebuiltDatasetRef = _reconstructDatasetRef(
427 v, type_, recIds, simple.dimensionRecords, universe
428 )
429 tmp.append(rebuiltDatasetRef)
430 container[type_] = tmp
432 dataId = (
433 DataCoordinate.from_simple(simple.dataId, universe=universe)
434 if simple.dataId is not None
435 else None
436 )
438 datastore_records: dict[str, DatastoreRecordData] | None = None
439 if simple.datastoreRecords is not None:
440 datastore_records = {
441 datastore_name: DatastoreRecordData.from_simple(record_data)
442 for datastore_name, record_data in simple.datastoreRecords.items()
443 }
445 quant = Quantum(
446 taskName=simple.taskName,
447 dataId=dataId,
448 initInputs=initInputs,
449 inputs=inputs,
450 outputs=outputs,
451 datastore_records=datastore_records,
452 )
453 return quant
455 @property
456 def taskClass(self) -> type | None:
457 """Task class associated with this `Quantum` (`type`)."""
458 if self._taskClass is None:
459 if self._taskName is None:
460 raise ValueError("No task class defined and task name is None")
461 task_class = doImportType(self._taskName)
462 self._taskClass = task_class
463 return self._taskClass
465 @property
466 def taskName(self) -> str | None:
467 """Return Fully-qualified name of the task associated with `Quantum`.
469 (`str`).
470 """
471 return self._taskName
473 @property
474 def dataId(self) -> DataCoordinate | None:
475 """Return dimension values of the unit of processing (`DataId`)."""
476 return self._dataId
478 @property
479 def initInputs(self) -> NamedKeyMapping[DatasetType, DatasetRef]:
480 """Return mapping of datasets used to construct the Task.
482 Has `DatasetType` instances as keys (names can also be used for
483 lookups) and `DatasetRef` instances as values.
484 """
485 return self._initInputs
487 @property
488 def inputs(self) -> NamedKeyMapping[DatasetType, tuple[DatasetRef]]:
489 """Return mapping of input datasets that were expected to be used.
491 Has `DatasetType` instances as keys (names can also be used for
492 lookups) and a list of `DatasetRef` instances as values.
494 Notes
495 -----
496 We cannot use `set` instead of `list` for the nested container because
497 `DatasetRef` instances cannot be compared reliably when some have
498 integers IDs and others do not.
499 """
500 return self._inputs
502 @property
503 def outputs(self) -> NamedKeyMapping[DatasetType, tuple[DatasetRef]]:
504 """Return mapping of output datasets (to be) generated by this quantum.
506 Has the same form as ``predictedInputs``.
508 Notes
509 -----
510 We cannot use `set` instead of `list` for the nested container because
511 `DatasetRef` instances cannot be compared reliably when some have
512 integers IDs and others do not.
513 """
514 return self._outputs
516 @property
517 def datastore_records(self) -> Mapping[str, DatastoreRecordData]:
518 """Tabular data stored with this quantum (`dict`).
520 This attribute may be modified in place, but not assigned to.
521 """
522 return self._datastore_records
524 def __eq__(self, other: object) -> bool:
525 if not isinstance(other, Quantum):
526 return False
527 for item in ("taskClass", "dataId", "initInputs", "inputs", "outputs"):
528 if getattr(self, item) != getattr(other, item):
529 return False
530 return True
532 def __hash__(self) -> int:
533 return hash((self.taskClass, self.dataId))
535 def __reduce__(self) -> str | tuple[Any, ...]:
536 return (
537 self._reduceFactory,
538 (
539 self.taskName,
540 self.taskClass,
541 self.dataId,
542 dict(self.initInputs.items()),
543 dict(self.inputs),
544 dict(self.outputs),
545 self.datastore_records,
546 ),
547 )
549 def __str__(self) -> str:
550 return f"{self.__class__.__name__}(taskName={self.taskName}, dataId={self.dataId})"
552 @staticmethod
553 def _reduceFactory(
554 taskName: str | None,
555 taskClass: type | None,
556 dataId: DataCoordinate | None,
557 initInputs: Mapping[DatasetType, DatasetRef] | Iterable[DatasetRef] | None,
558 inputs: Mapping[DatasetType, list[DatasetRef]] | None,
559 outputs: Mapping[DatasetType, list[DatasetRef]] | None,
560 datastore_records: Mapping[str, DatastoreRecordData],
561 ) -> Quantum:
562 return Quantum(
563 taskName=taskName,
564 taskClass=taskClass,
565 dataId=dataId,
566 initInputs=initInputs,
567 inputs=inputs,
568 outputs=outputs,
569 datastore_records=datastore_records,
570 )
573class DimensionRecordsAccumulator:
574 """Class used to accumulate dimension records for serialization.
576 This class generates an auto increment key for each unique dimension record
577 added to it. This allows serialization of dimension records to occur once
578 for each record but be refereed to multiple times.
579 """
581 def __init__(self) -> None:
582 self._counter = 0
583 self.mapping: MutableMapping[DimensionRecord, tuple[int, SerializedDimensionRecord]] = {}
585 def addRecord(self, record: DimensionRecord) -> int:
586 """Add a dimension record to the accumulator if it has not already been
587 added. When a record is inserted for the first time it is assigned
588 a unique integer key.
590 This function returns the key associated with the record (either the
591 newly allocated key, or the existing one)
593 Parameters
594 ----------
595 record : `DimensionRecord`
596 The record to add to the accumulator
598 Returns
599 -------
600 accumulatorKey : int
601 The key that is associated with the supplied record
602 """
603 if (mappingValue := self.mapping.get(record)) is None:
604 simple = record.to_simple()
605 mappingValue = (self._counter, simple)
606 self._counter += 1
607 self.mapping[record] = mappingValue
608 return mappingValue[0]
610 def makeSerializedDimensionRecordMapping(self) -> dict[int, SerializedDimensionRecord]:
611 return dict(self.mapping.values())