Coverage for python/lsst/daf/butler/_quantum.py: 23%
210 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-01 11:00 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-01 11:00 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28from __future__ import annotations
30__all__ = ("Quantum", "SerializedQuantum", "DimensionRecordsAccumulator")
32import sys
33import warnings
34from collections.abc import Iterable, Mapping, MutableMapping, Sequence
35from typing import Any
37from lsst.daf.butler._compat import _BaseModelCompat
38from lsst.utils import doImportType
39from lsst.utils.introspection import find_outside_stacklevel
41from ._dataset_ref import DatasetRef, SerializedDatasetRef
42from ._dataset_type import DatasetType, SerializedDatasetType
43from ._named import NamedKeyDict, NamedKeyMapping
44from .datastore.record_data import DatastoreRecordData, SerializedDatastoreRecordData
45from .dimensions import (
46 DataCoordinate,
47 DimensionRecord,
48 DimensionUniverse,
49 SerializedDataCoordinate,
50 SerializedDimensionRecord,
51)
54def _reconstructDatasetRef(
55 simple: SerializedDatasetRef,
56 type_: DatasetType | None,
57 ids: Iterable[int],
58 dimensionRecords: dict[int, SerializedDimensionRecord] | None,
59 universe: DimensionUniverse,
60) -> DatasetRef:
61 """Reconstruct a DatasetRef stored in a Serialized Quantum."""
62 # Reconstruct the dimension records
63 # if the dimension record has been loaded previously use that,
64 # otherwise load it from the dict of Serialized DimensionRecords
65 if dimensionRecords is None and ids:
66 raise ValueError("Cannot construct from a SerializedQuantum with no dimension records. ")
67 records = {}
68 for dId in ids:
69 # Ignore typing because it is missing that the above if statement
70 # ensures that if there is a loop that dimensionRecords is not None.
71 tmpSerialized = dimensionRecords[dId] # type: ignore
72 records[tmpSerialized.definition] = tmpSerialized
73 if simple.dataId is not None:
74 simple.dataId.records = records or None
75 rebuiltDatasetRef = DatasetRef.from_simple(simple, universe, datasetType=type_)
76 return rebuiltDatasetRef
79class SerializedQuantum(_BaseModelCompat):
80 """Simplified model of a `Quantum` suitable for serialization."""
82 taskName: str | None = None
83 dataId: SerializedDataCoordinate | None = None
84 datasetTypeMapping: Mapping[str, SerializedDatasetType]
85 initInputs: Mapping[str, tuple[SerializedDatasetRef, list[int]]]
86 inputs: Mapping[str, list[tuple[SerializedDatasetRef, list[int]]]]
87 outputs: Mapping[str, list[tuple[SerializedDatasetRef, list[int]]]]
88 dimensionRecords: dict[int, SerializedDimensionRecord] | None = None
89 datastoreRecords: dict[str, SerializedDatastoreRecordData] | None = None
91 @classmethod
92 def direct(
93 cls,
94 *,
95 taskName: str | None,
96 dataId: dict | None,
97 datasetTypeMapping: Mapping[str, dict],
98 initInputs: Mapping[str, tuple[dict, list[int]]],
99 inputs: Mapping[str, list[tuple[dict, list[int]]]],
100 outputs: Mapping[str, list[tuple[dict, list[int]]]],
101 dimensionRecords: dict[int, dict] | None,
102 datastoreRecords: dict[str, dict] | None,
103 ) -> SerializedQuantum:
104 """Construct a `SerializedQuantum` directly without validators.
106 This differs from the pydantic "construct" method in that the arguments
107 are explicitly what the model requires, and it will recurse through
108 members, constructing them from their corresponding `direct` methods.
110 This method should only be called when the inputs are trusted.
111 """
112 serialized_dataId = SerializedDataCoordinate.direct(**dataId) if dataId is not None else None
113 serialized_datasetTypeMapping = {
114 k: SerializedDatasetType.direct(**v) for k, v in datasetTypeMapping.items()
115 }
116 serialized_initInputs = {
117 k: (SerializedDatasetRef.direct(**v), refs) for k, (v, refs) in initInputs.items()
118 }
119 serialized_inputs = {
120 k: [(SerializedDatasetRef.direct(**ref), id) for ref, id in v] for k, v in inputs.items()
121 }
122 serialized_outputs = {
123 k: [(SerializedDatasetRef.direct(**ref), id) for ref, id in v] for k, v in outputs.items()
124 }
125 serialized_records = (
126 {int(k): SerializedDimensionRecord.direct(**v) for k, v in dimensionRecords.items()}
127 if dimensionRecords is not None
128 else None
129 )
130 serialized_datastore_records = (
131 {k: SerializedDatastoreRecordData.direct(**v) for k, v in datastoreRecords.items()}
132 if datastoreRecords is not None
133 else None
134 )
136 node = cls.model_construct(
137 taskName=sys.intern(taskName or ""),
138 dataId=serialized_dataId,
139 datasetTypeMapping=serialized_datasetTypeMapping,
140 initInputs=serialized_initInputs,
141 inputs=serialized_inputs,
142 outputs=serialized_outputs,
143 dimensionRecords=serialized_records,
144 datastoreRecords=serialized_datastore_records,
145 )
147 return node
150class Quantum:
151 """Class representing a discrete unit of work.
153 A Quantum may depend on one or more datasets and produce one or more
154 datasets.
156 Most Quanta will be executions of a particular ``PipelineTask``’s
157 ``runQuantum`` method, but they can also be used to represent discrete
158 units of work performed manually by human operators or other software
159 agents.
161 Parameters
162 ----------
163 taskName : `str`, optional
164 Fully-qualified name of the Task class that executed or will execute
165 this Quantum. If not provided, ``taskClass`` must be.
166 taskClass : `type`, optional
167 The Task class that executed or will execute this Quantum. If not
168 provided, ``taskName`` must be. Overrides ``taskName`` if both are
169 provided.
170 dataId : `DataId`, optional
171 The dimension values that identify this `Quantum`.
172 initInputs : collection of `DatasetRef`, optional
173 Datasets that are needed to construct an instance of the Task. May
174 be a flat iterable of `DatasetRef` instances or a mapping from
175 `DatasetType` to `DatasetRef`.
176 inputs : `~collections.abc.Mapping`, optional
177 Inputs identified prior to execution, organized as a mapping from
178 `DatasetType` to a list of `DatasetRef`.
179 outputs : `~collections.abc.Mapping`, optional
180 Outputs from executing this quantum of work, organized as a mapping
181 from `DatasetType` to a list of `DatasetRef`.
182 datastore_records : `DatastoreRecordData`, optional
183 Datastore record data for input or initInput datasets that already
184 exist.
185 """
187 __slots__ = (
188 "_taskName",
189 "_taskClass",
190 "_dataId",
191 "_initInputs",
192 "_inputs",
193 "_outputs",
194 "_datastore_records",
195 )
197 def __init__(
198 self,
199 *,
200 taskName: str | None = None,
201 taskClass: type | None = None,
202 dataId: DataCoordinate | None = None,
203 initInputs: Mapping[DatasetType, DatasetRef] | Iterable[DatasetRef] | None = None,
204 inputs: Mapping[DatasetType, Sequence[DatasetRef]] | None = None,
205 outputs: Mapping[DatasetType, Sequence[DatasetRef]] | None = None,
206 datastore_records: Mapping[str, DatastoreRecordData] | None = None,
207 ):
208 if taskClass is not None:
209 taskName = f"{taskClass.__module__}.{taskClass.__name__}"
210 self._taskName = taskName
211 self._taskClass = taskClass
212 self._dataId = dataId
213 if initInputs is None:
214 initInputs = {}
215 elif not isinstance(initInputs, Mapping):
216 initInputs = {ref.datasetType: ref for ref in initInputs}
217 if inputs is None:
218 inputs = {}
219 if outputs is None:
220 outputs = {}
221 self._initInputs = NamedKeyDict[DatasetType, DatasetRef](initInputs).freeze()
222 self._inputs = NamedKeyDict[DatasetType, tuple[DatasetRef]](
223 (k, tuple(v)) for k, v in inputs.items()
224 ).freeze()
225 self._outputs = NamedKeyDict[DatasetType, tuple[DatasetRef]](
226 (k, tuple(v)) for k, v in outputs.items()
227 ).freeze()
228 if datastore_records is None:
229 datastore_records = {}
230 self._datastore_records = datastore_records
232 def to_simple(self, accumulator: DimensionRecordsAccumulator | None = None) -> SerializedQuantum:
233 """Convert this class to a simple python type.
235 This makes it suitable for serialization.
237 Parameters
238 ----------
239 accumulator : `DimensionRecordsAccumulator`, optional
240 This accumulator can be used to aggregate dimension records accross
241 multiple Quanta. If this is None, the default, dimension records
242 are serialized with this Quantum. If an accumulator is supplied it
243 is assumed something else is responsible for serializing the
244 records, and they will not be stored with the SerializedQuantum.
246 Returns
247 -------
248 simple : `SerializedQuantum`
249 This object converted to a serializable representation.
250 """
251 typeMapping = {}
252 initInputs = {}
254 if accumulator is None:
255 accumulator = DimensionRecordsAccumulator()
256 writeDimensionRecords = True
257 else:
258 writeDimensionRecords = False
260 # collect the init inputs for serialization, recording the types into
261 # their own mapping, used throughout to minimize saving the same object
262 # multiple times. String name of the type used to index mappings.
263 for key, value in self._initInputs.items():
264 # add the type to the typeMapping
265 typeMapping[key.name] = key.to_simple()
266 # convert to a simple DatasetRef representation
267 simple = value.to_simple()
268 # extract the dimension records
269 recIds = []
270 if simple.dataId is not None and simple.dataId.records is not None:
271 # for each dimension record get a id by adding it to the
272 # record accumulator.
273 for element_name in value.dataId.dimensions.elements:
274 rec = value.dataId.records[element_name]
275 if rec is not None:
276 recordId = accumulator.addRecord(rec)
277 recIds.append(recordId)
278 # Set properties to None to save space
279 simple.dataId.records = None
280 simple.datasetType = None
281 initInputs[key.name] = (simple, recIds)
283 # container for all the SerializedDatasetRefs, keyed on the
284 # DatasetType name.
285 inputs = {}
287 # collect the inputs
288 for key, values in self._inputs.items():
289 # collect type if it is not already in the mapping
290 if key.name not in typeMapping:
291 typeMapping[key.name] = key.to_simple()
292 # for each input type there are a list of inputs, collect them
293 tmp = []
294 for e in values:
295 simp = e.to_simple()
296 # This container will hold ids (hashes) that point to all the
297 # dimension records within the SerializedDatasetRef dataId
298 # These dimension records repeat in almost every DatasetRef
299 # So it is hugely wasteful in terms of disk and cpu time to
300 # store them over and over again.
301 recIds = []
302 if simp.dataId is not None and simp.dataId.records is not None:
303 for element_name in e.dataId.dimensions.elements:
304 rec = e.dataId.records[element_name]
305 # for each dimension record get a id by adding it to
306 # the record accumulator.
307 if rec is not None:
308 recordId = accumulator.addRecord(rec)
309 recIds.append(recordId)
310 # Set the records to None to avoid serializing them
311 simp.dataId.records = None
312 # Dataset type is the same as the key in _inputs, no need
313 # to serialize it out multiple times, set it to None
314 simp.datasetType = None
315 # append a tuple of the simplified SerializedDatasetRef, along
316 # with the list of all the keys for the dimension records
317 # needed for reconstruction.
318 tmp.append((simp, recIds))
319 inputs[key.name] = tmp
321 # container for all the SerializedDatasetRefs, keyed on the
322 # DatasetType name.
323 outputs = {}
324 for key, values in self._outputs.items():
325 # collect type if it is not already in the mapping
326 if key.name not in typeMapping:
327 typeMapping[key.name] = key.to_simple()
328 # for each output type there are a list of inputs, collect them
329 tmp = []
330 for e in values:
331 simp = e.to_simple()
332 # This container will hold ids (hashes) that point to all the
333 # dimension records within the SerializedDatasetRef dataId
334 # These dimension records repeat in almost every DatasetRef
335 # So it is hugely wasteful in terms of disk and cpu time to
336 # store them over and over again.
337 recIds = []
338 if simp.dataId is not None and simp.dataId.records is not None:
339 for element_name in e.dataId.dimensions.elements:
340 rec = e.dataId.records[element_name]
341 # for each dimension record get a id by adding it to
342 # the record accumulator.
343 if rec is not None:
344 recordId = accumulator.addRecord(rec)
345 recIds.append(recordId)
346 # Set the records to None to avoid serializing them
347 simp.dataId.records = None
348 # Dataset type is the same as the key in _outputs, no need
349 # to serialize it out multiple times, set it to None
350 simp.datasetType = None
351 # append a tuple of the simplified SerializedDatasetRef, along
352 # with the list of all the keys for the dimension records
353 # needed for reconstruction.
354 tmp.append((simp, recIds))
355 outputs[key.name] = tmp
357 dimensionRecords: Mapping[int, SerializedDimensionRecord] | None
358 if writeDimensionRecords:
359 dimensionRecords = accumulator.makeSerializedDimensionRecordMapping()
360 else:
361 dimensionRecords = None
363 datastore_records: dict[str, SerializedDatastoreRecordData] | None = None
364 if self.datastore_records is not None:
365 datastore_records = {
366 datastore_name: record_data.to_simple()
367 for datastore_name, record_data in self.datastore_records.items()
368 }
370 return SerializedQuantum(
371 taskName=self._taskName,
372 dataId=self.dataId.to_simple() if self.dataId is not None else None,
373 datasetTypeMapping=typeMapping,
374 initInputs=initInputs,
375 inputs=inputs,
376 outputs=outputs,
377 dimensionRecords=dimensionRecords,
378 datastoreRecords=datastore_records,
379 )
381 @classmethod
382 def from_simple(
383 cls,
384 simple: SerializedQuantum,
385 universe: DimensionUniverse,
386 reconstitutedDimensions: dict[int, tuple[str, DimensionRecord]] | None = None,
387 ) -> Quantum:
388 """Construct a new object from a simplified form.
390 Generally this is data returned from the `to_simple` method.
392 Parameters
393 ----------
394 simple : SerializedQuantum
395 The value returned by a call to `to_simple`
396 universe : `DimensionUniverse`
397 The special graph of all known dimensions.
398 reconstitutedDimensions : `dict` of `int` to `DimensionRecord` or None
399 A mapping of ids to dimension records to be used when populating
400 dimensions for this Quantum. If supplied it will be used in place
401 of the dimension Records stored with the SerializedQuantum, if a
402 required dimension has already been loaded. Otherwise the record
403 will be unpersisted from the SerializedQuatnum and added to the
404 reconstitutedDimensions dict (if not None). Defaults to None.
405 Deprecated, any argument will be ignored. Will be removed after
406 v26.
407 """
408 initInputs: MutableMapping[DatasetType, DatasetRef] = {}
409 if reconstitutedDimensions is not None:
410 # TODO: remove this argument on DM-40150.
411 warnings.warn(
412 "The reconstitutedDimensions argument is now ignored and may be removed after v26",
413 category=FutureWarning,
414 stacklevel=find_outside_stacklevel("lsst.daf.butler"),
415 )
417 # Unpersist all the init inputs
418 for key, (value, dimensionIds) in simple.initInputs.items():
419 type_ = DatasetType.from_simple(simple.datasetTypeMapping[key], universe=universe)
420 # reconstruct the dimension records
421 rebuiltDatasetRef = _reconstructDatasetRef(
422 value, type_, dimensionIds, simple.dimensionRecords, universe
423 )
424 initInputs[type_] = rebuiltDatasetRef
426 # containers for the dataset refs
427 inputs: MutableMapping[DatasetType, list[DatasetRef]] = {}
428 outputs: MutableMapping[DatasetType, list[DatasetRef]] = {}
430 for container, simpleRefs in ((inputs, simple.inputs), (outputs, simple.outputs)):
431 for key, values in simpleRefs.items():
432 type_ = DatasetType.from_simple(simple.datasetTypeMapping[key], universe=universe)
433 # reconstruct the list of DatasetRefs for this DatasetType
434 tmp: list[DatasetRef] = []
435 for v, recIds in values:
436 rebuiltDatasetRef = _reconstructDatasetRef(
437 v, type_, recIds, simple.dimensionRecords, universe
438 )
439 tmp.append(rebuiltDatasetRef)
440 container[type_] = tmp
442 dataId = (
443 DataCoordinate.from_simple(simple.dataId, universe=universe)
444 if simple.dataId is not None
445 else None
446 )
448 datastore_records: dict[str, DatastoreRecordData] | None = None
449 if simple.datastoreRecords is not None:
450 datastore_records = {
451 datastore_name: DatastoreRecordData.from_simple(record_data)
452 for datastore_name, record_data in simple.datastoreRecords.items()
453 }
455 quant = Quantum(
456 taskName=simple.taskName,
457 dataId=dataId,
458 initInputs=initInputs,
459 inputs=inputs,
460 outputs=outputs,
461 datastore_records=datastore_records,
462 )
463 return quant
465 @property
466 def taskClass(self) -> type | None:
467 """Task class associated with this `Quantum` (`type`)."""
468 if self._taskClass is None:
469 if self._taskName is None:
470 raise ValueError("No task class defined and task name is None")
471 task_class = doImportType(self._taskName)
472 self._taskClass = task_class
473 return self._taskClass
475 @property
476 def taskName(self) -> str | None:
477 """Return Fully-qualified name of the task associated with `Quantum`.
479 (`str`).
480 """
481 return self._taskName
483 @property
484 def dataId(self) -> DataCoordinate | None:
485 """Return dimension values of the unit of processing (`DataId`)."""
486 return self._dataId
488 @property
489 def initInputs(self) -> NamedKeyMapping[DatasetType, DatasetRef]:
490 """Return mapping of datasets used to construct the Task.
492 Has `DatasetType` instances as keys (names can also be used for
493 lookups) and `DatasetRef` instances as values.
494 """
495 return self._initInputs
497 @property
498 def inputs(self) -> NamedKeyMapping[DatasetType, tuple[DatasetRef]]:
499 """Return mapping of input datasets that were expected to be used.
501 Has `DatasetType` instances as keys (names can also be used for
502 lookups) and a list of `DatasetRef` instances as values.
504 Notes
505 -----
506 We cannot use `set` instead of `list` for the nested container because
507 `DatasetRef` instances cannot be compared reliably when some have
508 integers IDs and others do not.
509 """
510 return self._inputs
512 @property
513 def outputs(self) -> NamedKeyMapping[DatasetType, tuple[DatasetRef]]:
514 """Return mapping of output datasets (to be) generated by this quantum.
516 Has the same form as ``predictedInputs``.
518 Notes
519 -----
520 We cannot use `set` instead of `list` for the nested container because
521 `DatasetRef` instances cannot be compared reliably when some have
522 integers IDs and others do not.
523 """
524 return self._outputs
526 @property
527 def datastore_records(self) -> Mapping[str, DatastoreRecordData]:
528 """Tabular data stored with this quantum (`dict`).
530 This attribute may be modified in place, but not assigned to.
531 """
532 return self._datastore_records
534 def __eq__(self, other: object) -> bool:
535 if not isinstance(other, Quantum):
536 return False
537 for item in ("taskClass", "dataId", "initInputs", "inputs", "outputs"):
538 if getattr(self, item) != getattr(other, item):
539 return False
540 return True
542 def __hash__(self) -> int:
543 return hash((self.taskClass, self.dataId))
545 def __reduce__(self) -> str | tuple[Any, ...]:
546 return (
547 self._reduceFactory,
548 (
549 self.taskName,
550 self.taskClass,
551 self.dataId,
552 dict(self.initInputs.items()),
553 dict(self.inputs),
554 dict(self.outputs),
555 self.datastore_records,
556 ),
557 )
559 def __str__(self) -> str:
560 return f"{self.__class__.__name__}(taskName={self.taskName}, dataId={self.dataId})"
562 @staticmethod
563 def _reduceFactory(
564 taskName: str | None,
565 taskClass: type | None,
566 dataId: DataCoordinate | None,
567 initInputs: Mapping[DatasetType, DatasetRef] | Iterable[DatasetRef] | None,
568 inputs: Mapping[DatasetType, list[DatasetRef]] | None,
569 outputs: Mapping[DatasetType, list[DatasetRef]] | None,
570 datastore_records: Mapping[str, DatastoreRecordData],
571 ) -> Quantum:
572 return Quantum(
573 taskName=taskName,
574 taskClass=taskClass,
575 dataId=dataId,
576 initInputs=initInputs,
577 inputs=inputs,
578 outputs=outputs,
579 datastore_records=datastore_records,
580 )
583class DimensionRecordsAccumulator:
584 """Class used to accumulate dimension records for serialization.
586 This class generates an auto increment key for each unique dimension record
587 added to it. This allows serialization of dimension records to occur once
588 for each record but be refereed to multiple times.
589 """
591 def __init__(self) -> None:
592 self._counter = 0
593 self.mapping: MutableMapping[DimensionRecord, tuple[int, SerializedDimensionRecord]] = {}
595 def addRecord(self, record: DimensionRecord) -> int:
596 """Add a dimension record to the accumulator if it has not already been
597 added. When a record is inserted for the first time it is assigned
598 a unique integer key.
600 This function returns the key associated with the record (either the
601 newly allocated key, or the existing one)
603 Parameters
604 ----------
605 record : `DimensionRecord`
606 The record to add to the accumulator
608 Returns
609 -------
610 accumulatorKey : int
611 The key that is associated with the supplied record
612 """
613 if (mappingValue := self.mapping.get(record)) is None:
614 simple = record.to_simple()
615 mappingValue = (self._counter, simple)
616 self._counter += 1
617 self.mapping[record] = mappingValue
618 return mappingValue[0]
620 def makeSerializedDimensionRecordMapping(self) -> dict[int, SerializedDimensionRecord]:
621 return dict(self.mapping.values())