Coverage for python/lsst/daf/butler/core/quantum.py: 23%
206 statements
« prev ^ index » next coverage.py v7.3.1, created at 2023-10-02 08:00 +0000
« prev ^ index » next coverage.py v7.3.1, created at 2023-10-02 08:00 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28from __future__ import annotations
30__all__ = ("Quantum", "SerializedQuantum", "DimensionRecordsAccumulator")
32import sys
33import warnings
34from collections.abc import Iterable, Mapping, MutableMapping, Sequence
35from typing import Any
37from lsst.daf.butler._compat import _BaseModelCompat
38from lsst.utils import doImportType
39from lsst.utils.introspection import find_outside_stacklevel
41from .datasets import DatasetRef, DatasetType, SerializedDatasetRef, SerializedDatasetType
42from .datastoreRecordData import DatastoreRecordData, SerializedDatastoreRecordData
43from .dimensions import (
44 DataCoordinate,
45 DimensionRecord,
46 DimensionUniverse,
47 SerializedDataCoordinate,
48 SerializedDimensionRecord,
49)
50from .named import NamedKeyDict, NamedKeyMapping
53def _reconstructDatasetRef(
54 simple: SerializedDatasetRef,
55 type_: DatasetType | None,
56 ids: Iterable[int],
57 dimensionRecords: dict[int, SerializedDimensionRecord] | None,
58 universe: DimensionUniverse,
59) -> DatasetRef:
60 """Reconstruct a DatasetRef stored in a Serialized Quantum."""
61 # Reconstruct the dimension records
62 # if the dimension record has been loaded previously use that,
63 # otherwise load it from the dict of Serialized DimensionRecords
64 if dimensionRecords is None and ids:
65 raise ValueError("Cannot construct from a SerializedQuantum with no dimension records. ")
66 records = {}
67 for dId in ids:
68 # Ignore typing because it is missing that the above if statement
69 # ensures that if there is a loop that dimensionRecords is not None.
70 tmpSerialized = dimensionRecords[dId] # type: ignore
71 records[tmpSerialized.definition] = tmpSerialized
72 if simple.dataId is not None:
73 simple.dataId.records = records or None
74 rebuiltDatasetRef = DatasetRef.from_simple(simple, universe, datasetType=type_)
75 return rebuiltDatasetRef
78class SerializedQuantum(_BaseModelCompat):
79 """Simplified model of a `Quantum` suitable for serialization."""
81 taskName: str | None = None
82 dataId: SerializedDataCoordinate | None = None
83 datasetTypeMapping: Mapping[str, SerializedDatasetType]
84 initInputs: Mapping[str, tuple[SerializedDatasetRef, list[int]]]
85 inputs: Mapping[str, list[tuple[SerializedDatasetRef, list[int]]]]
86 outputs: Mapping[str, list[tuple[SerializedDatasetRef, list[int]]]]
87 dimensionRecords: dict[int, SerializedDimensionRecord] | None = None
88 datastoreRecords: dict[str, SerializedDatastoreRecordData] | None = None
90 @classmethod
91 def direct(
92 cls,
93 *,
94 taskName: str | None,
95 dataId: dict | None,
96 datasetTypeMapping: Mapping[str, dict],
97 initInputs: Mapping[str, tuple[dict, list[int]]],
98 inputs: Mapping[str, list[tuple[dict, list[int]]]],
99 outputs: Mapping[str, list[tuple[dict, list[int]]]],
100 dimensionRecords: dict[int, dict] | None,
101 datastoreRecords: dict[str, dict] | None,
102 ) -> SerializedQuantum:
103 """Construct a `SerializedQuantum` directly without validators.
105 This differs from the pydantic "construct" method in that the arguments
106 are explicitly what the model requires, and it will recurse through
107 members, constructing them from their corresponding `direct` methods.
109 This method should only be called when the inputs are trusted.
110 """
111 serialized_dataId = SerializedDataCoordinate.direct(**dataId) if dataId is not None else None
112 serialized_datasetTypeMapping = {
113 k: SerializedDatasetType.direct(**v) for k, v in datasetTypeMapping.items()
114 }
115 serialized_initInputs = {
116 k: (SerializedDatasetRef.direct(**v), refs) for k, (v, refs) in initInputs.items()
117 }
118 serialized_inputs = {
119 k: [(SerializedDatasetRef.direct(**ref), id) for ref, id in v] for k, v in inputs.items()
120 }
121 serialized_outputs = {
122 k: [(SerializedDatasetRef.direct(**ref), id) for ref, id in v] for k, v in outputs.items()
123 }
124 serialized_records = (
125 {int(k): SerializedDimensionRecord.direct(**v) for k, v in dimensionRecords.items()}
126 if dimensionRecords is not None
127 else None
128 )
129 serialized_datastore_records = (
130 {k: SerializedDatastoreRecordData.direct(**v) for k, v in datastoreRecords.items()}
131 if datastoreRecords is not None
132 else None
133 )
135 node = cls.model_construct(
136 taskName=sys.intern(taskName or ""),
137 dataId=serialized_dataId,
138 datasetTypeMapping=serialized_datasetTypeMapping,
139 initInputs=serialized_initInputs,
140 inputs=serialized_inputs,
141 outputs=serialized_outputs,
142 dimensionRecords=serialized_records,
143 datastoreRecords=serialized_datastore_records,
144 )
146 return node
149class Quantum:
150 """Class representing a discrete unit of work.
152 A Quantum may depend on one or more datasets and produce one or more
153 datasets.
155 Most Quanta will be executions of a particular ``PipelineTask``’s
156 ``runQuantum`` method, but they can also be used to represent discrete
157 units of work performed manually by human operators or other software
158 agents.
160 Parameters
161 ----------
162 taskName : `str`, optional
163 Fully-qualified name of the Task class that executed or will execute
164 this Quantum. If not provided, ``taskClass`` must be.
165 taskClass : `type`, optional
166 The Task class that executed or will execute this Quantum. If not
167 provided, ``taskName`` must be. Overrides ``taskName`` if both are
168 provided.
169 dataId : `DataId`, optional
170 The dimension values that identify this `Quantum`.
171 initInputs : collection of `DatasetRef`, optional
172 Datasets that are needed to construct an instance of the Task. May
173 be a flat iterable of `DatasetRef` instances or a mapping from
174 `DatasetType` to `DatasetRef`.
175 inputs : `~collections.abc.Mapping`, optional
176 Inputs identified prior to execution, organized as a mapping from
177 `DatasetType` to a list of `DatasetRef`.
178 outputs : `~collections.abc.Mapping`, optional
179 Outputs from executing this quantum of work, organized as a mapping
180 from `DatasetType` to a list of `DatasetRef`.
181 datastore_records : `DatastoreRecordData`, optional
182 Datastore record data for input or initInput datasets that already
183 exist.
184 """
186 __slots__ = (
187 "_taskName",
188 "_taskClass",
189 "_dataId",
190 "_initInputs",
191 "_inputs",
192 "_outputs",
193 "_datastore_records",
194 )
196 def __init__(
197 self,
198 *,
199 taskName: str | None = None,
200 taskClass: type | None = None,
201 dataId: DataCoordinate | None = None,
202 initInputs: Mapping[DatasetType, DatasetRef] | Iterable[DatasetRef] | None = None,
203 inputs: Mapping[DatasetType, Sequence[DatasetRef]] | None = None,
204 outputs: Mapping[DatasetType, Sequence[DatasetRef]] | None = None,
205 datastore_records: Mapping[str, DatastoreRecordData] | None = None,
206 ):
207 if taskClass is not None:
208 taskName = f"{taskClass.__module__}.{taskClass.__name__}"
209 self._taskName = taskName
210 self._taskClass = taskClass
211 self._dataId = dataId
212 if initInputs is None:
213 initInputs = {}
214 elif not isinstance(initInputs, Mapping):
215 initInputs = {ref.datasetType: ref for ref in initInputs}
216 if inputs is None:
217 inputs = {}
218 if outputs is None:
219 outputs = {}
220 self._initInputs = NamedKeyDict[DatasetType, DatasetRef](initInputs).freeze()
221 self._inputs = NamedKeyDict[DatasetType, tuple[DatasetRef]](
222 (k, tuple(v)) for k, v in inputs.items()
223 ).freeze()
224 self._outputs = NamedKeyDict[DatasetType, tuple[DatasetRef]](
225 (k, tuple(v)) for k, v in outputs.items()
226 ).freeze()
227 if datastore_records is None:
228 datastore_records = {}
229 self._datastore_records = datastore_records
231 def to_simple(self, accumulator: DimensionRecordsAccumulator | None = None) -> SerializedQuantum:
232 """Convert this class to a simple python type.
234 This makes it suitable for serialization.
236 Parameters
237 ----------
238 accumulator : `DimensionRecordsAccumulator`, optional
239 This accumulator can be used to aggregate dimension records accross
240 multiple Quanta. If this is None, the default, dimension records
241 are serialized with this Quantum. If an accumulator is supplied it
242 is assumed something else is responsible for serializing the
243 records, and they will not be stored with the SerializedQuantum.
245 Returns
246 -------
247 simple : `SerializedQuantum`
248 This object converted to a serializable representation.
249 """
250 typeMapping = {}
251 initInputs = {}
253 if accumulator is None:
254 accumulator = DimensionRecordsAccumulator()
255 writeDimensionRecords = True
256 else:
257 writeDimensionRecords = False
259 # collect the init inputs for serialization, recording the types into
260 # their own mapping, used throughout to minimize saving the same object
261 # multiple times. String name of the type used to index mappings.
262 for key, value in self._initInputs.items():
263 # add the type to the typeMapping
264 typeMapping[key.name] = key.to_simple()
265 # convert to a simple DatasetRef representation
266 simple = value.to_simple()
267 # extract the dimension records
268 recIds = []
269 if simple.dataId is not None and simple.dataId.records is not None:
270 # for each dimension record get a id by adding it to the
271 # record accumulator.
272 for rec in value.dataId.records.values():
273 if rec is not None:
274 recordId = accumulator.addRecord(rec)
275 recIds.append(recordId)
276 # Set properties to None to save space
277 simple.dataId.records = None
278 simple.datasetType = None
279 initInputs[key.name] = (simple, recIds)
281 # container for all the SerializedDatasetRefs, keyed on the
282 # DatasetType name.
283 inputs = {}
285 # collect the inputs
286 for key, values in self._inputs.items():
287 # collect type if it is not already in the mapping
288 if key.name not in typeMapping:
289 typeMapping[key.name] = key.to_simple()
290 # for each input type there are a list of inputs, collect them
291 tmp = []
292 for e in values:
293 simp = e.to_simple()
294 # This container will hold ids (hashes) that point to all the
295 # dimension records within the SerializedDatasetRef dataId
296 # These dimension records repeat in almost every DatasetRef
297 # So it is hugely wasteful in terms of disk and cpu time to
298 # store them over and over again.
299 recIds = []
300 if simp.dataId is not None and simp.dataId.records is not None:
301 for rec in e.dataId.records.values():
302 # for each dimension record get a id by adding it to
303 # the record accumulator.
304 if rec is not None:
305 recordId = accumulator.addRecord(rec)
306 recIds.append(recordId)
307 # Set the records to None to avoid serializing them
308 simp.dataId.records = None
309 # Dataset type is the same as the key in _inputs, no need
310 # to serialize it out multiple times, set it to None
311 simp.datasetType = None
312 # append a tuple of the simplified SerializedDatasetRef, along
313 # with the list of all the keys for the dimension records
314 # needed for reconstruction.
315 tmp.append((simp, recIds))
316 inputs[key.name] = tmp
318 # container for all the SerializedDatasetRefs, keyed on the
319 # DatasetType name.
320 outputs = {}
321 for key, values in self._outputs.items():
322 # collect type if it is not already in the mapping
323 if key.name not in typeMapping:
324 typeMapping[key.name] = key.to_simple()
325 # for each output type there are a list of inputs, collect them
326 tmp = []
327 for e in values:
328 simp = e.to_simple()
329 # This container will hold ids (hashes) that point to all the
330 # dimension records within the SerializedDatasetRef dataId
331 # These dimension records repeat in almost every DatasetRef
332 # So it is hugely wasteful in terms of disk and cpu time to
333 # store them over and over again.
334 recIds = []
335 if simp.dataId is not None and simp.dataId.records is not None:
336 for rec in e.dataId.records.values():
337 # for each dimension record get a id by adding it to
338 # the record accumulator.
339 if rec is not None:
340 recordId = accumulator.addRecord(rec)
341 recIds.append(recordId)
342 # Set the records to None to avoid serializing them
343 simp.dataId.records = None
344 # Dataset type is the same as the key in _outputs, no need
345 # to serialize it out multiple times, set it to None
346 simp.datasetType = None
347 # append a tuple of the simplified SerializedDatasetRef, along
348 # with the list of all the keys for the dimension records
349 # needed for reconstruction.
350 tmp.append((simp, recIds))
351 outputs[key.name] = tmp
353 dimensionRecords: Mapping[int, SerializedDimensionRecord] | None
354 if writeDimensionRecords:
355 dimensionRecords = accumulator.makeSerializedDimensionRecordMapping()
356 else:
357 dimensionRecords = None
359 datastore_records: dict[str, SerializedDatastoreRecordData] | None = None
360 if self.datastore_records is not None:
361 datastore_records = {
362 datastore_name: record_data.to_simple()
363 for datastore_name, record_data in self.datastore_records.items()
364 }
366 return SerializedQuantum(
367 taskName=self._taskName,
368 dataId=self.dataId.to_simple() if self.dataId is not None else None,
369 datasetTypeMapping=typeMapping,
370 initInputs=initInputs,
371 inputs=inputs,
372 outputs=outputs,
373 dimensionRecords=dimensionRecords,
374 datastoreRecords=datastore_records,
375 )
377 @classmethod
378 def from_simple(
379 cls,
380 simple: SerializedQuantum,
381 universe: DimensionUniverse,
382 reconstitutedDimensions: dict[int, tuple[str, DimensionRecord]] | None = None,
383 ) -> Quantum:
384 """Construct a new object from a simplified form.
386 Generally this is data returned from the `to_simple` method.
388 Parameters
389 ----------
390 simple : SerializedQuantum
391 The value returned by a call to `to_simple`
392 universe : `DimensionUniverse`
393 The special graph of all known dimensions.
394 reconstitutedDimensions : `dict` of `int` to `DimensionRecord` or None
395 A mapping of ids to dimension records to be used when populating
396 dimensions for this Quantum. If supplied it will be used in place
397 of the dimension Records stored with the SerializedQuantum, if a
398 required dimension has already been loaded. Otherwise the record
399 will be unpersisted from the SerializedQuatnum and added to the
400 reconstitutedDimensions dict (if not None). Defaults to None.
401 Deprecated, any argument will be ignored. Will be removed after
402 v26.
403 """
404 initInputs: MutableMapping[DatasetType, DatasetRef] = {}
405 if reconstitutedDimensions is not None:
406 # TODO: remove this argument on DM-40150.
407 warnings.warn(
408 "The reconstitutedDimensions argument is now ignored and may be removed after v26",
409 category=FutureWarning,
410 stacklevel=find_outside_stacklevel("lsst.daf.butler"),
411 )
413 # Unpersist all the init inputs
414 for key, (value, dimensionIds) in simple.initInputs.items():
415 type_ = DatasetType.from_simple(simple.datasetTypeMapping[key], universe=universe)
416 # reconstruct the dimension records
417 rebuiltDatasetRef = _reconstructDatasetRef(
418 value, type_, dimensionIds, simple.dimensionRecords, universe
419 )
420 initInputs[type_] = rebuiltDatasetRef
422 # containers for the dataset refs
423 inputs: MutableMapping[DatasetType, list[DatasetRef]] = {}
424 outputs: MutableMapping[DatasetType, list[DatasetRef]] = {}
426 for container, simpleRefs in ((inputs, simple.inputs), (outputs, simple.outputs)):
427 for key, values in simpleRefs.items():
428 type_ = DatasetType.from_simple(simple.datasetTypeMapping[key], universe=universe)
429 # reconstruct the list of DatasetRefs for this DatasetType
430 tmp: list[DatasetRef] = []
431 for v, recIds in values:
432 rebuiltDatasetRef = _reconstructDatasetRef(
433 v, type_, recIds, simple.dimensionRecords, universe
434 )
435 tmp.append(rebuiltDatasetRef)
436 container[type_] = tmp
438 dataId = (
439 DataCoordinate.from_simple(simple.dataId, universe=universe)
440 if simple.dataId is not None
441 else None
442 )
444 datastore_records: dict[str, DatastoreRecordData] | None = None
445 if simple.datastoreRecords is not None:
446 datastore_records = {
447 datastore_name: DatastoreRecordData.from_simple(record_data)
448 for datastore_name, record_data in simple.datastoreRecords.items()
449 }
451 quant = Quantum(
452 taskName=simple.taskName,
453 dataId=dataId,
454 initInputs=initInputs,
455 inputs=inputs,
456 outputs=outputs,
457 datastore_records=datastore_records,
458 )
459 return quant
461 @property
462 def taskClass(self) -> type | None:
463 """Task class associated with this `Quantum` (`type`)."""
464 if self._taskClass is None:
465 if self._taskName is None:
466 raise ValueError("No task class defined and task name is None")
467 task_class = doImportType(self._taskName)
468 self._taskClass = task_class
469 return self._taskClass
471 @property
472 def taskName(self) -> str | None:
473 """Return Fully-qualified name of the task associated with `Quantum`.
475 (`str`).
476 """
477 return self._taskName
479 @property
480 def dataId(self) -> DataCoordinate | None:
481 """Return dimension values of the unit of processing (`DataId`)."""
482 return self._dataId
484 @property
485 def initInputs(self) -> NamedKeyMapping[DatasetType, DatasetRef]:
486 """Return mapping of datasets used to construct the Task.
488 Has `DatasetType` instances as keys (names can also be used for
489 lookups) and `DatasetRef` instances as values.
490 """
491 return self._initInputs
493 @property
494 def inputs(self) -> NamedKeyMapping[DatasetType, tuple[DatasetRef]]:
495 """Return mapping of input datasets that were expected to be used.
497 Has `DatasetType` instances as keys (names can also be used for
498 lookups) and a list of `DatasetRef` instances as values.
500 Notes
501 -----
502 We cannot use `set` instead of `list` for the nested container because
503 `DatasetRef` instances cannot be compared reliably when some have
504 integers IDs and others do not.
505 """
506 return self._inputs
508 @property
509 def outputs(self) -> NamedKeyMapping[DatasetType, tuple[DatasetRef]]:
510 """Return mapping of output datasets (to be) generated by this quantum.
512 Has the same form as ``predictedInputs``.
514 Notes
515 -----
516 We cannot use `set` instead of `list` for the nested container because
517 `DatasetRef` instances cannot be compared reliably when some have
518 integers IDs and others do not.
519 """
520 return self._outputs
522 @property
523 def datastore_records(self) -> Mapping[str, DatastoreRecordData]:
524 """Tabular data stored with this quantum (`dict`).
526 This attribute may be modified in place, but not assigned to.
527 """
528 return self._datastore_records
530 def __eq__(self, other: object) -> bool:
531 if not isinstance(other, Quantum):
532 return False
533 for item in ("taskClass", "dataId", "initInputs", "inputs", "outputs"):
534 if getattr(self, item) != getattr(other, item):
535 return False
536 return True
538 def __hash__(self) -> int:
539 return hash((self.taskClass, self.dataId))
541 def __reduce__(self) -> str | tuple[Any, ...]:
542 return (
543 self._reduceFactory,
544 (
545 self.taskName,
546 self.taskClass,
547 self.dataId,
548 dict(self.initInputs.items()),
549 dict(self.inputs),
550 dict(self.outputs),
551 self.datastore_records,
552 ),
553 )
555 def __str__(self) -> str:
556 return f"{self.__class__.__name__}(taskName={self.taskName}, dataId={self.dataId})"
558 @staticmethod
559 def _reduceFactory(
560 taskName: str | None,
561 taskClass: type | None,
562 dataId: DataCoordinate | None,
563 initInputs: Mapping[DatasetType, DatasetRef] | Iterable[DatasetRef] | None,
564 inputs: Mapping[DatasetType, list[DatasetRef]] | None,
565 outputs: Mapping[DatasetType, list[DatasetRef]] | None,
566 datastore_records: Mapping[str, DatastoreRecordData],
567 ) -> Quantum:
568 return Quantum(
569 taskName=taskName,
570 taskClass=taskClass,
571 dataId=dataId,
572 initInputs=initInputs,
573 inputs=inputs,
574 outputs=outputs,
575 datastore_records=datastore_records,
576 )
579class DimensionRecordsAccumulator:
580 """Class used to accumulate dimension records for serialization.
582 This class generates an auto increment key for each unique dimension record
583 added to it. This allows serialization of dimension records to occur once
584 for each record but be refereed to multiple times.
585 """
587 def __init__(self) -> None:
588 self._counter = 0
589 self.mapping: MutableMapping[DimensionRecord, tuple[int, SerializedDimensionRecord]] = {}
591 def addRecord(self, record: DimensionRecord) -> int:
592 """Add a dimension record to the accumulator if it has not already been
593 added. When a record is inserted for the first time it is assigned
594 a unique integer key.
596 This function returns the key associated with the record (either the
597 newly allocated key, or the existing one)
599 Parameters
600 ----------
601 record : `DimensionRecord`
602 The record to add to the accumulator
604 Returns
605 -------
606 accumulatorKey : int
607 The key that is associated with the supplied record
608 """
609 if (mappingValue := self.mapping.get(record)) is None:
610 simple = record.to_simple()
611 mappingValue = (self._counter, simple)
612 self._counter += 1
613 self.mapping[record] = mappingValue
614 return mappingValue[0]
616 def makeSerializedDimensionRecordMapping(self) -> dict[int, SerializedDimensionRecord]:
617 return dict(self.mapping.values())