Coverage for python/lsst/daf/butler/core/quantum.py: 23%
213 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-14 19:21 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-14 19:21 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = ("Quantum", "SerializedQuantum", "DimensionRecordsAccumulator")
26import sys
27import warnings
28from collections.abc import Iterable, Mapping, MutableMapping, Sequence
29from typing import Any
31from lsst.utils import doImportType
32from lsst.utils.introspection import find_outside_stacklevel
34try:
35 from pydantic.v1 import BaseModel
36except ModuleNotFoundError:
37 from pydantic import BaseModel # type: ignore
39from .datasets import DatasetRef, DatasetType, SerializedDatasetRef, SerializedDatasetType
40from .datastoreRecordData import DatastoreRecordData, SerializedDatastoreRecordData
41from .dimensions import (
42 DataCoordinate,
43 DimensionRecord,
44 DimensionUniverse,
45 SerializedDataCoordinate,
46 SerializedDimensionRecord,
47)
48from .named import NamedKeyDict, NamedKeyMapping
51def _reconstructDatasetRef(
52 simple: SerializedDatasetRef,
53 type_: DatasetType | None,
54 ids: Iterable[int],
55 dimensionRecords: dict[int, SerializedDimensionRecord] | None,
56 universe: DimensionUniverse,
57) -> DatasetRef:
58 """Reconstruct a DatasetRef stored in a Serialized Quantum."""
59 # Reconstruct the dimension records
60 records = {}
61 for dId in ids:
62 # if the dimension record has been loaded previously use that,
63 # otherwise load it from the dict of Serialized DimensionRecords
64 if dimensionRecords is None:
65 raise ValueError("Cannot construct from a SerializedQuantum with no dimension records. ")
66 tmpSerialized = dimensionRecords[dId]
67 reconstructedDim = DimensionRecord.from_simple(tmpSerialized, universe=universe)
68 records[sys.intern(reconstructedDim.definition.name)] = reconstructedDim
69 # turn the serialized form into an object and attach the dimension records
70 rebuiltDatasetRef = DatasetRef.from_simple(simple, universe, datasetType=type_)
71 if records:
72 object.__setattr__(rebuiltDatasetRef, "dataId", rebuiltDatasetRef.dataId.expanded(records))
73 return rebuiltDatasetRef
76class SerializedQuantum(BaseModel):
77 """Simplified model of a `Quantum` suitable for serialization."""
79 taskName: str | None = None
80 dataId: SerializedDataCoordinate | None = None
81 datasetTypeMapping: Mapping[str, SerializedDatasetType]
82 initInputs: Mapping[str, tuple[SerializedDatasetRef, list[int]]]
83 inputs: Mapping[str, list[tuple[SerializedDatasetRef, list[int]]]]
84 outputs: Mapping[str, list[tuple[SerializedDatasetRef, list[int]]]]
85 dimensionRecords: dict[int, SerializedDimensionRecord] | None = None
86 datastoreRecords: dict[str, SerializedDatastoreRecordData] | None = None
88 @classmethod
89 def direct(
90 cls,
91 *,
92 taskName: str | None,
93 dataId: dict | None,
94 datasetTypeMapping: Mapping[str, dict],
95 initInputs: Mapping[str, tuple[dict, list[int]]],
96 inputs: Mapping[str, list[tuple[dict, list[int]]]],
97 outputs: Mapping[str, list[tuple[dict, list[int]]]],
98 dimensionRecords: dict[int, dict] | None,
99 datastoreRecords: dict[str, dict] | None,
100 ) -> SerializedQuantum:
101 """Construct a `SerializedQuantum` directly without validators.
103 This differs from the pydantic "construct" method in that the arguments
104 are explicitly what the model requires, and it will recurse through
105 members, constructing them from their corresponding `direct` methods.
107 This method should only be called when the inputs are trusted.
108 """
109 node = SerializedQuantum.__new__(cls)
110 setter = object.__setattr__
111 setter(node, "taskName", sys.intern(taskName or ""))
112 setter(node, "dataId", dataId if dataId is None else SerializedDataCoordinate.direct(**dataId))
114 setter(
115 node,
116 "datasetTypeMapping",
117 {k: SerializedDatasetType.direct(**v) for k, v in datasetTypeMapping.items()},
118 )
120 setter(
121 node,
122 "initInputs",
123 {k: (SerializedDatasetRef.direct(**v), refs) for k, (v, refs) in initInputs.items()},
124 )
125 setter(
126 node,
127 "inputs",
128 {k: [(SerializedDatasetRef.direct(**ref), id) for ref, id in v] for k, v in inputs.items()},
129 )
130 setter(
131 node,
132 "outputs",
133 {k: [(SerializedDatasetRef.direct(**ref), id) for ref, id in v] for k, v in outputs.items()},
134 )
135 setter(
136 node,
137 "dimensionRecords",
138 dimensionRecords
139 if dimensionRecords is None
140 else {int(k): SerializedDimensionRecord.direct(**v) for k, v in dimensionRecords.items()},
141 )
142 setter(
143 node,
144 "datastoreRecords",
145 datastoreRecords
146 if datastoreRecords is None
147 else {k: SerializedDatastoreRecordData.direct(**v) for k, v in datastoreRecords.items()},
148 )
149 setter(
150 node,
151 "__fields_set__",
152 {
153 "taskName",
154 "dataId",
155 "datasetTypeMapping",
156 "initInputs",
157 "inputs",
158 "outputs",
159 "dimensionRecords",
160 "datastore_records",
161 },
162 )
163 return node
166class Quantum:
167 """Class representing a discrete unit of work.
169 A Quantum may depend on one or more datasets and produce one or more
170 datasets.
172 Most Quanta will be executions of a particular ``PipelineTask``’s
173 ``runQuantum`` method, but they can also be used to represent discrete
174 units of work performed manually by human operators or other software
175 agents.
177 Parameters
178 ----------
179 taskName : `str`, optional
180 Fully-qualified name of the Task class that executed or will execute
181 this Quantum. If not provided, ``taskClass`` must be.
182 taskClass : `type`, optional
183 The Task class that executed or will execute this Quantum. If not
184 provided, ``taskName`` must be. Overrides ``taskName`` if both are
185 provided.
186 dataId : `DataId`, optional
187 The dimension values that identify this `Quantum`.
188 initInputs : collection of `DatasetRef`, optional
189 Datasets that are needed to construct an instance of the Task. May
190 be a flat iterable of `DatasetRef` instances or a mapping from
191 `DatasetType` to `DatasetRef`.
192 inputs : `~collections.abc.Mapping`, optional
193 Inputs identified prior to execution, organized as a mapping from
194 `DatasetType` to a list of `DatasetRef`.
195 outputs : `~collections.abc.Mapping`, optional
196 Outputs from executing this quantum of work, organized as a mapping
197 from `DatasetType` to a list of `DatasetRef`.
198 datastore_records : `DatastoreRecordData`, optional
199 Datastore record data for input or initInput datasets that already
200 exist.
201 """
203 __slots__ = (
204 "_taskName",
205 "_taskClass",
206 "_dataId",
207 "_initInputs",
208 "_inputs",
209 "_outputs",
210 "_datastore_records",
211 )
213 def __init__(
214 self,
215 *,
216 taskName: str | None = None,
217 taskClass: type | None = None,
218 dataId: DataCoordinate | None = None,
219 initInputs: Mapping[DatasetType, DatasetRef] | Iterable[DatasetRef] | None = None,
220 inputs: Mapping[DatasetType, Sequence[DatasetRef]] | None = None,
221 outputs: Mapping[DatasetType, Sequence[DatasetRef]] | None = None,
222 datastore_records: Mapping[str, DatastoreRecordData] | None = None,
223 ):
224 if taskClass is not None:
225 taskName = f"{taskClass.__module__}.{taskClass.__name__}"
226 self._taskName = taskName
227 self._taskClass = taskClass
228 self._dataId = dataId
229 if initInputs is None:
230 initInputs = {}
231 elif not isinstance(initInputs, Mapping):
232 initInputs = {ref.datasetType: ref for ref in initInputs}
233 if inputs is None:
234 inputs = {}
235 if outputs is None:
236 outputs = {}
237 self._initInputs = NamedKeyDict[DatasetType, DatasetRef](initInputs).freeze()
238 self._inputs = NamedKeyDict[DatasetType, tuple[DatasetRef]](
239 (k, tuple(v)) for k, v in inputs.items()
240 ).freeze()
241 self._outputs = NamedKeyDict[DatasetType, tuple[DatasetRef]](
242 (k, tuple(v)) for k, v in outputs.items()
243 ).freeze()
244 if datastore_records is None:
245 datastore_records = {}
246 self._datastore_records = datastore_records
248 def to_simple(self, accumulator: DimensionRecordsAccumulator | None = None) -> SerializedQuantum:
249 """Convert this class to a simple python type.
251 This makes it suitable for serialization.
253 Parameters
254 ----------
255 accumulator : `DimensionRecordsAccumulator`, optional
256 This accumulator can be used to aggregate dimension records accross
257 multiple Quanta. If this is None, the default, dimension records
258 are serialized with this Quantum. If an accumulator is supplied it
259 is assumed something else is responsible for serializing the
260 records, and they will not be stored with the SerializedQuantum.
262 Returns
263 -------
264 simple : `SerializedQuantum`
265 This object converted to a serializable representation.
266 """
267 typeMapping = {}
268 initInputs = {}
270 if accumulator is None:
271 accumulator = DimensionRecordsAccumulator()
272 writeDimensionRecords = True
273 else:
274 writeDimensionRecords = False
276 # collect the init inputs for serialization, recording the types into
277 # their own mapping, used throughout to minimize saving the same object
278 # multiple times. String name of the type used to index mappings.
279 for key, value in self._initInputs.items():
280 # add the type to the typeMapping
281 typeMapping[key.name] = key.to_simple()
282 # convert to a simple DatasetRef representation
283 simple = value.to_simple()
284 # extract the dimension records
285 recIds = []
286 if simple.dataId is not None and simple.dataId.records is not None:
287 # for each dimension record get a id by adding it to the
288 # record accumulator.
289 for rec in value.dataId.records.values():
290 if rec is not None:
291 recordId = accumulator.addRecord(rec)
292 recIds.append(recordId)
293 # Set properties to None to save space
294 simple.dataId.records = None
295 simple.datasetType = None
296 initInputs[key.name] = (simple, recIds)
298 # container for all the SerializedDatasetRefs, keyed on the
299 # DatasetType name.
300 inputs = {}
302 # collect the inputs
303 for key, values in self._inputs.items():
304 # collect type if it is not already in the mapping
305 if key.name not in typeMapping:
306 typeMapping[key.name] = key.to_simple()
307 # for each input type there are a list of inputs, collect them
308 tmp = []
309 for e in values:
310 simp = e.to_simple()
311 # This container will hold ids (hashes) that point to all the
312 # dimension records within the SerializedDatasetRef dataId
313 # These dimension records repeat in almost every DatasetRef
314 # So it is hugely wasteful in terms of disk and cpu time to
315 # store them over and over again.
316 recIds = []
317 if simp.dataId is not None and simp.dataId.records is not None:
318 for rec in e.dataId.records.values():
319 # for each dimension record get a id by adding it to
320 # the record accumulator.
321 if rec is not None:
322 recordId = accumulator.addRecord(rec)
323 recIds.append(recordId)
324 # Set the records to None to avoid serializing them
325 simp.dataId.records = None
326 # Dataset type is the same as the key in _inputs, no need
327 # to serialize it out multiple times, set it to None
328 simp.datasetType = None
329 # append a tuple of the simplified SerializedDatasetRef, along
330 # with the list of all the keys for the dimension records
331 # needed for reconstruction.
332 tmp.append((simp, recIds))
333 inputs[key.name] = tmp
335 # container for all the SerializedDatasetRefs, keyed on the
336 # DatasetType name.
337 outputs = {}
338 for key, values in self._outputs.items():
339 # collect type if it is not already in the mapping
340 if key.name not in typeMapping:
341 typeMapping[key.name] = key.to_simple()
342 # for each output type there are a list of inputs, collect them
343 tmp = []
344 for e in values:
345 simp = e.to_simple()
346 # This container will hold ids (hashes) that point to all the
347 # dimension records within the SerializedDatasetRef dataId
348 # These dimension records repeat in almost every DatasetRef
349 # So it is hugely wasteful in terms of disk and cpu time to
350 # store them over and over again.
351 recIds = []
352 if simp.dataId is not None and simp.dataId.records is not None:
353 for rec in e.dataId.records.values():
354 # for each dimension record get a id by adding it to
355 # the record accumulator.
356 if rec is not None:
357 recordId = accumulator.addRecord(rec)
358 recIds.append(recordId)
359 # Set the records to None to avoid serializing them
360 simp.dataId.records = None
361 # Dataset type is the same as the key in _outputs, no need
362 # to serialize it out multiple times, set it to None
363 simp.datasetType = None
364 # append a tuple of the simplified SerializedDatasetRef, along
365 # with the list of all the keys for the dimension records
366 # needed for reconstruction.
367 tmp.append((simp, recIds))
368 outputs[key.name] = tmp
370 dimensionRecords: Mapping[int, SerializedDimensionRecord] | None
371 if writeDimensionRecords:
372 dimensionRecords = accumulator.makeSerializedDimensionRecordMapping()
373 else:
374 dimensionRecords = None
376 datastore_records: dict[str, SerializedDatastoreRecordData] | None = None
377 if self.datastore_records is not None:
378 datastore_records = {
379 datastore_name: record_data.to_simple()
380 for datastore_name, record_data in self.datastore_records.items()
381 }
383 return SerializedQuantum(
384 taskName=self._taskName,
385 dataId=self.dataId.to_simple() if self.dataId is not None else None,
386 datasetTypeMapping=typeMapping,
387 initInputs=initInputs,
388 inputs=inputs,
389 outputs=outputs,
390 dimensionRecords=dimensionRecords,
391 datastoreRecords=datastore_records,
392 )
394 @classmethod
395 def from_simple(
396 cls,
397 simple: SerializedQuantum,
398 universe: DimensionUniverse,
399 reconstitutedDimensions: dict[int, tuple[str, DimensionRecord]] | None = None,
400 ) -> Quantum:
401 """Construct a new object from a simplified form.
403 Generally this is data returned from the `to_simple` method.
405 Parameters
406 ----------
407 simple : SerializedQuantum
408 The value returned by a call to `to_simple`
409 universe : `DimensionUniverse`
410 The special graph of all known dimensions.
411 reconstitutedDimensions : `dict` of `int` to `DimensionRecord` or None
412 A mapping of ids to dimension records to be used when populating
413 dimensions for this Quantum. If supplied it will be used in place
414 of the dimension Records stored with the SerializedQuantum, if a
415 required dimension has already been loaded. Otherwise the record
416 will be unpersisted from the SerializedQuatnum and added to the
417 reconstitutedDimensions dict (if not None). Defaults to None.
418 Deprecated, any argument will be ignored.
419 """
420 initInputs: MutableMapping[DatasetType, DatasetRef] = {}
421 if reconstitutedDimensions is not None:
422 warnings.warn(
423 "The reconstitutedDimensions argument is now ignored and may be removed after v 27",
424 category=FutureWarning,
425 stacklevel=find_outside_stacklevel("lsst.daf.butler"),
426 )
428 # Unpersist all the init inputs
429 for key, (value, dimensionIds) in simple.initInputs.items():
430 type_ = DatasetType.from_simple(simple.datasetTypeMapping[key], universe=universe)
431 # reconstruct the dimension records
432 rebuiltDatasetRef = _reconstructDatasetRef(
433 value, type_, dimensionIds, simple.dimensionRecords, universe
434 )
435 initInputs[type_] = rebuiltDatasetRef
437 # containers for the dataset refs
438 inputs: MutableMapping[DatasetType, list[DatasetRef]] = {}
439 outputs: MutableMapping[DatasetType, list[DatasetRef]] = {}
441 for container, simpleRefs in ((inputs, simple.inputs), (outputs, simple.outputs)):
442 for key, values in simpleRefs.items():
443 type_ = DatasetType.from_simple(simple.datasetTypeMapping[key], universe=universe)
444 # reconstruct the list of DatasetRefs for this DatasetType
445 tmp: list[DatasetRef] = []
446 for v, recIds in values:
447 rebuiltDatasetRef = _reconstructDatasetRef(
448 v, type_, recIds, simple.dimensionRecords, universe
449 )
450 tmp.append(rebuiltDatasetRef)
451 container[type_] = tmp
453 dataId = (
454 DataCoordinate.from_simple(simple.dataId, universe=universe)
455 if simple.dataId is not None
456 else None
457 )
459 datastore_records: dict[str, DatastoreRecordData] | None = None
460 if simple.datastoreRecords is not None:
461 datastore_records = {
462 datastore_name: DatastoreRecordData.from_simple(record_data)
463 for datastore_name, record_data in simple.datastoreRecords.items()
464 }
466 quant = Quantum(
467 taskName=simple.taskName,
468 dataId=dataId,
469 initInputs=initInputs,
470 inputs=inputs,
471 outputs=outputs,
472 datastore_records=datastore_records,
473 )
474 return quant
476 @property
477 def taskClass(self) -> type | None:
478 """Task class associated with this `Quantum` (`type`)."""
479 if self._taskClass is None:
480 if self._taskName is None:
481 raise ValueError("No task class defined and task name is None")
482 task_class = doImportType(self._taskName)
483 self._taskClass = task_class
484 return self._taskClass
486 @property
487 def taskName(self) -> str | None:
488 """Return Fully-qualified name of the task associated with `Quantum`.
490 (`str`).
491 """
492 return self._taskName
494 @property
495 def dataId(self) -> DataCoordinate | None:
496 """Return dimension values of the unit of processing (`DataId`)."""
497 return self._dataId
499 @property
500 def initInputs(self) -> NamedKeyMapping[DatasetType, DatasetRef]:
501 """Return mapping of datasets used to construct the Task.
503 Has `DatasetType` instances as keys (names can also be used for
504 lookups) and `DatasetRef` instances as values.
505 """
506 return self._initInputs
508 @property
509 def inputs(self) -> NamedKeyMapping[DatasetType, tuple[DatasetRef]]:
510 """Return mapping of input datasets that were expected to be used.
512 Has `DatasetType` instances as keys (names can also be used for
513 lookups) and a list of `DatasetRef` instances as values.
515 Notes
516 -----
517 We cannot use `set` instead of `list` for the nested container because
518 `DatasetRef` instances cannot be compared reliably when some have
519 integers IDs and others do not.
520 """
521 return self._inputs
523 @property
524 def outputs(self) -> NamedKeyMapping[DatasetType, tuple[DatasetRef]]:
525 """Return mapping of output datasets (to be) generated by this quantum.
527 Has the same form as ``predictedInputs``.
529 Notes
530 -----
531 We cannot use `set` instead of `list` for the nested container because
532 `DatasetRef` instances cannot be compared reliably when some have
533 integers IDs and others do not.
534 """
535 return self._outputs
537 @property
538 def datastore_records(self) -> Mapping[str, DatastoreRecordData]:
539 """Tabular data stored with this quantum (`dict`).
541 This attribute may be modified in place, but not assigned to.
542 """
543 return self._datastore_records
545 def __eq__(self, other: object) -> bool:
546 if not isinstance(other, Quantum):
547 return False
548 for item in ("taskClass", "dataId", "initInputs", "inputs", "outputs"):
549 if getattr(self, item) != getattr(other, item):
550 return False
551 return True
553 def __hash__(self) -> int:
554 return hash((self.taskClass, self.dataId))
556 def __reduce__(self) -> str | tuple[Any, ...]:
557 return (
558 self._reduceFactory,
559 (
560 self.taskName,
561 self.taskClass,
562 self.dataId,
563 dict(self.initInputs.items()),
564 dict(self.inputs),
565 dict(self.outputs),
566 self.datastore_records,
567 ),
568 )
570 def __str__(self) -> str:
571 return f"{self.__class__.__name__}(taskName={self.taskName}, dataId={self.dataId})"
573 @staticmethod
574 def _reduceFactory(
575 taskName: str | None,
576 taskClass: type | None,
577 dataId: DataCoordinate | None,
578 initInputs: Mapping[DatasetType, DatasetRef] | Iterable[DatasetRef] | None,
579 inputs: Mapping[DatasetType, list[DatasetRef]] | None,
580 outputs: Mapping[DatasetType, list[DatasetRef]] | None,
581 datastore_records: Mapping[str, DatastoreRecordData],
582 ) -> Quantum:
583 return Quantum(
584 taskName=taskName,
585 taskClass=taskClass,
586 dataId=dataId,
587 initInputs=initInputs,
588 inputs=inputs,
589 outputs=outputs,
590 datastore_records=datastore_records,
591 )
594class DimensionRecordsAccumulator:
595 """Class used to accumulate dimension records for serialization.
597 This class generates an auto increment key for each unique dimension record
598 added to it. This allows serialization of dimension records to occur once
599 for each record but be refereed to multiple times.
600 """
602 def __init__(self) -> None:
603 self._counter = 0
604 self.mapping: MutableMapping[DimensionRecord, tuple[int, SerializedDimensionRecord]] = {}
606 def addRecord(self, record: DimensionRecord) -> int:
607 """Add a dimension record to the accumulator if it has not already been
608 added. When a record is inserted for the first time it is assigned
609 a unique integer key.
611 This function returns the key associated with the record (either the
612 newly allocated key, or the existing one)
614 Parameters
615 ----------
616 record : `DimensionRecord`
617 The record to add to the accumulator
619 Returns
620 -------
621 accumulatorKey : int
622 The key that is associated with the supplied record
623 """
624 if (mappingValue := self.mapping.get(record)) is None:
625 simple = record.to_simple()
626 mappingValue = (self._counter, simple)
627 self._counter += 1
628 self.mapping[record] = mappingValue
629 return mappingValue[0]
631 def makeSerializedDimensionRecordMapping(self) -> dict[int, SerializedDimensionRecord]:
632 return {id_: serializeRef for id_, serializeRef in self.mapping.values()}