Coverage for python/lsst/daf/butler/_quantum.py: 23%
207 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-10-27 09:44 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-10-27 09:44 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28from __future__ import annotations
30__all__ = ("Quantum", "SerializedQuantum", "DimensionRecordsAccumulator")
32import sys
33import warnings
34from collections.abc import Iterable, Mapping, MutableMapping, Sequence
35from typing import Any
37from lsst.daf.butler._compat import _BaseModelCompat
38from lsst.utils import doImportType
39from lsst.utils.introspection import find_outside_stacklevel
41from ._dataset_ref import DatasetRef, SerializedDatasetRef
42from ._dataset_type import DatasetType, SerializedDatasetType
43from ._named import NamedKeyDict, NamedKeyMapping
44from .datastore.record_data import DatastoreRecordData, SerializedDatastoreRecordData
45from .dimensions import (
46 DataCoordinate,
47 DimensionRecord,
48 DimensionUniverse,
49 SerializedDataCoordinate,
50 SerializedDimensionRecord,
51)
54def _reconstructDatasetRef(
55 simple: SerializedDatasetRef,
56 type_: DatasetType | None,
57 ids: Iterable[int],
58 dimensionRecords: dict[int, SerializedDimensionRecord] | None,
59 universe: DimensionUniverse,
60) -> DatasetRef:
61 """Reconstruct a DatasetRef stored in a Serialized Quantum."""
62 # Reconstruct the dimension records
63 # if the dimension record has been loaded previously use that,
64 # otherwise load it from the dict of Serialized DimensionRecords
65 if dimensionRecords is None and ids:
66 raise ValueError("Cannot construct from a SerializedQuantum with no dimension records. ")
67 records = {}
68 for dId in ids:
69 # Ignore typing because it is missing that the above if statement
70 # ensures that if there is a loop that dimensionRecords is not None.
71 tmpSerialized = dimensionRecords[dId] # type: ignore
72 records[tmpSerialized.definition] = tmpSerialized
73 if simple.dataId is not None:
74 simple.dataId.records = records or None
75 rebuiltDatasetRef = DatasetRef.from_simple(simple, universe, datasetType=type_)
76 return rebuiltDatasetRef
79class SerializedQuantum(_BaseModelCompat):
80 """Simplified model of a `Quantum` suitable for serialization."""
82 taskName: str | None = None
83 dataId: SerializedDataCoordinate | None = None
84 datasetTypeMapping: Mapping[str, SerializedDatasetType]
85 initInputs: Mapping[str, tuple[SerializedDatasetRef, list[int]]]
86 inputs: Mapping[str, list[tuple[SerializedDatasetRef, list[int]]]]
87 outputs: Mapping[str, list[tuple[SerializedDatasetRef, list[int]]]]
88 dimensionRecords: dict[int, SerializedDimensionRecord] | None = None
89 datastoreRecords: dict[str, SerializedDatastoreRecordData] | None = None
91 @classmethod
92 def direct(
93 cls,
94 *,
95 taskName: str | None,
96 dataId: dict | None,
97 datasetTypeMapping: Mapping[str, dict],
98 initInputs: Mapping[str, tuple[dict, list[int]]],
99 inputs: Mapping[str, list[tuple[dict, list[int]]]],
100 outputs: Mapping[str, list[tuple[dict, list[int]]]],
101 dimensionRecords: dict[int, dict] | None,
102 datastoreRecords: dict[str, dict] | None,
103 ) -> SerializedQuantum:
104 """Construct a `SerializedQuantum` directly without validators.
106 This differs from the pydantic "construct" method in that the arguments
107 are explicitly what the model requires, and it will recurse through
108 members, constructing them from their corresponding `direct` methods.
110 This method should only be called when the inputs are trusted.
111 """
112 serialized_dataId = SerializedDataCoordinate.direct(**dataId) if dataId is not None else None
113 serialized_datasetTypeMapping = {
114 k: SerializedDatasetType.direct(**v) for k, v in datasetTypeMapping.items()
115 }
116 serialized_initInputs = {
117 k: (SerializedDatasetRef.direct(**v), refs) for k, (v, refs) in initInputs.items()
118 }
119 serialized_inputs = {
120 k: [(SerializedDatasetRef.direct(**ref), id) for ref, id in v] for k, v in inputs.items()
121 }
122 serialized_outputs = {
123 k: [(SerializedDatasetRef.direct(**ref), id) for ref, id in v] for k, v in outputs.items()
124 }
125 serialized_records = (
126 {int(k): SerializedDimensionRecord.direct(**v) for k, v in dimensionRecords.items()}
127 if dimensionRecords is not None
128 else None
129 )
130 serialized_datastore_records = (
131 {k: SerializedDatastoreRecordData.direct(**v) for k, v in datastoreRecords.items()}
132 if datastoreRecords is not None
133 else None
134 )
136 node = cls.model_construct(
137 taskName=sys.intern(taskName or ""),
138 dataId=serialized_dataId,
139 datasetTypeMapping=serialized_datasetTypeMapping,
140 initInputs=serialized_initInputs,
141 inputs=serialized_inputs,
142 outputs=serialized_outputs,
143 dimensionRecords=serialized_records,
144 datastoreRecords=serialized_datastore_records,
145 )
147 return node
150class Quantum:
151 """Class representing a discrete unit of work.
153 A Quantum may depend on one or more datasets and produce one or more
154 datasets.
156 Most Quanta will be executions of a particular ``PipelineTask``’s
157 ``runQuantum`` method, but they can also be used to represent discrete
158 units of work performed manually by human operators or other software
159 agents.
161 Parameters
162 ----------
163 taskName : `str`, optional
164 Fully-qualified name of the Task class that executed or will execute
165 this Quantum. If not provided, ``taskClass`` must be.
166 taskClass : `type`, optional
167 The Task class that executed or will execute this Quantum. If not
168 provided, ``taskName`` must be. Overrides ``taskName`` if both are
169 provided.
170 dataId : `DataId`, optional
171 The dimension values that identify this `Quantum`.
172 initInputs : collection of `DatasetRef`, optional
173 Datasets that are needed to construct an instance of the Task. May
174 be a flat iterable of `DatasetRef` instances or a mapping from
175 `DatasetType` to `DatasetRef`.
176 inputs : `~collections.abc.Mapping`, optional
177 Inputs identified prior to execution, organized as a mapping from
178 `DatasetType` to a list of `DatasetRef`.
179 outputs : `~collections.abc.Mapping`, optional
180 Outputs from executing this quantum of work, organized as a mapping
181 from `DatasetType` to a list of `DatasetRef`.
182 datastore_records : `DatastoreRecordData`, optional
183 Datastore record data for input or initInput datasets that already
184 exist.
185 """
187 __slots__ = (
188 "_taskName",
189 "_taskClass",
190 "_dataId",
191 "_initInputs",
192 "_inputs",
193 "_outputs",
194 "_datastore_records",
195 )
197 def __init__(
198 self,
199 *,
200 taskName: str | None = None,
201 taskClass: type | None = None,
202 dataId: DataCoordinate | None = None,
203 initInputs: Mapping[DatasetType, DatasetRef] | Iterable[DatasetRef] | None = None,
204 inputs: Mapping[DatasetType, Sequence[DatasetRef]] | None = None,
205 outputs: Mapping[DatasetType, Sequence[DatasetRef]] | None = None,
206 datastore_records: Mapping[str, DatastoreRecordData] | None = None,
207 ):
208 if taskClass is not None:
209 taskName = f"{taskClass.__module__}.{taskClass.__name__}"
210 self._taskName = taskName
211 self._taskClass = taskClass
212 self._dataId = dataId
213 if initInputs is None:
214 initInputs = {}
215 elif not isinstance(initInputs, Mapping):
216 initInputs = {ref.datasetType: ref for ref in initInputs}
217 if inputs is None:
218 inputs = {}
219 if outputs is None:
220 outputs = {}
221 self._initInputs = NamedKeyDict[DatasetType, DatasetRef](initInputs).freeze()
222 self._inputs = NamedKeyDict[DatasetType, tuple[DatasetRef]](
223 (k, tuple(v)) for k, v in inputs.items()
224 ).freeze()
225 self._outputs = NamedKeyDict[DatasetType, tuple[DatasetRef]](
226 (k, tuple(v)) for k, v in outputs.items()
227 ).freeze()
228 if datastore_records is None:
229 datastore_records = {}
230 self._datastore_records = datastore_records
232 def to_simple(self, accumulator: DimensionRecordsAccumulator | None = None) -> SerializedQuantum:
233 """Convert this class to a simple python type.
235 This makes it suitable for serialization.
237 Parameters
238 ----------
239 accumulator : `DimensionRecordsAccumulator`, optional
240 This accumulator can be used to aggregate dimension records accross
241 multiple Quanta. If this is None, the default, dimension records
242 are serialized with this Quantum. If an accumulator is supplied it
243 is assumed something else is responsible for serializing the
244 records, and they will not be stored with the SerializedQuantum.
246 Returns
247 -------
248 simple : `SerializedQuantum`
249 This object converted to a serializable representation.
250 """
251 typeMapping = {}
252 initInputs = {}
254 if accumulator is None:
255 accumulator = DimensionRecordsAccumulator()
256 writeDimensionRecords = True
257 else:
258 writeDimensionRecords = False
260 # collect the init inputs for serialization, recording the types into
261 # their own mapping, used throughout to minimize saving the same object
262 # multiple times. String name of the type used to index mappings.
263 for key, value in self._initInputs.items():
264 # add the type to the typeMapping
265 typeMapping[key.name] = key.to_simple()
266 # convert to a simple DatasetRef representation
267 simple = value.to_simple()
268 # extract the dimension records
269 recIds = []
270 if simple.dataId is not None and simple.dataId.records is not None:
271 # for each dimension record get a id by adding it to the
272 # record accumulator.
273 for rec in value.dataId.records.values():
274 if rec is not None:
275 recordId = accumulator.addRecord(rec)
276 recIds.append(recordId)
277 # Set properties to None to save space
278 simple.dataId.records = None
279 simple.datasetType = None
280 initInputs[key.name] = (simple, recIds)
282 # container for all the SerializedDatasetRefs, keyed on the
283 # DatasetType name.
284 inputs = {}
286 # collect the inputs
287 for key, values in self._inputs.items():
288 # collect type if it is not already in the mapping
289 if key.name not in typeMapping:
290 typeMapping[key.name] = key.to_simple()
291 # for each input type there are a list of inputs, collect them
292 tmp = []
293 for e in values:
294 simp = e.to_simple()
295 # This container will hold ids (hashes) that point to all the
296 # dimension records within the SerializedDatasetRef dataId
297 # These dimension records repeat in almost every DatasetRef
298 # So it is hugely wasteful in terms of disk and cpu time to
299 # store them over and over again.
300 recIds = []
301 if simp.dataId is not None and simp.dataId.records is not None:
302 for rec in e.dataId.records.values():
303 # for each dimension record get a id by adding it to
304 # the record accumulator.
305 if rec is not None:
306 recordId = accumulator.addRecord(rec)
307 recIds.append(recordId)
308 # Set the records to None to avoid serializing them
309 simp.dataId.records = None
310 # Dataset type is the same as the key in _inputs, no need
311 # to serialize it out multiple times, set it to None
312 simp.datasetType = None
313 # append a tuple of the simplified SerializedDatasetRef, along
314 # with the list of all the keys for the dimension records
315 # needed for reconstruction.
316 tmp.append((simp, recIds))
317 inputs[key.name] = tmp
319 # container for all the SerializedDatasetRefs, keyed on the
320 # DatasetType name.
321 outputs = {}
322 for key, values in self._outputs.items():
323 # collect type if it is not already in the mapping
324 if key.name not in typeMapping:
325 typeMapping[key.name] = key.to_simple()
326 # for each output type there are a list of inputs, collect them
327 tmp = []
328 for e in values:
329 simp = e.to_simple()
330 # This container will hold ids (hashes) that point to all the
331 # dimension records within the SerializedDatasetRef dataId
332 # These dimension records repeat in almost every DatasetRef
333 # So it is hugely wasteful in terms of disk and cpu time to
334 # store them over and over again.
335 recIds = []
336 if simp.dataId is not None and simp.dataId.records is not None:
337 for rec in e.dataId.records.values():
338 # for each dimension record get a id by adding it to
339 # the record accumulator.
340 if rec is not None:
341 recordId = accumulator.addRecord(rec)
342 recIds.append(recordId)
343 # Set the records to None to avoid serializing them
344 simp.dataId.records = None
345 # Dataset type is the same as the key in _outputs, no need
346 # to serialize it out multiple times, set it to None
347 simp.datasetType = None
348 # append a tuple of the simplified SerializedDatasetRef, along
349 # with the list of all the keys for the dimension records
350 # needed for reconstruction.
351 tmp.append((simp, recIds))
352 outputs[key.name] = tmp
354 dimensionRecords: Mapping[int, SerializedDimensionRecord] | None
355 if writeDimensionRecords:
356 dimensionRecords = accumulator.makeSerializedDimensionRecordMapping()
357 else:
358 dimensionRecords = None
360 datastore_records: dict[str, SerializedDatastoreRecordData] | None = None
361 if self.datastore_records is not None:
362 datastore_records = {
363 datastore_name: record_data.to_simple()
364 for datastore_name, record_data in self.datastore_records.items()
365 }
367 return SerializedQuantum(
368 taskName=self._taskName,
369 dataId=self.dataId.to_simple() if self.dataId is not None else None,
370 datasetTypeMapping=typeMapping,
371 initInputs=initInputs,
372 inputs=inputs,
373 outputs=outputs,
374 dimensionRecords=dimensionRecords,
375 datastoreRecords=datastore_records,
376 )
378 @classmethod
379 def from_simple(
380 cls,
381 simple: SerializedQuantum,
382 universe: DimensionUniverse,
383 reconstitutedDimensions: dict[int, tuple[str, DimensionRecord]] | None = None,
384 ) -> Quantum:
385 """Construct a new object from a simplified form.
387 Generally this is data returned from the `to_simple` method.
389 Parameters
390 ----------
391 simple : SerializedQuantum
392 The value returned by a call to `to_simple`
393 universe : `DimensionUniverse`
394 The special graph of all known dimensions.
395 reconstitutedDimensions : `dict` of `int` to `DimensionRecord` or None
396 A mapping of ids to dimension records to be used when populating
397 dimensions for this Quantum. If supplied it will be used in place
398 of the dimension Records stored with the SerializedQuantum, if a
399 required dimension has already been loaded. Otherwise the record
400 will be unpersisted from the SerializedQuatnum and added to the
401 reconstitutedDimensions dict (if not None). Defaults to None.
402 Deprecated, any argument will be ignored. Will be removed after
403 v26.
404 """
405 initInputs: MutableMapping[DatasetType, DatasetRef] = {}
406 if reconstitutedDimensions is not None:
407 # TODO: remove this argument on DM-40150.
408 warnings.warn(
409 "The reconstitutedDimensions argument is now ignored and may be removed after v26",
410 category=FutureWarning,
411 stacklevel=find_outside_stacklevel("lsst.daf.butler"),
412 )
414 # Unpersist all the init inputs
415 for key, (value, dimensionIds) in simple.initInputs.items():
416 type_ = DatasetType.from_simple(simple.datasetTypeMapping[key], universe=universe)
417 # reconstruct the dimension records
418 rebuiltDatasetRef = _reconstructDatasetRef(
419 value, type_, dimensionIds, simple.dimensionRecords, universe
420 )
421 initInputs[type_] = rebuiltDatasetRef
423 # containers for the dataset refs
424 inputs: MutableMapping[DatasetType, list[DatasetRef]] = {}
425 outputs: MutableMapping[DatasetType, list[DatasetRef]] = {}
427 for container, simpleRefs in ((inputs, simple.inputs), (outputs, simple.outputs)):
428 for key, values in simpleRefs.items():
429 type_ = DatasetType.from_simple(simple.datasetTypeMapping[key], universe=universe)
430 # reconstruct the list of DatasetRefs for this DatasetType
431 tmp: list[DatasetRef] = []
432 for v, recIds in values:
433 rebuiltDatasetRef = _reconstructDatasetRef(
434 v, type_, recIds, simple.dimensionRecords, universe
435 )
436 tmp.append(rebuiltDatasetRef)
437 container[type_] = tmp
439 dataId = (
440 DataCoordinate.from_simple(simple.dataId, universe=universe)
441 if simple.dataId is not None
442 else None
443 )
445 datastore_records: dict[str, DatastoreRecordData] | None = None
446 if simple.datastoreRecords is not None:
447 datastore_records = {
448 datastore_name: DatastoreRecordData.from_simple(record_data)
449 for datastore_name, record_data in simple.datastoreRecords.items()
450 }
452 quant = Quantum(
453 taskName=simple.taskName,
454 dataId=dataId,
455 initInputs=initInputs,
456 inputs=inputs,
457 outputs=outputs,
458 datastore_records=datastore_records,
459 )
460 return quant
462 @property
463 def taskClass(self) -> type | None:
464 """Task class associated with this `Quantum` (`type`)."""
465 if self._taskClass is None:
466 if self._taskName is None:
467 raise ValueError("No task class defined and task name is None")
468 task_class = doImportType(self._taskName)
469 self._taskClass = task_class
470 return self._taskClass
472 @property
473 def taskName(self) -> str | None:
474 """Return Fully-qualified name of the task associated with `Quantum`.
476 (`str`).
477 """
478 return self._taskName
480 @property
481 def dataId(self) -> DataCoordinate | None:
482 """Return dimension values of the unit of processing (`DataId`)."""
483 return self._dataId
485 @property
486 def initInputs(self) -> NamedKeyMapping[DatasetType, DatasetRef]:
487 """Return mapping of datasets used to construct the Task.
489 Has `DatasetType` instances as keys (names can also be used for
490 lookups) and `DatasetRef` instances as values.
491 """
492 return self._initInputs
494 @property
495 def inputs(self) -> NamedKeyMapping[DatasetType, tuple[DatasetRef]]:
496 """Return mapping of input datasets that were expected to be used.
498 Has `DatasetType` instances as keys (names can also be used for
499 lookups) and a list of `DatasetRef` instances as values.
501 Notes
502 -----
503 We cannot use `set` instead of `list` for the nested container because
504 `DatasetRef` instances cannot be compared reliably when some have
505 integers IDs and others do not.
506 """
507 return self._inputs
509 @property
510 def outputs(self) -> NamedKeyMapping[DatasetType, tuple[DatasetRef]]:
511 """Return mapping of output datasets (to be) generated by this quantum.
513 Has the same form as ``predictedInputs``.
515 Notes
516 -----
517 We cannot use `set` instead of `list` for the nested container because
518 `DatasetRef` instances cannot be compared reliably when some have
519 integers IDs and others do not.
520 """
521 return self._outputs
523 @property
524 def datastore_records(self) -> Mapping[str, DatastoreRecordData]:
525 """Tabular data stored with this quantum (`dict`).
527 This attribute may be modified in place, but not assigned to.
528 """
529 return self._datastore_records
531 def __eq__(self, other: object) -> bool:
532 if not isinstance(other, Quantum):
533 return False
534 for item in ("taskClass", "dataId", "initInputs", "inputs", "outputs"):
535 if getattr(self, item) != getattr(other, item):
536 return False
537 return True
539 def __hash__(self) -> int:
540 return hash((self.taskClass, self.dataId))
542 def __reduce__(self) -> str | tuple[Any, ...]:
543 return (
544 self._reduceFactory,
545 (
546 self.taskName,
547 self.taskClass,
548 self.dataId,
549 dict(self.initInputs.items()),
550 dict(self.inputs),
551 dict(self.outputs),
552 self.datastore_records,
553 ),
554 )
556 def __str__(self) -> str:
557 return f"{self.__class__.__name__}(taskName={self.taskName}, dataId={self.dataId})"
559 @staticmethod
560 def _reduceFactory(
561 taskName: str | None,
562 taskClass: type | None,
563 dataId: DataCoordinate | None,
564 initInputs: Mapping[DatasetType, DatasetRef] | Iterable[DatasetRef] | None,
565 inputs: Mapping[DatasetType, list[DatasetRef]] | None,
566 outputs: Mapping[DatasetType, list[DatasetRef]] | None,
567 datastore_records: Mapping[str, DatastoreRecordData],
568 ) -> Quantum:
569 return Quantum(
570 taskName=taskName,
571 taskClass=taskClass,
572 dataId=dataId,
573 initInputs=initInputs,
574 inputs=inputs,
575 outputs=outputs,
576 datastore_records=datastore_records,
577 )
580class DimensionRecordsAccumulator:
581 """Class used to accumulate dimension records for serialization.
583 This class generates an auto increment key for each unique dimension record
584 added to it. This allows serialization of dimension records to occur once
585 for each record but be refereed to multiple times.
586 """
588 def __init__(self) -> None:
589 self._counter = 0
590 self.mapping: MutableMapping[DimensionRecord, tuple[int, SerializedDimensionRecord]] = {}
592 def addRecord(self, record: DimensionRecord) -> int:
593 """Add a dimension record to the accumulator if it has not already been
594 added. When a record is inserted for the first time it is assigned
595 a unique integer key.
597 This function returns the key associated with the record (either the
598 newly allocated key, or the existing one)
600 Parameters
601 ----------
602 record : `DimensionRecord`
603 The record to add to the accumulator
605 Returns
606 -------
607 accumulatorKey : int
608 The key that is associated with the supplied record
609 """
610 if (mappingValue := self.mapping.get(record)) is None:
611 simple = record.to_simple()
612 mappingValue = (self._counter, simple)
613 self._counter += 1
614 self.mapping[record] = mappingValue
615 return mappingValue[0]
617 def makeSerializedDimensionRecordMapping(self) -> dict[int, SerializedDimensionRecord]:
618 return dict(self.mapping.values())