Coverage for python/lsst/pipe/base/graph/_versionDeserializers.py: 32%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ("DESERIALIZER_MAP",)
25import json
26import lzma
27import pickle
28import struct
29import uuid
30from abc import ABC, abstractmethod
31from collections import defaultdict
32from dataclasses import dataclass
33from types import SimpleNamespace
34from typing import TYPE_CHECKING, Callable, ClassVar, DefaultDict, Dict, Mapping, Optional, Set, Tuple
36import networkx as nx
37from lsst.daf.butler import DimensionRecord, DimensionUniverse, Quantum, SerializedDimensionRecord
38from lsst.pex.config import Config
39from lsst.utils import doImport
41from ..pipeline import TaskDef
42from ..pipelineTask import PipelineTask
43from ._implDetails import DatasetTypeName, _DatasetTracker
44from .quantumNode import QuantumNode, SerializedQuantumNode
46if TYPE_CHECKING: 46 ↛ 47line 46 didn't jump to line 47, because the condition on line 46 was never true
47 from .graph import QuantumGraph
50class StructSizeDescriptor:
51 """This is basically a class level property. It exists to report the size
52 (number of bytes) of whatever the formatter string is for a deserializer
53 """
55 def __get__(self, inst, owner) -> int:
56 return struct.calcsize(owner.FMT_STRING())
59@dataclass
60class DeserializerBase(ABC):
61 @classmethod
62 @abstractmethod
63 def FMT_STRING(cls) -> str: # noqa: N805 # flake8 wants self
64 raise NotImplementedError("Base class does not implement this method")
66 structSize: ClassVar[StructSizeDescriptor]
68 preambleSize: int
69 sizeBytes: bytes
71 def __init_subclass__(cls) -> None:
72 # attach the size decriptor
73 cls.structSize = StructSizeDescriptor()
74 super().__init_subclass__()
76 def unpackHeader(self, rawHeader: bytes) -> Optional[str]:
77 """Transforms the raw bytes corresponding to the header of a save into
78 a string of the header information. Returns none if the save format has
79 no header string implementation (such as save format 1 that is all
80 pickle)
82 Parameters
83 ----------
84 rawheader : bytes
85 The bytes that are to be parsed into the header information. These
86 are the bytes after the preamble and structsize number of bytes
87 and before the headerSize bytes
88 """
89 raise NotImplementedError("Base class does not implement this method")
91 @property
92 def headerSize(self) -> int:
93 """Returns the number of bytes from the beginning of the file to the
94 end of the metadata.
95 """
96 raise NotImplementedError("Base class does not implement this method")
98 def readHeaderInfo(self, rawHeader: bytes) -> SimpleNamespace:
99 """Parse the supplied raw bytes into the header information and
100 byte ranges of specific TaskDefs and QuantumNodes
102 Parameters
103 ----------
104 rawheader : bytes
105 The bytes that are to be parsed into the header information. These
106 are the bytes after the preamble and structsize number of bytes
107 and before the headerSize bytes
108 """
109 raise NotImplementedError("Base class does not implement this method")
111 def constructGraph(
112 self, nodes: set[uuid.UUID], _readBytes: Callable[[int, int], bytes], universe: DimensionUniverse
113 ) -> QuantumGraph:
114 """Constructs a graph from the deserialized information.
116 Parameters
117 ----------
118 nodes : `set` of `uuid.UUID`
119 The nodes to include in the graph
120 _readBytes : callable
121 A callable that can be used to read bytes from the file handle.
122 The callable will take two ints, start and stop, to use as the
123 numerical bounds to read and returns a byte stream.
124 universe : `~lsst.daf.butler.DimensionUniverse`
125 The singleton of all dimensions known to the middleware registry
126 """
127 raise NotImplementedError("Base class does not implement this method")
129 def description(self) -> str:
130 """Return the description of the serialized data format"""
131 raise NotImplementedError("Base class does not implement this method")
134Version1Description = """
135The save file starts with the first few bytes corresponding to the magic bytes
136in the QuantumGraph: `qgraph4\xf6\xe8\xa9`.
138The next few bytes are 2 big endian unsigned 64 bit integers.
140The first unsigned 64 bit integer corresponds to the number of bytes of a
141python mapping of TaskDef labels to the byte ranges in the save file where the
142definition can be loaded.
144The second unsigned 64 bit integer corrresponds to the number of bytes of a
145python mapping of QuantumGraph Node number to the byte ranges in the save file
146where the node can be loaded. The byte range is indexed starting after
147the `header` bytes of the magic bytes, size bytes, and bytes of the two
148mappings.
150Each of the above mappings are pickled and then lzma compressed, so to
151deserialize the bytes, first lzma decompression must be performed and the
152results passed to python pickle loader.
154As stated above, each map contains byte ranges of the corresponding
155datastructure. Theses bytes are also lzma compressed pickles, and should
156be deserialized in a similar manner. The byte range is indexed starting after
157the `header` bytes of the magic bytes, size bytes, and bytes of the two
158mappings.
160In addition to the the TaskDef byte locations, the TypeDef map also contains
161an additional key '__GraphBuildID'. The value associated with this is the
162unique id assigned to the graph at its creation time.
163"""
166@dataclass
167class DeserializerV1(DeserializerBase):
168 @classmethod
169 def FMT_STRING(cls) -> str:
170 return ">QQ"
172 def __post_init__(self):
173 self.taskDefMapSize, self.nodeMapSize = struct.unpack(self.FMT_STRING(), self.sizeBytes)
175 @property
176 def headerSize(self) -> int:
177 return self.preambleSize + self.structSize + self.taskDefMapSize + self.nodeMapSize
179 def readHeaderInfo(self, rawHeader: bytes) -> SimpleNamespace:
180 returnValue = SimpleNamespace()
181 returnValue.taskDefMap = pickle.loads(rawHeader[: self.taskDefMapSize])
182 returnValue._buildId = returnValue.taskDefMap["__GraphBuildID"]
183 returnValue.map = pickle.loads(rawHeader[self.taskDefMapSize :])
184 returnValue.metadata = None
185 self.returnValue = returnValue
186 return returnValue
188 def unpackHeader(self, rawHeader: bytes) -> Optional[str]:
189 return None
191 def constructGraph(
192 self, nodes: set[uuid.UUID], _readBytes: Callable[[int, int], bytes], universe: DimensionUniverse
193 ):
194 # need to import here to avoid cyclic imports
195 from . import QuantumGraph
197 quanta: DefaultDict[TaskDef, Set[Quantum]] = defaultdict(set)
198 quantumToNodeId: Dict[Quantum, uuid.UUID] = {}
199 loadedTaskDef = {}
200 # loop over the nodes specified above
201 for node in nodes:
202 # Get the bytes to read from the map
203 start, stop = self.returnValue.map[node]
204 start += self.headerSize
205 stop += self.headerSize
207 # read the specified bytes, will be overloaded by subclasses
208 # bytes are compressed, so decompress them
209 dump = lzma.decompress(_readBytes(start, stop))
211 # reconstruct node
212 qNode = pickle.loads(dump)
213 object.__setattr__(qNode, "nodeId", uuid.uuid4())
215 # read the saved node, name. If it has been loaded, attach it, if
216 # not read in the taskDef first, and then load it
217 nodeTask = qNode.taskDef
218 if nodeTask not in loadedTaskDef:
219 # Get the byte ranges corresponding to this taskDef
220 start, stop = self.returnValue.taskDefMap[nodeTask]
221 start += self.headerSize
222 stop += self.headerSize
224 # load the taskDef, this method call will be overloaded by
225 # subclasses.
226 # bytes are compressed, so decompress them
227 taskDef = pickle.loads(lzma.decompress(_readBytes(start, stop)))
228 loadedTaskDef[nodeTask] = taskDef
229 # Explicitly overload the "frozen-ness" of nodes to attach the
230 # taskDef back into the un-persisted node
231 object.__setattr__(qNode, "taskDef", loadedTaskDef[nodeTask])
232 quanta[qNode.taskDef].add(qNode.quantum)
234 # record the node for later processing
235 quantumToNodeId[qNode.quantum] = qNode.nodeId
237 # construct an empty new QuantumGraph object, and run the associated
238 # creation method with the un-persisted data
239 qGraph = object.__new__(QuantumGraph)
240 qGraph._buildGraphs(
241 quanta,
242 _quantumToNodeId=quantumToNodeId,
243 _buildId=self.returnValue._buildId,
244 metadata=self.returnValue.metadata,
245 )
246 return qGraph
248 def description(self) -> str:
249 return Version1Description
252Version2Description = """
253The save file starts with the first few bytes corresponding to the magic bytes
254in the QuantumGraph: `qgraph4\xf6\xe8\xa9`.
256The next few bytes are a big endian unsigned long long.
258The unsigned long long corresponds to the number of bytes of a python mapping
259of header information. This mapping is encoded into json and then lzma
260compressed, meaning the operations must be performed in the opposite order to
261deserialize.
263The json encoded header mapping contains 4 fields: TaskDefs, GraphBuildId,
264Nodes, and Metadata.
266The `TaskDefs` key corresponds to a value which is a mapping of Task label to
267task data. The task data is a mapping of key to value, where the only key is
268`bytes` and it corresponds to a tuple of a byte range of the start, stop
269bytes (indexed after all the header bytes)
271The `GraphBuildId` corresponds with a string that is the unique id assigned to
272this graph when it was created.
274The `Nodes` key is like the `TaskDefs` key except it corresponds to
275QuantumNodes instead of TaskDefs. Another important difference is that JSON
276formatting does not allow using numbers as keys, and this mapping is keyed by
277the node number. Thus it is stored in JSON as two equal length lists, the first
278being the keys, and the second the values associated with those keys.
280The `Metadata` key is a mapping of strings to associated values. This metadata
281may be anything that is important to be transported alongside the graph.
283As stated above, each map contains byte ranges of the corresponding
284datastructure. Theses bytes are also lzma compressed pickles, and should
285be deserialized in a similar manner.
286"""
289@dataclass
290class DeserializerV2(DeserializerBase):
291 @classmethod
292 def FMT_STRING(cls) -> str:
293 return ">Q"
295 def __post_init__(self):
296 (self.mapSize,) = struct.unpack(self.FMT_STRING(), self.sizeBytes)
298 @property
299 def headerSize(self) -> int:
300 return self.preambleSize + self.structSize + self.mapSize
302 def readHeaderInfo(self, rawHeader: bytes) -> SimpleNamespace:
303 uncompressedHeaderMap = self.unpackHeader(rawHeader)
304 if uncompressedHeaderMap is None:
305 raise ValueError(
306 "This error is not possible because self.unpackHeader cannot return None,"
307 " but is done to satisfy type checkers"
308 )
309 header = json.loads(uncompressedHeaderMap)
310 returnValue = SimpleNamespace()
311 returnValue.taskDefMap = header["TaskDefs"]
312 returnValue._buildId = header["GraphBuildID"]
313 returnValue.map = dict(header["Nodes"])
314 returnValue.metadata = header["Metadata"]
315 self.returnValue = returnValue
316 return returnValue
318 def unpackHeader(self, rawHeader: bytes) -> Optional[str]:
319 return lzma.decompress(rawHeader).decode()
321 def constructGraph(
322 self, nodes: set[uuid.UUID], _readBytes: Callable[[int, int], bytes], universe: DimensionUniverse
323 ):
324 # need to import here to avoid cyclic imports
325 from . import QuantumGraph
327 quanta: DefaultDict[TaskDef, Set[Quantum]] = defaultdict(set)
328 quantumToNodeId: Dict[Quantum, uuid.UUID] = {}
329 loadedTaskDef = {}
330 # loop over the nodes specified above
331 for node in nodes:
332 # Get the bytes to read from the map
333 start, stop = self.returnValue.map[node]["bytes"]
334 start += self.headerSize
335 stop += self.headerSize
337 # read the specified bytes, will be overloaded by subclasses
338 # bytes are compressed, so decompress them
339 dump = lzma.decompress(_readBytes(start, stop))
341 # reconstruct node
342 qNode = pickle.loads(dump)
343 object.__setattr__(qNode, "nodeId", uuid.uuid4())
345 # read the saved node, name. If it has been loaded, attach it, if
346 # not read in the taskDef first, and then load it
347 nodeTask = qNode.taskDef
348 if nodeTask not in loadedTaskDef:
349 # Get the byte ranges corresponding to this taskDef
350 start, stop = self.returnValue.taskDefMap[nodeTask]["bytes"]
351 start += self.headerSize
352 stop += self.headerSize
354 # load the taskDef, this method call will be overloaded by
355 # subclasses.
356 # bytes are compressed, so decompress them
357 taskDef = pickle.loads(lzma.decompress(_readBytes(start, stop)))
358 loadedTaskDef[nodeTask] = taskDef
359 # Explicitly overload the "frozen-ness" of nodes to attach the
360 # taskDef back into the un-persisted node
361 object.__setattr__(qNode, "taskDef", loadedTaskDef[nodeTask])
362 quanta[qNode.taskDef].add(qNode.quantum)
364 # record the node for later processing
365 quantumToNodeId[qNode.quantum] = qNode.nodeId
367 # construct an empty new QuantumGraph object, and run the associated
368 # creation method with the un-persisted data
369 qGraph = object.__new__(QuantumGraph)
370 qGraph._buildGraphs(
371 quanta,
372 _quantumToNodeId=quantumToNodeId,
373 _buildId=self.returnValue._buildId,
374 metadata=self.returnValue.metadata,
375 )
376 return qGraph
378 def description(self) -> str:
379 return Version2Description
382Version3Description = """
383The save file starts with the first few bytes corresponding to the magic bytes
384in the QuantumGraph: `qgraph4\xf6\xe8\xa9`.
386The next few bytes are a big endian unsigned long long.
388The unsigned long long corresponds to the number of bytes of a mapping
389of header information. This mapping is encoded into json and then lzma
390compressed, meaning the operations must be performed in the opposite order to
391deserialize.
393The json encoded header mapping contains 5 fields: GraphBuildId, TaskDefs,
394Nodes, Metadata, and DimensionRecords.
396The `GraphBuildId` key corresponds with a string that is the unique id assigned
397to this graph when it was created.
399The `TaskDefs` key corresponds to a value which is a mapping of Task label to
400task data. The task data is a mapping of key to value. The keys of this mapping
401are `bytes`, `inputs`, and `outputs`.
403The `TaskDefs` `bytes` key corresponds to a tuple of a byte range of the
404start, stop bytes (indexed after all the header bytes). This byte rage
405corresponds to a lzma compressed json mapping. This mapping has keys of
406`taskName`, corresponding to a fully qualified python class, `config` a
407pex_config string that is used to configure the class, and `label` which
408corresponds to a string that uniquely identifies the task within a given
409execution pipeline.
411The `TaskDefs` `inputs` key is associated with a list of tuples where each
412tuple is a label of a task that is considered coming before a given task, and
413the name of the dataset that is shared between the tasks (think node and edge
414in a graph sense).
416The `TaskDefs` `outputs` key is like inputs except the values in a list
417correspond to all the output connections of a task.
419The `Nodes` key is also a json mapping with keys corresponding to the UUIDs of
420QuantumNodes. The values associated with these keys is another mapping with
421the keys `bytes`, `inputs`, and `outputs`.
423`Nodes` key `bytes` corresponds to a tuple of a byte range of the start, stop
424bytes (indexed after all the header bytes). These bytes are a lzma compressed
425json mapping which contains many sub elements, this mapping will be referred to
426as the SerializedQuantumNode (related to the python class it corresponds to).
428SerializedQUantumNodes have 3 keys, `quantum` corresponding to a json mapping
429(described below) referred to as a SerializedQuantum, `taskLabel` a string
430which corresponds to a label in the `TaskDefs` mapping, and `nodeId.
432A SerializedQuantum has many keys; taskName, dataId, datasetTypeMapping,
433initInputs, inputs, outputs, dimensionRecords.
435like the `TaskDefs` key except it corresponds to
436QuantumNodes instead of TaskDefs, and the keys of the mappings are string
437representations of the UUIDs of the QuantumNodes.
439The `Metadata` key is a mapping of strings to associated values. This metadata
440may be anything that is important to be transported alongside the graph.
442As stated above, each map contains byte ranges of the corresponding
443datastructure. Theses bytes are also lzma compressed pickles, and should
444be deserialized in a similar manner.
445"""
448@dataclass
449class DeserializerV3(DeserializerBase):
450 @classmethod
451 def FMT_STRING(cls) -> str:
452 return ">Q"
454 def __post_init__(self):
455 self.infoSize: int
456 (self.infoSize,) = struct.unpack(self.FMT_STRING(), self.sizeBytes)
458 @property
459 def headerSize(self) -> int:
460 return self.preambleSize + self.structSize + self.infoSize
462 def readHeaderInfo(self, rawHeader: bytes) -> SimpleNamespace:
463 uncompressedinfoMap = self.unpackHeader(rawHeader)
464 assert uncompressedinfoMap is not None # for python typing, this variant can't be None
465 infoMap = json.loads(uncompressedinfoMap)
466 infoMappings = SimpleNamespace()
467 infoMappings.taskDefMap = infoMap["TaskDefs"]
468 infoMappings._buildId = infoMap["GraphBuildID"]
469 infoMappings.map = {uuid.UUID(k): v for k, v in infoMap["Nodes"]}
470 infoMappings.metadata = infoMap["Metadata"]
471 infoMappings.dimensionRecords = {}
472 for k, v in infoMap["DimensionRecords"].items():
473 infoMappings.dimensionRecords[int(k)] = SerializedDimensionRecord(**v)
474 self.infoMappings = infoMappings
475 return infoMappings
477 def unpackHeader(self, rawHeader: bytes) -> Optional[str]:
478 return lzma.decompress(rawHeader).decode()
480 def constructGraph(
481 self, nodes: set[uuid.UUID], _readBytes: Callable[[int, int], bytes], universe: DimensionUniverse
482 ):
483 # need to import here to avoid cyclic imports
484 from . import QuantumGraph
486 graph = nx.DiGraph()
487 loadedTaskDef: Mapping[str, TaskDef] = {}
488 container = {}
489 datasetDict = _DatasetTracker[DatasetTypeName, TaskDef](createInverse=True)
490 taskToQuantumNode: DefaultDict[TaskDef, Set[QuantumNode]] = defaultdict(set)
491 recontitutedDimensions: Dict[int, Tuple[str, DimensionRecord]] = {}
493 for node in nodes:
494 start, stop = self.infoMappings.map[node]["bytes"]
495 start, stop = start + self.headerSize, stop + self.headerSize
496 # Read in the bytes corresponding to the node to load and
497 # decompress it
498 dump = json.loads(lzma.decompress(_readBytes(start, stop)))
500 # Turn the json back into the pydandtic model
501 nodeDeserialized = SerializedQuantumNode.direct(**dump)
502 # attach the dictionary of dimension records to the pydandtic model
503 # these are stored seperately because the are stored over and over
504 # and this saves a lot of space and time.
505 nodeDeserialized.quantum.dimensionRecords = self.infoMappings.dimensionRecords
506 # get the label for the current task
507 nodeTaskLabel = nodeDeserialized.taskLabel
509 if nodeTaskLabel not in loadedTaskDef:
510 # Get the byte ranges corresponding to this taskDef
511 start, stop = self.infoMappings.taskDefMap[nodeTaskLabel]["bytes"]
512 start, stop = start + self.headerSize, stop + self.headerSize
514 # bytes are compressed, so decompress them
515 taskDefDump = json.loads(lzma.decompress(_readBytes(start, stop)))
516 taskClass: PipelineTask = doImport(taskDefDump["taskName"]) # type: ignore
517 config: Config = taskClass.ConfigClass() # type: ignore
518 config.loadFromStream(taskDefDump["config"])
519 # Rebuild TaskDef
520 recreatedTaskDef = TaskDef(
521 taskName=taskDefDump["taskName"],
522 taskClass=taskClass,
523 config=config,
524 label=taskDefDump["label"],
525 )
526 loadedTaskDef[nodeTaskLabel] = recreatedTaskDef
528 # rebuild the mappings that associate dataset type names with
529 # TaskDefs
530 for _, input in self.infoMappings.taskDefMap[nodeTaskLabel]["inputs"]:
531 datasetDict.addConsumer(DatasetTypeName(input), recreatedTaskDef)
533 added = set()
534 for outputConnection in self.infoMappings.taskDefMap[nodeTaskLabel]["outputs"]:
535 typeName = outputConnection[1]
536 if typeName not in added:
537 added.add(typeName)
538 datasetDict.addProducer(DatasetTypeName(typeName), recreatedTaskDef)
540 # reconstitute the node, passing in the dictionaries for the
541 # loaded TaskDefs and dimension records. These are used to ensure
542 # that each unique record is only loaded once
543 node = QuantumNode.from_simple(nodeDeserialized, loadedTaskDef, universe, recontitutedDimensions)
544 container[node.nodeId] = node
545 taskToQuantumNode[loadedTaskDef[nodeTaskLabel]].add(node)
547 # recreate the relations between each node from stored info
548 graph.add_node(node)
549 for id in self.infoMappings.map[node.nodeId]["inputs"]:
550 # uuid is stored as a string, turn it back into a uuid
551 id = uuid.UUID(id)
552 # if the id is not yet in the container, dont make a connection
553 # this is not an issue, because once it is, that id will add
554 # the reverse connection
555 if id in container:
556 graph.add_edge(container[id], node)
557 for id in self.infoMappings.map[node.nodeId]["outputs"]:
558 # uuid is stored as a string, turn it back into a uuid
559 id = uuid.UUID(id)
560 # if the id is not yet in the container, dont make a connection
561 # this is not an issue, because once it is, that id will add
562 # the reverse connection
563 if id in container:
564 graph.add_edge(node, container[id])
566 newGraph = object.__new__(QuantumGraph)
567 newGraph._metadata = self.infoMappings.metadata
568 newGraph._buildId = self.infoMappings._buildId
569 newGraph._datasetDict = datasetDict
570 newGraph._nodeIdMap = container
571 newGraph._count = len(nodes)
572 newGraph._taskToQuantumNode = dict(taskToQuantumNode.items())
573 newGraph._taskGraph = datasetDict.makeNetworkXGraph()
574 newGraph._connectedQuanta = graph
575 return newGraph
578DESERIALIZER_MAP = {1: DeserializerV1, 2: DeserializerV2, 3: DeserializerV3}