Coverage for python/lsst/pipe/base/graph/_versionDeserializers.py: 32%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ("DESERIALIZER_MAP",)
25from abc import ABC, abstractmethod
26from dataclasses import dataclass
27from types import SimpleNamespace
28from typing import Callable, ClassVar, Mapping, DefaultDict, Set, Dict, Tuple, Optional
30import json
31import lzma
32import networkx as nx
33import pickle
34import struct
35import uuid
37from collections import defaultdict
38from typing import TYPE_CHECKING
40from lsst.utils import doImport
41from lsst.pex.config import Config
42from lsst.daf.butler import DimensionUniverse, Quantum, SerializedDimensionRecord, DimensionRecord
44from .quantumNode import QuantumNode, SerializedQuantumNode
45from ..pipeline import TaskDef
46from ..pipelineTask import PipelineTask
47from ._implDetails import _DatasetTracker, DatasetTypeName
50if TYPE_CHECKING: 50 ↛ 51line 50 didn't jump to line 51, because the condition on line 50 was never true
51 from .graph import QuantumGraph
54class StructSizeDescriptor:
55 """This is basically a class level property. It exists to report the size
56 (number of bytes) of whatever the formatter string is for a deserializer
57 """
58 def __get__(self, inst, owner) -> int:
59 return struct.calcsize(owner.FMT_STRING())
62@dataclass
63class DeserializerBase(ABC):
64 @classmethod
65 @abstractmethod
66 def FMT_STRING(cls) -> str: # noqa: N805 # flake8 wants self
67 raise NotImplementedError("Base class does not implement this method")
69 structSize: ClassVar[StructSizeDescriptor]
71 preambleSize: int
72 sizeBytes: bytes
74 def __init_subclass__(cls) -> None:
75 # attach the size decriptor
76 cls.structSize = StructSizeDescriptor()
77 super().__init_subclass__()
79 def unpackHeader(self, rawHeader: bytes) -> Optional[str]:
80 """Transforms the raw bytes corresponding to the header of a save into
81 a string of the header information. Returns none if the save format has
82 no header string implementation (such as save format 1 that is all
83 pickle)
85 Parameters
86 ----------
87 rawheader : bytes
88 The bytes that are to be parsed into the header information. These
89 are the bytes after the preamble and structsize number of bytes
90 and before the headerSize bytes
91 """
92 raise NotImplementedError("Base class does not implement this method")
94 @property
95 def headerSize(self) -> int:
96 """Returns the number of bytes from the beginning of the file to the
97 end of the metadata.
98 """
99 raise NotImplementedError("Base class does not implement this method")
101 def readHeaderInfo(self, rawHeader: bytes) -> SimpleNamespace:
102 """Parse the supplied raw bytes into the header information and
103 byte ranges of specific TaskDefs and QuantumNodes
105 Parameters
106 ----------
107 rawheader : bytes
108 The bytes that are to be parsed into the header information. These
109 are the bytes after the preamble and structsize number of bytes
110 and before the headerSize bytes
111 """
112 raise NotImplementedError("Base class does not implement this method")
114 def constructGraph(self, nodes: set[uuid.UUID], _readBytes: Callable[[int, int], bytes],
115 universe: DimensionUniverse) -> QuantumGraph:
116 """Constructs a graph from the deserialized information.
118 Parameters
119 ----------
120 nodes : `set` of `uuid.UUID`
121 The nodes to include in the graph
122 _readBytes : callable
123 A callable that can be used to read bytes from the file handle.
124 The callable will take two ints, start and stop, to use as the
125 numerical bounds to read and returns a byte stream.
126 universe : `~lsst.daf.butler.DimensionUniverse`
127 The singleton of all dimensions known to the middleware registry
128 """
129 raise NotImplementedError("Base class does not implement this method")
131 def description(self) -> str:
132 """Return the description of the serialized data format
133 """
134 raise NotImplementedError("Base class does not implement this method")
137Version1Description = """
138The save file starts with the first few bytes corresponding to the magic bytes
139in the QuantumGraph: `qgraph4\xf6\xe8\xa9`.
141The next few bytes are 2 big endian unsigned 64 bit integers.
143The first unsigned 64 bit integer corresponds to the number of bytes of a
144python mapping of TaskDef labels to the byte ranges in the save file where the
145definition can be loaded.
147The second unsigned 64 bit integer corrresponds to the number of bytes of a
148python mapping of QuantumGraph Node number to the byte ranges in the save file
149where the node can be loaded. The byte range is indexed starting after
150the `header` bytes of the magic bytes, size bytes, and bytes of the two
151mappings.
153Each of the above mappings are pickled and then lzma compressed, so to
154deserialize the bytes, first lzma decompression must be performed and the
155results passed to python pickle loader.
157As stated above, each map contains byte ranges of the corresponding
158datastructure. Theses bytes are also lzma compressed pickles, and should
159be deserialized in a similar manner. The byte range is indexed starting after
160the `header` bytes of the magic bytes, size bytes, and bytes of the two
161mappings.
163In addition to the the TaskDef byte locations, the TypeDef map also contains
164an additional key '__GraphBuildID'. The value associated with this is the
165unique id assigned to the graph at its creation time.
166"""
169@dataclass
170class DeserializerV1(DeserializerBase):
171 @classmethod
172 def FMT_STRING(cls) -> str:
173 return '>QQ'
175 def __post_init__(self):
176 self.taskDefMapSize, self.nodeMapSize = struct.unpack(self.FMT_STRING(), self.sizeBytes)
178 @property
179 def headerSize(self) -> int:
180 return self.preambleSize + self.structSize + self.taskDefMapSize + self.nodeMapSize
182 def readHeaderInfo(self, rawHeader: bytes) -> SimpleNamespace:
183 returnValue = SimpleNamespace()
184 returnValue.taskDefMap = pickle.loads(rawHeader[:self.taskDefMapSize])
185 returnValue._buildId = returnValue.taskDefMap['__GraphBuildID']
186 returnValue.map = pickle.loads(rawHeader[self.taskDefMapSize:])
187 returnValue.metadata = None
188 self.returnValue = returnValue
189 return returnValue
191 def unpackHeader(self, rawHeader: bytes) -> Optional[str]:
192 return None
194 def constructGraph(self, nodes: set[uuid.UUID], _readBytes: Callable[[int, int], bytes],
195 universe: DimensionUniverse):
196 # need to import here to avoid cyclic imports
197 from . import QuantumGraph
198 quanta: DefaultDict[TaskDef, Set[Quantum]] = defaultdict(set)
199 quantumToNodeId: Dict[Quantum, uuid.UUID] = {}
200 loadedTaskDef = {}
201 # loop over the nodes specified above
202 for node in nodes:
203 # Get the bytes to read from the map
204 start, stop = self.returnValue.map[node]
205 start += self.headerSize
206 stop += self.headerSize
208 # read the specified bytes, will be overloaded by subclasses
209 # bytes are compressed, so decompress them
210 dump = lzma.decompress(_readBytes(start, stop))
212 # reconstruct node
213 qNode = pickle.loads(dump)
214 object.__setattr__(qNode, 'nodeId', uuid.uuid4())
216 # read the saved node, name. If it has been loaded, attach it, if
217 # not read in the taskDef first, and then load it
218 nodeTask = qNode.taskDef
219 if nodeTask not in loadedTaskDef:
220 # Get the byte ranges corresponding to this taskDef
221 start, stop = self.returnValue.taskDefMap[nodeTask]
222 start += self.headerSize
223 stop += self.headerSize
225 # load the taskDef, this method call will be overloaded by
226 # subclasses.
227 # bytes are compressed, so decompress them
228 taskDef = pickle.loads(lzma.decompress(_readBytes(start, stop)))
229 loadedTaskDef[nodeTask] = taskDef
230 # Explicitly overload the "frozen-ness" of nodes to attach the
231 # taskDef back into the un-persisted node
232 object.__setattr__(qNode, 'taskDef', loadedTaskDef[nodeTask])
233 quanta[qNode.taskDef].add(qNode.quantum)
235 # record the node for later processing
236 quantumToNodeId[qNode.quantum] = qNode.nodeId
238 # construct an empty new QuantumGraph object, and run the associated
239 # creation method with the un-persisted data
240 qGraph = object.__new__(QuantumGraph)
241 qGraph._buildGraphs(quanta, _quantumToNodeId=quantumToNodeId, _buildId=self.returnValue._buildId,
242 metadata=self.returnValue.metadata)
243 return qGraph
245 def description(self) -> str:
246 return Version1Description
249Version2Description = """
250The save file starts with the first few bytes corresponding to the magic bytes
251in the QuantumGraph: `qgraph4\xf6\xe8\xa9`.
253The next few bytes are a big endian unsigned long long.
255The unsigned long long corresponds to the number of bytes of a python mapping
256of header information. This mapping is encoded into json and then lzma
257compressed, meaning the operations must be performed in the opposite order to
258deserialize.
260The json encoded header mapping contains 4 fields: TaskDefs, GraphBuildId,
261Nodes, and Metadata.
263The `TaskDefs` key corresponds to a value which is a mapping of Task label to
264task data. The task data is a mapping of key to value, where the only key is
265`bytes` and it corresponds to a tuple of a byte range of the start, stop
266bytes (indexed after all the header bytes)
268The `GraphBuildId` corresponds with a string that is the unique id assigned to
269this graph when it was created.
271The `Nodes` key is like the `TaskDefs` key except it corresponds to
272QuantumNodes instead of TaskDefs. Another important difference is that JSON
273formatting does not allow using numbers as keys, and this mapping is keyed by
274the node number. Thus it is stored in JSON as two equal length lists, the first
275being the keys, and the second the values associated with those keys.
277The `Metadata` key is a mapping of strings to associated values. This metadata
278may be anything that is important to be transported alongside the graph.
280As stated above, each map contains byte ranges of the corresponding
281datastructure. Theses bytes are also lzma compressed pickles, and should
282be deserialized in a similar manner.
283"""
286@dataclass
287class DeserializerV2(DeserializerBase):
288 @classmethod
289 def FMT_STRING(cls) -> str:
290 return '>Q'
292 def __post_init__(self):
293 self.mapSize, = struct.unpack(self.FMT_STRING(), self.sizeBytes)
295 @property
296 def headerSize(self) -> int:
297 return self.preambleSize + self.structSize + self.mapSize
299 def readHeaderInfo(self, rawHeader: bytes) -> SimpleNamespace:
300 uncompressedHeaderMap = self.unpackHeader(rawHeader)
301 if uncompressedHeaderMap is None:
302 raise ValueError("This error is not possible because self.unpackHeader cannot return None,"
303 " but is done to satisfy type checkers")
304 header = json.loads(uncompressedHeaderMap)
305 returnValue = SimpleNamespace()
306 returnValue.taskDefMap = header['TaskDefs']
307 returnValue._buildId = header['GraphBuildID']
308 returnValue.map = dict(header['Nodes'])
309 returnValue.metadata = header['Metadata']
310 self.returnValue = returnValue
311 return returnValue
313 def unpackHeader(self, rawHeader: bytes) -> Optional[str]:
314 return lzma.decompress(rawHeader).decode()
316 def constructGraph(self, nodes: set[uuid.UUID], _readBytes: Callable[[int, int], bytes],
317 universe: DimensionUniverse):
318 # need to import here to avoid cyclic imports
319 from . import QuantumGraph
320 quanta: DefaultDict[TaskDef, Set[Quantum]] = defaultdict(set)
321 quantumToNodeId: Dict[Quantum, uuid.UUID] = {}
322 loadedTaskDef = {}
323 # loop over the nodes specified above
324 for node in nodes:
325 # Get the bytes to read from the map
326 start, stop = self.returnValue.map[node]['bytes']
327 start += self.headerSize
328 stop += self.headerSize
330 # read the specified bytes, will be overloaded by subclasses
331 # bytes are compressed, so decompress them
332 dump = lzma.decompress(_readBytes(start, stop))
334 # reconstruct node
335 qNode = pickle.loads(dump)
336 object.__setattr__(qNode, 'nodeId', uuid.uuid4())
338 # read the saved node, name. If it has been loaded, attach it, if
339 # not read in the taskDef first, and then load it
340 nodeTask = qNode.taskDef
341 if nodeTask not in loadedTaskDef:
342 # Get the byte ranges corresponding to this taskDef
343 start, stop = self.returnValue.taskDefMap[nodeTask]['bytes']
344 start += self.headerSize
345 stop += self.headerSize
347 # load the taskDef, this method call will be overloaded by
348 # subclasses.
349 # bytes are compressed, so decompress them
350 taskDef = pickle.loads(lzma.decompress(_readBytes(start, stop)))
351 loadedTaskDef[nodeTask] = taskDef
352 # Explicitly overload the "frozen-ness" of nodes to attach the
353 # taskDef back into the un-persisted node
354 object.__setattr__(qNode, 'taskDef', loadedTaskDef[nodeTask])
355 quanta[qNode.taskDef].add(qNode.quantum)
357 # record the node for later processing
358 quantumToNodeId[qNode.quantum] = qNode.nodeId
360 # construct an empty new QuantumGraph object, and run the associated
361 # creation method with the un-persisted data
362 qGraph = object.__new__(QuantumGraph)
363 qGraph._buildGraphs(quanta, _quantumToNodeId=quantumToNodeId, _buildId=self.returnValue._buildId,
364 metadata=self.returnValue.metadata)
365 return qGraph
367 def description(self) -> str:
368 return Version2Description
371Version3Description = """
372The save file starts with the first few bytes corresponding to the magic bytes
373in the QuantumGraph: `qgraph4\xf6\xe8\xa9`.
375The next few bytes are a big endian unsigned long long.
377The unsigned long long corresponds to the number of bytes of a mapping
378of header information. This mapping is encoded into json and then lzma
379compressed, meaning the operations must be performed in the opposite order to
380deserialize.
382The json encoded header mapping contains 5 fields: GraphBuildId, TaskDefs,
383Nodes, Metadata, and DimensionRecords.
385The `GraphBuildId` key corresponds with a string that is the unique id assigned
386to this graph when it was created.
388The `TaskDefs` key corresponds to a value which is a mapping of Task label to
389task data. The task data is a mapping of key to value. The keys of this mapping
390are `bytes`, `inputs`, and `outputs`.
392The `TaskDefs` `bytes` key corresponds to a tuple of a byte range of the
393start, stop bytes (indexed after all the header bytes). This byte rage
394corresponds to a lzma compressed json mapping. This mapping has keys of
395`taskName`, corresponding to a fully qualified python class, `config` a
396pex_config string that is used to configure the class, and `label` which
397corresponds to a string that uniquely identifies the task within a given
398execution pipeline.
400The `TaskDefs` `inputs` key is associated with a list of tuples where each
401tuple is a label of a task that is considered coming before a given task, and
402the name of the dataset that is shared between the tasks (think node and edge
403in a graph sense).
405The `TaskDefs` `outputs` key is like inputs except the values in a list
406correspond to all the output connections of a task.
408The `Nodes` key is also a json mapping with keys corresponding to the UUIDs of
409QuantumNodes. The values associated with these keys is another mapping with
410the keys `bytes`, `inputs`, and `outputs`.
412`Nodes` key `bytes` corresponds to a tuple of a byte range of the start, stop
413bytes (indexed after all the header bytes). These bytes are a lzma compressed
414json mapping which contains many sub elements, this mapping will be referred to
415as the SerializedQuantumNode (related to the python class it corresponds to).
417SerializedQUantumNodes have 3 keys, `quantum` corresponding to a json mapping
418(described below) referred to as a SerializedQuantum, `taskLabel` a string
419which corresponds to a label in the `TaskDefs` mapping, and `nodeId.
421A SerializedQuantum has many keys; taskName, dataId, datasetTypeMapping,
422initInputs, inputs, outputs, dimensionRecords.
424like the `TaskDefs` key except it corresponds to
425QuantumNodes instead of TaskDefs, and the keys of the mappings are string
426representations of the UUIDs of the QuantumNodes.
428The `Metadata` key is a mapping of strings to associated values. This metadata
429may be anything that is important to be transported alongside the graph.
431As stated above, each map contains byte ranges of the corresponding
432datastructure. Theses bytes are also lzma compressed pickles, and should
433be deserialized in a similar manner.
434"""
437@dataclass
438class DeserializerV3(DeserializerBase):
439 @classmethod
440 def FMT_STRING(cls) -> str:
441 return ">Q"
443 def __post_init__(self):
444 self.infoSize: int
445 self.infoSize, = struct.unpack(self.FMT_STRING(), self.sizeBytes)
447 @property
448 def headerSize(self) -> int:
449 return self.preambleSize + self.structSize + self.infoSize
451 def readHeaderInfo(self, rawHeader: bytes) -> SimpleNamespace:
452 uncompressedinfoMap = self.unpackHeader(rawHeader)
453 assert uncompressedinfoMap is not None # for python typing, this variant can't be None
454 infoMap = json.loads(uncompressedinfoMap)
455 infoMappings = SimpleNamespace()
456 infoMappings.taskDefMap = infoMap['TaskDefs']
457 infoMappings._buildId = infoMap['GraphBuildID']
458 infoMappings.map = {uuid.UUID(k): v for k, v in infoMap['Nodes']}
459 infoMappings.metadata = infoMap['Metadata']
460 infoMappings.dimensionRecords = {}
461 for k, v in infoMap['DimensionRecords'].items():
462 infoMappings.dimensionRecords[int(k)] = SerializedDimensionRecord(**v)
463 self.infoMappings = infoMappings
464 return infoMappings
466 def unpackHeader(self, rawHeader: bytes) -> Optional[str]:
467 return lzma.decompress(rawHeader).decode()
469 def constructGraph(self, nodes: set[uuid.UUID], _readBytes: Callable[[int, int], bytes],
470 universe: DimensionUniverse):
471 # need to import here to avoid cyclic imports
472 from . import QuantumGraph
473 graph = nx.DiGraph()
474 loadedTaskDef: Mapping[str, TaskDef] = {}
475 container = {}
476 datasetDict = _DatasetTracker[DatasetTypeName, TaskDef](createInverse=True)
477 taskToQuantumNode: DefaultDict[TaskDef, Set[QuantumNode]] = defaultdict(set)
478 recontitutedDimensions: Dict[int, Tuple[str, DimensionRecord]] = {}
480 for node in nodes:
481 start, stop = self.infoMappings.map[node]['bytes']
482 start, stop = start + self.headerSize, stop + self.headerSize
483 # Read in the bytes corresponding to the node to load and
484 # decompress it
485 dump = json.loads(lzma.decompress(_readBytes(start, stop)))
487 # Turn the json back into the pydandtic model
488 nodeDeserialized = SerializedQuantumNode.direct(**dump)
489 # attach the dictionary of dimension records to the pydandtic model
490 # these are stored seperately because the are stored over and over
491 # and this saves a lot of space and time.
492 nodeDeserialized.quantum.dimensionRecords = self.infoMappings.dimensionRecords
493 # get the label for the current task
494 nodeTaskLabel = nodeDeserialized.taskLabel
496 if nodeTaskLabel not in loadedTaskDef:
497 # Get the byte ranges corresponding to this taskDef
498 start, stop = self.infoMappings.taskDefMap[nodeTaskLabel]['bytes']
499 start, stop = start + self.headerSize, stop + self.headerSize
501 # bytes are compressed, so decompress them
502 taskDefDump = json.loads(lzma.decompress(_readBytes(start, stop)))
503 taskClass: PipelineTask = doImport(taskDefDump['taskName']) # type: ignore
504 config: Config = taskClass.ConfigClass() # type: ignore
505 config.loadFromStream(taskDefDump['config'])
506 # Rebuild TaskDef
507 recreatedTaskDef = TaskDef(taskName=taskDefDump['taskName'],
508 taskClass=taskClass,
509 config=config,
510 label=taskDefDump['label'])
511 loadedTaskDef[nodeTaskLabel] = recreatedTaskDef
513 # rebuild the mappings that associate dataset type names with
514 # TaskDefs
515 for _, input in self.infoMappings.taskDefMap[nodeTaskLabel]['inputs']:
516 datasetDict.addConsumer(DatasetTypeName(input), recreatedTaskDef)
518 added = set()
519 for outputConnection in self.infoMappings.taskDefMap[nodeTaskLabel]['outputs']:
520 typeName = outputConnection[1]
521 if typeName not in added:
522 added.add(typeName)
523 datasetDict.addProducer(DatasetTypeName(typeName), recreatedTaskDef)
525 # reconstitute the node, passing in the dictionaries for the
526 # loaded TaskDefs and dimension records. These are used to ensure
527 # that each unique record is only loaded once
528 node = QuantumNode.from_simple(nodeDeserialized, loadedTaskDef, universe, recontitutedDimensions)
529 container[node.nodeId] = node
530 taskToQuantumNode[loadedTaskDef[nodeTaskLabel]].add(node)
532 # recreate the relations between each node from stored info
533 graph.add_node(node)
534 for id in self.infoMappings.map[node.nodeId]['inputs']:
535 # uuid is stored as a string, turn it back into a uuid
536 id = uuid.UUID(id)
537 # if the id is not yet in the container, dont make a connection
538 # this is not an issue, because once it is, that id will add
539 # the reverse connection
540 if id in container:
541 graph.add_edge(container[id], node)
542 for id in self.infoMappings.map[node.nodeId]['outputs']:
543 # uuid is stored as a string, turn it back into a uuid
544 id = uuid.UUID(id)
545 # if the id is not yet in the container, dont make a connection
546 # this is not an issue, because once it is, that id will add
547 # the reverse connection
548 if id in container:
549 graph.add_edge(node, container[id])
551 newGraph = object.__new__(QuantumGraph)
552 newGraph._metadata = self.infoMappings.metadata
553 newGraph._buildId = self.infoMappings._buildId
554 newGraph._datasetDict = datasetDict
555 newGraph._nodeIdMap = container
556 newGraph._count = len(nodes)
557 newGraph._taskToQuantumNode = dict(taskToQuantumNode.items())
558 newGraph._taskGraph = datasetDict.makeNetworkXGraph()
559 newGraph._connectedQuanta = graph
560 return newGraph
563DESERIALIZER_MAP = {1: DeserializerV1, 2: DeserializerV2, 3: DeserializerV3}