Coverage for python/lsst/pipe/base/graph/_versionDeserializers.py: 32%
231 statements
« prev ^ index » next coverage.py v6.4.1, created at 2022-06-30 10:31 +0000
« prev ^ index » next coverage.py v6.4.1, created at 2022-06-30 10:31 +0000
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ("DESERIALIZER_MAP",)
25import json
26import lzma
27import pickle
28import struct
29import uuid
30from abc import ABC, abstractmethod
31from collections import defaultdict
32from dataclasses import dataclass
33from types import SimpleNamespace
34from typing import TYPE_CHECKING, Callable, ClassVar, DefaultDict, Dict, Optional, Set, Tuple, Type
36import networkx as nx
37from lsst.daf.butler import DimensionRecord, DimensionUniverse, Quantum, SerializedDimensionRecord
38from lsst.pex.config import Config
39from lsst.utils import doImportType
41from ..pipeline import TaskDef
42from ..pipelineTask import PipelineTask
43from ._implDetails import DatasetTypeName, _DatasetTracker
44from .quantumNode import QuantumNode, SerializedQuantumNode
46if TYPE_CHECKING: 46 ↛ 47line 46 didn't jump to line 47, because the condition on line 46 was never true
47 from .graph import QuantumGraph
50class StructSizeDescriptor:
51 """This is basically a class level property. It exists to report the size
52 (number of bytes) of whatever the formatter string is for a deserializer
53 """
55 def __get__(self, inst: Optional[DeserializerBase], owner: Type[DeserializerBase]) -> int:
56 return struct.calcsize(owner.FMT_STRING())
59# MyPy doesn't seem to like the idea of an abstract dataclass. It seems to
60# work, but maybe we're doing something that isn't really supported (or maybe
61# I misunderstood the error message).
62@dataclass # type: ignore
63class DeserializerBase(ABC):
64 @classmethod
65 @abstractmethod
66 def FMT_STRING(cls) -> str: # noqa: N805 # flake8 wants self
67 raise NotImplementedError("Base class does not implement this method")
69 structSize: ClassVar[StructSizeDescriptor]
71 preambleSize: int
72 sizeBytes: bytes
74 def __init_subclass__(cls) -> None:
75 # attach the size decriptor
76 cls.structSize = StructSizeDescriptor()
77 super().__init_subclass__()
79 def unpackHeader(self, rawHeader: bytes) -> Optional[str]:
80 """Transforms the raw bytes corresponding to the header of a save into
81 a string of the header information. Returns none if the save format has
82 no header string implementation (such as save format 1 that is all
83 pickle)
85 Parameters
86 ----------
87 rawheader : bytes
88 The bytes that are to be parsed into the header information. These
89 are the bytes after the preamble and structsize number of bytes
90 and before the headerSize bytes
91 """
92 raise NotImplementedError("Base class does not implement this method")
94 @property
95 def headerSize(self) -> int:
96 """Returns the number of bytes from the beginning of the file to the
97 end of the metadata.
98 """
99 raise NotImplementedError("Base class does not implement this method")
101 def readHeaderInfo(self, rawHeader: bytes) -> SimpleNamespace:
102 """Parse the supplied raw bytes into the header information and
103 byte ranges of specific TaskDefs and QuantumNodes
105 Parameters
106 ----------
107 rawheader : bytes
108 The bytes that are to be parsed into the header information. These
109 are the bytes after the preamble and structsize number of bytes
110 and before the headerSize bytes
111 """
112 raise NotImplementedError("Base class does not implement this method")
114 def constructGraph(
115 self, nodes: set[uuid.UUID], _readBytes: Callable[[int, int], bytes], universe: DimensionUniverse
116 ) -> QuantumGraph:
117 """Constructs a graph from the deserialized information.
119 Parameters
120 ----------
121 nodes : `set` of `uuid.UUID`
122 The nodes to include in the graph
123 _readBytes : callable
124 A callable that can be used to read bytes from the file handle.
125 The callable will take two ints, start and stop, to use as the
126 numerical bounds to read and returns a byte stream.
127 universe : `~lsst.daf.butler.DimensionUniverse`
128 The singleton of all dimensions known to the middleware registry
129 """
130 raise NotImplementedError("Base class does not implement this method")
132 def description(self) -> str:
133 """Return the description of the serialized data format"""
134 raise NotImplementedError("Base class does not implement this method")
137Version1Description = """
138The save file starts with the first few bytes corresponding to the magic bytes
139in the QuantumGraph: `qgraph4\xf6\xe8\xa9`.
141The next few bytes are 2 big endian unsigned 64 bit integers.
143The first unsigned 64 bit integer corresponds to the number of bytes of a
144python mapping of TaskDef labels to the byte ranges in the save file where the
145definition can be loaded.
147The second unsigned 64 bit integer corrresponds to the number of bytes of a
148python mapping of QuantumGraph Node number to the byte ranges in the save file
149where the node can be loaded. The byte range is indexed starting after
150the `header` bytes of the magic bytes, size bytes, and bytes of the two
151mappings.
153Each of the above mappings are pickled and then lzma compressed, so to
154deserialize the bytes, first lzma decompression must be performed and the
155results passed to python pickle loader.
157As stated above, each map contains byte ranges of the corresponding
158datastructure. Theses bytes are also lzma compressed pickles, and should
159be deserialized in a similar manner. The byte range is indexed starting after
160the `header` bytes of the magic bytes, size bytes, and bytes of the two
161mappings.
163In addition to the the TaskDef byte locations, the TypeDef map also contains
164an additional key '__GraphBuildID'. The value associated with this is the
165unique id assigned to the graph at its creation time.
166"""
169@dataclass
170class DeserializerV1(DeserializerBase):
171 @classmethod
172 def FMT_STRING(cls) -> str:
173 return ">QQ"
175 def __post_init__(self) -> None:
176 self.taskDefMapSize, self.nodeMapSize = struct.unpack(self.FMT_STRING(), self.sizeBytes)
178 @property
179 def headerSize(self) -> int:
180 return self.preambleSize + self.structSize + self.taskDefMapSize + self.nodeMapSize
182 def readHeaderInfo(self, rawHeader: bytes) -> SimpleNamespace:
183 returnValue = SimpleNamespace()
184 returnValue.taskDefMap = pickle.loads(rawHeader[: self.taskDefMapSize])
185 returnValue._buildId = returnValue.taskDefMap["__GraphBuildID"]
186 returnValue.map = pickle.loads(rawHeader[self.taskDefMapSize :])
187 returnValue.metadata = None
188 self.returnValue = returnValue
189 return returnValue
191 def unpackHeader(self, rawHeader: bytes) -> Optional[str]:
192 return None
194 def constructGraph(
195 self, nodes: set[uuid.UUID], _readBytes: Callable[[int, int], bytes], universe: DimensionUniverse
196 ) -> QuantumGraph:
197 # need to import here to avoid cyclic imports
198 from . import QuantumGraph
200 quanta: DefaultDict[TaskDef, Set[Quantum]] = defaultdict(set)
201 quantumToNodeId: Dict[Quantum, uuid.UUID] = {}
202 loadedTaskDef = {}
203 # loop over the nodes specified above
204 for node in nodes:
205 # Get the bytes to read from the map
206 start, stop = self.returnValue.map[node]
207 start += self.headerSize
208 stop += self.headerSize
210 # read the specified bytes, will be overloaded by subclasses
211 # bytes are compressed, so decompress them
212 dump = lzma.decompress(_readBytes(start, stop))
214 # reconstruct node
215 qNode = pickle.loads(dump)
216 object.__setattr__(qNode, "nodeId", uuid.uuid4())
218 # read the saved node, name. If it has been loaded, attach it, if
219 # not read in the taskDef first, and then load it
220 nodeTask = qNode.taskDef
221 if nodeTask not in loadedTaskDef:
222 # Get the byte ranges corresponding to this taskDef
223 start, stop = self.returnValue.taskDefMap[nodeTask]
224 start += self.headerSize
225 stop += self.headerSize
227 # load the taskDef, this method call will be overloaded by
228 # subclasses.
229 # bytes are compressed, so decompress them
230 taskDef = pickle.loads(lzma.decompress(_readBytes(start, stop)))
231 loadedTaskDef[nodeTask] = taskDef
232 # Explicitly overload the "frozen-ness" of nodes to attach the
233 # taskDef back into the un-persisted node
234 object.__setattr__(qNode, "taskDef", loadedTaskDef[nodeTask])
235 quanta[qNode.taskDef].add(qNode.quantum)
237 # record the node for later processing
238 quantumToNodeId[qNode.quantum] = qNode.nodeId
240 # construct an empty new QuantumGraph object, and run the associated
241 # creation method with the un-persisted data
242 qGraph = object.__new__(QuantumGraph)
243 qGraph._buildGraphs(
244 quanta,
245 _quantumToNodeId=quantumToNodeId,
246 _buildId=self.returnValue._buildId,
247 metadata=self.returnValue.metadata,
248 )
249 return qGraph
251 def description(self) -> str:
252 return Version1Description
255Version2Description = """
256The save file starts with the first few bytes corresponding to the magic bytes
257in the QuantumGraph: `qgraph4\xf6\xe8\xa9`.
259The next few bytes are a big endian unsigned long long.
261The unsigned long long corresponds to the number of bytes of a python mapping
262of header information. This mapping is encoded into json and then lzma
263compressed, meaning the operations must be performed in the opposite order to
264deserialize.
266The json encoded header mapping contains 4 fields: TaskDefs, GraphBuildId,
267Nodes, and Metadata.
269The `TaskDefs` key corresponds to a value which is a mapping of Task label to
270task data. The task data is a mapping of key to value, where the only key is
271`bytes` and it corresponds to a tuple of a byte range of the start, stop
272bytes (indexed after all the header bytes)
274The `GraphBuildId` corresponds with a string that is the unique id assigned to
275this graph when it was created.
277The `Nodes` key is like the `TaskDefs` key except it corresponds to
278QuantumNodes instead of TaskDefs. Another important difference is that JSON
279formatting does not allow using numbers as keys, and this mapping is keyed by
280the node number. Thus it is stored in JSON as two equal length lists, the first
281being the keys, and the second the values associated with those keys.
283The `Metadata` key is a mapping of strings to associated values. This metadata
284may be anything that is important to be transported alongside the graph.
286As stated above, each map contains byte ranges of the corresponding
287datastructure. Theses bytes are also lzma compressed pickles, and should
288be deserialized in a similar manner.
289"""
292@dataclass
293class DeserializerV2(DeserializerBase):
294 @classmethod
295 def FMT_STRING(cls) -> str:
296 return ">Q"
298 def __post_init__(self) -> None:
299 (self.mapSize,) = struct.unpack(self.FMT_STRING(), self.sizeBytes)
301 @property
302 def headerSize(self) -> int:
303 return self.preambleSize + self.structSize + self.mapSize
305 def readHeaderInfo(self, rawHeader: bytes) -> SimpleNamespace:
306 uncompressedHeaderMap = self.unpackHeader(rawHeader)
307 if uncompressedHeaderMap is None:
308 raise ValueError(
309 "This error is not possible because self.unpackHeader cannot return None,"
310 " but is done to satisfy type checkers"
311 )
312 header = json.loads(uncompressedHeaderMap)
313 returnValue = SimpleNamespace()
314 returnValue.taskDefMap = header["TaskDefs"]
315 returnValue._buildId = header["GraphBuildID"]
316 returnValue.map = dict(header["Nodes"])
317 returnValue.metadata = header["Metadata"]
318 self.returnValue = returnValue
319 return returnValue
321 def unpackHeader(self, rawHeader: bytes) -> Optional[str]:
322 return lzma.decompress(rawHeader).decode()
324 def constructGraph(
325 self, nodes: set[uuid.UUID], _readBytes: Callable[[int, int], bytes], universe: DimensionUniverse
326 ) -> QuantumGraph:
327 # need to import here to avoid cyclic imports
328 from . import QuantumGraph
330 quanta: DefaultDict[TaskDef, Set[Quantum]] = defaultdict(set)
331 quantumToNodeId: Dict[Quantum, uuid.UUID] = {}
332 loadedTaskDef = {}
333 # loop over the nodes specified above
334 for node in nodes:
335 # Get the bytes to read from the map
336 start, stop = self.returnValue.map[node]["bytes"]
337 start += self.headerSize
338 stop += self.headerSize
340 # read the specified bytes, will be overloaded by subclasses
341 # bytes are compressed, so decompress them
342 dump = lzma.decompress(_readBytes(start, stop))
344 # reconstruct node
345 qNode = pickle.loads(dump)
346 object.__setattr__(qNode, "nodeId", uuid.uuid4())
348 # read the saved node, name. If it has been loaded, attach it, if
349 # not read in the taskDef first, and then load it
350 nodeTask = qNode.taskDef
351 if nodeTask not in loadedTaskDef:
352 # Get the byte ranges corresponding to this taskDef
353 start, stop = self.returnValue.taskDefMap[nodeTask]["bytes"]
354 start += self.headerSize
355 stop += self.headerSize
357 # load the taskDef, this method call will be overloaded by
358 # subclasses.
359 # bytes are compressed, so decompress them
360 taskDef = pickle.loads(lzma.decompress(_readBytes(start, stop)))
361 loadedTaskDef[nodeTask] = taskDef
362 # Explicitly overload the "frozen-ness" of nodes to attach the
363 # taskDef back into the un-persisted node
364 object.__setattr__(qNode, "taskDef", loadedTaskDef[nodeTask])
365 quanta[qNode.taskDef].add(qNode.quantum)
367 # record the node for later processing
368 quantumToNodeId[qNode.quantum] = qNode.nodeId
370 # construct an empty new QuantumGraph object, and run the associated
371 # creation method with the un-persisted data
372 qGraph = object.__new__(QuantumGraph)
373 qGraph._buildGraphs(
374 quanta,
375 _quantumToNodeId=quantumToNodeId,
376 _buildId=self.returnValue._buildId,
377 metadata=self.returnValue.metadata,
378 )
379 return qGraph
381 def description(self) -> str:
382 return Version2Description
385Version3Description = """
386The save file starts with the first few bytes corresponding to the magic bytes
387in the QuantumGraph: `qgraph4\xf6\xe8\xa9`.
389The next few bytes are a big endian unsigned long long.
391The unsigned long long corresponds to the number of bytes of a mapping
392of header information. This mapping is encoded into json and then lzma
393compressed, meaning the operations must be performed in the opposite order to
394deserialize.
396The json encoded header mapping contains 5 fields: GraphBuildId, TaskDefs,
397Nodes, Metadata, and DimensionRecords.
399The `GraphBuildId` key corresponds with a string that is the unique id assigned
400to this graph when it was created.
402The `TaskDefs` key corresponds to a value which is a mapping of Task label to
403task data. The task data is a mapping of key to value. The keys of this mapping
404are `bytes`, `inputs`, and `outputs`.
406The `TaskDefs` `bytes` key corresponds to a tuple of a byte range of the
407start, stop bytes (indexed after all the header bytes). This byte rage
408corresponds to a lzma compressed json mapping. This mapping has keys of
409`taskName`, corresponding to a fully qualified python class, `config` a
410pex_config string that is used to configure the class, and `label` which
411corresponds to a string that uniquely identifies the task within a given
412execution pipeline.
414The `TaskDefs` `inputs` key is associated with a list of tuples where each
415tuple is a label of a task that is considered coming before a given task, and
416the name of the dataset that is shared between the tasks (think node and edge
417in a graph sense).
419The `TaskDefs` `outputs` key is like inputs except the values in a list
420correspond to all the output connections of a task.
422The `Nodes` key is also a json mapping with keys corresponding to the UUIDs of
423QuantumNodes. The values associated with these keys is another mapping with
424the keys `bytes`, `inputs`, and `outputs`.
426`Nodes` key `bytes` corresponds to a tuple of a byte range of the start, stop
427bytes (indexed after all the header bytes). These bytes are a lzma compressed
428json mapping which contains many sub elements, this mapping will be referred to
429as the SerializedQuantumNode (related to the python class it corresponds to).
431SerializedQUantumNodes have 3 keys, `quantum` corresponding to a json mapping
432(described below) referred to as a SerializedQuantum, `taskLabel` a string
433which corresponds to a label in the `TaskDefs` mapping, and `nodeId.
435A SerializedQuantum has many keys; taskName, dataId, datasetTypeMapping,
436initInputs, inputs, outputs, dimensionRecords.
438like the `TaskDefs` key except it corresponds to
439QuantumNodes instead of TaskDefs, and the keys of the mappings are string
440representations of the UUIDs of the QuantumNodes.
442The `Metadata` key is a mapping of strings to associated values. This metadata
443may be anything that is important to be transported alongside the graph.
445As stated above, each map contains byte ranges of the corresponding
446datastructure. Theses bytes are also lzma compressed pickles, and should
447be deserialized in a similar manner.
448"""
451@dataclass
452class DeserializerV3(DeserializerBase):
453 @classmethod
454 def FMT_STRING(cls) -> str:
455 return ">Q"
457 def __post_init__(self) -> None:
458 self.infoSize: int
459 (self.infoSize,) = struct.unpack(self.FMT_STRING(), self.sizeBytes)
461 @property
462 def headerSize(self) -> int:
463 return self.preambleSize + self.structSize + self.infoSize
465 def readHeaderInfo(self, rawHeader: bytes) -> SimpleNamespace:
466 uncompressedinfoMap = self.unpackHeader(rawHeader)
467 assert uncompressedinfoMap is not None # for python typing, this variant can't be None
468 infoMap = json.loads(uncompressedinfoMap)
469 infoMappings = SimpleNamespace()
470 infoMappings.taskDefMap = infoMap["TaskDefs"]
471 infoMappings._buildId = infoMap["GraphBuildID"]
472 infoMappings.map = {uuid.UUID(k): v for k, v in infoMap["Nodes"]}
473 infoMappings.metadata = infoMap["Metadata"]
474 infoMappings.dimensionRecords = {}
475 for k, v in infoMap["DimensionRecords"].items():
476 infoMappings.dimensionRecords[int(k)] = SerializedDimensionRecord(**v)
477 self.infoMappings = infoMappings
478 return infoMappings
480 def unpackHeader(self, rawHeader: bytes) -> Optional[str]:
481 return lzma.decompress(rawHeader).decode()
483 def constructGraph(
484 self, nodes: set[uuid.UUID], _readBytes: Callable[[int, int], bytes], universe: DimensionUniverse
485 ) -> QuantumGraph:
486 # need to import here to avoid cyclic imports
487 from . import QuantumGraph
489 graph = nx.DiGraph()
490 loadedTaskDef: Dict[str, TaskDef] = {}
491 container = {}
492 datasetDict = _DatasetTracker[DatasetTypeName, TaskDef](createInverse=True)
493 taskToQuantumNode: DefaultDict[TaskDef, Set[QuantumNode]] = defaultdict(set)
494 recontitutedDimensions: Dict[int, Tuple[str, DimensionRecord]] = {}
496 for node in nodes:
497 start, stop = self.infoMappings.map[node]["bytes"]
498 start, stop = start + self.headerSize, stop + self.headerSize
499 # Read in the bytes corresponding to the node to load and
500 # decompress it
501 dump = json.loads(lzma.decompress(_readBytes(start, stop)))
503 # Turn the json back into the pydandtic model
504 nodeDeserialized = SerializedQuantumNode.direct(**dump)
505 # attach the dictionary of dimension records to the pydandtic model
506 # these are stored seperately because the are stored over and over
507 # and this saves a lot of space and time.
508 nodeDeserialized.quantum.dimensionRecords = self.infoMappings.dimensionRecords
509 # get the label for the current task
510 nodeTaskLabel = nodeDeserialized.taskLabel
512 if nodeTaskLabel not in loadedTaskDef:
513 # Get the byte ranges corresponding to this taskDef
514 start, stop = self.infoMappings.taskDefMap[nodeTaskLabel]["bytes"]
515 start, stop = start + self.headerSize, stop + self.headerSize
517 # bytes are compressed, so decompress them
518 taskDefDump = json.loads(lzma.decompress(_readBytes(start, stop)))
519 taskClass: Type[PipelineTask] = doImportType(taskDefDump["taskName"])
520 config: Config = taskClass.ConfigClass()
521 config.loadFromStream(taskDefDump["config"])
522 # Rebuild TaskDef
523 recreatedTaskDef = TaskDef(
524 taskName=taskDefDump["taskName"],
525 taskClass=taskClass,
526 config=config,
527 label=taskDefDump["label"],
528 )
529 loadedTaskDef[nodeTaskLabel] = recreatedTaskDef
531 # rebuild the mappings that associate dataset type names with
532 # TaskDefs
533 for _, input in self.infoMappings.taskDefMap[nodeTaskLabel]["inputs"]:
534 datasetDict.addConsumer(DatasetTypeName(input), recreatedTaskDef)
536 added = set()
537 for outputConnection in self.infoMappings.taskDefMap[nodeTaskLabel]["outputs"]:
538 typeName = outputConnection[1]
539 if typeName not in added:
540 added.add(typeName)
541 datasetDict.addProducer(DatasetTypeName(typeName), recreatedTaskDef)
543 # reconstitute the node, passing in the dictionaries for the
544 # loaded TaskDefs and dimension records. These are used to ensure
545 # that each unique record is only loaded once
546 qnode = QuantumNode.from_simple(nodeDeserialized, loadedTaskDef, universe, recontitutedDimensions)
547 container[qnode.nodeId] = qnode
548 taskToQuantumNode[loadedTaskDef[nodeTaskLabel]].add(qnode)
550 # recreate the relations between each node from stored info
551 graph.add_node(qnode)
552 for id in self.infoMappings.map[qnode.nodeId]["inputs"]:
553 # uuid is stored as a string, turn it back into a uuid
554 id = uuid.UUID(id)
555 # if the id is not yet in the container, dont make a connection
556 # this is not an issue, because once it is, that id will add
557 # the reverse connection
558 if id in container:
559 graph.add_edge(container[id], qnode)
560 for id in self.infoMappings.map[qnode.nodeId]["outputs"]:
561 # uuid is stored as a string, turn it back into a uuid
562 id = uuid.UUID(id)
563 # if the id is not yet in the container, dont make a connection
564 # this is not an issue, because once it is, that id will add
565 # the reverse connection
566 if id in container:
567 graph.add_edge(qnode, container[id])
569 newGraph = object.__new__(QuantumGraph)
570 newGraph._metadata = self.infoMappings.metadata
571 newGraph._buildId = self.infoMappings._buildId
572 newGraph._datasetDict = datasetDict
573 newGraph._nodeIdMap = container
574 newGraph._count = len(nodes)
575 newGraph._taskToQuantumNode = dict(taskToQuantumNode.items())
576 newGraph._taskGraph = datasetDict.makeNetworkXGraph()
577 newGraph._connectedQuanta = graph
578 return newGraph
581DESERIALIZER_MAP = {1: DeserializerV1, 2: DeserializerV2, 3: DeserializerV3}