Coverage for python/lsst/pipe/base/graph/_versionDeserializers.py: 30%
240 statements
« prev ^ index » next coverage.py v6.4.2, created at 2022-08-04 09:17 +0000
« prev ^ index » next coverage.py v6.4.2, created at 2022-08-04 09:17 +0000
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ("DESERIALIZER_MAP",)
25import json
26import lzma
27import pickle
28import struct
29import uuid
30from abc import ABC, abstractmethod
31from collections import defaultdict
32from dataclasses import dataclass
33from types import SimpleNamespace
34from typing import TYPE_CHECKING, Callable, ClassVar, DefaultDict, Dict, Optional, Set, Tuple, Type
36import networkx as nx
37from lsst.daf.butler import (
38 DimensionConfig,
39 DimensionRecord,
40 DimensionUniverse,
41 Quantum,
42 SerializedDimensionRecord,
43)
44from lsst.utils import doImportType
46from ..config import PipelineTaskConfig
47from ..pipeline import TaskDef
48from ..pipelineTask import PipelineTask
49from ._implDetails import DatasetTypeName, _DatasetTracker
50from .quantumNode import QuantumNode, SerializedQuantumNode
52if TYPE_CHECKING: 52 ↛ 53line 52 didn't jump to line 53, because the condition on line 52 was never true
53 from .graph import QuantumGraph
56class StructSizeDescriptor:
57 """This is basically a class level property. It exists to report the size
58 (number of bytes) of whatever the formatter string is for a deserializer
59 """
61 def __get__(self, inst: Optional[DeserializerBase], owner: Type[DeserializerBase]) -> int:
62 return struct.calcsize(owner.FMT_STRING())
65# MyPy doesn't seem to like the idea of an abstract dataclass. It seems to
66# work, but maybe we're doing something that isn't really supported (or maybe
67# I misunderstood the error message).
68@dataclass # type: ignore
69class DeserializerBase(ABC):
70 @classmethod
71 @abstractmethod
72 def FMT_STRING(cls) -> str: # noqa: N805 # flake8 wants self
73 raise NotImplementedError("Base class does not implement this method")
75 structSize: ClassVar[StructSizeDescriptor]
77 preambleSize: int
78 sizeBytes: bytes
80 def __init_subclass__(cls) -> None:
81 # attach the size decriptor
82 cls.structSize = StructSizeDescriptor()
83 super().__init_subclass__()
85 def unpackHeader(self, rawHeader: bytes) -> Optional[str]:
86 """Transforms the raw bytes corresponding to the header of a save into
87 a string of the header information. Returns none if the save format has
88 no header string implementation (such as save format 1 that is all
89 pickle)
91 Parameters
92 ----------
93 rawheader : bytes
94 The bytes that are to be parsed into the header information. These
95 are the bytes after the preamble and structsize number of bytes
96 and before the headerSize bytes
97 """
98 raise NotImplementedError("Base class does not implement this method")
100 @property
101 def headerSize(self) -> int:
102 """Returns the number of bytes from the beginning of the file to the
103 end of the metadata.
104 """
105 raise NotImplementedError("Base class does not implement this method")
107 def readHeaderInfo(self, rawHeader: bytes) -> SimpleNamespace:
108 """Parse the supplied raw bytes into the header information and
109 byte ranges of specific TaskDefs and QuantumNodes
111 Parameters
112 ----------
113 rawheader : bytes
114 The bytes that are to be parsed into the header information. These
115 are the bytes after the preamble and structsize number of bytes
116 and before the headerSize bytes
117 """
118 raise NotImplementedError("Base class does not implement this method")
120 def constructGraph(
121 self,
122 nodes: set[uuid.UUID],
123 _readBytes: Callable[[int, int], bytes],
124 universe: Optional[DimensionUniverse] = None,
125 ) -> QuantumGraph:
126 """Constructs a graph from the deserialized information.
128 Parameters
129 ----------
130 nodes : `set` of `uuid.UUID`
131 The nodes to include in the graph
132 _readBytes : callable
133 A callable that can be used to read bytes from the file handle.
134 The callable will take two ints, start and stop, to use as the
135 numerical bounds to read and returns a byte stream.
136 universe : `~lsst.daf.butler.DimensionUniverse`
137 The singleton of all dimensions known to the middleware registry
138 """
139 raise NotImplementedError("Base class does not implement this method")
141 def description(self) -> str:
142 """Return the description of the serialized data format"""
143 raise NotImplementedError("Base class does not implement this method")
146Version1Description = """
147The save file starts with the first few bytes corresponding to the magic bytes
148in the QuantumGraph: `qgraph4\xf6\xe8\xa9`.
150The next few bytes are 2 big endian unsigned 64 bit integers.
152The first unsigned 64 bit integer corresponds to the number of bytes of a
153python mapping of TaskDef labels to the byte ranges in the save file where the
154definition can be loaded.
156The second unsigned 64 bit integer corrresponds to the number of bytes of a
157python mapping of QuantumGraph Node number to the byte ranges in the save file
158where the node can be loaded. The byte range is indexed starting after
159the `header` bytes of the magic bytes, size bytes, and bytes of the two
160mappings.
162Each of the above mappings are pickled and then lzma compressed, so to
163deserialize the bytes, first lzma decompression must be performed and the
164results passed to python pickle loader.
166As stated above, each map contains byte ranges of the corresponding
167datastructure. Theses bytes are also lzma compressed pickles, and should
168be deserialized in a similar manner. The byte range is indexed starting after
169the `header` bytes of the magic bytes, size bytes, and bytes of the two
170mappings.
172In addition to the the TaskDef byte locations, the TypeDef map also contains
173an additional key '__GraphBuildID'. The value associated with this is the
174unique id assigned to the graph at its creation time.
175"""
178@dataclass
179class DeserializerV1(DeserializerBase):
180 @classmethod
181 def FMT_STRING(cls) -> str:
182 return ">QQ"
184 def __post_init__(self) -> None:
185 self.taskDefMapSize, self.nodeMapSize = struct.unpack(self.FMT_STRING(), self.sizeBytes)
187 @property
188 def headerSize(self) -> int:
189 return self.preambleSize + self.structSize + self.taskDefMapSize + self.nodeMapSize
191 def readHeaderInfo(self, rawHeader: bytes) -> SimpleNamespace:
192 returnValue = SimpleNamespace()
193 returnValue.taskDefMap = pickle.loads(rawHeader[: self.taskDefMapSize])
194 returnValue._buildId = returnValue.taskDefMap["__GraphBuildID"]
195 returnValue.map = pickle.loads(rawHeader[self.taskDefMapSize :])
196 returnValue.metadata = None
197 self.returnValue = returnValue
198 return returnValue
200 def unpackHeader(self, rawHeader: bytes) -> Optional[str]:
201 return None
203 def constructGraph(
204 self,
205 nodes: set[uuid.UUID],
206 _readBytes: Callable[[int, int], bytes],
207 universe: Optional[DimensionUniverse] = None,
208 ) -> QuantumGraph:
209 # need to import here to avoid cyclic imports
210 from . import QuantumGraph
212 quanta: DefaultDict[TaskDef, Set[Quantum]] = defaultdict(set)
213 quantumToNodeId: Dict[Quantum, uuid.UUID] = {}
214 loadedTaskDef = {}
215 # loop over the nodes specified above
216 for node in nodes:
217 # Get the bytes to read from the map
218 start, stop = self.returnValue.map[node]
219 start += self.headerSize
220 stop += self.headerSize
222 # read the specified bytes, will be overloaded by subclasses
223 # bytes are compressed, so decompress them
224 dump = lzma.decompress(_readBytes(start, stop))
226 # reconstruct node
227 qNode = pickle.loads(dump)
228 object.__setattr__(qNode, "nodeId", uuid.uuid4())
230 # read the saved node, name. If it has been loaded, attach it, if
231 # not read in the taskDef first, and then load it
232 nodeTask = qNode.taskDef
233 if nodeTask not in loadedTaskDef:
234 # Get the byte ranges corresponding to this taskDef
235 start, stop = self.returnValue.taskDefMap[nodeTask]
236 start += self.headerSize
237 stop += self.headerSize
239 # load the taskDef, this method call will be overloaded by
240 # subclasses.
241 # bytes are compressed, so decompress them
242 taskDef = pickle.loads(lzma.decompress(_readBytes(start, stop)))
243 loadedTaskDef[nodeTask] = taskDef
244 # Explicitly overload the "frozen-ness" of nodes to attach the
245 # taskDef back into the un-persisted node
246 object.__setattr__(qNode, "taskDef", loadedTaskDef[nodeTask])
247 quanta[qNode.taskDef].add(qNode.quantum)
249 # record the node for later processing
250 quantumToNodeId[qNode.quantum] = qNode.nodeId
252 # construct an empty new QuantumGraph object, and run the associated
253 # creation method with the un-persisted data
254 qGraph = object.__new__(QuantumGraph)
255 qGraph._buildGraphs(
256 quanta,
257 _quantumToNodeId=quantumToNodeId,
258 _buildId=self.returnValue._buildId,
259 metadata=self.returnValue.metadata,
260 universe=universe,
261 )
262 return qGraph
264 def description(self) -> str:
265 return Version1Description
268Version2Description = """
269The save file starts with the first few bytes corresponding to the magic bytes
270in the QuantumGraph: `qgraph4\xf6\xe8\xa9`.
272The next few bytes are a big endian unsigned long long.
274The unsigned long long corresponds to the number of bytes of a python mapping
275of header information. This mapping is encoded into json and then lzma
276compressed, meaning the operations must be performed in the opposite order to
277deserialize.
279The json encoded header mapping contains 4 fields: TaskDefs, GraphBuildId,
280Nodes, and Metadata.
282The `TaskDefs` key corresponds to a value which is a mapping of Task label to
283task data. The task data is a mapping of key to value, where the only key is
284`bytes` and it corresponds to a tuple of a byte range of the start, stop
285bytes (indexed after all the header bytes)
287The `GraphBuildId` corresponds with a string that is the unique id assigned to
288this graph when it was created.
290The `Nodes` key is like the `TaskDefs` key except it corresponds to
291QuantumNodes instead of TaskDefs. Another important difference is that JSON
292formatting does not allow using numbers as keys, and this mapping is keyed by
293the node number. Thus it is stored in JSON as two equal length lists, the first
294being the keys, and the second the values associated with those keys.
296The `Metadata` key is a mapping of strings to associated values. This metadata
297may be anything that is important to be transported alongside the graph.
299As stated above, each map contains byte ranges of the corresponding
300datastructure. Theses bytes are also lzma compressed pickles, and should
301be deserialized in a similar manner.
302"""
305@dataclass
306class DeserializerV2(DeserializerBase):
307 @classmethod
308 def FMT_STRING(cls) -> str:
309 return ">Q"
311 def __post_init__(self) -> None:
312 (self.mapSize,) = struct.unpack(self.FMT_STRING(), self.sizeBytes)
314 @property
315 def headerSize(self) -> int:
316 return self.preambleSize + self.structSize + self.mapSize
318 def readHeaderInfo(self, rawHeader: bytes) -> SimpleNamespace:
319 uncompressedHeaderMap = self.unpackHeader(rawHeader)
320 if uncompressedHeaderMap is None:
321 raise ValueError(
322 "This error is not possible because self.unpackHeader cannot return None,"
323 " but is done to satisfy type checkers"
324 )
325 header = json.loads(uncompressedHeaderMap)
326 returnValue = SimpleNamespace()
327 returnValue.taskDefMap = header["TaskDefs"]
328 returnValue._buildId = header["GraphBuildID"]
329 returnValue.map = dict(header["Nodes"])
330 returnValue.metadata = header["Metadata"]
331 self.returnValue = returnValue
332 return returnValue
334 def unpackHeader(self, rawHeader: bytes) -> Optional[str]:
335 return lzma.decompress(rawHeader).decode()
337 def constructGraph(
338 self,
339 nodes: set[uuid.UUID],
340 _readBytes: Callable[[int, int], bytes],
341 universe: Optional[DimensionUniverse] = None,
342 ) -> QuantumGraph:
343 # need to import here to avoid cyclic imports
344 from . import QuantumGraph
346 quanta: DefaultDict[TaskDef, Set[Quantum]] = defaultdict(set)
347 quantumToNodeId: Dict[Quantum, uuid.UUID] = {}
348 loadedTaskDef = {}
349 # loop over the nodes specified above
350 for node in nodes:
351 # Get the bytes to read from the map
352 start, stop = self.returnValue.map[node]["bytes"]
353 start += self.headerSize
354 stop += self.headerSize
356 # read the specified bytes, will be overloaded by subclasses
357 # bytes are compressed, so decompress them
358 dump = lzma.decompress(_readBytes(start, stop))
360 # reconstruct node
361 qNode = pickle.loads(dump)
362 object.__setattr__(qNode, "nodeId", uuid.uuid4())
364 # read the saved node, name. If it has been loaded, attach it, if
365 # not read in the taskDef first, and then load it
366 nodeTask = qNode.taskDef
367 if nodeTask not in loadedTaskDef:
368 # Get the byte ranges corresponding to this taskDef
369 start, stop = self.returnValue.taskDefMap[nodeTask]["bytes"]
370 start += self.headerSize
371 stop += self.headerSize
373 # load the taskDef, this method call will be overloaded by
374 # subclasses.
375 # bytes are compressed, so decompress them
376 taskDef = pickle.loads(lzma.decompress(_readBytes(start, stop)))
377 loadedTaskDef[nodeTask] = taskDef
378 # Explicitly overload the "frozen-ness" of nodes to attach the
379 # taskDef back into the un-persisted node
380 object.__setattr__(qNode, "taskDef", loadedTaskDef[nodeTask])
381 quanta[qNode.taskDef].add(qNode.quantum)
383 # record the node for later processing
384 quantumToNodeId[qNode.quantum] = qNode.nodeId
386 # construct an empty new QuantumGraph object, and run the associated
387 # creation method with the un-persisted data
388 qGraph = object.__new__(QuantumGraph)
389 qGraph._buildGraphs(
390 quanta,
391 _quantumToNodeId=quantumToNodeId,
392 _buildId=self.returnValue._buildId,
393 metadata=self.returnValue.metadata,
394 universe=universe,
395 )
396 return qGraph
398 def description(self) -> str:
399 return Version2Description
402Version3Description = """
403The save file starts with the first few bytes corresponding to the magic bytes
404in the QuantumGraph: `qgraph4\xf6\xe8\xa9`.
406The next few bytes are a big endian unsigned long long.
408The unsigned long long corresponds to the number of bytes of a mapping
409of header information. This mapping is encoded into json and then lzma
410compressed, meaning the operations must be performed in the opposite order to
411deserialize.
413The json encoded header mapping contains 5 fields: GraphBuildId, TaskDefs,
414Nodes, Metadata, and DimensionRecords.
416The `GraphBuildId` key corresponds with a string that is the unique id assigned
417to this graph when it was created.
419The `TaskDefs` key corresponds to a value which is a mapping of Task label to
420task data. The task data is a mapping of key to value. The keys of this mapping
421are `bytes`, `inputs`, and `outputs`.
423The `TaskDefs` `bytes` key corresponds to a tuple of a byte range of the
424start, stop bytes (indexed after all the header bytes). This byte rage
425corresponds to a lzma compressed json mapping. This mapping has keys of
426`taskName`, corresponding to a fully qualified python class, `config` a
427pex_config string that is used to configure the class, and `label` which
428corresponds to a string that uniquely identifies the task within a given
429execution pipeline.
431The `TaskDefs` `inputs` key is associated with a list of tuples where each
432tuple is a label of a task that is considered coming before a given task, and
433the name of the dataset that is shared between the tasks (think node and edge
434in a graph sense).
436The `TaskDefs` `outputs` key is like inputs except the values in a list
437correspond to all the output connections of a task.
439The `Nodes` key is also a json mapping with keys corresponding to the UUIDs of
440QuantumNodes. The values associated with these keys is another mapping with
441the keys `bytes`, `inputs`, and `outputs`.
443`Nodes` key `bytes` corresponds to a tuple of a byte range of the start, stop
444bytes (indexed after all the header bytes). These bytes are a lzma compressed
445json mapping which contains many sub elements, this mapping will be referred to
446as the SerializedQuantumNode (related to the python class it corresponds to).
448SerializedQUantumNodes have 3 keys, `quantum` corresponding to a json mapping
449(described below) referred to as a SerializedQuantum, `taskLabel` a string
450which corresponds to a label in the `TaskDefs` mapping, and `nodeId.
452A SerializedQuantum has many keys; taskName, dataId, datasetTypeMapping,
453initInputs, inputs, outputs, dimensionRecords.
455like the `TaskDefs` key except it corresponds to
456QuantumNodes instead of TaskDefs, and the keys of the mappings are string
457representations of the UUIDs of the QuantumNodes.
459The `Metadata` key is a mapping of strings to associated values. This metadata
460may be anything that is important to be transported alongside the graph.
462As stated above, each map contains byte ranges of the corresponding
463datastructure. Theses bytes are also lzma compressed pickles, and should
464be deserialized in a similar manner.
465"""
468@dataclass
469class DeserializerV3(DeserializerBase):
470 @classmethod
471 def FMT_STRING(cls) -> str:
472 return ">Q"
474 def __post_init__(self) -> None:
475 self.infoSize: int
476 (self.infoSize,) = struct.unpack(self.FMT_STRING(), self.sizeBytes)
478 @property
479 def headerSize(self) -> int:
480 return self.preambleSize + self.structSize + self.infoSize
482 def readHeaderInfo(self, rawHeader: bytes) -> SimpleNamespace:
483 uncompressedinfoMap = self.unpackHeader(rawHeader)
484 assert uncompressedinfoMap is not None # for python typing, this variant can't be None
485 infoMap = json.loads(uncompressedinfoMap)
486 infoMappings = SimpleNamespace()
487 infoMappings.taskDefMap = infoMap["TaskDefs"]
488 infoMappings._buildId = infoMap["GraphBuildID"]
489 infoMappings.map = {uuid.UUID(k): v for k, v in infoMap["Nodes"]}
490 infoMappings.metadata = infoMap["Metadata"]
491 infoMappings.dimensionRecords = {}
492 for k, v in infoMap["DimensionRecords"].items():
493 infoMappings.dimensionRecords[int(k)] = SerializedDimensionRecord(**v)
494 # This is important to be a get call here, so that it supports versions
495 # of saved quantum graph that might not have a saved universe without
496 # changing save format
497 if (universeConfig := infoMap.get("universe")) is not None:
498 universe = DimensionUniverse(config=DimensionConfig(universeConfig))
499 else:
500 universe = DimensionUniverse()
501 infoMappings.universe = universe
502 self.infoMappings = infoMappings
503 return infoMappings
505 def unpackHeader(self, rawHeader: bytes) -> Optional[str]:
506 return lzma.decompress(rawHeader).decode()
508 def constructGraph(
509 self,
510 nodes: set[uuid.UUID],
511 _readBytes: Callable[[int, int], bytes],
512 universe: Optional[DimensionUniverse] = None,
513 ) -> QuantumGraph:
514 # need to import here to avoid cyclic imports
515 from . import QuantumGraph
517 graph = nx.DiGraph()
518 loadedTaskDef: Dict[str, TaskDef] = {}
519 container = {}
520 datasetDict = _DatasetTracker[DatasetTypeName, TaskDef](createInverse=True)
521 taskToQuantumNode: DefaultDict[TaskDef, Set[QuantumNode]] = defaultdict(set)
522 recontitutedDimensions: Dict[int, Tuple[str, DimensionRecord]] = {}
524 if universe is not None:
525 if not universe.isCompatibleWith(self.infoMappings.universe):
526 saved = self.infoMappings.universe
527 raise RuntimeError(
528 f"The saved dimension universe ({saved.namespace}@v{saved.version}) is not "
529 f"compatible with the supplied universe ({universe.namespace}@v{universe.version})."
530 )
531 else:
532 universe = self.infoMappings.universe
534 for node in nodes:
535 start, stop = self.infoMappings.map[node]["bytes"]
536 start, stop = start + self.headerSize, stop + self.headerSize
537 # Read in the bytes corresponding to the node to load and
538 # decompress it
539 dump = json.loads(lzma.decompress(_readBytes(start, stop)))
541 # Turn the json back into the pydandtic model
542 nodeDeserialized = SerializedQuantumNode.direct(**dump)
543 # attach the dictionary of dimension records to the pydandtic model
544 # these are stored seperately because the are stored over and over
545 # and this saves a lot of space and time.
546 nodeDeserialized.quantum.dimensionRecords = self.infoMappings.dimensionRecords
547 # get the label for the current task
548 nodeTaskLabel = nodeDeserialized.taskLabel
550 if nodeTaskLabel not in loadedTaskDef:
551 # Get the byte ranges corresponding to this taskDef
552 start, stop = self.infoMappings.taskDefMap[nodeTaskLabel]["bytes"]
553 start, stop = start + self.headerSize, stop + self.headerSize
555 # bytes are compressed, so decompress them
556 taskDefDump = json.loads(lzma.decompress(_readBytes(start, stop)))
557 taskClass: Type[PipelineTask] = doImportType(taskDefDump["taskName"])
558 config: PipelineTaskConfig = taskClass.ConfigClass()
559 config.loadFromStream(taskDefDump["config"])
560 # Rebuild TaskDef
561 recreatedTaskDef = TaskDef(
562 taskName=taskDefDump["taskName"],
563 taskClass=taskClass,
564 config=config,
565 label=taskDefDump["label"],
566 )
567 loadedTaskDef[nodeTaskLabel] = recreatedTaskDef
569 # rebuild the mappings that associate dataset type names with
570 # TaskDefs
571 for _, input in self.infoMappings.taskDefMap[nodeTaskLabel]["inputs"]:
572 datasetDict.addConsumer(DatasetTypeName(input), recreatedTaskDef)
574 added = set()
575 for outputConnection in self.infoMappings.taskDefMap[nodeTaskLabel]["outputs"]:
576 typeName = outputConnection[1]
577 if typeName not in added:
578 added.add(typeName)
579 datasetDict.addProducer(DatasetTypeName(typeName), recreatedTaskDef)
581 # reconstitute the node, passing in the dictionaries for the
582 # loaded TaskDefs and dimension records. These are used to ensure
583 # that each unique record is only loaded once
584 qnode = QuantumNode.from_simple(nodeDeserialized, loadedTaskDef, universe, recontitutedDimensions)
585 container[qnode.nodeId] = qnode
586 taskToQuantumNode[loadedTaskDef[nodeTaskLabel]].add(qnode)
588 # recreate the relations between each node from stored info
589 graph.add_node(qnode)
590 for id in self.infoMappings.map[qnode.nodeId]["inputs"]:
591 # uuid is stored as a string, turn it back into a uuid
592 id = uuid.UUID(id)
593 # if the id is not yet in the container, dont make a connection
594 # this is not an issue, because once it is, that id will add
595 # the reverse connection
596 if id in container:
597 graph.add_edge(container[id], qnode)
598 for id in self.infoMappings.map[qnode.nodeId]["outputs"]:
599 # uuid is stored as a string, turn it back into a uuid
600 id = uuid.UUID(id)
601 # if the id is not yet in the container, dont make a connection
602 # this is not an issue, because once it is, that id will add
603 # the reverse connection
604 if id in container:
605 graph.add_edge(qnode, container[id])
607 newGraph = object.__new__(QuantumGraph)
608 newGraph._metadata = self.infoMappings.metadata
609 newGraph._buildId = self.infoMappings._buildId
610 newGraph._datasetDict = datasetDict
611 newGraph._nodeIdMap = container
612 newGraph._count = len(nodes)
613 newGraph._taskToQuantumNode = dict(taskToQuantumNode.items())
614 newGraph._taskGraph = datasetDict.makeNetworkXGraph()
615 newGraph._connectedQuanta = graph
616 return newGraph
619DESERIALIZER_MAP = {1: DeserializerV1, 2: DeserializerV2, 3: DeserializerV3}