Coverage for python/lsst/daf/butler/formatters/file.py: 28%
68 statements
« prev ^ index » next coverage.py v7.4.3, created at 2024-03-12 10:07 +0000
« prev ^ index » next coverage.py v7.4.3, created at 2024-03-12 10:07 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28"""Support for reading and writing files to a POSIX file system."""
30from __future__ import annotations
32__all__ = ("FileFormatter",)
34import dataclasses
35from abc import abstractmethod
36from typing import TYPE_CHECKING, Any
38from lsst.daf.butler import Formatter
40if TYPE_CHECKING:
41 from lsst.daf.butler import StorageClass
44class FileFormatter(Formatter):
45 """Interface for reading and writing files on a POSIX file system."""
47 extension: str | None = None
48 """Default file extension to use for writing files. None means that no
49 modifications will be made to the supplied file extension. (`str`)"""
51 @abstractmethod
52 def _readFile(self, path: str, pytype: type[Any] | None = None) -> Any:
53 """Read a file from the path in the correct format.
55 Parameters
56 ----------
57 path : `str`
58 Path to use to open the file.
59 pytype : `class`, optional
60 Class to use to read the file.
62 Returns
63 -------
64 data : `object`
65 Data read from file. Returns `None` if the file can not be
66 found at the given path.
68 Raises
69 ------
70 Exception
71 Some problem reading the file.
72 """
73 pass
75 @abstractmethod
76 def _writeFile(self, inMemoryDataset: Any) -> None:
77 """Write the in memory dataset to file on disk.
79 Parameters
80 ----------
81 inMemoryDataset : `object`
82 Object to serialize.
84 Raises
85 ------
86 Exception
87 The file could not be written.
88 """
89 pass
91 def _assembleDataset(self, data: Any, component: str | None = None) -> Any:
92 """Assembles and coerces the dataset, or one of its components,
93 into an appropriate python type and returns it.
95 Parameters
96 ----------
97 data : `dict` or `object`
98 Composite or a dict that, or which component, needs to be
99 coerced to the python type specified in "fileDescriptor"
100 component : `str`, optional
101 Component to read from the file. Only used if the `StorageClass`
102 for reading differed from the `StorageClass` used to write the
103 file.
105 Returns
106 -------
107 inMemoryDataset : `object`
108 The requested data as a Python object. The type of object
109 is controlled by the specific formatter.
110 """
111 fileDescriptor = self.fileDescriptor
113 # Get the read and write storage classes.
114 readStorageClass = fileDescriptor.readStorageClass
115 writeStorageClass = fileDescriptor.storageClass
117 if component is not None:
118 # Requesting a component implies that we need to first ensure
119 # that the composite is the correct python type. Lie to the
120 # coercion routine since the read StorageClass is not relevant
121 # if we want the original.
122 data = self._coerceType(data, writeStorageClass, writeStorageClass)
124 # Concrete composite written as a single file (we hope)
125 # so try to get the component.
126 try:
127 data = fileDescriptor.storageClass.delegate().getComponent(data, component)
128 except AttributeError:
129 # Defer the complaint
130 data = None
132 # Update the write storage class to match that of the component.
133 # It should be safe to use the component storage class directly
134 # since that should match what was returned from getComponent
135 # (else we could create a temporary storage class guaranteed to
136 # match the python type we have).
137 writeStorageClass = writeStorageClass.allComponents()[component]
139 # Coerce to the requested type.
140 data = self._coerceType(data, writeStorageClass, readStorageClass)
142 return data
144 def _coerceBuiltinType(self, inMemoryDataset: Any, writeStorageClass: StorageClass) -> Any:
145 """Coerce the supplied inMemoryDataset to the written python type if it
146 is currently a built-in type.
148 Parameters
149 ----------
150 inMemoryDataset : `object`
151 Object to coerce to expected type.
152 writeStorageClass : `StorageClass`
153 Storage class used to serialize this data.
155 Returns
156 -------
157 inMemoryDataset : `object`
158 Object of expected type ``writeStorageClass.pytype``.
160 Notes
161 -----
162 This method only modifies the supplied object if the object is:
164 * Not already the required type.
165 * Not `None`.
166 * Looks like a built-in type.
168 It is intended to be used as a helper for file formats that do not
169 store the original Python type information in serialized form and
170 instead return built-in types such as `dict` and `list` that need
171 to be converted to the required form. This happens before
172 `StorageClass` converters trigger so that constructors can be
173 called that can build the original type first before checking the
174 requested Python type. This is important for Pydantic models where
175 the internal structure of the model may not match the `dict` form
176 in a scenario where the user has requested a `dict`.
177 """
178 if (
179 inMemoryDataset is not None
180 and not isinstance(inMemoryDataset, writeStorageClass.pytype)
181 and type(inMemoryDataset).__module__ == "builtins"
182 ):
183 # Try different ways of converting to the required type.
184 # Pydantic v1 uses parse_obj and some non-pydantic classes
185 # use that convention. Pydantic v2 uses model_validate.
186 for method_name in ("model_validate", "parse_obj"):
187 if method := getattr(writeStorageClass.pytype, method_name, None):
188 return method(inMemoryDataset)
189 if isinstance(inMemoryDataset, dict):
190 if dataclasses.is_dataclass(writeStorageClass.pytype):
191 # Dataclasses accept key/value parameters.
192 inMemoryDataset = writeStorageClass.pytype(**inMemoryDataset)
193 elif writeStorageClass.isComposite():
194 # Assume that this type can be constructed
195 # using the registered assembler from a dict.
196 inMemoryDataset = writeStorageClass.delegate().assemble(
197 inMemoryDataset, pytype=writeStorageClass.pytype
198 )
199 else:
200 # Unpack the dict and hope that works.
201 inMemoryDataset = writeStorageClass.pytype(**inMemoryDataset)
202 else:
203 # Hope that we can pass the arguments in directly.
204 inMemoryDataset = writeStorageClass.pytype(inMemoryDataset)
206 return inMemoryDataset
208 def _coerceType(
209 self, inMemoryDataset: Any, writeStorageClass: StorageClass, readStorageClass: StorageClass
210 ) -> Any:
211 """Coerce the supplied inMemoryDataset to the correct python type.
213 Parameters
214 ----------
215 inMemoryDataset : `object`
216 Object to coerce to expected type.
217 writeStorageClass : `StorageClass`
218 Storage class used to serialize this data.
219 readStorageClass : `StorageClass`
220 Storage class requested as the outcome.
222 Returns
223 -------
224 inMemoryDataset : `object`
225 Object of expected type ``readStorageClass.pytype``.
226 """
227 inMemoryDataset = self._coerceBuiltinType(inMemoryDataset, writeStorageClass)
228 return readStorageClass.coerce_type(inMemoryDataset)
230 def read(self, component: str | None = None) -> Any:
231 """Read data from a file.
233 Parameters
234 ----------
235 component : `str`, optional
236 Component to read from the file. Only used if the `StorageClass`
237 for reading differed from the `StorageClass` used to write the
238 file.
240 Returns
241 -------
242 inMemoryDataset : `object`
243 The requested data as a Python object. The type of object
244 is controlled by the specific formatter.
246 Raises
247 ------
248 ValueError
249 Component requested but this file does not seem to be a concrete
250 composite.
251 NotImplementedError
252 Formatter does not implement a method to read from files.
253 """
254 # Read the file naively
255 path = self.fileDescriptor.location.path
256 data = self._readFile(path, self.fileDescriptor.storageClass.pytype)
258 # Assemble the requested dataset and potentially return only its
259 # component coercing it to its appropriate pytype
260 data = self._assembleDataset(data, component)
262 # Special case components by allowing a formatter to return None
263 # to indicate that the component was understood but is missing
264 if data is None and component is None:
265 raise ValueError(f"Unable to read data with URI {self.fileDescriptor.location.uri}")
267 return data
269 def fromBytes(self, serializedDataset: bytes, component: str | None = None) -> Any:
270 """Read serialized data into a Dataset or its component.
272 Parameters
273 ----------
274 serializedDataset : `bytes`
275 Bytes object to unserialize.
276 component : `str`, optional
277 Component to read from the Dataset. Only used if the `StorageClass`
278 for reading differed from the `StorageClass` used to write the
279 file.
281 Returns
282 -------
283 inMemoryDataset : `object`
284 The requested data as a Python object. The type of object
285 is controlled by the specific formatter.
287 Raises
288 ------
289 NotImplementedError
290 Formatter does not support reading from bytes.
291 """
292 if not hasattr(self, "_fromBytes"):
293 raise NotImplementedError("Type does not support reading from bytes.")
295 data = self._fromBytes(serializedDataset, self.fileDescriptor.storageClass.pytype)
297 # Assemble the requested dataset and potentially return only its
298 # component coercing it to its appropriate pytype
299 data = self._assembleDataset(data, component)
301 # Special case components by allowing a formatter to return None
302 # to indicate that the component was understood but is missing
303 if data is None and component is None:
304 nbytes = len(serializedDataset)
305 s = "s" if nbytes != 1 else ""
306 raise ValueError(
307 f"Unable to unpersist {nbytes} byte{s} from URI {self.fileDescriptor.location.uri}"
308 )
310 return data
312 def write(self, inMemoryDataset: Any) -> None:
313 """Write a Python object to a file.
315 Parameters
316 ----------
317 inMemoryDataset : `object`
318 The Python object to store.
320 Returns
321 -------
322 path : `str`
323 The path where the primary file is stored within the datastore.
324 """
325 fileDescriptor = self.fileDescriptor
326 # Update the location with the formatter-preferred file extension
327 fileDescriptor.location.updateExtension(self.extension)
329 self._writeFile(inMemoryDataset)
331 def toBytes(self, inMemoryDataset: Any) -> bytes:
332 """Serialize the Dataset to bytes based on formatter.
334 Parameters
335 ----------
336 inMemoryDataset : `object`
337 Object to serialize.
339 Returns
340 -------
341 serializedDataset : `bytes`
342 Bytes representing the serialized dataset.
344 Raises
345 ------
346 NotImplementedError
347 Formatter does not support reading from bytes.
348 """
349 if not hasattr(self, "_toBytes"):
350 raise NotImplementedError("Type does not support reading from bytes.")
352 return self._toBytes(inMemoryDataset)