Coverage for python / lsst / daf / butler / formatters / file.py: 0%

67 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-17 08:48 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28"""Support for reading and writing files to a POSIX file system.""" 

29 

30from __future__ import annotations 

31 

32__all__ = ("FileFormatter",) 

33 

34import dataclasses 

35from abc import abstractmethod 

36from typing import TYPE_CHECKING, Any 

37 

38from lsst.daf.butler import Formatter, FormatterNotImplementedError 

39 

40if TYPE_CHECKING: 

41 from lsst.daf.butler import StorageClass 

42 

43 

44class FileFormatter(Formatter): 

45 """Interface for reading and writing files on a POSIX file system.""" 

46 

47 @abstractmethod 

48 def _readFile(self, path: str, pytype: type[Any] | None = None) -> Any: 

49 """Read a file from the path in the correct format. 

50 

51 Parameters 

52 ---------- 

53 path : `str` 

54 Path to use to open the file. 

55 pytype : `class`, optional 

56 Class to use to read the file. 

57 

58 Returns 

59 ------- 

60 data : `object` 

61 Data read from file. Returns `None` if the file can not be 

62 found at the given path. 

63 

64 Raises 

65 ------ 

66 Exception 

67 Some problem reading the file. 

68 """ 

69 pass 

70 

71 @abstractmethod 

72 def _writeFile(self, inMemoryDataset: Any) -> None: 

73 """Write the in memory dataset to file on disk. 

74 

75 Parameters 

76 ---------- 

77 inMemoryDataset : `object` 

78 Object to serialize. 

79 

80 Raises 

81 ------ 

82 Exception 

83 The file could not be written. 

84 """ 

85 pass 

86 

87 def _assembleDataset(self, data: Any, component: str | None = None) -> Any: 

88 """Assembles and coerces the dataset, or one of its components, 

89 into an appropriate python type and returns it. 

90 

91 Parameters 

92 ---------- 

93 data : `dict` or `object` 

94 Composite or a dict that, or which component, needs to be 

95 coerced to the python type specified in "fileDescriptor" 

96 component : `str`, optional 

97 Component to read from the file. Only used if the `StorageClass` 

98 for reading differed from the `StorageClass` used to write the 

99 file. 

100 

101 Returns 

102 ------- 

103 inMemoryDataset : `object` 

104 The requested data as a Python object. The type of object 

105 is controlled by the specific formatter. 

106 """ 

107 fileDescriptor = self.fileDescriptor 

108 

109 # Get the read and write storage classes. 

110 readStorageClass = fileDescriptor.readStorageClass 

111 writeStorageClass = fileDescriptor.storageClass 

112 

113 if component is not None: 

114 # Requesting a component implies that we need to first ensure 

115 # that the composite is the correct python type. Lie to the 

116 # coercion routine since the read StorageClass is not relevant 

117 # if we want the original. 

118 data = self._coerceType(data, writeStorageClass, writeStorageClass) 

119 

120 # Concrete composite written as a single file (we hope) 

121 # so try to get the component. 

122 try: 

123 data = fileDescriptor.storageClass.delegate().getComponent(data, component) 

124 except AttributeError: 

125 # Defer the complaint 

126 data = None 

127 

128 # Update the write storage class to match that of the component. 

129 # It should be safe to use the component storage class directly 

130 # since that should match what was returned from getComponent 

131 # (else we could create a temporary storage class guaranteed to 

132 # match the python type we have). 

133 writeStorageClass = writeStorageClass.allComponents()[component] 

134 

135 # Coerce to the requested type. 

136 data = self._coerceType(data, writeStorageClass, readStorageClass) 

137 

138 return data 

139 

140 def _coerceBuiltinType(self, inMemoryDataset: Any, writeStorageClass: StorageClass) -> Any: 

141 """Coerce the supplied inMemoryDataset to the written python type if it 

142 is currently a built-in type. 

143 

144 Parameters 

145 ---------- 

146 inMemoryDataset : `object` 

147 Object to coerce to expected type. 

148 writeStorageClass : `StorageClass` 

149 Storage class used to serialize this data. 

150 

151 Returns 

152 ------- 

153 inMemoryDataset : `object` 

154 Object of expected type ``writeStorageClass.pytype``. 

155 

156 Notes 

157 ----- 

158 This method only modifies the supplied object if the object is: 

159 

160 * Not already the required type. 

161 * Not `None`. 

162 * Looks like a built-in type. 

163 

164 It is intended to be used as a helper for file formats that do not 

165 store the original Python type information in serialized form and 

166 instead return built-in types such as `dict` and `list` that need 

167 to be converted to the required form. This happens before 

168 `StorageClass` converters trigger so that constructors can be 

169 called that can build the original type first before checking the 

170 requested Python type. This is important for Pydantic models where 

171 the internal structure of the model may not match the `dict` form 

172 in a scenario where the user has requested a `dict`. 

173 """ 

174 if ( 

175 inMemoryDataset is not None 

176 and not isinstance(inMemoryDataset, writeStorageClass.pytype) 

177 and type(inMemoryDataset).__module__ == "builtins" 

178 ): 

179 # Try different ways of converting to the required type. 

180 # Pydantic v1 uses parse_obj and some non-pydantic classes 

181 # use that convention. Pydantic v2 uses model_validate. 

182 for method_name in ("model_validate", "parse_obj"): 

183 if method := getattr(writeStorageClass.pytype, method_name, None): 

184 return method(inMemoryDataset) 

185 if isinstance(inMemoryDataset, dict): 

186 if dataclasses.is_dataclass(writeStorageClass.pytype): 

187 # Dataclasses accept key/value parameters. 

188 inMemoryDataset = writeStorageClass.pytype(**inMemoryDataset) 

189 elif writeStorageClass.isComposite(): 

190 # Assume that this type can be constructed 

191 # using the registered assembler from a dict. 

192 inMemoryDataset = writeStorageClass.delegate().assemble( 

193 inMemoryDataset, pytype=writeStorageClass.pytype 

194 ) 

195 else: 

196 # Unpack the dict and hope that works. 

197 inMemoryDataset = writeStorageClass.pytype(**inMemoryDataset) 

198 else: 

199 # Hope that we can pass the arguments in directly. 

200 inMemoryDataset = writeStorageClass.pytype(inMemoryDataset) 

201 

202 return inMemoryDataset 

203 

204 def _coerceType( 

205 self, inMemoryDataset: Any, writeStorageClass: StorageClass, readStorageClass: StorageClass 

206 ) -> Any: 

207 """Coerce the supplied inMemoryDataset to the correct python type. 

208 

209 Parameters 

210 ---------- 

211 inMemoryDataset : `object` 

212 Object to coerce to expected type. 

213 writeStorageClass : `StorageClass` 

214 Storage class used to serialize this data. 

215 readStorageClass : `StorageClass` 

216 Storage class requested as the outcome. 

217 

218 Returns 

219 ------- 

220 inMemoryDataset : `object` 

221 Object of expected type ``readStorageClass.pytype``. 

222 """ 

223 inMemoryDataset = self._coerceBuiltinType(inMemoryDataset, writeStorageClass) 

224 return readStorageClass.coerce_type(inMemoryDataset) 

225 

226 def read(self, component: str | None = None) -> Any: 

227 """Read data from a file. 

228 

229 Parameters 

230 ---------- 

231 component : `str`, optional 

232 Component to read from the file. Only used if the `StorageClass` 

233 for reading differed from the `StorageClass` used to write the 

234 file. 

235 

236 Returns 

237 ------- 

238 inMemoryDataset : `object` 

239 The requested data as a Python object. The type of object 

240 is controlled by the specific formatter. 

241 

242 Raises 

243 ------ 

244 ValueError 

245 Component requested but this file does not seem to be a concrete 

246 composite. 

247 FormatterNotImplementedError 

248 Formatter does not implement a method to read from files. 

249 """ 

250 # Read the file naively 

251 path = self.fileDescriptor.location.path 

252 data = self._readFile(path, self.fileDescriptor.storageClass.pytype) 

253 

254 # Assemble the requested dataset and potentially return only its 

255 # component coercing it to its appropriate pytype 

256 data = self._assembleDataset(data, component) 

257 

258 # Special case components by allowing a formatter to return None 

259 # to indicate that the component was understood but is missing 

260 if data is None and component is None: 

261 raise ValueError(f"Unable to read data with URI {self.fileDescriptor.location.uri}") 

262 

263 return data 

264 

265 def fromBytes(self, serializedDataset: bytes, component: str | None = None) -> Any: 

266 """Read serialized data into a Dataset or its component. 

267 

268 Parameters 

269 ---------- 

270 serializedDataset : `bytes` 

271 Bytes object to unserialize. 

272 component : `str`, optional 

273 Component to read from the Dataset. Only used if the `StorageClass` 

274 for reading differed from the `StorageClass` used to write the 

275 file. 

276 

277 Returns 

278 ------- 

279 inMemoryDataset : `object` 

280 The requested data as a Python object. The type of object 

281 is controlled by the specific formatter. 

282 

283 Raises 

284 ------ 

285 FormatterNotImplementedError 

286 Formatter does not support reading from bytes. 

287 """ 

288 if not hasattr(self, "_fromBytes"): 

289 raise FormatterNotImplementedError("Type does not support reading from bytes.") 

290 

291 data = self._fromBytes(serializedDataset, self.fileDescriptor.storageClass.pytype) 

292 

293 # Assemble the requested dataset and potentially return only its 

294 # component coercing it to its appropriate pytype 

295 data = self._assembleDataset(data, component) 

296 

297 # Special case components by allowing a formatter to return None 

298 # to indicate that the component was understood but is missing 

299 if data is None and component is None: 

300 nbytes = len(serializedDataset) 

301 s = "s" if nbytes != 1 else "" 

302 raise ValueError( 

303 f"Unable to unpersist {nbytes} byte{s} from URI {self.fileDescriptor.location.uri}" 

304 ) 

305 

306 return data 

307 

308 def write(self, inMemoryDataset: Any) -> None: 

309 """Write a Python object to a file. 

310 

311 Parameters 

312 ---------- 

313 inMemoryDataset : `object` 

314 The Python object to store. 

315 

316 Returns 

317 ------- 

318 path : `str` 

319 The path where the primary file is stored within the datastore. 

320 """ 

321 fileDescriptor = self.fileDescriptor 

322 # Update the location with the formatter-preferred file extension 

323 fileDescriptor.location.updateExtension(self.extension) 

324 

325 self._writeFile(inMemoryDataset) 

326 

327 def toBytes(self, inMemoryDataset: Any) -> bytes: 

328 """Serialize the Dataset to bytes based on formatter. 

329 

330 Parameters 

331 ---------- 

332 inMemoryDataset : `object` 

333 Object to serialize. 

334 

335 Returns 

336 ------- 

337 serializedDataset : `bytes` 

338 Bytes representing the serialized dataset. 

339 

340 Raises 

341 ------ 

342 FormatterNotImplementedError 

343 Formatter does not support reading from bytes. 

344 """ 

345 if not hasattr(self, "_toBytes"): 

346 raise FormatterNotImplementedError("Type does not support reading from bytes.") 

347 

348 return self._toBytes(inMemoryDataset)