Coverage for python/lsst/daf/butler/formatters/file.py: 28%

68 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-10-27 09:44 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28"""Support for reading and writing files to a POSIX file system.""" 

29 

30from __future__ import annotations 

31 

32__all__ = ("FileFormatter",) 

33 

34import dataclasses 

35from abc import abstractmethod 

36from typing import TYPE_CHECKING, Any 

37 

38from lsst.daf.butler import Formatter 

39 

40if TYPE_CHECKING: 

41 from lsst.daf.butler import StorageClass 

42 

43 

44class FileFormatter(Formatter): 

45 """Interface for reading and writing files on a POSIX file system.""" 

46 

47 extension: str | None = None 

48 """Default file extension to use for writing files. None means that no 

49 modifications will be made to the supplied file extension. (`str`)""" 

50 

51 @abstractmethod 

52 def _readFile(self, path: str, pytype: type[Any] | None = None) -> Any: 

53 """Read a file from the path in the correct format. 

54 

55 Parameters 

56 ---------- 

57 path : `str` 

58 Path to use to open the file. 

59 pytype : `class`, optional 

60 Class to use to read the file. 

61 

62 Returns 

63 ------- 

64 data : `object` 

65 Data read from file. Returns `None` if the file can not be 

66 found at the given path. 

67 

68 Raises 

69 ------ 

70 Exception 

71 Some problem reading the file. 

72 """ 

73 pass 

74 

75 @abstractmethod 

76 def _writeFile(self, inMemoryDataset: Any) -> None: 

77 """Write the in memory dataset to file on disk. 

78 

79 Parameters 

80 ---------- 

81 inMemoryDataset : `object` 

82 Object to serialize. 

83 

84 Raises 

85 ------ 

86 Exception 

87 The file could not be written. 

88 """ 

89 pass 

90 

91 def _assembleDataset(self, data: Any, component: str | None = None) -> Any: 

92 """Assembles and coerces the dataset, or one of its components, 

93 into an appropriate python type and returns it. 

94 

95 Parameters 

96 ---------- 

97 data : `dict` or `object` 

98 Composite or a dict that, or which component, needs to be 

99 coerced to the python type specified in "fileDescriptor" 

100 component : `str`, optional 

101 Component to read from the file. Only used if the `StorageClass` 

102 for reading differed from the `StorageClass` used to write the 

103 file. 

104 

105 Returns 

106 ------- 

107 inMemoryDataset : `object` 

108 The requested data as a Python object. The type of object 

109 is controlled by the specific formatter. 

110 """ 

111 fileDescriptor = self.fileDescriptor 

112 

113 # Get the read and write storage classes. 

114 readStorageClass = fileDescriptor.readStorageClass 

115 writeStorageClass = fileDescriptor.storageClass 

116 

117 if component is not None: 

118 # Requesting a component implies that we need to first ensure 

119 # that the composite is the correct python type. Lie to the 

120 # coercion routine since the read StorageClass is not relevant 

121 # if we want the original. 

122 data = self._coerceType(data, writeStorageClass, writeStorageClass) 

123 

124 # Concrete composite written as a single file (we hope) 

125 # so try to get the component. 

126 try: 

127 data = fileDescriptor.storageClass.delegate().getComponent(data, component) 

128 except AttributeError: 

129 # Defer the complaint 

130 data = None 

131 

132 # Update the write storage class to match that of the component. 

133 # It should be safe to use the component storage class directly 

134 # since that should match what was returned from getComponent 

135 # (else we could create a temporary storage class guaranteed to 

136 # match the python type we have). 

137 writeStorageClass = writeStorageClass.allComponents()[component] 

138 

139 # Coerce to the requested type. 

140 data = self._coerceType(data, writeStorageClass, readStorageClass) 

141 

142 return data 

143 

144 def _coerceBuiltinType(self, inMemoryDataset: Any, writeStorageClass: StorageClass) -> Any: 

145 """Coerce the supplied inMemoryDataset to the written python type if it 

146 is currently a built-in type. 

147 

148 Parameters 

149 ---------- 

150 inMemoryDataset : `object` 

151 Object to coerce to expected type. 

152 writeStorageClass : `StorageClass` 

153 Storage class used to serialize this data. 

154 

155 Returns 

156 ------- 

157 inMemoryDataset : `object` 

158 Object of expected type ``writeStorageClass.pytype``. 

159 

160 Notes 

161 ----- 

162 This method only modifies the supplied object if the object is: 

163 

164 * Not already the required type. 

165 * Not `None`. 

166 * Looks like a built-in type. 

167 

168 It is intended to be used as a helper for file formats that do not 

169 store the original Python type information in serialized form and 

170 instead return built-in types such as `dict` and `list` that need 

171 to be converted to the required form. This happens before 

172 `StorageClass` converters trigger so that constructors can be 

173 called that can build the original type first before checking the 

174 requested Python type. This is important for Pydantic models where 

175 the internal structure of the model may not match the `dict` form 

176 in a scenario where the user has requested a `dict`. 

177 """ 

178 if ( 

179 inMemoryDataset is not None 

180 and not isinstance(inMemoryDataset, writeStorageClass.pytype) 

181 and type(inMemoryDataset).__module__ == "builtins" 

182 ): 

183 # Try different ways of converting to the required type. 

184 # Pydantic v1 uses parse_obj and some non-pydantic classes 

185 # use that convention. Pydantic v2 uses model_validate. 

186 for method_name in ("model_validate", "parse_obj"): 

187 if method := getattr(writeStorageClass.pytype, method_name, None): 

188 return method(inMemoryDataset) 

189 if isinstance(inMemoryDataset, dict): 

190 if dataclasses.is_dataclass(writeStorageClass.pytype): 

191 # Dataclasses accept key/value parameters. 

192 inMemoryDataset = writeStorageClass.pytype(**inMemoryDataset) 

193 elif writeStorageClass.isComposite(): 

194 # Assume that this type can be constructed 

195 # using the registered assembler from a dict. 

196 inMemoryDataset = writeStorageClass.delegate().assemble( 

197 inMemoryDataset, pytype=writeStorageClass.pytype 

198 ) 

199 else: 

200 # Unpack the dict and hope that works. 

201 inMemoryDataset = writeStorageClass.pytype(**inMemoryDataset) 

202 else: 

203 # Hope that we can pass the arguments in directly. 

204 inMemoryDataset = writeStorageClass.pytype(inMemoryDataset) 

205 

206 return inMemoryDataset 

207 

208 def _coerceType( 

209 self, inMemoryDataset: Any, writeStorageClass: StorageClass, readStorageClass: StorageClass 

210 ) -> Any: 

211 """Coerce the supplied inMemoryDataset to the correct python type. 

212 

213 Parameters 

214 ---------- 

215 inMemoryDataset : `object` 

216 Object to coerce to expected type. 

217 writeStorageClass : `StorageClass` 

218 Storage class used to serialize this data. 

219 readStorageClass : `StorageClass` 

220 Storage class requested as the outcome. 

221 

222 Returns 

223 ------- 

224 inMemoryDataset : `object` 

225 Object of expected type ``readStorageClass.pytype``. 

226 """ 

227 inMemoryDataset = self._coerceBuiltinType(inMemoryDataset, writeStorageClass) 

228 return readStorageClass.coerce_type(inMemoryDataset) 

229 

230 def read(self, component: str | None = None) -> Any: 

231 """Read data from a file. 

232 

233 Parameters 

234 ---------- 

235 fileDescriptor : `FileDescriptor` 

236 Identifies the file to read, type to read it into and parameters 

237 to be used for reading. 

238 component : `str`, optional 

239 Component to read from the file. Only used if the `StorageClass` 

240 for reading differed from the `StorageClass` used to write the 

241 file. 

242 

243 Returns 

244 ------- 

245 inMemoryDataset : `object` 

246 The requested data as a Python object. The type of object 

247 is controlled by the specific formatter. 

248 

249 Raises 

250 ------ 

251 ValueError 

252 Component requested but this file does not seem to be a concrete 

253 composite. 

254 NotImplementedError 

255 Formatter does not implement a method to read from files. 

256 """ 

257 # Read the file naively 

258 path = self.fileDescriptor.location.path 

259 data = self._readFile(path, self.fileDescriptor.storageClass.pytype) 

260 

261 # Assemble the requested dataset and potentially return only its 

262 # component coercing it to its appropriate pytype 

263 data = self._assembleDataset(data, component) 

264 

265 # Special case components by allowing a formatter to return None 

266 # to indicate that the component was understood but is missing 

267 if data is None and component is None: 

268 raise ValueError(f"Unable to read data with URI {self.fileDescriptor.location.uri}") 

269 

270 return data 

271 

272 def fromBytes(self, serializedDataset: bytes, component: str | None = None) -> Any: 

273 """Read serialized data into a Dataset or its component. 

274 

275 Parameters 

276 ---------- 

277 serializedDataset : `bytes` 

278 Bytes object to unserialize. 

279 component : `str`, optional 

280 Component to read from the Dataset. Only used if the `StorageClass` 

281 for reading differed from the `StorageClass` used to write the 

282 file. 

283 

284 Returns 

285 ------- 

286 inMemoryDataset : `object` 

287 The requested data as a Python object. The type of object 

288 is controlled by the specific formatter. 

289 

290 Raises 

291 ------ 

292 NotImplementedError 

293 Formatter does not support reading from bytes. 

294 """ 

295 if not hasattr(self, "_fromBytes"): 

296 raise NotImplementedError("Type does not support reading from bytes.") 

297 

298 data = self._fromBytes(serializedDataset, self.fileDescriptor.storageClass.pytype) 

299 

300 # Assemble the requested dataset and potentially return only its 

301 # component coercing it to its appropriate pytype 

302 data = self._assembleDataset(data, component) 

303 

304 # Special case components by allowing a formatter to return None 

305 # to indicate that the component was understood but is missing 

306 if data is None and component is None: 

307 nbytes = len(serializedDataset) 

308 s = "s" if nbytes != 1 else "" 

309 raise ValueError( 

310 f"Unable to unpersist {nbytes} byte{s} from URI {self.fileDescriptor.location.uri}" 

311 ) 

312 

313 return data 

314 

315 def write(self, inMemoryDataset: Any) -> None: 

316 """Write a Python object to a file. 

317 

318 Parameters 

319 ---------- 

320 inMemoryDataset : `object` 

321 The Python object to store. 

322 

323 Returns 

324 ------- 

325 path : `str` 

326 The path where the primary file is stored within the datastore. 

327 """ 

328 fileDescriptor = self.fileDescriptor 

329 # Update the location with the formatter-preferred file extension 

330 fileDescriptor.location.updateExtension(self.extension) 

331 

332 self._writeFile(inMemoryDataset) 

333 

334 def toBytes(self, inMemoryDataset: Any) -> bytes: 

335 """Serialize the Dataset to bytes based on formatter. 

336 

337 Parameters 

338 ---------- 

339 inMemoryDataset : `object` 

340 Object to serialize. 

341 

342 Returns 

343 ------- 

344 serializedDataset : `bytes` 

345 Bytes representing the serialized dataset. 

346 

347 Raises 

348 ------ 

349 NotImplementedError 

350 Formatter does not support reading from bytes. 

351 """ 

352 if not hasattr(self, "_toBytes"): 

353 raise NotImplementedError("Type does not support reading from bytes.") 

354 

355 return self._toBytes(inMemoryDataset)