Coverage for python/lsst/daf/butler/formatters/file.py: 28%

68 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-08-12 09:20 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""Support for reading and writing files to a POSIX file system.""" 

23 

24from __future__ import annotations 

25 

26__all__ = ("FileFormatter",) 

27 

28import dataclasses 

29from abc import abstractmethod 

30from typing import TYPE_CHECKING, Any 

31 

32from lsst.daf.butler import Formatter 

33 

34if TYPE_CHECKING: 

35 from lsst.daf.butler import StorageClass 

36 

37 

38class FileFormatter(Formatter): 

39 """Interface for reading and writing files on a POSIX file system.""" 

40 

41 extension: str | None = None 

42 """Default file extension to use for writing files. None means that no 

43 modifications will be made to the supplied file extension. (`str`)""" 

44 

45 @abstractmethod 

46 def _readFile(self, path: str, pytype: type[Any] | None = None) -> Any: 

47 """Read a file from the path in the correct format. 

48 

49 Parameters 

50 ---------- 

51 path : `str` 

52 Path to use to open the file. 

53 pytype : `class`, optional 

54 Class to use to read the file. 

55 

56 Returns 

57 ------- 

58 data : `object` 

59 Data read from file. Returns `None` if the file can not be 

60 found at the given path. 

61 

62 Raises 

63 ------ 

64 Exception 

65 Some problem reading the file. 

66 """ 

67 pass 

68 

69 @abstractmethod 

70 def _writeFile(self, inMemoryDataset: Any) -> None: 

71 """Write the in memory dataset to file on disk. 

72 

73 Parameters 

74 ---------- 

75 inMemoryDataset : `object` 

76 Object to serialize. 

77 

78 Raises 

79 ------ 

80 Exception 

81 The file could not be written. 

82 """ 

83 pass 

84 

85 def _assembleDataset(self, data: Any, component: str | None = None) -> Any: 

86 """Assembles and coerces the dataset, or one of its components, 

87 into an appropriate python type and returns it. 

88 

89 Parameters 

90 ---------- 

91 data : `dict` or `object` 

92 Composite or a dict that, or which component, needs to be 

93 coerced to the python type specified in "fileDescriptor" 

94 component : `str`, optional 

95 Component to read from the file. Only used if the `StorageClass` 

96 for reading differed from the `StorageClass` used to write the 

97 file. 

98 

99 Returns 

100 ------- 

101 inMemoryDataset : `object` 

102 The requested data as a Python object. The type of object 

103 is controlled by the specific formatter. 

104 """ 

105 fileDescriptor = self.fileDescriptor 

106 

107 # Get the read and write storage classes. 

108 readStorageClass = fileDescriptor.readStorageClass 

109 writeStorageClass = fileDescriptor.storageClass 

110 

111 if component is not None: 

112 # Requesting a component implies that we need to first ensure 

113 # that the composite is the correct python type. Lie to the 

114 # coercion routine since the read StorageClass is not relevant 

115 # if we want the original. 

116 data = self._coerceType(data, writeStorageClass, writeStorageClass) 

117 

118 # Concrete composite written as a single file (we hope) 

119 # so try to get the component. 

120 try: 

121 data = fileDescriptor.storageClass.delegate().getComponent(data, component) 

122 except AttributeError: 

123 # Defer the complaint 

124 data = None 

125 

126 # Update the write storage class to match that of the component. 

127 # It should be safe to use the component storage class directly 

128 # since that should match what was returned from getComponent 

129 # (else we could create a temporary storage class guaranteed to 

130 # match the python type we have). 

131 writeStorageClass = writeStorageClass.allComponents()[component] 

132 

133 # Coerce to the requested type. 

134 data = self._coerceType(data, writeStorageClass, readStorageClass) 

135 

136 return data 

137 

138 def _coerceBuiltinType(self, inMemoryDataset: Any, writeStorageClass: StorageClass) -> Any: 

139 """Coerce the supplied inMemoryDataset to the written python type if it 

140 is currently a built-in type. 

141 

142 Parameters 

143 ---------- 

144 inMemoryDataset : `object` 

145 Object to coerce to expected type. 

146 writeStorageClass : `StorageClass` 

147 Storage class used to serialize this data. 

148 

149 Returns 

150 ------- 

151 inMemoryDataset : `object` 

152 Object of expected type ``writeStorageClass.pytype``. 

153 

154 Notes 

155 ----- 

156 This method only modifies the supplied object if the object is: 

157 

158 * Not already the required type. 

159 * Not `None`. 

160 * Looks like a built-in type. 

161 

162 It is intended to be used as a helper for file formats that do not 

163 store the original Python type information in serialized form and 

164 instead return built-in types such as `dict` and `list` that need 

165 to be converted to the required form. This happens before 

166 `StorageClass` converters trigger so that constructors can be 

167 called that can build the original type first before checking the 

168 requested Python type. This is important for Pydantic models where 

169 the internal structure of the model may not match the `dict` form 

170 in a scenario where the user has requested a `dict`. 

171 """ 

172 if ( 

173 inMemoryDataset is not None 

174 and not isinstance(inMemoryDataset, writeStorageClass.pytype) 

175 and type(inMemoryDataset).__module__ == "builtins" 

176 ): 

177 # Try different ways of converting to the required type. 

178 # Pydantic v1 uses parse_obj and some non-pydantic classes 

179 # use that convention. Pydantic v2 uses model_validate. 

180 for method_name in ("model_validate", "parse_obj"): 

181 if method := getattr(writeStorageClass.pytype, method_name, None): 

182 return method(inMemoryDataset) 

183 if isinstance(inMemoryDataset, dict): 

184 if dataclasses.is_dataclass(writeStorageClass.pytype): 

185 # Dataclasses accept key/value parameters. 

186 inMemoryDataset = writeStorageClass.pytype(**inMemoryDataset) 

187 elif writeStorageClass.isComposite(): 

188 # Assume that this type can be constructed 

189 # using the registered assembler from a dict. 

190 inMemoryDataset = writeStorageClass.delegate().assemble( 

191 inMemoryDataset, pytype=writeStorageClass.pytype 

192 ) 

193 else: 

194 # Unpack the dict and hope that works. 

195 inMemoryDataset = writeStorageClass.pytype(**inMemoryDataset) 

196 else: 

197 # Hope that we can pass the arguments in directly. 

198 inMemoryDataset = writeStorageClass.pytype(inMemoryDataset) 

199 

200 return inMemoryDataset 

201 

202 def _coerceType( 

203 self, inMemoryDataset: Any, writeStorageClass: StorageClass, readStorageClass: StorageClass 

204 ) -> Any: 

205 """Coerce the supplied inMemoryDataset to the correct python type. 

206 

207 Parameters 

208 ---------- 

209 inMemoryDataset : `object` 

210 Object to coerce to expected type. 

211 writeStorageClass : `StorageClass` 

212 Storage class used to serialize this data. 

213 readStorageClass : `StorageClass` 

214 Storage class requested as the outcome. 

215 

216 Returns 

217 ------- 

218 inMemoryDataset : `object` 

219 Object of expected type ``readStorageClass.pytype``. 

220 """ 

221 inMemoryDataset = self._coerceBuiltinType(inMemoryDataset, writeStorageClass) 

222 return readStorageClass.coerce_type(inMemoryDataset) 

223 

224 def read(self, component: str | None = None) -> Any: 

225 """Read data from a file. 

226 

227 Parameters 

228 ---------- 

229 fileDescriptor : `FileDescriptor` 

230 Identifies the file to read, type to read it into and parameters 

231 to be used for reading. 

232 component : `str`, optional 

233 Component to read from the file. Only used if the `StorageClass` 

234 for reading differed from the `StorageClass` used to write the 

235 file. 

236 

237 Returns 

238 ------- 

239 inMemoryDataset : `object` 

240 The requested data as a Python object. The type of object 

241 is controlled by the specific formatter. 

242 

243 Raises 

244 ------ 

245 ValueError 

246 Component requested but this file does not seem to be a concrete 

247 composite. 

248 NotImplementedError 

249 Formatter does not implement a method to read from files. 

250 """ 

251 # Read the file naively 

252 path = self.fileDescriptor.location.path 

253 data = self._readFile(path, self.fileDescriptor.storageClass.pytype) 

254 

255 # Assemble the requested dataset and potentially return only its 

256 # component coercing it to its appropriate pytype 

257 data = self._assembleDataset(data, component) 

258 

259 # Special case components by allowing a formatter to return None 

260 # to indicate that the component was understood but is missing 

261 if data is None and component is None: 

262 raise ValueError(f"Unable to read data with URI {self.fileDescriptor.location.uri}") 

263 

264 return data 

265 

266 def fromBytes(self, serializedDataset: bytes, component: str | None = None) -> Any: 

267 """Read serialized data into a Dataset or its component. 

268 

269 Parameters 

270 ---------- 

271 serializedDataset : `bytes` 

272 Bytes object to unserialize. 

273 component : `str`, optional 

274 Component to read from the Dataset. Only used if the `StorageClass` 

275 for reading differed from the `StorageClass` used to write the 

276 file. 

277 

278 Returns 

279 ------- 

280 inMemoryDataset : `object` 

281 The requested data as a Python object. The type of object 

282 is controlled by the specific formatter. 

283 

284 Raises 

285 ------ 

286 NotImplementedError 

287 Formatter does not support reading from bytes. 

288 """ 

289 if not hasattr(self, "_fromBytes"): 

290 raise NotImplementedError("Type does not support reading from bytes.") 

291 

292 data = self._fromBytes(serializedDataset, self.fileDescriptor.storageClass.pytype) 

293 

294 # Assemble the requested dataset and potentially return only its 

295 # component coercing it to its appropriate pytype 

296 data = self._assembleDataset(data, component) 

297 

298 # Special case components by allowing a formatter to return None 

299 # to indicate that the component was understood but is missing 

300 if data is None and component is None: 

301 nbytes = len(serializedDataset) 

302 s = "s" if nbytes != 1 else "" 

303 raise ValueError( 

304 f"Unable to unpersist {nbytes} byte{s} from URI {self.fileDescriptor.location.uri}" 

305 ) 

306 

307 return data 

308 

309 def write(self, inMemoryDataset: Any) -> None: 

310 """Write a Python object to a file. 

311 

312 Parameters 

313 ---------- 

314 inMemoryDataset : `object` 

315 The Python object to store. 

316 

317 Returns 

318 ------- 

319 path : `str` 

320 The path where the primary file is stored within the datastore. 

321 """ 

322 fileDescriptor = self.fileDescriptor 

323 # Update the location with the formatter-preferred file extension 

324 fileDescriptor.location.updateExtension(self.extension) 

325 

326 self._writeFile(inMemoryDataset) 

327 

328 def toBytes(self, inMemoryDataset: Any) -> bytes: 

329 """Serialize the Dataset to bytes based on formatter. 

330 

331 Parameters 

332 ---------- 

333 inMemoryDataset : `object` 

334 Object to serialize. 

335 

336 Returns 

337 ------- 

338 serializedDataset : `bytes` 

339 Bytes representing the serialized dataset. 

340 

341 Raises 

342 ------ 

343 NotImplementedError 

344 Formatter does not support reading from bytes. 

345 """ 

346 if not hasattr(self, "_toBytes"): 

347 raise NotImplementedError("Type does not support reading from bytes.") 

348 

349 return self._toBytes(inMemoryDataset)