Coverage for python/lsst/daf/butler/formatters/file.py: 25%

67 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2023-06-06 09:38 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24"""Support for reading and writing files to a POSIX file system.""" 

25 

26__all__ = ("FileFormatter",) 

27 

28import dataclasses 

29from abc import abstractmethod 

30from typing import TYPE_CHECKING, Any, Optional, Type 

31 

32from lsst.daf.butler import Formatter 

33 

34if TYPE_CHECKING: 

35 from lsst.daf.butler import StorageClass 

36 

37 

38class FileFormatter(Formatter): 

39 """Interface for reading and writing files on a POSIX file system.""" 

40 

41 extension: Optional[str] = None 

42 """Default file extension to use for writing files. None means that no 

43 modifications will be made to the supplied file extension. (`str`)""" 

44 

45 @abstractmethod 

46 def _readFile(self, path: str, pytype: Optional[Type[Any]] = None) -> Any: 

47 """Read a file from the path in the correct format. 

48 

49 Parameters 

50 ---------- 

51 path : `str` 

52 Path to use to open the file. 

53 pytype : `class`, optional 

54 Class to use to read the file. 

55 

56 Returns 

57 ------- 

58 data : `object` 

59 Data read from file. Returns `None` if the file can not be 

60 found at the given path. 

61 

62 Raises 

63 ------ 

64 Exception 

65 Some problem reading the file. 

66 """ 

67 pass 

68 

69 @abstractmethod 

70 def _writeFile(self, inMemoryDataset: Any) -> None: 

71 """Write the in memory dataset to file on disk. 

72 

73 Parameters 

74 ---------- 

75 inMemoryDataset : `object` 

76 Object to serialize. 

77 

78 Raises 

79 ------ 

80 Exception 

81 The file could not be written. 

82 """ 

83 pass 

84 

85 def _assembleDataset(self, data: Any, component: Optional[str] = None) -> Any: 

86 """Assembles and coerces the dataset, or one of its components, 

87 into an appropriate python type and returns it. 

88 

89 Parameters 

90 ---------- 

91 data : `dict` or `object` 

92 Composite or a dict that, or which component, needs to be 

93 coerced to the python type specified in "fileDescriptor" 

94 component : `str`, optional 

95 Component to read from the file. Only used if the `StorageClass` 

96 for reading differed from the `StorageClass` used to write the 

97 file. 

98 

99 Returns 

100 ------- 

101 inMemoryDataset : `object` 

102 The requested data as a Python object. The type of object 

103 is controlled by the specific formatter. 

104 """ 

105 fileDescriptor = self.fileDescriptor 

106 

107 # Get the read and write storage classes. 

108 readStorageClass = fileDescriptor.readStorageClass 

109 writeStorageClass = fileDescriptor.storageClass 

110 

111 if component is not None: 

112 # Requesting a component implies that we need to first ensure 

113 # that the composite is the correct python type. Lie to the 

114 # coercion routine since the read StorageClass is not relevant 

115 # if we want the original. 

116 data = self._coerceType(data, writeStorageClass, writeStorageClass) 

117 

118 # Concrete composite written as a single file (we hope) 

119 # so try to get the component. 

120 try: 

121 data = fileDescriptor.storageClass.delegate().getComponent(data, component) 

122 except AttributeError: 

123 # Defer the complaint 

124 data = None 

125 

126 # Update the write storage class to match that of the component. 

127 # It should be safe to use the component storage class directly 

128 # since that should match what was returned from getComponent 

129 # (else we could create a temporary storage class guaranteed to 

130 # match the python type we have). 

131 writeStorageClass = writeStorageClass.allComponents()[component] 

132 

133 # Coerce to the requested type. 

134 data = self._coerceType(data, writeStorageClass, readStorageClass) 

135 

136 return data 

137 

138 def _coerceBuiltinType(self, inMemoryDataset: Any, writeStorageClass: StorageClass) -> Any: 

139 """Coerce the supplied inMemoryDataset to the written python type if it 

140 is currently a built-in type. 

141 

142 Parameters 

143 ---------- 

144 inMemoryDataset : `object` 

145 Object to coerce to expected type. 

146 writeStorageClass : `StorageClass` 

147 Storage class used to serialize this data. 

148 

149 Returns 

150 ------- 

151 inMemoryDataset : `object` 

152 Object of expected type ``writeStorageClass.pytype``. 

153 

154 Notes 

155 ----- 

156 This method only modifies the supplied object if the object is: 

157 

158 * Not already the required type. 

159 * Not `None`. 

160 * Looks like a built-in type. 

161 

162 It is intended to be used as a helper for file formats that do not 

163 store the original Python type information in serialized form and 

164 instead return built-in types such as `dict` and `list` that need 

165 to be converted to the required form. This happens before 

166 `StorageClass` converters trigger so that constructors can be 

167 called that can build the original type first before checking the 

168 requested Python type. This is important for Pydantic models where 

169 the internal structure of the model may not match the `dict` form 

170 in a scenario where the user has requested a `dict`. 

171 """ 

172 if ( 

173 inMemoryDataset is not None 

174 and not isinstance(inMemoryDataset, writeStorageClass.pytype) 

175 and type(inMemoryDataset).__module__ == "builtins" 

176 ): 

177 # Try different ways of converting to the required type. 

178 if hasattr(writeStorageClass.pytype, "parse_obj"): 

179 # This is for a Pydantic model. 

180 inMemoryDataset = writeStorageClass.pytype.parse_obj(inMemoryDataset) 

181 elif isinstance(inMemoryDataset, dict): 

182 if dataclasses.is_dataclass(writeStorageClass.pytype): 

183 # Dataclasses accept key/value parameters. 

184 inMemoryDataset = writeStorageClass.pytype(**inMemoryDataset) 

185 elif writeStorageClass.isComposite(): 

186 # Assume that this type can be constructed 

187 # using the registered assembler from a dict. 

188 inMemoryDataset = writeStorageClass.delegate().assemble( 

189 inMemoryDataset, pytype=writeStorageClass.pytype 

190 ) 

191 else: 

192 # Unpack the dict and hope that works. 

193 inMemoryDataset = writeStorageClass.pytype(**inMemoryDataset) 

194 else: 

195 # Hope that we can pass the arguments in directly. 

196 inMemoryDataset = writeStorageClass.pytype(inMemoryDataset) 

197 

198 return inMemoryDataset 

199 

200 def _coerceType( 

201 self, inMemoryDataset: Any, writeStorageClass: StorageClass, readStorageClass: StorageClass 

202 ) -> Any: 

203 """Coerce the supplied inMemoryDataset to the correct python type. 

204 

205 Parameters 

206 ---------- 

207 inMemoryDataset : `object` 

208 Object to coerce to expected type. 

209 writeStorageClass : `StorageClass` 

210 Storage class used to serialize this data. 

211 readStorageClass : `StorageClass` 

212 Storage class requested as the outcome. 

213 

214 Returns 

215 ------- 

216 inMemoryDataset : `object` 

217 Object of expected type ``readStorageClass.pytype``. 

218 """ 

219 inMemoryDataset = self._coerceBuiltinType(inMemoryDataset, writeStorageClass) 

220 return readStorageClass.coerce_type(inMemoryDataset) 

221 

222 def read(self, component: Optional[str] = None) -> Any: 

223 """Read data from a file. 

224 

225 Parameters 

226 ---------- 

227 fileDescriptor : `FileDescriptor` 

228 Identifies the file to read, type to read it into and parameters 

229 to be used for reading. 

230 component : `str`, optional 

231 Component to read from the file. Only used if the `StorageClass` 

232 for reading differed from the `StorageClass` used to write the 

233 file. 

234 

235 Returns 

236 ------- 

237 inMemoryDataset : `object` 

238 The requested data as a Python object. The type of object 

239 is controlled by the specific formatter. 

240 

241 Raises 

242 ------ 

243 ValueError 

244 Component requested but this file does not seem to be a concrete 

245 composite. 

246 NotImplementedError 

247 Formatter does not implement a method to read from files. 

248 """ 

249 

250 # Read the file naively 

251 path = self.fileDescriptor.location.path 

252 data = self._readFile(path, self.fileDescriptor.storageClass.pytype) 

253 

254 # Assemble the requested dataset and potentially return only its 

255 # component coercing it to its appropriate pytype 

256 data = self._assembleDataset(data, component) 

257 

258 # Special case components by allowing a formatter to return None 

259 # to indicate that the component was understood but is missing 

260 if data is None and component is None: 

261 raise ValueError(f"Unable to read data with URI {self.fileDescriptor.location.uri}") 

262 

263 return data 

264 

265 def fromBytes(self, serializedDataset: bytes, component: Optional[str] = None) -> Any: 

266 """Reads serialized data into a Dataset or its component. 

267 

268 Parameters 

269 ---------- 

270 serializedDataset : `bytes` 

271 Bytes object to unserialize. 

272 component : `str`, optional 

273 Component to read from the Dataset. Only used if the `StorageClass` 

274 for reading differed from the `StorageClass` used to write the 

275 file. 

276 

277 Returns 

278 ------- 

279 inMemoryDataset : `object` 

280 The requested data as a Python object. The type of object 

281 is controlled by the specific formatter. 

282 

283 Raises 

284 ------ 

285 NotImplementedError 

286 Formatter does not support reading from bytes. 

287 """ 

288 if not hasattr(self, "_fromBytes"): 

289 raise NotImplementedError("Type does not support reading from bytes.") 

290 

291 data = self._fromBytes(serializedDataset, self.fileDescriptor.storageClass.pytype) 

292 

293 # Assemble the requested dataset and potentially return only its 

294 # component coercing it to its appropriate pytype 

295 data = self._assembleDataset(data, component) 

296 

297 # Special case components by allowing a formatter to return None 

298 # to indicate that the component was understood but is missing 

299 if data is None and component is None: 

300 nbytes = len(serializedDataset) 

301 s = "s" if nbytes != 1 else "" 

302 raise ValueError( 

303 f"Unable to unpersist {nbytes} byte{s} from URI {self.fileDescriptor.location.uri}" 

304 ) 

305 

306 return data 

307 

308 def write(self, inMemoryDataset: Any) -> None: 

309 """Write a Python object to a file. 

310 

311 Parameters 

312 ---------- 

313 inMemoryDataset : `object` 

314 The Python object to store. 

315 

316 Returns 

317 ------- 

318 path : `str` 

319 The path where the primary file is stored within the datastore. 

320 """ 

321 fileDescriptor = self.fileDescriptor 

322 # Update the location with the formatter-preferred file extension 

323 fileDescriptor.location.updateExtension(self.extension) 

324 

325 self._writeFile(inMemoryDataset) 

326 

327 def toBytes(self, inMemoryDataset: Any) -> bytes: 

328 """Serialize the Dataset to bytes based on formatter. 

329 

330 Parameters 

331 ---------- 

332 inMemoryDataset : `object` 

333 Object to serialize. 

334 

335 Returns 

336 ------- 

337 serializedDataset : `bytes` 

338 Bytes representing the serialized dataset. 

339 

340 Raises 

341 ------ 

342 NotImplementedError 

343 Formatter does not support reading from bytes. 

344 """ 

345 if not hasattr(self, "_toBytes"): 

346 raise NotImplementedError("Type does not support reading from bytes.") 

347 

348 return self._toBytes(inMemoryDataset)