Coverage for python/lsst/daf/butler/formatters/file.py: 29%

67 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-07-21 09:55 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""Support for reading and writing files to a POSIX file system.""" 

23 

24from __future__ import annotations 

25 

26__all__ = ("FileFormatter",) 

27 

28import dataclasses 

29from abc import abstractmethod 

30from typing import TYPE_CHECKING, Any 

31 

32from lsst.daf.butler import Formatter 

33 

34if TYPE_CHECKING: 

35 from lsst.daf.butler import StorageClass 

36 

37 

38class FileFormatter(Formatter): 

39 """Interface for reading and writing files on a POSIX file system.""" 

40 

41 extension: str | None = None 

42 """Default file extension to use for writing files. None means that no 

43 modifications will be made to the supplied file extension. (`str`)""" 

44 

45 @abstractmethod 

46 def _readFile(self, path: str, pytype: type[Any] | None = None) -> Any: 

47 """Read a file from the path in the correct format. 

48 

49 Parameters 

50 ---------- 

51 path : `str` 

52 Path to use to open the file. 

53 pytype : `class`, optional 

54 Class to use to read the file. 

55 

56 Returns 

57 ------- 

58 data : `object` 

59 Data read from file. Returns `None` if the file can not be 

60 found at the given path. 

61 

62 Raises 

63 ------ 

64 Exception 

65 Some problem reading the file. 

66 """ 

67 pass 

68 

69 @abstractmethod 

70 def _writeFile(self, inMemoryDataset: Any) -> None: 

71 """Write the in memory dataset to file on disk. 

72 

73 Parameters 

74 ---------- 

75 inMemoryDataset : `object` 

76 Object to serialize. 

77 

78 Raises 

79 ------ 

80 Exception 

81 The file could not be written. 

82 """ 

83 pass 

84 

85 def _assembleDataset(self, data: Any, component: str | None = None) -> Any: 

86 """Assembles and coerces the dataset, or one of its components, 

87 into an appropriate python type and returns it. 

88 

89 Parameters 

90 ---------- 

91 data : `dict` or `object` 

92 Composite or a dict that, or which component, needs to be 

93 coerced to the python type specified in "fileDescriptor" 

94 component : `str`, optional 

95 Component to read from the file. Only used if the `StorageClass` 

96 for reading differed from the `StorageClass` used to write the 

97 file. 

98 

99 Returns 

100 ------- 

101 inMemoryDataset : `object` 

102 The requested data as a Python object. The type of object 

103 is controlled by the specific formatter. 

104 """ 

105 fileDescriptor = self.fileDescriptor 

106 

107 # Get the read and write storage classes. 

108 readStorageClass = fileDescriptor.readStorageClass 

109 writeStorageClass = fileDescriptor.storageClass 

110 

111 if component is not None: 

112 # Requesting a component implies that we need to first ensure 

113 # that the composite is the correct python type. Lie to the 

114 # coercion routine since the read StorageClass is not relevant 

115 # if we want the original. 

116 data = self._coerceType(data, writeStorageClass, writeStorageClass) 

117 

118 # Concrete composite written as a single file (we hope) 

119 # so try to get the component. 

120 try: 

121 data = fileDescriptor.storageClass.delegate().getComponent(data, component) 

122 except AttributeError: 

123 # Defer the complaint 

124 data = None 

125 

126 # Update the write storage class to match that of the component. 

127 # It should be safe to use the component storage class directly 

128 # since that should match what was returned from getComponent 

129 # (else we could create a temporary storage class guaranteed to 

130 # match the python type we have). 

131 writeStorageClass = writeStorageClass.allComponents()[component] 

132 

133 # Coerce to the requested type. 

134 data = self._coerceType(data, writeStorageClass, readStorageClass) 

135 

136 return data 

137 

138 def _coerceBuiltinType(self, inMemoryDataset: Any, writeStorageClass: StorageClass) -> Any: 

139 """Coerce the supplied inMemoryDataset to the written python type if it 

140 is currently a built-in type. 

141 

142 Parameters 

143 ---------- 

144 inMemoryDataset : `object` 

145 Object to coerce to expected type. 

146 writeStorageClass : `StorageClass` 

147 Storage class used to serialize this data. 

148 

149 Returns 

150 ------- 

151 inMemoryDataset : `object` 

152 Object of expected type ``writeStorageClass.pytype``. 

153 

154 Notes 

155 ----- 

156 This method only modifies the supplied object if the object is: 

157 

158 * Not already the required type. 

159 * Not `None`. 

160 * Looks like a built-in type. 

161 

162 It is intended to be used as a helper for file formats that do not 

163 store the original Python type information in serialized form and 

164 instead return built-in types such as `dict` and `list` that need 

165 to be converted to the required form. This happens before 

166 `StorageClass` converters trigger so that constructors can be 

167 called that can build the original type first before checking the 

168 requested Python type. This is important for Pydantic models where 

169 the internal structure of the model may not match the `dict` form 

170 in a scenario where the user has requested a `dict`. 

171 """ 

172 if ( 

173 inMemoryDataset is not None 

174 and not isinstance(inMemoryDataset, writeStorageClass.pytype) 

175 and type(inMemoryDataset).__module__ == "builtins" 

176 ): 

177 # Try different ways of converting to the required type. 

178 if hasattr(writeStorageClass.pytype, "parse_obj"): 

179 # This is for a Pydantic model. 

180 inMemoryDataset = writeStorageClass.pytype.parse_obj(inMemoryDataset) 

181 elif isinstance(inMemoryDataset, dict): 

182 if dataclasses.is_dataclass(writeStorageClass.pytype): 

183 # Dataclasses accept key/value parameters. 

184 inMemoryDataset = writeStorageClass.pytype(**inMemoryDataset) 

185 elif writeStorageClass.isComposite(): 

186 # Assume that this type can be constructed 

187 # using the registered assembler from a dict. 

188 inMemoryDataset = writeStorageClass.delegate().assemble( 

189 inMemoryDataset, pytype=writeStorageClass.pytype 

190 ) 

191 else: 

192 # Unpack the dict and hope that works. 

193 inMemoryDataset = writeStorageClass.pytype(**inMemoryDataset) 

194 else: 

195 # Hope that we can pass the arguments in directly. 

196 inMemoryDataset = writeStorageClass.pytype(inMemoryDataset) 

197 

198 return inMemoryDataset 

199 

200 def _coerceType( 

201 self, inMemoryDataset: Any, writeStorageClass: StorageClass, readStorageClass: StorageClass 

202 ) -> Any: 

203 """Coerce the supplied inMemoryDataset to the correct python type. 

204 

205 Parameters 

206 ---------- 

207 inMemoryDataset : `object` 

208 Object to coerce to expected type. 

209 writeStorageClass : `StorageClass` 

210 Storage class used to serialize this data. 

211 readStorageClass : `StorageClass` 

212 Storage class requested as the outcome. 

213 

214 Returns 

215 ------- 

216 inMemoryDataset : `object` 

217 Object of expected type ``readStorageClass.pytype``. 

218 """ 

219 inMemoryDataset = self._coerceBuiltinType(inMemoryDataset, writeStorageClass) 

220 return readStorageClass.coerce_type(inMemoryDataset) 

221 

222 def read(self, component: str | None = None) -> Any: 

223 """Read data from a file. 

224 

225 Parameters 

226 ---------- 

227 fileDescriptor : `FileDescriptor` 

228 Identifies the file to read, type to read it into and parameters 

229 to be used for reading. 

230 component : `str`, optional 

231 Component to read from the file. Only used if the `StorageClass` 

232 for reading differed from the `StorageClass` used to write the 

233 file. 

234 

235 Returns 

236 ------- 

237 inMemoryDataset : `object` 

238 The requested data as a Python object. The type of object 

239 is controlled by the specific formatter. 

240 

241 Raises 

242 ------ 

243 ValueError 

244 Component requested but this file does not seem to be a concrete 

245 composite. 

246 NotImplementedError 

247 Formatter does not implement a method to read from files. 

248 """ 

249 # Read the file naively 

250 path = self.fileDescriptor.location.path 

251 data = self._readFile(path, self.fileDescriptor.storageClass.pytype) 

252 

253 # Assemble the requested dataset and potentially return only its 

254 # component coercing it to its appropriate pytype 

255 data = self._assembleDataset(data, component) 

256 

257 # Special case components by allowing a formatter to return None 

258 # to indicate that the component was understood but is missing 

259 if data is None and component is None: 

260 raise ValueError(f"Unable to read data with URI {self.fileDescriptor.location.uri}") 

261 

262 return data 

263 

264 def fromBytes(self, serializedDataset: bytes, component: str | None = None) -> Any: 

265 """Read serialized data into a Dataset or its component. 

266 

267 Parameters 

268 ---------- 

269 serializedDataset : `bytes` 

270 Bytes object to unserialize. 

271 component : `str`, optional 

272 Component to read from the Dataset. Only used if the `StorageClass` 

273 for reading differed from the `StorageClass` used to write the 

274 file. 

275 

276 Returns 

277 ------- 

278 inMemoryDataset : `object` 

279 The requested data as a Python object. The type of object 

280 is controlled by the specific formatter. 

281 

282 Raises 

283 ------ 

284 NotImplementedError 

285 Formatter does not support reading from bytes. 

286 """ 

287 if not hasattr(self, "_fromBytes"): 

288 raise NotImplementedError("Type does not support reading from bytes.") 

289 

290 data = self._fromBytes(serializedDataset, self.fileDescriptor.storageClass.pytype) 

291 

292 # Assemble the requested dataset and potentially return only its 

293 # component coercing it to its appropriate pytype 

294 data = self._assembleDataset(data, component) 

295 

296 # Special case components by allowing a formatter to return None 

297 # to indicate that the component was understood but is missing 

298 if data is None and component is None: 

299 nbytes = len(serializedDataset) 

300 s = "s" if nbytes != 1 else "" 

301 raise ValueError( 

302 f"Unable to unpersist {nbytes} byte{s} from URI {self.fileDescriptor.location.uri}" 

303 ) 

304 

305 return data 

306 

307 def write(self, inMemoryDataset: Any) -> None: 

308 """Write a Python object to a file. 

309 

310 Parameters 

311 ---------- 

312 inMemoryDataset : `object` 

313 The Python object to store. 

314 

315 Returns 

316 ------- 

317 path : `str` 

318 The path where the primary file is stored within the datastore. 

319 """ 

320 fileDescriptor = self.fileDescriptor 

321 # Update the location with the formatter-preferred file extension 

322 fileDescriptor.location.updateExtension(self.extension) 

323 

324 self._writeFile(inMemoryDataset) 

325 

326 def toBytes(self, inMemoryDataset: Any) -> bytes: 

327 """Serialize the Dataset to bytes based on formatter. 

328 

329 Parameters 

330 ---------- 

331 inMemoryDataset : `object` 

332 Object to serialize. 

333 

334 Returns 

335 ------- 

336 serializedDataset : `bytes` 

337 Bytes representing the serialized dataset. 

338 

339 Raises 

340 ------ 

341 NotImplementedError 

342 Formatter does not support reading from bytes. 

343 """ 

344 if not hasattr(self, "_toBytes"): 

345 raise NotImplementedError("Type does not support reading from bytes.") 

346 

347 return self._toBytes(inMemoryDataset)