Coverage for tests/test_parquet.py: 16%

794 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2023-02-28 10:37 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""Tests for ParquetFormatter. 

23 

24Tests in this module are disabled unless pandas and pyarrow are importable. 

25""" 

26 

27import os 

28import unittest 

29 

30try: 

31 import pyarrow as pa 

32except ImportError: 

33 pa = None 

34try: 

35 import astropy.table as atable 

36 from astropy import units 

37except ImportError: 

38 atable = None 

39try: 

40 import numpy as np 

41except ImportError: 

42 np = None 

43try: 

44 import pandas as pd 

45except ImportError: 

46 np = None 

47 

48from lsst.daf.butler import ( 

49 Butler, 

50 Config, 

51 DatasetRef, 

52 DatasetType, 

53 FileDataset, 

54 StorageClassConfig, 

55 StorageClassFactory, 

56) 

57from lsst.daf.butler.delegates.arrowastropy import ArrowAstropyDelegate 

58from lsst.daf.butler.delegates.arrownumpy import ArrowNumpyDelegate 

59from lsst.daf.butler.delegates.arrowtable import ArrowTableDelegate 

60from lsst.daf.butler.delegates.dataframe import DataFrameDelegate 

61from lsst.daf.butler.formatters.parquet import ( 

62 ArrowAstropySchema, 

63 ArrowNumpySchema, 

64 DataFrameSchema, 

65 ParquetFormatter, 

66 _append_numpy_multidim_metadata, 

67 _numpy_dtype_to_arrow_types, 

68 arrow_to_astropy, 

69 arrow_to_numpy, 

70 arrow_to_numpy_dict, 

71 arrow_to_pandas, 

72 astropy_to_arrow, 

73 numpy_dict_to_arrow, 

74 numpy_to_arrow, 

75 pandas_to_arrow, 

76) 

77from lsst.daf.butler.tests.utils import makeTestTempDir, removeTestTempDir 

78 

79TESTDIR = os.path.abspath(os.path.dirname(__file__)) 

80 

81 

82def _makeSimpleNumpyTable(include_multidim=False, include_bigendian=False): 

83 """Make a simple numpy table with random data. 

84 

85 Parameters 

86 ---------- 

87 include_multidim : `bool` 

88 Include multi-dimensional columns. 

89 include_bigendian : `bool` 

90 Include big-endian columns. 

91 

92 Returns 

93 ------- 

94 numpyTable : `numpy.ndarray` 

95 """ 

96 nrow = 5 

97 

98 dtype = [ 

99 ("index", "i4"), 

100 ("a", "f8"), 

101 ("b", "f8"), 

102 ("c", "f8"), 

103 ("ddd", "f8"), 

104 ("f", "i8"), 

105 ("strcol", "U10"), 

106 ("bytecol", "a10"), 

107 ] 

108 

109 if include_multidim: 

110 dtype.extend( 

111 [ 

112 ("d1", "f4", (5,)), 

113 ("d2", "i8", (5, 10)), 

114 ("d3", "f8", (5, 10)), 

115 ] 

116 ) 

117 

118 if include_bigendian: 

119 dtype.extend([("a_bigendian", ">f8"), ("f_bigendian", ">i8")]) 

120 

121 data = np.zeros(nrow, dtype=dtype) 

122 data["index"][:] = np.arange(nrow) 

123 data["a"] = np.random.randn(nrow) 

124 data["b"] = np.random.randn(nrow) 

125 data["c"] = np.random.randn(nrow) 

126 data["ddd"] = np.random.randn(nrow) 

127 data["f"] = np.arange(nrow) * 10 

128 data["strcol"][:] = "teststring" 

129 data["bytecol"][:] = "teststring" 

130 

131 if include_multidim: 

132 data["d1"] = np.random.randn(data["d1"].size).reshape(data["d1"].shape) 

133 data["d2"] = np.arange(data["d2"].size).reshape(data["d2"].shape) 

134 data["d3"] = np.asfortranarray(np.random.randn(data["d3"].size).reshape(data["d3"].shape)) 

135 

136 if include_bigendian: 

137 data["a_bigendian"][:] = data["a"] 

138 data["f_bigendian"][:] = data["f"] 

139 

140 return data 

141 

142 

143def _makeSingleIndexDataFrame(include_masked=False): 

144 """Make a single index data frame for testing. 

145 

146 Parameters 

147 ---------- 

148 include_masked : `bool` 

149 Include masked columns. 

150 

151 Returns 

152 ------- 

153 dataFrame : `~pandas.DataFrame` 

154 The test dataframe. 

155 allColumns : `list` [`str`] 

156 List of all the columns (including index columns). 

157 """ 

158 data = _makeSimpleNumpyTable() 

159 df = pd.DataFrame(data) 

160 df = df.set_index("index") 

161 

162 if include_masked: 

163 nrow = len(df) 

164 

165 df["m1"] = pd.array(np.arange(nrow), dtype=pd.Int64Dtype()) 

166 df["m2"] = pd.array(np.arange(nrow), dtype=np.float32) 

167 df["mstrcol"] = pd.array(np.array(["text"] * nrow)) 

168 df.loc[1, ["m1", "m2", "mstrcol"]] = None 

169 

170 allColumns = df.columns.append(pd.Index(df.index.names)) 

171 

172 return df, allColumns 

173 

174 

175def _makeMultiIndexDataFrame(): 

176 """Make a multi-index data frame for testing. 

177 

178 Returns 

179 ------- 

180 dataFrame : `~pandas.DataFrame` 

181 The test dataframe. 

182 """ 

183 columns = pd.MultiIndex.from_tuples( 

184 [ 

185 ("g", "a"), 

186 ("g", "b"), 

187 ("g", "c"), 

188 ("r", "a"), 

189 ("r", "b"), 

190 ("r", "c"), 

191 ], 

192 names=["filter", "column"], 

193 ) 

194 df = pd.DataFrame(np.random.randn(5, 6), index=np.arange(5, dtype=int), columns=columns) 

195 

196 return df 

197 

198 

199def _makeSimpleAstropyTable(include_multidim=False, include_masked=False, include_bigendian=False): 

200 """Make an astropy table for testing. 

201 

202 Parameters 

203 ---------- 

204 include_multidim : `bool` 

205 Include multi-dimensional columns. 

206 include_masked : `bool` 

207 Include masked columns. 

208 include_bigendian : `bool` 

209 Include big-endian columns. 

210 

211 Returns 

212 ------- 

213 astropyTable : `astropy.table.Table` 

214 The test table. 

215 """ 

216 data = _makeSimpleNumpyTable(include_multidim=include_multidim, include_bigendian=include_bigendian) 

217 # Add a couple of units. 

218 table = atable.Table(data) 

219 table["a"].unit = units.degree 

220 table["b"].unit = units.meter 

221 

222 # Add some masked columns. 

223 if include_masked: 

224 nrow = len(table) 

225 mask = np.zeros(nrow, dtype=bool) 

226 mask[1] = True 

227 table["m1"] = np.ma.masked_array(data=np.arange(nrow, dtype="i8"), mask=mask) 

228 table["m2"] = np.ma.masked_array(data=np.arange(nrow, dtype="f4"), mask=mask) 

229 table["mstrcol"] = np.ma.masked_array(data=np.array(["text"] * nrow), mask=mask) 

230 table["mbytecol"] = np.ma.masked_array(data=np.array([b"bytes"] * nrow), mask=mask) 

231 

232 return table 

233 

234 

235def _makeSimpleArrowTable(include_multidim=False, include_masked=False): 

236 """Make an arrow table for testing. 

237 

238 Parameters 

239 ---------- 

240 include_multidim : `bool` 

241 Include multi-dimensional columns. 

242 include_masked : `bool` 

243 Include masked columns. 

244 

245 Returns 

246 ------- 

247 arrowTable : `pyarrow.Table` 

248 The test table. 

249 """ 

250 data = _makeSimpleAstropyTable(include_multidim=include_multidim, include_masked=include_masked) 

251 return astropy_to_arrow(data) 

252 

253 

254@unittest.skipUnless(pd is not None, "Cannot test ParquetFormatterDataFrame without pandas.") 

255@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterDataFrame without pyarrow.") 

256class ParquetFormatterDataFrameTestCase(unittest.TestCase): 

257 """Tests for ParquetFormatter, DataFrame, using local file datastore.""" 

258 

259 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

260 

261 def setUp(self): 

262 """Create a new butler root for each test.""" 

263 self.root = makeTestTempDir(TESTDIR) 

264 config = Config(self.configFile) 

265 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run") 

266 # No dimensions in dataset type so we don't have to worry about 

267 # inserting dimension data or defining data IDs. 

268 self.datasetType = DatasetType( 

269 "data", dimensions=(), storageClass="DataFrame", universe=self.butler.registry.dimensions 

270 ) 

271 self.butler.registry.registerDatasetType(self.datasetType) 

272 

273 def tearDown(self): 

274 removeTestTempDir(self.root) 

275 

276 def testSingleIndexDataFrame(self): 

277 df1, allColumns = _makeSingleIndexDataFrame(include_masked=True) 

278 

279 self.butler.put(df1, self.datasetType, dataId={}) 

280 # Read the whole DataFrame. 

281 df2 = self.butler.get(self.datasetType, dataId={}) 

282 self.assertTrue(df1.equals(df2)) 

283 # Read just the column descriptions. 

284 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

285 self.assertTrue(allColumns.equals(columns2)) 

286 # Read the rowcount. 

287 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

288 self.assertEqual(rowcount, len(df1)) 

289 # Read the schema. 

290 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

291 self.assertEqual(schema, DataFrameSchema(df1)) 

292 # Read just some columns a few different ways. 

293 df3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

294 self.assertTrue(df1.loc[:, ["a", "c"]].equals(df3)) 

295 df4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

296 self.assertTrue(df1.loc[:, ["a"]].equals(df4)) 

297 df5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]}) 

298 self.assertTrue(df1.loc[:, ["a"]].equals(df5)) 

299 df6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"}) 

300 self.assertTrue(df1.loc[:, ["ddd"]].equals(df6)) 

301 df7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]}) 

302 self.assertTrue(df1.loc[:, ["a"]].equals(df7)) 

303 # Passing an unrecognized column should be a ValueError. 

304 with self.assertRaises(ValueError): 

305 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]}) 

306 

307 def testMultiIndexDataFrame(self): 

308 df1 = _makeMultiIndexDataFrame() 

309 

310 self.butler.put(df1, self.datasetType, dataId={}) 

311 # Read the whole DataFrame. 

312 df2 = self.butler.get(self.datasetType, dataId={}) 

313 self.assertTrue(df1.equals(df2)) 

314 # Read just the column descriptions. 

315 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

316 self.assertTrue(df1.columns.equals(columns2)) 

317 # Read the rowcount. 

318 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

319 self.assertEqual(rowcount, len(df1)) 

320 # Read the schema. 

321 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

322 self.assertEqual(schema, DataFrameSchema(df1)) 

323 # Read just some columns a few different ways. 

324 df3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": {"filter": "g"}}) 

325 self.assertTrue(df1.loc[:, ["g"]].equals(df3)) 

326 df4 = self.butler.get( 

327 self.datasetType, dataId={}, parameters={"columns": {"filter": ["r"], "column": "a"}} 

328 ) 

329 self.assertTrue(df1.loc[:, [("r", "a")]].equals(df4)) 

330 column_list = [("g", "a"), ("r", "c")] 

331 df5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": column_list}) 

332 self.assertTrue(df1.loc[:, column_list].equals(df5)) 

333 # Passing an unrecognized column should be a ValueError. 

334 with self.assertRaises(ValueError): 

335 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["d"]}) 

336 

337 def testSingleIndexDataFrameEmptyString(self): 

338 """Test persisting a single index dataframe with empty strings.""" 

339 df1, _ = _makeSingleIndexDataFrame() 

340 

341 # Set one of the strings to None 

342 df1.at[1, "strcol"] = None 

343 

344 self.butler.put(df1, self.datasetType, dataId={}) 

345 # Read the whole DataFrame. 

346 df2 = self.butler.get(self.datasetType, dataId={}) 

347 self.assertTrue(df1.equals(df2)) 

348 

349 def testSingleIndexDataFrameAllEmptyStrings(self): 

350 """Test persisting a single index dataframe with an empty string 

351 column. 

352 """ 

353 df1, _ = _makeSingleIndexDataFrame() 

354 

355 # Set all of the strings to None 

356 df1.loc[0:, "strcol"] = None 

357 

358 self.butler.put(df1, self.datasetType, dataId={}) 

359 # Read the whole DataFrame. 

360 df2 = self.butler.get(self.datasetType, dataId={}) 

361 self.assertTrue(df1.equals(df2)) 

362 

363 def testLegacyDataFrame(self): 

364 """Test writing a dataframe to parquet via pandas (without additional 

365 metadata) and ensure that we can read it back with all the new 

366 functionality. 

367 """ 

368 df1, allColumns = _makeSingleIndexDataFrame() 

369 

370 fname = os.path.join(self.root, "test_dataframe.parq") 

371 df1.to_parquet(fname) 

372 

373 legacy_type = DatasetType( 

374 "legacy_dataframe", 

375 dimensions=(), 

376 storageClass="DataFrame", 

377 universe=self.butler.registry.dimensions, 

378 ) 

379 self.butler.registry.registerDatasetType(legacy_type) 

380 

381 data_id = {} 

382 ref = DatasetRef(legacy_type, data_id, id=None) 

383 dataset = FileDataset(path=fname, refs=[ref], formatter=ParquetFormatter) 

384 

385 self.butler.ingest(dataset, transfer="copy") 

386 

387 self.butler.put(df1, self.datasetType, dataId={}) 

388 

389 df2a = self.butler.get(self.datasetType, dataId={}) 

390 df2b = self.butler.get("legacy_dataframe", dataId={}) 

391 self.assertTrue(df2a.equals(df2b)) 

392 

393 df3a = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a"]}) 

394 df3b = self.butler.get("legacy_dataframe", dataId={}, parameters={"columns": ["a"]}) 

395 self.assertTrue(df3a.equals(df3b)) 

396 

397 columns2a = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

398 columns2b = self.butler.get("legacy_dataframe.columns", dataId={}) 

399 self.assertTrue(columns2a.equals(columns2b)) 

400 

401 rowcount2a = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

402 rowcount2b = self.butler.get("legacy_dataframe.rowcount", dataId={}) 

403 self.assertEqual(rowcount2a, rowcount2b) 

404 

405 schema2a = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

406 schema2b = self.butler.get("legacy_dataframe.schema", dataId={}) 

407 self.assertEqual(schema2a, schema2b) 

408 

409 def testDataFrameSchema(self): 

410 tab1 = _makeSimpleArrowTable() 

411 

412 schema = DataFrameSchema.from_arrow(tab1.schema) 

413 

414 self.assertIsInstance(schema.schema, pd.DataFrame) 

415 self.assertEqual(repr(schema), repr(schema._schema)) 

416 self.assertNotEqual(schema, "not_a_schema") 

417 self.assertEqual(schema, schema) 

418 

419 tab2 = _makeMultiIndexDataFrame() 

420 schema2 = DataFrameSchema(tab2) 

421 

422 self.assertNotEqual(schema, schema2) 

423 

424 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

425 def testWriteSingleIndexDataFrameReadAsAstropyTable(self): 

426 df1, allColumns = _makeSingleIndexDataFrame() 

427 

428 self.butler.put(df1, self.datasetType, dataId={}) 

429 

430 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

431 

432 tab2_df = tab2.to_pandas(index="index") 

433 self.assertTrue(df1.equals(tab2_df)) 

434 

435 # Check reading the columns. 

436 columns = list(tab2.columns.keys()) 

437 columns2 = self.butler.get( 

438 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

439 ) 

440 # We check the set because pandas reorders the columns. 

441 self.assertEqual(set(columns2), set(columns)) 

442 

443 # Check reading the schema. 

444 schema = ArrowAstropySchema(tab2) 

445 schema2 = self.butler.get( 

446 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema" 

447 ) 

448 

449 # The string types are objectified by pandas, and the order 

450 # will be changed because of pandas indexing. 

451 self.assertEqual(len(schema2.schema.columns), len(schema.schema.columns)) 

452 for name in schema.schema.columns: 

453 self.assertIn(name, schema2.schema.columns) 

454 if schema2.schema[name].dtype != np.dtype("O"): 

455 self.assertEqual(schema2.schema[name].dtype, schema.schema[name].dtype) 

456 

457 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

458 def testWriteSingleIndexDataFrameWithMaskedColsReadAsAstropyTable(self): 

459 # We need to special-case the write-as-pandas read-as-astropy code 

460 # with masks because pandas has multiple ways to use masked columns. 

461 # (The string column mask handling in particular is frustratingly 

462 # inconsistent.) 

463 df1, allColumns = _makeSingleIndexDataFrame(include_masked=True) 

464 

465 self.butler.put(df1, self.datasetType, dataId={}) 

466 

467 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

468 tab2_df = tab2.to_pandas(index="index") 

469 

470 self.assertTrue(df1.columns.equals(tab2_df.columns)) 

471 for name in tab2_df.columns: 

472 col1 = df1[name] 

473 col2 = tab2_df[name] 

474 

475 if col1.hasnans: 

476 notNull = col1.notnull() 

477 self.assertTrue(notNull.equals(col2.notnull())) 

478 # Need to check value-by-value because column may 

479 # be made of objects, depending on what pandas decides. 

480 for index in notNull.values.nonzero()[0]: 

481 self.assertEqual(col1[index], col2[index]) 

482 else: 

483 self.assertTrue(col1.equals(col2)) 

484 

485 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

486 def testWriteMultiIndexDataFrameReadAsAstropyTable(self): 

487 df1 = _makeMultiIndexDataFrame() 

488 

489 self.butler.put(df1, self.datasetType, dataId={}) 

490 

491 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

492 

493 # This is an odd duck, it doesn't really round-trip. 

494 # This test simply checks that it's readable, but definitely not 

495 # recommended. 

496 

497 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.") 

498 def testWriteSingleIndexDataFrameReadAsArrowTable(self): 

499 df1, allColumns = _makeSingleIndexDataFrame() 

500 

501 self.butler.put(df1, self.datasetType, dataId={}) 

502 

503 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable") 

504 

505 tab2_df = arrow_to_pandas(tab2) 

506 self.assertTrue(df1.equals(tab2_df)) 

507 

508 # Check reading the columns. 

509 columns = list(tab2.schema.names) 

510 columns2 = self.butler.get( 

511 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

512 ) 

513 # We check the set because pandas reorders the columns. 

514 self.assertEqual(set(columns), set(columns2)) 

515 

516 # Check reading the schema. 

517 schema = tab2.schema 

518 schema2 = self.butler.get( 

519 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema" 

520 ) 

521 

522 # These will not have the same metadata, nor will the string column 

523 # information be maintained. 

524 self.assertEqual(len(schema.names), len(schema2.names)) 

525 for name in schema.names: 

526 if schema.field(name).type not in (pa.string(), pa.binary()): 

527 self.assertEqual(schema.field(name).type, schema2.field(name).type) 

528 

529 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.") 

530 def testWriteMultiIndexDataFrameReadAsArrowTable(self): 

531 df1 = _makeMultiIndexDataFrame() 

532 

533 self.butler.put(df1, self.datasetType, dataId={}) 

534 

535 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable") 

536 

537 tab2_df = arrow_to_pandas(tab2) 

538 self.assertTrue(df1.equals(tab2_df)) 

539 

540 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

541 def testWriteSingleIndexDataFrameReadAsNumpyTable(self): 

542 df1, allColumns = _makeSingleIndexDataFrame() 

543 

544 self.butler.put(df1, self.datasetType, dataId={}) 

545 

546 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy") 

547 

548 tab2_df = pd.DataFrame.from_records(tab2, index=["index"]) 

549 self.assertTrue(df1.equals(tab2_df)) 

550 

551 # Check reading the columns. 

552 columns = list(tab2.dtype.names) 

553 columns2 = self.butler.get( 

554 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

555 ) 

556 # We check the set because pandas reorders the columns. 

557 self.assertEqual(set(columns2), set(columns)) 

558 

559 # Check reading the schema. 

560 schema = ArrowNumpySchema(tab2.dtype) 

561 schema2 = self.butler.get( 

562 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema" 

563 ) 

564 

565 # The string types will be objectified by pandas, and the order 

566 # will be changed because of pandas indexing. 

567 self.assertEqual(len(schema.schema.names), len(schema2.schema.names)) 

568 for name in schema.schema.names: 

569 self.assertIn(name, schema2.schema.names) 

570 self.assertEqual(schema2.schema[name].type, schema.schema[name].type) 

571 

572 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

573 def testWriteMultiIndexDataFrameReadAsNumpyTable(self): 

574 df1 = _makeMultiIndexDataFrame() 

575 

576 self.butler.put(df1, self.datasetType, dataId={}) 

577 

578 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy") 

579 

580 # This is an odd duck, it doesn't really round-trip. 

581 # This test simply checks that it's readable, but definitely not 

582 # recommended. 

583 

584 

585@unittest.skipUnless(pd is not None, "Cannot test InMemoryDataFrameDelegate without pandas.") 

586class InMemoryDataFrameDelegateTestCase(ParquetFormatterDataFrameTestCase): 

587 """Tests for InMemoryDatastore, using DataFrameDelegate.""" 

588 

589 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

590 

591 def testMultiIndexDataFrame(self): 

592 df1 = _makeMultiIndexDataFrame() 

593 

594 delegate = DataFrameDelegate("DataFrame") 

595 

596 # Read the whole DataFrame. 

597 df2 = delegate.handleParameters(inMemoryDataset=df1) 

598 self.assertTrue(df1.equals(df2)) 

599 # Read just the column descriptions. 

600 columns2 = delegate.getComponent(composite=df1, componentName="columns") 

601 self.assertTrue(df1.columns.equals(columns2)) 

602 

603 # Read just some columns a few different ways. 

604 with self.assertRaises(NotImplementedError) as cm: 

605 delegate.handleParameters(inMemoryDataset=df1, parameters={"columns": {"filter": "g"}}) 

606 self.assertIn("only supports string column names", str(cm.exception)) 

607 with self.assertRaises(NotImplementedError) as cm: 

608 delegate.handleParameters( 

609 inMemoryDataset=df1, parameters={"columns": {"filter": ["r"], "column": "a"}} 

610 ) 

611 self.assertIn("only supports string column names", str(cm.exception)) 

612 

613 def testWriteMultiIndexDataFrameReadAsAstropyTable(self): 

614 df1 = _makeMultiIndexDataFrame() 

615 

616 self.butler.put(df1, self.datasetType, dataId={}) 

617 

618 with self.assertRaises(ValueError): 

619 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

620 

621 def testLegacyDataFrame(self): 

622 # This test does not work with an inMemoryDatastore. 

623 pass 

624 

625 def testBadInput(self): 

626 df1, _ = _makeSingleIndexDataFrame() 

627 delegate = DataFrameDelegate("DataFrame") 

628 

629 with self.assertRaises(ValueError): 

630 delegate.handleParameters(inMemoryDataset="not_a_dataframe") 

631 

632 with self.assertRaises(AttributeError): 

633 delegate.getComponent(composite=df1, componentName="nothing") 

634 

635 def testStorageClass(self): 

636 df1, allColumns = _makeSingleIndexDataFrame() 

637 

638 factory = StorageClassFactory() 

639 factory.addFromConfig(StorageClassConfig()) 

640 

641 storageClass = factory.findStorageClass(type(df1), compare_types=False) 

642 # Force the name lookup to do name matching. 

643 storageClass._pytype = None 

644 self.assertEqual(storageClass.name, "DataFrame") 

645 

646 storageClass = factory.findStorageClass(type(df1), compare_types=True) 

647 # Force the name lookup to do name matching. 

648 storageClass._pytype = None 

649 self.assertEqual(storageClass.name, "DataFrame") 

650 

651 

652@unittest.skipUnless(atable is not None, "Cannot test ParquetFormatterArrowAstropy without astropy.") 

653@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowAstropy without pyarrow.") 

654class ParquetFormatterArrowAstropyTestCase(unittest.TestCase): 

655 """Tests for ParquetFormatter, ArrowAstropy, using local file datastore.""" 

656 

657 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

658 

659 def setUp(self): 

660 """Create a new butler root for each test.""" 

661 self.root = makeTestTempDir(TESTDIR) 

662 config = Config(self.configFile) 

663 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run") 

664 # No dimensions in dataset type so we don't have to worry about 

665 # inserting dimension data or defining data IDs. 

666 self.datasetType = DatasetType( 

667 "data", dimensions=(), storageClass="ArrowAstropy", universe=self.butler.registry.dimensions 

668 ) 

669 self.butler.registry.registerDatasetType(self.datasetType) 

670 

671 def tearDown(self): 

672 removeTestTempDir(self.root) 

673 

674 def testAstropyTable(self): 

675 tab1 = _makeSimpleAstropyTable(include_multidim=True, include_masked=True) 

676 

677 self.butler.put(tab1, self.datasetType, dataId={}) 

678 # Read the whole Table. 

679 tab2 = self.butler.get(self.datasetType, dataId={}) 

680 self._checkAstropyTableEquality(tab1, tab2) 

681 # Read the columns. 

682 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

683 self.assertEqual(len(columns2), len(tab1.dtype.names)) 

684 for i, name in enumerate(tab1.dtype.names): 

685 self.assertEqual(columns2[i], name) 

686 # Read the rowcount. 

687 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

688 self.assertEqual(rowcount, len(tab1)) 

689 # Read the schema. 

690 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

691 self.assertEqual(schema, ArrowAstropySchema(tab1)) 

692 # Read just some columns a few different ways. 

693 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

694 self._checkAstropyTableEquality(tab1[("a", "c")], tab3) 

695 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

696 self._checkAstropyTableEquality(tab1[("a",)], tab4) 

697 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]}) 

698 self._checkAstropyTableEquality(tab1[("index", "a")], tab5) 

699 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"}) 

700 self._checkAstropyTableEquality(tab1[("ddd",)], tab6) 

701 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]}) 

702 self._checkAstropyTableEquality(tab1[("a",)], tab7) 

703 # Passing an unrecognized column should be a ValueError. 

704 with self.assertRaises(ValueError): 

705 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]}) 

706 

707 def testAstropyTableBigEndian(self): 

708 tab1 = _makeSimpleAstropyTable(include_bigendian=True) 

709 

710 self.butler.put(tab1, self.datasetType, dataId={}) 

711 # Read the whole Table. 

712 tab2 = self.butler.get(self.datasetType, dataId={}) 

713 self._checkAstropyTableEquality(tab1, tab2, has_bigendian=True) 

714 

715 def testAstropyTableWithMetadata(self): 

716 tab1 = _makeSimpleAstropyTable(include_multidim=True) 

717 

718 meta = { 

719 "meta_a": 5, 

720 "meta_b": 10.0, 

721 "meta_c": [1, 2, 3], 

722 "meta_d": True, 

723 "meta_e": "string", 

724 } 

725 

726 tab1.meta.update(meta) 

727 

728 self.butler.put(tab1, self.datasetType, dataId={}) 

729 # Read the whole Table. 

730 tab2 = self.butler.get(self.datasetType, dataId={}) 

731 # This will check that the metadata is equivalent as well. 

732 self._checkAstropyTableEquality(tab1, tab2) 

733 

734 def testArrowAstropySchema(self): 

735 tab1 = _makeSimpleAstropyTable() 

736 tab1_arrow = astropy_to_arrow(tab1) 

737 schema = ArrowAstropySchema.from_arrow(tab1_arrow.schema) 

738 

739 self.assertIsInstance(schema.schema, atable.Table) 

740 self.assertEqual(repr(schema), repr(schema._schema)) 

741 self.assertNotEqual(schema, "not_a_schema") 

742 self.assertEqual(schema, schema) 

743 

744 # Test various inequalities 

745 tab2 = tab1.copy() 

746 tab2.rename_column("index", "index2") 

747 schema2 = ArrowAstropySchema(tab2) 

748 self.assertNotEqual(schema2, schema) 

749 

750 tab2 = tab1.copy() 

751 tab2["index"].unit = units.micron 

752 schema2 = ArrowAstropySchema(tab2) 

753 self.assertNotEqual(schema2, schema) 

754 

755 tab2 = tab1.copy() 

756 tab2["index"].description = "Index column" 

757 schema2 = ArrowAstropySchema(tab2) 

758 self.assertNotEqual(schema2, schema) 

759 

760 tab2 = tab1.copy() 

761 tab2["index"].format = "%05d" 

762 schema2 = ArrowAstropySchema(tab2) 

763 self.assertNotEqual(schema2, schema) 

764 

765 def testAstropyParquet(self): 

766 tab1 = _makeSimpleAstropyTable() 

767 

768 fname = os.path.join(self.root, "test_astropy.parq") 

769 tab1.write(fname) 

770 

771 astropy_type = DatasetType( 

772 "astropy_parquet", 

773 dimensions=(), 

774 storageClass="ArrowAstropy", 

775 universe=self.butler.registry.dimensions, 

776 ) 

777 self.butler.registry.registerDatasetType(astropy_type) 

778 

779 data_id = {} 

780 ref = DatasetRef(astropy_type, data_id, id=None) 

781 dataset = FileDataset(path=fname, refs=[ref], formatter=ParquetFormatter) 

782 

783 self.butler.ingest(dataset, transfer="copy") 

784 

785 self.butler.put(tab1, self.datasetType, dataId={}) 

786 

787 tab2a = self.butler.get(self.datasetType, dataId={}) 

788 tab2b = self.butler.get("astropy_parquet", dataId={}) 

789 self._checkAstropyTableEquality(tab2a, tab2b) 

790 

791 columns2a = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

792 columns2b = self.butler.get("astropy_parquet.columns", dataId={}) 

793 self.assertEqual(len(columns2b), len(columns2a)) 

794 for i, name in enumerate(columns2a): 

795 self.assertEqual(columns2b[i], name) 

796 

797 rowcount2a = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

798 rowcount2b = self.butler.get("astropy_parquet.rowcount", dataId={}) 

799 self.assertEqual(rowcount2a, rowcount2b) 

800 

801 schema2a = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

802 schema2b = self.butler.get("astropy_parquet.schema", dataId={}) 

803 self.assertEqual(schema2a, schema2b) 

804 

805 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.") 

806 def testWriteAstropyReadAsArrowTable(self): 

807 # This astropy <-> arrow works fine with masked columns. 

808 tab1 = _makeSimpleAstropyTable(include_masked=True) 

809 

810 self.butler.put(tab1, self.datasetType, dataId={}) 

811 

812 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable") 

813 

814 tab2_astropy = arrow_to_astropy(tab2) 

815 self._checkAstropyTableEquality(tab1, tab2_astropy) 

816 

817 # Check reading the columns. 

818 columns = tab2.schema.names 

819 columns2 = self.butler.get( 

820 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

821 ) 

822 self.assertEqual(columns2, columns) 

823 

824 # Check reading the schema. 

825 schema = tab2.schema 

826 schema2 = self.butler.get( 

827 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema" 

828 ) 

829 

830 self.assertEqual(schema, schema2) 

831 

832 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

833 def testWriteAstropyReadAsDataFrame(self): 

834 tab1 = _makeSimpleAstropyTable() 

835 

836 self.butler.put(tab1, self.datasetType, dataId={}) 

837 

838 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

839 

840 # This is tricky because it loses the units and gains a bonus pandas 

841 # _index_ column, so we just test the dataframe form. 

842 

843 tab1_df = tab1.to_pandas() 

844 self.assertTrue(tab1_df.equals(tab2)) 

845 

846 # Check reading the columns. 

847 columns = tab2.columns 

848 columns2 = self.butler.get( 

849 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex" 

850 ) 

851 self.assertTrue(columns.equals(columns2)) 

852 

853 # Check reading the schema. 

854 schema = DataFrameSchema(tab2) 

855 schema2 = self.butler.get( 

856 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema" 

857 ) 

858 

859 self.assertEqual(schema2, schema) 

860 

861 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

862 def testWriteAstropyWithMaskedColsReadAsDataFrame(self): 

863 # We need to special-case the write-as-astropy read-as-pandas code 

864 # with masks because pandas has multiple ways to use masked columns. 

865 # (When writing an astropy table with masked columns we get an object 

866 # column back, but each unmasked element has the correct type.) 

867 tab1 = _makeSimpleAstropyTable(include_masked=True) 

868 

869 self.butler.put(tab1, self.datasetType, dataId={}) 

870 

871 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

872 

873 tab1_df = tab1.to_pandas() 

874 

875 self.assertTrue(tab1_df.columns.equals(tab2.columns)) 

876 for name in tab2.columns: 

877 col1 = tab1_df[name] 

878 col2 = tab2[name] 

879 

880 if col1.hasnans: 

881 notNull = col1.notnull() 

882 self.assertTrue(notNull.equals(col2.notnull())) 

883 # Need to check value-by-value because column may 

884 # be made of objects, depending on what pandas decides. 

885 for index in notNull.values.nonzero()[0]: 

886 self.assertEqual(col1[index], col2[index]) 

887 else: 

888 self.assertTrue(col1.equals(col2)) 

889 

890 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

891 def testWriteAstropyReadAsNumpyTable(self): 

892 tab1 = _makeSimpleAstropyTable() 

893 self.butler.put(tab1, self.datasetType, dataId={}) 

894 

895 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy") 

896 

897 # This is tricky because it loses the units. 

898 tab2_astropy = atable.Table(tab2) 

899 

900 self._checkAstropyTableEquality(tab1, tab2_astropy, skip_units=True) 

901 

902 # Check reading the columns. 

903 columns = list(tab2.dtype.names) 

904 columns2 = self.butler.get( 

905 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

906 ) 

907 self.assertEqual(columns2, columns) 

908 

909 # Check reading the schema. 

910 schema = ArrowNumpySchema(tab2.dtype) 

911 schema2 = self.butler.get( 

912 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema" 

913 ) 

914 

915 self.assertEqual(schema2, schema) 

916 

917 def _checkAstropyTableEquality(self, table1, table2, skip_units=False, has_bigendian=False): 

918 """Check if two astropy tables have the same columns/values. 

919 

920 Parameters 

921 ---------- 

922 table1 : `astropy.table.Table` 

923 table2 : `astropy.table.Table` 

924 skip_units : `bool` 

925 has_bigendian : `bool` 

926 """ 

927 if not has_bigendian: 

928 self.assertEqual(table1.dtype, table2.dtype) 

929 else: 

930 for name in table1.dtype.names: 

931 # Only check type matches, force to little-endian. 

932 self.assertEqual(table1.dtype[name].newbyteorder(">"), table2.dtype[name].newbyteorder(">")) 

933 

934 self.assertEqual(table1.meta, table2.meta) 

935 if not skip_units: 

936 for name in table1.columns: 

937 self.assertEqual(table1[name].unit, table2[name].unit) 

938 self.assertEqual(table1[name].description, table2[name].description) 

939 self.assertEqual(table1[name].format, table2[name].format) 

940 self.assertTrue(np.all(table1 == table2)) 

941 

942 

943@unittest.skipUnless(atable is not None, "Cannot test InMemoryArrowAstropyDelegate without astropy.") 

944class InMemoryArrowAstropyDelegateTestCase(ParquetFormatterArrowAstropyTestCase): 

945 """Tests for InMemoryDatastore, using ArrowAstropyDelegate.""" 

946 

947 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

948 

949 def testAstropyParquet(self): 

950 # This test does not work with an inMemoryDatastore. 

951 pass 

952 

953 def testBadInput(self): 

954 tab1 = _makeSimpleAstropyTable() 

955 delegate = ArrowAstropyDelegate("ArrowAstropy") 

956 

957 with self.assertRaises(ValueError): 

958 delegate.handleParameters(inMemoryDataset="not_an_astropy_table") 

959 

960 with self.assertRaises(NotImplementedError): 

961 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]}) 

962 

963 with self.assertRaises(AttributeError): 

964 delegate.getComponent(composite=tab1, componentName="nothing") 

965 

966 

967@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.") 

968@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowNumpy without pyarrow.") 

969class ParquetFormatterArrowNumpyTestCase(unittest.TestCase): 

970 """Tests for ParquetFormatter, ArrowNumpy, using local file datastore.""" 

971 

972 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

973 

974 def setUp(self): 

975 """Create a new butler root for each test.""" 

976 self.root = makeTestTempDir(TESTDIR) 

977 config = Config(self.configFile) 

978 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run") 

979 # No dimensions in dataset type so we don't have to worry about 

980 # inserting dimension data or defining data IDs. 

981 self.datasetType = DatasetType( 

982 "data", dimensions=(), storageClass="ArrowNumpy", universe=self.butler.registry.dimensions 

983 ) 

984 self.butler.registry.registerDatasetType(self.datasetType) 

985 

986 def tearDown(self): 

987 removeTestTempDir(self.root) 

988 

989 def testNumpyTable(self): 

990 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

991 

992 self.butler.put(tab1, self.datasetType, dataId={}) 

993 # Read the whole Table. 

994 tab2 = self.butler.get(self.datasetType, dataId={}) 

995 self._checkNumpyTableEquality(tab1, tab2) 

996 # Read the columns. 

997 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

998 self.assertEqual(len(columns2), len(tab1.dtype.names)) 

999 for i, name in enumerate(tab1.dtype.names): 

1000 self.assertEqual(columns2[i], name) 

1001 # Read the rowcount. 

1002 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

1003 self.assertEqual(rowcount, len(tab1)) 

1004 # Read the schema. 

1005 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

1006 self.assertEqual(schema, ArrowNumpySchema(tab1.dtype)) 

1007 # Read just some columns a few different ways. 

1008 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

1009 self._checkNumpyTableEquality(tab1[["a", "c"]], tab3) 

1010 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

1011 self._checkNumpyTableEquality( 

1012 tab1[ 

1013 [ 

1014 "a", 

1015 ] 

1016 ], 

1017 tab4, 

1018 ) 

1019 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]}) 

1020 self._checkNumpyTableEquality(tab1[["index", "a"]], tab5) 

1021 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"}) 

1022 self._checkNumpyTableEquality( 

1023 tab1[ 

1024 [ 

1025 "ddd", 

1026 ] 

1027 ], 

1028 tab6, 

1029 ) 

1030 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]}) 

1031 self._checkNumpyTableEquality( 

1032 tab1[ 

1033 [ 

1034 "a", 

1035 ] 

1036 ], 

1037 tab7, 

1038 ) 

1039 # Passing an unrecognized column should be a ValueError. 

1040 with self.assertRaises(ValueError): 

1041 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]}) 

1042 

1043 def testNumpyTableBigEndian(self): 

1044 tab1 = _makeSimpleNumpyTable(include_bigendian=True) 

1045 

1046 self.butler.put(tab1, self.datasetType, dataId={}) 

1047 # Read the whole Table. 

1048 tab2 = self.butler.get(self.datasetType, dataId={}) 

1049 self._checkNumpyTableEquality(tab1, tab2, has_bigendian=True) 

1050 

1051 def testArrowNumpySchema(self): 

1052 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1053 tab1_arrow = numpy_to_arrow(tab1) 

1054 schema = ArrowNumpySchema.from_arrow(tab1_arrow.schema) 

1055 

1056 self.assertIsInstance(schema.schema, np.dtype) 

1057 self.assertEqual(repr(schema), repr(schema._dtype)) 

1058 self.assertNotEqual(schema, "not_a_schema") 

1059 self.assertEqual(schema, schema) 

1060 

1061 # Test inequality 

1062 tab2 = tab1.copy() 

1063 names = list(tab2.dtype.names) 

1064 names[0] = "index2" 

1065 tab2.dtype.names = names 

1066 schema2 = ArrowNumpySchema(tab2.dtype) 

1067 self.assertNotEqual(schema2, schema) 

1068 

1069 @unittest.skipUnless(pa is not None, "Cannot test arrow conversions without pyarrow.") 

1070 def testNumpyDictConversions(self): 

1071 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1072 

1073 # Verify that everything round-trips, including the schema. 

1074 tab1_arrow = numpy_to_arrow(tab1) 

1075 tab1_dict = arrow_to_numpy_dict(tab1_arrow) 

1076 tab1_dict_arrow = numpy_dict_to_arrow(tab1_dict) 

1077 

1078 self.assertEqual(tab1_arrow.schema, tab1_dict_arrow.schema) 

1079 self.assertEqual(tab1_arrow, tab1_dict_arrow) 

1080 

1081 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.") 

1082 def testWriteNumpyTableReadAsArrowTable(self): 

1083 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1084 

1085 self.butler.put(tab1, self.datasetType, dataId={}) 

1086 

1087 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable") 

1088 

1089 tab2_numpy = arrow_to_numpy(tab2) 

1090 

1091 self._checkNumpyTableEquality(tab1, tab2_numpy) 

1092 

1093 # Check reading the columns. 

1094 columns = tab2.schema.names 

1095 columns2 = self.butler.get( 

1096 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

1097 ) 

1098 self.assertEqual(columns2, columns) 

1099 

1100 # Check reading the schema. 

1101 schema = tab2.schema 

1102 schema2 = self.butler.get( 

1103 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema" 

1104 ) 

1105 self.assertEqual(schema2, schema) 

1106 

1107 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

1108 def testWriteNumpyTableReadAsDataFrame(self): 

1109 tab1 = _makeSimpleNumpyTable() 

1110 

1111 self.butler.put(tab1, self.datasetType, dataId={}) 

1112 

1113 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

1114 

1115 # Converting this back to numpy gets confused with the index column 

1116 # and changes the datatype of the string column. 

1117 

1118 tab1_df = pd.DataFrame(tab1) 

1119 

1120 self.assertTrue(tab1_df.equals(tab2)) 

1121 

1122 # Check reading the columns. 

1123 columns = tab2.columns 

1124 columns2 = self.butler.get( 

1125 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex" 

1126 ) 

1127 self.assertTrue(columns.equals(columns2)) 

1128 

1129 # Check reading the schema. 

1130 schema = DataFrameSchema(tab2) 

1131 schema2 = self.butler.get( 

1132 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema" 

1133 ) 

1134 

1135 self.assertEqual(schema2, schema) 

1136 

1137 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

1138 def testWriteNumpyTableReadAsAstropyTable(self): 

1139 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1140 

1141 self.butler.put(tab1, self.datasetType, dataId={}) 

1142 

1143 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

1144 tab2_numpy = tab2.as_array() 

1145 

1146 self._checkNumpyTableEquality(tab1, tab2_numpy) 

1147 

1148 # Check reading the columns. 

1149 columns = list(tab2.columns.keys()) 

1150 columns2 = self.butler.get( 

1151 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

1152 ) 

1153 self.assertEqual(columns2, columns) 

1154 

1155 # Check reading the schema. 

1156 schema = ArrowAstropySchema(tab2) 

1157 schema2 = self.butler.get( 

1158 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema" 

1159 ) 

1160 

1161 self.assertEqual(schema2, schema) 

1162 

1163 def _checkNumpyTableEquality(self, table1, table2, has_bigendian=False): 

1164 """Check if two numpy tables have the same columns/values 

1165 

1166 Parameters 

1167 ---------- 

1168 table1 : `numpy.ndarray` 

1169 table2 : `numpy.ndarray` 

1170 has_bigendian : `bool` 

1171 """ 

1172 self.assertEqual(table1.dtype.names, table2.dtype.names) 

1173 for name in table1.dtype.names: 

1174 if not has_bigendian: 

1175 self.assertEqual(table1.dtype[name], table2.dtype[name]) 

1176 else: 

1177 # Only check type matches, force to little-endian. 

1178 self.assertEqual(table1.dtype[name].newbyteorder(">"), table2.dtype[name].newbyteorder(">")) 

1179 self.assertTrue(np.all(table1 == table2)) 

1180 

1181 

1182@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.") 

1183class InMemoryArrowNumpyDelegateTestCase(ParquetFormatterArrowNumpyTestCase): 

1184 """Tests for InMemoryDatastore, using ArrowNumpyDelegate.""" 

1185 

1186 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

1187 

1188 def testBadInput(self): 

1189 tab1 = _makeSimpleNumpyTable() 

1190 delegate = ArrowNumpyDelegate("ArrowNumpy") 

1191 

1192 with self.assertRaises(ValueError): 

1193 delegate.handleParameters(inMemoryDataset="not_a_numpy_table") 

1194 

1195 with self.assertRaises(NotImplementedError): 

1196 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]}) 

1197 

1198 with self.assertRaises(AttributeError): 

1199 delegate.getComponent(composite=tab1, componentName="nothing") 

1200 

1201 def testStorageClass(self): 

1202 tab1 = _makeSimpleNumpyTable() 

1203 

1204 factory = StorageClassFactory() 

1205 factory.addFromConfig(StorageClassConfig()) 

1206 

1207 storageClass = factory.findStorageClass(type(tab1), compare_types=False) 

1208 # Force the name lookup to do name matching. 

1209 storageClass._pytype = None 

1210 self.assertEqual(storageClass.name, "ArrowNumpy") 

1211 

1212 storageClass = factory.findStorageClass(type(tab1), compare_types=True) 

1213 # Force the name lookup to do name matching. 

1214 storageClass._pytype = None 

1215 self.assertEqual(storageClass.name, "ArrowNumpy") 

1216 

1217 

1218@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowTable without pyarrow.") 

1219class ParquetFormatterArrowTableTestCase(unittest.TestCase): 

1220 """Tests for ParquetFormatter, ArrowTable, using local file datastore.""" 

1221 

1222 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1223 

1224 def setUp(self): 

1225 """Create a new butler root for each test.""" 

1226 self.root = makeTestTempDir(TESTDIR) 

1227 config = Config(self.configFile) 

1228 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run") 

1229 # No dimensions in dataset type so we don't have to worry about 

1230 # inserting dimension data or defining data IDs. 

1231 self.datasetType = DatasetType( 

1232 "data", dimensions=(), storageClass="ArrowTable", universe=self.butler.registry.dimensions 

1233 ) 

1234 self.butler.registry.registerDatasetType(self.datasetType) 

1235 

1236 def tearDown(self): 

1237 removeTestTempDir(self.root) 

1238 

1239 def testArrowTable(self): 

1240 tab1 = _makeSimpleArrowTable(include_multidim=True, include_masked=True) 

1241 

1242 self.butler.put(tab1, self.datasetType, dataId={}) 

1243 # Read the whole Table. 

1244 tab2 = self.butler.get(self.datasetType, dataId={}) 

1245 self.assertEqual(tab2, tab1) 

1246 # Read the columns. 

1247 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

1248 self.assertEqual(len(columns2), len(tab1.schema.names)) 

1249 for i, name in enumerate(tab1.schema.names): 

1250 self.assertEqual(columns2[i], name) 

1251 # Read the rowcount. 

1252 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

1253 self.assertEqual(rowcount, len(tab1)) 

1254 # Read the schema. 

1255 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

1256 self.assertEqual(schema, tab1.schema) 

1257 # Read just some columns a few different ways. 

1258 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

1259 self.assertEqual(tab3, tab1.select(("a", "c"))) 

1260 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

1261 self.assertEqual(tab4, tab1.select(("a",))) 

1262 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]}) 

1263 self.assertEqual(tab5, tab1.select(("index", "a"))) 

1264 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"}) 

1265 self.assertEqual(tab6, tab1.select(("ddd",))) 

1266 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]}) 

1267 self.assertEqual(tab7, tab1.select(("a",))) 

1268 # Passing an unrecognized column should be a ValueError. 

1269 with self.assertRaises(ValueError): 

1270 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]}) 

1271 

1272 def testEmptyArrowTable(self): 

1273 data = _makeSimpleNumpyTable() 

1274 type_list = _numpy_dtype_to_arrow_types(data.dtype) 

1275 

1276 schema = pa.schema(type_list) 

1277 arrays = [[]] * len(schema.names) 

1278 

1279 tab1 = pa.Table.from_arrays(arrays, schema=schema) 

1280 

1281 self.butler.put(tab1, self.datasetType, dataId={}) 

1282 tab2 = self.butler.get(self.datasetType, dataId={}) 

1283 self.assertEqual(tab2, tab1) 

1284 

1285 tab1_numpy = arrow_to_numpy(tab1) 

1286 self.assertEqual(len(tab1_numpy), 0) 

1287 tab1_numpy_arrow = numpy_to_arrow(tab1_numpy) 

1288 self.assertEqual(tab1_numpy_arrow, tab1) 

1289 

1290 tab1_pandas = arrow_to_pandas(tab1) 

1291 self.assertEqual(len(tab1_pandas), 0) 

1292 tab1_pandas_arrow = pandas_to_arrow(tab1_pandas) 

1293 # Unfortunately, string/byte columns get mangled when translated 

1294 # through empty pandas dataframes. 

1295 self.assertEqual( 

1296 tab1_pandas_arrow.select(("index", "a", "b", "c", "ddd")), 

1297 tab1.select(("index", "a", "b", "c", "ddd")), 

1298 ) 

1299 

1300 tab1_astropy = arrow_to_astropy(tab1) 

1301 self.assertEqual(len(tab1_astropy), 0) 

1302 tab1_astropy_arrow = astropy_to_arrow(tab1_astropy) 

1303 self.assertEqual(tab1_astropy_arrow, tab1) 

1304 

1305 def testEmptyArrowTableMultidim(self): 

1306 data = _makeSimpleNumpyTable(include_multidim=True) 

1307 type_list = _numpy_dtype_to_arrow_types(data.dtype) 

1308 

1309 md = {} 

1310 for name in data.dtype.names: 

1311 _append_numpy_multidim_metadata(md, name, data.dtype[name]) 

1312 

1313 schema = pa.schema(type_list, metadata=md) 

1314 arrays = [[]] * len(schema.names) 

1315 

1316 tab1 = pa.Table.from_arrays(arrays, schema=schema) 

1317 

1318 self.butler.put(tab1, self.datasetType, dataId={}) 

1319 tab2 = self.butler.get(self.datasetType, dataId={}) 

1320 self.assertEqual(tab2, tab1) 

1321 

1322 tab1_numpy = arrow_to_numpy(tab1) 

1323 self.assertEqual(len(tab1_numpy), 0) 

1324 tab1_numpy_arrow = numpy_to_arrow(tab1_numpy) 

1325 self.assertEqual(tab1_numpy_arrow, tab1) 

1326 

1327 tab1_astropy = arrow_to_astropy(tab1) 

1328 self.assertEqual(len(tab1_astropy), 0) 

1329 tab1_astropy_arrow = astropy_to_arrow(tab1_astropy) 

1330 self.assertEqual(tab1_astropy_arrow, tab1) 

1331 

1332 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

1333 def testWriteArrowTableReadAsSingleIndexDataFrame(self): 

1334 df1, allColumns = _makeSingleIndexDataFrame() 

1335 

1336 self.butler.put(df1, self.datasetType, dataId={}) 

1337 

1338 # Read back out as a dataframe. 

1339 df2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

1340 self.assertTrue(df1.equals(df2)) 

1341 

1342 # Read back out as an arrow table, convert to dataframe. 

1343 tab3 = self.butler.get(self.datasetType, dataId={}) 

1344 df3 = arrow_to_pandas(tab3) 

1345 self.assertTrue(df1.equals(df3)) 

1346 

1347 # Check reading the columns. 

1348 columns = df2.reset_index().columns 

1349 columns2 = self.butler.get( 

1350 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex" 

1351 ) 

1352 # We check the set because pandas reorders the columns. 

1353 self.assertEqual(set(columns2.to_list()), set(columns.to_list())) 

1354 

1355 # Check reading the schema. 

1356 schema = DataFrameSchema(df1) 

1357 schema2 = self.butler.get( 

1358 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema" 

1359 ) 

1360 self.assertEqual(schema2, schema) 

1361 

1362 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

1363 def testWriteArrowTableReadAsMultiIndexDataFrame(self): 

1364 df1 = _makeMultiIndexDataFrame() 

1365 

1366 self.butler.put(df1, self.datasetType, dataId={}) 

1367 

1368 # Read back out as a dataframe. 

1369 df2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

1370 self.assertTrue(df1.equals(df2)) 

1371 

1372 # Read back out as an arrow table, convert to dataframe. 

1373 atab3 = self.butler.get(self.datasetType, dataId={}) 

1374 df3 = arrow_to_pandas(atab3) 

1375 self.assertTrue(df1.equals(df3)) 

1376 

1377 # Check reading the columns. 

1378 columns = df2.columns 

1379 columns2 = self.butler.get( 

1380 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex" 

1381 ) 

1382 self.assertTrue(columns2.equals(columns)) 

1383 

1384 # Check reading the schema. 

1385 schema = DataFrameSchema(df1) 

1386 schema2 = self.butler.get( 

1387 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema" 

1388 ) 

1389 self.assertEqual(schema2, schema) 

1390 

1391 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

1392 def testWriteArrowTableReadAsAstropyTable(self): 

1393 tab1 = _makeSimpleAstropyTable(include_multidim=True, include_masked=True) 

1394 

1395 self.butler.put(tab1, self.datasetType, dataId={}) 

1396 

1397 # Read back out as an astropy table. 

1398 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

1399 self._checkAstropyTableEquality(tab1, tab2) 

1400 

1401 # Read back out as an arrow table, convert to astropy table. 

1402 atab3 = self.butler.get(self.datasetType, dataId={}) 

1403 tab3 = arrow_to_astropy(atab3) 

1404 self._checkAstropyTableEquality(tab1, tab3) 

1405 

1406 # Check reading the columns. 

1407 columns = list(tab2.columns.keys()) 

1408 columns2 = self.butler.get( 

1409 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

1410 ) 

1411 self.assertEqual(columns2, columns) 

1412 

1413 # Check reading the schema. 

1414 schema = ArrowAstropySchema(tab1) 

1415 schema2 = self.butler.get( 

1416 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema" 

1417 ) 

1418 self.assertEqual(schema2, schema) 

1419 

1420 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

1421 def testWriteArrowTableReadAsNumpyTable(self): 

1422 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1423 

1424 self.butler.put(tab1, self.datasetType, dataId={}) 

1425 

1426 # Read back out as a numpy table. 

1427 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy") 

1428 self._checkNumpyTableEquality(tab1, tab2) 

1429 

1430 # Read back out as an arrow table, convert to numpy table. 

1431 atab3 = self.butler.get(self.datasetType, dataId={}) 

1432 tab3 = arrow_to_numpy(atab3) 

1433 self._checkNumpyTableEquality(tab1, tab3) 

1434 

1435 # Check reading the columns. 

1436 columns = list(tab2.dtype.names) 

1437 columns2 = self.butler.get( 

1438 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

1439 ) 

1440 self.assertEqual(columns2, columns) 

1441 

1442 # Check reading the schema. 

1443 schema = ArrowNumpySchema(tab1.dtype) 

1444 schema2 = self.butler.get( 

1445 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema" 

1446 ) 

1447 self.assertEqual(schema2, schema) 

1448 

1449 def _checkAstropyTableEquality(self, table1, table2): 

1450 """Check if two astropy tables have the same columns/values 

1451 

1452 Parameters 

1453 ---------- 

1454 table1 : `astropy.table.Table` 

1455 table2 : `astropy.table.Table` 

1456 """ 

1457 self.assertEqual(table1.dtype, table2.dtype) 

1458 for name in table1.columns: 

1459 self.assertEqual(table1[name].unit, table2[name].unit) 

1460 self.assertEqual(table1[name].description, table2[name].description) 

1461 self.assertEqual(table1[name].format, table2[name].format) 

1462 self.assertTrue(np.all(table1 == table2)) 

1463 

1464 def _checkNumpyTableEquality(self, table1, table2): 

1465 """Check if two numpy tables have the same columns/values 

1466 

1467 Parameters 

1468 ---------- 

1469 table1 : `numpy.ndarray` 

1470 table2 : `numpy.ndarray` 

1471 """ 

1472 self.assertEqual(table1.dtype.names, table2.dtype.names) 

1473 for name in table1.dtype.names: 

1474 self.assertEqual(table1.dtype[name], table2.dtype[name]) 

1475 self.assertTrue(np.all(table1 == table2)) 

1476 

1477 

1478@unittest.skipUnless(pa is not None, "Cannot test InMemoryArrowTableDelegate without pyarrow.") 

1479class InMemoryArrowTableDelegateTestCase(ParquetFormatterArrowTableTestCase): 

1480 """Tests for InMemoryDatastore, using ArrowTableDelegate.""" 

1481 

1482 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

1483 

1484 def testBadInput(self): 

1485 tab1 = _makeSimpleArrowTable() 

1486 delegate = ArrowTableDelegate("ArrowTable") 

1487 

1488 with self.assertRaises(ValueError): 

1489 delegate.handleParameters(inMemoryDataset="not_an_arrow_table") 

1490 

1491 with self.assertRaises(NotImplementedError): 

1492 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]}) 

1493 

1494 with self.assertRaises(AttributeError): 

1495 delegate.getComponent(composite=tab1, componentName="nothing") 

1496 

1497 def testStorageClass(self): 

1498 tab1 = _makeSimpleArrowTable() 

1499 

1500 factory = StorageClassFactory() 

1501 factory.addFromConfig(StorageClassConfig()) 

1502 

1503 storageClass = factory.findStorageClass(type(tab1), compare_types=False) 

1504 # Force the name lookup to do name matching. 

1505 storageClass._pytype = None 

1506 self.assertEqual(storageClass.name, "ArrowTable") 

1507 

1508 storageClass = factory.findStorageClass(type(tab1), compare_types=True) 

1509 # Force the name lookup to do name matching. 

1510 storageClass._pytype = None 

1511 self.assertEqual(storageClass.name, "ArrowTable") 

1512 

1513 

1514if __name__ == "__main__": 1514 ↛ 1515line 1514 didn't jump to line 1515, because the condition on line 1514 was never true

1515 unittest.main()