Coverage for tests/test_parquet.py: 18%

680 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2022-10-29 02:20 -0700

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""Tests for ParquetFormatter. 

23 

24Tests in this module are disabled unless pandas and pyarrow are importable. 

25""" 

26 

27import os 

28import unittest 

29 

30try: 

31 import pyarrow as pa 

32except ImportError: 

33 pa = None 

34try: 

35 import astropy.table as atable 

36 from astropy import units 

37except ImportError: 

38 atable = None 

39try: 

40 import numpy as np 

41except ImportError: 

42 np = None 

43try: 

44 import pandas as pd 

45except ImportError: 

46 np = None 

47 

48from lsst.daf.butler import ( 

49 Butler, 

50 Config, 

51 DatasetRef, 

52 DatasetType, 

53 FileDataset, 

54 StorageClassConfig, 

55 StorageClassFactory, 

56) 

57from lsst.daf.butler.delegates.arrowastropy import ArrowAstropyDelegate 

58from lsst.daf.butler.delegates.arrownumpy import ArrowNumpyDelegate 

59from lsst.daf.butler.delegates.arrowtable import ArrowTableDelegate 

60from lsst.daf.butler.delegates.dataframe import DataFrameDelegate 

61from lsst.daf.butler.formatters.parquet import ( 

62 ArrowAstropySchema, 

63 ArrowNumpySchema, 

64 DataFrameSchema, 

65 ParquetFormatter, 

66 arrow_to_astropy, 

67 arrow_to_numpy, 

68 arrow_to_numpy_dict, 

69 arrow_to_pandas, 

70 astropy_to_arrow, 

71 numpy_dict_to_arrow, 

72 numpy_to_arrow, 

73 pandas_to_arrow, 

74) 

75from lsst.daf.butler.tests.utils import makeTestTempDir, removeTestTempDir 

76 

77TESTDIR = os.path.abspath(os.path.dirname(__file__)) 

78 

79 

80def _makeSimpleNumpyTable(): 

81 """Make a simple numpy table with random data. 

82 

83 Returns 

84 ------- 

85 numpyTable : `numpy.ndarray` 

86 """ 

87 nrow = 5 

88 data = np.zeros( 

89 nrow, 

90 dtype=[ 

91 ("index", "i4"), 

92 ("a", "f8"), 

93 ("b", "f8"), 

94 ("c", "f8"), 

95 ("ddd", "f8"), 

96 ("strcol", "U10"), 

97 ("bytecol", "a10"), 

98 ], 

99 ) 

100 data["index"][:] = np.arange(nrow) 

101 data["a"] = np.random.randn(nrow) 

102 data["b"] = np.random.randn(nrow) 

103 data["c"] = np.random.randn(nrow) 

104 data["ddd"] = np.random.randn(nrow) 

105 data["strcol"][:] = "teststring" 

106 data["bytecol"][:] = "teststring" 

107 

108 return data 

109 

110 

111def _makeSingleIndexDataFrame(): 

112 """Make a single index data frame for testing. 

113 

114 Returns 

115 ------- 

116 dataFrame : `~pandas.DataFrame` 

117 The test dataframe. 

118 allColumns : `list` [`str`] 

119 List of all the columns (including index columns). 

120 """ 

121 data = _makeSimpleNumpyTable() 

122 df = pd.DataFrame(data) 

123 df = df.set_index("index") 

124 allColumns = df.columns.append(pd.Index(df.index.names)) 

125 

126 return df, allColumns 

127 

128 

129def _makeMultiIndexDataFrame(): 

130 """Make a multi-index data frame for testing. 

131 

132 Returns 

133 ------- 

134 dataFrame : `~pandas.DataFrame` 

135 The test dataframe. 

136 """ 

137 columns = pd.MultiIndex.from_tuples( 

138 [ 

139 ("g", "a"), 

140 ("g", "b"), 

141 ("g", "c"), 

142 ("r", "a"), 

143 ("r", "b"), 

144 ("r", "c"), 

145 ], 

146 names=["filter", "column"], 

147 ) 

148 df = pd.DataFrame(np.random.randn(5, 6), index=np.arange(5, dtype=int), columns=columns) 

149 

150 return df 

151 

152 

153def _makeSimpleAstropyTable(): 

154 """Make an astropy table for testing. 

155 

156 Returns 

157 ------- 

158 astropyTable : `astropy.table.Table` 

159 The test table. 

160 """ 

161 data = _makeSimpleNumpyTable() 

162 # Add a couple of units. 

163 table = atable.Table(data) 

164 table["a"].unit = units.degree 

165 table["b"].unit = units.meter 

166 return table 

167 

168 

169def _makeSimpleArrowTable(): 

170 """Make an arrow table for testing. 

171 

172 Returns 

173 ------- 

174 arrowTable : `pyarrow.Table` 

175 The test table. 

176 """ 

177 data = _makeSimpleNumpyTable() 

178 return numpy_to_arrow(data) 

179 

180 

181@unittest.skipUnless(pd is not None, "Cannot test ParquetFormatterDataFrame without pandas.") 

182@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterDataFrame without pyarrow.") 

183class ParquetFormatterDataFrameTestCase(unittest.TestCase): 

184 """Tests for ParquetFormatter, DataFrame, using local file datastore.""" 

185 

186 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

187 

188 def setUp(self): 

189 """Create a new butler root for each test.""" 

190 self.root = makeTestTempDir(TESTDIR) 

191 config = Config(self.configFile) 

192 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run") 

193 # No dimensions in dataset type so we don't have to worry about 

194 # inserting dimension data or defining data IDs. 

195 self.datasetType = DatasetType( 

196 "data", dimensions=(), storageClass="DataFrame", universe=self.butler.registry.dimensions 

197 ) 

198 self.butler.registry.registerDatasetType(self.datasetType) 

199 

200 def tearDown(self): 

201 removeTestTempDir(self.root) 

202 

203 def testSingleIndexDataFrame(self): 

204 df1, allColumns = _makeSingleIndexDataFrame() 

205 

206 self.butler.put(df1, self.datasetType, dataId={}) 

207 # Read the whole DataFrame. 

208 df2 = self.butler.get(self.datasetType, dataId={}) 

209 self.assertTrue(df1.equals(df2)) 

210 # Read just the column descriptions. 

211 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

212 self.assertTrue(allColumns.equals(columns2)) 

213 # Read the rowcount. 

214 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

215 self.assertEqual(rowcount, len(df1)) 

216 # Read the schema. 

217 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

218 self.assertEqual(schema, DataFrameSchema(df1)) 

219 # Read just some columns a few different ways. 

220 df3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

221 self.assertTrue(df1.loc[:, ["a", "c"]].equals(df3)) 

222 df4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

223 self.assertTrue(df1.loc[:, ["a"]].equals(df4)) 

224 df5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]}) 

225 self.assertTrue(df1.loc[:, ["a"]].equals(df5)) 

226 df6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"}) 

227 self.assertTrue(df1.loc[:, ["ddd"]].equals(df6)) 

228 df7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]}) 

229 self.assertTrue(df1.loc[:, ["a"]].equals(df7)) 

230 # Passing an unrecognized column should be a ValueError. 

231 with self.assertRaises(ValueError): 

232 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]}) 

233 

234 def testMultiIndexDataFrame(self): 

235 df1 = _makeMultiIndexDataFrame() 

236 

237 self.butler.put(df1, self.datasetType, dataId={}) 

238 # Read the whole DataFrame. 

239 df2 = self.butler.get(self.datasetType, dataId={}) 

240 self.assertTrue(df1.equals(df2)) 

241 # Read just the column descriptions. 

242 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

243 self.assertTrue(df1.columns.equals(columns2)) 

244 # Read the rowcount. 

245 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

246 self.assertEqual(rowcount, len(df1)) 

247 # Read the schema. 

248 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

249 self.assertEqual(schema, DataFrameSchema(df1)) 

250 # Read just some columns a few different ways. 

251 df3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": {"filter": "g"}}) 

252 self.assertTrue(df1.loc[:, ["g"]].equals(df3)) 

253 df4 = self.butler.get( 

254 self.datasetType, dataId={}, parameters={"columns": {"filter": ["r"], "column": "a"}} 

255 ) 

256 self.assertTrue(df1.loc[:, [("r", "a")]].equals(df4)) 

257 column_list = [("g", "a"), ("r", "c")] 

258 df5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": column_list}) 

259 self.assertTrue(df1.loc[:, column_list].equals(df5)) 

260 # Passing an unrecognized column should be a ValueError. 

261 with self.assertRaises(ValueError): 

262 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["d"]}) 

263 

264 def testLegacyDataFrame(self): 

265 """Test writing a dataframe to parquet via pandas (without additional 

266 metadata) and ensure that we can read it back with all the new 

267 functionality. 

268 """ 

269 df1, allColumns = _makeSingleIndexDataFrame() 

270 

271 fname = os.path.join(self.root, "test_dataframe.parq") 

272 df1.to_parquet(fname) 

273 

274 legacy_type = DatasetType( 

275 "legacy_dataframe", 

276 dimensions=(), 

277 storageClass="DataFrame", 

278 universe=self.butler.registry.dimensions, 

279 ) 

280 self.butler.registry.registerDatasetType(legacy_type) 

281 

282 data_id = {} 

283 ref = DatasetRef(legacy_type, data_id, id=None) 

284 dataset = FileDataset(path=fname, refs=[ref], formatter=ParquetFormatter) 

285 

286 self.butler.ingest(dataset, transfer="copy") 

287 

288 self.butler.put(df1, self.datasetType, dataId={}) 

289 

290 df2a = self.butler.get(self.datasetType, dataId={}) 

291 df2b = self.butler.get("legacy_dataframe", dataId={}) 

292 self.assertTrue(df2a.equals(df2b)) 

293 

294 df3a = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a"]}) 

295 df3b = self.butler.get("legacy_dataframe", dataId={}, parameters={"columns": ["a"]}) 

296 self.assertTrue(df3a.equals(df3b)) 

297 

298 columns2a = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

299 columns2b = self.butler.get("legacy_dataframe.columns", dataId={}) 

300 self.assertTrue(columns2a.equals(columns2b)) 

301 

302 rowcount2a = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

303 rowcount2b = self.butler.get("legacy_dataframe.rowcount", dataId={}) 

304 self.assertEqual(rowcount2a, rowcount2b) 

305 

306 schema2a = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

307 schema2b = self.butler.get("legacy_dataframe.schema", dataId={}) 

308 self.assertEqual(schema2a, schema2b) 

309 

310 def testDataFrameSchema(self): 

311 tab1 = _makeSimpleArrowTable() 

312 

313 schema = DataFrameSchema.from_arrow(tab1.schema) 

314 

315 self.assertIsInstance(schema.schema, pd.DataFrame) 

316 self.assertEqual(repr(schema), repr(schema._schema)) 

317 self.assertNotEqual(schema, "not_a_schema") 

318 self.assertEqual(schema, schema) 

319 

320 tab2 = _makeMultiIndexDataFrame() 

321 schema2 = DataFrameSchema(tab2) 

322 

323 self.assertNotEqual(schema, schema2) 

324 

325 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

326 def testWriteSingleIndexDataFrameReadAsAstropyTable(self): 

327 df1, allColumns = _makeSingleIndexDataFrame() 

328 

329 self.butler.put(df1, self.datasetType, dataId={}) 

330 

331 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

332 

333 tab2_df = tab2.to_pandas(index="index") 

334 self.assertTrue(df1.equals(tab2_df)) 

335 

336 # Check reading the columns. 

337 columns = list(tab2.columns.keys()) 

338 columns2 = self.butler.get( 

339 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

340 ) 

341 # We check the set because pandas reorders the columns. 

342 self.assertEqual(set(columns2), set(columns)) 

343 

344 # Check reading the schema. 

345 schema = ArrowAstropySchema(tab2) 

346 schema2 = self.butler.get( 

347 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema" 

348 ) 

349 

350 # The string types are objectified by pandas, and the order 

351 # will be changed because of pandas indexing. 

352 self.assertEqual(len(schema2.schema.columns), len(schema.schema.columns)) 

353 for name in schema.schema.columns: 

354 self.assertIn(name, schema2.schema.columns) 

355 if schema2.schema[name].dtype != np.dtype("O"): 

356 self.assertEqual(schema2.schema[name].dtype, schema.schema[name].dtype) 

357 

358 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

359 def testWriteMultiIndexDataFrameReadAsAstropyTable(self): 

360 df1 = _makeMultiIndexDataFrame() 

361 

362 self.butler.put(df1, self.datasetType, dataId={}) 

363 

364 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

365 

366 # This is an odd duck, it doesn't really round-trip. 

367 # This test simply checks that it's readable, but definitely not 

368 # recommended. 

369 

370 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.") 

371 def testWriteSingleIndexDataFrameReadAsArrowTable(self): 

372 df1, allColumns = _makeSingleIndexDataFrame() 

373 

374 self.butler.put(df1, self.datasetType, dataId={}) 

375 

376 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable") 

377 

378 tab2_df = arrow_to_pandas(tab2) 

379 self.assertTrue(df1.equals(tab2_df)) 

380 

381 # Check reading the columns. 

382 columns = list(tab2.schema.names) 

383 columns2 = self.butler.get( 

384 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

385 ) 

386 # We check the set because pandas reorders the columns. 

387 self.assertEqual(set(columns), set(columns2)) 

388 

389 # Check reading the schema. 

390 schema = tab2.schema 

391 schema2 = self.butler.get( 

392 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema" 

393 ) 

394 

395 # These will not have the same metadata, nor will the string column 

396 # information be maintained. 

397 self.assertEqual(len(schema.names), len(schema2.names)) 

398 for name in schema.names: 

399 if schema.field(name).type not in (pa.string(), pa.binary()): 

400 self.assertEqual(schema.field(name).type, schema2.field(name).type) 

401 

402 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.") 

403 def testWriteMultiIndexDataFrameReadAsArrowTable(self): 

404 df1 = _makeMultiIndexDataFrame() 

405 

406 self.butler.put(df1, self.datasetType, dataId={}) 

407 

408 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable") 

409 

410 tab2_df = arrow_to_pandas(tab2) 

411 self.assertTrue(df1.equals(tab2_df)) 

412 

413 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

414 def testWriteSingleIndexDataFrameReadAsNumpyTable(self): 

415 df1, allColumns = _makeSingleIndexDataFrame() 

416 

417 self.butler.put(df1, self.datasetType, dataId={}) 

418 

419 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy") 

420 

421 tab2_df = pd.DataFrame.from_records(tab2, index=["index"]) 

422 self.assertTrue(df1.equals(tab2_df)) 

423 

424 # Check reading the columns. 

425 columns = list(tab2.dtype.names) 

426 columns2 = self.butler.get( 

427 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

428 ) 

429 # We check the set because pandas reorders the columns. 

430 self.assertEqual(set(columns2), set(columns)) 

431 

432 # Check reading the schema. 

433 schema = ArrowNumpySchema(tab2.dtype) 

434 schema2 = self.butler.get( 

435 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema" 

436 ) 

437 

438 # The string types will be objectified by pandas, and the order 

439 # will be changed because of pandas indexing. 

440 self.assertEqual(len(schema.schema.names), len(schema2.schema.names)) 

441 for name in schema.schema.names: 

442 self.assertIn(name, schema2.schema.names) 

443 self.assertEqual(schema2.schema[name].type, schema.schema[name].type) 

444 

445 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

446 def testWriteMultiIndexDataFrameReadAsNumpyTable(self): 

447 df1 = _makeMultiIndexDataFrame() 

448 

449 self.butler.put(df1, self.datasetType, dataId={}) 

450 

451 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy") 

452 

453 # This is an odd duck, it doesn't really round-trip. 

454 # This test simply checks that it's readable, but definitely not 

455 # recommended. 

456 

457 

458@unittest.skipUnless(pd is not None, "Cannot test InMemoryDataFrameDelegate without pandas.") 

459class InMemoryDataFrameDelegateTestCase(ParquetFormatterDataFrameTestCase): 

460 """Tests for InMemoryDatastore, using DataFrameDelegate.""" 

461 

462 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

463 

464 def testMultiIndexDataFrame(self): 

465 df1 = _makeMultiIndexDataFrame() 

466 

467 delegate = DataFrameDelegate("DataFrame") 

468 

469 # Read the whole DataFrame. 

470 df2 = delegate.handleParameters(inMemoryDataset=df1) 

471 self.assertTrue(df1.equals(df2)) 

472 # Read just the column descriptions. 

473 columns2 = delegate.getComponent(composite=df1, componentName="columns") 

474 self.assertTrue(df1.columns.equals(columns2)) 

475 

476 # Read just some columns a few different ways. 

477 with self.assertRaises(NotImplementedError) as cm: 

478 delegate.handleParameters(inMemoryDataset=df1, parameters={"columns": {"filter": "g"}}) 

479 self.assertIn("only supports string column names", str(cm.exception)) 

480 with self.assertRaises(NotImplementedError) as cm: 

481 delegate.handleParameters( 

482 inMemoryDataset=df1, parameters={"columns": {"filter": ["r"], "column": "a"}} 

483 ) 

484 self.assertIn("only supports string column names", str(cm.exception)) 

485 

486 def testWriteMultiIndexDataFrameReadAsAstropyTable(self): 

487 df1 = _makeMultiIndexDataFrame() 

488 

489 self.butler.put(df1, self.datasetType, dataId={}) 

490 

491 with self.assertRaises(ValueError): 

492 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

493 

494 def testLegacyDataFrame(self): 

495 # This test does not work with an inMemoryDatastore. 

496 pass 

497 

498 def testBadInput(self): 

499 df1, _ = _makeSingleIndexDataFrame() 

500 delegate = DataFrameDelegate("DataFrame") 

501 

502 with self.assertRaises(ValueError): 

503 delegate.handleParameters(inMemoryDataset="not_a_dataframe") 

504 

505 with self.assertRaises(AttributeError): 

506 delegate.getComponent(composite=df1, componentName="nothing") 

507 

508 def testStorageClass(self): 

509 df1, allColumns = _makeSingleIndexDataFrame() 

510 

511 factory = StorageClassFactory() 

512 factory.addFromConfig(StorageClassConfig()) 

513 

514 storageClass = factory.findStorageClass(type(df1), compare_types=False) 

515 # Force the name lookup to do name matching. 

516 storageClass._pytype = None 

517 self.assertEqual(storageClass.name, "DataFrame") 

518 

519 storageClass = factory.findStorageClass(type(df1), compare_types=True) 

520 # Force the name lookup to do name matching. 

521 storageClass._pytype = None 

522 self.assertEqual(storageClass.name, "DataFrame") 

523 

524 

525@unittest.skipUnless(atable is not None, "Cannot test ParquetFormatterArrowAstropy without astropy.") 

526@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowAstropy without pyarrow.") 

527class ParquetFormatterArrowAstropyTestCase(unittest.TestCase): 

528 """Tests for ParquetFormatter, ArrowAstropy, using local file datastore.""" 

529 

530 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

531 

532 def setUp(self): 

533 """Create a new butler root for each test.""" 

534 self.root = makeTestTempDir(TESTDIR) 

535 config = Config(self.configFile) 

536 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run") 

537 # No dimensions in dataset type so we don't have to worry about 

538 # inserting dimension data or defining data IDs. 

539 self.datasetType = DatasetType( 

540 "data", dimensions=(), storageClass="ArrowAstropy", universe=self.butler.registry.dimensions 

541 ) 

542 self.butler.registry.registerDatasetType(self.datasetType) 

543 

544 def tearDown(self): 

545 removeTestTempDir(self.root) 

546 

547 def testAstropyTable(self): 

548 tab1 = _makeSimpleAstropyTable() 

549 

550 self.butler.put(tab1, self.datasetType, dataId={}) 

551 # Read the whole Table. 

552 tab2 = self.butler.get(self.datasetType, dataId={}) 

553 self._checkAstropyTableEquality(tab1, tab2) 

554 # Read the columns. 

555 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

556 self.assertEqual(len(columns2), len(tab1.dtype.names)) 

557 for i, name in enumerate(tab1.dtype.names): 

558 self.assertEqual(columns2[i], name) 

559 # Read the rowcount. 

560 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

561 self.assertEqual(rowcount, len(tab1)) 

562 # Read the schema. 

563 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

564 self.assertEqual(schema, ArrowAstropySchema(tab1)) 

565 # Read just some columns a few different ways. 

566 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

567 self._checkAstropyTableEquality(tab1[("a", "c")], tab3) 

568 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

569 self._checkAstropyTableEquality(tab1[("a",)], tab4) 

570 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]}) 

571 self._checkAstropyTableEquality(tab1[("index", "a")], tab5) 

572 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"}) 

573 self._checkAstropyTableEquality(tab1[("ddd",)], tab6) 

574 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]}) 

575 self._checkAstropyTableEquality(tab1[("a",)], tab7) 

576 # Passing an unrecognized column should be a ValueError. 

577 with self.assertRaises(ValueError): 

578 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]}) 

579 

580 def testArrowAstropySchema(self): 

581 tab1 = _makeSimpleAstropyTable() 

582 tab1_arrow = astropy_to_arrow(tab1) 

583 schema = ArrowAstropySchema.from_arrow(tab1_arrow.schema) 

584 

585 self.assertIsInstance(schema.schema, atable.Table) 

586 self.assertEqual(repr(schema), repr(schema._schema)) 

587 self.assertNotEqual(schema, "not_a_schema") 

588 self.assertEqual(schema, schema) 

589 

590 # Test various inequalities 

591 tab2 = tab1.copy() 

592 tab2.rename_column("index", "index2") 

593 schema2 = ArrowAstropySchema(tab2) 

594 self.assertNotEqual(schema2, schema) 

595 

596 tab2 = tab1.copy() 

597 tab2["index"].unit = units.micron 

598 schema2 = ArrowAstropySchema(tab2) 

599 self.assertNotEqual(schema2, schema) 

600 

601 tab2 = tab1.copy() 

602 tab2["index"].description = "Index column" 

603 schema2 = ArrowAstropySchema(tab2) 

604 self.assertNotEqual(schema2, schema) 

605 

606 tab2 = tab1.copy() 

607 tab2["index"].format = "%05d" 

608 schema2 = ArrowAstropySchema(tab2) 

609 self.assertNotEqual(schema2, schema) 

610 

611 def testAstropyParquet(self): 

612 """Test writing a dataframe to parquet via pandas (without additional 

613 metadata) and ensure that we can read it back with all the new 

614 functionality. 

615 """ 

616 tab1 = _makeSimpleAstropyTable() 

617 

618 fname = os.path.join(self.root, "test_astropy.parq") 

619 tab1.write(fname) 

620 

621 astropy_type = DatasetType( 

622 "astropy_parquet", 

623 dimensions=(), 

624 storageClass="ArrowAstropy", 

625 universe=self.butler.registry.dimensions, 

626 ) 

627 self.butler.registry.registerDatasetType(astropy_type) 

628 

629 data_id = {} 

630 ref = DatasetRef(astropy_type, data_id, id=None) 

631 dataset = FileDataset(path=fname, refs=[ref], formatter=ParquetFormatter) 

632 

633 self.butler.ingest(dataset, transfer="copy") 

634 

635 self.butler.put(tab1, self.datasetType, dataId={}) 

636 

637 tab2a = self.butler.get(self.datasetType, dataId={}) 

638 tab2b = self.butler.get("astropy_parquet", dataId={}) 

639 self._checkAstropyTableEquality(tab2a, tab2b) 

640 

641 columns2a = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

642 columns2b = self.butler.get("astropy_parquet.columns", dataId={}) 

643 self.assertEqual(len(columns2b), len(columns2a)) 

644 for i, name in enumerate(columns2a): 

645 self.assertEqual(columns2b[i], name) 

646 

647 rowcount2a = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

648 rowcount2b = self.butler.get("astropy_parquet.rowcount", dataId={}) 

649 self.assertEqual(rowcount2a, rowcount2b) 

650 

651 schema2a = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

652 schema2b = self.butler.get("astropy_parquet.schema", dataId={}) 

653 self.assertEqual(schema2a, schema2b) 

654 

655 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.") 

656 def testWriteAstropyReadAsArrowTable(self): 

657 tab1 = _makeSimpleAstropyTable() 

658 

659 self.butler.put(tab1, self.datasetType, dataId={}) 

660 

661 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable") 

662 

663 tab2_astropy = arrow_to_astropy(tab2) 

664 self._checkAstropyTableEquality(tab1, tab2_astropy) 

665 

666 # Check reading the columns. 

667 columns = tab2.schema.names 

668 columns2 = self.butler.get( 

669 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

670 ) 

671 self.assertEqual(columns2, columns) 

672 

673 # Check reading the schema. 

674 schema = tab2.schema 

675 schema2 = self.butler.get( 

676 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema" 

677 ) 

678 

679 self.assertEqual(schema, schema2) 

680 

681 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

682 def testWriteAstropyReadAsDataFrame(self): 

683 tab1 = _makeSimpleAstropyTable() 

684 

685 self.butler.put(tab1, self.datasetType, dataId={}) 

686 

687 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

688 

689 # This is tricky because it loses the units and gains a bonus pandas 

690 # _index_ column, so we just test the dataframe form. 

691 

692 tab1_df = tab1.to_pandas() 

693 self.assertTrue(tab1_df.equals(tab2)) 

694 

695 # Check reading the columns. 

696 columns = tab2.columns 

697 columns2 = self.butler.get( 

698 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex" 

699 ) 

700 self.assertTrue(columns.equals(columns2)) 

701 

702 # Check reading the schema. 

703 schema = DataFrameSchema(tab2) 

704 schema2 = self.butler.get( 

705 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema" 

706 ) 

707 

708 self.assertEqual(schema2, schema) 

709 

710 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

711 def testWriteAstropyReadAsNumpyTable(self): 

712 tab1 = _makeSimpleAstropyTable() 

713 self.butler.put(tab1, self.datasetType, dataId={}) 

714 

715 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy") 

716 

717 # This is tricky because it loses the units. 

718 tab2_astropy = atable.Table(tab2) 

719 

720 self._checkAstropyTableEquality(tab1, tab2_astropy, skip_units=True) 

721 

722 # Check reading the columns. 

723 columns = list(tab2.dtype.names) 

724 columns2 = self.butler.get( 

725 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

726 ) 

727 self.assertEqual(columns2, columns) 

728 

729 # Check reading the schema. 

730 schema = ArrowNumpySchema(tab2.dtype) 

731 schema2 = self.butler.get( 

732 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema" 

733 ) 

734 

735 self.assertEqual(schema2, schema) 

736 

737 def _checkAstropyTableEquality(self, table1, table2, skip_units=False): 

738 """Check if two astropy tables have the same columns/values. 

739 

740 Parameters 

741 ---------- 

742 table1 : `astropy.table.Table` 

743 table2 : `astropy.table.Table` 

744 skip_units : `bool` 

745 """ 

746 self.assertEqual(table1.dtype, table2.dtype) 

747 if not skip_units: 

748 for name in table1.columns: 

749 self.assertEqual(table1[name].unit, table2[name].unit) 

750 self.assertEqual(table1[name].description, table2[name].description) 

751 self.assertEqual(table1[name].format, table2[name].format) 

752 self.assertTrue(np.all(table1 == table2)) 

753 

754 

755@unittest.skipUnless(atable is not None, "Cannot test InMemoryArrowAstropyDelegate without astropy.") 

756class InMemoryArrowAstropyDelegateTestCase(ParquetFormatterArrowAstropyTestCase): 

757 """Tests for InMemoryDatastore, using ArrowAstropyDelegate.""" 

758 

759 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

760 

761 def testAstropyParquet(self): 

762 # This test does not work with an inMemoryDatastore. 

763 pass 

764 

765 def testBadInput(self): 

766 tab1 = _makeSimpleAstropyTable() 

767 delegate = ArrowAstropyDelegate("ArrowAstropy") 

768 

769 with self.assertRaises(ValueError): 

770 delegate.handleParameters(inMemoryDataset="not_an_astropy_table") 

771 

772 with self.assertRaises(NotImplementedError): 

773 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]}) 

774 

775 with self.assertRaises(AttributeError): 

776 delegate.getComponent(composite=tab1, componentName="nothing") 

777 

778 

779@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.") 

780@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowNumpy without pyarrow.") 

781class ParquetFormatterArrowNumpyTestCase(unittest.TestCase): 

782 """Tests for ParquetFormatter, ArrowNumpy, using local file datastore.""" 

783 

784 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

785 

786 def setUp(self): 

787 """Create a new butler root for each test.""" 

788 self.root = makeTestTempDir(TESTDIR) 

789 config = Config(self.configFile) 

790 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run") 

791 # No dimensions in dataset type so we don't have to worry about 

792 # inserting dimension data or defining data IDs. 

793 self.datasetType = DatasetType( 

794 "data", dimensions=(), storageClass="ArrowNumpy", universe=self.butler.registry.dimensions 

795 ) 

796 self.butler.registry.registerDatasetType(self.datasetType) 

797 

798 def tearDown(self): 

799 removeTestTempDir(self.root) 

800 

801 def testNumpyTable(self): 

802 tab1 = _makeSimpleNumpyTable() 

803 

804 self.butler.put(tab1, self.datasetType, dataId={}) 

805 # Read the whole Table. 

806 tab2 = self.butler.get(self.datasetType, dataId={}) 

807 self._checkNumpyTableEquality(tab1, tab2) 

808 # Read the columns. 

809 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

810 self.assertEqual(len(columns2), len(tab1.dtype.names)) 

811 for i, name in enumerate(tab1.dtype.names): 

812 self.assertEqual(columns2[i], name) 

813 # Read the rowcount. 

814 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

815 self.assertEqual(rowcount, len(tab1)) 

816 # Read the schema. 

817 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

818 self.assertEqual(schema, ArrowNumpySchema(tab1.dtype)) 

819 # Read just some columns a few different ways. 

820 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

821 self._checkNumpyTableEquality(tab1[["a", "c"]], tab3) 

822 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

823 self._checkNumpyTableEquality( 

824 tab1[ 

825 [ 

826 "a", 

827 ] 

828 ], 

829 tab4, 

830 ) 

831 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]}) 

832 self._checkNumpyTableEquality(tab1[["index", "a"]], tab5) 

833 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"}) 

834 self._checkNumpyTableEquality( 

835 tab1[ 

836 [ 

837 "ddd", 

838 ] 

839 ], 

840 tab6, 

841 ) 

842 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]}) 

843 self._checkNumpyTableEquality( 

844 tab1[ 

845 [ 

846 "a", 

847 ] 

848 ], 

849 tab7, 

850 ) 

851 # Passing an unrecognized column should be a ValueError. 

852 with self.assertRaises(ValueError): 

853 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]}) 

854 

855 def testArrowNumpySchema(self): 

856 tab1 = _makeSimpleNumpyTable() 

857 tab1_arrow = numpy_to_arrow(tab1) 

858 schema = ArrowNumpySchema.from_arrow(tab1_arrow.schema) 

859 

860 self.assertIsInstance(schema.schema, np.dtype) 

861 self.assertEqual(repr(schema), repr(schema._dtype)) 

862 self.assertNotEqual(schema, "not_a_schema") 

863 self.assertEqual(schema, schema) 

864 

865 # Test inequality 

866 tab2 = tab1.copy() 

867 names = list(tab2.dtype.names) 

868 names[0] = "index2" 

869 tab2.dtype.names = names 

870 schema2 = ArrowNumpySchema(tab2.dtype) 

871 self.assertNotEqual(schema2, schema) 

872 

873 @unittest.skipUnless(pa is not None, "Cannot test arrow conversions without pyarrow.") 

874 def testNumpyDictConversions(self): 

875 tab1 = _makeSimpleNumpyTable() 

876 

877 # Verify that everything round-trips, including the schema. 

878 tab1_arrow = numpy_to_arrow(tab1) 

879 tab1_dict = arrow_to_numpy_dict(tab1_arrow) 

880 tab1_dict_arrow = numpy_dict_to_arrow(tab1_dict) 

881 

882 self.assertEqual(tab1_arrow.schema, tab1_dict_arrow.schema) 

883 self.assertEqual(tab1_arrow, tab1_dict_arrow) 

884 

885 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.") 

886 def testWriteNumpyTableReadAsArrowTable(self): 

887 tab1 = _makeSimpleNumpyTable() 

888 

889 self.butler.put(tab1, self.datasetType, dataId={}) 

890 

891 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable") 

892 

893 tab2_numpy = arrow_to_numpy(tab2) 

894 

895 self._checkNumpyTableEquality(tab1, tab2_numpy) 

896 

897 # Check reading the columns. 

898 columns = tab2.schema.names 

899 columns2 = self.butler.get( 

900 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

901 ) 

902 self.assertEqual(columns2, columns) 

903 

904 # Check reading the schema. 

905 schema = tab2.schema 

906 schema2 = self.butler.get( 

907 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema" 

908 ) 

909 self.assertEqual(schema2, schema) 

910 

911 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

912 def testWriteNumpyTableReadAsDataFrame(self): 

913 tab1 = _makeSimpleNumpyTable() 

914 

915 self.butler.put(tab1, self.datasetType, dataId={}) 

916 

917 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

918 

919 # Converting this back to numpy gets confused with the index column 

920 # and changes the datatype of the string column. 

921 

922 tab1_df = pd.DataFrame(tab1) 

923 

924 self.assertTrue(tab1_df.equals(tab2)) 

925 

926 # Check reading the columns. 

927 columns = tab2.columns 

928 columns2 = self.butler.get( 

929 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex" 

930 ) 

931 self.assertTrue(columns.equals(columns2)) 

932 

933 # Check reading the schema. 

934 schema = DataFrameSchema(tab2) 

935 schema2 = self.butler.get( 

936 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema" 

937 ) 

938 

939 self.assertEqual(schema2, schema) 

940 

941 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

942 def testWriteNumpyTableReadAsAstropyTable(self): 

943 tab1 = _makeSimpleNumpyTable() 

944 

945 self.butler.put(tab1, self.datasetType, dataId={}) 

946 

947 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

948 tab2_numpy = tab2.as_array() 

949 

950 self._checkNumpyTableEquality(tab1, tab2_numpy) 

951 

952 # Check reading the columns. 

953 columns = list(tab2.columns.keys()) 

954 columns2 = self.butler.get( 

955 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

956 ) 

957 self.assertEqual(columns2, columns) 

958 

959 # Check reading the schema. 

960 schema = ArrowAstropySchema(tab2) 

961 schema2 = self.butler.get( 

962 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema" 

963 ) 

964 

965 self.assertEqual(schema2, schema) 

966 

967 def _checkNumpyTableEquality(self, table1, table2): 

968 """Check if two numpy tables have the same columns/values 

969 

970 Parameters 

971 ---------- 

972 table1 : `numpy.ndarray` 

973 table2 : `numpy.ndarray` 

974 """ 

975 self.assertEqual(table1.dtype.names, table2.dtype.names) 

976 for name in table1.dtype.names: 

977 self.assertEqual(table1.dtype[name], table2.dtype[name]) 

978 self.assertTrue(np.all(table1 == table2)) 

979 

980 

981@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.") 

982class InMemoryArrowNumpyDelegateTestCase(ParquetFormatterArrowNumpyTestCase): 

983 """Tests for InMemoryDatastore, using ArrowNumpyDelegate.""" 

984 

985 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

986 

987 def testBadInput(self): 

988 tab1 = _makeSimpleNumpyTable() 

989 delegate = ArrowNumpyDelegate("ArrowNumpy") 

990 

991 with self.assertRaises(ValueError): 

992 delegate.handleParameters(inMemoryDataset="not_a_numpy_table") 

993 

994 with self.assertRaises(NotImplementedError): 

995 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]}) 

996 

997 with self.assertRaises(AttributeError): 

998 delegate.getComponent(composite=tab1, componentName="nothing") 

999 

1000 def testStorageClass(self): 

1001 tab1 = _makeSimpleNumpyTable() 

1002 

1003 factory = StorageClassFactory() 

1004 factory.addFromConfig(StorageClassConfig()) 

1005 

1006 storageClass = factory.findStorageClass(type(tab1), compare_types=False) 

1007 # Force the name lookup to do name matching. 

1008 storageClass._pytype = None 

1009 self.assertEqual(storageClass.name, "ArrowNumpy") 

1010 

1011 storageClass = factory.findStorageClass(type(tab1), compare_types=True) 

1012 # Force the name lookup to do name matching. 

1013 storageClass._pytype = None 

1014 self.assertEqual(storageClass.name, "ArrowNumpy") 

1015 

1016 

1017@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowTable without pyarrow.") 

1018class ParquetFormatterArrowTableTestCase(unittest.TestCase): 

1019 """Tests for ParquetFormatter, ArrowTable, using local file datastore.""" 

1020 

1021 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1022 

1023 def setUp(self): 

1024 """Create a new butler root for each test.""" 

1025 self.root = makeTestTempDir(TESTDIR) 

1026 config = Config(self.configFile) 

1027 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run") 

1028 # No dimensions in dataset type so we don't have to worry about 

1029 # inserting dimension data or defining data IDs. 

1030 self.datasetType = DatasetType( 

1031 "data", dimensions=(), storageClass="ArrowTable", universe=self.butler.registry.dimensions 

1032 ) 

1033 self.butler.registry.registerDatasetType(self.datasetType) 

1034 

1035 def tearDown(self): 

1036 removeTestTempDir(self.root) 

1037 

1038 def testArrowTable(self): 

1039 tab1 = _makeSimpleArrowTable() 

1040 

1041 self.butler.put(tab1, self.datasetType, dataId={}) 

1042 # Read the whole Table. 

1043 tab2 = self.butler.get(self.datasetType, dataId={}) 

1044 self.assertEqual(tab2, tab1) 

1045 # Read the columns. 

1046 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

1047 self.assertEqual(len(columns2), len(tab1.schema.names)) 

1048 for i, name in enumerate(tab1.schema.names): 

1049 self.assertEqual(columns2[i], name) 

1050 # Read the rowcount. 

1051 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

1052 self.assertEqual(rowcount, len(tab1)) 

1053 # Read the schema. 

1054 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

1055 self.assertEqual(schema, tab1.schema) 

1056 # Read just some columns a few different ways. 

1057 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

1058 self.assertEqual(tab3, tab1.select(("a", "c"))) 

1059 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

1060 self.assertEqual(tab4, tab1.select(("a",))) 

1061 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]}) 

1062 self.assertEqual(tab5, tab1.select(("index", "a"))) 

1063 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"}) 

1064 self.assertEqual(tab6, tab1.select(("ddd",))) 

1065 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]}) 

1066 self.assertEqual(tab7, tab1.select(("a",))) 

1067 # Passing an unrecognized column should be a ValueError. 

1068 with self.assertRaises(ValueError): 

1069 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]}) 

1070 

1071 def testEmptyArrowTable(self): 

1072 data = _makeSimpleNumpyTable() 

1073 type_list = [(name, pa.from_numpy_dtype(data.dtype[name].type)) for name in data.dtype.names] 

1074 

1075 schema = pa.schema(type_list) 

1076 arrays = [[]] * len(schema.names) 

1077 

1078 tab1 = pa.Table.from_arrays(arrays, schema=schema) 

1079 

1080 self.butler.put(tab1, self.datasetType, dataId={}) 

1081 tab2 = self.butler.get(self.datasetType, dataId={}) 

1082 self.assertEqual(tab2, tab1) 

1083 

1084 tab1_numpy = arrow_to_numpy(tab1) 

1085 self.assertEqual(len(tab1_numpy), 0) 

1086 tab1_numpy_arrow = numpy_to_arrow(tab1_numpy) 

1087 self.assertEqual(tab1_numpy_arrow, tab1) 

1088 

1089 tab1_pandas = arrow_to_pandas(tab1) 

1090 self.assertEqual(len(tab1_pandas), 0) 

1091 tab1_pandas_arrow = pandas_to_arrow(tab1_pandas) 

1092 # Unfortunately, string/byte columns get mangled when translated 

1093 # through empty pandas dataframes. 

1094 self.assertEqual( 

1095 tab1_pandas_arrow.select(("index", "a", "b", "c", "ddd")), 

1096 tab1.select(("index", "a", "b", "c", "ddd")), 

1097 ) 

1098 

1099 tab1_astropy = arrow_to_astropy(tab1) 

1100 self.assertEqual(len(tab1_astropy), 0) 

1101 tab1_astropy_arrow = astropy_to_arrow(tab1_astropy) 

1102 self.assertEqual(tab1_astropy_arrow, tab1) 

1103 

1104 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

1105 def testWriteArrowTableReadAsSingleIndexDataFrame(self): 

1106 df1, allColumns = _makeSingleIndexDataFrame() 

1107 

1108 self.butler.put(df1, self.datasetType, dataId={}) 

1109 

1110 # Read back out as a dataframe. 

1111 df2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

1112 self.assertTrue(df1.equals(df2)) 

1113 

1114 # Read back out as an arrow table, convert to dataframe. 

1115 tab3 = self.butler.get(self.datasetType, dataId={}) 

1116 df3 = arrow_to_pandas(tab3) 

1117 self.assertTrue(df1.equals(df3)) 

1118 

1119 # Check reading the columns. 

1120 columns = df2.reset_index().columns 

1121 columns2 = self.butler.get( 

1122 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex" 

1123 ) 

1124 # We check the set because pandas reorders the columns. 

1125 self.assertEqual(set(columns2.to_list()), set(columns.to_list())) 

1126 

1127 # Check reading the schema. 

1128 schema = DataFrameSchema(df1) 

1129 schema2 = self.butler.get( 

1130 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema" 

1131 ) 

1132 self.assertEqual(schema2, schema) 

1133 

1134 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

1135 def testWriteArrowTableReadAsMultiIndexDataFrame(self): 

1136 df1 = _makeMultiIndexDataFrame() 

1137 

1138 self.butler.put(df1, self.datasetType, dataId={}) 

1139 

1140 # Read back out as a dataframe. 

1141 df2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

1142 self.assertTrue(df1.equals(df2)) 

1143 

1144 # Read back out as an arrow table, convert to dataframe. 

1145 atab3 = self.butler.get(self.datasetType, dataId={}) 

1146 df3 = arrow_to_pandas(atab3) 

1147 self.assertTrue(df1.equals(df3)) 

1148 

1149 # Check reading the columns. 

1150 columns = df2.columns 

1151 columns2 = self.butler.get( 

1152 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex" 

1153 ) 

1154 self.assertTrue(columns2.equals(columns)) 

1155 

1156 # Check reading the schema. 

1157 schema = DataFrameSchema(df1) 

1158 schema2 = self.butler.get( 

1159 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema" 

1160 ) 

1161 self.assertEqual(schema2, schema) 

1162 

1163 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

1164 def testWriteArrowTableReadAsAstropyTable(self): 

1165 tab1 = _makeSimpleAstropyTable() 

1166 

1167 self.butler.put(tab1, self.datasetType, dataId={}) 

1168 

1169 # Read back out as an astropy table. 

1170 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

1171 self._checkAstropyTableEquality(tab1, tab2) 

1172 

1173 # Read back out as an arrow table, convert to astropy table. 

1174 atab3 = self.butler.get(self.datasetType, dataId={}) 

1175 tab3 = arrow_to_astropy(atab3) 

1176 self._checkAstropyTableEquality(tab1, tab3) 

1177 

1178 # Check reading the columns. 

1179 columns = list(tab2.columns.keys()) 

1180 columns2 = self.butler.get( 

1181 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

1182 ) 

1183 self.assertEqual(columns2, columns) 

1184 

1185 # Check reading the schema. 

1186 schema = ArrowAstropySchema(tab1) 

1187 schema2 = self.butler.get( 

1188 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema" 

1189 ) 

1190 self.assertEqual(schema2, schema) 

1191 

1192 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

1193 def testWriteArrowTableReadAsNumpyTable(self): 

1194 tab1 = _makeSimpleNumpyTable() 

1195 

1196 self.butler.put(tab1, self.datasetType, dataId={}) 

1197 

1198 # Read back out as a numpy table. 

1199 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy") 

1200 self._checkNumpyTableEquality(tab1, tab2) 

1201 

1202 # Read back out as an arrow table, convert to numpy table. 

1203 atab3 = self.butler.get(self.datasetType, dataId={}) 

1204 tab3 = arrow_to_numpy(atab3) 

1205 self._checkNumpyTableEquality(tab1, tab3) 

1206 

1207 # Check reading the columns. 

1208 columns = list(tab2.dtype.names) 

1209 columns2 = self.butler.get( 

1210 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

1211 ) 

1212 self.assertEqual(columns2, columns) 

1213 

1214 # Check reading the schema. 

1215 schema = ArrowNumpySchema(tab1.dtype) 

1216 schema2 = self.butler.get( 

1217 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema" 

1218 ) 

1219 self.assertEqual(schema2, schema) 

1220 

1221 def _checkAstropyTableEquality(self, table1, table2): 

1222 """Check if two astropy tables have the same columns/values 

1223 

1224 Parameters 

1225 ---------- 

1226 table1 : `astropy.table.Table` 

1227 table2 : `astropy.table.Table` 

1228 """ 

1229 self.assertEqual(table1.dtype, table2.dtype) 

1230 for name in table1.columns: 

1231 self.assertEqual(table1[name].unit, table2[name].unit) 

1232 self.assertEqual(table1[name].description, table2[name].description) 

1233 self.assertEqual(table1[name].format, table2[name].format) 

1234 self.assertTrue(np.all(table1 == table2)) 

1235 

1236 def _checkNumpyTableEquality(self, table1, table2): 

1237 """Check if two numpy tables have the same columns/values 

1238 

1239 Parameters 

1240 ---------- 

1241 table1 : `numpy.ndarray` 

1242 table2 : `numpy.ndarray` 

1243 """ 

1244 self.assertEqual(table1.dtype.names, table2.dtype.names) 

1245 for name in table1.dtype.names: 

1246 self.assertEqual(table1.dtype[name], table2.dtype[name]) 

1247 self.assertTrue(np.all(table1 == table2)) 

1248 

1249 

1250@unittest.skipUnless(pa is not None, "Cannot test InMemoryArrowTableDelegate without pyarrow.") 

1251class InMemoryArrowTableDelegateTestCase(ParquetFormatterArrowTableTestCase): 

1252 """Tests for InMemoryDatastore, using ArrowTableDelegate.""" 

1253 

1254 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

1255 

1256 def testBadInput(self): 

1257 tab1 = _makeSimpleArrowTable() 

1258 delegate = ArrowTableDelegate("ArrowTable") 

1259 

1260 with self.assertRaises(ValueError): 

1261 delegate.handleParameters(inMemoryDataset="not_an_arrow_table") 

1262 

1263 with self.assertRaises(NotImplementedError): 

1264 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]}) 

1265 

1266 with self.assertRaises(AttributeError): 

1267 delegate.getComponent(composite=tab1, componentName="nothing") 

1268 

1269 def testStorageClass(self): 

1270 tab1 = _makeSimpleArrowTable() 

1271 

1272 factory = StorageClassFactory() 

1273 factory.addFromConfig(StorageClassConfig()) 

1274 

1275 storageClass = factory.findStorageClass(type(tab1), compare_types=False) 

1276 # Force the name lookup to do name matching. 

1277 storageClass._pytype = None 

1278 self.assertEqual(storageClass.name, "ArrowTable") 

1279 

1280 storageClass = factory.findStorageClass(type(tab1), compare_types=True) 

1281 # Force the name lookup to do name matching. 

1282 storageClass._pytype = None 

1283 self.assertEqual(storageClass.name, "ArrowTable") 

1284 

1285 

1286if __name__ == "__main__": 1286 ↛ 1287line 1286 didn't jump to line 1287, because the condition on line 1286 was never true

1287 unittest.main()