Coverage for tests/test_parquet.py: 18%

692 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2022-11-15 01:59 -0800

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""Tests for ParquetFormatter. 

23 

24Tests in this module are disabled unless pandas and pyarrow are importable. 

25""" 

26 

27import os 

28import unittest 

29 

30try: 

31 import pyarrow as pa 

32except ImportError: 

33 pa = None 

34try: 

35 import astropy.table as atable 

36 from astropy import units 

37except ImportError: 

38 atable = None 

39try: 

40 import numpy as np 

41except ImportError: 

42 np = None 

43try: 

44 import pandas as pd 

45except ImportError: 

46 np = None 

47 

48from lsst.daf.butler import ( 

49 Butler, 

50 Config, 

51 DatasetRef, 

52 DatasetType, 

53 FileDataset, 

54 StorageClassConfig, 

55 StorageClassFactory, 

56) 

57from lsst.daf.butler.delegates.arrowastropy import ArrowAstropyDelegate 

58from lsst.daf.butler.delegates.arrownumpy import ArrowNumpyDelegate 

59from lsst.daf.butler.delegates.arrowtable import ArrowTableDelegate 

60from lsst.daf.butler.delegates.dataframe import DataFrameDelegate 

61from lsst.daf.butler.formatters.parquet import ( 

62 ArrowAstropySchema, 

63 ArrowNumpySchema, 

64 DataFrameSchema, 

65 ParquetFormatter, 

66 arrow_to_astropy, 

67 arrow_to_numpy, 

68 arrow_to_numpy_dict, 

69 arrow_to_pandas, 

70 astropy_to_arrow, 

71 numpy_dict_to_arrow, 

72 numpy_to_arrow, 

73 pandas_to_arrow, 

74) 

75from lsst.daf.butler.tests.utils import makeTestTempDir, removeTestTempDir 

76 

77TESTDIR = os.path.abspath(os.path.dirname(__file__)) 

78 

79 

80def _makeSimpleNumpyTable(): 

81 """Make a simple numpy table with random data. 

82 

83 Returns 

84 ------- 

85 numpyTable : `numpy.ndarray` 

86 """ 

87 nrow = 5 

88 data = np.zeros( 

89 nrow, 

90 dtype=[ 

91 ("index", "i4"), 

92 ("a", "f8"), 

93 ("b", "f8"), 

94 ("c", "f8"), 

95 ("ddd", "f8"), 

96 ("strcol", "U10"), 

97 ("bytecol", "a10"), 

98 ], 

99 ) 

100 data["index"][:] = np.arange(nrow) 

101 data["a"] = np.random.randn(nrow) 

102 data["b"] = np.random.randn(nrow) 

103 data["c"] = np.random.randn(nrow) 

104 data["ddd"] = np.random.randn(nrow) 

105 data["strcol"][:] = "teststring" 

106 data["bytecol"][:] = "teststring" 

107 

108 return data 

109 

110 

111def _makeSingleIndexDataFrame(): 

112 """Make a single index data frame for testing. 

113 

114 Returns 

115 ------- 

116 dataFrame : `~pandas.DataFrame` 

117 The test dataframe. 

118 allColumns : `list` [`str`] 

119 List of all the columns (including index columns). 

120 """ 

121 data = _makeSimpleNumpyTable() 

122 df = pd.DataFrame(data) 

123 df = df.set_index("index") 

124 allColumns = df.columns.append(pd.Index(df.index.names)) 

125 

126 return df, allColumns 

127 

128 

129def _makeMultiIndexDataFrame(): 

130 """Make a multi-index data frame for testing. 

131 

132 Returns 

133 ------- 

134 dataFrame : `~pandas.DataFrame` 

135 The test dataframe. 

136 """ 

137 columns = pd.MultiIndex.from_tuples( 

138 [ 

139 ("g", "a"), 

140 ("g", "b"), 

141 ("g", "c"), 

142 ("r", "a"), 

143 ("r", "b"), 

144 ("r", "c"), 

145 ], 

146 names=["filter", "column"], 

147 ) 

148 df = pd.DataFrame(np.random.randn(5, 6), index=np.arange(5, dtype=int), columns=columns) 

149 

150 return df 

151 

152 

153def _makeSimpleAstropyTable(): 

154 """Make an astropy table for testing. 

155 

156 Returns 

157 ------- 

158 astropyTable : `astropy.table.Table` 

159 The test table. 

160 """ 

161 data = _makeSimpleNumpyTable() 

162 # Add a couple of units. 

163 table = atable.Table(data) 

164 table["a"].unit = units.degree 

165 table["b"].unit = units.meter 

166 return table 

167 

168 

169def _makeSimpleArrowTable(): 

170 """Make an arrow table for testing. 

171 

172 Returns 

173 ------- 

174 arrowTable : `pyarrow.Table` 

175 The test table. 

176 """ 

177 data = _makeSimpleNumpyTable() 

178 return numpy_to_arrow(data) 

179 

180 

181@unittest.skipUnless(pd is not None, "Cannot test ParquetFormatterDataFrame without pandas.") 

182@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterDataFrame without pyarrow.") 

183class ParquetFormatterDataFrameTestCase(unittest.TestCase): 

184 """Tests for ParquetFormatter, DataFrame, using local file datastore.""" 

185 

186 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

187 

188 def setUp(self): 

189 """Create a new butler root for each test.""" 

190 self.root = makeTestTempDir(TESTDIR) 

191 config = Config(self.configFile) 

192 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run") 

193 # No dimensions in dataset type so we don't have to worry about 

194 # inserting dimension data or defining data IDs. 

195 self.datasetType = DatasetType( 

196 "data", dimensions=(), storageClass="DataFrame", universe=self.butler.registry.dimensions 

197 ) 

198 self.butler.registry.registerDatasetType(self.datasetType) 

199 

200 def tearDown(self): 

201 removeTestTempDir(self.root) 

202 

203 def testSingleIndexDataFrame(self): 

204 df1, allColumns = _makeSingleIndexDataFrame() 

205 

206 self.butler.put(df1, self.datasetType, dataId={}) 

207 # Read the whole DataFrame. 

208 df2 = self.butler.get(self.datasetType, dataId={}) 

209 self.assertTrue(df1.equals(df2)) 

210 # Read just the column descriptions. 

211 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

212 self.assertTrue(allColumns.equals(columns2)) 

213 # Read the rowcount. 

214 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

215 self.assertEqual(rowcount, len(df1)) 

216 # Read the schema. 

217 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

218 self.assertEqual(schema, DataFrameSchema(df1)) 

219 # Read just some columns a few different ways. 

220 df3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

221 self.assertTrue(df1.loc[:, ["a", "c"]].equals(df3)) 

222 df4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

223 self.assertTrue(df1.loc[:, ["a"]].equals(df4)) 

224 df5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]}) 

225 self.assertTrue(df1.loc[:, ["a"]].equals(df5)) 

226 df6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"}) 

227 self.assertTrue(df1.loc[:, ["ddd"]].equals(df6)) 

228 df7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]}) 

229 self.assertTrue(df1.loc[:, ["a"]].equals(df7)) 

230 # Passing an unrecognized column should be a ValueError. 

231 with self.assertRaises(ValueError): 

232 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]}) 

233 

234 def testMultiIndexDataFrame(self): 

235 df1 = _makeMultiIndexDataFrame() 

236 

237 self.butler.put(df1, self.datasetType, dataId={}) 

238 # Read the whole DataFrame. 

239 df2 = self.butler.get(self.datasetType, dataId={}) 

240 self.assertTrue(df1.equals(df2)) 

241 # Read just the column descriptions. 

242 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

243 self.assertTrue(df1.columns.equals(columns2)) 

244 # Read the rowcount. 

245 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

246 self.assertEqual(rowcount, len(df1)) 

247 # Read the schema. 

248 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

249 self.assertEqual(schema, DataFrameSchema(df1)) 

250 # Read just some columns a few different ways. 

251 df3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": {"filter": "g"}}) 

252 self.assertTrue(df1.loc[:, ["g"]].equals(df3)) 

253 df4 = self.butler.get( 

254 self.datasetType, dataId={}, parameters={"columns": {"filter": ["r"], "column": "a"}} 

255 ) 

256 self.assertTrue(df1.loc[:, [("r", "a")]].equals(df4)) 

257 column_list = [("g", "a"), ("r", "c")] 

258 df5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": column_list}) 

259 self.assertTrue(df1.loc[:, column_list].equals(df5)) 

260 # Passing an unrecognized column should be a ValueError. 

261 with self.assertRaises(ValueError): 

262 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["d"]}) 

263 

264 def testSingleIndexDataFrameEmptyString(self): 

265 """Test persisting a single index dataframe with empty strings.""" 

266 df1, _ = _makeSingleIndexDataFrame() 

267 

268 # Set one of the strings to None 

269 df1.at[1, "strcol"] = None 

270 

271 self.butler.put(df1, self.datasetType, dataId={}) 

272 # Read the whole DataFrame. 

273 df2 = self.butler.get(self.datasetType, dataId={}) 

274 self.assertTrue(df1.equals(df2)) 

275 

276 def testSingleIndexDataFrameAllEmptyStrings(self): 

277 """Test persisting a single index dataframe with an empty string 

278 column. 

279 """ 

280 df1, _ = _makeSingleIndexDataFrame() 

281 

282 # Set all of the strings to None 

283 df1.loc[0:, "strcol"] = None 

284 

285 self.butler.put(df1, self.datasetType, dataId={}) 

286 # Read the whole DataFrame. 

287 df2 = self.butler.get(self.datasetType, dataId={}) 

288 self.assertTrue(df1.equals(df2)) 

289 

290 def testLegacyDataFrame(self): 

291 """Test writing a dataframe to parquet via pandas (without additional 

292 metadata) and ensure that we can read it back with all the new 

293 functionality. 

294 """ 

295 df1, allColumns = _makeSingleIndexDataFrame() 

296 

297 fname = os.path.join(self.root, "test_dataframe.parq") 

298 df1.to_parquet(fname) 

299 

300 legacy_type = DatasetType( 

301 "legacy_dataframe", 

302 dimensions=(), 

303 storageClass="DataFrame", 

304 universe=self.butler.registry.dimensions, 

305 ) 

306 self.butler.registry.registerDatasetType(legacy_type) 

307 

308 data_id = {} 

309 ref = DatasetRef(legacy_type, data_id, id=None) 

310 dataset = FileDataset(path=fname, refs=[ref], formatter=ParquetFormatter) 

311 

312 self.butler.ingest(dataset, transfer="copy") 

313 

314 self.butler.put(df1, self.datasetType, dataId={}) 

315 

316 df2a = self.butler.get(self.datasetType, dataId={}) 

317 df2b = self.butler.get("legacy_dataframe", dataId={}) 

318 self.assertTrue(df2a.equals(df2b)) 

319 

320 df3a = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a"]}) 

321 df3b = self.butler.get("legacy_dataframe", dataId={}, parameters={"columns": ["a"]}) 

322 self.assertTrue(df3a.equals(df3b)) 

323 

324 columns2a = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

325 columns2b = self.butler.get("legacy_dataframe.columns", dataId={}) 

326 self.assertTrue(columns2a.equals(columns2b)) 

327 

328 rowcount2a = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

329 rowcount2b = self.butler.get("legacy_dataframe.rowcount", dataId={}) 

330 self.assertEqual(rowcount2a, rowcount2b) 

331 

332 schema2a = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

333 schema2b = self.butler.get("legacy_dataframe.schema", dataId={}) 

334 self.assertEqual(schema2a, schema2b) 

335 

336 def testDataFrameSchema(self): 

337 tab1 = _makeSimpleArrowTable() 

338 

339 schema = DataFrameSchema.from_arrow(tab1.schema) 

340 

341 self.assertIsInstance(schema.schema, pd.DataFrame) 

342 self.assertEqual(repr(schema), repr(schema._schema)) 

343 self.assertNotEqual(schema, "not_a_schema") 

344 self.assertEqual(schema, schema) 

345 

346 tab2 = _makeMultiIndexDataFrame() 

347 schema2 = DataFrameSchema(tab2) 

348 

349 self.assertNotEqual(schema, schema2) 

350 

351 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

352 def testWriteSingleIndexDataFrameReadAsAstropyTable(self): 

353 df1, allColumns = _makeSingleIndexDataFrame() 

354 

355 self.butler.put(df1, self.datasetType, dataId={}) 

356 

357 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

358 

359 tab2_df = tab2.to_pandas(index="index") 

360 self.assertTrue(df1.equals(tab2_df)) 

361 

362 # Check reading the columns. 

363 columns = list(tab2.columns.keys()) 

364 columns2 = self.butler.get( 

365 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

366 ) 

367 # We check the set because pandas reorders the columns. 

368 self.assertEqual(set(columns2), set(columns)) 

369 

370 # Check reading the schema. 

371 schema = ArrowAstropySchema(tab2) 

372 schema2 = self.butler.get( 

373 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema" 

374 ) 

375 

376 # The string types are objectified by pandas, and the order 

377 # will be changed because of pandas indexing. 

378 self.assertEqual(len(schema2.schema.columns), len(schema.schema.columns)) 

379 for name in schema.schema.columns: 

380 self.assertIn(name, schema2.schema.columns) 

381 if schema2.schema[name].dtype != np.dtype("O"): 

382 self.assertEqual(schema2.schema[name].dtype, schema.schema[name].dtype) 

383 

384 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

385 def testWriteMultiIndexDataFrameReadAsAstropyTable(self): 

386 df1 = _makeMultiIndexDataFrame() 

387 

388 self.butler.put(df1, self.datasetType, dataId={}) 

389 

390 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

391 

392 # This is an odd duck, it doesn't really round-trip. 

393 # This test simply checks that it's readable, but definitely not 

394 # recommended. 

395 

396 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.") 

397 def testWriteSingleIndexDataFrameReadAsArrowTable(self): 

398 df1, allColumns = _makeSingleIndexDataFrame() 

399 

400 self.butler.put(df1, self.datasetType, dataId={}) 

401 

402 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable") 

403 

404 tab2_df = arrow_to_pandas(tab2) 

405 self.assertTrue(df1.equals(tab2_df)) 

406 

407 # Check reading the columns. 

408 columns = list(tab2.schema.names) 

409 columns2 = self.butler.get( 

410 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

411 ) 

412 # We check the set because pandas reorders the columns. 

413 self.assertEqual(set(columns), set(columns2)) 

414 

415 # Check reading the schema. 

416 schema = tab2.schema 

417 schema2 = self.butler.get( 

418 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema" 

419 ) 

420 

421 # These will not have the same metadata, nor will the string column 

422 # information be maintained. 

423 self.assertEqual(len(schema.names), len(schema2.names)) 

424 for name in schema.names: 

425 if schema.field(name).type not in (pa.string(), pa.binary()): 

426 self.assertEqual(schema.field(name).type, schema2.field(name).type) 

427 

428 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.") 

429 def testWriteMultiIndexDataFrameReadAsArrowTable(self): 

430 df1 = _makeMultiIndexDataFrame() 

431 

432 self.butler.put(df1, self.datasetType, dataId={}) 

433 

434 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable") 

435 

436 tab2_df = arrow_to_pandas(tab2) 

437 self.assertTrue(df1.equals(tab2_df)) 

438 

439 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

440 def testWriteSingleIndexDataFrameReadAsNumpyTable(self): 

441 df1, allColumns = _makeSingleIndexDataFrame() 

442 

443 self.butler.put(df1, self.datasetType, dataId={}) 

444 

445 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy") 

446 

447 tab2_df = pd.DataFrame.from_records(tab2, index=["index"]) 

448 self.assertTrue(df1.equals(tab2_df)) 

449 

450 # Check reading the columns. 

451 columns = list(tab2.dtype.names) 

452 columns2 = self.butler.get( 

453 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

454 ) 

455 # We check the set because pandas reorders the columns. 

456 self.assertEqual(set(columns2), set(columns)) 

457 

458 # Check reading the schema. 

459 schema = ArrowNumpySchema(tab2.dtype) 

460 schema2 = self.butler.get( 

461 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema" 

462 ) 

463 

464 # The string types will be objectified by pandas, and the order 

465 # will be changed because of pandas indexing. 

466 self.assertEqual(len(schema.schema.names), len(schema2.schema.names)) 

467 for name in schema.schema.names: 

468 self.assertIn(name, schema2.schema.names) 

469 self.assertEqual(schema2.schema[name].type, schema.schema[name].type) 

470 

471 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

472 def testWriteMultiIndexDataFrameReadAsNumpyTable(self): 

473 df1 = _makeMultiIndexDataFrame() 

474 

475 self.butler.put(df1, self.datasetType, dataId={}) 

476 

477 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy") 

478 

479 # This is an odd duck, it doesn't really round-trip. 

480 # This test simply checks that it's readable, but definitely not 

481 # recommended. 

482 

483 

484@unittest.skipUnless(pd is not None, "Cannot test InMemoryDataFrameDelegate without pandas.") 

485class InMemoryDataFrameDelegateTestCase(ParquetFormatterDataFrameTestCase): 

486 """Tests for InMemoryDatastore, using DataFrameDelegate.""" 

487 

488 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

489 

490 def testMultiIndexDataFrame(self): 

491 df1 = _makeMultiIndexDataFrame() 

492 

493 delegate = DataFrameDelegate("DataFrame") 

494 

495 # Read the whole DataFrame. 

496 df2 = delegate.handleParameters(inMemoryDataset=df1) 

497 self.assertTrue(df1.equals(df2)) 

498 # Read just the column descriptions. 

499 columns2 = delegate.getComponent(composite=df1, componentName="columns") 

500 self.assertTrue(df1.columns.equals(columns2)) 

501 

502 # Read just some columns a few different ways. 

503 with self.assertRaises(NotImplementedError) as cm: 

504 delegate.handleParameters(inMemoryDataset=df1, parameters={"columns": {"filter": "g"}}) 

505 self.assertIn("only supports string column names", str(cm.exception)) 

506 with self.assertRaises(NotImplementedError) as cm: 

507 delegate.handleParameters( 

508 inMemoryDataset=df1, parameters={"columns": {"filter": ["r"], "column": "a"}} 

509 ) 

510 self.assertIn("only supports string column names", str(cm.exception)) 

511 

512 def testWriteMultiIndexDataFrameReadAsAstropyTable(self): 

513 df1 = _makeMultiIndexDataFrame() 

514 

515 self.butler.put(df1, self.datasetType, dataId={}) 

516 

517 with self.assertRaises(ValueError): 

518 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

519 

520 def testLegacyDataFrame(self): 

521 # This test does not work with an inMemoryDatastore. 

522 pass 

523 

524 def testBadInput(self): 

525 df1, _ = _makeSingleIndexDataFrame() 

526 delegate = DataFrameDelegate("DataFrame") 

527 

528 with self.assertRaises(ValueError): 

529 delegate.handleParameters(inMemoryDataset="not_a_dataframe") 

530 

531 with self.assertRaises(AttributeError): 

532 delegate.getComponent(composite=df1, componentName="nothing") 

533 

534 def testStorageClass(self): 

535 df1, allColumns = _makeSingleIndexDataFrame() 

536 

537 factory = StorageClassFactory() 

538 factory.addFromConfig(StorageClassConfig()) 

539 

540 storageClass = factory.findStorageClass(type(df1), compare_types=False) 

541 # Force the name lookup to do name matching. 

542 storageClass._pytype = None 

543 self.assertEqual(storageClass.name, "DataFrame") 

544 

545 storageClass = factory.findStorageClass(type(df1), compare_types=True) 

546 # Force the name lookup to do name matching. 

547 storageClass._pytype = None 

548 self.assertEqual(storageClass.name, "DataFrame") 

549 

550 

551@unittest.skipUnless(atable is not None, "Cannot test ParquetFormatterArrowAstropy without astropy.") 

552@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowAstropy without pyarrow.") 

553class ParquetFormatterArrowAstropyTestCase(unittest.TestCase): 

554 """Tests for ParquetFormatter, ArrowAstropy, using local file datastore.""" 

555 

556 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

557 

558 def setUp(self): 

559 """Create a new butler root for each test.""" 

560 self.root = makeTestTempDir(TESTDIR) 

561 config = Config(self.configFile) 

562 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run") 

563 # No dimensions in dataset type so we don't have to worry about 

564 # inserting dimension data or defining data IDs. 

565 self.datasetType = DatasetType( 

566 "data", dimensions=(), storageClass="ArrowAstropy", universe=self.butler.registry.dimensions 

567 ) 

568 self.butler.registry.registerDatasetType(self.datasetType) 

569 

570 def tearDown(self): 

571 removeTestTempDir(self.root) 

572 

573 def testAstropyTable(self): 

574 tab1 = _makeSimpleAstropyTable() 

575 

576 self.butler.put(tab1, self.datasetType, dataId={}) 

577 # Read the whole Table. 

578 tab2 = self.butler.get(self.datasetType, dataId={}) 

579 self._checkAstropyTableEquality(tab1, tab2) 

580 # Read the columns. 

581 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

582 self.assertEqual(len(columns2), len(tab1.dtype.names)) 

583 for i, name in enumerate(tab1.dtype.names): 

584 self.assertEqual(columns2[i], name) 

585 # Read the rowcount. 

586 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

587 self.assertEqual(rowcount, len(tab1)) 

588 # Read the schema. 

589 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

590 self.assertEqual(schema, ArrowAstropySchema(tab1)) 

591 # Read just some columns a few different ways. 

592 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

593 self._checkAstropyTableEquality(tab1[("a", "c")], tab3) 

594 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

595 self._checkAstropyTableEquality(tab1[("a",)], tab4) 

596 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]}) 

597 self._checkAstropyTableEquality(tab1[("index", "a")], tab5) 

598 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"}) 

599 self._checkAstropyTableEquality(tab1[("ddd",)], tab6) 

600 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]}) 

601 self._checkAstropyTableEquality(tab1[("a",)], tab7) 

602 # Passing an unrecognized column should be a ValueError. 

603 with self.assertRaises(ValueError): 

604 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]}) 

605 

606 def testArrowAstropySchema(self): 

607 tab1 = _makeSimpleAstropyTable() 

608 tab1_arrow = astropy_to_arrow(tab1) 

609 schema = ArrowAstropySchema.from_arrow(tab1_arrow.schema) 

610 

611 self.assertIsInstance(schema.schema, atable.Table) 

612 self.assertEqual(repr(schema), repr(schema._schema)) 

613 self.assertNotEqual(schema, "not_a_schema") 

614 self.assertEqual(schema, schema) 

615 

616 # Test various inequalities 

617 tab2 = tab1.copy() 

618 tab2.rename_column("index", "index2") 

619 schema2 = ArrowAstropySchema(tab2) 

620 self.assertNotEqual(schema2, schema) 

621 

622 tab2 = tab1.copy() 

623 tab2["index"].unit = units.micron 

624 schema2 = ArrowAstropySchema(tab2) 

625 self.assertNotEqual(schema2, schema) 

626 

627 tab2 = tab1.copy() 

628 tab2["index"].description = "Index column" 

629 schema2 = ArrowAstropySchema(tab2) 

630 self.assertNotEqual(schema2, schema) 

631 

632 tab2 = tab1.copy() 

633 tab2["index"].format = "%05d" 

634 schema2 = ArrowAstropySchema(tab2) 

635 self.assertNotEqual(schema2, schema) 

636 

637 def testAstropyParquet(self): 

638 """Test writing a dataframe to parquet via pandas (without additional 

639 metadata) and ensure that we can read it back with all the new 

640 functionality. 

641 """ 

642 tab1 = _makeSimpleAstropyTable() 

643 

644 fname = os.path.join(self.root, "test_astropy.parq") 

645 tab1.write(fname) 

646 

647 astropy_type = DatasetType( 

648 "astropy_parquet", 

649 dimensions=(), 

650 storageClass="ArrowAstropy", 

651 universe=self.butler.registry.dimensions, 

652 ) 

653 self.butler.registry.registerDatasetType(astropy_type) 

654 

655 data_id = {} 

656 ref = DatasetRef(astropy_type, data_id, id=None) 

657 dataset = FileDataset(path=fname, refs=[ref], formatter=ParquetFormatter) 

658 

659 self.butler.ingest(dataset, transfer="copy") 

660 

661 self.butler.put(tab1, self.datasetType, dataId={}) 

662 

663 tab2a = self.butler.get(self.datasetType, dataId={}) 

664 tab2b = self.butler.get("astropy_parquet", dataId={}) 

665 self._checkAstropyTableEquality(tab2a, tab2b) 

666 

667 columns2a = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

668 columns2b = self.butler.get("astropy_parquet.columns", dataId={}) 

669 self.assertEqual(len(columns2b), len(columns2a)) 

670 for i, name in enumerate(columns2a): 

671 self.assertEqual(columns2b[i], name) 

672 

673 rowcount2a = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

674 rowcount2b = self.butler.get("astropy_parquet.rowcount", dataId={}) 

675 self.assertEqual(rowcount2a, rowcount2b) 

676 

677 schema2a = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

678 schema2b = self.butler.get("astropy_parquet.schema", dataId={}) 

679 self.assertEqual(schema2a, schema2b) 

680 

681 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.") 

682 def testWriteAstropyReadAsArrowTable(self): 

683 tab1 = _makeSimpleAstropyTable() 

684 

685 self.butler.put(tab1, self.datasetType, dataId={}) 

686 

687 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable") 

688 

689 tab2_astropy = arrow_to_astropy(tab2) 

690 self._checkAstropyTableEquality(tab1, tab2_astropy) 

691 

692 # Check reading the columns. 

693 columns = tab2.schema.names 

694 columns2 = self.butler.get( 

695 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

696 ) 

697 self.assertEqual(columns2, columns) 

698 

699 # Check reading the schema. 

700 schema = tab2.schema 

701 schema2 = self.butler.get( 

702 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema" 

703 ) 

704 

705 self.assertEqual(schema, schema2) 

706 

707 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

708 def testWriteAstropyReadAsDataFrame(self): 

709 tab1 = _makeSimpleAstropyTable() 

710 

711 self.butler.put(tab1, self.datasetType, dataId={}) 

712 

713 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

714 

715 # This is tricky because it loses the units and gains a bonus pandas 

716 # _index_ column, so we just test the dataframe form. 

717 

718 tab1_df = tab1.to_pandas() 

719 self.assertTrue(tab1_df.equals(tab2)) 

720 

721 # Check reading the columns. 

722 columns = tab2.columns 

723 columns2 = self.butler.get( 

724 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex" 

725 ) 

726 self.assertTrue(columns.equals(columns2)) 

727 

728 # Check reading the schema. 

729 schema = DataFrameSchema(tab2) 

730 schema2 = self.butler.get( 

731 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema" 

732 ) 

733 

734 self.assertEqual(schema2, schema) 

735 

736 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

737 def testWriteAstropyReadAsNumpyTable(self): 

738 tab1 = _makeSimpleAstropyTable() 

739 self.butler.put(tab1, self.datasetType, dataId={}) 

740 

741 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy") 

742 

743 # This is tricky because it loses the units. 

744 tab2_astropy = atable.Table(tab2) 

745 

746 self._checkAstropyTableEquality(tab1, tab2_astropy, skip_units=True) 

747 

748 # Check reading the columns. 

749 columns = list(tab2.dtype.names) 

750 columns2 = self.butler.get( 

751 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

752 ) 

753 self.assertEqual(columns2, columns) 

754 

755 # Check reading the schema. 

756 schema = ArrowNumpySchema(tab2.dtype) 

757 schema2 = self.butler.get( 

758 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema" 

759 ) 

760 

761 self.assertEqual(schema2, schema) 

762 

763 def _checkAstropyTableEquality(self, table1, table2, skip_units=False): 

764 """Check if two astropy tables have the same columns/values. 

765 

766 Parameters 

767 ---------- 

768 table1 : `astropy.table.Table` 

769 table2 : `astropy.table.Table` 

770 skip_units : `bool` 

771 """ 

772 self.assertEqual(table1.dtype, table2.dtype) 

773 if not skip_units: 

774 for name in table1.columns: 

775 self.assertEqual(table1[name].unit, table2[name].unit) 

776 self.assertEqual(table1[name].description, table2[name].description) 

777 self.assertEqual(table1[name].format, table2[name].format) 

778 self.assertTrue(np.all(table1 == table2)) 

779 

780 

781@unittest.skipUnless(atable is not None, "Cannot test InMemoryArrowAstropyDelegate without astropy.") 

782class InMemoryArrowAstropyDelegateTestCase(ParquetFormatterArrowAstropyTestCase): 

783 """Tests for InMemoryDatastore, using ArrowAstropyDelegate.""" 

784 

785 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

786 

787 def testAstropyParquet(self): 

788 # This test does not work with an inMemoryDatastore. 

789 pass 

790 

791 def testBadInput(self): 

792 tab1 = _makeSimpleAstropyTable() 

793 delegate = ArrowAstropyDelegate("ArrowAstropy") 

794 

795 with self.assertRaises(ValueError): 

796 delegate.handleParameters(inMemoryDataset="not_an_astropy_table") 

797 

798 with self.assertRaises(NotImplementedError): 

799 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]}) 

800 

801 with self.assertRaises(AttributeError): 

802 delegate.getComponent(composite=tab1, componentName="nothing") 

803 

804 

805@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.") 

806@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowNumpy without pyarrow.") 

807class ParquetFormatterArrowNumpyTestCase(unittest.TestCase): 

808 """Tests for ParquetFormatter, ArrowNumpy, using local file datastore.""" 

809 

810 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

811 

812 def setUp(self): 

813 """Create a new butler root for each test.""" 

814 self.root = makeTestTempDir(TESTDIR) 

815 config = Config(self.configFile) 

816 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run") 

817 # No dimensions in dataset type so we don't have to worry about 

818 # inserting dimension data or defining data IDs. 

819 self.datasetType = DatasetType( 

820 "data", dimensions=(), storageClass="ArrowNumpy", universe=self.butler.registry.dimensions 

821 ) 

822 self.butler.registry.registerDatasetType(self.datasetType) 

823 

824 def tearDown(self): 

825 removeTestTempDir(self.root) 

826 

827 def testNumpyTable(self): 

828 tab1 = _makeSimpleNumpyTable() 

829 

830 self.butler.put(tab1, self.datasetType, dataId={}) 

831 # Read the whole Table. 

832 tab2 = self.butler.get(self.datasetType, dataId={}) 

833 self._checkNumpyTableEquality(tab1, tab2) 

834 # Read the columns. 

835 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

836 self.assertEqual(len(columns2), len(tab1.dtype.names)) 

837 for i, name in enumerate(tab1.dtype.names): 

838 self.assertEqual(columns2[i], name) 

839 # Read the rowcount. 

840 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

841 self.assertEqual(rowcount, len(tab1)) 

842 # Read the schema. 

843 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

844 self.assertEqual(schema, ArrowNumpySchema(tab1.dtype)) 

845 # Read just some columns a few different ways. 

846 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

847 self._checkNumpyTableEquality(tab1[["a", "c"]], tab3) 

848 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

849 self._checkNumpyTableEquality( 

850 tab1[ 

851 [ 

852 "a", 

853 ] 

854 ], 

855 tab4, 

856 ) 

857 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]}) 

858 self._checkNumpyTableEquality(tab1[["index", "a"]], tab5) 

859 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"}) 

860 self._checkNumpyTableEquality( 

861 tab1[ 

862 [ 

863 "ddd", 

864 ] 

865 ], 

866 tab6, 

867 ) 

868 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]}) 

869 self._checkNumpyTableEquality( 

870 tab1[ 

871 [ 

872 "a", 

873 ] 

874 ], 

875 tab7, 

876 ) 

877 # Passing an unrecognized column should be a ValueError. 

878 with self.assertRaises(ValueError): 

879 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]}) 

880 

881 def testArrowNumpySchema(self): 

882 tab1 = _makeSimpleNumpyTable() 

883 tab1_arrow = numpy_to_arrow(tab1) 

884 schema = ArrowNumpySchema.from_arrow(tab1_arrow.schema) 

885 

886 self.assertIsInstance(schema.schema, np.dtype) 

887 self.assertEqual(repr(schema), repr(schema._dtype)) 

888 self.assertNotEqual(schema, "not_a_schema") 

889 self.assertEqual(schema, schema) 

890 

891 # Test inequality 

892 tab2 = tab1.copy() 

893 names = list(tab2.dtype.names) 

894 names[0] = "index2" 

895 tab2.dtype.names = names 

896 schema2 = ArrowNumpySchema(tab2.dtype) 

897 self.assertNotEqual(schema2, schema) 

898 

899 @unittest.skipUnless(pa is not None, "Cannot test arrow conversions without pyarrow.") 

900 def testNumpyDictConversions(self): 

901 tab1 = _makeSimpleNumpyTable() 

902 

903 # Verify that everything round-trips, including the schema. 

904 tab1_arrow = numpy_to_arrow(tab1) 

905 tab1_dict = arrow_to_numpy_dict(tab1_arrow) 

906 tab1_dict_arrow = numpy_dict_to_arrow(tab1_dict) 

907 

908 self.assertEqual(tab1_arrow.schema, tab1_dict_arrow.schema) 

909 self.assertEqual(tab1_arrow, tab1_dict_arrow) 

910 

911 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.") 

912 def testWriteNumpyTableReadAsArrowTable(self): 

913 tab1 = _makeSimpleNumpyTable() 

914 

915 self.butler.put(tab1, self.datasetType, dataId={}) 

916 

917 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable") 

918 

919 tab2_numpy = arrow_to_numpy(tab2) 

920 

921 self._checkNumpyTableEquality(tab1, tab2_numpy) 

922 

923 # Check reading the columns. 

924 columns = tab2.schema.names 

925 columns2 = self.butler.get( 

926 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

927 ) 

928 self.assertEqual(columns2, columns) 

929 

930 # Check reading the schema. 

931 schema = tab2.schema 

932 schema2 = self.butler.get( 

933 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema" 

934 ) 

935 self.assertEqual(schema2, schema) 

936 

937 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

938 def testWriteNumpyTableReadAsDataFrame(self): 

939 tab1 = _makeSimpleNumpyTable() 

940 

941 self.butler.put(tab1, self.datasetType, dataId={}) 

942 

943 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

944 

945 # Converting this back to numpy gets confused with the index column 

946 # and changes the datatype of the string column. 

947 

948 tab1_df = pd.DataFrame(tab1) 

949 

950 self.assertTrue(tab1_df.equals(tab2)) 

951 

952 # Check reading the columns. 

953 columns = tab2.columns 

954 columns2 = self.butler.get( 

955 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex" 

956 ) 

957 self.assertTrue(columns.equals(columns2)) 

958 

959 # Check reading the schema. 

960 schema = DataFrameSchema(tab2) 

961 schema2 = self.butler.get( 

962 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema" 

963 ) 

964 

965 self.assertEqual(schema2, schema) 

966 

967 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

968 def testWriteNumpyTableReadAsAstropyTable(self): 

969 tab1 = _makeSimpleNumpyTable() 

970 

971 self.butler.put(tab1, self.datasetType, dataId={}) 

972 

973 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

974 tab2_numpy = tab2.as_array() 

975 

976 self._checkNumpyTableEquality(tab1, tab2_numpy) 

977 

978 # Check reading the columns. 

979 columns = list(tab2.columns.keys()) 

980 columns2 = self.butler.get( 

981 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

982 ) 

983 self.assertEqual(columns2, columns) 

984 

985 # Check reading the schema. 

986 schema = ArrowAstropySchema(tab2) 

987 schema2 = self.butler.get( 

988 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema" 

989 ) 

990 

991 self.assertEqual(schema2, schema) 

992 

993 def _checkNumpyTableEquality(self, table1, table2): 

994 """Check if two numpy tables have the same columns/values 

995 

996 Parameters 

997 ---------- 

998 table1 : `numpy.ndarray` 

999 table2 : `numpy.ndarray` 

1000 """ 

1001 self.assertEqual(table1.dtype.names, table2.dtype.names) 

1002 for name in table1.dtype.names: 

1003 self.assertEqual(table1.dtype[name], table2.dtype[name]) 

1004 self.assertTrue(np.all(table1 == table2)) 

1005 

1006 

1007@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.") 

1008class InMemoryArrowNumpyDelegateTestCase(ParquetFormatterArrowNumpyTestCase): 

1009 """Tests for InMemoryDatastore, using ArrowNumpyDelegate.""" 

1010 

1011 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

1012 

1013 def testBadInput(self): 

1014 tab1 = _makeSimpleNumpyTable() 

1015 delegate = ArrowNumpyDelegate("ArrowNumpy") 

1016 

1017 with self.assertRaises(ValueError): 

1018 delegate.handleParameters(inMemoryDataset="not_a_numpy_table") 

1019 

1020 with self.assertRaises(NotImplementedError): 

1021 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]}) 

1022 

1023 with self.assertRaises(AttributeError): 

1024 delegate.getComponent(composite=tab1, componentName="nothing") 

1025 

1026 def testStorageClass(self): 

1027 tab1 = _makeSimpleNumpyTable() 

1028 

1029 factory = StorageClassFactory() 

1030 factory.addFromConfig(StorageClassConfig()) 

1031 

1032 storageClass = factory.findStorageClass(type(tab1), compare_types=False) 

1033 # Force the name lookup to do name matching. 

1034 storageClass._pytype = None 

1035 self.assertEqual(storageClass.name, "ArrowNumpy") 

1036 

1037 storageClass = factory.findStorageClass(type(tab1), compare_types=True) 

1038 # Force the name lookup to do name matching. 

1039 storageClass._pytype = None 

1040 self.assertEqual(storageClass.name, "ArrowNumpy") 

1041 

1042 

1043@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowTable without pyarrow.") 

1044class ParquetFormatterArrowTableTestCase(unittest.TestCase): 

1045 """Tests for ParquetFormatter, ArrowTable, using local file datastore.""" 

1046 

1047 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1048 

1049 def setUp(self): 

1050 """Create a new butler root for each test.""" 

1051 self.root = makeTestTempDir(TESTDIR) 

1052 config = Config(self.configFile) 

1053 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run") 

1054 # No dimensions in dataset type so we don't have to worry about 

1055 # inserting dimension data or defining data IDs. 

1056 self.datasetType = DatasetType( 

1057 "data", dimensions=(), storageClass="ArrowTable", universe=self.butler.registry.dimensions 

1058 ) 

1059 self.butler.registry.registerDatasetType(self.datasetType) 

1060 

1061 def tearDown(self): 

1062 removeTestTempDir(self.root) 

1063 

1064 def testArrowTable(self): 

1065 tab1 = _makeSimpleArrowTable() 

1066 

1067 self.butler.put(tab1, self.datasetType, dataId={}) 

1068 # Read the whole Table. 

1069 tab2 = self.butler.get(self.datasetType, dataId={}) 

1070 self.assertEqual(tab2, tab1) 

1071 # Read the columns. 

1072 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

1073 self.assertEqual(len(columns2), len(tab1.schema.names)) 

1074 for i, name in enumerate(tab1.schema.names): 

1075 self.assertEqual(columns2[i], name) 

1076 # Read the rowcount. 

1077 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

1078 self.assertEqual(rowcount, len(tab1)) 

1079 # Read the schema. 

1080 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

1081 self.assertEqual(schema, tab1.schema) 

1082 # Read just some columns a few different ways. 

1083 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

1084 self.assertEqual(tab3, tab1.select(("a", "c"))) 

1085 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

1086 self.assertEqual(tab4, tab1.select(("a",))) 

1087 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]}) 

1088 self.assertEqual(tab5, tab1.select(("index", "a"))) 

1089 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"}) 

1090 self.assertEqual(tab6, tab1.select(("ddd",))) 

1091 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]}) 

1092 self.assertEqual(tab7, tab1.select(("a",))) 

1093 # Passing an unrecognized column should be a ValueError. 

1094 with self.assertRaises(ValueError): 

1095 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]}) 

1096 

1097 def testEmptyArrowTable(self): 

1098 data = _makeSimpleNumpyTable() 

1099 type_list = [(name, pa.from_numpy_dtype(data.dtype[name].type)) for name in data.dtype.names] 

1100 

1101 schema = pa.schema(type_list) 

1102 arrays = [[]] * len(schema.names) 

1103 

1104 tab1 = pa.Table.from_arrays(arrays, schema=schema) 

1105 

1106 self.butler.put(tab1, self.datasetType, dataId={}) 

1107 tab2 = self.butler.get(self.datasetType, dataId={}) 

1108 self.assertEqual(tab2, tab1) 

1109 

1110 tab1_numpy = arrow_to_numpy(tab1) 

1111 self.assertEqual(len(tab1_numpy), 0) 

1112 tab1_numpy_arrow = numpy_to_arrow(tab1_numpy) 

1113 self.assertEqual(tab1_numpy_arrow, tab1) 

1114 

1115 tab1_pandas = arrow_to_pandas(tab1) 

1116 self.assertEqual(len(tab1_pandas), 0) 

1117 tab1_pandas_arrow = pandas_to_arrow(tab1_pandas) 

1118 # Unfortunately, string/byte columns get mangled when translated 

1119 # through empty pandas dataframes. 

1120 self.assertEqual( 

1121 tab1_pandas_arrow.select(("index", "a", "b", "c", "ddd")), 

1122 tab1.select(("index", "a", "b", "c", "ddd")), 

1123 ) 

1124 

1125 tab1_astropy = arrow_to_astropy(tab1) 

1126 self.assertEqual(len(tab1_astropy), 0) 

1127 tab1_astropy_arrow = astropy_to_arrow(tab1_astropy) 

1128 self.assertEqual(tab1_astropy_arrow, tab1) 

1129 

1130 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

1131 def testWriteArrowTableReadAsSingleIndexDataFrame(self): 

1132 df1, allColumns = _makeSingleIndexDataFrame() 

1133 

1134 self.butler.put(df1, self.datasetType, dataId={}) 

1135 

1136 # Read back out as a dataframe. 

1137 df2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

1138 self.assertTrue(df1.equals(df2)) 

1139 

1140 # Read back out as an arrow table, convert to dataframe. 

1141 tab3 = self.butler.get(self.datasetType, dataId={}) 

1142 df3 = arrow_to_pandas(tab3) 

1143 self.assertTrue(df1.equals(df3)) 

1144 

1145 # Check reading the columns. 

1146 columns = df2.reset_index().columns 

1147 columns2 = self.butler.get( 

1148 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex" 

1149 ) 

1150 # We check the set because pandas reorders the columns. 

1151 self.assertEqual(set(columns2.to_list()), set(columns.to_list())) 

1152 

1153 # Check reading the schema. 

1154 schema = DataFrameSchema(df1) 

1155 schema2 = self.butler.get( 

1156 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema" 

1157 ) 

1158 self.assertEqual(schema2, schema) 

1159 

1160 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

1161 def testWriteArrowTableReadAsMultiIndexDataFrame(self): 

1162 df1 = _makeMultiIndexDataFrame() 

1163 

1164 self.butler.put(df1, self.datasetType, dataId={}) 

1165 

1166 # Read back out as a dataframe. 

1167 df2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

1168 self.assertTrue(df1.equals(df2)) 

1169 

1170 # Read back out as an arrow table, convert to dataframe. 

1171 atab3 = self.butler.get(self.datasetType, dataId={}) 

1172 df3 = arrow_to_pandas(atab3) 

1173 self.assertTrue(df1.equals(df3)) 

1174 

1175 # Check reading the columns. 

1176 columns = df2.columns 

1177 columns2 = self.butler.get( 

1178 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex" 

1179 ) 

1180 self.assertTrue(columns2.equals(columns)) 

1181 

1182 # Check reading the schema. 

1183 schema = DataFrameSchema(df1) 

1184 schema2 = self.butler.get( 

1185 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema" 

1186 ) 

1187 self.assertEqual(schema2, schema) 

1188 

1189 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

1190 def testWriteArrowTableReadAsAstropyTable(self): 

1191 tab1 = _makeSimpleAstropyTable() 

1192 

1193 self.butler.put(tab1, self.datasetType, dataId={}) 

1194 

1195 # Read back out as an astropy table. 

1196 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

1197 self._checkAstropyTableEquality(tab1, tab2) 

1198 

1199 # Read back out as an arrow table, convert to astropy table. 

1200 atab3 = self.butler.get(self.datasetType, dataId={}) 

1201 tab3 = arrow_to_astropy(atab3) 

1202 self._checkAstropyTableEquality(tab1, tab3) 

1203 

1204 # Check reading the columns. 

1205 columns = list(tab2.columns.keys()) 

1206 columns2 = self.butler.get( 

1207 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

1208 ) 

1209 self.assertEqual(columns2, columns) 

1210 

1211 # Check reading the schema. 

1212 schema = ArrowAstropySchema(tab1) 

1213 schema2 = self.butler.get( 

1214 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema" 

1215 ) 

1216 self.assertEqual(schema2, schema) 

1217 

1218 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

1219 def testWriteArrowTableReadAsNumpyTable(self): 

1220 tab1 = _makeSimpleNumpyTable() 

1221 

1222 self.butler.put(tab1, self.datasetType, dataId={}) 

1223 

1224 # Read back out as a numpy table. 

1225 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy") 

1226 self._checkNumpyTableEquality(tab1, tab2) 

1227 

1228 # Read back out as an arrow table, convert to numpy table. 

1229 atab3 = self.butler.get(self.datasetType, dataId={}) 

1230 tab3 = arrow_to_numpy(atab3) 

1231 self._checkNumpyTableEquality(tab1, tab3) 

1232 

1233 # Check reading the columns. 

1234 columns = list(tab2.dtype.names) 

1235 columns2 = self.butler.get( 

1236 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

1237 ) 

1238 self.assertEqual(columns2, columns) 

1239 

1240 # Check reading the schema. 

1241 schema = ArrowNumpySchema(tab1.dtype) 

1242 schema2 = self.butler.get( 

1243 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema" 

1244 ) 

1245 self.assertEqual(schema2, schema) 

1246 

1247 def _checkAstropyTableEquality(self, table1, table2): 

1248 """Check if two astropy tables have the same columns/values 

1249 

1250 Parameters 

1251 ---------- 

1252 table1 : `astropy.table.Table` 

1253 table2 : `astropy.table.Table` 

1254 """ 

1255 self.assertEqual(table1.dtype, table2.dtype) 

1256 for name in table1.columns: 

1257 self.assertEqual(table1[name].unit, table2[name].unit) 

1258 self.assertEqual(table1[name].description, table2[name].description) 

1259 self.assertEqual(table1[name].format, table2[name].format) 

1260 self.assertTrue(np.all(table1 == table2)) 

1261 

1262 def _checkNumpyTableEquality(self, table1, table2): 

1263 """Check if two numpy tables have the same columns/values 

1264 

1265 Parameters 

1266 ---------- 

1267 table1 : `numpy.ndarray` 

1268 table2 : `numpy.ndarray` 

1269 """ 

1270 self.assertEqual(table1.dtype.names, table2.dtype.names) 

1271 for name in table1.dtype.names: 

1272 self.assertEqual(table1.dtype[name], table2.dtype[name]) 

1273 self.assertTrue(np.all(table1 == table2)) 

1274 

1275 

1276@unittest.skipUnless(pa is not None, "Cannot test InMemoryArrowTableDelegate without pyarrow.") 

1277class InMemoryArrowTableDelegateTestCase(ParquetFormatterArrowTableTestCase): 

1278 """Tests for InMemoryDatastore, using ArrowTableDelegate.""" 

1279 

1280 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

1281 

1282 def testBadInput(self): 

1283 tab1 = _makeSimpleArrowTable() 

1284 delegate = ArrowTableDelegate("ArrowTable") 

1285 

1286 with self.assertRaises(ValueError): 

1287 delegate.handleParameters(inMemoryDataset="not_an_arrow_table") 

1288 

1289 with self.assertRaises(NotImplementedError): 

1290 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]}) 

1291 

1292 with self.assertRaises(AttributeError): 

1293 delegate.getComponent(composite=tab1, componentName="nothing") 

1294 

1295 def testStorageClass(self): 

1296 tab1 = _makeSimpleArrowTable() 

1297 

1298 factory = StorageClassFactory() 

1299 factory.addFromConfig(StorageClassConfig()) 

1300 

1301 storageClass = factory.findStorageClass(type(tab1), compare_types=False) 

1302 # Force the name lookup to do name matching. 

1303 storageClass._pytype = None 

1304 self.assertEqual(storageClass.name, "ArrowTable") 

1305 

1306 storageClass = factory.findStorageClass(type(tab1), compare_types=True) 

1307 # Force the name lookup to do name matching. 

1308 storageClass._pytype = None 

1309 self.assertEqual(storageClass.name, "ArrowTable") 

1310 

1311 

1312if __name__ == "__main__": 1312 ↛ 1313line 1312 didn't jump to line 1313, because the condition on line 1312 was never true

1313 unittest.main()