Coverage for tests/test_parquet.py: 17%

719 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2023-01-05 10:36 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""Tests for ParquetFormatter. 

23 

24Tests in this module are disabled unless pandas and pyarrow are importable. 

25""" 

26 

27import os 

28import unittest 

29 

30try: 

31 import pyarrow as pa 

32except ImportError: 

33 pa = None 

34try: 

35 import astropy.table as atable 

36 from astropy import units 

37except ImportError: 

38 atable = None 

39try: 

40 import numpy as np 

41except ImportError: 

42 np = None 

43try: 

44 import pandas as pd 

45except ImportError: 

46 np = None 

47 

48from lsst.daf.butler import ( 

49 Butler, 

50 Config, 

51 DatasetRef, 

52 DatasetType, 

53 FileDataset, 

54 StorageClassConfig, 

55 StorageClassFactory, 

56) 

57from lsst.daf.butler.delegates.arrowastropy import ArrowAstropyDelegate 

58from lsst.daf.butler.delegates.arrownumpy import ArrowNumpyDelegate 

59from lsst.daf.butler.delegates.arrowtable import ArrowTableDelegate 

60from lsst.daf.butler.delegates.dataframe import DataFrameDelegate 

61from lsst.daf.butler.formatters.parquet import ( 

62 ArrowAstropySchema, 

63 ArrowNumpySchema, 

64 DataFrameSchema, 

65 ParquetFormatter, 

66 _append_numpy_multidim_metadata, 

67 _numpy_dtype_to_arrow_types, 

68 arrow_to_astropy, 

69 arrow_to_numpy, 

70 arrow_to_numpy_dict, 

71 arrow_to_pandas, 

72 astropy_to_arrow, 

73 numpy_dict_to_arrow, 

74 numpy_to_arrow, 

75 pandas_to_arrow, 

76) 

77from lsst.daf.butler.tests.utils import makeTestTempDir, removeTestTempDir 

78 

79TESTDIR = os.path.abspath(os.path.dirname(__file__)) 

80 

81 

82def _makeSimpleNumpyTable(include_multidim=False): 

83 """Make a simple numpy table with random data. 

84 

85 Parameters 

86 ---------- 

87 include_multidim : `bool` 

88 Include multi-dimensional columns. 

89 

90 Returns 

91 ------- 

92 numpyTable : `numpy.ndarray` 

93 """ 

94 nrow = 5 

95 

96 dtype = [ 

97 ("index", "i4"), 

98 ("a", "f8"), 

99 ("b", "f8"), 

100 ("c", "f8"), 

101 ("ddd", "f8"), 

102 ("strcol", "U10"), 

103 ("bytecol", "a10"), 

104 ] 

105 

106 if include_multidim: 

107 dtype.extend( 

108 [ 

109 ("d1", "f4", (5,)), 

110 ("d2", "i8", (5, 10)), 

111 ("d3", "f8", (5, 10)), 

112 ] 

113 ) 

114 

115 data = np.zeros(nrow, dtype=dtype) 

116 data["index"][:] = np.arange(nrow) 

117 data["a"] = np.random.randn(nrow) 

118 data["b"] = np.random.randn(nrow) 

119 data["c"] = np.random.randn(nrow) 

120 data["ddd"] = np.random.randn(nrow) 

121 data["strcol"][:] = "teststring" 

122 data["bytecol"][:] = "teststring" 

123 

124 if include_multidim: 

125 data["d1"] = np.random.randn(data["d1"].size).reshape(data["d1"].shape) 

126 data["d2"] = np.arange(data["d2"].size).reshape(data["d2"].shape) 

127 data["d3"] = np.asfortranarray(np.random.randn(data["d3"].size).reshape(data["d3"].shape)) 

128 

129 return data 

130 

131 

132def _makeSingleIndexDataFrame(): 

133 """Make a single index data frame for testing. 

134 

135 Returns 

136 ------- 

137 dataFrame : `~pandas.DataFrame` 

138 The test dataframe. 

139 allColumns : `list` [`str`] 

140 List of all the columns (including index columns). 

141 """ 

142 data = _makeSimpleNumpyTable() 

143 df = pd.DataFrame(data) 

144 df = df.set_index("index") 

145 allColumns = df.columns.append(pd.Index(df.index.names)) 

146 

147 return df, allColumns 

148 

149 

150def _makeMultiIndexDataFrame(): 

151 """Make a multi-index data frame for testing. 

152 

153 Returns 

154 ------- 

155 dataFrame : `~pandas.DataFrame` 

156 The test dataframe. 

157 """ 

158 columns = pd.MultiIndex.from_tuples( 

159 [ 

160 ("g", "a"), 

161 ("g", "b"), 

162 ("g", "c"), 

163 ("r", "a"), 

164 ("r", "b"), 

165 ("r", "c"), 

166 ], 

167 names=["filter", "column"], 

168 ) 

169 df = pd.DataFrame(np.random.randn(5, 6), index=np.arange(5, dtype=int), columns=columns) 

170 

171 return df 

172 

173 

174def _makeSimpleAstropyTable(include_multidim=False): 

175 """Make an astropy table for testing. 

176 

177 Parameters 

178 ---------- 

179 include_multidim : `bool` 

180 Include multi-dimensional columns. 

181 

182 Returns 

183 ------- 

184 astropyTable : `astropy.table.Table` 

185 The test table. 

186 """ 

187 data = _makeSimpleNumpyTable(include_multidim=include_multidim) 

188 # Add a couple of units. 

189 table = atable.Table(data) 

190 table["a"].unit = units.degree 

191 table["b"].unit = units.meter 

192 return table 

193 

194 

195def _makeSimpleArrowTable(include_multidim=False): 

196 """Make an arrow table for testing. 

197 

198 Parameters 

199 ---------- 

200 include_multidim : `bool` 

201 Include multi-dimensional columns. 

202 

203 Returns 

204 ------- 

205 arrowTable : `pyarrow.Table` 

206 The test table. 

207 """ 

208 data = _makeSimpleNumpyTable(include_multidim=include_multidim) 

209 return numpy_to_arrow(data) 

210 

211 

212@unittest.skipUnless(pd is not None, "Cannot test ParquetFormatterDataFrame without pandas.") 

213@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterDataFrame without pyarrow.") 

214class ParquetFormatterDataFrameTestCase(unittest.TestCase): 

215 """Tests for ParquetFormatter, DataFrame, using local file datastore.""" 

216 

217 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

218 

219 def setUp(self): 

220 """Create a new butler root for each test.""" 

221 self.root = makeTestTempDir(TESTDIR) 

222 config = Config(self.configFile) 

223 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run") 

224 # No dimensions in dataset type so we don't have to worry about 

225 # inserting dimension data or defining data IDs. 

226 self.datasetType = DatasetType( 

227 "data", dimensions=(), storageClass="DataFrame", universe=self.butler.registry.dimensions 

228 ) 

229 self.butler.registry.registerDatasetType(self.datasetType) 

230 

231 def tearDown(self): 

232 removeTestTempDir(self.root) 

233 

234 def testSingleIndexDataFrame(self): 

235 df1, allColumns = _makeSingleIndexDataFrame() 

236 

237 self.butler.put(df1, self.datasetType, dataId={}) 

238 # Read the whole DataFrame. 

239 df2 = self.butler.get(self.datasetType, dataId={}) 

240 self.assertTrue(df1.equals(df2)) 

241 # Read just the column descriptions. 

242 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

243 self.assertTrue(allColumns.equals(columns2)) 

244 # Read the rowcount. 

245 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

246 self.assertEqual(rowcount, len(df1)) 

247 # Read the schema. 

248 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

249 self.assertEqual(schema, DataFrameSchema(df1)) 

250 # Read just some columns a few different ways. 

251 df3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

252 self.assertTrue(df1.loc[:, ["a", "c"]].equals(df3)) 

253 df4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

254 self.assertTrue(df1.loc[:, ["a"]].equals(df4)) 

255 df5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]}) 

256 self.assertTrue(df1.loc[:, ["a"]].equals(df5)) 

257 df6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"}) 

258 self.assertTrue(df1.loc[:, ["ddd"]].equals(df6)) 

259 df7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]}) 

260 self.assertTrue(df1.loc[:, ["a"]].equals(df7)) 

261 # Passing an unrecognized column should be a ValueError. 

262 with self.assertRaises(ValueError): 

263 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]}) 

264 

265 def testMultiIndexDataFrame(self): 

266 df1 = _makeMultiIndexDataFrame() 

267 

268 self.butler.put(df1, self.datasetType, dataId={}) 

269 # Read the whole DataFrame. 

270 df2 = self.butler.get(self.datasetType, dataId={}) 

271 self.assertTrue(df1.equals(df2)) 

272 # Read just the column descriptions. 

273 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

274 self.assertTrue(df1.columns.equals(columns2)) 

275 # Read the rowcount. 

276 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

277 self.assertEqual(rowcount, len(df1)) 

278 # Read the schema. 

279 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

280 self.assertEqual(schema, DataFrameSchema(df1)) 

281 # Read just some columns a few different ways. 

282 df3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": {"filter": "g"}}) 

283 self.assertTrue(df1.loc[:, ["g"]].equals(df3)) 

284 df4 = self.butler.get( 

285 self.datasetType, dataId={}, parameters={"columns": {"filter": ["r"], "column": "a"}} 

286 ) 

287 self.assertTrue(df1.loc[:, [("r", "a")]].equals(df4)) 

288 column_list = [("g", "a"), ("r", "c")] 

289 df5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": column_list}) 

290 self.assertTrue(df1.loc[:, column_list].equals(df5)) 

291 # Passing an unrecognized column should be a ValueError. 

292 with self.assertRaises(ValueError): 

293 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["d"]}) 

294 

295 def testSingleIndexDataFrameEmptyString(self): 

296 """Test persisting a single index dataframe with empty strings.""" 

297 df1, _ = _makeSingleIndexDataFrame() 

298 

299 # Set one of the strings to None 

300 df1.at[1, "strcol"] = None 

301 

302 self.butler.put(df1, self.datasetType, dataId={}) 

303 # Read the whole DataFrame. 

304 df2 = self.butler.get(self.datasetType, dataId={}) 

305 self.assertTrue(df1.equals(df2)) 

306 

307 def testSingleIndexDataFrameAllEmptyStrings(self): 

308 """Test persisting a single index dataframe with an empty string 

309 column. 

310 """ 

311 df1, _ = _makeSingleIndexDataFrame() 

312 

313 # Set all of the strings to None 

314 df1.loc[0:, "strcol"] = None 

315 

316 self.butler.put(df1, self.datasetType, dataId={}) 

317 # Read the whole DataFrame. 

318 df2 = self.butler.get(self.datasetType, dataId={}) 

319 self.assertTrue(df1.equals(df2)) 

320 

321 def testLegacyDataFrame(self): 

322 """Test writing a dataframe to parquet via pandas (without additional 

323 metadata) and ensure that we can read it back with all the new 

324 functionality. 

325 """ 

326 df1, allColumns = _makeSingleIndexDataFrame() 

327 

328 fname = os.path.join(self.root, "test_dataframe.parq") 

329 df1.to_parquet(fname) 

330 

331 legacy_type = DatasetType( 

332 "legacy_dataframe", 

333 dimensions=(), 

334 storageClass="DataFrame", 

335 universe=self.butler.registry.dimensions, 

336 ) 

337 self.butler.registry.registerDatasetType(legacy_type) 

338 

339 data_id = {} 

340 ref = DatasetRef(legacy_type, data_id, id=None) 

341 dataset = FileDataset(path=fname, refs=[ref], formatter=ParquetFormatter) 

342 

343 self.butler.ingest(dataset, transfer="copy") 

344 

345 self.butler.put(df1, self.datasetType, dataId={}) 

346 

347 df2a = self.butler.get(self.datasetType, dataId={}) 

348 df2b = self.butler.get("legacy_dataframe", dataId={}) 

349 self.assertTrue(df2a.equals(df2b)) 

350 

351 df3a = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a"]}) 

352 df3b = self.butler.get("legacy_dataframe", dataId={}, parameters={"columns": ["a"]}) 

353 self.assertTrue(df3a.equals(df3b)) 

354 

355 columns2a = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

356 columns2b = self.butler.get("legacy_dataframe.columns", dataId={}) 

357 self.assertTrue(columns2a.equals(columns2b)) 

358 

359 rowcount2a = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

360 rowcount2b = self.butler.get("legacy_dataframe.rowcount", dataId={}) 

361 self.assertEqual(rowcount2a, rowcount2b) 

362 

363 schema2a = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

364 schema2b = self.butler.get("legacy_dataframe.schema", dataId={}) 

365 self.assertEqual(schema2a, schema2b) 

366 

367 def testDataFrameSchema(self): 

368 tab1 = _makeSimpleArrowTable() 

369 

370 schema = DataFrameSchema.from_arrow(tab1.schema) 

371 

372 self.assertIsInstance(schema.schema, pd.DataFrame) 

373 self.assertEqual(repr(schema), repr(schema._schema)) 

374 self.assertNotEqual(schema, "not_a_schema") 

375 self.assertEqual(schema, schema) 

376 

377 tab2 = _makeMultiIndexDataFrame() 

378 schema2 = DataFrameSchema(tab2) 

379 

380 self.assertNotEqual(schema, schema2) 

381 

382 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

383 def testWriteSingleIndexDataFrameReadAsAstropyTable(self): 

384 df1, allColumns = _makeSingleIndexDataFrame() 

385 

386 self.butler.put(df1, self.datasetType, dataId={}) 

387 

388 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

389 

390 tab2_df = tab2.to_pandas(index="index") 

391 self.assertTrue(df1.equals(tab2_df)) 

392 

393 # Check reading the columns. 

394 columns = list(tab2.columns.keys()) 

395 columns2 = self.butler.get( 

396 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

397 ) 

398 # We check the set because pandas reorders the columns. 

399 self.assertEqual(set(columns2), set(columns)) 

400 

401 # Check reading the schema. 

402 schema = ArrowAstropySchema(tab2) 

403 schema2 = self.butler.get( 

404 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema" 

405 ) 

406 

407 # The string types are objectified by pandas, and the order 

408 # will be changed because of pandas indexing. 

409 self.assertEqual(len(schema2.schema.columns), len(schema.schema.columns)) 

410 for name in schema.schema.columns: 

411 self.assertIn(name, schema2.schema.columns) 

412 if schema2.schema[name].dtype != np.dtype("O"): 

413 self.assertEqual(schema2.schema[name].dtype, schema.schema[name].dtype) 

414 

415 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

416 def testWriteMultiIndexDataFrameReadAsAstropyTable(self): 

417 df1 = _makeMultiIndexDataFrame() 

418 

419 self.butler.put(df1, self.datasetType, dataId={}) 

420 

421 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

422 

423 # This is an odd duck, it doesn't really round-trip. 

424 # This test simply checks that it's readable, but definitely not 

425 # recommended. 

426 

427 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.") 

428 def testWriteSingleIndexDataFrameReadAsArrowTable(self): 

429 df1, allColumns = _makeSingleIndexDataFrame() 

430 

431 self.butler.put(df1, self.datasetType, dataId={}) 

432 

433 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable") 

434 

435 tab2_df = arrow_to_pandas(tab2) 

436 self.assertTrue(df1.equals(tab2_df)) 

437 

438 # Check reading the columns. 

439 columns = list(tab2.schema.names) 

440 columns2 = self.butler.get( 

441 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

442 ) 

443 # We check the set because pandas reorders the columns. 

444 self.assertEqual(set(columns), set(columns2)) 

445 

446 # Check reading the schema. 

447 schema = tab2.schema 

448 schema2 = self.butler.get( 

449 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema" 

450 ) 

451 

452 # These will not have the same metadata, nor will the string column 

453 # information be maintained. 

454 self.assertEqual(len(schema.names), len(schema2.names)) 

455 for name in schema.names: 

456 if schema.field(name).type not in (pa.string(), pa.binary()): 

457 self.assertEqual(schema.field(name).type, schema2.field(name).type) 

458 

459 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.") 

460 def testWriteMultiIndexDataFrameReadAsArrowTable(self): 

461 df1 = _makeMultiIndexDataFrame() 

462 

463 self.butler.put(df1, self.datasetType, dataId={}) 

464 

465 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable") 

466 

467 tab2_df = arrow_to_pandas(tab2) 

468 self.assertTrue(df1.equals(tab2_df)) 

469 

470 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

471 def testWriteSingleIndexDataFrameReadAsNumpyTable(self): 

472 df1, allColumns = _makeSingleIndexDataFrame() 

473 

474 self.butler.put(df1, self.datasetType, dataId={}) 

475 

476 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy") 

477 

478 tab2_df = pd.DataFrame.from_records(tab2, index=["index"]) 

479 self.assertTrue(df1.equals(tab2_df)) 

480 

481 # Check reading the columns. 

482 columns = list(tab2.dtype.names) 

483 columns2 = self.butler.get( 

484 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

485 ) 

486 # We check the set because pandas reorders the columns. 

487 self.assertEqual(set(columns2), set(columns)) 

488 

489 # Check reading the schema. 

490 schema = ArrowNumpySchema(tab2.dtype) 

491 schema2 = self.butler.get( 

492 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema" 

493 ) 

494 

495 # The string types will be objectified by pandas, and the order 

496 # will be changed because of pandas indexing. 

497 self.assertEqual(len(schema.schema.names), len(schema2.schema.names)) 

498 for name in schema.schema.names: 

499 self.assertIn(name, schema2.schema.names) 

500 self.assertEqual(schema2.schema[name].type, schema.schema[name].type) 

501 

502 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

503 def testWriteMultiIndexDataFrameReadAsNumpyTable(self): 

504 df1 = _makeMultiIndexDataFrame() 

505 

506 self.butler.put(df1, self.datasetType, dataId={}) 

507 

508 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy") 

509 

510 # This is an odd duck, it doesn't really round-trip. 

511 # This test simply checks that it's readable, but definitely not 

512 # recommended. 

513 

514 

515@unittest.skipUnless(pd is not None, "Cannot test InMemoryDataFrameDelegate without pandas.") 

516class InMemoryDataFrameDelegateTestCase(ParquetFormatterDataFrameTestCase): 

517 """Tests for InMemoryDatastore, using DataFrameDelegate.""" 

518 

519 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

520 

521 def testMultiIndexDataFrame(self): 

522 df1 = _makeMultiIndexDataFrame() 

523 

524 delegate = DataFrameDelegate("DataFrame") 

525 

526 # Read the whole DataFrame. 

527 df2 = delegate.handleParameters(inMemoryDataset=df1) 

528 self.assertTrue(df1.equals(df2)) 

529 # Read just the column descriptions. 

530 columns2 = delegate.getComponent(composite=df1, componentName="columns") 

531 self.assertTrue(df1.columns.equals(columns2)) 

532 

533 # Read just some columns a few different ways. 

534 with self.assertRaises(NotImplementedError) as cm: 

535 delegate.handleParameters(inMemoryDataset=df1, parameters={"columns": {"filter": "g"}}) 

536 self.assertIn("only supports string column names", str(cm.exception)) 

537 with self.assertRaises(NotImplementedError) as cm: 

538 delegate.handleParameters( 

539 inMemoryDataset=df1, parameters={"columns": {"filter": ["r"], "column": "a"}} 

540 ) 

541 self.assertIn("only supports string column names", str(cm.exception)) 

542 

543 def testWriteMultiIndexDataFrameReadAsAstropyTable(self): 

544 df1 = _makeMultiIndexDataFrame() 

545 

546 self.butler.put(df1, self.datasetType, dataId={}) 

547 

548 with self.assertRaises(ValueError): 

549 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

550 

551 def testLegacyDataFrame(self): 

552 # This test does not work with an inMemoryDatastore. 

553 pass 

554 

555 def testBadInput(self): 

556 df1, _ = _makeSingleIndexDataFrame() 

557 delegate = DataFrameDelegate("DataFrame") 

558 

559 with self.assertRaises(ValueError): 

560 delegate.handleParameters(inMemoryDataset="not_a_dataframe") 

561 

562 with self.assertRaises(AttributeError): 

563 delegate.getComponent(composite=df1, componentName="nothing") 

564 

565 def testStorageClass(self): 

566 df1, allColumns = _makeSingleIndexDataFrame() 

567 

568 factory = StorageClassFactory() 

569 factory.addFromConfig(StorageClassConfig()) 

570 

571 storageClass = factory.findStorageClass(type(df1), compare_types=False) 

572 # Force the name lookup to do name matching. 

573 storageClass._pytype = None 

574 self.assertEqual(storageClass.name, "DataFrame") 

575 

576 storageClass = factory.findStorageClass(type(df1), compare_types=True) 

577 # Force the name lookup to do name matching. 

578 storageClass._pytype = None 

579 self.assertEqual(storageClass.name, "DataFrame") 

580 

581 

582@unittest.skipUnless(atable is not None, "Cannot test ParquetFormatterArrowAstropy without astropy.") 

583@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowAstropy without pyarrow.") 

584class ParquetFormatterArrowAstropyTestCase(unittest.TestCase): 

585 """Tests for ParquetFormatter, ArrowAstropy, using local file datastore.""" 

586 

587 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

588 

589 def setUp(self): 

590 """Create a new butler root for each test.""" 

591 self.root = makeTestTempDir(TESTDIR) 

592 config = Config(self.configFile) 

593 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run") 

594 # No dimensions in dataset type so we don't have to worry about 

595 # inserting dimension data or defining data IDs. 

596 self.datasetType = DatasetType( 

597 "data", dimensions=(), storageClass="ArrowAstropy", universe=self.butler.registry.dimensions 

598 ) 

599 self.butler.registry.registerDatasetType(self.datasetType) 

600 

601 def tearDown(self): 

602 removeTestTempDir(self.root) 

603 

604 def testAstropyTable(self): 

605 tab1 = _makeSimpleAstropyTable(include_multidim=True) 

606 

607 self.butler.put(tab1, self.datasetType, dataId={}) 

608 # Read the whole Table. 

609 tab2 = self.butler.get(self.datasetType, dataId={}) 

610 self._checkAstropyTableEquality(tab1, tab2) 

611 # Read the columns. 

612 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

613 self.assertEqual(len(columns2), len(tab1.dtype.names)) 

614 for i, name in enumerate(tab1.dtype.names): 

615 self.assertEqual(columns2[i], name) 

616 # Read the rowcount. 

617 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

618 self.assertEqual(rowcount, len(tab1)) 

619 # Read the schema. 

620 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

621 self.assertEqual(schema, ArrowAstropySchema(tab1)) 

622 # Read just some columns a few different ways. 

623 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

624 self._checkAstropyTableEquality(tab1[("a", "c")], tab3) 

625 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

626 self._checkAstropyTableEquality(tab1[("a",)], tab4) 

627 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]}) 

628 self._checkAstropyTableEquality(tab1[("index", "a")], tab5) 

629 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"}) 

630 self._checkAstropyTableEquality(tab1[("ddd",)], tab6) 

631 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]}) 

632 self._checkAstropyTableEquality(tab1[("a",)], tab7) 

633 # Passing an unrecognized column should be a ValueError. 

634 with self.assertRaises(ValueError): 

635 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]}) 

636 

637 def testArrowAstropySchema(self): 

638 tab1 = _makeSimpleAstropyTable() 

639 tab1_arrow = astropy_to_arrow(tab1) 

640 schema = ArrowAstropySchema.from_arrow(tab1_arrow.schema) 

641 

642 self.assertIsInstance(schema.schema, atable.Table) 

643 self.assertEqual(repr(schema), repr(schema._schema)) 

644 self.assertNotEqual(schema, "not_a_schema") 

645 self.assertEqual(schema, schema) 

646 

647 # Test various inequalities 

648 tab2 = tab1.copy() 

649 tab2.rename_column("index", "index2") 

650 schema2 = ArrowAstropySchema(tab2) 

651 self.assertNotEqual(schema2, schema) 

652 

653 tab2 = tab1.copy() 

654 tab2["index"].unit = units.micron 

655 schema2 = ArrowAstropySchema(tab2) 

656 self.assertNotEqual(schema2, schema) 

657 

658 tab2 = tab1.copy() 

659 tab2["index"].description = "Index column" 

660 schema2 = ArrowAstropySchema(tab2) 

661 self.assertNotEqual(schema2, schema) 

662 

663 tab2 = tab1.copy() 

664 tab2["index"].format = "%05d" 

665 schema2 = ArrowAstropySchema(tab2) 

666 self.assertNotEqual(schema2, schema) 

667 

668 def testAstropyParquet(self): 

669 """Test writing a dataframe to parquet via pandas (without additional 

670 metadata) and ensure that we can read it back with all the new 

671 functionality. 

672 """ 

673 tab1 = _makeSimpleAstropyTable() 

674 

675 fname = os.path.join(self.root, "test_astropy.parq") 

676 tab1.write(fname) 

677 

678 astropy_type = DatasetType( 

679 "astropy_parquet", 

680 dimensions=(), 

681 storageClass="ArrowAstropy", 

682 universe=self.butler.registry.dimensions, 

683 ) 

684 self.butler.registry.registerDatasetType(astropy_type) 

685 

686 data_id = {} 

687 ref = DatasetRef(astropy_type, data_id, id=None) 

688 dataset = FileDataset(path=fname, refs=[ref], formatter=ParquetFormatter) 

689 

690 self.butler.ingest(dataset, transfer="copy") 

691 

692 self.butler.put(tab1, self.datasetType, dataId={}) 

693 

694 tab2a = self.butler.get(self.datasetType, dataId={}) 

695 tab2b = self.butler.get("astropy_parquet", dataId={}) 

696 self._checkAstropyTableEquality(tab2a, tab2b) 

697 

698 columns2a = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

699 columns2b = self.butler.get("astropy_parquet.columns", dataId={}) 

700 self.assertEqual(len(columns2b), len(columns2a)) 

701 for i, name in enumerate(columns2a): 

702 self.assertEqual(columns2b[i], name) 

703 

704 rowcount2a = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

705 rowcount2b = self.butler.get("astropy_parquet.rowcount", dataId={}) 

706 self.assertEqual(rowcount2a, rowcount2b) 

707 

708 schema2a = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

709 schema2b = self.butler.get("astropy_parquet.schema", dataId={}) 

710 self.assertEqual(schema2a, schema2b) 

711 

712 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.") 

713 def testWriteAstropyReadAsArrowTable(self): 

714 tab1 = _makeSimpleAstropyTable() 

715 

716 self.butler.put(tab1, self.datasetType, dataId={}) 

717 

718 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable") 

719 

720 tab2_astropy = arrow_to_astropy(tab2) 

721 self._checkAstropyTableEquality(tab1, tab2_astropy) 

722 

723 # Check reading the columns. 

724 columns = tab2.schema.names 

725 columns2 = self.butler.get( 

726 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

727 ) 

728 self.assertEqual(columns2, columns) 

729 

730 # Check reading the schema. 

731 schema = tab2.schema 

732 schema2 = self.butler.get( 

733 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema" 

734 ) 

735 

736 self.assertEqual(schema, schema2) 

737 

738 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

739 def testWriteAstropyReadAsDataFrame(self): 

740 tab1 = _makeSimpleAstropyTable() 

741 

742 self.butler.put(tab1, self.datasetType, dataId={}) 

743 

744 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

745 

746 # This is tricky because it loses the units and gains a bonus pandas 

747 # _index_ column, so we just test the dataframe form. 

748 

749 tab1_df = tab1.to_pandas() 

750 self.assertTrue(tab1_df.equals(tab2)) 

751 

752 # Check reading the columns. 

753 columns = tab2.columns 

754 columns2 = self.butler.get( 

755 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex" 

756 ) 

757 self.assertTrue(columns.equals(columns2)) 

758 

759 # Check reading the schema. 

760 schema = DataFrameSchema(tab2) 

761 schema2 = self.butler.get( 

762 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema" 

763 ) 

764 

765 self.assertEqual(schema2, schema) 

766 

767 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

768 def testWriteAstropyReadAsNumpyTable(self): 

769 tab1 = _makeSimpleAstropyTable() 

770 self.butler.put(tab1, self.datasetType, dataId={}) 

771 

772 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy") 

773 

774 # This is tricky because it loses the units. 

775 tab2_astropy = atable.Table(tab2) 

776 

777 self._checkAstropyTableEquality(tab1, tab2_astropy, skip_units=True) 

778 

779 # Check reading the columns. 

780 columns = list(tab2.dtype.names) 

781 columns2 = self.butler.get( 

782 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

783 ) 

784 self.assertEqual(columns2, columns) 

785 

786 # Check reading the schema. 

787 schema = ArrowNumpySchema(tab2.dtype) 

788 schema2 = self.butler.get( 

789 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema" 

790 ) 

791 

792 self.assertEqual(schema2, schema) 

793 

794 def _checkAstropyTableEquality(self, table1, table2, skip_units=False): 

795 """Check if two astropy tables have the same columns/values. 

796 

797 Parameters 

798 ---------- 

799 table1 : `astropy.table.Table` 

800 table2 : `astropy.table.Table` 

801 skip_units : `bool` 

802 """ 

803 self.assertEqual(table1.dtype, table2.dtype) 

804 if not skip_units: 

805 for name in table1.columns: 

806 self.assertEqual(table1[name].unit, table2[name].unit) 

807 self.assertEqual(table1[name].description, table2[name].description) 

808 self.assertEqual(table1[name].format, table2[name].format) 

809 self.assertTrue(np.all(table1 == table2)) 

810 

811 

812@unittest.skipUnless(atable is not None, "Cannot test InMemoryArrowAstropyDelegate without astropy.") 

813class InMemoryArrowAstropyDelegateTestCase(ParquetFormatterArrowAstropyTestCase): 

814 """Tests for InMemoryDatastore, using ArrowAstropyDelegate.""" 

815 

816 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

817 

818 def testAstropyParquet(self): 

819 # This test does not work with an inMemoryDatastore. 

820 pass 

821 

822 def testBadInput(self): 

823 tab1 = _makeSimpleAstropyTable() 

824 delegate = ArrowAstropyDelegate("ArrowAstropy") 

825 

826 with self.assertRaises(ValueError): 

827 delegate.handleParameters(inMemoryDataset="not_an_astropy_table") 

828 

829 with self.assertRaises(NotImplementedError): 

830 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]}) 

831 

832 with self.assertRaises(AttributeError): 

833 delegate.getComponent(composite=tab1, componentName="nothing") 

834 

835 

836@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.") 

837@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowNumpy without pyarrow.") 

838class ParquetFormatterArrowNumpyTestCase(unittest.TestCase): 

839 """Tests for ParquetFormatter, ArrowNumpy, using local file datastore.""" 

840 

841 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

842 

843 def setUp(self): 

844 """Create a new butler root for each test.""" 

845 self.root = makeTestTempDir(TESTDIR) 

846 config = Config(self.configFile) 

847 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run") 

848 # No dimensions in dataset type so we don't have to worry about 

849 # inserting dimension data or defining data IDs. 

850 self.datasetType = DatasetType( 

851 "data", dimensions=(), storageClass="ArrowNumpy", universe=self.butler.registry.dimensions 

852 ) 

853 self.butler.registry.registerDatasetType(self.datasetType) 

854 

855 def tearDown(self): 

856 removeTestTempDir(self.root) 

857 

858 def testNumpyTable(self): 

859 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

860 

861 self.butler.put(tab1, self.datasetType, dataId={}) 

862 # Read the whole Table. 

863 tab2 = self.butler.get(self.datasetType, dataId={}) 

864 self._checkNumpyTableEquality(tab1, tab2) 

865 # Read the columns. 

866 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

867 self.assertEqual(len(columns2), len(tab1.dtype.names)) 

868 for i, name in enumerate(tab1.dtype.names): 

869 self.assertEqual(columns2[i], name) 

870 # Read the rowcount. 

871 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

872 self.assertEqual(rowcount, len(tab1)) 

873 # Read the schema. 

874 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

875 self.assertEqual(schema, ArrowNumpySchema(tab1.dtype)) 

876 # Read just some columns a few different ways. 

877 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

878 self._checkNumpyTableEquality(tab1[["a", "c"]], tab3) 

879 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

880 self._checkNumpyTableEquality( 

881 tab1[ 

882 [ 

883 "a", 

884 ] 

885 ], 

886 tab4, 

887 ) 

888 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]}) 

889 self._checkNumpyTableEquality(tab1[["index", "a"]], tab5) 

890 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"}) 

891 self._checkNumpyTableEquality( 

892 tab1[ 

893 [ 

894 "ddd", 

895 ] 

896 ], 

897 tab6, 

898 ) 

899 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]}) 

900 self._checkNumpyTableEquality( 

901 tab1[ 

902 [ 

903 "a", 

904 ] 

905 ], 

906 tab7, 

907 ) 

908 # Passing an unrecognized column should be a ValueError. 

909 with self.assertRaises(ValueError): 

910 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]}) 

911 

912 def testArrowNumpySchema(self): 

913 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

914 tab1_arrow = numpy_to_arrow(tab1) 

915 schema = ArrowNumpySchema.from_arrow(tab1_arrow.schema) 

916 

917 self.assertIsInstance(schema.schema, np.dtype) 

918 self.assertEqual(repr(schema), repr(schema._dtype)) 

919 self.assertNotEqual(schema, "not_a_schema") 

920 self.assertEqual(schema, schema) 

921 

922 # Test inequality 

923 tab2 = tab1.copy() 

924 names = list(tab2.dtype.names) 

925 names[0] = "index2" 

926 tab2.dtype.names = names 

927 schema2 = ArrowNumpySchema(tab2.dtype) 

928 self.assertNotEqual(schema2, schema) 

929 

930 @unittest.skipUnless(pa is not None, "Cannot test arrow conversions without pyarrow.") 

931 def testNumpyDictConversions(self): 

932 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

933 

934 # Verify that everything round-trips, including the schema. 

935 tab1_arrow = numpy_to_arrow(tab1) 

936 tab1_dict = arrow_to_numpy_dict(tab1_arrow) 

937 tab1_dict_arrow = numpy_dict_to_arrow(tab1_dict) 

938 

939 self.assertEqual(tab1_arrow.schema, tab1_dict_arrow.schema) 

940 self.assertEqual(tab1_arrow, tab1_dict_arrow) 

941 

942 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.") 

943 def testWriteNumpyTableReadAsArrowTable(self): 

944 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

945 

946 self.butler.put(tab1, self.datasetType, dataId={}) 

947 

948 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable") 

949 

950 tab2_numpy = arrow_to_numpy(tab2) 

951 

952 self._checkNumpyTableEquality(tab1, tab2_numpy) 

953 

954 # Check reading the columns. 

955 columns = tab2.schema.names 

956 columns2 = self.butler.get( 

957 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

958 ) 

959 self.assertEqual(columns2, columns) 

960 

961 # Check reading the schema. 

962 schema = tab2.schema 

963 schema2 = self.butler.get( 

964 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema" 

965 ) 

966 self.assertEqual(schema2, schema) 

967 

968 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

969 def testWriteNumpyTableReadAsDataFrame(self): 

970 tab1 = _makeSimpleNumpyTable() 

971 

972 self.butler.put(tab1, self.datasetType, dataId={}) 

973 

974 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

975 

976 # Converting this back to numpy gets confused with the index column 

977 # and changes the datatype of the string column. 

978 

979 tab1_df = pd.DataFrame(tab1) 

980 

981 self.assertTrue(tab1_df.equals(tab2)) 

982 

983 # Check reading the columns. 

984 columns = tab2.columns 

985 columns2 = self.butler.get( 

986 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex" 

987 ) 

988 self.assertTrue(columns.equals(columns2)) 

989 

990 # Check reading the schema. 

991 schema = DataFrameSchema(tab2) 

992 schema2 = self.butler.get( 

993 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema" 

994 ) 

995 

996 self.assertEqual(schema2, schema) 

997 

998 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

999 def testWriteNumpyTableReadAsAstropyTable(self): 

1000 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1001 

1002 self.butler.put(tab1, self.datasetType, dataId={}) 

1003 

1004 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

1005 tab2_numpy = tab2.as_array() 

1006 

1007 self._checkNumpyTableEquality(tab1, tab2_numpy) 

1008 

1009 # Check reading the columns. 

1010 columns = list(tab2.columns.keys()) 

1011 columns2 = self.butler.get( 

1012 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

1013 ) 

1014 self.assertEqual(columns2, columns) 

1015 

1016 # Check reading the schema. 

1017 schema = ArrowAstropySchema(tab2) 

1018 schema2 = self.butler.get( 

1019 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema" 

1020 ) 

1021 

1022 self.assertEqual(schema2, schema) 

1023 

1024 def _checkNumpyTableEquality(self, table1, table2): 

1025 """Check if two numpy tables have the same columns/values 

1026 

1027 Parameters 

1028 ---------- 

1029 table1 : `numpy.ndarray` 

1030 table2 : `numpy.ndarray` 

1031 """ 

1032 self.assertEqual(table1.dtype.names, table2.dtype.names) 

1033 for name in table1.dtype.names: 

1034 self.assertEqual(table1.dtype[name], table2.dtype[name]) 

1035 self.assertTrue(np.all(table1 == table2)) 

1036 

1037 

1038@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.") 

1039class InMemoryArrowNumpyDelegateTestCase(ParquetFormatterArrowNumpyTestCase): 

1040 """Tests for InMemoryDatastore, using ArrowNumpyDelegate.""" 

1041 

1042 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

1043 

1044 def testBadInput(self): 

1045 tab1 = _makeSimpleNumpyTable() 

1046 delegate = ArrowNumpyDelegate("ArrowNumpy") 

1047 

1048 with self.assertRaises(ValueError): 

1049 delegate.handleParameters(inMemoryDataset="not_a_numpy_table") 

1050 

1051 with self.assertRaises(NotImplementedError): 

1052 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]}) 

1053 

1054 with self.assertRaises(AttributeError): 

1055 delegate.getComponent(composite=tab1, componentName="nothing") 

1056 

1057 def testStorageClass(self): 

1058 tab1 = _makeSimpleNumpyTable() 

1059 

1060 factory = StorageClassFactory() 

1061 factory.addFromConfig(StorageClassConfig()) 

1062 

1063 storageClass = factory.findStorageClass(type(tab1), compare_types=False) 

1064 # Force the name lookup to do name matching. 

1065 storageClass._pytype = None 

1066 self.assertEqual(storageClass.name, "ArrowNumpy") 

1067 

1068 storageClass = factory.findStorageClass(type(tab1), compare_types=True) 

1069 # Force the name lookup to do name matching. 

1070 storageClass._pytype = None 

1071 self.assertEqual(storageClass.name, "ArrowNumpy") 

1072 

1073 

1074@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowTable without pyarrow.") 

1075class ParquetFormatterArrowTableTestCase(unittest.TestCase): 

1076 """Tests for ParquetFormatter, ArrowTable, using local file datastore.""" 

1077 

1078 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1079 

1080 def setUp(self): 

1081 """Create a new butler root for each test.""" 

1082 self.root = makeTestTempDir(TESTDIR) 

1083 config = Config(self.configFile) 

1084 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run") 

1085 # No dimensions in dataset type so we don't have to worry about 

1086 # inserting dimension data or defining data IDs. 

1087 self.datasetType = DatasetType( 

1088 "data", dimensions=(), storageClass="ArrowTable", universe=self.butler.registry.dimensions 

1089 ) 

1090 self.butler.registry.registerDatasetType(self.datasetType) 

1091 

1092 def tearDown(self): 

1093 removeTestTempDir(self.root) 

1094 

1095 def testArrowTable(self): 

1096 tab1 = _makeSimpleArrowTable(include_multidim=True) 

1097 

1098 self.butler.put(tab1, self.datasetType, dataId={}) 

1099 # Read the whole Table. 

1100 tab2 = self.butler.get(self.datasetType, dataId={}) 

1101 self.assertEqual(tab2, tab1) 

1102 # Read the columns. 

1103 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

1104 self.assertEqual(len(columns2), len(tab1.schema.names)) 

1105 for i, name in enumerate(tab1.schema.names): 

1106 self.assertEqual(columns2[i], name) 

1107 # Read the rowcount. 

1108 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

1109 self.assertEqual(rowcount, len(tab1)) 

1110 # Read the schema. 

1111 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

1112 self.assertEqual(schema, tab1.schema) 

1113 # Read just some columns a few different ways. 

1114 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

1115 self.assertEqual(tab3, tab1.select(("a", "c"))) 

1116 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

1117 self.assertEqual(tab4, tab1.select(("a",))) 

1118 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]}) 

1119 self.assertEqual(tab5, tab1.select(("index", "a"))) 

1120 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"}) 

1121 self.assertEqual(tab6, tab1.select(("ddd",))) 

1122 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]}) 

1123 self.assertEqual(tab7, tab1.select(("a",))) 

1124 # Passing an unrecognized column should be a ValueError. 

1125 with self.assertRaises(ValueError): 

1126 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]}) 

1127 

1128 def testEmptyArrowTable(self): 

1129 data = _makeSimpleNumpyTable() 

1130 type_list = _numpy_dtype_to_arrow_types(data.dtype) 

1131 

1132 schema = pa.schema(type_list) 

1133 arrays = [[]] * len(schema.names) 

1134 

1135 tab1 = pa.Table.from_arrays(arrays, schema=schema) 

1136 

1137 self.butler.put(tab1, self.datasetType, dataId={}) 

1138 tab2 = self.butler.get(self.datasetType, dataId={}) 

1139 self.assertEqual(tab2, tab1) 

1140 

1141 tab1_numpy = arrow_to_numpy(tab1) 

1142 self.assertEqual(len(tab1_numpy), 0) 

1143 tab1_numpy_arrow = numpy_to_arrow(tab1_numpy) 

1144 self.assertEqual(tab1_numpy_arrow, tab1) 

1145 

1146 tab1_pandas = arrow_to_pandas(tab1) 

1147 self.assertEqual(len(tab1_pandas), 0) 

1148 tab1_pandas_arrow = pandas_to_arrow(tab1_pandas) 

1149 # Unfortunately, string/byte columns get mangled when translated 

1150 # through empty pandas dataframes. 

1151 self.assertEqual( 

1152 tab1_pandas_arrow.select(("index", "a", "b", "c", "ddd")), 

1153 tab1.select(("index", "a", "b", "c", "ddd")), 

1154 ) 

1155 

1156 tab1_astropy = arrow_to_astropy(tab1) 

1157 self.assertEqual(len(tab1_astropy), 0) 

1158 tab1_astropy_arrow = astropy_to_arrow(tab1_astropy) 

1159 self.assertEqual(tab1_astropy_arrow, tab1) 

1160 

1161 def testEmptyArrowTableMultidim(self): 

1162 data = _makeSimpleNumpyTable(include_multidim=True) 

1163 type_list = _numpy_dtype_to_arrow_types(data.dtype) 

1164 

1165 md = {} 

1166 for name in data.dtype.names: 

1167 _append_numpy_multidim_metadata(md, name, data.dtype[name]) 

1168 

1169 schema = pa.schema(type_list, metadata=md) 

1170 arrays = [[]] * len(schema.names) 

1171 

1172 tab1 = pa.Table.from_arrays(arrays, schema=schema) 

1173 

1174 self.butler.put(tab1, self.datasetType, dataId={}) 

1175 tab2 = self.butler.get(self.datasetType, dataId={}) 

1176 self.assertEqual(tab2, tab1) 

1177 

1178 tab1_numpy = arrow_to_numpy(tab1) 

1179 self.assertEqual(len(tab1_numpy), 0) 

1180 tab1_numpy_arrow = numpy_to_arrow(tab1_numpy) 

1181 self.assertEqual(tab1_numpy_arrow, tab1) 

1182 

1183 tab1_astropy = arrow_to_astropy(tab1) 

1184 self.assertEqual(len(tab1_astropy), 0) 

1185 tab1_astropy_arrow = astropy_to_arrow(tab1_astropy) 

1186 self.assertEqual(tab1_astropy_arrow, tab1) 

1187 

1188 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

1189 def testWriteArrowTableReadAsSingleIndexDataFrame(self): 

1190 df1, allColumns = _makeSingleIndexDataFrame() 

1191 

1192 self.butler.put(df1, self.datasetType, dataId={}) 

1193 

1194 # Read back out as a dataframe. 

1195 df2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

1196 self.assertTrue(df1.equals(df2)) 

1197 

1198 # Read back out as an arrow table, convert to dataframe. 

1199 tab3 = self.butler.get(self.datasetType, dataId={}) 

1200 df3 = arrow_to_pandas(tab3) 

1201 self.assertTrue(df1.equals(df3)) 

1202 

1203 # Check reading the columns. 

1204 columns = df2.reset_index().columns 

1205 columns2 = self.butler.get( 

1206 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex" 

1207 ) 

1208 # We check the set because pandas reorders the columns. 

1209 self.assertEqual(set(columns2.to_list()), set(columns.to_list())) 

1210 

1211 # Check reading the schema. 

1212 schema = DataFrameSchema(df1) 

1213 schema2 = self.butler.get( 

1214 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema" 

1215 ) 

1216 self.assertEqual(schema2, schema) 

1217 

1218 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

1219 def testWriteArrowTableReadAsMultiIndexDataFrame(self): 

1220 df1 = _makeMultiIndexDataFrame() 

1221 

1222 self.butler.put(df1, self.datasetType, dataId={}) 

1223 

1224 # Read back out as a dataframe. 

1225 df2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

1226 self.assertTrue(df1.equals(df2)) 

1227 

1228 # Read back out as an arrow table, convert to dataframe. 

1229 atab3 = self.butler.get(self.datasetType, dataId={}) 

1230 df3 = arrow_to_pandas(atab3) 

1231 self.assertTrue(df1.equals(df3)) 

1232 

1233 # Check reading the columns. 

1234 columns = df2.columns 

1235 columns2 = self.butler.get( 

1236 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex" 

1237 ) 

1238 self.assertTrue(columns2.equals(columns)) 

1239 

1240 # Check reading the schema. 

1241 schema = DataFrameSchema(df1) 

1242 schema2 = self.butler.get( 

1243 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema" 

1244 ) 

1245 self.assertEqual(schema2, schema) 

1246 

1247 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

1248 def testWriteArrowTableReadAsAstropyTable(self): 

1249 tab1 = _makeSimpleAstropyTable(include_multidim=True) 

1250 

1251 self.butler.put(tab1, self.datasetType, dataId={}) 

1252 

1253 # Read back out as an astropy table. 

1254 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

1255 self._checkAstropyTableEquality(tab1, tab2) 

1256 

1257 # Read back out as an arrow table, convert to astropy table. 

1258 atab3 = self.butler.get(self.datasetType, dataId={}) 

1259 tab3 = arrow_to_astropy(atab3) 

1260 self._checkAstropyTableEquality(tab1, tab3) 

1261 

1262 # Check reading the columns. 

1263 columns = list(tab2.columns.keys()) 

1264 columns2 = self.butler.get( 

1265 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

1266 ) 

1267 self.assertEqual(columns2, columns) 

1268 

1269 # Check reading the schema. 

1270 schema = ArrowAstropySchema(tab1) 

1271 schema2 = self.butler.get( 

1272 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema" 

1273 ) 

1274 self.assertEqual(schema2, schema) 

1275 

1276 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

1277 def testWriteArrowTableReadAsNumpyTable(self): 

1278 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1279 

1280 self.butler.put(tab1, self.datasetType, dataId={}) 

1281 

1282 # Read back out as a numpy table. 

1283 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy") 

1284 self._checkNumpyTableEquality(tab1, tab2) 

1285 

1286 # Read back out as an arrow table, convert to numpy table. 

1287 atab3 = self.butler.get(self.datasetType, dataId={}) 

1288 tab3 = arrow_to_numpy(atab3) 

1289 self._checkNumpyTableEquality(tab1, tab3) 

1290 

1291 # Check reading the columns. 

1292 columns = list(tab2.dtype.names) 

1293 columns2 = self.butler.get( 

1294 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

1295 ) 

1296 self.assertEqual(columns2, columns) 

1297 

1298 # Check reading the schema. 

1299 schema = ArrowNumpySchema(tab1.dtype) 

1300 schema2 = self.butler.get( 

1301 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema" 

1302 ) 

1303 self.assertEqual(schema2, schema) 

1304 

1305 def _checkAstropyTableEquality(self, table1, table2): 

1306 """Check if two astropy tables have the same columns/values 

1307 

1308 Parameters 

1309 ---------- 

1310 table1 : `astropy.table.Table` 

1311 table2 : `astropy.table.Table` 

1312 """ 

1313 self.assertEqual(table1.dtype, table2.dtype) 

1314 for name in table1.columns: 

1315 self.assertEqual(table1[name].unit, table2[name].unit) 

1316 self.assertEqual(table1[name].description, table2[name].description) 

1317 self.assertEqual(table1[name].format, table2[name].format) 

1318 self.assertTrue(np.all(table1 == table2)) 

1319 

1320 def _checkNumpyTableEquality(self, table1, table2): 

1321 """Check if two numpy tables have the same columns/values 

1322 

1323 Parameters 

1324 ---------- 

1325 table1 : `numpy.ndarray` 

1326 table2 : `numpy.ndarray` 

1327 """ 

1328 self.assertEqual(table1.dtype.names, table2.dtype.names) 

1329 for name in table1.dtype.names: 

1330 self.assertEqual(table1.dtype[name], table2.dtype[name]) 

1331 self.assertTrue(np.all(table1 == table2)) 

1332 

1333 

1334@unittest.skipUnless(pa is not None, "Cannot test InMemoryArrowTableDelegate without pyarrow.") 

1335class InMemoryArrowTableDelegateTestCase(ParquetFormatterArrowTableTestCase): 

1336 """Tests for InMemoryDatastore, using ArrowTableDelegate.""" 

1337 

1338 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

1339 

1340 def testBadInput(self): 

1341 tab1 = _makeSimpleArrowTable() 

1342 delegate = ArrowTableDelegate("ArrowTable") 

1343 

1344 with self.assertRaises(ValueError): 

1345 delegate.handleParameters(inMemoryDataset="not_an_arrow_table") 

1346 

1347 with self.assertRaises(NotImplementedError): 

1348 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]}) 

1349 

1350 with self.assertRaises(AttributeError): 

1351 delegate.getComponent(composite=tab1, componentName="nothing") 

1352 

1353 def testStorageClass(self): 

1354 tab1 = _makeSimpleArrowTable() 

1355 

1356 factory = StorageClassFactory() 

1357 factory.addFromConfig(StorageClassConfig()) 

1358 

1359 storageClass = factory.findStorageClass(type(tab1), compare_types=False) 

1360 # Force the name lookup to do name matching. 

1361 storageClass._pytype = None 

1362 self.assertEqual(storageClass.name, "ArrowTable") 

1363 

1364 storageClass = factory.findStorageClass(type(tab1), compare_types=True) 

1365 # Force the name lookup to do name matching. 

1366 storageClass._pytype = None 

1367 self.assertEqual(storageClass.name, "ArrowTable") 

1368 

1369 

1370if __name__ == "__main__": 1370 ↛ 1371line 1370 didn't jump to line 1371, because the condition on line 1370 was never true

1371 unittest.main()