Coverage for tests/test_parquet.py: 17%

727 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2023-01-12 02:05 -0800

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""Tests for ParquetFormatter. 

23 

24Tests in this module are disabled unless pandas and pyarrow are importable. 

25""" 

26 

27import os 

28import unittest 

29 

30try: 

31 import pyarrow as pa 

32except ImportError: 

33 pa = None 

34try: 

35 import astropy.table as atable 

36 from astropy import units 

37except ImportError: 

38 atable = None 

39try: 

40 import numpy as np 

41except ImportError: 

42 np = None 

43try: 

44 import pandas as pd 

45except ImportError: 

46 np = None 

47 

48from lsst.daf.butler import ( 

49 Butler, 

50 Config, 

51 DatasetRef, 

52 DatasetType, 

53 FileDataset, 

54 StorageClassConfig, 

55 StorageClassFactory, 

56) 

57from lsst.daf.butler.delegates.arrowastropy import ArrowAstropyDelegate 

58from lsst.daf.butler.delegates.arrownumpy import ArrowNumpyDelegate 

59from lsst.daf.butler.delegates.arrowtable import ArrowTableDelegate 

60from lsst.daf.butler.delegates.dataframe import DataFrameDelegate 

61from lsst.daf.butler.formatters.parquet import ( 

62 ArrowAstropySchema, 

63 ArrowNumpySchema, 

64 DataFrameSchema, 

65 ParquetFormatter, 

66 _append_numpy_multidim_metadata, 

67 _numpy_dtype_to_arrow_types, 

68 arrow_to_astropy, 

69 arrow_to_numpy, 

70 arrow_to_numpy_dict, 

71 arrow_to_pandas, 

72 astropy_to_arrow, 

73 numpy_dict_to_arrow, 

74 numpy_to_arrow, 

75 pandas_to_arrow, 

76) 

77from lsst.daf.butler.tests.utils import makeTestTempDir, removeTestTempDir 

78 

79TESTDIR = os.path.abspath(os.path.dirname(__file__)) 

80 

81 

82def _makeSimpleNumpyTable(include_multidim=False): 

83 """Make a simple numpy table with random data. 

84 

85 Parameters 

86 ---------- 

87 include_multidim : `bool` 

88 Include multi-dimensional columns. 

89 

90 Returns 

91 ------- 

92 numpyTable : `numpy.ndarray` 

93 """ 

94 nrow = 5 

95 

96 dtype = [ 

97 ("index", "i4"), 

98 ("a", "f8"), 

99 ("b", "f8"), 

100 ("c", "f8"), 

101 ("ddd", "f8"), 

102 ("strcol", "U10"), 

103 ("bytecol", "a10"), 

104 ] 

105 

106 if include_multidim: 

107 dtype.extend( 

108 [ 

109 ("d1", "f4", (5,)), 

110 ("d2", "i8", (5, 10)), 

111 ("d3", "f8", (5, 10)), 

112 ] 

113 ) 

114 

115 data = np.zeros(nrow, dtype=dtype) 

116 data["index"][:] = np.arange(nrow) 

117 data["a"] = np.random.randn(nrow) 

118 data["b"] = np.random.randn(nrow) 

119 data["c"] = np.random.randn(nrow) 

120 data["ddd"] = np.random.randn(nrow) 

121 data["strcol"][:] = "teststring" 

122 data["bytecol"][:] = "teststring" 

123 

124 if include_multidim: 

125 data["d1"] = np.random.randn(data["d1"].size).reshape(data["d1"].shape) 

126 data["d2"] = np.arange(data["d2"].size).reshape(data["d2"].shape) 

127 data["d3"] = np.asfortranarray(np.random.randn(data["d3"].size).reshape(data["d3"].shape)) 

128 

129 return data 

130 

131 

132def _makeSingleIndexDataFrame(): 

133 """Make a single index data frame for testing. 

134 

135 Returns 

136 ------- 

137 dataFrame : `~pandas.DataFrame` 

138 The test dataframe. 

139 allColumns : `list` [`str`] 

140 List of all the columns (including index columns). 

141 """ 

142 data = _makeSimpleNumpyTable() 

143 df = pd.DataFrame(data) 

144 df = df.set_index("index") 

145 allColumns = df.columns.append(pd.Index(df.index.names)) 

146 

147 return df, allColumns 

148 

149 

150def _makeMultiIndexDataFrame(): 

151 """Make a multi-index data frame for testing. 

152 

153 Returns 

154 ------- 

155 dataFrame : `~pandas.DataFrame` 

156 The test dataframe. 

157 """ 

158 columns = pd.MultiIndex.from_tuples( 

159 [ 

160 ("g", "a"), 

161 ("g", "b"), 

162 ("g", "c"), 

163 ("r", "a"), 

164 ("r", "b"), 

165 ("r", "c"), 

166 ], 

167 names=["filter", "column"], 

168 ) 

169 df = pd.DataFrame(np.random.randn(5, 6), index=np.arange(5, dtype=int), columns=columns) 

170 

171 return df 

172 

173 

174def _makeSimpleAstropyTable(include_multidim=False): 

175 """Make an astropy table for testing. 

176 

177 Parameters 

178 ---------- 

179 include_multidim : `bool` 

180 Include multi-dimensional columns. 

181 

182 Returns 

183 ------- 

184 astropyTable : `astropy.table.Table` 

185 The test table. 

186 """ 

187 data = _makeSimpleNumpyTable(include_multidim=include_multidim) 

188 # Add a couple of units. 

189 table = atable.Table(data) 

190 table["a"].unit = units.degree 

191 table["b"].unit = units.meter 

192 return table 

193 

194 

195def _makeSimpleArrowTable(include_multidim=False): 

196 """Make an arrow table for testing. 

197 

198 Parameters 

199 ---------- 

200 include_multidim : `bool` 

201 Include multi-dimensional columns. 

202 

203 Returns 

204 ------- 

205 arrowTable : `pyarrow.Table` 

206 The test table. 

207 """ 

208 data = _makeSimpleNumpyTable(include_multidim=include_multidim) 

209 return numpy_to_arrow(data) 

210 

211 

212@unittest.skipUnless(pd is not None, "Cannot test ParquetFormatterDataFrame without pandas.") 

213@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterDataFrame without pyarrow.") 

214class ParquetFormatterDataFrameTestCase(unittest.TestCase): 

215 """Tests for ParquetFormatter, DataFrame, using local file datastore.""" 

216 

217 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

218 

219 def setUp(self): 

220 """Create a new butler root for each test.""" 

221 self.root = makeTestTempDir(TESTDIR) 

222 config = Config(self.configFile) 

223 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run") 

224 # No dimensions in dataset type so we don't have to worry about 

225 # inserting dimension data or defining data IDs. 

226 self.datasetType = DatasetType( 

227 "data", dimensions=(), storageClass="DataFrame", universe=self.butler.registry.dimensions 

228 ) 

229 self.butler.registry.registerDatasetType(self.datasetType) 

230 

231 def tearDown(self): 

232 removeTestTempDir(self.root) 

233 

234 def testSingleIndexDataFrame(self): 

235 df1, allColumns = _makeSingleIndexDataFrame() 

236 

237 self.butler.put(df1, self.datasetType, dataId={}) 

238 # Read the whole DataFrame. 

239 df2 = self.butler.get(self.datasetType, dataId={}) 

240 self.assertTrue(df1.equals(df2)) 

241 # Read just the column descriptions. 

242 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

243 self.assertTrue(allColumns.equals(columns2)) 

244 # Read the rowcount. 

245 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

246 self.assertEqual(rowcount, len(df1)) 

247 # Read the schema. 

248 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

249 self.assertEqual(schema, DataFrameSchema(df1)) 

250 # Read just some columns a few different ways. 

251 df3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

252 self.assertTrue(df1.loc[:, ["a", "c"]].equals(df3)) 

253 df4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

254 self.assertTrue(df1.loc[:, ["a"]].equals(df4)) 

255 df5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]}) 

256 self.assertTrue(df1.loc[:, ["a"]].equals(df5)) 

257 df6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"}) 

258 self.assertTrue(df1.loc[:, ["ddd"]].equals(df6)) 

259 df7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]}) 

260 self.assertTrue(df1.loc[:, ["a"]].equals(df7)) 

261 # Passing an unrecognized column should be a ValueError. 

262 with self.assertRaises(ValueError): 

263 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]}) 

264 

265 def testMultiIndexDataFrame(self): 

266 df1 = _makeMultiIndexDataFrame() 

267 

268 self.butler.put(df1, self.datasetType, dataId={}) 

269 # Read the whole DataFrame. 

270 df2 = self.butler.get(self.datasetType, dataId={}) 

271 self.assertTrue(df1.equals(df2)) 

272 # Read just the column descriptions. 

273 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

274 self.assertTrue(df1.columns.equals(columns2)) 

275 # Read the rowcount. 

276 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

277 self.assertEqual(rowcount, len(df1)) 

278 # Read the schema. 

279 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

280 self.assertEqual(schema, DataFrameSchema(df1)) 

281 # Read just some columns a few different ways. 

282 df3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": {"filter": "g"}}) 

283 self.assertTrue(df1.loc[:, ["g"]].equals(df3)) 

284 df4 = self.butler.get( 

285 self.datasetType, dataId={}, parameters={"columns": {"filter": ["r"], "column": "a"}} 

286 ) 

287 self.assertTrue(df1.loc[:, [("r", "a")]].equals(df4)) 

288 column_list = [("g", "a"), ("r", "c")] 

289 df5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": column_list}) 

290 self.assertTrue(df1.loc[:, column_list].equals(df5)) 

291 # Passing an unrecognized column should be a ValueError. 

292 with self.assertRaises(ValueError): 

293 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["d"]}) 

294 

295 def testSingleIndexDataFrameEmptyString(self): 

296 """Test persisting a single index dataframe with empty strings.""" 

297 df1, _ = _makeSingleIndexDataFrame() 

298 

299 # Set one of the strings to None 

300 df1.at[1, "strcol"] = None 

301 

302 self.butler.put(df1, self.datasetType, dataId={}) 

303 # Read the whole DataFrame. 

304 df2 = self.butler.get(self.datasetType, dataId={}) 

305 self.assertTrue(df1.equals(df2)) 

306 

307 def testSingleIndexDataFrameAllEmptyStrings(self): 

308 """Test persisting a single index dataframe with an empty string 

309 column. 

310 """ 

311 df1, _ = _makeSingleIndexDataFrame() 

312 

313 # Set all of the strings to None 

314 df1.loc[0:, "strcol"] = None 

315 

316 self.butler.put(df1, self.datasetType, dataId={}) 

317 # Read the whole DataFrame. 

318 df2 = self.butler.get(self.datasetType, dataId={}) 

319 self.assertTrue(df1.equals(df2)) 

320 

321 def testLegacyDataFrame(self): 

322 """Test writing a dataframe to parquet via pandas (without additional 

323 metadata) and ensure that we can read it back with all the new 

324 functionality. 

325 """ 

326 df1, allColumns = _makeSingleIndexDataFrame() 

327 

328 fname = os.path.join(self.root, "test_dataframe.parq") 

329 df1.to_parquet(fname) 

330 

331 legacy_type = DatasetType( 

332 "legacy_dataframe", 

333 dimensions=(), 

334 storageClass="DataFrame", 

335 universe=self.butler.registry.dimensions, 

336 ) 

337 self.butler.registry.registerDatasetType(legacy_type) 

338 

339 data_id = {} 

340 ref = DatasetRef(legacy_type, data_id, id=None) 

341 dataset = FileDataset(path=fname, refs=[ref], formatter=ParquetFormatter) 

342 

343 self.butler.ingest(dataset, transfer="copy") 

344 

345 self.butler.put(df1, self.datasetType, dataId={}) 

346 

347 df2a = self.butler.get(self.datasetType, dataId={}) 

348 df2b = self.butler.get("legacy_dataframe", dataId={}) 

349 self.assertTrue(df2a.equals(df2b)) 

350 

351 df3a = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a"]}) 

352 df3b = self.butler.get("legacy_dataframe", dataId={}, parameters={"columns": ["a"]}) 

353 self.assertTrue(df3a.equals(df3b)) 

354 

355 columns2a = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

356 columns2b = self.butler.get("legacy_dataframe.columns", dataId={}) 

357 self.assertTrue(columns2a.equals(columns2b)) 

358 

359 rowcount2a = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

360 rowcount2b = self.butler.get("legacy_dataframe.rowcount", dataId={}) 

361 self.assertEqual(rowcount2a, rowcount2b) 

362 

363 schema2a = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

364 schema2b = self.butler.get("legacy_dataframe.schema", dataId={}) 

365 self.assertEqual(schema2a, schema2b) 

366 

367 def testDataFrameSchema(self): 

368 tab1 = _makeSimpleArrowTable() 

369 

370 schema = DataFrameSchema.from_arrow(tab1.schema) 

371 

372 self.assertIsInstance(schema.schema, pd.DataFrame) 

373 self.assertEqual(repr(schema), repr(schema._schema)) 

374 self.assertNotEqual(schema, "not_a_schema") 

375 self.assertEqual(schema, schema) 

376 

377 tab2 = _makeMultiIndexDataFrame() 

378 schema2 = DataFrameSchema(tab2) 

379 

380 self.assertNotEqual(schema, schema2) 

381 

382 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

383 def testWriteSingleIndexDataFrameReadAsAstropyTable(self): 

384 df1, allColumns = _makeSingleIndexDataFrame() 

385 

386 self.butler.put(df1, self.datasetType, dataId={}) 

387 

388 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

389 

390 tab2_df = tab2.to_pandas(index="index") 

391 self.assertTrue(df1.equals(tab2_df)) 

392 

393 # Check reading the columns. 

394 columns = list(tab2.columns.keys()) 

395 columns2 = self.butler.get( 

396 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

397 ) 

398 # We check the set because pandas reorders the columns. 

399 self.assertEqual(set(columns2), set(columns)) 

400 

401 # Check reading the schema. 

402 schema = ArrowAstropySchema(tab2) 

403 schema2 = self.butler.get( 

404 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema" 

405 ) 

406 

407 # The string types are objectified by pandas, and the order 

408 # will be changed because of pandas indexing. 

409 self.assertEqual(len(schema2.schema.columns), len(schema.schema.columns)) 

410 for name in schema.schema.columns: 

411 self.assertIn(name, schema2.schema.columns) 

412 if schema2.schema[name].dtype != np.dtype("O"): 

413 self.assertEqual(schema2.schema[name].dtype, schema.schema[name].dtype) 

414 

415 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

416 def testWriteMultiIndexDataFrameReadAsAstropyTable(self): 

417 df1 = _makeMultiIndexDataFrame() 

418 

419 self.butler.put(df1, self.datasetType, dataId={}) 

420 

421 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

422 

423 # This is an odd duck, it doesn't really round-trip. 

424 # This test simply checks that it's readable, but definitely not 

425 # recommended. 

426 

427 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.") 

428 def testWriteSingleIndexDataFrameReadAsArrowTable(self): 

429 df1, allColumns = _makeSingleIndexDataFrame() 

430 

431 self.butler.put(df1, self.datasetType, dataId={}) 

432 

433 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable") 

434 

435 tab2_df = arrow_to_pandas(tab2) 

436 self.assertTrue(df1.equals(tab2_df)) 

437 

438 # Check reading the columns. 

439 columns = list(tab2.schema.names) 

440 columns2 = self.butler.get( 

441 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

442 ) 

443 # We check the set because pandas reorders the columns. 

444 self.assertEqual(set(columns), set(columns2)) 

445 

446 # Check reading the schema. 

447 schema = tab2.schema 

448 schema2 = self.butler.get( 

449 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema" 

450 ) 

451 

452 # These will not have the same metadata, nor will the string column 

453 # information be maintained. 

454 self.assertEqual(len(schema.names), len(schema2.names)) 

455 for name in schema.names: 

456 if schema.field(name).type not in (pa.string(), pa.binary()): 

457 self.assertEqual(schema.field(name).type, schema2.field(name).type) 

458 

459 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.") 

460 def testWriteMultiIndexDataFrameReadAsArrowTable(self): 

461 df1 = _makeMultiIndexDataFrame() 

462 

463 self.butler.put(df1, self.datasetType, dataId={}) 

464 

465 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable") 

466 

467 tab2_df = arrow_to_pandas(tab2) 

468 self.assertTrue(df1.equals(tab2_df)) 

469 

470 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

471 def testWriteSingleIndexDataFrameReadAsNumpyTable(self): 

472 df1, allColumns = _makeSingleIndexDataFrame() 

473 

474 self.butler.put(df1, self.datasetType, dataId={}) 

475 

476 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy") 

477 

478 tab2_df = pd.DataFrame.from_records(tab2, index=["index"]) 

479 self.assertTrue(df1.equals(tab2_df)) 

480 

481 # Check reading the columns. 

482 columns = list(tab2.dtype.names) 

483 columns2 = self.butler.get( 

484 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

485 ) 

486 # We check the set because pandas reorders the columns. 

487 self.assertEqual(set(columns2), set(columns)) 

488 

489 # Check reading the schema. 

490 schema = ArrowNumpySchema(tab2.dtype) 

491 schema2 = self.butler.get( 

492 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema" 

493 ) 

494 

495 # The string types will be objectified by pandas, and the order 

496 # will be changed because of pandas indexing. 

497 self.assertEqual(len(schema.schema.names), len(schema2.schema.names)) 

498 for name in schema.schema.names: 

499 self.assertIn(name, schema2.schema.names) 

500 self.assertEqual(schema2.schema[name].type, schema.schema[name].type) 

501 

502 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

503 def testWriteMultiIndexDataFrameReadAsNumpyTable(self): 

504 df1 = _makeMultiIndexDataFrame() 

505 

506 self.butler.put(df1, self.datasetType, dataId={}) 

507 

508 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy") 

509 

510 # This is an odd duck, it doesn't really round-trip. 

511 # This test simply checks that it's readable, but definitely not 

512 # recommended. 

513 

514 

515@unittest.skipUnless(pd is not None, "Cannot test InMemoryDataFrameDelegate without pandas.") 

516class InMemoryDataFrameDelegateTestCase(ParquetFormatterDataFrameTestCase): 

517 """Tests for InMemoryDatastore, using DataFrameDelegate.""" 

518 

519 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

520 

521 def testMultiIndexDataFrame(self): 

522 df1 = _makeMultiIndexDataFrame() 

523 

524 delegate = DataFrameDelegate("DataFrame") 

525 

526 # Read the whole DataFrame. 

527 df2 = delegate.handleParameters(inMemoryDataset=df1) 

528 self.assertTrue(df1.equals(df2)) 

529 # Read just the column descriptions. 

530 columns2 = delegate.getComponent(composite=df1, componentName="columns") 

531 self.assertTrue(df1.columns.equals(columns2)) 

532 

533 # Read just some columns a few different ways. 

534 with self.assertRaises(NotImplementedError) as cm: 

535 delegate.handleParameters(inMemoryDataset=df1, parameters={"columns": {"filter": "g"}}) 

536 self.assertIn("only supports string column names", str(cm.exception)) 

537 with self.assertRaises(NotImplementedError) as cm: 

538 delegate.handleParameters( 

539 inMemoryDataset=df1, parameters={"columns": {"filter": ["r"], "column": "a"}} 

540 ) 

541 self.assertIn("only supports string column names", str(cm.exception)) 

542 

543 def testWriteMultiIndexDataFrameReadAsAstropyTable(self): 

544 df1 = _makeMultiIndexDataFrame() 

545 

546 self.butler.put(df1, self.datasetType, dataId={}) 

547 

548 with self.assertRaises(ValueError): 

549 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

550 

551 def testLegacyDataFrame(self): 

552 # This test does not work with an inMemoryDatastore. 

553 pass 

554 

555 def testBadInput(self): 

556 df1, _ = _makeSingleIndexDataFrame() 

557 delegate = DataFrameDelegate("DataFrame") 

558 

559 with self.assertRaises(ValueError): 

560 delegate.handleParameters(inMemoryDataset="not_a_dataframe") 

561 

562 with self.assertRaises(AttributeError): 

563 delegate.getComponent(composite=df1, componentName="nothing") 

564 

565 def testStorageClass(self): 

566 df1, allColumns = _makeSingleIndexDataFrame() 

567 

568 factory = StorageClassFactory() 

569 factory.addFromConfig(StorageClassConfig()) 

570 

571 storageClass = factory.findStorageClass(type(df1), compare_types=False) 

572 # Force the name lookup to do name matching. 

573 storageClass._pytype = None 

574 self.assertEqual(storageClass.name, "DataFrame") 

575 

576 storageClass = factory.findStorageClass(type(df1), compare_types=True) 

577 # Force the name lookup to do name matching. 

578 storageClass._pytype = None 

579 self.assertEqual(storageClass.name, "DataFrame") 

580 

581 

582@unittest.skipUnless(atable is not None, "Cannot test ParquetFormatterArrowAstropy without astropy.") 

583@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowAstropy without pyarrow.") 

584class ParquetFormatterArrowAstropyTestCase(unittest.TestCase): 

585 """Tests for ParquetFormatter, ArrowAstropy, using local file datastore.""" 

586 

587 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

588 

589 def setUp(self): 

590 """Create a new butler root for each test.""" 

591 self.root = makeTestTempDir(TESTDIR) 

592 config = Config(self.configFile) 

593 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run") 

594 # No dimensions in dataset type so we don't have to worry about 

595 # inserting dimension data or defining data IDs. 

596 self.datasetType = DatasetType( 

597 "data", dimensions=(), storageClass="ArrowAstropy", universe=self.butler.registry.dimensions 

598 ) 

599 self.butler.registry.registerDatasetType(self.datasetType) 

600 

601 def tearDown(self): 

602 removeTestTempDir(self.root) 

603 

604 def testAstropyTable(self): 

605 tab1 = _makeSimpleAstropyTable(include_multidim=True) 

606 

607 self.butler.put(tab1, self.datasetType, dataId={}) 

608 # Read the whole Table. 

609 tab2 = self.butler.get(self.datasetType, dataId={}) 

610 self._checkAstropyTableEquality(tab1, tab2) 

611 # Read the columns. 

612 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

613 self.assertEqual(len(columns2), len(tab1.dtype.names)) 

614 for i, name in enumerate(tab1.dtype.names): 

615 self.assertEqual(columns2[i], name) 

616 # Read the rowcount. 

617 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

618 self.assertEqual(rowcount, len(tab1)) 

619 # Read the schema. 

620 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

621 self.assertEqual(schema, ArrowAstropySchema(tab1)) 

622 # Read just some columns a few different ways. 

623 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

624 self._checkAstropyTableEquality(tab1[("a", "c")], tab3) 

625 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

626 self._checkAstropyTableEquality(tab1[("a",)], tab4) 

627 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]}) 

628 self._checkAstropyTableEquality(tab1[("index", "a")], tab5) 

629 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"}) 

630 self._checkAstropyTableEquality(tab1[("ddd",)], tab6) 

631 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]}) 

632 self._checkAstropyTableEquality(tab1[("a",)], tab7) 

633 # Passing an unrecognized column should be a ValueError. 

634 with self.assertRaises(ValueError): 

635 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]}) 

636 

637 def testAstropyTableWithMetadata(self): 

638 tab1 = _makeSimpleAstropyTable(include_multidim=True) 

639 

640 meta = { 

641 "meta_a": 5, 

642 "meta_b": 10.0, 

643 "meta_c": [1, 2, 3], 

644 "meta_d": True, 

645 "meta_e": "string", 

646 } 

647 

648 tab1.meta.update(meta) 

649 

650 self.butler.put(tab1, self.datasetType, dataId={}) 

651 # Read the whole Table. 

652 tab2 = self.butler.get(self.datasetType, dataId={}) 

653 # This will check that the metadata is equivalent as well. 

654 self._checkAstropyTableEquality(tab1, tab2) 

655 

656 def testArrowAstropySchema(self): 

657 tab1 = _makeSimpleAstropyTable() 

658 tab1_arrow = astropy_to_arrow(tab1) 

659 schema = ArrowAstropySchema.from_arrow(tab1_arrow.schema) 

660 

661 self.assertIsInstance(schema.schema, atable.Table) 

662 self.assertEqual(repr(schema), repr(schema._schema)) 

663 self.assertNotEqual(schema, "not_a_schema") 

664 self.assertEqual(schema, schema) 

665 

666 # Test various inequalities 

667 tab2 = tab1.copy() 

668 tab2.rename_column("index", "index2") 

669 schema2 = ArrowAstropySchema(tab2) 

670 self.assertNotEqual(schema2, schema) 

671 

672 tab2 = tab1.copy() 

673 tab2["index"].unit = units.micron 

674 schema2 = ArrowAstropySchema(tab2) 

675 self.assertNotEqual(schema2, schema) 

676 

677 tab2 = tab1.copy() 

678 tab2["index"].description = "Index column" 

679 schema2 = ArrowAstropySchema(tab2) 

680 self.assertNotEqual(schema2, schema) 

681 

682 tab2 = tab1.copy() 

683 tab2["index"].format = "%05d" 

684 schema2 = ArrowAstropySchema(tab2) 

685 self.assertNotEqual(schema2, schema) 

686 

687 def testAstropyParquet(self): 

688 """Test writing a dataframe to parquet via pandas (without additional 

689 metadata) and ensure that we can read it back with all the new 

690 functionality. 

691 """ 

692 tab1 = _makeSimpleAstropyTable() 

693 

694 fname = os.path.join(self.root, "test_astropy.parq") 

695 tab1.write(fname) 

696 

697 astropy_type = DatasetType( 

698 "astropy_parquet", 

699 dimensions=(), 

700 storageClass="ArrowAstropy", 

701 universe=self.butler.registry.dimensions, 

702 ) 

703 self.butler.registry.registerDatasetType(astropy_type) 

704 

705 data_id = {} 

706 ref = DatasetRef(astropy_type, data_id, id=None) 

707 dataset = FileDataset(path=fname, refs=[ref], formatter=ParquetFormatter) 

708 

709 self.butler.ingest(dataset, transfer="copy") 

710 

711 self.butler.put(tab1, self.datasetType, dataId={}) 

712 

713 tab2a = self.butler.get(self.datasetType, dataId={}) 

714 tab2b = self.butler.get("astropy_parquet", dataId={}) 

715 self._checkAstropyTableEquality(tab2a, tab2b) 

716 

717 columns2a = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

718 columns2b = self.butler.get("astropy_parquet.columns", dataId={}) 

719 self.assertEqual(len(columns2b), len(columns2a)) 

720 for i, name in enumerate(columns2a): 

721 self.assertEqual(columns2b[i], name) 

722 

723 rowcount2a = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

724 rowcount2b = self.butler.get("astropy_parquet.rowcount", dataId={}) 

725 self.assertEqual(rowcount2a, rowcount2b) 

726 

727 schema2a = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

728 schema2b = self.butler.get("astropy_parquet.schema", dataId={}) 

729 self.assertEqual(schema2a, schema2b) 

730 

731 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.") 

732 def testWriteAstropyReadAsArrowTable(self): 

733 tab1 = _makeSimpleAstropyTable() 

734 

735 self.butler.put(tab1, self.datasetType, dataId={}) 

736 

737 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable") 

738 

739 tab2_astropy = arrow_to_astropy(tab2) 

740 self._checkAstropyTableEquality(tab1, tab2_astropy) 

741 

742 # Check reading the columns. 

743 columns = tab2.schema.names 

744 columns2 = self.butler.get( 

745 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

746 ) 

747 self.assertEqual(columns2, columns) 

748 

749 # Check reading the schema. 

750 schema = tab2.schema 

751 schema2 = self.butler.get( 

752 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema" 

753 ) 

754 

755 self.assertEqual(schema, schema2) 

756 

757 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

758 def testWriteAstropyReadAsDataFrame(self): 

759 tab1 = _makeSimpleAstropyTable() 

760 

761 self.butler.put(tab1, self.datasetType, dataId={}) 

762 

763 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

764 

765 # This is tricky because it loses the units and gains a bonus pandas 

766 # _index_ column, so we just test the dataframe form. 

767 

768 tab1_df = tab1.to_pandas() 

769 self.assertTrue(tab1_df.equals(tab2)) 

770 

771 # Check reading the columns. 

772 columns = tab2.columns 

773 columns2 = self.butler.get( 

774 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex" 

775 ) 

776 self.assertTrue(columns.equals(columns2)) 

777 

778 # Check reading the schema. 

779 schema = DataFrameSchema(tab2) 

780 schema2 = self.butler.get( 

781 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema" 

782 ) 

783 

784 self.assertEqual(schema2, schema) 

785 

786 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

787 def testWriteAstropyReadAsNumpyTable(self): 

788 tab1 = _makeSimpleAstropyTable() 

789 self.butler.put(tab1, self.datasetType, dataId={}) 

790 

791 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy") 

792 

793 # This is tricky because it loses the units. 

794 tab2_astropy = atable.Table(tab2) 

795 

796 self._checkAstropyTableEquality(tab1, tab2_astropy, skip_units=True) 

797 

798 # Check reading the columns. 

799 columns = list(tab2.dtype.names) 

800 columns2 = self.butler.get( 

801 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

802 ) 

803 self.assertEqual(columns2, columns) 

804 

805 # Check reading the schema. 

806 schema = ArrowNumpySchema(tab2.dtype) 

807 schema2 = self.butler.get( 

808 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema" 

809 ) 

810 

811 self.assertEqual(schema2, schema) 

812 

813 def _checkAstropyTableEquality(self, table1, table2, skip_units=False): 

814 """Check if two astropy tables have the same columns/values. 

815 

816 Parameters 

817 ---------- 

818 table1 : `astropy.table.Table` 

819 table2 : `astropy.table.Table` 

820 skip_units : `bool` 

821 """ 

822 self.assertEqual(table1.dtype, table2.dtype) 

823 self.assertEqual(table1.meta, table2.meta) 

824 if not skip_units: 

825 for name in table1.columns: 

826 self.assertEqual(table1[name].unit, table2[name].unit) 

827 self.assertEqual(table1[name].description, table2[name].description) 

828 self.assertEqual(table1[name].format, table2[name].format) 

829 self.assertTrue(np.all(table1 == table2)) 

830 

831 

832@unittest.skipUnless(atable is not None, "Cannot test InMemoryArrowAstropyDelegate without astropy.") 

833class InMemoryArrowAstropyDelegateTestCase(ParquetFormatterArrowAstropyTestCase): 

834 """Tests for InMemoryDatastore, using ArrowAstropyDelegate.""" 

835 

836 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

837 

838 def testAstropyParquet(self): 

839 # This test does not work with an inMemoryDatastore. 

840 pass 

841 

842 def testBadInput(self): 

843 tab1 = _makeSimpleAstropyTable() 

844 delegate = ArrowAstropyDelegate("ArrowAstropy") 

845 

846 with self.assertRaises(ValueError): 

847 delegate.handleParameters(inMemoryDataset="not_an_astropy_table") 

848 

849 with self.assertRaises(NotImplementedError): 

850 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]}) 

851 

852 with self.assertRaises(AttributeError): 

853 delegate.getComponent(composite=tab1, componentName="nothing") 

854 

855 

856@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.") 

857@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowNumpy without pyarrow.") 

858class ParquetFormatterArrowNumpyTestCase(unittest.TestCase): 

859 """Tests for ParquetFormatter, ArrowNumpy, using local file datastore.""" 

860 

861 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

862 

863 def setUp(self): 

864 """Create a new butler root for each test.""" 

865 self.root = makeTestTempDir(TESTDIR) 

866 config = Config(self.configFile) 

867 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run") 

868 # No dimensions in dataset type so we don't have to worry about 

869 # inserting dimension data or defining data IDs. 

870 self.datasetType = DatasetType( 

871 "data", dimensions=(), storageClass="ArrowNumpy", universe=self.butler.registry.dimensions 

872 ) 

873 self.butler.registry.registerDatasetType(self.datasetType) 

874 

875 def tearDown(self): 

876 removeTestTempDir(self.root) 

877 

878 def testNumpyTable(self): 

879 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

880 

881 self.butler.put(tab1, self.datasetType, dataId={}) 

882 # Read the whole Table. 

883 tab2 = self.butler.get(self.datasetType, dataId={}) 

884 self._checkNumpyTableEquality(tab1, tab2) 

885 # Read the columns. 

886 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

887 self.assertEqual(len(columns2), len(tab1.dtype.names)) 

888 for i, name in enumerate(tab1.dtype.names): 

889 self.assertEqual(columns2[i], name) 

890 # Read the rowcount. 

891 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

892 self.assertEqual(rowcount, len(tab1)) 

893 # Read the schema. 

894 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

895 self.assertEqual(schema, ArrowNumpySchema(tab1.dtype)) 

896 # Read just some columns a few different ways. 

897 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

898 self._checkNumpyTableEquality(tab1[["a", "c"]], tab3) 

899 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

900 self._checkNumpyTableEquality( 

901 tab1[ 

902 [ 

903 "a", 

904 ] 

905 ], 

906 tab4, 

907 ) 

908 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]}) 

909 self._checkNumpyTableEquality(tab1[["index", "a"]], tab5) 

910 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"}) 

911 self._checkNumpyTableEquality( 

912 tab1[ 

913 [ 

914 "ddd", 

915 ] 

916 ], 

917 tab6, 

918 ) 

919 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]}) 

920 self._checkNumpyTableEquality( 

921 tab1[ 

922 [ 

923 "a", 

924 ] 

925 ], 

926 tab7, 

927 ) 

928 # Passing an unrecognized column should be a ValueError. 

929 with self.assertRaises(ValueError): 

930 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]}) 

931 

932 def testArrowNumpySchema(self): 

933 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

934 tab1_arrow = numpy_to_arrow(tab1) 

935 schema = ArrowNumpySchema.from_arrow(tab1_arrow.schema) 

936 

937 self.assertIsInstance(schema.schema, np.dtype) 

938 self.assertEqual(repr(schema), repr(schema._dtype)) 

939 self.assertNotEqual(schema, "not_a_schema") 

940 self.assertEqual(schema, schema) 

941 

942 # Test inequality 

943 tab2 = tab1.copy() 

944 names = list(tab2.dtype.names) 

945 names[0] = "index2" 

946 tab2.dtype.names = names 

947 schema2 = ArrowNumpySchema(tab2.dtype) 

948 self.assertNotEqual(schema2, schema) 

949 

950 @unittest.skipUnless(pa is not None, "Cannot test arrow conversions without pyarrow.") 

951 def testNumpyDictConversions(self): 

952 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

953 

954 # Verify that everything round-trips, including the schema. 

955 tab1_arrow = numpy_to_arrow(tab1) 

956 tab1_dict = arrow_to_numpy_dict(tab1_arrow) 

957 tab1_dict_arrow = numpy_dict_to_arrow(tab1_dict) 

958 

959 self.assertEqual(tab1_arrow.schema, tab1_dict_arrow.schema) 

960 self.assertEqual(tab1_arrow, tab1_dict_arrow) 

961 

962 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.") 

963 def testWriteNumpyTableReadAsArrowTable(self): 

964 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

965 

966 self.butler.put(tab1, self.datasetType, dataId={}) 

967 

968 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable") 

969 

970 tab2_numpy = arrow_to_numpy(tab2) 

971 

972 self._checkNumpyTableEquality(tab1, tab2_numpy) 

973 

974 # Check reading the columns. 

975 columns = tab2.schema.names 

976 columns2 = self.butler.get( 

977 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

978 ) 

979 self.assertEqual(columns2, columns) 

980 

981 # Check reading the schema. 

982 schema = tab2.schema 

983 schema2 = self.butler.get( 

984 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema" 

985 ) 

986 self.assertEqual(schema2, schema) 

987 

988 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

989 def testWriteNumpyTableReadAsDataFrame(self): 

990 tab1 = _makeSimpleNumpyTable() 

991 

992 self.butler.put(tab1, self.datasetType, dataId={}) 

993 

994 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

995 

996 # Converting this back to numpy gets confused with the index column 

997 # and changes the datatype of the string column. 

998 

999 tab1_df = pd.DataFrame(tab1) 

1000 

1001 self.assertTrue(tab1_df.equals(tab2)) 

1002 

1003 # Check reading the columns. 

1004 columns = tab2.columns 

1005 columns2 = self.butler.get( 

1006 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex" 

1007 ) 

1008 self.assertTrue(columns.equals(columns2)) 

1009 

1010 # Check reading the schema. 

1011 schema = DataFrameSchema(tab2) 

1012 schema2 = self.butler.get( 

1013 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema" 

1014 ) 

1015 

1016 self.assertEqual(schema2, schema) 

1017 

1018 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

1019 def testWriteNumpyTableReadAsAstropyTable(self): 

1020 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1021 

1022 self.butler.put(tab1, self.datasetType, dataId={}) 

1023 

1024 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

1025 tab2_numpy = tab2.as_array() 

1026 

1027 self._checkNumpyTableEquality(tab1, tab2_numpy) 

1028 

1029 # Check reading the columns. 

1030 columns = list(tab2.columns.keys()) 

1031 columns2 = self.butler.get( 

1032 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

1033 ) 

1034 self.assertEqual(columns2, columns) 

1035 

1036 # Check reading the schema. 

1037 schema = ArrowAstropySchema(tab2) 

1038 schema2 = self.butler.get( 

1039 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema" 

1040 ) 

1041 

1042 self.assertEqual(schema2, schema) 

1043 

1044 def _checkNumpyTableEquality(self, table1, table2): 

1045 """Check if two numpy tables have the same columns/values 

1046 

1047 Parameters 

1048 ---------- 

1049 table1 : `numpy.ndarray` 

1050 table2 : `numpy.ndarray` 

1051 """ 

1052 self.assertEqual(table1.dtype.names, table2.dtype.names) 

1053 for name in table1.dtype.names: 

1054 self.assertEqual(table1.dtype[name], table2.dtype[name]) 

1055 self.assertTrue(np.all(table1 == table2)) 

1056 

1057 

1058@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.") 

1059class InMemoryArrowNumpyDelegateTestCase(ParquetFormatterArrowNumpyTestCase): 

1060 """Tests for InMemoryDatastore, using ArrowNumpyDelegate.""" 

1061 

1062 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

1063 

1064 def testBadInput(self): 

1065 tab1 = _makeSimpleNumpyTable() 

1066 delegate = ArrowNumpyDelegate("ArrowNumpy") 

1067 

1068 with self.assertRaises(ValueError): 

1069 delegate.handleParameters(inMemoryDataset="not_a_numpy_table") 

1070 

1071 with self.assertRaises(NotImplementedError): 

1072 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]}) 

1073 

1074 with self.assertRaises(AttributeError): 

1075 delegate.getComponent(composite=tab1, componentName="nothing") 

1076 

1077 def testStorageClass(self): 

1078 tab1 = _makeSimpleNumpyTable() 

1079 

1080 factory = StorageClassFactory() 

1081 factory.addFromConfig(StorageClassConfig()) 

1082 

1083 storageClass = factory.findStorageClass(type(tab1), compare_types=False) 

1084 # Force the name lookup to do name matching. 

1085 storageClass._pytype = None 

1086 self.assertEqual(storageClass.name, "ArrowNumpy") 

1087 

1088 storageClass = factory.findStorageClass(type(tab1), compare_types=True) 

1089 # Force the name lookup to do name matching. 

1090 storageClass._pytype = None 

1091 self.assertEqual(storageClass.name, "ArrowNumpy") 

1092 

1093 

1094@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowTable without pyarrow.") 

1095class ParquetFormatterArrowTableTestCase(unittest.TestCase): 

1096 """Tests for ParquetFormatter, ArrowTable, using local file datastore.""" 

1097 

1098 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1099 

1100 def setUp(self): 

1101 """Create a new butler root for each test.""" 

1102 self.root = makeTestTempDir(TESTDIR) 

1103 config = Config(self.configFile) 

1104 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run") 

1105 # No dimensions in dataset type so we don't have to worry about 

1106 # inserting dimension data or defining data IDs. 

1107 self.datasetType = DatasetType( 

1108 "data", dimensions=(), storageClass="ArrowTable", universe=self.butler.registry.dimensions 

1109 ) 

1110 self.butler.registry.registerDatasetType(self.datasetType) 

1111 

1112 def tearDown(self): 

1113 removeTestTempDir(self.root) 

1114 

1115 def testArrowTable(self): 

1116 tab1 = _makeSimpleArrowTable(include_multidim=True) 

1117 

1118 self.butler.put(tab1, self.datasetType, dataId={}) 

1119 # Read the whole Table. 

1120 tab2 = self.butler.get(self.datasetType, dataId={}) 

1121 self.assertEqual(tab2, tab1) 

1122 # Read the columns. 

1123 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

1124 self.assertEqual(len(columns2), len(tab1.schema.names)) 

1125 for i, name in enumerate(tab1.schema.names): 

1126 self.assertEqual(columns2[i], name) 

1127 # Read the rowcount. 

1128 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

1129 self.assertEqual(rowcount, len(tab1)) 

1130 # Read the schema. 

1131 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

1132 self.assertEqual(schema, tab1.schema) 

1133 # Read just some columns a few different ways. 

1134 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

1135 self.assertEqual(tab3, tab1.select(("a", "c"))) 

1136 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

1137 self.assertEqual(tab4, tab1.select(("a",))) 

1138 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]}) 

1139 self.assertEqual(tab5, tab1.select(("index", "a"))) 

1140 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"}) 

1141 self.assertEqual(tab6, tab1.select(("ddd",))) 

1142 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]}) 

1143 self.assertEqual(tab7, tab1.select(("a",))) 

1144 # Passing an unrecognized column should be a ValueError. 

1145 with self.assertRaises(ValueError): 

1146 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]}) 

1147 

1148 def testEmptyArrowTable(self): 

1149 data = _makeSimpleNumpyTable() 

1150 type_list = _numpy_dtype_to_arrow_types(data.dtype) 

1151 

1152 schema = pa.schema(type_list) 

1153 arrays = [[]] * len(schema.names) 

1154 

1155 tab1 = pa.Table.from_arrays(arrays, schema=schema) 

1156 

1157 self.butler.put(tab1, self.datasetType, dataId={}) 

1158 tab2 = self.butler.get(self.datasetType, dataId={}) 

1159 self.assertEqual(tab2, tab1) 

1160 

1161 tab1_numpy = arrow_to_numpy(tab1) 

1162 self.assertEqual(len(tab1_numpy), 0) 

1163 tab1_numpy_arrow = numpy_to_arrow(tab1_numpy) 

1164 self.assertEqual(tab1_numpy_arrow, tab1) 

1165 

1166 tab1_pandas = arrow_to_pandas(tab1) 

1167 self.assertEqual(len(tab1_pandas), 0) 

1168 tab1_pandas_arrow = pandas_to_arrow(tab1_pandas) 

1169 # Unfortunately, string/byte columns get mangled when translated 

1170 # through empty pandas dataframes. 

1171 self.assertEqual( 

1172 tab1_pandas_arrow.select(("index", "a", "b", "c", "ddd")), 

1173 tab1.select(("index", "a", "b", "c", "ddd")), 

1174 ) 

1175 

1176 tab1_astropy = arrow_to_astropy(tab1) 

1177 self.assertEqual(len(tab1_astropy), 0) 

1178 tab1_astropy_arrow = astropy_to_arrow(tab1_astropy) 

1179 self.assertEqual(tab1_astropy_arrow, tab1) 

1180 

1181 def testEmptyArrowTableMultidim(self): 

1182 data = _makeSimpleNumpyTable(include_multidim=True) 

1183 type_list = _numpy_dtype_to_arrow_types(data.dtype) 

1184 

1185 md = {} 

1186 for name in data.dtype.names: 

1187 _append_numpy_multidim_metadata(md, name, data.dtype[name]) 

1188 

1189 schema = pa.schema(type_list, metadata=md) 

1190 arrays = [[]] * len(schema.names) 

1191 

1192 tab1 = pa.Table.from_arrays(arrays, schema=schema) 

1193 

1194 self.butler.put(tab1, self.datasetType, dataId={}) 

1195 tab2 = self.butler.get(self.datasetType, dataId={}) 

1196 self.assertEqual(tab2, tab1) 

1197 

1198 tab1_numpy = arrow_to_numpy(tab1) 

1199 self.assertEqual(len(tab1_numpy), 0) 

1200 tab1_numpy_arrow = numpy_to_arrow(tab1_numpy) 

1201 self.assertEqual(tab1_numpy_arrow, tab1) 

1202 

1203 tab1_astropy = arrow_to_astropy(tab1) 

1204 self.assertEqual(len(tab1_astropy), 0) 

1205 tab1_astropy_arrow = astropy_to_arrow(tab1_astropy) 

1206 self.assertEqual(tab1_astropy_arrow, tab1) 

1207 

1208 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

1209 def testWriteArrowTableReadAsSingleIndexDataFrame(self): 

1210 df1, allColumns = _makeSingleIndexDataFrame() 

1211 

1212 self.butler.put(df1, self.datasetType, dataId={}) 

1213 

1214 # Read back out as a dataframe. 

1215 df2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

1216 self.assertTrue(df1.equals(df2)) 

1217 

1218 # Read back out as an arrow table, convert to dataframe. 

1219 tab3 = self.butler.get(self.datasetType, dataId={}) 

1220 df3 = arrow_to_pandas(tab3) 

1221 self.assertTrue(df1.equals(df3)) 

1222 

1223 # Check reading the columns. 

1224 columns = df2.reset_index().columns 

1225 columns2 = self.butler.get( 

1226 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex" 

1227 ) 

1228 # We check the set because pandas reorders the columns. 

1229 self.assertEqual(set(columns2.to_list()), set(columns.to_list())) 

1230 

1231 # Check reading the schema. 

1232 schema = DataFrameSchema(df1) 

1233 schema2 = self.butler.get( 

1234 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema" 

1235 ) 

1236 self.assertEqual(schema2, schema) 

1237 

1238 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

1239 def testWriteArrowTableReadAsMultiIndexDataFrame(self): 

1240 df1 = _makeMultiIndexDataFrame() 

1241 

1242 self.butler.put(df1, self.datasetType, dataId={}) 

1243 

1244 # Read back out as a dataframe. 

1245 df2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

1246 self.assertTrue(df1.equals(df2)) 

1247 

1248 # Read back out as an arrow table, convert to dataframe. 

1249 atab3 = self.butler.get(self.datasetType, dataId={}) 

1250 df3 = arrow_to_pandas(atab3) 

1251 self.assertTrue(df1.equals(df3)) 

1252 

1253 # Check reading the columns. 

1254 columns = df2.columns 

1255 columns2 = self.butler.get( 

1256 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex" 

1257 ) 

1258 self.assertTrue(columns2.equals(columns)) 

1259 

1260 # Check reading the schema. 

1261 schema = DataFrameSchema(df1) 

1262 schema2 = self.butler.get( 

1263 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema" 

1264 ) 

1265 self.assertEqual(schema2, schema) 

1266 

1267 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

1268 def testWriteArrowTableReadAsAstropyTable(self): 

1269 tab1 = _makeSimpleAstropyTable(include_multidim=True) 

1270 

1271 self.butler.put(tab1, self.datasetType, dataId={}) 

1272 

1273 # Read back out as an astropy table. 

1274 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

1275 self._checkAstropyTableEquality(tab1, tab2) 

1276 

1277 # Read back out as an arrow table, convert to astropy table. 

1278 atab3 = self.butler.get(self.datasetType, dataId={}) 

1279 tab3 = arrow_to_astropy(atab3) 

1280 self._checkAstropyTableEquality(tab1, tab3) 

1281 

1282 # Check reading the columns. 

1283 columns = list(tab2.columns.keys()) 

1284 columns2 = self.butler.get( 

1285 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

1286 ) 

1287 self.assertEqual(columns2, columns) 

1288 

1289 # Check reading the schema. 

1290 schema = ArrowAstropySchema(tab1) 

1291 schema2 = self.butler.get( 

1292 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema" 

1293 ) 

1294 self.assertEqual(schema2, schema) 

1295 

1296 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

1297 def testWriteArrowTableReadAsNumpyTable(self): 

1298 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1299 

1300 self.butler.put(tab1, self.datasetType, dataId={}) 

1301 

1302 # Read back out as a numpy table. 

1303 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy") 

1304 self._checkNumpyTableEquality(tab1, tab2) 

1305 

1306 # Read back out as an arrow table, convert to numpy table. 

1307 atab3 = self.butler.get(self.datasetType, dataId={}) 

1308 tab3 = arrow_to_numpy(atab3) 

1309 self._checkNumpyTableEquality(tab1, tab3) 

1310 

1311 # Check reading the columns. 

1312 columns = list(tab2.dtype.names) 

1313 columns2 = self.butler.get( 

1314 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

1315 ) 

1316 self.assertEqual(columns2, columns) 

1317 

1318 # Check reading the schema. 

1319 schema = ArrowNumpySchema(tab1.dtype) 

1320 schema2 = self.butler.get( 

1321 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema" 

1322 ) 

1323 self.assertEqual(schema2, schema) 

1324 

1325 def _checkAstropyTableEquality(self, table1, table2): 

1326 """Check if two astropy tables have the same columns/values 

1327 

1328 Parameters 

1329 ---------- 

1330 table1 : `astropy.table.Table` 

1331 table2 : `astropy.table.Table` 

1332 """ 

1333 self.assertEqual(table1.dtype, table2.dtype) 

1334 for name in table1.columns: 

1335 self.assertEqual(table1[name].unit, table2[name].unit) 

1336 self.assertEqual(table1[name].description, table2[name].description) 

1337 self.assertEqual(table1[name].format, table2[name].format) 

1338 self.assertTrue(np.all(table1 == table2)) 

1339 

1340 def _checkNumpyTableEquality(self, table1, table2): 

1341 """Check if two numpy tables have the same columns/values 

1342 

1343 Parameters 

1344 ---------- 

1345 table1 : `numpy.ndarray` 

1346 table2 : `numpy.ndarray` 

1347 """ 

1348 self.assertEqual(table1.dtype.names, table2.dtype.names) 

1349 for name in table1.dtype.names: 

1350 self.assertEqual(table1.dtype[name], table2.dtype[name]) 

1351 self.assertTrue(np.all(table1 == table2)) 

1352 

1353 

1354@unittest.skipUnless(pa is not None, "Cannot test InMemoryArrowTableDelegate without pyarrow.") 

1355class InMemoryArrowTableDelegateTestCase(ParquetFormatterArrowTableTestCase): 

1356 """Tests for InMemoryDatastore, using ArrowTableDelegate.""" 

1357 

1358 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

1359 

1360 def testBadInput(self): 

1361 tab1 = _makeSimpleArrowTable() 

1362 delegate = ArrowTableDelegate("ArrowTable") 

1363 

1364 with self.assertRaises(ValueError): 

1365 delegate.handleParameters(inMemoryDataset="not_an_arrow_table") 

1366 

1367 with self.assertRaises(NotImplementedError): 

1368 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]}) 

1369 

1370 with self.assertRaises(AttributeError): 

1371 delegate.getComponent(composite=tab1, componentName="nothing") 

1372 

1373 def testStorageClass(self): 

1374 tab1 = _makeSimpleArrowTable() 

1375 

1376 factory = StorageClassFactory() 

1377 factory.addFromConfig(StorageClassConfig()) 

1378 

1379 storageClass = factory.findStorageClass(type(tab1), compare_types=False) 

1380 # Force the name lookup to do name matching. 

1381 storageClass._pytype = None 

1382 self.assertEqual(storageClass.name, "ArrowTable") 

1383 

1384 storageClass = factory.findStorageClass(type(tab1), compare_types=True) 

1385 # Force the name lookup to do name matching. 

1386 storageClass._pytype = None 

1387 self.assertEqual(storageClass.name, "ArrowTable") 

1388 

1389 

1390if __name__ == "__main__": 1390 ↛ 1391line 1390 didn't jump to line 1391, because the condition on line 1390 was never true

1391 unittest.main()