Coverage for tests/test_parquet.py: 17%

774 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2023-02-05 02:04 -0800

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""Tests for ParquetFormatter. 

23 

24Tests in this module are disabled unless pandas and pyarrow are importable. 

25""" 

26 

27import os 

28import unittest 

29 

30try: 

31 import pyarrow as pa 

32except ImportError: 

33 pa = None 

34try: 

35 import astropy.table as atable 

36 from astropy import units 

37except ImportError: 

38 atable = None 

39try: 

40 import numpy as np 

41except ImportError: 

42 np = None 

43try: 

44 import pandas as pd 

45except ImportError: 

46 np = None 

47 

48from lsst.daf.butler import ( 

49 Butler, 

50 Config, 

51 DatasetRef, 

52 DatasetType, 

53 FileDataset, 

54 StorageClassConfig, 

55 StorageClassFactory, 

56) 

57from lsst.daf.butler.delegates.arrowastropy import ArrowAstropyDelegate 

58from lsst.daf.butler.delegates.arrownumpy import ArrowNumpyDelegate 

59from lsst.daf.butler.delegates.arrowtable import ArrowTableDelegate 

60from lsst.daf.butler.delegates.dataframe import DataFrameDelegate 

61from lsst.daf.butler.formatters.parquet import ( 

62 ArrowAstropySchema, 

63 ArrowNumpySchema, 

64 DataFrameSchema, 

65 ParquetFormatter, 

66 _append_numpy_multidim_metadata, 

67 _numpy_dtype_to_arrow_types, 

68 arrow_to_astropy, 

69 arrow_to_numpy, 

70 arrow_to_numpy_dict, 

71 arrow_to_pandas, 

72 astropy_to_arrow, 

73 numpy_dict_to_arrow, 

74 numpy_to_arrow, 

75 pandas_to_arrow, 

76) 

77from lsst.daf.butler.tests.utils import makeTestTempDir, removeTestTempDir 

78 

79TESTDIR = os.path.abspath(os.path.dirname(__file__)) 

80 

81 

82def _makeSimpleNumpyTable(include_multidim=False): 

83 """Make a simple numpy table with random data. 

84 

85 Parameters 

86 ---------- 

87 include_multidim : `bool` 

88 Include multi-dimensional columns. 

89 

90 Returns 

91 ------- 

92 numpyTable : `numpy.ndarray` 

93 """ 

94 nrow = 5 

95 

96 dtype = [ 

97 ("index", "i4"), 

98 ("a", "f8"), 

99 ("b", "f8"), 

100 ("c", "f8"), 

101 ("ddd", "f8"), 

102 ("f", "i8"), 

103 ("strcol", "U10"), 

104 ("bytecol", "a10"), 

105 ] 

106 

107 if include_multidim: 

108 dtype.extend( 

109 [ 

110 ("d1", "f4", (5,)), 

111 ("d2", "i8", (5, 10)), 

112 ("d3", "f8", (5, 10)), 

113 ] 

114 ) 

115 

116 data = np.zeros(nrow, dtype=dtype) 

117 data["index"][:] = np.arange(nrow) 

118 data["a"] = np.random.randn(nrow) 

119 data["b"] = np.random.randn(nrow) 

120 data["c"] = np.random.randn(nrow) 

121 data["ddd"] = np.random.randn(nrow) 

122 data["f"] = np.arange(nrow) * 10 

123 data["strcol"][:] = "teststring" 

124 data["bytecol"][:] = "teststring" 

125 

126 if include_multidim: 

127 data["d1"] = np.random.randn(data["d1"].size).reshape(data["d1"].shape) 

128 data["d2"] = np.arange(data["d2"].size).reshape(data["d2"].shape) 

129 data["d3"] = np.asfortranarray(np.random.randn(data["d3"].size).reshape(data["d3"].shape)) 

130 

131 return data 

132 

133 

134def _makeSingleIndexDataFrame(include_masked=False): 

135 """Make a single index data frame for testing. 

136 

137 Parameters 

138 ---------- 

139 include_masked : `bool` 

140 Include masked columns. 

141 

142 Returns 

143 ------- 

144 dataFrame : `~pandas.DataFrame` 

145 The test dataframe. 

146 allColumns : `list` [`str`] 

147 List of all the columns (including index columns). 

148 """ 

149 data = _makeSimpleNumpyTable() 

150 df = pd.DataFrame(data) 

151 df = df.set_index("index") 

152 

153 if include_masked: 

154 nrow = len(df) 

155 

156 df["m1"] = pd.array(np.arange(nrow), dtype=pd.Int64Dtype()) 

157 df["m2"] = pd.array(np.arange(nrow), dtype=np.float32) 

158 df["mstrcol"] = pd.array(np.array(["text"] * nrow)) 

159 df.loc[1, ["m1", "m2", "mstrcol"]] = None 

160 

161 allColumns = df.columns.append(pd.Index(df.index.names)) 

162 

163 return df, allColumns 

164 

165 

166def _makeMultiIndexDataFrame(): 

167 """Make a multi-index data frame for testing. 

168 

169 Returns 

170 ------- 

171 dataFrame : `~pandas.DataFrame` 

172 The test dataframe. 

173 """ 

174 columns = pd.MultiIndex.from_tuples( 

175 [ 

176 ("g", "a"), 

177 ("g", "b"), 

178 ("g", "c"), 

179 ("r", "a"), 

180 ("r", "b"), 

181 ("r", "c"), 

182 ], 

183 names=["filter", "column"], 

184 ) 

185 df = pd.DataFrame(np.random.randn(5, 6), index=np.arange(5, dtype=int), columns=columns) 

186 

187 return df 

188 

189 

190def _makeSimpleAstropyTable(include_multidim=False, include_masked=False): 

191 """Make an astropy table for testing. 

192 

193 Parameters 

194 ---------- 

195 include_multidim : `bool` 

196 Include multi-dimensional columns. 

197 include_masked : `bool` 

198 Include masked columns. 

199 

200 Returns 

201 ------- 

202 astropyTable : `astropy.table.Table` 

203 The test table. 

204 """ 

205 data = _makeSimpleNumpyTable(include_multidim=include_multidim) 

206 # Add a couple of units. 

207 table = atable.Table(data) 

208 table["a"].unit = units.degree 

209 table["b"].unit = units.meter 

210 

211 # Add some masked columns. 

212 if include_masked: 

213 nrow = len(table) 

214 mask = np.zeros(nrow, dtype=bool) 

215 mask[1] = True 

216 table["m1"] = np.ma.masked_array(data=np.arange(nrow, dtype="i8"), mask=mask) 

217 table["m2"] = np.ma.masked_array(data=np.arange(nrow, dtype="f4"), mask=mask) 

218 table["mstrcol"] = np.ma.masked_array(data=np.array(["text"] * nrow), mask=mask) 

219 table["mbytecol"] = np.ma.masked_array(data=np.array([b"bytes"] * nrow), mask=mask) 

220 

221 return table 

222 

223 

224def _makeSimpleArrowTable(include_multidim=False, include_masked=False): 

225 """Make an arrow table for testing. 

226 

227 Parameters 

228 ---------- 

229 include_multidim : `bool` 

230 Include multi-dimensional columns. 

231 include_masked : `bool` 

232 Include masked columns. 

233 

234 Returns 

235 ------- 

236 arrowTable : `pyarrow.Table` 

237 The test table. 

238 """ 

239 data = _makeSimpleAstropyTable(include_multidim=include_multidim, include_masked=include_masked) 

240 return astropy_to_arrow(data) 

241 

242 

243@unittest.skipUnless(pd is not None, "Cannot test ParquetFormatterDataFrame without pandas.") 

244@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterDataFrame without pyarrow.") 

245class ParquetFormatterDataFrameTestCase(unittest.TestCase): 

246 """Tests for ParquetFormatter, DataFrame, using local file datastore.""" 

247 

248 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

249 

250 def setUp(self): 

251 """Create a new butler root for each test.""" 

252 self.root = makeTestTempDir(TESTDIR) 

253 config = Config(self.configFile) 

254 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run") 

255 # No dimensions in dataset type so we don't have to worry about 

256 # inserting dimension data or defining data IDs. 

257 self.datasetType = DatasetType( 

258 "data", dimensions=(), storageClass="DataFrame", universe=self.butler.registry.dimensions 

259 ) 

260 self.butler.registry.registerDatasetType(self.datasetType) 

261 

262 def tearDown(self): 

263 removeTestTempDir(self.root) 

264 

265 def testSingleIndexDataFrame(self): 

266 df1, allColumns = _makeSingleIndexDataFrame(include_masked=True) 

267 

268 self.butler.put(df1, self.datasetType, dataId={}) 

269 # Read the whole DataFrame. 

270 df2 = self.butler.get(self.datasetType, dataId={}) 

271 self.assertTrue(df1.equals(df2)) 

272 # Read just the column descriptions. 

273 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

274 self.assertTrue(allColumns.equals(columns2)) 

275 # Read the rowcount. 

276 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

277 self.assertEqual(rowcount, len(df1)) 

278 # Read the schema. 

279 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

280 self.assertEqual(schema, DataFrameSchema(df1)) 

281 # Read just some columns a few different ways. 

282 df3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

283 self.assertTrue(df1.loc[:, ["a", "c"]].equals(df3)) 

284 df4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

285 self.assertTrue(df1.loc[:, ["a"]].equals(df4)) 

286 df5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]}) 

287 self.assertTrue(df1.loc[:, ["a"]].equals(df5)) 

288 df6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"}) 

289 self.assertTrue(df1.loc[:, ["ddd"]].equals(df6)) 

290 df7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]}) 

291 self.assertTrue(df1.loc[:, ["a"]].equals(df7)) 

292 # Passing an unrecognized column should be a ValueError. 

293 with self.assertRaises(ValueError): 

294 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]}) 

295 

296 def testMultiIndexDataFrame(self): 

297 df1 = _makeMultiIndexDataFrame() 

298 

299 self.butler.put(df1, self.datasetType, dataId={}) 

300 # Read the whole DataFrame. 

301 df2 = self.butler.get(self.datasetType, dataId={}) 

302 self.assertTrue(df1.equals(df2)) 

303 # Read just the column descriptions. 

304 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

305 self.assertTrue(df1.columns.equals(columns2)) 

306 # Read the rowcount. 

307 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

308 self.assertEqual(rowcount, len(df1)) 

309 # Read the schema. 

310 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

311 self.assertEqual(schema, DataFrameSchema(df1)) 

312 # Read just some columns a few different ways. 

313 df3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": {"filter": "g"}}) 

314 self.assertTrue(df1.loc[:, ["g"]].equals(df3)) 

315 df4 = self.butler.get( 

316 self.datasetType, dataId={}, parameters={"columns": {"filter": ["r"], "column": "a"}} 

317 ) 

318 self.assertTrue(df1.loc[:, [("r", "a")]].equals(df4)) 

319 column_list = [("g", "a"), ("r", "c")] 

320 df5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": column_list}) 

321 self.assertTrue(df1.loc[:, column_list].equals(df5)) 

322 # Passing an unrecognized column should be a ValueError. 

323 with self.assertRaises(ValueError): 

324 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["d"]}) 

325 

326 def testSingleIndexDataFrameEmptyString(self): 

327 """Test persisting a single index dataframe with empty strings.""" 

328 df1, _ = _makeSingleIndexDataFrame() 

329 

330 # Set one of the strings to None 

331 df1.at[1, "strcol"] = None 

332 

333 self.butler.put(df1, self.datasetType, dataId={}) 

334 # Read the whole DataFrame. 

335 df2 = self.butler.get(self.datasetType, dataId={}) 

336 self.assertTrue(df1.equals(df2)) 

337 

338 def testSingleIndexDataFrameAllEmptyStrings(self): 

339 """Test persisting a single index dataframe with an empty string 

340 column. 

341 """ 

342 df1, _ = _makeSingleIndexDataFrame() 

343 

344 # Set all of the strings to None 

345 df1.loc[0:, "strcol"] = None 

346 

347 self.butler.put(df1, self.datasetType, dataId={}) 

348 # Read the whole DataFrame. 

349 df2 = self.butler.get(self.datasetType, dataId={}) 

350 self.assertTrue(df1.equals(df2)) 

351 

352 def testLegacyDataFrame(self): 

353 """Test writing a dataframe to parquet via pandas (without additional 

354 metadata) and ensure that we can read it back with all the new 

355 functionality. 

356 """ 

357 df1, allColumns = _makeSingleIndexDataFrame() 

358 

359 fname = os.path.join(self.root, "test_dataframe.parq") 

360 df1.to_parquet(fname) 

361 

362 legacy_type = DatasetType( 

363 "legacy_dataframe", 

364 dimensions=(), 

365 storageClass="DataFrame", 

366 universe=self.butler.registry.dimensions, 

367 ) 

368 self.butler.registry.registerDatasetType(legacy_type) 

369 

370 data_id = {} 

371 ref = DatasetRef(legacy_type, data_id, id=None) 

372 dataset = FileDataset(path=fname, refs=[ref], formatter=ParquetFormatter) 

373 

374 self.butler.ingest(dataset, transfer="copy") 

375 

376 self.butler.put(df1, self.datasetType, dataId={}) 

377 

378 df2a = self.butler.get(self.datasetType, dataId={}) 

379 df2b = self.butler.get("legacy_dataframe", dataId={}) 

380 self.assertTrue(df2a.equals(df2b)) 

381 

382 df3a = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a"]}) 

383 df3b = self.butler.get("legacy_dataframe", dataId={}, parameters={"columns": ["a"]}) 

384 self.assertTrue(df3a.equals(df3b)) 

385 

386 columns2a = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

387 columns2b = self.butler.get("legacy_dataframe.columns", dataId={}) 

388 self.assertTrue(columns2a.equals(columns2b)) 

389 

390 rowcount2a = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

391 rowcount2b = self.butler.get("legacy_dataframe.rowcount", dataId={}) 

392 self.assertEqual(rowcount2a, rowcount2b) 

393 

394 schema2a = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

395 schema2b = self.butler.get("legacy_dataframe.schema", dataId={}) 

396 self.assertEqual(schema2a, schema2b) 

397 

398 def testDataFrameSchema(self): 

399 tab1 = _makeSimpleArrowTable() 

400 

401 schema = DataFrameSchema.from_arrow(tab1.schema) 

402 

403 self.assertIsInstance(schema.schema, pd.DataFrame) 

404 self.assertEqual(repr(schema), repr(schema._schema)) 

405 self.assertNotEqual(schema, "not_a_schema") 

406 self.assertEqual(schema, schema) 

407 

408 tab2 = _makeMultiIndexDataFrame() 

409 schema2 = DataFrameSchema(tab2) 

410 

411 self.assertNotEqual(schema, schema2) 

412 

413 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

414 def testWriteSingleIndexDataFrameReadAsAstropyTable(self): 

415 df1, allColumns = _makeSingleIndexDataFrame() 

416 

417 self.butler.put(df1, self.datasetType, dataId={}) 

418 

419 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

420 

421 tab2_df = tab2.to_pandas(index="index") 

422 self.assertTrue(df1.equals(tab2_df)) 

423 

424 # Check reading the columns. 

425 columns = list(tab2.columns.keys()) 

426 columns2 = self.butler.get( 

427 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

428 ) 

429 # We check the set because pandas reorders the columns. 

430 self.assertEqual(set(columns2), set(columns)) 

431 

432 # Check reading the schema. 

433 schema = ArrowAstropySchema(tab2) 

434 schema2 = self.butler.get( 

435 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema" 

436 ) 

437 

438 # The string types are objectified by pandas, and the order 

439 # will be changed because of pandas indexing. 

440 self.assertEqual(len(schema2.schema.columns), len(schema.schema.columns)) 

441 for name in schema.schema.columns: 

442 self.assertIn(name, schema2.schema.columns) 

443 if schema2.schema[name].dtype != np.dtype("O"): 

444 self.assertEqual(schema2.schema[name].dtype, schema.schema[name].dtype) 

445 

446 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

447 def testWriteSingleIndexDataFrameWithMaskedColsReadAsAstropyTable(self): 

448 # We need to special-case the write-as-pandas read-as-astropy code 

449 # with masks because pandas has multiple ways to use masked columns. 

450 # (The string column mask handling in particular is frustratingly 

451 # inconsistent.) 

452 df1, allColumns = _makeSingleIndexDataFrame(include_masked=True) 

453 

454 self.butler.put(df1, self.datasetType, dataId={}) 

455 

456 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

457 tab2_df = tab2.to_pandas(index="index") 

458 

459 self.assertTrue(df1.columns.equals(tab2_df.columns)) 

460 for name in tab2_df.columns: 

461 col1 = df1[name] 

462 col2 = tab2_df[name] 

463 

464 if col1.hasnans: 

465 notNull = col1.notnull() 

466 self.assertTrue(notNull.equals(col2.notnull())) 

467 # Need to check value-by-value because column may 

468 # be made of objects, depending on what pandas decides. 

469 for index in notNull.values.nonzero()[0]: 

470 self.assertEqual(col1[index], col2[index]) 

471 else: 

472 self.assertTrue(col1.equals(col2)) 

473 

474 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

475 def testWriteMultiIndexDataFrameReadAsAstropyTable(self): 

476 df1 = _makeMultiIndexDataFrame() 

477 

478 self.butler.put(df1, self.datasetType, dataId={}) 

479 

480 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

481 

482 # This is an odd duck, it doesn't really round-trip. 

483 # This test simply checks that it's readable, but definitely not 

484 # recommended. 

485 

486 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.") 

487 def testWriteSingleIndexDataFrameReadAsArrowTable(self): 

488 df1, allColumns = _makeSingleIndexDataFrame() 

489 

490 self.butler.put(df1, self.datasetType, dataId={}) 

491 

492 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable") 

493 

494 tab2_df = arrow_to_pandas(tab2) 

495 self.assertTrue(df1.equals(tab2_df)) 

496 

497 # Check reading the columns. 

498 columns = list(tab2.schema.names) 

499 columns2 = self.butler.get( 

500 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

501 ) 

502 # We check the set because pandas reorders the columns. 

503 self.assertEqual(set(columns), set(columns2)) 

504 

505 # Check reading the schema. 

506 schema = tab2.schema 

507 schema2 = self.butler.get( 

508 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema" 

509 ) 

510 

511 # These will not have the same metadata, nor will the string column 

512 # information be maintained. 

513 self.assertEqual(len(schema.names), len(schema2.names)) 

514 for name in schema.names: 

515 if schema.field(name).type not in (pa.string(), pa.binary()): 

516 self.assertEqual(schema.field(name).type, schema2.field(name).type) 

517 

518 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.") 

519 def testWriteMultiIndexDataFrameReadAsArrowTable(self): 

520 df1 = _makeMultiIndexDataFrame() 

521 

522 self.butler.put(df1, self.datasetType, dataId={}) 

523 

524 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable") 

525 

526 tab2_df = arrow_to_pandas(tab2) 

527 self.assertTrue(df1.equals(tab2_df)) 

528 

529 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

530 def testWriteSingleIndexDataFrameReadAsNumpyTable(self): 

531 df1, allColumns = _makeSingleIndexDataFrame() 

532 

533 self.butler.put(df1, self.datasetType, dataId={}) 

534 

535 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy") 

536 

537 tab2_df = pd.DataFrame.from_records(tab2, index=["index"]) 

538 self.assertTrue(df1.equals(tab2_df)) 

539 

540 # Check reading the columns. 

541 columns = list(tab2.dtype.names) 

542 columns2 = self.butler.get( 

543 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

544 ) 

545 # We check the set because pandas reorders the columns. 

546 self.assertEqual(set(columns2), set(columns)) 

547 

548 # Check reading the schema. 

549 schema = ArrowNumpySchema(tab2.dtype) 

550 schema2 = self.butler.get( 

551 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema" 

552 ) 

553 

554 # The string types will be objectified by pandas, and the order 

555 # will be changed because of pandas indexing. 

556 self.assertEqual(len(schema.schema.names), len(schema2.schema.names)) 

557 for name in schema.schema.names: 

558 self.assertIn(name, schema2.schema.names) 

559 self.assertEqual(schema2.schema[name].type, schema.schema[name].type) 

560 

561 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

562 def testWriteMultiIndexDataFrameReadAsNumpyTable(self): 

563 df1 = _makeMultiIndexDataFrame() 

564 

565 self.butler.put(df1, self.datasetType, dataId={}) 

566 

567 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy") 

568 

569 # This is an odd duck, it doesn't really round-trip. 

570 # This test simply checks that it's readable, but definitely not 

571 # recommended. 

572 

573 

574@unittest.skipUnless(pd is not None, "Cannot test InMemoryDataFrameDelegate without pandas.") 

575class InMemoryDataFrameDelegateTestCase(ParquetFormatterDataFrameTestCase): 

576 """Tests for InMemoryDatastore, using DataFrameDelegate.""" 

577 

578 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

579 

580 def testMultiIndexDataFrame(self): 

581 df1 = _makeMultiIndexDataFrame() 

582 

583 delegate = DataFrameDelegate("DataFrame") 

584 

585 # Read the whole DataFrame. 

586 df2 = delegate.handleParameters(inMemoryDataset=df1) 

587 self.assertTrue(df1.equals(df2)) 

588 # Read just the column descriptions. 

589 columns2 = delegate.getComponent(composite=df1, componentName="columns") 

590 self.assertTrue(df1.columns.equals(columns2)) 

591 

592 # Read just some columns a few different ways. 

593 with self.assertRaises(NotImplementedError) as cm: 

594 delegate.handleParameters(inMemoryDataset=df1, parameters={"columns": {"filter": "g"}}) 

595 self.assertIn("only supports string column names", str(cm.exception)) 

596 with self.assertRaises(NotImplementedError) as cm: 

597 delegate.handleParameters( 

598 inMemoryDataset=df1, parameters={"columns": {"filter": ["r"], "column": "a"}} 

599 ) 

600 self.assertIn("only supports string column names", str(cm.exception)) 

601 

602 def testWriteMultiIndexDataFrameReadAsAstropyTable(self): 

603 df1 = _makeMultiIndexDataFrame() 

604 

605 self.butler.put(df1, self.datasetType, dataId={}) 

606 

607 with self.assertRaises(ValueError): 

608 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

609 

610 def testLegacyDataFrame(self): 

611 # This test does not work with an inMemoryDatastore. 

612 pass 

613 

614 def testBadInput(self): 

615 df1, _ = _makeSingleIndexDataFrame() 

616 delegate = DataFrameDelegate("DataFrame") 

617 

618 with self.assertRaises(ValueError): 

619 delegate.handleParameters(inMemoryDataset="not_a_dataframe") 

620 

621 with self.assertRaises(AttributeError): 

622 delegate.getComponent(composite=df1, componentName="nothing") 

623 

624 def testStorageClass(self): 

625 df1, allColumns = _makeSingleIndexDataFrame() 

626 

627 factory = StorageClassFactory() 

628 factory.addFromConfig(StorageClassConfig()) 

629 

630 storageClass = factory.findStorageClass(type(df1), compare_types=False) 

631 # Force the name lookup to do name matching. 

632 storageClass._pytype = None 

633 self.assertEqual(storageClass.name, "DataFrame") 

634 

635 storageClass = factory.findStorageClass(type(df1), compare_types=True) 

636 # Force the name lookup to do name matching. 

637 storageClass._pytype = None 

638 self.assertEqual(storageClass.name, "DataFrame") 

639 

640 

641@unittest.skipUnless(atable is not None, "Cannot test ParquetFormatterArrowAstropy without astropy.") 

642@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowAstropy without pyarrow.") 

643class ParquetFormatterArrowAstropyTestCase(unittest.TestCase): 

644 """Tests for ParquetFormatter, ArrowAstropy, using local file datastore.""" 

645 

646 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

647 

648 def setUp(self): 

649 """Create a new butler root for each test.""" 

650 self.root = makeTestTempDir(TESTDIR) 

651 config = Config(self.configFile) 

652 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run") 

653 # No dimensions in dataset type so we don't have to worry about 

654 # inserting dimension data or defining data IDs. 

655 self.datasetType = DatasetType( 

656 "data", dimensions=(), storageClass="ArrowAstropy", universe=self.butler.registry.dimensions 

657 ) 

658 self.butler.registry.registerDatasetType(self.datasetType) 

659 

660 def tearDown(self): 

661 removeTestTempDir(self.root) 

662 

663 def testAstropyTable(self): 

664 tab1 = _makeSimpleAstropyTable(include_multidim=True, include_masked=True) 

665 

666 self.butler.put(tab1, self.datasetType, dataId={}) 

667 # Read the whole Table. 

668 tab2 = self.butler.get(self.datasetType, dataId={}) 

669 self._checkAstropyTableEquality(tab1, tab2) 

670 # Read the columns. 

671 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

672 self.assertEqual(len(columns2), len(tab1.dtype.names)) 

673 for i, name in enumerate(tab1.dtype.names): 

674 self.assertEqual(columns2[i], name) 

675 # Read the rowcount. 

676 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

677 self.assertEqual(rowcount, len(tab1)) 

678 # Read the schema. 

679 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

680 self.assertEqual(schema, ArrowAstropySchema(tab1)) 

681 # Read just some columns a few different ways. 

682 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

683 self._checkAstropyTableEquality(tab1[("a", "c")], tab3) 

684 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

685 self._checkAstropyTableEquality(tab1[("a",)], tab4) 

686 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]}) 

687 self._checkAstropyTableEquality(tab1[("index", "a")], tab5) 

688 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"}) 

689 self._checkAstropyTableEquality(tab1[("ddd",)], tab6) 

690 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]}) 

691 self._checkAstropyTableEquality(tab1[("a",)], tab7) 

692 # Passing an unrecognized column should be a ValueError. 

693 with self.assertRaises(ValueError): 

694 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]}) 

695 

696 def testAstropyTableWithMetadata(self): 

697 tab1 = _makeSimpleAstropyTable(include_multidim=True) 

698 

699 meta = { 

700 "meta_a": 5, 

701 "meta_b": 10.0, 

702 "meta_c": [1, 2, 3], 

703 "meta_d": True, 

704 "meta_e": "string", 

705 } 

706 

707 tab1.meta.update(meta) 

708 

709 self.butler.put(tab1, self.datasetType, dataId={}) 

710 # Read the whole Table. 

711 tab2 = self.butler.get(self.datasetType, dataId={}) 

712 # This will check that the metadata is equivalent as well. 

713 self._checkAstropyTableEquality(tab1, tab2) 

714 

715 def testArrowAstropySchema(self): 

716 tab1 = _makeSimpleAstropyTable() 

717 tab1_arrow = astropy_to_arrow(tab1) 

718 schema = ArrowAstropySchema.from_arrow(tab1_arrow.schema) 

719 

720 self.assertIsInstance(schema.schema, atable.Table) 

721 self.assertEqual(repr(schema), repr(schema._schema)) 

722 self.assertNotEqual(schema, "not_a_schema") 

723 self.assertEqual(schema, schema) 

724 

725 # Test various inequalities 

726 tab2 = tab1.copy() 

727 tab2.rename_column("index", "index2") 

728 schema2 = ArrowAstropySchema(tab2) 

729 self.assertNotEqual(schema2, schema) 

730 

731 tab2 = tab1.copy() 

732 tab2["index"].unit = units.micron 

733 schema2 = ArrowAstropySchema(tab2) 

734 self.assertNotEqual(schema2, schema) 

735 

736 tab2 = tab1.copy() 

737 tab2["index"].description = "Index column" 

738 schema2 = ArrowAstropySchema(tab2) 

739 self.assertNotEqual(schema2, schema) 

740 

741 tab2 = tab1.copy() 

742 tab2["index"].format = "%05d" 

743 schema2 = ArrowAstropySchema(tab2) 

744 self.assertNotEqual(schema2, schema) 

745 

746 def testAstropyParquet(self): 

747 tab1 = _makeSimpleAstropyTable() 

748 

749 fname = os.path.join(self.root, "test_astropy.parq") 

750 tab1.write(fname) 

751 

752 astropy_type = DatasetType( 

753 "astropy_parquet", 

754 dimensions=(), 

755 storageClass="ArrowAstropy", 

756 universe=self.butler.registry.dimensions, 

757 ) 

758 self.butler.registry.registerDatasetType(astropy_type) 

759 

760 data_id = {} 

761 ref = DatasetRef(astropy_type, data_id, id=None) 

762 dataset = FileDataset(path=fname, refs=[ref], formatter=ParquetFormatter) 

763 

764 self.butler.ingest(dataset, transfer="copy") 

765 

766 self.butler.put(tab1, self.datasetType, dataId={}) 

767 

768 tab2a = self.butler.get(self.datasetType, dataId={}) 

769 tab2b = self.butler.get("astropy_parquet", dataId={}) 

770 self._checkAstropyTableEquality(tab2a, tab2b) 

771 

772 columns2a = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

773 columns2b = self.butler.get("astropy_parquet.columns", dataId={}) 

774 self.assertEqual(len(columns2b), len(columns2a)) 

775 for i, name in enumerate(columns2a): 

776 self.assertEqual(columns2b[i], name) 

777 

778 rowcount2a = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

779 rowcount2b = self.butler.get("astropy_parquet.rowcount", dataId={}) 

780 self.assertEqual(rowcount2a, rowcount2b) 

781 

782 schema2a = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

783 schema2b = self.butler.get("astropy_parquet.schema", dataId={}) 

784 self.assertEqual(schema2a, schema2b) 

785 

786 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.") 

787 def testWriteAstropyReadAsArrowTable(self): 

788 # This astropy <-> arrow works fine with masked columns. 

789 tab1 = _makeSimpleAstropyTable(include_masked=True) 

790 

791 self.butler.put(tab1, self.datasetType, dataId={}) 

792 

793 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable") 

794 

795 tab2_astropy = arrow_to_astropy(tab2) 

796 self._checkAstropyTableEquality(tab1, tab2_astropy) 

797 

798 # Check reading the columns. 

799 columns = tab2.schema.names 

800 columns2 = self.butler.get( 

801 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

802 ) 

803 self.assertEqual(columns2, columns) 

804 

805 # Check reading the schema. 

806 schema = tab2.schema 

807 schema2 = self.butler.get( 

808 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema" 

809 ) 

810 

811 self.assertEqual(schema, schema2) 

812 

813 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

814 def testWriteAstropyReadAsDataFrame(self): 

815 tab1 = _makeSimpleAstropyTable() 

816 

817 self.butler.put(tab1, self.datasetType, dataId={}) 

818 

819 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

820 

821 # This is tricky because it loses the units and gains a bonus pandas 

822 # _index_ column, so we just test the dataframe form. 

823 

824 tab1_df = tab1.to_pandas() 

825 self.assertTrue(tab1_df.equals(tab2)) 

826 

827 # Check reading the columns. 

828 columns = tab2.columns 

829 columns2 = self.butler.get( 

830 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex" 

831 ) 

832 self.assertTrue(columns.equals(columns2)) 

833 

834 # Check reading the schema. 

835 schema = DataFrameSchema(tab2) 

836 schema2 = self.butler.get( 

837 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema" 

838 ) 

839 

840 self.assertEqual(schema2, schema) 

841 

842 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

843 def testWriteAstropyWithMaskedColsReadAsDataFrame(self): 

844 # We need to special-case the write-as-astropy read-as-pandas code 

845 # with masks because pandas has multiple ways to use masked columns. 

846 # (When writing an astropy table with masked columns we get an object 

847 # column back, but each unmasked element has the correct type.) 

848 tab1 = _makeSimpleAstropyTable(include_masked=True) 

849 

850 self.butler.put(tab1, self.datasetType, dataId={}) 

851 

852 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

853 

854 tab1_df = tab1.to_pandas() 

855 

856 self.assertTrue(tab1_df.columns.equals(tab2.columns)) 

857 for name in tab2.columns: 

858 col1 = tab1_df[name] 

859 col2 = tab2[name] 

860 

861 if col1.hasnans: 

862 notNull = col1.notnull() 

863 self.assertTrue(notNull.equals(col2.notnull())) 

864 # Need to check value-by-value because column may 

865 # be made of objects, depending on what pandas decides. 

866 for index in notNull.values.nonzero()[0]: 

867 self.assertEqual(col1[index], col2[index]) 

868 else: 

869 self.assertTrue(col1.equals(col2)) 

870 

871 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

872 def testWriteAstropyReadAsNumpyTable(self): 

873 tab1 = _makeSimpleAstropyTable() 

874 self.butler.put(tab1, self.datasetType, dataId={}) 

875 

876 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy") 

877 

878 # This is tricky because it loses the units. 

879 tab2_astropy = atable.Table(tab2) 

880 

881 self._checkAstropyTableEquality(tab1, tab2_astropy, skip_units=True) 

882 

883 # Check reading the columns. 

884 columns = list(tab2.dtype.names) 

885 columns2 = self.butler.get( 

886 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

887 ) 

888 self.assertEqual(columns2, columns) 

889 

890 # Check reading the schema. 

891 schema = ArrowNumpySchema(tab2.dtype) 

892 schema2 = self.butler.get( 

893 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema" 

894 ) 

895 

896 self.assertEqual(schema2, schema) 

897 

898 def _checkAstropyTableEquality(self, table1, table2, skip_units=False): 

899 """Check if two astropy tables have the same columns/values. 

900 

901 Parameters 

902 ---------- 

903 table1 : `astropy.table.Table` 

904 table2 : `astropy.table.Table` 

905 skip_units : `bool` 

906 """ 

907 self.assertEqual(table1.dtype, table2.dtype) 

908 self.assertEqual(table1.meta, table2.meta) 

909 if not skip_units: 

910 for name in table1.columns: 

911 self.assertEqual(table1[name].unit, table2[name].unit) 

912 self.assertEqual(table1[name].description, table2[name].description) 

913 self.assertEqual(table1[name].format, table2[name].format) 

914 self.assertTrue(np.all(table1 == table2)) 

915 

916 

917@unittest.skipUnless(atable is not None, "Cannot test InMemoryArrowAstropyDelegate without astropy.") 

918class InMemoryArrowAstropyDelegateTestCase(ParquetFormatterArrowAstropyTestCase): 

919 """Tests for InMemoryDatastore, using ArrowAstropyDelegate.""" 

920 

921 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

922 

923 def testAstropyParquet(self): 

924 # This test does not work with an inMemoryDatastore. 

925 pass 

926 

927 def testBadInput(self): 

928 tab1 = _makeSimpleAstropyTable() 

929 delegate = ArrowAstropyDelegate("ArrowAstropy") 

930 

931 with self.assertRaises(ValueError): 

932 delegate.handleParameters(inMemoryDataset="not_an_astropy_table") 

933 

934 with self.assertRaises(NotImplementedError): 

935 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]}) 

936 

937 with self.assertRaises(AttributeError): 

938 delegate.getComponent(composite=tab1, componentName="nothing") 

939 

940 

941@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.") 

942@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowNumpy without pyarrow.") 

943class ParquetFormatterArrowNumpyTestCase(unittest.TestCase): 

944 """Tests for ParquetFormatter, ArrowNumpy, using local file datastore.""" 

945 

946 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

947 

948 def setUp(self): 

949 """Create a new butler root for each test.""" 

950 self.root = makeTestTempDir(TESTDIR) 

951 config = Config(self.configFile) 

952 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run") 

953 # No dimensions in dataset type so we don't have to worry about 

954 # inserting dimension data or defining data IDs. 

955 self.datasetType = DatasetType( 

956 "data", dimensions=(), storageClass="ArrowNumpy", universe=self.butler.registry.dimensions 

957 ) 

958 self.butler.registry.registerDatasetType(self.datasetType) 

959 

960 def tearDown(self): 

961 removeTestTempDir(self.root) 

962 

963 def testNumpyTable(self): 

964 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

965 

966 self.butler.put(tab1, self.datasetType, dataId={}) 

967 # Read the whole Table. 

968 tab2 = self.butler.get(self.datasetType, dataId={}) 

969 self._checkNumpyTableEquality(tab1, tab2) 

970 # Read the columns. 

971 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

972 self.assertEqual(len(columns2), len(tab1.dtype.names)) 

973 for i, name in enumerate(tab1.dtype.names): 

974 self.assertEqual(columns2[i], name) 

975 # Read the rowcount. 

976 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

977 self.assertEqual(rowcount, len(tab1)) 

978 # Read the schema. 

979 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

980 self.assertEqual(schema, ArrowNumpySchema(tab1.dtype)) 

981 # Read just some columns a few different ways. 

982 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

983 self._checkNumpyTableEquality(tab1[["a", "c"]], tab3) 

984 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

985 self._checkNumpyTableEquality( 

986 tab1[ 

987 [ 

988 "a", 

989 ] 

990 ], 

991 tab4, 

992 ) 

993 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]}) 

994 self._checkNumpyTableEquality(tab1[["index", "a"]], tab5) 

995 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"}) 

996 self._checkNumpyTableEquality( 

997 tab1[ 

998 [ 

999 "ddd", 

1000 ] 

1001 ], 

1002 tab6, 

1003 ) 

1004 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]}) 

1005 self._checkNumpyTableEquality( 

1006 tab1[ 

1007 [ 

1008 "a", 

1009 ] 

1010 ], 

1011 tab7, 

1012 ) 

1013 # Passing an unrecognized column should be a ValueError. 

1014 with self.assertRaises(ValueError): 

1015 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]}) 

1016 

1017 def testArrowNumpySchema(self): 

1018 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1019 tab1_arrow = numpy_to_arrow(tab1) 

1020 schema = ArrowNumpySchema.from_arrow(tab1_arrow.schema) 

1021 

1022 self.assertIsInstance(schema.schema, np.dtype) 

1023 self.assertEqual(repr(schema), repr(schema._dtype)) 

1024 self.assertNotEqual(schema, "not_a_schema") 

1025 self.assertEqual(schema, schema) 

1026 

1027 # Test inequality 

1028 tab2 = tab1.copy() 

1029 names = list(tab2.dtype.names) 

1030 names[0] = "index2" 

1031 tab2.dtype.names = names 

1032 schema2 = ArrowNumpySchema(tab2.dtype) 

1033 self.assertNotEqual(schema2, schema) 

1034 

1035 @unittest.skipUnless(pa is not None, "Cannot test arrow conversions without pyarrow.") 

1036 def testNumpyDictConversions(self): 

1037 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1038 

1039 # Verify that everything round-trips, including the schema. 

1040 tab1_arrow = numpy_to_arrow(tab1) 

1041 tab1_dict = arrow_to_numpy_dict(tab1_arrow) 

1042 tab1_dict_arrow = numpy_dict_to_arrow(tab1_dict) 

1043 

1044 self.assertEqual(tab1_arrow.schema, tab1_dict_arrow.schema) 

1045 self.assertEqual(tab1_arrow, tab1_dict_arrow) 

1046 

1047 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.") 

1048 def testWriteNumpyTableReadAsArrowTable(self): 

1049 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1050 

1051 self.butler.put(tab1, self.datasetType, dataId={}) 

1052 

1053 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable") 

1054 

1055 tab2_numpy = arrow_to_numpy(tab2) 

1056 

1057 self._checkNumpyTableEquality(tab1, tab2_numpy) 

1058 

1059 # Check reading the columns. 

1060 columns = tab2.schema.names 

1061 columns2 = self.butler.get( 

1062 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

1063 ) 

1064 self.assertEqual(columns2, columns) 

1065 

1066 # Check reading the schema. 

1067 schema = tab2.schema 

1068 schema2 = self.butler.get( 

1069 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema" 

1070 ) 

1071 self.assertEqual(schema2, schema) 

1072 

1073 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

1074 def testWriteNumpyTableReadAsDataFrame(self): 

1075 tab1 = _makeSimpleNumpyTable() 

1076 

1077 self.butler.put(tab1, self.datasetType, dataId={}) 

1078 

1079 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

1080 

1081 # Converting this back to numpy gets confused with the index column 

1082 # and changes the datatype of the string column. 

1083 

1084 tab1_df = pd.DataFrame(tab1) 

1085 

1086 self.assertTrue(tab1_df.equals(tab2)) 

1087 

1088 # Check reading the columns. 

1089 columns = tab2.columns 

1090 columns2 = self.butler.get( 

1091 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex" 

1092 ) 

1093 self.assertTrue(columns.equals(columns2)) 

1094 

1095 # Check reading the schema. 

1096 schema = DataFrameSchema(tab2) 

1097 schema2 = self.butler.get( 

1098 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema" 

1099 ) 

1100 

1101 self.assertEqual(schema2, schema) 

1102 

1103 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

1104 def testWriteNumpyTableReadAsAstropyTable(self): 

1105 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1106 

1107 self.butler.put(tab1, self.datasetType, dataId={}) 

1108 

1109 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

1110 tab2_numpy = tab2.as_array() 

1111 

1112 self._checkNumpyTableEquality(tab1, tab2_numpy) 

1113 

1114 # Check reading the columns. 

1115 columns = list(tab2.columns.keys()) 

1116 columns2 = self.butler.get( 

1117 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

1118 ) 

1119 self.assertEqual(columns2, columns) 

1120 

1121 # Check reading the schema. 

1122 schema = ArrowAstropySchema(tab2) 

1123 schema2 = self.butler.get( 

1124 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema" 

1125 ) 

1126 

1127 self.assertEqual(schema2, schema) 

1128 

1129 def _checkNumpyTableEquality(self, table1, table2): 

1130 """Check if two numpy tables have the same columns/values 

1131 

1132 Parameters 

1133 ---------- 

1134 table1 : `numpy.ndarray` 

1135 table2 : `numpy.ndarray` 

1136 """ 

1137 self.assertEqual(table1.dtype.names, table2.dtype.names) 

1138 for name in table1.dtype.names: 

1139 self.assertEqual(table1.dtype[name], table2.dtype[name]) 

1140 self.assertTrue(np.all(table1 == table2)) 

1141 

1142 

1143@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.") 

1144class InMemoryArrowNumpyDelegateTestCase(ParquetFormatterArrowNumpyTestCase): 

1145 """Tests for InMemoryDatastore, using ArrowNumpyDelegate.""" 

1146 

1147 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

1148 

1149 def testBadInput(self): 

1150 tab1 = _makeSimpleNumpyTable() 

1151 delegate = ArrowNumpyDelegate("ArrowNumpy") 

1152 

1153 with self.assertRaises(ValueError): 

1154 delegate.handleParameters(inMemoryDataset="not_a_numpy_table") 

1155 

1156 with self.assertRaises(NotImplementedError): 

1157 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]}) 

1158 

1159 with self.assertRaises(AttributeError): 

1160 delegate.getComponent(composite=tab1, componentName="nothing") 

1161 

1162 def testStorageClass(self): 

1163 tab1 = _makeSimpleNumpyTable() 

1164 

1165 factory = StorageClassFactory() 

1166 factory.addFromConfig(StorageClassConfig()) 

1167 

1168 storageClass = factory.findStorageClass(type(tab1), compare_types=False) 

1169 # Force the name lookup to do name matching. 

1170 storageClass._pytype = None 

1171 self.assertEqual(storageClass.name, "ArrowNumpy") 

1172 

1173 storageClass = factory.findStorageClass(type(tab1), compare_types=True) 

1174 # Force the name lookup to do name matching. 

1175 storageClass._pytype = None 

1176 self.assertEqual(storageClass.name, "ArrowNumpy") 

1177 

1178 

1179@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowTable without pyarrow.") 

1180class ParquetFormatterArrowTableTestCase(unittest.TestCase): 

1181 """Tests for ParquetFormatter, ArrowTable, using local file datastore.""" 

1182 

1183 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1184 

1185 def setUp(self): 

1186 """Create a new butler root for each test.""" 

1187 self.root = makeTestTempDir(TESTDIR) 

1188 config = Config(self.configFile) 

1189 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run") 

1190 # No dimensions in dataset type so we don't have to worry about 

1191 # inserting dimension data or defining data IDs. 

1192 self.datasetType = DatasetType( 

1193 "data", dimensions=(), storageClass="ArrowTable", universe=self.butler.registry.dimensions 

1194 ) 

1195 self.butler.registry.registerDatasetType(self.datasetType) 

1196 

1197 def tearDown(self): 

1198 removeTestTempDir(self.root) 

1199 

1200 def testArrowTable(self): 

1201 tab1 = _makeSimpleArrowTable(include_multidim=True, include_masked=True) 

1202 

1203 self.butler.put(tab1, self.datasetType, dataId={}) 

1204 # Read the whole Table. 

1205 tab2 = self.butler.get(self.datasetType, dataId={}) 

1206 self.assertEqual(tab2, tab1) 

1207 # Read the columns. 

1208 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

1209 self.assertEqual(len(columns2), len(tab1.schema.names)) 

1210 for i, name in enumerate(tab1.schema.names): 

1211 self.assertEqual(columns2[i], name) 

1212 # Read the rowcount. 

1213 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

1214 self.assertEqual(rowcount, len(tab1)) 

1215 # Read the schema. 

1216 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

1217 self.assertEqual(schema, tab1.schema) 

1218 # Read just some columns a few different ways. 

1219 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

1220 self.assertEqual(tab3, tab1.select(("a", "c"))) 

1221 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

1222 self.assertEqual(tab4, tab1.select(("a",))) 

1223 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]}) 

1224 self.assertEqual(tab5, tab1.select(("index", "a"))) 

1225 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"}) 

1226 self.assertEqual(tab6, tab1.select(("ddd",))) 

1227 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]}) 

1228 self.assertEqual(tab7, tab1.select(("a",))) 

1229 # Passing an unrecognized column should be a ValueError. 

1230 with self.assertRaises(ValueError): 

1231 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]}) 

1232 

1233 def testEmptyArrowTable(self): 

1234 data = _makeSimpleNumpyTable() 

1235 type_list = _numpy_dtype_to_arrow_types(data.dtype) 

1236 

1237 schema = pa.schema(type_list) 

1238 arrays = [[]] * len(schema.names) 

1239 

1240 tab1 = pa.Table.from_arrays(arrays, schema=schema) 

1241 

1242 self.butler.put(tab1, self.datasetType, dataId={}) 

1243 tab2 = self.butler.get(self.datasetType, dataId={}) 

1244 self.assertEqual(tab2, tab1) 

1245 

1246 tab1_numpy = arrow_to_numpy(tab1) 

1247 self.assertEqual(len(tab1_numpy), 0) 

1248 tab1_numpy_arrow = numpy_to_arrow(tab1_numpy) 

1249 self.assertEqual(tab1_numpy_arrow, tab1) 

1250 

1251 tab1_pandas = arrow_to_pandas(tab1) 

1252 self.assertEqual(len(tab1_pandas), 0) 

1253 tab1_pandas_arrow = pandas_to_arrow(tab1_pandas) 

1254 # Unfortunately, string/byte columns get mangled when translated 

1255 # through empty pandas dataframes. 

1256 self.assertEqual( 

1257 tab1_pandas_arrow.select(("index", "a", "b", "c", "ddd")), 

1258 tab1.select(("index", "a", "b", "c", "ddd")), 

1259 ) 

1260 

1261 tab1_astropy = arrow_to_astropy(tab1) 

1262 self.assertEqual(len(tab1_astropy), 0) 

1263 tab1_astropy_arrow = astropy_to_arrow(tab1_astropy) 

1264 self.assertEqual(tab1_astropy_arrow, tab1) 

1265 

1266 def testEmptyArrowTableMultidim(self): 

1267 data = _makeSimpleNumpyTable(include_multidim=True) 

1268 type_list = _numpy_dtype_to_arrow_types(data.dtype) 

1269 

1270 md = {} 

1271 for name in data.dtype.names: 

1272 _append_numpy_multidim_metadata(md, name, data.dtype[name]) 

1273 

1274 schema = pa.schema(type_list, metadata=md) 

1275 arrays = [[]] * len(schema.names) 

1276 

1277 tab1 = pa.Table.from_arrays(arrays, schema=schema) 

1278 

1279 self.butler.put(tab1, self.datasetType, dataId={}) 

1280 tab2 = self.butler.get(self.datasetType, dataId={}) 

1281 self.assertEqual(tab2, tab1) 

1282 

1283 tab1_numpy = arrow_to_numpy(tab1) 

1284 self.assertEqual(len(tab1_numpy), 0) 

1285 tab1_numpy_arrow = numpy_to_arrow(tab1_numpy) 

1286 self.assertEqual(tab1_numpy_arrow, tab1) 

1287 

1288 tab1_astropy = arrow_to_astropy(tab1) 

1289 self.assertEqual(len(tab1_astropy), 0) 

1290 tab1_astropy_arrow = astropy_to_arrow(tab1_astropy) 

1291 self.assertEqual(tab1_astropy_arrow, tab1) 

1292 

1293 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

1294 def testWriteArrowTableReadAsSingleIndexDataFrame(self): 

1295 df1, allColumns = _makeSingleIndexDataFrame() 

1296 

1297 self.butler.put(df1, self.datasetType, dataId={}) 

1298 

1299 # Read back out as a dataframe. 

1300 df2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

1301 self.assertTrue(df1.equals(df2)) 

1302 

1303 # Read back out as an arrow table, convert to dataframe. 

1304 tab3 = self.butler.get(self.datasetType, dataId={}) 

1305 df3 = arrow_to_pandas(tab3) 

1306 self.assertTrue(df1.equals(df3)) 

1307 

1308 # Check reading the columns. 

1309 columns = df2.reset_index().columns 

1310 columns2 = self.butler.get( 

1311 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex" 

1312 ) 

1313 # We check the set because pandas reorders the columns. 

1314 self.assertEqual(set(columns2.to_list()), set(columns.to_list())) 

1315 

1316 # Check reading the schema. 

1317 schema = DataFrameSchema(df1) 

1318 schema2 = self.butler.get( 

1319 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema" 

1320 ) 

1321 self.assertEqual(schema2, schema) 

1322 

1323 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

1324 def testWriteArrowTableReadAsMultiIndexDataFrame(self): 

1325 df1 = _makeMultiIndexDataFrame() 

1326 

1327 self.butler.put(df1, self.datasetType, dataId={}) 

1328 

1329 # Read back out as a dataframe. 

1330 df2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

1331 self.assertTrue(df1.equals(df2)) 

1332 

1333 # Read back out as an arrow table, convert to dataframe. 

1334 atab3 = self.butler.get(self.datasetType, dataId={}) 

1335 df3 = arrow_to_pandas(atab3) 

1336 self.assertTrue(df1.equals(df3)) 

1337 

1338 # Check reading the columns. 

1339 columns = df2.columns 

1340 columns2 = self.butler.get( 

1341 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex" 

1342 ) 

1343 self.assertTrue(columns2.equals(columns)) 

1344 

1345 # Check reading the schema. 

1346 schema = DataFrameSchema(df1) 

1347 schema2 = self.butler.get( 

1348 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema" 

1349 ) 

1350 self.assertEqual(schema2, schema) 

1351 

1352 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

1353 def testWriteArrowTableReadAsAstropyTable(self): 

1354 tab1 = _makeSimpleAstropyTable(include_multidim=True, include_masked=True) 

1355 

1356 self.butler.put(tab1, self.datasetType, dataId={}) 

1357 

1358 # Read back out as an astropy table. 

1359 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

1360 self._checkAstropyTableEquality(tab1, tab2) 

1361 

1362 # Read back out as an arrow table, convert to astropy table. 

1363 atab3 = self.butler.get(self.datasetType, dataId={}) 

1364 tab3 = arrow_to_astropy(atab3) 

1365 self._checkAstropyTableEquality(tab1, tab3) 

1366 

1367 # Check reading the columns. 

1368 columns = list(tab2.columns.keys()) 

1369 columns2 = self.butler.get( 

1370 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

1371 ) 

1372 self.assertEqual(columns2, columns) 

1373 

1374 # Check reading the schema. 

1375 schema = ArrowAstropySchema(tab1) 

1376 schema2 = self.butler.get( 

1377 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema" 

1378 ) 

1379 self.assertEqual(schema2, schema) 

1380 

1381 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

1382 def testWriteArrowTableReadAsNumpyTable(self): 

1383 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1384 

1385 self.butler.put(tab1, self.datasetType, dataId={}) 

1386 

1387 # Read back out as a numpy table. 

1388 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy") 

1389 self._checkNumpyTableEquality(tab1, tab2) 

1390 

1391 # Read back out as an arrow table, convert to numpy table. 

1392 atab3 = self.butler.get(self.datasetType, dataId={}) 

1393 tab3 = arrow_to_numpy(atab3) 

1394 self._checkNumpyTableEquality(tab1, tab3) 

1395 

1396 # Check reading the columns. 

1397 columns = list(tab2.dtype.names) 

1398 columns2 = self.butler.get( 

1399 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

1400 ) 

1401 self.assertEqual(columns2, columns) 

1402 

1403 # Check reading the schema. 

1404 schema = ArrowNumpySchema(tab1.dtype) 

1405 schema2 = self.butler.get( 

1406 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema" 

1407 ) 

1408 self.assertEqual(schema2, schema) 

1409 

1410 def _checkAstropyTableEquality(self, table1, table2): 

1411 """Check if two astropy tables have the same columns/values 

1412 

1413 Parameters 

1414 ---------- 

1415 table1 : `astropy.table.Table` 

1416 table2 : `astropy.table.Table` 

1417 """ 

1418 self.assertEqual(table1.dtype, table2.dtype) 

1419 for name in table1.columns: 

1420 self.assertEqual(table1[name].unit, table2[name].unit) 

1421 self.assertEqual(table1[name].description, table2[name].description) 

1422 self.assertEqual(table1[name].format, table2[name].format) 

1423 self.assertTrue(np.all(table1 == table2)) 

1424 

1425 def _checkNumpyTableEquality(self, table1, table2): 

1426 """Check if two numpy tables have the same columns/values 

1427 

1428 Parameters 

1429 ---------- 

1430 table1 : `numpy.ndarray` 

1431 table2 : `numpy.ndarray` 

1432 """ 

1433 self.assertEqual(table1.dtype.names, table2.dtype.names) 

1434 for name in table1.dtype.names: 

1435 self.assertEqual(table1.dtype[name], table2.dtype[name]) 

1436 self.assertTrue(np.all(table1 == table2)) 

1437 

1438 

1439@unittest.skipUnless(pa is not None, "Cannot test InMemoryArrowTableDelegate without pyarrow.") 

1440class InMemoryArrowTableDelegateTestCase(ParquetFormatterArrowTableTestCase): 

1441 """Tests for InMemoryDatastore, using ArrowTableDelegate.""" 

1442 

1443 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

1444 

1445 def testBadInput(self): 

1446 tab1 = _makeSimpleArrowTable() 

1447 delegate = ArrowTableDelegate("ArrowTable") 

1448 

1449 with self.assertRaises(ValueError): 

1450 delegate.handleParameters(inMemoryDataset="not_an_arrow_table") 

1451 

1452 with self.assertRaises(NotImplementedError): 

1453 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]}) 

1454 

1455 with self.assertRaises(AttributeError): 

1456 delegate.getComponent(composite=tab1, componentName="nothing") 

1457 

1458 def testStorageClass(self): 

1459 tab1 = _makeSimpleArrowTable() 

1460 

1461 factory = StorageClassFactory() 

1462 factory.addFromConfig(StorageClassConfig()) 

1463 

1464 storageClass = factory.findStorageClass(type(tab1), compare_types=False) 

1465 # Force the name lookup to do name matching. 

1466 storageClass._pytype = None 

1467 self.assertEqual(storageClass.name, "ArrowTable") 

1468 

1469 storageClass = factory.findStorageClass(type(tab1), compare_types=True) 

1470 # Force the name lookup to do name matching. 

1471 storageClass._pytype = None 

1472 self.assertEqual(storageClass.name, "ArrowTable") 

1473 

1474 

1475if __name__ == "__main__": 1475 ↛ 1476line 1475 didn't jump to line 1476, because the condition on line 1475 was never true

1476 unittest.main()