Coverage for tests/test_parquet.py: 22%

977 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-08-12 09:20 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""Tests for ParquetFormatter. 

23 

24Tests in this module are disabled unless pandas and pyarrow are importable. 

25""" 

26 

27import os 

28import unittest 

29 

30try: 

31 import pyarrow as pa 

32except ImportError: 

33 pa = None 

34try: 

35 import astropy.table as atable 

36 from astropy import units 

37except ImportError: 

38 atable = None 

39try: 

40 import numpy as np 

41except ImportError: 

42 np = None 

43try: 

44 import pandas as pd 

45except ImportError: 

46 pd = None 

47 

48from lsst.daf.butler import ( 

49 Butler, 

50 Config, 

51 DatasetRef, 

52 DatasetType, 

53 FileDataset, 

54 StorageClassConfig, 

55 StorageClassFactory, 

56) 

57 

58try: 

59 from lsst.daf.butler.delegates.arrowastropy import ArrowAstropyDelegate 

60except ImportError: 

61 atable = None 

62 pa = None 

63try: 

64 from lsst.daf.butler.delegates.arrownumpy import ArrowNumpyDelegate 

65except ImportError: 

66 np = None 

67 pa = None 

68try: 

69 from lsst.daf.butler.delegates.arrowtable import ArrowTableDelegate 

70except ImportError: 

71 pa = None 

72try: 

73 from lsst.daf.butler.delegates.dataframe import DataFrameDelegate 

74except ImportError: 

75 pd = None 

76try: 

77 from lsst.daf.butler.formatters.parquet import ( 

78 ArrowAstropySchema, 

79 ArrowNumpySchema, 

80 DataFrameSchema, 

81 ParquetFormatter, 

82 _append_numpy_multidim_metadata, 

83 _astropy_to_numpy_dict, 

84 _numpy_dict_to_numpy, 

85 _numpy_dtype_to_arrow_types, 

86 _numpy_style_arrays_to_arrow_arrays, 

87 _numpy_to_numpy_dict, 

88 arrow_to_astropy, 

89 arrow_to_numpy, 

90 arrow_to_numpy_dict, 

91 arrow_to_pandas, 

92 astropy_to_arrow, 

93 compute_row_group_size, 

94 numpy_dict_to_arrow, 

95 numpy_to_arrow, 

96 pandas_to_arrow, 

97 ) 

98except ImportError: 

99 pa = None 

100 pd = None 

101 atable = None 

102 np = None 

103from lsst.daf.butler.tests.utils import makeTestTempDir, removeTestTempDir 

104 

105TESTDIR = os.path.abspath(os.path.dirname(__file__)) 

106 

107 

108def _makeSimpleNumpyTable(include_multidim=False, include_bigendian=False): 

109 """Make a simple numpy table with random data. 

110 

111 Parameters 

112 ---------- 

113 include_multidim : `bool` 

114 Include multi-dimensional columns. 

115 include_bigendian : `bool` 

116 Include big-endian columns. 

117 

118 Returns 

119 ------- 

120 numpyTable : `numpy.ndarray` 

121 """ 

122 nrow = 5 

123 

124 dtype = [ 

125 ("index", "i4"), 

126 ("a", "f8"), 

127 ("b", "f8"), 

128 ("c", "f8"), 

129 ("ddd", "f8"), 

130 ("f", "i8"), 

131 ("strcol", "U10"), 

132 ("bytecol", "a10"), 

133 ] 

134 

135 if include_multidim: 

136 dtype.extend( 

137 [ 

138 ("d1", "f4", (5,)), 

139 ("d2", "i8", (5, 10)), 

140 ("d3", "f8", (5, 10)), 

141 ] 

142 ) 

143 

144 if include_bigendian: 

145 dtype.extend([("a_bigendian", ">f8"), ("f_bigendian", ">i8")]) 

146 

147 data = np.zeros(nrow, dtype=dtype) 

148 data["index"][:] = np.arange(nrow) 

149 data["a"] = np.random.randn(nrow) 

150 data["b"] = np.random.randn(nrow) 

151 data["c"] = np.random.randn(nrow) 

152 data["ddd"] = np.random.randn(nrow) 

153 data["f"] = np.arange(nrow) * 10 

154 data["strcol"][:] = "teststring" 

155 data["bytecol"][:] = "teststring" 

156 

157 if include_multidim: 

158 data["d1"] = np.random.randn(data["d1"].size).reshape(data["d1"].shape) 

159 data["d2"] = np.arange(data["d2"].size).reshape(data["d2"].shape) 

160 data["d3"] = np.asfortranarray(np.random.randn(data["d3"].size).reshape(data["d3"].shape)) 

161 

162 if include_bigendian: 

163 data["a_bigendian"][:] = data["a"] 

164 data["f_bigendian"][:] = data["f"] 

165 

166 return data 

167 

168 

169def _makeSingleIndexDataFrame(include_masked=False, include_lists=False): 

170 """Make a single index data frame for testing. 

171 

172 Parameters 

173 ---------- 

174 include_masked : `bool` 

175 Include masked columns. 

176 include_lists : `bool` 

177 Include list columns. 

178 

179 Returns 

180 ------- 

181 dataFrame : `~pandas.DataFrame` 

182 The test dataframe. 

183 allColumns : `list` [`str`] 

184 List of all the columns (including index columns). 

185 """ 

186 data = _makeSimpleNumpyTable() 

187 df = pd.DataFrame(data) 

188 df = df.set_index("index") 

189 

190 if include_masked: 

191 nrow = len(df) 

192 

193 df["m1"] = pd.array(np.arange(nrow), dtype=pd.Int64Dtype()) 

194 df["m2"] = pd.array(np.arange(nrow), dtype=np.float32) 

195 df["mstrcol"] = pd.array(np.array(["text"] * nrow)) 

196 df.loc[1, ["m1", "m2", "mstrcol"]] = None 

197 

198 if include_lists: 

199 nrow = len(df) 

200 

201 df["l1"] = [[0, 0]] * nrow 

202 df["l2"] = [[0.0, 0.0]] * nrow 

203 df["l3"] = [[]] * nrow 

204 

205 allColumns = df.columns.append(pd.Index(df.index.names)) 

206 

207 return df, allColumns 

208 

209 

210def _makeMultiIndexDataFrame(): 

211 """Make a multi-index data frame for testing. 

212 

213 Returns 

214 ------- 

215 dataFrame : `~pandas.DataFrame` 

216 The test dataframe. 

217 """ 

218 columns = pd.MultiIndex.from_tuples( 

219 [ 

220 ("g", "a"), 

221 ("g", "b"), 

222 ("g", "c"), 

223 ("r", "a"), 

224 ("r", "b"), 

225 ("r", "c"), 

226 ], 

227 names=["filter", "column"], 

228 ) 

229 df = pd.DataFrame(np.random.randn(5, 6), index=np.arange(5, dtype=int), columns=columns) 

230 

231 return df 

232 

233 

234def _makeSimpleAstropyTable(include_multidim=False, include_masked=False, include_bigendian=False): 

235 """Make an astropy table for testing. 

236 

237 Parameters 

238 ---------- 

239 include_multidim : `bool` 

240 Include multi-dimensional columns. 

241 include_masked : `bool` 

242 Include masked columns. 

243 include_bigendian : `bool` 

244 Include big-endian columns. 

245 

246 Returns 

247 ------- 

248 astropyTable : `astropy.table.Table` 

249 The test table. 

250 """ 

251 data = _makeSimpleNumpyTable(include_multidim=include_multidim, include_bigendian=include_bigendian) 

252 # Add a couple of units. 

253 table = atable.Table(data) 

254 table["a"].unit = units.degree 

255 table["b"].unit = units.meter 

256 

257 # Add some masked columns. 

258 if include_masked: 

259 nrow = len(table) 

260 mask = np.zeros(nrow, dtype=bool) 

261 mask[1] = True 

262 table["m1"] = np.ma.masked_array(data=np.arange(nrow, dtype="i8"), mask=mask) 

263 table["m2"] = np.ma.masked_array(data=np.arange(nrow, dtype="f4"), mask=mask) 

264 table["mstrcol"] = np.ma.masked_array(data=np.array(["text"] * nrow), mask=mask) 

265 table["mbytecol"] = np.ma.masked_array(data=np.array([b"bytes"] * nrow), mask=mask) 

266 

267 return table 

268 

269 

270def _makeSimpleArrowTable(include_multidim=False, include_masked=False): 

271 """Make an arrow table for testing. 

272 

273 Parameters 

274 ---------- 

275 include_multidim : `bool` 

276 Include multi-dimensional columns. 

277 include_masked : `bool` 

278 Include masked columns. 

279 

280 Returns 

281 ------- 

282 arrowTable : `pyarrow.Table` 

283 The test table. 

284 """ 

285 data = _makeSimpleAstropyTable(include_multidim=include_multidim, include_masked=include_masked) 

286 return astropy_to_arrow(data) 

287 

288 

289@unittest.skipUnless(pd is not None, "Cannot test ParquetFormatterDataFrame without pandas.") 

290@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterDataFrame without pyarrow.") 

291class ParquetFormatterDataFrameTestCase(unittest.TestCase): 

292 """Tests for ParquetFormatter, DataFrame, using local file datastore.""" 

293 

294 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

295 

296 def setUp(self): 

297 """Create a new butler root for each test.""" 

298 self.root = makeTestTempDir(TESTDIR) 

299 config = Config(self.configFile) 

300 self.run = "test_run" 

301 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run=self.run) 

302 # No dimensions in dataset type so we don't have to worry about 

303 # inserting dimension data or defining data IDs. 

304 self.datasetType = DatasetType( 

305 "data", dimensions=(), storageClass="DataFrame", universe=self.butler.dimensions 

306 ) 

307 self.butler.registry.registerDatasetType(self.datasetType) 

308 

309 def tearDown(self): 

310 removeTestTempDir(self.root) 

311 

312 def testSingleIndexDataFrame(self): 

313 df1, allColumns = _makeSingleIndexDataFrame(include_masked=True) 

314 

315 self.butler.put(df1, self.datasetType, dataId={}) 

316 # Read the whole DataFrame. 

317 df2 = self.butler.get(self.datasetType, dataId={}) 

318 self.assertTrue(df1.equals(df2)) 

319 # Read just the column descriptions. 

320 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

321 self.assertTrue(allColumns.equals(columns2)) 

322 # Read the rowcount. 

323 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

324 self.assertEqual(rowcount, len(df1)) 

325 # Read the schema. 

326 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

327 self.assertEqual(schema, DataFrameSchema(df1)) 

328 # Read just some columns a few different ways. 

329 df3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

330 self.assertTrue(df1.loc[:, ["a", "c"]].equals(df3)) 

331 df4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

332 self.assertTrue(df1.loc[:, ["a"]].equals(df4)) 

333 df5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]}) 

334 self.assertTrue(df1.loc[:, ["a"]].equals(df5)) 

335 df6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"}) 

336 self.assertTrue(df1.loc[:, ["ddd"]].equals(df6)) 

337 df7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]}) 

338 self.assertTrue(df1.loc[:, ["a"]].equals(df7)) 

339 # Passing an unrecognized column should be a ValueError. 

340 with self.assertRaises(ValueError): 

341 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]}) 

342 

343 def testSingleIndexDataFrameWithLists(self): 

344 df1, allColumns = _makeSingleIndexDataFrame(include_lists=True) 

345 

346 self.butler.put(df1, self.datasetType, dataId={}) 

347 # Read the whole DataFrame. 

348 df2 = self.butler.get(self.datasetType, dataId={}) 

349 

350 # We need to check the list columns specially because they go 

351 # from lists to arrays. 

352 for col in ["l1", "l2", "l3"]: 

353 for i in range(len(df1)): 

354 self.assertTrue(np.all(df2[col].values[i] == df1[col].values[i])) 

355 

356 def testMultiIndexDataFrame(self): 

357 df1 = _makeMultiIndexDataFrame() 

358 

359 self.butler.put(df1, self.datasetType, dataId={}) 

360 # Read the whole DataFrame. 

361 df2 = self.butler.get(self.datasetType, dataId={}) 

362 self.assertTrue(df1.equals(df2)) 

363 # Read just the column descriptions. 

364 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

365 self.assertTrue(df1.columns.equals(columns2)) 

366 self.assertEqual(columns2.names, df1.columns.names) 

367 # Read the rowcount. 

368 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

369 self.assertEqual(rowcount, len(df1)) 

370 # Read the schema. 

371 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

372 self.assertEqual(schema, DataFrameSchema(df1)) 

373 # Read just some columns a few different ways. 

374 df3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": {"filter": "g"}}) 

375 self.assertTrue(df1.loc[:, ["g"]].equals(df3)) 

376 df4 = self.butler.get( 

377 self.datasetType, dataId={}, parameters={"columns": {"filter": ["r"], "column": "a"}} 

378 ) 

379 self.assertTrue(df1.loc[:, [("r", "a")]].equals(df4)) 

380 column_list = [("g", "a"), ("r", "c")] 

381 df5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": column_list}) 

382 self.assertTrue(df1.loc[:, column_list].equals(df5)) 

383 column_dict = {"filter": "r", "column": ["a", "b"]} 

384 df6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": column_dict}) 

385 self.assertTrue(df1.loc[:, [("r", "a"), ("r", "b")]].equals(df6)) 

386 # Passing an unrecognized column should be a ValueError. 

387 with self.assertRaises(ValueError): 

388 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["d"]}) 

389 

390 def testSingleIndexDataFrameEmptyString(self): 

391 """Test persisting a single index dataframe with empty strings.""" 

392 df1, _ = _makeSingleIndexDataFrame() 

393 

394 # Set one of the strings to None 

395 df1.at[1, "strcol"] = None 

396 

397 self.butler.put(df1, self.datasetType, dataId={}) 

398 # Read the whole DataFrame. 

399 df2 = self.butler.get(self.datasetType, dataId={}) 

400 self.assertTrue(df1.equals(df2)) 

401 

402 def testSingleIndexDataFrameAllEmptyStrings(self): 

403 """Test persisting a single index dataframe with an empty string 

404 column. 

405 """ 

406 df1, _ = _makeSingleIndexDataFrame() 

407 

408 # Set all of the strings to None 

409 df1.loc[0:, "strcol"] = None 

410 

411 self.butler.put(df1, self.datasetType, dataId={}) 

412 # Read the whole DataFrame. 

413 df2 = self.butler.get(self.datasetType, dataId={}) 

414 self.assertTrue(df1.equals(df2)) 

415 

416 def testLegacyDataFrame(self): 

417 """Test writing a dataframe to parquet via pandas (without additional 

418 metadata) and ensure that we can read it back with all the new 

419 functionality. 

420 """ 

421 df1, allColumns = _makeSingleIndexDataFrame() 

422 

423 fname = os.path.join(self.root, "test_dataframe.parq") 

424 df1.to_parquet(fname) 

425 

426 legacy_type = DatasetType( 

427 "legacy_dataframe", 

428 dimensions=(), 

429 storageClass="DataFrame", 

430 universe=self.butler.dimensions, 

431 ) 

432 self.butler.registry.registerDatasetType(legacy_type) 

433 

434 data_id = {} 

435 ref = DatasetRef(legacy_type, data_id, run=self.run) 

436 dataset = FileDataset(path=fname, refs=[ref], formatter=ParquetFormatter) 

437 

438 self.butler.ingest(dataset, transfer="copy") 

439 

440 self.butler.put(df1, self.datasetType, dataId={}) 

441 

442 df2a = self.butler.get(self.datasetType, dataId={}) 

443 df2b = self.butler.get("legacy_dataframe", dataId={}) 

444 self.assertTrue(df2a.equals(df2b)) 

445 

446 df3a = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a"]}) 

447 df3b = self.butler.get("legacy_dataframe", dataId={}, parameters={"columns": ["a"]}) 

448 self.assertTrue(df3a.equals(df3b)) 

449 

450 columns2a = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

451 columns2b = self.butler.get("legacy_dataframe.columns", dataId={}) 

452 self.assertTrue(columns2a.equals(columns2b)) 

453 

454 rowcount2a = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

455 rowcount2b = self.butler.get("legacy_dataframe.rowcount", dataId={}) 

456 self.assertEqual(rowcount2a, rowcount2b) 

457 

458 schema2a = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

459 schema2b = self.butler.get("legacy_dataframe.schema", dataId={}) 

460 self.assertEqual(schema2a, schema2b) 

461 

462 def testDataFrameSchema(self): 

463 tab1 = _makeSimpleArrowTable() 

464 

465 schema = DataFrameSchema.from_arrow(tab1.schema) 

466 

467 self.assertIsInstance(schema.schema, pd.DataFrame) 

468 self.assertEqual(repr(schema), repr(schema._schema)) 

469 self.assertNotEqual(schema, "not_a_schema") 

470 self.assertEqual(schema, schema) 

471 

472 tab2 = _makeMultiIndexDataFrame() 

473 schema2 = DataFrameSchema(tab2) 

474 

475 self.assertNotEqual(schema, schema2) 

476 

477 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

478 def testWriteSingleIndexDataFrameReadAsAstropyTable(self): 

479 df1, allColumns = _makeSingleIndexDataFrame() 

480 

481 self.butler.put(df1, self.datasetType, dataId={}) 

482 

483 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

484 

485 tab2_df = tab2.to_pandas(index="index") 

486 self.assertTrue(df1.equals(tab2_df)) 

487 

488 # Check reading the columns. 

489 columns = list(tab2.columns.keys()) 

490 columns2 = self.butler.get( 

491 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

492 ) 

493 # We check the set because pandas reorders the columns. 

494 self.assertEqual(set(columns2), set(columns)) 

495 

496 # Check reading the schema. 

497 schema = ArrowAstropySchema(tab2) 

498 schema2 = self.butler.get( 

499 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema" 

500 ) 

501 

502 # The string types are objectified by pandas, and the order 

503 # will be changed because of pandas indexing. 

504 self.assertEqual(len(schema2.schema.columns), len(schema.schema.columns)) 

505 for name in schema.schema.columns: 

506 self.assertIn(name, schema2.schema.columns) 

507 if schema2.schema[name].dtype != np.dtype("O"): 

508 self.assertEqual(schema2.schema[name].dtype, schema.schema[name].dtype) 

509 

510 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

511 def testWriteSingleIndexDataFrameWithMaskedColsReadAsAstropyTable(self): 

512 # We need to special-case the write-as-pandas read-as-astropy code 

513 # with masks because pandas has multiple ways to use masked columns. 

514 # (The string column mask handling in particular is frustratingly 

515 # inconsistent.) 

516 df1, allColumns = _makeSingleIndexDataFrame(include_masked=True) 

517 

518 self.butler.put(df1, self.datasetType, dataId={}) 

519 

520 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

521 tab2_df = tab2.to_pandas(index="index") 

522 

523 self.assertTrue(df1.columns.equals(tab2_df.columns)) 

524 for name in tab2_df.columns: 

525 col1 = df1[name] 

526 col2 = tab2_df[name] 

527 

528 if col1.hasnans: 

529 notNull = col1.notnull() 

530 self.assertTrue(notNull.equals(col2.notnull())) 

531 # Need to check value-by-value because column may 

532 # be made of objects, depending on what pandas decides. 

533 for index in notNull.values.nonzero()[0]: 

534 self.assertEqual(col1[index], col2[index]) 

535 else: 

536 self.assertTrue(col1.equals(col2)) 

537 

538 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

539 def testWriteMultiIndexDataFrameReadAsAstropyTable(self): 

540 df1 = _makeMultiIndexDataFrame() 

541 

542 self.butler.put(df1, self.datasetType, dataId={}) 

543 

544 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

545 

546 # This is an odd duck, it doesn't really round-trip. 

547 # This test simply checks that it's readable, but definitely not 

548 # recommended. 

549 

550 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.") 

551 def testWriteSingleIndexDataFrameReadAsArrowTable(self): 

552 df1, allColumns = _makeSingleIndexDataFrame() 

553 

554 self.butler.put(df1, self.datasetType, dataId={}) 

555 

556 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable") 

557 

558 tab2_df = arrow_to_pandas(tab2) 

559 self.assertTrue(df1.equals(tab2_df)) 

560 

561 # Check reading the columns. 

562 columns = list(tab2.schema.names) 

563 columns2 = self.butler.get( 

564 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

565 ) 

566 # We check the set because pandas reorders the columns. 

567 self.assertEqual(set(columns), set(columns2)) 

568 

569 # Check reading the schema. 

570 schema = tab2.schema 

571 schema2 = self.butler.get( 

572 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema" 

573 ) 

574 

575 # These will not have the same metadata, nor will the string column 

576 # information be maintained. 

577 self.assertEqual(len(schema.names), len(schema2.names)) 

578 for name in schema.names: 

579 if schema.field(name).type not in (pa.string(), pa.binary()): 

580 self.assertEqual(schema.field(name).type, schema2.field(name).type) 

581 

582 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.") 

583 def testWriteMultiIndexDataFrameReadAsArrowTable(self): 

584 df1 = _makeMultiIndexDataFrame() 

585 

586 self.butler.put(df1, self.datasetType, dataId={}) 

587 

588 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable") 

589 

590 tab2_df = arrow_to_pandas(tab2) 

591 self.assertTrue(df1.equals(tab2_df)) 

592 

593 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

594 def testWriteSingleIndexDataFrameReadAsNumpyTable(self): 

595 df1, allColumns = _makeSingleIndexDataFrame() 

596 

597 self.butler.put(df1, self.datasetType, dataId={}) 

598 

599 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy") 

600 

601 tab2_df = pd.DataFrame.from_records(tab2, index=["index"]) 

602 self.assertTrue(df1.equals(tab2_df)) 

603 

604 # Check reading the columns. 

605 columns = list(tab2.dtype.names) 

606 columns2 = self.butler.get( 

607 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

608 ) 

609 # We check the set because pandas reorders the columns. 

610 self.assertEqual(set(columns2), set(columns)) 

611 

612 # Check reading the schema. 

613 schema = ArrowNumpySchema(tab2.dtype) 

614 schema2 = self.butler.get( 

615 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema" 

616 ) 

617 

618 # The string types will be objectified by pandas, and the order 

619 # will be changed because of pandas indexing. 

620 self.assertEqual(len(schema.schema.names), len(schema2.schema.names)) 

621 for name in schema.schema.names: 

622 self.assertIn(name, schema2.schema.names) 

623 self.assertEqual(schema2.schema[name].type, schema.schema[name].type) 

624 

625 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

626 def testWriteMultiIndexDataFrameReadAsNumpyTable(self): 

627 df1 = _makeMultiIndexDataFrame() 

628 

629 self.butler.put(df1, self.datasetType, dataId={}) 

630 

631 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy") 

632 

633 # This is an odd duck, it doesn't really round-trip. 

634 # This test simply checks that it's readable, but definitely not 

635 # recommended. 

636 

637 @unittest.skipUnless(np is not None, "Cannot test reading as numpy dict without numpy.") 

638 def testWriteSingleIndexDataFrameReadAsNumpyDict(self): 

639 df1, allColumns = _makeSingleIndexDataFrame() 

640 

641 self.butler.put(df1, self.datasetType, dataId={}) 

642 

643 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict") 

644 

645 tab2_df = pd.DataFrame.from_records(tab2, index=["index"]) 

646 # The column order is not maintained. 

647 self.assertEqual(set(df1.columns), set(tab2_df.columns)) 

648 for col in df1.columns: 

649 self.assertTrue(np.all(df1[col].values == tab2_df[col].values)) 

650 

651 @unittest.skipUnless(np is not None, "Cannot test reading as numpy dict without numpy.") 

652 def testWriteMultiIndexDataFrameReadAsNumpyDict(self): 

653 df1 = _makeMultiIndexDataFrame() 

654 

655 self.butler.put(df1, self.datasetType, dataId={}) 

656 

657 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict") 

658 

659 # This is an odd duck, it doesn't really round-trip. 

660 # This test simply checks that it's readable, but definitely not 

661 # recommended. 

662 

663 

664@unittest.skipUnless(pd is not None, "Cannot test InMemoryDataFrameDelegate without pandas.") 

665class InMemoryDataFrameDelegateTestCase(ParquetFormatterDataFrameTestCase): 

666 """Tests for InMemoryDatastore, using DataFrameDelegate.""" 

667 

668 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

669 

670 def testWriteMultiIndexDataFrameReadAsAstropyTable(self): 

671 df1 = _makeMultiIndexDataFrame() 

672 

673 self.butler.put(df1, self.datasetType, dataId={}) 

674 

675 with self.assertRaises(ValueError): 

676 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

677 

678 def testLegacyDataFrame(self): 

679 # This test does not work with an inMemoryDatastore. 

680 pass 

681 

682 def testBadInput(self): 

683 df1, _ = _makeSingleIndexDataFrame() 

684 delegate = DataFrameDelegate("DataFrame") 

685 

686 with self.assertRaises(ValueError): 

687 delegate.handleParameters(inMemoryDataset="not_a_dataframe") 

688 

689 with self.assertRaises(AttributeError): 

690 delegate.getComponent(composite=df1, componentName="nothing") 

691 

692 def testStorageClass(self): 

693 df1, allColumns = _makeSingleIndexDataFrame() 

694 

695 factory = StorageClassFactory() 

696 factory.addFromConfig(StorageClassConfig()) 

697 

698 storageClass = factory.findStorageClass(type(df1), compare_types=False) 

699 # Force the name lookup to do name matching. 

700 storageClass._pytype = None 

701 self.assertEqual(storageClass.name, "DataFrame") 

702 

703 storageClass = factory.findStorageClass(type(df1), compare_types=True) 

704 # Force the name lookup to do name matching. 

705 storageClass._pytype = None 

706 self.assertEqual(storageClass.name, "DataFrame") 

707 

708 

709@unittest.skipUnless(atable is not None, "Cannot test ParquetFormatterArrowAstropy without astropy.") 

710@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowAstropy without pyarrow.") 

711class ParquetFormatterArrowAstropyTestCase(unittest.TestCase): 

712 """Tests for ParquetFormatter, ArrowAstropy, using local file datastore.""" 

713 

714 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

715 

716 def setUp(self): 

717 """Create a new butler root for each test.""" 

718 self.root = makeTestTempDir(TESTDIR) 

719 config = Config(self.configFile) 

720 self.run = "test_run" 

721 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run=self.run) 

722 # No dimensions in dataset type so we don't have to worry about 

723 # inserting dimension data or defining data IDs. 

724 self.datasetType = DatasetType( 

725 "data", dimensions=(), storageClass="ArrowAstropy", universe=self.butler.dimensions 

726 ) 

727 self.butler.registry.registerDatasetType(self.datasetType) 

728 

729 def tearDown(self): 

730 removeTestTempDir(self.root) 

731 

732 def testAstropyTable(self): 

733 tab1 = _makeSimpleAstropyTable(include_multidim=True, include_masked=True) 

734 

735 self.butler.put(tab1, self.datasetType, dataId={}) 

736 # Read the whole Table. 

737 tab2 = self.butler.get(self.datasetType, dataId={}) 

738 self._checkAstropyTableEquality(tab1, tab2) 

739 # Read the columns. 

740 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

741 self.assertEqual(len(columns2), len(tab1.dtype.names)) 

742 for i, name in enumerate(tab1.dtype.names): 

743 self.assertEqual(columns2[i], name) 

744 # Read the rowcount. 

745 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

746 self.assertEqual(rowcount, len(tab1)) 

747 # Read the schema. 

748 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

749 self.assertEqual(schema, ArrowAstropySchema(tab1)) 

750 # Read just some columns a few different ways. 

751 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

752 self._checkAstropyTableEquality(tab1[("a", "c")], tab3) 

753 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

754 self._checkAstropyTableEquality(tab1[("a",)], tab4) 

755 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]}) 

756 self._checkAstropyTableEquality(tab1[("index", "a")], tab5) 

757 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"}) 

758 self._checkAstropyTableEquality(tab1[("ddd",)], tab6) 

759 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]}) 

760 self._checkAstropyTableEquality(tab1[("a",)], tab7) 

761 # Passing an unrecognized column should be a ValueError. 

762 with self.assertRaises(ValueError): 

763 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]}) 

764 

765 def testAstropyTableBigEndian(self): 

766 tab1 = _makeSimpleAstropyTable(include_bigendian=True) 

767 

768 self.butler.put(tab1, self.datasetType, dataId={}) 

769 # Read the whole Table. 

770 tab2 = self.butler.get(self.datasetType, dataId={}) 

771 self._checkAstropyTableEquality(tab1, tab2, has_bigendian=True) 

772 

773 def testAstropyTableWithMetadata(self): 

774 tab1 = _makeSimpleAstropyTable(include_multidim=True) 

775 

776 meta = { 

777 "meta_a": 5, 

778 "meta_b": 10.0, 

779 "meta_c": [1, 2, 3], 

780 "meta_d": True, 

781 "meta_e": "string", 

782 } 

783 

784 tab1.meta.update(meta) 

785 

786 self.butler.put(tab1, self.datasetType, dataId={}) 

787 # Read the whole Table. 

788 tab2 = self.butler.get(self.datasetType, dataId={}) 

789 # This will check that the metadata is equivalent as well. 

790 self._checkAstropyTableEquality(tab1, tab2) 

791 

792 def testArrowAstropySchema(self): 

793 tab1 = _makeSimpleAstropyTable() 

794 tab1_arrow = astropy_to_arrow(tab1) 

795 schema = ArrowAstropySchema.from_arrow(tab1_arrow.schema) 

796 

797 self.assertIsInstance(schema.schema, atable.Table) 

798 self.assertEqual(repr(schema), repr(schema._schema)) 

799 self.assertNotEqual(schema, "not_a_schema") 

800 self.assertEqual(schema, schema) 

801 

802 # Test various inequalities 

803 tab2 = tab1.copy() 

804 tab2.rename_column("index", "index2") 

805 schema2 = ArrowAstropySchema(tab2) 

806 self.assertNotEqual(schema2, schema) 

807 

808 tab2 = tab1.copy() 

809 tab2["index"].unit = units.micron 

810 schema2 = ArrowAstropySchema(tab2) 

811 self.assertNotEqual(schema2, schema) 

812 

813 tab2 = tab1.copy() 

814 tab2["index"].description = "Index column" 

815 schema2 = ArrowAstropySchema(tab2) 

816 self.assertNotEqual(schema2, schema) 

817 

818 tab2 = tab1.copy() 

819 tab2["index"].format = "%05d" 

820 schema2 = ArrowAstropySchema(tab2) 

821 self.assertNotEqual(schema2, schema) 

822 

823 def testAstropyParquet(self): 

824 tab1 = _makeSimpleAstropyTable() 

825 

826 fname = os.path.join(self.root, "test_astropy.parq") 

827 tab1.write(fname) 

828 

829 astropy_type = DatasetType( 

830 "astropy_parquet", 

831 dimensions=(), 

832 storageClass="ArrowAstropy", 

833 universe=self.butler.dimensions, 

834 ) 

835 self.butler.registry.registerDatasetType(astropy_type) 

836 

837 data_id = {} 

838 ref = DatasetRef(astropy_type, data_id, run=self.run) 

839 dataset = FileDataset(path=fname, refs=[ref], formatter=ParquetFormatter) 

840 

841 self.butler.ingest(dataset, transfer="copy") 

842 

843 self.butler.put(tab1, self.datasetType, dataId={}) 

844 

845 tab2a = self.butler.get(self.datasetType, dataId={}) 

846 tab2b = self.butler.get("astropy_parquet", dataId={}) 

847 self._checkAstropyTableEquality(tab2a, tab2b) 

848 

849 columns2a = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

850 columns2b = self.butler.get("astropy_parquet.columns", dataId={}) 

851 self.assertEqual(len(columns2b), len(columns2a)) 

852 for i, name in enumerate(columns2a): 

853 self.assertEqual(columns2b[i], name) 

854 

855 rowcount2a = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

856 rowcount2b = self.butler.get("astropy_parquet.rowcount", dataId={}) 

857 self.assertEqual(rowcount2a, rowcount2b) 

858 

859 schema2a = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

860 schema2b = self.butler.get("astropy_parquet.schema", dataId={}) 

861 self.assertEqual(schema2a, schema2b) 

862 

863 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.") 

864 def testWriteAstropyReadAsArrowTable(self): 

865 # This astropy <-> arrow works fine with masked columns. 

866 tab1 = _makeSimpleAstropyTable(include_masked=True) 

867 

868 self.butler.put(tab1, self.datasetType, dataId={}) 

869 

870 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable") 

871 

872 tab2_astropy = arrow_to_astropy(tab2) 

873 self._checkAstropyTableEquality(tab1, tab2_astropy) 

874 

875 # Check reading the columns. 

876 columns = tab2.schema.names 

877 columns2 = self.butler.get( 

878 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

879 ) 

880 self.assertEqual(columns2, columns) 

881 

882 # Check reading the schema. 

883 schema = tab2.schema 

884 schema2 = self.butler.get( 

885 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema" 

886 ) 

887 

888 self.assertEqual(schema, schema2) 

889 

890 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

891 def testWriteAstropyReadAsDataFrame(self): 

892 tab1 = _makeSimpleAstropyTable() 

893 

894 self.butler.put(tab1, self.datasetType, dataId={}) 

895 

896 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

897 

898 # This is tricky because it loses the units and gains a bonus pandas 

899 # _index_ column, so we just test the dataframe form. 

900 

901 tab1_df = tab1.to_pandas() 

902 self.assertTrue(tab1_df.equals(tab2)) 

903 

904 # Check reading the columns. 

905 columns = tab2.columns 

906 columns2 = self.butler.get( 

907 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex" 

908 ) 

909 self.assertTrue(columns.equals(columns2)) 

910 

911 # Check reading the schema. 

912 schema = DataFrameSchema(tab2) 

913 schema2 = self.butler.get( 

914 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema" 

915 ) 

916 

917 self.assertEqual(schema2, schema) 

918 

919 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

920 def testWriteAstropyWithMaskedColsReadAsDataFrame(self): 

921 # We need to special-case the write-as-astropy read-as-pandas code 

922 # with masks because pandas has multiple ways to use masked columns. 

923 # (When writing an astropy table with masked columns we get an object 

924 # column back, but each unmasked element has the correct type.) 

925 tab1 = _makeSimpleAstropyTable(include_masked=True) 

926 

927 self.butler.put(tab1, self.datasetType, dataId={}) 

928 

929 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

930 

931 tab1_df = tab1.to_pandas() 

932 

933 self.assertTrue(tab1_df.columns.equals(tab2.columns)) 

934 for name in tab2.columns: 

935 col1 = tab1_df[name] 

936 col2 = tab2[name] 

937 

938 if col1.hasnans: 

939 notNull = col1.notnull() 

940 self.assertTrue(notNull.equals(col2.notnull())) 

941 # Need to check value-by-value because column may 

942 # be made of objects, depending on what pandas decides. 

943 for index in notNull.values.nonzero()[0]: 

944 self.assertEqual(col1[index], col2[index]) 

945 else: 

946 self.assertTrue(col1.equals(col2)) 

947 

948 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

949 def testWriteAstropyReadAsNumpyTable(self): 

950 tab1 = _makeSimpleAstropyTable() 

951 self.butler.put(tab1, self.datasetType, dataId={}) 

952 

953 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy") 

954 

955 # This is tricky because it loses the units. 

956 tab2_astropy = atable.Table(tab2) 

957 

958 self._checkAstropyTableEquality(tab1, tab2_astropy, skip_units=True) 

959 

960 # Check reading the columns. 

961 columns = list(tab2.dtype.names) 

962 columns2 = self.butler.get( 

963 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

964 ) 

965 self.assertEqual(columns2, columns) 

966 

967 # Check reading the schema. 

968 schema = ArrowNumpySchema(tab2.dtype) 

969 schema2 = self.butler.get( 

970 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema" 

971 ) 

972 

973 self.assertEqual(schema2, schema) 

974 

975 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

976 def testWriteAstropyReadAsNumpyDict(self): 

977 tab1 = _makeSimpleAstropyTable() 

978 self.butler.put(tab1, self.datasetType, dataId={}) 

979 

980 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict") 

981 

982 # This is tricky because it loses the units. 

983 tab2_astropy = atable.Table(tab2) 

984 

985 self._checkAstropyTableEquality(tab1, tab2_astropy, skip_units=True) 

986 

987 def _checkAstropyTableEquality(self, table1, table2, skip_units=False, has_bigendian=False): 

988 """Check if two astropy tables have the same columns/values. 

989 

990 Parameters 

991 ---------- 

992 table1 : `astropy.table.Table` 

993 table2 : `astropy.table.Table` 

994 skip_units : `bool` 

995 has_bigendian : `bool` 

996 """ 

997 if not has_bigendian: 

998 self.assertEqual(table1.dtype, table2.dtype) 

999 else: 

1000 for name in table1.dtype.names: 

1001 # Only check type matches, force to little-endian. 

1002 self.assertEqual(table1.dtype[name].newbyteorder(">"), table2.dtype[name].newbyteorder(">")) 

1003 

1004 self.assertEqual(table1.meta, table2.meta) 

1005 if not skip_units: 

1006 for name in table1.columns: 

1007 self.assertEqual(table1[name].unit, table2[name].unit) 

1008 self.assertEqual(table1[name].description, table2[name].description) 

1009 self.assertEqual(table1[name].format, table2[name].format) 

1010 self.assertTrue(np.all(table1 == table2)) 

1011 

1012 

1013@unittest.skipUnless(atable is not None, "Cannot test InMemoryArrowAstropyDelegate without astropy.") 

1014class InMemoryArrowAstropyDelegateTestCase(ParquetFormatterArrowAstropyTestCase): 

1015 """Tests for InMemoryDatastore, using ArrowAstropyDelegate.""" 

1016 

1017 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

1018 

1019 def testAstropyParquet(self): 

1020 # This test does not work with an inMemoryDatastore. 

1021 pass 

1022 

1023 def testBadInput(self): 

1024 tab1 = _makeSimpleAstropyTable() 

1025 delegate = ArrowAstropyDelegate("ArrowAstropy") 

1026 

1027 with self.assertRaises(ValueError): 

1028 delegate.handleParameters(inMemoryDataset="not_an_astropy_table") 

1029 

1030 with self.assertRaises(NotImplementedError): 

1031 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]}) 

1032 

1033 with self.assertRaises(AttributeError): 

1034 delegate.getComponent(composite=tab1, componentName="nothing") 

1035 

1036 

1037@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.") 

1038@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowNumpy without pyarrow.") 

1039class ParquetFormatterArrowNumpyTestCase(unittest.TestCase): 

1040 """Tests for ParquetFormatter, ArrowNumpy, using local file datastore.""" 

1041 

1042 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1043 

1044 def setUp(self): 

1045 """Create a new butler root for each test.""" 

1046 self.root = makeTestTempDir(TESTDIR) 

1047 config = Config(self.configFile) 

1048 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run") 

1049 # No dimensions in dataset type so we don't have to worry about 

1050 # inserting dimension data or defining data IDs. 

1051 self.datasetType = DatasetType( 

1052 "data", dimensions=(), storageClass="ArrowNumpy", universe=self.butler.dimensions 

1053 ) 

1054 self.butler.registry.registerDatasetType(self.datasetType) 

1055 

1056 def tearDown(self): 

1057 removeTestTempDir(self.root) 

1058 

1059 def testNumpyTable(self): 

1060 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1061 

1062 self.butler.put(tab1, self.datasetType, dataId={}) 

1063 # Read the whole Table. 

1064 tab2 = self.butler.get(self.datasetType, dataId={}) 

1065 self._checkNumpyTableEquality(tab1, tab2) 

1066 # Read the columns. 

1067 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

1068 self.assertEqual(len(columns2), len(tab1.dtype.names)) 

1069 for i, name in enumerate(tab1.dtype.names): 

1070 self.assertEqual(columns2[i], name) 

1071 # Read the rowcount. 

1072 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

1073 self.assertEqual(rowcount, len(tab1)) 

1074 # Read the schema. 

1075 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

1076 self.assertEqual(schema, ArrowNumpySchema(tab1.dtype)) 

1077 # Read just some columns a few different ways. 

1078 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

1079 self._checkNumpyTableEquality(tab1[["a", "c"]], tab3) 

1080 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

1081 self._checkNumpyTableEquality( 

1082 tab1[ 

1083 [ 

1084 "a", 

1085 ] 

1086 ], 

1087 tab4, 

1088 ) 

1089 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]}) 

1090 self._checkNumpyTableEquality(tab1[["index", "a"]], tab5) 

1091 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"}) 

1092 self._checkNumpyTableEquality( 

1093 tab1[ 

1094 [ 

1095 "ddd", 

1096 ] 

1097 ], 

1098 tab6, 

1099 ) 

1100 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]}) 

1101 self._checkNumpyTableEquality( 

1102 tab1[ 

1103 [ 

1104 "a", 

1105 ] 

1106 ], 

1107 tab7, 

1108 ) 

1109 # Passing an unrecognized column should be a ValueError. 

1110 with self.assertRaises(ValueError): 

1111 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]}) 

1112 

1113 def testNumpyTableBigEndian(self): 

1114 tab1 = _makeSimpleNumpyTable(include_bigendian=True) 

1115 

1116 self.butler.put(tab1, self.datasetType, dataId={}) 

1117 # Read the whole Table. 

1118 tab2 = self.butler.get(self.datasetType, dataId={}) 

1119 self._checkNumpyTableEquality(tab1, tab2, has_bigendian=True) 

1120 

1121 def testArrowNumpySchema(self): 

1122 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1123 tab1_arrow = numpy_to_arrow(tab1) 

1124 schema = ArrowNumpySchema.from_arrow(tab1_arrow.schema) 

1125 

1126 self.assertIsInstance(schema.schema, np.dtype) 

1127 self.assertEqual(repr(schema), repr(schema._dtype)) 

1128 self.assertNotEqual(schema, "not_a_schema") 

1129 self.assertEqual(schema, schema) 

1130 

1131 # Test inequality 

1132 tab2 = tab1.copy() 

1133 names = list(tab2.dtype.names) 

1134 names[0] = "index2" 

1135 tab2.dtype.names = names 

1136 schema2 = ArrowNumpySchema(tab2.dtype) 

1137 self.assertNotEqual(schema2, schema) 

1138 

1139 @unittest.skipUnless(pa is not None, "Cannot test arrow conversions without pyarrow.") 

1140 def testNumpyDictConversions(self): 

1141 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1142 

1143 # Verify that everything round-trips, including the schema. 

1144 tab1_arrow = numpy_to_arrow(tab1) 

1145 tab1_dict = arrow_to_numpy_dict(tab1_arrow) 

1146 tab1_dict_arrow = numpy_dict_to_arrow(tab1_dict) 

1147 

1148 self.assertEqual(tab1_arrow.schema, tab1_dict_arrow.schema) 

1149 self.assertEqual(tab1_arrow, tab1_dict_arrow) 

1150 

1151 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.") 

1152 def testWriteNumpyTableReadAsArrowTable(self): 

1153 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1154 

1155 self.butler.put(tab1, self.datasetType, dataId={}) 

1156 

1157 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable") 

1158 

1159 tab2_numpy = arrow_to_numpy(tab2) 

1160 

1161 self._checkNumpyTableEquality(tab1, tab2_numpy) 

1162 

1163 # Check reading the columns. 

1164 columns = tab2.schema.names 

1165 columns2 = self.butler.get( 

1166 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

1167 ) 

1168 self.assertEqual(columns2, columns) 

1169 

1170 # Check reading the schema. 

1171 schema = tab2.schema 

1172 schema2 = self.butler.get( 

1173 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema" 

1174 ) 

1175 self.assertEqual(schema2, schema) 

1176 

1177 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

1178 def testWriteNumpyTableReadAsDataFrame(self): 

1179 tab1 = _makeSimpleNumpyTable() 

1180 

1181 self.butler.put(tab1, self.datasetType, dataId={}) 

1182 

1183 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

1184 

1185 # Converting this back to numpy gets confused with the index column 

1186 # and changes the datatype of the string column. 

1187 

1188 tab1_df = pd.DataFrame(tab1) 

1189 

1190 self.assertTrue(tab1_df.equals(tab2)) 

1191 

1192 # Check reading the columns. 

1193 columns = tab2.columns 

1194 columns2 = self.butler.get( 

1195 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex" 

1196 ) 

1197 self.assertTrue(columns.equals(columns2)) 

1198 

1199 # Check reading the schema. 

1200 schema = DataFrameSchema(tab2) 

1201 schema2 = self.butler.get( 

1202 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema" 

1203 ) 

1204 

1205 self.assertEqual(schema2, schema) 

1206 

1207 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

1208 def testWriteNumpyTableReadAsAstropyTable(self): 

1209 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1210 

1211 self.butler.put(tab1, self.datasetType, dataId={}) 

1212 

1213 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

1214 tab2_numpy = tab2.as_array() 

1215 

1216 self._checkNumpyTableEquality(tab1, tab2_numpy) 

1217 

1218 # Check reading the columns. 

1219 columns = list(tab2.columns.keys()) 

1220 columns2 = self.butler.get( 

1221 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

1222 ) 

1223 self.assertEqual(columns2, columns) 

1224 

1225 # Check reading the schema. 

1226 schema = ArrowAstropySchema(tab2) 

1227 schema2 = self.butler.get( 

1228 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema" 

1229 ) 

1230 

1231 self.assertEqual(schema2, schema) 

1232 

1233 def testWriteNumpyTableReadAsNumpyDict(self): 

1234 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1235 

1236 self.butler.put(tab1, self.datasetType, dataId={}) 

1237 

1238 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict") 

1239 tab2_numpy = _numpy_dict_to_numpy(tab2) 

1240 

1241 self._checkNumpyTableEquality(tab1, tab2_numpy) 

1242 

1243 def _checkNumpyTableEquality(self, table1, table2, has_bigendian=False): 

1244 """Check if two numpy tables have the same columns/values 

1245 

1246 Parameters 

1247 ---------- 

1248 table1 : `numpy.ndarray` 

1249 table2 : `numpy.ndarray` 

1250 has_bigendian : `bool` 

1251 """ 

1252 self.assertEqual(table1.dtype.names, table2.dtype.names) 

1253 for name in table1.dtype.names: 

1254 if not has_bigendian: 

1255 self.assertEqual(table1.dtype[name], table2.dtype[name]) 

1256 else: 

1257 # Only check type matches, force to little-endian. 

1258 self.assertEqual(table1.dtype[name].newbyteorder(">"), table2.dtype[name].newbyteorder(">")) 

1259 self.assertTrue(np.all(table1 == table2)) 

1260 

1261 

1262@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.") 

1263class InMemoryArrowNumpyDelegateTestCase(ParquetFormatterArrowNumpyTestCase): 

1264 """Tests for InMemoryDatastore, using ArrowNumpyDelegate.""" 

1265 

1266 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

1267 

1268 def testBadInput(self): 

1269 tab1 = _makeSimpleNumpyTable() 

1270 delegate = ArrowNumpyDelegate("ArrowNumpy") 

1271 

1272 with self.assertRaises(ValueError): 

1273 delegate.handleParameters(inMemoryDataset="not_a_numpy_table") 

1274 

1275 with self.assertRaises(NotImplementedError): 

1276 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]}) 

1277 

1278 with self.assertRaises(AttributeError): 

1279 delegate.getComponent(composite=tab1, componentName="nothing") 

1280 

1281 def testStorageClass(self): 

1282 tab1 = _makeSimpleNumpyTable() 

1283 

1284 factory = StorageClassFactory() 

1285 factory.addFromConfig(StorageClassConfig()) 

1286 

1287 storageClass = factory.findStorageClass(type(tab1), compare_types=False) 

1288 # Force the name lookup to do name matching. 

1289 storageClass._pytype = None 

1290 self.assertEqual(storageClass.name, "ArrowNumpy") 

1291 

1292 storageClass = factory.findStorageClass(type(tab1), compare_types=True) 

1293 # Force the name lookup to do name matching. 

1294 storageClass._pytype = None 

1295 self.assertEqual(storageClass.name, "ArrowNumpy") 

1296 

1297 

1298@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowTable without pyarrow.") 

1299class ParquetFormatterArrowTableTestCase(unittest.TestCase): 

1300 """Tests for ParquetFormatter, ArrowTable, using local file datastore.""" 

1301 

1302 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1303 

1304 def setUp(self): 

1305 """Create a new butler root for each test.""" 

1306 self.root = makeTestTempDir(TESTDIR) 

1307 config = Config(self.configFile) 

1308 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run") 

1309 # No dimensions in dataset type so we don't have to worry about 

1310 # inserting dimension data or defining data IDs. 

1311 self.datasetType = DatasetType( 

1312 "data", dimensions=(), storageClass="ArrowTable", universe=self.butler.dimensions 

1313 ) 

1314 self.butler.registry.registerDatasetType(self.datasetType) 

1315 

1316 def tearDown(self): 

1317 removeTestTempDir(self.root) 

1318 

1319 def testArrowTable(self): 

1320 tab1 = _makeSimpleArrowTable(include_multidim=True, include_masked=True) 

1321 

1322 self.butler.put(tab1, self.datasetType, dataId={}) 

1323 # Read the whole Table. 

1324 tab2 = self.butler.get(self.datasetType, dataId={}) 

1325 self.assertEqual(tab2, tab1) 

1326 # Read the columns. 

1327 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

1328 self.assertEqual(len(columns2), len(tab1.schema.names)) 

1329 for i, name in enumerate(tab1.schema.names): 

1330 self.assertEqual(columns2[i], name) 

1331 # Read the rowcount. 

1332 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

1333 self.assertEqual(rowcount, len(tab1)) 

1334 # Read the schema. 

1335 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

1336 self.assertEqual(schema, tab1.schema) 

1337 # Read just some columns a few different ways. 

1338 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

1339 self.assertEqual(tab3, tab1.select(("a", "c"))) 

1340 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

1341 self.assertEqual(tab4, tab1.select(("a",))) 

1342 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]}) 

1343 self.assertEqual(tab5, tab1.select(("index", "a"))) 

1344 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"}) 

1345 self.assertEqual(tab6, tab1.select(("ddd",))) 

1346 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]}) 

1347 self.assertEqual(tab7, tab1.select(("a",))) 

1348 # Passing an unrecognized column should be a ValueError. 

1349 with self.assertRaises(ValueError): 

1350 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]}) 

1351 

1352 def testEmptyArrowTable(self): 

1353 data = _makeSimpleNumpyTable() 

1354 type_list = _numpy_dtype_to_arrow_types(data.dtype) 

1355 

1356 schema = pa.schema(type_list) 

1357 arrays = [[]] * len(schema.names) 

1358 

1359 tab1 = pa.Table.from_arrays(arrays, schema=schema) 

1360 

1361 self.butler.put(tab1, self.datasetType, dataId={}) 

1362 tab2 = self.butler.get(self.datasetType, dataId={}) 

1363 self.assertEqual(tab2, tab1) 

1364 

1365 tab1_numpy = arrow_to_numpy(tab1) 

1366 self.assertEqual(len(tab1_numpy), 0) 

1367 tab1_numpy_arrow = numpy_to_arrow(tab1_numpy) 

1368 self.assertEqual(tab1_numpy_arrow, tab1) 

1369 

1370 tab1_pandas = arrow_to_pandas(tab1) 

1371 self.assertEqual(len(tab1_pandas), 0) 

1372 tab1_pandas_arrow = pandas_to_arrow(tab1_pandas) 

1373 # Unfortunately, string/byte columns get mangled when translated 

1374 # through empty pandas dataframes. 

1375 self.assertEqual( 

1376 tab1_pandas_arrow.select(("index", "a", "b", "c", "ddd")), 

1377 tab1.select(("index", "a", "b", "c", "ddd")), 

1378 ) 

1379 

1380 tab1_astropy = arrow_to_astropy(tab1) 

1381 self.assertEqual(len(tab1_astropy), 0) 

1382 tab1_astropy_arrow = astropy_to_arrow(tab1_astropy) 

1383 self.assertEqual(tab1_astropy_arrow, tab1) 

1384 

1385 def testEmptyArrowTableMultidim(self): 

1386 data = _makeSimpleNumpyTable(include_multidim=True) 

1387 type_list = _numpy_dtype_to_arrow_types(data.dtype) 

1388 

1389 md = {} 

1390 for name in data.dtype.names: 

1391 _append_numpy_multidim_metadata(md, name, data.dtype[name]) 

1392 

1393 schema = pa.schema(type_list, metadata=md) 

1394 arrays = [[]] * len(schema.names) 

1395 

1396 tab1 = pa.Table.from_arrays(arrays, schema=schema) 

1397 

1398 self.butler.put(tab1, self.datasetType, dataId={}) 

1399 tab2 = self.butler.get(self.datasetType, dataId={}) 

1400 self.assertEqual(tab2, tab1) 

1401 

1402 tab1_numpy = arrow_to_numpy(tab1) 

1403 self.assertEqual(len(tab1_numpy), 0) 

1404 tab1_numpy_arrow = numpy_to_arrow(tab1_numpy) 

1405 self.assertEqual(tab1_numpy_arrow, tab1) 

1406 

1407 tab1_astropy = arrow_to_astropy(tab1) 

1408 self.assertEqual(len(tab1_astropy), 0) 

1409 tab1_astropy_arrow = astropy_to_arrow(tab1_astropy) 

1410 self.assertEqual(tab1_astropy_arrow, tab1) 

1411 

1412 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

1413 def testWriteArrowTableReadAsSingleIndexDataFrame(self): 

1414 df1, allColumns = _makeSingleIndexDataFrame() 

1415 

1416 self.butler.put(df1, self.datasetType, dataId={}) 

1417 

1418 # Read back out as a dataframe. 

1419 df2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

1420 self.assertTrue(df1.equals(df2)) 

1421 

1422 # Read back out as an arrow table, convert to dataframe. 

1423 tab3 = self.butler.get(self.datasetType, dataId={}) 

1424 df3 = arrow_to_pandas(tab3) 

1425 self.assertTrue(df1.equals(df3)) 

1426 

1427 # Check reading the columns. 

1428 columns = df2.reset_index().columns 

1429 columns2 = self.butler.get( 

1430 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex" 

1431 ) 

1432 # We check the set because pandas reorders the columns. 

1433 self.assertEqual(set(columns2.to_list()), set(columns.to_list())) 

1434 

1435 # Check reading the schema. 

1436 schema = DataFrameSchema(df1) 

1437 schema2 = self.butler.get( 

1438 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema" 

1439 ) 

1440 self.assertEqual(schema2, schema) 

1441 

1442 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

1443 def testWriteArrowTableReadAsMultiIndexDataFrame(self): 

1444 df1 = _makeMultiIndexDataFrame() 

1445 

1446 self.butler.put(df1, self.datasetType, dataId={}) 

1447 

1448 # Read back out as a dataframe. 

1449 df2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

1450 self.assertTrue(df1.equals(df2)) 

1451 

1452 # Read back out as an arrow table, convert to dataframe. 

1453 atab3 = self.butler.get(self.datasetType, dataId={}) 

1454 df3 = arrow_to_pandas(atab3) 

1455 self.assertTrue(df1.equals(df3)) 

1456 

1457 # Check reading the columns. 

1458 columns = df2.columns 

1459 columns2 = self.butler.get( 

1460 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex" 

1461 ) 

1462 self.assertTrue(columns2.equals(columns)) 

1463 

1464 # Check reading the schema. 

1465 schema = DataFrameSchema(df1) 

1466 schema2 = self.butler.get( 

1467 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema" 

1468 ) 

1469 self.assertEqual(schema2, schema) 

1470 

1471 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

1472 def testWriteArrowTableReadAsAstropyTable(self): 

1473 tab1 = _makeSimpleAstropyTable(include_multidim=True, include_masked=True) 

1474 

1475 self.butler.put(tab1, self.datasetType, dataId={}) 

1476 

1477 # Read back out as an astropy table. 

1478 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

1479 self._checkAstropyTableEquality(tab1, tab2) 

1480 

1481 # Read back out as an arrow table, convert to astropy table. 

1482 atab3 = self.butler.get(self.datasetType, dataId={}) 

1483 tab3 = arrow_to_astropy(atab3) 

1484 self._checkAstropyTableEquality(tab1, tab3) 

1485 

1486 # Check reading the columns. 

1487 columns = list(tab2.columns.keys()) 

1488 columns2 = self.butler.get( 

1489 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

1490 ) 

1491 self.assertEqual(columns2, columns) 

1492 

1493 # Check reading the schema. 

1494 schema = ArrowAstropySchema(tab1) 

1495 schema2 = self.butler.get( 

1496 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema" 

1497 ) 

1498 self.assertEqual(schema2, schema) 

1499 

1500 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

1501 def testWriteArrowTableReadAsNumpyTable(self): 

1502 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1503 

1504 self.butler.put(tab1, self.datasetType, dataId={}) 

1505 

1506 # Read back out as a numpy table. 

1507 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy") 

1508 self._checkNumpyTableEquality(tab1, tab2) 

1509 

1510 # Read back out as an arrow table, convert to numpy table. 

1511 atab3 = self.butler.get(self.datasetType, dataId={}) 

1512 tab3 = arrow_to_numpy(atab3) 

1513 self._checkNumpyTableEquality(tab1, tab3) 

1514 

1515 # Check reading the columns. 

1516 columns = list(tab2.dtype.names) 

1517 columns2 = self.butler.get( 

1518 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

1519 ) 

1520 self.assertEqual(columns2, columns) 

1521 

1522 # Check reading the schema. 

1523 schema = ArrowNumpySchema(tab1.dtype) 

1524 schema2 = self.butler.get( 

1525 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema" 

1526 ) 

1527 self.assertEqual(schema2, schema) 

1528 

1529 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

1530 def testWriteArrowTableReadAsNumpyDict(self): 

1531 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1532 

1533 self.butler.put(tab1, self.datasetType, dataId={}) 

1534 

1535 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict") 

1536 tab2_numpy = _numpy_dict_to_numpy(tab2) 

1537 self._checkNumpyTableEquality(tab1, tab2_numpy) 

1538 

1539 def _checkAstropyTableEquality(self, table1, table2): 

1540 """Check if two astropy tables have the same columns/values 

1541 

1542 Parameters 

1543 ---------- 

1544 table1 : `astropy.table.Table` 

1545 table2 : `astropy.table.Table` 

1546 """ 

1547 self.assertEqual(table1.dtype, table2.dtype) 

1548 for name in table1.columns: 

1549 self.assertEqual(table1[name].unit, table2[name].unit) 

1550 self.assertEqual(table1[name].description, table2[name].description) 

1551 self.assertEqual(table1[name].format, table2[name].format) 

1552 self.assertTrue(np.all(table1 == table2)) 

1553 

1554 def _checkNumpyTableEquality(self, table1, table2): 

1555 """Check if two numpy tables have the same columns/values 

1556 

1557 Parameters 

1558 ---------- 

1559 table1 : `numpy.ndarray` 

1560 table2 : `numpy.ndarray` 

1561 """ 

1562 self.assertEqual(table1.dtype.names, table2.dtype.names) 

1563 for name in table1.dtype.names: 

1564 self.assertEqual(table1.dtype[name], table2.dtype[name]) 

1565 self.assertTrue(np.all(table1 == table2)) 

1566 

1567 

1568@unittest.skipUnless(pa is not None, "Cannot test InMemoryArrowTableDelegate without pyarrow.") 

1569class InMemoryArrowTableDelegateTestCase(ParquetFormatterArrowTableTestCase): 

1570 """Tests for InMemoryDatastore, using ArrowTableDelegate.""" 

1571 

1572 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

1573 

1574 def testBadInput(self): 

1575 tab1 = _makeSimpleArrowTable() 

1576 delegate = ArrowTableDelegate("ArrowTable") 

1577 

1578 with self.assertRaises(ValueError): 

1579 delegate.handleParameters(inMemoryDataset="not_an_arrow_table") 

1580 

1581 with self.assertRaises(NotImplementedError): 

1582 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]}) 

1583 

1584 with self.assertRaises(AttributeError): 

1585 delegate.getComponent(composite=tab1, componentName="nothing") 

1586 

1587 def testStorageClass(self): 

1588 tab1 = _makeSimpleArrowTable() 

1589 

1590 factory = StorageClassFactory() 

1591 factory.addFromConfig(StorageClassConfig()) 

1592 

1593 storageClass = factory.findStorageClass(type(tab1), compare_types=False) 

1594 # Force the name lookup to do name matching. 

1595 storageClass._pytype = None 

1596 self.assertEqual(storageClass.name, "ArrowTable") 

1597 

1598 storageClass = factory.findStorageClass(type(tab1), compare_types=True) 

1599 # Force the name lookup to do name matching. 

1600 storageClass._pytype = None 

1601 self.assertEqual(storageClass.name, "ArrowTable") 

1602 

1603 

1604@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.") 

1605@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowNumpy without pyarrow.") 

1606class ParquetFormatterArrowNumpyDictTestCase(unittest.TestCase): 

1607 """Tests for ParquetFormatter, ArrowNumpyDict, using local file store.""" 

1608 

1609 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1610 

1611 def setUp(self): 

1612 """Create a new butler root for each test.""" 

1613 self.root = makeTestTempDir(TESTDIR) 

1614 config = Config(self.configFile) 

1615 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run") 

1616 # No dimensions in dataset type so we don't have to worry about 

1617 # inserting dimension data or defining data IDs. 

1618 self.datasetType = DatasetType( 

1619 "data", dimensions=(), storageClass="ArrowNumpyDict", universe=self.butler.dimensions 

1620 ) 

1621 self.butler.registry.registerDatasetType(self.datasetType) 

1622 

1623 def tearDown(self): 

1624 removeTestTempDir(self.root) 

1625 

1626 def testNumpyDict(self): 

1627 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1628 dict1 = _numpy_to_numpy_dict(tab1) 

1629 

1630 self.butler.put(dict1, self.datasetType, dataId={}) 

1631 # Read the whole table. 

1632 dict2 = self.butler.get(self.datasetType, dataId={}) 

1633 self._checkNumpyDictEquality(dict1, dict2) 

1634 # Read the columns. 

1635 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

1636 self.assertEqual(len(columns2), len(dict1.keys())) 

1637 for name in dict1: 

1638 self.assertIn(name, columns2) 

1639 # Read the rowcount. 

1640 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

1641 self.assertEqual(rowcount, len(dict1["a"])) 

1642 # Read the schema. 

1643 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

1644 self.assertEqual(schema, ArrowNumpySchema(tab1.dtype)) 

1645 # Read just some columns a few different ways. 

1646 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

1647 subdict = {key: dict1[key] for key in ["a", "c"]} 

1648 self._checkNumpyDictEquality(subdict, tab3) 

1649 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

1650 subdict = {key: dict1[key] for key in ["a"]} 

1651 self._checkNumpyDictEquality(subdict, tab4) 

1652 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]}) 

1653 subdict = {key: dict1[key] for key in ["index", "a"]} 

1654 self._checkNumpyDictEquality(subdict, tab5) 

1655 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"}) 

1656 subdict = {key: dict1[key] for key in ["ddd"]} 

1657 self._checkNumpyDictEquality(subdict, tab6) 

1658 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]}) 

1659 subdict = {key: dict1[key] for key in ["a"]} 

1660 self._checkNumpyDictEquality(subdict, tab7) 

1661 # Passing an unrecognized column should be a ValueError. 

1662 with self.assertRaises(ValueError): 

1663 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]}) 

1664 

1665 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.") 

1666 def testWriteNumpyDictReadAsArrowTable(self): 

1667 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1668 dict1 = _numpy_to_numpy_dict(tab1) 

1669 

1670 self.butler.put(dict1, self.datasetType, dataId={}) 

1671 

1672 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable") 

1673 

1674 tab2_dict = arrow_to_numpy_dict(tab2) 

1675 

1676 self._checkNumpyDictEquality(dict1, tab2_dict) 

1677 

1678 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

1679 def testWriteNumpyDictReadAsDataFrame(self): 

1680 tab1 = _makeSimpleNumpyTable() 

1681 dict1 = _numpy_to_numpy_dict(tab1) 

1682 

1683 self.butler.put(dict1, self.datasetType, dataId={}) 

1684 

1685 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

1686 

1687 # The order of the dict may get mixed up, so we need to check column 

1688 # by column. We also need to do this in dataframe form because pandas 

1689 # changes the datatype of the string column. 

1690 tab1_df = pd.DataFrame(tab1) 

1691 

1692 self.assertEqual(set(tab1_df.columns), set(tab2.columns)) 

1693 for col in tab1_df.columns: 

1694 self.assertTrue(np.all(tab1_df[col].values == tab2[col].values)) 

1695 

1696 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

1697 def testWriteNumpyDictReadAsAstropyTable(self): 

1698 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1699 dict1 = _numpy_to_numpy_dict(tab1) 

1700 

1701 self.butler.put(dict1, self.datasetType, dataId={}) 

1702 

1703 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

1704 tab2_dict = _astropy_to_numpy_dict(tab2) 

1705 

1706 self._checkNumpyDictEquality(dict1, tab2_dict) 

1707 

1708 def testWriteNumpyDictReadAsNumpyTable(self): 

1709 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1710 dict1 = _numpy_to_numpy_dict(tab1) 

1711 

1712 self.butler.put(dict1, self.datasetType, dataId={}) 

1713 

1714 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy") 

1715 tab2_dict = _numpy_to_numpy_dict(tab2) 

1716 

1717 self._checkNumpyDictEquality(dict1, tab2_dict) 

1718 

1719 def testWriteNumpyDictBad(self): 

1720 dict1 = {"a": 4, "b": np.ndarray([1])} 

1721 with self.assertRaises(RuntimeError): 

1722 self.butler.put(dict1, self.datasetType, dataId={}) 

1723 

1724 dict2 = {"a": np.zeros(4), "b": np.zeros(5)} 

1725 with self.assertRaises(RuntimeError): 

1726 self.butler.put(dict2, self.datasetType, dataId={}) 

1727 

1728 dict3 = {"a": [0] * 5, "b": np.zeros(5)} 

1729 with self.assertRaises(RuntimeError): 

1730 self.butler.put(dict3, self.datasetType, dataId={}) 

1731 

1732 def _checkNumpyDictEquality(self, dict1, dict2): 

1733 """Check if two numpy dicts have the same columns/values. 

1734 

1735 Parameters 

1736 ---------- 

1737 dict1 : `dict` [`str`, `np.ndarray`] 

1738 dict2 : `dict` [`str`, `np.ndarray`] 

1739 """ 

1740 self.assertEqual(set(dict1.keys()), set(dict2.keys())) 

1741 for name in dict1: 

1742 self.assertEqual(dict1[name].dtype, dict2[name].dtype) 

1743 self.assertTrue(np.all(dict1[name] == dict2[name])) 

1744 

1745 

1746@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.") 

1747@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowNumpy without pyarrow.") 

1748class InMemoryNumpyDictDelegateTestCase(ParquetFormatterArrowNumpyDictTestCase): 

1749 """Tests for InMemoryDatastore, using ArrowNumpyDictDelegate.""" 

1750 

1751 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

1752 

1753 def testWriteNumpyDictBad(self): 

1754 # The sub-type checking is not done on in-memory datastore. 

1755 pass 

1756 

1757 

1758@unittest.skipUnless(np is not None, "Cannot test compute_row_group_size without numpy.") 

1759@unittest.skipUnless(pa is not None, "Cannot test compute_row_group_size without pyarrow.") 

1760class ComputeRowGroupSizeTestCase(unittest.TestCase): 

1761 """Tests for compute_row_group_size.""" 

1762 

1763 def testRowGroupSizeNoMetadata(self): 

1764 numpyTable = _makeSimpleNumpyTable(include_multidim=True) 

1765 

1766 # We can't use the numpy_to_arrow convenience function because 

1767 # that adds metadata. 

1768 type_list = _numpy_dtype_to_arrow_types(numpyTable.dtype) 

1769 schema = pa.schema(type_list) 

1770 arrays = _numpy_style_arrays_to_arrow_arrays( 

1771 numpyTable.dtype, 

1772 len(numpyTable), 

1773 numpyTable, 

1774 schema, 

1775 ) 

1776 arrowTable = pa.Table.from_arrays(arrays, schema=schema) 

1777 

1778 row_group_size = compute_row_group_size(arrowTable.schema) 

1779 

1780 self.assertGreater(row_group_size, 1_000_000) 

1781 self.assertLess(row_group_size, 2_000_000) 

1782 

1783 def testRowGroupSizeWithMetadata(self): 

1784 numpyTable = _makeSimpleNumpyTable(include_multidim=True) 

1785 

1786 arrowTable = numpy_to_arrow(numpyTable) 

1787 

1788 row_group_size = compute_row_group_size(arrowTable.schema) 

1789 

1790 self.assertGreater(row_group_size, 1_000_000) 

1791 self.assertLess(row_group_size, 2_000_000) 

1792 

1793 def testRowGroupSizeTinyTable(self): 

1794 numpyTable = np.zeros(1, dtype=[("a", np.bool_)]) 

1795 

1796 arrowTable = numpy_to_arrow(numpyTable) 

1797 

1798 row_group_size = compute_row_group_size(arrowTable.schema) 

1799 

1800 self.assertGreater(row_group_size, 1_000_000) 

1801 

1802 @unittest.skipUnless(pd is not None, "Cannot run testRowGroupSizeDataFrameWithLists without pandas.") 

1803 def testRowGroupSizeDataFrameWithLists(self): 

1804 df = pd.DataFrame({"a": np.zeros(10), "b": [[0, 0]] * 10, "c": [[0.0, 0.0]] * 10, "d": [[]] * 10}) 

1805 arrowTable = pandas_to_arrow(df) 

1806 row_group_size = compute_row_group_size(arrowTable.schema) 

1807 

1808 self.assertGreater(row_group_size, 1_000_000) 

1809 

1810 

1811if __name__ == "__main__": 

1812 unittest.main()