Coverage for tests/test_parquet.py: 17%

957 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-06-23 09:30 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""Tests for ParquetFormatter. 

23 

24Tests in this module are disabled unless pandas and pyarrow are importable. 

25""" 

26 

27import os 

28import unittest 

29 

30try: 

31 import pyarrow as pa 

32except ImportError: 

33 pa = None 

34try: 

35 import astropy.table as atable 

36 from astropy import units 

37except ImportError: 

38 atable = None 

39try: 

40 import numpy as np 

41except ImportError: 

42 np = None 

43try: 

44 import pandas as pd 

45except ImportError: 

46 np = None 

47 

48from lsst.daf.butler import ( 

49 Butler, 

50 Config, 

51 DatasetRef, 

52 DatasetType, 

53 FileDataset, 

54 StorageClassConfig, 

55 StorageClassFactory, 

56) 

57from lsst.daf.butler.delegates.arrowastropy import ArrowAstropyDelegate 

58from lsst.daf.butler.delegates.arrownumpy import ArrowNumpyDelegate 

59from lsst.daf.butler.delegates.arrowtable import ArrowTableDelegate 

60from lsst.daf.butler.delegates.dataframe import DataFrameDelegate 

61from lsst.daf.butler.formatters.parquet import ( 

62 ArrowAstropySchema, 

63 ArrowNumpySchema, 

64 DataFrameSchema, 

65 ParquetFormatter, 

66 _append_numpy_multidim_metadata, 

67 _astropy_to_numpy_dict, 

68 _numpy_dict_to_numpy, 

69 _numpy_dtype_to_arrow_types, 

70 _numpy_style_arrays_to_arrow_arrays, 

71 _numpy_to_numpy_dict, 

72 arrow_to_astropy, 

73 arrow_to_numpy, 

74 arrow_to_numpy_dict, 

75 arrow_to_pandas, 

76 astropy_to_arrow, 

77 compute_row_group_size, 

78 numpy_dict_to_arrow, 

79 numpy_to_arrow, 

80 pandas_to_arrow, 

81) 

82from lsst.daf.butler.tests.utils import makeTestTempDir, removeTestTempDir 

83 

84TESTDIR = os.path.abspath(os.path.dirname(__file__)) 

85 

86 

87def _makeSimpleNumpyTable(include_multidim=False, include_bigendian=False): 

88 """Make a simple numpy table with random data. 

89 

90 Parameters 

91 ---------- 

92 include_multidim : `bool` 

93 Include multi-dimensional columns. 

94 include_bigendian : `bool` 

95 Include big-endian columns. 

96 

97 Returns 

98 ------- 

99 numpyTable : `numpy.ndarray` 

100 """ 

101 nrow = 5 

102 

103 dtype = [ 

104 ("index", "i4"), 

105 ("a", "f8"), 

106 ("b", "f8"), 

107 ("c", "f8"), 

108 ("ddd", "f8"), 

109 ("f", "i8"), 

110 ("strcol", "U10"), 

111 ("bytecol", "a10"), 

112 ] 

113 

114 if include_multidim: 

115 dtype.extend( 

116 [ 

117 ("d1", "f4", (5,)), 

118 ("d2", "i8", (5, 10)), 

119 ("d3", "f8", (5, 10)), 

120 ] 

121 ) 

122 

123 if include_bigendian: 

124 dtype.extend([("a_bigendian", ">f8"), ("f_bigendian", ">i8")]) 

125 

126 data = np.zeros(nrow, dtype=dtype) 

127 data["index"][:] = np.arange(nrow) 

128 data["a"] = np.random.randn(nrow) 

129 data["b"] = np.random.randn(nrow) 

130 data["c"] = np.random.randn(nrow) 

131 data["ddd"] = np.random.randn(nrow) 

132 data["f"] = np.arange(nrow) * 10 

133 data["strcol"][:] = "teststring" 

134 data["bytecol"][:] = "teststring" 

135 

136 if include_multidim: 

137 data["d1"] = np.random.randn(data["d1"].size).reshape(data["d1"].shape) 

138 data["d2"] = np.arange(data["d2"].size).reshape(data["d2"].shape) 

139 data["d3"] = np.asfortranarray(np.random.randn(data["d3"].size).reshape(data["d3"].shape)) 

140 

141 if include_bigendian: 

142 data["a_bigendian"][:] = data["a"] 

143 data["f_bigendian"][:] = data["f"] 

144 

145 return data 

146 

147 

148def _makeSingleIndexDataFrame(include_masked=False, include_lists=False): 

149 """Make a single index data frame for testing. 

150 

151 Parameters 

152 ---------- 

153 include_masked : `bool` 

154 Include masked columns. 

155 include_lists : `bool` 

156 Include list columns. 

157 

158 Returns 

159 ------- 

160 dataFrame : `~pandas.DataFrame` 

161 The test dataframe. 

162 allColumns : `list` [`str`] 

163 List of all the columns (including index columns). 

164 """ 

165 data = _makeSimpleNumpyTable() 

166 df = pd.DataFrame(data) 

167 df = df.set_index("index") 

168 

169 if include_masked: 

170 nrow = len(df) 

171 

172 df["m1"] = pd.array(np.arange(nrow), dtype=pd.Int64Dtype()) 

173 df["m2"] = pd.array(np.arange(nrow), dtype=np.float32) 

174 df["mstrcol"] = pd.array(np.array(["text"] * nrow)) 

175 df.loc[1, ["m1", "m2", "mstrcol"]] = None 

176 

177 if include_lists: 

178 nrow = len(df) 

179 

180 df["l1"] = [[0, 0]] * nrow 

181 df["l2"] = [[0.0, 0.0]] * nrow 

182 df["l3"] = [[]] * nrow 

183 

184 allColumns = df.columns.append(pd.Index(df.index.names)) 

185 

186 return df, allColumns 

187 

188 

189def _makeMultiIndexDataFrame(): 

190 """Make a multi-index data frame for testing. 

191 

192 Returns 

193 ------- 

194 dataFrame : `~pandas.DataFrame` 

195 The test dataframe. 

196 """ 

197 columns = pd.MultiIndex.from_tuples( 

198 [ 

199 ("g", "a"), 

200 ("g", "b"), 

201 ("g", "c"), 

202 ("r", "a"), 

203 ("r", "b"), 

204 ("r", "c"), 

205 ], 

206 names=["filter", "column"], 

207 ) 

208 df = pd.DataFrame(np.random.randn(5, 6), index=np.arange(5, dtype=int), columns=columns) 

209 

210 return df 

211 

212 

213def _makeSimpleAstropyTable(include_multidim=False, include_masked=False, include_bigendian=False): 

214 """Make an astropy table for testing. 

215 

216 Parameters 

217 ---------- 

218 include_multidim : `bool` 

219 Include multi-dimensional columns. 

220 include_masked : `bool` 

221 Include masked columns. 

222 include_bigendian : `bool` 

223 Include big-endian columns. 

224 

225 Returns 

226 ------- 

227 astropyTable : `astropy.table.Table` 

228 The test table. 

229 """ 

230 data = _makeSimpleNumpyTable(include_multidim=include_multidim, include_bigendian=include_bigendian) 

231 # Add a couple of units. 

232 table = atable.Table(data) 

233 table["a"].unit = units.degree 

234 table["b"].unit = units.meter 

235 

236 # Add some masked columns. 

237 if include_masked: 

238 nrow = len(table) 

239 mask = np.zeros(nrow, dtype=bool) 

240 mask[1] = True 

241 table["m1"] = np.ma.masked_array(data=np.arange(nrow, dtype="i8"), mask=mask) 

242 table["m2"] = np.ma.masked_array(data=np.arange(nrow, dtype="f4"), mask=mask) 

243 table["mstrcol"] = np.ma.masked_array(data=np.array(["text"] * nrow), mask=mask) 

244 table["mbytecol"] = np.ma.masked_array(data=np.array([b"bytes"] * nrow), mask=mask) 

245 

246 return table 

247 

248 

249def _makeSimpleArrowTable(include_multidim=False, include_masked=False): 

250 """Make an arrow table for testing. 

251 

252 Parameters 

253 ---------- 

254 include_multidim : `bool` 

255 Include multi-dimensional columns. 

256 include_masked : `bool` 

257 Include masked columns. 

258 

259 Returns 

260 ------- 

261 arrowTable : `pyarrow.Table` 

262 The test table. 

263 """ 

264 data = _makeSimpleAstropyTable(include_multidim=include_multidim, include_masked=include_masked) 

265 return astropy_to_arrow(data) 

266 

267 

268@unittest.skipUnless(pd is not None, "Cannot test ParquetFormatterDataFrame without pandas.") 

269@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterDataFrame without pyarrow.") 

270class ParquetFormatterDataFrameTestCase(unittest.TestCase): 

271 """Tests for ParquetFormatter, DataFrame, using local file datastore.""" 

272 

273 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

274 

275 def setUp(self): 

276 """Create a new butler root for each test.""" 

277 self.root = makeTestTempDir(TESTDIR) 

278 config = Config(self.configFile) 

279 self.run = "test_run" 

280 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run=self.run) 

281 # No dimensions in dataset type so we don't have to worry about 

282 # inserting dimension data or defining data IDs. 

283 self.datasetType = DatasetType( 

284 "data", dimensions=(), storageClass="DataFrame", universe=self.butler.dimensions 

285 ) 

286 self.butler.registry.registerDatasetType(self.datasetType) 

287 

288 def tearDown(self): 

289 removeTestTempDir(self.root) 

290 

291 def testSingleIndexDataFrame(self): 

292 df1, allColumns = _makeSingleIndexDataFrame(include_masked=True) 

293 

294 self.butler.put(df1, self.datasetType, dataId={}) 

295 # Read the whole DataFrame. 

296 df2 = self.butler.get(self.datasetType, dataId={}) 

297 self.assertTrue(df1.equals(df2)) 

298 # Read just the column descriptions. 

299 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

300 self.assertTrue(allColumns.equals(columns2)) 

301 # Read the rowcount. 

302 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

303 self.assertEqual(rowcount, len(df1)) 

304 # Read the schema. 

305 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

306 self.assertEqual(schema, DataFrameSchema(df1)) 

307 # Read just some columns a few different ways. 

308 df3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

309 self.assertTrue(df1.loc[:, ["a", "c"]].equals(df3)) 

310 df4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

311 self.assertTrue(df1.loc[:, ["a"]].equals(df4)) 

312 df5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]}) 

313 self.assertTrue(df1.loc[:, ["a"]].equals(df5)) 

314 df6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"}) 

315 self.assertTrue(df1.loc[:, ["ddd"]].equals(df6)) 

316 df7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]}) 

317 self.assertTrue(df1.loc[:, ["a"]].equals(df7)) 

318 # Passing an unrecognized column should be a ValueError. 

319 with self.assertRaises(ValueError): 

320 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]}) 

321 

322 def testSingleIndexDataFrameWithLists(self): 

323 df1, allColumns = _makeSingleIndexDataFrame(include_lists=True) 

324 

325 self.butler.put(df1, self.datasetType, dataId={}) 

326 # Read the whole DataFrame. 

327 df2 = self.butler.get(self.datasetType, dataId={}) 

328 

329 # We need to check the list columns specially because they go 

330 # from lists to arrays. 

331 for col in ["l1", "l2", "l3"]: 

332 for i in range(len(df1)): 

333 self.assertTrue(np.all(df2[col].values[i] == df1[col].values[i])) 

334 

335 def testMultiIndexDataFrame(self): 

336 df1 = _makeMultiIndexDataFrame() 

337 

338 self.butler.put(df1, self.datasetType, dataId={}) 

339 # Read the whole DataFrame. 

340 df2 = self.butler.get(self.datasetType, dataId={}) 

341 self.assertTrue(df1.equals(df2)) 

342 # Read just the column descriptions. 

343 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

344 self.assertTrue(df1.columns.equals(columns2)) 

345 self.assertEqual(columns2.names, df1.columns.names) 

346 # Read the rowcount. 

347 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

348 self.assertEqual(rowcount, len(df1)) 

349 # Read the schema. 

350 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

351 self.assertEqual(schema, DataFrameSchema(df1)) 

352 # Read just some columns a few different ways. 

353 df3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": {"filter": "g"}}) 

354 self.assertTrue(df1.loc[:, ["g"]].equals(df3)) 

355 df4 = self.butler.get( 

356 self.datasetType, dataId={}, parameters={"columns": {"filter": ["r"], "column": "a"}} 

357 ) 

358 self.assertTrue(df1.loc[:, [("r", "a")]].equals(df4)) 

359 column_list = [("g", "a"), ("r", "c")] 

360 df5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": column_list}) 

361 self.assertTrue(df1.loc[:, column_list].equals(df5)) 

362 column_dict = {"filter": "r", "column": ["a", "b"]} 

363 df6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": column_dict}) 

364 self.assertTrue(df1.loc[:, [("r", "a"), ("r", "b")]].equals(df6)) 

365 # Passing an unrecognized column should be a ValueError. 

366 with self.assertRaises(ValueError): 

367 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["d"]}) 

368 

369 def testSingleIndexDataFrameEmptyString(self): 

370 """Test persisting a single index dataframe with empty strings.""" 

371 df1, _ = _makeSingleIndexDataFrame() 

372 

373 # Set one of the strings to None 

374 df1.at[1, "strcol"] = None 

375 

376 self.butler.put(df1, self.datasetType, dataId={}) 

377 # Read the whole DataFrame. 

378 df2 = self.butler.get(self.datasetType, dataId={}) 

379 self.assertTrue(df1.equals(df2)) 

380 

381 def testSingleIndexDataFrameAllEmptyStrings(self): 

382 """Test persisting a single index dataframe with an empty string 

383 column. 

384 """ 

385 df1, _ = _makeSingleIndexDataFrame() 

386 

387 # Set all of the strings to None 

388 df1.loc[0:, "strcol"] = None 

389 

390 self.butler.put(df1, self.datasetType, dataId={}) 

391 # Read the whole DataFrame. 

392 df2 = self.butler.get(self.datasetType, dataId={}) 

393 self.assertTrue(df1.equals(df2)) 

394 

395 def testLegacyDataFrame(self): 

396 """Test writing a dataframe to parquet via pandas (without additional 

397 metadata) and ensure that we can read it back with all the new 

398 functionality. 

399 """ 

400 df1, allColumns = _makeSingleIndexDataFrame() 

401 

402 fname = os.path.join(self.root, "test_dataframe.parq") 

403 df1.to_parquet(fname) 

404 

405 legacy_type = DatasetType( 

406 "legacy_dataframe", 

407 dimensions=(), 

408 storageClass="DataFrame", 

409 universe=self.butler.dimensions, 

410 ) 

411 self.butler.registry.registerDatasetType(legacy_type) 

412 

413 data_id = {} 

414 ref = DatasetRef(legacy_type, data_id, run=self.run) 

415 dataset = FileDataset(path=fname, refs=[ref], formatter=ParquetFormatter) 

416 

417 self.butler.ingest(dataset, transfer="copy") 

418 

419 self.butler.put(df1, self.datasetType, dataId={}) 

420 

421 df2a = self.butler.get(self.datasetType, dataId={}) 

422 df2b = self.butler.get("legacy_dataframe", dataId={}) 

423 self.assertTrue(df2a.equals(df2b)) 

424 

425 df3a = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a"]}) 

426 df3b = self.butler.get("legacy_dataframe", dataId={}, parameters={"columns": ["a"]}) 

427 self.assertTrue(df3a.equals(df3b)) 

428 

429 columns2a = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

430 columns2b = self.butler.get("legacy_dataframe.columns", dataId={}) 

431 self.assertTrue(columns2a.equals(columns2b)) 

432 

433 rowcount2a = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

434 rowcount2b = self.butler.get("legacy_dataframe.rowcount", dataId={}) 

435 self.assertEqual(rowcount2a, rowcount2b) 

436 

437 schema2a = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

438 schema2b = self.butler.get("legacy_dataframe.schema", dataId={}) 

439 self.assertEqual(schema2a, schema2b) 

440 

441 def testDataFrameSchema(self): 

442 tab1 = _makeSimpleArrowTable() 

443 

444 schema = DataFrameSchema.from_arrow(tab1.schema) 

445 

446 self.assertIsInstance(schema.schema, pd.DataFrame) 

447 self.assertEqual(repr(schema), repr(schema._schema)) 

448 self.assertNotEqual(schema, "not_a_schema") 

449 self.assertEqual(schema, schema) 

450 

451 tab2 = _makeMultiIndexDataFrame() 

452 schema2 = DataFrameSchema(tab2) 

453 

454 self.assertNotEqual(schema, schema2) 

455 

456 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

457 def testWriteSingleIndexDataFrameReadAsAstropyTable(self): 

458 df1, allColumns = _makeSingleIndexDataFrame() 

459 

460 self.butler.put(df1, self.datasetType, dataId={}) 

461 

462 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

463 

464 tab2_df = tab2.to_pandas(index="index") 

465 self.assertTrue(df1.equals(tab2_df)) 

466 

467 # Check reading the columns. 

468 columns = list(tab2.columns.keys()) 

469 columns2 = self.butler.get( 

470 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

471 ) 

472 # We check the set because pandas reorders the columns. 

473 self.assertEqual(set(columns2), set(columns)) 

474 

475 # Check reading the schema. 

476 schema = ArrowAstropySchema(tab2) 

477 schema2 = self.butler.get( 

478 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema" 

479 ) 

480 

481 # The string types are objectified by pandas, and the order 

482 # will be changed because of pandas indexing. 

483 self.assertEqual(len(schema2.schema.columns), len(schema.schema.columns)) 

484 for name in schema.schema.columns: 

485 self.assertIn(name, schema2.schema.columns) 

486 if schema2.schema[name].dtype != np.dtype("O"): 

487 self.assertEqual(schema2.schema[name].dtype, schema.schema[name].dtype) 

488 

489 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

490 def testWriteSingleIndexDataFrameWithMaskedColsReadAsAstropyTable(self): 

491 # We need to special-case the write-as-pandas read-as-astropy code 

492 # with masks because pandas has multiple ways to use masked columns. 

493 # (The string column mask handling in particular is frustratingly 

494 # inconsistent.) 

495 df1, allColumns = _makeSingleIndexDataFrame(include_masked=True) 

496 

497 self.butler.put(df1, self.datasetType, dataId={}) 

498 

499 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

500 tab2_df = tab2.to_pandas(index="index") 

501 

502 self.assertTrue(df1.columns.equals(tab2_df.columns)) 

503 for name in tab2_df.columns: 

504 col1 = df1[name] 

505 col2 = tab2_df[name] 

506 

507 if col1.hasnans: 

508 notNull = col1.notnull() 

509 self.assertTrue(notNull.equals(col2.notnull())) 

510 # Need to check value-by-value because column may 

511 # be made of objects, depending on what pandas decides. 

512 for index in notNull.values.nonzero()[0]: 

513 self.assertEqual(col1[index], col2[index]) 

514 else: 

515 self.assertTrue(col1.equals(col2)) 

516 

517 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

518 def testWriteMultiIndexDataFrameReadAsAstropyTable(self): 

519 df1 = _makeMultiIndexDataFrame() 

520 

521 self.butler.put(df1, self.datasetType, dataId={}) 

522 

523 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

524 

525 # This is an odd duck, it doesn't really round-trip. 

526 # This test simply checks that it's readable, but definitely not 

527 # recommended. 

528 

529 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.") 

530 def testWriteSingleIndexDataFrameReadAsArrowTable(self): 

531 df1, allColumns = _makeSingleIndexDataFrame() 

532 

533 self.butler.put(df1, self.datasetType, dataId={}) 

534 

535 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable") 

536 

537 tab2_df = arrow_to_pandas(tab2) 

538 self.assertTrue(df1.equals(tab2_df)) 

539 

540 # Check reading the columns. 

541 columns = list(tab2.schema.names) 

542 columns2 = self.butler.get( 

543 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

544 ) 

545 # We check the set because pandas reorders the columns. 

546 self.assertEqual(set(columns), set(columns2)) 

547 

548 # Check reading the schema. 

549 schema = tab2.schema 

550 schema2 = self.butler.get( 

551 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema" 

552 ) 

553 

554 # These will not have the same metadata, nor will the string column 

555 # information be maintained. 

556 self.assertEqual(len(schema.names), len(schema2.names)) 

557 for name in schema.names: 

558 if schema.field(name).type not in (pa.string(), pa.binary()): 

559 self.assertEqual(schema.field(name).type, schema2.field(name).type) 

560 

561 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.") 

562 def testWriteMultiIndexDataFrameReadAsArrowTable(self): 

563 df1 = _makeMultiIndexDataFrame() 

564 

565 self.butler.put(df1, self.datasetType, dataId={}) 

566 

567 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable") 

568 

569 tab2_df = arrow_to_pandas(tab2) 

570 self.assertTrue(df1.equals(tab2_df)) 

571 

572 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

573 def testWriteSingleIndexDataFrameReadAsNumpyTable(self): 

574 df1, allColumns = _makeSingleIndexDataFrame() 

575 

576 self.butler.put(df1, self.datasetType, dataId={}) 

577 

578 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy") 

579 

580 tab2_df = pd.DataFrame.from_records(tab2, index=["index"]) 

581 self.assertTrue(df1.equals(tab2_df)) 

582 

583 # Check reading the columns. 

584 columns = list(tab2.dtype.names) 

585 columns2 = self.butler.get( 

586 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

587 ) 

588 # We check the set because pandas reorders the columns. 

589 self.assertEqual(set(columns2), set(columns)) 

590 

591 # Check reading the schema. 

592 schema = ArrowNumpySchema(tab2.dtype) 

593 schema2 = self.butler.get( 

594 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema" 

595 ) 

596 

597 # The string types will be objectified by pandas, and the order 

598 # will be changed because of pandas indexing. 

599 self.assertEqual(len(schema.schema.names), len(schema2.schema.names)) 

600 for name in schema.schema.names: 

601 self.assertIn(name, schema2.schema.names) 

602 self.assertEqual(schema2.schema[name].type, schema.schema[name].type) 

603 

604 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

605 def testWriteMultiIndexDataFrameReadAsNumpyTable(self): 

606 df1 = _makeMultiIndexDataFrame() 

607 

608 self.butler.put(df1, self.datasetType, dataId={}) 

609 

610 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy") 

611 

612 # This is an odd duck, it doesn't really round-trip. 

613 # This test simply checks that it's readable, but definitely not 

614 # recommended. 

615 

616 @unittest.skipUnless(np is not None, "Cannot test reading as numpy dict without numpy.") 

617 def testWriteSingleIndexDataFrameReadAsNumpyDict(self): 

618 df1, allColumns = _makeSingleIndexDataFrame() 

619 

620 self.butler.put(df1, self.datasetType, dataId={}) 

621 

622 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict") 

623 

624 tab2_df = pd.DataFrame.from_records(tab2, index=["index"]) 

625 # The column order is not maintained. 

626 self.assertEqual(set(df1.columns), set(tab2_df.columns)) 

627 for col in df1.columns: 

628 self.assertTrue(np.all(df1[col].values == tab2_df[col].values)) 

629 

630 @unittest.skipUnless(np is not None, "Cannot test reading as numpy dict without numpy.") 

631 def testWriteMultiIndexDataFrameReadAsNumpyDict(self): 

632 df1 = _makeMultiIndexDataFrame() 

633 

634 self.butler.put(df1, self.datasetType, dataId={}) 

635 

636 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict") 

637 

638 # This is an odd duck, it doesn't really round-trip. 

639 # This test simply checks that it's readable, but definitely not 

640 # recommended. 

641 

642 

643@unittest.skipUnless(pd is not None, "Cannot test InMemoryDataFrameDelegate without pandas.") 

644class InMemoryDataFrameDelegateTestCase(ParquetFormatterDataFrameTestCase): 

645 """Tests for InMemoryDatastore, using DataFrameDelegate.""" 

646 

647 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

648 

649 def testWriteMultiIndexDataFrameReadAsAstropyTable(self): 

650 df1 = _makeMultiIndexDataFrame() 

651 

652 self.butler.put(df1, self.datasetType, dataId={}) 

653 

654 with self.assertRaises(ValueError): 

655 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

656 

657 def testLegacyDataFrame(self): 

658 # This test does not work with an inMemoryDatastore. 

659 pass 

660 

661 def testBadInput(self): 

662 df1, _ = _makeSingleIndexDataFrame() 

663 delegate = DataFrameDelegate("DataFrame") 

664 

665 with self.assertRaises(ValueError): 

666 delegate.handleParameters(inMemoryDataset="not_a_dataframe") 

667 

668 with self.assertRaises(AttributeError): 

669 delegate.getComponent(composite=df1, componentName="nothing") 

670 

671 def testStorageClass(self): 

672 df1, allColumns = _makeSingleIndexDataFrame() 

673 

674 factory = StorageClassFactory() 

675 factory.addFromConfig(StorageClassConfig()) 

676 

677 storageClass = factory.findStorageClass(type(df1), compare_types=False) 

678 # Force the name lookup to do name matching. 

679 storageClass._pytype = None 

680 self.assertEqual(storageClass.name, "DataFrame") 

681 

682 storageClass = factory.findStorageClass(type(df1), compare_types=True) 

683 # Force the name lookup to do name matching. 

684 storageClass._pytype = None 

685 self.assertEqual(storageClass.name, "DataFrame") 

686 

687 

688@unittest.skipUnless(atable is not None, "Cannot test ParquetFormatterArrowAstropy without astropy.") 

689@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowAstropy without pyarrow.") 

690class ParquetFormatterArrowAstropyTestCase(unittest.TestCase): 

691 """Tests for ParquetFormatter, ArrowAstropy, using local file datastore.""" 

692 

693 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

694 

695 def setUp(self): 

696 """Create a new butler root for each test.""" 

697 self.root = makeTestTempDir(TESTDIR) 

698 config = Config(self.configFile) 

699 self.run = "test_run" 

700 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run=self.run) 

701 # No dimensions in dataset type so we don't have to worry about 

702 # inserting dimension data or defining data IDs. 

703 self.datasetType = DatasetType( 

704 "data", dimensions=(), storageClass="ArrowAstropy", universe=self.butler.dimensions 

705 ) 

706 self.butler.registry.registerDatasetType(self.datasetType) 

707 

708 def tearDown(self): 

709 removeTestTempDir(self.root) 

710 

711 def testAstropyTable(self): 

712 tab1 = _makeSimpleAstropyTable(include_multidim=True, include_masked=True) 

713 

714 self.butler.put(tab1, self.datasetType, dataId={}) 

715 # Read the whole Table. 

716 tab2 = self.butler.get(self.datasetType, dataId={}) 

717 self._checkAstropyTableEquality(tab1, tab2) 

718 # Read the columns. 

719 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

720 self.assertEqual(len(columns2), len(tab1.dtype.names)) 

721 for i, name in enumerate(tab1.dtype.names): 

722 self.assertEqual(columns2[i], name) 

723 # Read the rowcount. 

724 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

725 self.assertEqual(rowcount, len(tab1)) 

726 # Read the schema. 

727 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

728 self.assertEqual(schema, ArrowAstropySchema(tab1)) 

729 # Read just some columns a few different ways. 

730 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

731 self._checkAstropyTableEquality(tab1[("a", "c")], tab3) 

732 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

733 self._checkAstropyTableEquality(tab1[("a",)], tab4) 

734 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]}) 

735 self._checkAstropyTableEquality(tab1[("index", "a")], tab5) 

736 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"}) 

737 self._checkAstropyTableEquality(tab1[("ddd",)], tab6) 

738 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]}) 

739 self._checkAstropyTableEquality(tab1[("a",)], tab7) 

740 # Passing an unrecognized column should be a ValueError. 

741 with self.assertRaises(ValueError): 

742 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]}) 

743 

744 def testAstropyTableBigEndian(self): 

745 tab1 = _makeSimpleAstropyTable(include_bigendian=True) 

746 

747 self.butler.put(tab1, self.datasetType, dataId={}) 

748 # Read the whole Table. 

749 tab2 = self.butler.get(self.datasetType, dataId={}) 

750 self._checkAstropyTableEquality(tab1, tab2, has_bigendian=True) 

751 

752 def testAstropyTableWithMetadata(self): 

753 tab1 = _makeSimpleAstropyTable(include_multidim=True) 

754 

755 meta = { 

756 "meta_a": 5, 

757 "meta_b": 10.0, 

758 "meta_c": [1, 2, 3], 

759 "meta_d": True, 

760 "meta_e": "string", 

761 } 

762 

763 tab1.meta.update(meta) 

764 

765 self.butler.put(tab1, self.datasetType, dataId={}) 

766 # Read the whole Table. 

767 tab2 = self.butler.get(self.datasetType, dataId={}) 

768 # This will check that the metadata is equivalent as well. 

769 self._checkAstropyTableEquality(tab1, tab2) 

770 

771 def testArrowAstropySchema(self): 

772 tab1 = _makeSimpleAstropyTable() 

773 tab1_arrow = astropy_to_arrow(tab1) 

774 schema = ArrowAstropySchema.from_arrow(tab1_arrow.schema) 

775 

776 self.assertIsInstance(schema.schema, atable.Table) 

777 self.assertEqual(repr(schema), repr(schema._schema)) 

778 self.assertNotEqual(schema, "not_a_schema") 

779 self.assertEqual(schema, schema) 

780 

781 # Test various inequalities 

782 tab2 = tab1.copy() 

783 tab2.rename_column("index", "index2") 

784 schema2 = ArrowAstropySchema(tab2) 

785 self.assertNotEqual(schema2, schema) 

786 

787 tab2 = tab1.copy() 

788 tab2["index"].unit = units.micron 

789 schema2 = ArrowAstropySchema(tab2) 

790 self.assertNotEqual(schema2, schema) 

791 

792 tab2 = tab1.copy() 

793 tab2["index"].description = "Index column" 

794 schema2 = ArrowAstropySchema(tab2) 

795 self.assertNotEqual(schema2, schema) 

796 

797 tab2 = tab1.copy() 

798 tab2["index"].format = "%05d" 

799 schema2 = ArrowAstropySchema(tab2) 

800 self.assertNotEqual(schema2, schema) 

801 

802 def testAstropyParquet(self): 

803 tab1 = _makeSimpleAstropyTable() 

804 

805 fname = os.path.join(self.root, "test_astropy.parq") 

806 tab1.write(fname) 

807 

808 astropy_type = DatasetType( 

809 "astropy_parquet", 

810 dimensions=(), 

811 storageClass="ArrowAstropy", 

812 universe=self.butler.dimensions, 

813 ) 

814 self.butler.registry.registerDatasetType(astropy_type) 

815 

816 data_id = {} 

817 ref = DatasetRef(astropy_type, data_id, run=self.run) 

818 dataset = FileDataset(path=fname, refs=[ref], formatter=ParquetFormatter) 

819 

820 self.butler.ingest(dataset, transfer="copy") 

821 

822 self.butler.put(tab1, self.datasetType, dataId={}) 

823 

824 tab2a = self.butler.get(self.datasetType, dataId={}) 

825 tab2b = self.butler.get("astropy_parquet", dataId={}) 

826 self._checkAstropyTableEquality(tab2a, tab2b) 

827 

828 columns2a = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

829 columns2b = self.butler.get("astropy_parquet.columns", dataId={}) 

830 self.assertEqual(len(columns2b), len(columns2a)) 

831 for i, name in enumerate(columns2a): 

832 self.assertEqual(columns2b[i], name) 

833 

834 rowcount2a = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

835 rowcount2b = self.butler.get("astropy_parquet.rowcount", dataId={}) 

836 self.assertEqual(rowcount2a, rowcount2b) 

837 

838 schema2a = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

839 schema2b = self.butler.get("astropy_parquet.schema", dataId={}) 

840 self.assertEqual(schema2a, schema2b) 

841 

842 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.") 

843 def testWriteAstropyReadAsArrowTable(self): 

844 # This astropy <-> arrow works fine with masked columns. 

845 tab1 = _makeSimpleAstropyTable(include_masked=True) 

846 

847 self.butler.put(tab1, self.datasetType, dataId={}) 

848 

849 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable") 

850 

851 tab2_astropy = arrow_to_astropy(tab2) 

852 self._checkAstropyTableEquality(tab1, tab2_astropy) 

853 

854 # Check reading the columns. 

855 columns = tab2.schema.names 

856 columns2 = self.butler.get( 

857 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

858 ) 

859 self.assertEqual(columns2, columns) 

860 

861 # Check reading the schema. 

862 schema = tab2.schema 

863 schema2 = self.butler.get( 

864 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema" 

865 ) 

866 

867 self.assertEqual(schema, schema2) 

868 

869 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

870 def testWriteAstropyReadAsDataFrame(self): 

871 tab1 = _makeSimpleAstropyTable() 

872 

873 self.butler.put(tab1, self.datasetType, dataId={}) 

874 

875 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

876 

877 # This is tricky because it loses the units and gains a bonus pandas 

878 # _index_ column, so we just test the dataframe form. 

879 

880 tab1_df = tab1.to_pandas() 

881 self.assertTrue(tab1_df.equals(tab2)) 

882 

883 # Check reading the columns. 

884 columns = tab2.columns 

885 columns2 = self.butler.get( 

886 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex" 

887 ) 

888 self.assertTrue(columns.equals(columns2)) 

889 

890 # Check reading the schema. 

891 schema = DataFrameSchema(tab2) 

892 schema2 = self.butler.get( 

893 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema" 

894 ) 

895 

896 self.assertEqual(schema2, schema) 

897 

898 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

899 def testWriteAstropyWithMaskedColsReadAsDataFrame(self): 

900 # We need to special-case the write-as-astropy read-as-pandas code 

901 # with masks because pandas has multiple ways to use masked columns. 

902 # (When writing an astropy table with masked columns we get an object 

903 # column back, but each unmasked element has the correct type.) 

904 tab1 = _makeSimpleAstropyTable(include_masked=True) 

905 

906 self.butler.put(tab1, self.datasetType, dataId={}) 

907 

908 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

909 

910 tab1_df = tab1.to_pandas() 

911 

912 self.assertTrue(tab1_df.columns.equals(tab2.columns)) 

913 for name in tab2.columns: 

914 col1 = tab1_df[name] 

915 col2 = tab2[name] 

916 

917 if col1.hasnans: 

918 notNull = col1.notnull() 

919 self.assertTrue(notNull.equals(col2.notnull())) 

920 # Need to check value-by-value because column may 

921 # be made of objects, depending on what pandas decides. 

922 for index in notNull.values.nonzero()[0]: 

923 self.assertEqual(col1[index], col2[index]) 

924 else: 

925 self.assertTrue(col1.equals(col2)) 

926 

927 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

928 def testWriteAstropyReadAsNumpyTable(self): 

929 tab1 = _makeSimpleAstropyTable() 

930 self.butler.put(tab1, self.datasetType, dataId={}) 

931 

932 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy") 

933 

934 # This is tricky because it loses the units. 

935 tab2_astropy = atable.Table(tab2) 

936 

937 self._checkAstropyTableEquality(tab1, tab2_astropy, skip_units=True) 

938 

939 # Check reading the columns. 

940 columns = list(tab2.dtype.names) 

941 columns2 = self.butler.get( 

942 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

943 ) 

944 self.assertEqual(columns2, columns) 

945 

946 # Check reading the schema. 

947 schema = ArrowNumpySchema(tab2.dtype) 

948 schema2 = self.butler.get( 

949 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema" 

950 ) 

951 

952 self.assertEqual(schema2, schema) 

953 

954 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

955 def testWriteAstropyReadAsNumpyDict(self): 

956 tab1 = _makeSimpleAstropyTable() 

957 self.butler.put(tab1, self.datasetType, dataId={}) 

958 

959 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict") 

960 

961 # This is tricky because it loses the units. 

962 tab2_astropy = atable.Table(tab2) 

963 

964 self._checkAstropyTableEquality(tab1, tab2_astropy, skip_units=True) 

965 

966 def _checkAstropyTableEquality(self, table1, table2, skip_units=False, has_bigendian=False): 

967 """Check if two astropy tables have the same columns/values. 

968 

969 Parameters 

970 ---------- 

971 table1 : `astropy.table.Table` 

972 table2 : `astropy.table.Table` 

973 skip_units : `bool` 

974 has_bigendian : `bool` 

975 """ 

976 if not has_bigendian: 

977 self.assertEqual(table1.dtype, table2.dtype) 

978 else: 

979 for name in table1.dtype.names: 

980 # Only check type matches, force to little-endian. 

981 self.assertEqual(table1.dtype[name].newbyteorder(">"), table2.dtype[name].newbyteorder(">")) 

982 

983 self.assertEqual(table1.meta, table2.meta) 

984 if not skip_units: 

985 for name in table1.columns: 

986 self.assertEqual(table1[name].unit, table2[name].unit) 

987 self.assertEqual(table1[name].description, table2[name].description) 

988 self.assertEqual(table1[name].format, table2[name].format) 

989 self.assertTrue(np.all(table1 == table2)) 

990 

991 

992@unittest.skipUnless(atable is not None, "Cannot test InMemoryArrowAstropyDelegate without astropy.") 

993class InMemoryArrowAstropyDelegateTestCase(ParquetFormatterArrowAstropyTestCase): 

994 """Tests for InMemoryDatastore, using ArrowAstropyDelegate.""" 

995 

996 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

997 

998 def testAstropyParquet(self): 

999 # This test does not work with an inMemoryDatastore. 

1000 pass 

1001 

1002 def testBadInput(self): 

1003 tab1 = _makeSimpleAstropyTable() 

1004 delegate = ArrowAstropyDelegate("ArrowAstropy") 

1005 

1006 with self.assertRaises(ValueError): 

1007 delegate.handleParameters(inMemoryDataset="not_an_astropy_table") 

1008 

1009 with self.assertRaises(NotImplementedError): 

1010 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]}) 

1011 

1012 with self.assertRaises(AttributeError): 

1013 delegate.getComponent(composite=tab1, componentName="nothing") 

1014 

1015 

1016@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.") 

1017@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowNumpy without pyarrow.") 

1018class ParquetFormatterArrowNumpyTestCase(unittest.TestCase): 

1019 """Tests for ParquetFormatter, ArrowNumpy, using local file datastore.""" 

1020 

1021 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1022 

1023 def setUp(self): 

1024 """Create a new butler root for each test.""" 

1025 self.root = makeTestTempDir(TESTDIR) 

1026 config = Config(self.configFile) 

1027 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run") 

1028 # No dimensions in dataset type so we don't have to worry about 

1029 # inserting dimension data or defining data IDs. 

1030 self.datasetType = DatasetType( 

1031 "data", dimensions=(), storageClass="ArrowNumpy", universe=self.butler.dimensions 

1032 ) 

1033 self.butler.registry.registerDatasetType(self.datasetType) 

1034 

1035 def tearDown(self): 

1036 removeTestTempDir(self.root) 

1037 

1038 def testNumpyTable(self): 

1039 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1040 

1041 self.butler.put(tab1, self.datasetType, dataId={}) 

1042 # Read the whole Table. 

1043 tab2 = self.butler.get(self.datasetType, dataId={}) 

1044 self._checkNumpyTableEquality(tab1, tab2) 

1045 # Read the columns. 

1046 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

1047 self.assertEqual(len(columns2), len(tab1.dtype.names)) 

1048 for i, name in enumerate(tab1.dtype.names): 

1049 self.assertEqual(columns2[i], name) 

1050 # Read the rowcount. 

1051 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

1052 self.assertEqual(rowcount, len(tab1)) 

1053 # Read the schema. 

1054 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

1055 self.assertEqual(schema, ArrowNumpySchema(tab1.dtype)) 

1056 # Read just some columns a few different ways. 

1057 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

1058 self._checkNumpyTableEquality(tab1[["a", "c"]], tab3) 

1059 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

1060 self._checkNumpyTableEquality( 

1061 tab1[ 

1062 [ 

1063 "a", 

1064 ] 

1065 ], 

1066 tab4, 

1067 ) 

1068 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]}) 

1069 self._checkNumpyTableEquality(tab1[["index", "a"]], tab5) 

1070 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"}) 

1071 self._checkNumpyTableEquality( 

1072 tab1[ 

1073 [ 

1074 "ddd", 

1075 ] 

1076 ], 

1077 tab6, 

1078 ) 

1079 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]}) 

1080 self._checkNumpyTableEquality( 

1081 tab1[ 

1082 [ 

1083 "a", 

1084 ] 

1085 ], 

1086 tab7, 

1087 ) 

1088 # Passing an unrecognized column should be a ValueError. 

1089 with self.assertRaises(ValueError): 

1090 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]}) 

1091 

1092 def testNumpyTableBigEndian(self): 

1093 tab1 = _makeSimpleNumpyTable(include_bigendian=True) 

1094 

1095 self.butler.put(tab1, self.datasetType, dataId={}) 

1096 # Read the whole Table. 

1097 tab2 = self.butler.get(self.datasetType, dataId={}) 

1098 self._checkNumpyTableEquality(tab1, tab2, has_bigendian=True) 

1099 

1100 def testArrowNumpySchema(self): 

1101 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1102 tab1_arrow = numpy_to_arrow(tab1) 

1103 schema = ArrowNumpySchema.from_arrow(tab1_arrow.schema) 

1104 

1105 self.assertIsInstance(schema.schema, np.dtype) 

1106 self.assertEqual(repr(schema), repr(schema._dtype)) 

1107 self.assertNotEqual(schema, "not_a_schema") 

1108 self.assertEqual(schema, schema) 

1109 

1110 # Test inequality 

1111 tab2 = tab1.copy() 

1112 names = list(tab2.dtype.names) 

1113 names[0] = "index2" 

1114 tab2.dtype.names = names 

1115 schema2 = ArrowNumpySchema(tab2.dtype) 

1116 self.assertNotEqual(schema2, schema) 

1117 

1118 @unittest.skipUnless(pa is not None, "Cannot test arrow conversions without pyarrow.") 

1119 def testNumpyDictConversions(self): 

1120 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1121 

1122 # Verify that everything round-trips, including the schema. 

1123 tab1_arrow = numpy_to_arrow(tab1) 

1124 tab1_dict = arrow_to_numpy_dict(tab1_arrow) 

1125 tab1_dict_arrow = numpy_dict_to_arrow(tab1_dict) 

1126 

1127 self.assertEqual(tab1_arrow.schema, tab1_dict_arrow.schema) 

1128 self.assertEqual(tab1_arrow, tab1_dict_arrow) 

1129 

1130 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.") 

1131 def testWriteNumpyTableReadAsArrowTable(self): 

1132 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1133 

1134 self.butler.put(tab1, self.datasetType, dataId={}) 

1135 

1136 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable") 

1137 

1138 tab2_numpy = arrow_to_numpy(tab2) 

1139 

1140 self._checkNumpyTableEquality(tab1, tab2_numpy) 

1141 

1142 # Check reading the columns. 

1143 columns = tab2.schema.names 

1144 columns2 = self.butler.get( 

1145 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

1146 ) 

1147 self.assertEqual(columns2, columns) 

1148 

1149 # Check reading the schema. 

1150 schema = tab2.schema 

1151 schema2 = self.butler.get( 

1152 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema" 

1153 ) 

1154 self.assertEqual(schema2, schema) 

1155 

1156 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

1157 def testWriteNumpyTableReadAsDataFrame(self): 

1158 tab1 = _makeSimpleNumpyTable() 

1159 

1160 self.butler.put(tab1, self.datasetType, dataId={}) 

1161 

1162 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

1163 

1164 # Converting this back to numpy gets confused with the index column 

1165 # and changes the datatype of the string column. 

1166 

1167 tab1_df = pd.DataFrame(tab1) 

1168 

1169 self.assertTrue(tab1_df.equals(tab2)) 

1170 

1171 # Check reading the columns. 

1172 columns = tab2.columns 

1173 columns2 = self.butler.get( 

1174 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex" 

1175 ) 

1176 self.assertTrue(columns.equals(columns2)) 

1177 

1178 # Check reading the schema. 

1179 schema = DataFrameSchema(tab2) 

1180 schema2 = self.butler.get( 

1181 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema" 

1182 ) 

1183 

1184 self.assertEqual(schema2, schema) 

1185 

1186 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

1187 def testWriteNumpyTableReadAsAstropyTable(self): 

1188 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1189 

1190 self.butler.put(tab1, self.datasetType, dataId={}) 

1191 

1192 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

1193 tab2_numpy = tab2.as_array() 

1194 

1195 self._checkNumpyTableEquality(tab1, tab2_numpy) 

1196 

1197 # Check reading the columns. 

1198 columns = list(tab2.columns.keys()) 

1199 columns2 = self.butler.get( 

1200 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

1201 ) 

1202 self.assertEqual(columns2, columns) 

1203 

1204 # Check reading the schema. 

1205 schema = ArrowAstropySchema(tab2) 

1206 schema2 = self.butler.get( 

1207 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema" 

1208 ) 

1209 

1210 self.assertEqual(schema2, schema) 

1211 

1212 def testWriteNumpyTableReadAsNumpyDict(self): 

1213 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1214 

1215 self.butler.put(tab1, self.datasetType, dataId={}) 

1216 

1217 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict") 

1218 tab2_numpy = _numpy_dict_to_numpy(tab2) 

1219 

1220 self._checkNumpyTableEquality(tab1, tab2_numpy) 

1221 

1222 def _checkNumpyTableEquality(self, table1, table2, has_bigendian=False): 

1223 """Check if two numpy tables have the same columns/values 

1224 

1225 Parameters 

1226 ---------- 

1227 table1 : `numpy.ndarray` 

1228 table2 : `numpy.ndarray` 

1229 has_bigendian : `bool` 

1230 """ 

1231 self.assertEqual(table1.dtype.names, table2.dtype.names) 

1232 for name in table1.dtype.names: 

1233 if not has_bigendian: 

1234 self.assertEqual(table1.dtype[name], table2.dtype[name]) 

1235 else: 

1236 # Only check type matches, force to little-endian. 

1237 self.assertEqual(table1.dtype[name].newbyteorder(">"), table2.dtype[name].newbyteorder(">")) 

1238 self.assertTrue(np.all(table1 == table2)) 

1239 

1240 

1241@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.") 

1242class InMemoryArrowNumpyDelegateTestCase(ParquetFormatterArrowNumpyTestCase): 

1243 """Tests for InMemoryDatastore, using ArrowNumpyDelegate.""" 

1244 

1245 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

1246 

1247 def testBadInput(self): 

1248 tab1 = _makeSimpleNumpyTable() 

1249 delegate = ArrowNumpyDelegate("ArrowNumpy") 

1250 

1251 with self.assertRaises(ValueError): 

1252 delegate.handleParameters(inMemoryDataset="not_a_numpy_table") 

1253 

1254 with self.assertRaises(NotImplementedError): 

1255 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]}) 

1256 

1257 with self.assertRaises(AttributeError): 

1258 delegate.getComponent(composite=tab1, componentName="nothing") 

1259 

1260 def testStorageClass(self): 

1261 tab1 = _makeSimpleNumpyTable() 

1262 

1263 factory = StorageClassFactory() 

1264 factory.addFromConfig(StorageClassConfig()) 

1265 

1266 storageClass = factory.findStorageClass(type(tab1), compare_types=False) 

1267 # Force the name lookup to do name matching. 

1268 storageClass._pytype = None 

1269 self.assertEqual(storageClass.name, "ArrowNumpy") 

1270 

1271 storageClass = factory.findStorageClass(type(tab1), compare_types=True) 

1272 # Force the name lookup to do name matching. 

1273 storageClass._pytype = None 

1274 self.assertEqual(storageClass.name, "ArrowNumpy") 

1275 

1276 

1277@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowTable without pyarrow.") 

1278class ParquetFormatterArrowTableTestCase(unittest.TestCase): 

1279 """Tests for ParquetFormatter, ArrowTable, using local file datastore.""" 

1280 

1281 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1282 

1283 def setUp(self): 

1284 """Create a new butler root for each test.""" 

1285 self.root = makeTestTempDir(TESTDIR) 

1286 config = Config(self.configFile) 

1287 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run") 

1288 # No dimensions in dataset type so we don't have to worry about 

1289 # inserting dimension data or defining data IDs. 

1290 self.datasetType = DatasetType( 

1291 "data", dimensions=(), storageClass="ArrowTable", universe=self.butler.dimensions 

1292 ) 

1293 self.butler.registry.registerDatasetType(self.datasetType) 

1294 

1295 def tearDown(self): 

1296 removeTestTempDir(self.root) 

1297 

1298 def testArrowTable(self): 

1299 tab1 = _makeSimpleArrowTable(include_multidim=True, include_masked=True) 

1300 

1301 self.butler.put(tab1, self.datasetType, dataId={}) 

1302 # Read the whole Table. 

1303 tab2 = self.butler.get(self.datasetType, dataId={}) 

1304 self.assertEqual(tab2, tab1) 

1305 # Read the columns. 

1306 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

1307 self.assertEqual(len(columns2), len(tab1.schema.names)) 

1308 for i, name in enumerate(tab1.schema.names): 

1309 self.assertEqual(columns2[i], name) 

1310 # Read the rowcount. 

1311 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

1312 self.assertEqual(rowcount, len(tab1)) 

1313 # Read the schema. 

1314 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

1315 self.assertEqual(schema, tab1.schema) 

1316 # Read just some columns a few different ways. 

1317 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

1318 self.assertEqual(tab3, tab1.select(("a", "c"))) 

1319 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

1320 self.assertEqual(tab4, tab1.select(("a",))) 

1321 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]}) 

1322 self.assertEqual(tab5, tab1.select(("index", "a"))) 

1323 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"}) 

1324 self.assertEqual(tab6, tab1.select(("ddd",))) 

1325 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]}) 

1326 self.assertEqual(tab7, tab1.select(("a",))) 

1327 # Passing an unrecognized column should be a ValueError. 

1328 with self.assertRaises(ValueError): 

1329 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]}) 

1330 

1331 def testEmptyArrowTable(self): 

1332 data = _makeSimpleNumpyTable() 

1333 type_list = _numpy_dtype_to_arrow_types(data.dtype) 

1334 

1335 schema = pa.schema(type_list) 

1336 arrays = [[]] * len(schema.names) 

1337 

1338 tab1 = pa.Table.from_arrays(arrays, schema=schema) 

1339 

1340 self.butler.put(tab1, self.datasetType, dataId={}) 

1341 tab2 = self.butler.get(self.datasetType, dataId={}) 

1342 self.assertEqual(tab2, tab1) 

1343 

1344 tab1_numpy = arrow_to_numpy(tab1) 

1345 self.assertEqual(len(tab1_numpy), 0) 

1346 tab1_numpy_arrow = numpy_to_arrow(tab1_numpy) 

1347 self.assertEqual(tab1_numpy_arrow, tab1) 

1348 

1349 tab1_pandas = arrow_to_pandas(tab1) 

1350 self.assertEqual(len(tab1_pandas), 0) 

1351 tab1_pandas_arrow = pandas_to_arrow(tab1_pandas) 

1352 # Unfortunately, string/byte columns get mangled when translated 

1353 # through empty pandas dataframes. 

1354 self.assertEqual( 

1355 tab1_pandas_arrow.select(("index", "a", "b", "c", "ddd")), 

1356 tab1.select(("index", "a", "b", "c", "ddd")), 

1357 ) 

1358 

1359 tab1_astropy = arrow_to_astropy(tab1) 

1360 self.assertEqual(len(tab1_astropy), 0) 

1361 tab1_astropy_arrow = astropy_to_arrow(tab1_astropy) 

1362 self.assertEqual(tab1_astropy_arrow, tab1) 

1363 

1364 def testEmptyArrowTableMultidim(self): 

1365 data = _makeSimpleNumpyTable(include_multidim=True) 

1366 type_list = _numpy_dtype_to_arrow_types(data.dtype) 

1367 

1368 md = {} 

1369 for name in data.dtype.names: 

1370 _append_numpy_multidim_metadata(md, name, data.dtype[name]) 

1371 

1372 schema = pa.schema(type_list, metadata=md) 

1373 arrays = [[]] * len(schema.names) 

1374 

1375 tab1 = pa.Table.from_arrays(arrays, schema=schema) 

1376 

1377 self.butler.put(tab1, self.datasetType, dataId={}) 

1378 tab2 = self.butler.get(self.datasetType, dataId={}) 

1379 self.assertEqual(tab2, tab1) 

1380 

1381 tab1_numpy = arrow_to_numpy(tab1) 

1382 self.assertEqual(len(tab1_numpy), 0) 

1383 tab1_numpy_arrow = numpy_to_arrow(tab1_numpy) 

1384 self.assertEqual(tab1_numpy_arrow, tab1) 

1385 

1386 tab1_astropy = arrow_to_astropy(tab1) 

1387 self.assertEqual(len(tab1_astropy), 0) 

1388 tab1_astropy_arrow = astropy_to_arrow(tab1_astropy) 

1389 self.assertEqual(tab1_astropy_arrow, tab1) 

1390 

1391 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

1392 def testWriteArrowTableReadAsSingleIndexDataFrame(self): 

1393 df1, allColumns = _makeSingleIndexDataFrame() 

1394 

1395 self.butler.put(df1, self.datasetType, dataId={}) 

1396 

1397 # Read back out as a dataframe. 

1398 df2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

1399 self.assertTrue(df1.equals(df2)) 

1400 

1401 # Read back out as an arrow table, convert to dataframe. 

1402 tab3 = self.butler.get(self.datasetType, dataId={}) 

1403 df3 = arrow_to_pandas(tab3) 

1404 self.assertTrue(df1.equals(df3)) 

1405 

1406 # Check reading the columns. 

1407 columns = df2.reset_index().columns 

1408 columns2 = self.butler.get( 

1409 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex" 

1410 ) 

1411 # We check the set because pandas reorders the columns. 

1412 self.assertEqual(set(columns2.to_list()), set(columns.to_list())) 

1413 

1414 # Check reading the schema. 

1415 schema = DataFrameSchema(df1) 

1416 schema2 = self.butler.get( 

1417 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema" 

1418 ) 

1419 self.assertEqual(schema2, schema) 

1420 

1421 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

1422 def testWriteArrowTableReadAsMultiIndexDataFrame(self): 

1423 df1 = _makeMultiIndexDataFrame() 

1424 

1425 self.butler.put(df1, self.datasetType, dataId={}) 

1426 

1427 # Read back out as a dataframe. 

1428 df2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

1429 self.assertTrue(df1.equals(df2)) 

1430 

1431 # Read back out as an arrow table, convert to dataframe. 

1432 atab3 = self.butler.get(self.datasetType, dataId={}) 

1433 df3 = arrow_to_pandas(atab3) 

1434 self.assertTrue(df1.equals(df3)) 

1435 

1436 # Check reading the columns. 

1437 columns = df2.columns 

1438 columns2 = self.butler.get( 

1439 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex" 

1440 ) 

1441 self.assertTrue(columns2.equals(columns)) 

1442 

1443 # Check reading the schema. 

1444 schema = DataFrameSchema(df1) 

1445 schema2 = self.butler.get( 

1446 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema" 

1447 ) 

1448 self.assertEqual(schema2, schema) 

1449 

1450 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

1451 def testWriteArrowTableReadAsAstropyTable(self): 

1452 tab1 = _makeSimpleAstropyTable(include_multidim=True, include_masked=True) 

1453 

1454 self.butler.put(tab1, self.datasetType, dataId={}) 

1455 

1456 # Read back out as an astropy table. 

1457 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

1458 self._checkAstropyTableEquality(tab1, tab2) 

1459 

1460 # Read back out as an arrow table, convert to astropy table. 

1461 atab3 = self.butler.get(self.datasetType, dataId={}) 

1462 tab3 = arrow_to_astropy(atab3) 

1463 self._checkAstropyTableEquality(tab1, tab3) 

1464 

1465 # Check reading the columns. 

1466 columns = list(tab2.columns.keys()) 

1467 columns2 = self.butler.get( 

1468 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

1469 ) 

1470 self.assertEqual(columns2, columns) 

1471 

1472 # Check reading the schema. 

1473 schema = ArrowAstropySchema(tab1) 

1474 schema2 = self.butler.get( 

1475 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema" 

1476 ) 

1477 self.assertEqual(schema2, schema) 

1478 

1479 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

1480 def testWriteArrowTableReadAsNumpyTable(self): 

1481 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1482 

1483 self.butler.put(tab1, self.datasetType, dataId={}) 

1484 

1485 # Read back out as a numpy table. 

1486 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy") 

1487 self._checkNumpyTableEquality(tab1, tab2) 

1488 

1489 # Read back out as an arrow table, convert to numpy table. 

1490 atab3 = self.butler.get(self.datasetType, dataId={}) 

1491 tab3 = arrow_to_numpy(atab3) 

1492 self._checkNumpyTableEquality(tab1, tab3) 

1493 

1494 # Check reading the columns. 

1495 columns = list(tab2.dtype.names) 

1496 columns2 = self.butler.get( 

1497 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

1498 ) 

1499 self.assertEqual(columns2, columns) 

1500 

1501 # Check reading the schema. 

1502 schema = ArrowNumpySchema(tab1.dtype) 

1503 schema2 = self.butler.get( 

1504 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema" 

1505 ) 

1506 self.assertEqual(schema2, schema) 

1507 

1508 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

1509 def testWriteArrowTableReadAsNumpyDict(self): 

1510 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1511 

1512 self.butler.put(tab1, self.datasetType, dataId={}) 

1513 

1514 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict") 

1515 tab2_numpy = _numpy_dict_to_numpy(tab2) 

1516 self._checkNumpyTableEquality(tab1, tab2_numpy) 

1517 

1518 def _checkAstropyTableEquality(self, table1, table2): 

1519 """Check if two astropy tables have the same columns/values 

1520 

1521 Parameters 

1522 ---------- 

1523 table1 : `astropy.table.Table` 

1524 table2 : `astropy.table.Table` 

1525 """ 

1526 self.assertEqual(table1.dtype, table2.dtype) 

1527 for name in table1.columns: 

1528 self.assertEqual(table1[name].unit, table2[name].unit) 

1529 self.assertEqual(table1[name].description, table2[name].description) 

1530 self.assertEqual(table1[name].format, table2[name].format) 

1531 self.assertTrue(np.all(table1 == table2)) 

1532 

1533 def _checkNumpyTableEquality(self, table1, table2): 

1534 """Check if two numpy tables have the same columns/values 

1535 

1536 Parameters 

1537 ---------- 

1538 table1 : `numpy.ndarray` 

1539 table2 : `numpy.ndarray` 

1540 """ 

1541 self.assertEqual(table1.dtype.names, table2.dtype.names) 

1542 for name in table1.dtype.names: 

1543 self.assertEqual(table1.dtype[name], table2.dtype[name]) 

1544 self.assertTrue(np.all(table1 == table2)) 

1545 

1546 

1547@unittest.skipUnless(pa is not None, "Cannot test InMemoryArrowTableDelegate without pyarrow.") 

1548class InMemoryArrowTableDelegateTestCase(ParquetFormatterArrowTableTestCase): 

1549 """Tests for InMemoryDatastore, using ArrowTableDelegate.""" 

1550 

1551 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

1552 

1553 def testBadInput(self): 

1554 tab1 = _makeSimpleArrowTable() 

1555 delegate = ArrowTableDelegate("ArrowTable") 

1556 

1557 with self.assertRaises(ValueError): 

1558 delegate.handleParameters(inMemoryDataset="not_an_arrow_table") 

1559 

1560 with self.assertRaises(NotImplementedError): 

1561 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]}) 

1562 

1563 with self.assertRaises(AttributeError): 

1564 delegate.getComponent(composite=tab1, componentName="nothing") 

1565 

1566 def testStorageClass(self): 

1567 tab1 = _makeSimpleArrowTable() 

1568 

1569 factory = StorageClassFactory() 

1570 factory.addFromConfig(StorageClassConfig()) 

1571 

1572 storageClass = factory.findStorageClass(type(tab1), compare_types=False) 

1573 # Force the name lookup to do name matching. 

1574 storageClass._pytype = None 

1575 self.assertEqual(storageClass.name, "ArrowTable") 

1576 

1577 storageClass = factory.findStorageClass(type(tab1), compare_types=True) 

1578 # Force the name lookup to do name matching. 

1579 storageClass._pytype = None 

1580 self.assertEqual(storageClass.name, "ArrowTable") 

1581 

1582 

1583@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.") 

1584@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowNumpy without pyarrow.") 

1585class ParquetFormatterArrowNumpyDictTestCase(unittest.TestCase): 

1586 """Tests for ParquetFormatter, ArrowNumpyDict, using local file store.""" 

1587 

1588 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1589 

1590 def setUp(self): 

1591 """Create a new butler root for each test.""" 

1592 self.root = makeTestTempDir(TESTDIR) 

1593 config = Config(self.configFile) 

1594 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run") 

1595 # No dimensions in dataset type so we don't have to worry about 

1596 # inserting dimension data or defining data IDs. 

1597 self.datasetType = DatasetType( 

1598 "data", dimensions=(), storageClass="ArrowNumpyDict", universe=self.butler.dimensions 

1599 ) 

1600 self.butler.registry.registerDatasetType(self.datasetType) 

1601 

1602 def tearDown(self): 

1603 removeTestTempDir(self.root) 

1604 

1605 def testNumpyDict(self): 

1606 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1607 dict1 = _numpy_to_numpy_dict(tab1) 

1608 

1609 self.butler.put(dict1, self.datasetType, dataId={}) 

1610 # Read the whole table. 

1611 dict2 = self.butler.get(self.datasetType, dataId={}) 

1612 self._checkNumpyDictEquality(dict1, dict2) 

1613 # Read the columns. 

1614 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

1615 self.assertEqual(len(columns2), len(dict1.keys())) 

1616 for i, name in enumerate(dict1.keys()): 

1617 self.assertIn(name, columns2) 

1618 # Read the rowcount. 

1619 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

1620 self.assertEqual(rowcount, len(dict1["a"])) 

1621 # Read the schema. 

1622 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

1623 self.assertEqual(schema, ArrowNumpySchema(tab1.dtype)) 

1624 # Read just some columns a few different ways. 

1625 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

1626 subdict = {key: dict1[key] for key in ["a", "c"]} 

1627 self._checkNumpyDictEquality(subdict, tab3) 

1628 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

1629 subdict = {key: dict1[key] for key in ["a"]} 

1630 self._checkNumpyDictEquality(subdict, tab4) 

1631 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]}) 

1632 subdict = {key: dict1[key] for key in ["index", "a"]} 

1633 self._checkNumpyDictEquality(subdict, tab5) 

1634 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"}) 

1635 subdict = {key: dict1[key] for key in ["ddd"]} 

1636 self._checkNumpyDictEquality(subdict, tab6) 

1637 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]}) 

1638 subdict = {key: dict1[key] for key in ["a"]} 

1639 self._checkNumpyDictEquality(subdict, tab7) 

1640 # Passing an unrecognized column should be a ValueError. 

1641 with self.assertRaises(ValueError): 

1642 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]}) 

1643 

1644 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.") 

1645 def testWriteNumpyDictReadAsArrowTable(self): 

1646 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1647 dict1 = _numpy_to_numpy_dict(tab1) 

1648 

1649 self.butler.put(dict1, self.datasetType, dataId={}) 

1650 

1651 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable") 

1652 

1653 tab2_dict = arrow_to_numpy_dict(tab2) 

1654 

1655 self._checkNumpyDictEquality(dict1, tab2_dict) 

1656 

1657 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

1658 def testWriteNumpyDictReadAsDataFrame(self): 

1659 tab1 = _makeSimpleNumpyTable() 

1660 dict1 = _numpy_to_numpy_dict(tab1) 

1661 

1662 self.butler.put(dict1, self.datasetType, dataId={}) 

1663 

1664 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

1665 

1666 # The order of the dict may get mixed up, so we need to check column 

1667 # by column. We also need to do this in dataframe form because pandas 

1668 # changes the datatype of the string column. 

1669 tab1_df = pd.DataFrame(tab1) 

1670 

1671 self.assertEqual(set(tab1_df.columns), set(tab2.columns)) 

1672 for col in tab1_df.columns: 

1673 self.assertTrue(np.all(tab1_df[col].values == tab2[col].values)) 

1674 

1675 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

1676 def testWriteNumpyDictReadAsAstropyTable(self): 

1677 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1678 dict1 = _numpy_to_numpy_dict(tab1) 

1679 

1680 self.butler.put(dict1, self.datasetType, dataId={}) 

1681 

1682 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

1683 tab2_dict = _astropy_to_numpy_dict(tab2) 

1684 

1685 self._checkNumpyDictEquality(dict1, tab2_dict) 

1686 

1687 def testWriteNumpyDictReadAsNumpyTable(self): 

1688 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1689 dict1 = _numpy_to_numpy_dict(tab1) 

1690 

1691 self.butler.put(dict1, self.datasetType, dataId={}) 

1692 

1693 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy") 

1694 tab2_dict = _numpy_to_numpy_dict(tab2) 

1695 

1696 self._checkNumpyDictEquality(dict1, tab2_dict) 

1697 

1698 def testWriteNumpyDictBad(self): 

1699 dict1 = {"a": 4, "b": np.ndarray([1])} 

1700 with self.assertRaises(RuntimeError): 

1701 self.butler.put(dict1, self.datasetType, dataId={}) 

1702 

1703 dict2 = {"a": np.zeros(4), "b": np.zeros(5)} 

1704 with self.assertRaises(RuntimeError): 

1705 self.butler.put(dict2, self.datasetType, dataId={}) 

1706 

1707 dict3 = {"a": [0] * 5, "b": np.zeros(5)} 

1708 with self.assertRaises(RuntimeError): 

1709 self.butler.put(dict3, self.datasetType, dataId={}) 

1710 

1711 def _checkNumpyDictEquality(self, dict1, dict2): 

1712 """Check if two numpy dicts have the same columns/values. 

1713 

1714 Parameters 

1715 ---------- 

1716 dict1 : `dict` [`str`, `np.ndarray`] 

1717 dict2 : `dict` [`str`, `np.ndarray`] 

1718 """ 

1719 self.assertEqual(set(dict1.keys()), set(dict2.keys())) 

1720 for name in dict1.keys(): 

1721 self.assertEqual(dict1[name].dtype, dict2[name].dtype) 

1722 self.assertTrue(np.all(dict1[name] == dict2[name])) 

1723 

1724 

1725@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.") 

1726@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowNumpy without pyarrow.") 

1727class InMemoryNumpyDictDelegateTestCase(ParquetFormatterArrowNumpyDictTestCase): 

1728 """Tests for InMemoryDatastore, using ArrowNumpyDictDelegate.""" 

1729 

1730 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

1731 

1732 def testWriteNumpyDictBad(self): 

1733 # The sub-type checking is not done on in-memory datastore. 

1734 pass 

1735 

1736 

1737@unittest.skipUnless(np is not None, "Cannot test compute_row_group_size without numpy.") 

1738@unittest.skipUnless(pa is not None, "Cannot test compute_row_group_size without pyarrow.") 

1739class ComputeRowGroupSizeTestCase(unittest.TestCase): 

1740 """Tests for compute_row_group_size.""" 

1741 

1742 def testRowGroupSizeNoMetadata(self): 

1743 numpyTable = _makeSimpleNumpyTable(include_multidim=True) 

1744 

1745 # We can't use the numpy_to_arrow convenience function because 

1746 # that adds metadata. 

1747 type_list = _numpy_dtype_to_arrow_types(numpyTable.dtype) 

1748 schema = pa.schema(type_list) 

1749 arrays = _numpy_style_arrays_to_arrow_arrays( 

1750 numpyTable.dtype, 

1751 len(numpyTable), 

1752 numpyTable, 

1753 schema, 

1754 ) 

1755 arrowTable = pa.Table.from_arrays(arrays, schema=schema) 

1756 

1757 row_group_size = compute_row_group_size(arrowTable.schema) 

1758 

1759 self.assertGreater(row_group_size, 1_000_000) 

1760 self.assertLess(row_group_size, 2_000_000) 

1761 

1762 def testRowGroupSizeWithMetadata(self): 

1763 numpyTable = _makeSimpleNumpyTable(include_multidim=True) 

1764 

1765 arrowTable = numpy_to_arrow(numpyTable) 

1766 

1767 row_group_size = compute_row_group_size(arrowTable.schema) 

1768 

1769 self.assertGreater(row_group_size, 1_000_000) 

1770 self.assertLess(row_group_size, 2_000_000) 

1771 

1772 def testRowGroupSizeTinyTable(self): 

1773 numpyTable = np.zeros(1, dtype=[("a", np.bool_)]) 

1774 

1775 arrowTable = numpy_to_arrow(numpyTable) 

1776 

1777 row_group_size = compute_row_group_size(arrowTable.schema) 

1778 

1779 self.assertGreater(row_group_size, 1_000_000) 

1780 

1781 @unittest.skipUnless(pd is not None, "Cannot run testRowGroupSizeDataFrameWithLists without pandas.") 

1782 def testRowGroupSizeDataFrameWithLists(self): 

1783 df = pd.DataFrame({"a": np.zeros(10), "b": [[0, 0]] * 10, "c": [[0.0, 0.0]] * 10, "d": [[]] * 10}) 

1784 arrowTable = pandas_to_arrow(df) 

1785 row_group_size = compute_row_group_size(arrowTable.schema) 

1786 

1787 self.assertGreater(row_group_size, 1_000_000) 

1788 

1789 

1790if __name__ == "__main__": 

1791 unittest.main()