Coverage for tests/test_parquet.py: 17%

937 statements  

« prev     ^ index     » next       coverage.py v7.2.3, created at 2023-04-22 02:18 -0700

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""Tests for ParquetFormatter. 

23 

24Tests in this module are disabled unless pandas and pyarrow are importable. 

25""" 

26 

27import os 

28import unittest 

29 

30try: 

31 import pyarrow as pa 

32except ImportError: 

33 pa = None 

34try: 

35 import astropy.table as atable 

36 from astropy import units 

37except ImportError: 

38 atable = None 

39try: 

40 import numpy as np 

41except ImportError: 

42 np = None 

43try: 

44 import pandas as pd 

45except ImportError: 

46 np = None 

47 

48from lsst.daf.butler import ( 

49 Butler, 

50 Config, 

51 DatasetRef, 

52 DatasetType, 

53 FileDataset, 

54 StorageClassConfig, 

55 StorageClassFactory, 

56) 

57from lsst.daf.butler.delegates.arrowastropy import ArrowAstropyDelegate 

58from lsst.daf.butler.delegates.arrownumpy import ArrowNumpyDelegate 

59from lsst.daf.butler.delegates.arrowtable import ArrowTableDelegate 

60from lsst.daf.butler.delegates.dataframe import DataFrameDelegate 

61from lsst.daf.butler.formatters.parquet import ( 

62 ArrowAstropySchema, 

63 ArrowNumpySchema, 

64 DataFrameSchema, 

65 ParquetFormatter, 

66 _append_numpy_multidim_metadata, 

67 _astropy_to_numpy_dict, 

68 _numpy_dict_to_numpy, 

69 _numpy_dtype_to_arrow_types, 

70 _numpy_style_arrays_to_arrow_arrays, 

71 _numpy_to_numpy_dict, 

72 arrow_to_astropy, 

73 arrow_to_numpy, 

74 arrow_to_numpy_dict, 

75 arrow_to_pandas, 

76 astropy_to_arrow, 

77 compute_row_group_size, 

78 numpy_dict_to_arrow, 

79 numpy_to_arrow, 

80 pandas_to_arrow, 

81) 

82from lsst.daf.butler.tests.utils import makeTestTempDir, removeTestTempDir 

83 

84TESTDIR = os.path.abspath(os.path.dirname(__file__)) 

85 

86 

87def _makeSimpleNumpyTable(include_multidim=False, include_bigendian=False): 

88 """Make a simple numpy table with random data. 

89 

90 Parameters 

91 ---------- 

92 include_multidim : `bool` 

93 Include multi-dimensional columns. 

94 include_bigendian : `bool` 

95 Include big-endian columns. 

96 

97 Returns 

98 ------- 

99 numpyTable : `numpy.ndarray` 

100 """ 

101 nrow = 5 

102 

103 dtype = [ 

104 ("index", "i4"), 

105 ("a", "f8"), 

106 ("b", "f8"), 

107 ("c", "f8"), 

108 ("ddd", "f8"), 

109 ("f", "i8"), 

110 ("strcol", "U10"), 

111 ("bytecol", "a10"), 

112 ] 

113 

114 if include_multidim: 

115 dtype.extend( 

116 [ 

117 ("d1", "f4", (5,)), 

118 ("d2", "i8", (5, 10)), 

119 ("d3", "f8", (5, 10)), 

120 ] 

121 ) 

122 

123 if include_bigendian: 

124 dtype.extend([("a_bigendian", ">f8"), ("f_bigendian", ">i8")]) 

125 

126 data = np.zeros(nrow, dtype=dtype) 

127 data["index"][:] = np.arange(nrow) 

128 data["a"] = np.random.randn(nrow) 

129 data["b"] = np.random.randn(nrow) 

130 data["c"] = np.random.randn(nrow) 

131 data["ddd"] = np.random.randn(nrow) 

132 data["f"] = np.arange(nrow) * 10 

133 data["strcol"][:] = "teststring" 

134 data["bytecol"][:] = "teststring" 

135 

136 if include_multidim: 

137 data["d1"] = np.random.randn(data["d1"].size).reshape(data["d1"].shape) 

138 data["d2"] = np.arange(data["d2"].size).reshape(data["d2"].shape) 

139 data["d3"] = np.asfortranarray(np.random.randn(data["d3"].size).reshape(data["d3"].shape)) 

140 

141 if include_bigendian: 

142 data["a_bigendian"][:] = data["a"] 

143 data["f_bigendian"][:] = data["f"] 

144 

145 return data 

146 

147 

148def _makeSingleIndexDataFrame(include_masked=False): 

149 """Make a single index data frame for testing. 

150 

151 Parameters 

152 ---------- 

153 include_masked : `bool` 

154 Include masked columns. 

155 

156 Returns 

157 ------- 

158 dataFrame : `~pandas.DataFrame` 

159 The test dataframe. 

160 allColumns : `list` [`str`] 

161 List of all the columns (including index columns). 

162 """ 

163 data = _makeSimpleNumpyTable() 

164 df = pd.DataFrame(data) 

165 df = df.set_index("index") 

166 

167 if include_masked: 

168 nrow = len(df) 

169 

170 df["m1"] = pd.array(np.arange(nrow), dtype=pd.Int64Dtype()) 

171 df["m2"] = pd.array(np.arange(nrow), dtype=np.float32) 

172 df["mstrcol"] = pd.array(np.array(["text"] * nrow)) 

173 df.loc[1, ["m1", "m2", "mstrcol"]] = None 

174 

175 allColumns = df.columns.append(pd.Index(df.index.names)) 

176 

177 return df, allColumns 

178 

179 

180def _makeMultiIndexDataFrame(): 

181 """Make a multi-index data frame for testing. 

182 

183 Returns 

184 ------- 

185 dataFrame : `~pandas.DataFrame` 

186 The test dataframe. 

187 """ 

188 columns = pd.MultiIndex.from_tuples( 

189 [ 

190 ("g", "a"), 

191 ("g", "b"), 

192 ("g", "c"), 

193 ("r", "a"), 

194 ("r", "b"), 

195 ("r", "c"), 

196 ], 

197 names=["filter", "column"], 

198 ) 

199 df = pd.DataFrame(np.random.randn(5, 6), index=np.arange(5, dtype=int), columns=columns) 

200 

201 return df 

202 

203 

204def _makeSimpleAstropyTable(include_multidim=False, include_masked=False, include_bigendian=False): 

205 """Make an astropy table for testing. 

206 

207 Parameters 

208 ---------- 

209 include_multidim : `bool` 

210 Include multi-dimensional columns. 

211 include_masked : `bool` 

212 Include masked columns. 

213 include_bigendian : `bool` 

214 Include big-endian columns. 

215 

216 Returns 

217 ------- 

218 astropyTable : `astropy.table.Table` 

219 The test table. 

220 """ 

221 data = _makeSimpleNumpyTable(include_multidim=include_multidim, include_bigendian=include_bigendian) 

222 # Add a couple of units. 

223 table = atable.Table(data) 

224 table["a"].unit = units.degree 

225 table["b"].unit = units.meter 

226 

227 # Add some masked columns. 

228 if include_masked: 

229 nrow = len(table) 

230 mask = np.zeros(nrow, dtype=bool) 

231 mask[1] = True 

232 table["m1"] = np.ma.masked_array(data=np.arange(nrow, dtype="i8"), mask=mask) 

233 table["m2"] = np.ma.masked_array(data=np.arange(nrow, dtype="f4"), mask=mask) 

234 table["mstrcol"] = np.ma.masked_array(data=np.array(["text"] * nrow), mask=mask) 

235 table["mbytecol"] = np.ma.masked_array(data=np.array([b"bytes"] * nrow), mask=mask) 

236 

237 return table 

238 

239 

240def _makeSimpleArrowTable(include_multidim=False, include_masked=False): 

241 """Make an arrow table for testing. 

242 

243 Parameters 

244 ---------- 

245 include_multidim : `bool` 

246 Include multi-dimensional columns. 

247 include_masked : `bool` 

248 Include masked columns. 

249 

250 Returns 

251 ------- 

252 arrowTable : `pyarrow.Table` 

253 The test table. 

254 """ 

255 data = _makeSimpleAstropyTable(include_multidim=include_multidim, include_masked=include_masked) 

256 return astropy_to_arrow(data) 

257 

258 

259@unittest.skipUnless(pd is not None, "Cannot test ParquetFormatterDataFrame without pandas.") 

260@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterDataFrame without pyarrow.") 

261class ParquetFormatterDataFrameTestCase(unittest.TestCase): 

262 """Tests for ParquetFormatter, DataFrame, using local file datastore.""" 

263 

264 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

265 

266 def setUp(self): 

267 """Create a new butler root for each test.""" 

268 self.root = makeTestTempDir(TESTDIR) 

269 config = Config(self.configFile) 

270 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run") 

271 # No dimensions in dataset type so we don't have to worry about 

272 # inserting dimension data or defining data IDs. 

273 self.datasetType = DatasetType( 

274 "data", dimensions=(), storageClass="DataFrame", universe=self.butler.registry.dimensions 

275 ) 

276 self.butler.registry.registerDatasetType(self.datasetType) 

277 

278 def tearDown(self): 

279 removeTestTempDir(self.root) 

280 

281 def testSingleIndexDataFrame(self): 

282 df1, allColumns = _makeSingleIndexDataFrame(include_masked=True) 

283 

284 self.butler.put(df1, self.datasetType, dataId={}) 

285 # Read the whole DataFrame. 

286 df2 = self.butler.get(self.datasetType, dataId={}) 

287 self.assertTrue(df1.equals(df2)) 

288 # Read just the column descriptions. 

289 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

290 self.assertTrue(allColumns.equals(columns2)) 

291 # Read the rowcount. 

292 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

293 self.assertEqual(rowcount, len(df1)) 

294 # Read the schema. 

295 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

296 self.assertEqual(schema, DataFrameSchema(df1)) 

297 # Read just some columns a few different ways. 

298 df3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

299 self.assertTrue(df1.loc[:, ["a", "c"]].equals(df3)) 

300 df4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

301 self.assertTrue(df1.loc[:, ["a"]].equals(df4)) 

302 df5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]}) 

303 self.assertTrue(df1.loc[:, ["a"]].equals(df5)) 

304 df6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"}) 

305 self.assertTrue(df1.loc[:, ["ddd"]].equals(df6)) 

306 df7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]}) 

307 self.assertTrue(df1.loc[:, ["a"]].equals(df7)) 

308 # Passing an unrecognized column should be a ValueError. 

309 with self.assertRaises(ValueError): 

310 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]}) 

311 

312 def testMultiIndexDataFrame(self): 

313 df1 = _makeMultiIndexDataFrame() 

314 

315 self.butler.put(df1, self.datasetType, dataId={}) 

316 # Read the whole DataFrame. 

317 df2 = self.butler.get(self.datasetType, dataId={}) 

318 self.assertTrue(df1.equals(df2)) 

319 # Read just the column descriptions. 

320 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

321 self.assertTrue(df1.columns.equals(columns2)) 

322 self.assertEqual(columns2.names, df1.columns.names) 

323 # Read the rowcount. 

324 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

325 self.assertEqual(rowcount, len(df1)) 

326 # Read the schema. 

327 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

328 self.assertEqual(schema, DataFrameSchema(df1)) 

329 # Read just some columns a few different ways. 

330 df3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": {"filter": "g"}}) 

331 self.assertTrue(df1.loc[:, ["g"]].equals(df3)) 

332 df4 = self.butler.get( 

333 self.datasetType, dataId={}, parameters={"columns": {"filter": ["r"], "column": "a"}} 

334 ) 

335 self.assertTrue(df1.loc[:, [("r", "a")]].equals(df4)) 

336 column_list = [("g", "a"), ("r", "c")] 

337 df5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": column_list}) 

338 self.assertTrue(df1.loc[:, column_list].equals(df5)) 

339 column_dict = {"filter": "r", "column": ["a", "b"]} 

340 df6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": column_dict}) 

341 self.assertTrue(df1.loc[:, [("r", "a"), ("r", "b")]].equals(df6)) 

342 # Passing an unrecognized column should be a ValueError. 

343 with self.assertRaises(ValueError): 

344 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["d"]}) 

345 

346 def testSingleIndexDataFrameEmptyString(self): 

347 """Test persisting a single index dataframe with empty strings.""" 

348 df1, _ = _makeSingleIndexDataFrame() 

349 

350 # Set one of the strings to None 

351 df1.at[1, "strcol"] = None 

352 

353 self.butler.put(df1, self.datasetType, dataId={}) 

354 # Read the whole DataFrame. 

355 df2 = self.butler.get(self.datasetType, dataId={}) 

356 self.assertTrue(df1.equals(df2)) 

357 

358 def testSingleIndexDataFrameAllEmptyStrings(self): 

359 """Test persisting a single index dataframe with an empty string 

360 column. 

361 """ 

362 df1, _ = _makeSingleIndexDataFrame() 

363 

364 # Set all of the strings to None 

365 df1.loc[0:, "strcol"] = None 

366 

367 self.butler.put(df1, self.datasetType, dataId={}) 

368 # Read the whole DataFrame. 

369 df2 = self.butler.get(self.datasetType, dataId={}) 

370 self.assertTrue(df1.equals(df2)) 

371 

372 def testLegacyDataFrame(self): 

373 """Test writing a dataframe to parquet via pandas (without additional 

374 metadata) and ensure that we can read it back with all the new 

375 functionality. 

376 """ 

377 df1, allColumns = _makeSingleIndexDataFrame() 

378 

379 fname = os.path.join(self.root, "test_dataframe.parq") 

380 df1.to_parquet(fname) 

381 

382 legacy_type = DatasetType( 

383 "legacy_dataframe", 

384 dimensions=(), 

385 storageClass="DataFrame", 

386 universe=self.butler.registry.dimensions, 

387 ) 

388 self.butler.registry.registerDatasetType(legacy_type) 

389 

390 data_id = {} 

391 ref = DatasetRef(legacy_type, data_id, run="testLegacyDataFrame") 

392 dataset = FileDataset(path=fname, refs=[ref], formatter=ParquetFormatter) 

393 

394 self.butler.ingest(dataset, transfer="copy") 

395 

396 self.butler.put(df1, self.datasetType, dataId={}) 

397 

398 df2a = self.butler.get(self.datasetType, dataId={}) 

399 df2b = self.butler.get("legacy_dataframe", dataId={}) 

400 self.assertTrue(df2a.equals(df2b)) 

401 

402 df3a = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a"]}) 

403 df3b = self.butler.get("legacy_dataframe", dataId={}, parameters={"columns": ["a"]}) 

404 self.assertTrue(df3a.equals(df3b)) 

405 

406 columns2a = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

407 columns2b = self.butler.get("legacy_dataframe.columns", dataId={}) 

408 self.assertTrue(columns2a.equals(columns2b)) 

409 

410 rowcount2a = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

411 rowcount2b = self.butler.get("legacy_dataframe.rowcount", dataId={}) 

412 self.assertEqual(rowcount2a, rowcount2b) 

413 

414 schema2a = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

415 schema2b = self.butler.get("legacy_dataframe.schema", dataId={}) 

416 self.assertEqual(schema2a, schema2b) 

417 

418 def testDataFrameSchema(self): 

419 tab1 = _makeSimpleArrowTable() 

420 

421 schema = DataFrameSchema.from_arrow(tab1.schema) 

422 

423 self.assertIsInstance(schema.schema, pd.DataFrame) 

424 self.assertEqual(repr(schema), repr(schema._schema)) 

425 self.assertNotEqual(schema, "not_a_schema") 

426 self.assertEqual(schema, schema) 

427 

428 tab2 = _makeMultiIndexDataFrame() 

429 schema2 = DataFrameSchema(tab2) 

430 

431 self.assertNotEqual(schema, schema2) 

432 

433 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

434 def testWriteSingleIndexDataFrameReadAsAstropyTable(self): 

435 df1, allColumns = _makeSingleIndexDataFrame() 

436 

437 self.butler.put(df1, self.datasetType, dataId={}) 

438 

439 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

440 

441 tab2_df = tab2.to_pandas(index="index") 

442 self.assertTrue(df1.equals(tab2_df)) 

443 

444 # Check reading the columns. 

445 columns = list(tab2.columns.keys()) 

446 columns2 = self.butler.get( 

447 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

448 ) 

449 # We check the set because pandas reorders the columns. 

450 self.assertEqual(set(columns2), set(columns)) 

451 

452 # Check reading the schema. 

453 schema = ArrowAstropySchema(tab2) 

454 schema2 = self.butler.get( 

455 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema" 

456 ) 

457 

458 # The string types are objectified by pandas, and the order 

459 # will be changed because of pandas indexing. 

460 self.assertEqual(len(schema2.schema.columns), len(schema.schema.columns)) 

461 for name in schema.schema.columns: 

462 self.assertIn(name, schema2.schema.columns) 

463 if schema2.schema[name].dtype != np.dtype("O"): 

464 self.assertEqual(schema2.schema[name].dtype, schema.schema[name].dtype) 

465 

466 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

467 def testWriteSingleIndexDataFrameWithMaskedColsReadAsAstropyTable(self): 

468 # We need to special-case the write-as-pandas read-as-astropy code 

469 # with masks because pandas has multiple ways to use masked columns. 

470 # (The string column mask handling in particular is frustratingly 

471 # inconsistent.) 

472 df1, allColumns = _makeSingleIndexDataFrame(include_masked=True) 

473 

474 self.butler.put(df1, self.datasetType, dataId={}) 

475 

476 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

477 tab2_df = tab2.to_pandas(index="index") 

478 

479 self.assertTrue(df1.columns.equals(tab2_df.columns)) 

480 for name in tab2_df.columns: 

481 col1 = df1[name] 

482 col2 = tab2_df[name] 

483 

484 if col1.hasnans: 

485 notNull = col1.notnull() 

486 self.assertTrue(notNull.equals(col2.notnull())) 

487 # Need to check value-by-value because column may 

488 # be made of objects, depending on what pandas decides. 

489 for index in notNull.values.nonzero()[0]: 

490 self.assertEqual(col1[index], col2[index]) 

491 else: 

492 self.assertTrue(col1.equals(col2)) 

493 

494 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

495 def testWriteMultiIndexDataFrameReadAsAstropyTable(self): 

496 df1 = _makeMultiIndexDataFrame() 

497 

498 self.butler.put(df1, self.datasetType, dataId={}) 

499 

500 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

501 

502 # This is an odd duck, it doesn't really round-trip. 

503 # This test simply checks that it's readable, but definitely not 

504 # recommended. 

505 

506 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.") 

507 def testWriteSingleIndexDataFrameReadAsArrowTable(self): 

508 df1, allColumns = _makeSingleIndexDataFrame() 

509 

510 self.butler.put(df1, self.datasetType, dataId={}) 

511 

512 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable") 

513 

514 tab2_df = arrow_to_pandas(tab2) 

515 self.assertTrue(df1.equals(tab2_df)) 

516 

517 # Check reading the columns. 

518 columns = list(tab2.schema.names) 

519 columns2 = self.butler.get( 

520 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

521 ) 

522 # We check the set because pandas reorders the columns. 

523 self.assertEqual(set(columns), set(columns2)) 

524 

525 # Check reading the schema. 

526 schema = tab2.schema 

527 schema2 = self.butler.get( 

528 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema" 

529 ) 

530 

531 # These will not have the same metadata, nor will the string column 

532 # information be maintained. 

533 self.assertEqual(len(schema.names), len(schema2.names)) 

534 for name in schema.names: 

535 if schema.field(name).type not in (pa.string(), pa.binary()): 

536 self.assertEqual(schema.field(name).type, schema2.field(name).type) 

537 

538 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.") 

539 def testWriteMultiIndexDataFrameReadAsArrowTable(self): 

540 df1 = _makeMultiIndexDataFrame() 

541 

542 self.butler.put(df1, self.datasetType, dataId={}) 

543 

544 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable") 

545 

546 tab2_df = arrow_to_pandas(tab2) 

547 self.assertTrue(df1.equals(tab2_df)) 

548 

549 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

550 def testWriteSingleIndexDataFrameReadAsNumpyTable(self): 

551 df1, allColumns = _makeSingleIndexDataFrame() 

552 

553 self.butler.put(df1, self.datasetType, dataId={}) 

554 

555 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy") 

556 

557 tab2_df = pd.DataFrame.from_records(tab2, index=["index"]) 

558 self.assertTrue(df1.equals(tab2_df)) 

559 

560 # Check reading the columns. 

561 columns = list(tab2.dtype.names) 

562 columns2 = self.butler.get( 

563 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

564 ) 

565 # We check the set because pandas reorders the columns. 

566 self.assertEqual(set(columns2), set(columns)) 

567 

568 # Check reading the schema. 

569 schema = ArrowNumpySchema(tab2.dtype) 

570 schema2 = self.butler.get( 

571 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema" 

572 ) 

573 

574 # The string types will be objectified by pandas, and the order 

575 # will be changed because of pandas indexing. 

576 self.assertEqual(len(schema.schema.names), len(schema2.schema.names)) 

577 for name in schema.schema.names: 

578 self.assertIn(name, schema2.schema.names) 

579 self.assertEqual(schema2.schema[name].type, schema.schema[name].type) 

580 

581 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

582 def testWriteMultiIndexDataFrameReadAsNumpyTable(self): 

583 df1 = _makeMultiIndexDataFrame() 

584 

585 self.butler.put(df1, self.datasetType, dataId={}) 

586 

587 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy") 

588 

589 # This is an odd duck, it doesn't really round-trip. 

590 # This test simply checks that it's readable, but definitely not 

591 # recommended. 

592 

593 @unittest.skipUnless(np is not None, "Cannot test reading as numpy dict without numpy.") 

594 def testWriteSingleIndexDataFrameReadAsNumpyDict(self): 

595 df1, allColumns = _makeSingleIndexDataFrame() 

596 

597 self.butler.put(df1, self.datasetType, dataId={}) 

598 

599 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict") 

600 

601 tab2_df = pd.DataFrame.from_records(tab2, index=["index"]) 

602 # The column order is not maintained. 

603 self.assertEqual(set(df1.columns), set(tab2_df.columns)) 

604 for col in df1.columns: 

605 self.assertTrue(np.all(df1[col].values == tab2_df[col].values)) 

606 

607 @unittest.skipUnless(np is not None, "Cannot test reading as numpy dict without numpy.") 

608 def testWriteMultiIndexDataFrameReadAsNumpyDict(self): 

609 df1 = _makeMultiIndexDataFrame() 

610 

611 self.butler.put(df1, self.datasetType, dataId={}) 

612 

613 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict") 

614 

615 # This is an odd duck, it doesn't really round-trip. 

616 # This test simply checks that it's readable, but definitely not 

617 # recommended. 

618 

619 

620@unittest.skipUnless(pd is not None, "Cannot test InMemoryDataFrameDelegate without pandas.") 

621class InMemoryDataFrameDelegateTestCase(ParquetFormatterDataFrameTestCase): 

622 """Tests for InMemoryDatastore, using DataFrameDelegate.""" 

623 

624 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

625 

626 def testWriteMultiIndexDataFrameReadAsAstropyTable(self): 

627 df1 = _makeMultiIndexDataFrame() 

628 

629 self.butler.put(df1, self.datasetType, dataId={}) 

630 

631 with self.assertRaises(ValueError): 

632 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

633 

634 def testLegacyDataFrame(self): 

635 # This test does not work with an inMemoryDatastore. 

636 pass 

637 

638 def testBadInput(self): 

639 df1, _ = _makeSingleIndexDataFrame() 

640 delegate = DataFrameDelegate("DataFrame") 

641 

642 with self.assertRaises(ValueError): 

643 delegate.handleParameters(inMemoryDataset="not_a_dataframe") 

644 

645 with self.assertRaises(AttributeError): 

646 delegate.getComponent(composite=df1, componentName="nothing") 

647 

648 def testStorageClass(self): 

649 df1, allColumns = _makeSingleIndexDataFrame() 

650 

651 factory = StorageClassFactory() 

652 factory.addFromConfig(StorageClassConfig()) 

653 

654 storageClass = factory.findStorageClass(type(df1), compare_types=False) 

655 # Force the name lookup to do name matching. 

656 storageClass._pytype = None 

657 self.assertEqual(storageClass.name, "DataFrame") 

658 

659 storageClass = factory.findStorageClass(type(df1), compare_types=True) 

660 # Force the name lookup to do name matching. 

661 storageClass._pytype = None 

662 self.assertEqual(storageClass.name, "DataFrame") 

663 

664 

665@unittest.skipUnless(atable is not None, "Cannot test ParquetFormatterArrowAstropy without astropy.") 

666@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowAstropy without pyarrow.") 

667class ParquetFormatterArrowAstropyTestCase(unittest.TestCase): 

668 """Tests for ParquetFormatter, ArrowAstropy, using local file datastore.""" 

669 

670 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

671 

672 def setUp(self): 

673 """Create a new butler root for each test.""" 

674 self.root = makeTestTempDir(TESTDIR) 

675 config = Config(self.configFile) 

676 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run") 

677 # No dimensions in dataset type so we don't have to worry about 

678 # inserting dimension data or defining data IDs. 

679 self.datasetType = DatasetType( 

680 "data", dimensions=(), storageClass="ArrowAstropy", universe=self.butler.registry.dimensions 

681 ) 

682 self.butler.registry.registerDatasetType(self.datasetType) 

683 

684 def tearDown(self): 

685 removeTestTempDir(self.root) 

686 

687 def testAstropyTable(self): 

688 tab1 = _makeSimpleAstropyTable(include_multidim=True, include_masked=True) 

689 

690 self.butler.put(tab1, self.datasetType, dataId={}) 

691 # Read the whole Table. 

692 tab2 = self.butler.get(self.datasetType, dataId={}) 

693 self._checkAstropyTableEquality(tab1, tab2) 

694 # Read the columns. 

695 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

696 self.assertEqual(len(columns2), len(tab1.dtype.names)) 

697 for i, name in enumerate(tab1.dtype.names): 

698 self.assertEqual(columns2[i], name) 

699 # Read the rowcount. 

700 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

701 self.assertEqual(rowcount, len(tab1)) 

702 # Read the schema. 

703 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

704 self.assertEqual(schema, ArrowAstropySchema(tab1)) 

705 # Read just some columns a few different ways. 

706 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

707 self._checkAstropyTableEquality(tab1[("a", "c")], tab3) 

708 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

709 self._checkAstropyTableEquality(tab1[("a",)], tab4) 

710 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]}) 

711 self._checkAstropyTableEquality(tab1[("index", "a")], tab5) 

712 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"}) 

713 self._checkAstropyTableEquality(tab1[("ddd",)], tab6) 

714 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]}) 

715 self._checkAstropyTableEquality(tab1[("a",)], tab7) 

716 # Passing an unrecognized column should be a ValueError. 

717 with self.assertRaises(ValueError): 

718 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]}) 

719 

720 def testAstropyTableBigEndian(self): 

721 tab1 = _makeSimpleAstropyTable(include_bigendian=True) 

722 

723 self.butler.put(tab1, self.datasetType, dataId={}) 

724 # Read the whole Table. 

725 tab2 = self.butler.get(self.datasetType, dataId={}) 

726 self._checkAstropyTableEquality(tab1, tab2, has_bigendian=True) 

727 

728 def testAstropyTableWithMetadata(self): 

729 tab1 = _makeSimpleAstropyTable(include_multidim=True) 

730 

731 meta = { 

732 "meta_a": 5, 

733 "meta_b": 10.0, 

734 "meta_c": [1, 2, 3], 

735 "meta_d": True, 

736 "meta_e": "string", 

737 } 

738 

739 tab1.meta.update(meta) 

740 

741 self.butler.put(tab1, self.datasetType, dataId={}) 

742 # Read the whole Table. 

743 tab2 = self.butler.get(self.datasetType, dataId={}) 

744 # This will check that the metadata is equivalent as well. 

745 self._checkAstropyTableEquality(tab1, tab2) 

746 

747 def testArrowAstropySchema(self): 

748 tab1 = _makeSimpleAstropyTable() 

749 tab1_arrow = astropy_to_arrow(tab1) 

750 schema = ArrowAstropySchema.from_arrow(tab1_arrow.schema) 

751 

752 self.assertIsInstance(schema.schema, atable.Table) 

753 self.assertEqual(repr(schema), repr(schema._schema)) 

754 self.assertNotEqual(schema, "not_a_schema") 

755 self.assertEqual(schema, schema) 

756 

757 # Test various inequalities 

758 tab2 = tab1.copy() 

759 tab2.rename_column("index", "index2") 

760 schema2 = ArrowAstropySchema(tab2) 

761 self.assertNotEqual(schema2, schema) 

762 

763 tab2 = tab1.copy() 

764 tab2["index"].unit = units.micron 

765 schema2 = ArrowAstropySchema(tab2) 

766 self.assertNotEqual(schema2, schema) 

767 

768 tab2 = tab1.copy() 

769 tab2["index"].description = "Index column" 

770 schema2 = ArrowAstropySchema(tab2) 

771 self.assertNotEqual(schema2, schema) 

772 

773 tab2 = tab1.copy() 

774 tab2["index"].format = "%05d" 

775 schema2 = ArrowAstropySchema(tab2) 

776 self.assertNotEqual(schema2, schema) 

777 

778 def testAstropyParquet(self): 

779 tab1 = _makeSimpleAstropyTable() 

780 

781 fname = os.path.join(self.root, "test_astropy.parq") 

782 tab1.write(fname) 

783 

784 astropy_type = DatasetType( 

785 "astropy_parquet", 

786 dimensions=(), 

787 storageClass="ArrowAstropy", 

788 universe=self.butler.registry.dimensions, 

789 ) 

790 self.butler.registry.registerDatasetType(astropy_type) 

791 

792 data_id = {} 

793 ref = DatasetRef(astropy_type, data_id, run="testAstropyParquet") 

794 dataset = FileDataset(path=fname, refs=[ref], formatter=ParquetFormatter) 

795 

796 self.butler.ingest(dataset, transfer="copy") 

797 

798 self.butler.put(tab1, self.datasetType, dataId={}) 

799 

800 tab2a = self.butler.get(self.datasetType, dataId={}) 

801 tab2b = self.butler.get("astropy_parquet", dataId={}) 

802 self._checkAstropyTableEquality(tab2a, tab2b) 

803 

804 columns2a = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

805 columns2b = self.butler.get("astropy_parquet.columns", dataId={}) 

806 self.assertEqual(len(columns2b), len(columns2a)) 

807 for i, name in enumerate(columns2a): 

808 self.assertEqual(columns2b[i], name) 

809 

810 rowcount2a = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

811 rowcount2b = self.butler.get("astropy_parquet.rowcount", dataId={}) 

812 self.assertEqual(rowcount2a, rowcount2b) 

813 

814 schema2a = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

815 schema2b = self.butler.get("astropy_parquet.schema", dataId={}) 

816 self.assertEqual(schema2a, schema2b) 

817 

818 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.") 

819 def testWriteAstropyReadAsArrowTable(self): 

820 # This astropy <-> arrow works fine with masked columns. 

821 tab1 = _makeSimpleAstropyTable(include_masked=True) 

822 

823 self.butler.put(tab1, self.datasetType, dataId={}) 

824 

825 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable") 

826 

827 tab2_astropy = arrow_to_astropy(tab2) 

828 self._checkAstropyTableEquality(tab1, tab2_astropy) 

829 

830 # Check reading the columns. 

831 columns = tab2.schema.names 

832 columns2 = self.butler.get( 

833 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

834 ) 

835 self.assertEqual(columns2, columns) 

836 

837 # Check reading the schema. 

838 schema = tab2.schema 

839 schema2 = self.butler.get( 

840 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema" 

841 ) 

842 

843 self.assertEqual(schema, schema2) 

844 

845 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

846 def testWriteAstropyReadAsDataFrame(self): 

847 tab1 = _makeSimpleAstropyTable() 

848 

849 self.butler.put(tab1, self.datasetType, dataId={}) 

850 

851 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

852 

853 # This is tricky because it loses the units and gains a bonus pandas 

854 # _index_ column, so we just test the dataframe form. 

855 

856 tab1_df = tab1.to_pandas() 

857 self.assertTrue(tab1_df.equals(tab2)) 

858 

859 # Check reading the columns. 

860 columns = tab2.columns 

861 columns2 = self.butler.get( 

862 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex" 

863 ) 

864 self.assertTrue(columns.equals(columns2)) 

865 

866 # Check reading the schema. 

867 schema = DataFrameSchema(tab2) 

868 schema2 = self.butler.get( 

869 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema" 

870 ) 

871 

872 self.assertEqual(schema2, schema) 

873 

874 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

875 def testWriteAstropyWithMaskedColsReadAsDataFrame(self): 

876 # We need to special-case the write-as-astropy read-as-pandas code 

877 # with masks because pandas has multiple ways to use masked columns. 

878 # (When writing an astropy table with masked columns we get an object 

879 # column back, but each unmasked element has the correct type.) 

880 tab1 = _makeSimpleAstropyTable(include_masked=True) 

881 

882 self.butler.put(tab1, self.datasetType, dataId={}) 

883 

884 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

885 

886 tab1_df = tab1.to_pandas() 

887 

888 self.assertTrue(tab1_df.columns.equals(tab2.columns)) 

889 for name in tab2.columns: 

890 col1 = tab1_df[name] 

891 col2 = tab2[name] 

892 

893 if col1.hasnans: 

894 notNull = col1.notnull() 

895 self.assertTrue(notNull.equals(col2.notnull())) 

896 # Need to check value-by-value because column may 

897 # be made of objects, depending on what pandas decides. 

898 for index in notNull.values.nonzero()[0]: 

899 self.assertEqual(col1[index], col2[index]) 

900 else: 

901 self.assertTrue(col1.equals(col2)) 

902 

903 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

904 def testWriteAstropyReadAsNumpyTable(self): 

905 tab1 = _makeSimpleAstropyTable() 

906 self.butler.put(tab1, self.datasetType, dataId={}) 

907 

908 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy") 

909 

910 # This is tricky because it loses the units. 

911 tab2_astropy = atable.Table(tab2) 

912 

913 self._checkAstropyTableEquality(tab1, tab2_astropy, skip_units=True) 

914 

915 # Check reading the columns. 

916 columns = list(tab2.dtype.names) 

917 columns2 = self.butler.get( 

918 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

919 ) 

920 self.assertEqual(columns2, columns) 

921 

922 # Check reading the schema. 

923 schema = ArrowNumpySchema(tab2.dtype) 

924 schema2 = self.butler.get( 

925 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema" 

926 ) 

927 

928 self.assertEqual(schema2, schema) 

929 

930 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

931 def testWriteAstropyReadAsNumpyDict(self): 

932 tab1 = _makeSimpleAstropyTable() 

933 self.butler.put(tab1, self.datasetType, dataId={}) 

934 

935 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict") 

936 

937 # This is tricky because it loses the units. 

938 tab2_astropy = atable.Table(tab2) 

939 

940 self._checkAstropyTableEquality(tab1, tab2_astropy, skip_units=True) 

941 

942 def _checkAstropyTableEquality(self, table1, table2, skip_units=False, has_bigendian=False): 

943 """Check if two astropy tables have the same columns/values. 

944 

945 Parameters 

946 ---------- 

947 table1 : `astropy.table.Table` 

948 table2 : `astropy.table.Table` 

949 skip_units : `bool` 

950 has_bigendian : `bool` 

951 """ 

952 if not has_bigendian: 

953 self.assertEqual(table1.dtype, table2.dtype) 

954 else: 

955 for name in table1.dtype.names: 

956 # Only check type matches, force to little-endian. 

957 self.assertEqual(table1.dtype[name].newbyteorder(">"), table2.dtype[name].newbyteorder(">")) 

958 

959 self.assertEqual(table1.meta, table2.meta) 

960 if not skip_units: 

961 for name in table1.columns: 

962 self.assertEqual(table1[name].unit, table2[name].unit) 

963 self.assertEqual(table1[name].description, table2[name].description) 

964 self.assertEqual(table1[name].format, table2[name].format) 

965 self.assertTrue(np.all(table1 == table2)) 

966 

967 

968@unittest.skipUnless(atable is not None, "Cannot test InMemoryArrowAstropyDelegate without astropy.") 

969class InMemoryArrowAstropyDelegateTestCase(ParquetFormatterArrowAstropyTestCase): 

970 """Tests for InMemoryDatastore, using ArrowAstropyDelegate.""" 

971 

972 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

973 

974 def testAstropyParquet(self): 

975 # This test does not work with an inMemoryDatastore. 

976 pass 

977 

978 def testBadInput(self): 

979 tab1 = _makeSimpleAstropyTable() 

980 delegate = ArrowAstropyDelegate("ArrowAstropy") 

981 

982 with self.assertRaises(ValueError): 

983 delegate.handleParameters(inMemoryDataset="not_an_astropy_table") 

984 

985 with self.assertRaises(NotImplementedError): 

986 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]}) 

987 

988 with self.assertRaises(AttributeError): 

989 delegate.getComponent(composite=tab1, componentName="nothing") 

990 

991 

992@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.") 

993@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowNumpy without pyarrow.") 

994class ParquetFormatterArrowNumpyTestCase(unittest.TestCase): 

995 """Tests for ParquetFormatter, ArrowNumpy, using local file datastore.""" 

996 

997 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

998 

999 def setUp(self): 

1000 """Create a new butler root for each test.""" 

1001 self.root = makeTestTempDir(TESTDIR) 

1002 config = Config(self.configFile) 

1003 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run") 

1004 # No dimensions in dataset type so we don't have to worry about 

1005 # inserting dimension data or defining data IDs. 

1006 self.datasetType = DatasetType( 

1007 "data", dimensions=(), storageClass="ArrowNumpy", universe=self.butler.registry.dimensions 

1008 ) 

1009 self.butler.registry.registerDatasetType(self.datasetType) 

1010 

1011 def tearDown(self): 

1012 removeTestTempDir(self.root) 

1013 

1014 def testNumpyTable(self): 

1015 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1016 

1017 self.butler.put(tab1, self.datasetType, dataId={}) 

1018 # Read the whole Table. 

1019 tab2 = self.butler.get(self.datasetType, dataId={}) 

1020 self._checkNumpyTableEquality(tab1, tab2) 

1021 # Read the columns. 

1022 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

1023 self.assertEqual(len(columns2), len(tab1.dtype.names)) 

1024 for i, name in enumerate(tab1.dtype.names): 

1025 self.assertEqual(columns2[i], name) 

1026 # Read the rowcount. 

1027 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

1028 self.assertEqual(rowcount, len(tab1)) 

1029 # Read the schema. 

1030 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

1031 self.assertEqual(schema, ArrowNumpySchema(tab1.dtype)) 

1032 # Read just some columns a few different ways. 

1033 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

1034 self._checkNumpyTableEquality(tab1[["a", "c"]], tab3) 

1035 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

1036 self._checkNumpyTableEquality( 

1037 tab1[ 

1038 [ 

1039 "a", 

1040 ] 

1041 ], 

1042 tab4, 

1043 ) 

1044 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]}) 

1045 self._checkNumpyTableEquality(tab1[["index", "a"]], tab5) 

1046 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"}) 

1047 self._checkNumpyTableEquality( 

1048 tab1[ 

1049 [ 

1050 "ddd", 

1051 ] 

1052 ], 

1053 tab6, 

1054 ) 

1055 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]}) 

1056 self._checkNumpyTableEquality( 

1057 tab1[ 

1058 [ 

1059 "a", 

1060 ] 

1061 ], 

1062 tab7, 

1063 ) 

1064 # Passing an unrecognized column should be a ValueError. 

1065 with self.assertRaises(ValueError): 

1066 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]}) 

1067 

1068 def testNumpyTableBigEndian(self): 

1069 tab1 = _makeSimpleNumpyTable(include_bigendian=True) 

1070 

1071 self.butler.put(tab1, self.datasetType, dataId={}) 

1072 # Read the whole Table. 

1073 tab2 = self.butler.get(self.datasetType, dataId={}) 

1074 self._checkNumpyTableEquality(tab1, tab2, has_bigendian=True) 

1075 

1076 def testArrowNumpySchema(self): 

1077 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1078 tab1_arrow = numpy_to_arrow(tab1) 

1079 schema = ArrowNumpySchema.from_arrow(tab1_arrow.schema) 

1080 

1081 self.assertIsInstance(schema.schema, np.dtype) 

1082 self.assertEqual(repr(schema), repr(schema._dtype)) 

1083 self.assertNotEqual(schema, "not_a_schema") 

1084 self.assertEqual(schema, schema) 

1085 

1086 # Test inequality 

1087 tab2 = tab1.copy() 

1088 names = list(tab2.dtype.names) 

1089 names[0] = "index2" 

1090 tab2.dtype.names = names 

1091 schema2 = ArrowNumpySchema(tab2.dtype) 

1092 self.assertNotEqual(schema2, schema) 

1093 

1094 @unittest.skipUnless(pa is not None, "Cannot test arrow conversions without pyarrow.") 

1095 def testNumpyDictConversions(self): 

1096 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1097 

1098 # Verify that everything round-trips, including the schema. 

1099 tab1_arrow = numpy_to_arrow(tab1) 

1100 tab1_dict = arrow_to_numpy_dict(tab1_arrow) 

1101 tab1_dict_arrow = numpy_dict_to_arrow(tab1_dict) 

1102 

1103 self.assertEqual(tab1_arrow.schema, tab1_dict_arrow.schema) 

1104 self.assertEqual(tab1_arrow, tab1_dict_arrow) 

1105 

1106 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.") 

1107 def testWriteNumpyTableReadAsArrowTable(self): 

1108 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1109 

1110 self.butler.put(tab1, self.datasetType, dataId={}) 

1111 

1112 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable") 

1113 

1114 tab2_numpy = arrow_to_numpy(tab2) 

1115 

1116 self._checkNumpyTableEquality(tab1, tab2_numpy) 

1117 

1118 # Check reading the columns. 

1119 columns = tab2.schema.names 

1120 columns2 = self.butler.get( 

1121 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

1122 ) 

1123 self.assertEqual(columns2, columns) 

1124 

1125 # Check reading the schema. 

1126 schema = tab2.schema 

1127 schema2 = self.butler.get( 

1128 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema" 

1129 ) 

1130 self.assertEqual(schema2, schema) 

1131 

1132 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

1133 def testWriteNumpyTableReadAsDataFrame(self): 

1134 tab1 = _makeSimpleNumpyTable() 

1135 

1136 self.butler.put(tab1, self.datasetType, dataId={}) 

1137 

1138 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

1139 

1140 # Converting this back to numpy gets confused with the index column 

1141 # and changes the datatype of the string column. 

1142 

1143 tab1_df = pd.DataFrame(tab1) 

1144 

1145 self.assertTrue(tab1_df.equals(tab2)) 

1146 

1147 # Check reading the columns. 

1148 columns = tab2.columns 

1149 columns2 = self.butler.get( 

1150 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex" 

1151 ) 

1152 self.assertTrue(columns.equals(columns2)) 

1153 

1154 # Check reading the schema. 

1155 schema = DataFrameSchema(tab2) 

1156 schema2 = self.butler.get( 

1157 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema" 

1158 ) 

1159 

1160 self.assertEqual(schema2, schema) 

1161 

1162 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

1163 def testWriteNumpyTableReadAsAstropyTable(self): 

1164 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1165 

1166 self.butler.put(tab1, self.datasetType, dataId={}) 

1167 

1168 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

1169 tab2_numpy = tab2.as_array() 

1170 

1171 self._checkNumpyTableEquality(tab1, tab2_numpy) 

1172 

1173 # Check reading the columns. 

1174 columns = list(tab2.columns.keys()) 

1175 columns2 = self.butler.get( 

1176 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

1177 ) 

1178 self.assertEqual(columns2, columns) 

1179 

1180 # Check reading the schema. 

1181 schema = ArrowAstropySchema(tab2) 

1182 schema2 = self.butler.get( 

1183 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema" 

1184 ) 

1185 

1186 self.assertEqual(schema2, schema) 

1187 

1188 def testWriteNumpyTableReadAsNumpyDict(self): 

1189 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1190 

1191 self.butler.put(tab1, self.datasetType, dataId={}) 

1192 

1193 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict") 

1194 tab2_numpy = _numpy_dict_to_numpy(tab2) 

1195 

1196 self._checkNumpyTableEquality(tab1, tab2_numpy) 

1197 

1198 def _checkNumpyTableEquality(self, table1, table2, has_bigendian=False): 

1199 """Check if two numpy tables have the same columns/values 

1200 

1201 Parameters 

1202 ---------- 

1203 table1 : `numpy.ndarray` 

1204 table2 : `numpy.ndarray` 

1205 has_bigendian : `bool` 

1206 """ 

1207 self.assertEqual(table1.dtype.names, table2.dtype.names) 

1208 for name in table1.dtype.names: 

1209 if not has_bigendian: 

1210 self.assertEqual(table1.dtype[name], table2.dtype[name]) 

1211 else: 

1212 # Only check type matches, force to little-endian. 

1213 self.assertEqual(table1.dtype[name].newbyteorder(">"), table2.dtype[name].newbyteorder(">")) 

1214 self.assertTrue(np.all(table1 == table2)) 

1215 

1216 

1217@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.") 

1218class InMemoryArrowNumpyDelegateTestCase(ParquetFormatterArrowNumpyTestCase): 

1219 """Tests for InMemoryDatastore, using ArrowNumpyDelegate.""" 

1220 

1221 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

1222 

1223 def testBadInput(self): 

1224 tab1 = _makeSimpleNumpyTable() 

1225 delegate = ArrowNumpyDelegate("ArrowNumpy") 

1226 

1227 with self.assertRaises(ValueError): 

1228 delegate.handleParameters(inMemoryDataset="not_a_numpy_table") 

1229 

1230 with self.assertRaises(NotImplementedError): 

1231 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]}) 

1232 

1233 with self.assertRaises(AttributeError): 

1234 delegate.getComponent(composite=tab1, componentName="nothing") 

1235 

1236 def testStorageClass(self): 

1237 tab1 = _makeSimpleNumpyTable() 

1238 

1239 factory = StorageClassFactory() 

1240 factory.addFromConfig(StorageClassConfig()) 

1241 

1242 storageClass = factory.findStorageClass(type(tab1), compare_types=False) 

1243 # Force the name lookup to do name matching. 

1244 storageClass._pytype = None 

1245 self.assertEqual(storageClass.name, "ArrowNumpy") 

1246 

1247 storageClass = factory.findStorageClass(type(tab1), compare_types=True) 

1248 # Force the name lookup to do name matching. 

1249 storageClass._pytype = None 

1250 self.assertEqual(storageClass.name, "ArrowNumpy") 

1251 

1252 

1253@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowTable without pyarrow.") 

1254class ParquetFormatterArrowTableTestCase(unittest.TestCase): 

1255 """Tests for ParquetFormatter, ArrowTable, using local file datastore.""" 

1256 

1257 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1258 

1259 def setUp(self): 

1260 """Create a new butler root for each test.""" 

1261 self.root = makeTestTempDir(TESTDIR) 

1262 config = Config(self.configFile) 

1263 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run") 

1264 # No dimensions in dataset type so we don't have to worry about 

1265 # inserting dimension data or defining data IDs. 

1266 self.datasetType = DatasetType( 

1267 "data", dimensions=(), storageClass="ArrowTable", universe=self.butler.registry.dimensions 

1268 ) 

1269 self.butler.registry.registerDatasetType(self.datasetType) 

1270 

1271 def tearDown(self): 

1272 removeTestTempDir(self.root) 

1273 

1274 def testArrowTable(self): 

1275 tab1 = _makeSimpleArrowTable(include_multidim=True, include_masked=True) 

1276 

1277 self.butler.put(tab1, self.datasetType, dataId={}) 

1278 # Read the whole Table. 

1279 tab2 = self.butler.get(self.datasetType, dataId={}) 

1280 self.assertEqual(tab2, tab1) 

1281 # Read the columns. 

1282 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

1283 self.assertEqual(len(columns2), len(tab1.schema.names)) 

1284 for i, name in enumerate(tab1.schema.names): 

1285 self.assertEqual(columns2[i], name) 

1286 # Read the rowcount. 

1287 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

1288 self.assertEqual(rowcount, len(tab1)) 

1289 # Read the schema. 

1290 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

1291 self.assertEqual(schema, tab1.schema) 

1292 # Read just some columns a few different ways. 

1293 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

1294 self.assertEqual(tab3, tab1.select(("a", "c"))) 

1295 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

1296 self.assertEqual(tab4, tab1.select(("a",))) 

1297 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]}) 

1298 self.assertEqual(tab5, tab1.select(("index", "a"))) 

1299 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"}) 

1300 self.assertEqual(tab6, tab1.select(("ddd",))) 

1301 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]}) 

1302 self.assertEqual(tab7, tab1.select(("a",))) 

1303 # Passing an unrecognized column should be a ValueError. 

1304 with self.assertRaises(ValueError): 

1305 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]}) 

1306 

1307 def testEmptyArrowTable(self): 

1308 data = _makeSimpleNumpyTable() 

1309 type_list = _numpy_dtype_to_arrow_types(data.dtype) 

1310 

1311 schema = pa.schema(type_list) 

1312 arrays = [[]] * len(schema.names) 

1313 

1314 tab1 = pa.Table.from_arrays(arrays, schema=schema) 

1315 

1316 self.butler.put(tab1, self.datasetType, dataId={}) 

1317 tab2 = self.butler.get(self.datasetType, dataId={}) 

1318 self.assertEqual(tab2, tab1) 

1319 

1320 tab1_numpy = arrow_to_numpy(tab1) 

1321 self.assertEqual(len(tab1_numpy), 0) 

1322 tab1_numpy_arrow = numpy_to_arrow(tab1_numpy) 

1323 self.assertEqual(tab1_numpy_arrow, tab1) 

1324 

1325 tab1_pandas = arrow_to_pandas(tab1) 

1326 self.assertEqual(len(tab1_pandas), 0) 

1327 tab1_pandas_arrow = pandas_to_arrow(tab1_pandas) 

1328 # Unfortunately, string/byte columns get mangled when translated 

1329 # through empty pandas dataframes. 

1330 self.assertEqual( 

1331 tab1_pandas_arrow.select(("index", "a", "b", "c", "ddd")), 

1332 tab1.select(("index", "a", "b", "c", "ddd")), 

1333 ) 

1334 

1335 tab1_astropy = arrow_to_astropy(tab1) 

1336 self.assertEqual(len(tab1_astropy), 0) 

1337 tab1_astropy_arrow = astropy_to_arrow(tab1_astropy) 

1338 self.assertEqual(tab1_astropy_arrow, tab1) 

1339 

1340 def testEmptyArrowTableMultidim(self): 

1341 data = _makeSimpleNumpyTable(include_multidim=True) 

1342 type_list = _numpy_dtype_to_arrow_types(data.dtype) 

1343 

1344 md = {} 

1345 for name in data.dtype.names: 

1346 _append_numpy_multidim_metadata(md, name, data.dtype[name]) 

1347 

1348 schema = pa.schema(type_list, metadata=md) 

1349 arrays = [[]] * len(schema.names) 

1350 

1351 tab1 = pa.Table.from_arrays(arrays, schema=schema) 

1352 

1353 self.butler.put(tab1, self.datasetType, dataId={}) 

1354 tab2 = self.butler.get(self.datasetType, dataId={}) 

1355 self.assertEqual(tab2, tab1) 

1356 

1357 tab1_numpy = arrow_to_numpy(tab1) 

1358 self.assertEqual(len(tab1_numpy), 0) 

1359 tab1_numpy_arrow = numpy_to_arrow(tab1_numpy) 

1360 self.assertEqual(tab1_numpy_arrow, tab1) 

1361 

1362 tab1_astropy = arrow_to_astropy(tab1) 

1363 self.assertEqual(len(tab1_astropy), 0) 

1364 tab1_astropy_arrow = astropy_to_arrow(tab1_astropy) 

1365 self.assertEqual(tab1_astropy_arrow, tab1) 

1366 

1367 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

1368 def testWriteArrowTableReadAsSingleIndexDataFrame(self): 

1369 df1, allColumns = _makeSingleIndexDataFrame() 

1370 

1371 self.butler.put(df1, self.datasetType, dataId={}) 

1372 

1373 # Read back out as a dataframe. 

1374 df2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

1375 self.assertTrue(df1.equals(df2)) 

1376 

1377 # Read back out as an arrow table, convert to dataframe. 

1378 tab3 = self.butler.get(self.datasetType, dataId={}) 

1379 df3 = arrow_to_pandas(tab3) 

1380 self.assertTrue(df1.equals(df3)) 

1381 

1382 # Check reading the columns. 

1383 columns = df2.reset_index().columns 

1384 columns2 = self.butler.get( 

1385 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex" 

1386 ) 

1387 # We check the set because pandas reorders the columns. 

1388 self.assertEqual(set(columns2.to_list()), set(columns.to_list())) 

1389 

1390 # Check reading the schema. 

1391 schema = DataFrameSchema(df1) 

1392 schema2 = self.butler.get( 

1393 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema" 

1394 ) 

1395 self.assertEqual(schema2, schema) 

1396 

1397 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

1398 def testWriteArrowTableReadAsMultiIndexDataFrame(self): 

1399 df1 = _makeMultiIndexDataFrame() 

1400 

1401 self.butler.put(df1, self.datasetType, dataId={}) 

1402 

1403 # Read back out as a dataframe. 

1404 df2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

1405 self.assertTrue(df1.equals(df2)) 

1406 

1407 # Read back out as an arrow table, convert to dataframe. 

1408 atab3 = self.butler.get(self.datasetType, dataId={}) 

1409 df3 = arrow_to_pandas(atab3) 

1410 self.assertTrue(df1.equals(df3)) 

1411 

1412 # Check reading the columns. 

1413 columns = df2.columns 

1414 columns2 = self.butler.get( 

1415 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex" 

1416 ) 

1417 self.assertTrue(columns2.equals(columns)) 

1418 

1419 # Check reading the schema. 

1420 schema = DataFrameSchema(df1) 

1421 schema2 = self.butler.get( 

1422 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema" 

1423 ) 

1424 self.assertEqual(schema2, schema) 

1425 

1426 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

1427 def testWriteArrowTableReadAsAstropyTable(self): 

1428 tab1 = _makeSimpleAstropyTable(include_multidim=True, include_masked=True) 

1429 

1430 self.butler.put(tab1, self.datasetType, dataId={}) 

1431 

1432 # Read back out as an astropy table. 

1433 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

1434 self._checkAstropyTableEquality(tab1, tab2) 

1435 

1436 # Read back out as an arrow table, convert to astropy table. 

1437 atab3 = self.butler.get(self.datasetType, dataId={}) 

1438 tab3 = arrow_to_astropy(atab3) 

1439 self._checkAstropyTableEquality(tab1, tab3) 

1440 

1441 # Check reading the columns. 

1442 columns = list(tab2.columns.keys()) 

1443 columns2 = self.butler.get( 

1444 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

1445 ) 

1446 self.assertEqual(columns2, columns) 

1447 

1448 # Check reading the schema. 

1449 schema = ArrowAstropySchema(tab1) 

1450 schema2 = self.butler.get( 

1451 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema" 

1452 ) 

1453 self.assertEqual(schema2, schema) 

1454 

1455 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

1456 def testWriteArrowTableReadAsNumpyTable(self): 

1457 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1458 

1459 self.butler.put(tab1, self.datasetType, dataId={}) 

1460 

1461 # Read back out as a numpy table. 

1462 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy") 

1463 self._checkNumpyTableEquality(tab1, tab2) 

1464 

1465 # Read back out as an arrow table, convert to numpy table. 

1466 atab3 = self.butler.get(self.datasetType, dataId={}) 

1467 tab3 = arrow_to_numpy(atab3) 

1468 self._checkNumpyTableEquality(tab1, tab3) 

1469 

1470 # Check reading the columns. 

1471 columns = list(tab2.dtype.names) 

1472 columns2 = self.butler.get( 

1473 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

1474 ) 

1475 self.assertEqual(columns2, columns) 

1476 

1477 # Check reading the schema. 

1478 schema = ArrowNumpySchema(tab1.dtype) 

1479 schema2 = self.butler.get( 

1480 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema" 

1481 ) 

1482 self.assertEqual(schema2, schema) 

1483 

1484 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

1485 def testWriteArrowTableReadAsNumpyDict(self): 

1486 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1487 

1488 self.butler.put(tab1, self.datasetType, dataId={}) 

1489 

1490 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict") 

1491 tab2_numpy = _numpy_dict_to_numpy(tab2) 

1492 self._checkNumpyTableEquality(tab1, tab2_numpy) 

1493 

1494 def _checkAstropyTableEquality(self, table1, table2): 

1495 """Check if two astropy tables have the same columns/values 

1496 

1497 Parameters 

1498 ---------- 

1499 table1 : `astropy.table.Table` 

1500 table2 : `astropy.table.Table` 

1501 """ 

1502 self.assertEqual(table1.dtype, table2.dtype) 

1503 for name in table1.columns: 

1504 self.assertEqual(table1[name].unit, table2[name].unit) 

1505 self.assertEqual(table1[name].description, table2[name].description) 

1506 self.assertEqual(table1[name].format, table2[name].format) 

1507 self.assertTrue(np.all(table1 == table2)) 

1508 

1509 def _checkNumpyTableEquality(self, table1, table2): 

1510 """Check if two numpy tables have the same columns/values 

1511 

1512 Parameters 

1513 ---------- 

1514 table1 : `numpy.ndarray` 

1515 table2 : `numpy.ndarray` 

1516 """ 

1517 self.assertEqual(table1.dtype.names, table2.dtype.names) 

1518 for name in table1.dtype.names: 

1519 self.assertEqual(table1.dtype[name], table2.dtype[name]) 

1520 self.assertTrue(np.all(table1 == table2)) 

1521 

1522 

1523@unittest.skipUnless(pa is not None, "Cannot test InMemoryArrowTableDelegate without pyarrow.") 

1524class InMemoryArrowTableDelegateTestCase(ParquetFormatterArrowTableTestCase): 

1525 """Tests for InMemoryDatastore, using ArrowTableDelegate.""" 

1526 

1527 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

1528 

1529 def testBadInput(self): 

1530 tab1 = _makeSimpleArrowTable() 

1531 delegate = ArrowTableDelegate("ArrowTable") 

1532 

1533 with self.assertRaises(ValueError): 

1534 delegate.handleParameters(inMemoryDataset="not_an_arrow_table") 

1535 

1536 with self.assertRaises(NotImplementedError): 

1537 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]}) 

1538 

1539 with self.assertRaises(AttributeError): 

1540 delegate.getComponent(composite=tab1, componentName="nothing") 

1541 

1542 def testStorageClass(self): 

1543 tab1 = _makeSimpleArrowTable() 

1544 

1545 factory = StorageClassFactory() 

1546 factory.addFromConfig(StorageClassConfig()) 

1547 

1548 storageClass = factory.findStorageClass(type(tab1), compare_types=False) 

1549 # Force the name lookup to do name matching. 

1550 storageClass._pytype = None 

1551 self.assertEqual(storageClass.name, "ArrowTable") 

1552 

1553 storageClass = factory.findStorageClass(type(tab1), compare_types=True) 

1554 # Force the name lookup to do name matching. 

1555 storageClass._pytype = None 

1556 self.assertEqual(storageClass.name, "ArrowTable") 

1557 

1558 

1559@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.") 

1560@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowNumpy without pyarrow.") 

1561class ParquetFormatterArrowNumpyDictTestCase(unittest.TestCase): 

1562 """Tests for ParquetFormatter, ArrowNumpyDict, using local file store.""" 

1563 

1564 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1565 

1566 def setUp(self): 

1567 """Create a new butler root for each test.""" 

1568 self.root = makeTestTempDir(TESTDIR) 

1569 config = Config(self.configFile) 

1570 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run") 

1571 # No dimensions in dataset type so we don't have to worry about 

1572 # inserting dimension data or defining data IDs. 

1573 self.datasetType = DatasetType( 

1574 "data", dimensions=(), storageClass="ArrowNumpyDict", universe=self.butler.registry.dimensions 

1575 ) 

1576 self.butler.registry.registerDatasetType(self.datasetType) 

1577 

1578 def tearDown(self): 

1579 removeTestTempDir(self.root) 

1580 

1581 def testNumpyDict(self): 

1582 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1583 dict1 = _numpy_to_numpy_dict(tab1) 

1584 

1585 self.butler.put(dict1, self.datasetType, dataId={}) 

1586 # Read the whole table. 

1587 dict2 = self.butler.get(self.datasetType, dataId={}) 

1588 self._checkNumpyDictEquality(dict1, dict2) 

1589 # Read the columns. 

1590 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

1591 self.assertEqual(len(columns2), len(dict1.keys())) 

1592 for i, name in enumerate(dict1.keys()): 

1593 self.assertIn(name, columns2) 

1594 # Read the rowcount. 

1595 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

1596 self.assertEqual(rowcount, len(dict1["a"])) 

1597 # Read the schema. 

1598 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

1599 self.assertEqual(schema, ArrowNumpySchema(tab1.dtype)) 

1600 # Read just some columns a few different ways. 

1601 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

1602 subdict = {key: dict1[key] for key in ["a", "c"]} 

1603 self._checkNumpyDictEquality(subdict, tab3) 

1604 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

1605 subdict = {key: dict1[key] for key in ["a"]} 

1606 self._checkNumpyDictEquality(subdict, tab4) 

1607 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]}) 

1608 subdict = {key: dict1[key] for key in ["index", "a"]} 

1609 self._checkNumpyDictEquality(subdict, tab5) 

1610 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"}) 

1611 subdict = {key: dict1[key] for key in ["ddd"]} 

1612 self._checkNumpyDictEquality(subdict, tab6) 

1613 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]}) 

1614 subdict = {key: dict1[key] for key in ["a"]} 

1615 self._checkNumpyDictEquality(subdict, tab7) 

1616 # Passing an unrecognized column should be a ValueError. 

1617 with self.assertRaises(ValueError): 

1618 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]}) 

1619 

1620 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.") 

1621 def testWriteNumpyDictReadAsArrowTable(self): 

1622 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1623 dict1 = _numpy_to_numpy_dict(tab1) 

1624 

1625 self.butler.put(dict1, self.datasetType, dataId={}) 

1626 

1627 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable") 

1628 

1629 tab2_dict = arrow_to_numpy_dict(tab2) 

1630 

1631 self._checkNumpyDictEquality(dict1, tab2_dict) 

1632 

1633 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

1634 def testWriteNumpyDictReadAsDataFrame(self): 

1635 tab1 = _makeSimpleNumpyTable() 

1636 dict1 = _numpy_to_numpy_dict(tab1) 

1637 

1638 self.butler.put(dict1, self.datasetType, dataId={}) 

1639 

1640 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

1641 

1642 # The order of the dict may get mixed up, so we need to check column 

1643 # by column. We also need to do this in dataframe form because pandas 

1644 # changes the datatype of the string column. 

1645 tab1_df = pd.DataFrame(tab1) 

1646 

1647 self.assertEqual(set(tab1_df.columns), set(tab2.columns)) 

1648 for col in tab1_df.columns: 

1649 self.assertTrue(np.all(tab1_df[col].values == tab2[col].values)) 

1650 

1651 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

1652 def testWriteNumpyDictReadAsAstropyTable(self): 

1653 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1654 dict1 = _numpy_to_numpy_dict(tab1) 

1655 

1656 self.butler.put(dict1, self.datasetType, dataId={}) 

1657 

1658 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

1659 tab2_dict = _astropy_to_numpy_dict(tab2) 

1660 

1661 self._checkNumpyDictEquality(dict1, tab2_dict) 

1662 

1663 def testWriteNumpyDictReadAsNumpyTable(self): 

1664 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1665 dict1 = _numpy_to_numpy_dict(tab1) 

1666 

1667 self.butler.put(dict1, self.datasetType, dataId={}) 

1668 

1669 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy") 

1670 tab2_dict = _numpy_to_numpy_dict(tab2) 

1671 

1672 self._checkNumpyDictEquality(dict1, tab2_dict) 

1673 

1674 def testWriteNumpyDictBad(self): 

1675 dict1 = {"a": 4, "b": np.ndarray([1])} 

1676 with self.assertRaises(RuntimeError): 

1677 self.butler.put(dict1, self.datasetType, dataId={}) 

1678 

1679 dict2 = {"a": np.zeros(4), "b": np.zeros(5)} 

1680 with self.assertRaises(RuntimeError): 

1681 self.butler.put(dict2, self.datasetType, dataId={}) 

1682 

1683 dict3 = {"a": [0] * 5, "b": np.zeros(5)} 

1684 with self.assertRaises(RuntimeError): 

1685 self.butler.put(dict3, self.datasetType, dataId={}) 

1686 

1687 def _checkNumpyDictEquality(self, dict1, dict2): 

1688 """Check if two numpy dicts have the same columns/values. 

1689 

1690 Parameters 

1691 ---------- 

1692 dict1 : `dict` [`str`, `np.ndarray`] 

1693 dict2 : `dict` [`str`, `np.ndarray`] 

1694 """ 

1695 self.assertEqual(set(dict1.keys()), set(dict2.keys())) 

1696 for name in dict1.keys(): 

1697 self.assertEqual(dict1[name].dtype, dict2[name].dtype) 

1698 self.assertTrue(np.all(dict1[name] == dict2[name])) 

1699 

1700 

1701@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.") 

1702@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowNumpy without pyarrow.") 

1703class InMemoryNumpyDictDelegateTestCase(ParquetFormatterArrowNumpyDictTestCase): 

1704 """Tests for InMemoryDatastore, using ArrowNumpyDictDelegate.""" 

1705 

1706 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

1707 

1708 def testWriteNumpyDictBad(self): 

1709 # The sub-type checking is not done on in-memory datastore. 

1710 pass 

1711 

1712 

1713@unittest.skipUnless(np is not None, "Cannot test compute_row_group_size without numpy.") 

1714@unittest.skipUnless(pa is not None, "Cannot test compute_row_group_size without pyarrow.") 

1715class ComputeRowGroupSizeTestCase(unittest.TestCase): 

1716 """Tests for compute_row_group_size.""" 

1717 

1718 def testRowGroupSizeNoMetadata(self): 

1719 numpyTable = _makeSimpleNumpyTable(include_multidim=True) 

1720 

1721 # We can't use the numpy_to_arrow convenience function because 

1722 # that adds metadata. 

1723 type_list = _numpy_dtype_to_arrow_types(numpyTable.dtype) 

1724 schema = pa.schema(type_list) 

1725 arrays = _numpy_style_arrays_to_arrow_arrays( 

1726 numpyTable.dtype, 

1727 len(numpyTable), 

1728 numpyTable, 

1729 schema, 

1730 ) 

1731 arrowTable = pa.Table.from_arrays(arrays, schema=schema) 

1732 

1733 row_group_size = compute_row_group_size(arrowTable.schema) 

1734 

1735 self.assertGreater(row_group_size, 1_000_000) 

1736 self.assertLess(row_group_size, 2_000_000) 

1737 

1738 def testRowGroupSizeWithMetadata(self): 

1739 numpyTable = _makeSimpleNumpyTable(include_multidim=True) 

1740 

1741 arrowTable = numpy_to_arrow(numpyTable) 

1742 

1743 row_group_size = compute_row_group_size(arrowTable.schema) 

1744 

1745 self.assertGreater(row_group_size, 1_000_000) 

1746 self.assertLess(row_group_size, 2_000_000) 

1747 

1748 def testRowGroupSizeTinyTable(self): 

1749 numpyTable = np.zeros(1, dtype=[("a", np.bool_)]) 

1750 

1751 arrowTable = numpy_to_arrow(numpyTable) 

1752 

1753 row_group_size = compute_row_group_size(arrowTable.schema) 

1754 

1755 self.assertGreater(row_group_size, 1_000_000) 

1756 

1757 

1758if __name__ == "__main__": 

1759 unittest.main()