Coverage for tests/test_parquet.py: 17%

955 statements  

« prev     ^ index     » next       coverage.py v7.2.5, created at 2023-05-03 09:15 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""Tests for ParquetFormatter. 

23 

24Tests in this module are disabled unless pandas and pyarrow are importable. 

25""" 

26 

27import os 

28import unittest 

29 

30try: 

31 import pyarrow as pa 

32except ImportError: 

33 pa = None 

34try: 

35 import astropy.table as atable 

36 from astropy import units 

37except ImportError: 

38 atable = None 

39try: 

40 import numpy as np 

41except ImportError: 

42 np = None 

43try: 

44 import pandas as pd 

45except ImportError: 

46 np = None 

47 

48from lsst.daf.butler import ( 

49 Butler, 

50 Config, 

51 DatasetRef, 

52 DatasetType, 

53 FileDataset, 

54 StorageClassConfig, 

55 StorageClassFactory, 

56) 

57from lsst.daf.butler.delegates.arrowastropy import ArrowAstropyDelegate 

58from lsst.daf.butler.delegates.arrownumpy import ArrowNumpyDelegate 

59from lsst.daf.butler.delegates.arrowtable import ArrowTableDelegate 

60from lsst.daf.butler.delegates.dataframe import DataFrameDelegate 

61from lsst.daf.butler.formatters.parquet import ( 

62 ArrowAstropySchema, 

63 ArrowNumpySchema, 

64 DataFrameSchema, 

65 ParquetFormatter, 

66 _append_numpy_multidim_metadata, 

67 _astropy_to_numpy_dict, 

68 _numpy_dict_to_numpy, 

69 _numpy_dtype_to_arrow_types, 

70 _numpy_style_arrays_to_arrow_arrays, 

71 _numpy_to_numpy_dict, 

72 arrow_to_astropy, 

73 arrow_to_numpy, 

74 arrow_to_numpy_dict, 

75 arrow_to_pandas, 

76 astropy_to_arrow, 

77 compute_row_group_size, 

78 numpy_dict_to_arrow, 

79 numpy_to_arrow, 

80 pandas_to_arrow, 

81) 

82from lsst.daf.butler.tests.utils import makeTestTempDir, removeTestTempDir 

83 

84TESTDIR = os.path.abspath(os.path.dirname(__file__)) 

85 

86 

87def _makeSimpleNumpyTable(include_multidim=False, include_bigendian=False): 

88 """Make a simple numpy table with random data. 

89 

90 Parameters 

91 ---------- 

92 include_multidim : `bool` 

93 Include multi-dimensional columns. 

94 include_bigendian : `bool` 

95 Include big-endian columns. 

96 

97 Returns 

98 ------- 

99 numpyTable : `numpy.ndarray` 

100 """ 

101 nrow = 5 

102 

103 dtype = [ 

104 ("index", "i4"), 

105 ("a", "f8"), 

106 ("b", "f8"), 

107 ("c", "f8"), 

108 ("ddd", "f8"), 

109 ("f", "i8"), 

110 ("strcol", "U10"), 

111 ("bytecol", "a10"), 

112 ] 

113 

114 if include_multidim: 

115 dtype.extend( 

116 [ 

117 ("d1", "f4", (5,)), 

118 ("d2", "i8", (5, 10)), 

119 ("d3", "f8", (5, 10)), 

120 ] 

121 ) 

122 

123 if include_bigendian: 

124 dtype.extend([("a_bigendian", ">f8"), ("f_bigendian", ">i8")]) 

125 

126 data = np.zeros(nrow, dtype=dtype) 

127 data["index"][:] = np.arange(nrow) 

128 data["a"] = np.random.randn(nrow) 

129 data["b"] = np.random.randn(nrow) 

130 data["c"] = np.random.randn(nrow) 

131 data["ddd"] = np.random.randn(nrow) 

132 data["f"] = np.arange(nrow) * 10 

133 data["strcol"][:] = "teststring" 

134 data["bytecol"][:] = "teststring" 

135 

136 if include_multidim: 

137 data["d1"] = np.random.randn(data["d1"].size).reshape(data["d1"].shape) 

138 data["d2"] = np.arange(data["d2"].size).reshape(data["d2"].shape) 

139 data["d3"] = np.asfortranarray(np.random.randn(data["d3"].size).reshape(data["d3"].shape)) 

140 

141 if include_bigendian: 

142 data["a_bigendian"][:] = data["a"] 

143 data["f_bigendian"][:] = data["f"] 

144 

145 return data 

146 

147 

148def _makeSingleIndexDataFrame(include_masked=False, include_lists=False): 

149 """Make a single index data frame for testing. 

150 

151 Parameters 

152 ---------- 

153 include_masked : `bool` 

154 Include masked columns. 

155 include_lists : `bool` 

156 Include list columns. 

157 

158 Returns 

159 ------- 

160 dataFrame : `~pandas.DataFrame` 

161 The test dataframe. 

162 allColumns : `list` [`str`] 

163 List of all the columns (including index columns). 

164 """ 

165 data = _makeSimpleNumpyTable() 

166 df = pd.DataFrame(data) 

167 df = df.set_index("index") 

168 

169 if include_masked: 

170 nrow = len(df) 

171 

172 df["m1"] = pd.array(np.arange(nrow), dtype=pd.Int64Dtype()) 

173 df["m2"] = pd.array(np.arange(nrow), dtype=np.float32) 

174 df["mstrcol"] = pd.array(np.array(["text"] * nrow)) 

175 df.loc[1, ["m1", "m2", "mstrcol"]] = None 

176 

177 if include_lists: 

178 nrow = len(df) 

179 

180 df["l1"] = [[0, 0]] * nrow 

181 df["l2"] = [[0.0, 0.0]] * nrow 

182 df["l3"] = [[]] * nrow 

183 

184 allColumns = df.columns.append(pd.Index(df.index.names)) 

185 

186 return df, allColumns 

187 

188 

189def _makeMultiIndexDataFrame(): 

190 """Make a multi-index data frame for testing. 

191 

192 Returns 

193 ------- 

194 dataFrame : `~pandas.DataFrame` 

195 The test dataframe. 

196 """ 

197 columns = pd.MultiIndex.from_tuples( 

198 [ 

199 ("g", "a"), 

200 ("g", "b"), 

201 ("g", "c"), 

202 ("r", "a"), 

203 ("r", "b"), 

204 ("r", "c"), 

205 ], 

206 names=["filter", "column"], 

207 ) 

208 df = pd.DataFrame(np.random.randn(5, 6), index=np.arange(5, dtype=int), columns=columns) 

209 

210 return df 

211 

212 

213def _makeSimpleAstropyTable(include_multidim=False, include_masked=False, include_bigendian=False): 

214 """Make an astropy table for testing. 

215 

216 Parameters 

217 ---------- 

218 include_multidim : `bool` 

219 Include multi-dimensional columns. 

220 include_masked : `bool` 

221 Include masked columns. 

222 include_bigendian : `bool` 

223 Include big-endian columns. 

224 

225 Returns 

226 ------- 

227 astropyTable : `astropy.table.Table` 

228 The test table. 

229 """ 

230 data = _makeSimpleNumpyTable(include_multidim=include_multidim, include_bigendian=include_bigendian) 

231 # Add a couple of units. 

232 table = atable.Table(data) 

233 table["a"].unit = units.degree 

234 table["b"].unit = units.meter 

235 

236 # Add some masked columns. 

237 if include_masked: 

238 nrow = len(table) 

239 mask = np.zeros(nrow, dtype=bool) 

240 mask[1] = True 

241 table["m1"] = np.ma.masked_array(data=np.arange(nrow, dtype="i8"), mask=mask) 

242 table["m2"] = np.ma.masked_array(data=np.arange(nrow, dtype="f4"), mask=mask) 

243 table["mstrcol"] = np.ma.masked_array(data=np.array(["text"] * nrow), mask=mask) 

244 table["mbytecol"] = np.ma.masked_array(data=np.array([b"bytes"] * nrow), mask=mask) 

245 

246 return table 

247 

248 

249def _makeSimpleArrowTable(include_multidim=False, include_masked=False): 

250 """Make an arrow table for testing. 

251 

252 Parameters 

253 ---------- 

254 include_multidim : `bool` 

255 Include multi-dimensional columns. 

256 include_masked : `bool` 

257 Include masked columns. 

258 

259 Returns 

260 ------- 

261 arrowTable : `pyarrow.Table` 

262 The test table. 

263 """ 

264 data = _makeSimpleAstropyTable(include_multidim=include_multidim, include_masked=include_masked) 

265 return astropy_to_arrow(data) 

266 

267 

268@unittest.skipUnless(pd is not None, "Cannot test ParquetFormatterDataFrame without pandas.") 

269@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterDataFrame without pyarrow.") 

270class ParquetFormatterDataFrameTestCase(unittest.TestCase): 

271 """Tests for ParquetFormatter, DataFrame, using local file datastore.""" 

272 

273 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

274 

275 def setUp(self): 

276 """Create a new butler root for each test.""" 

277 self.root = makeTestTempDir(TESTDIR) 

278 config = Config(self.configFile) 

279 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run") 

280 # No dimensions in dataset type so we don't have to worry about 

281 # inserting dimension data or defining data IDs. 

282 self.datasetType = DatasetType( 

283 "data", dimensions=(), storageClass="DataFrame", universe=self.butler.registry.dimensions 

284 ) 

285 self.butler.registry.registerDatasetType(self.datasetType) 

286 

287 def tearDown(self): 

288 removeTestTempDir(self.root) 

289 

290 def testSingleIndexDataFrame(self): 

291 df1, allColumns = _makeSingleIndexDataFrame(include_masked=True) 

292 

293 self.butler.put(df1, self.datasetType, dataId={}) 

294 # Read the whole DataFrame. 

295 df2 = self.butler.get(self.datasetType, dataId={}) 

296 self.assertTrue(df1.equals(df2)) 

297 # Read just the column descriptions. 

298 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

299 self.assertTrue(allColumns.equals(columns2)) 

300 # Read the rowcount. 

301 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

302 self.assertEqual(rowcount, len(df1)) 

303 # Read the schema. 

304 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

305 self.assertEqual(schema, DataFrameSchema(df1)) 

306 # Read just some columns a few different ways. 

307 df3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

308 self.assertTrue(df1.loc[:, ["a", "c"]].equals(df3)) 

309 df4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

310 self.assertTrue(df1.loc[:, ["a"]].equals(df4)) 

311 df5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]}) 

312 self.assertTrue(df1.loc[:, ["a"]].equals(df5)) 

313 df6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"}) 

314 self.assertTrue(df1.loc[:, ["ddd"]].equals(df6)) 

315 df7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]}) 

316 self.assertTrue(df1.loc[:, ["a"]].equals(df7)) 

317 # Passing an unrecognized column should be a ValueError. 

318 with self.assertRaises(ValueError): 

319 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]}) 

320 

321 def testSingleIndexDataFrameWithLists(self): 

322 df1, allColumns = _makeSingleIndexDataFrame(include_lists=True) 

323 

324 self.butler.put(df1, self.datasetType, dataId={}) 

325 # Read the whole DataFrame. 

326 df2 = self.butler.get(self.datasetType, dataId={}) 

327 

328 # We need to check the list columns specially because they go 

329 # from lists to arrays. 

330 for col in ["l1", "l2", "l3"]: 

331 for i in range(len(df1)): 

332 self.assertTrue(np.all(df2[col].values[i] == df1[col].values[i])) 

333 

334 def testMultiIndexDataFrame(self): 

335 df1 = _makeMultiIndexDataFrame() 

336 

337 self.butler.put(df1, self.datasetType, dataId={}) 

338 # Read the whole DataFrame. 

339 df2 = self.butler.get(self.datasetType, dataId={}) 

340 self.assertTrue(df1.equals(df2)) 

341 # Read just the column descriptions. 

342 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

343 self.assertTrue(df1.columns.equals(columns2)) 

344 self.assertEqual(columns2.names, df1.columns.names) 

345 # Read the rowcount. 

346 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

347 self.assertEqual(rowcount, len(df1)) 

348 # Read the schema. 

349 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

350 self.assertEqual(schema, DataFrameSchema(df1)) 

351 # Read just some columns a few different ways. 

352 df3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": {"filter": "g"}}) 

353 self.assertTrue(df1.loc[:, ["g"]].equals(df3)) 

354 df4 = self.butler.get( 

355 self.datasetType, dataId={}, parameters={"columns": {"filter": ["r"], "column": "a"}} 

356 ) 

357 self.assertTrue(df1.loc[:, [("r", "a")]].equals(df4)) 

358 column_list = [("g", "a"), ("r", "c")] 

359 df5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": column_list}) 

360 self.assertTrue(df1.loc[:, column_list].equals(df5)) 

361 column_dict = {"filter": "r", "column": ["a", "b"]} 

362 df6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": column_dict}) 

363 self.assertTrue(df1.loc[:, [("r", "a"), ("r", "b")]].equals(df6)) 

364 # Passing an unrecognized column should be a ValueError. 

365 with self.assertRaises(ValueError): 

366 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["d"]}) 

367 

368 def testSingleIndexDataFrameEmptyString(self): 

369 """Test persisting a single index dataframe with empty strings.""" 

370 df1, _ = _makeSingleIndexDataFrame() 

371 

372 # Set one of the strings to None 

373 df1.at[1, "strcol"] = None 

374 

375 self.butler.put(df1, self.datasetType, dataId={}) 

376 # Read the whole DataFrame. 

377 df2 = self.butler.get(self.datasetType, dataId={}) 

378 self.assertTrue(df1.equals(df2)) 

379 

380 def testSingleIndexDataFrameAllEmptyStrings(self): 

381 """Test persisting a single index dataframe with an empty string 

382 column. 

383 """ 

384 df1, _ = _makeSingleIndexDataFrame() 

385 

386 # Set all of the strings to None 

387 df1.loc[0:, "strcol"] = None 

388 

389 self.butler.put(df1, self.datasetType, dataId={}) 

390 # Read the whole DataFrame. 

391 df2 = self.butler.get(self.datasetType, dataId={}) 

392 self.assertTrue(df1.equals(df2)) 

393 

394 def testLegacyDataFrame(self): 

395 """Test writing a dataframe to parquet via pandas (without additional 

396 metadata) and ensure that we can read it back with all the new 

397 functionality. 

398 """ 

399 df1, allColumns = _makeSingleIndexDataFrame() 

400 

401 fname = os.path.join(self.root, "test_dataframe.parq") 

402 df1.to_parquet(fname) 

403 

404 legacy_type = DatasetType( 

405 "legacy_dataframe", 

406 dimensions=(), 

407 storageClass="DataFrame", 

408 universe=self.butler.registry.dimensions, 

409 ) 

410 self.butler.registry.registerDatasetType(legacy_type) 

411 

412 data_id = {} 

413 ref = DatasetRef(legacy_type, data_id, run="testLegacyDataFrame") 

414 dataset = FileDataset(path=fname, refs=[ref], formatter=ParquetFormatter) 

415 

416 self.butler.ingest(dataset, transfer="copy") 

417 

418 self.butler.put(df1, self.datasetType, dataId={}) 

419 

420 df2a = self.butler.get(self.datasetType, dataId={}) 

421 df2b = self.butler.get("legacy_dataframe", dataId={}) 

422 self.assertTrue(df2a.equals(df2b)) 

423 

424 df3a = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a"]}) 

425 df3b = self.butler.get("legacy_dataframe", dataId={}, parameters={"columns": ["a"]}) 

426 self.assertTrue(df3a.equals(df3b)) 

427 

428 columns2a = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

429 columns2b = self.butler.get("legacy_dataframe.columns", dataId={}) 

430 self.assertTrue(columns2a.equals(columns2b)) 

431 

432 rowcount2a = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

433 rowcount2b = self.butler.get("legacy_dataframe.rowcount", dataId={}) 

434 self.assertEqual(rowcount2a, rowcount2b) 

435 

436 schema2a = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

437 schema2b = self.butler.get("legacy_dataframe.schema", dataId={}) 

438 self.assertEqual(schema2a, schema2b) 

439 

440 def testDataFrameSchema(self): 

441 tab1 = _makeSimpleArrowTable() 

442 

443 schema = DataFrameSchema.from_arrow(tab1.schema) 

444 

445 self.assertIsInstance(schema.schema, pd.DataFrame) 

446 self.assertEqual(repr(schema), repr(schema._schema)) 

447 self.assertNotEqual(schema, "not_a_schema") 

448 self.assertEqual(schema, schema) 

449 

450 tab2 = _makeMultiIndexDataFrame() 

451 schema2 = DataFrameSchema(tab2) 

452 

453 self.assertNotEqual(schema, schema2) 

454 

455 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

456 def testWriteSingleIndexDataFrameReadAsAstropyTable(self): 

457 df1, allColumns = _makeSingleIndexDataFrame() 

458 

459 self.butler.put(df1, self.datasetType, dataId={}) 

460 

461 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

462 

463 tab2_df = tab2.to_pandas(index="index") 

464 self.assertTrue(df1.equals(tab2_df)) 

465 

466 # Check reading the columns. 

467 columns = list(tab2.columns.keys()) 

468 columns2 = self.butler.get( 

469 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

470 ) 

471 # We check the set because pandas reorders the columns. 

472 self.assertEqual(set(columns2), set(columns)) 

473 

474 # Check reading the schema. 

475 schema = ArrowAstropySchema(tab2) 

476 schema2 = self.butler.get( 

477 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema" 

478 ) 

479 

480 # The string types are objectified by pandas, and the order 

481 # will be changed because of pandas indexing. 

482 self.assertEqual(len(schema2.schema.columns), len(schema.schema.columns)) 

483 for name in schema.schema.columns: 

484 self.assertIn(name, schema2.schema.columns) 

485 if schema2.schema[name].dtype != np.dtype("O"): 

486 self.assertEqual(schema2.schema[name].dtype, schema.schema[name].dtype) 

487 

488 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

489 def testWriteSingleIndexDataFrameWithMaskedColsReadAsAstropyTable(self): 

490 # We need to special-case the write-as-pandas read-as-astropy code 

491 # with masks because pandas has multiple ways to use masked columns. 

492 # (The string column mask handling in particular is frustratingly 

493 # inconsistent.) 

494 df1, allColumns = _makeSingleIndexDataFrame(include_masked=True) 

495 

496 self.butler.put(df1, self.datasetType, dataId={}) 

497 

498 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

499 tab2_df = tab2.to_pandas(index="index") 

500 

501 self.assertTrue(df1.columns.equals(tab2_df.columns)) 

502 for name in tab2_df.columns: 

503 col1 = df1[name] 

504 col2 = tab2_df[name] 

505 

506 if col1.hasnans: 

507 notNull = col1.notnull() 

508 self.assertTrue(notNull.equals(col2.notnull())) 

509 # Need to check value-by-value because column may 

510 # be made of objects, depending on what pandas decides. 

511 for index in notNull.values.nonzero()[0]: 

512 self.assertEqual(col1[index], col2[index]) 

513 else: 

514 self.assertTrue(col1.equals(col2)) 

515 

516 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

517 def testWriteMultiIndexDataFrameReadAsAstropyTable(self): 

518 df1 = _makeMultiIndexDataFrame() 

519 

520 self.butler.put(df1, self.datasetType, dataId={}) 

521 

522 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

523 

524 # This is an odd duck, it doesn't really round-trip. 

525 # This test simply checks that it's readable, but definitely not 

526 # recommended. 

527 

528 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.") 

529 def testWriteSingleIndexDataFrameReadAsArrowTable(self): 

530 df1, allColumns = _makeSingleIndexDataFrame() 

531 

532 self.butler.put(df1, self.datasetType, dataId={}) 

533 

534 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable") 

535 

536 tab2_df = arrow_to_pandas(tab2) 

537 self.assertTrue(df1.equals(tab2_df)) 

538 

539 # Check reading the columns. 

540 columns = list(tab2.schema.names) 

541 columns2 = self.butler.get( 

542 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

543 ) 

544 # We check the set because pandas reorders the columns. 

545 self.assertEqual(set(columns), set(columns2)) 

546 

547 # Check reading the schema. 

548 schema = tab2.schema 

549 schema2 = self.butler.get( 

550 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema" 

551 ) 

552 

553 # These will not have the same metadata, nor will the string column 

554 # information be maintained. 

555 self.assertEqual(len(schema.names), len(schema2.names)) 

556 for name in schema.names: 

557 if schema.field(name).type not in (pa.string(), pa.binary()): 

558 self.assertEqual(schema.field(name).type, schema2.field(name).type) 

559 

560 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.") 

561 def testWriteMultiIndexDataFrameReadAsArrowTable(self): 

562 df1 = _makeMultiIndexDataFrame() 

563 

564 self.butler.put(df1, self.datasetType, dataId={}) 

565 

566 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable") 

567 

568 tab2_df = arrow_to_pandas(tab2) 

569 self.assertTrue(df1.equals(tab2_df)) 

570 

571 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

572 def testWriteSingleIndexDataFrameReadAsNumpyTable(self): 

573 df1, allColumns = _makeSingleIndexDataFrame() 

574 

575 self.butler.put(df1, self.datasetType, dataId={}) 

576 

577 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy") 

578 

579 tab2_df = pd.DataFrame.from_records(tab2, index=["index"]) 

580 self.assertTrue(df1.equals(tab2_df)) 

581 

582 # Check reading the columns. 

583 columns = list(tab2.dtype.names) 

584 columns2 = self.butler.get( 

585 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

586 ) 

587 # We check the set because pandas reorders the columns. 

588 self.assertEqual(set(columns2), set(columns)) 

589 

590 # Check reading the schema. 

591 schema = ArrowNumpySchema(tab2.dtype) 

592 schema2 = self.butler.get( 

593 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema" 

594 ) 

595 

596 # The string types will be objectified by pandas, and the order 

597 # will be changed because of pandas indexing. 

598 self.assertEqual(len(schema.schema.names), len(schema2.schema.names)) 

599 for name in schema.schema.names: 

600 self.assertIn(name, schema2.schema.names) 

601 self.assertEqual(schema2.schema[name].type, schema.schema[name].type) 

602 

603 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

604 def testWriteMultiIndexDataFrameReadAsNumpyTable(self): 

605 df1 = _makeMultiIndexDataFrame() 

606 

607 self.butler.put(df1, self.datasetType, dataId={}) 

608 

609 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy") 

610 

611 # This is an odd duck, it doesn't really round-trip. 

612 # This test simply checks that it's readable, but definitely not 

613 # recommended. 

614 

615 @unittest.skipUnless(np is not None, "Cannot test reading as numpy dict without numpy.") 

616 def testWriteSingleIndexDataFrameReadAsNumpyDict(self): 

617 df1, allColumns = _makeSingleIndexDataFrame() 

618 

619 self.butler.put(df1, self.datasetType, dataId={}) 

620 

621 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict") 

622 

623 tab2_df = pd.DataFrame.from_records(tab2, index=["index"]) 

624 # The column order is not maintained. 

625 self.assertEqual(set(df1.columns), set(tab2_df.columns)) 

626 for col in df1.columns: 

627 self.assertTrue(np.all(df1[col].values == tab2_df[col].values)) 

628 

629 @unittest.skipUnless(np is not None, "Cannot test reading as numpy dict without numpy.") 

630 def testWriteMultiIndexDataFrameReadAsNumpyDict(self): 

631 df1 = _makeMultiIndexDataFrame() 

632 

633 self.butler.put(df1, self.datasetType, dataId={}) 

634 

635 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict") 

636 

637 # This is an odd duck, it doesn't really round-trip. 

638 # This test simply checks that it's readable, but definitely not 

639 # recommended. 

640 

641 

642@unittest.skipUnless(pd is not None, "Cannot test InMemoryDataFrameDelegate without pandas.") 

643class InMemoryDataFrameDelegateTestCase(ParquetFormatterDataFrameTestCase): 

644 """Tests for InMemoryDatastore, using DataFrameDelegate.""" 

645 

646 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

647 

648 def testWriteMultiIndexDataFrameReadAsAstropyTable(self): 

649 df1 = _makeMultiIndexDataFrame() 

650 

651 self.butler.put(df1, self.datasetType, dataId={}) 

652 

653 with self.assertRaises(ValueError): 

654 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

655 

656 def testLegacyDataFrame(self): 

657 # This test does not work with an inMemoryDatastore. 

658 pass 

659 

660 def testBadInput(self): 

661 df1, _ = _makeSingleIndexDataFrame() 

662 delegate = DataFrameDelegate("DataFrame") 

663 

664 with self.assertRaises(ValueError): 

665 delegate.handleParameters(inMemoryDataset="not_a_dataframe") 

666 

667 with self.assertRaises(AttributeError): 

668 delegate.getComponent(composite=df1, componentName="nothing") 

669 

670 def testStorageClass(self): 

671 df1, allColumns = _makeSingleIndexDataFrame() 

672 

673 factory = StorageClassFactory() 

674 factory.addFromConfig(StorageClassConfig()) 

675 

676 storageClass = factory.findStorageClass(type(df1), compare_types=False) 

677 # Force the name lookup to do name matching. 

678 storageClass._pytype = None 

679 self.assertEqual(storageClass.name, "DataFrame") 

680 

681 storageClass = factory.findStorageClass(type(df1), compare_types=True) 

682 # Force the name lookup to do name matching. 

683 storageClass._pytype = None 

684 self.assertEqual(storageClass.name, "DataFrame") 

685 

686 

687@unittest.skipUnless(atable is not None, "Cannot test ParquetFormatterArrowAstropy without astropy.") 

688@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowAstropy without pyarrow.") 

689class ParquetFormatterArrowAstropyTestCase(unittest.TestCase): 

690 """Tests for ParquetFormatter, ArrowAstropy, using local file datastore.""" 

691 

692 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

693 

694 def setUp(self): 

695 """Create a new butler root for each test.""" 

696 self.root = makeTestTempDir(TESTDIR) 

697 config = Config(self.configFile) 

698 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run") 

699 # No dimensions in dataset type so we don't have to worry about 

700 # inserting dimension data or defining data IDs. 

701 self.datasetType = DatasetType( 

702 "data", dimensions=(), storageClass="ArrowAstropy", universe=self.butler.registry.dimensions 

703 ) 

704 self.butler.registry.registerDatasetType(self.datasetType) 

705 

706 def tearDown(self): 

707 removeTestTempDir(self.root) 

708 

709 def testAstropyTable(self): 

710 tab1 = _makeSimpleAstropyTable(include_multidim=True, include_masked=True) 

711 

712 self.butler.put(tab1, self.datasetType, dataId={}) 

713 # Read the whole Table. 

714 tab2 = self.butler.get(self.datasetType, dataId={}) 

715 self._checkAstropyTableEquality(tab1, tab2) 

716 # Read the columns. 

717 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

718 self.assertEqual(len(columns2), len(tab1.dtype.names)) 

719 for i, name in enumerate(tab1.dtype.names): 

720 self.assertEqual(columns2[i], name) 

721 # Read the rowcount. 

722 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

723 self.assertEqual(rowcount, len(tab1)) 

724 # Read the schema. 

725 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

726 self.assertEqual(schema, ArrowAstropySchema(tab1)) 

727 # Read just some columns a few different ways. 

728 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

729 self._checkAstropyTableEquality(tab1[("a", "c")], tab3) 

730 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

731 self._checkAstropyTableEquality(tab1[("a",)], tab4) 

732 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]}) 

733 self._checkAstropyTableEquality(tab1[("index", "a")], tab5) 

734 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"}) 

735 self._checkAstropyTableEquality(tab1[("ddd",)], tab6) 

736 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]}) 

737 self._checkAstropyTableEquality(tab1[("a",)], tab7) 

738 # Passing an unrecognized column should be a ValueError. 

739 with self.assertRaises(ValueError): 

740 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]}) 

741 

742 def testAstropyTableBigEndian(self): 

743 tab1 = _makeSimpleAstropyTable(include_bigendian=True) 

744 

745 self.butler.put(tab1, self.datasetType, dataId={}) 

746 # Read the whole Table. 

747 tab2 = self.butler.get(self.datasetType, dataId={}) 

748 self._checkAstropyTableEquality(tab1, tab2, has_bigendian=True) 

749 

750 def testAstropyTableWithMetadata(self): 

751 tab1 = _makeSimpleAstropyTable(include_multidim=True) 

752 

753 meta = { 

754 "meta_a": 5, 

755 "meta_b": 10.0, 

756 "meta_c": [1, 2, 3], 

757 "meta_d": True, 

758 "meta_e": "string", 

759 } 

760 

761 tab1.meta.update(meta) 

762 

763 self.butler.put(tab1, self.datasetType, dataId={}) 

764 # Read the whole Table. 

765 tab2 = self.butler.get(self.datasetType, dataId={}) 

766 # This will check that the metadata is equivalent as well. 

767 self._checkAstropyTableEquality(tab1, tab2) 

768 

769 def testArrowAstropySchema(self): 

770 tab1 = _makeSimpleAstropyTable() 

771 tab1_arrow = astropy_to_arrow(tab1) 

772 schema = ArrowAstropySchema.from_arrow(tab1_arrow.schema) 

773 

774 self.assertIsInstance(schema.schema, atable.Table) 

775 self.assertEqual(repr(schema), repr(schema._schema)) 

776 self.assertNotEqual(schema, "not_a_schema") 

777 self.assertEqual(schema, schema) 

778 

779 # Test various inequalities 

780 tab2 = tab1.copy() 

781 tab2.rename_column("index", "index2") 

782 schema2 = ArrowAstropySchema(tab2) 

783 self.assertNotEqual(schema2, schema) 

784 

785 tab2 = tab1.copy() 

786 tab2["index"].unit = units.micron 

787 schema2 = ArrowAstropySchema(tab2) 

788 self.assertNotEqual(schema2, schema) 

789 

790 tab2 = tab1.copy() 

791 tab2["index"].description = "Index column" 

792 schema2 = ArrowAstropySchema(tab2) 

793 self.assertNotEqual(schema2, schema) 

794 

795 tab2 = tab1.copy() 

796 tab2["index"].format = "%05d" 

797 schema2 = ArrowAstropySchema(tab2) 

798 self.assertNotEqual(schema2, schema) 

799 

800 def testAstropyParquet(self): 

801 tab1 = _makeSimpleAstropyTable() 

802 

803 fname = os.path.join(self.root, "test_astropy.parq") 

804 tab1.write(fname) 

805 

806 astropy_type = DatasetType( 

807 "astropy_parquet", 

808 dimensions=(), 

809 storageClass="ArrowAstropy", 

810 universe=self.butler.registry.dimensions, 

811 ) 

812 self.butler.registry.registerDatasetType(astropy_type) 

813 

814 data_id = {} 

815 ref = DatasetRef(astropy_type, data_id, run="testAstropyParquet") 

816 dataset = FileDataset(path=fname, refs=[ref], formatter=ParquetFormatter) 

817 

818 self.butler.ingest(dataset, transfer="copy") 

819 

820 self.butler.put(tab1, self.datasetType, dataId={}) 

821 

822 tab2a = self.butler.get(self.datasetType, dataId={}) 

823 tab2b = self.butler.get("astropy_parquet", dataId={}) 

824 self._checkAstropyTableEquality(tab2a, tab2b) 

825 

826 columns2a = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

827 columns2b = self.butler.get("astropy_parquet.columns", dataId={}) 

828 self.assertEqual(len(columns2b), len(columns2a)) 

829 for i, name in enumerate(columns2a): 

830 self.assertEqual(columns2b[i], name) 

831 

832 rowcount2a = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

833 rowcount2b = self.butler.get("astropy_parquet.rowcount", dataId={}) 

834 self.assertEqual(rowcount2a, rowcount2b) 

835 

836 schema2a = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

837 schema2b = self.butler.get("astropy_parquet.schema", dataId={}) 

838 self.assertEqual(schema2a, schema2b) 

839 

840 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.") 

841 def testWriteAstropyReadAsArrowTable(self): 

842 # This astropy <-> arrow works fine with masked columns. 

843 tab1 = _makeSimpleAstropyTable(include_masked=True) 

844 

845 self.butler.put(tab1, self.datasetType, dataId={}) 

846 

847 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable") 

848 

849 tab2_astropy = arrow_to_astropy(tab2) 

850 self._checkAstropyTableEquality(tab1, tab2_astropy) 

851 

852 # Check reading the columns. 

853 columns = tab2.schema.names 

854 columns2 = self.butler.get( 

855 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

856 ) 

857 self.assertEqual(columns2, columns) 

858 

859 # Check reading the schema. 

860 schema = tab2.schema 

861 schema2 = self.butler.get( 

862 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema" 

863 ) 

864 

865 self.assertEqual(schema, schema2) 

866 

867 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

868 def testWriteAstropyReadAsDataFrame(self): 

869 tab1 = _makeSimpleAstropyTable() 

870 

871 self.butler.put(tab1, self.datasetType, dataId={}) 

872 

873 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

874 

875 # This is tricky because it loses the units and gains a bonus pandas 

876 # _index_ column, so we just test the dataframe form. 

877 

878 tab1_df = tab1.to_pandas() 

879 self.assertTrue(tab1_df.equals(tab2)) 

880 

881 # Check reading the columns. 

882 columns = tab2.columns 

883 columns2 = self.butler.get( 

884 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex" 

885 ) 

886 self.assertTrue(columns.equals(columns2)) 

887 

888 # Check reading the schema. 

889 schema = DataFrameSchema(tab2) 

890 schema2 = self.butler.get( 

891 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema" 

892 ) 

893 

894 self.assertEqual(schema2, schema) 

895 

896 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

897 def testWriteAstropyWithMaskedColsReadAsDataFrame(self): 

898 # We need to special-case the write-as-astropy read-as-pandas code 

899 # with masks because pandas has multiple ways to use masked columns. 

900 # (When writing an astropy table with masked columns we get an object 

901 # column back, but each unmasked element has the correct type.) 

902 tab1 = _makeSimpleAstropyTable(include_masked=True) 

903 

904 self.butler.put(tab1, self.datasetType, dataId={}) 

905 

906 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

907 

908 tab1_df = tab1.to_pandas() 

909 

910 self.assertTrue(tab1_df.columns.equals(tab2.columns)) 

911 for name in tab2.columns: 

912 col1 = tab1_df[name] 

913 col2 = tab2[name] 

914 

915 if col1.hasnans: 

916 notNull = col1.notnull() 

917 self.assertTrue(notNull.equals(col2.notnull())) 

918 # Need to check value-by-value because column may 

919 # be made of objects, depending on what pandas decides. 

920 for index in notNull.values.nonzero()[0]: 

921 self.assertEqual(col1[index], col2[index]) 

922 else: 

923 self.assertTrue(col1.equals(col2)) 

924 

925 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

926 def testWriteAstropyReadAsNumpyTable(self): 

927 tab1 = _makeSimpleAstropyTable() 

928 self.butler.put(tab1, self.datasetType, dataId={}) 

929 

930 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy") 

931 

932 # This is tricky because it loses the units. 

933 tab2_astropy = atable.Table(tab2) 

934 

935 self._checkAstropyTableEquality(tab1, tab2_astropy, skip_units=True) 

936 

937 # Check reading the columns. 

938 columns = list(tab2.dtype.names) 

939 columns2 = self.butler.get( 

940 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

941 ) 

942 self.assertEqual(columns2, columns) 

943 

944 # Check reading the schema. 

945 schema = ArrowNumpySchema(tab2.dtype) 

946 schema2 = self.butler.get( 

947 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema" 

948 ) 

949 

950 self.assertEqual(schema2, schema) 

951 

952 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

953 def testWriteAstropyReadAsNumpyDict(self): 

954 tab1 = _makeSimpleAstropyTable() 

955 self.butler.put(tab1, self.datasetType, dataId={}) 

956 

957 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict") 

958 

959 # This is tricky because it loses the units. 

960 tab2_astropy = atable.Table(tab2) 

961 

962 self._checkAstropyTableEquality(tab1, tab2_astropy, skip_units=True) 

963 

964 def _checkAstropyTableEquality(self, table1, table2, skip_units=False, has_bigendian=False): 

965 """Check if two astropy tables have the same columns/values. 

966 

967 Parameters 

968 ---------- 

969 table1 : `astropy.table.Table` 

970 table2 : `astropy.table.Table` 

971 skip_units : `bool` 

972 has_bigendian : `bool` 

973 """ 

974 if not has_bigendian: 

975 self.assertEqual(table1.dtype, table2.dtype) 

976 else: 

977 for name in table1.dtype.names: 

978 # Only check type matches, force to little-endian. 

979 self.assertEqual(table1.dtype[name].newbyteorder(">"), table2.dtype[name].newbyteorder(">")) 

980 

981 self.assertEqual(table1.meta, table2.meta) 

982 if not skip_units: 

983 for name in table1.columns: 

984 self.assertEqual(table1[name].unit, table2[name].unit) 

985 self.assertEqual(table1[name].description, table2[name].description) 

986 self.assertEqual(table1[name].format, table2[name].format) 

987 self.assertTrue(np.all(table1 == table2)) 

988 

989 

990@unittest.skipUnless(atable is not None, "Cannot test InMemoryArrowAstropyDelegate without astropy.") 

991class InMemoryArrowAstropyDelegateTestCase(ParquetFormatterArrowAstropyTestCase): 

992 """Tests for InMemoryDatastore, using ArrowAstropyDelegate.""" 

993 

994 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

995 

996 def testAstropyParquet(self): 

997 # This test does not work with an inMemoryDatastore. 

998 pass 

999 

1000 def testBadInput(self): 

1001 tab1 = _makeSimpleAstropyTable() 

1002 delegate = ArrowAstropyDelegate("ArrowAstropy") 

1003 

1004 with self.assertRaises(ValueError): 

1005 delegate.handleParameters(inMemoryDataset="not_an_astropy_table") 

1006 

1007 with self.assertRaises(NotImplementedError): 

1008 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]}) 

1009 

1010 with self.assertRaises(AttributeError): 

1011 delegate.getComponent(composite=tab1, componentName="nothing") 

1012 

1013 

1014@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.") 

1015@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowNumpy without pyarrow.") 

1016class ParquetFormatterArrowNumpyTestCase(unittest.TestCase): 

1017 """Tests for ParquetFormatter, ArrowNumpy, using local file datastore.""" 

1018 

1019 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1020 

1021 def setUp(self): 

1022 """Create a new butler root for each test.""" 

1023 self.root = makeTestTempDir(TESTDIR) 

1024 config = Config(self.configFile) 

1025 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run") 

1026 # No dimensions in dataset type so we don't have to worry about 

1027 # inserting dimension data or defining data IDs. 

1028 self.datasetType = DatasetType( 

1029 "data", dimensions=(), storageClass="ArrowNumpy", universe=self.butler.registry.dimensions 

1030 ) 

1031 self.butler.registry.registerDatasetType(self.datasetType) 

1032 

1033 def tearDown(self): 

1034 removeTestTempDir(self.root) 

1035 

1036 def testNumpyTable(self): 

1037 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1038 

1039 self.butler.put(tab1, self.datasetType, dataId={}) 

1040 # Read the whole Table. 

1041 tab2 = self.butler.get(self.datasetType, dataId={}) 

1042 self._checkNumpyTableEquality(tab1, tab2) 

1043 # Read the columns. 

1044 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

1045 self.assertEqual(len(columns2), len(tab1.dtype.names)) 

1046 for i, name in enumerate(tab1.dtype.names): 

1047 self.assertEqual(columns2[i], name) 

1048 # Read the rowcount. 

1049 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

1050 self.assertEqual(rowcount, len(tab1)) 

1051 # Read the schema. 

1052 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

1053 self.assertEqual(schema, ArrowNumpySchema(tab1.dtype)) 

1054 # Read just some columns a few different ways. 

1055 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

1056 self._checkNumpyTableEquality(tab1[["a", "c"]], tab3) 

1057 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

1058 self._checkNumpyTableEquality( 

1059 tab1[ 

1060 [ 

1061 "a", 

1062 ] 

1063 ], 

1064 tab4, 

1065 ) 

1066 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]}) 

1067 self._checkNumpyTableEquality(tab1[["index", "a"]], tab5) 

1068 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"}) 

1069 self._checkNumpyTableEquality( 

1070 tab1[ 

1071 [ 

1072 "ddd", 

1073 ] 

1074 ], 

1075 tab6, 

1076 ) 

1077 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]}) 

1078 self._checkNumpyTableEquality( 

1079 tab1[ 

1080 [ 

1081 "a", 

1082 ] 

1083 ], 

1084 tab7, 

1085 ) 

1086 # Passing an unrecognized column should be a ValueError. 

1087 with self.assertRaises(ValueError): 

1088 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]}) 

1089 

1090 def testNumpyTableBigEndian(self): 

1091 tab1 = _makeSimpleNumpyTable(include_bigendian=True) 

1092 

1093 self.butler.put(tab1, self.datasetType, dataId={}) 

1094 # Read the whole Table. 

1095 tab2 = self.butler.get(self.datasetType, dataId={}) 

1096 self._checkNumpyTableEquality(tab1, tab2, has_bigendian=True) 

1097 

1098 def testArrowNumpySchema(self): 

1099 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1100 tab1_arrow = numpy_to_arrow(tab1) 

1101 schema = ArrowNumpySchema.from_arrow(tab1_arrow.schema) 

1102 

1103 self.assertIsInstance(schema.schema, np.dtype) 

1104 self.assertEqual(repr(schema), repr(schema._dtype)) 

1105 self.assertNotEqual(schema, "not_a_schema") 

1106 self.assertEqual(schema, schema) 

1107 

1108 # Test inequality 

1109 tab2 = tab1.copy() 

1110 names = list(tab2.dtype.names) 

1111 names[0] = "index2" 

1112 tab2.dtype.names = names 

1113 schema2 = ArrowNumpySchema(tab2.dtype) 

1114 self.assertNotEqual(schema2, schema) 

1115 

1116 @unittest.skipUnless(pa is not None, "Cannot test arrow conversions without pyarrow.") 

1117 def testNumpyDictConversions(self): 

1118 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1119 

1120 # Verify that everything round-trips, including the schema. 

1121 tab1_arrow = numpy_to_arrow(tab1) 

1122 tab1_dict = arrow_to_numpy_dict(tab1_arrow) 

1123 tab1_dict_arrow = numpy_dict_to_arrow(tab1_dict) 

1124 

1125 self.assertEqual(tab1_arrow.schema, tab1_dict_arrow.schema) 

1126 self.assertEqual(tab1_arrow, tab1_dict_arrow) 

1127 

1128 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.") 

1129 def testWriteNumpyTableReadAsArrowTable(self): 

1130 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1131 

1132 self.butler.put(tab1, self.datasetType, dataId={}) 

1133 

1134 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable") 

1135 

1136 tab2_numpy = arrow_to_numpy(tab2) 

1137 

1138 self._checkNumpyTableEquality(tab1, tab2_numpy) 

1139 

1140 # Check reading the columns. 

1141 columns = tab2.schema.names 

1142 columns2 = self.butler.get( 

1143 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

1144 ) 

1145 self.assertEqual(columns2, columns) 

1146 

1147 # Check reading the schema. 

1148 schema = tab2.schema 

1149 schema2 = self.butler.get( 

1150 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema" 

1151 ) 

1152 self.assertEqual(schema2, schema) 

1153 

1154 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

1155 def testWriteNumpyTableReadAsDataFrame(self): 

1156 tab1 = _makeSimpleNumpyTable() 

1157 

1158 self.butler.put(tab1, self.datasetType, dataId={}) 

1159 

1160 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

1161 

1162 # Converting this back to numpy gets confused with the index column 

1163 # and changes the datatype of the string column. 

1164 

1165 tab1_df = pd.DataFrame(tab1) 

1166 

1167 self.assertTrue(tab1_df.equals(tab2)) 

1168 

1169 # Check reading the columns. 

1170 columns = tab2.columns 

1171 columns2 = self.butler.get( 

1172 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex" 

1173 ) 

1174 self.assertTrue(columns.equals(columns2)) 

1175 

1176 # Check reading the schema. 

1177 schema = DataFrameSchema(tab2) 

1178 schema2 = self.butler.get( 

1179 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema" 

1180 ) 

1181 

1182 self.assertEqual(schema2, schema) 

1183 

1184 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

1185 def testWriteNumpyTableReadAsAstropyTable(self): 

1186 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1187 

1188 self.butler.put(tab1, self.datasetType, dataId={}) 

1189 

1190 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

1191 tab2_numpy = tab2.as_array() 

1192 

1193 self._checkNumpyTableEquality(tab1, tab2_numpy) 

1194 

1195 # Check reading the columns. 

1196 columns = list(tab2.columns.keys()) 

1197 columns2 = self.butler.get( 

1198 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

1199 ) 

1200 self.assertEqual(columns2, columns) 

1201 

1202 # Check reading the schema. 

1203 schema = ArrowAstropySchema(tab2) 

1204 schema2 = self.butler.get( 

1205 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema" 

1206 ) 

1207 

1208 self.assertEqual(schema2, schema) 

1209 

1210 def testWriteNumpyTableReadAsNumpyDict(self): 

1211 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1212 

1213 self.butler.put(tab1, self.datasetType, dataId={}) 

1214 

1215 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict") 

1216 tab2_numpy = _numpy_dict_to_numpy(tab2) 

1217 

1218 self._checkNumpyTableEquality(tab1, tab2_numpy) 

1219 

1220 def _checkNumpyTableEquality(self, table1, table2, has_bigendian=False): 

1221 """Check if two numpy tables have the same columns/values 

1222 

1223 Parameters 

1224 ---------- 

1225 table1 : `numpy.ndarray` 

1226 table2 : `numpy.ndarray` 

1227 has_bigendian : `bool` 

1228 """ 

1229 self.assertEqual(table1.dtype.names, table2.dtype.names) 

1230 for name in table1.dtype.names: 

1231 if not has_bigendian: 

1232 self.assertEqual(table1.dtype[name], table2.dtype[name]) 

1233 else: 

1234 # Only check type matches, force to little-endian. 

1235 self.assertEqual(table1.dtype[name].newbyteorder(">"), table2.dtype[name].newbyteorder(">")) 

1236 self.assertTrue(np.all(table1 == table2)) 

1237 

1238 

1239@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.") 

1240class InMemoryArrowNumpyDelegateTestCase(ParquetFormatterArrowNumpyTestCase): 

1241 """Tests for InMemoryDatastore, using ArrowNumpyDelegate.""" 

1242 

1243 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

1244 

1245 def testBadInput(self): 

1246 tab1 = _makeSimpleNumpyTable() 

1247 delegate = ArrowNumpyDelegate("ArrowNumpy") 

1248 

1249 with self.assertRaises(ValueError): 

1250 delegate.handleParameters(inMemoryDataset="not_a_numpy_table") 

1251 

1252 with self.assertRaises(NotImplementedError): 

1253 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]}) 

1254 

1255 with self.assertRaises(AttributeError): 

1256 delegate.getComponent(composite=tab1, componentName="nothing") 

1257 

1258 def testStorageClass(self): 

1259 tab1 = _makeSimpleNumpyTable() 

1260 

1261 factory = StorageClassFactory() 

1262 factory.addFromConfig(StorageClassConfig()) 

1263 

1264 storageClass = factory.findStorageClass(type(tab1), compare_types=False) 

1265 # Force the name lookup to do name matching. 

1266 storageClass._pytype = None 

1267 self.assertEqual(storageClass.name, "ArrowNumpy") 

1268 

1269 storageClass = factory.findStorageClass(type(tab1), compare_types=True) 

1270 # Force the name lookup to do name matching. 

1271 storageClass._pytype = None 

1272 self.assertEqual(storageClass.name, "ArrowNumpy") 

1273 

1274 

1275@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowTable without pyarrow.") 

1276class ParquetFormatterArrowTableTestCase(unittest.TestCase): 

1277 """Tests for ParquetFormatter, ArrowTable, using local file datastore.""" 

1278 

1279 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1280 

1281 def setUp(self): 

1282 """Create a new butler root for each test.""" 

1283 self.root = makeTestTempDir(TESTDIR) 

1284 config = Config(self.configFile) 

1285 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run") 

1286 # No dimensions in dataset type so we don't have to worry about 

1287 # inserting dimension data or defining data IDs. 

1288 self.datasetType = DatasetType( 

1289 "data", dimensions=(), storageClass="ArrowTable", universe=self.butler.registry.dimensions 

1290 ) 

1291 self.butler.registry.registerDatasetType(self.datasetType) 

1292 

1293 def tearDown(self): 

1294 removeTestTempDir(self.root) 

1295 

1296 def testArrowTable(self): 

1297 tab1 = _makeSimpleArrowTable(include_multidim=True, include_masked=True) 

1298 

1299 self.butler.put(tab1, self.datasetType, dataId={}) 

1300 # Read the whole Table. 

1301 tab2 = self.butler.get(self.datasetType, dataId={}) 

1302 self.assertEqual(tab2, tab1) 

1303 # Read the columns. 

1304 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

1305 self.assertEqual(len(columns2), len(tab1.schema.names)) 

1306 for i, name in enumerate(tab1.schema.names): 

1307 self.assertEqual(columns2[i], name) 

1308 # Read the rowcount. 

1309 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

1310 self.assertEqual(rowcount, len(tab1)) 

1311 # Read the schema. 

1312 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

1313 self.assertEqual(schema, tab1.schema) 

1314 # Read just some columns a few different ways. 

1315 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

1316 self.assertEqual(tab3, tab1.select(("a", "c"))) 

1317 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

1318 self.assertEqual(tab4, tab1.select(("a",))) 

1319 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]}) 

1320 self.assertEqual(tab5, tab1.select(("index", "a"))) 

1321 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"}) 

1322 self.assertEqual(tab6, tab1.select(("ddd",))) 

1323 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]}) 

1324 self.assertEqual(tab7, tab1.select(("a",))) 

1325 # Passing an unrecognized column should be a ValueError. 

1326 with self.assertRaises(ValueError): 

1327 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]}) 

1328 

1329 def testEmptyArrowTable(self): 

1330 data = _makeSimpleNumpyTable() 

1331 type_list = _numpy_dtype_to_arrow_types(data.dtype) 

1332 

1333 schema = pa.schema(type_list) 

1334 arrays = [[]] * len(schema.names) 

1335 

1336 tab1 = pa.Table.from_arrays(arrays, schema=schema) 

1337 

1338 self.butler.put(tab1, self.datasetType, dataId={}) 

1339 tab2 = self.butler.get(self.datasetType, dataId={}) 

1340 self.assertEqual(tab2, tab1) 

1341 

1342 tab1_numpy = arrow_to_numpy(tab1) 

1343 self.assertEqual(len(tab1_numpy), 0) 

1344 tab1_numpy_arrow = numpy_to_arrow(tab1_numpy) 

1345 self.assertEqual(tab1_numpy_arrow, tab1) 

1346 

1347 tab1_pandas = arrow_to_pandas(tab1) 

1348 self.assertEqual(len(tab1_pandas), 0) 

1349 tab1_pandas_arrow = pandas_to_arrow(tab1_pandas) 

1350 # Unfortunately, string/byte columns get mangled when translated 

1351 # through empty pandas dataframes. 

1352 self.assertEqual( 

1353 tab1_pandas_arrow.select(("index", "a", "b", "c", "ddd")), 

1354 tab1.select(("index", "a", "b", "c", "ddd")), 

1355 ) 

1356 

1357 tab1_astropy = arrow_to_astropy(tab1) 

1358 self.assertEqual(len(tab1_astropy), 0) 

1359 tab1_astropy_arrow = astropy_to_arrow(tab1_astropy) 

1360 self.assertEqual(tab1_astropy_arrow, tab1) 

1361 

1362 def testEmptyArrowTableMultidim(self): 

1363 data = _makeSimpleNumpyTable(include_multidim=True) 

1364 type_list = _numpy_dtype_to_arrow_types(data.dtype) 

1365 

1366 md = {} 

1367 for name in data.dtype.names: 

1368 _append_numpy_multidim_metadata(md, name, data.dtype[name]) 

1369 

1370 schema = pa.schema(type_list, metadata=md) 

1371 arrays = [[]] * len(schema.names) 

1372 

1373 tab1 = pa.Table.from_arrays(arrays, schema=schema) 

1374 

1375 self.butler.put(tab1, self.datasetType, dataId={}) 

1376 tab2 = self.butler.get(self.datasetType, dataId={}) 

1377 self.assertEqual(tab2, tab1) 

1378 

1379 tab1_numpy = arrow_to_numpy(tab1) 

1380 self.assertEqual(len(tab1_numpy), 0) 

1381 tab1_numpy_arrow = numpy_to_arrow(tab1_numpy) 

1382 self.assertEqual(tab1_numpy_arrow, tab1) 

1383 

1384 tab1_astropy = arrow_to_astropy(tab1) 

1385 self.assertEqual(len(tab1_astropy), 0) 

1386 tab1_astropy_arrow = astropy_to_arrow(tab1_astropy) 

1387 self.assertEqual(tab1_astropy_arrow, tab1) 

1388 

1389 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

1390 def testWriteArrowTableReadAsSingleIndexDataFrame(self): 

1391 df1, allColumns = _makeSingleIndexDataFrame() 

1392 

1393 self.butler.put(df1, self.datasetType, dataId={}) 

1394 

1395 # Read back out as a dataframe. 

1396 df2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

1397 self.assertTrue(df1.equals(df2)) 

1398 

1399 # Read back out as an arrow table, convert to dataframe. 

1400 tab3 = self.butler.get(self.datasetType, dataId={}) 

1401 df3 = arrow_to_pandas(tab3) 

1402 self.assertTrue(df1.equals(df3)) 

1403 

1404 # Check reading the columns. 

1405 columns = df2.reset_index().columns 

1406 columns2 = self.butler.get( 

1407 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex" 

1408 ) 

1409 # We check the set because pandas reorders the columns. 

1410 self.assertEqual(set(columns2.to_list()), set(columns.to_list())) 

1411 

1412 # Check reading the schema. 

1413 schema = DataFrameSchema(df1) 

1414 schema2 = self.butler.get( 

1415 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema" 

1416 ) 

1417 self.assertEqual(schema2, schema) 

1418 

1419 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

1420 def testWriteArrowTableReadAsMultiIndexDataFrame(self): 

1421 df1 = _makeMultiIndexDataFrame() 

1422 

1423 self.butler.put(df1, self.datasetType, dataId={}) 

1424 

1425 # Read back out as a dataframe. 

1426 df2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

1427 self.assertTrue(df1.equals(df2)) 

1428 

1429 # Read back out as an arrow table, convert to dataframe. 

1430 atab3 = self.butler.get(self.datasetType, dataId={}) 

1431 df3 = arrow_to_pandas(atab3) 

1432 self.assertTrue(df1.equals(df3)) 

1433 

1434 # Check reading the columns. 

1435 columns = df2.columns 

1436 columns2 = self.butler.get( 

1437 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex" 

1438 ) 

1439 self.assertTrue(columns2.equals(columns)) 

1440 

1441 # Check reading the schema. 

1442 schema = DataFrameSchema(df1) 

1443 schema2 = self.butler.get( 

1444 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema" 

1445 ) 

1446 self.assertEqual(schema2, schema) 

1447 

1448 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

1449 def testWriteArrowTableReadAsAstropyTable(self): 

1450 tab1 = _makeSimpleAstropyTable(include_multidim=True, include_masked=True) 

1451 

1452 self.butler.put(tab1, self.datasetType, dataId={}) 

1453 

1454 # Read back out as an astropy table. 

1455 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

1456 self._checkAstropyTableEquality(tab1, tab2) 

1457 

1458 # Read back out as an arrow table, convert to astropy table. 

1459 atab3 = self.butler.get(self.datasetType, dataId={}) 

1460 tab3 = arrow_to_astropy(atab3) 

1461 self._checkAstropyTableEquality(tab1, tab3) 

1462 

1463 # Check reading the columns. 

1464 columns = list(tab2.columns.keys()) 

1465 columns2 = self.butler.get( 

1466 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

1467 ) 

1468 self.assertEqual(columns2, columns) 

1469 

1470 # Check reading the schema. 

1471 schema = ArrowAstropySchema(tab1) 

1472 schema2 = self.butler.get( 

1473 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema" 

1474 ) 

1475 self.assertEqual(schema2, schema) 

1476 

1477 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

1478 def testWriteArrowTableReadAsNumpyTable(self): 

1479 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1480 

1481 self.butler.put(tab1, self.datasetType, dataId={}) 

1482 

1483 # Read back out as a numpy table. 

1484 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy") 

1485 self._checkNumpyTableEquality(tab1, tab2) 

1486 

1487 # Read back out as an arrow table, convert to numpy table. 

1488 atab3 = self.butler.get(self.datasetType, dataId={}) 

1489 tab3 = arrow_to_numpy(atab3) 

1490 self._checkNumpyTableEquality(tab1, tab3) 

1491 

1492 # Check reading the columns. 

1493 columns = list(tab2.dtype.names) 

1494 columns2 = self.butler.get( 

1495 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

1496 ) 

1497 self.assertEqual(columns2, columns) 

1498 

1499 # Check reading the schema. 

1500 schema = ArrowNumpySchema(tab1.dtype) 

1501 schema2 = self.butler.get( 

1502 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema" 

1503 ) 

1504 self.assertEqual(schema2, schema) 

1505 

1506 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

1507 def testWriteArrowTableReadAsNumpyDict(self): 

1508 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1509 

1510 self.butler.put(tab1, self.datasetType, dataId={}) 

1511 

1512 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict") 

1513 tab2_numpy = _numpy_dict_to_numpy(tab2) 

1514 self._checkNumpyTableEquality(tab1, tab2_numpy) 

1515 

1516 def _checkAstropyTableEquality(self, table1, table2): 

1517 """Check if two astropy tables have the same columns/values 

1518 

1519 Parameters 

1520 ---------- 

1521 table1 : `astropy.table.Table` 

1522 table2 : `astropy.table.Table` 

1523 """ 

1524 self.assertEqual(table1.dtype, table2.dtype) 

1525 for name in table1.columns: 

1526 self.assertEqual(table1[name].unit, table2[name].unit) 

1527 self.assertEqual(table1[name].description, table2[name].description) 

1528 self.assertEqual(table1[name].format, table2[name].format) 

1529 self.assertTrue(np.all(table1 == table2)) 

1530 

1531 def _checkNumpyTableEquality(self, table1, table2): 

1532 """Check if two numpy tables have the same columns/values 

1533 

1534 Parameters 

1535 ---------- 

1536 table1 : `numpy.ndarray` 

1537 table2 : `numpy.ndarray` 

1538 """ 

1539 self.assertEqual(table1.dtype.names, table2.dtype.names) 

1540 for name in table1.dtype.names: 

1541 self.assertEqual(table1.dtype[name], table2.dtype[name]) 

1542 self.assertTrue(np.all(table1 == table2)) 

1543 

1544 

1545@unittest.skipUnless(pa is not None, "Cannot test InMemoryArrowTableDelegate without pyarrow.") 

1546class InMemoryArrowTableDelegateTestCase(ParquetFormatterArrowTableTestCase): 

1547 """Tests for InMemoryDatastore, using ArrowTableDelegate.""" 

1548 

1549 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

1550 

1551 def testBadInput(self): 

1552 tab1 = _makeSimpleArrowTable() 

1553 delegate = ArrowTableDelegate("ArrowTable") 

1554 

1555 with self.assertRaises(ValueError): 

1556 delegate.handleParameters(inMemoryDataset="not_an_arrow_table") 

1557 

1558 with self.assertRaises(NotImplementedError): 

1559 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]}) 

1560 

1561 with self.assertRaises(AttributeError): 

1562 delegate.getComponent(composite=tab1, componentName="nothing") 

1563 

1564 def testStorageClass(self): 

1565 tab1 = _makeSimpleArrowTable() 

1566 

1567 factory = StorageClassFactory() 

1568 factory.addFromConfig(StorageClassConfig()) 

1569 

1570 storageClass = factory.findStorageClass(type(tab1), compare_types=False) 

1571 # Force the name lookup to do name matching. 

1572 storageClass._pytype = None 

1573 self.assertEqual(storageClass.name, "ArrowTable") 

1574 

1575 storageClass = factory.findStorageClass(type(tab1), compare_types=True) 

1576 # Force the name lookup to do name matching. 

1577 storageClass._pytype = None 

1578 self.assertEqual(storageClass.name, "ArrowTable") 

1579 

1580 

1581@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.") 

1582@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowNumpy without pyarrow.") 

1583class ParquetFormatterArrowNumpyDictTestCase(unittest.TestCase): 

1584 """Tests for ParquetFormatter, ArrowNumpyDict, using local file store.""" 

1585 

1586 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1587 

1588 def setUp(self): 

1589 """Create a new butler root for each test.""" 

1590 self.root = makeTestTempDir(TESTDIR) 

1591 config = Config(self.configFile) 

1592 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run") 

1593 # No dimensions in dataset type so we don't have to worry about 

1594 # inserting dimension data or defining data IDs. 

1595 self.datasetType = DatasetType( 

1596 "data", dimensions=(), storageClass="ArrowNumpyDict", universe=self.butler.registry.dimensions 

1597 ) 

1598 self.butler.registry.registerDatasetType(self.datasetType) 

1599 

1600 def tearDown(self): 

1601 removeTestTempDir(self.root) 

1602 

1603 def testNumpyDict(self): 

1604 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1605 dict1 = _numpy_to_numpy_dict(tab1) 

1606 

1607 self.butler.put(dict1, self.datasetType, dataId={}) 

1608 # Read the whole table. 

1609 dict2 = self.butler.get(self.datasetType, dataId={}) 

1610 self._checkNumpyDictEquality(dict1, dict2) 

1611 # Read the columns. 

1612 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

1613 self.assertEqual(len(columns2), len(dict1.keys())) 

1614 for i, name in enumerate(dict1.keys()): 

1615 self.assertIn(name, columns2) 

1616 # Read the rowcount. 

1617 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

1618 self.assertEqual(rowcount, len(dict1["a"])) 

1619 # Read the schema. 

1620 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

1621 self.assertEqual(schema, ArrowNumpySchema(tab1.dtype)) 

1622 # Read just some columns a few different ways. 

1623 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

1624 subdict = {key: dict1[key] for key in ["a", "c"]} 

1625 self._checkNumpyDictEquality(subdict, tab3) 

1626 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

1627 subdict = {key: dict1[key] for key in ["a"]} 

1628 self._checkNumpyDictEquality(subdict, tab4) 

1629 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]}) 

1630 subdict = {key: dict1[key] for key in ["index", "a"]} 

1631 self._checkNumpyDictEquality(subdict, tab5) 

1632 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"}) 

1633 subdict = {key: dict1[key] for key in ["ddd"]} 

1634 self._checkNumpyDictEquality(subdict, tab6) 

1635 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]}) 

1636 subdict = {key: dict1[key] for key in ["a"]} 

1637 self._checkNumpyDictEquality(subdict, tab7) 

1638 # Passing an unrecognized column should be a ValueError. 

1639 with self.assertRaises(ValueError): 

1640 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]}) 

1641 

1642 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.") 

1643 def testWriteNumpyDictReadAsArrowTable(self): 

1644 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1645 dict1 = _numpy_to_numpy_dict(tab1) 

1646 

1647 self.butler.put(dict1, self.datasetType, dataId={}) 

1648 

1649 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable") 

1650 

1651 tab2_dict = arrow_to_numpy_dict(tab2) 

1652 

1653 self._checkNumpyDictEquality(dict1, tab2_dict) 

1654 

1655 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

1656 def testWriteNumpyDictReadAsDataFrame(self): 

1657 tab1 = _makeSimpleNumpyTable() 

1658 dict1 = _numpy_to_numpy_dict(tab1) 

1659 

1660 self.butler.put(dict1, self.datasetType, dataId={}) 

1661 

1662 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

1663 

1664 # The order of the dict may get mixed up, so we need to check column 

1665 # by column. We also need to do this in dataframe form because pandas 

1666 # changes the datatype of the string column. 

1667 tab1_df = pd.DataFrame(tab1) 

1668 

1669 self.assertEqual(set(tab1_df.columns), set(tab2.columns)) 

1670 for col in tab1_df.columns: 

1671 self.assertTrue(np.all(tab1_df[col].values == tab2[col].values)) 

1672 

1673 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

1674 def testWriteNumpyDictReadAsAstropyTable(self): 

1675 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1676 dict1 = _numpy_to_numpy_dict(tab1) 

1677 

1678 self.butler.put(dict1, self.datasetType, dataId={}) 

1679 

1680 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

1681 tab2_dict = _astropy_to_numpy_dict(tab2) 

1682 

1683 self._checkNumpyDictEquality(dict1, tab2_dict) 

1684 

1685 def testWriteNumpyDictReadAsNumpyTable(self): 

1686 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1687 dict1 = _numpy_to_numpy_dict(tab1) 

1688 

1689 self.butler.put(dict1, self.datasetType, dataId={}) 

1690 

1691 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy") 

1692 tab2_dict = _numpy_to_numpy_dict(tab2) 

1693 

1694 self._checkNumpyDictEquality(dict1, tab2_dict) 

1695 

1696 def testWriteNumpyDictBad(self): 

1697 dict1 = {"a": 4, "b": np.ndarray([1])} 

1698 with self.assertRaises(RuntimeError): 

1699 self.butler.put(dict1, self.datasetType, dataId={}) 

1700 

1701 dict2 = {"a": np.zeros(4), "b": np.zeros(5)} 

1702 with self.assertRaises(RuntimeError): 

1703 self.butler.put(dict2, self.datasetType, dataId={}) 

1704 

1705 dict3 = {"a": [0] * 5, "b": np.zeros(5)} 

1706 with self.assertRaises(RuntimeError): 

1707 self.butler.put(dict3, self.datasetType, dataId={}) 

1708 

1709 def _checkNumpyDictEquality(self, dict1, dict2): 

1710 """Check if two numpy dicts have the same columns/values. 

1711 

1712 Parameters 

1713 ---------- 

1714 dict1 : `dict` [`str`, `np.ndarray`] 

1715 dict2 : `dict` [`str`, `np.ndarray`] 

1716 """ 

1717 self.assertEqual(set(dict1.keys()), set(dict2.keys())) 

1718 for name in dict1.keys(): 

1719 self.assertEqual(dict1[name].dtype, dict2[name].dtype) 

1720 self.assertTrue(np.all(dict1[name] == dict2[name])) 

1721 

1722 

1723@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.") 

1724@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowNumpy without pyarrow.") 

1725class InMemoryNumpyDictDelegateTestCase(ParquetFormatterArrowNumpyDictTestCase): 

1726 """Tests for InMemoryDatastore, using ArrowNumpyDictDelegate.""" 

1727 

1728 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

1729 

1730 def testWriteNumpyDictBad(self): 

1731 # The sub-type checking is not done on in-memory datastore. 

1732 pass 

1733 

1734 

1735@unittest.skipUnless(np is not None, "Cannot test compute_row_group_size without numpy.") 

1736@unittest.skipUnless(pa is not None, "Cannot test compute_row_group_size without pyarrow.") 

1737class ComputeRowGroupSizeTestCase(unittest.TestCase): 

1738 """Tests for compute_row_group_size.""" 

1739 

1740 def testRowGroupSizeNoMetadata(self): 

1741 numpyTable = _makeSimpleNumpyTable(include_multidim=True) 

1742 

1743 # We can't use the numpy_to_arrow convenience function because 

1744 # that adds metadata. 

1745 type_list = _numpy_dtype_to_arrow_types(numpyTable.dtype) 

1746 schema = pa.schema(type_list) 

1747 arrays = _numpy_style_arrays_to_arrow_arrays( 

1748 numpyTable.dtype, 

1749 len(numpyTable), 

1750 numpyTable, 

1751 schema, 

1752 ) 

1753 arrowTable = pa.Table.from_arrays(arrays, schema=schema) 

1754 

1755 row_group_size = compute_row_group_size(arrowTable.schema) 

1756 

1757 self.assertGreater(row_group_size, 1_000_000) 

1758 self.assertLess(row_group_size, 2_000_000) 

1759 

1760 def testRowGroupSizeWithMetadata(self): 

1761 numpyTable = _makeSimpleNumpyTable(include_multidim=True) 

1762 

1763 arrowTable = numpy_to_arrow(numpyTable) 

1764 

1765 row_group_size = compute_row_group_size(arrowTable.schema) 

1766 

1767 self.assertGreater(row_group_size, 1_000_000) 

1768 self.assertLess(row_group_size, 2_000_000) 

1769 

1770 def testRowGroupSizeTinyTable(self): 

1771 numpyTable = np.zeros(1, dtype=[("a", np.bool_)]) 

1772 

1773 arrowTable = numpy_to_arrow(numpyTable) 

1774 

1775 row_group_size = compute_row_group_size(arrowTable.schema) 

1776 

1777 self.assertGreater(row_group_size, 1_000_000) 

1778 

1779 @unittest.skipUnless(pd is not None, "Cannot run testRowGroupSizeDataFrameWithLists without pandas.") 

1780 def testRowGroupSizeDataFrameWithLists(self): 

1781 df = pd.DataFrame({"a": np.zeros(10), "b": [[0, 0]] * 10, "c": [[0.0, 0.0]] * 10, "d": [[]] * 10}) 

1782 arrowTable = pandas_to_arrow(df) 

1783 row_group_size = compute_row_group_size(arrowTable.schema) 

1784 

1785 self.assertGreater(row_group_size, 1_000_000) 

1786 

1787 

1788if __name__ == "__main__": 

1789 unittest.main()