Coverage for tests/test_parquet.py: 22%

1077 statements  

« prev     ^ index     » next       coverage.py v7.4.3, created at 2024-03-12 10:07 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28"""Tests for ParquetFormatter. 

29 

30Tests in this module are disabled unless pandas and pyarrow are importable. 

31""" 

32 

33import os 

34import unittest 

35 

36try: 

37 import pyarrow as pa 

38except ImportError: 

39 pa = None 

40try: 

41 import astropy.table as atable 

42 from astropy import units 

43except ImportError: 

44 atable = None 

45try: 

46 import numpy as np 

47except ImportError: 

48 np = None 

49try: 

50 import pandas as pd 

51except ImportError: 

52 pd = None 

53 

54from lsst.daf.butler import ( 

55 Butler, 

56 Config, 

57 DatasetRef, 

58 DatasetType, 

59 FileDataset, 

60 StorageClassConfig, 

61 StorageClassFactory, 

62) 

63 

64try: 

65 from lsst.daf.butler.delegates.arrowastropy import ArrowAstropyDelegate 

66except ImportError: 

67 atable = None 

68 pa = None 

69try: 

70 from lsst.daf.butler.delegates.arrownumpy import ArrowNumpyDelegate 

71except ImportError: 

72 np = None 

73 pa = None 

74try: 

75 from lsst.daf.butler.delegates.arrowtable import ArrowTableDelegate 

76except ImportError: 

77 pa = None 

78try: 

79 from lsst.daf.butler.delegates.dataframe import DataFrameDelegate 

80except ImportError: 

81 pd = None 

82try: 

83 from lsst.daf.butler.formatters.parquet import ( 

84 ArrowAstropySchema, 

85 ArrowNumpySchema, 

86 DataFrameSchema, 

87 ParquetFormatter, 

88 _append_numpy_multidim_metadata, 

89 _astropy_to_numpy_dict, 

90 _numpy_dict_to_numpy, 

91 _numpy_dtype_to_arrow_types, 

92 _numpy_style_arrays_to_arrow_arrays, 

93 _numpy_to_numpy_dict, 

94 arrow_to_astropy, 

95 arrow_to_numpy, 

96 arrow_to_numpy_dict, 

97 arrow_to_pandas, 

98 astropy_to_arrow, 

99 compute_row_group_size, 

100 numpy_dict_to_arrow, 

101 numpy_to_arrow, 

102 pandas_to_arrow, 

103 ) 

104except ImportError: 

105 pa = None 

106 pd = None 

107 atable = None 

108 np = None 

109from lsst.daf.butler.tests.utils import makeTestTempDir, removeTestTempDir 

110 

111TESTDIR = os.path.abspath(os.path.dirname(__file__)) 

112 

113 

114def _makeSimpleNumpyTable(include_multidim=False, include_bigendian=False): 

115 """Make a simple numpy table with random data. 

116 

117 Parameters 

118 ---------- 

119 include_multidim : `bool` 

120 Include multi-dimensional columns. 

121 include_bigendian : `bool` 

122 Include big-endian columns. 

123 

124 Returns 

125 ------- 

126 numpyTable : `numpy.ndarray` 

127 """ 

128 nrow = 5 

129 

130 dtype = [ 

131 ("index", "i4"), 

132 ("a", "f8"), 

133 ("b", "f8"), 

134 ("c", "f8"), 

135 ("ddd", "f8"), 

136 ("f", "i8"), 

137 ("strcol", "U10"), 

138 ("bytecol", "a10"), 

139 ] 

140 

141 if include_multidim: 

142 dtype.extend( 

143 [ 

144 ("d1", "f4", (5,)), 

145 ("d2", "i8", (5, 10)), 

146 ("d3", "f8", (5, 10)), 

147 ] 

148 ) 

149 

150 if include_bigendian: 

151 dtype.extend([("a_bigendian", ">f8"), ("f_bigendian", ">i8")]) 

152 

153 data = np.zeros(nrow, dtype=dtype) 

154 data["index"][:] = np.arange(nrow) 

155 data["a"] = np.random.randn(nrow) 

156 data["b"] = np.random.randn(nrow) 

157 data["c"] = np.random.randn(nrow) 

158 data["ddd"] = np.random.randn(nrow) 

159 data["f"] = np.arange(nrow) * 10 

160 data["strcol"][:] = "teststring" 

161 data["bytecol"][:] = "teststring" 

162 

163 if include_multidim: 

164 data["d1"] = np.random.randn(data["d1"].size).reshape(data["d1"].shape) 

165 data["d2"] = np.arange(data["d2"].size).reshape(data["d2"].shape) 

166 data["d3"] = np.asfortranarray(np.random.randn(data["d3"].size).reshape(data["d3"].shape)) 

167 

168 if include_bigendian: 

169 data["a_bigendian"][:] = data["a"] 

170 data["f_bigendian"][:] = data["f"] 

171 

172 return data 

173 

174 

175def _makeSingleIndexDataFrame(include_masked=False, include_lists=False): 

176 """Make a single index data frame for testing. 

177 

178 Parameters 

179 ---------- 

180 include_masked : `bool` 

181 Include masked columns. 

182 include_lists : `bool` 

183 Include list columns. 

184 

185 Returns 

186 ------- 

187 dataFrame : `~pandas.DataFrame` 

188 The test dataframe. 

189 allColumns : `list` [`str`] 

190 List of all the columns (including index columns). 

191 """ 

192 data = _makeSimpleNumpyTable() 

193 df = pd.DataFrame(data) 

194 df = df.set_index("index") 

195 

196 if include_masked: 

197 nrow = len(df) 

198 

199 df["m1"] = pd.array(np.arange(nrow), dtype=pd.Int64Dtype()) 

200 df["m2"] = pd.array(np.arange(nrow), dtype=np.float32) 

201 df["mstrcol"] = pd.array(np.array(["text"] * nrow)) 

202 df.loc[1, ["m1", "m2", "mstrcol"]] = None 

203 

204 if include_lists: 

205 nrow = len(df) 

206 

207 df["l1"] = [[0, 0]] * nrow 

208 df["l2"] = [[0.0, 0.0]] * nrow 

209 df["l3"] = [[]] * nrow 

210 

211 allColumns = df.columns.append(pd.Index(df.index.names)) 

212 

213 return df, allColumns 

214 

215 

216def _makeMultiIndexDataFrame(): 

217 """Make a multi-index data frame for testing. 

218 

219 Returns 

220 ------- 

221 dataFrame : `~pandas.DataFrame` 

222 The test dataframe. 

223 """ 

224 columns = pd.MultiIndex.from_tuples( 

225 [ 

226 ("g", "a"), 

227 ("g", "b"), 

228 ("g", "c"), 

229 ("r", "a"), 

230 ("r", "b"), 

231 ("r", "c"), 

232 ], 

233 names=["filter", "column"], 

234 ) 

235 df = pd.DataFrame(np.random.randn(5, 6), index=np.arange(5, dtype=int), columns=columns) 

236 

237 return df 

238 

239 

240def _makeSimpleAstropyTable(include_multidim=False, include_masked=False, include_bigendian=False): 

241 """Make an astropy table for testing. 

242 

243 Parameters 

244 ---------- 

245 include_multidim : `bool` 

246 Include multi-dimensional columns. 

247 include_masked : `bool` 

248 Include masked columns. 

249 include_bigendian : `bool` 

250 Include big-endian columns. 

251 

252 Returns 

253 ------- 

254 astropyTable : `astropy.table.Table` 

255 The test table. 

256 """ 

257 data = _makeSimpleNumpyTable(include_multidim=include_multidim, include_bigendian=include_bigendian) 

258 # Add a couple of units. 

259 table = atable.Table(data) 

260 table["a"].unit = units.degree 

261 table["a"].description = "Description of column a" 

262 table["b"].unit = units.meter 

263 table["b"].description = "Description of column b" 

264 

265 # Add some masked columns. 

266 if include_masked: 

267 nrow = len(table) 

268 mask = np.zeros(nrow, dtype=bool) 

269 mask[1] = True 

270 # We set the masked columns with the underlying sentinel value 

271 # to be able test after serialization. 

272 arr = np.arange(nrow, dtype="i8") 

273 arr[mask] = -1 

274 table["m1"] = np.ma.masked_array(data=arr, mask=mask, fill_value=-1) 

275 arr = np.arange(nrow, dtype="f4") 

276 arr[mask] = np.nan 

277 table["m2"] = np.ma.masked_array(data=arr, mask=mask, fill_value=np.nan) 

278 table["m3"] = np.arange(nrow, dtype="f4") 

279 table["m3"][mask] = np.nan 

280 arr = np.zeros(nrow, dtype=np.bool_) 

281 arr[mask] = True 

282 table["m4"] = np.ma.masked_array(data=arr, mask=mask, fill_value=True) 

283 arr = np.arange(nrow, dtype="u4") 

284 arr[mask] = 0 

285 table["m5"] = np.ma.masked_array(data=arr, mask=mask, fill_value=0) 

286 table["mstrcol"] = np.ma.masked_array(data=np.array(["text"] * nrow), mask=mask, fill_value="") 

287 table["mbytecol"] = np.ma.masked_array(data=np.array([b"bytes"] * nrow), mask=mask, fill_value=b"") 

288 

289 return table 

290 

291 

292def _makeSimpleArrowTable(include_multidim=False, include_masked=False): 

293 """Make an arrow table for testing. 

294 

295 Parameters 

296 ---------- 

297 include_multidim : `bool` 

298 Include multi-dimensional columns. 

299 include_masked : `bool` 

300 Include masked columns. 

301 

302 Returns 

303 ------- 

304 arrowTable : `pyarrow.Table` 

305 The test table. 

306 """ 

307 data = _makeSimpleAstropyTable(include_multidim=include_multidim, include_masked=include_masked) 

308 return astropy_to_arrow(data) 

309 

310 

311@unittest.skipUnless(pd is not None, "Cannot test ParquetFormatterDataFrame without pandas.") 

312@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterDataFrame without pyarrow.") 

313class ParquetFormatterDataFrameTestCase(unittest.TestCase): 

314 """Tests for ParquetFormatter, DataFrame, using local file datastore.""" 

315 

316 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

317 

318 def setUp(self): 

319 """Create a new butler root for each test.""" 

320 self.root = makeTestTempDir(TESTDIR) 

321 config = Config(self.configFile) 

322 self.run = "test_run" 

323 self.butler = Butler.from_config( 

324 Butler.makeRepo(self.root, config=config), writeable=True, run=self.run 

325 ) 

326 # No dimensions in dataset type so we don't have to worry about 

327 # inserting dimension data or defining data IDs. 

328 self.datasetType = DatasetType( 

329 "data", dimensions=(), storageClass="DataFrame", universe=self.butler.dimensions 

330 ) 

331 self.butler.registry.registerDatasetType(self.datasetType) 

332 

333 def tearDown(self): 

334 removeTestTempDir(self.root) 

335 

336 def testSingleIndexDataFrame(self): 

337 df1, allColumns = _makeSingleIndexDataFrame(include_masked=True) 

338 

339 self.butler.put(df1, self.datasetType, dataId={}) 

340 # Read the whole DataFrame. 

341 df2 = self.butler.get(self.datasetType, dataId={}) 

342 self.assertTrue(df1.equals(df2)) 

343 # Read just the column descriptions. 

344 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

345 self.assertTrue(allColumns.equals(columns2)) 

346 # Read the rowcount. 

347 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

348 self.assertEqual(rowcount, len(df1)) 

349 # Read the schema. 

350 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

351 self.assertEqual(schema, DataFrameSchema(df1)) 

352 # Read just some columns a few different ways. 

353 df3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

354 self.assertTrue(df1.loc[:, ["a", "c"]].equals(df3)) 

355 df4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

356 self.assertTrue(df1.loc[:, ["a"]].equals(df4)) 

357 df5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]}) 

358 self.assertTrue(df1.loc[:, ["a"]].equals(df5)) 

359 df6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"}) 

360 self.assertTrue(df1.loc[:, ["ddd"]].equals(df6)) 

361 df7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]}) 

362 self.assertTrue(df1.loc[:, ["a"]].equals(df7)) 

363 # Passing an unrecognized column should be a ValueError. 

364 with self.assertRaises(ValueError): 

365 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]}) 

366 

367 def testSingleIndexDataFrameWithLists(self): 

368 df1, allColumns = _makeSingleIndexDataFrame(include_lists=True) 

369 

370 self.butler.put(df1, self.datasetType, dataId={}) 

371 # Read the whole DataFrame. 

372 df2 = self.butler.get(self.datasetType, dataId={}) 

373 

374 # We need to check the list columns specially because they go 

375 # from lists to arrays. 

376 for col in ["l1", "l2", "l3"]: 

377 for i in range(len(df1)): 

378 self.assertTrue(np.all(df2[col].values[i] == df1[col].values[i])) 

379 

380 def testMultiIndexDataFrame(self): 

381 df1 = _makeMultiIndexDataFrame() 

382 

383 self.butler.put(df1, self.datasetType, dataId={}) 

384 # Read the whole DataFrame. 

385 df2 = self.butler.get(self.datasetType, dataId={}) 

386 self.assertTrue(df1.equals(df2)) 

387 # Read just the column descriptions. 

388 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

389 self.assertTrue(df1.columns.equals(columns2)) 

390 self.assertEqual(columns2.names, df1.columns.names) 

391 # Read the rowcount. 

392 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

393 self.assertEqual(rowcount, len(df1)) 

394 # Read the schema. 

395 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

396 self.assertEqual(schema, DataFrameSchema(df1)) 

397 # Read just some columns a few different ways. 

398 df3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": {"filter": "g"}}) 

399 self.assertTrue(df1.loc[:, ["g"]].equals(df3)) 

400 df4 = self.butler.get( 

401 self.datasetType, dataId={}, parameters={"columns": {"filter": ["r"], "column": "a"}} 

402 ) 

403 self.assertTrue(df1.loc[:, [("r", "a")]].equals(df4)) 

404 column_list = [("g", "a"), ("r", "c")] 

405 df5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": column_list}) 

406 self.assertTrue(df1.loc[:, column_list].equals(df5)) 

407 column_dict = {"filter": "r", "column": ["a", "b"]} 

408 df6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": column_dict}) 

409 self.assertTrue(df1.loc[:, [("r", "a"), ("r", "b")]].equals(df6)) 

410 # Passing an unrecognized column should be a ValueError. 

411 with self.assertRaises(ValueError): 

412 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["d"]}) 

413 

414 def testSingleIndexDataFrameEmptyString(self): 

415 """Test persisting a single index dataframe with empty strings.""" 

416 df1, _ = _makeSingleIndexDataFrame() 

417 

418 # Set one of the strings to None 

419 df1.at[1, "strcol"] = None 

420 

421 self.butler.put(df1, self.datasetType, dataId={}) 

422 # Read the whole DataFrame. 

423 df2 = self.butler.get(self.datasetType, dataId={}) 

424 self.assertTrue(df1.equals(df2)) 

425 

426 def testSingleIndexDataFrameAllEmptyStrings(self): 

427 """Test persisting a single index dataframe with an empty string 

428 column. 

429 """ 

430 df1, _ = _makeSingleIndexDataFrame() 

431 

432 # Set all of the strings to None 

433 df1.loc[0:, "strcol"] = None 

434 

435 self.butler.put(df1, self.datasetType, dataId={}) 

436 # Read the whole DataFrame. 

437 df2 = self.butler.get(self.datasetType, dataId={}) 

438 self.assertTrue(df1.equals(df2)) 

439 

440 def testLegacyDataFrame(self): 

441 """Test writing a dataframe to parquet via pandas (without additional 

442 metadata) and ensure that we can read it back with all the new 

443 functionality. 

444 """ 

445 df1, allColumns = _makeSingleIndexDataFrame() 

446 

447 fname = os.path.join(self.root, "test_dataframe.parq") 

448 df1.to_parquet(fname) 

449 

450 legacy_type = DatasetType( 

451 "legacy_dataframe", 

452 dimensions=(), 

453 storageClass="DataFrame", 

454 universe=self.butler.dimensions, 

455 ) 

456 self.butler.registry.registerDatasetType(legacy_type) 

457 

458 data_id = {} 

459 ref = DatasetRef(legacy_type, data_id, run=self.run) 

460 dataset = FileDataset(path=fname, refs=[ref], formatter=ParquetFormatter) 

461 

462 self.butler.ingest(dataset, transfer="copy") 

463 

464 self.butler.put(df1, self.datasetType, dataId={}) 

465 

466 df2a = self.butler.get(self.datasetType, dataId={}) 

467 df2b = self.butler.get("legacy_dataframe", dataId={}) 

468 self.assertTrue(df2a.equals(df2b)) 

469 

470 df3a = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a"]}) 

471 df3b = self.butler.get("legacy_dataframe", dataId={}, parameters={"columns": ["a"]}) 

472 self.assertTrue(df3a.equals(df3b)) 

473 

474 columns2a = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

475 columns2b = self.butler.get("legacy_dataframe.columns", dataId={}) 

476 self.assertTrue(columns2a.equals(columns2b)) 

477 

478 rowcount2a = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

479 rowcount2b = self.butler.get("legacy_dataframe.rowcount", dataId={}) 

480 self.assertEqual(rowcount2a, rowcount2b) 

481 

482 schema2a = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

483 schema2b = self.butler.get("legacy_dataframe.schema", dataId={}) 

484 self.assertEqual(schema2a, schema2b) 

485 

486 def testDataFrameSchema(self): 

487 tab1 = _makeSimpleArrowTable() 

488 

489 schema = DataFrameSchema.from_arrow(tab1.schema) 

490 

491 self.assertIsInstance(schema.schema, pd.DataFrame) 

492 self.assertEqual(repr(schema), repr(schema._schema)) 

493 self.assertNotEqual(schema, "not_a_schema") 

494 self.assertEqual(schema, schema) 

495 

496 tab2 = _makeMultiIndexDataFrame() 

497 schema2 = DataFrameSchema(tab2) 

498 

499 self.assertNotEqual(schema, schema2) 

500 

501 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

502 def testWriteSingleIndexDataFrameReadAsAstropyTable(self): 

503 df1, allColumns = _makeSingleIndexDataFrame() 

504 

505 self.butler.put(df1, self.datasetType, dataId={}) 

506 

507 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

508 

509 tab2_df = tab2.to_pandas(index="index") 

510 self.assertTrue(df1.equals(tab2_df)) 

511 

512 # Check reading the columns. 

513 columns = list(tab2.columns.keys()) 

514 columns2 = self.butler.get( 

515 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

516 ) 

517 # We check the set because pandas reorders the columns. 

518 self.assertEqual(set(columns2), set(columns)) 

519 

520 # Check reading the schema. 

521 schema = ArrowAstropySchema(tab2) 

522 schema2 = self.butler.get( 

523 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema" 

524 ) 

525 

526 # The string types are objectified by pandas, and the order 

527 # will be changed because of pandas indexing. 

528 self.assertEqual(len(schema2.schema.columns), len(schema.schema.columns)) 

529 for name in schema.schema.columns: 

530 self.assertIn(name, schema2.schema.columns) 

531 if schema2.schema[name].dtype != np.dtype("O"): 

532 self.assertEqual(schema2.schema[name].dtype, schema.schema[name].dtype) 

533 

534 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

535 def testWriteSingleIndexDataFrameWithMaskedColsReadAsAstropyTable(self): 

536 # We need to special-case the write-as-pandas read-as-astropy code 

537 # with masks because pandas has multiple ways to use masked columns. 

538 # (The string column mask handling in particular is frustratingly 

539 # inconsistent.) 

540 df1, allColumns = _makeSingleIndexDataFrame(include_masked=True) 

541 

542 self.butler.put(df1, self.datasetType, dataId={}) 

543 

544 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

545 tab2_df = tab2.to_pandas(index="index") 

546 

547 self.assertTrue(df1.columns.equals(tab2_df.columns)) 

548 for name in tab2_df.columns: 

549 col1 = df1[name] 

550 col2 = tab2_df[name] 

551 

552 if col1.hasnans: 

553 notNull = col1.notnull() 

554 self.assertTrue(notNull.equals(col2.notnull())) 

555 # Need to check value-by-value because column may 

556 # be made of objects, depending on what pandas decides. 

557 for index in notNull.values.nonzero()[0]: 

558 self.assertEqual(col1[index], col2[index]) 

559 else: 

560 self.assertTrue(col1.equals(col2)) 

561 

562 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

563 def testWriteMultiIndexDataFrameReadAsAstropyTable(self): 

564 df1 = _makeMultiIndexDataFrame() 

565 

566 self.butler.put(df1, self.datasetType, dataId={}) 

567 

568 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

569 

570 # This is an odd duck, it doesn't really round-trip. 

571 # This test simply checks that it's readable, but definitely not 

572 # recommended. 

573 

574 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.") 

575 def testWriteSingleIndexDataFrameReadAsArrowTable(self): 

576 df1, allColumns = _makeSingleIndexDataFrame() 

577 

578 self.butler.put(df1, self.datasetType, dataId={}) 

579 

580 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable") 

581 

582 tab2_df = arrow_to_pandas(tab2) 

583 self.assertTrue(df1.equals(tab2_df)) 

584 

585 # Check reading the columns. 

586 columns = list(tab2.schema.names) 

587 columns2 = self.butler.get( 

588 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

589 ) 

590 # We check the set because pandas reorders the columns. 

591 self.assertEqual(set(columns), set(columns2)) 

592 

593 # Check reading the schema. 

594 schema = tab2.schema 

595 schema2 = self.butler.get( 

596 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema" 

597 ) 

598 

599 # These will not have the same metadata, nor will the string column 

600 # information be maintained. 

601 self.assertEqual(len(schema.names), len(schema2.names)) 

602 for name in schema.names: 

603 if schema.field(name).type not in (pa.string(), pa.binary()): 

604 self.assertEqual(schema.field(name).type, schema2.field(name).type) 

605 

606 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.") 

607 def testWriteMultiIndexDataFrameReadAsArrowTable(self): 

608 df1 = _makeMultiIndexDataFrame() 

609 

610 self.butler.put(df1, self.datasetType, dataId={}) 

611 

612 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable") 

613 

614 tab2_df = arrow_to_pandas(tab2) 

615 self.assertTrue(df1.equals(tab2_df)) 

616 

617 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

618 def testWriteSingleIndexDataFrameReadAsNumpyTable(self): 

619 df1, allColumns = _makeSingleIndexDataFrame() 

620 

621 self.butler.put(df1, self.datasetType, dataId={}) 

622 

623 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy") 

624 

625 tab2_df = pd.DataFrame.from_records(tab2, index=["index"]) 

626 self.assertTrue(df1.equals(tab2_df)) 

627 

628 # Check reading the columns. 

629 columns = list(tab2.dtype.names) 

630 columns2 = self.butler.get( 

631 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

632 ) 

633 # We check the set because pandas reorders the columns. 

634 self.assertEqual(set(columns2), set(columns)) 

635 

636 # Check reading the schema. 

637 schema = ArrowNumpySchema(tab2.dtype) 

638 schema2 = self.butler.get( 

639 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema" 

640 ) 

641 

642 # The string types will be objectified by pandas, and the order 

643 # will be changed because of pandas indexing. 

644 self.assertEqual(len(schema.schema.names), len(schema2.schema.names)) 

645 for name in schema.schema.names: 

646 self.assertIn(name, schema2.schema.names) 

647 self.assertEqual(schema2.schema[name].type, schema.schema[name].type) 

648 

649 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

650 def testWriteMultiIndexDataFrameReadAsNumpyTable(self): 

651 df1 = _makeMultiIndexDataFrame() 

652 

653 self.butler.put(df1, self.datasetType, dataId={}) 

654 

655 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy") 

656 

657 # This is an odd duck, it doesn't really round-trip. 

658 # This test simply checks that it's readable, but definitely not 

659 # recommended. 

660 

661 @unittest.skipUnless(np is not None, "Cannot test reading as numpy dict without numpy.") 

662 def testWriteSingleIndexDataFrameReadAsNumpyDict(self): 

663 df1, allColumns = _makeSingleIndexDataFrame() 

664 

665 self.butler.put(df1, self.datasetType, dataId={}) 

666 

667 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict") 

668 

669 tab2_df = pd.DataFrame.from_records(tab2, index=["index"]) 

670 # The column order is not maintained. 

671 self.assertEqual(set(df1.columns), set(tab2_df.columns)) 

672 for col in df1.columns: 

673 self.assertTrue(np.all(df1[col].values == tab2_df[col].values)) 

674 

675 @unittest.skipUnless(np is not None, "Cannot test reading as numpy dict without numpy.") 

676 def testWriteMultiIndexDataFrameReadAsNumpyDict(self): 

677 df1 = _makeMultiIndexDataFrame() 

678 

679 self.butler.put(df1, self.datasetType, dataId={}) 

680 

681 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict") 

682 

683 # This is an odd duck, it doesn't really round-trip. 

684 # This test simply checks that it's readable, but definitely not 

685 # recommended. 

686 

687 

688@unittest.skipUnless(pd is not None, "Cannot test InMemoryDataFrameDelegate without pandas.") 

689class InMemoryDataFrameDelegateTestCase(ParquetFormatterDataFrameTestCase): 

690 """Tests for InMemoryDatastore, using DataFrameDelegate.""" 

691 

692 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

693 

694 def testWriteMultiIndexDataFrameReadAsAstropyTable(self): 

695 df1 = _makeMultiIndexDataFrame() 

696 

697 self.butler.put(df1, self.datasetType, dataId={}) 

698 

699 with self.assertRaises(ValueError): 

700 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

701 

702 def testLegacyDataFrame(self): 

703 # This test does not work with an inMemoryDatastore. 

704 pass 

705 

706 def testBadInput(self): 

707 df1, _ = _makeSingleIndexDataFrame() 

708 delegate = DataFrameDelegate("DataFrame") 

709 

710 with self.assertRaises(ValueError): 

711 delegate.handleParameters(inMemoryDataset="not_a_dataframe") 

712 

713 with self.assertRaises(AttributeError): 

714 delegate.getComponent(composite=df1, componentName="nothing") 

715 

716 def testStorageClass(self): 

717 df1, allColumns = _makeSingleIndexDataFrame() 

718 

719 factory = StorageClassFactory() 

720 factory.addFromConfig(StorageClassConfig()) 

721 

722 storageClass = factory.findStorageClass(type(df1), compare_types=False) 

723 # Force the name lookup to do name matching. 

724 storageClass._pytype = None 

725 self.assertEqual(storageClass.name, "DataFrame") 

726 

727 storageClass = factory.findStorageClass(type(df1), compare_types=True) 

728 # Force the name lookup to do name matching. 

729 storageClass._pytype = None 

730 self.assertEqual(storageClass.name, "DataFrame") 

731 

732 

733@unittest.skipUnless(atable is not None, "Cannot test ParquetFormatterArrowAstropy without astropy.") 

734@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowAstropy without pyarrow.") 

735class ParquetFormatterArrowAstropyTestCase(unittest.TestCase): 

736 """Tests for ParquetFormatter, ArrowAstropy, using local file datastore.""" 

737 

738 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

739 

740 def setUp(self): 

741 """Create a new butler root for each test.""" 

742 self.root = makeTestTempDir(TESTDIR) 

743 config = Config(self.configFile) 

744 self.run = "test_run" 

745 self.butler = Butler.from_config( 

746 Butler.makeRepo(self.root, config=config), writeable=True, run=self.run 

747 ) 

748 # No dimensions in dataset type so we don't have to worry about 

749 # inserting dimension data or defining data IDs. 

750 self.datasetType = DatasetType( 

751 "data", dimensions=(), storageClass="ArrowAstropy", universe=self.butler.dimensions 

752 ) 

753 self.butler.registry.registerDatasetType(self.datasetType) 

754 

755 def tearDown(self): 

756 removeTestTempDir(self.root) 

757 

758 def testAstropyTable(self): 

759 tab1 = _makeSimpleAstropyTable(include_multidim=True, include_masked=True) 

760 

761 self.butler.put(tab1, self.datasetType, dataId={}) 

762 # Read the whole Table. 

763 tab2 = self.butler.get(self.datasetType, dataId={}) 

764 self._checkAstropyTableEquality(tab1, tab2) 

765 # Read the columns. 

766 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

767 self.assertEqual(len(columns2), len(tab1.dtype.names)) 

768 for i, name in enumerate(tab1.dtype.names): 

769 self.assertEqual(columns2[i], name) 

770 # Read the rowcount. 

771 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

772 self.assertEqual(rowcount, len(tab1)) 

773 # Read the schema. 

774 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

775 self.assertEqual(schema, ArrowAstropySchema(tab1)) 

776 # Read just some columns a few different ways. 

777 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

778 self._checkAstropyTableEquality(tab1[("a", "c")], tab3) 

779 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

780 self._checkAstropyTableEquality(tab1[("a",)], tab4) 

781 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]}) 

782 self._checkAstropyTableEquality(tab1[("index", "a")], tab5) 

783 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"}) 

784 self._checkAstropyTableEquality(tab1[("ddd",)], tab6) 

785 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]}) 

786 self._checkAstropyTableEquality(tab1[("a",)], tab7) 

787 # Passing an unrecognized column should be a ValueError. 

788 with self.assertRaises(ValueError): 

789 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]}) 

790 

791 def testAstropyTableBigEndian(self): 

792 tab1 = _makeSimpleAstropyTable(include_bigendian=True) 

793 

794 self.butler.put(tab1, self.datasetType, dataId={}) 

795 # Read the whole Table. 

796 tab2 = self.butler.get(self.datasetType, dataId={}) 

797 self._checkAstropyTableEquality(tab1, tab2, has_bigendian=True) 

798 

799 def testAstropyTableWithMetadata(self): 

800 tab1 = _makeSimpleAstropyTable(include_multidim=True) 

801 

802 meta = { 

803 "meta_a": 5, 

804 "meta_b": 10.0, 

805 "meta_c": [1, 2, 3], 

806 "meta_d": True, 

807 "meta_e": "string", 

808 } 

809 

810 tab1.meta.update(meta) 

811 

812 self.butler.put(tab1, self.datasetType, dataId={}) 

813 # Read the whole Table. 

814 tab2 = self.butler.get(self.datasetType, dataId={}) 

815 # This will check that the metadata is equivalent as well. 

816 self._checkAstropyTableEquality(tab1, tab2) 

817 

818 def testArrowAstropySchema(self): 

819 tab1 = _makeSimpleAstropyTable() 

820 tab1_arrow = astropy_to_arrow(tab1) 

821 schema = ArrowAstropySchema.from_arrow(tab1_arrow.schema) 

822 

823 self.assertIsInstance(schema.schema, atable.Table) 

824 self.assertEqual(repr(schema), repr(schema._schema)) 

825 self.assertNotEqual(schema, "not_a_schema") 

826 self.assertEqual(schema, schema) 

827 

828 # Test various inequalities 

829 tab2 = tab1.copy() 

830 tab2.rename_column("index", "index2") 

831 schema2 = ArrowAstropySchema(tab2) 

832 self.assertNotEqual(schema2, schema) 

833 

834 tab2 = tab1.copy() 

835 tab2["index"].unit = units.micron 

836 schema2 = ArrowAstropySchema(tab2) 

837 self.assertNotEqual(schema2, schema) 

838 

839 tab2 = tab1.copy() 

840 tab2["index"].description = "Index column" 

841 schema2 = ArrowAstropySchema(tab2) 

842 self.assertNotEqual(schema2, schema) 

843 

844 tab2 = tab1.copy() 

845 tab2["index"].format = "%05d" 

846 schema2 = ArrowAstropySchema(tab2) 

847 self.assertNotEqual(schema2, schema) 

848 

849 def testAstropyParquet(self): 

850 tab1 = _makeSimpleAstropyTable() 

851 

852 fname = os.path.join(self.root, "test_astropy.parq") 

853 tab1.write(fname) 

854 

855 astropy_type = DatasetType( 

856 "astropy_parquet", 

857 dimensions=(), 

858 storageClass="ArrowAstropy", 

859 universe=self.butler.dimensions, 

860 ) 

861 self.butler.registry.registerDatasetType(astropy_type) 

862 

863 data_id = {} 

864 ref = DatasetRef(astropy_type, data_id, run=self.run) 

865 dataset = FileDataset(path=fname, refs=[ref], formatter=ParquetFormatter) 

866 

867 self.butler.ingest(dataset, transfer="copy") 

868 

869 self.butler.put(tab1, self.datasetType, dataId={}) 

870 

871 tab2a = self.butler.get(self.datasetType, dataId={}) 

872 tab2b = self.butler.get("astropy_parquet", dataId={}) 

873 self._checkAstropyTableEquality(tab2a, tab2b) 

874 

875 columns2a = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

876 columns2b = self.butler.get("astropy_parquet.columns", dataId={}) 

877 self.assertEqual(len(columns2b), len(columns2a)) 

878 for i, name in enumerate(columns2a): 

879 self.assertEqual(columns2b[i], name) 

880 

881 rowcount2a = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

882 rowcount2b = self.butler.get("astropy_parquet.rowcount", dataId={}) 

883 self.assertEqual(rowcount2a, rowcount2b) 

884 

885 schema2a = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

886 schema2b = self.butler.get("astropy_parquet.schema", dataId={}) 

887 self.assertEqual(schema2a, schema2b) 

888 

889 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.") 

890 def testWriteAstropyReadAsArrowTable(self): 

891 # This astropy <-> arrow works fine with masked columns. 

892 tab1 = _makeSimpleAstropyTable(include_masked=True) 

893 

894 self.butler.put(tab1, self.datasetType, dataId={}) 

895 

896 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable") 

897 

898 tab2_astropy = arrow_to_astropy(tab2) 

899 self._checkAstropyTableEquality(tab1, tab2_astropy) 

900 

901 # Check reading the columns. 

902 columns = tab2.schema.names 

903 columns2 = self.butler.get( 

904 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

905 ) 

906 self.assertEqual(columns2, columns) 

907 

908 # Check reading the schema. 

909 schema = tab2.schema 

910 schema2 = self.butler.get( 

911 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema" 

912 ) 

913 

914 self.assertEqual(schema, schema2) 

915 

916 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

917 def testWriteAstropyReadAsDataFrame(self): 

918 tab1 = _makeSimpleAstropyTable() 

919 

920 self.butler.put(tab1, self.datasetType, dataId={}) 

921 

922 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

923 

924 # This is tricky because it loses the units and gains a bonus pandas 

925 # _index_ column, so we just test the dataframe form. 

926 

927 tab1_df = tab1.to_pandas() 

928 self.assertTrue(tab1_df.equals(tab2)) 

929 

930 # Check reading the columns. 

931 columns = tab2.columns 

932 columns2 = self.butler.get( 

933 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex" 

934 ) 

935 self.assertTrue(columns.equals(columns2)) 

936 

937 # Check reading the schema. 

938 schema = DataFrameSchema(tab2) 

939 schema2 = self.butler.get( 

940 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema" 

941 ) 

942 

943 self.assertEqual(schema2, schema) 

944 

945 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

946 def testWriteAstropyWithMaskedColsReadAsDataFrame(self): 

947 # We need to special-case the write-as-astropy read-as-pandas code 

948 # with masks because pandas has multiple ways to use masked columns. 

949 # (When writing an astropy table with masked columns we get an object 

950 # column back, but each unmasked element has the correct type.) 

951 tab1 = _makeSimpleAstropyTable(include_masked=True) 

952 

953 self.butler.put(tab1, self.datasetType, dataId={}) 

954 

955 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

956 

957 tab1_df = tab1.to_pandas() 

958 

959 self.assertTrue(tab1_df.columns.equals(tab2.columns)) 

960 for name in tab2.columns: 

961 col1 = tab1_df[name] 

962 col2 = tab2[name] 

963 

964 if col1.hasnans: 

965 notNull = col1.notnull() 

966 self.assertTrue(notNull.equals(col2.notnull())) 

967 # Need to check value-by-value because column may 

968 # be made of objects, depending on what pandas decides. 

969 for index in notNull.values.nonzero()[0]: 

970 self.assertEqual(col1[index], col2[index]) 

971 else: 

972 self.assertTrue(col1.equals(col2)) 

973 

974 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

975 def testWriteAstropyReadAsNumpyTable(self): 

976 tab1 = _makeSimpleAstropyTable() 

977 self.butler.put(tab1, self.datasetType, dataId={}) 

978 

979 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy") 

980 

981 # This is tricky because it loses the units. 

982 tab2_astropy = atable.Table(tab2) 

983 

984 self._checkAstropyTableEquality(tab1, tab2_astropy, skip_units=True) 

985 

986 # Check reading the columns. 

987 columns = list(tab2.dtype.names) 

988 columns2 = self.butler.get( 

989 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

990 ) 

991 self.assertEqual(columns2, columns) 

992 

993 # Check reading the schema. 

994 schema = ArrowNumpySchema(tab2.dtype) 

995 schema2 = self.butler.get( 

996 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema" 

997 ) 

998 

999 self.assertEqual(schema2, schema) 

1000 

1001 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

1002 def testWriteAstropyReadAsNumpyDict(self): 

1003 tab1 = _makeSimpleAstropyTable() 

1004 self.butler.put(tab1, self.datasetType, dataId={}) 

1005 

1006 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict") 

1007 

1008 # This is tricky because it loses the units. 

1009 tab2_astropy = atable.Table(tab2) 

1010 

1011 self._checkAstropyTableEquality(tab1, tab2_astropy, skip_units=True) 

1012 

1013 def _checkAstropyTableEquality(self, table1, table2, skip_units=False, has_bigendian=False): 

1014 """Check if two astropy tables have the same columns/values. 

1015 

1016 Parameters 

1017 ---------- 

1018 table1 : `astropy.table.Table` 

1019 table2 : `astropy.table.Table` 

1020 skip_units : `bool` 

1021 has_bigendian : `bool` 

1022 """ 

1023 if not has_bigendian: 

1024 self.assertEqual(table1.dtype, table2.dtype) 

1025 else: 

1026 for name in table1.dtype.names: 

1027 # Only check type matches, force to little-endian. 

1028 self.assertEqual(table1.dtype[name].newbyteorder(">"), table2.dtype[name].newbyteorder(">")) 

1029 

1030 self.assertEqual(table1.meta, table2.meta) 

1031 if not skip_units: 

1032 for name in table1.columns: 

1033 self.assertEqual(table1[name].unit, table2[name].unit) 

1034 self.assertEqual(table1[name].description, table2[name].description) 

1035 self.assertEqual(table1[name].format, table2[name].format) 

1036 # We need to check masked/regular columns after filling. 

1037 has_masked = False 

1038 if isinstance(table1[name], atable.column.MaskedColumn): 

1039 c1 = table1[name].filled() 

1040 has_masked = True 

1041 else: 

1042 c1 = np.array(table1[name]) 

1043 if isinstance(table2[name], atable.column.MaskedColumn): 

1044 c2 = table2[name].filled() 

1045 has_masked = True 

1046 else: 

1047 c2 = np.array(table2[name]) 

1048 np.testing.assert_array_equal(c1, c2) 

1049 # If we have a masked column then we test the underlying data. 

1050 if has_masked: 

1051 np.testing.assert_array_equal(np.array(c1), np.array(c2)) 

1052 

1053 

1054@unittest.skipUnless(atable is not None, "Cannot test InMemoryArrowAstropyDelegate without astropy.") 

1055class InMemoryArrowAstropyDelegateTestCase(ParquetFormatterArrowAstropyTestCase): 

1056 """Tests for InMemoryDatastore, using ArrowAstropyDelegate.""" 

1057 

1058 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

1059 

1060 def testAstropyParquet(self): 

1061 # This test does not work with an inMemoryDatastore. 

1062 pass 

1063 

1064 def testBadInput(self): 

1065 tab1 = _makeSimpleAstropyTable() 

1066 delegate = ArrowAstropyDelegate("ArrowAstropy") 

1067 

1068 with self.assertRaises(ValueError): 

1069 delegate.handleParameters(inMemoryDataset="not_an_astropy_table") 

1070 

1071 with self.assertRaises(NotImplementedError): 

1072 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]}) 

1073 

1074 with self.assertRaises(AttributeError): 

1075 delegate.getComponent(composite=tab1, componentName="nothing") 

1076 

1077 

1078@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.") 

1079@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowNumpy without pyarrow.") 

1080class ParquetFormatterArrowNumpyTestCase(unittest.TestCase): 

1081 """Tests for ParquetFormatter, ArrowNumpy, using local file datastore.""" 

1082 

1083 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1084 

1085 def setUp(self): 

1086 """Create a new butler root for each test.""" 

1087 self.root = makeTestTempDir(TESTDIR) 

1088 config = Config(self.configFile) 

1089 self.butler = Butler.from_config( 

1090 Butler.makeRepo(self.root, config=config), writeable=True, run="test_run" 

1091 ) 

1092 # No dimensions in dataset type so we don't have to worry about 

1093 # inserting dimension data or defining data IDs. 

1094 self.datasetType = DatasetType( 

1095 "data", dimensions=(), storageClass="ArrowNumpy", universe=self.butler.dimensions 

1096 ) 

1097 self.butler.registry.registerDatasetType(self.datasetType) 

1098 

1099 def tearDown(self): 

1100 removeTestTempDir(self.root) 

1101 

1102 def testNumpyTable(self): 

1103 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1104 

1105 self.butler.put(tab1, self.datasetType, dataId={}) 

1106 # Read the whole Table. 

1107 tab2 = self.butler.get(self.datasetType, dataId={}) 

1108 self._checkNumpyTableEquality(tab1, tab2) 

1109 # Read the columns. 

1110 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

1111 self.assertEqual(len(columns2), len(tab1.dtype.names)) 

1112 for i, name in enumerate(tab1.dtype.names): 

1113 self.assertEqual(columns2[i], name) 

1114 # Read the rowcount. 

1115 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

1116 self.assertEqual(rowcount, len(tab1)) 

1117 # Read the schema. 

1118 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

1119 self.assertEqual(schema, ArrowNumpySchema(tab1.dtype)) 

1120 # Read just some columns a few different ways. 

1121 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

1122 self._checkNumpyTableEquality(tab1[["a", "c"]], tab3) 

1123 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

1124 self._checkNumpyTableEquality( 

1125 tab1[ 

1126 [ 

1127 "a", 

1128 ] 

1129 ], 

1130 tab4, 

1131 ) 

1132 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]}) 

1133 self._checkNumpyTableEquality(tab1[["index", "a"]], tab5) 

1134 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"}) 

1135 self._checkNumpyTableEquality( 

1136 tab1[ 

1137 [ 

1138 "ddd", 

1139 ] 

1140 ], 

1141 tab6, 

1142 ) 

1143 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]}) 

1144 self._checkNumpyTableEquality( 

1145 tab1[ 

1146 [ 

1147 "a", 

1148 ] 

1149 ], 

1150 tab7, 

1151 ) 

1152 # Passing an unrecognized column should be a ValueError. 

1153 with self.assertRaises(ValueError): 

1154 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]}) 

1155 

1156 def testNumpyTableBigEndian(self): 

1157 tab1 = _makeSimpleNumpyTable(include_bigendian=True) 

1158 

1159 self.butler.put(tab1, self.datasetType, dataId={}) 

1160 # Read the whole Table. 

1161 tab2 = self.butler.get(self.datasetType, dataId={}) 

1162 self._checkNumpyTableEquality(tab1, tab2, has_bigendian=True) 

1163 

1164 def testArrowNumpySchema(self): 

1165 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1166 tab1_arrow = numpy_to_arrow(tab1) 

1167 schema = ArrowNumpySchema.from_arrow(tab1_arrow.schema) 

1168 

1169 self.assertIsInstance(schema.schema, np.dtype) 

1170 self.assertEqual(repr(schema), repr(schema._dtype)) 

1171 self.assertNotEqual(schema, "not_a_schema") 

1172 self.assertEqual(schema, schema) 

1173 

1174 # Test inequality 

1175 tab2 = tab1.copy() 

1176 names = list(tab2.dtype.names) 

1177 names[0] = "index2" 

1178 tab2.dtype.names = names 

1179 schema2 = ArrowNumpySchema(tab2.dtype) 

1180 self.assertNotEqual(schema2, schema) 

1181 

1182 @unittest.skipUnless(pa is not None, "Cannot test arrow conversions without pyarrow.") 

1183 def testNumpyDictConversions(self): 

1184 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1185 

1186 # Verify that everything round-trips, including the schema. 

1187 tab1_arrow = numpy_to_arrow(tab1) 

1188 tab1_dict = arrow_to_numpy_dict(tab1_arrow) 

1189 tab1_dict_arrow = numpy_dict_to_arrow(tab1_dict) 

1190 

1191 self.assertEqual(tab1_arrow.schema, tab1_dict_arrow.schema) 

1192 self.assertEqual(tab1_arrow, tab1_dict_arrow) 

1193 

1194 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.") 

1195 def testWriteNumpyTableReadAsArrowTable(self): 

1196 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1197 

1198 self.butler.put(tab1, self.datasetType, dataId={}) 

1199 

1200 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable") 

1201 

1202 tab2_numpy = arrow_to_numpy(tab2) 

1203 

1204 self._checkNumpyTableEquality(tab1, tab2_numpy) 

1205 

1206 # Check reading the columns. 

1207 columns = tab2.schema.names 

1208 columns2 = self.butler.get( 

1209 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

1210 ) 

1211 self.assertEqual(columns2, columns) 

1212 

1213 # Check reading the schema. 

1214 schema = tab2.schema 

1215 schema2 = self.butler.get( 

1216 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema" 

1217 ) 

1218 self.assertEqual(schema2, schema) 

1219 

1220 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

1221 def testWriteNumpyTableReadAsDataFrame(self): 

1222 tab1 = _makeSimpleNumpyTable() 

1223 

1224 self.butler.put(tab1, self.datasetType, dataId={}) 

1225 

1226 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

1227 

1228 # Converting this back to numpy gets confused with the index column 

1229 # and changes the datatype of the string column. 

1230 

1231 tab1_df = pd.DataFrame(tab1) 

1232 

1233 self.assertTrue(tab1_df.equals(tab2)) 

1234 

1235 # Check reading the columns. 

1236 columns = tab2.columns 

1237 columns2 = self.butler.get( 

1238 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex" 

1239 ) 

1240 self.assertTrue(columns.equals(columns2)) 

1241 

1242 # Check reading the schema. 

1243 schema = DataFrameSchema(tab2) 

1244 schema2 = self.butler.get( 

1245 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema" 

1246 ) 

1247 

1248 self.assertEqual(schema2, schema) 

1249 

1250 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

1251 def testWriteNumpyTableReadAsAstropyTable(self): 

1252 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1253 

1254 self.butler.put(tab1, self.datasetType, dataId={}) 

1255 

1256 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

1257 tab2_numpy = tab2.as_array() 

1258 

1259 self._checkNumpyTableEquality(tab1, tab2_numpy) 

1260 

1261 # Check reading the columns. 

1262 columns = list(tab2.columns.keys()) 

1263 columns2 = self.butler.get( 

1264 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

1265 ) 

1266 self.assertEqual(columns2, columns) 

1267 

1268 # Check reading the schema. 

1269 schema = ArrowAstropySchema(tab2) 

1270 schema2 = self.butler.get( 

1271 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema" 

1272 ) 

1273 

1274 self.assertEqual(schema2, schema) 

1275 

1276 def testWriteNumpyTableReadAsNumpyDict(self): 

1277 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1278 

1279 self.butler.put(tab1, self.datasetType, dataId={}) 

1280 

1281 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict") 

1282 tab2_numpy = _numpy_dict_to_numpy(tab2) 

1283 

1284 self._checkNumpyTableEquality(tab1, tab2_numpy) 

1285 

1286 def _checkNumpyTableEquality(self, table1, table2, has_bigendian=False): 

1287 """Check if two numpy tables have the same columns/values 

1288 

1289 Parameters 

1290 ---------- 

1291 table1 : `numpy.ndarray` 

1292 table2 : `numpy.ndarray` 

1293 has_bigendian : `bool` 

1294 """ 

1295 self.assertEqual(table1.dtype.names, table2.dtype.names) 

1296 for name in table1.dtype.names: 

1297 if not has_bigendian: 

1298 self.assertEqual(table1.dtype[name], table2.dtype[name]) 

1299 else: 

1300 # Only check type matches, force to little-endian. 

1301 self.assertEqual(table1.dtype[name].newbyteorder(">"), table2.dtype[name].newbyteorder(">")) 

1302 self.assertTrue(np.all(table1 == table2)) 

1303 

1304 

1305@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.") 

1306class InMemoryArrowNumpyDelegateTestCase(ParquetFormatterArrowNumpyTestCase): 

1307 """Tests for InMemoryDatastore, using ArrowNumpyDelegate.""" 

1308 

1309 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

1310 

1311 def testBadInput(self): 

1312 tab1 = _makeSimpleNumpyTable() 

1313 delegate = ArrowNumpyDelegate("ArrowNumpy") 

1314 

1315 with self.assertRaises(ValueError): 

1316 delegate.handleParameters(inMemoryDataset="not_a_numpy_table") 

1317 

1318 with self.assertRaises(NotImplementedError): 

1319 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]}) 

1320 

1321 with self.assertRaises(AttributeError): 

1322 delegate.getComponent(composite=tab1, componentName="nothing") 

1323 

1324 def testStorageClass(self): 

1325 tab1 = _makeSimpleNumpyTable() 

1326 

1327 factory = StorageClassFactory() 

1328 factory.addFromConfig(StorageClassConfig()) 

1329 

1330 storageClass = factory.findStorageClass(type(tab1), compare_types=False) 

1331 # Force the name lookup to do name matching. 

1332 storageClass._pytype = None 

1333 self.assertEqual(storageClass.name, "ArrowNumpy") 

1334 

1335 storageClass = factory.findStorageClass(type(tab1), compare_types=True) 

1336 # Force the name lookup to do name matching. 

1337 storageClass._pytype = None 

1338 self.assertEqual(storageClass.name, "ArrowNumpy") 

1339 

1340 

1341@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowTable without pyarrow.") 

1342class ParquetFormatterArrowTableTestCase(unittest.TestCase): 

1343 """Tests for ParquetFormatter, ArrowTable, using local file datastore.""" 

1344 

1345 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1346 

1347 def setUp(self): 

1348 """Create a new butler root for each test.""" 

1349 self.root = makeTestTempDir(TESTDIR) 

1350 config = Config(self.configFile) 

1351 self.butler = Butler.from_config( 

1352 Butler.makeRepo(self.root, config=config), writeable=True, run="test_run" 

1353 ) 

1354 # No dimensions in dataset type so we don't have to worry about 

1355 # inserting dimension data or defining data IDs. 

1356 self.datasetType = DatasetType( 

1357 "data", dimensions=(), storageClass="ArrowTable", universe=self.butler.dimensions 

1358 ) 

1359 self.butler.registry.registerDatasetType(self.datasetType) 

1360 

1361 def tearDown(self): 

1362 removeTestTempDir(self.root) 

1363 

1364 def testArrowTable(self): 

1365 tab1 = _makeSimpleArrowTable(include_multidim=True, include_masked=True) 

1366 

1367 self.butler.put(tab1, self.datasetType, dataId={}) 

1368 # Read the whole Table. 

1369 tab2 = self.butler.get(self.datasetType, dataId={}) 

1370 # We convert to use the numpy testing framework to handle nan 

1371 # comparisons. 

1372 self.assertEqual(tab1.schema, tab2.schema) 

1373 tab1_np = arrow_to_numpy(tab1) 

1374 tab2_np = arrow_to_numpy(tab2) 

1375 for col in tab1.column_names: 

1376 np.testing.assert_array_equal(tab2_np[col], tab1_np[col]) 

1377 # Read the columns. 

1378 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

1379 self.assertEqual(len(columns2), len(tab1.schema.names)) 

1380 for i, name in enumerate(tab1.schema.names): 

1381 self.assertEqual(columns2[i], name) 

1382 # Read the rowcount. 

1383 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

1384 self.assertEqual(rowcount, len(tab1)) 

1385 # Read the schema. 

1386 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

1387 self.assertEqual(schema, tab1.schema) 

1388 # Read just some columns a few different ways. 

1389 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

1390 self.assertEqual(tab3, tab1.select(("a", "c"))) 

1391 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

1392 self.assertEqual(tab4, tab1.select(("a",))) 

1393 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]}) 

1394 self.assertEqual(tab5, tab1.select(("index", "a"))) 

1395 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"}) 

1396 self.assertEqual(tab6, tab1.select(("ddd",))) 

1397 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]}) 

1398 self.assertEqual(tab7, tab1.select(("a",))) 

1399 # Passing an unrecognized column should be a ValueError. 

1400 with self.assertRaises(ValueError): 

1401 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]}) 

1402 

1403 def testEmptyArrowTable(self): 

1404 data = _makeSimpleNumpyTable() 

1405 type_list = _numpy_dtype_to_arrow_types(data.dtype) 

1406 

1407 schema = pa.schema(type_list) 

1408 arrays = [[]] * len(schema.names) 

1409 

1410 tab1 = pa.Table.from_arrays(arrays, schema=schema) 

1411 

1412 self.butler.put(tab1, self.datasetType, dataId={}) 

1413 tab2 = self.butler.get(self.datasetType, dataId={}) 

1414 self.assertEqual(tab2, tab1) 

1415 

1416 tab1_numpy = arrow_to_numpy(tab1) 

1417 self.assertEqual(len(tab1_numpy), 0) 

1418 tab1_numpy_arrow = numpy_to_arrow(tab1_numpy) 

1419 self.assertEqual(tab1_numpy_arrow, tab1) 

1420 

1421 tab1_pandas = arrow_to_pandas(tab1) 

1422 self.assertEqual(len(tab1_pandas), 0) 

1423 tab1_pandas_arrow = pandas_to_arrow(tab1_pandas) 

1424 # Unfortunately, string/byte columns get mangled when translated 

1425 # through empty pandas dataframes. 

1426 self.assertEqual( 

1427 tab1_pandas_arrow.select(("index", "a", "b", "c", "ddd")), 

1428 tab1.select(("index", "a", "b", "c", "ddd")), 

1429 ) 

1430 

1431 tab1_astropy = arrow_to_astropy(tab1) 

1432 self.assertEqual(len(tab1_astropy), 0) 

1433 tab1_astropy_arrow = astropy_to_arrow(tab1_astropy) 

1434 self.assertEqual(tab1_astropy_arrow, tab1) 

1435 

1436 def testEmptyArrowTableMultidim(self): 

1437 data = _makeSimpleNumpyTable(include_multidim=True) 

1438 type_list = _numpy_dtype_to_arrow_types(data.dtype) 

1439 

1440 md = {} 

1441 for name in data.dtype.names: 

1442 _append_numpy_multidim_metadata(md, name, data.dtype[name]) 

1443 

1444 schema = pa.schema(type_list, metadata=md) 

1445 arrays = [[]] * len(schema.names) 

1446 

1447 tab1 = pa.Table.from_arrays(arrays, schema=schema) 

1448 

1449 self.butler.put(tab1, self.datasetType, dataId={}) 

1450 tab2 = self.butler.get(self.datasetType, dataId={}) 

1451 self.assertEqual(tab2, tab1) 

1452 

1453 tab1_numpy = arrow_to_numpy(tab1) 

1454 self.assertEqual(len(tab1_numpy), 0) 

1455 tab1_numpy_arrow = numpy_to_arrow(tab1_numpy) 

1456 self.assertEqual(tab1_numpy_arrow, tab1) 

1457 

1458 tab1_astropy = arrow_to_astropy(tab1) 

1459 self.assertEqual(len(tab1_astropy), 0) 

1460 tab1_astropy_arrow = astropy_to_arrow(tab1_astropy) 

1461 self.assertEqual(tab1_astropy_arrow, tab1) 

1462 

1463 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

1464 def testWriteArrowTableReadAsSingleIndexDataFrame(self): 

1465 df1, allColumns = _makeSingleIndexDataFrame() 

1466 

1467 self.butler.put(df1, self.datasetType, dataId={}) 

1468 

1469 # Read back out as a dataframe. 

1470 df2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

1471 self.assertTrue(df1.equals(df2)) 

1472 

1473 # Read back out as an arrow table, convert to dataframe. 

1474 tab3 = self.butler.get(self.datasetType, dataId={}) 

1475 df3 = arrow_to_pandas(tab3) 

1476 self.assertTrue(df1.equals(df3)) 

1477 

1478 # Check reading the columns. 

1479 columns = df2.reset_index().columns 

1480 columns2 = self.butler.get( 

1481 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex" 

1482 ) 

1483 # We check the set because pandas reorders the columns. 

1484 self.assertEqual(set(columns2.to_list()), set(columns.to_list())) 

1485 

1486 # Check reading the schema. 

1487 schema = DataFrameSchema(df1) 

1488 schema2 = self.butler.get( 

1489 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema" 

1490 ) 

1491 self.assertEqual(schema2, schema) 

1492 

1493 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

1494 def testWriteArrowTableReadAsMultiIndexDataFrame(self): 

1495 df1 = _makeMultiIndexDataFrame() 

1496 

1497 self.butler.put(df1, self.datasetType, dataId={}) 

1498 

1499 # Read back out as a dataframe. 

1500 df2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

1501 self.assertTrue(df1.equals(df2)) 

1502 

1503 # Read back out as an arrow table, convert to dataframe. 

1504 atab3 = self.butler.get(self.datasetType, dataId={}) 

1505 df3 = arrow_to_pandas(atab3) 

1506 self.assertTrue(df1.equals(df3)) 

1507 

1508 # Check reading the columns. 

1509 columns = df2.columns 

1510 columns2 = self.butler.get( 

1511 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex" 

1512 ) 

1513 self.assertTrue(columns2.equals(columns)) 

1514 

1515 # Check reading the schema. 

1516 schema = DataFrameSchema(df1) 

1517 schema2 = self.butler.get( 

1518 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema" 

1519 ) 

1520 self.assertEqual(schema2, schema) 

1521 

1522 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

1523 def testWriteArrowTableReadAsAstropyTable(self): 

1524 tab1 = _makeSimpleAstropyTable(include_multidim=True, include_masked=True) 

1525 

1526 self.butler.put(tab1, self.datasetType, dataId={}) 

1527 

1528 # Read back out as an astropy table. 

1529 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

1530 self._checkAstropyTableEquality(tab1, tab2) 

1531 

1532 # Read back out as an arrow table, convert to astropy table. 

1533 atab3 = self.butler.get(self.datasetType, dataId={}) 

1534 tab3 = arrow_to_astropy(atab3) 

1535 self._checkAstropyTableEquality(tab1, tab3) 

1536 

1537 # Check reading the columns. 

1538 columns = list(tab2.columns.keys()) 

1539 columns2 = self.butler.get( 

1540 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

1541 ) 

1542 self.assertEqual(columns2, columns) 

1543 

1544 # Check reading the schema. 

1545 schema = ArrowAstropySchema(tab1) 

1546 schema2 = self.butler.get( 

1547 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema" 

1548 ) 

1549 self.assertEqual(schema2, schema) 

1550 

1551 # Check the schema conversions and units. 

1552 arrow_schema = schema.to_arrow_schema() 

1553 for name in arrow_schema.names: 

1554 field_metadata = arrow_schema.field(name).metadata 

1555 if ( 

1556 b"description" in field_metadata 

1557 and (description := field_metadata[b"description"].decode("UTF-8")) != "" 

1558 ): 

1559 self.assertEqual(schema2.schema[name].description, description) 

1560 else: 

1561 self.assertIsNone(schema2.schema[name].description) 

1562 if b"unit" in field_metadata and (unit := field_metadata[b"unit"].decode("UTF-8")) != "": 

1563 self.assertEqual(schema2.schema[name].unit, units.Unit(unit)) 

1564 

1565 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

1566 def testWriteArrowTableReadAsNumpyTable(self): 

1567 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1568 

1569 self.butler.put(tab1, self.datasetType, dataId={}) 

1570 

1571 # Read back out as a numpy table. 

1572 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy") 

1573 self._checkNumpyTableEquality(tab1, tab2) 

1574 

1575 # Read back out as an arrow table, convert to numpy table. 

1576 atab3 = self.butler.get(self.datasetType, dataId={}) 

1577 tab3 = arrow_to_numpy(atab3) 

1578 self._checkNumpyTableEquality(tab1, tab3) 

1579 

1580 # Check reading the columns. 

1581 columns = list(tab2.dtype.names) 

1582 columns2 = self.butler.get( 

1583 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

1584 ) 

1585 self.assertEqual(columns2, columns) 

1586 

1587 # Check reading the schema. 

1588 schema = ArrowNumpySchema(tab1.dtype) 

1589 schema2 = self.butler.get( 

1590 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema" 

1591 ) 

1592 self.assertEqual(schema2, schema) 

1593 

1594 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

1595 def testWriteArrowTableReadAsNumpyDict(self): 

1596 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1597 

1598 self.butler.put(tab1, self.datasetType, dataId={}) 

1599 

1600 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict") 

1601 tab2_numpy = _numpy_dict_to_numpy(tab2) 

1602 self._checkNumpyTableEquality(tab1, tab2_numpy) 

1603 

1604 def _checkAstropyTableEquality(self, table1, table2): 

1605 """Check if two astropy tables have the same columns/values 

1606 

1607 Parameters 

1608 ---------- 

1609 table1 : `astropy.table.Table` 

1610 table2 : `astropy.table.Table` 

1611 """ 

1612 self.assertEqual(table1.dtype, table2.dtype) 

1613 for name in table1.columns: 

1614 self.assertEqual(table1[name].unit, table2[name].unit) 

1615 self.assertEqual(table1[name].description, table2[name].description) 

1616 self.assertEqual(table1[name].format, table2[name].format) 

1617 # We need to check masked/regular columns after filling. 

1618 has_masked = False 

1619 if isinstance(table1[name], atable.column.MaskedColumn): 

1620 c1 = table1[name].filled() 

1621 has_masked = True 

1622 else: 

1623 c1 = np.array(table1[name]) 

1624 if isinstance(table2[name], atable.column.MaskedColumn): 

1625 c2 = table2[name].filled() 

1626 has_masked = True 

1627 else: 

1628 c2 = np.array(table2[name]) 

1629 np.testing.assert_array_equal(c1, c2) 

1630 # If we have a masked column then we test the underlying data. 

1631 if has_masked: 

1632 np.testing.assert_array_equal(np.array(c1), np.array(c2)) 

1633 

1634 def _checkNumpyTableEquality(self, table1, table2): 

1635 """Check if two numpy tables have the same columns/values 

1636 

1637 Parameters 

1638 ---------- 

1639 table1 : `numpy.ndarray` 

1640 table2 : `numpy.ndarray` 

1641 """ 

1642 self.assertEqual(table1.dtype.names, table2.dtype.names) 

1643 for name in table1.dtype.names: 

1644 self.assertEqual(table1.dtype[name], table2.dtype[name]) 

1645 self.assertTrue(np.all(table1 == table2)) 

1646 

1647 

1648@unittest.skipUnless(pa is not None, "Cannot test InMemoryArrowTableDelegate without pyarrow.") 

1649class InMemoryArrowTableDelegateTestCase(ParquetFormatterArrowTableTestCase): 

1650 """Tests for InMemoryDatastore, using ArrowTableDelegate.""" 

1651 

1652 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

1653 

1654 def testBadInput(self): 

1655 tab1 = _makeSimpleArrowTable() 

1656 delegate = ArrowTableDelegate("ArrowTable") 

1657 

1658 with self.assertRaises(ValueError): 

1659 delegate.handleParameters(inMemoryDataset="not_an_arrow_table") 

1660 

1661 with self.assertRaises(NotImplementedError): 

1662 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]}) 

1663 

1664 with self.assertRaises(AttributeError): 

1665 delegate.getComponent(composite=tab1, componentName="nothing") 

1666 

1667 def testStorageClass(self): 

1668 tab1 = _makeSimpleArrowTable() 

1669 

1670 factory = StorageClassFactory() 

1671 factory.addFromConfig(StorageClassConfig()) 

1672 

1673 storageClass = factory.findStorageClass(type(tab1), compare_types=False) 

1674 # Force the name lookup to do name matching. 

1675 storageClass._pytype = None 

1676 self.assertEqual(storageClass.name, "ArrowTable") 

1677 

1678 storageClass = factory.findStorageClass(type(tab1), compare_types=True) 

1679 # Force the name lookup to do name matching. 

1680 storageClass._pytype = None 

1681 self.assertEqual(storageClass.name, "ArrowTable") 

1682 

1683 

1684@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.") 

1685@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowNumpy without pyarrow.") 

1686class ParquetFormatterArrowNumpyDictTestCase(unittest.TestCase): 

1687 """Tests for ParquetFormatter, ArrowNumpyDict, using local file store.""" 

1688 

1689 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1690 

1691 def setUp(self): 

1692 """Create a new butler root for each test.""" 

1693 self.root = makeTestTempDir(TESTDIR) 

1694 config = Config(self.configFile) 

1695 self.butler = Butler.from_config( 

1696 Butler.makeRepo(self.root, config=config), writeable=True, run="test_run" 

1697 ) 

1698 # No dimensions in dataset type so we don't have to worry about 

1699 # inserting dimension data or defining data IDs. 

1700 self.datasetType = DatasetType( 

1701 "data", dimensions=(), storageClass="ArrowNumpyDict", universe=self.butler.dimensions 

1702 ) 

1703 self.butler.registry.registerDatasetType(self.datasetType) 

1704 

1705 def tearDown(self): 

1706 removeTestTempDir(self.root) 

1707 

1708 def testNumpyDict(self): 

1709 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1710 dict1 = _numpy_to_numpy_dict(tab1) 

1711 

1712 self.butler.put(dict1, self.datasetType, dataId={}) 

1713 # Read the whole table. 

1714 dict2 = self.butler.get(self.datasetType, dataId={}) 

1715 self._checkNumpyDictEquality(dict1, dict2) 

1716 # Read the columns. 

1717 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

1718 self.assertEqual(len(columns2), len(dict1.keys())) 

1719 for name in dict1: 

1720 self.assertIn(name, columns2) 

1721 # Read the rowcount. 

1722 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

1723 self.assertEqual(rowcount, len(dict1["a"])) 

1724 # Read the schema. 

1725 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

1726 self.assertEqual(schema, ArrowNumpySchema(tab1.dtype)) 

1727 # Read just some columns a few different ways. 

1728 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

1729 subdict = {key: dict1[key] for key in ["a", "c"]} 

1730 self._checkNumpyDictEquality(subdict, tab3) 

1731 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

1732 subdict = {key: dict1[key] for key in ["a"]} 

1733 self._checkNumpyDictEquality(subdict, tab4) 

1734 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]}) 

1735 subdict = {key: dict1[key] for key in ["index", "a"]} 

1736 self._checkNumpyDictEquality(subdict, tab5) 

1737 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"}) 

1738 subdict = {key: dict1[key] for key in ["ddd"]} 

1739 self._checkNumpyDictEquality(subdict, tab6) 

1740 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]}) 

1741 subdict = {key: dict1[key] for key in ["a"]} 

1742 self._checkNumpyDictEquality(subdict, tab7) 

1743 # Passing an unrecognized column should be a ValueError. 

1744 with self.assertRaises(ValueError): 

1745 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]}) 

1746 

1747 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.") 

1748 def testWriteNumpyDictReadAsArrowTable(self): 

1749 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1750 dict1 = _numpy_to_numpy_dict(tab1) 

1751 

1752 self.butler.put(dict1, self.datasetType, dataId={}) 

1753 

1754 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable") 

1755 

1756 tab2_dict = arrow_to_numpy_dict(tab2) 

1757 

1758 self._checkNumpyDictEquality(dict1, tab2_dict) 

1759 

1760 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

1761 def testWriteNumpyDictReadAsDataFrame(self): 

1762 tab1 = _makeSimpleNumpyTable() 

1763 dict1 = _numpy_to_numpy_dict(tab1) 

1764 

1765 self.butler.put(dict1, self.datasetType, dataId={}) 

1766 

1767 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

1768 

1769 # The order of the dict may get mixed up, so we need to check column 

1770 # by column. We also need to do this in dataframe form because pandas 

1771 # changes the datatype of the string column. 

1772 tab1_df = pd.DataFrame(tab1) 

1773 

1774 self.assertEqual(set(tab1_df.columns), set(tab2.columns)) 

1775 for col in tab1_df.columns: 

1776 self.assertTrue(np.all(tab1_df[col].values == tab2[col].values)) 

1777 

1778 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

1779 def testWriteNumpyDictReadAsAstropyTable(self): 

1780 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1781 dict1 = _numpy_to_numpy_dict(tab1) 

1782 

1783 self.butler.put(dict1, self.datasetType, dataId={}) 

1784 

1785 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

1786 tab2_dict = _astropy_to_numpy_dict(tab2) 

1787 

1788 self._checkNumpyDictEquality(dict1, tab2_dict) 

1789 

1790 def testWriteNumpyDictReadAsNumpyTable(self): 

1791 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1792 dict1 = _numpy_to_numpy_dict(tab1) 

1793 

1794 self.butler.put(dict1, self.datasetType, dataId={}) 

1795 

1796 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy") 

1797 tab2_dict = _numpy_to_numpy_dict(tab2) 

1798 

1799 self._checkNumpyDictEquality(dict1, tab2_dict) 

1800 

1801 def testWriteNumpyDictBad(self): 

1802 dict1 = {"a": 4, "b": np.ndarray([1])} 

1803 with self.assertRaises(RuntimeError): 

1804 self.butler.put(dict1, self.datasetType, dataId={}) 

1805 

1806 dict2 = {"a": np.zeros(4), "b": np.zeros(5)} 

1807 with self.assertRaises(RuntimeError): 

1808 self.butler.put(dict2, self.datasetType, dataId={}) 

1809 

1810 dict3 = {"a": [0] * 5, "b": np.zeros(5)} 

1811 with self.assertRaises(RuntimeError): 

1812 self.butler.put(dict3, self.datasetType, dataId={}) 

1813 

1814 def _checkNumpyDictEquality(self, dict1, dict2): 

1815 """Check if two numpy dicts have the same columns/values. 

1816 

1817 Parameters 

1818 ---------- 

1819 dict1 : `dict` [`str`, `np.ndarray`] 

1820 dict2 : `dict` [`str`, `np.ndarray`] 

1821 """ 

1822 self.assertEqual(set(dict1.keys()), set(dict2.keys())) 

1823 for name in dict1: 

1824 self.assertEqual(dict1[name].dtype, dict2[name].dtype) 

1825 self.assertTrue(np.all(dict1[name] == dict2[name])) 

1826 

1827 

1828@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.") 

1829@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowNumpy without pyarrow.") 

1830class InMemoryNumpyDictDelegateTestCase(ParquetFormatterArrowNumpyDictTestCase): 

1831 """Tests for InMemoryDatastore, using ArrowNumpyDictDelegate.""" 

1832 

1833 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

1834 

1835 def testWriteNumpyDictBad(self): 

1836 # The sub-type checking is not done on in-memory datastore. 

1837 pass 

1838 

1839 

1840@unittest.skipUnless(pa is not None, "Cannot test ArrowSchema without pyarrow.") 

1841class ParquetFormatterArrowSchemaTestCase(unittest.TestCase): 

1842 """Tests for ParquetFormatter, ArrowSchema, using local file datastore.""" 

1843 

1844 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1845 

1846 def setUp(self): 

1847 """Create a new butler root for each test.""" 

1848 self.root = makeTestTempDir(TESTDIR) 

1849 config = Config(self.configFile) 

1850 self.butler = Butler.from_config( 

1851 Butler.makeRepo(self.root, config=config), writeable=True, run="test_run" 

1852 ) 

1853 # No dimensions in dataset type so we don't have to worry about 

1854 # inserting dimension data or defining data IDs. 

1855 self.datasetType = DatasetType( 

1856 "data", dimensions=(), storageClass="ArrowSchema", universe=self.butler.dimensions 

1857 ) 

1858 self.butler.registry.registerDatasetType(self.datasetType) 

1859 

1860 def tearDown(self): 

1861 removeTestTempDir(self.root) 

1862 

1863 def _makeTestSchema(self): 

1864 schema = pa.schema( 

1865 [ 

1866 pa.field( 

1867 "int32", 

1868 pa.int32(), 

1869 nullable=False, 

1870 metadata={ 

1871 "description": "32-bit integer", 

1872 "unit": "", 

1873 }, 

1874 ), 

1875 pa.field( 

1876 "int64", 

1877 pa.int64(), 

1878 nullable=False, 

1879 metadata={ 

1880 "description": "64-bit integer", 

1881 "unit": "", 

1882 }, 

1883 ), 

1884 pa.field( 

1885 "uint64", 

1886 pa.uint64(), 

1887 nullable=False, 

1888 metadata={ 

1889 "description": "64-bit unsigned integer", 

1890 "unit": "", 

1891 }, 

1892 ), 

1893 pa.field( 

1894 "float32", 

1895 pa.float32(), 

1896 nullable=False, 

1897 metadata={ 

1898 "description": "32-bit float", 

1899 "unit": "count", 

1900 }, 

1901 ), 

1902 pa.field( 

1903 "float64", 

1904 pa.float64(), 

1905 nullable=False, 

1906 metadata={ 

1907 "description": "64-bit float", 

1908 "unit": "nJy", 

1909 }, 

1910 ), 

1911 pa.field( 

1912 "fixed_size_list", 

1913 pa.list_(pa.float64(), list_size=10), 

1914 nullable=False, 

1915 metadata={ 

1916 "description": "Fixed size list of 64-bit floats.", 

1917 "unit": "nJy", 

1918 }, 

1919 ), 

1920 pa.field( 

1921 "variable_size_list", 

1922 pa.list_(pa.float64()), 

1923 nullable=False, 

1924 metadata={ 

1925 "description": "Variable size list of 64-bit floats.", 

1926 "unit": "nJy", 

1927 }, 

1928 ), 

1929 # One of these fields will have no description. 

1930 pa.field( 

1931 "string", 

1932 pa.string(), 

1933 nullable=False, 

1934 metadata={ 

1935 "unit": "", 

1936 }, 

1937 ), 

1938 # One of these fields will have no metadata. 

1939 pa.field( 

1940 "binary", 

1941 pa.binary(), 

1942 nullable=False, 

1943 ), 

1944 ] 

1945 ) 

1946 

1947 return schema 

1948 

1949 def testArrowSchema(self): 

1950 schema1 = self._makeTestSchema() 

1951 self.butler.put(schema1, self.datasetType, dataId={}) 

1952 

1953 schema2 = self.butler.get(self.datasetType, dataId={}) 

1954 self.assertEqual(schema2, schema1) 

1955 

1956 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe schema without pandas.") 

1957 def testWriteArrowSchemaReadAsDataFrameSchema(self): 

1958 schema1 = self._makeTestSchema() 

1959 self.butler.put(schema1, self.datasetType, dataId={}) 

1960 

1961 df_schema1 = DataFrameSchema.from_arrow(schema1) 

1962 

1963 df_schema2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrameSchema") 

1964 self.assertEqual(df_schema2, df_schema1) 

1965 

1966 @unittest.skipUnless(atable is not None, "Cannot test reading as an astropy schema without astropy.") 

1967 def testWriteArrowSchemaReadAsArrowAstropySchema(self): 

1968 schema1 = self._makeTestSchema() 

1969 self.butler.put(schema1, self.datasetType, dataId={}) 

1970 

1971 ap_schema1 = ArrowAstropySchema.from_arrow(schema1) 

1972 

1973 ap_schema2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropySchema") 

1974 self.assertEqual(ap_schema2, ap_schema1) 

1975 

1976 # Confirm that the ap_schema2 has the unit/description we expect. 

1977 for name in schema1.names: 

1978 field_metadata = schema1.field(name).metadata 

1979 if field_metadata is None: 

1980 continue 

1981 if ( 

1982 b"description" in field_metadata 

1983 and (description := field_metadata[b"description"].decode("UTF-8")) != "" 

1984 ): 

1985 self.assertEqual(ap_schema2.schema[name].description, description) 

1986 else: 

1987 self.assertIsNone(ap_schema2.schema[name].description) 

1988 if b"unit" in field_metadata and (unit := field_metadata[b"unit"].decode("UTF-8")) != "": 

1989 self.assertEqual(ap_schema2.schema[name].unit, units.Unit(unit)) 

1990 

1991 @unittest.skipUnless(atable is not None, "Cannot test reading as an numpy schema without numpy.") 

1992 def testWriteArrowSchemaReadAsArrowNumpySchema(self): 

1993 schema1 = self._makeTestSchema() 

1994 self.butler.put(schema1, self.datasetType, dataId={}) 

1995 

1996 np_schema1 = ArrowNumpySchema.from_arrow(schema1) 

1997 

1998 np_schema2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpySchema") 

1999 self.assertEqual(np_schema2, np_schema1) 

2000 

2001 

2002@unittest.skipUnless(pa is not None, "Cannot test InMemoryArrowSchemaDelegate without pyarrow.") 

2003class InMemoryArrowSchemaDelegateTestCase(ParquetFormatterArrowSchemaTestCase): 

2004 """Tests for InMemoryDatastore and ArrowSchema.""" 

2005 

2006 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

2007 

2008 

2009@unittest.skipUnless(np is not None, "Cannot test compute_row_group_size without numpy.") 

2010@unittest.skipUnless(pa is not None, "Cannot test compute_row_group_size without pyarrow.") 

2011class ComputeRowGroupSizeTestCase(unittest.TestCase): 

2012 """Tests for compute_row_group_size.""" 

2013 

2014 def testRowGroupSizeNoMetadata(self): 

2015 numpyTable = _makeSimpleNumpyTable(include_multidim=True) 

2016 

2017 # We can't use the numpy_to_arrow convenience function because 

2018 # that adds metadata. 

2019 type_list = _numpy_dtype_to_arrow_types(numpyTable.dtype) 

2020 schema = pa.schema(type_list) 

2021 arrays = _numpy_style_arrays_to_arrow_arrays( 

2022 numpyTable.dtype, 

2023 len(numpyTable), 

2024 numpyTable, 

2025 schema, 

2026 ) 

2027 arrowTable = pa.Table.from_arrays(arrays, schema=schema) 

2028 

2029 row_group_size = compute_row_group_size(arrowTable.schema) 

2030 

2031 self.assertGreater(row_group_size, 1_000_000) 

2032 self.assertLess(row_group_size, 2_000_000) 

2033 

2034 def testRowGroupSizeWithMetadata(self): 

2035 numpyTable = _makeSimpleNumpyTable(include_multidim=True) 

2036 

2037 arrowTable = numpy_to_arrow(numpyTable) 

2038 

2039 row_group_size = compute_row_group_size(arrowTable.schema) 

2040 

2041 self.assertGreater(row_group_size, 1_000_000) 

2042 self.assertLess(row_group_size, 2_000_000) 

2043 

2044 def testRowGroupSizeTinyTable(self): 

2045 numpyTable = np.zeros(1, dtype=[("a", np.bool_)]) 

2046 

2047 arrowTable = numpy_to_arrow(numpyTable) 

2048 

2049 row_group_size = compute_row_group_size(arrowTable.schema) 

2050 

2051 self.assertGreater(row_group_size, 1_000_000) 

2052 

2053 @unittest.skipUnless(pd is not None, "Cannot run testRowGroupSizeDataFrameWithLists without pandas.") 

2054 def testRowGroupSizeDataFrameWithLists(self): 

2055 df = pd.DataFrame({"a": np.zeros(10), "b": [[0, 0]] * 10, "c": [[0.0, 0.0]] * 10, "d": [[]] * 10}) 

2056 arrowTable = pandas_to_arrow(df) 

2057 row_group_size = compute_row_group_size(arrowTable.schema) 

2058 

2059 self.assertGreater(row_group_size, 1_000_000) 

2060 

2061 

2062if __name__ == "__main__": 

2063 unittest.main()