Coverage for tests/test_parquet.py: 23%

1039 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-12-05 11:07 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28"""Tests for ParquetFormatter. 

29 

30Tests in this module are disabled unless pandas and pyarrow are importable. 

31""" 

32 

33import os 

34import unittest 

35 

36try: 

37 import pyarrow as pa 

38except ImportError: 

39 pa = None 

40try: 

41 import astropy.table as atable 

42 from astropy import units 

43except ImportError: 

44 atable = None 

45try: 

46 import numpy as np 

47except ImportError: 

48 np = None 

49try: 

50 import pandas as pd 

51except ImportError: 

52 pd = None 

53 

54from lsst.daf.butler import ( 

55 Butler, 

56 Config, 

57 DatasetRef, 

58 DatasetType, 

59 FileDataset, 

60 StorageClassConfig, 

61 StorageClassFactory, 

62) 

63 

64try: 

65 from lsst.daf.butler.delegates.arrowastropy import ArrowAstropyDelegate 

66except ImportError: 

67 atable = None 

68 pa = None 

69try: 

70 from lsst.daf.butler.delegates.arrownumpy import ArrowNumpyDelegate 

71except ImportError: 

72 np = None 

73 pa = None 

74try: 

75 from lsst.daf.butler.delegates.arrowtable import ArrowTableDelegate 

76except ImportError: 

77 pa = None 

78try: 

79 from lsst.daf.butler.delegates.dataframe import DataFrameDelegate 

80except ImportError: 

81 pd = None 

82try: 

83 from lsst.daf.butler.formatters.parquet import ( 

84 ArrowAstropySchema, 

85 ArrowNumpySchema, 

86 DataFrameSchema, 

87 ParquetFormatter, 

88 _append_numpy_multidim_metadata, 

89 _astropy_to_numpy_dict, 

90 _numpy_dict_to_numpy, 

91 _numpy_dtype_to_arrow_types, 

92 _numpy_style_arrays_to_arrow_arrays, 

93 _numpy_to_numpy_dict, 

94 arrow_to_astropy, 

95 arrow_to_numpy, 

96 arrow_to_numpy_dict, 

97 arrow_to_pandas, 

98 astropy_to_arrow, 

99 compute_row_group_size, 

100 numpy_dict_to_arrow, 

101 numpy_to_arrow, 

102 pandas_to_arrow, 

103 ) 

104except ImportError: 

105 pa = None 

106 pd = None 

107 atable = None 

108 np = None 

109from lsst.daf.butler.tests.utils import makeTestTempDir, removeTestTempDir 

110 

111TESTDIR = os.path.abspath(os.path.dirname(__file__)) 

112 

113 

114def _makeSimpleNumpyTable(include_multidim=False, include_bigendian=False): 

115 """Make a simple numpy table with random data. 

116 

117 Parameters 

118 ---------- 

119 include_multidim : `bool` 

120 Include multi-dimensional columns. 

121 include_bigendian : `bool` 

122 Include big-endian columns. 

123 

124 Returns 

125 ------- 

126 numpyTable : `numpy.ndarray` 

127 """ 

128 nrow = 5 

129 

130 dtype = [ 

131 ("index", "i4"), 

132 ("a", "f8"), 

133 ("b", "f8"), 

134 ("c", "f8"), 

135 ("ddd", "f8"), 

136 ("f", "i8"), 

137 ("strcol", "U10"), 

138 ("bytecol", "a10"), 

139 ] 

140 

141 if include_multidim: 

142 dtype.extend( 

143 [ 

144 ("d1", "f4", (5,)), 

145 ("d2", "i8", (5, 10)), 

146 ("d3", "f8", (5, 10)), 

147 ] 

148 ) 

149 

150 if include_bigendian: 

151 dtype.extend([("a_bigendian", ">f8"), ("f_bigendian", ">i8")]) 

152 

153 data = np.zeros(nrow, dtype=dtype) 

154 data["index"][:] = np.arange(nrow) 

155 data["a"] = np.random.randn(nrow) 

156 data["b"] = np.random.randn(nrow) 

157 data["c"] = np.random.randn(nrow) 

158 data["ddd"] = np.random.randn(nrow) 

159 data["f"] = np.arange(nrow) * 10 

160 data["strcol"][:] = "teststring" 

161 data["bytecol"][:] = "teststring" 

162 

163 if include_multidim: 

164 data["d1"] = np.random.randn(data["d1"].size).reshape(data["d1"].shape) 

165 data["d2"] = np.arange(data["d2"].size).reshape(data["d2"].shape) 

166 data["d3"] = np.asfortranarray(np.random.randn(data["d3"].size).reshape(data["d3"].shape)) 

167 

168 if include_bigendian: 

169 data["a_bigendian"][:] = data["a"] 

170 data["f_bigendian"][:] = data["f"] 

171 

172 return data 

173 

174 

175def _makeSingleIndexDataFrame(include_masked=False, include_lists=False): 

176 """Make a single index data frame for testing. 

177 

178 Parameters 

179 ---------- 

180 include_masked : `bool` 

181 Include masked columns. 

182 include_lists : `bool` 

183 Include list columns. 

184 

185 Returns 

186 ------- 

187 dataFrame : `~pandas.DataFrame` 

188 The test dataframe. 

189 allColumns : `list` [`str`] 

190 List of all the columns (including index columns). 

191 """ 

192 data = _makeSimpleNumpyTable() 

193 df = pd.DataFrame(data) 

194 df = df.set_index("index") 

195 

196 if include_masked: 

197 nrow = len(df) 

198 

199 df["m1"] = pd.array(np.arange(nrow), dtype=pd.Int64Dtype()) 

200 df["m2"] = pd.array(np.arange(nrow), dtype=np.float32) 

201 df["mstrcol"] = pd.array(np.array(["text"] * nrow)) 

202 df.loc[1, ["m1", "m2", "mstrcol"]] = None 

203 

204 if include_lists: 

205 nrow = len(df) 

206 

207 df["l1"] = [[0, 0]] * nrow 

208 df["l2"] = [[0.0, 0.0]] * nrow 

209 df["l3"] = [[]] * nrow 

210 

211 allColumns = df.columns.append(pd.Index(df.index.names)) 

212 

213 return df, allColumns 

214 

215 

216def _makeMultiIndexDataFrame(): 

217 """Make a multi-index data frame for testing. 

218 

219 Returns 

220 ------- 

221 dataFrame : `~pandas.DataFrame` 

222 The test dataframe. 

223 """ 

224 columns = pd.MultiIndex.from_tuples( 

225 [ 

226 ("g", "a"), 

227 ("g", "b"), 

228 ("g", "c"), 

229 ("r", "a"), 

230 ("r", "b"), 

231 ("r", "c"), 

232 ], 

233 names=["filter", "column"], 

234 ) 

235 df = pd.DataFrame(np.random.randn(5, 6), index=np.arange(5, dtype=int), columns=columns) 

236 

237 return df 

238 

239 

240def _makeSimpleAstropyTable(include_multidim=False, include_masked=False, include_bigendian=False): 

241 """Make an astropy table for testing. 

242 

243 Parameters 

244 ---------- 

245 include_multidim : `bool` 

246 Include multi-dimensional columns. 

247 include_masked : `bool` 

248 Include masked columns. 

249 include_bigendian : `bool` 

250 Include big-endian columns. 

251 

252 Returns 

253 ------- 

254 astropyTable : `astropy.table.Table` 

255 The test table. 

256 """ 

257 data = _makeSimpleNumpyTable(include_multidim=include_multidim, include_bigendian=include_bigendian) 

258 # Add a couple of units. 

259 table = atable.Table(data) 

260 table["a"].unit = units.degree 

261 table["a"].description = "Description of column a" 

262 table["b"].unit = units.meter 

263 table["b"].description = "Description of column b" 

264 

265 # Add some masked columns. 

266 if include_masked: 

267 nrow = len(table) 

268 mask = np.zeros(nrow, dtype=bool) 

269 mask[1] = True 

270 table["m1"] = np.ma.masked_array(data=np.arange(nrow, dtype="i8"), mask=mask) 

271 table["m2"] = np.ma.masked_array(data=np.arange(nrow, dtype="f4"), mask=mask) 

272 table["mstrcol"] = np.ma.masked_array(data=np.array(["text"] * nrow), mask=mask) 

273 table["mbytecol"] = np.ma.masked_array(data=np.array([b"bytes"] * nrow), mask=mask) 

274 

275 return table 

276 

277 

278def _makeSimpleArrowTable(include_multidim=False, include_masked=False): 

279 """Make an arrow table for testing. 

280 

281 Parameters 

282 ---------- 

283 include_multidim : `bool` 

284 Include multi-dimensional columns. 

285 include_masked : `bool` 

286 Include masked columns. 

287 

288 Returns 

289 ------- 

290 arrowTable : `pyarrow.Table` 

291 The test table. 

292 """ 

293 data = _makeSimpleAstropyTable(include_multidim=include_multidim, include_masked=include_masked) 

294 return astropy_to_arrow(data) 

295 

296 

297@unittest.skipUnless(pd is not None, "Cannot test ParquetFormatterDataFrame without pandas.") 

298@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterDataFrame without pyarrow.") 

299class ParquetFormatterDataFrameTestCase(unittest.TestCase): 

300 """Tests for ParquetFormatter, DataFrame, using local file datastore.""" 

301 

302 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

303 

304 def setUp(self): 

305 """Create a new butler root for each test.""" 

306 self.root = makeTestTempDir(TESTDIR) 

307 config = Config(self.configFile) 

308 self.run = "test_run" 

309 self.butler = Butler.from_config( 

310 Butler.makeRepo(self.root, config=config), writeable=True, run=self.run 

311 ) 

312 # No dimensions in dataset type so we don't have to worry about 

313 # inserting dimension data or defining data IDs. 

314 self.datasetType = DatasetType( 

315 "data", dimensions=(), storageClass="DataFrame", universe=self.butler.dimensions 

316 ) 

317 self.butler.registry.registerDatasetType(self.datasetType) 

318 

319 def tearDown(self): 

320 removeTestTempDir(self.root) 

321 

322 def testSingleIndexDataFrame(self): 

323 df1, allColumns = _makeSingleIndexDataFrame(include_masked=True) 

324 

325 self.butler.put(df1, self.datasetType, dataId={}) 

326 # Read the whole DataFrame. 

327 df2 = self.butler.get(self.datasetType, dataId={}) 

328 self.assertTrue(df1.equals(df2)) 

329 # Read just the column descriptions. 

330 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

331 self.assertTrue(allColumns.equals(columns2)) 

332 # Read the rowcount. 

333 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

334 self.assertEqual(rowcount, len(df1)) 

335 # Read the schema. 

336 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

337 self.assertEqual(schema, DataFrameSchema(df1)) 

338 # Read just some columns a few different ways. 

339 df3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

340 self.assertTrue(df1.loc[:, ["a", "c"]].equals(df3)) 

341 df4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

342 self.assertTrue(df1.loc[:, ["a"]].equals(df4)) 

343 df5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]}) 

344 self.assertTrue(df1.loc[:, ["a"]].equals(df5)) 

345 df6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"}) 

346 self.assertTrue(df1.loc[:, ["ddd"]].equals(df6)) 

347 df7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]}) 

348 self.assertTrue(df1.loc[:, ["a"]].equals(df7)) 

349 # Passing an unrecognized column should be a ValueError. 

350 with self.assertRaises(ValueError): 

351 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]}) 

352 

353 def testSingleIndexDataFrameWithLists(self): 

354 df1, allColumns = _makeSingleIndexDataFrame(include_lists=True) 

355 

356 self.butler.put(df1, self.datasetType, dataId={}) 

357 # Read the whole DataFrame. 

358 df2 = self.butler.get(self.datasetType, dataId={}) 

359 

360 # We need to check the list columns specially because they go 

361 # from lists to arrays. 

362 for col in ["l1", "l2", "l3"]: 

363 for i in range(len(df1)): 

364 self.assertTrue(np.all(df2[col].values[i] == df1[col].values[i])) 

365 

366 def testMultiIndexDataFrame(self): 

367 df1 = _makeMultiIndexDataFrame() 

368 

369 self.butler.put(df1, self.datasetType, dataId={}) 

370 # Read the whole DataFrame. 

371 df2 = self.butler.get(self.datasetType, dataId={}) 

372 self.assertTrue(df1.equals(df2)) 

373 # Read just the column descriptions. 

374 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

375 self.assertTrue(df1.columns.equals(columns2)) 

376 self.assertEqual(columns2.names, df1.columns.names) 

377 # Read the rowcount. 

378 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

379 self.assertEqual(rowcount, len(df1)) 

380 # Read the schema. 

381 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

382 self.assertEqual(schema, DataFrameSchema(df1)) 

383 # Read just some columns a few different ways. 

384 df3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": {"filter": "g"}}) 

385 self.assertTrue(df1.loc[:, ["g"]].equals(df3)) 

386 df4 = self.butler.get( 

387 self.datasetType, dataId={}, parameters={"columns": {"filter": ["r"], "column": "a"}} 

388 ) 

389 self.assertTrue(df1.loc[:, [("r", "a")]].equals(df4)) 

390 column_list = [("g", "a"), ("r", "c")] 

391 df5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": column_list}) 

392 self.assertTrue(df1.loc[:, column_list].equals(df5)) 

393 column_dict = {"filter": "r", "column": ["a", "b"]} 

394 df6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": column_dict}) 

395 self.assertTrue(df1.loc[:, [("r", "a"), ("r", "b")]].equals(df6)) 

396 # Passing an unrecognized column should be a ValueError. 

397 with self.assertRaises(ValueError): 

398 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["d"]}) 

399 

400 def testSingleIndexDataFrameEmptyString(self): 

401 """Test persisting a single index dataframe with empty strings.""" 

402 df1, _ = _makeSingleIndexDataFrame() 

403 

404 # Set one of the strings to None 

405 df1.at[1, "strcol"] = None 

406 

407 self.butler.put(df1, self.datasetType, dataId={}) 

408 # Read the whole DataFrame. 

409 df2 = self.butler.get(self.datasetType, dataId={}) 

410 self.assertTrue(df1.equals(df2)) 

411 

412 def testSingleIndexDataFrameAllEmptyStrings(self): 

413 """Test persisting a single index dataframe with an empty string 

414 column. 

415 """ 

416 df1, _ = _makeSingleIndexDataFrame() 

417 

418 # Set all of the strings to None 

419 df1.loc[0:, "strcol"] = None 

420 

421 self.butler.put(df1, self.datasetType, dataId={}) 

422 # Read the whole DataFrame. 

423 df2 = self.butler.get(self.datasetType, dataId={}) 

424 self.assertTrue(df1.equals(df2)) 

425 

426 def testLegacyDataFrame(self): 

427 """Test writing a dataframe to parquet via pandas (without additional 

428 metadata) and ensure that we can read it back with all the new 

429 functionality. 

430 """ 

431 df1, allColumns = _makeSingleIndexDataFrame() 

432 

433 fname = os.path.join(self.root, "test_dataframe.parq") 

434 df1.to_parquet(fname) 

435 

436 legacy_type = DatasetType( 

437 "legacy_dataframe", 

438 dimensions=(), 

439 storageClass="DataFrame", 

440 universe=self.butler.dimensions, 

441 ) 

442 self.butler.registry.registerDatasetType(legacy_type) 

443 

444 data_id = {} 

445 ref = DatasetRef(legacy_type, data_id, run=self.run) 

446 dataset = FileDataset(path=fname, refs=[ref], formatter=ParquetFormatter) 

447 

448 self.butler.ingest(dataset, transfer="copy") 

449 

450 self.butler.put(df1, self.datasetType, dataId={}) 

451 

452 df2a = self.butler.get(self.datasetType, dataId={}) 

453 df2b = self.butler.get("legacy_dataframe", dataId={}) 

454 self.assertTrue(df2a.equals(df2b)) 

455 

456 df3a = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a"]}) 

457 df3b = self.butler.get("legacy_dataframe", dataId={}, parameters={"columns": ["a"]}) 

458 self.assertTrue(df3a.equals(df3b)) 

459 

460 columns2a = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

461 columns2b = self.butler.get("legacy_dataframe.columns", dataId={}) 

462 self.assertTrue(columns2a.equals(columns2b)) 

463 

464 rowcount2a = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

465 rowcount2b = self.butler.get("legacy_dataframe.rowcount", dataId={}) 

466 self.assertEqual(rowcount2a, rowcount2b) 

467 

468 schema2a = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

469 schema2b = self.butler.get("legacy_dataframe.schema", dataId={}) 

470 self.assertEqual(schema2a, schema2b) 

471 

472 def testDataFrameSchema(self): 

473 tab1 = _makeSimpleArrowTable() 

474 

475 schema = DataFrameSchema.from_arrow(tab1.schema) 

476 

477 self.assertIsInstance(schema.schema, pd.DataFrame) 

478 self.assertEqual(repr(schema), repr(schema._schema)) 

479 self.assertNotEqual(schema, "not_a_schema") 

480 self.assertEqual(schema, schema) 

481 

482 tab2 = _makeMultiIndexDataFrame() 

483 schema2 = DataFrameSchema(tab2) 

484 

485 self.assertNotEqual(schema, schema2) 

486 

487 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

488 def testWriteSingleIndexDataFrameReadAsAstropyTable(self): 

489 df1, allColumns = _makeSingleIndexDataFrame() 

490 

491 self.butler.put(df1, self.datasetType, dataId={}) 

492 

493 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

494 

495 tab2_df = tab2.to_pandas(index="index") 

496 self.assertTrue(df1.equals(tab2_df)) 

497 

498 # Check reading the columns. 

499 columns = list(tab2.columns.keys()) 

500 columns2 = self.butler.get( 

501 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

502 ) 

503 # We check the set because pandas reorders the columns. 

504 self.assertEqual(set(columns2), set(columns)) 

505 

506 # Check reading the schema. 

507 schema = ArrowAstropySchema(tab2) 

508 schema2 = self.butler.get( 

509 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema" 

510 ) 

511 

512 # The string types are objectified by pandas, and the order 

513 # will be changed because of pandas indexing. 

514 self.assertEqual(len(schema2.schema.columns), len(schema.schema.columns)) 

515 for name in schema.schema.columns: 

516 self.assertIn(name, schema2.schema.columns) 

517 if schema2.schema[name].dtype != np.dtype("O"): 

518 self.assertEqual(schema2.schema[name].dtype, schema.schema[name].dtype) 

519 

520 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

521 def testWriteSingleIndexDataFrameWithMaskedColsReadAsAstropyTable(self): 

522 # We need to special-case the write-as-pandas read-as-astropy code 

523 # with masks because pandas has multiple ways to use masked columns. 

524 # (The string column mask handling in particular is frustratingly 

525 # inconsistent.) 

526 df1, allColumns = _makeSingleIndexDataFrame(include_masked=True) 

527 

528 self.butler.put(df1, self.datasetType, dataId={}) 

529 

530 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

531 tab2_df = tab2.to_pandas(index="index") 

532 

533 self.assertTrue(df1.columns.equals(tab2_df.columns)) 

534 for name in tab2_df.columns: 

535 col1 = df1[name] 

536 col2 = tab2_df[name] 

537 

538 if col1.hasnans: 

539 notNull = col1.notnull() 

540 self.assertTrue(notNull.equals(col2.notnull())) 

541 # Need to check value-by-value because column may 

542 # be made of objects, depending on what pandas decides. 

543 for index in notNull.values.nonzero()[0]: 

544 self.assertEqual(col1[index], col2[index]) 

545 else: 

546 self.assertTrue(col1.equals(col2)) 

547 

548 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

549 def testWriteMultiIndexDataFrameReadAsAstropyTable(self): 

550 df1 = _makeMultiIndexDataFrame() 

551 

552 self.butler.put(df1, self.datasetType, dataId={}) 

553 

554 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

555 

556 # This is an odd duck, it doesn't really round-trip. 

557 # This test simply checks that it's readable, but definitely not 

558 # recommended. 

559 

560 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.") 

561 def testWriteSingleIndexDataFrameReadAsArrowTable(self): 

562 df1, allColumns = _makeSingleIndexDataFrame() 

563 

564 self.butler.put(df1, self.datasetType, dataId={}) 

565 

566 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable") 

567 

568 tab2_df = arrow_to_pandas(tab2) 

569 self.assertTrue(df1.equals(tab2_df)) 

570 

571 # Check reading the columns. 

572 columns = list(tab2.schema.names) 

573 columns2 = self.butler.get( 

574 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

575 ) 

576 # We check the set because pandas reorders the columns. 

577 self.assertEqual(set(columns), set(columns2)) 

578 

579 # Check reading the schema. 

580 schema = tab2.schema 

581 schema2 = self.butler.get( 

582 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema" 

583 ) 

584 

585 # These will not have the same metadata, nor will the string column 

586 # information be maintained. 

587 self.assertEqual(len(schema.names), len(schema2.names)) 

588 for name in schema.names: 

589 if schema.field(name).type not in (pa.string(), pa.binary()): 

590 self.assertEqual(schema.field(name).type, schema2.field(name).type) 

591 

592 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.") 

593 def testWriteMultiIndexDataFrameReadAsArrowTable(self): 

594 df1 = _makeMultiIndexDataFrame() 

595 

596 self.butler.put(df1, self.datasetType, dataId={}) 

597 

598 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable") 

599 

600 tab2_df = arrow_to_pandas(tab2) 

601 self.assertTrue(df1.equals(tab2_df)) 

602 

603 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

604 def testWriteSingleIndexDataFrameReadAsNumpyTable(self): 

605 df1, allColumns = _makeSingleIndexDataFrame() 

606 

607 self.butler.put(df1, self.datasetType, dataId={}) 

608 

609 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy") 

610 

611 tab2_df = pd.DataFrame.from_records(tab2, index=["index"]) 

612 self.assertTrue(df1.equals(tab2_df)) 

613 

614 # Check reading the columns. 

615 columns = list(tab2.dtype.names) 

616 columns2 = self.butler.get( 

617 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

618 ) 

619 # We check the set because pandas reorders the columns. 

620 self.assertEqual(set(columns2), set(columns)) 

621 

622 # Check reading the schema. 

623 schema = ArrowNumpySchema(tab2.dtype) 

624 schema2 = self.butler.get( 

625 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema" 

626 ) 

627 

628 # The string types will be objectified by pandas, and the order 

629 # will be changed because of pandas indexing. 

630 self.assertEqual(len(schema.schema.names), len(schema2.schema.names)) 

631 for name in schema.schema.names: 

632 self.assertIn(name, schema2.schema.names) 

633 self.assertEqual(schema2.schema[name].type, schema.schema[name].type) 

634 

635 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

636 def testWriteMultiIndexDataFrameReadAsNumpyTable(self): 

637 df1 = _makeMultiIndexDataFrame() 

638 

639 self.butler.put(df1, self.datasetType, dataId={}) 

640 

641 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy") 

642 

643 # This is an odd duck, it doesn't really round-trip. 

644 # This test simply checks that it's readable, but definitely not 

645 # recommended. 

646 

647 @unittest.skipUnless(np is not None, "Cannot test reading as numpy dict without numpy.") 

648 def testWriteSingleIndexDataFrameReadAsNumpyDict(self): 

649 df1, allColumns = _makeSingleIndexDataFrame() 

650 

651 self.butler.put(df1, self.datasetType, dataId={}) 

652 

653 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict") 

654 

655 tab2_df = pd.DataFrame.from_records(tab2, index=["index"]) 

656 # The column order is not maintained. 

657 self.assertEqual(set(df1.columns), set(tab2_df.columns)) 

658 for col in df1.columns: 

659 self.assertTrue(np.all(df1[col].values == tab2_df[col].values)) 

660 

661 @unittest.skipUnless(np is not None, "Cannot test reading as numpy dict without numpy.") 

662 def testWriteMultiIndexDataFrameReadAsNumpyDict(self): 

663 df1 = _makeMultiIndexDataFrame() 

664 

665 self.butler.put(df1, self.datasetType, dataId={}) 

666 

667 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict") 

668 

669 # This is an odd duck, it doesn't really round-trip. 

670 # This test simply checks that it's readable, but definitely not 

671 # recommended. 

672 

673 

674@unittest.skipUnless(pd is not None, "Cannot test InMemoryDataFrameDelegate without pandas.") 

675class InMemoryDataFrameDelegateTestCase(ParquetFormatterDataFrameTestCase): 

676 """Tests for InMemoryDatastore, using DataFrameDelegate.""" 

677 

678 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

679 

680 def testWriteMultiIndexDataFrameReadAsAstropyTable(self): 

681 df1 = _makeMultiIndexDataFrame() 

682 

683 self.butler.put(df1, self.datasetType, dataId={}) 

684 

685 with self.assertRaises(ValueError): 

686 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

687 

688 def testLegacyDataFrame(self): 

689 # This test does not work with an inMemoryDatastore. 

690 pass 

691 

692 def testBadInput(self): 

693 df1, _ = _makeSingleIndexDataFrame() 

694 delegate = DataFrameDelegate("DataFrame") 

695 

696 with self.assertRaises(ValueError): 

697 delegate.handleParameters(inMemoryDataset="not_a_dataframe") 

698 

699 with self.assertRaises(AttributeError): 

700 delegate.getComponent(composite=df1, componentName="nothing") 

701 

702 def testStorageClass(self): 

703 df1, allColumns = _makeSingleIndexDataFrame() 

704 

705 factory = StorageClassFactory() 

706 factory.addFromConfig(StorageClassConfig()) 

707 

708 storageClass = factory.findStorageClass(type(df1), compare_types=False) 

709 # Force the name lookup to do name matching. 

710 storageClass._pytype = None 

711 self.assertEqual(storageClass.name, "DataFrame") 

712 

713 storageClass = factory.findStorageClass(type(df1), compare_types=True) 

714 # Force the name lookup to do name matching. 

715 storageClass._pytype = None 

716 self.assertEqual(storageClass.name, "DataFrame") 

717 

718 

719@unittest.skipUnless(atable is not None, "Cannot test ParquetFormatterArrowAstropy without astropy.") 

720@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowAstropy without pyarrow.") 

721class ParquetFormatterArrowAstropyTestCase(unittest.TestCase): 

722 """Tests for ParquetFormatter, ArrowAstropy, using local file datastore.""" 

723 

724 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

725 

726 def setUp(self): 

727 """Create a new butler root for each test.""" 

728 self.root = makeTestTempDir(TESTDIR) 

729 config = Config(self.configFile) 

730 self.run = "test_run" 

731 self.butler = Butler.from_config( 

732 Butler.makeRepo(self.root, config=config), writeable=True, run=self.run 

733 ) 

734 # No dimensions in dataset type so we don't have to worry about 

735 # inserting dimension data or defining data IDs. 

736 self.datasetType = DatasetType( 

737 "data", dimensions=(), storageClass="ArrowAstropy", universe=self.butler.dimensions 

738 ) 

739 self.butler.registry.registerDatasetType(self.datasetType) 

740 

741 def tearDown(self): 

742 removeTestTempDir(self.root) 

743 

744 def testAstropyTable(self): 

745 tab1 = _makeSimpleAstropyTable(include_multidim=True, include_masked=True) 

746 

747 self.butler.put(tab1, self.datasetType, dataId={}) 

748 # Read the whole Table. 

749 tab2 = self.butler.get(self.datasetType, dataId={}) 

750 self._checkAstropyTableEquality(tab1, tab2) 

751 # Read the columns. 

752 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

753 self.assertEqual(len(columns2), len(tab1.dtype.names)) 

754 for i, name in enumerate(tab1.dtype.names): 

755 self.assertEqual(columns2[i], name) 

756 # Read the rowcount. 

757 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

758 self.assertEqual(rowcount, len(tab1)) 

759 # Read the schema. 

760 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

761 self.assertEqual(schema, ArrowAstropySchema(tab1)) 

762 # Read just some columns a few different ways. 

763 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

764 self._checkAstropyTableEquality(tab1[("a", "c")], tab3) 

765 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

766 self._checkAstropyTableEquality(tab1[("a",)], tab4) 

767 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]}) 

768 self._checkAstropyTableEquality(tab1[("index", "a")], tab5) 

769 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"}) 

770 self._checkAstropyTableEquality(tab1[("ddd",)], tab6) 

771 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]}) 

772 self._checkAstropyTableEquality(tab1[("a",)], tab7) 

773 # Passing an unrecognized column should be a ValueError. 

774 with self.assertRaises(ValueError): 

775 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]}) 

776 

777 def testAstropyTableBigEndian(self): 

778 tab1 = _makeSimpleAstropyTable(include_bigendian=True) 

779 

780 self.butler.put(tab1, self.datasetType, dataId={}) 

781 # Read the whole Table. 

782 tab2 = self.butler.get(self.datasetType, dataId={}) 

783 self._checkAstropyTableEquality(tab1, tab2, has_bigendian=True) 

784 

785 def testAstropyTableWithMetadata(self): 

786 tab1 = _makeSimpleAstropyTable(include_multidim=True) 

787 

788 meta = { 

789 "meta_a": 5, 

790 "meta_b": 10.0, 

791 "meta_c": [1, 2, 3], 

792 "meta_d": True, 

793 "meta_e": "string", 

794 } 

795 

796 tab1.meta.update(meta) 

797 

798 self.butler.put(tab1, self.datasetType, dataId={}) 

799 # Read the whole Table. 

800 tab2 = self.butler.get(self.datasetType, dataId={}) 

801 # This will check that the metadata is equivalent as well. 

802 self._checkAstropyTableEquality(tab1, tab2) 

803 

804 def testArrowAstropySchema(self): 

805 tab1 = _makeSimpleAstropyTable() 

806 tab1_arrow = astropy_to_arrow(tab1) 

807 schema = ArrowAstropySchema.from_arrow(tab1_arrow.schema) 

808 

809 self.assertIsInstance(schema.schema, atable.Table) 

810 self.assertEqual(repr(schema), repr(schema._schema)) 

811 self.assertNotEqual(schema, "not_a_schema") 

812 self.assertEqual(schema, schema) 

813 

814 # Test various inequalities 

815 tab2 = tab1.copy() 

816 tab2.rename_column("index", "index2") 

817 schema2 = ArrowAstropySchema(tab2) 

818 self.assertNotEqual(schema2, schema) 

819 

820 tab2 = tab1.copy() 

821 tab2["index"].unit = units.micron 

822 schema2 = ArrowAstropySchema(tab2) 

823 self.assertNotEqual(schema2, schema) 

824 

825 tab2 = tab1.copy() 

826 tab2["index"].description = "Index column" 

827 schema2 = ArrowAstropySchema(tab2) 

828 self.assertNotEqual(schema2, schema) 

829 

830 tab2 = tab1.copy() 

831 tab2["index"].format = "%05d" 

832 schema2 = ArrowAstropySchema(tab2) 

833 self.assertNotEqual(schema2, schema) 

834 

835 def testAstropyParquet(self): 

836 tab1 = _makeSimpleAstropyTable() 

837 

838 fname = os.path.join(self.root, "test_astropy.parq") 

839 tab1.write(fname) 

840 

841 astropy_type = DatasetType( 

842 "astropy_parquet", 

843 dimensions=(), 

844 storageClass="ArrowAstropy", 

845 universe=self.butler.dimensions, 

846 ) 

847 self.butler.registry.registerDatasetType(astropy_type) 

848 

849 data_id = {} 

850 ref = DatasetRef(astropy_type, data_id, run=self.run) 

851 dataset = FileDataset(path=fname, refs=[ref], formatter=ParquetFormatter) 

852 

853 self.butler.ingest(dataset, transfer="copy") 

854 

855 self.butler.put(tab1, self.datasetType, dataId={}) 

856 

857 tab2a = self.butler.get(self.datasetType, dataId={}) 

858 tab2b = self.butler.get("astropy_parquet", dataId={}) 

859 self._checkAstropyTableEquality(tab2a, tab2b) 

860 

861 columns2a = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

862 columns2b = self.butler.get("astropy_parquet.columns", dataId={}) 

863 self.assertEqual(len(columns2b), len(columns2a)) 

864 for i, name in enumerate(columns2a): 

865 self.assertEqual(columns2b[i], name) 

866 

867 rowcount2a = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

868 rowcount2b = self.butler.get("astropy_parquet.rowcount", dataId={}) 

869 self.assertEqual(rowcount2a, rowcount2b) 

870 

871 schema2a = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

872 schema2b = self.butler.get("astropy_parquet.schema", dataId={}) 

873 self.assertEqual(schema2a, schema2b) 

874 

875 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.") 

876 def testWriteAstropyReadAsArrowTable(self): 

877 # This astropy <-> arrow works fine with masked columns. 

878 tab1 = _makeSimpleAstropyTable(include_masked=True) 

879 

880 self.butler.put(tab1, self.datasetType, dataId={}) 

881 

882 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable") 

883 

884 tab2_astropy = arrow_to_astropy(tab2) 

885 self._checkAstropyTableEquality(tab1, tab2_astropy) 

886 

887 # Check reading the columns. 

888 columns = tab2.schema.names 

889 columns2 = self.butler.get( 

890 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

891 ) 

892 self.assertEqual(columns2, columns) 

893 

894 # Check reading the schema. 

895 schema = tab2.schema 

896 schema2 = self.butler.get( 

897 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema" 

898 ) 

899 

900 self.assertEqual(schema, schema2) 

901 

902 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

903 def testWriteAstropyReadAsDataFrame(self): 

904 tab1 = _makeSimpleAstropyTable() 

905 

906 self.butler.put(tab1, self.datasetType, dataId={}) 

907 

908 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

909 

910 # This is tricky because it loses the units and gains a bonus pandas 

911 # _index_ column, so we just test the dataframe form. 

912 

913 tab1_df = tab1.to_pandas() 

914 self.assertTrue(tab1_df.equals(tab2)) 

915 

916 # Check reading the columns. 

917 columns = tab2.columns 

918 columns2 = self.butler.get( 

919 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex" 

920 ) 

921 self.assertTrue(columns.equals(columns2)) 

922 

923 # Check reading the schema. 

924 schema = DataFrameSchema(tab2) 

925 schema2 = self.butler.get( 

926 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema" 

927 ) 

928 

929 self.assertEqual(schema2, schema) 

930 

931 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

932 def testWriteAstropyWithMaskedColsReadAsDataFrame(self): 

933 # We need to special-case the write-as-astropy read-as-pandas code 

934 # with masks because pandas has multiple ways to use masked columns. 

935 # (When writing an astropy table with masked columns we get an object 

936 # column back, but each unmasked element has the correct type.) 

937 tab1 = _makeSimpleAstropyTable(include_masked=True) 

938 

939 self.butler.put(tab1, self.datasetType, dataId={}) 

940 

941 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

942 

943 tab1_df = tab1.to_pandas() 

944 

945 self.assertTrue(tab1_df.columns.equals(tab2.columns)) 

946 for name in tab2.columns: 

947 col1 = tab1_df[name] 

948 col2 = tab2[name] 

949 

950 if col1.hasnans: 

951 notNull = col1.notnull() 

952 self.assertTrue(notNull.equals(col2.notnull())) 

953 # Need to check value-by-value because column may 

954 # be made of objects, depending on what pandas decides. 

955 for index in notNull.values.nonzero()[0]: 

956 self.assertEqual(col1[index], col2[index]) 

957 else: 

958 self.assertTrue(col1.equals(col2)) 

959 

960 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

961 def testWriteAstropyReadAsNumpyTable(self): 

962 tab1 = _makeSimpleAstropyTable() 

963 self.butler.put(tab1, self.datasetType, dataId={}) 

964 

965 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy") 

966 

967 # This is tricky because it loses the units. 

968 tab2_astropy = atable.Table(tab2) 

969 

970 self._checkAstropyTableEquality(tab1, tab2_astropy, skip_units=True) 

971 

972 # Check reading the columns. 

973 columns = list(tab2.dtype.names) 

974 columns2 = self.butler.get( 

975 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

976 ) 

977 self.assertEqual(columns2, columns) 

978 

979 # Check reading the schema. 

980 schema = ArrowNumpySchema(tab2.dtype) 

981 schema2 = self.butler.get( 

982 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema" 

983 ) 

984 

985 self.assertEqual(schema2, schema) 

986 

987 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

988 def testWriteAstropyReadAsNumpyDict(self): 

989 tab1 = _makeSimpleAstropyTable() 

990 self.butler.put(tab1, self.datasetType, dataId={}) 

991 

992 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict") 

993 

994 # This is tricky because it loses the units. 

995 tab2_astropy = atable.Table(tab2) 

996 

997 self._checkAstropyTableEquality(tab1, tab2_astropy, skip_units=True) 

998 

999 def _checkAstropyTableEquality(self, table1, table2, skip_units=False, has_bigendian=False): 

1000 """Check if two astropy tables have the same columns/values. 

1001 

1002 Parameters 

1003 ---------- 

1004 table1 : `astropy.table.Table` 

1005 table2 : `astropy.table.Table` 

1006 skip_units : `bool` 

1007 has_bigendian : `bool` 

1008 """ 

1009 if not has_bigendian: 

1010 self.assertEqual(table1.dtype, table2.dtype) 

1011 else: 

1012 for name in table1.dtype.names: 

1013 # Only check type matches, force to little-endian. 

1014 self.assertEqual(table1.dtype[name].newbyteorder(">"), table2.dtype[name].newbyteorder(">")) 

1015 

1016 self.assertEqual(table1.meta, table2.meta) 

1017 if not skip_units: 

1018 for name in table1.columns: 

1019 self.assertEqual(table1[name].unit, table2[name].unit) 

1020 self.assertEqual(table1[name].description, table2[name].description) 

1021 self.assertEqual(table1[name].format, table2[name].format) 

1022 self.assertTrue(np.all(table1 == table2)) 

1023 

1024 

1025@unittest.skipUnless(atable is not None, "Cannot test InMemoryArrowAstropyDelegate without astropy.") 

1026class InMemoryArrowAstropyDelegateTestCase(ParquetFormatterArrowAstropyTestCase): 

1027 """Tests for InMemoryDatastore, using ArrowAstropyDelegate.""" 

1028 

1029 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

1030 

1031 def testAstropyParquet(self): 

1032 # This test does not work with an inMemoryDatastore. 

1033 pass 

1034 

1035 def testBadInput(self): 

1036 tab1 = _makeSimpleAstropyTable() 

1037 delegate = ArrowAstropyDelegate("ArrowAstropy") 

1038 

1039 with self.assertRaises(ValueError): 

1040 delegate.handleParameters(inMemoryDataset="not_an_astropy_table") 

1041 

1042 with self.assertRaises(NotImplementedError): 

1043 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]}) 

1044 

1045 with self.assertRaises(AttributeError): 

1046 delegate.getComponent(composite=tab1, componentName="nothing") 

1047 

1048 

1049@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.") 

1050@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowNumpy without pyarrow.") 

1051class ParquetFormatterArrowNumpyTestCase(unittest.TestCase): 

1052 """Tests for ParquetFormatter, ArrowNumpy, using local file datastore.""" 

1053 

1054 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1055 

1056 def setUp(self): 

1057 """Create a new butler root for each test.""" 

1058 self.root = makeTestTempDir(TESTDIR) 

1059 config = Config(self.configFile) 

1060 self.butler = Butler.from_config( 

1061 Butler.makeRepo(self.root, config=config), writeable=True, run="test_run" 

1062 ) 

1063 # No dimensions in dataset type so we don't have to worry about 

1064 # inserting dimension data or defining data IDs. 

1065 self.datasetType = DatasetType( 

1066 "data", dimensions=(), storageClass="ArrowNumpy", universe=self.butler.dimensions 

1067 ) 

1068 self.butler.registry.registerDatasetType(self.datasetType) 

1069 

1070 def tearDown(self): 

1071 removeTestTempDir(self.root) 

1072 

1073 def testNumpyTable(self): 

1074 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1075 

1076 self.butler.put(tab1, self.datasetType, dataId={}) 

1077 # Read the whole Table. 

1078 tab2 = self.butler.get(self.datasetType, dataId={}) 

1079 self._checkNumpyTableEquality(tab1, tab2) 

1080 # Read the columns. 

1081 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

1082 self.assertEqual(len(columns2), len(tab1.dtype.names)) 

1083 for i, name in enumerate(tab1.dtype.names): 

1084 self.assertEqual(columns2[i], name) 

1085 # Read the rowcount. 

1086 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

1087 self.assertEqual(rowcount, len(tab1)) 

1088 # Read the schema. 

1089 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

1090 self.assertEqual(schema, ArrowNumpySchema(tab1.dtype)) 

1091 # Read just some columns a few different ways. 

1092 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

1093 self._checkNumpyTableEquality(tab1[["a", "c"]], tab3) 

1094 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

1095 self._checkNumpyTableEquality( 

1096 tab1[ 

1097 [ 

1098 "a", 

1099 ] 

1100 ], 

1101 tab4, 

1102 ) 

1103 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]}) 

1104 self._checkNumpyTableEquality(tab1[["index", "a"]], tab5) 

1105 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"}) 

1106 self._checkNumpyTableEquality( 

1107 tab1[ 

1108 [ 

1109 "ddd", 

1110 ] 

1111 ], 

1112 tab6, 

1113 ) 

1114 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]}) 

1115 self._checkNumpyTableEquality( 

1116 tab1[ 

1117 [ 

1118 "a", 

1119 ] 

1120 ], 

1121 tab7, 

1122 ) 

1123 # Passing an unrecognized column should be a ValueError. 

1124 with self.assertRaises(ValueError): 

1125 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]}) 

1126 

1127 def testNumpyTableBigEndian(self): 

1128 tab1 = _makeSimpleNumpyTable(include_bigendian=True) 

1129 

1130 self.butler.put(tab1, self.datasetType, dataId={}) 

1131 # Read the whole Table. 

1132 tab2 = self.butler.get(self.datasetType, dataId={}) 

1133 self._checkNumpyTableEquality(tab1, tab2, has_bigendian=True) 

1134 

1135 def testArrowNumpySchema(self): 

1136 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1137 tab1_arrow = numpy_to_arrow(tab1) 

1138 schema = ArrowNumpySchema.from_arrow(tab1_arrow.schema) 

1139 

1140 self.assertIsInstance(schema.schema, np.dtype) 

1141 self.assertEqual(repr(schema), repr(schema._dtype)) 

1142 self.assertNotEqual(schema, "not_a_schema") 

1143 self.assertEqual(schema, schema) 

1144 

1145 # Test inequality 

1146 tab2 = tab1.copy() 

1147 names = list(tab2.dtype.names) 

1148 names[0] = "index2" 

1149 tab2.dtype.names = names 

1150 schema2 = ArrowNumpySchema(tab2.dtype) 

1151 self.assertNotEqual(schema2, schema) 

1152 

1153 @unittest.skipUnless(pa is not None, "Cannot test arrow conversions without pyarrow.") 

1154 def testNumpyDictConversions(self): 

1155 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1156 

1157 # Verify that everything round-trips, including the schema. 

1158 tab1_arrow = numpy_to_arrow(tab1) 

1159 tab1_dict = arrow_to_numpy_dict(tab1_arrow) 

1160 tab1_dict_arrow = numpy_dict_to_arrow(tab1_dict) 

1161 

1162 self.assertEqual(tab1_arrow.schema, tab1_dict_arrow.schema) 

1163 self.assertEqual(tab1_arrow, tab1_dict_arrow) 

1164 

1165 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.") 

1166 def testWriteNumpyTableReadAsArrowTable(self): 

1167 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1168 

1169 self.butler.put(tab1, self.datasetType, dataId={}) 

1170 

1171 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable") 

1172 

1173 tab2_numpy = arrow_to_numpy(tab2) 

1174 

1175 self._checkNumpyTableEquality(tab1, tab2_numpy) 

1176 

1177 # Check reading the columns. 

1178 columns = tab2.schema.names 

1179 columns2 = self.butler.get( 

1180 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

1181 ) 

1182 self.assertEqual(columns2, columns) 

1183 

1184 # Check reading the schema. 

1185 schema = tab2.schema 

1186 schema2 = self.butler.get( 

1187 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema" 

1188 ) 

1189 self.assertEqual(schema2, schema) 

1190 

1191 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

1192 def testWriteNumpyTableReadAsDataFrame(self): 

1193 tab1 = _makeSimpleNumpyTable() 

1194 

1195 self.butler.put(tab1, self.datasetType, dataId={}) 

1196 

1197 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

1198 

1199 # Converting this back to numpy gets confused with the index column 

1200 # and changes the datatype of the string column. 

1201 

1202 tab1_df = pd.DataFrame(tab1) 

1203 

1204 self.assertTrue(tab1_df.equals(tab2)) 

1205 

1206 # Check reading the columns. 

1207 columns = tab2.columns 

1208 columns2 = self.butler.get( 

1209 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex" 

1210 ) 

1211 self.assertTrue(columns.equals(columns2)) 

1212 

1213 # Check reading the schema. 

1214 schema = DataFrameSchema(tab2) 

1215 schema2 = self.butler.get( 

1216 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema" 

1217 ) 

1218 

1219 self.assertEqual(schema2, schema) 

1220 

1221 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

1222 def testWriteNumpyTableReadAsAstropyTable(self): 

1223 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1224 

1225 self.butler.put(tab1, self.datasetType, dataId={}) 

1226 

1227 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

1228 tab2_numpy = tab2.as_array() 

1229 

1230 self._checkNumpyTableEquality(tab1, tab2_numpy) 

1231 

1232 # Check reading the columns. 

1233 columns = list(tab2.columns.keys()) 

1234 columns2 = self.butler.get( 

1235 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

1236 ) 

1237 self.assertEqual(columns2, columns) 

1238 

1239 # Check reading the schema. 

1240 schema = ArrowAstropySchema(tab2) 

1241 schema2 = self.butler.get( 

1242 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema" 

1243 ) 

1244 

1245 self.assertEqual(schema2, schema) 

1246 

1247 def testWriteNumpyTableReadAsNumpyDict(self): 

1248 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1249 

1250 self.butler.put(tab1, self.datasetType, dataId={}) 

1251 

1252 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict") 

1253 tab2_numpy = _numpy_dict_to_numpy(tab2) 

1254 

1255 self._checkNumpyTableEquality(tab1, tab2_numpy) 

1256 

1257 def _checkNumpyTableEquality(self, table1, table2, has_bigendian=False): 

1258 """Check if two numpy tables have the same columns/values 

1259 

1260 Parameters 

1261 ---------- 

1262 table1 : `numpy.ndarray` 

1263 table2 : `numpy.ndarray` 

1264 has_bigendian : `bool` 

1265 """ 

1266 self.assertEqual(table1.dtype.names, table2.dtype.names) 

1267 for name in table1.dtype.names: 

1268 if not has_bigendian: 

1269 self.assertEqual(table1.dtype[name], table2.dtype[name]) 

1270 else: 

1271 # Only check type matches, force to little-endian. 

1272 self.assertEqual(table1.dtype[name].newbyteorder(">"), table2.dtype[name].newbyteorder(">")) 

1273 self.assertTrue(np.all(table1 == table2)) 

1274 

1275 

1276@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.") 

1277class InMemoryArrowNumpyDelegateTestCase(ParquetFormatterArrowNumpyTestCase): 

1278 """Tests for InMemoryDatastore, using ArrowNumpyDelegate.""" 

1279 

1280 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

1281 

1282 def testBadInput(self): 

1283 tab1 = _makeSimpleNumpyTable() 

1284 delegate = ArrowNumpyDelegate("ArrowNumpy") 

1285 

1286 with self.assertRaises(ValueError): 

1287 delegate.handleParameters(inMemoryDataset="not_a_numpy_table") 

1288 

1289 with self.assertRaises(NotImplementedError): 

1290 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]}) 

1291 

1292 with self.assertRaises(AttributeError): 

1293 delegate.getComponent(composite=tab1, componentName="nothing") 

1294 

1295 def testStorageClass(self): 

1296 tab1 = _makeSimpleNumpyTable() 

1297 

1298 factory = StorageClassFactory() 

1299 factory.addFromConfig(StorageClassConfig()) 

1300 

1301 storageClass = factory.findStorageClass(type(tab1), compare_types=False) 

1302 # Force the name lookup to do name matching. 

1303 storageClass._pytype = None 

1304 self.assertEqual(storageClass.name, "ArrowNumpy") 

1305 

1306 storageClass = factory.findStorageClass(type(tab1), compare_types=True) 

1307 # Force the name lookup to do name matching. 

1308 storageClass._pytype = None 

1309 self.assertEqual(storageClass.name, "ArrowNumpy") 

1310 

1311 

1312@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowTable without pyarrow.") 

1313class ParquetFormatterArrowTableTestCase(unittest.TestCase): 

1314 """Tests for ParquetFormatter, ArrowTable, using local file datastore.""" 

1315 

1316 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1317 

1318 def setUp(self): 

1319 """Create a new butler root for each test.""" 

1320 self.root = makeTestTempDir(TESTDIR) 

1321 config = Config(self.configFile) 

1322 self.butler = Butler.from_config( 

1323 Butler.makeRepo(self.root, config=config), writeable=True, run="test_run" 

1324 ) 

1325 # No dimensions in dataset type so we don't have to worry about 

1326 # inserting dimension data or defining data IDs. 

1327 self.datasetType = DatasetType( 

1328 "data", dimensions=(), storageClass="ArrowTable", universe=self.butler.dimensions 

1329 ) 

1330 self.butler.registry.registerDatasetType(self.datasetType) 

1331 

1332 def tearDown(self): 

1333 removeTestTempDir(self.root) 

1334 

1335 def testArrowTable(self): 

1336 tab1 = _makeSimpleArrowTable(include_multidim=True, include_masked=True) 

1337 

1338 self.butler.put(tab1, self.datasetType, dataId={}) 

1339 # Read the whole Table. 

1340 tab2 = self.butler.get(self.datasetType, dataId={}) 

1341 self.assertEqual(tab2, tab1) 

1342 # Read the columns. 

1343 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

1344 self.assertEqual(len(columns2), len(tab1.schema.names)) 

1345 for i, name in enumerate(tab1.schema.names): 

1346 self.assertEqual(columns2[i], name) 

1347 # Read the rowcount. 

1348 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

1349 self.assertEqual(rowcount, len(tab1)) 

1350 # Read the schema. 

1351 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

1352 self.assertEqual(schema, tab1.schema) 

1353 # Read just some columns a few different ways. 

1354 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

1355 self.assertEqual(tab3, tab1.select(("a", "c"))) 

1356 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

1357 self.assertEqual(tab4, tab1.select(("a",))) 

1358 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]}) 

1359 self.assertEqual(tab5, tab1.select(("index", "a"))) 

1360 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"}) 

1361 self.assertEqual(tab6, tab1.select(("ddd",))) 

1362 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]}) 

1363 self.assertEqual(tab7, tab1.select(("a",))) 

1364 # Passing an unrecognized column should be a ValueError. 

1365 with self.assertRaises(ValueError): 

1366 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]}) 

1367 

1368 def testEmptyArrowTable(self): 

1369 data = _makeSimpleNumpyTable() 

1370 type_list = _numpy_dtype_to_arrow_types(data.dtype) 

1371 

1372 schema = pa.schema(type_list) 

1373 arrays = [[]] * len(schema.names) 

1374 

1375 tab1 = pa.Table.from_arrays(arrays, schema=schema) 

1376 

1377 self.butler.put(tab1, self.datasetType, dataId={}) 

1378 tab2 = self.butler.get(self.datasetType, dataId={}) 

1379 self.assertEqual(tab2, tab1) 

1380 

1381 tab1_numpy = arrow_to_numpy(tab1) 

1382 self.assertEqual(len(tab1_numpy), 0) 

1383 tab1_numpy_arrow = numpy_to_arrow(tab1_numpy) 

1384 self.assertEqual(tab1_numpy_arrow, tab1) 

1385 

1386 tab1_pandas = arrow_to_pandas(tab1) 

1387 self.assertEqual(len(tab1_pandas), 0) 

1388 tab1_pandas_arrow = pandas_to_arrow(tab1_pandas) 

1389 # Unfortunately, string/byte columns get mangled when translated 

1390 # through empty pandas dataframes. 

1391 self.assertEqual( 

1392 tab1_pandas_arrow.select(("index", "a", "b", "c", "ddd")), 

1393 tab1.select(("index", "a", "b", "c", "ddd")), 

1394 ) 

1395 

1396 tab1_astropy = arrow_to_astropy(tab1) 

1397 self.assertEqual(len(tab1_astropy), 0) 

1398 tab1_astropy_arrow = astropy_to_arrow(tab1_astropy) 

1399 self.assertEqual(tab1_astropy_arrow, tab1) 

1400 

1401 def testEmptyArrowTableMultidim(self): 

1402 data = _makeSimpleNumpyTable(include_multidim=True) 

1403 type_list = _numpy_dtype_to_arrow_types(data.dtype) 

1404 

1405 md = {} 

1406 for name in data.dtype.names: 

1407 _append_numpy_multidim_metadata(md, name, data.dtype[name]) 

1408 

1409 schema = pa.schema(type_list, metadata=md) 

1410 arrays = [[]] * len(schema.names) 

1411 

1412 tab1 = pa.Table.from_arrays(arrays, schema=schema) 

1413 

1414 self.butler.put(tab1, self.datasetType, dataId={}) 

1415 tab2 = self.butler.get(self.datasetType, dataId={}) 

1416 self.assertEqual(tab2, tab1) 

1417 

1418 tab1_numpy = arrow_to_numpy(tab1) 

1419 self.assertEqual(len(tab1_numpy), 0) 

1420 tab1_numpy_arrow = numpy_to_arrow(tab1_numpy) 

1421 self.assertEqual(tab1_numpy_arrow, tab1) 

1422 

1423 tab1_astropy = arrow_to_astropy(tab1) 

1424 self.assertEqual(len(tab1_astropy), 0) 

1425 tab1_astropy_arrow = astropy_to_arrow(tab1_astropy) 

1426 self.assertEqual(tab1_astropy_arrow, tab1) 

1427 

1428 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

1429 def testWriteArrowTableReadAsSingleIndexDataFrame(self): 

1430 df1, allColumns = _makeSingleIndexDataFrame() 

1431 

1432 self.butler.put(df1, self.datasetType, dataId={}) 

1433 

1434 # Read back out as a dataframe. 

1435 df2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

1436 self.assertTrue(df1.equals(df2)) 

1437 

1438 # Read back out as an arrow table, convert to dataframe. 

1439 tab3 = self.butler.get(self.datasetType, dataId={}) 

1440 df3 = arrow_to_pandas(tab3) 

1441 self.assertTrue(df1.equals(df3)) 

1442 

1443 # Check reading the columns. 

1444 columns = df2.reset_index().columns 

1445 columns2 = self.butler.get( 

1446 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex" 

1447 ) 

1448 # We check the set because pandas reorders the columns. 

1449 self.assertEqual(set(columns2.to_list()), set(columns.to_list())) 

1450 

1451 # Check reading the schema. 

1452 schema = DataFrameSchema(df1) 

1453 schema2 = self.butler.get( 

1454 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema" 

1455 ) 

1456 self.assertEqual(schema2, schema) 

1457 

1458 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

1459 def testWriteArrowTableReadAsMultiIndexDataFrame(self): 

1460 df1 = _makeMultiIndexDataFrame() 

1461 

1462 self.butler.put(df1, self.datasetType, dataId={}) 

1463 

1464 # Read back out as a dataframe. 

1465 df2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

1466 self.assertTrue(df1.equals(df2)) 

1467 

1468 # Read back out as an arrow table, convert to dataframe. 

1469 atab3 = self.butler.get(self.datasetType, dataId={}) 

1470 df3 = arrow_to_pandas(atab3) 

1471 self.assertTrue(df1.equals(df3)) 

1472 

1473 # Check reading the columns. 

1474 columns = df2.columns 

1475 columns2 = self.butler.get( 

1476 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex" 

1477 ) 

1478 self.assertTrue(columns2.equals(columns)) 

1479 

1480 # Check reading the schema. 

1481 schema = DataFrameSchema(df1) 

1482 schema2 = self.butler.get( 

1483 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema" 

1484 ) 

1485 self.assertEqual(schema2, schema) 

1486 

1487 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

1488 def testWriteArrowTableReadAsAstropyTable(self): 

1489 tab1 = _makeSimpleAstropyTable(include_multidim=True, include_masked=True) 

1490 

1491 self.butler.put(tab1, self.datasetType, dataId={}) 

1492 

1493 # Read back out as an astropy table. 

1494 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

1495 self._checkAstropyTableEquality(tab1, tab2) 

1496 

1497 # Read back out as an arrow table, convert to astropy table. 

1498 atab3 = self.butler.get(self.datasetType, dataId={}) 

1499 tab3 = arrow_to_astropy(atab3) 

1500 self._checkAstropyTableEquality(tab1, tab3) 

1501 

1502 # Check reading the columns. 

1503 columns = list(tab2.columns.keys()) 

1504 columns2 = self.butler.get( 

1505 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

1506 ) 

1507 self.assertEqual(columns2, columns) 

1508 

1509 # Check reading the schema. 

1510 schema = ArrowAstropySchema(tab1) 

1511 schema2 = self.butler.get( 

1512 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema" 

1513 ) 

1514 self.assertEqual(schema2, schema) 

1515 

1516 # Check the schema conversions and units. 

1517 arrow_schema = schema.to_arrow_schema() 

1518 for name in arrow_schema.names: 

1519 field_metadata = arrow_schema.field(name).metadata 

1520 if ( 

1521 b"description" in field_metadata 

1522 and (description := field_metadata[b"description"].decode("UTF-8")) != "" 

1523 ): 

1524 self.assertEqual(schema2.schema[name].description, description) 

1525 else: 

1526 self.assertIsNone(schema2.schema[name].description) 

1527 if b"unit" in field_metadata and (unit := field_metadata[b"unit"].decode("UTF-8")) != "": 

1528 self.assertEqual(schema2.schema[name].unit, units.Unit(unit)) 

1529 

1530 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

1531 def testWriteArrowTableReadAsNumpyTable(self): 

1532 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1533 

1534 self.butler.put(tab1, self.datasetType, dataId={}) 

1535 

1536 # Read back out as a numpy table. 

1537 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy") 

1538 self._checkNumpyTableEquality(tab1, tab2) 

1539 

1540 # Read back out as an arrow table, convert to numpy table. 

1541 atab3 = self.butler.get(self.datasetType, dataId={}) 

1542 tab3 = arrow_to_numpy(atab3) 

1543 self._checkNumpyTableEquality(tab1, tab3) 

1544 

1545 # Check reading the columns. 

1546 columns = list(tab2.dtype.names) 

1547 columns2 = self.butler.get( 

1548 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

1549 ) 

1550 self.assertEqual(columns2, columns) 

1551 

1552 # Check reading the schema. 

1553 schema = ArrowNumpySchema(tab1.dtype) 

1554 schema2 = self.butler.get( 

1555 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema" 

1556 ) 

1557 self.assertEqual(schema2, schema) 

1558 

1559 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

1560 def testWriteArrowTableReadAsNumpyDict(self): 

1561 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1562 

1563 self.butler.put(tab1, self.datasetType, dataId={}) 

1564 

1565 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict") 

1566 tab2_numpy = _numpy_dict_to_numpy(tab2) 

1567 self._checkNumpyTableEquality(tab1, tab2_numpy) 

1568 

1569 def _checkAstropyTableEquality(self, table1, table2): 

1570 """Check if two astropy tables have the same columns/values 

1571 

1572 Parameters 

1573 ---------- 

1574 table1 : `astropy.table.Table` 

1575 table2 : `astropy.table.Table` 

1576 """ 

1577 self.assertEqual(table1.dtype, table2.dtype) 

1578 for name in table1.columns: 

1579 self.assertEqual(table1[name].unit, table2[name].unit) 

1580 self.assertEqual(table1[name].description, table2[name].description) 

1581 self.assertEqual(table1[name].format, table2[name].format) 

1582 self.assertTrue(np.all(table1 == table2)) 

1583 

1584 def _checkNumpyTableEquality(self, table1, table2): 

1585 """Check if two numpy tables have the same columns/values 

1586 

1587 Parameters 

1588 ---------- 

1589 table1 : `numpy.ndarray` 

1590 table2 : `numpy.ndarray` 

1591 """ 

1592 self.assertEqual(table1.dtype.names, table2.dtype.names) 

1593 for name in table1.dtype.names: 

1594 self.assertEqual(table1.dtype[name], table2.dtype[name]) 

1595 self.assertTrue(np.all(table1 == table2)) 

1596 

1597 

1598@unittest.skipUnless(pa is not None, "Cannot test InMemoryArrowTableDelegate without pyarrow.") 

1599class InMemoryArrowTableDelegateTestCase(ParquetFormatterArrowTableTestCase): 

1600 """Tests for InMemoryDatastore, using ArrowTableDelegate.""" 

1601 

1602 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

1603 

1604 def testBadInput(self): 

1605 tab1 = _makeSimpleArrowTable() 

1606 delegate = ArrowTableDelegate("ArrowTable") 

1607 

1608 with self.assertRaises(ValueError): 

1609 delegate.handleParameters(inMemoryDataset="not_an_arrow_table") 

1610 

1611 with self.assertRaises(NotImplementedError): 

1612 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]}) 

1613 

1614 with self.assertRaises(AttributeError): 

1615 delegate.getComponent(composite=tab1, componentName="nothing") 

1616 

1617 def testStorageClass(self): 

1618 tab1 = _makeSimpleArrowTable() 

1619 

1620 factory = StorageClassFactory() 

1621 factory.addFromConfig(StorageClassConfig()) 

1622 

1623 storageClass = factory.findStorageClass(type(tab1), compare_types=False) 

1624 # Force the name lookup to do name matching. 

1625 storageClass._pytype = None 

1626 self.assertEqual(storageClass.name, "ArrowTable") 

1627 

1628 storageClass = factory.findStorageClass(type(tab1), compare_types=True) 

1629 # Force the name lookup to do name matching. 

1630 storageClass._pytype = None 

1631 self.assertEqual(storageClass.name, "ArrowTable") 

1632 

1633 

1634@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.") 

1635@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowNumpy without pyarrow.") 

1636class ParquetFormatterArrowNumpyDictTestCase(unittest.TestCase): 

1637 """Tests for ParquetFormatter, ArrowNumpyDict, using local file store.""" 

1638 

1639 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1640 

1641 def setUp(self): 

1642 """Create a new butler root for each test.""" 

1643 self.root = makeTestTempDir(TESTDIR) 

1644 config = Config(self.configFile) 

1645 self.butler = Butler.from_config( 

1646 Butler.makeRepo(self.root, config=config), writeable=True, run="test_run" 

1647 ) 

1648 # No dimensions in dataset type so we don't have to worry about 

1649 # inserting dimension data or defining data IDs. 

1650 self.datasetType = DatasetType( 

1651 "data", dimensions=(), storageClass="ArrowNumpyDict", universe=self.butler.dimensions 

1652 ) 

1653 self.butler.registry.registerDatasetType(self.datasetType) 

1654 

1655 def tearDown(self): 

1656 removeTestTempDir(self.root) 

1657 

1658 def testNumpyDict(self): 

1659 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1660 dict1 = _numpy_to_numpy_dict(tab1) 

1661 

1662 self.butler.put(dict1, self.datasetType, dataId={}) 

1663 # Read the whole table. 

1664 dict2 = self.butler.get(self.datasetType, dataId={}) 

1665 self._checkNumpyDictEquality(dict1, dict2) 

1666 # Read the columns. 

1667 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

1668 self.assertEqual(len(columns2), len(dict1.keys())) 

1669 for name in dict1: 

1670 self.assertIn(name, columns2) 

1671 # Read the rowcount. 

1672 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

1673 self.assertEqual(rowcount, len(dict1["a"])) 

1674 # Read the schema. 

1675 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

1676 self.assertEqual(schema, ArrowNumpySchema(tab1.dtype)) 

1677 # Read just some columns a few different ways. 

1678 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

1679 subdict = {key: dict1[key] for key in ["a", "c"]} 

1680 self._checkNumpyDictEquality(subdict, tab3) 

1681 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

1682 subdict = {key: dict1[key] for key in ["a"]} 

1683 self._checkNumpyDictEquality(subdict, tab4) 

1684 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]}) 

1685 subdict = {key: dict1[key] for key in ["index", "a"]} 

1686 self._checkNumpyDictEquality(subdict, tab5) 

1687 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"}) 

1688 subdict = {key: dict1[key] for key in ["ddd"]} 

1689 self._checkNumpyDictEquality(subdict, tab6) 

1690 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]}) 

1691 subdict = {key: dict1[key] for key in ["a"]} 

1692 self._checkNumpyDictEquality(subdict, tab7) 

1693 # Passing an unrecognized column should be a ValueError. 

1694 with self.assertRaises(ValueError): 

1695 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]}) 

1696 

1697 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.") 

1698 def testWriteNumpyDictReadAsArrowTable(self): 

1699 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1700 dict1 = _numpy_to_numpy_dict(tab1) 

1701 

1702 self.butler.put(dict1, self.datasetType, dataId={}) 

1703 

1704 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable") 

1705 

1706 tab2_dict = arrow_to_numpy_dict(tab2) 

1707 

1708 self._checkNumpyDictEquality(dict1, tab2_dict) 

1709 

1710 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

1711 def testWriteNumpyDictReadAsDataFrame(self): 

1712 tab1 = _makeSimpleNumpyTable() 

1713 dict1 = _numpy_to_numpy_dict(tab1) 

1714 

1715 self.butler.put(dict1, self.datasetType, dataId={}) 

1716 

1717 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

1718 

1719 # The order of the dict may get mixed up, so we need to check column 

1720 # by column. We also need to do this in dataframe form because pandas 

1721 # changes the datatype of the string column. 

1722 tab1_df = pd.DataFrame(tab1) 

1723 

1724 self.assertEqual(set(tab1_df.columns), set(tab2.columns)) 

1725 for col in tab1_df.columns: 

1726 self.assertTrue(np.all(tab1_df[col].values == tab2[col].values)) 

1727 

1728 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

1729 def testWriteNumpyDictReadAsAstropyTable(self): 

1730 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1731 dict1 = _numpy_to_numpy_dict(tab1) 

1732 

1733 self.butler.put(dict1, self.datasetType, dataId={}) 

1734 

1735 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

1736 tab2_dict = _astropy_to_numpy_dict(tab2) 

1737 

1738 self._checkNumpyDictEquality(dict1, tab2_dict) 

1739 

1740 def testWriteNumpyDictReadAsNumpyTable(self): 

1741 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1742 dict1 = _numpy_to_numpy_dict(tab1) 

1743 

1744 self.butler.put(dict1, self.datasetType, dataId={}) 

1745 

1746 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy") 

1747 tab2_dict = _numpy_to_numpy_dict(tab2) 

1748 

1749 self._checkNumpyDictEquality(dict1, tab2_dict) 

1750 

1751 def testWriteNumpyDictBad(self): 

1752 dict1 = {"a": 4, "b": np.ndarray([1])} 

1753 with self.assertRaises(RuntimeError): 

1754 self.butler.put(dict1, self.datasetType, dataId={}) 

1755 

1756 dict2 = {"a": np.zeros(4), "b": np.zeros(5)} 

1757 with self.assertRaises(RuntimeError): 

1758 self.butler.put(dict2, self.datasetType, dataId={}) 

1759 

1760 dict3 = {"a": [0] * 5, "b": np.zeros(5)} 

1761 with self.assertRaises(RuntimeError): 

1762 self.butler.put(dict3, self.datasetType, dataId={}) 

1763 

1764 def _checkNumpyDictEquality(self, dict1, dict2): 

1765 """Check if two numpy dicts have the same columns/values. 

1766 

1767 Parameters 

1768 ---------- 

1769 dict1 : `dict` [`str`, `np.ndarray`] 

1770 dict2 : `dict` [`str`, `np.ndarray`] 

1771 """ 

1772 self.assertEqual(set(dict1.keys()), set(dict2.keys())) 

1773 for name in dict1: 

1774 self.assertEqual(dict1[name].dtype, dict2[name].dtype) 

1775 self.assertTrue(np.all(dict1[name] == dict2[name])) 

1776 

1777 

1778@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.") 

1779@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowNumpy without pyarrow.") 

1780class InMemoryNumpyDictDelegateTestCase(ParquetFormatterArrowNumpyDictTestCase): 

1781 """Tests for InMemoryDatastore, using ArrowNumpyDictDelegate.""" 

1782 

1783 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

1784 

1785 def testWriteNumpyDictBad(self): 

1786 # The sub-type checking is not done on in-memory datastore. 

1787 pass 

1788 

1789 

1790@unittest.skipUnless(pa is not None, "Cannot test ArrowSchema without pyarrow.") 

1791class ParquetFormatterArrowSchemaTestCase(unittest.TestCase): 

1792 """Tests for ParquetFormatter, ArrowSchema, using local file datastore.""" 

1793 

1794 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1795 

1796 def setUp(self): 

1797 """Create a new butler root for each test.""" 

1798 self.root = makeTestTempDir(TESTDIR) 

1799 config = Config(self.configFile) 

1800 self.butler = Butler.from_config( 

1801 Butler.makeRepo(self.root, config=config), writeable=True, run="test_run" 

1802 ) 

1803 # No dimensions in dataset type so we don't have to worry about 

1804 # inserting dimension data or defining data IDs. 

1805 self.datasetType = DatasetType( 

1806 "data", dimensions=(), storageClass="ArrowSchema", universe=self.butler.dimensions 

1807 ) 

1808 self.butler.registry.registerDatasetType(self.datasetType) 

1809 

1810 def tearDown(self): 

1811 removeTestTempDir(self.root) 

1812 

1813 def _makeTestSchema(self): 

1814 schema = pa.schema( 

1815 [ 

1816 pa.field( 

1817 "int32", 

1818 pa.int32(), 

1819 nullable=False, 

1820 metadata={ 

1821 "description": "32-bit integer", 

1822 "unit": "", 

1823 }, 

1824 ), 

1825 pa.field( 

1826 "int64", 

1827 pa.int64(), 

1828 nullable=False, 

1829 metadata={ 

1830 "description": "64-bit integer", 

1831 "unit": "", 

1832 }, 

1833 ), 

1834 pa.field( 

1835 "uint64", 

1836 pa.uint64(), 

1837 nullable=False, 

1838 metadata={ 

1839 "description": "64-bit unsigned integer", 

1840 "unit": "", 

1841 }, 

1842 ), 

1843 pa.field( 

1844 "float32", 

1845 pa.float32(), 

1846 nullable=False, 

1847 metadata={ 

1848 "description": "32-bit float", 

1849 "unit": "count", 

1850 }, 

1851 ), 

1852 pa.field( 

1853 "float64", 

1854 pa.float64(), 

1855 nullable=False, 

1856 metadata={ 

1857 "description": "64-bit float", 

1858 "unit": "nJy", 

1859 }, 

1860 ), 

1861 pa.field( 

1862 "fixed_size_list", 

1863 pa.list_(pa.float64(), list_size=10), 

1864 nullable=False, 

1865 metadata={ 

1866 "description": "Fixed size list of 64-bit floats.", 

1867 "unit": "nJy", 

1868 }, 

1869 ), 

1870 pa.field( 

1871 "variable_size_list", 

1872 pa.list_(pa.float64()), 

1873 nullable=False, 

1874 metadata={ 

1875 "description": "Variable size list of 64-bit floats.", 

1876 "unit": "nJy", 

1877 }, 

1878 ), 

1879 # One of these fields will have no description. 

1880 pa.field( 

1881 "string", 

1882 pa.string(), 

1883 nullable=False, 

1884 metadata={ 

1885 "unit": "", 

1886 }, 

1887 ), 

1888 # One of these fields will have no metadata. 

1889 pa.field( 

1890 "binary", 

1891 pa.binary(), 

1892 nullable=False, 

1893 ), 

1894 ] 

1895 ) 

1896 

1897 return schema 

1898 

1899 def testArrowSchema(self): 

1900 schema1 = self._makeTestSchema() 

1901 self.butler.put(schema1, self.datasetType, dataId={}) 

1902 

1903 schema2 = self.butler.get(self.datasetType, dataId={}) 

1904 self.assertEqual(schema2, schema1) 

1905 

1906 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe schema without pandas.") 

1907 def testWriteArrowSchemaReadAsDataFrameSchema(self): 

1908 schema1 = self._makeTestSchema() 

1909 self.butler.put(schema1, self.datasetType, dataId={}) 

1910 

1911 df_schema1 = DataFrameSchema.from_arrow(schema1) 

1912 

1913 df_schema2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrameSchema") 

1914 self.assertEqual(df_schema2, df_schema1) 

1915 

1916 @unittest.skipUnless(atable is not None, "Cannot test reading as an astropy schema without astropy.") 

1917 def testWriteArrowSchemaReadAsArrowAstropySchema(self): 

1918 schema1 = self._makeTestSchema() 

1919 self.butler.put(schema1, self.datasetType, dataId={}) 

1920 

1921 ap_schema1 = ArrowAstropySchema.from_arrow(schema1) 

1922 

1923 ap_schema2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropySchema") 

1924 self.assertEqual(ap_schema2, ap_schema1) 

1925 

1926 # Confirm that the ap_schema2 has the unit/description we expect. 

1927 for name in schema1.names: 

1928 field_metadata = schema1.field(name).metadata 

1929 if field_metadata is None: 

1930 continue 

1931 if ( 

1932 b"description" in field_metadata 

1933 and (description := field_metadata[b"description"].decode("UTF-8")) != "" 

1934 ): 

1935 self.assertEqual(ap_schema2.schema[name].description, description) 

1936 else: 

1937 self.assertIsNone(ap_schema2.schema[name].description) 

1938 if b"unit" in field_metadata and (unit := field_metadata[b"unit"].decode("UTF-8")) != "": 

1939 self.assertEqual(ap_schema2.schema[name].unit, units.Unit(unit)) 

1940 

1941 @unittest.skipUnless(atable is not None, "Cannot test reading as an numpy schema without numpy.") 

1942 def testWriteArrowSchemaReadAsArrowNumpySchema(self): 

1943 schema1 = self._makeTestSchema() 

1944 self.butler.put(schema1, self.datasetType, dataId={}) 

1945 

1946 np_schema1 = ArrowNumpySchema.from_arrow(schema1) 

1947 

1948 np_schema2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpySchema") 

1949 self.assertEqual(np_schema2, np_schema1) 

1950 

1951 

1952@unittest.skipUnless(pa is not None, "Cannot test InMemoryArrowSchemaDelegate without pyarrow.") 

1953class InMemoryArrowSchemaDelegateTestCase(ParquetFormatterArrowSchemaTestCase): 

1954 """Tests for InMemoryDatastore and ArrowSchema.""" 

1955 

1956 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

1957 

1958 

1959@unittest.skipUnless(np is not None, "Cannot test compute_row_group_size without numpy.") 

1960@unittest.skipUnless(pa is not None, "Cannot test compute_row_group_size without pyarrow.") 

1961class ComputeRowGroupSizeTestCase(unittest.TestCase): 

1962 """Tests for compute_row_group_size.""" 

1963 

1964 def testRowGroupSizeNoMetadata(self): 

1965 numpyTable = _makeSimpleNumpyTable(include_multidim=True) 

1966 

1967 # We can't use the numpy_to_arrow convenience function because 

1968 # that adds metadata. 

1969 type_list = _numpy_dtype_to_arrow_types(numpyTable.dtype) 

1970 schema = pa.schema(type_list) 

1971 arrays = _numpy_style_arrays_to_arrow_arrays( 

1972 numpyTable.dtype, 

1973 len(numpyTable), 

1974 numpyTable, 

1975 schema, 

1976 ) 

1977 arrowTable = pa.Table.from_arrays(arrays, schema=schema) 

1978 

1979 row_group_size = compute_row_group_size(arrowTable.schema) 

1980 

1981 self.assertGreater(row_group_size, 1_000_000) 

1982 self.assertLess(row_group_size, 2_000_000) 

1983 

1984 def testRowGroupSizeWithMetadata(self): 

1985 numpyTable = _makeSimpleNumpyTable(include_multidim=True) 

1986 

1987 arrowTable = numpy_to_arrow(numpyTable) 

1988 

1989 row_group_size = compute_row_group_size(arrowTable.schema) 

1990 

1991 self.assertGreater(row_group_size, 1_000_000) 

1992 self.assertLess(row_group_size, 2_000_000) 

1993 

1994 def testRowGroupSizeTinyTable(self): 

1995 numpyTable = np.zeros(1, dtype=[("a", np.bool_)]) 

1996 

1997 arrowTable = numpy_to_arrow(numpyTable) 

1998 

1999 row_group_size = compute_row_group_size(arrowTable.schema) 

2000 

2001 self.assertGreater(row_group_size, 1_000_000) 

2002 

2003 @unittest.skipUnless(pd is not None, "Cannot run testRowGroupSizeDataFrameWithLists without pandas.") 

2004 def testRowGroupSizeDataFrameWithLists(self): 

2005 df = pd.DataFrame({"a": np.zeros(10), "b": [[0, 0]] * 10, "c": [[0.0, 0.0]] * 10, "d": [[]] * 10}) 

2006 arrowTable = pandas_to_arrow(df) 

2007 row_group_size = compute_row_group_size(arrowTable.schema) 

2008 

2009 self.assertGreater(row_group_size, 1_000_000) 

2010 

2011 

2012if __name__ == "__main__": 

2013 unittest.main()