Coverage for tests/test_parquet.py: 22%

1084 statements  

« prev     ^ index     » next       coverage.py v7.4.4, created at 2024-04-05 02:53 -0700

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28"""Tests for ParquetFormatter. 

29 

30Tests in this module are disabled unless pandas and pyarrow are importable. 

31""" 

32 

33import os 

34import unittest 

35 

36try: 

37 import pyarrow as pa 

38except ImportError: 

39 pa = None 

40try: 

41 import astropy.table as atable 

42 from astropy import units 

43except ImportError: 

44 atable = None 

45try: 

46 import numpy as np 

47except ImportError: 

48 np = None 

49try: 

50 import pandas as pd 

51except ImportError: 

52 pd = None 

53 

54from lsst.daf.butler import ( 

55 Butler, 

56 Config, 

57 DatasetRef, 

58 DatasetType, 

59 FileDataset, 

60 StorageClassConfig, 

61 StorageClassFactory, 

62) 

63 

64try: 

65 from lsst.daf.butler.delegates.arrowastropy import ArrowAstropyDelegate 

66except ImportError: 

67 atable = None 

68 pa = None 

69try: 

70 from lsst.daf.butler.delegates.arrownumpy import ArrowNumpyDelegate 

71except ImportError: 

72 np = None 

73 pa = None 

74try: 

75 from lsst.daf.butler.delegates.arrowtable import ArrowTableDelegate 

76except ImportError: 

77 pa = None 

78try: 

79 from lsst.daf.butler.delegates.dataframe import DataFrameDelegate 

80except ImportError: 

81 pd = None 

82try: 

83 from lsst.daf.butler.formatters.parquet import ( 

84 ArrowAstropySchema, 

85 ArrowNumpySchema, 

86 DataFrameSchema, 

87 ParquetFormatter, 

88 _append_numpy_multidim_metadata, 

89 _astropy_to_numpy_dict, 

90 _numpy_dict_to_numpy, 

91 _numpy_dtype_to_arrow_types, 

92 _numpy_style_arrays_to_arrow_arrays, 

93 _numpy_to_numpy_dict, 

94 arrow_to_astropy, 

95 arrow_to_numpy, 

96 arrow_to_numpy_dict, 

97 arrow_to_pandas, 

98 astropy_to_arrow, 

99 compute_row_group_size, 

100 numpy_dict_to_arrow, 

101 numpy_to_arrow, 

102 pandas_to_arrow, 

103 ) 

104except ImportError: 

105 pa = None 

106 pd = None 

107 atable = None 

108 np = None 

109from lsst.daf.butler.tests.utils import makeTestTempDir, removeTestTempDir 

110 

111TESTDIR = os.path.abspath(os.path.dirname(__file__)) 

112 

113 

114def _makeSimpleNumpyTable(include_multidim=False, include_bigendian=False): 

115 """Make a simple numpy table with random data. 

116 

117 Parameters 

118 ---------- 

119 include_multidim : `bool` 

120 Include multi-dimensional columns. 

121 include_bigendian : `bool` 

122 Include big-endian columns. 

123 

124 Returns 

125 ------- 

126 numpyTable : `numpy.ndarray` 

127 """ 

128 nrow = 5 

129 

130 dtype = [ 

131 ("index", "i4"), 

132 ("a", "f8"), 

133 ("b", "f8"), 

134 ("c", "f8"), 

135 ("ddd", "f8"), 

136 ("f", "i8"), 

137 ("strcol", "U10"), 

138 ("bytecol", "a10"), 

139 ] 

140 

141 if include_multidim: 

142 dtype.extend( 

143 [ 

144 ("d1", "f4", (5,)), 

145 ("d2", "i8", (5, 10)), 

146 ("d3", "f8", (5, 10)), 

147 ] 

148 ) 

149 

150 if include_bigendian: 

151 dtype.extend([("a_bigendian", ">f8"), ("f_bigendian", ">i8")]) 

152 

153 data = np.zeros(nrow, dtype=dtype) 

154 data["index"][:] = np.arange(nrow) 

155 data["a"] = np.random.randn(nrow) 

156 data["b"] = np.random.randn(nrow) 

157 data["c"] = np.random.randn(nrow) 

158 data["ddd"] = np.random.randn(nrow) 

159 data["f"] = np.arange(nrow) * 10 

160 data["strcol"][:] = "teststring" 

161 data["bytecol"][:] = "teststring" 

162 

163 if include_multidim: 

164 data["d1"] = np.random.randn(data["d1"].size).reshape(data["d1"].shape) 

165 data["d2"] = np.arange(data["d2"].size).reshape(data["d2"].shape) 

166 data["d3"] = np.asfortranarray(np.random.randn(data["d3"].size).reshape(data["d3"].shape)) 

167 

168 if include_bigendian: 

169 data["a_bigendian"][:] = data["a"] 

170 data["f_bigendian"][:] = data["f"] 

171 

172 return data 

173 

174 

175def _makeSingleIndexDataFrame(include_masked=False, include_lists=False): 

176 """Make a single index data frame for testing. 

177 

178 Parameters 

179 ---------- 

180 include_masked : `bool` 

181 Include masked columns. 

182 include_lists : `bool` 

183 Include list columns. 

184 

185 Returns 

186 ------- 

187 dataFrame : `~pandas.DataFrame` 

188 The test dataframe. 

189 allColumns : `list` [`str`] 

190 List of all the columns (including index columns). 

191 """ 

192 data = _makeSimpleNumpyTable() 

193 df = pd.DataFrame(data) 

194 df = df.set_index("index") 

195 

196 if include_masked: 

197 nrow = len(df) 

198 

199 df["m1"] = pd.array(np.arange(nrow), dtype=pd.Int64Dtype()) 

200 df["m2"] = pd.array(np.arange(nrow), dtype=np.float32) 

201 df["mstrcol"] = pd.array(np.array(["text"] * nrow)) 

202 df.loc[1, ["m1", "m2", "mstrcol"]] = None 

203 

204 if include_lists: 

205 nrow = len(df) 

206 

207 df["l1"] = [[0, 0]] * nrow 

208 df["l2"] = [[0.0, 0.0]] * nrow 

209 df["l3"] = [[]] * nrow 

210 

211 allColumns = df.columns.append(pd.Index(df.index.names)) 

212 

213 return df, allColumns 

214 

215 

216def _makeMultiIndexDataFrame(): 

217 """Make a multi-index data frame for testing. 

218 

219 Returns 

220 ------- 

221 dataFrame : `~pandas.DataFrame` 

222 The test dataframe. 

223 """ 

224 columns = pd.MultiIndex.from_tuples( 

225 [ 

226 ("g", "a"), 

227 ("g", "b"), 

228 ("g", "c"), 

229 ("r", "a"), 

230 ("r", "b"), 

231 ("r", "c"), 

232 ], 

233 names=["filter", "column"], 

234 ) 

235 df = pd.DataFrame(np.random.randn(5, 6), index=np.arange(5, dtype=int), columns=columns) 

236 

237 return df 

238 

239 

240def _makeSimpleAstropyTable(include_multidim=False, include_masked=False, include_bigendian=False): 

241 """Make an astropy table for testing. 

242 

243 Parameters 

244 ---------- 

245 include_multidim : `bool` 

246 Include multi-dimensional columns. 

247 include_masked : `bool` 

248 Include masked columns. 

249 include_bigendian : `bool` 

250 Include big-endian columns. 

251 

252 Returns 

253 ------- 

254 astropyTable : `astropy.table.Table` 

255 The test table. 

256 """ 

257 data = _makeSimpleNumpyTable(include_multidim=include_multidim, include_bigendian=include_bigendian) 

258 # Add a couple of units. 

259 table = atable.Table(data) 

260 table["a"].unit = units.degree 

261 table["a"].description = "Description of column a" 

262 table["b"].unit = units.meter 

263 table["b"].description = "Description of column b" 

264 

265 # Add some masked columns. 

266 if include_masked: 

267 nrow = len(table) 

268 mask = np.zeros(nrow, dtype=bool) 

269 mask[1] = True 

270 # We set the masked columns with the underlying sentinel value 

271 # to be able test after serialization. 

272 

273 # Masked 64-bit integer. 

274 arr = np.arange(nrow, dtype="i8") 

275 arr[mask] = -1 

276 table["m_i8"] = np.ma.masked_array(data=arr, mask=mask, fill_value=-1) 

277 # Masked 32-bit float. 

278 arr = np.arange(nrow, dtype="f4") 

279 arr[mask] = np.nan 

280 table["m_f4"] = np.ma.masked_array(data=arr, mask=mask, fill_value=np.nan) 

281 # Unmasked 32-bit float with NaNs. 

282 table["um_f4"] = arr 

283 # Masked 64-bit float. 

284 arr = np.arange(nrow, dtype="f8") 

285 arr[mask] = np.nan 

286 table["m_f8"] = np.ma.masked_array(data=arr, mask=mask, fill_value=np.nan) 

287 # Unmasked 64-bit float with NaNs. 

288 table["um_f8"] = arr 

289 # Masked boolean. 

290 arr = np.zeros(nrow, dtype=np.bool_) 

291 arr[mask] = True 

292 table["m_bool"] = np.ma.masked_array(data=arr, mask=mask, fill_value=True) 

293 # Masked unsigned 32-bit unsigned int. 

294 arr = np.arange(nrow, dtype="u4") 

295 arr[mask] = 0 

296 table["m_u4"] = np.ma.masked_array(data=arr, mask=mask, fill_value=0) 

297 # Masked string. 

298 table["m_str"] = np.ma.masked_array(data=np.array(["text"] * nrow), mask=mask, fill_value="") 

299 # Masked bytes. 

300 table["m_byte"] = np.ma.masked_array(data=np.array([b"bytes"] * nrow), mask=mask, fill_value=b"") 

301 

302 return table 

303 

304 

305def _makeSimpleArrowTable(include_multidim=False, include_masked=False): 

306 """Make an arrow table for testing. 

307 

308 Parameters 

309 ---------- 

310 include_multidim : `bool` 

311 Include multi-dimensional columns. 

312 include_masked : `bool` 

313 Include masked columns. 

314 

315 Returns 

316 ------- 

317 arrowTable : `pyarrow.Table` 

318 The test table. 

319 """ 

320 data = _makeSimpleAstropyTable(include_multidim=include_multidim, include_masked=include_masked) 

321 return astropy_to_arrow(data) 

322 

323 

324@unittest.skipUnless(pd is not None, "Cannot test ParquetFormatterDataFrame without pandas.") 

325@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterDataFrame without pyarrow.") 

326class ParquetFormatterDataFrameTestCase(unittest.TestCase): 

327 """Tests for ParquetFormatter, DataFrame, using local file datastore.""" 

328 

329 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

330 

331 def setUp(self): 

332 """Create a new butler root for each test.""" 

333 self.root = makeTestTempDir(TESTDIR) 

334 config = Config(self.configFile) 

335 self.run = "test_run" 

336 self.butler = Butler.from_config( 

337 Butler.makeRepo(self.root, config=config), writeable=True, run=self.run 

338 ) 

339 # No dimensions in dataset type so we don't have to worry about 

340 # inserting dimension data or defining data IDs. 

341 self.datasetType = DatasetType( 

342 "data", dimensions=(), storageClass="DataFrame", universe=self.butler.dimensions 

343 ) 

344 self.butler.registry.registerDatasetType(self.datasetType) 

345 

346 def tearDown(self): 

347 removeTestTempDir(self.root) 

348 

349 def testSingleIndexDataFrame(self): 

350 df1, allColumns = _makeSingleIndexDataFrame(include_masked=True) 

351 

352 self.butler.put(df1, self.datasetType, dataId={}) 

353 # Read the whole DataFrame. 

354 df2 = self.butler.get(self.datasetType, dataId={}) 

355 self.assertTrue(df1.equals(df2)) 

356 # Read just the column descriptions. 

357 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

358 self.assertTrue(allColumns.equals(columns2)) 

359 # Read the rowcount. 

360 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

361 self.assertEqual(rowcount, len(df1)) 

362 # Read the schema. 

363 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

364 self.assertEqual(schema, DataFrameSchema(df1)) 

365 # Read just some columns a few different ways. 

366 df3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

367 self.assertTrue(df1.loc[:, ["a", "c"]].equals(df3)) 

368 df4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

369 self.assertTrue(df1.loc[:, ["a"]].equals(df4)) 

370 df5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]}) 

371 self.assertTrue(df1.loc[:, ["a"]].equals(df5)) 

372 df6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"}) 

373 self.assertTrue(df1.loc[:, ["ddd"]].equals(df6)) 

374 df7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]}) 

375 self.assertTrue(df1.loc[:, ["a"]].equals(df7)) 

376 # Passing an unrecognized column should be a ValueError. 

377 with self.assertRaises(ValueError): 

378 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]}) 

379 

380 def testSingleIndexDataFrameWithLists(self): 

381 df1, allColumns = _makeSingleIndexDataFrame(include_lists=True) 

382 

383 self.butler.put(df1, self.datasetType, dataId={}) 

384 # Read the whole DataFrame. 

385 df2 = self.butler.get(self.datasetType, dataId={}) 

386 

387 # We need to check the list columns specially because they go 

388 # from lists to arrays. 

389 for col in ["l1", "l2", "l3"]: 

390 for i in range(len(df1)): 

391 self.assertTrue(np.all(df2[col].values[i] == df1[col].values[i])) 

392 

393 def testMultiIndexDataFrame(self): 

394 df1 = _makeMultiIndexDataFrame() 

395 

396 self.butler.put(df1, self.datasetType, dataId={}) 

397 # Read the whole DataFrame. 

398 df2 = self.butler.get(self.datasetType, dataId={}) 

399 self.assertTrue(df1.equals(df2)) 

400 # Read just the column descriptions. 

401 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

402 self.assertTrue(df1.columns.equals(columns2)) 

403 self.assertEqual(columns2.names, df1.columns.names) 

404 # Read the rowcount. 

405 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

406 self.assertEqual(rowcount, len(df1)) 

407 # Read the schema. 

408 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

409 self.assertEqual(schema, DataFrameSchema(df1)) 

410 # Read just some columns a few different ways. 

411 df3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": {"filter": "g"}}) 

412 self.assertTrue(df1.loc[:, ["g"]].equals(df3)) 

413 df4 = self.butler.get( 

414 self.datasetType, dataId={}, parameters={"columns": {"filter": ["r"], "column": "a"}} 

415 ) 

416 self.assertTrue(df1.loc[:, [("r", "a")]].equals(df4)) 

417 column_list = [("g", "a"), ("r", "c")] 

418 df5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": column_list}) 

419 self.assertTrue(df1.loc[:, column_list].equals(df5)) 

420 column_dict = {"filter": "r", "column": ["a", "b"]} 

421 df6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": column_dict}) 

422 self.assertTrue(df1.loc[:, [("r", "a"), ("r", "b")]].equals(df6)) 

423 # Passing an unrecognized column should be a ValueError. 

424 with self.assertRaises(ValueError): 

425 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["d"]}) 

426 

427 def testSingleIndexDataFrameEmptyString(self): 

428 """Test persisting a single index dataframe with empty strings.""" 

429 df1, _ = _makeSingleIndexDataFrame() 

430 

431 # Set one of the strings to None 

432 df1.at[1, "strcol"] = None 

433 

434 self.butler.put(df1, self.datasetType, dataId={}) 

435 # Read the whole DataFrame. 

436 df2 = self.butler.get(self.datasetType, dataId={}) 

437 self.assertTrue(df1.equals(df2)) 

438 

439 def testSingleIndexDataFrameAllEmptyStrings(self): 

440 """Test persisting a single index dataframe with an empty string 

441 column. 

442 """ 

443 df1, _ = _makeSingleIndexDataFrame() 

444 

445 # Set all of the strings to None 

446 df1.loc[0:, "strcol"] = None 

447 

448 self.butler.put(df1, self.datasetType, dataId={}) 

449 # Read the whole DataFrame. 

450 df2 = self.butler.get(self.datasetType, dataId={}) 

451 self.assertTrue(df1.equals(df2)) 

452 

453 def testLegacyDataFrame(self): 

454 """Test writing a dataframe to parquet via pandas (without additional 

455 metadata) and ensure that we can read it back with all the new 

456 functionality. 

457 """ 

458 df1, allColumns = _makeSingleIndexDataFrame() 

459 

460 fname = os.path.join(self.root, "test_dataframe.parq") 

461 df1.to_parquet(fname) 

462 

463 legacy_type = DatasetType( 

464 "legacy_dataframe", 

465 dimensions=(), 

466 storageClass="DataFrame", 

467 universe=self.butler.dimensions, 

468 ) 

469 self.butler.registry.registerDatasetType(legacy_type) 

470 

471 data_id = {} 

472 ref = DatasetRef(legacy_type, data_id, run=self.run) 

473 dataset = FileDataset(path=fname, refs=[ref], formatter=ParquetFormatter) 

474 

475 self.butler.ingest(dataset, transfer="copy") 

476 

477 self.butler.put(df1, self.datasetType, dataId={}) 

478 

479 df2a = self.butler.get(self.datasetType, dataId={}) 

480 df2b = self.butler.get("legacy_dataframe", dataId={}) 

481 self.assertTrue(df2a.equals(df2b)) 

482 

483 df3a = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a"]}) 

484 df3b = self.butler.get("legacy_dataframe", dataId={}, parameters={"columns": ["a"]}) 

485 self.assertTrue(df3a.equals(df3b)) 

486 

487 columns2a = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

488 columns2b = self.butler.get("legacy_dataframe.columns", dataId={}) 

489 self.assertTrue(columns2a.equals(columns2b)) 

490 

491 rowcount2a = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

492 rowcount2b = self.butler.get("legacy_dataframe.rowcount", dataId={}) 

493 self.assertEqual(rowcount2a, rowcount2b) 

494 

495 schema2a = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

496 schema2b = self.butler.get("legacy_dataframe.schema", dataId={}) 

497 self.assertEqual(schema2a, schema2b) 

498 

499 def testDataFrameSchema(self): 

500 tab1 = _makeSimpleArrowTable() 

501 

502 schema = DataFrameSchema.from_arrow(tab1.schema) 

503 

504 self.assertIsInstance(schema.schema, pd.DataFrame) 

505 self.assertEqual(repr(schema), repr(schema._schema)) 

506 self.assertNotEqual(schema, "not_a_schema") 

507 self.assertEqual(schema, schema) 

508 

509 tab2 = _makeMultiIndexDataFrame() 

510 schema2 = DataFrameSchema(tab2) 

511 

512 self.assertNotEqual(schema, schema2) 

513 

514 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

515 def testWriteSingleIndexDataFrameReadAsAstropyTable(self): 

516 df1, allColumns = _makeSingleIndexDataFrame() 

517 

518 self.butler.put(df1, self.datasetType, dataId={}) 

519 

520 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

521 

522 tab2_df = tab2.to_pandas(index="index") 

523 self.assertTrue(df1.equals(tab2_df)) 

524 

525 # Check reading the columns. 

526 columns = list(tab2.columns.keys()) 

527 columns2 = self.butler.get( 

528 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

529 ) 

530 # We check the set because pandas reorders the columns. 

531 self.assertEqual(set(columns2), set(columns)) 

532 

533 # Check reading the schema. 

534 schema = ArrowAstropySchema(tab2) 

535 schema2 = self.butler.get( 

536 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema" 

537 ) 

538 

539 # The string types are objectified by pandas, and the order 

540 # will be changed because of pandas indexing. 

541 self.assertEqual(len(schema2.schema.columns), len(schema.schema.columns)) 

542 for name in schema.schema.columns: 

543 self.assertIn(name, schema2.schema.columns) 

544 if schema2.schema[name].dtype != np.dtype("O"): 

545 self.assertEqual(schema2.schema[name].dtype, schema.schema[name].dtype) 

546 

547 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

548 def testWriteSingleIndexDataFrameWithMaskedColsReadAsAstropyTable(self): 

549 # We need to special-case the write-as-pandas read-as-astropy code 

550 # with masks because pandas has multiple ways to use masked columns. 

551 # (The string column mask handling in particular is frustratingly 

552 # inconsistent.) 

553 df1, allColumns = _makeSingleIndexDataFrame(include_masked=True) 

554 

555 self.butler.put(df1, self.datasetType, dataId={}) 

556 

557 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

558 tab2_df = tab2.to_pandas(index="index") 

559 

560 self.assertTrue(df1.columns.equals(tab2_df.columns)) 

561 for name in tab2_df.columns: 

562 col1 = df1[name] 

563 col2 = tab2_df[name] 

564 

565 if col1.hasnans: 

566 notNull = col1.notnull() 

567 self.assertTrue(notNull.equals(col2.notnull())) 

568 # Need to check value-by-value because column may 

569 # be made of objects, depending on what pandas decides. 

570 for index in notNull.values.nonzero()[0]: 

571 self.assertEqual(col1[index], col2[index]) 

572 else: 

573 self.assertTrue(col1.equals(col2)) 

574 

575 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

576 def testWriteMultiIndexDataFrameReadAsAstropyTable(self): 

577 df1 = _makeMultiIndexDataFrame() 

578 

579 self.butler.put(df1, self.datasetType, dataId={}) 

580 

581 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

582 

583 # This is an odd duck, it doesn't really round-trip. 

584 # This test simply checks that it's readable, but definitely not 

585 # recommended. 

586 

587 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.") 

588 def testWriteSingleIndexDataFrameReadAsArrowTable(self): 

589 df1, allColumns = _makeSingleIndexDataFrame() 

590 

591 self.butler.put(df1, self.datasetType, dataId={}) 

592 

593 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable") 

594 

595 tab2_df = arrow_to_pandas(tab2) 

596 self.assertTrue(df1.equals(tab2_df)) 

597 

598 # Check reading the columns. 

599 columns = list(tab2.schema.names) 

600 columns2 = self.butler.get( 

601 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

602 ) 

603 # We check the set because pandas reorders the columns. 

604 self.assertEqual(set(columns), set(columns2)) 

605 

606 # Check reading the schema. 

607 schema = tab2.schema 

608 schema2 = self.butler.get( 

609 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema" 

610 ) 

611 

612 # These will not have the same metadata, nor will the string column 

613 # information be maintained. 

614 self.assertEqual(len(schema.names), len(schema2.names)) 

615 for name in schema.names: 

616 if schema.field(name).type not in (pa.string(), pa.binary()): 

617 self.assertEqual(schema.field(name).type, schema2.field(name).type) 

618 

619 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.") 

620 def testWriteMultiIndexDataFrameReadAsArrowTable(self): 

621 df1 = _makeMultiIndexDataFrame() 

622 

623 self.butler.put(df1, self.datasetType, dataId={}) 

624 

625 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable") 

626 

627 tab2_df = arrow_to_pandas(tab2) 

628 self.assertTrue(df1.equals(tab2_df)) 

629 

630 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

631 def testWriteSingleIndexDataFrameReadAsNumpyTable(self): 

632 df1, allColumns = _makeSingleIndexDataFrame() 

633 

634 self.butler.put(df1, self.datasetType, dataId={}) 

635 

636 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy") 

637 

638 tab2_df = pd.DataFrame.from_records(tab2, index=["index"]) 

639 self.assertTrue(df1.equals(tab2_df)) 

640 

641 # Check reading the columns. 

642 columns = list(tab2.dtype.names) 

643 columns2 = self.butler.get( 

644 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

645 ) 

646 # We check the set because pandas reorders the columns. 

647 self.assertEqual(set(columns2), set(columns)) 

648 

649 # Check reading the schema. 

650 schema = ArrowNumpySchema(tab2.dtype) 

651 schema2 = self.butler.get( 

652 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema" 

653 ) 

654 

655 # The string types will be objectified by pandas, and the order 

656 # will be changed because of pandas indexing. 

657 self.assertEqual(len(schema.schema.names), len(schema2.schema.names)) 

658 for name in schema.schema.names: 

659 self.assertIn(name, schema2.schema.names) 

660 self.assertEqual(schema2.schema[name].type, schema.schema[name].type) 

661 

662 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

663 def testWriteMultiIndexDataFrameReadAsNumpyTable(self): 

664 df1 = _makeMultiIndexDataFrame() 

665 

666 self.butler.put(df1, self.datasetType, dataId={}) 

667 

668 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy") 

669 

670 # This is an odd duck, it doesn't really round-trip. 

671 # This test simply checks that it's readable, but definitely not 

672 # recommended. 

673 

674 @unittest.skipUnless(np is not None, "Cannot test reading as numpy dict without numpy.") 

675 def testWriteSingleIndexDataFrameReadAsNumpyDict(self): 

676 df1, allColumns = _makeSingleIndexDataFrame() 

677 

678 self.butler.put(df1, self.datasetType, dataId={}) 

679 

680 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict") 

681 

682 tab2_df = pd.DataFrame.from_records(tab2, index=["index"]) 

683 # The column order is not maintained. 

684 self.assertEqual(set(df1.columns), set(tab2_df.columns)) 

685 for col in df1.columns: 

686 self.assertTrue(np.all(df1[col].values == tab2_df[col].values)) 

687 

688 @unittest.skipUnless(np is not None, "Cannot test reading as numpy dict without numpy.") 

689 def testWriteMultiIndexDataFrameReadAsNumpyDict(self): 

690 df1 = _makeMultiIndexDataFrame() 

691 

692 self.butler.put(df1, self.datasetType, dataId={}) 

693 

694 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict") 

695 

696 # This is an odd duck, it doesn't really round-trip. 

697 # This test simply checks that it's readable, but definitely not 

698 # recommended. 

699 

700 

701@unittest.skipUnless(pd is not None, "Cannot test InMemoryDataFrameDelegate without pandas.") 

702class InMemoryDataFrameDelegateTestCase(ParquetFormatterDataFrameTestCase): 

703 """Tests for InMemoryDatastore, using DataFrameDelegate.""" 

704 

705 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

706 

707 def testWriteMultiIndexDataFrameReadAsAstropyTable(self): 

708 df1 = _makeMultiIndexDataFrame() 

709 

710 self.butler.put(df1, self.datasetType, dataId={}) 

711 

712 with self.assertRaises(ValueError): 

713 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

714 

715 def testLegacyDataFrame(self): 

716 # This test does not work with an inMemoryDatastore. 

717 pass 

718 

719 def testBadInput(self): 

720 df1, _ = _makeSingleIndexDataFrame() 

721 delegate = DataFrameDelegate("DataFrame") 

722 

723 with self.assertRaises(ValueError): 

724 delegate.handleParameters(inMemoryDataset="not_a_dataframe") 

725 

726 with self.assertRaises(AttributeError): 

727 delegate.getComponent(composite=df1, componentName="nothing") 

728 

729 def testStorageClass(self): 

730 df1, allColumns = _makeSingleIndexDataFrame() 

731 

732 factory = StorageClassFactory() 

733 factory.addFromConfig(StorageClassConfig()) 

734 

735 storageClass = factory.findStorageClass(type(df1), compare_types=False) 

736 # Force the name lookup to do name matching. 

737 storageClass._pytype = None 

738 self.assertEqual(storageClass.name, "DataFrame") 

739 

740 storageClass = factory.findStorageClass(type(df1), compare_types=True) 

741 # Force the name lookup to do name matching. 

742 storageClass._pytype = None 

743 self.assertEqual(storageClass.name, "DataFrame") 

744 

745 

746@unittest.skipUnless(atable is not None, "Cannot test ParquetFormatterArrowAstropy without astropy.") 

747@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowAstropy without pyarrow.") 

748class ParquetFormatterArrowAstropyTestCase(unittest.TestCase): 

749 """Tests for ParquetFormatter, ArrowAstropy, using local file datastore.""" 

750 

751 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

752 

753 def setUp(self): 

754 """Create a new butler root for each test.""" 

755 self.root = makeTestTempDir(TESTDIR) 

756 config = Config(self.configFile) 

757 self.run = "test_run" 

758 self.butler = Butler.from_config( 

759 Butler.makeRepo(self.root, config=config), writeable=True, run=self.run 

760 ) 

761 # No dimensions in dataset type so we don't have to worry about 

762 # inserting dimension data or defining data IDs. 

763 self.datasetType = DatasetType( 

764 "data", dimensions=(), storageClass="ArrowAstropy", universe=self.butler.dimensions 

765 ) 

766 self.butler.registry.registerDatasetType(self.datasetType) 

767 

768 def tearDown(self): 

769 removeTestTempDir(self.root) 

770 

771 def testAstropyTable(self): 

772 tab1 = _makeSimpleAstropyTable(include_multidim=True, include_masked=True) 

773 

774 self.butler.put(tab1, self.datasetType, dataId={}) 

775 # Read the whole Table. 

776 tab2 = self.butler.get(self.datasetType, dataId={}) 

777 self._checkAstropyTableEquality(tab1, tab2) 

778 # Read the columns. 

779 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

780 self.assertEqual(len(columns2), len(tab1.dtype.names)) 

781 for i, name in enumerate(tab1.dtype.names): 

782 self.assertEqual(columns2[i], name) 

783 # Read the rowcount. 

784 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

785 self.assertEqual(rowcount, len(tab1)) 

786 # Read the schema. 

787 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

788 self.assertEqual(schema, ArrowAstropySchema(tab1)) 

789 # Read just some columns a few different ways. 

790 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

791 self._checkAstropyTableEquality(tab1[("a", "c")], tab3) 

792 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

793 self._checkAstropyTableEquality(tab1[("a",)], tab4) 

794 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]}) 

795 self._checkAstropyTableEquality(tab1[("index", "a")], tab5) 

796 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"}) 

797 self._checkAstropyTableEquality(tab1[("ddd",)], tab6) 

798 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]}) 

799 self._checkAstropyTableEquality(tab1[("a",)], tab7) 

800 # Passing an unrecognized column should be a ValueError. 

801 with self.assertRaises(ValueError): 

802 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]}) 

803 

804 def testAstropyTableBigEndian(self): 

805 tab1 = _makeSimpleAstropyTable(include_bigendian=True) 

806 

807 self.butler.put(tab1, self.datasetType, dataId={}) 

808 # Read the whole Table. 

809 tab2 = self.butler.get(self.datasetType, dataId={}) 

810 self._checkAstropyTableEquality(tab1, tab2, has_bigendian=True) 

811 

812 def testAstropyTableWithMetadata(self): 

813 tab1 = _makeSimpleAstropyTable(include_multidim=True) 

814 

815 meta = { 

816 "meta_a": 5, 

817 "meta_b": 10.0, 

818 "meta_c": [1, 2, 3], 

819 "meta_d": True, 

820 "meta_e": "string", 

821 } 

822 

823 tab1.meta.update(meta) 

824 

825 self.butler.put(tab1, self.datasetType, dataId={}) 

826 # Read the whole Table. 

827 tab2 = self.butler.get(self.datasetType, dataId={}) 

828 # This will check that the metadata is equivalent as well. 

829 self._checkAstropyTableEquality(tab1, tab2) 

830 

831 def testArrowAstropySchema(self): 

832 tab1 = _makeSimpleAstropyTable() 

833 tab1_arrow = astropy_to_arrow(tab1) 

834 schema = ArrowAstropySchema.from_arrow(tab1_arrow.schema) 

835 

836 self.assertIsInstance(schema.schema, atable.Table) 

837 self.assertEqual(repr(schema), repr(schema._schema)) 

838 self.assertNotEqual(schema, "not_a_schema") 

839 self.assertEqual(schema, schema) 

840 

841 # Test various inequalities 

842 tab2 = tab1.copy() 

843 tab2.rename_column("index", "index2") 

844 schema2 = ArrowAstropySchema(tab2) 

845 self.assertNotEqual(schema2, schema) 

846 

847 tab2 = tab1.copy() 

848 tab2["index"].unit = units.micron 

849 schema2 = ArrowAstropySchema(tab2) 

850 self.assertNotEqual(schema2, schema) 

851 

852 tab2 = tab1.copy() 

853 tab2["index"].description = "Index column" 

854 schema2 = ArrowAstropySchema(tab2) 

855 self.assertNotEqual(schema2, schema) 

856 

857 tab2 = tab1.copy() 

858 tab2["index"].format = "%05d" 

859 schema2 = ArrowAstropySchema(tab2) 

860 self.assertNotEqual(schema2, schema) 

861 

862 def testAstropyParquet(self): 

863 tab1 = _makeSimpleAstropyTable() 

864 

865 fname = os.path.join(self.root, "test_astropy.parq") 

866 tab1.write(fname) 

867 

868 astropy_type = DatasetType( 

869 "astropy_parquet", 

870 dimensions=(), 

871 storageClass="ArrowAstropy", 

872 universe=self.butler.dimensions, 

873 ) 

874 self.butler.registry.registerDatasetType(astropy_type) 

875 

876 data_id = {} 

877 ref = DatasetRef(astropy_type, data_id, run=self.run) 

878 dataset = FileDataset(path=fname, refs=[ref], formatter=ParquetFormatter) 

879 

880 self.butler.ingest(dataset, transfer="copy") 

881 

882 self.butler.put(tab1, self.datasetType, dataId={}) 

883 

884 tab2a = self.butler.get(self.datasetType, dataId={}) 

885 tab2b = self.butler.get("astropy_parquet", dataId={}) 

886 self._checkAstropyTableEquality(tab2a, tab2b) 

887 

888 columns2a = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

889 columns2b = self.butler.get("astropy_parquet.columns", dataId={}) 

890 self.assertEqual(len(columns2b), len(columns2a)) 

891 for i, name in enumerate(columns2a): 

892 self.assertEqual(columns2b[i], name) 

893 

894 rowcount2a = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

895 rowcount2b = self.butler.get("astropy_parquet.rowcount", dataId={}) 

896 self.assertEqual(rowcount2a, rowcount2b) 

897 

898 schema2a = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

899 schema2b = self.butler.get("astropy_parquet.schema", dataId={}) 

900 self.assertEqual(schema2a, schema2b) 

901 

902 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.") 

903 def testWriteAstropyReadAsArrowTable(self): 

904 # This astropy <-> arrow works fine with masked columns. 

905 tab1 = _makeSimpleAstropyTable(include_masked=True) 

906 

907 self.butler.put(tab1, self.datasetType, dataId={}) 

908 

909 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable") 

910 

911 tab2_astropy = arrow_to_astropy(tab2) 

912 self._checkAstropyTableEquality(tab1, tab2_astropy) 

913 

914 # Check reading the columns. 

915 columns = tab2.schema.names 

916 columns2 = self.butler.get( 

917 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

918 ) 

919 self.assertEqual(columns2, columns) 

920 

921 # Check reading the schema. 

922 schema = tab2.schema 

923 schema2 = self.butler.get( 

924 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema" 

925 ) 

926 

927 self.assertEqual(schema, schema2) 

928 

929 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

930 def testWriteAstropyReadAsDataFrame(self): 

931 tab1 = _makeSimpleAstropyTable() 

932 

933 self.butler.put(tab1, self.datasetType, dataId={}) 

934 

935 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

936 

937 # This is tricky because it loses the units and gains a bonus pandas 

938 # _index_ column, so we just test the dataframe form. 

939 

940 tab1_df = tab1.to_pandas() 

941 self.assertTrue(tab1_df.equals(tab2)) 

942 

943 # Check reading the columns. 

944 columns = tab2.columns 

945 columns2 = self.butler.get( 

946 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex" 

947 ) 

948 self.assertTrue(columns.equals(columns2)) 

949 

950 # Check reading the schema. 

951 schema = DataFrameSchema(tab2) 

952 schema2 = self.butler.get( 

953 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema" 

954 ) 

955 

956 self.assertEqual(schema2, schema) 

957 

958 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

959 def testWriteAstropyWithMaskedColsReadAsDataFrame(self): 

960 # We need to special-case the write-as-astropy read-as-pandas code 

961 # with masks because pandas has multiple ways to use masked columns. 

962 # (When writing an astropy table with masked columns we get an object 

963 # column back, but each unmasked element has the correct type.) 

964 tab1 = _makeSimpleAstropyTable(include_masked=True) 

965 

966 self.butler.put(tab1, self.datasetType, dataId={}) 

967 

968 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

969 

970 tab1_df = tab1.to_pandas() 

971 

972 self.assertTrue(tab1_df.columns.equals(tab2.columns)) 

973 for name in tab2.columns: 

974 col1 = tab1_df[name] 

975 col2 = tab2[name] 

976 

977 if col1.hasnans: 

978 notNull = col1.notnull() 

979 self.assertTrue(notNull.equals(col2.notnull())) 

980 # Need to check value-by-value because column may 

981 # be made of objects, depending on what pandas decides. 

982 for index in notNull.values.nonzero()[0]: 

983 self.assertEqual(col1[index], col2[index]) 

984 else: 

985 self.assertTrue(col1.equals(col2)) 

986 

987 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

988 def testWriteAstropyReadAsNumpyTable(self): 

989 tab1 = _makeSimpleAstropyTable() 

990 self.butler.put(tab1, self.datasetType, dataId={}) 

991 

992 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy") 

993 

994 # This is tricky because it loses the units. 

995 tab2_astropy = atable.Table(tab2) 

996 

997 self._checkAstropyTableEquality(tab1, tab2_astropy, skip_units=True) 

998 

999 # Check reading the columns. 

1000 columns = list(tab2.dtype.names) 

1001 columns2 = self.butler.get( 

1002 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

1003 ) 

1004 self.assertEqual(columns2, columns) 

1005 

1006 # Check reading the schema. 

1007 schema = ArrowNumpySchema(tab2.dtype) 

1008 schema2 = self.butler.get( 

1009 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema" 

1010 ) 

1011 

1012 self.assertEqual(schema2, schema) 

1013 

1014 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

1015 def testWriteAstropyReadAsNumpyDict(self): 

1016 tab1 = _makeSimpleAstropyTable() 

1017 self.butler.put(tab1, self.datasetType, dataId={}) 

1018 

1019 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict") 

1020 

1021 # This is tricky because it loses the units. 

1022 tab2_astropy = atable.Table(tab2) 

1023 

1024 self._checkAstropyTableEquality(tab1, tab2_astropy, skip_units=True) 

1025 

1026 def _checkAstropyTableEquality(self, table1, table2, skip_units=False, has_bigendian=False): 

1027 """Check if two astropy tables have the same columns/values. 

1028 

1029 Parameters 

1030 ---------- 

1031 table1 : `astropy.table.Table` 

1032 table2 : `astropy.table.Table` 

1033 skip_units : `bool` 

1034 has_bigendian : `bool` 

1035 """ 

1036 if not has_bigendian: 

1037 self.assertEqual(table1.dtype, table2.dtype) 

1038 else: 

1039 for name in table1.dtype.names: 

1040 # Only check type matches, force to little-endian. 

1041 self.assertEqual(table1.dtype[name].newbyteorder(">"), table2.dtype[name].newbyteorder(">")) 

1042 

1043 self.assertEqual(table1.meta, table2.meta) 

1044 if not skip_units: 

1045 for name in table1.columns: 

1046 self.assertEqual(table1[name].unit, table2[name].unit) 

1047 self.assertEqual(table1[name].description, table2[name].description) 

1048 self.assertEqual(table1[name].format, table2[name].format) 

1049 # We need to check masked/regular columns after filling. 

1050 has_masked = False 

1051 if isinstance(table1[name], atable.column.MaskedColumn): 

1052 c1 = table1[name].filled() 

1053 has_masked = True 

1054 else: 

1055 c1 = np.array(table1[name]) 

1056 if has_masked: 

1057 self.assertIsInstance(table2[name], atable.column.MaskedColumn) 

1058 c2 = table2[name].filled() 

1059 else: 

1060 self.assertFalse(isinstance(table2[name], atable.column.MaskedColumn)) 

1061 c2 = np.array(table2[name]) 

1062 np.testing.assert_array_equal(c1, c2) 

1063 # If we have a masked column then we test the underlying data. 

1064 if has_masked: 

1065 np.testing.assert_array_equal(np.array(c1), np.array(c2)) 

1066 np.testing.assert_array_equal(table1[name].mask, table2[name].mask) 

1067 

1068 

1069@unittest.skipUnless(atable is not None, "Cannot test InMemoryArrowAstropyDelegate without astropy.") 

1070class InMemoryArrowAstropyDelegateTestCase(ParquetFormatterArrowAstropyTestCase): 

1071 """Tests for InMemoryDatastore, using ArrowAstropyDelegate.""" 

1072 

1073 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

1074 

1075 def testAstropyParquet(self): 

1076 # This test does not work with an inMemoryDatastore. 

1077 pass 

1078 

1079 def testBadInput(self): 

1080 tab1 = _makeSimpleAstropyTable() 

1081 delegate = ArrowAstropyDelegate("ArrowAstropy") 

1082 

1083 with self.assertRaises(ValueError): 

1084 delegate.handleParameters(inMemoryDataset="not_an_astropy_table") 

1085 

1086 with self.assertRaises(NotImplementedError): 

1087 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]}) 

1088 

1089 with self.assertRaises(AttributeError): 

1090 delegate.getComponent(composite=tab1, componentName="nothing") 

1091 

1092 

1093@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.") 

1094@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowNumpy without pyarrow.") 

1095class ParquetFormatterArrowNumpyTestCase(unittest.TestCase): 

1096 """Tests for ParquetFormatter, ArrowNumpy, using local file datastore.""" 

1097 

1098 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1099 

1100 def setUp(self): 

1101 """Create a new butler root for each test.""" 

1102 self.root = makeTestTempDir(TESTDIR) 

1103 config = Config(self.configFile) 

1104 self.butler = Butler.from_config( 

1105 Butler.makeRepo(self.root, config=config), writeable=True, run="test_run" 

1106 ) 

1107 # No dimensions in dataset type so we don't have to worry about 

1108 # inserting dimension data or defining data IDs. 

1109 self.datasetType = DatasetType( 

1110 "data", dimensions=(), storageClass="ArrowNumpy", universe=self.butler.dimensions 

1111 ) 

1112 self.butler.registry.registerDatasetType(self.datasetType) 

1113 

1114 def tearDown(self): 

1115 removeTestTempDir(self.root) 

1116 

1117 def testNumpyTable(self): 

1118 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1119 

1120 self.butler.put(tab1, self.datasetType, dataId={}) 

1121 # Read the whole Table. 

1122 tab2 = self.butler.get(self.datasetType, dataId={}) 

1123 self._checkNumpyTableEquality(tab1, tab2) 

1124 # Read the columns. 

1125 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

1126 self.assertEqual(len(columns2), len(tab1.dtype.names)) 

1127 for i, name in enumerate(tab1.dtype.names): 

1128 self.assertEqual(columns2[i], name) 

1129 # Read the rowcount. 

1130 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

1131 self.assertEqual(rowcount, len(tab1)) 

1132 # Read the schema. 

1133 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

1134 self.assertEqual(schema, ArrowNumpySchema(tab1.dtype)) 

1135 # Read just some columns a few different ways. 

1136 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

1137 self._checkNumpyTableEquality(tab1[["a", "c"]], tab3) 

1138 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

1139 self._checkNumpyTableEquality( 

1140 tab1[ 

1141 [ 

1142 "a", 

1143 ] 

1144 ], 

1145 tab4, 

1146 ) 

1147 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]}) 

1148 self._checkNumpyTableEquality(tab1[["index", "a"]], tab5) 

1149 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"}) 

1150 self._checkNumpyTableEquality( 

1151 tab1[ 

1152 [ 

1153 "ddd", 

1154 ] 

1155 ], 

1156 tab6, 

1157 ) 

1158 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]}) 

1159 self._checkNumpyTableEquality( 

1160 tab1[ 

1161 [ 

1162 "a", 

1163 ] 

1164 ], 

1165 tab7, 

1166 ) 

1167 # Passing an unrecognized column should be a ValueError. 

1168 with self.assertRaises(ValueError): 

1169 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]}) 

1170 

1171 def testNumpyTableBigEndian(self): 

1172 tab1 = _makeSimpleNumpyTable(include_bigendian=True) 

1173 

1174 self.butler.put(tab1, self.datasetType, dataId={}) 

1175 # Read the whole Table. 

1176 tab2 = self.butler.get(self.datasetType, dataId={}) 

1177 self._checkNumpyTableEquality(tab1, tab2, has_bigendian=True) 

1178 

1179 def testArrowNumpySchema(self): 

1180 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1181 tab1_arrow = numpy_to_arrow(tab1) 

1182 schema = ArrowNumpySchema.from_arrow(tab1_arrow.schema) 

1183 

1184 self.assertIsInstance(schema.schema, np.dtype) 

1185 self.assertEqual(repr(schema), repr(schema._dtype)) 

1186 self.assertNotEqual(schema, "not_a_schema") 

1187 self.assertEqual(schema, schema) 

1188 

1189 # Test inequality 

1190 tab2 = tab1.copy() 

1191 names = list(tab2.dtype.names) 

1192 names[0] = "index2" 

1193 tab2.dtype.names = names 

1194 schema2 = ArrowNumpySchema(tab2.dtype) 

1195 self.assertNotEqual(schema2, schema) 

1196 

1197 @unittest.skipUnless(pa is not None, "Cannot test arrow conversions without pyarrow.") 

1198 def testNumpyDictConversions(self): 

1199 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1200 

1201 # Verify that everything round-trips, including the schema. 

1202 tab1_arrow = numpy_to_arrow(tab1) 

1203 tab1_dict = arrow_to_numpy_dict(tab1_arrow) 

1204 tab1_dict_arrow = numpy_dict_to_arrow(tab1_dict) 

1205 

1206 self.assertEqual(tab1_arrow.schema, tab1_dict_arrow.schema) 

1207 self.assertEqual(tab1_arrow, tab1_dict_arrow) 

1208 

1209 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.") 

1210 def testWriteNumpyTableReadAsArrowTable(self): 

1211 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1212 

1213 self.butler.put(tab1, self.datasetType, dataId={}) 

1214 

1215 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable") 

1216 

1217 tab2_numpy = arrow_to_numpy(tab2) 

1218 

1219 self._checkNumpyTableEquality(tab1, tab2_numpy) 

1220 

1221 # Check reading the columns. 

1222 columns = tab2.schema.names 

1223 columns2 = self.butler.get( 

1224 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

1225 ) 

1226 self.assertEqual(columns2, columns) 

1227 

1228 # Check reading the schema. 

1229 schema = tab2.schema 

1230 schema2 = self.butler.get( 

1231 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema" 

1232 ) 

1233 self.assertEqual(schema2, schema) 

1234 

1235 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

1236 def testWriteNumpyTableReadAsDataFrame(self): 

1237 tab1 = _makeSimpleNumpyTable() 

1238 

1239 self.butler.put(tab1, self.datasetType, dataId={}) 

1240 

1241 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

1242 

1243 # Converting this back to numpy gets confused with the index column 

1244 # and changes the datatype of the string column. 

1245 

1246 tab1_df = pd.DataFrame(tab1) 

1247 

1248 self.assertTrue(tab1_df.equals(tab2)) 

1249 

1250 # Check reading the columns. 

1251 columns = tab2.columns 

1252 columns2 = self.butler.get( 

1253 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex" 

1254 ) 

1255 self.assertTrue(columns.equals(columns2)) 

1256 

1257 # Check reading the schema. 

1258 schema = DataFrameSchema(tab2) 

1259 schema2 = self.butler.get( 

1260 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema" 

1261 ) 

1262 

1263 self.assertEqual(schema2, schema) 

1264 

1265 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

1266 def testWriteNumpyTableReadAsAstropyTable(self): 

1267 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1268 

1269 self.butler.put(tab1, self.datasetType, dataId={}) 

1270 

1271 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

1272 tab2_numpy = tab2.as_array() 

1273 

1274 self._checkNumpyTableEquality(tab1, tab2_numpy) 

1275 

1276 # Check reading the columns. 

1277 columns = list(tab2.columns.keys()) 

1278 columns2 = self.butler.get( 

1279 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

1280 ) 

1281 self.assertEqual(columns2, columns) 

1282 

1283 # Check reading the schema. 

1284 schema = ArrowAstropySchema(tab2) 

1285 schema2 = self.butler.get( 

1286 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema" 

1287 ) 

1288 

1289 self.assertEqual(schema2, schema) 

1290 

1291 def testWriteNumpyTableReadAsNumpyDict(self): 

1292 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1293 

1294 self.butler.put(tab1, self.datasetType, dataId={}) 

1295 

1296 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict") 

1297 tab2_numpy = _numpy_dict_to_numpy(tab2) 

1298 

1299 self._checkNumpyTableEquality(tab1, tab2_numpy) 

1300 

1301 def _checkNumpyTableEquality(self, table1, table2, has_bigendian=False): 

1302 """Check if two numpy tables have the same columns/values 

1303 

1304 Parameters 

1305 ---------- 

1306 table1 : `numpy.ndarray` 

1307 table2 : `numpy.ndarray` 

1308 has_bigendian : `bool` 

1309 """ 

1310 self.assertEqual(table1.dtype.names, table2.dtype.names) 

1311 for name in table1.dtype.names: 

1312 if not has_bigendian: 

1313 self.assertEqual(table1.dtype[name], table2.dtype[name]) 

1314 else: 

1315 # Only check type matches, force to little-endian. 

1316 self.assertEqual(table1.dtype[name].newbyteorder(">"), table2.dtype[name].newbyteorder(">")) 

1317 self.assertTrue(np.all(table1 == table2)) 

1318 

1319 

1320@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.") 

1321class InMemoryArrowNumpyDelegateTestCase(ParquetFormatterArrowNumpyTestCase): 

1322 """Tests for InMemoryDatastore, using ArrowNumpyDelegate.""" 

1323 

1324 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

1325 

1326 def testBadInput(self): 

1327 tab1 = _makeSimpleNumpyTable() 

1328 delegate = ArrowNumpyDelegate("ArrowNumpy") 

1329 

1330 with self.assertRaises(ValueError): 

1331 delegate.handleParameters(inMemoryDataset="not_a_numpy_table") 

1332 

1333 with self.assertRaises(NotImplementedError): 

1334 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]}) 

1335 

1336 with self.assertRaises(AttributeError): 

1337 delegate.getComponent(composite=tab1, componentName="nothing") 

1338 

1339 def testStorageClass(self): 

1340 tab1 = _makeSimpleNumpyTable() 

1341 

1342 factory = StorageClassFactory() 

1343 factory.addFromConfig(StorageClassConfig()) 

1344 

1345 storageClass = factory.findStorageClass(type(tab1), compare_types=False) 

1346 # Force the name lookup to do name matching. 

1347 storageClass._pytype = None 

1348 self.assertEqual(storageClass.name, "ArrowNumpy") 

1349 

1350 storageClass = factory.findStorageClass(type(tab1), compare_types=True) 

1351 # Force the name lookup to do name matching. 

1352 storageClass._pytype = None 

1353 self.assertEqual(storageClass.name, "ArrowNumpy") 

1354 

1355 

1356@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowTable without pyarrow.") 

1357class ParquetFormatterArrowTableTestCase(unittest.TestCase): 

1358 """Tests for ParquetFormatter, ArrowTable, using local file datastore.""" 

1359 

1360 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1361 

1362 def setUp(self): 

1363 """Create a new butler root for each test.""" 

1364 self.root = makeTestTempDir(TESTDIR) 

1365 config = Config(self.configFile) 

1366 self.butler = Butler.from_config( 

1367 Butler.makeRepo(self.root, config=config), writeable=True, run="test_run" 

1368 ) 

1369 # No dimensions in dataset type so we don't have to worry about 

1370 # inserting dimension data or defining data IDs. 

1371 self.datasetType = DatasetType( 

1372 "data", dimensions=(), storageClass="ArrowTable", universe=self.butler.dimensions 

1373 ) 

1374 self.butler.registry.registerDatasetType(self.datasetType) 

1375 

1376 def tearDown(self): 

1377 removeTestTempDir(self.root) 

1378 

1379 def testArrowTable(self): 

1380 tab1 = _makeSimpleArrowTable(include_multidim=True, include_masked=True) 

1381 

1382 self.butler.put(tab1, self.datasetType, dataId={}) 

1383 # Read the whole Table. 

1384 tab2 = self.butler.get(self.datasetType, dataId={}) 

1385 # We convert to use the numpy testing framework to handle nan 

1386 # comparisons. 

1387 self.assertEqual(tab1.schema, tab2.schema) 

1388 tab1_np = arrow_to_numpy(tab1) 

1389 tab2_np = arrow_to_numpy(tab2) 

1390 for col in tab1.column_names: 

1391 np.testing.assert_array_equal(tab2_np[col], tab1_np[col]) 

1392 # Read the columns. 

1393 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

1394 self.assertEqual(len(columns2), len(tab1.schema.names)) 

1395 for i, name in enumerate(tab1.schema.names): 

1396 self.assertEqual(columns2[i], name) 

1397 # Read the rowcount. 

1398 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

1399 self.assertEqual(rowcount, len(tab1)) 

1400 # Read the schema. 

1401 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

1402 self.assertEqual(schema, tab1.schema) 

1403 # Read just some columns a few different ways. 

1404 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

1405 self.assertEqual(tab3, tab1.select(("a", "c"))) 

1406 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

1407 self.assertEqual(tab4, tab1.select(("a",))) 

1408 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]}) 

1409 self.assertEqual(tab5, tab1.select(("index", "a"))) 

1410 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"}) 

1411 self.assertEqual(tab6, tab1.select(("ddd",))) 

1412 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]}) 

1413 self.assertEqual(tab7, tab1.select(("a",))) 

1414 # Passing an unrecognized column should be a ValueError. 

1415 with self.assertRaises(ValueError): 

1416 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]}) 

1417 

1418 def testEmptyArrowTable(self): 

1419 data = _makeSimpleNumpyTable() 

1420 type_list = _numpy_dtype_to_arrow_types(data.dtype) 

1421 

1422 schema = pa.schema(type_list) 

1423 arrays = [[]] * len(schema.names) 

1424 

1425 tab1 = pa.Table.from_arrays(arrays, schema=schema) 

1426 

1427 self.butler.put(tab1, self.datasetType, dataId={}) 

1428 tab2 = self.butler.get(self.datasetType, dataId={}) 

1429 self.assertEqual(tab2, tab1) 

1430 

1431 tab1_numpy = arrow_to_numpy(tab1) 

1432 self.assertEqual(len(tab1_numpy), 0) 

1433 tab1_numpy_arrow = numpy_to_arrow(tab1_numpy) 

1434 self.assertEqual(tab1_numpy_arrow, tab1) 

1435 

1436 tab1_pandas = arrow_to_pandas(tab1) 

1437 self.assertEqual(len(tab1_pandas), 0) 

1438 tab1_pandas_arrow = pandas_to_arrow(tab1_pandas) 

1439 # Unfortunately, string/byte columns get mangled when translated 

1440 # through empty pandas dataframes. 

1441 self.assertEqual( 

1442 tab1_pandas_arrow.select(("index", "a", "b", "c", "ddd")), 

1443 tab1.select(("index", "a", "b", "c", "ddd")), 

1444 ) 

1445 

1446 tab1_astropy = arrow_to_astropy(tab1) 

1447 self.assertEqual(len(tab1_astropy), 0) 

1448 tab1_astropy_arrow = astropy_to_arrow(tab1_astropy) 

1449 self.assertEqual(tab1_astropy_arrow, tab1) 

1450 

1451 def testEmptyArrowTableMultidim(self): 

1452 data = _makeSimpleNumpyTable(include_multidim=True) 

1453 type_list = _numpy_dtype_to_arrow_types(data.dtype) 

1454 

1455 md = {} 

1456 for name in data.dtype.names: 

1457 _append_numpy_multidim_metadata(md, name, data.dtype[name]) 

1458 

1459 schema = pa.schema(type_list, metadata=md) 

1460 arrays = [[]] * len(schema.names) 

1461 

1462 tab1 = pa.Table.from_arrays(arrays, schema=schema) 

1463 

1464 self.butler.put(tab1, self.datasetType, dataId={}) 

1465 tab2 = self.butler.get(self.datasetType, dataId={}) 

1466 self.assertEqual(tab2, tab1) 

1467 

1468 tab1_numpy = arrow_to_numpy(tab1) 

1469 self.assertEqual(len(tab1_numpy), 0) 

1470 tab1_numpy_arrow = numpy_to_arrow(tab1_numpy) 

1471 self.assertEqual(tab1_numpy_arrow, tab1) 

1472 

1473 tab1_astropy = arrow_to_astropy(tab1) 

1474 self.assertEqual(len(tab1_astropy), 0) 

1475 tab1_astropy_arrow = astropy_to_arrow(tab1_astropy) 

1476 self.assertEqual(tab1_astropy_arrow, tab1) 

1477 

1478 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

1479 def testWriteArrowTableReadAsSingleIndexDataFrame(self): 

1480 df1, allColumns = _makeSingleIndexDataFrame() 

1481 

1482 self.butler.put(df1, self.datasetType, dataId={}) 

1483 

1484 # Read back out as a dataframe. 

1485 df2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

1486 self.assertTrue(df1.equals(df2)) 

1487 

1488 # Read back out as an arrow table, convert to dataframe. 

1489 tab3 = self.butler.get(self.datasetType, dataId={}) 

1490 df3 = arrow_to_pandas(tab3) 

1491 self.assertTrue(df1.equals(df3)) 

1492 

1493 # Check reading the columns. 

1494 columns = df2.reset_index().columns 

1495 columns2 = self.butler.get( 

1496 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex" 

1497 ) 

1498 # We check the set because pandas reorders the columns. 

1499 self.assertEqual(set(columns2.to_list()), set(columns.to_list())) 

1500 

1501 # Check reading the schema. 

1502 schema = DataFrameSchema(df1) 

1503 schema2 = self.butler.get( 

1504 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema" 

1505 ) 

1506 self.assertEqual(schema2, schema) 

1507 

1508 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

1509 def testWriteArrowTableReadAsMultiIndexDataFrame(self): 

1510 df1 = _makeMultiIndexDataFrame() 

1511 

1512 self.butler.put(df1, self.datasetType, dataId={}) 

1513 

1514 # Read back out as a dataframe. 

1515 df2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

1516 self.assertTrue(df1.equals(df2)) 

1517 

1518 # Read back out as an arrow table, convert to dataframe. 

1519 atab3 = self.butler.get(self.datasetType, dataId={}) 

1520 df3 = arrow_to_pandas(atab3) 

1521 self.assertTrue(df1.equals(df3)) 

1522 

1523 # Check reading the columns. 

1524 columns = df2.columns 

1525 columns2 = self.butler.get( 

1526 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex" 

1527 ) 

1528 self.assertTrue(columns2.equals(columns)) 

1529 

1530 # Check reading the schema. 

1531 schema = DataFrameSchema(df1) 

1532 schema2 = self.butler.get( 

1533 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema" 

1534 ) 

1535 self.assertEqual(schema2, schema) 

1536 

1537 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

1538 def testWriteArrowTableReadAsAstropyTable(self): 

1539 tab1 = _makeSimpleAstropyTable(include_multidim=True, include_masked=True) 

1540 

1541 self.butler.put(tab1, self.datasetType, dataId={}) 

1542 

1543 # Read back out as an astropy table. 

1544 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

1545 self._checkAstropyTableEquality(tab1, tab2) 

1546 

1547 # Read back out as an arrow table, convert to astropy table. 

1548 atab3 = self.butler.get(self.datasetType, dataId={}) 

1549 tab3 = arrow_to_astropy(atab3) 

1550 self._checkAstropyTableEquality(tab1, tab3) 

1551 

1552 # Check reading the columns. 

1553 columns = list(tab2.columns.keys()) 

1554 columns2 = self.butler.get( 

1555 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

1556 ) 

1557 self.assertEqual(columns2, columns) 

1558 

1559 # Check reading the schema. 

1560 schema = ArrowAstropySchema(tab1) 

1561 schema2 = self.butler.get( 

1562 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema" 

1563 ) 

1564 self.assertEqual(schema2, schema) 

1565 

1566 # Check the schema conversions and units. 

1567 arrow_schema = schema.to_arrow_schema() 

1568 for name in arrow_schema.names: 

1569 field_metadata = arrow_schema.field(name).metadata 

1570 if ( 

1571 b"description" in field_metadata 

1572 and (description := field_metadata[b"description"].decode("UTF-8")) != "" 

1573 ): 

1574 self.assertEqual(schema2.schema[name].description, description) 

1575 else: 

1576 self.assertIsNone(schema2.schema[name].description) 

1577 if b"unit" in field_metadata and (unit := field_metadata[b"unit"].decode("UTF-8")) != "": 

1578 self.assertEqual(schema2.schema[name].unit, units.Unit(unit)) 

1579 

1580 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

1581 def testWriteArrowTableReadAsNumpyTable(self): 

1582 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1583 

1584 self.butler.put(tab1, self.datasetType, dataId={}) 

1585 

1586 # Read back out as a numpy table. 

1587 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy") 

1588 self._checkNumpyTableEquality(tab1, tab2) 

1589 

1590 # Read back out as an arrow table, convert to numpy table. 

1591 atab3 = self.butler.get(self.datasetType, dataId={}) 

1592 tab3 = arrow_to_numpy(atab3) 

1593 self._checkNumpyTableEquality(tab1, tab3) 

1594 

1595 # Check reading the columns. 

1596 columns = list(tab2.dtype.names) 

1597 columns2 = self.butler.get( 

1598 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

1599 ) 

1600 self.assertEqual(columns2, columns) 

1601 

1602 # Check reading the schema. 

1603 schema = ArrowNumpySchema(tab1.dtype) 

1604 schema2 = self.butler.get( 

1605 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema" 

1606 ) 

1607 self.assertEqual(schema2, schema) 

1608 

1609 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

1610 def testWriteArrowTableReadAsNumpyDict(self): 

1611 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1612 

1613 self.butler.put(tab1, self.datasetType, dataId={}) 

1614 

1615 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict") 

1616 tab2_numpy = _numpy_dict_to_numpy(tab2) 

1617 self._checkNumpyTableEquality(tab1, tab2_numpy) 

1618 

1619 def _checkAstropyTableEquality(self, table1, table2): 

1620 """Check if two astropy tables have the same columns/values 

1621 

1622 Parameters 

1623 ---------- 

1624 table1 : `astropy.table.Table` 

1625 table2 : `astropy.table.Table` 

1626 """ 

1627 self.assertEqual(table1.dtype, table2.dtype) 

1628 for name in table1.columns: 

1629 self.assertEqual(table1[name].unit, table2[name].unit) 

1630 self.assertEqual(table1[name].description, table2[name].description) 

1631 self.assertEqual(table1[name].format, table2[name].format) 

1632 # We need to check masked/regular columns after filling. 

1633 has_masked = False 

1634 if isinstance(table1[name], atable.column.MaskedColumn): 

1635 c1 = table1[name].filled() 

1636 has_masked = True 

1637 else: 

1638 c1 = np.array(table1[name]) 

1639 if has_masked: 

1640 self.assertIsInstance(table2[name], atable.column.MaskedColumn) 

1641 c2 = table2[name].filled() 

1642 else: 

1643 self.assertFalse(isinstance(table2[name], atable.column.MaskedColumn)) 

1644 c2 = np.array(table2[name]) 

1645 np.testing.assert_array_equal(c1, c2) 

1646 # If we have a masked column then we test the underlying data. 

1647 if has_masked: 

1648 np.testing.assert_array_equal(np.array(c1), np.array(c2)) 

1649 np.testing.assert_array_equal(table1[name].mask, table2[name].mask) 

1650 

1651 def _checkNumpyTableEquality(self, table1, table2): 

1652 """Check if two numpy tables have the same columns/values 

1653 

1654 Parameters 

1655 ---------- 

1656 table1 : `numpy.ndarray` 

1657 table2 : `numpy.ndarray` 

1658 """ 

1659 self.assertEqual(table1.dtype.names, table2.dtype.names) 

1660 for name in table1.dtype.names: 

1661 self.assertEqual(table1.dtype[name], table2.dtype[name]) 

1662 self.assertTrue(np.all(table1 == table2)) 

1663 

1664 

1665@unittest.skipUnless(pa is not None, "Cannot test InMemoryArrowTableDelegate without pyarrow.") 

1666class InMemoryArrowTableDelegateTestCase(ParquetFormatterArrowTableTestCase): 

1667 """Tests for InMemoryDatastore, using ArrowTableDelegate.""" 

1668 

1669 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

1670 

1671 def testBadInput(self): 

1672 tab1 = _makeSimpleArrowTable() 

1673 delegate = ArrowTableDelegate("ArrowTable") 

1674 

1675 with self.assertRaises(ValueError): 

1676 delegate.handleParameters(inMemoryDataset="not_an_arrow_table") 

1677 

1678 with self.assertRaises(NotImplementedError): 

1679 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]}) 

1680 

1681 with self.assertRaises(AttributeError): 

1682 delegate.getComponent(composite=tab1, componentName="nothing") 

1683 

1684 def testStorageClass(self): 

1685 tab1 = _makeSimpleArrowTable() 

1686 

1687 factory = StorageClassFactory() 

1688 factory.addFromConfig(StorageClassConfig()) 

1689 

1690 storageClass = factory.findStorageClass(type(tab1), compare_types=False) 

1691 # Force the name lookup to do name matching. 

1692 storageClass._pytype = None 

1693 self.assertEqual(storageClass.name, "ArrowTable") 

1694 

1695 storageClass = factory.findStorageClass(type(tab1), compare_types=True) 

1696 # Force the name lookup to do name matching. 

1697 storageClass._pytype = None 

1698 self.assertEqual(storageClass.name, "ArrowTable") 

1699 

1700 

1701@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.") 

1702@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowNumpy without pyarrow.") 

1703class ParquetFormatterArrowNumpyDictTestCase(unittest.TestCase): 

1704 """Tests for ParquetFormatter, ArrowNumpyDict, using local file store.""" 

1705 

1706 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1707 

1708 def setUp(self): 

1709 """Create a new butler root for each test.""" 

1710 self.root = makeTestTempDir(TESTDIR) 

1711 config = Config(self.configFile) 

1712 self.butler = Butler.from_config( 

1713 Butler.makeRepo(self.root, config=config), writeable=True, run="test_run" 

1714 ) 

1715 # No dimensions in dataset type so we don't have to worry about 

1716 # inserting dimension data or defining data IDs. 

1717 self.datasetType = DatasetType( 

1718 "data", dimensions=(), storageClass="ArrowNumpyDict", universe=self.butler.dimensions 

1719 ) 

1720 self.butler.registry.registerDatasetType(self.datasetType) 

1721 

1722 def tearDown(self): 

1723 removeTestTempDir(self.root) 

1724 

1725 def testNumpyDict(self): 

1726 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1727 dict1 = _numpy_to_numpy_dict(tab1) 

1728 

1729 self.butler.put(dict1, self.datasetType, dataId={}) 

1730 # Read the whole table. 

1731 dict2 = self.butler.get(self.datasetType, dataId={}) 

1732 self._checkNumpyDictEquality(dict1, dict2) 

1733 # Read the columns. 

1734 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

1735 self.assertEqual(len(columns2), len(dict1.keys())) 

1736 for name in dict1: 

1737 self.assertIn(name, columns2) 

1738 # Read the rowcount. 

1739 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

1740 self.assertEqual(rowcount, len(dict1["a"])) 

1741 # Read the schema. 

1742 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

1743 self.assertEqual(schema, ArrowNumpySchema(tab1.dtype)) 

1744 # Read just some columns a few different ways. 

1745 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

1746 subdict = {key: dict1[key] for key in ["a", "c"]} 

1747 self._checkNumpyDictEquality(subdict, tab3) 

1748 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

1749 subdict = {key: dict1[key] for key in ["a"]} 

1750 self._checkNumpyDictEquality(subdict, tab4) 

1751 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]}) 

1752 subdict = {key: dict1[key] for key in ["index", "a"]} 

1753 self._checkNumpyDictEquality(subdict, tab5) 

1754 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"}) 

1755 subdict = {key: dict1[key] for key in ["ddd"]} 

1756 self._checkNumpyDictEquality(subdict, tab6) 

1757 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]}) 

1758 subdict = {key: dict1[key] for key in ["a"]} 

1759 self._checkNumpyDictEquality(subdict, tab7) 

1760 # Passing an unrecognized column should be a ValueError. 

1761 with self.assertRaises(ValueError): 

1762 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]}) 

1763 

1764 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.") 

1765 def testWriteNumpyDictReadAsArrowTable(self): 

1766 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1767 dict1 = _numpy_to_numpy_dict(tab1) 

1768 

1769 self.butler.put(dict1, self.datasetType, dataId={}) 

1770 

1771 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable") 

1772 

1773 tab2_dict = arrow_to_numpy_dict(tab2) 

1774 

1775 self._checkNumpyDictEquality(dict1, tab2_dict) 

1776 

1777 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

1778 def testWriteNumpyDictReadAsDataFrame(self): 

1779 tab1 = _makeSimpleNumpyTable() 

1780 dict1 = _numpy_to_numpy_dict(tab1) 

1781 

1782 self.butler.put(dict1, self.datasetType, dataId={}) 

1783 

1784 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

1785 

1786 # The order of the dict may get mixed up, so we need to check column 

1787 # by column. We also need to do this in dataframe form because pandas 

1788 # changes the datatype of the string column. 

1789 tab1_df = pd.DataFrame(tab1) 

1790 

1791 self.assertEqual(set(tab1_df.columns), set(tab2.columns)) 

1792 for col in tab1_df.columns: 

1793 self.assertTrue(np.all(tab1_df[col].values == tab2[col].values)) 

1794 

1795 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

1796 def testWriteNumpyDictReadAsAstropyTable(self): 

1797 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1798 dict1 = _numpy_to_numpy_dict(tab1) 

1799 

1800 self.butler.put(dict1, self.datasetType, dataId={}) 

1801 

1802 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

1803 tab2_dict = _astropy_to_numpy_dict(tab2) 

1804 

1805 self._checkNumpyDictEquality(dict1, tab2_dict) 

1806 

1807 def testWriteNumpyDictReadAsNumpyTable(self): 

1808 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1809 dict1 = _numpy_to_numpy_dict(tab1) 

1810 

1811 self.butler.put(dict1, self.datasetType, dataId={}) 

1812 

1813 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy") 

1814 tab2_dict = _numpy_to_numpy_dict(tab2) 

1815 

1816 self._checkNumpyDictEquality(dict1, tab2_dict) 

1817 

1818 def testWriteNumpyDictBad(self): 

1819 dict1 = {"a": 4, "b": np.ndarray([1])} 

1820 with self.assertRaises(RuntimeError): 

1821 self.butler.put(dict1, self.datasetType, dataId={}) 

1822 

1823 dict2 = {"a": np.zeros(4), "b": np.zeros(5)} 

1824 with self.assertRaises(RuntimeError): 

1825 self.butler.put(dict2, self.datasetType, dataId={}) 

1826 

1827 dict3 = {"a": [0] * 5, "b": np.zeros(5)} 

1828 with self.assertRaises(RuntimeError): 

1829 self.butler.put(dict3, self.datasetType, dataId={}) 

1830 

1831 def _checkNumpyDictEquality(self, dict1, dict2): 

1832 """Check if two numpy dicts have the same columns/values. 

1833 

1834 Parameters 

1835 ---------- 

1836 dict1 : `dict` [`str`, `np.ndarray`] 

1837 dict2 : `dict` [`str`, `np.ndarray`] 

1838 """ 

1839 self.assertEqual(set(dict1.keys()), set(dict2.keys())) 

1840 for name in dict1: 

1841 self.assertEqual(dict1[name].dtype, dict2[name].dtype) 

1842 self.assertTrue(np.all(dict1[name] == dict2[name])) 

1843 

1844 

1845@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.") 

1846@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowNumpy without pyarrow.") 

1847class InMemoryNumpyDictDelegateTestCase(ParquetFormatterArrowNumpyDictTestCase): 

1848 """Tests for InMemoryDatastore, using ArrowNumpyDictDelegate.""" 

1849 

1850 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

1851 

1852 def testWriteNumpyDictBad(self): 

1853 # The sub-type checking is not done on in-memory datastore. 

1854 pass 

1855 

1856 

1857@unittest.skipUnless(pa is not None, "Cannot test ArrowSchema without pyarrow.") 

1858class ParquetFormatterArrowSchemaTestCase(unittest.TestCase): 

1859 """Tests for ParquetFormatter, ArrowSchema, using local file datastore.""" 

1860 

1861 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1862 

1863 def setUp(self): 

1864 """Create a new butler root for each test.""" 

1865 self.root = makeTestTempDir(TESTDIR) 

1866 config = Config(self.configFile) 

1867 self.butler = Butler.from_config( 

1868 Butler.makeRepo(self.root, config=config), writeable=True, run="test_run" 

1869 ) 

1870 # No dimensions in dataset type so we don't have to worry about 

1871 # inserting dimension data or defining data IDs. 

1872 self.datasetType = DatasetType( 

1873 "data", dimensions=(), storageClass="ArrowSchema", universe=self.butler.dimensions 

1874 ) 

1875 self.butler.registry.registerDatasetType(self.datasetType) 

1876 

1877 def tearDown(self): 

1878 removeTestTempDir(self.root) 

1879 

1880 def _makeTestSchema(self): 

1881 schema = pa.schema( 

1882 [ 

1883 pa.field( 

1884 "int32", 

1885 pa.int32(), 

1886 nullable=False, 

1887 metadata={ 

1888 "description": "32-bit integer", 

1889 "unit": "", 

1890 }, 

1891 ), 

1892 pa.field( 

1893 "int64", 

1894 pa.int64(), 

1895 nullable=False, 

1896 metadata={ 

1897 "description": "64-bit integer", 

1898 "unit": "", 

1899 }, 

1900 ), 

1901 pa.field( 

1902 "uint64", 

1903 pa.uint64(), 

1904 nullable=False, 

1905 metadata={ 

1906 "description": "64-bit unsigned integer", 

1907 "unit": "", 

1908 }, 

1909 ), 

1910 pa.field( 

1911 "float32", 

1912 pa.float32(), 

1913 nullable=False, 

1914 metadata={ 

1915 "description": "32-bit float", 

1916 "unit": "count", 

1917 }, 

1918 ), 

1919 pa.field( 

1920 "float64", 

1921 pa.float64(), 

1922 nullable=False, 

1923 metadata={ 

1924 "description": "64-bit float", 

1925 "unit": "nJy", 

1926 }, 

1927 ), 

1928 pa.field( 

1929 "fixed_size_list", 

1930 pa.list_(pa.float64(), list_size=10), 

1931 nullable=False, 

1932 metadata={ 

1933 "description": "Fixed size list of 64-bit floats.", 

1934 "unit": "nJy", 

1935 }, 

1936 ), 

1937 pa.field( 

1938 "variable_size_list", 

1939 pa.list_(pa.float64()), 

1940 nullable=False, 

1941 metadata={ 

1942 "description": "Variable size list of 64-bit floats.", 

1943 "unit": "nJy", 

1944 }, 

1945 ), 

1946 # One of these fields will have no description. 

1947 pa.field( 

1948 "string", 

1949 pa.string(), 

1950 nullable=False, 

1951 metadata={ 

1952 "unit": "", 

1953 }, 

1954 ), 

1955 # One of these fields will have no metadata. 

1956 pa.field( 

1957 "binary", 

1958 pa.binary(), 

1959 nullable=False, 

1960 ), 

1961 ] 

1962 ) 

1963 

1964 return schema 

1965 

1966 def testArrowSchema(self): 

1967 schema1 = self._makeTestSchema() 

1968 self.butler.put(schema1, self.datasetType, dataId={}) 

1969 

1970 schema2 = self.butler.get(self.datasetType, dataId={}) 

1971 self.assertEqual(schema2, schema1) 

1972 

1973 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe schema without pandas.") 

1974 def testWriteArrowSchemaReadAsDataFrameSchema(self): 

1975 schema1 = self._makeTestSchema() 

1976 self.butler.put(schema1, self.datasetType, dataId={}) 

1977 

1978 df_schema1 = DataFrameSchema.from_arrow(schema1) 

1979 

1980 df_schema2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrameSchema") 

1981 self.assertEqual(df_schema2, df_schema1) 

1982 

1983 @unittest.skipUnless(atable is not None, "Cannot test reading as an astropy schema without astropy.") 

1984 def testWriteArrowSchemaReadAsArrowAstropySchema(self): 

1985 schema1 = self._makeTestSchema() 

1986 self.butler.put(schema1, self.datasetType, dataId={}) 

1987 

1988 ap_schema1 = ArrowAstropySchema.from_arrow(schema1) 

1989 

1990 ap_schema2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropySchema") 

1991 self.assertEqual(ap_schema2, ap_schema1) 

1992 

1993 # Confirm that the ap_schema2 has the unit/description we expect. 

1994 for name in schema1.names: 

1995 field_metadata = schema1.field(name).metadata 

1996 if field_metadata is None: 

1997 continue 

1998 if ( 

1999 b"description" in field_metadata 

2000 and (description := field_metadata[b"description"].decode("UTF-8")) != "" 

2001 ): 

2002 self.assertEqual(ap_schema2.schema[name].description, description) 

2003 else: 

2004 self.assertIsNone(ap_schema2.schema[name].description) 

2005 if b"unit" in field_metadata and (unit := field_metadata[b"unit"].decode("UTF-8")) != "": 

2006 self.assertEqual(ap_schema2.schema[name].unit, units.Unit(unit)) 

2007 

2008 @unittest.skipUnless(atable is not None, "Cannot test reading as an numpy schema without numpy.") 

2009 def testWriteArrowSchemaReadAsArrowNumpySchema(self): 

2010 schema1 = self._makeTestSchema() 

2011 self.butler.put(schema1, self.datasetType, dataId={}) 

2012 

2013 np_schema1 = ArrowNumpySchema.from_arrow(schema1) 

2014 

2015 np_schema2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpySchema") 

2016 self.assertEqual(np_schema2, np_schema1) 

2017 

2018 

2019@unittest.skipUnless(pa is not None, "Cannot test InMemoryArrowSchemaDelegate without pyarrow.") 

2020class InMemoryArrowSchemaDelegateTestCase(ParquetFormatterArrowSchemaTestCase): 

2021 """Tests for InMemoryDatastore and ArrowSchema.""" 

2022 

2023 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

2024 

2025 

2026@unittest.skipUnless(np is not None, "Cannot test compute_row_group_size without numpy.") 

2027@unittest.skipUnless(pa is not None, "Cannot test compute_row_group_size without pyarrow.") 

2028class ComputeRowGroupSizeTestCase(unittest.TestCase): 

2029 """Tests for compute_row_group_size.""" 

2030 

2031 def testRowGroupSizeNoMetadata(self): 

2032 numpyTable = _makeSimpleNumpyTable(include_multidim=True) 

2033 

2034 # We can't use the numpy_to_arrow convenience function because 

2035 # that adds metadata. 

2036 type_list = _numpy_dtype_to_arrow_types(numpyTable.dtype) 

2037 schema = pa.schema(type_list) 

2038 arrays = _numpy_style_arrays_to_arrow_arrays( 

2039 numpyTable.dtype, 

2040 len(numpyTable), 

2041 numpyTable, 

2042 schema, 

2043 ) 

2044 arrowTable = pa.Table.from_arrays(arrays, schema=schema) 

2045 

2046 row_group_size = compute_row_group_size(arrowTable.schema) 

2047 

2048 self.assertGreater(row_group_size, 1_000_000) 

2049 self.assertLess(row_group_size, 2_000_000) 

2050 

2051 def testRowGroupSizeWithMetadata(self): 

2052 numpyTable = _makeSimpleNumpyTable(include_multidim=True) 

2053 

2054 arrowTable = numpy_to_arrow(numpyTable) 

2055 

2056 row_group_size = compute_row_group_size(arrowTable.schema) 

2057 

2058 self.assertGreater(row_group_size, 1_000_000) 

2059 self.assertLess(row_group_size, 2_000_000) 

2060 

2061 def testRowGroupSizeTinyTable(self): 

2062 numpyTable = np.zeros(1, dtype=[("a", np.bool_)]) 

2063 

2064 arrowTable = numpy_to_arrow(numpyTable) 

2065 

2066 row_group_size = compute_row_group_size(arrowTable.schema) 

2067 

2068 self.assertGreater(row_group_size, 1_000_000) 

2069 

2070 @unittest.skipUnless(pd is not None, "Cannot run testRowGroupSizeDataFrameWithLists without pandas.") 

2071 def testRowGroupSizeDataFrameWithLists(self): 

2072 df = pd.DataFrame({"a": np.zeros(10), "b": [[0, 0]] * 10, "c": [[0.0, 0.0]] * 10, "d": [[]] * 10}) 

2073 arrowTable = pandas_to_arrow(df) 

2074 row_group_size = compute_row_group_size(arrowTable.schema) 

2075 

2076 self.assertGreater(row_group_size, 1_000_000) 

2077 

2078 

2079if __name__ == "__main__": 

2080 unittest.main()