Coverage for tests/test_parquet.py: 22%

1105 statements  

« prev     ^ index     » next       coverage.py v7.5.0, created at 2024-04-30 02:53 -0700

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28"""Tests for ParquetFormatter. 

29 

30Tests in this module are disabled unless pandas and pyarrow are importable. 

31""" 

32 

33import os 

34import unittest 

35 

36try: 

37 import pyarrow as pa 

38except ImportError: 

39 pa = None 

40try: 

41 import astropy.table as atable 

42 from astropy import units 

43except ImportError: 

44 atable = None 

45try: 

46 import numpy as np 

47except ImportError: 

48 np = None 

49try: 

50 import pandas as pd 

51except ImportError: 

52 pd = None 

53 

54from lsst.daf.butler import ( 

55 Butler, 

56 Config, 

57 DatasetRef, 

58 DatasetType, 

59 FileDataset, 

60 StorageClassConfig, 

61 StorageClassFactory, 

62) 

63 

64try: 

65 from lsst.daf.butler.delegates.arrowastropy import ArrowAstropyDelegate 

66except ImportError: 

67 atable = None 

68 pa = None 

69try: 

70 from lsst.daf.butler.delegates.arrownumpy import ArrowNumpyDelegate 

71except ImportError: 

72 np = None 

73 pa = None 

74try: 

75 from lsst.daf.butler.delegates.arrowtable import ArrowTableDelegate 

76except ImportError: 

77 pa = None 

78try: 

79 from lsst.daf.butler.delegates.dataframe import DataFrameDelegate 

80except ImportError: 

81 pd = None 

82try: 

83 from lsst.daf.butler.formatters.parquet import ( 

84 ArrowAstropySchema, 

85 ArrowNumpySchema, 

86 DataFrameSchema, 

87 ParquetFormatter, 

88 _append_numpy_multidim_metadata, 

89 _astropy_to_numpy_dict, 

90 _numpy_dict_to_numpy, 

91 _numpy_dtype_to_arrow_types, 

92 _numpy_style_arrays_to_arrow_arrays, 

93 _numpy_to_numpy_dict, 

94 arrow_to_astropy, 

95 arrow_to_numpy, 

96 arrow_to_numpy_dict, 

97 arrow_to_pandas, 

98 astropy_to_arrow, 

99 astropy_to_pandas, 

100 compute_row_group_size, 

101 numpy_dict_to_arrow, 

102 numpy_to_arrow, 

103 pandas_to_arrow, 

104 pandas_to_astropy, 

105 ) 

106except ImportError: 

107 pa = None 

108 pd = None 

109 atable = None 

110 np = None 

111from lsst.daf.butler.tests.utils import makeTestTempDir, removeTestTempDir 

112 

113TESTDIR = os.path.abspath(os.path.dirname(__file__)) 

114 

115 

116def _makeSimpleNumpyTable(include_multidim=False, include_bigendian=False): 

117 """Make a simple numpy table with random data. 

118 

119 Parameters 

120 ---------- 

121 include_multidim : `bool` 

122 Include multi-dimensional columns. 

123 include_bigendian : `bool` 

124 Include big-endian columns. 

125 

126 Returns 

127 ------- 

128 numpyTable : `numpy.ndarray` 

129 """ 

130 nrow = 5 

131 

132 dtype = [ 

133 ("index", "i4"), 

134 ("a", "f8"), 

135 ("b", "f8"), 

136 ("c", "f8"), 

137 ("ddd", "f8"), 

138 ("f", "i8"), 

139 ("strcol", "U10"), 

140 ("bytecol", "a10"), 

141 ] 

142 

143 if include_multidim: 

144 dtype.extend( 

145 [ 

146 ("d1", "f4", (5,)), 

147 ("d2", "i8", (5, 10)), 

148 ("d3", "f8", (5, 10)), 

149 ] 

150 ) 

151 

152 if include_bigendian: 

153 dtype.extend([("a_bigendian", ">f8"), ("f_bigendian", ">i8")]) 

154 

155 data = np.zeros(nrow, dtype=dtype) 

156 data["index"][:] = np.arange(nrow) 

157 data["a"] = np.random.randn(nrow) 

158 data["b"] = np.random.randn(nrow) 

159 data["c"] = np.random.randn(nrow) 

160 data["ddd"] = np.random.randn(nrow) 

161 data["f"] = np.arange(nrow) * 10 

162 data["strcol"][:] = "teststring" 

163 data["bytecol"][:] = "teststring" 

164 

165 if include_multidim: 

166 data["d1"] = np.random.randn(data["d1"].size).reshape(data["d1"].shape) 

167 data["d2"] = np.arange(data["d2"].size).reshape(data["d2"].shape) 

168 data["d3"] = np.asfortranarray(np.random.randn(data["d3"].size).reshape(data["d3"].shape)) 

169 

170 if include_bigendian: 

171 data["a_bigendian"][:] = data["a"] 

172 data["f_bigendian"][:] = data["f"] 

173 

174 return data 

175 

176 

177def _makeSingleIndexDataFrame(include_masked=False, include_lists=False): 

178 """Make a single index data frame for testing. 

179 

180 Parameters 

181 ---------- 

182 include_masked : `bool` 

183 Include masked columns. 

184 include_lists : `bool` 

185 Include list columns. 

186 

187 Returns 

188 ------- 

189 dataFrame : `~pandas.DataFrame` 

190 The test dataframe. 

191 allColumns : `list` [`str`] 

192 List of all the columns (including index columns). 

193 """ 

194 data = _makeSimpleNumpyTable() 

195 df = pd.DataFrame(data) 

196 df = df.set_index("index") 

197 

198 if include_masked: 

199 nrow = len(df) 

200 

201 df["m1"] = pd.array(np.arange(nrow), dtype=pd.Int64Dtype()) 

202 df["m2"] = pd.array(np.arange(nrow), dtype=np.float32) 

203 df["mstrcol"] = pd.array(np.array(["text"] * nrow)) 

204 df.loc[1, ["m1", "m2", "mstrcol"]] = None 

205 df.loc[0, "m1"] = 1649900760361600113 

206 

207 if include_lists: 

208 nrow = len(df) 

209 

210 df["l1"] = [[0, 0]] * nrow 

211 df["l2"] = [[0.0, 0.0]] * nrow 

212 df["l3"] = [[]] * nrow 

213 

214 allColumns = df.columns.append(pd.Index(df.index.names)) 

215 

216 return df, allColumns 

217 

218 

219def _makeMultiIndexDataFrame(): 

220 """Make a multi-index data frame for testing. 

221 

222 Returns 

223 ------- 

224 dataFrame : `~pandas.DataFrame` 

225 The test dataframe. 

226 """ 

227 columns = pd.MultiIndex.from_tuples( 

228 [ 

229 ("g", "a"), 

230 ("g", "b"), 

231 ("g", "c"), 

232 ("r", "a"), 

233 ("r", "b"), 

234 ("r", "c"), 

235 ], 

236 names=["filter", "column"], 

237 ) 

238 df = pd.DataFrame(np.random.randn(5, 6), index=np.arange(5, dtype=int), columns=columns) 

239 

240 return df 

241 

242 

243def _makeSimpleAstropyTable(include_multidim=False, include_masked=False, include_bigendian=False): 

244 """Make an astropy table for testing. 

245 

246 Parameters 

247 ---------- 

248 include_multidim : `bool` 

249 Include multi-dimensional columns. 

250 include_masked : `bool` 

251 Include masked columns. 

252 include_bigendian : `bool` 

253 Include big-endian columns. 

254 

255 Returns 

256 ------- 

257 astropyTable : `astropy.table.Table` 

258 The test table. 

259 """ 

260 data = _makeSimpleNumpyTable(include_multidim=include_multidim, include_bigendian=include_bigendian) 

261 # Add a couple of units. 

262 table = atable.Table(data) 

263 table["a"].unit = units.degree 

264 table["a"].description = "Description of column a" 

265 table["b"].unit = units.meter 

266 table["b"].description = "Description of column b" 

267 

268 # Add some masked columns. 

269 if include_masked: 

270 nrow = len(table) 

271 mask = np.zeros(nrow, dtype=bool) 

272 mask[1] = True 

273 # We set the masked columns with the underlying sentinel value 

274 # to be able test after serialization. 

275 

276 # Masked 64-bit integer. 

277 arr = np.arange(nrow, dtype="i8") 

278 arr[mask] = -1 

279 arr[0] = 1649900760361600113 

280 table["m_i8"] = np.ma.masked_array(data=arr, mask=mask, fill_value=-1) 

281 # Masked 32-bit float. 

282 arr = np.arange(nrow, dtype="f4") 

283 arr[mask] = np.nan 

284 table["m_f4"] = np.ma.masked_array(data=arr, mask=mask, fill_value=np.nan) 

285 # Unmasked 32-bit float with NaNs. 

286 table["um_f4"] = arr 

287 # Masked 64-bit float. 

288 arr = np.arange(nrow, dtype="f8") 

289 arr[mask] = np.nan 

290 table["m_f8"] = np.ma.masked_array(data=arr, mask=mask, fill_value=np.nan) 

291 # Unmasked 64-bit float with NaNs. 

292 table["um_f8"] = arr 

293 # Masked boolean. 

294 arr = np.zeros(nrow, dtype=np.bool_) 

295 arr[mask] = True 

296 table["m_bool"] = np.ma.masked_array(data=arr, mask=mask, fill_value=True) 

297 # Masked unsigned 32-bit unsigned int. 

298 arr = np.arange(nrow, dtype="u4") 

299 arr[mask] = 0 

300 table["m_u4"] = np.ma.masked_array(data=arr, mask=mask, fill_value=0) 

301 # Masked string. 

302 table["m_str"] = np.ma.masked_array(data=np.array(["text"] * nrow), mask=mask, fill_value="") 

303 # Masked bytes. 

304 table["m_byte"] = np.ma.masked_array(data=np.array([b"bytes"] * nrow), mask=mask, fill_value=b"") 

305 

306 return table 

307 

308 

309def _makeSimpleArrowTable(include_multidim=False, include_masked=False): 

310 """Make an arrow table for testing. 

311 

312 Parameters 

313 ---------- 

314 include_multidim : `bool` 

315 Include multi-dimensional columns. 

316 include_masked : `bool` 

317 Include masked columns. 

318 

319 Returns 

320 ------- 

321 arrowTable : `pyarrow.Table` 

322 The test table. 

323 """ 

324 data = _makeSimpleAstropyTable(include_multidim=include_multidim, include_masked=include_masked) 

325 return astropy_to_arrow(data) 

326 

327 

328@unittest.skipUnless(pd is not None, "Cannot test ParquetFormatterDataFrame without pandas.") 

329@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterDataFrame without pyarrow.") 

330class ParquetFormatterDataFrameTestCase(unittest.TestCase): 

331 """Tests for ParquetFormatter, DataFrame, using local file datastore.""" 

332 

333 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

334 

335 def setUp(self): 

336 """Create a new butler root for each test.""" 

337 self.root = makeTestTempDir(TESTDIR) 

338 config = Config(self.configFile) 

339 self.run = "test_run" 

340 self.butler = Butler.from_config( 

341 Butler.makeRepo(self.root, config=config), writeable=True, run=self.run 

342 ) 

343 # No dimensions in dataset type so we don't have to worry about 

344 # inserting dimension data or defining data IDs. 

345 self.datasetType = DatasetType( 

346 "data", dimensions=(), storageClass="DataFrame", universe=self.butler.dimensions 

347 ) 

348 self.butler.registry.registerDatasetType(self.datasetType) 

349 

350 def tearDown(self): 

351 removeTestTempDir(self.root) 

352 

353 def testSingleIndexDataFrame(self): 

354 df1, allColumns = _makeSingleIndexDataFrame(include_masked=True) 

355 

356 self.butler.put(df1, self.datasetType, dataId={}) 

357 # Read the whole DataFrame. 

358 df2 = self.butler.get(self.datasetType, dataId={}) 

359 self.assertTrue(df1.equals(df2)) 

360 # Read just the column descriptions. 

361 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

362 self.assertTrue(allColumns.equals(columns2)) 

363 # Read the rowcount. 

364 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

365 self.assertEqual(rowcount, len(df1)) 

366 # Read the schema. 

367 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

368 self.assertEqual(schema, DataFrameSchema(df1)) 

369 # Read just some columns a few different ways. 

370 df3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

371 self.assertTrue(df1.loc[:, ["a", "c"]].equals(df3)) 

372 df4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

373 self.assertTrue(df1.loc[:, ["a"]].equals(df4)) 

374 df5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]}) 

375 self.assertTrue(df1.loc[:, ["a"]].equals(df5)) 

376 df6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"}) 

377 self.assertTrue(df1.loc[:, ["ddd"]].equals(df6)) 

378 df7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]}) 

379 self.assertTrue(df1.loc[:, ["a"]].equals(df7)) 

380 # Passing an unrecognized column should be a ValueError. 

381 with self.assertRaises(ValueError): 

382 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]}) 

383 

384 def testSingleIndexDataFrameWithLists(self): 

385 df1, allColumns = _makeSingleIndexDataFrame(include_lists=True) 

386 

387 self.butler.put(df1, self.datasetType, dataId={}) 

388 # Read the whole DataFrame. 

389 df2 = self.butler.get(self.datasetType, dataId={}) 

390 

391 # We need to check the list columns specially because they go 

392 # from lists to arrays. 

393 for col in ["l1", "l2", "l3"]: 

394 for i in range(len(df1)): 

395 self.assertTrue(np.all(df2[col].values[i] == df1[col].values[i])) 

396 

397 def testMultiIndexDataFrame(self): 

398 df1 = _makeMultiIndexDataFrame() 

399 

400 self.butler.put(df1, self.datasetType, dataId={}) 

401 # Read the whole DataFrame. 

402 df2 = self.butler.get(self.datasetType, dataId={}) 

403 self.assertTrue(df1.equals(df2)) 

404 # Read just the column descriptions. 

405 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

406 self.assertTrue(df1.columns.equals(columns2)) 

407 self.assertEqual(columns2.names, df1.columns.names) 

408 # Read the rowcount. 

409 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

410 self.assertEqual(rowcount, len(df1)) 

411 # Read the schema. 

412 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

413 self.assertEqual(schema, DataFrameSchema(df1)) 

414 # Read just some columns a few different ways. 

415 df3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": {"filter": "g"}}) 

416 self.assertTrue(df1.loc[:, ["g"]].equals(df3)) 

417 df4 = self.butler.get( 

418 self.datasetType, dataId={}, parameters={"columns": {"filter": ["r"], "column": "a"}} 

419 ) 

420 self.assertTrue(df1.loc[:, [("r", "a")]].equals(df4)) 

421 column_list = [("g", "a"), ("r", "c")] 

422 df5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": column_list}) 

423 self.assertTrue(df1.loc[:, column_list].equals(df5)) 

424 column_dict = {"filter": "r", "column": ["a", "b"]} 

425 df6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": column_dict}) 

426 self.assertTrue(df1.loc[:, [("r", "a"), ("r", "b")]].equals(df6)) 

427 # Passing an unrecognized column should be a ValueError. 

428 with self.assertRaises(ValueError): 

429 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["d"]}) 

430 

431 def testSingleIndexDataFrameEmptyString(self): 

432 """Test persisting a single index dataframe with empty strings.""" 

433 df1, _ = _makeSingleIndexDataFrame() 

434 

435 # Set one of the strings to None 

436 df1.at[1, "strcol"] = None 

437 

438 self.butler.put(df1, self.datasetType, dataId={}) 

439 # Read the whole DataFrame. 

440 df2 = self.butler.get(self.datasetType, dataId={}) 

441 self.assertTrue(df1.equals(df2)) 

442 

443 def testSingleIndexDataFrameAllEmptyStrings(self): 

444 """Test persisting a single index dataframe with an empty string 

445 column. 

446 """ 

447 df1, _ = _makeSingleIndexDataFrame() 

448 

449 # Set all of the strings to None 

450 df1.loc[0:, "strcol"] = None 

451 

452 self.butler.put(df1, self.datasetType, dataId={}) 

453 # Read the whole DataFrame. 

454 df2 = self.butler.get(self.datasetType, dataId={}) 

455 self.assertTrue(df1.equals(df2)) 

456 

457 def testLegacyDataFrame(self): 

458 """Test writing a dataframe to parquet via pandas (without additional 

459 metadata) and ensure that we can read it back with all the new 

460 functionality. 

461 """ 

462 df1, allColumns = _makeSingleIndexDataFrame() 

463 

464 fname = os.path.join(self.root, "test_dataframe.parq") 

465 df1.to_parquet(fname) 

466 

467 legacy_type = DatasetType( 

468 "legacy_dataframe", 

469 dimensions=(), 

470 storageClass="DataFrame", 

471 universe=self.butler.dimensions, 

472 ) 

473 self.butler.registry.registerDatasetType(legacy_type) 

474 

475 data_id = {} 

476 ref = DatasetRef(legacy_type, data_id, run=self.run) 

477 dataset = FileDataset(path=fname, refs=[ref], formatter=ParquetFormatter) 

478 

479 self.butler.ingest(dataset, transfer="copy") 

480 

481 self.butler.put(df1, self.datasetType, dataId={}) 

482 

483 df2a = self.butler.get(self.datasetType, dataId={}) 

484 df2b = self.butler.get("legacy_dataframe", dataId={}) 

485 self.assertTrue(df2a.equals(df2b)) 

486 

487 df3a = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a"]}) 

488 df3b = self.butler.get("legacy_dataframe", dataId={}, parameters={"columns": ["a"]}) 

489 self.assertTrue(df3a.equals(df3b)) 

490 

491 columns2a = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

492 columns2b = self.butler.get("legacy_dataframe.columns", dataId={}) 

493 self.assertTrue(columns2a.equals(columns2b)) 

494 

495 rowcount2a = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

496 rowcount2b = self.butler.get("legacy_dataframe.rowcount", dataId={}) 

497 self.assertEqual(rowcount2a, rowcount2b) 

498 

499 schema2a = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

500 schema2b = self.butler.get("legacy_dataframe.schema", dataId={}) 

501 self.assertEqual(schema2a, schema2b) 

502 

503 def testDataFrameSchema(self): 

504 tab1 = _makeSimpleArrowTable() 

505 

506 schema = DataFrameSchema.from_arrow(tab1.schema) 

507 

508 self.assertIsInstance(schema.schema, pd.DataFrame) 

509 self.assertEqual(repr(schema), repr(schema._schema)) 

510 self.assertNotEqual(schema, "not_a_schema") 

511 self.assertEqual(schema, schema) 

512 

513 tab2 = _makeMultiIndexDataFrame() 

514 schema2 = DataFrameSchema(tab2) 

515 

516 self.assertNotEqual(schema, schema2) 

517 

518 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

519 def testWriteSingleIndexDataFrameReadAsAstropyTable(self): 

520 df1, allColumns = _makeSingleIndexDataFrame() 

521 

522 self.butler.put(df1, self.datasetType, dataId={}) 

523 

524 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

525 

526 tab2_df = tab2.to_pandas(index="index") 

527 self.assertTrue(df1.equals(tab2_df)) 

528 

529 # Check reading the columns. 

530 columns = list(tab2.columns.keys()) 

531 columns2 = self.butler.get( 

532 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

533 ) 

534 # We check the set because pandas reorders the columns. 

535 self.assertEqual(set(columns2), set(columns)) 

536 

537 # Check reading the schema. 

538 schema = ArrowAstropySchema(tab2) 

539 schema2 = self.butler.get( 

540 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema" 

541 ) 

542 

543 # The string types are objectified by pandas, and the order 

544 # will be changed because of pandas indexing. 

545 self.assertEqual(len(schema2.schema.columns), len(schema.schema.columns)) 

546 for name in schema.schema.columns: 

547 self.assertIn(name, schema2.schema.columns) 

548 if schema2.schema[name].dtype != np.dtype("O"): 

549 self.assertEqual(schema2.schema[name].dtype, schema.schema[name].dtype) 

550 

551 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

552 def testWriteSingleIndexDataFrameWithMaskedColsReadAsAstropyTable(self): 

553 # We need to special-case the write-as-pandas read-as-astropy code 

554 # with masks because pandas has multiple ways to use masked columns. 

555 # (The string column mask handling in particular is frustratingly 

556 # inconsistent.) 

557 df1, allColumns = _makeSingleIndexDataFrame(include_masked=True) 

558 

559 self.butler.put(df1, self.datasetType, dataId={}) 

560 

561 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

562 tab2_df = astropy_to_pandas(tab2, index="index") 

563 

564 self.assertTrue(df1.columns.equals(tab2_df.columns)) 

565 for name in tab2_df.columns: 

566 col1 = df1[name] 

567 col2 = tab2_df[name] 

568 

569 if col1.hasnans: 

570 notNull = col1.notnull() 

571 self.assertTrue(notNull.equals(col2.notnull())) 

572 # Need to check value-by-value because column may 

573 # be made of objects, depending on what pandas decides. 

574 for index in notNull.values.nonzero()[0]: 

575 self.assertEqual(col1[index], col2[index]) 

576 else: 

577 self.assertTrue(col1.equals(col2)) 

578 

579 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

580 def testWriteMultiIndexDataFrameReadAsAstropyTable(self): 

581 df1 = _makeMultiIndexDataFrame() 

582 

583 self.butler.put(df1, self.datasetType, dataId={}) 

584 

585 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

586 

587 # This is an odd duck, it doesn't really round-trip. 

588 # This test simply checks that it's readable, but definitely not 

589 # recommended. 

590 

591 @unittest.skipUnless(atable is not None, "Cannot test writing as astropy without astropy.") 

592 def testWriteAstropyTableWithMaskedColsReadAsSingleIndexDataFrame(self): 

593 tab1 = _makeSimpleAstropyTable(include_masked=True) 

594 

595 self.butler.put(tab1, self.datasetType, dataId={}) 

596 

597 tab2 = self.butler.get(self.datasetType, dataId={}) 

598 

599 tab1_df = astropy_to_pandas(tab1) 

600 self.assertTrue(tab1_df.equals(tab2)) 

601 

602 tab2_astropy = pandas_to_astropy(tab2) 

603 for col in tab1.dtype.names: 

604 np.testing.assert_array_equal(tab2_astropy[col], tab1[col]) 

605 if isinstance(tab1[col], atable.column.MaskedColumn): 

606 np.testing.assert_array_equal(tab2_astropy[col].mask, tab1[col].mask) 

607 

608 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.") 

609 def testWriteSingleIndexDataFrameReadAsArrowTable(self): 

610 df1, allColumns = _makeSingleIndexDataFrame() 

611 

612 self.butler.put(df1, self.datasetType, dataId={}) 

613 

614 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable") 

615 

616 tab2_df = arrow_to_pandas(tab2) 

617 self.assertTrue(df1.equals(tab2_df)) 

618 

619 # Check reading the columns. 

620 columns = list(tab2.schema.names) 

621 columns2 = self.butler.get( 

622 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

623 ) 

624 # We check the set because pandas reorders the columns. 

625 self.assertEqual(set(columns), set(columns2)) 

626 

627 # Check reading the schema. 

628 schema = tab2.schema 

629 schema2 = self.butler.get( 

630 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema" 

631 ) 

632 

633 # These will not have the same metadata, nor will the string column 

634 # information be maintained. 

635 self.assertEqual(len(schema.names), len(schema2.names)) 

636 for name in schema.names: 

637 if schema.field(name).type not in (pa.string(), pa.binary()): 

638 self.assertEqual(schema.field(name).type, schema2.field(name).type) 

639 

640 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.") 

641 def testWriteMultiIndexDataFrameReadAsArrowTable(self): 

642 df1 = _makeMultiIndexDataFrame() 

643 

644 self.butler.put(df1, self.datasetType, dataId={}) 

645 

646 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable") 

647 

648 tab2_df = arrow_to_pandas(tab2) 

649 self.assertTrue(df1.equals(tab2_df)) 

650 

651 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

652 def testWriteSingleIndexDataFrameReadAsNumpyTable(self): 

653 df1, allColumns = _makeSingleIndexDataFrame() 

654 

655 self.butler.put(df1, self.datasetType, dataId={}) 

656 

657 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy") 

658 

659 tab2_df = pd.DataFrame.from_records(tab2, index=["index"]) 

660 self.assertTrue(df1.equals(tab2_df)) 

661 

662 # Check reading the columns. 

663 columns = list(tab2.dtype.names) 

664 columns2 = self.butler.get( 

665 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

666 ) 

667 # We check the set because pandas reorders the columns. 

668 self.assertEqual(set(columns2), set(columns)) 

669 

670 # Check reading the schema. 

671 schema = ArrowNumpySchema(tab2.dtype) 

672 schema2 = self.butler.get( 

673 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema" 

674 ) 

675 

676 # The string types will be objectified by pandas, and the order 

677 # will be changed because of pandas indexing. 

678 self.assertEqual(len(schema.schema.names), len(schema2.schema.names)) 

679 for name in schema.schema.names: 

680 self.assertIn(name, schema2.schema.names) 

681 self.assertEqual(schema2.schema[name].type, schema.schema[name].type) 

682 

683 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

684 def testWriteMultiIndexDataFrameReadAsNumpyTable(self): 

685 df1 = _makeMultiIndexDataFrame() 

686 

687 self.butler.put(df1, self.datasetType, dataId={}) 

688 

689 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy") 

690 

691 # This is an odd duck, it doesn't really round-trip. 

692 # This test simply checks that it's readable, but definitely not 

693 # recommended. 

694 

695 @unittest.skipUnless(np is not None, "Cannot test reading as numpy dict without numpy.") 

696 def testWriteSingleIndexDataFrameReadAsNumpyDict(self): 

697 df1, allColumns = _makeSingleIndexDataFrame() 

698 

699 self.butler.put(df1, self.datasetType, dataId={}) 

700 

701 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict") 

702 

703 tab2_df = pd.DataFrame.from_records(tab2, index=["index"]) 

704 # The column order is not maintained. 

705 self.assertEqual(set(df1.columns), set(tab2_df.columns)) 

706 for col in df1.columns: 

707 self.assertTrue(np.all(df1[col].values == tab2_df[col].values)) 

708 

709 @unittest.skipUnless(np is not None, "Cannot test reading as numpy dict without numpy.") 

710 def testWriteMultiIndexDataFrameReadAsNumpyDict(self): 

711 df1 = _makeMultiIndexDataFrame() 

712 

713 self.butler.put(df1, self.datasetType, dataId={}) 

714 

715 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict") 

716 

717 # This is an odd duck, it doesn't really round-trip. 

718 # This test simply checks that it's readable, but definitely not 

719 # recommended. 

720 

721 

722@unittest.skipUnless(pd is not None, "Cannot test InMemoryDataFrameDelegate without pandas.") 

723class InMemoryDataFrameDelegateTestCase(ParquetFormatterDataFrameTestCase): 

724 """Tests for InMemoryDatastore, using DataFrameDelegate.""" 

725 

726 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

727 

728 def testWriteMultiIndexDataFrameReadAsAstropyTable(self): 

729 df1 = _makeMultiIndexDataFrame() 

730 

731 self.butler.put(df1, self.datasetType, dataId={}) 

732 

733 with self.assertRaises(ValueError): 

734 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

735 

736 def testLegacyDataFrame(self): 

737 # This test does not work with an inMemoryDatastore. 

738 pass 

739 

740 def testBadInput(self): 

741 df1, _ = _makeSingleIndexDataFrame() 

742 delegate = DataFrameDelegate("DataFrame") 

743 

744 with self.assertRaises(ValueError): 

745 delegate.handleParameters(inMemoryDataset="not_a_dataframe") 

746 

747 with self.assertRaises(AttributeError): 

748 delegate.getComponent(composite=df1, componentName="nothing") 

749 

750 def testStorageClass(self): 

751 df1, allColumns = _makeSingleIndexDataFrame() 

752 

753 factory = StorageClassFactory() 

754 factory.addFromConfig(StorageClassConfig()) 

755 

756 storageClass = factory.findStorageClass(type(df1), compare_types=False) 

757 # Force the name lookup to do name matching. 

758 storageClass._pytype = None 

759 self.assertEqual(storageClass.name, "DataFrame") 

760 

761 storageClass = factory.findStorageClass(type(df1), compare_types=True) 

762 # Force the name lookup to do name matching. 

763 storageClass._pytype = None 

764 self.assertEqual(storageClass.name, "DataFrame") 

765 

766 

767@unittest.skipUnless(atable is not None, "Cannot test ParquetFormatterArrowAstropy without astropy.") 

768@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowAstropy without pyarrow.") 

769class ParquetFormatterArrowAstropyTestCase(unittest.TestCase): 

770 """Tests for ParquetFormatter, ArrowAstropy, using local file datastore.""" 

771 

772 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

773 

774 def setUp(self): 

775 """Create a new butler root for each test.""" 

776 self.root = makeTestTempDir(TESTDIR) 

777 config = Config(self.configFile) 

778 self.run = "test_run" 

779 self.butler = Butler.from_config( 

780 Butler.makeRepo(self.root, config=config), writeable=True, run=self.run 

781 ) 

782 # No dimensions in dataset type so we don't have to worry about 

783 # inserting dimension data or defining data IDs. 

784 self.datasetType = DatasetType( 

785 "data", dimensions=(), storageClass="ArrowAstropy", universe=self.butler.dimensions 

786 ) 

787 self.butler.registry.registerDatasetType(self.datasetType) 

788 

789 def tearDown(self): 

790 removeTestTempDir(self.root) 

791 

792 def testAstropyTable(self): 

793 tab1 = _makeSimpleAstropyTable(include_multidim=True, include_masked=True) 

794 

795 self.butler.put(tab1, self.datasetType, dataId={}) 

796 # Read the whole Table. 

797 tab2 = self.butler.get(self.datasetType, dataId={}) 

798 self._checkAstropyTableEquality(tab1, tab2) 

799 # Read the columns. 

800 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

801 self.assertEqual(len(columns2), len(tab1.dtype.names)) 

802 for i, name in enumerate(tab1.dtype.names): 

803 self.assertEqual(columns2[i], name) 

804 # Read the rowcount. 

805 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

806 self.assertEqual(rowcount, len(tab1)) 

807 # Read the schema. 

808 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

809 self.assertEqual(schema, ArrowAstropySchema(tab1)) 

810 # Read just some columns a few different ways. 

811 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

812 self._checkAstropyTableEquality(tab1[("a", "c")], tab3) 

813 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

814 self._checkAstropyTableEquality(tab1[("a",)], tab4) 

815 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]}) 

816 self._checkAstropyTableEquality(tab1[("index", "a")], tab5) 

817 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"}) 

818 self._checkAstropyTableEquality(tab1[("ddd",)], tab6) 

819 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]}) 

820 self._checkAstropyTableEquality(tab1[("a",)], tab7) 

821 # Passing an unrecognized column should be a ValueError. 

822 with self.assertRaises(ValueError): 

823 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]}) 

824 

825 def testAstropyTableBigEndian(self): 

826 tab1 = _makeSimpleAstropyTable(include_bigendian=True) 

827 

828 self.butler.put(tab1, self.datasetType, dataId={}) 

829 # Read the whole Table. 

830 tab2 = self.butler.get(self.datasetType, dataId={}) 

831 self._checkAstropyTableEquality(tab1, tab2, has_bigendian=True) 

832 

833 def testAstropyTableWithMetadata(self): 

834 tab1 = _makeSimpleAstropyTable(include_multidim=True) 

835 

836 meta = { 

837 "meta_a": 5, 

838 "meta_b": 10.0, 

839 "meta_c": [1, 2, 3], 

840 "meta_d": True, 

841 "meta_e": "string", 

842 } 

843 

844 tab1.meta.update(meta) 

845 

846 self.butler.put(tab1, self.datasetType, dataId={}) 

847 # Read the whole Table. 

848 tab2 = self.butler.get(self.datasetType, dataId={}) 

849 # This will check that the metadata is equivalent as well. 

850 self._checkAstropyTableEquality(tab1, tab2) 

851 

852 def testArrowAstropySchema(self): 

853 tab1 = _makeSimpleAstropyTable() 

854 tab1_arrow = astropy_to_arrow(tab1) 

855 schema = ArrowAstropySchema.from_arrow(tab1_arrow.schema) 

856 

857 self.assertIsInstance(schema.schema, atable.Table) 

858 self.assertEqual(repr(schema), repr(schema._schema)) 

859 self.assertNotEqual(schema, "not_a_schema") 

860 self.assertEqual(schema, schema) 

861 

862 # Test various inequalities 

863 tab2 = tab1.copy() 

864 tab2.rename_column("index", "index2") 

865 schema2 = ArrowAstropySchema(tab2) 

866 self.assertNotEqual(schema2, schema) 

867 

868 tab2 = tab1.copy() 

869 tab2["index"].unit = units.micron 

870 schema2 = ArrowAstropySchema(tab2) 

871 self.assertNotEqual(schema2, schema) 

872 

873 tab2 = tab1.copy() 

874 tab2["index"].description = "Index column" 

875 schema2 = ArrowAstropySchema(tab2) 

876 self.assertNotEqual(schema2, schema) 

877 

878 tab2 = tab1.copy() 

879 tab2["index"].format = "%05d" 

880 schema2 = ArrowAstropySchema(tab2) 

881 self.assertNotEqual(schema2, schema) 

882 

883 def testAstropyParquet(self): 

884 tab1 = _makeSimpleAstropyTable() 

885 

886 fname = os.path.join(self.root, "test_astropy.parq") 

887 tab1.write(fname) 

888 

889 astropy_type = DatasetType( 

890 "astropy_parquet", 

891 dimensions=(), 

892 storageClass="ArrowAstropy", 

893 universe=self.butler.dimensions, 

894 ) 

895 self.butler.registry.registerDatasetType(astropy_type) 

896 

897 data_id = {} 

898 ref = DatasetRef(astropy_type, data_id, run=self.run) 

899 dataset = FileDataset(path=fname, refs=[ref], formatter=ParquetFormatter) 

900 

901 self.butler.ingest(dataset, transfer="copy") 

902 

903 self.butler.put(tab1, self.datasetType, dataId={}) 

904 

905 tab2a = self.butler.get(self.datasetType, dataId={}) 

906 tab2b = self.butler.get("astropy_parquet", dataId={}) 

907 self._checkAstropyTableEquality(tab2a, tab2b) 

908 

909 columns2a = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

910 columns2b = self.butler.get("astropy_parquet.columns", dataId={}) 

911 self.assertEqual(len(columns2b), len(columns2a)) 

912 for i, name in enumerate(columns2a): 

913 self.assertEqual(columns2b[i], name) 

914 

915 rowcount2a = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

916 rowcount2b = self.butler.get("astropy_parquet.rowcount", dataId={}) 

917 self.assertEqual(rowcount2a, rowcount2b) 

918 

919 schema2a = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

920 schema2b = self.butler.get("astropy_parquet.schema", dataId={}) 

921 self.assertEqual(schema2a, schema2b) 

922 

923 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.") 

924 def testWriteAstropyReadAsArrowTable(self): 

925 # This astropy <-> arrow works fine with masked columns. 

926 tab1 = _makeSimpleAstropyTable(include_masked=True) 

927 

928 self.butler.put(tab1, self.datasetType, dataId={}) 

929 

930 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable") 

931 

932 tab2_astropy = arrow_to_astropy(tab2) 

933 self._checkAstropyTableEquality(tab1, tab2_astropy) 

934 

935 # Check reading the columns. 

936 columns = tab2.schema.names 

937 columns2 = self.butler.get( 

938 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

939 ) 

940 self.assertEqual(columns2, columns) 

941 

942 # Check reading the schema. 

943 schema = tab2.schema 

944 schema2 = self.butler.get( 

945 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema" 

946 ) 

947 

948 self.assertEqual(schema, schema2) 

949 

950 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

951 def testWriteAstropyReadAsDataFrame(self): 

952 tab1 = _makeSimpleAstropyTable() 

953 

954 self.butler.put(tab1, self.datasetType, dataId={}) 

955 

956 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

957 

958 # This is tricky because it loses the units and gains a bonus pandas 

959 # _index_ column, so we just test the dataframe form. 

960 

961 tab1_df = tab1.to_pandas() 

962 self.assertTrue(tab1_df.equals(tab2)) 

963 

964 # Check reading the columns. 

965 columns = tab2.columns 

966 columns2 = self.butler.get( 

967 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex" 

968 ) 

969 self.assertTrue(columns.equals(columns2)) 

970 

971 # Check reading the schema. 

972 schema = DataFrameSchema(tab2) 

973 schema2 = self.butler.get( 

974 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema" 

975 ) 

976 

977 self.assertEqual(schema2, schema) 

978 

979 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

980 def testWriteAstropyWithMaskedColsReadAsDataFrame(self): 

981 # We need to special-case the write-as-astropy read-as-pandas code 

982 # with masks because pandas has multiple ways to use masked columns. 

983 # (When writing an astropy table with masked columns we get an object 

984 # column back, but each unmasked element has the correct type.) 

985 tab1 = _makeSimpleAstropyTable(include_masked=True) 

986 

987 self.butler.put(tab1, self.datasetType, dataId={}) 

988 

989 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

990 

991 tab1_df = astropy_to_pandas(tab1) 

992 

993 self.assertTrue(tab1_df.columns.equals(tab2.columns)) 

994 for name in tab2.columns: 

995 col1 = tab1_df[name] 

996 col2 = tab2[name] 

997 

998 if col1.hasnans: 

999 notNull = col1.notnull() 

1000 self.assertTrue(notNull.equals(col2.notnull())) 

1001 # Need to check value-by-value because column may 

1002 # be made of objects, depending on what pandas decides. 

1003 for index in notNull.values.nonzero()[0]: 

1004 self.assertEqual(col1[index], col2[index]) 

1005 else: 

1006 self.assertTrue(col1.equals(col2)) 

1007 

1008 @unittest.skipUnless(pd is not None, "Cannot test writing as a dataframe without pandas.") 

1009 def testWriteSingleIndexDataFrameWithMaskedColsReadAsAstropyTable(self): 

1010 df1, allColumns = _makeSingleIndexDataFrame(include_masked=True) 

1011 

1012 self.butler.put(df1, self.datasetType, dataId={}) 

1013 

1014 tab2 = self.butler.get(self.datasetType, dataId={}) 

1015 

1016 df1_tab = pandas_to_astropy(df1) 

1017 

1018 self._checkAstropyTableEquality(df1_tab, tab2) 

1019 

1020 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

1021 def testWriteAstropyReadAsNumpyTable(self): 

1022 tab1 = _makeSimpleAstropyTable() 

1023 self.butler.put(tab1, self.datasetType, dataId={}) 

1024 

1025 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy") 

1026 

1027 # This is tricky because it loses the units. 

1028 tab2_astropy = atable.Table(tab2) 

1029 

1030 self._checkAstropyTableEquality(tab1, tab2_astropy, skip_units=True) 

1031 

1032 # Check reading the columns. 

1033 columns = list(tab2.dtype.names) 

1034 columns2 = self.butler.get( 

1035 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

1036 ) 

1037 self.assertEqual(columns2, columns) 

1038 

1039 # Check reading the schema. 

1040 schema = ArrowNumpySchema(tab2.dtype) 

1041 schema2 = self.butler.get( 

1042 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema" 

1043 ) 

1044 

1045 self.assertEqual(schema2, schema) 

1046 

1047 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

1048 def testWriteAstropyReadAsNumpyDict(self): 

1049 tab1 = _makeSimpleAstropyTable() 

1050 self.butler.put(tab1, self.datasetType, dataId={}) 

1051 

1052 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict") 

1053 

1054 # This is tricky because it loses the units. 

1055 tab2_astropy = atable.Table(tab2) 

1056 

1057 self._checkAstropyTableEquality(tab1, tab2_astropy, skip_units=True) 

1058 

1059 def _checkAstropyTableEquality(self, table1, table2, skip_units=False, has_bigendian=False): 

1060 """Check if two astropy tables have the same columns/values. 

1061 

1062 Parameters 

1063 ---------- 

1064 table1 : `astropy.table.Table` 

1065 table2 : `astropy.table.Table` 

1066 skip_units : `bool` 

1067 has_bigendian : `bool` 

1068 """ 

1069 if not has_bigendian: 

1070 self.assertEqual(table1.dtype, table2.dtype) 

1071 else: 

1072 for name in table1.dtype.names: 

1073 # Only check type matches, force to little-endian. 

1074 self.assertEqual(table1.dtype[name].newbyteorder(">"), table2.dtype[name].newbyteorder(">")) 

1075 

1076 self.assertEqual(table1.meta, table2.meta) 

1077 if not skip_units: 

1078 for name in table1.columns: 

1079 self.assertEqual(table1[name].unit, table2[name].unit) 

1080 self.assertEqual(table1[name].description, table2[name].description) 

1081 self.assertEqual(table1[name].format, table2[name].format) 

1082 # We need to check masked/regular columns after filling. 

1083 has_masked = False 

1084 if isinstance(table1[name], atable.column.MaskedColumn): 

1085 c1 = table1[name].filled() 

1086 has_masked = True 

1087 else: 

1088 c1 = np.array(table1[name]) 

1089 if has_masked: 

1090 self.assertIsInstance(table2[name], atable.column.MaskedColumn) 

1091 c2 = table2[name].filled() 

1092 else: 

1093 self.assertFalse(isinstance(table2[name], atable.column.MaskedColumn)) 

1094 c2 = np.array(table2[name]) 

1095 np.testing.assert_array_equal(c1, c2) 

1096 # If we have a masked column then we test the underlying data. 

1097 if has_masked: 

1098 np.testing.assert_array_equal(np.array(c1), np.array(c2)) 

1099 np.testing.assert_array_equal(table1[name].mask, table2[name].mask) 

1100 

1101 

1102@unittest.skipUnless(atable is not None, "Cannot test InMemoryArrowAstropyDelegate without astropy.") 

1103class InMemoryArrowAstropyDelegateTestCase(ParquetFormatterArrowAstropyTestCase): 

1104 """Tests for InMemoryDatastore, using ArrowAstropyDelegate.""" 

1105 

1106 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

1107 

1108 def testAstropyParquet(self): 

1109 # This test does not work with an inMemoryDatastore. 

1110 pass 

1111 

1112 def testBadInput(self): 

1113 tab1 = _makeSimpleAstropyTable() 

1114 delegate = ArrowAstropyDelegate("ArrowAstropy") 

1115 

1116 with self.assertRaises(ValueError): 

1117 delegate.handleParameters(inMemoryDataset="not_an_astropy_table") 

1118 

1119 with self.assertRaises(NotImplementedError): 

1120 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]}) 

1121 

1122 with self.assertRaises(AttributeError): 

1123 delegate.getComponent(composite=tab1, componentName="nothing") 

1124 

1125 

1126@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.") 

1127@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowNumpy without pyarrow.") 

1128class ParquetFormatterArrowNumpyTestCase(unittest.TestCase): 

1129 """Tests for ParquetFormatter, ArrowNumpy, using local file datastore.""" 

1130 

1131 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1132 

1133 def setUp(self): 

1134 """Create a new butler root for each test.""" 

1135 self.root = makeTestTempDir(TESTDIR) 

1136 config = Config(self.configFile) 

1137 self.butler = Butler.from_config( 

1138 Butler.makeRepo(self.root, config=config), writeable=True, run="test_run" 

1139 ) 

1140 # No dimensions in dataset type so we don't have to worry about 

1141 # inserting dimension data or defining data IDs. 

1142 self.datasetType = DatasetType( 

1143 "data", dimensions=(), storageClass="ArrowNumpy", universe=self.butler.dimensions 

1144 ) 

1145 self.butler.registry.registerDatasetType(self.datasetType) 

1146 

1147 def tearDown(self): 

1148 removeTestTempDir(self.root) 

1149 

1150 def testNumpyTable(self): 

1151 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1152 

1153 self.butler.put(tab1, self.datasetType, dataId={}) 

1154 # Read the whole Table. 

1155 tab2 = self.butler.get(self.datasetType, dataId={}) 

1156 self._checkNumpyTableEquality(tab1, tab2) 

1157 # Read the columns. 

1158 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

1159 self.assertEqual(len(columns2), len(tab1.dtype.names)) 

1160 for i, name in enumerate(tab1.dtype.names): 

1161 self.assertEqual(columns2[i], name) 

1162 # Read the rowcount. 

1163 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

1164 self.assertEqual(rowcount, len(tab1)) 

1165 # Read the schema. 

1166 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

1167 self.assertEqual(schema, ArrowNumpySchema(tab1.dtype)) 

1168 # Read just some columns a few different ways. 

1169 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

1170 self._checkNumpyTableEquality(tab1[["a", "c"]], tab3) 

1171 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

1172 self._checkNumpyTableEquality( 

1173 tab1[ 

1174 [ 

1175 "a", 

1176 ] 

1177 ], 

1178 tab4, 

1179 ) 

1180 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]}) 

1181 self._checkNumpyTableEquality(tab1[["index", "a"]], tab5) 

1182 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"}) 

1183 self._checkNumpyTableEquality( 

1184 tab1[ 

1185 [ 

1186 "ddd", 

1187 ] 

1188 ], 

1189 tab6, 

1190 ) 

1191 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]}) 

1192 self._checkNumpyTableEquality( 

1193 tab1[ 

1194 [ 

1195 "a", 

1196 ] 

1197 ], 

1198 tab7, 

1199 ) 

1200 # Passing an unrecognized column should be a ValueError. 

1201 with self.assertRaises(ValueError): 

1202 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]}) 

1203 

1204 def testNumpyTableBigEndian(self): 

1205 tab1 = _makeSimpleNumpyTable(include_bigendian=True) 

1206 

1207 self.butler.put(tab1, self.datasetType, dataId={}) 

1208 # Read the whole Table. 

1209 tab2 = self.butler.get(self.datasetType, dataId={}) 

1210 self._checkNumpyTableEquality(tab1, tab2, has_bigendian=True) 

1211 

1212 def testArrowNumpySchema(self): 

1213 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1214 tab1_arrow = numpy_to_arrow(tab1) 

1215 schema = ArrowNumpySchema.from_arrow(tab1_arrow.schema) 

1216 

1217 self.assertIsInstance(schema.schema, np.dtype) 

1218 self.assertEqual(repr(schema), repr(schema._dtype)) 

1219 self.assertNotEqual(schema, "not_a_schema") 

1220 self.assertEqual(schema, schema) 

1221 

1222 # Test inequality 

1223 tab2 = tab1.copy() 

1224 names = list(tab2.dtype.names) 

1225 names[0] = "index2" 

1226 tab2.dtype.names = names 

1227 schema2 = ArrowNumpySchema(tab2.dtype) 

1228 self.assertNotEqual(schema2, schema) 

1229 

1230 @unittest.skipUnless(pa is not None, "Cannot test arrow conversions without pyarrow.") 

1231 def testNumpyDictConversions(self): 

1232 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1233 

1234 # Verify that everything round-trips, including the schema. 

1235 tab1_arrow = numpy_to_arrow(tab1) 

1236 tab1_dict = arrow_to_numpy_dict(tab1_arrow) 

1237 tab1_dict_arrow = numpy_dict_to_arrow(tab1_dict) 

1238 

1239 self.assertEqual(tab1_arrow.schema, tab1_dict_arrow.schema) 

1240 self.assertEqual(tab1_arrow, tab1_dict_arrow) 

1241 

1242 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.") 

1243 def testWriteNumpyTableReadAsArrowTable(self): 

1244 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1245 

1246 self.butler.put(tab1, self.datasetType, dataId={}) 

1247 

1248 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable") 

1249 

1250 tab2_numpy = arrow_to_numpy(tab2) 

1251 

1252 self._checkNumpyTableEquality(tab1, tab2_numpy) 

1253 

1254 # Check reading the columns. 

1255 columns = tab2.schema.names 

1256 columns2 = self.butler.get( 

1257 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

1258 ) 

1259 self.assertEqual(columns2, columns) 

1260 

1261 # Check reading the schema. 

1262 schema = tab2.schema 

1263 schema2 = self.butler.get( 

1264 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema" 

1265 ) 

1266 self.assertEqual(schema2, schema) 

1267 

1268 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

1269 def testWriteNumpyTableReadAsDataFrame(self): 

1270 tab1 = _makeSimpleNumpyTable() 

1271 

1272 self.butler.put(tab1, self.datasetType, dataId={}) 

1273 

1274 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

1275 

1276 # Converting this back to numpy gets confused with the index column 

1277 # and changes the datatype of the string column. 

1278 

1279 tab1_df = pd.DataFrame(tab1) 

1280 

1281 self.assertTrue(tab1_df.equals(tab2)) 

1282 

1283 # Check reading the columns. 

1284 columns = tab2.columns 

1285 columns2 = self.butler.get( 

1286 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex" 

1287 ) 

1288 self.assertTrue(columns.equals(columns2)) 

1289 

1290 # Check reading the schema. 

1291 schema = DataFrameSchema(tab2) 

1292 schema2 = self.butler.get( 

1293 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema" 

1294 ) 

1295 

1296 self.assertEqual(schema2, schema) 

1297 

1298 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

1299 def testWriteNumpyTableReadAsAstropyTable(self): 

1300 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1301 

1302 self.butler.put(tab1, self.datasetType, dataId={}) 

1303 

1304 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

1305 tab2_numpy = tab2.as_array() 

1306 

1307 self._checkNumpyTableEquality(tab1, tab2_numpy) 

1308 

1309 # Check reading the columns. 

1310 columns = list(tab2.columns.keys()) 

1311 columns2 = self.butler.get( 

1312 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

1313 ) 

1314 self.assertEqual(columns2, columns) 

1315 

1316 # Check reading the schema. 

1317 schema = ArrowAstropySchema(tab2) 

1318 schema2 = self.butler.get( 

1319 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema" 

1320 ) 

1321 

1322 self.assertEqual(schema2, schema) 

1323 

1324 def testWriteNumpyTableReadAsNumpyDict(self): 

1325 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1326 

1327 self.butler.put(tab1, self.datasetType, dataId={}) 

1328 

1329 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict") 

1330 tab2_numpy = _numpy_dict_to_numpy(tab2) 

1331 

1332 self._checkNumpyTableEquality(tab1, tab2_numpy) 

1333 

1334 def _checkNumpyTableEquality(self, table1, table2, has_bigendian=False): 

1335 """Check if two numpy tables have the same columns/values 

1336 

1337 Parameters 

1338 ---------- 

1339 table1 : `numpy.ndarray` 

1340 table2 : `numpy.ndarray` 

1341 has_bigendian : `bool` 

1342 """ 

1343 self.assertEqual(table1.dtype.names, table2.dtype.names) 

1344 for name in table1.dtype.names: 

1345 if not has_bigendian: 

1346 self.assertEqual(table1.dtype[name], table2.dtype[name]) 

1347 else: 

1348 # Only check type matches, force to little-endian. 

1349 self.assertEqual(table1.dtype[name].newbyteorder(">"), table2.dtype[name].newbyteorder(">")) 

1350 self.assertTrue(np.all(table1 == table2)) 

1351 

1352 

1353@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.") 

1354class InMemoryArrowNumpyDelegateTestCase(ParquetFormatterArrowNumpyTestCase): 

1355 """Tests for InMemoryDatastore, using ArrowNumpyDelegate.""" 

1356 

1357 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

1358 

1359 def testBadInput(self): 

1360 tab1 = _makeSimpleNumpyTable() 

1361 delegate = ArrowNumpyDelegate("ArrowNumpy") 

1362 

1363 with self.assertRaises(ValueError): 

1364 delegate.handleParameters(inMemoryDataset="not_a_numpy_table") 

1365 

1366 with self.assertRaises(NotImplementedError): 

1367 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]}) 

1368 

1369 with self.assertRaises(AttributeError): 

1370 delegate.getComponent(composite=tab1, componentName="nothing") 

1371 

1372 def testStorageClass(self): 

1373 tab1 = _makeSimpleNumpyTable() 

1374 

1375 factory = StorageClassFactory() 

1376 factory.addFromConfig(StorageClassConfig()) 

1377 

1378 storageClass = factory.findStorageClass(type(tab1), compare_types=False) 

1379 # Force the name lookup to do name matching. 

1380 storageClass._pytype = None 

1381 self.assertEqual(storageClass.name, "ArrowNumpy") 

1382 

1383 storageClass = factory.findStorageClass(type(tab1), compare_types=True) 

1384 # Force the name lookup to do name matching. 

1385 storageClass._pytype = None 

1386 self.assertEqual(storageClass.name, "ArrowNumpy") 

1387 

1388 

1389@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowTable without pyarrow.") 

1390class ParquetFormatterArrowTableTestCase(unittest.TestCase): 

1391 """Tests for ParquetFormatter, ArrowTable, using local file datastore.""" 

1392 

1393 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1394 

1395 def setUp(self): 

1396 """Create a new butler root for each test.""" 

1397 self.root = makeTestTempDir(TESTDIR) 

1398 config = Config(self.configFile) 

1399 self.butler = Butler.from_config( 

1400 Butler.makeRepo(self.root, config=config), writeable=True, run="test_run" 

1401 ) 

1402 # No dimensions in dataset type so we don't have to worry about 

1403 # inserting dimension data or defining data IDs. 

1404 self.datasetType = DatasetType( 

1405 "data", dimensions=(), storageClass="ArrowTable", universe=self.butler.dimensions 

1406 ) 

1407 self.butler.registry.registerDatasetType(self.datasetType) 

1408 

1409 def tearDown(self): 

1410 removeTestTempDir(self.root) 

1411 

1412 def testArrowTable(self): 

1413 tab1 = _makeSimpleArrowTable(include_multidim=True, include_masked=True) 

1414 

1415 self.butler.put(tab1, self.datasetType, dataId={}) 

1416 # Read the whole Table. 

1417 tab2 = self.butler.get(self.datasetType, dataId={}) 

1418 # We convert to use the numpy testing framework to handle nan 

1419 # comparisons. 

1420 self.assertEqual(tab1.schema, tab2.schema) 

1421 tab1_np = arrow_to_numpy(tab1) 

1422 tab2_np = arrow_to_numpy(tab2) 

1423 for col in tab1.column_names: 

1424 np.testing.assert_array_equal(tab2_np[col], tab1_np[col]) 

1425 # Read the columns. 

1426 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

1427 self.assertEqual(len(columns2), len(tab1.schema.names)) 

1428 for i, name in enumerate(tab1.schema.names): 

1429 self.assertEqual(columns2[i], name) 

1430 # Read the rowcount. 

1431 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

1432 self.assertEqual(rowcount, len(tab1)) 

1433 # Read the schema. 

1434 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

1435 self.assertEqual(schema, tab1.schema) 

1436 # Read just some columns a few different ways. 

1437 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

1438 self.assertEqual(tab3, tab1.select(("a", "c"))) 

1439 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

1440 self.assertEqual(tab4, tab1.select(("a",))) 

1441 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]}) 

1442 self.assertEqual(tab5, tab1.select(("index", "a"))) 

1443 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"}) 

1444 self.assertEqual(tab6, tab1.select(("ddd",))) 

1445 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]}) 

1446 self.assertEqual(tab7, tab1.select(("a",))) 

1447 # Passing an unrecognized column should be a ValueError. 

1448 with self.assertRaises(ValueError): 

1449 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]}) 

1450 

1451 def testEmptyArrowTable(self): 

1452 data = _makeSimpleNumpyTable() 

1453 type_list = _numpy_dtype_to_arrow_types(data.dtype) 

1454 

1455 schema = pa.schema(type_list) 

1456 arrays = [[]] * len(schema.names) 

1457 

1458 tab1 = pa.Table.from_arrays(arrays, schema=schema) 

1459 

1460 self.butler.put(tab1, self.datasetType, dataId={}) 

1461 tab2 = self.butler.get(self.datasetType, dataId={}) 

1462 self.assertEqual(tab2, tab1) 

1463 

1464 tab1_numpy = arrow_to_numpy(tab1) 

1465 self.assertEqual(len(tab1_numpy), 0) 

1466 tab1_numpy_arrow = numpy_to_arrow(tab1_numpy) 

1467 self.assertEqual(tab1_numpy_arrow, tab1) 

1468 

1469 tab1_pandas = arrow_to_pandas(tab1) 

1470 self.assertEqual(len(tab1_pandas), 0) 

1471 tab1_pandas_arrow = pandas_to_arrow(tab1_pandas) 

1472 # Unfortunately, string/byte columns get mangled when translated 

1473 # through empty pandas dataframes. 

1474 self.assertEqual( 

1475 tab1_pandas_arrow.select(("index", "a", "b", "c", "ddd")), 

1476 tab1.select(("index", "a", "b", "c", "ddd")), 

1477 ) 

1478 

1479 tab1_astropy = arrow_to_astropy(tab1) 

1480 self.assertEqual(len(tab1_astropy), 0) 

1481 tab1_astropy_arrow = astropy_to_arrow(tab1_astropy) 

1482 self.assertEqual(tab1_astropy_arrow, tab1) 

1483 

1484 def testEmptyArrowTableMultidim(self): 

1485 data = _makeSimpleNumpyTable(include_multidim=True) 

1486 type_list = _numpy_dtype_to_arrow_types(data.dtype) 

1487 

1488 md = {} 

1489 for name in data.dtype.names: 

1490 _append_numpy_multidim_metadata(md, name, data.dtype[name]) 

1491 

1492 schema = pa.schema(type_list, metadata=md) 

1493 arrays = [[]] * len(schema.names) 

1494 

1495 tab1 = pa.Table.from_arrays(arrays, schema=schema) 

1496 

1497 self.butler.put(tab1, self.datasetType, dataId={}) 

1498 tab2 = self.butler.get(self.datasetType, dataId={}) 

1499 self.assertEqual(tab2, tab1) 

1500 

1501 tab1_numpy = arrow_to_numpy(tab1) 

1502 self.assertEqual(len(tab1_numpy), 0) 

1503 tab1_numpy_arrow = numpy_to_arrow(tab1_numpy) 

1504 self.assertEqual(tab1_numpy_arrow, tab1) 

1505 

1506 tab1_astropy = arrow_to_astropy(tab1) 

1507 self.assertEqual(len(tab1_astropy), 0) 

1508 tab1_astropy_arrow = astropy_to_arrow(tab1_astropy) 

1509 self.assertEqual(tab1_astropy_arrow, tab1) 

1510 

1511 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

1512 def testWriteArrowTableReadAsSingleIndexDataFrame(self): 

1513 df1, allColumns = _makeSingleIndexDataFrame() 

1514 

1515 self.butler.put(df1, self.datasetType, dataId={}) 

1516 

1517 # Read back out as a dataframe. 

1518 df2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

1519 self.assertTrue(df1.equals(df2)) 

1520 

1521 # Read back out as an arrow table, convert to dataframe. 

1522 tab3 = self.butler.get(self.datasetType, dataId={}) 

1523 df3 = arrow_to_pandas(tab3) 

1524 self.assertTrue(df1.equals(df3)) 

1525 

1526 # Check reading the columns. 

1527 columns = df2.reset_index().columns 

1528 columns2 = self.butler.get( 

1529 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex" 

1530 ) 

1531 # We check the set because pandas reorders the columns. 

1532 self.assertEqual(set(columns2.to_list()), set(columns.to_list())) 

1533 

1534 # Check reading the schema. 

1535 schema = DataFrameSchema(df1) 

1536 schema2 = self.butler.get( 

1537 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema" 

1538 ) 

1539 self.assertEqual(schema2, schema) 

1540 

1541 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

1542 def testWriteArrowTableReadAsMultiIndexDataFrame(self): 

1543 df1 = _makeMultiIndexDataFrame() 

1544 

1545 self.butler.put(df1, self.datasetType, dataId={}) 

1546 

1547 # Read back out as a dataframe. 

1548 df2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

1549 self.assertTrue(df1.equals(df2)) 

1550 

1551 # Read back out as an arrow table, convert to dataframe. 

1552 atab3 = self.butler.get(self.datasetType, dataId={}) 

1553 df3 = arrow_to_pandas(atab3) 

1554 self.assertTrue(df1.equals(df3)) 

1555 

1556 # Check reading the columns. 

1557 columns = df2.columns 

1558 columns2 = self.butler.get( 

1559 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex" 

1560 ) 

1561 self.assertTrue(columns2.equals(columns)) 

1562 

1563 # Check reading the schema. 

1564 schema = DataFrameSchema(df1) 

1565 schema2 = self.butler.get( 

1566 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema" 

1567 ) 

1568 self.assertEqual(schema2, schema) 

1569 

1570 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

1571 def testWriteArrowTableReadAsAstropyTable(self): 

1572 tab1 = _makeSimpleAstropyTable(include_multidim=True, include_masked=True) 

1573 

1574 self.butler.put(tab1, self.datasetType, dataId={}) 

1575 

1576 # Read back out as an astropy table. 

1577 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

1578 self._checkAstropyTableEquality(tab1, tab2) 

1579 

1580 # Read back out as an arrow table, convert to astropy table. 

1581 atab3 = self.butler.get(self.datasetType, dataId={}) 

1582 tab3 = arrow_to_astropy(atab3) 

1583 self._checkAstropyTableEquality(tab1, tab3) 

1584 

1585 # Check reading the columns. 

1586 columns = list(tab2.columns.keys()) 

1587 columns2 = self.butler.get( 

1588 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

1589 ) 

1590 self.assertEqual(columns2, columns) 

1591 

1592 # Check reading the schema. 

1593 schema = ArrowAstropySchema(tab1) 

1594 schema2 = self.butler.get( 

1595 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema" 

1596 ) 

1597 self.assertEqual(schema2, schema) 

1598 

1599 # Check the schema conversions and units. 

1600 arrow_schema = schema.to_arrow_schema() 

1601 for name in arrow_schema.names: 

1602 field_metadata = arrow_schema.field(name).metadata 

1603 if ( 

1604 b"description" in field_metadata 

1605 and (description := field_metadata[b"description"].decode("UTF-8")) != "" 

1606 ): 

1607 self.assertEqual(schema2.schema[name].description, description) 

1608 else: 

1609 self.assertIsNone(schema2.schema[name].description) 

1610 if b"unit" in field_metadata and (unit := field_metadata[b"unit"].decode("UTF-8")) != "": 

1611 self.assertEqual(schema2.schema[name].unit, units.Unit(unit)) 

1612 

1613 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

1614 def testWriteArrowTableReadAsNumpyTable(self): 

1615 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1616 

1617 self.butler.put(tab1, self.datasetType, dataId={}) 

1618 

1619 # Read back out as a numpy table. 

1620 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy") 

1621 self._checkNumpyTableEquality(tab1, tab2) 

1622 

1623 # Read back out as an arrow table, convert to numpy table. 

1624 atab3 = self.butler.get(self.datasetType, dataId={}) 

1625 tab3 = arrow_to_numpy(atab3) 

1626 self._checkNumpyTableEquality(tab1, tab3) 

1627 

1628 # Check reading the columns. 

1629 columns = list(tab2.dtype.names) 

1630 columns2 = self.butler.get( 

1631 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

1632 ) 

1633 self.assertEqual(columns2, columns) 

1634 

1635 # Check reading the schema. 

1636 schema = ArrowNumpySchema(tab1.dtype) 

1637 schema2 = self.butler.get( 

1638 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema" 

1639 ) 

1640 self.assertEqual(schema2, schema) 

1641 

1642 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

1643 def testWriteArrowTableReadAsNumpyDict(self): 

1644 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1645 

1646 self.butler.put(tab1, self.datasetType, dataId={}) 

1647 

1648 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict") 

1649 tab2_numpy = _numpy_dict_to_numpy(tab2) 

1650 self._checkNumpyTableEquality(tab1, tab2_numpy) 

1651 

1652 def _checkAstropyTableEquality(self, table1, table2): 

1653 """Check if two astropy tables have the same columns/values 

1654 

1655 Parameters 

1656 ---------- 

1657 table1 : `astropy.table.Table` 

1658 table2 : `astropy.table.Table` 

1659 """ 

1660 self.assertEqual(table1.dtype, table2.dtype) 

1661 for name in table1.columns: 

1662 self.assertEqual(table1[name].unit, table2[name].unit) 

1663 self.assertEqual(table1[name].description, table2[name].description) 

1664 self.assertEqual(table1[name].format, table2[name].format) 

1665 # We need to check masked/regular columns after filling. 

1666 has_masked = False 

1667 if isinstance(table1[name], atable.column.MaskedColumn): 

1668 c1 = table1[name].filled() 

1669 has_masked = True 

1670 else: 

1671 c1 = np.array(table1[name]) 

1672 if has_masked: 

1673 self.assertIsInstance(table2[name], atable.column.MaskedColumn) 

1674 c2 = table2[name].filled() 

1675 else: 

1676 self.assertFalse(isinstance(table2[name], atable.column.MaskedColumn)) 

1677 c2 = np.array(table2[name]) 

1678 np.testing.assert_array_equal(c1, c2) 

1679 # If we have a masked column then we test the underlying data. 

1680 if has_masked: 

1681 np.testing.assert_array_equal(np.array(c1), np.array(c2)) 

1682 np.testing.assert_array_equal(table1[name].mask, table2[name].mask) 

1683 

1684 def _checkNumpyTableEquality(self, table1, table2): 

1685 """Check if two numpy tables have the same columns/values 

1686 

1687 Parameters 

1688 ---------- 

1689 table1 : `numpy.ndarray` 

1690 table2 : `numpy.ndarray` 

1691 """ 

1692 self.assertEqual(table1.dtype.names, table2.dtype.names) 

1693 for name in table1.dtype.names: 

1694 self.assertEqual(table1.dtype[name], table2.dtype[name]) 

1695 self.assertTrue(np.all(table1 == table2)) 

1696 

1697 

1698@unittest.skipUnless(pa is not None, "Cannot test InMemoryArrowTableDelegate without pyarrow.") 

1699class InMemoryArrowTableDelegateTestCase(ParquetFormatterArrowTableTestCase): 

1700 """Tests for InMemoryDatastore, using ArrowTableDelegate.""" 

1701 

1702 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

1703 

1704 def testBadInput(self): 

1705 tab1 = _makeSimpleArrowTable() 

1706 delegate = ArrowTableDelegate("ArrowTable") 

1707 

1708 with self.assertRaises(ValueError): 

1709 delegate.handleParameters(inMemoryDataset="not_an_arrow_table") 

1710 

1711 with self.assertRaises(NotImplementedError): 

1712 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]}) 

1713 

1714 with self.assertRaises(AttributeError): 

1715 delegate.getComponent(composite=tab1, componentName="nothing") 

1716 

1717 def testStorageClass(self): 

1718 tab1 = _makeSimpleArrowTable() 

1719 

1720 factory = StorageClassFactory() 

1721 factory.addFromConfig(StorageClassConfig()) 

1722 

1723 storageClass = factory.findStorageClass(type(tab1), compare_types=False) 

1724 # Force the name lookup to do name matching. 

1725 storageClass._pytype = None 

1726 self.assertEqual(storageClass.name, "ArrowTable") 

1727 

1728 storageClass = factory.findStorageClass(type(tab1), compare_types=True) 

1729 # Force the name lookup to do name matching. 

1730 storageClass._pytype = None 

1731 self.assertEqual(storageClass.name, "ArrowTable") 

1732 

1733 

1734@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.") 

1735@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowNumpy without pyarrow.") 

1736class ParquetFormatterArrowNumpyDictTestCase(unittest.TestCase): 

1737 """Tests for ParquetFormatter, ArrowNumpyDict, using local file store.""" 

1738 

1739 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1740 

1741 def setUp(self): 

1742 """Create a new butler root for each test.""" 

1743 self.root = makeTestTempDir(TESTDIR) 

1744 config = Config(self.configFile) 

1745 self.butler = Butler.from_config( 

1746 Butler.makeRepo(self.root, config=config), writeable=True, run="test_run" 

1747 ) 

1748 # No dimensions in dataset type so we don't have to worry about 

1749 # inserting dimension data or defining data IDs. 

1750 self.datasetType = DatasetType( 

1751 "data", dimensions=(), storageClass="ArrowNumpyDict", universe=self.butler.dimensions 

1752 ) 

1753 self.butler.registry.registerDatasetType(self.datasetType) 

1754 

1755 def tearDown(self): 

1756 removeTestTempDir(self.root) 

1757 

1758 def testNumpyDict(self): 

1759 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1760 dict1 = _numpy_to_numpy_dict(tab1) 

1761 

1762 self.butler.put(dict1, self.datasetType, dataId={}) 

1763 # Read the whole table. 

1764 dict2 = self.butler.get(self.datasetType, dataId={}) 

1765 self._checkNumpyDictEquality(dict1, dict2) 

1766 # Read the columns. 

1767 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

1768 self.assertEqual(len(columns2), len(dict1.keys())) 

1769 for name in dict1: 

1770 self.assertIn(name, columns2) 

1771 # Read the rowcount. 

1772 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

1773 self.assertEqual(rowcount, len(dict1["a"])) 

1774 # Read the schema. 

1775 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

1776 self.assertEqual(schema, ArrowNumpySchema(tab1.dtype)) 

1777 # Read just some columns a few different ways. 

1778 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

1779 subdict = {key: dict1[key] for key in ["a", "c"]} 

1780 self._checkNumpyDictEquality(subdict, tab3) 

1781 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

1782 subdict = {key: dict1[key] for key in ["a"]} 

1783 self._checkNumpyDictEquality(subdict, tab4) 

1784 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]}) 

1785 subdict = {key: dict1[key] for key in ["index", "a"]} 

1786 self._checkNumpyDictEquality(subdict, tab5) 

1787 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"}) 

1788 subdict = {key: dict1[key] for key in ["ddd"]} 

1789 self._checkNumpyDictEquality(subdict, tab6) 

1790 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]}) 

1791 subdict = {key: dict1[key] for key in ["a"]} 

1792 self._checkNumpyDictEquality(subdict, tab7) 

1793 # Passing an unrecognized column should be a ValueError. 

1794 with self.assertRaises(ValueError): 

1795 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]}) 

1796 

1797 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.") 

1798 def testWriteNumpyDictReadAsArrowTable(self): 

1799 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1800 dict1 = _numpy_to_numpy_dict(tab1) 

1801 

1802 self.butler.put(dict1, self.datasetType, dataId={}) 

1803 

1804 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable") 

1805 

1806 tab2_dict = arrow_to_numpy_dict(tab2) 

1807 

1808 self._checkNumpyDictEquality(dict1, tab2_dict) 

1809 

1810 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

1811 def testWriteNumpyDictReadAsDataFrame(self): 

1812 tab1 = _makeSimpleNumpyTable() 

1813 dict1 = _numpy_to_numpy_dict(tab1) 

1814 

1815 self.butler.put(dict1, self.datasetType, dataId={}) 

1816 

1817 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

1818 

1819 # The order of the dict may get mixed up, so we need to check column 

1820 # by column. We also need to do this in dataframe form because pandas 

1821 # changes the datatype of the string column. 

1822 tab1_df = pd.DataFrame(tab1) 

1823 

1824 self.assertEqual(set(tab1_df.columns), set(tab2.columns)) 

1825 for col in tab1_df.columns: 

1826 self.assertTrue(np.all(tab1_df[col].values == tab2[col].values)) 

1827 

1828 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

1829 def testWriteNumpyDictReadAsAstropyTable(self): 

1830 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1831 dict1 = _numpy_to_numpy_dict(tab1) 

1832 

1833 self.butler.put(dict1, self.datasetType, dataId={}) 

1834 

1835 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

1836 tab2_dict = _astropy_to_numpy_dict(tab2) 

1837 

1838 self._checkNumpyDictEquality(dict1, tab2_dict) 

1839 

1840 def testWriteNumpyDictReadAsNumpyTable(self): 

1841 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1842 dict1 = _numpy_to_numpy_dict(tab1) 

1843 

1844 self.butler.put(dict1, self.datasetType, dataId={}) 

1845 

1846 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy") 

1847 tab2_dict = _numpy_to_numpy_dict(tab2) 

1848 

1849 self._checkNumpyDictEquality(dict1, tab2_dict) 

1850 

1851 def testWriteNumpyDictBad(self): 

1852 dict1 = {"a": 4, "b": np.ndarray([1])} 

1853 with self.assertRaises(RuntimeError): 

1854 self.butler.put(dict1, self.datasetType, dataId={}) 

1855 

1856 dict2 = {"a": np.zeros(4), "b": np.zeros(5)} 

1857 with self.assertRaises(RuntimeError): 

1858 self.butler.put(dict2, self.datasetType, dataId={}) 

1859 

1860 dict3 = {"a": [0] * 5, "b": np.zeros(5)} 

1861 with self.assertRaises(RuntimeError): 

1862 self.butler.put(dict3, self.datasetType, dataId={}) 

1863 

1864 def _checkNumpyDictEquality(self, dict1, dict2): 

1865 """Check if two numpy dicts have the same columns/values. 

1866 

1867 Parameters 

1868 ---------- 

1869 dict1 : `dict` [`str`, `np.ndarray`] 

1870 dict2 : `dict` [`str`, `np.ndarray`] 

1871 """ 

1872 self.assertEqual(set(dict1.keys()), set(dict2.keys())) 

1873 for name in dict1: 

1874 self.assertEqual(dict1[name].dtype, dict2[name].dtype) 

1875 self.assertTrue(np.all(dict1[name] == dict2[name])) 

1876 

1877 

1878@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.") 

1879@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowNumpy without pyarrow.") 

1880class InMemoryNumpyDictDelegateTestCase(ParquetFormatterArrowNumpyDictTestCase): 

1881 """Tests for InMemoryDatastore, using ArrowNumpyDictDelegate.""" 

1882 

1883 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

1884 

1885 def testWriteNumpyDictBad(self): 

1886 # The sub-type checking is not done on in-memory datastore. 

1887 pass 

1888 

1889 

1890@unittest.skipUnless(pa is not None, "Cannot test ArrowSchema without pyarrow.") 

1891class ParquetFormatterArrowSchemaTestCase(unittest.TestCase): 

1892 """Tests for ParquetFormatter, ArrowSchema, using local file datastore.""" 

1893 

1894 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1895 

1896 def setUp(self): 

1897 """Create a new butler root for each test.""" 

1898 self.root = makeTestTempDir(TESTDIR) 

1899 config = Config(self.configFile) 

1900 self.butler = Butler.from_config( 

1901 Butler.makeRepo(self.root, config=config), writeable=True, run="test_run" 

1902 ) 

1903 # No dimensions in dataset type so we don't have to worry about 

1904 # inserting dimension data or defining data IDs. 

1905 self.datasetType = DatasetType( 

1906 "data", dimensions=(), storageClass="ArrowSchema", universe=self.butler.dimensions 

1907 ) 

1908 self.butler.registry.registerDatasetType(self.datasetType) 

1909 

1910 def tearDown(self): 

1911 removeTestTempDir(self.root) 

1912 

1913 def _makeTestSchema(self): 

1914 schema = pa.schema( 

1915 [ 

1916 pa.field( 

1917 "int32", 

1918 pa.int32(), 

1919 nullable=False, 

1920 metadata={ 

1921 "description": "32-bit integer", 

1922 "unit": "", 

1923 }, 

1924 ), 

1925 pa.field( 

1926 "int64", 

1927 pa.int64(), 

1928 nullable=False, 

1929 metadata={ 

1930 "description": "64-bit integer", 

1931 "unit": "", 

1932 }, 

1933 ), 

1934 pa.field( 

1935 "uint64", 

1936 pa.uint64(), 

1937 nullable=False, 

1938 metadata={ 

1939 "description": "64-bit unsigned integer", 

1940 "unit": "", 

1941 }, 

1942 ), 

1943 pa.field( 

1944 "float32", 

1945 pa.float32(), 

1946 nullable=False, 

1947 metadata={ 

1948 "description": "32-bit float", 

1949 "unit": "count", 

1950 }, 

1951 ), 

1952 pa.field( 

1953 "float64", 

1954 pa.float64(), 

1955 nullable=False, 

1956 metadata={ 

1957 "description": "64-bit float", 

1958 "unit": "nJy", 

1959 }, 

1960 ), 

1961 pa.field( 

1962 "fixed_size_list", 

1963 pa.list_(pa.float64(), list_size=10), 

1964 nullable=False, 

1965 metadata={ 

1966 "description": "Fixed size list of 64-bit floats.", 

1967 "unit": "nJy", 

1968 }, 

1969 ), 

1970 pa.field( 

1971 "variable_size_list", 

1972 pa.list_(pa.float64()), 

1973 nullable=False, 

1974 metadata={ 

1975 "description": "Variable size list of 64-bit floats.", 

1976 "unit": "nJy", 

1977 }, 

1978 ), 

1979 # One of these fields will have no description. 

1980 pa.field( 

1981 "string", 

1982 pa.string(), 

1983 nullable=False, 

1984 metadata={ 

1985 "unit": "", 

1986 }, 

1987 ), 

1988 # One of these fields will have no metadata. 

1989 pa.field( 

1990 "binary", 

1991 pa.binary(), 

1992 nullable=False, 

1993 ), 

1994 ] 

1995 ) 

1996 

1997 return schema 

1998 

1999 def testArrowSchema(self): 

2000 schema1 = self._makeTestSchema() 

2001 self.butler.put(schema1, self.datasetType, dataId={}) 

2002 

2003 schema2 = self.butler.get(self.datasetType, dataId={}) 

2004 self.assertEqual(schema2, schema1) 

2005 

2006 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe schema without pandas.") 

2007 def testWriteArrowSchemaReadAsDataFrameSchema(self): 

2008 schema1 = self._makeTestSchema() 

2009 self.butler.put(schema1, self.datasetType, dataId={}) 

2010 

2011 df_schema1 = DataFrameSchema.from_arrow(schema1) 

2012 

2013 df_schema2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrameSchema") 

2014 self.assertEqual(df_schema2, df_schema1) 

2015 

2016 @unittest.skipUnless(atable is not None, "Cannot test reading as an astropy schema without astropy.") 

2017 def testWriteArrowSchemaReadAsArrowAstropySchema(self): 

2018 schema1 = self._makeTestSchema() 

2019 self.butler.put(schema1, self.datasetType, dataId={}) 

2020 

2021 ap_schema1 = ArrowAstropySchema.from_arrow(schema1) 

2022 

2023 ap_schema2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropySchema") 

2024 self.assertEqual(ap_schema2, ap_schema1) 

2025 

2026 # Confirm that the ap_schema2 has the unit/description we expect. 

2027 for name in schema1.names: 

2028 field_metadata = schema1.field(name).metadata 

2029 if field_metadata is None: 

2030 continue 

2031 if ( 

2032 b"description" in field_metadata 

2033 and (description := field_metadata[b"description"].decode("UTF-8")) != "" 

2034 ): 

2035 self.assertEqual(ap_schema2.schema[name].description, description) 

2036 else: 

2037 self.assertIsNone(ap_schema2.schema[name].description) 

2038 if b"unit" in field_metadata and (unit := field_metadata[b"unit"].decode("UTF-8")) != "": 

2039 self.assertEqual(ap_schema2.schema[name].unit, units.Unit(unit)) 

2040 

2041 @unittest.skipUnless(atable is not None, "Cannot test reading as an numpy schema without numpy.") 

2042 def testWriteArrowSchemaReadAsArrowNumpySchema(self): 

2043 schema1 = self._makeTestSchema() 

2044 self.butler.put(schema1, self.datasetType, dataId={}) 

2045 

2046 np_schema1 = ArrowNumpySchema.from_arrow(schema1) 

2047 

2048 np_schema2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpySchema") 

2049 self.assertEqual(np_schema2, np_schema1) 

2050 

2051 

2052@unittest.skipUnless(pa is not None, "Cannot test InMemoryArrowSchemaDelegate without pyarrow.") 

2053class InMemoryArrowSchemaDelegateTestCase(ParquetFormatterArrowSchemaTestCase): 

2054 """Tests for InMemoryDatastore and ArrowSchema.""" 

2055 

2056 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

2057 

2058 

2059@unittest.skipUnless(np is not None, "Cannot test compute_row_group_size without numpy.") 

2060@unittest.skipUnless(pa is not None, "Cannot test compute_row_group_size without pyarrow.") 

2061class ComputeRowGroupSizeTestCase(unittest.TestCase): 

2062 """Tests for compute_row_group_size.""" 

2063 

2064 def testRowGroupSizeNoMetadata(self): 

2065 numpyTable = _makeSimpleNumpyTable(include_multidim=True) 

2066 

2067 # We can't use the numpy_to_arrow convenience function because 

2068 # that adds metadata. 

2069 type_list = _numpy_dtype_to_arrow_types(numpyTable.dtype) 

2070 schema = pa.schema(type_list) 

2071 arrays = _numpy_style_arrays_to_arrow_arrays( 

2072 numpyTable.dtype, 

2073 len(numpyTable), 

2074 numpyTable, 

2075 schema, 

2076 ) 

2077 arrowTable = pa.Table.from_arrays(arrays, schema=schema) 

2078 

2079 row_group_size = compute_row_group_size(arrowTable.schema) 

2080 

2081 self.assertGreater(row_group_size, 1_000_000) 

2082 self.assertLess(row_group_size, 2_000_000) 

2083 

2084 def testRowGroupSizeWithMetadata(self): 

2085 numpyTable = _makeSimpleNumpyTable(include_multidim=True) 

2086 

2087 arrowTable = numpy_to_arrow(numpyTable) 

2088 

2089 row_group_size = compute_row_group_size(arrowTable.schema) 

2090 

2091 self.assertGreater(row_group_size, 1_000_000) 

2092 self.assertLess(row_group_size, 2_000_000) 

2093 

2094 def testRowGroupSizeTinyTable(self): 

2095 numpyTable = np.zeros(1, dtype=[("a", np.bool_)]) 

2096 

2097 arrowTable = numpy_to_arrow(numpyTable) 

2098 

2099 row_group_size = compute_row_group_size(arrowTable.schema) 

2100 

2101 self.assertGreater(row_group_size, 1_000_000) 

2102 

2103 @unittest.skipUnless(pd is not None, "Cannot run testRowGroupSizeDataFrameWithLists without pandas.") 

2104 def testRowGroupSizeDataFrameWithLists(self): 

2105 df = pd.DataFrame({"a": np.zeros(10), "b": [[0, 0]] * 10, "c": [[0.0, 0.0]] * 10, "d": [[]] * 10}) 

2106 arrowTable = pandas_to_arrow(df) 

2107 row_group_size = compute_row_group_size(arrowTable.schema) 

2108 

2109 self.assertGreater(row_group_size, 1_000_000) 

2110 

2111 

2112if __name__ == "__main__": 

2113 unittest.main()