Coverage for tests/test_parquet.py: 23%

1039 statements  

« prev     ^ index     » next       coverage.py v7.3.1, created at 2023-10-02 08:00 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28"""Tests for ParquetFormatter. 

29 

30Tests in this module are disabled unless pandas and pyarrow are importable. 

31""" 

32 

33import os 

34import unittest 

35 

36try: 

37 import pyarrow as pa 

38except ImportError: 

39 pa = None 

40try: 

41 import astropy.table as atable 

42 from astropy import units 

43except ImportError: 

44 atable = None 

45try: 

46 import numpy as np 

47except ImportError: 

48 np = None 

49try: 

50 import pandas as pd 

51except ImportError: 

52 pd = None 

53 

54from lsst.daf.butler import ( 

55 Butler, 

56 Config, 

57 DatasetRef, 

58 DatasetType, 

59 FileDataset, 

60 StorageClassConfig, 

61 StorageClassFactory, 

62) 

63 

64try: 

65 from lsst.daf.butler.delegates.arrowastropy import ArrowAstropyDelegate 

66except ImportError: 

67 atable = None 

68 pa = None 

69try: 

70 from lsst.daf.butler.delegates.arrownumpy import ArrowNumpyDelegate 

71except ImportError: 

72 np = None 

73 pa = None 

74try: 

75 from lsst.daf.butler.delegates.arrowtable import ArrowTableDelegate 

76except ImportError: 

77 pa = None 

78try: 

79 from lsst.daf.butler.delegates.dataframe import DataFrameDelegate 

80except ImportError: 

81 pd = None 

82try: 

83 from lsst.daf.butler.formatters.parquet import ( 

84 ArrowAstropySchema, 

85 ArrowNumpySchema, 

86 DataFrameSchema, 

87 ParquetFormatter, 

88 _append_numpy_multidim_metadata, 

89 _astropy_to_numpy_dict, 

90 _numpy_dict_to_numpy, 

91 _numpy_dtype_to_arrow_types, 

92 _numpy_style_arrays_to_arrow_arrays, 

93 _numpy_to_numpy_dict, 

94 arrow_to_astropy, 

95 arrow_to_numpy, 

96 arrow_to_numpy_dict, 

97 arrow_to_pandas, 

98 astropy_to_arrow, 

99 compute_row_group_size, 

100 numpy_dict_to_arrow, 

101 numpy_to_arrow, 

102 pandas_to_arrow, 

103 ) 

104except ImportError: 

105 pa = None 

106 pd = None 

107 atable = None 

108 np = None 

109from lsst.daf.butler.tests.utils import makeTestTempDir, removeTestTempDir 

110 

111TESTDIR = os.path.abspath(os.path.dirname(__file__)) 

112 

113 

114def _makeSimpleNumpyTable(include_multidim=False, include_bigendian=False): 

115 """Make a simple numpy table with random data. 

116 

117 Parameters 

118 ---------- 

119 include_multidim : `bool` 

120 Include multi-dimensional columns. 

121 include_bigendian : `bool` 

122 Include big-endian columns. 

123 

124 Returns 

125 ------- 

126 numpyTable : `numpy.ndarray` 

127 """ 

128 nrow = 5 

129 

130 dtype = [ 

131 ("index", "i4"), 

132 ("a", "f8"), 

133 ("b", "f8"), 

134 ("c", "f8"), 

135 ("ddd", "f8"), 

136 ("f", "i8"), 

137 ("strcol", "U10"), 

138 ("bytecol", "a10"), 

139 ] 

140 

141 if include_multidim: 

142 dtype.extend( 

143 [ 

144 ("d1", "f4", (5,)), 

145 ("d2", "i8", (5, 10)), 

146 ("d3", "f8", (5, 10)), 

147 ] 

148 ) 

149 

150 if include_bigendian: 

151 dtype.extend([("a_bigendian", ">f8"), ("f_bigendian", ">i8")]) 

152 

153 data = np.zeros(nrow, dtype=dtype) 

154 data["index"][:] = np.arange(nrow) 

155 data["a"] = np.random.randn(nrow) 

156 data["b"] = np.random.randn(nrow) 

157 data["c"] = np.random.randn(nrow) 

158 data["ddd"] = np.random.randn(nrow) 

159 data["f"] = np.arange(nrow) * 10 

160 data["strcol"][:] = "teststring" 

161 data["bytecol"][:] = "teststring" 

162 

163 if include_multidim: 

164 data["d1"] = np.random.randn(data["d1"].size).reshape(data["d1"].shape) 

165 data["d2"] = np.arange(data["d2"].size).reshape(data["d2"].shape) 

166 data["d3"] = np.asfortranarray(np.random.randn(data["d3"].size).reshape(data["d3"].shape)) 

167 

168 if include_bigendian: 

169 data["a_bigendian"][:] = data["a"] 

170 data["f_bigendian"][:] = data["f"] 

171 

172 return data 

173 

174 

175def _makeSingleIndexDataFrame(include_masked=False, include_lists=False): 

176 """Make a single index data frame for testing. 

177 

178 Parameters 

179 ---------- 

180 include_masked : `bool` 

181 Include masked columns. 

182 include_lists : `bool` 

183 Include list columns. 

184 

185 Returns 

186 ------- 

187 dataFrame : `~pandas.DataFrame` 

188 The test dataframe. 

189 allColumns : `list` [`str`] 

190 List of all the columns (including index columns). 

191 """ 

192 data = _makeSimpleNumpyTable() 

193 df = pd.DataFrame(data) 

194 df = df.set_index("index") 

195 

196 if include_masked: 

197 nrow = len(df) 

198 

199 df["m1"] = pd.array(np.arange(nrow), dtype=pd.Int64Dtype()) 

200 df["m2"] = pd.array(np.arange(nrow), dtype=np.float32) 

201 df["mstrcol"] = pd.array(np.array(["text"] * nrow)) 

202 df.loc[1, ["m1", "m2", "mstrcol"]] = None 

203 

204 if include_lists: 

205 nrow = len(df) 

206 

207 df["l1"] = [[0, 0]] * nrow 

208 df["l2"] = [[0.0, 0.0]] * nrow 

209 df["l3"] = [[]] * nrow 

210 

211 allColumns = df.columns.append(pd.Index(df.index.names)) 

212 

213 return df, allColumns 

214 

215 

216def _makeMultiIndexDataFrame(): 

217 """Make a multi-index data frame for testing. 

218 

219 Returns 

220 ------- 

221 dataFrame : `~pandas.DataFrame` 

222 The test dataframe. 

223 """ 

224 columns = pd.MultiIndex.from_tuples( 

225 [ 

226 ("g", "a"), 

227 ("g", "b"), 

228 ("g", "c"), 

229 ("r", "a"), 

230 ("r", "b"), 

231 ("r", "c"), 

232 ], 

233 names=["filter", "column"], 

234 ) 

235 df = pd.DataFrame(np.random.randn(5, 6), index=np.arange(5, dtype=int), columns=columns) 

236 

237 return df 

238 

239 

240def _makeSimpleAstropyTable(include_multidim=False, include_masked=False, include_bigendian=False): 

241 """Make an astropy table for testing. 

242 

243 Parameters 

244 ---------- 

245 include_multidim : `bool` 

246 Include multi-dimensional columns. 

247 include_masked : `bool` 

248 Include masked columns. 

249 include_bigendian : `bool` 

250 Include big-endian columns. 

251 

252 Returns 

253 ------- 

254 astropyTable : `astropy.table.Table` 

255 The test table. 

256 """ 

257 data = _makeSimpleNumpyTable(include_multidim=include_multidim, include_bigendian=include_bigendian) 

258 # Add a couple of units. 

259 table = atable.Table(data) 

260 table["a"].unit = units.degree 

261 table["a"].description = "Description of column a" 

262 table["b"].unit = units.meter 

263 table["b"].description = "Description of column b" 

264 

265 # Add some masked columns. 

266 if include_masked: 

267 nrow = len(table) 

268 mask = np.zeros(nrow, dtype=bool) 

269 mask[1] = True 

270 table["m1"] = np.ma.masked_array(data=np.arange(nrow, dtype="i8"), mask=mask) 

271 table["m2"] = np.ma.masked_array(data=np.arange(nrow, dtype="f4"), mask=mask) 

272 table["mstrcol"] = np.ma.masked_array(data=np.array(["text"] * nrow), mask=mask) 

273 table["mbytecol"] = np.ma.masked_array(data=np.array([b"bytes"] * nrow), mask=mask) 

274 

275 return table 

276 

277 

278def _makeSimpleArrowTable(include_multidim=False, include_masked=False): 

279 """Make an arrow table for testing. 

280 

281 Parameters 

282 ---------- 

283 include_multidim : `bool` 

284 Include multi-dimensional columns. 

285 include_masked : `bool` 

286 Include masked columns. 

287 

288 Returns 

289 ------- 

290 arrowTable : `pyarrow.Table` 

291 The test table. 

292 """ 

293 data = _makeSimpleAstropyTable(include_multidim=include_multidim, include_masked=include_masked) 

294 return astropy_to_arrow(data) 

295 

296 

297@unittest.skipUnless(pd is not None, "Cannot test ParquetFormatterDataFrame without pandas.") 

298@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterDataFrame without pyarrow.") 

299class ParquetFormatterDataFrameTestCase(unittest.TestCase): 

300 """Tests for ParquetFormatter, DataFrame, using local file datastore.""" 

301 

302 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

303 

304 def setUp(self): 

305 """Create a new butler root for each test.""" 

306 self.root = makeTestTempDir(TESTDIR) 

307 config = Config(self.configFile) 

308 self.run = "test_run" 

309 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run=self.run) 

310 # No dimensions in dataset type so we don't have to worry about 

311 # inserting dimension data or defining data IDs. 

312 self.datasetType = DatasetType( 

313 "data", dimensions=(), storageClass="DataFrame", universe=self.butler.dimensions 

314 ) 

315 self.butler.registry.registerDatasetType(self.datasetType) 

316 

317 def tearDown(self): 

318 removeTestTempDir(self.root) 

319 

320 def testSingleIndexDataFrame(self): 

321 df1, allColumns = _makeSingleIndexDataFrame(include_masked=True) 

322 

323 self.butler.put(df1, self.datasetType, dataId={}) 

324 # Read the whole DataFrame. 

325 df2 = self.butler.get(self.datasetType, dataId={}) 

326 self.assertTrue(df1.equals(df2)) 

327 # Read just the column descriptions. 

328 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

329 self.assertTrue(allColumns.equals(columns2)) 

330 # Read the rowcount. 

331 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

332 self.assertEqual(rowcount, len(df1)) 

333 # Read the schema. 

334 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

335 self.assertEqual(schema, DataFrameSchema(df1)) 

336 # Read just some columns a few different ways. 

337 df3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

338 self.assertTrue(df1.loc[:, ["a", "c"]].equals(df3)) 

339 df4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

340 self.assertTrue(df1.loc[:, ["a"]].equals(df4)) 

341 df5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]}) 

342 self.assertTrue(df1.loc[:, ["a"]].equals(df5)) 

343 df6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"}) 

344 self.assertTrue(df1.loc[:, ["ddd"]].equals(df6)) 

345 df7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]}) 

346 self.assertTrue(df1.loc[:, ["a"]].equals(df7)) 

347 # Passing an unrecognized column should be a ValueError. 

348 with self.assertRaises(ValueError): 

349 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]}) 

350 

351 def testSingleIndexDataFrameWithLists(self): 

352 df1, allColumns = _makeSingleIndexDataFrame(include_lists=True) 

353 

354 self.butler.put(df1, self.datasetType, dataId={}) 

355 # Read the whole DataFrame. 

356 df2 = self.butler.get(self.datasetType, dataId={}) 

357 

358 # We need to check the list columns specially because they go 

359 # from lists to arrays. 

360 for col in ["l1", "l2", "l3"]: 

361 for i in range(len(df1)): 

362 self.assertTrue(np.all(df2[col].values[i] == df1[col].values[i])) 

363 

364 def testMultiIndexDataFrame(self): 

365 df1 = _makeMultiIndexDataFrame() 

366 

367 self.butler.put(df1, self.datasetType, dataId={}) 

368 # Read the whole DataFrame. 

369 df2 = self.butler.get(self.datasetType, dataId={}) 

370 self.assertTrue(df1.equals(df2)) 

371 # Read just the column descriptions. 

372 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

373 self.assertTrue(df1.columns.equals(columns2)) 

374 self.assertEqual(columns2.names, df1.columns.names) 

375 # Read the rowcount. 

376 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

377 self.assertEqual(rowcount, len(df1)) 

378 # Read the schema. 

379 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

380 self.assertEqual(schema, DataFrameSchema(df1)) 

381 # Read just some columns a few different ways. 

382 df3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": {"filter": "g"}}) 

383 self.assertTrue(df1.loc[:, ["g"]].equals(df3)) 

384 df4 = self.butler.get( 

385 self.datasetType, dataId={}, parameters={"columns": {"filter": ["r"], "column": "a"}} 

386 ) 

387 self.assertTrue(df1.loc[:, [("r", "a")]].equals(df4)) 

388 column_list = [("g", "a"), ("r", "c")] 

389 df5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": column_list}) 

390 self.assertTrue(df1.loc[:, column_list].equals(df5)) 

391 column_dict = {"filter": "r", "column": ["a", "b"]} 

392 df6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": column_dict}) 

393 self.assertTrue(df1.loc[:, [("r", "a"), ("r", "b")]].equals(df6)) 

394 # Passing an unrecognized column should be a ValueError. 

395 with self.assertRaises(ValueError): 

396 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["d"]}) 

397 

398 def testSingleIndexDataFrameEmptyString(self): 

399 """Test persisting a single index dataframe with empty strings.""" 

400 df1, _ = _makeSingleIndexDataFrame() 

401 

402 # Set one of the strings to None 

403 df1.at[1, "strcol"] = None 

404 

405 self.butler.put(df1, self.datasetType, dataId={}) 

406 # Read the whole DataFrame. 

407 df2 = self.butler.get(self.datasetType, dataId={}) 

408 self.assertTrue(df1.equals(df2)) 

409 

410 def testSingleIndexDataFrameAllEmptyStrings(self): 

411 """Test persisting a single index dataframe with an empty string 

412 column. 

413 """ 

414 df1, _ = _makeSingleIndexDataFrame() 

415 

416 # Set all of the strings to None 

417 df1.loc[0:, "strcol"] = None 

418 

419 self.butler.put(df1, self.datasetType, dataId={}) 

420 # Read the whole DataFrame. 

421 df2 = self.butler.get(self.datasetType, dataId={}) 

422 self.assertTrue(df1.equals(df2)) 

423 

424 def testLegacyDataFrame(self): 

425 """Test writing a dataframe to parquet via pandas (without additional 

426 metadata) and ensure that we can read it back with all the new 

427 functionality. 

428 """ 

429 df1, allColumns = _makeSingleIndexDataFrame() 

430 

431 fname = os.path.join(self.root, "test_dataframe.parq") 

432 df1.to_parquet(fname) 

433 

434 legacy_type = DatasetType( 

435 "legacy_dataframe", 

436 dimensions=(), 

437 storageClass="DataFrame", 

438 universe=self.butler.dimensions, 

439 ) 

440 self.butler.registry.registerDatasetType(legacy_type) 

441 

442 data_id = {} 

443 ref = DatasetRef(legacy_type, data_id, run=self.run) 

444 dataset = FileDataset(path=fname, refs=[ref], formatter=ParquetFormatter) 

445 

446 self.butler.ingest(dataset, transfer="copy") 

447 

448 self.butler.put(df1, self.datasetType, dataId={}) 

449 

450 df2a = self.butler.get(self.datasetType, dataId={}) 

451 df2b = self.butler.get("legacy_dataframe", dataId={}) 

452 self.assertTrue(df2a.equals(df2b)) 

453 

454 df3a = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a"]}) 

455 df3b = self.butler.get("legacy_dataframe", dataId={}, parameters={"columns": ["a"]}) 

456 self.assertTrue(df3a.equals(df3b)) 

457 

458 columns2a = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

459 columns2b = self.butler.get("legacy_dataframe.columns", dataId={}) 

460 self.assertTrue(columns2a.equals(columns2b)) 

461 

462 rowcount2a = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

463 rowcount2b = self.butler.get("legacy_dataframe.rowcount", dataId={}) 

464 self.assertEqual(rowcount2a, rowcount2b) 

465 

466 schema2a = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

467 schema2b = self.butler.get("legacy_dataframe.schema", dataId={}) 

468 self.assertEqual(schema2a, schema2b) 

469 

470 def testDataFrameSchema(self): 

471 tab1 = _makeSimpleArrowTable() 

472 

473 schema = DataFrameSchema.from_arrow(tab1.schema) 

474 

475 self.assertIsInstance(schema.schema, pd.DataFrame) 

476 self.assertEqual(repr(schema), repr(schema._schema)) 

477 self.assertNotEqual(schema, "not_a_schema") 

478 self.assertEqual(schema, schema) 

479 

480 tab2 = _makeMultiIndexDataFrame() 

481 schema2 = DataFrameSchema(tab2) 

482 

483 self.assertNotEqual(schema, schema2) 

484 

485 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

486 def testWriteSingleIndexDataFrameReadAsAstropyTable(self): 

487 df1, allColumns = _makeSingleIndexDataFrame() 

488 

489 self.butler.put(df1, self.datasetType, dataId={}) 

490 

491 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

492 

493 tab2_df = tab2.to_pandas(index="index") 

494 self.assertTrue(df1.equals(tab2_df)) 

495 

496 # Check reading the columns. 

497 columns = list(tab2.columns.keys()) 

498 columns2 = self.butler.get( 

499 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

500 ) 

501 # We check the set because pandas reorders the columns. 

502 self.assertEqual(set(columns2), set(columns)) 

503 

504 # Check reading the schema. 

505 schema = ArrowAstropySchema(tab2) 

506 schema2 = self.butler.get( 

507 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema" 

508 ) 

509 

510 # The string types are objectified by pandas, and the order 

511 # will be changed because of pandas indexing. 

512 self.assertEqual(len(schema2.schema.columns), len(schema.schema.columns)) 

513 for name in schema.schema.columns: 

514 self.assertIn(name, schema2.schema.columns) 

515 if schema2.schema[name].dtype != np.dtype("O"): 

516 self.assertEqual(schema2.schema[name].dtype, schema.schema[name].dtype) 

517 

518 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

519 def testWriteSingleIndexDataFrameWithMaskedColsReadAsAstropyTable(self): 

520 # We need to special-case the write-as-pandas read-as-astropy code 

521 # with masks because pandas has multiple ways to use masked columns. 

522 # (The string column mask handling in particular is frustratingly 

523 # inconsistent.) 

524 df1, allColumns = _makeSingleIndexDataFrame(include_masked=True) 

525 

526 self.butler.put(df1, self.datasetType, dataId={}) 

527 

528 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

529 tab2_df = tab2.to_pandas(index="index") 

530 

531 self.assertTrue(df1.columns.equals(tab2_df.columns)) 

532 for name in tab2_df.columns: 

533 col1 = df1[name] 

534 col2 = tab2_df[name] 

535 

536 if col1.hasnans: 

537 notNull = col1.notnull() 

538 self.assertTrue(notNull.equals(col2.notnull())) 

539 # Need to check value-by-value because column may 

540 # be made of objects, depending on what pandas decides. 

541 for index in notNull.values.nonzero()[0]: 

542 self.assertEqual(col1[index], col2[index]) 

543 else: 

544 self.assertTrue(col1.equals(col2)) 

545 

546 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

547 def testWriteMultiIndexDataFrameReadAsAstropyTable(self): 

548 df1 = _makeMultiIndexDataFrame() 

549 

550 self.butler.put(df1, self.datasetType, dataId={}) 

551 

552 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

553 

554 # This is an odd duck, it doesn't really round-trip. 

555 # This test simply checks that it's readable, but definitely not 

556 # recommended. 

557 

558 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.") 

559 def testWriteSingleIndexDataFrameReadAsArrowTable(self): 

560 df1, allColumns = _makeSingleIndexDataFrame() 

561 

562 self.butler.put(df1, self.datasetType, dataId={}) 

563 

564 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable") 

565 

566 tab2_df = arrow_to_pandas(tab2) 

567 self.assertTrue(df1.equals(tab2_df)) 

568 

569 # Check reading the columns. 

570 columns = list(tab2.schema.names) 

571 columns2 = self.butler.get( 

572 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

573 ) 

574 # We check the set because pandas reorders the columns. 

575 self.assertEqual(set(columns), set(columns2)) 

576 

577 # Check reading the schema. 

578 schema = tab2.schema 

579 schema2 = self.butler.get( 

580 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema" 

581 ) 

582 

583 # These will not have the same metadata, nor will the string column 

584 # information be maintained. 

585 self.assertEqual(len(schema.names), len(schema2.names)) 

586 for name in schema.names: 

587 if schema.field(name).type not in (pa.string(), pa.binary()): 

588 self.assertEqual(schema.field(name).type, schema2.field(name).type) 

589 

590 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.") 

591 def testWriteMultiIndexDataFrameReadAsArrowTable(self): 

592 df1 = _makeMultiIndexDataFrame() 

593 

594 self.butler.put(df1, self.datasetType, dataId={}) 

595 

596 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable") 

597 

598 tab2_df = arrow_to_pandas(tab2) 

599 self.assertTrue(df1.equals(tab2_df)) 

600 

601 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

602 def testWriteSingleIndexDataFrameReadAsNumpyTable(self): 

603 df1, allColumns = _makeSingleIndexDataFrame() 

604 

605 self.butler.put(df1, self.datasetType, dataId={}) 

606 

607 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy") 

608 

609 tab2_df = pd.DataFrame.from_records(tab2, index=["index"]) 

610 self.assertTrue(df1.equals(tab2_df)) 

611 

612 # Check reading the columns. 

613 columns = list(tab2.dtype.names) 

614 columns2 = self.butler.get( 

615 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

616 ) 

617 # We check the set because pandas reorders the columns. 

618 self.assertEqual(set(columns2), set(columns)) 

619 

620 # Check reading the schema. 

621 schema = ArrowNumpySchema(tab2.dtype) 

622 schema2 = self.butler.get( 

623 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema" 

624 ) 

625 

626 # The string types will be objectified by pandas, and the order 

627 # will be changed because of pandas indexing. 

628 self.assertEqual(len(schema.schema.names), len(schema2.schema.names)) 

629 for name in schema.schema.names: 

630 self.assertIn(name, schema2.schema.names) 

631 self.assertEqual(schema2.schema[name].type, schema.schema[name].type) 

632 

633 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

634 def testWriteMultiIndexDataFrameReadAsNumpyTable(self): 

635 df1 = _makeMultiIndexDataFrame() 

636 

637 self.butler.put(df1, self.datasetType, dataId={}) 

638 

639 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy") 

640 

641 # This is an odd duck, it doesn't really round-trip. 

642 # This test simply checks that it's readable, but definitely not 

643 # recommended. 

644 

645 @unittest.skipUnless(np is not None, "Cannot test reading as numpy dict without numpy.") 

646 def testWriteSingleIndexDataFrameReadAsNumpyDict(self): 

647 df1, allColumns = _makeSingleIndexDataFrame() 

648 

649 self.butler.put(df1, self.datasetType, dataId={}) 

650 

651 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict") 

652 

653 tab2_df = pd.DataFrame.from_records(tab2, index=["index"]) 

654 # The column order is not maintained. 

655 self.assertEqual(set(df1.columns), set(tab2_df.columns)) 

656 for col in df1.columns: 

657 self.assertTrue(np.all(df1[col].values == tab2_df[col].values)) 

658 

659 @unittest.skipUnless(np is not None, "Cannot test reading as numpy dict without numpy.") 

660 def testWriteMultiIndexDataFrameReadAsNumpyDict(self): 

661 df1 = _makeMultiIndexDataFrame() 

662 

663 self.butler.put(df1, self.datasetType, dataId={}) 

664 

665 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict") 

666 

667 # This is an odd duck, it doesn't really round-trip. 

668 # This test simply checks that it's readable, but definitely not 

669 # recommended. 

670 

671 

672@unittest.skipUnless(pd is not None, "Cannot test InMemoryDataFrameDelegate without pandas.") 

673class InMemoryDataFrameDelegateTestCase(ParquetFormatterDataFrameTestCase): 

674 """Tests for InMemoryDatastore, using DataFrameDelegate.""" 

675 

676 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

677 

678 def testWriteMultiIndexDataFrameReadAsAstropyTable(self): 

679 df1 = _makeMultiIndexDataFrame() 

680 

681 self.butler.put(df1, self.datasetType, dataId={}) 

682 

683 with self.assertRaises(ValueError): 

684 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

685 

686 def testLegacyDataFrame(self): 

687 # This test does not work with an inMemoryDatastore. 

688 pass 

689 

690 def testBadInput(self): 

691 df1, _ = _makeSingleIndexDataFrame() 

692 delegate = DataFrameDelegate("DataFrame") 

693 

694 with self.assertRaises(ValueError): 

695 delegate.handleParameters(inMemoryDataset="not_a_dataframe") 

696 

697 with self.assertRaises(AttributeError): 

698 delegate.getComponent(composite=df1, componentName="nothing") 

699 

700 def testStorageClass(self): 

701 df1, allColumns = _makeSingleIndexDataFrame() 

702 

703 factory = StorageClassFactory() 

704 factory.addFromConfig(StorageClassConfig()) 

705 

706 storageClass = factory.findStorageClass(type(df1), compare_types=False) 

707 # Force the name lookup to do name matching. 

708 storageClass._pytype = None 

709 self.assertEqual(storageClass.name, "DataFrame") 

710 

711 storageClass = factory.findStorageClass(type(df1), compare_types=True) 

712 # Force the name lookup to do name matching. 

713 storageClass._pytype = None 

714 self.assertEqual(storageClass.name, "DataFrame") 

715 

716 

717@unittest.skipUnless(atable is not None, "Cannot test ParquetFormatterArrowAstropy without astropy.") 

718@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowAstropy without pyarrow.") 

719class ParquetFormatterArrowAstropyTestCase(unittest.TestCase): 

720 """Tests for ParquetFormatter, ArrowAstropy, using local file datastore.""" 

721 

722 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

723 

724 def setUp(self): 

725 """Create a new butler root for each test.""" 

726 self.root = makeTestTempDir(TESTDIR) 

727 config = Config(self.configFile) 

728 self.run = "test_run" 

729 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run=self.run) 

730 # No dimensions in dataset type so we don't have to worry about 

731 # inserting dimension data or defining data IDs. 

732 self.datasetType = DatasetType( 

733 "data", dimensions=(), storageClass="ArrowAstropy", universe=self.butler.dimensions 

734 ) 

735 self.butler.registry.registerDatasetType(self.datasetType) 

736 

737 def tearDown(self): 

738 removeTestTempDir(self.root) 

739 

740 def testAstropyTable(self): 

741 tab1 = _makeSimpleAstropyTable(include_multidim=True, include_masked=True) 

742 

743 self.butler.put(tab1, self.datasetType, dataId={}) 

744 # Read the whole Table. 

745 tab2 = self.butler.get(self.datasetType, dataId={}) 

746 self._checkAstropyTableEquality(tab1, tab2) 

747 # Read the columns. 

748 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

749 self.assertEqual(len(columns2), len(tab1.dtype.names)) 

750 for i, name in enumerate(tab1.dtype.names): 

751 self.assertEqual(columns2[i], name) 

752 # Read the rowcount. 

753 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

754 self.assertEqual(rowcount, len(tab1)) 

755 # Read the schema. 

756 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

757 self.assertEqual(schema, ArrowAstropySchema(tab1)) 

758 # Read just some columns a few different ways. 

759 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

760 self._checkAstropyTableEquality(tab1[("a", "c")], tab3) 

761 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

762 self._checkAstropyTableEquality(tab1[("a",)], tab4) 

763 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]}) 

764 self._checkAstropyTableEquality(tab1[("index", "a")], tab5) 

765 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"}) 

766 self._checkAstropyTableEquality(tab1[("ddd",)], tab6) 

767 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]}) 

768 self._checkAstropyTableEquality(tab1[("a",)], tab7) 

769 # Passing an unrecognized column should be a ValueError. 

770 with self.assertRaises(ValueError): 

771 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]}) 

772 

773 def testAstropyTableBigEndian(self): 

774 tab1 = _makeSimpleAstropyTable(include_bigendian=True) 

775 

776 self.butler.put(tab1, self.datasetType, dataId={}) 

777 # Read the whole Table. 

778 tab2 = self.butler.get(self.datasetType, dataId={}) 

779 self._checkAstropyTableEquality(tab1, tab2, has_bigendian=True) 

780 

781 def testAstropyTableWithMetadata(self): 

782 tab1 = _makeSimpleAstropyTable(include_multidim=True) 

783 

784 meta = { 

785 "meta_a": 5, 

786 "meta_b": 10.0, 

787 "meta_c": [1, 2, 3], 

788 "meta_d": True, 

789 "meta_e": "string", 

790 } 

791 

792 tab1.meta.update(meta) 

793 

794 self.butler.put(tab1, self.datasetType, dataId={}) 

795 # Read the whole Table. 

796 tab2 = self.butler.get(self.datasetType, dataId={}) 

797 # This will check that the metadata is equivalent as well. 

798 self._checkAstropyTableEquality(tab1, tab2) 

799 

800 def testArrowAstropySchema(self): 

801 tab1 = _makeSimpleAstropyTable() 

802 tab1_arrow = astropy_to_arrow(tab1) 

803 schema = ArrowAstropySchema.from_arrow(tab1_arrow.schema) 

804 

805 self.assertIsInstance(schema.schema, atable.Table) 

806 self.assertEqual(repr(schema), repr(schema._schema)) 

807 self.assertNotEqual(schema, "not_a_schema") 

808 self.assertEqual(schema, schema) 

809 

810 # Test various inequalities 

811 tab2 = tab1.copy() 

812 tab2.rename_column("index", "index2") 

813 schema2 = ArrowAstropySchema(tab2) 

814 self.assertNotEqual(schema2, schema) 

815 

816 tab2 = tab1.copy() 

817 tab2["index"].unit = units.micron 

818 schema2 = ArrowAstropySchema(tab2) 

819 self.assertNotEqual(schema2, schema) 

820 

821 tab2 = tab1.copy() 

822 tab2["index"].description = "Index column" 

823 schema2 = ArrowAstropySchema(tab2) 

824 self.assertNotEqual(schema2, schema) 

825 

826 tab2 = tab1.copy() 

827 tab2["index"].format = "%05d" 

828 schema2 = ArrowAstropySchema(tab2) 

829 self.assertNotEqual(schema2, schema) 

830 

831 def testAstropyParquet(self): 

832 tab1 = _makeSimpleAstropyTable() 

833 

834 fname = os.path.join(self.root, "test_astropy.parq") 

835 tab1.write(fname) 

836 

837 astropy_type = DatasetType( 

838 "astropy_parquet", 

839 dimensions=(), 

840 storageClass="ArrowAstropy", 

841 universe=self.butler.dimensions, 

842 ) 

843 self.butler.registry.registerDatasetType(astropy_type) 

844 

845 data_id = {} 

846 ref = DatasetRef(astropy_type, data_id, run=self.run) 

847 dataset = FileDataset(path=fname, refs=[ref], formatter=ParquetFormatter) 

848 

849 self.butler.ingest(dataset, transfer="copy") 

850 

851 self.butler.put(tab1, self.datasetType, dataId={}) 

852 

853 tab2a = self.butler.get(self.datasetType, dataId={}) 

854 tab2b = self.butler.get("astropy_parquet", dataId={}) 

855 self._checkAstropyTableEquality(tab2a, tab2b) 

856 

857 columns2a = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

858 columns2b = self.butler.get("astropy_parquet.columns", dataId={}) 

859 self.assertEqual(len(columns2b), len(columns2a)) 

860 for i, name in enumerate(columns2a): 

861 self.assertEqual(columns2b[i], name) 

862 

863 rowcount2a = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

864 rowcount2b = self.butler.get("astropy_parquet.rowcount", dataId={}) 

865 self.assertEqual(rowcount2a, rowcount2b) 

866 

867 schema2a = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

868 schema2b = self.butler.get("astropy_parquet.schema", dataId={}) 

869 self.assertEqual(schema2a, schema2b) 

870 

871 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.") 

872 def testWriteAstropyReadAsArrowTable(self): 

873 # This astropy <-> arrow works fine with masked columns. 

874 tab1 = _makeSimpleAstropyTable(include_masked=True) 

875 

876 self.butler.put(tab1, self.datasetType, dataId={}) 

877 

878 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable") 

879 

880 tab2_astropy = arrow_to_astropy(tab2) 

881 self._checkAstropyTableEquality(tab1, tab2_astropy) 

882 

883 # Check reading the columns. 

884 columns = tab2.schema.names 

885 columns2 = self.butler.get( 

886 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

887 ) 

888 self.assertEqual(columns2, columns) 

889 

890 # Check reading the schema. 

891 schema = tab2.schema 

892 schema2 = self.butler.get( 

893 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema" 

894 ) 

895 

896 self.assertEqual(schema, schema2) 

897 

898 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

899 def testWriteAstropyReadAsDataFrame(self): 

900 tab1 = _makeSimpleAstropyTable() 

901 

902 self.butler.put(tab1, self.datasetType, dataId={}) 

903 

904 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

905 

906 # This is tricky because it loses the units and gains a bonus pandas 

907 # _index_ column, so we just test the dataframe form. 

908 

909 tab1_df = tab1.to_pandas() 

910 self.assertTrue(tab1_df.equals(tab2)) 

911 

912 # Check reading the columns. 

913 columns = tab2.columns 

914 columns2 = self.butler.get( 

915 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex" 

916 ) 

917 self.assertTrue(columns.equals(columns2)) 

918 

919 # Check reading the schema. 

920 schema = DataFrameSchema(tab2) 

921 schema2 = self.butler.get( 

922 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema" 

923 ) 

924 

925 self.assertEqual(schema2, schema) 

926 

927 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

928 def testWriteAstropyWithMaskedColsReadAsDataFrame(self): 

929 # We need to special-case the write-as-astropy read-as-pandas code 

930 # with masks because pandas has multiple ways to use masked columns. 

931 # (When writing an astropy table with masked columns we get an object 

932 # column back, but each unmasked element has the correct type.) 

933 tab1 = _makeSimpleAstropyTable(include_masked=True) 

934 

935 self.butler.put(tab1, self.datasetType, dataId={}) 

936 

937 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

938 

939 tab1_df = tab1.to_pandas() 

940 

941 self.assertTrue(tab1_df.columns.equals(tab2.columns)) 

942 for name in tab2.columns: 

943 col1 = tab1_df[name] 

944 col2 = tab2[name] 

945 

946 if col1.hasnans: 

947 notNull = col1.notnull() 

948 self.assertTrue(notNull.equals(col2.notnull())) 

949 # Need to check value-by-value because column may 

950 # be made of objects, depending on what pandas decides. 

951 for index in notNull.values.nonzero()[0]: 

952 self.assertEqual(col1[index], col2[index]) 

953 else: 

954 self.assertTrue(col1.equals(col2)) 

955 

956 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

957 def testWriteAstropyReadAsNumpyTable(self): 

958 tab1 = _makeSimpleAstropyTable() 

959 self.butler.put(tab1, self.datasetType, dataId={}) 

960 

961 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy") 

962 

963 # This is tricky because it loses the units. 

964 tab2_astropy = atable.Table(tab2) 

965 

966 self._checkAstropyTableEquality(tab1, tab2_astropy, skip_units=True) 

967 

968 # Check reading the columns. 

969 columns = list(tab2.dtype.names) 

970 columns2 = self.butler.get( 

971 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

972 ) 

973 self.assertEqual(columns2, columns) 

974 

975 # Check reading the schema. 

976 schema = ArrowNumpySchema(tab2.dtype) 

977 schema2 = self.butler.get( 

978 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema" 

979 ) 

980 

981 self.assertEqual(schema2, schema) 

982 

983 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

984 def testWriteAstropyReadAsNumpyDict(self): 

985 tab1 = _makeSimpleAstropyTable() 

986 self.butler.put(tab1, self.datasetType, dataId={}) 

987 

988 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict") 

989 

990 # This is tricky because it loses the units. 

991 tab2_astropy = atable.Table(tab2) 

992 

993 self._checkAstropyTableEquality(tab1, tab2_astropy, skip_units=True) 

994 

995 def _checkAstropyTableEquality(self, table1, table2, skip_units=False, has_bigendian=False): 

996 """Check if two astropy tables have the same columns/values. 

997 

998 Parameters 

999 ---------- 

1000 table1 : `astropy.table.Table` 

1001 table2 : `astropy.table.Table` 

1002 skip_units : `bool` 

1003 has_bigendian : `bool` 

1004 """ 

1005 if not has_bigendian: 

1006 self.assertEqual(table1.dtype, table2.dtype) 

1007 else: 

1008 for name in table1.dtype.names: 

1009 # Only check type matches, force to little-endian. 

1010 self.assertEqual(table1.dtype[name].newbyteorder(">"), table2.dtype[name].newbyteorder(">")) 

1011 

1012 self.assertEqual(table1.meta, table2.meta) 

1013 if not skip_units: 

1014 for name in table1.columns: 

1015 self.assertEqual(table1[name].unit, table2[name].unit) 

1016 self.assertEqual(table1[name].description, table2[name].description) 

1017 self.assertEqual(table1[name].format, table2[name].format) 

1018 self.assertTrue(np.all(table1 == table2)) 

1019 

1020 

1021@unittest.skipUnless(atable is not None, "Cannot test InMemoryArrowAstropyDelegate without astropy.") 

1022class InMemoryArrowAstropyDelegateTestCase(ParquetFormatterArrowAstropyTestCase): 

1023 """Tests for InMemoryDatastore, using ArrowAstropyDelegate.""" 

1024 

1025 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

1026 

1027 def testAstropyParquet(self): 

1028 # This test does not work with an inMemoryDatastore. 

1029 pass 

1030 

1031 def testBadInput(self): 

1032 tab1 = _makeSimpleAstropyTable() 

1033 delegate = ArrowAstropyDelegate("ArrowAstropy") 

1034 

1035 with self.assertRaises(ValueError): 

1036 delegate.handleParameters(inMemoryDataset="not_an_astropy_table") 

1037 

1038 with self.assertRaises(NotImplementedError): 

1039 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]}) 

1040 

1041 with self.assertRaises(AttributeError): 

1042 delegate.getComponent(composite=tab1, componentName="nothing") 

1043 

1044 

1045@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.") 

1046@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowNumpy without pyarrow.") 

1047class ParquetFormatterArrowNumpyTestCase(unittest.TestCase): 

1048 """Tests for ParquetFormatter, ArrowNumpy, using local file datastore.""" 

1049 

1050 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1051 

1052 def setUp(self): 

1053 """Create a new butler root for each test.""" 

1054 self.root = makeTestTempDir(TESTDIR) 

1055 config = Config(self.configFile) 

1056 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run") 

1057 # No dimensions in dataset type so we don't have to worry about 

1058 # inserting dimension data or defining data IDs. 

1059 self.datasetType = DatasetType( 

1060 "data", dimensions=(), storageClass="ArrowNumpy", universe=self.butler.dimensions 

1061 ) 

1062 self.butler.registry.registerDatasetType(self.datasetType) 

1063 

1064 def tearDown(self): 

1065 removeTestTempDir(self.root) 

1066 

1067 def testNumpyTable(self): 

1068 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1069 

1070 self.butler.put(tab1, self.datasetType, dataId={}) 

1071 # Read the whole Table. 

1072 tab2 = self.butler.get(self.datasetType, dataId={}) 

1073 self._checkNumpyTableEquality(tab1, tab2) 

1074 # Read the columns. 

1075 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

1076 self.assertEqual(len(columns2), len(tab1.dtype.names)) 

1077 for i, name in enumerate(tab1.dtype.names): 

1078 self.assertEqual(columns2[i], name) 

1079 # Read the rowcount. 

1080 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

1081 self.assertEqual(rowcount, len(tab1)) 

1082 # Read the schema. 

1083 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

1084 self.assertEqual(schema, ArrowNumpySchema(tab1.dtype)) 

1085 # Read just some columns a few different ways. 

1086 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

1087 self._checkNumpyTableEquality(tab1[["a", "c"]], tab3) 

1088 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

1089 self._checkNumpyTableEquality( 

1090 tab1[ 

1091 [ 

1092 "a", 

1093 ] 

1094 ], 

1095 tab4, 

1096 ) 

1097 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]}) 

1098 self._checkNumpyTableEquality(tab1[["index", "a"]], tab5) 

1099 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"}) 

1100 self._checkNumpyTableEquality( 

1101 tab1[ 

1102 [ 

1103 "ddd", 

1104 ] 

1105 ], 

1106 tab6, 

1107 ) 

1108 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]}) 

1109 self._checkNumpyTableEquality( 

1110 tab1[ 

1111 [ 

1112 "a", 

1113 ] 

1114 ], 

1115 tab7, 

1116 ) 

1117 # Passing an unrecognized column should be a ValueError. 

1118 with self.assertRaises(ValueError): 

1119 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]}) 

1120 

1121 def testNumpyTableBigEndian(self): 

1122 tab1 = _makeSimpleNumpyTable(include_bigendian=True) 

1123 

1124 self.butler.put(tab1, self.datasetType, dataId={}) 

1125 # Read the whole Table. 

1126 tab2 = self.butler.get(self.datasetType, dataId={}) 

1127 self._checkNumpyTableEquality(tab1, tab2, has_bigendian=True) 

1128 

1129 def testArrowNumpySchema(self): 

1130 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1131 tab1_arrow = numpy_to_arrow(tab1) 

1132 schema = ArrowNumpySchema.from_arrow(tab1_arrow.schema) 

1133 

1134 self.assertIsInstance(schema.schema, np.dtype) 

1135 self.assertEqual(repr(schema), repr(schema._dtype)) 

1136 self.assertNotEqual(schema, "not_a_schema") 

1137 self.assertEqual(schema, schema) 

1138 

1139 # Test inequality 

1140 tab2 = tab1.copy() 

1141 names = list(tab2.dtype.names) 

1142 names[0] = "index2" 

1143 tab2.dtype.names = names 

1144 schema2 = ArrowNumpySchema(tab2.dtype) 

1145 self.assertNotEqual(schema2, schema) 

1146 

1147 @unittest.skipUnless(pa is not None, "Cannot test arrow conversions without pyarrow.") 

1148 def testNumpyDictConversions(self): 

1149 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1150 

1151 # Verify that everything round-trips, including the schema. 

1152 tab1_arrow = numpy_to_arrow(tab1) 

1153 tab1_dict = arrow_to_numpy_dict(tab1_arrow) 

1154 tab1_dict_arrow = numpy_dict_to_arrow(tab1_dict) 

1155 

1156 self.assertEqual(tab1_arrow.schema, tab1_dict_arrow.schema) 

1157 self.assertEqual(tab1_arrow, tab1_dict_arrow) 

1158 

1159 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.") 

1160 def testWriteNumpyTableReadAsArrowTable(self): 

1161 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1162 

1163 self.butler.put(tab1, self.datasetType, dataId={}) 

1164 

1165 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable") 

1166 

1167 tab2_numpy = arrow_to_numpy(tab2) 

1168 

1169 self._checkNumpyTableEquality(tab1, tab2_numpy) 

1170 

1171 # Check reading the columns. 

1172 columns = tab2.schema.names 

1173 columns2 = self.butler.get( 

1174 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

1175 ) 

1176 self.assertEqual(columns2, columns) 

1177 

1178 # Check reading the schema. 

1179 schema = tab2.schema 

1180 schema2 = self.butler.get( 

1181 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema" 

1182 ) 

1183 self.assertEqual(schema2, schema) 

1184 

1185 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

1186 def testWriteNumpyTableReadAsDataFrame(self): 

1187 tab1 = _makeSimpleNumpyTable() 

1188 

1189 self.butler.put(tab1, self.datasetType, dataId={}) 

1190 

1191 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

1192 

1193 # Converting this back to numpy gets confused with the index column 

1194 # and changes the datatype of the string column. 

1195 

1196 tab1_df = pd.DataFrame(tab1) 

1197 

1198 self.assertTrue(tab1_df.equals(tab2)) 

1199 

1200 # Check reading the columns. 

1201 columns = tab2.columns 

1202 columns2 = self.butler.get( 

1203 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex" 

1204 ) 

1205 self.assertTrue(columns.equals(columns2)) 

1206 

1207 # Check reading the schema. 

1208 schema = DataFrameSchema(tab2) 

1209 schema2 = self.butler.get( 

1210 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema" 

1211 ) 

1212 

1213 self.assertEqual(schema2, schema) 

1214 

1215 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

1216 def testWriteNumpyTableReadAsAstropyTable(self): 

1217 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1218 

1219 self.butler.put(tab1, self.datasetType, dataId={}) 

1220 

1221 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

1222 tab2_numpy = tab2.as_array() 

1223 

1224 self._checkNumpyTableEquality(tab1, tab2_numpy) 

1225 

1226 # Check reading the columns. 

1227 columns = list(tab2.columns.keys()) 

1228 columns2 = self.butler.get( 

1229 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

1230 ) 

1231 self.assertEqual(columns2, columns) 

1232 

1233 # Check reading the schema. 

1234 schema = ArrowAstropySchema(tab2) 

1235 schema2 = self.butler.get( 

1236 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema" 

1237 ) 

1238 

1239 self.assertEqual(schema2, schema) 

1240 

1241 def testWriteNumpyTableReadAsNumpyDict(self): 

1242 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1243 

1244 self.butler.put(tab1, self.datasetType, dataId={}) 

1245 

1246 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict") 

1247 tab2_numpy = _numpy_dict_to_numpy(tab2) 

1248 

1249 self._checkNumpyTableEquality(tab1, tab2_numpy) 

1250 

1251 def _checkNumpyTableEquality(self, table1, table2, has_bigendian=False): 

1252 """Check if two numpy tables have the same columns/values 

1253 

1254 Parameters 

1255 ---------- 

1256 table1 : `numpy.ndarray` 

1257 table2 : `numpy.ndarray` 

1258 has_bigendian : `bool` 

1259 """ 

1260 self.assertEqual(table1.dtype.names, table2.dtype.names) 

1261 for name in table1.dtype.names: 

1262 if not has_bigendian: 

1263 self.assertEqual(table1.dtype[name], table2.dtype[name]) 

1264 else: 

1265 # Only check type matches, force to little-endian. 

1266 self.assertEqual(table1.dtype[name].newbyteorder(">"), table2.dtype[name].newbyteorder(">")) 

1267 self.assertTrue(np.all(table1 == table2)) 

1268 

1269 

1270@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.") 

1271class InMemoryArrowNumpyDelegateTestCase(ParquetFormatterArrowNumpyTestCase): 

1272 """Tests for InMemoryDatastore, using ArrowNumpyDelegate.""" 

1273 

1274 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

1275 

1276 def testBadInput(self): 

1277 tab1 = _makeSimpleNumpyTable() 

1278 delegate = ArrowNumpyDelegate("ArrowNumpy") 

1279 

1280 with self.assertRaises(ValueError): 

1281 delegate.handleParameters(inMemoryDataset="not_a_numpy_table") 

1282 

1283 with self.assertRaises(NotImplementedError): 

1284 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]}) 

1285 

1286 with self.assertRaises(AttributeError): 

1287 delegate.getComponent(composite=tab1, componentName="nothing") 

1288 

1289 def testStorageClass(self): 

1290 tab1 = _makeSimpleNumpyTable() 

1291 

1292 factory = StorageClassFactory() 

1293 factory.addFromConfig(StorageClassConfig()) 

1294 

1295 storageClass = factory.findStorageClass(type(tab1), compare_types=False) 

1296 # Force the name lookup to do name matching. 

1297 storageClass._pytype = None 

1298 self.assertEqual(storageClass.name, "ArrowNumpy") 

1299 

1300 storageClass = factory.findStorageClass(type(tab1), compare_types=True) 

1301 # Force the name lookup to do name matching. 

1302 storageClass._pytype = None 

1303 self.assertEqual(storageClass.name, "ArrowNumpy") 

1304 

1305 

1306@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowTable without pyarrow.") 

1307class ParquetFormatterArrowTableTestCase(unittest.TestCase): 

1308 """Tests for ParquetFormatter, ArrowTable, using local file datastore.""" 

1309 

1310 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1311 

1312 def setUp(self): 

1313 """Create a new butler root for each test.""" 

1314 self.root = makeTestTempDir(TESTDIR) 

1315 config = Config(self.configFile) 

1316 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run") 

1317 # No dimensions in dataset type so we don't have to worry about 

1318 # inserting dimension data or defining data IDs. 

1319 self.datasetType = DatasetType( 

1320 "data", dimensions=(), storageClass="ArrowTable", universe=self.butler.dimensions 

1321 ) 

1322 self.butler.registry.registerDatasetType(self.datasetType) 

1323 

1324 def tearDown(self): 

1325 removeTestTempDir(self.root) 

1326 

1327 def testArrowTable(self): 

1328 tab1 = _makeSimpleArrowTable(include_multidim=True, include_masked=True) 

1329 

1330 self.butler.put(tab1, self.datasetType, dataId={}) 

1331 # Read the whole Table. 

1332 tab2 = self.butler.get(self.datasetType, dataId={}) 

1333 self.assertEqual(tab2, tab1) 

1334 # Read the columns. 

1335 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

1336 self.assertEqual(len(columns2), len(tab1.schema.names)) 

1337 for i, name in enumerate(tab1.schema.names): 

1338 self.assertEqual(columns2[i], name) 

1339 # Read the rowcount. 

1340 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

1341 self.assertEqual(rowcount, len(tab1)) 

1342 # Read the schema. 

1343 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

1344 self.assertEqual(schema, tab1.schema) 

1345 # Read just some columns a few different ways. 

1346 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

1347 self.assertEqual(tab3, tab1.select(("a", "c"))) 

1348 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

1349 self.assertEqual(tab4, tab1.select(("a",))) 

1350 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]}) 

1351 self.assertEqual(tab5, tab1.select(("index", "a"))) 

1352 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"}) 

1353 self.assertEqual(tab6, tab1.select(("ddd",))) 

1354 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]}) 

1355 self.assertEqual(tab7, tab1.select(("a",))) 

1356 # Passing an unrecognized column should be a ValueError. 

1357 with self.assertRaises(ValueError): 

1358 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]}) 

1359 

1360 def testEmptyArrowTable(self): 

1361 data = _makeSimpleNumpyTable() 

1362 type_list = _numpy_dtype_to_arrow_types(data.dtype) 

1363 

1364 schema = pa.schema(type_list) 

1365 arrays = [[]] * len(schema.names) 

1366 

1367 tab1 = pa.Table.from_arrays(arrays, schema=schema) 

1368 

1369 self.butler.put(tab1, self.datasetType, dataId={}) 

1370 tab2 = self.butler.get(self.datasetType, dataId={}) 

1371 self.assertEqual(tab2, tab1) 

1372 

1373 tab1_numpy = arrow_to_numpy(tab1) 

1374 self.assertEqual(len(tab1_numpy), 0) 

1375 tab1_numpy_arrow = numpy_to_arrow(tab1_numpy) 

1376 self.assertEqual(tab1_numpy_arrow, tab1) 

1377 

1378 tab1_pandas = arrow_to_pandas(tab1) 

1379 self.assertEqual(len(tab1_pandas), 0) 

1380 tab1_pandas_arrow = pandas_to_arrow(tab1_pandas) 

1381 # Unfortunately, string/byte columns get mangled when translated 

1382 # through empty pandas dataframes. 

1383 self.assertEqual( 

1384 tab1_pandas_arrow.select(("index", "a", "b", "c", "ddd")), 

1385 tab1.select(("index", "a", "b", "c", "ddd")), 

1386 ) 

1387 

1388 tab1_astropy = arrow_to_astropy(tab1) 

1389 self.assertEqual(len(tab1_astropy), 0) 

1390 tab1_astropy_arrow = astropy_to_arrow(tab1_astropy) 

1391 self.assertEqual(tab1_astropy_arrow, tab1) 

1392 

1393 def testEmptyArrowTableMultidim(self): 

1394 data = _makeSimpleNumpyTable(include_multidim=True) 

1395 type_list = _numpy_dtype_to_arrow_types(data.dtype) 

1396 

1397 md = {} 

1398 for name in data.dtype.names: 

1399 _append_numpy_multidim_metadata(md, name, data.dtype[name]) 

1400 

1401 schema = pa.schema(type_list, metadata=md) 

1402 arrays = [[]] * len(schema.names) 

1403 

1404 tab1 = pa.Table.from_arrays(arrays, schema=schema) 

1405 

1406 self.butler.put(tab1, self.datasetType, dataId={}) 

1407 tab2 = self.butler.get(self.datasetType, dataId={}) 

1408 self.assertEqual(tab2, tab1) 

1409 

1410 tab1_numpy = arrow_to_numpy(tab1) 

1411 self.assertEqual(len(tab1_numpy), 0) 

1412 tab1_numpy_arrow = numpy_to_arrow(tab1_numpy) 

1413 self.assertEqual(tab1_numpy_arrow, tab1) 

1414 

1415 tab1_astropy = arrow_to_astropy(tab1) 

1416 self.assertEqual(len(tab1_astropy), 0) 

1417 tab1_astropy_arrow = astropy_to_arrow(tab1_astropy) 

1418 self.assertEqual(tab1_astropy_arrow, tab1) 

1419 

1420 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

1421 def testWriteArrowTableReadAsSingleIndexDataFrame(self): 

1422 df1, allColumns = _makeSingleIndexDataFrame() 

1423 

1424 self.butler.put(df1, self.datasetType, dataId={}) 

1425 

1426 # Read back out as a dataframe. 

1427 df2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

1428 self.assertTrue(df1.equals(df2)) 

1429 

1430 # Read back out as an arrow table, convert to dataframe. 

1431 tab3 = self.butler.get(self.datasetType, dataId={}) 

1432 df3 = arrow_to_pandas(tab3) 

1433 self.assertTrue(df1.equals(df3)) 

1434 

1435 # Check reading the columns. 

1436 columns = df2.reset_index().columns 

1437 columns2 = self.butler.get( 

1438 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex" 

1439 ) 

1440 # We check the set because pandas reorders the columns. 

1441 self.assertEqual(set(columns2.to_list()), set(columns.to_list())) 

1442 

1443 # Check reading the schema. 

1444 schema = DataFrameSchema(df1) 

1445 schema2 = self.butler.get( 

1446 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema" 

1447 ) 

1448 self.assertEqual(schema2, schema) 

1449 

1450 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

1451 def testWriteArrowTableReadAsMultiIndexDataFrame(self): 

1452 df1 = _makeMultiIndexDataFrame() 

1453 

1454 self.butler.put(df1, self.datasetType, dataId={}) 

1455 

1456 # Read back out as a dataframe. 

1457 df2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

1458 self.assertTrue(df1.equals(df2)) 

1459 

1460 # Read back out as an arrow table, convert to dataframe. 

1461 atab3 = self.butler.get(self.datasetType, dataId={}) 

1462 df3 = arrow_to_pandas(atab3) 

1463 self.assertTrue(df1.equals(df3)) 

1464 

1465 # Check reading the columns. 

1466 columns = df2.columns 

1467 columns2 = self.butler.get( 

1468 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex" 

1469 ) 

1470 self.assertTrue(columns2.equals(columns)) 

1471 

1472 # Check reading the schema. 

1473 schema = DataFrameSchema(df1) 

1474 schema2 = self.butler.get( 

1475 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema" 

1476 ) 

1477 self.assertEqual(schema2, schema) 

1478 

1479 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

1480 def testWriteArrowTableReadAsAstropyTable(self): 

1481 tab1 = _makeSimpleAstropyTable(include_multidim=True, include_masked=True) 

1482 

1483 self.butler.put(tab1, self.datasetType, dataId={}) 

1484 

1485 # Read back out as an astropy table. 

1486 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

1487 self._checkAstropyTableEquality(tab1, tab2) 

1488 

1489 # Read back out as an arrow table, convert to astropy table. 

1490 atab3 = self.butler.get(self.datasetType, dataId={}) 

1491 tab3 = arrow_to_astropy(atab3) 

1492 self._checkAstropyTableEquality(tab1, tab3) 

1493 

1494 # Check reading the columns. 

1495 columns = list(tab2.columns.keys()) 

1496 columns2 = self.butler.get( 

1497 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

1498 ) 

1499 self.assertEqual(columns2, columns) 

1500 

1501 # Check reading the schema. 

1502 schema = ArrowAstropySchema(tab1) 

1503 schema2 = self.butler.get( 

1504 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema" 

1505 ) 

1506 self.assertEqual(schema2, schema) 

1507 

1508 # Check the schema conversions and units. 

1509 arrow_schema = schema.to_arrow_schema() 

1510 for name in arrow_schema.names: 

1511 field_metadata = arrow_schema.field(name).metadata 

1512 if ( 

1513 b"description" in field_metadata 

1514 and (description := field_metadata[b"description"].decode("UTF-8")) != "" 

1515 ): 

1516 self.assertEqual(schema2.schema[name].description, description) 

1517 else: 

1518 self.assertIsNone(schema2.schema[name].description) 

1519 if b"unit" in field_metadata and (unit := field_metadata[b"unit"].decode("UTF-8")) != "": 

1520 self.assertEqual(schema2.schema[name].unit, units.Unit(unit)) 

1521 

1522 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

1523 def testWriteArrowTableReadAsNumpyTable(self): 

1524 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1525 

1526 self.butler.put(tab1, self.datasetType, dataId={}) 

1527 

1528 # Read back out as a numpy table. 

1529 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy") 

1530 self._checkNumpyTableEquality(tab1, tab2) 

1531 

1532 # Read back out as an arrow table, convert to numpy table. 

1533 atab3 = self.butler.get(self.datasetType, dataId={}) 

1534 tab3 = arrow_to_numpy(atab3) 

1535 self._checkNumpyTableEquality(tab1, tab3) 

1536 

1537 # Check reading the columns. 

1538 columns = list(tab2.dtype.names) 

1539 columns2 = self.butler.get( 

1540 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

1541 ) 

1542 self.assertEqual(columns2, columns) 

1543 

1544 # Check reading the schema. 

1545 schema = ArrowNumpySchema(tab1.dtype) 

1546 schema2 = self.butler.get( 

1547 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema" 

1548 ) 

1549 self.assertEqual(schema2, schema) 

1550 

1551 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

1552 def testWriteArrowTableReadAsNumpyDict(self): 

1553 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1554 

1555 self.butler.put(tab1, self.datasetType, dataId={}) 

1556 

1557 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict") 

1558 tab2_numpy = _numpy_dict_to_numpy(tab2) 

1559 self._checkNumpyTableEquality(tab1, tab2_numpy) 

1560 

1561 def _checkAstropyTableEquality(self, table1, table2): 

1562 """Check if two astropy tables have the same columns/values 

1563 

1564 Parameters 

1565 ---------- 

1566 table1 : `astropy.table.Table` 

1567 table2 : `astropy.table.Table` 

1568 """ 

1569 self.assertEqual(table1.dtype, table2.dtype) 

1570 for name in table1.columns: 

1571 self.assertEqual(table1[name].unit, table2[name].unit) 

1572 self.assertEqual(table1[name].description, table2[name].description) 

1573 self.assertEqual(table1[name].format, table2[name].format) 

1574 self.assertTrue(np.all(table1 == table2)) 

1575 

1576 def _checkNumpyTableEquality(self, table1, table2): 

1577 """Check if two numpy tables have the same columns/values 

1578 

1579 Parameters 

1580 ---------- 

1581 table1 : `numpy.ndarray` 

1582 table2 : `numpy.ndarray` 

1583 """ 

1584 self.assertEqual(table1.dtype.names, table2.dtype.names) 

1585 for name in table1.dtype.names: 

1586 self.assertEqual(table1.dtype[name], table2.dtype[name]) 

1587 self.assertTrue(np.all(table1 == table2)) 

1588 

1589 

1590@unittest.skipUnless(pa is not None, "Cannot test InMemoryArrowTableDelegate without pyarrow.") 

1591class InMemoryArrowTableDelegateTestCase(ParquetFormatterArrowTableTestCase): 

1592 """Tests for InMemoryDatastore, using ArrowTableDelegate.""" 

1593 

1594 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

1595 

1596 def testBadInput(self): 

1597 tab1 = _makeSimpleArrowTable() 

1598 delegate = ArrowTableDelegate("ArrowTable") 

1599 

1600 with self.assertRaises(ValueError): 

1601 delegate.handleParameters(inMemoryDataset="not_an_arrow_table") 

1602 

1603 with self.assertRaises(NotImplementedError): 

1604 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]}) 

1605 

1606 with self.assertRaises(AttributeError): 

1607 delegate.getComponent(composite=tab1, componentName="nothing") 

1608 

1609 def testStorageClass(self): 

1610 tab1 = _makeSimpleArrowTable() 

1611 

1612 factory = StorageClassFactory() 

1613 factory.addFromConfig(StorageClassConfig()) 

1614 

1615 storageClass = factory.findStorageClass(type(tab1), compare_types=False) 

1616 # Force the name lookup to do name matching. 

1617 storageClass._pytype = None 

1618 self.assertEqual(storageClass.name, "ArrowTable") 

1619 

1620 storageClass = factory.findStorageClass(type(tab1), compare_types=True) 

1621 # Force the name lookup to do name matching. 

1622 storageClass._pytype = None 

1623 self.assertEqual(storageClass.name, "ArrowTable") 

1624 

1625 

1626@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.") 

1627@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowNumpy without pyarrow.") 

1628class ParquetFormatterArrowNumpyDictTestCase(unittest.TestCase): 

1629 """Tests for ParquetFormatter, ArrowNumpyDict, using local file store.""" 

1630 

1631 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1632 

1633 def setUp(self): 

1634 """Create a new butler root for each test.""" 

1635 self.root = makeTestTempDir(TESTDIR) 

1636 config = Config(self.configFile) 

1637 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run") 

1638 # No dimensions in dataset type so we don't have to worry about 

1639 # inserting dimension data or defining data IDs. 

1640 self.datasetType = DatasetType( 

1641 "data", dimensions=(), storageClass="ArrowNumpyDict", universe=self.butler.dimensions 

1642 ) 

1643 self.butler.registry.registerDatasetType(self.datasetType) 

1644 

1645 def tearDown(self): 

1646 removeTestTempDir(self.root) 

1647 

1648 def testNumpyDict(self): 

1649 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1650 dict1 = _numpy_to_numpy_dict(tab1) 

1651 

1652 self.butler.put(dict1, self.datasetType, dataId={}) 

1653 # Read the whole table. 

1654 dict2 = self.butler.get(self.datasetType, dataId={}) 

1655 self._checkNumpyDictEquality(dict1, dict2) 

1656 # Read the columns. 

1657 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

1658 self.assertEqual(len(columns2), len(dict1.keys())) 

1659 for name in dict1: 

1660 self.assertIn(name, columns2) 

1661 # Read the rowcount. 

1662 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

1663 self.assertEqual(rowcount, len(dict1["a"])) 

1664 # Read the schema. 

1665 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

1666 self.assertEqual(schema, ArrowNumpySchema(tab1.dtype)) 

1667 # Read just some columns a few different ways. 

1668 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

1669 subdict = {key: dict1[key] for key in ["a", "c"]} 

1670 self._checkNumpyDictEquality(subdict, tab3) 

1671 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

1672 subdict = {key: dict1[key] for key in ["a"]} 

1673 self._checkNumpyDictEquality(subdict, tab4) 

1674 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]}) 

1675 subdict = {key: dict1[key] for key in ["index", "a"]} 

1676 self._checkNumpyDictEquality(subdict, tab5) 

1677 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"}) 

1678 subdict = {key: dict1[key] for key in ["ddd"]} 

1679 self._checkNumpyDictEquality(subdict, tab6) 

1680 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]}) 

1681 subdict = {key: dict1[key] for key in ["a"]} 

1682 self._checkNumpyDictEquality(subdict, tab7) 

1683 # Passing an unrecognized column should be a ValueError. 

1684 with self.assertRaises(ValueError): 

1685 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]}) 

1686 

1687 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.") 

1688 def testWriteNumpyDictReadAsArrowTable(self): 

1689 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1690 dict1 = _numpy_to_numpy_dict(tab1) 

1691 

1692 self.butler.put(dict1, self.datasetType, dataId={}) 

1693 

1694 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable") 

1695 

1696 tab2_dict = arrow_to_numpy_dict(tab2) 

1697 

1698 self._checkNumpyDictEquality(dict1, tab2_dict) 

1699 

1700 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

1701 def testWriteNumpyDictReadAsDataFrame(self): 

1702 tab1 = _makeSimpleNumpyTable() 

1703 dict1 = _numpy_to_numpy_dict(tab1) 

1704 

1705 self.butler.put(dict1, self.datasetType, dataId={}) 

1706 

1707 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

1708 

1709 # The order of the dict may get mixed up, so we need to check column 

1710 # by column. We also need to do this in dataframe form because pandas 

1711 # changes the datatype of the string column. 

1712 tab1_df = pd.DataFrame(tab1) 

1713 

1714 self.assertEqual(set(tab1_df.columns), set(tab2.columns)) 

1715 for col in tab1_df.columns: 

1716 self.assertTrue(np.all(tab1_df[col].values == tab2[col].values)) 

1717 

1718 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

1719 def testWriteNumpyDictReadAsAstropyTable(self): 

1720 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1721 dict1 = _numpy_to_numpy_dict(tab1) 

1722 

1723 self.butler.put(dict1, self.datasetType, dataId={}) 

1724 

1725 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

1726 tab2_dict = _astropy_to_numpy_dict(tab2) 

1727 

1728 self._checkNumpyDictEquality(dict1, tab2_dict) 

1729 

1730 def testWriteNumpyDictReadAsNumpyTable(self): 

1731 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1732 dict1 = _numpy_to_numpy_dict(tab1) 

1733 

1734 self.butler.put(dict1, self.datasetType, dataId={}) 

1735 

1736 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy") 

1737 tab2_dict = _numpy_to_numpy_dict(tab2) 

1738 

1739 self._checkNumpyDictEquality(dict1, tab2_dict) 

1740 

1741 def testWriteNumpyDictBad(self): 

1742 dict1 = {"a": 4, "b": np.ndarray([1])} 

1743 with self.assertRaises(RuntimeError): 

1744 self.butler.put(dict1, self.datasetType, dataId={}) 

1745 

1746 dict2 = {"a": np.zeros(4), "b": np.zeros(5)} 

1747 with self.assertRaises(RuntimeError): 

1748 self.butler.put(dict2, self.datasetType, dataId={}) 

1749 

1750 dict3 = {"a": [0] * 5, "b": np.zeros(5)} 

1751 with self.assertRaises(RuntimeError): 

1752 self.butler.put(dict3, self.datasetType, dataId={}) 

1753 

1754 def _checkNumpyDictEquality(self, dict1, dict2): 

1755 """Check if two numpy dicts have the same columns/values. 

1756 

1757 Parameters 

1758 ---------- 

1759 dict1 : `dict` [`str`, `np.ndarray`] 

1760 dict2 : `dict` [`str`, `np.ndarray`] 

1761 """ 

1762 self.assertEqual(set(dict1.keys()), set(dict2.keys())) 

1763 for name in dict1: 

1764 self.assertEqual(dict1[name].dtype, dict2[name].dtype) 

1765 self.assertTrue(np.all(dict1[name] == dict2[name])) 

1766 

1767 

1768@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.") 

1769@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowNumpy without pyarrow.") 

1770class InMemoryNumpyDictDelegateTestCase(ParquetFormatterArrowNumpyDictTestCase): 

1771 """Tests for InMemoryDatastore, using ArrowNumpyDictDelegate.""" 

1772 

1773 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

1774 

1775 def testWriteNumpyDictBad(self): 

1776 # The sub-type checking is not done on in-memory datastore. 

1777 pass 

1778 

1779 

1780@unittest.skipUnless(pa is not None, "Cannot test ArrowSchema without pyarrow.") 

1781class ParquetFormatterArrowSchemaTestCase(unittest.TestCase): 

1782 """Tests for ParquetFormatter, ArrowSchema, using local file datastore.""" 

1783 

1784 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1785 

1786 def setUp(self): 

1787 """Create a new butler root for each test.""" 

1788 self.root = makeTestTempDir(TESTDIR) 

1789 config = Config(self.configFile) 

1790 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run") 

1791 # No dimensions in dataset type so we don't have to worry about 

1792 # inserting dimension data or defining data IDs. 

1793 self.datasetType = DatasetType( 

1794 "data", dimensions=(), storageClass="ArrowSchema", universe=self.butler.dimensions 

1795 ) 

1796 self.butler.registry.registerDatasetType(self.datasetType) 

1797 

1798 def tearDown(self): 

1799 removeTestTempDir(self.root) 

1800 

1801 def _makeTestSchema(self): 

1802 schema = pa.schema( 

1803 [ 

1804 pa.field( 

1805 "int32", 

1806 pa.int32(), 

1807 nullable=False, 

1808 metadata={ 

1809 "description": "32-bit integer", 

1810 "unit": "", 

1811 }, 

1812 ), 

1813 pa.field( 

1814 "int64", 

1815 pa.int64(), 

1816 nullable=False, 

1817 metadata={ 

1818 "description": "64-bit integer", 

1819 "unit": "", 

1820 }, 

1821 ), 

1822 pa.field( 

1823 "uint64", 

1824 pa.uint64(), 

1825 nullable=False, 

1826 metadata={ 

1827 "description": "64-bit unsigned integer", 

1828 "unit": "", 

1829 }, 

1830 ), 

1831 pa.field( 

1832 "float32", 

1833 pa.float32(), 

1834 nullable=False, 

1835 metadata={ 

1836 "description": "32-bit float", 

1837 "unit": "count", 

1838 }, 

1839 ), 

1840 pa.field( 

1841 "float64", 

1842 pa.float64(), 

1843 nullable=False, 

1844 metadata={ 

1845 "description": "64-bit float", 

1846 "unit": "nJy", 

1847 }, 

1848 ), 

1849 pa.field( 

1850 "fixed_size_list", 

1851 pa.list_(pa.float64(), list_size=10), 

1852 nullable=False, 

1853 metadata={ 

1854 "description": "Fixed size list of 64-bit floats.", 

1855 "unit": "nJy", 

1856 }, 

1857 ), 

1858 pa.field( 

1859 "variable_size_list", 

1860 pa.list_(pa.float64()), 

1861 nullable=False, 

1862 metadata={ 

1863 "description": "Variable size list of 64-bit floats.", 

1864 "unit": "nJy", 

1865 }, 

1866 ), 

1867 # One of these fields will have no description. 

1868 pa.field( 

1869 "string", 

1870 pa.string(), 

1871 nullable=False, 

1872 metadata={ 

1873 "unit": "", 

1874 }, 

1875 ), 

1876 # One of these fields will have no metadata. 

1877 pa.field( 

1878 "binary", 

1879 pa.binary(), 

1880 nullable=False, 

1881 ), 

1882 ] 

1883 ) 

1884 

1885 return schema 

1886 

1887 def testArrowSchema(self): 

1888 schema1 = self._makeTestSchema() 

1889 self.butler.put(schema1, self.datasetType, dataId={}) 

1890 

1891 schema2 = self.butler.get(self.datasetType, dataId={}) 

1892 self.assertEqual(schema2, schema1) 

1893 

1894 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe schema without pandas.") 

1895 def testWriteArrowSchemaReadAsDataFrameSchema(self): 

1896 schema1 = self._makeTestSchema() 

1897 self.butler.put(schema1, self.datasetType, dataId={}) 

1898 

1899 df_schema1 = DataFrameSchema.from_arrow(schema1) 

1900 

1901 df_schema2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrameSchema") 

1902 self.assertEqual(df_schema2, df_schema1) 

1903 

1904 @unittest.skipUnless(atable is not None, "Cannot test reading as an astropy schema without astropy.") 

1905 def testWriteArrowSchemaReadAsArrowAstropySchema(self): 

1906 schema1 = self._makeTestSchema() 

1907 self.butler.put(schema1, self.datasetType, dataId={}) 

1908 

1909 ap_schema1 = ArrowAstropySchema.from_arrow(schema1) 

1910 

1911 ap_schema2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropySchema") 

1912 self.assertEqual(ap_schema2, ap_schema1) 

1913 

1914 # Confirm that the ap_schema2 has the unit/description we expect. 

1915 for name in schema1.names: 

1916 field_metadata = schema1.field(name).metadata 

1917 if field_metadata is None: 

1918 continue 

1919 if ( 

1920 b"description" in field_metadata 

1921 and (description := field_metadata[b"description"].decode("UTF-8")) != "" 

1922 ): 

1923 self.assertEqual(ap_schema2.schema[name].description, description) 

1924 else: 

1925 self.assertIsNone(ap_schema2.schema[name].description) 

1926 if b"unit" in field_metadata and (unit := field_metadata[b"unit"].decode("UTF-8")) != "": 

1927 self.assertEqual(ap_schema2.schema[name].unit, units.Unit(unit)) 

1928 

1929 @unittest.skipUnless(atable is not None, "Cannot test reading as an numpy schema without numpy.") 

1930 def testWriteArrowSchemaReadAsArrowNumpySchema(self): 

1931 schema1 = self._makeTestSchema() 

1932 self.butler.put(schema1, self.datasetType, dataId={}) 

1933 

1934 np_schema1 = ArrowNumpySchema.from_arrow(schema1) 

1935 

1936 np_schema2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpySchema") 

1937 self.assertEqual(np_schema2, np_schema1) 

1938 

1939 

1940@unittest.skipUnless(pa is not None, "Cannot test InMemoryArrowSchemaDelegate without pyarrow.") 

1941class InMemoryArrowSchemaDelegateTestCase(ParquetFormatterArrowSchemaTestCase): 

1942 """Tests for InMemoryDatastore and ArrowSchema.""" 

1943 

1944 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

1945 

1946 

1947@unittest.skipUnless(np is not None, "Cannot test compute_row_group_size without numpy.") 

1948@unittest.skipUnless(pa is not None, "Cannot test compute_row_group_size without pyarrow.") 

1949class ComputeRowGroupSizeTestCase(unittest.TestCase): 

1950 """Tests for compute_row_group_size.""" 

1951 

1952 def testRowGroupSizeNoMetadata(self): 

1953 numpyTable = _makeSimpleNumpyTable(include_multidim=True) 

1954 

1955 # We can't use the numpy_to_arrow convenience function because 

1956 # that adds metadata. 

1957 type_list = _numpy_dtype_to_arrow_types(numpyTable.dtype) 

1958 schema = pa.schema(type_list) 

1959 arrays = _numpy_style_arrays_to_arrow_arrays( 

1960 numpyTable.dtype, 

1961 len(numpyTable), 

1962 numpyTable, 

1963 schema, 

1964 ) 

1965 arrowTable = pa.Table.from_arrays(arrays, schema=schema) 

1966 

1967 row_group_size = compute_row_group_size(arrowTable.schema) 

1968 

1969 self.assertGreater(row_group_size, 1_000_000) 

1970 self.assertLess(row_group_size, 2_000_000) 

1971 

1972 def testRowGroupSizeWithMetadata(self): 

1973 numpyTable = _makeSimpleNumpyTable(include_multidim=True) 

1974 

1975 arrowTable = numpy_to_arrow(numpyTable) 

1976 

1977 row_group_size = compute_row_group_size(arrowTable.schema) 

1978 

1979 self.assertGreater(row_group_size, 1_000_000) 

1980 self.assertLess(row_group_size, 2_000_000) 

1981 

1982 def testRowGroupSizeTinyTable(self): 

1983 numpyTable = np.zeros(1, dtype=[("a", np.bool_)]) 

1984 

1985 arrowTable = numpy_to_arrow(numpyTable) 

1986 

1987 row_group_size = compute_row_group_size(arrowTable.schema) 

1988 

1989 self.assertGreater(row_group_size, 1_000_000) 

1990 

1991 @unittest.skipUnless(pd is not None, "Cannot run testRowGroupSizeDataFrameWithLists without pandas.") 

1992 def testRowGroupSizeDataFrameWithLists(self): 

1993 df = pd.DataFrame({"a": np.zeros(10), "b": [[0, 0]] * 10, "c": [[0.0, 0.0]] * 10, "d": [[]] * 10}) 

1994 arrowTable = pandas_to_arrow(df) 

1995 row_group_size = compute_row_group_size(arrowTable.schema) 

1996 

1997 self.assertGreater(row_group_size, 1_000_000) 

1998 

1999 

2000if __name__ == "__main__": 

2001 unittest.main()