Coverage for tests/test_parquet.py: 17%

923 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2023-04-07 00:58 -0700

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""Tests for ParquetFormatter. 

23 

24Tests in this module are disabled unless pandas and pyarrow are importable. 

25""" 

26 

27import os 

28import unittest 

29 

30try: 

31 import pyarrow as pa 

32except ImportError: 

33 pa = None 

34try: 

35 import astropy.table as atable 

36 from astropy import units 

37except ImportError: 

38 atable = None 

39try: 

40 import numpy as np 

41except ImportError: 

42 np = None 

43try: 

44 import pandas as pd 

45except ImportError: 

46 np = None 

47 

48from lsst.daf.butler import ( 

49 Butler, 

50 Config, 

51 DatasetRef, 

52 DatasetType, 

53 FileDataset, 

54 StorageClassConfig, 

55 StorageClassFactory, 

56) 

57from lsst.daf.butler.delegates.arrowastropy import ArrowAstropyDelegate 

58from lsst.daf.butler.delegates.arrownumpy import ArrowNumpyDelegate 

59from lsst.daf.butler.delegates.arrowtable import ArrowTableDelegate 

60from lsst.daf.butler.delegates.dataframe import DataFrameDelegate 

61from lsst.daf.butler.formatters.parquet import ( 

62 ArrowAstropySchema, 

63 ArrowNumpySchema, 

64 DataFrameSchema, 

65 ParquetFormatter, 

66 _append_numpy_multidim_metadata, 

67 _astropy_to_numpy_dict, 

68 _numpy_dict_to_numpy, 

69 _numpy_dtype_to_arrow_types, 

70 _numpy_to_numpy_dict, 

71 arrow_to_astropy, 

72 arrow_to_numpy, 

73 arrow_to_numpy_dict, 

74 arrow_to_pandas, 

75 astropy_to_arrow, 

76 numpy_dict_to_arrow, 

77 numpy_to_arrow, 

78 pandas_to_arrow, 

79) 

80from lsst.daf.butler.tests.utils import makeTestTempDir, removeTestTempDir 

81 

82TESTDIR = os.path.abspath(os.path.dirname(__file__)) 

83 

84 

85def _makeSimpleNumpyTable(include_multidim=False, include_bigendian=False): 

86 """Make a simple numpy table with random data. 

87 

88 Parameters 

89 ---------- 

90 include_multidim : `bool` 

91 Include multi-dimensional columns. 

92 include_bigendian : `bool` 

93 Include big-endian columns. 

94 

95 Returns 

96 ------- 

97 numpyTable : `numpy.ndarray` 

98 """ 

99 nrow = 5 

100 

101 dtype = [ 

102 ("index", "i4"), 

103 ("a", "f8"), 

104 ("b", "f8"), 

105 ("c", "f8"), 

106 ("ddd", "f8"), 

107 ("f", "i8"), 

108 ("strcol", "U10"), 

109 ("bytecol", "a10"), 

110 ] 

111 

112 if include_multidim: 

113 dtype.extend( 

114 [ 

115 ("d1", "f4", (5,)), 

116 ("d2", "i8", (5, 10)), 

117 ("d3", "f8", (5, 10)), 

118 ] 

119 ) 

120 

121 if include_bigendian: 

122 dtype.extend([("a_bigendian", ">f8"), ("f_bigendian", ">i8")]) 

123 

124 data = np.zeros(nrow, dtype=dtype) 

125 data["index"][:] = np.arange(nrow) 

126 data["a"] = np.random.randn(nrow) 

127 data["b"] = np.random.randn(nrow) 

128 data["c"] = np.random.randn(nrow) 

129 data["ddd"] = np.random.randn(nrow) 

130 data["f"] = np.arange(nrow) * 10 

131 data["strcol"][:] = "teststring" 

132 data["bytecol"][:] = "teststring" 

133 

134 if include_multidim: 

135 data["d1"] = np.random.randn(data["d1"].size).reshape(data["d1"].shape) 

136 data["d2"] = np.arange(data["d2"].size).reshape(data["d2"].shape) 

137 data["d3"] = np.asfortranarray(np.random.randn(data["d3"].size).reshape(data["d3"].shape)) 

138 

139 if include_bigendian: 

140 data["a_bigendian"][:] = data["a"] 

141 data["f_bigendian"][:] = data["f"] 

142 

143 return data 

144 

145 

146def _makeSingleIndexDataFrame(include_masked=False): 

147 """Make a single index data frame for testing. 

148 

149 Parameters 

150 ---------- 

151 include_masked : `bool` 

152 Include masked columns. 

153 

154 Returns 

155 ------- 

156 dataFrame : `~pandas.DataFrame` 

157 The test dataframe. 

158 allColumns : `list` [`str`] 

159 List of all the columns (including index columns). 

160 """ 

161 data = _makeSimpleNumpyTable() 

162 df = pd.DataFrame(data) 

163 df = df.set_index("index") 

164 

165 if include_masked: 

166 nrow = len(df) 

167 

168 df["m1"] = pd.array(np.arange(nrow), dtype=pd.Int64Dtype()) 

169 df["m2"] = pd.array(np.arange(nrow), dtype=np.float32) 

170 df["mstrcol"] = pd.array(np.array(["text"] * nrow)) 

171 df.loc[1, ["m1", "m2", "mstrcol"]] = None 

172 

173 allColumns = df.columns.append(pd.Index(df.index.names)) 

174 

175 return df, allColumns 

176 

177 

178def _makeMultiIndexDataFrame(): 

179 """Make a multi-index data frame for testing. 

180 

181 Returns 

182 ------- 

183 dataFrame : `~pandas.DataFrame` 

184 The test dataframe. 

185 """ 

186 columns = pd.MultiIndex.from_tuples( 

187 [ 

188 ("g", "a"), 

189 ("g", "b"), 

190 ("g", "c"), 

191 ("r", "a"), 

192 ("r", "b"), 

193 ("r", "c"), 

194 ], 

195 names=["filter", "column"], 

196 ) 

197 df = pd.DataFrame(np.random.randn(5, 6), index=np.arange(5, dtype=int), columns=columns) 

198 

199 return df 

200 

201 

202def _makeSimpleAstropyTable(include_multidim=False, include_masked=False, include_bigendian=False): 

203 """Make an astropy table for testing. 

204 

205 Parameters 

206 ---------- 

207 include_multidim : `bool` 

208 Include multi-dimensional columns. 

209 include_masked : `bool` 

210 Include masked columns. 

211 include_bigendian : `bool` 

212 Include big-endian columns. 

213 

214 Returns 

215 ------- 

216 astropyTable : `astropy.table.Table` 

217 The test table. 

218 """ 

219 data = _makeSimpleNumpyTable(include_multidim=include_multidim, include_bigendian=include_bigendian) 

220 # Add a couple of units. 

221 table = atable.Table(data) 

222 table["a"].unit = units.degree 

223 table["b"].unit = units.meter 

224 

225 # Add some masked columns. 

226 if include_masked: 

227 nrow = len(table) 

228 mask = np.zeros(nrow, dtype=bool) 

229 mask[1] = True 

230 table["m1"] = np.ma.masked_array(data=np.arange(nrow, dtype="i8"), mask=mask) 

231 table["m2"] = np.ma.masked_array(data=np.arange(nrow, dtype="f4"), mask=mask) 

232 table["mstrcol"] = np.ma.masked_array(data=np.array(["text"] * nrow), mask=mask) 

233 table["mbytecol"] = np.ma.masked_array(data=np.array([b"bytes"] * nrow), mask=mask) 

234 

235 return table 

236 

237 

238def _makeSimpleArrowTable(include_multidim=False, include_masked=False): 

239 """Make an arrow table for testing. 

240 

241 Parameters 

242 ---------- 

243 include_multidim : `bool` 

244 Include multi-dimensional columns. 

245 include_masked : `bool` 

246 Include masked columns. 

247 

248 Returns 

249 ------- 

250 arrowTable : `pyarrow.Table` 

251 The test table. 

252 """ 

253 data = _makeSimpleAstropyTable(include_multidim=include_multidim, include_masked=include_masked) 

254 return astropy_to_arrow(data) 

255 

256 

257@unittest.skipUnless(pd is not None, "Cannot test ParquetFormatterDataFrame without pandas.") 

258@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterDataFrame without pyarrow.") 

259class ParquetFormatterDataFrameTestCase(unittest.TestCase): 

260 """Tests for ParquetFormatter, DataFrame, using local file datastore.""" 

261 

262 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

263 

264 def setUp(self): 

265 """Create a new butler root for each test.""" 

266 self.root = makeTestTempDir(TESTDIR) 

267 config = Config(self.configFile) 

268 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run") 

269 # No dimensions in dataset type so we don't have to worry about 

270 # inserting dimension data or defining data IDs. 

271 self.datasetType = DatasetType( 

272 "data", dimensions=(), storageClass="DataFrame", universe=self.butler.registry.dimensions 

273 ) 

274 self.butler.registry.registerDatasetType(self.datasetType) 

275 

276 def tearDown(self): 

277 removeTestTempDir(self.root) 

278 

279 def testSingleIndexDataFrame(self): 

280 df1, allColumns = _makeSingleIndexDataFrame(include_masked=True) 

281 

282 self.butler.put(df1, self.datasetType, dataId={}) 

283 # Read the whole DataFrame. 

284 df2 = self.butler.get(self.datasetType, dataId={}) 

285 self.assertTrue(df1.equals(df2)) 

286 # Read just the column descriptions. 

287 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

288 self.assertTrue(allColumns.equals(columns2)) 

289 # Read the rowcount. 

290 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

291 self.assertEqual(rowcount, len(df1)) 

292 # Read the schema. 

293 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

294 self.assertEqual(schema, DataFrameSchema(df1)) 

295 # Read just some columns a few different ways. 

296 df3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

297 self.assertTrue(df1.loc[:, ["a", "c"]].equals(df3)) 

298 df4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

299 self.assertTrue(df1.loc[:, ["a"]].equals(df4)) 

300 df5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]}) 

301 self.assertTrue(df1.loc[:, ["a"]].equals(df5)) 

302 df6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"}) 

303 self.assertTrue(df1.loc[:, ["ddd"]].equals(df6)) 

304 df7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]}) 

305 self.assertTrue(df1.loc[:, ["a"]].equals(df7)) 

306 # Passing an unrecognized column should be a ValueError. 

307 with self.assertRaises(ValueError): 

308 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]}) 

309 

310 def testMultiIndexDataFrame(self): 

311 df1 = _makeMultiIndexDataFrame() 

312 

313 self.butler.put(df1, self.datasetType, dataId={}) 

314 # Read the whole DataFrame. 

315 df2 = self.butler.get(self.datasetType, dataId={}) 

316 self.assertTrue(df1.equals(df2)) 

317 # Read just the column descriptions. 

318 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

319 self.assertTrue(df1.columns.equals(columns2)) 

320 # Read the rowcount. 

321 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

322 self.assertEqual(rowcount, len(df1)) 

323 # Read the schema. 

324 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

325 self.assertEqual(schema, DataFrameSchema(df1)) 

326 # Read just some columns a few different ways. 

327 df3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": {"filter": "g"}}) 

328 self.assertTrue(df1.loc[:, ["g"]].equals(df3)) 

329 df4 = self.butler.get( 

330 self.datasetType, dataId={}, parameters={"columns": {"filter": ["r"], "column": "a"}} 

331 ) 

332 self.assertTrue(df1.loc[:, [("r", "a")]].equals(df4)) 

333 column_list = [("g", "a"), ("r", "c")] 

334 df5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": column_list}) 

335 self.assertTrue(df1.loc[:, column_list].equals(df5)) 

336 # Passing an unrecognized column should be a ValueError. 

337 with self.assertRaises(ValueError): 

338 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["d"]}) 

339 

340 def testSingleIndexDataFrameEmptyString(self): 

341 """Test persisting a single index dataframe with empty strings.""" 

342 df1, _ = _makeSingleIndexDataFrame() 

343 

344 # Set one of the strings to None 

345 df1.at[1, "strcol"] = None 

346 

347 self.butler.put(df1, self.datasetType, dataId={}) 

348 # Read the whole DataFrame. 

349 df2 = self.butler.get(self.datasetType, dataId={}) 

350 self.assertTrue(df1.equals(df2)) 

351 

352 def testSingleIndexDataFrameAllEmptyStrings(self): 

353 """Test persisting a single index dataframe with an empty string 

354 column. 

355 """ 

356 df1, _ = _makeSingleIndexDataFrame() 

357 

358 # Set all of the strings to None 

359 df1.loc[0:, "strcol"] = None 

360 

361 self.butler.put(df1, self.datasetType, dataId={}) 

362 # Read the whole DataFrame. 

363 df2 = self.butler.get(self.datasetType, dataId={}) 

364 self.assertTrue(df1.equals(df2)) 

365 

366 def testLegacyDataFrame(self): 

367 """Test writing a dataframe to parquet via pandas (without additional 

368 metadata) and ensure that we can read it back with all the new 

369 functionality. 

370 """ 

371 df1, allColumns = _makeSingleIndexDataFrame() 

372 

373 fname = os.path.join(self.root, "test_dataframe.parq") 

374 df1.to_parquet(fname) 

375 

376 legacy_type = DatasetType( 

377 "legacy_dataframe", 

378 dimensions=(), 

379 storageClass="DataFrame", 

380 universe=self.butler.registry.dimensions, 

381 ) 

382 self.butler.registry.registerDatasetType(legacy_type) 

383 

384 data_id = {} 

385 ref = DatasetRef(legacy_type, data_id, id=None) 

386 dataset = FileDataset(path=fname, refs=[ref], formatter=ParquetFormatter) 

387 

388 self.butler.ingest(dataset, transfer="copy") 

389 

390 self.butler.put(df1, self.datasetType, dataId={}) 

391 

392 df2a = self.butler.get(self.datasetType, dataId={}) 

393 df2b = self.butler.get("legacy_dataframe", dataId={}) 

394 self.assertTrue(df2a.equals(df2b)) 

395 

396 df3a = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a"]}) 

397 df3b = self.butler.get("legacy_dataframe", dataId={}, parameters={"columns": ["a"]}) 

398 self.assertTrue(df3a.equals(df3b)) 

399 

400 columns2a = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

401 columns2b = self.butler.get("legacy_dataframe.columns", dataId={}) 

402 self.assertTrue(columns2a.equals(columns2b)) 

403 

404 rowcount2a = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

405 rowcount2b = self.butler.get("legacy_dataframe.rowcount", dataId={}) 

406 self.assertEqual(rowcount2a, rowcount2b) 

407 

408 schema2a = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

409 schema2b = self.butler.get("legacy_dataframe.schema", dataId={}) 

410 self.assertEqual(schema2a, schema2b) 

411 

412 def testDataFrameSchema(self): 

413 tab1 = _makeSimpleArrowTable() 

414 

415 schema = DataFrameSchema.from_arrow(tab1.schema) 

416 

417 self.assertIsInstance(schema.schema, pd.DataFrame) 

418 self.assertEqual(repr(schema), repr(schema._schema)) 

419 self.assertNotEqual(schema, "not_a_schema") 

420 self.assertEqual(schema, schema) 

421 

422 tab2 = _makeMultiIndexDataFrame() 

423 schema2 = DataFrameSchema(tab2) 

424 

425 self.assertNotEqual(schema, schema2) 

426 

427 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

428 def testWriteSingleIndexDataFrameReadAsAstropyTable(self): 

429 df1, allColumns = _makeSingleIndexDataFrame() 

430 

431 self.butler.put(df1, self.datasetType, dataId={}) 

432 

433 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

434 

435 tab2_df = tab2.to_pandas(index="index") 

436 self.assertTrue(df1.equals(tab2_df)) 

437 

438 # Check reading the columns. 

439 columns = list(tab2.columns.keys()) 

440 columns2 = self.butler.get( 

441 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

442 ) 

443 # We check the set because pandas reorders the columns. 

444 self.assertEqual(set(columns2), set(columns)) 

445 

446 # Check reading the schema. 

447 schema = ArrowAstropySchema(tab2) 

448 schema2 = self.butler.get( 

449 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema" 

450 ) 

451 

452 # The string types are objectified by pandas, and the order 

453 # will be changed because of pandas indexing. 

454 self.assertEqual(len(schema2.schema.columns), len(schema.schema.columns)) 

455 for name in schema.schema.columns: 

456 self.assertIn(name, schema2.schema.columns) 

457 if schema2.schema[name].dtype != np.dtype("O"): 

458 self.assertEqual(schema2.schema[name].dtype, schema.schema[name].dtype) 

459 

460 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

461 def testWriteSingleIndexDataFrameWithMaskedColsReadAsAstropyTable(self): 

462 # We need to special-case the write-as-pandas read-as-astropy code 

463 # with masks because pandas has multiple ways to use masked columns. 

464 # (The string column mask handling in particular is frustratingly 

465 # inconsistent.) 

466 df1, allColumns = _makeSingleIndexDataFrame(include_masked=True) 

467 

468 self.butler.put(df1, self.datasetType, dataId={}) 

469 

470 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

471 tab2_df = tab2.to_pandas(index="index") 

472 

473 self.assertTrue(df1.columns.equals(tab2_df.columns)) 

474 for name in tab2_df.columns: 

475 col1 = df1[name] 

476 col2 = tab2_df[name] 

477 

478 if col1.hasnans: 

479 notNull = col1.notnull() 

480 self.assertTrue(notNull.equals(col2.notnull())) 

481 # Need to check value-by-value because column may 

482 # be made of objects, depending on what pandas decides. 

483 for index in notNull.values.nonzero()[0]: 

484 self.assertEqual(col1[index], col2[index]) 

485 else: 

486 self.assertTrue(col1.equals(col2)) 

487 

488 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

489 def testWriteMultiIndexDataFrameReadAsAstropyTable(self): 

490 df1 = _makeMultiIndexDataFrame() 

491 

492 self.butler.put(df1, self.datasetType, dataId={}) 

493 

494 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

495 

496 # This is an odd duck, it doesn't really round-trip. 

497 # This test simply checks that it's readable, but definitely not 

498 # recommended. 

499 

500 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.") 

501 def testWriteSingleIndexDataFrameReadAsArrowTable(self): 

502 df1, allColumns = _makeSingleIndexDataFrame() 

503 

504 self.butler.put(df1, self.datasetType, dataId={}) 

505 

506 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable") 

507 

508 tab2_df = arrow_to_pandas(tab2) 

509 self.assertTrue(df1.equals(tab2_df)) 

510 

511 # Check reading the columns. 

512 columns = list(tab2.schema.names) 

513 columns2 = self.butler.get( 

514 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

515 ) 

516 # We check the set because pandas reorders the columns. 

517 self.assertEqual(set(columns), set(columns2)) 

518 

519 # Check reading the schema. 

520 schema = tab2.schema 

521 schema2 = self.butler.get( 

522 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema" 

523 ) 

524 

525 # These will not have the same metadata, nor will the string column 

526 # information be maintained. 

527 self.assertEqual(len(schema.names), len(schema2.names)) 

528 for name in schema.names: 

529 if schema.field(name).type not in (pa.string(), pa.binary()): 

530 self.assertEqual(schema.field(name).type, schema2.field(name).type) 

531 

532 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.") 

533 def testWriteMultiIndexDataFrameReadAsArrowTable(self): 

534 df1 = _makeMultiIndexDataFrame() 

535 

536 self.butler.put(df1, self.datasetType, dataId={}) 

537 

538 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable") 

539 

540 tab2_df = arrow_to_pandas(tab2) 

541 self.assertTrue(df1.equals(tab2_df)) 

542 

543 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

544 def testWriteSingleIndexDataFrameReadAsNumpyTable(self): 

545 df1, allColumns = _makeSingleIndexDataFrame() 

546 

547 self.butler.put(df1, self.datasetType, dataId={}) 

548 

549 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy") 

550 

551 tab2_df = pd.DataFrame.from_records(tab2, index=["index"]) 

552 self.assertTrue(df1.equals(tab2_df)) 

553 

554 # Check reading the columns. 

555 columns = list(tab2.dtype.names) 

556 columns2 = self.butler.get( 

557 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

558 ) 

559 # We check the set because pandas reorders the columns. 

560 self.assertEqual(set(columns2), set(columns)) 

561 

562 # Check reading the schema. 

563 schema = ArrowNumpySchema(tab2.dtype) 

564 schema2 = self.butler.get( 

565 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema" 

566 ) 

567 

568 # The string types will be objectified by pandas, and the order 

569 # will be changed because of pandas indexing. 

570 self.assertEqual(len(schema.schema.names), len(schema2.schema.names)) 

571 for name in schema.schema.names: 

572 self.assertIn(name, schema2.schema.names) 

573 self.assertEqual(schema2.schema[name].type, schema.schema[name].type) 

574 

575 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

576 def testWriteMultiIndexDataFrameReadAsNumpyTable(self): 

577 df1 = _makeMultiIndexDataFrame() 

578 

579 self.butler.put(df1, self.datasetType, dataId={}) 

580 

581 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy") 

582 

583 # This is an odd duck, it doesn't really round-trip. 

584 # This test simply checks that it's readable, but definitely not 

585 # recommended. 

586 

587 @unittest.skipUnless(np is not None, "Cannot test reading as numpy dict without numpy.") 

588 def testWriteSingleIndexDataFrameReadAsNumpyDict(self): 

589 df1, allColumns = _makeSingleIndexDataFrame() 

590 

591 self.butler.put(df1, self.datasetType, dataId={}) 

592 

593 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict") 

594 

595 tab2_df = pd.DataFrame.from_records(tab2, index=["index"]) 

596 # The column order is not maintained. 

597 self.assertEqual(set(df1.columns), set(tab2_df.columns)) 

598 for col in df1.columns: 

599 self.assertTrue(np.all(df1[col].values == tab2_df[col].values)) 

600 

601 @unittest.skipUnless(np is not None, "Cannot test reading as numpy dict without numpy.") 

602 def testWriteMultiIndexDataFrameReadAsNumpyDict(self): 

603 df1 = _makeMultiIndexDataFrame() 

604 

605 self.butler.put(df1, self.datasetType, dataId={}) 

606 

607 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict") 

608 

609 # This is an odd duck, it doesn't really round-trip. 

610 # This test simply checks that it's readable, but definitely not 

611 # recommended. 

612 

613 

614@unittest.skipUnless(pd is not None, "Cannot test InMemoryDataFrameDelegate without pandas.") 

615class InMemoryDataFrameDelegateTestCase(ParquetFormatterDataFrameTestCase): 

616 """Tests for InMemoryDatastore, using DataFrameDelegate.""" 

617 

618 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

619 

620 def testMultiIndexDataFrame(self): 

621 df1 = _makeMultiIndexDataFrame() 

622 

623 delegate = DataFrameDelegate("DataFrame") 

624 

625 # Read the whole DataFrame. 

626 df2 = delegate.handleParameters(inMemoryDataset=df1) 

627 self.assertTrue(df1.equals(df2)) 

628 # Read just the column descriptions. 

629 columns2 = delegate.getComponent(composite=df1, componentName="columns") 

630 self.assertTrue(df1.columns.equals(columns2)) 

631 

632 # Read just some columns a few different ways. 

633 with self.assertRaises(NotImplementedError) as cm: 

634 delegate.handleParameters(inMemoryDataset=df1, parameters={"columns": {"filter": "g"}}) 

635 self.assertIn("only supports string column names", str(cm.exception)) 

636 with self.assertRaises(NotImplementedError) as cm: 

637 delegate.handleParameters( 

638 inMemoryDataset=df1, parameters={"columns": {"filter": ["r"], "column": "a"}} 

639 ) 

640 self.assertIn("only supports string column names", str(cm.exception)) 

641 

642 def testWriteMultiIndexDataFrameReadAsAstropyTable(self): 

643 df1 = _makeMultiIndexDataFrame() 

644 

645 self.butler.put(df1, self.datasetType, dataId={}) 

646 

647 with self.assertRaises(ValueError): 

648 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

649 

650 def testLegacyDataFrame(self): 

651 # This test does not work with an inMemoryDatastore. 

652 pass 

653 

654 def testBadInput(self): 

655 df1, _ = _makeSingleIndexDataFrame() 

656 delegate = DataFrameDelegate("DataFrame") 

657 

658 with self.assertRaises(ValueError): 

659 delegate.handleParameters(inMemoryDataset="not_a_dataframe") 

660 

661 with self.assertRaises(AttributeError): 

662 delegate.getComponent(composite=df1, componentName="nothing") 

663 

664 def testStorageClass(self): 

665 df1, allColumns = _makeSingleIndexDataFrame() 

666 

667 factory = StorageClassFactory() 

668 factory.addFromConfig(StorageClassConfig()) 

669 

670 storageClass = factory.findStorageClass(type(df1), compare_types=False) 

671 # Force the name lookup to do name matching. 

672 storageClass._pytype = None 

673 self.assertEqual(storageClass.name, "DataFrame") 

674 

675 storageClass = factory.findStorageClass(type(df1), compare_types=True) 

676 # Force the name lookup to do name matching. 

677 storageClass._pytype = None 

678 self.assertEqual(storageClass.name, "DataFrame") 

679 

680 

681@unittest.skipUnless(atable is not None, "Cannot test ParquetFormatterArrowAstropy without astropy.") 

682@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowAstropy without pyarrow.") 

683class ParquetFormatterArrowAstropyTestCase(unittest.TestCase): 

684 """Tests for ParquetFormatter, ArrowAstropy, using local file datastore.""" 

685 

686 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

687 

688 def setUp(self): 

689 """Create a new butler root for each test.""" 

690 self.root = makeTestTempDir(TESTDIR) 

691 config = Config(self.configFile) 

692 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run") 

693 # No dimensions in dataset type so we don't have to worry about 

694 # inserting dimension data or defining data IDs. 

695 self.datasetType = DatasetType( 

696 "data", dimensions=(), storageClass="ArrowAstropy", universe=self.butler.registry.dimensions 

697 ) 

698 self.butler.registry.registerDatasetType(self.datasetType) 

699 

700 def tearDown(self): 

701 removeTestTempDir(self.root) 

702 

703 def testAstropyTable(self): 

704 tab1 = _makeSimpleAstropyTable(include_multidim=True, include_masked=True) 

705 

706 self.butler.put(tab1, self.datasetType, dataId={}) 

707 # Read the whole Table. 

708 tab2 = self.butler.get(self.datasetType, dataId={}) 

709 self._checkAstropyTableEquality(tab1, tab2) 

710 # Read the columns. 

711 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

712 self.assertEqual(len(columns2), len(tab1.dtype.names)) 

713 for i, name in enumerate(tab1.dtype.names): 

714 self.assertEqual(columns2[i], name) 

715 # Read the rowcount. 

716 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

717 self.assertEqual(rowcount, len(tab1)) 

718 # Read the schema. 

719 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

720 self.assertEqual(schema, ArrowAstropySchema(tab1)) 

721 # Read just some columns a few different ways. 

722 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

723 self._checkAstropyTableEquality(tab1[("a", "c")], tab3) 

724 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

725 self._checkAstropyTableEquality(tab1[("a",)], tab4) 

726 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]}) 

727 self._checkAstropyTableEquality(tab1[("index", "a")], tab5) 

728 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"}) 

729 self._checkAstropyTableEquality(tab1[("ddd",)], tab6) 

730 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]}) 

731 self._checkAstropyTableEquality(tab1[("a",)], tab7) 

732 # Passing an unrecognized column should be a ValueError. 

733 with self.assertRaises(ValueError): 

734 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]}) 

735 

736 def testAstropyTableBigEndian(self): 

737 tab1 = _makeSimpleAstropyTable(include_bigendian=True) 

738 

739 self.butler.put(tab1, self.datasetType, dataId={}) 

740 # Read the whole Table. 

741 tab2 = self.butler.get(self.datasetType, dataId={}) 

742 self._checkAstropyTableEquality(tab1, tab2, has_bigendian=True) 

743 

744 def testAstropyTableWithMetadata(self): 

745 tab1 = _makeSimpleAstropyTable(include_multidim=True) 

746 

747 meta = { 

748 "meta_a": 5, 

749 "meta_b": 10.0, 

750 "meta_c": [1, 2, 3], 

751 "meta_d": True, 

752 "meta_e": "string", 

753 } 

754 

755 tab1.meta.update(meta) 

756 

757 self.butler.put(tab1, self.datasetType, dataId={}) 

758 # Read the whole Table. 

759 tab2 = self.butler.get(self.datasetType, dataId={}) 

760 # This will check that the metadata is equivalent as well. 

761 self._checkAstropyTableEquality(tab1, tab2) 

762 

763 def testArrowAstropySchema(self): 

764 tab1 = _makeSimpleAstropyTable() 

765 tab1_arrow = astropy_to_arrow(tab1) 

766 schema = ArrowAstropySchema.from_arrow(tab1_arrow.schema) 

767 

768 self.assertIsInstance(schema.schema, atable.Table) 

769 self.assertEqual(repr(schema), repr(schema._schema)) 

770 self.assertNotEqual(schema, "not_a_schema") 

771 self.assertEqual(schema, schema) 

772 

773 # Test various inequalities 

774 tab2 = tab1.copy() 

775 tab2.rename_column("index", "index2") 

776 schema2 = ArrowAstropySchema(tab2) 

777 self.assertNotEqual(schema2, schema) 

778 

779 tab2 = tab1.copy() 

780 tab2["index"].unit = units.micron 

781 schema2 = ArrowAstropySchema(tab2) 

782 self.assertNotEqual(schema2, schema) 

783 

784 tab2 = tab1.copy() 

785 tab2["index"].description = "Index column" 

786 schema2 = ArrowAstropySchema(tab2) 

787 self.assertNotEqual(schema2, schema) 

788 

789 tab2 = tab1.copy() 

790 tab2["index"].format = "%05d" 

791 schema2 = ArrowAstropySchema(tab2) 

792 self.assertNotEqual(schema2, schema) 

793 

794 def testAstropyParquet(self): 

795 tab1 = _makeSimpleAstropyTable() 

796 

797 fname = os.path.join(self.root, "test_astropy.parq") 

798 tab1.write(fname) 

799 

800 astropy_type = DatasetType( 

801 "astropy_parquet", 

802 dimensions=(), 

803 storageClass="ArrowAstropy", 

804 universe=self.butler.registry.dimensions, 

805 ) 

806 self.butler.registry.registerDatasetType(astropy_type) 

807 

808 data_id = {} 

809 ref = DatasetRef(astropy_type, data_id, id=None) 

810 dataset = FileDataset(path=fname, refs=[ref], formatter=ParquetFormatter) 

811 

812 self.butler.ingest(dataset, transfer="copy") 

813 

814 self.butler.put(tab1, self.datasetType, dataId={}) 

815 

816 tab2a = self.butler.get(self.datasetType, dataId={}) 

817 tab2b = self.butler.get("astropy_parquet", dataId={}) 

818 self._checkAstropyTableEquality(tab2a, tab2b) 

819 

820 columns2a = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

821 columns2b = self.butler.get("astropy_parquet.columns", dataId={}) 

822 self.assertEqual(len(columns2b), len(columns2a)) 

823 for i, name in enumerate(columns2a): 

824 self.assertEqual(columns2b[i], name) 

825 

826 rowcount2a = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

827 rowcount2b = self.butler.get("astropy_parquet.rowcount", dataId={}) 

828 self.assertEqual(rowcount2a, rowcount2b) 

829 

830 schema2a = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

831 schema2b = self.butler.get("astropy_parquet.schema", dataId={}) 

832 self.assertEqual(schema2a, schema2b) 

833 

834 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.") 

835 def testWriteAstropyReadAsArrowTable(self): 

836 # This astropy <-> arrow works fine with masked columns. 

837 tab1 = _makeSimpleAstropyTable(include_masked=True) 

838 

839 self.butler.put(tab1, self.datasetType, dataId={}) 

840 

841 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable") 

842 

843 tab2_astropy = arrow_to_astropy(tab2) 

844 self._checkAstropyTableEquality(tab1, tab2_astropy) 

845 

846 # Check reading the columns. 

847 columns = tab2.schema.names 

848 columns2 = self.butler.get( 

849 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

850 ) 

851 self.assertEqual(columns2, columns) 

852 

853 # Check reading the schema. 

854 schema = tab2.schema 

855 schema2 = self.butler.get( 

856 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema" 

857 ) 

858 

859 self.assertEqual(schema, schema2) 

860 

861 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

862 def testWriteAstropyReadAsDataFrame(self): 

863 tab1 = _makeSimpleAstropyTable() 

864 

865 self.butler.put(tab1, self.datasetType, dataId={}) 

866 

867 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

868 

869 # This is tricky because it loses the units and gains a bonus pandas 

870 # _index_ column, so we just test the dataframe form. 

871 

872 tab1_df = tab1.to_pandas() 

873 self.assertTrue(tab1_df.equals(tab2)) 

874 

875 # Check reading the columns. 

876 columns = tab2.columns 

877 columns2 = self.butler.get( 

878 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex" 

879 ) 

880 self.assertTrue(columns.equals(columns2)) 

881 

882 # Check reading the schema. 

883 schema = DataFrameSchema(tab2) 

884 schema2 = self.butler.get( 

885 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema" 

886 ) 

887 

888 self.assertEqual(schema2, schema) 

889 

890 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

891 def testWriteAstropyWithMaskedColsReadAsDataFrame(self): 

892 # We need to special-case the write-as-astropy read-as-pandas code 

893 # with masks because pandas has multiple ways to use masked columns. 

894 # (When writing an astropy table with masked columns we get an object 

895 # column back, but each unmasked element has the correct type.) 

896 tab1 = _makeSimpleAstropyTable(include_masked=True) 

897 

898 self.butler.put(tab1, self.datasetType, dataId={}) 

899 

900 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

901 

902 tab1_df = tab1.to_pandas() 

903 

904 self.assertTrue(tab1_df.columns.equals(tab2.columns)) 

905 for name in tab2.columns: 

906 col1 = tab1_df[name] 

907 col2 = tab2[name] 

908 

909 if col1.hasnans: 

910 notNull = col1.notnull() 

911 self.assertTrue(notNull.equals(col2.notnull())) 

912 # Need to check value-by-value because column may 

913 # be made of objects, depending on what pandas decides. 

914 for index in notNull.values.nonzero()[0]: 

915 self.assertEqual(col1[index], col2[index]) 

916 else: 

917 self.assertTrue(col1.equals(col2)) 

918 

919 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

920 def testWriteAstropyReadAsNumpyTable(self): 

921 tab1 = _makeSimpleAstropyTable() 

922 self.butler.put(tab1, self.datasetType, dataId={}) 

923 

924 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy") 

925 

926 # This is tricky because it loses the units. 

927 tab2_astropy = atable.Table(tab2) 

928 

929 self._checkAstropyTableEquality(tab1, tab2_astropy, skip_units=True) 

930 

931 # Check reading the columns. 

932 columns = list(tab2.dtype.names) 

933 columns2 = self.butler.get( 

934 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

935 ) 

936 self.assertEqual(columns2, columns) 

937 

938 # Check reading the schema. 

939 schema = ArrowNumpySchema(tab2.dtype) 

940 schema2 = self.butler.get( 

941 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema" 

942 ) 

943 

944 self.assertEqual(schema2, schema) 

945 

946 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

947 def testWriteAstropyReadAsNumpyDict(self): 

948 tab1 = _makeSimpleAstropyTable() 

949 self.butler.put(tab1, self.datasetType, dataId={}) 

950 

951 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict") 

952 

953 # This is tricky because it loses the units. 

954 tab2_astropy = atable.Table(tab2) 

955 

956 self._checkAstropyTableEquality(tab1, tab2_astropy, skip_units=True) 

957 

958 def _checkAstropyTableEquality(self, table1, table2, skip_units=False, has_bigendian=False): 

959 """Check if two astropy tables have the same columns/values. 

960 

961 Parameters 

962 ---------- 

963 table1 : `astropy.table.Table` 

964 table2 : `astropy.table.Table` 

965 skip_units : `bool` 

966 has_bigendian : `bool` 

967 """ 

968 if not has_bigendian: 

969 self.assertEqual(table1.dtype, table2.dtype) 

970 else: 

971 for name in table1.dtype.names: 

972 # Only check type matches, force to little-endian. 

973 self.assertEqual(table1.dtype[name].newbyteorder(">"), table2.dtype[name].newbyteorder(">")) 

974 

975 self.assertEqual(table1.meta, table2.meta) 

976 if not skip_units: 

977 for name in table1.columns: 

978 self.assertEqual(table1[name].unit, table2[name].unit) 

979 self.assertEqual(table1[name].description, table2[name].description) 

980 self.assertEqual(table1[name].format, table2[name].format) 

981 self.assertTrue(np.all(table1 == table2)) 

982 

983 

984@unittest.skipUnless(atable is not None, "Cannot test InMemoryArrowAstropyDelegate without astropy.") 

985class InMemoryArrowAstropyDelegateTestCase(ParquetFormatterArrowAstropyTestCase): 

986 """Tests for InMemoryDatastore, using ArrowAstropyDelegate.""" 

987 

988 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

989 

990 def testAstropyParquet(self): 

991 # This test does not work with an inMemoryDatastore. 

992 pass 

993 

994 def testBadInput(self): 

995 tab1 = _makeSimpleAstropyTable() 

996 delegate = ArrowAstropyDelegate("ArrowAstropy") 

997 

998 with self.assertRaises(ValueError): 

999 delegate.handleParameters(inMemoryDataset="not_an_astropy_table") 

1000 

1001 with self.assertRaises(NotImplementedError): 

1002 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]}) 

1003 

1004 with self.assertRaises(AttributeError): 

1005 delegate.getComponent(composite=tab1, componentName="nothing") 

1006 

1007 

1008@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.") 

1009@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowNumpy without pyarrow.") 

1010class ParquetFormatterArrowNumpyTestCase(unittest.TestCase): 

1011 """Tests for ParquetFormatter, ArrowNumpy, using local file datastore.""" 

1012 

1013 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1014 

1015 def setUp(self): 

1016 """Create a new butler root for each test.""" 

1017 self.root = makeTestTempDir(TESTDIR) 

1018 config = Config(self.configFile) 

1019 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run") 

1020 # No dimensions in dataset type so we don't have to worry about 

1021 # inserting dimension data or defining data IDs. 

1022 self.datasetType = DatasetType( 

1023 "data", dimensions=(), storageClass="ArrowNumpy", universe=self.butler.registry.dimensions 

1024 ) 

1025 self.butler.registry.registerDatasetType(self.datasetType) 

1026 

1027 def tearDown(self): 

1028 removeTestTempDir(self.root) 

1029 

1030 def testNumpyTable(self): 

1031 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1032 

1033 self.butler.put(tab1, self.datasetType, dataId={}) 

1034 # Read the whole Table. 

1035 tab2 = self.butler.get(self.datasetType, dataId={}) 

1036 self._checkNumpyTableEquality(tab1, tab2) 

1037 # Read the columns. 

1038 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

1039 self.assertEqual(len(columns2), len(tab1.dtype.names)) 

1040 for i, name in enumerate(tab1.dtype.names): 

1041 self.assertEqual(columns2[i], name) 

1042 # Read the rowcount. 

1043 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

1044 self.assertEqual(rowcount, len(tab1)) 

1045 # Read the schema. 

1046 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

1047 self.assertEqual(schema, ArrowNumpySchema(tab1.dtype)) 

1048 # Read just some columns a few different ways. 

1049 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

1050 self._checkNumpyTableEquality(tab1[["a", "c"]], tab3) 

1051 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

1052 self._checkNumpyTableEquality( 

1053 tab1[ 

1054 [ 

1055 "a", 

1056 ] 

1057 ], 

1058 tab4, 

1059 ) 

1060 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]}) 

1061 self._checkNumpyTableEquality(tab1[["index", "a"]], tab5) 

1062 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"}) 

1063 self._checkNumpyTableEquality( 

1064 tab1[ 

1065 [ 

1066 "ddd", 

1067 ] 

1068 ], 

1069 tab6, 

1070 ) 

1071 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]}) 

1072 self._checkNumpyTableEquality( 

1073 tab1[ 

1074 [ 

1075 "a", 

1076 ] 

1077 ], 

1078 tab7, 

1079 ) 

1080 # Passing an unrecognized column should be a ValueError. 

1081 with self.assertRaises(ValueError): 

1082 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]}) 

1083 

1084 def testNumpyTableBigEndian(self): 

1085 tab1 = _makeSimpleNumpyTable(include_bigendian=True) 

1086 

1087 self.butler.put(tab1, self.datasetType, dataId={}) 

1088 # Read the whole Table. 

1089 tab2 = self.butler.get(self.datasetType, dataId={}) 

1090 self._checkNumpyTableEquality(tab1, tab2, has_bigendian=True) 

1091 

1092 def testArrowNumpySchema(self): 

1093 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1094 tab1_arrow = numpy_to_arrow(tab1) 

1095 schema = ArrowNumpySchema.from_arrow(tab1_arrow.schema) 

1096 

1097 self.assertIsInstance(schema.schema, np.dtype) 

1098 self.assertEqual(repr(schema), repr(schema._dtype)) 

1099 self.assertNotEqual(schema, "not_a_schema") 

1100 self.assertEqual(schema, schema) 

1101 

1102 # Test inequality 

1103 tab2 = tab1.copy() 

1104 names = list(tab2.dtype.names) 

1105 names[0] = "index2" 

1106 tab2.dtype.names = names 

1107 schema2 = ArrowNumpySchema(tab2.dtype) 

1108 self.assertNotEqual(schema2, schema) 

1109 

1110 @unittest.skipUnless(pa is not None, "Cannot test arrow conversions without pyarrow.") 

1111 def testNumpyDictConversions(self): 

1112 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1113 

1114 # Verify that everything round-trips, including the schema. 

1115 tab1_arrow = numpy_to_arrow(tab1) 

1116 tab1_dict = arrow_to_numpy_dict(tab1_arrow) 

1117 tab1_dict_arrow = numpy_dict_to_arrow(tab1_dict) 

1118 

1119 self.assertEqual(tab1_arrow.schema, tab1_dict_arrow.schema) 

1120 self.assertEqual(tab1_arrow, tab1_dict_arrow) 

1121 

1122 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.") 

1123 def testWriteNumpyTableReadAsArrowTable(self): 

1124 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1125 

1126 self.butler.put(tab1, self.datasetType, dataId={}) 

1127 

1128 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable") 

1129 

1130 tab2_numpy = arrow_to_numpy(tab2) 

1131 

1132 self._checkNumpyTableEquality(tab1, tab2_numpy) 

1133 

1134 # Check reading the columns. 

1135 columns = tab2.schema.names 

1136 columns2 = self.butler.get( 

1137 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

1138 ) 

1139 self.assertEqual(columns2, columns) 

1140 

1141 # Check reading the schema. 

1142 schema = tab2.schema 

1143 schema2 = self.butler.get( 

1144 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema" 

1145 ) 

1146 self.assertEqual(schema2, schema) 

1147 

1148 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

1149 def testWriteNumpyTableReadAsDataFrame(self): 

1150 tab1 = _makeSimpleNumpyTable() 

1151 

1152 self.butler.put(tab1, self.datasetType, dataId={}) 

1153 

1154 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

1155 

1156 # Converting this back to numpy gets confused with the index column 

1157 # and changes the datatype of the string column. 

1158 

1159 tab1_df = pd.DataFrame(tab1) 

1160 

1161 self.assertTrue(tab1_df.equals(tab2)) 

1162 

1163 # Check reading the columns. 

1164 columns = tab2.columns 

1165 columns2 = self.butler.get( 

1166 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex" 

1167 ) 

1168 self.assertTrue(columns.equals(columns2)) 

1169 

1170 # Check reading the schema. 

1171 schema = DataFrameSchema(tab2) 

1172 schema2 = self.butler.get( 

1173 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema" 

1174 ) 

1175 

1176 self.assertEqual(schema2, schema) 

1177 

1178 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

1179 def testWriteNumpyTableReadAsAstropyTable(self): 

1180 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1181 

1182 self.butler.put(tab1, self.datasetType, dataId={}) 

1183 

1184 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

1185 tab2_numpy = tab2.as_array() 

1186 

1187 self._checkNumpyTableEquality(tab1, tab2_numpy) 

1188 

1189 # Check reading the columns. 

1190 columns = list(tab2.columns.keys()) 

1191 columns2 = self.butler.get( 

1192 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

1193 ) 

1194 self.assertEqual(columns2, columns) 

1195 

1196 # Check reading the schema. 

1197 schema = ArrowAstropySchema(tab2) 

1198 schema2 = self.butler.get( 

1199 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema" 

1200 ) 

1201 

1202 self.assertEqual(schema2, schema) 

1203 

1204 def testWriteNumpyTableReadAsNumpyDict(self): 

1205 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1206 

1207 self.butler.put(tab1, self.datasetType, dataId={}) 

1208 

1209 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict") 

1210 tab2_numpy = _numpy_dict_to_numpy(tab2) 

1211 

1212 self._checkNumpyTableEquality(tab1, tab2_numpy) 

1213 

1214 def _checkNumpyTableEquality(self, table1, table2, has_bigendian=False): 

1215 """Check if two numpy tables have the same columns/values 

1216 

1217 Parameters 

1218 ---------- 

1219 table1 : `numpy.ndarray` 

1220 table2 : `numpy.ndarray` 

1221 has_bigendian : `bool` 

1222 """ 

1223 self.assertEqual(table1.dtype.names, table2.dtype.names) 

1224 for name in table1.dtype.names: 

1225 if not has_bigendian: 

1226 self.assertEqual(table1.dtype[name], table2.dtype[name]) 

1227 else: 

1228 # Only check type matches, force to little-endian. 

1229 self.assertEqual(table1.dtype[name].newbyteorder(">"), table2.dtype[name].newbyteorder(">")) 

1230 self.assertTrue(np.all(table1 == table2)) 

1231 

1232 

1233@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.") 

1234class InMemoryArrowNumpyDelegateTestCase(ParquetFormatterArrowNumpyTestCase): 

1235 """Tests for InMemoryDatastore, using ArrowNumpyDelegate.""" 

1236 

1237 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

1238 

1239 def testBadInput(self): 

1240 tab1 = _makeSimpleNumpyTable() 

1241 delegate = ArrowNumpyDelegate("ArrowNumpy") 

1242 

1243 with self.assertRaises(ValueError): 

1244 delegate.handleParameters(inMemoryDataset="not_a_numpy_table") 

1245 

1246 with self.assertRaises(NotImplementedError): 

1247 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]}) 

1248 

1249 with self.assertRaises(AttributeError): 

1250 delegate.getComponent(composite=tab1, componentName="nothing") 

1251 

1252 def testStorageClass(self): 

1253 tab1 = _makeSimpleNumpyTable() 

1254 

1255 factory = StorageClassFactory() 

1256 factory.addFromConfig(StorageClassConfig()) 

1257 

1258 storageClass = factory.findStorageClass(type(tab1), compare_types=False) 

1259 # Force the name lookup to do name matching. 

1260 storageClass._pytype = None 

1261 self.assertEqual(storageClass.name, "ArrowNumpy") 

1262 

1263 storageClass = factory.findStorageClass(type(tab1), compare_types=True) 

1264 # Force the name lookup to do name matching. 

1265 storageClass._pytype = None 

1266 self.assertEqual(storageClass.name, "ArrowNumpy") 

1267 

1268 

1269@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowTable without pyarrow.") 

1270class ParquetFormatterArrowTableTestCase(unittest.TestCase): 

1271 """Tests for ParquetFormatter, ArrowTable, using local file datastore.""" 

1272 

1273 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1274 

1275 def setUp(self): 

1276 """Create a new butler root for each test.""" 

1277 self.root = makeTestTempDir(TESTDIR) 

1278 config = Config(self.configFile) 

1279 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run") 

1280 # No dimensions in dataset type so we don't have to worry about 

1281 # inserting dimension data or defining data IDs. 

1282 self.datasetType = DatasetType( 

1283 "data", dimensions=(), storageClass="ArrowTable", universe=self.butler.registry.dimensions 

1284 ) 

1285 self.butler.registry.registerDatasetType(self.datasetType) 

1286 

1287 def tearDown(self): 

1288 removeTestTempDir(self.root) 

1289 

1290 def testArrowTable(self): 

1291 tab1 = _makeSimpleArrowTable(include_multidim=True, include_masked=True) 

1292 

1293 self.butler.put(tab1, self.datasetType, dataId={}) 

1294 # Read the whole Table. 

1295 tab2 = self.butler.get(self.datasetType, dataId={}) 

1296 self.assertEqual(tab2, tab1) 

1297 # Read the columns. 

1298 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

1299 self.assertEqual(len(columns2), len(tab1.schema.names)) 

1300 for i, name in enumerate(tab1.schema.names): 

1301 self.assertEqual(columns2[i], name) 

1302 # Read the rowcount. 

1303 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

1304 self.assertEqual(rowcount, len(tab1)) 

1305 # Read the schema. 

1306 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

1307 self.assertEqual(schema, tab1.schema) 

1308 # Read just some columns a few different ways. 

1309 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

1310 self.assertEqual(tab3, tab1.select(("a", "c"))) 

1311 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

1312 self.assertEqual(tab4, tab1.select(("a",))) 

1313 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]}) 

1314 self.assertEqual(tab5, tab1.select(("index", "a"))) 

1315 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"}) 

1316 self.assertEqual(tab6, tab1.select(("ddd",))) 

1317 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]}) 

1318 self.assertEqual(tab7, tab1.select(("a",))) 

1319 # Passing an unrecognized column should be a ValueError. 

1320 with self.assertRaises(ValueError): 

1321 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]}) 

1322 

1323 def testEmptyArrowTable(self): 

1324 data = _makeSimpleNumpyTable() 

1325 type_list = _numpy_dtype_to_arrow_types(data.dtype) 

1326 

1327 schema = pa.schema(type_list) 

1328 arrays = [[]] * len(schema.names) 

1329 

1330 tab1 = pa.Table.from_arrays(arrays, schema=schema) 

1331 

1332 self.butler.put(tab1, self.datasetType, dataId={}) 

1333 tab2 = self.butler.get(self.datasetType, dataId={}) 

1334 self.assertEqual(tab2, tab1) 

1335 

1336 tab1_numpy = arrow_to_numpy(tab1) 

1337 self.assertEqual(len(tab1_numpy), 0) 

1338 tab1_numpy_arrow = numpy_to_arrow(tab1_numpy) 

1339 self.assertEqual(tab1_numpy_arrow, tab1) 

1340 

1341 tab1_pandas = arrow_to_pandas(tab1) 

1342 self.assertEqual(len(tab1_pandas), 0) 

1343 tab1_pandas_arrow = pandas_to_arrow(tab1_pandas) 

1344 # Unfortunately, string/byte columns get mangled when translated 

1345 # through empty pandas dataframes. 

1346 self.assertEqual( 

1347 tab1_pandas_arrow.select(("index", "a", "b", "c", "ddd")), 

1348 tab1.select(("index", "a", "b", "c", "ddd")), 

1349 ) 

1350 

1351 tab1_astropy = arrow_to_astropy(tab1) 

1352 self.assertEqual(len(tab1_astropy), 0) 

1353 tab1_astropy_arrow = astropy_to_arrow(tab1_astropy) 

1354 self.assertEqual(tab1_astropy_arrow, tab1) 

1355 

1356 def testEmptyArrowTableMultidim(self): 

1357 data = _makeSimpleNumpyTable(include_multidim=True) 

1358 type_list = _numpy_dtype_to_arrow_types(data.dtype) 

1359 

1360 md = {} 

1361 for name in data.dtype.names: 

1362 _append_numpy_multidim_metadata(md, name, data.dtype[name]) 

1363 

1364 schema = pa.schema(type_list, metadata=md) 

1365 arrays = [[]] * len(schema.names) 

1366 

1367 tab1 = pa.Table.from_arrays(arrays, schema=schema) 

1368 

1369 self.butler.put(tab1, self.datasetType, dataId={}) 

1370 tab2 = self.butler.get(self.datasetType, dataId={}) 

1371 self.assertEqual(tab2, tab1) 

1372 

1373 tab1_numpy = arrow_to_numpy(tab1) 

1374 self.assertEqual(len(tab1_numpy), 0) 

1375 tab1_numpy_arrow = numpy_to_arrow(tab1_numpy) 

1376 self.assertEqual(tab1_numpy_arrow, tab1) 

1377 

1378 tab1_astropy = arrow_to_astropy(tab1) 

1379 self.assertEqual(len(tab1_astropy), 0) 

1380 tab1_astropy_arrow = astropy_to_arrow(tab1_astropy) 

1381 self.assertEqual(tab1_astropy_arrow, tab1) 

1382 

1383 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

1384 def testWriteArrowTableReadAsSingleIndexDataFrame(self): 

1385 df1, allColumns = _makeSingleIndexDataFrame() 

1386 

1387 self.butler.put(df1, self.datasetType, dataId={}) 

1388 

1389 # Read back out as a dataframe. 

1390 df2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

1391 self.assertTrue(df1.equals(df2)) 

1392 

1393 # Read back out as an arrow table, convert to dataframe. 

1394 tab3 = self.butler.get(self.datasetType, dataId={}) 

1395 df3 = arrow_to_pandas(tab3) 

1396 self.assertTrue(df1.equals(df3)) 

1397 

1398 # Check reading the columns. 

1399 columns = df2.reset_index().columns 

1400 columns2 = self.butler.get( 

1401 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex" 

1402 ) 

1403 # We check the set because pandas reorders the columns. 

1404 self.assertEqual(set(columns2.to_list()), set(columns.to_list())) 

1405 

1406 # Check reading the schema. 

1407 schema = DataFrameSchema(df1) 

1408 schema2 = self.butler.get( 

1409 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema" 

1410 ) 

1411 self.assertEqual(schema2, schema) 

1412 

1413 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

1414 def testWriteArrowTableReadAsMultiIndexDataFrame(self): 

1415 df1 = _makeMultiIndexDataFrame() 

1416 

1417 self.butler.put(df1, self.datasetType, dataId={}) 

1418 

1419 # Read back out as a dataframe. 

1420 df2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

1421 self.assertTrue(df1.equals(df2)) 

1422 

1423 # Read back out as an arrow table, convert to dataframe. 

1424 atab3 = self.butler.get(self.datasetType, dataId={}) 

1425 df3 = arrow_to_pandas(atab3) 

1426 self.assertTrue(df1.equals(df3)) 

1427 

1428 # Check reading the columns. 

1429 columns = df2.columns 

1430 columns2 = self.butler.get( 

1431 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex" 

1432 ) 

1433 self.assertTrue(columns2.equals(columns)) 

1434 

1435 # Check reading the schema. 

1436 schema = DataFrameSchema(df1) 

1437 schema2 = self.butler.get( 

1438 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema" 

1439 ) 

1440 self.assertEqual(schema2, schema) 

1441 

1442 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

1443 def testWriteArrowTableReadAsAstropyTable(self): 

1444 tab1 = _makeSimpleAstropyTable(include_multidim=True, include_masked=True) 

1445 

1446 self.butler.put(tab1, self.datasetType, dataId={}) 

1447 

1448 # Read back out as an astropy table. 

1449 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

1450 self._checkAstropyTableEquality(tab1, tab2) 

1451 

1452 # Read back out as an arrow table, convert to astropy table. 

1453 atab3 = self.butler.get(self.datasetType, dataId={}) 

1454 tab3 = arrow_to_astropy(atab3) 

1455 self._checkAstropyTableEquality(tab1, tab3) 

1456 

1457 # Check reading the columns. 

1458 columns = list(tab2.columns.keys()) 

1459 columns2 = self.butler.get( 

1460 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

1461 ) 

1462 self.assertEqual(columns2, columns) 

1463 

1464 # Check reading the schema. 

1465 schema = ArrowAstropySchema(tab1) 

1466 schema2 = self.butler.get( 

1467 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema" 

1468 ) 

1469 self.assertEqual(schema2, schema) 

1470 

1471 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

1472 def testWriteArrowTableReadAsNumpyTable(self): 

1473 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1474 

1475 self.butler.put(tab1, self.datasetType, dataId={}) 

1476 

1477 # Read back out as a numpy table. 

1478 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy") 

1479 self._checkNumpyTableEquality(tab1, tab2) 

1480 

1481 # Read back out as an arrow table, convert to numpy table. 

1482 atab3 = self.butler.get(self.datasetType, dataId={}) 

1483 tab3 = arrow_to_numpy(atab3) 

1484 self._checkNumpyTableEquality(tab1, tab3) 

1485 

1486 # Check reading the columns. 

1487 columns = list(tab2.dtype.names) 

1488 columns2 = self.butler.get( 

1489 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

1490 ) 

1491 self.assertEqual(columns2, columns) 

1492 

1493 # Check reading the schema. 

1494 schema = ArrowNumpySchema(tab1.dtype) 

1495 schema2 = self.butler.get( 

1496 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema" 

1497 ) 

1498 self.assertEqual(schema2, schema) 

1499 

1500 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

1501 def testWriteArrowTableReadAsNumpyDict(self): 

1502 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1503 

1504 self.butler.put(tab1, self.datasetType, dataId={}) 

1505 

1506 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict") 

1507 tab2_numpy = _numpy_dict_to_numpy(tab2) 

1508 self._checkNumpyTableEquality(tab1, tab2_numpy) 

1509 

1510 def _checkAstropyTableEquality(self, table1, table2): 

1511 """Check if two astropy tables have the same columns/values 

1512 

1513 Parameters 

1514 ---------- 

1515 table1 : `astropy.table.Table` 

1516 table2 : `astropy.table.Table` 

1517 """ 

1518 self.assertEqual(table1.dtype, table2.dtype) 

1519 for name in table1.columns: 

1520 self.assertEqual(table1[name].unit, table2[name].unit) 

1521 self.assertEqual(table1[name].description, table2[name].description) 

1522 self.assertEqual(table1[name].format, table2[name].format) 

1523 self.assertTrue(np.all(table1 == table2)) 

1524 

1525 def _checkNumpyTableEquality(self, table1, table2): 

1526 """Check if two numpy tables have the same columns/values 

1527 

1528 Parameters 

1529 ---------- 

1530 table1 : `numpy.ndarray` 

1531 table2 : `numpy.ndarray` 

1532 """ 

1533 self.assertEqual(table1.dtype.names, table2.dtype.names) 

1534 for name in table1.dtype.names: 

1535 self.assertEqual(table1.dtype[name], table2.dtype[name]) 

1536 self.assertTrue(np.all(table1 == table2)) 

1537 

1538 

1539@unittest.skipUnless(pa is not None, "Cannot test InMemoryArrowTableDelegate without pyarrow.") 

1540class InMemoryArrowTableDelegateTestCase(ParquetFormatterArrowTableTestCase): 

1541 """Tests for InMemoryDatastore, using ArrowTableDelegate.""" 

1542 

1543 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

1544 

1545 def testBadInput(self): 

1546 tab1 = _makeSimpleArrowTable() 

1547 delegate = ArrowTableDelegate("ArrowTable") 

1548 

1549 with self.assertRaises(ValueError): 

1550 delegate.handleParameters(inMemoryDataset="not_an_arrow_table") 

1551 

1552 with self.assertRaises(NotImplementedError): 

1553 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]}) 

1554 

1555 with self.assertRaises(AttributeError): 

1556 delegate.getComponent(composite=tab1, componentName="nothing") 

1557 

1558 def testStorageClass(self): 

1559 tab1 = _makeSimpleArrowTable() 

1560 

1561 factory = StorageClassFactory() 

1562 factory.addFromConfig(StorageClassConfig()) 

1563 

1564 storageClass = factory.findStorageClass(type(tab1), compare_types=False) 

1565 # Force the name lookup to do name matching. 

1566 storageClass._pytype = None 

1567 self.assertEqual(storageClass.name, "ArrowTable") 

1568 

1569 storageClass = factory.findStorageClass(type(tab1), compare_types=True) 

1570 # Force the name lookup to do name matching. 

1571 storageClass._pytype = None 

1572 self.assertEqual(storageClass.name, "ArrowTable") 

1573 

1574 

1575@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.") 

1576@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowNumpy without pyarrow.") 

1577class ParquetFormatterArrowNumpyDictTestCase(unittest.TestCase): 

1578 """Tests for ParquetFormatter, ArrowNumpyDict, using local file store.""" 

1579 

1580 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1581 

1582 def setUp(self): 

1583 """Create a new butler root for each test.""" 

1584 self.root = makeTestTempDir(TESTDIR) 

1585 config = Config(self.configFile) 

1586 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run") 

1587 # No dimensions in dataset type so we don't have to worry about 

1588 # inserting dimension data or defining data IDs. 

1589 self.datasetType = DatasetType( 

1590 "data", dimensions=(), storageClass="ArrowNumpyDict", universe=self.butler.registry.dimensions 

1591 ) 

1592 self.butler.registry.registerDatasetType(self.datasetType) 

1593 

1594 def tearDown(self): 

1595 removeTestTempDir(self.root) 

1596 

1597 def testNumpyDict(self): 

1598 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1599 dict1 = _numpy_to_numpy_dict(tab1) 

1600 

1601 self.butler.put(dict1, self.datasetType, dataId={}) 

1602 # Read the whole table. 

1603 dict2 = self.butler.get(self.datasetType, dataId={}) 

1604 self._checkNumpyDictEquality(dict1, dict2) 

1605 # Read the columns. 

1606 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

1607 self.assertEqual(len(columns2), len(dict1.keys())) 

1608 for i, name in enumerate(dict1.keys()): 

1609 self.assertIn(name, columns2) 

1610 # Read the rowcount. 

1611 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

1612 self.assertEqual(rowcount, len(dict1["a"])) 

1613 # Read the schema. 

1614 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

1615 self.assertEqual(schema, ArrowNumpySchema(tab1.dtype)) 

1616 # Read just some columns a few different ways. 

1617 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

1618 subdict = {key: dict1[key] for key in ["a", "c"]} 

1619 self._checkNumpyDictEquality(subdict, tab3) 

1620 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

1621 subdict = {key: dict1[key] for key in ["a"]} 

1622 self._checkNumpyDictEquality(subdict, tab4) 

1623 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]}) 

1624 subdict = {key: dict1[key] for key in ["index", "a"]} 

1625 self._checkNumpyDictEquality(subdict, tab5) 

1626 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"}) 

1627 subdict = {key: dict1[key] for key in ["ddd"]} 

1628 self._checkNumpyDictEquality(subdict, tab6) 

1629 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]}) 

1630 subdict = {key: dict1[key] for key in ["a"]} 

1631 self._checkNumpyDictEquality(subdict, tab7) 

1632 # Passing an unrecognized column should be a ValueError. 

1633 with self.assertRaises(ValueError): 

1634 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]}) 

1635 

1636 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.") 

1637 def testWriteNumpyDictReadAsArrowTable(self): 

1638 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1639 dict1 = _numpy_to_numpy_dict(tab1) 

1640 

1641 self.butler.put(dict1, self.datasetType, dataId={}) 

1642 

1643 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable") 

1644 

1645 tab2_dict = arrow_to_numpy_dict(tab2) 

1646 

1647 self._checkNumpyDictEquality(dict1, tab2_dict) 

1648 

1649 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

1650 def testWriteNumpyDictReadAsDataFrame(self): 

1651 tab1 = _makeSimpleNumpyTable() 

1652 dict1 = _numpy_to_numpy_dict(tab1) 

1653 

1654 self.butler.put(dict1, self.datasetType, dataId={}) 

1655 

1656 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

1657 

1658 # The order of the dict may get mixed up, so we need to check column 

1659 # by column. We also need to do this in dataframe form because pandas 

1660 # changes the datatype of the string column. 

1661 tab1_df = pd.DataFrame(tab1) 

1662 

1663 self.assertEqual(set(tab1_df.columns), set(tab2.columns)) 

1664 for col in tab1_df.columns: 

1665 self.assertTrue(np.all(tab1_df[col].values == tab2[col].values)) 

1666 

1667 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

1668 def testWriteNumpyDictReadAsAstropyTable(self): 

1669 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1670 dict1 = _numpy_to_numpy_dict(tab1) 

1671 

1672 self.butler.put(dict1, self.datasetType, dataId={}) 

1673 

1674 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

1675 tab2_dict = _astropy_to_numpy_dict(tab2) 

1676 

1677 self._checkNumpyDictEquality(dict1, tab2_dict) 

1678 

1679 def testWriteNumpyDictReadAsNumpyTable(self): 

1680 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1681 dict1 = _numpy_to_numpy_dict(tab1) 

1682 

1683 self.butler.put(dict1, self.datasetType, dataId={}) 

1684 

1685 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy") 

1686 tab2_dict = _numpy_to_numpy_dict(tab2) 

1687 

1688 self._checkNumpyDictEquality(dict1, tab2_dict) 

1689 

1690 def testWriteNumpyDictBad(self): 

1691 dict1 = {"a": 4, "b": np.ndarray([1])} 

1692 with self.assertRaises(RuntimeError): 

1693 self.butler.put(dict1, self.datasetType, dataId={}) 

1694 

1695 dict2 = {"a": np.zeros(4), "b": np.zeros(5)} 

1696 with self.assertRaises(RuntimeError): 

1697 self.butler.put(dict2, self.datasetType, dataId={}) 

1698 

1699 dict3 = {"a": [0] * 5, "b": np.zeros(5)} 

1700 with self.assertRaises(RuntimeError): 

1701 self.butler.put(dict3, self.datasetType, dataId={}) 

1702 

1703 def _checkNumpyDictEquality(self, dict1, dict2): 

1704 """Check if two numpy dicts have the same columns/values. 

1705 

1706 Parameters 

1707 ---------- 

1708 dict1 : `dict` [`str`, `np.ndarray`] 

1709 dict2 : `dict` [`str`, `np.ndarray`] 

1710 """ 

1711 self.assertEqual(set(dict1.keys()), set(dict2.keys())) 

1712 for name in dict1.keys(): 

1713 self.assertEqual(dict1[name].dtype, dict2[name].dtype) 

1714 self.assertTrue(np.all(dict1[name] == dict2[name])) 

1715 

1716 

1717@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.") 

1718@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowNumpy without pyarrow.") 

1719class InMemoryNumpyDictDelegateTestCase(ParquetFormatterArrowNumpyDictTestCase): 

1720 """Tests for InMemoryDatastore, using ArrowNumpyDictDelegate.""" 

1721 

1722 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

1723 

1724 def testWriteNumpyDictBad(self): 

1725 # The sub-type checking is not done on in-memory datastore. 

1726 pass 

1727 

1728 

1729if __name__ == "__main__": 

1730 unittest.main()