Coverage for tests/test_parquet.py: 17%

914 statements  

« prev     ^ index     » next       coverage.py v7.2.3, created at 2023-04-19 03:42 -0700

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""Tests for ParquetFormatter. 

23 

24Tests in this module are disabled unless pandas and pyarrow are importable. 

25""" 

26 

27import os 

28import unittest 

29 

30try: 

31 import pyarrow as pa 

32except ImportError: 

33 pa = None 

34try: 

35 import astropy.table as atable 

36 from astropy import units 

37except ImportError: 

38 atable = None 

39try: 

40 import numpy as np 

41except ImportError: 

42 np = None 

43try: 

44 import pandas as pd 

45except ImportError: 

46 np = None 

47 

48from lsst.daf.butler import ( 

49 Butler, 

50 Config, 

51 DatasetRef, 

52 DatasetType, 

53 FileDataset, 

54 StorageClassConfig, 

55 StorageClassFactory, 

56) 

57from lsst.daf.butler.delegates.arrowastropy import ArrowAstropyDelegate 

58from lsst.daf.butler.delegates.arrownumpy import ArrowNumpyDelegate 

59from lsst.daf.butler.delegates.arrowtable import ArrowTableDelegate 

60from lsst.daf.butler.delegates.dataframe import DataFrameDelegate 

61from lsst.daf.butler.formatters.parquet import ( 

62 ArrowAstropySchema, 

63 ArrowNumpySchema, 

64 DataFrameSchema, 

65 ParquetFormatter, 

66 _append_numpy_multidim_metadata, 

67 _astropy_to_numpy_dict, 

68 _numpy_dict_to_numpy, 

69 _numpy_dtype_to_arrow_types, 

70 _numpy_to_numpy_dict, 

71 arrow_to_astropy, 

72 arrow_to_numpy, 

73 arrow_to_numpy_dict, 

74 arrow_to_pandas, 

75 astropy_to_arrow, 

76 numpy_dict_to_arrow, 

77 numpy_to_arrow, 

78 pandas_to_arrow, 

79) 

80from lsst.daf.butler.tests.utils import makeTestTempDir, removeTestTempDir 

81 

82TESTDIR = os.path.abspath(os.path.dirname(__file__)) 

83 

84 

85def _makeSimpleNumpyTable(include_multidim=False, include_bigendian=False): 

86 """Make a simple numpy table with random data. 

87 

88 Parameters 

89 ---------- 

90 include_multidim : `bool` 

91 Include multi-dimensional columns. 

92 include_bigendian : `bool` 

93 Include big-endian columns. 

94 

95 Returns 

96 ------- 

97 numpyTable : `numpy.ndarray` 

98 """ 

99 nrow = 5 

100 

101 dtype = [ 

102 ("index", "i4"), 

103 ("a", "f8"), 

104 ("b", "f8"), 

105 ("c", "f8"), 

106 ("ddd", "f8"), 

107 ("f", "i8"), 

108 ("strcol", "U10"), 

109 ("bytecol", "a10"), 

110 ] 

111 

112 if include_multidim: 

113 dtype.extend( 

114 [ 

115 ("d1", "f4", (5,)), 

116 ("d2", "i8", (5, 10)), 

117 ("d3", "f8", (5, 10)), 

118 ] 

119 ) 

120 

121 if include_bigendian: 

122 dtype.extend([("a_bigendian", ">f8"), ("f_bigendian", ">i8")]) 

123 

124 data = np.zeros(nrow, dtype=dtype) 

125 data["index"][:] = np.arange(nrow) 

126 data["a"] = np.random.randn(nrow) 

127 data["b"] = np.random.randn(nrow) 

128 data["c"] = np.random.randn(nrow) 

129 data["ddd"] = np.random.randn(nrow) 

130 data["f"] = np.arange(nrow) * 10 

131 data["strcol"][:] = "teststring" 

132 data["bytecol"][:] = "teststring" 

133 

134 if include_multidim: 

135 data["d1"] = np.random.randn(data["d1"].size).reshape(data["d1"].shape) 

136 data["d2"] = np.arange(data["d2"].size).reshape(data["d2"].shape) 

137 data["d3"] = np.asfortranarray(np.random.randn(data["d3"].size).reshape(data["d3"].shape)) 

138 

139 if include_bigendian: 

140 data["a_bigendian"][:] = data["a"] 

141 data["f_bigendian"][:] = data["f"] 

142 

143 return data 

144 

145 

146def _makeSingleIndexDataFrame(include_masked=False): 

147 """Make a single index data frame for testing. 

148 

149 Parameters 

150 ---------- 

151 include_masked : `bool` 

152 Include masked columns. 

153 

154 Returns 

155 ------- 

156 dataFrame : `~pandas.DataFrame` 

157 The test dataframe. 

158 allColumns : `list` [`str`] 

159 List of all the columns (including index columns). 

160 """ 

161 data = _makeSimpleNumpyTable() 

162 df = pd.DataFrame(data) 

163 df = df.set_index("index") 

164 

165 if include_masked: 

166 nrow = len(df) 

167 

168 df["m1"] = pd.array(np.arange(nrow), dtype=pd.Int64Dtype()) 

169 df["m2"] = pd.array(np.arange(nrow), dtype=np.float32) 

170 df["mstrcol"] = pd.array(np.array(["text"] * nrow)) 

171 df.loc[1, ["m1", "m2", "mstrcol"]] = None 

172 

173 allColumns = df.columns.append(pd.Index(df.index.names)) 

174 

175 return df, allColumns 

176 

177 

178def _makeMultiIndexDataFrame(): 

179 """Make a multi-index data frame for testing. 

180 

181 Returns 

182 ------- 

183 dataFrame : `~pandas.DataFrame` 

184 The test dataframe. 

185 """ 

186 columns = pd.MultiIndex.from_tuples( 

187 [ 

188 ("g", "a"), 

189 ("g", "b"), 

190 ("g", "c"), 

191 ("r", "a"), 

192 ("r", "b"), 

193 ("r", "c"), 

194 ], 

195 names=["filter", "column"], 

196 ) 

197 df = pd.DataFrame(np.random.randn(5, 6), index=np.arange(5, dtype=int), columns=columns) 

198 

199 return df 

200 

201 

202def _makeSimpleAstropyTable(include_multidim=False, include_masked=False, include_bigendian=False): 

203 """Make an astropy table for testing. 

204 

205 Parameters 

206 ---------- 

207 include_multidim : `bool` 

208 Include multi-dimensional columns. 

209 include_masked : `bool` 

210 Include masked columns. 

211 include_bigendian : `bool` 

212 Include big-endian columns. 

213 

214 Returns 

215 ------- 

216 astropyTable : `astropy.table.Table` 

217 The test table. 

218 """ 

219 data = _makeSimpleNumpyTable(include_multidim=include_multidim, include_bigendian=include_bigendian) 

220 # Add a couple of units. 

221 table = atable.Table(data) 

222 table["a"].unit = units.degree 

223 table["b"].unit = units.meter 

224 

225 # Add some masked columns. 

226 if include_masked: 

227 nrow = len(table) 

228 mask = np.zeros(nrow, dtype=bool) 

229 mask[1] = True 

230 table["m1"] = np.ma.masked_array(data=np.arange(nrow, dtype="i8"), mask=mask) 

231 table["m2"] = np.ma.masked_array(data=np.arange(nrow, dtype="f4"), mask=mask) 

232 table["mstrcol"] = np.ma.masked_array(data=np.array(["text"] * nrow), mask=mask) 

233 table["mbytecol"] = np.ma.masked_array(data=np.array([b"bytes"] * nrow), mask=mask) 

234 

235 return table 

236 

237 

238def _makeSimpleArrowTable(include_multidim=False, include_masked=False): 

239 """Make an arrow table for testing. 

240 

241 Parameters 

242 ---------- 

243 include_multidim : `bool` 

244 Include multi-dimensional columns. 

245 include_masked : `bool` 

246 Include masked columns. 

247 

248 Returns 

249 ------- 

250 arrowTable : `pyarrow.Table` 

251 The test table. 

252 """ 

253 data = _makeSimpleAstropyTable(include_multidim=include_multidim, include_masked=include_masked) 

254 return astropy_to_arrow(data) 

255 

256 

257@unittest.skipUnless(pd is not None, "Cannot test ParquetFormatterDataFrame without pandas.") 

258@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterDataFrame without pyarrow.") 

259class ParquetFormatterDataFrameTestCase(unittest.TestCase): 

260 """Tests for ParquetFormatter, DataFrame, using local file datastore.""" 

261 

262 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

263 

264 def setUp(self): 

265 """Create a new butler root for each test.""" 

266 self.root = makeTestTempDir(TESTDIR) 

267 config = Config(self.configFile) 

268 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run") 

269 # No dimensions in dataset type so we don't have to worry about 

270 # inserting dimension data or defining data IDs. 

271 self.datasetType = DatasetType( 

272 "data", dimensions=(), storageClass="DataFrame", universe=self.butler.registry.dimensions 

273 ) 

274 self.butler.registry.registerDatasetType(self.datasetType) 

275 

276 def tearDown(self): 

277 removeTestTempDir(self.root) 

278 

279 def testSingleIndexDataFrame(self): 

280 df1, allColumns = _makeSingleIndexDataFrame(include_masked=True) 

281 

282 self.butler.put(df1, self.datasetType, dataId={}) 

283 # Read the whole DataFrame. 

284 df2 = self.butler.get(self.datasetType, dataId={}) 

285 self.assertTrue(df1.equals(df2)) 

286 # Read just the column descriptions. 

287 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

288 self.assertTrue(allColumns.equals(columns2)) 

289 # Read the rowcount. 

290 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

291 self.assertEqual(rowcount, len(df1)) 

292 # Read the schema. 

293 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

294 self.assertEqual(schema, DataFrameSchema(df1)) 

295 # Read just some columns a few different ways. 

296 df3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

297 self.assertTrue(df1.loc[:, ["a", "c"]].equals(df3)) 

298 df4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

299 self.assertTrue(df1.loc[:, ["a"]].equals(df4)) 

300 df5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]}) 

301 self.assertTrue(df1.loc[:, ["a"]].equals(df5)) 

302 df6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"}) 

303 self.assertTrue(df1.loc[:, ["ddd"]].equals(df6)) 

304 df7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]}) 

305 self.assertTrue(df1.loc[:, ["a"]].equals(df7)) 

306 # Passing an unrecognized column should be a ValueError. 

307 with self.assertRaises(ValueError): 

308 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]}) 

309 

310 def testMultiIndexDataFrame(self): 

311 df1 = _makeMultiIndexDataFrame() 

312 

313 self.butler.put(df1, self.datasetType, dataId={}) 

314 # Read the whole DataFrame. 

315 df2 = self.butler.get(self.datasetType, dataId={}) 

316 self.assertTrue(df1.equals(df2)) 

317 # Read just the column descriptions. 

318 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

319 self.assertTrue(df1.columns.equals(columns2)) 

320 self.assertEqual(columns2.names, df1.columns.names) 

321 # Read the rowcount. 

322 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

323 self.assertEqual(rowcount, len(df1)) 

324 # Read the schema. 

325 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

326 self.assertEqual(schema, DataFrameSchema(df1)) 

327 # Read just some columns a few different ways. 

328 df3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": {"filter": "g"}}) 

329 self.assertTrue(df1.loc[:, ["g"]].equals(df3)) 

330 df4 = self.butler.get( 

331 self.datasetType, dataId={}, parameters={"columns": {"filter": ["r"], "column": "a"}} 

332 ) 

333 self.assertTrue(df1.loc[:, [("r", "a")]].equals(df4)) 

334 column_list = [("g", "a"), ("r", "c")] 

335 df5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": column_list}) 

336 self.assertTrue(df1.loc[:, column_list].equals(df5)) 

337 column_dict = {"filter": "r", "column": ["a", "b"]} 

338 df6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": column_dict}) 

339 self.assertTrue(df1.loc[:, [("r", "a"), ("r", "b")]].equals(df6)) 

340 # Passing an unrecognized column should be a ValueError. 

341 with self.assertRaises(ValueError): 

342 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["d"]}) 

343 

344 def testSingleIndexDataFrameEmptyString(self): 

345 """Test persisting a single index dataframe with empty strings.""" 

346 df1, _ = _makeSingleIndexDataFrame() 

347 

348 # Set one of the strings to None 

349 df1.at[1, "strcol"] = None 

350 

351 self.butler.put(df1, self.datasetType, dataId={}) 

352 # Read the whole DataFrame. 

353 df2 = self.butler.get(self.datasetType, dataId={}) 

354 self.assertTrue(df1.equals(df2)) 

355 

356 def testSingleIndexDataFrameAllEmptyStrings(self): 

357 """Test persisting a single index dataframe with an empty string 

358 column. 

359 """ 

360 df1, _ = _makeSingleIndexDataFrame() 

361 

362 # Set all of the strings to None 

363 df1.loc[0:, "strcol"] = None 

364 

365 self.butler.put(df1, self.datasetType, dataId={}) 

366 # Read the whole DataFrame. 

367 df2 = self.butler.get(self.datasetType, dataId={}) 

368 self.assertTrue(df1.equals(df2)) 

369 

370 def testLegacyDataFrame(self): 

371 """Test writing a dataframe to parquet via pandas (without additional 

372 metadata) and ensure that we can read it back with all the new 

373 functionality. 

374 """ 

375 df1, allColumns = _makeSingleIndexDataFrame() 

376 

377 fname = os.path.join(self.root, "test_dataframe.parq") 

378 df1.to_parquet(fname) 

379 

380 legacy_type = DatasetType( 

381 "legacy_dataframe", 

382 dimensions=(), 

383 storageClass="DataFrame", 

384 universe=self.butler.registry.dimensions, 

385 ) 

386 self.butler.registry.registerDatasetType(legacy_type) 

387 

388 data_id = {} 

389 ref = DatasetRef(legacy_type, data_id, id=None) 

390 dataset = FileDataset(path=fname, refs=[ref], formatter=ParquetFormatter) 

391 

392 self.butler.ingest(dataset, transfer="copy") 

393 

394 self.butler.put(df1, self.datasetType, dataId={}) 

395 

396 df2a = self.butler.get(self.datasetType, dataId={}) 

397 df2b = self.butler.get("legacy_dataframe", dataId={}) 

398 self.assertTrue(df2a.equals(df2b)) 

399 

400 df3a = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a"]}) 

401 df3b = self.butler.get("legacy_dataframe", dataId={}, parameters={"columns": ["a"]}) 

402 self.assertTrue(df3a.equals(df3b)) 

403 

404 columns2a = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

405 columns2b = self.butler.get("legacy_dataframe.columns", dataId={}) 

406 self.assertTrue(columns2a.equals(columns2b)) 

407 

408 rowcount2a = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

409 rowcount2b = self.butler.get("legacy_dataframe.rowcount", dataId={}) 

410 self.assertEqual(rowcount2a, rowcount2b) 

411 

412 schema2a = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

413 schema2b = self.butler.get("legacy_dataframe.schema", dataId={}) 

414 self.assertEqual(schema2a, schema2b) 

415 

416 def testDataFrameSchema(self): 

417 tab1 = _makeSimpleArrowTable() 

418 

419 schema = DataFrameSchema.from_arrow(tab1.schema) 

420 

421 self.assertIsInstance(schema.schema, pd.DataFrame) 

422 self.assertEqual(repr(schema), repr(schema._schema)) 

423 self.assertNotEqual(schema, "not_a_schema") 

424 self.assertEqual(schema, schema) 

425 

426 tab2 = _makeMultiIndexDataFrame() 

427 schema2 = DataFrameSchema(tab2) 

428 

429 self.assertNotEqual(schema, schema2) 

430 

431 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

432 def testWriteSingleIndexDataFrameReadAsAstropyTable(self): 

433 df1, allColumns = _makeSingleIndexDataFrame() 

434 

435 self.butler.put(df1, self.datasetType, dataId={}) 

436 

437 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

438 

439 tab2_df = tab2.to_pandas(index="index") 

440 self.assertTrue(df1.equals(tab2_df)) 

441 

442 # Check reading the columns. 

443 columns = list(tab2.columns.keys()) 

444 columns2 = self.butler.get( 

445 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

446 ) 

447 # We check the set because pandas reorders the columns. 

448 self.assertEqual(set(columns2), set(columns)) 

449 

450 # Check reading the schema. 

451 schema = ArrowAstropySchema(tab2) 

452 schema2 = self.butler.get( 

453 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema" 

454 ) 

455 

456 # The string types are objectified by pandas, and the order 

457 # will be changed because of pandas indexing. 

458 self.assertEqual(len(schema2.schema.columns), len(schema.schema.columns)) 

459 for name in schema.schema.columns: 

460 self.assertIn(name, schema2.schema.columns) 

461 if schema2.schema[name].dtype != np.dtype("O"): 

462 self.assertEqual(schema2.schema[name].dtype, schema.schema[name].dtype) 

463 

464 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

465 def testWriteSingleIndexDataFrameWithMaskedColsReadAsAstropyTable(self): 

466 # We need to special-case the write-as-pandas read-as-astropy code 

467 # with masks because pandas has multiple ways to use masked columns. 

468 # (The string column mask handling in particular is frustratingly 

469 # inconsistent.) 

470 df1, allColumns = _makeSingleIndexDataFrame(include_masked=True) 

471 

472 self.butler.put(df1, self.datasetType, dataId={}) 

473 

474 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

475 tab2_df = tab2.to_pandas(index="index") 

476 

477 self.assertTrue(df1.columns.equals(tab2_df.columns)) 

478 for name in tab2_df.columns: 

479 col1 = df1[name] 

480 col2 = tab2_df[name] 

481 

482 if col1.hasnans: 

483 notNull = col1.notnull() 

484 self.assertTrue(notNull.equals(col2.notnull())) 

485 # Need to check value-by-value because column may 

486 # be made of objects, depending on what pandas decides. 

487 for index in notNull.values.nonzero()[0]: 

488 self.assertEqual(col1[index], col2[index]) 

489 else: 

490 self.assertTrue(col1.equals(col2)) 

491 

492 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

493 def testWriteMultiIndexDataFrameReadAsAstropyTable(self): 

494 df1 = _makeMultiIndexDataFrame() 

495 

496 self.butler.put(df1, self.datasetType, dataId={}) 

497 

498 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

499 

500 # This is an odd duck, it doesn't really round-trip. 

501 # This test simply checks that it's readable, but definitely not 

502 # recommended. 

503 

504 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.") 

505 def testWriteSingleIndexDataFrameReadAsArrowTable(self): 

506 df1, allColumns = _makeSingleIndexDataFrame() 

507 

508 self.butler.put(df1, self.datasetType, dataId={}) 

509 

510 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable") 

511 

512 tab2_df = arrow_to_pandas(tab2) 

513 self.assertTrue(df1.equals(tab2_df)) 

514 

515 # Check reading the columns. 

516 columns = list(tab2.schema.names) 

517 columns2 = self.butler.get( 

518 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

519 ) 

520 # We check the set because pandas reorders the columns. 

521 self.assertEqual(set(columns), set(columns2)) 

522 

523 # Check reading the schema. 

524 schema = tab2.schema 

525 schema2 = self.butler.get( 

526 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema" 

527 ) 

528 

529 # These will not have the same metadata, nor will the string column 

530 # information be maintained. 

531 self.assertEqual(len(schema.names), len(schema2.names)) 

532 for name in schema.names: 

533 if schema.field(name).type not in (pa.string(), pa.binary()): 

534 self.assertEqual(schema.field(name).type, schema2.field(name).type) 

535 

536 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.") 

537 def testWriteMultiIndexDataFrameReadAsArrowTable(self): 

538 df1 = _makeMultiIndexDataFrame() 

539 

540 self.butler.put(df1, self.datasetType, dataId={}) 

541 

542 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable") 

543 

544 tab2_df = arrow_to_pandas(tab2) 

545 self.assertTrue(df1.equals(tab2_df)) 

546 

547 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

548 def testWriteSingleIndexDataFrameReadAsNumpyTable(self): 

549 df1, allColumns = _makeSingleIndexDataFrame() 

550 

551 self.butler.put(df1, self.datasetType, dataId={}) 

552 

553 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy") 

554 

555 tab2_df = pd.DataFrame.from_records(tab2, index=["index"]) 

556 self.assertTrue(df1.equals(tab2_df)) 

557 

558 # Check reading the columns. 

559 columns = list(tab2.dtype.names) 

560 columns2 = self.butler.get( 

561 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

562 ) 

563 # We check the set because pandas reorders the columns. 

564 self.assertEqual(set(columns2), set(columns)) 

565 

566 # Check reading the schema. 

567 schema = ArrowNumpySchema(tab2.dtype) 

568 schema2 = self.butler.get( 

569 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema" 

570 ) 

571 

572 # The string types will be objectified by pandas, and the order 

573 # will be changed because of pandas indexing. 

574 self.assertEqual(len(schema.schema.names), len(schema2.schema.names)) 

575 for name in schema.schema.names: 

576 self.assertIn(name, schema2.schema.names) 

577 self.assertEqual(schema2.schema[name].type, schema.schema[name].type) 

578 

579 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

580 def testWriteMultiIndexDataFrameReadAsNumpyTable(self): 

581 df1 = _makeMultiIndexDataFrame() 

582 

583 self.butler.put(df1, self.datasetType, dataId={}) 

584 

585 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy") 

586 

587 # This is an odd duck, it doesn't really round-trip. 

588 # This test simply checks that it's readable, but definitely not 

589 # recommended. 

590 

591 @unittest.skipUnless(np is not None, "Cannot test reading as numpy dict without numpy.") 

592 def testWriteSingleIndexDataFrameReadAsNumpyDict(self): 

593 df1, allColumns = _makeSingleIndexDataFrame() 

594 

595 self.butler.put(df1, self.datasetType, dataId={}) 

596 

597 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict") 

598 

599 tab2_df = pd.DataFrame.from_records(tab2, index=["index"]) 

600 # The column order is not maintained. 

601 self.assertEqual(set(df1.columns), set(tab2_df.columns)) 

602 for col in df1.columns: 

603 self.assertTrue(np.all(df1[col].values == tab2_df[col].values)) 

604 

605 @unittest.skipUnless(np is not None, "Cannot test reading as numpy dict without numpy.") 

606 def testWriteMultiIndexDataFrameReadAsNumpyDict(self): 

607 df1 = _makeMultiIndexDataFrame() 

608 

609 self.butler.put(df1, self.datasetType, dataId={}) 

610 

611 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict") 

612 

613 # This is an odd duck, it doesn't really round-trip. 

614 # This test simply checks that it's readable, but definitely not 

615 # recommended. 

616 

617 

618@unittest.skipUnless(pd is not None, "Cannot test InMemoryDataFrameDelegate without pandas.") 

619class InMemoryDataFrameDelegateTestCase(ParquetFormatterDataFrameTestCase): 

620 """Tests for InMemoryDatastore, using DataFrameDelegate.""" 

621 

622 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

623 

624 def testWriteMultiIndexDataFrameReadAsAstropyTable(self): 

625 df1 = _makeMultiIndexDataFrame() 

626 

627 self.butler.put(df1, self.datasetType, dataId={}) 

628 

629 with self.assertRaises(ValueError): 

630 _ = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

631 

632 def testLegacyDataFrame(self): 

633 # This test does not work with an inMemoryDatastore. 

634 pass 

635 

636 def testBadInput(self): 

637 df1, _ = _makeSingleIndexDataFrame() 

638 delegate = DataFrameDelegate("DataFrame") 

639 

640 with self.assertRaises(ValueError): 

641 delegate.handleParameters(inMemoryDataset="not_a_dataframe") 

642 

643 with self.assertRaises(AttributeError): 

644 delegate.getComponent(composite=df1, componentName="nothing") 

645 

646 def testStorageClass(self): 

647 df1, allColumns = _makeSingleIndexDataFrame() 

648 

649 factory = StorageClassFactory() 

650 factory.addFromConfig(StorageClassConfig()) 

651 

652 storageClass = factory.findStorageClass(type(df1), compare_types=False) 

653 # Force the name lookup to do name matching. 

654 storageClass._pytype = None 

655 self.assertEqual(storageClass.name, "DataFrame") 

656 

657 storageClass = factory.findStorageClass(type(df1), compare_types=True) 

658 # Force the name lookup to do name matching. 

659 storageClass._pytype = None 

660 self.assertEqual(storageClass.name, "DataFrame") 

661 

662 

663@unittest.skipUnless(atable is not None, "Cannot test ParquetFormatterArrowAstropy without astropy.") 

664@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowAstropy without pyarrow.") 

665class ParquetFormatterArrowAstropyTestCase(unittest.TestCase): 

666 """Tests for ParquetFormatter, ArrowAstropy, using local file datastore.""" 

667 

668 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

669 

670 def setUp(self): 

671 """Create a new butler root for each test.""" 

672 self.root = makeTestTempDir(TESTDIR) 

673 config = Config(self.configFile) 

674 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run") 

675 # No dimensions in dataset type so we don't have to worry about 

676 # inserting dimension data or defining data IDs. 

677 self.datasetType = DatasetType( 

678 "data", dimensions=(), storageClass="ArrowAstropy", universe=self.butler.registry.dimensions 

679 ) 

680 self.butler.registry.registerDatasetType(self.datasetType) 

681 

682 def tearDown(self): 

683 removeTestTempDir(self.root) 

684 

685 def testAstropyTable(self): 

686 tab1 = _makeSimpleAstropyTable(include_multidim=True, include_masked=True) 

687 

688 self.butler.put(tab1, self.datasetType, dataId={}) 

689 # Read the whole Table. 

690 tab2 = self.butler.get(self.datasetType, dataId={}) 

691 self._checkAstropyTableEquality(tab1, tab2) 

692 # Read the columns. 

693 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

694 self.assertEqual(len(columns2), len(tab1.dtype.names)) 

695 for i, name in enumerate(tab1.dtype.names): 

696 self.assertEqual(columns2[i], name) 

697 # Read the rowcount. 

698 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

699 self.assertEqual(rowcount, len(tab1)) 

700 # Read the schema. 

701 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

702 self.assertEqual(schema, ArrowAstropySchema(tab1)) 

703 # Read just some columns a few different ways. 

704 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

705 self._checkAstropyTableEquality(tab1[("a", "c")], tab3) 

706 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

707 self._checkAstropyTableEquality(tab1[("a",)], tab4) 

708 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]}) 

709 self._checkAstropyTableEquality(tab1[("index", "a")], tab5) 

710 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"}) 

711 self._checkAstropyTableEquality(tab1[("ddd",)], tab6) 

712 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]}) 

713 self._checkAstropyTableEquality(tab1[("a",)], tab7) 

714 # Passing an unrecognized column should be a ValueError. 

715 with self.assertRaises(ValueError): 

716 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]}) 

717 

718 def testAstropyTableBigEndian(self): 

719 tab1 = _makeSimpleAstropyTable(include_bigendian=True) 

720 

721 self.butler.put(tab1, self.datasetType, dataId={}) 

722 # Read the whole Table. 

723 tab2 = self.butler.get(self.datasetType, dataId={}) 

724 self._checkAstropyTableEquality(tab1, tab2, has_bigendian=True) 

725 

726 def testAstropyTableWithMetadata(self): 

727 tab1 = _makeSimpleAstropyTable(include_multidim=True) 

728 

729 meta = { 

730 "meta_a": 5, 

731 "meta_b": 10.0, 

732 "meta_c": [1, 2, 3], 

733 "meta_d": True, 

734 "meta_e": "string", 

735 } 

736 

737 tab1.meta.update(meta) 

738 

739 self.butler.put(tab1, self.datasetType, dataId={}) 

740 # Read the whole Table. 

741 tab2 = self.butler.get(self.datasetType, dataId={}) 

742 # This will check that the metadata is equivalent as well. 

743 self._checkAstropyTableEquality(tab1, tab2) 

744 

745 def testArrowAstropySchema(self): 

746 tab1 = _makeSimpleAstropyTable() 

747 tab1_arrow = astropy_to_arrow(tab1) 

748 schema = ArrowAstropySchema.from_arrow(tab1_arrow.schema) 

749 

750 self.assertIsInstance(schema.schema, atable.Table) 

751 self.assertEqual(repr(schema), repr(schema._schema)) 

752 self.assertNotEqual(schema, "not_a_schema") 

753 self.assertEqual(schema, schema) 

754 

755 # Test various inequalities 

756 tab2 = tab1.copy() 

757 tab2.rename_column("index", "index2") 

758 schema2 = ArrowAstropySchema(tab2) 

759 self.assertNotEqual(schema2, schema) 

760 

761 tab2 = tab1.copy() 

762 tab2["index"].unit = units.micron 

763 schema2 = ArrowAstropySchema(tab2) 

764 self.assertNotEqual(schema2, schema) 

765 

766 tab2 = tab1.copy() 

767 tab2["index"].description = "Index column" 

768 schema2 = ArrowAstropySchema(tab2) 

769 self.assertNotEqual(schema2, schema) 

770 

771 tab2 = tab1.copy() 

772 tab2["index"].format = "%05d" 

773 schema2 = ArrowAstropySchema(tab2) 

774 self.assertNotEqual(schema2, schema) 

775 

776 def testAstropyParquet(self): 

777 tab1 = _makeSimpleAstropyTable() 

778 

779 fname = os.path.join(self.root, "test_astropy.parq") 

780 tab1.write(fname) 

781 

782 astropy_type = DatasetType( 

783 "astropy_parquet", 

784 dimensions=(), 

785 storageClass="ArrowAstropy", 

786 universe=self.butler.registry.dimensions, 

787 ) 

788 self.butler.registry.registerDatasetType(astropy_type) 

789 

790 data_id = {} 

791 ref = DatasetRef(astropy_type, data_id, id=None) 

792 dataset = FileDataset(path=fname, refs=[ref], formatter=ParquetFormatter) 

793 

794 self.butler.ingest(dataset, transfer="copy") 

795 

796 self.butler.put(tab1, self.datasetType, dataId={}) 

797 

798 tab2a = self.butler.get(self.datasetType, dataId={}) 

799 tab2b = self.butler.get("astropy_parquet", dataId={}) 

800 self._checkAstropyTableEquality(tab2a, tab2b) 

801 

802 columns2a = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

803 columns2b = self.butler.get("astropy_parquet.columns", dataId={}) 

804 self.assertEqual(len(columns2b), len(columns2a)) 

805 for i, name in enumerate(columns2a): 

806 self.assertEqual(columns2b[i], name) 

807 

808 rowcount2a = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

809 rowcount2b = self.butler.get("astropy_parquet.rowcount", dataId={}) 

810 self.assertEqual(rowcount2a, rowcount2b) 

811 

812 schema2a = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

813 schema2b = self.butler.get("astropy_parquet.schema", dataId={}) 

814 self.assertEqual(schema2a, schema2b) 

815 

816 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.") 

817 def testWriteAstropyReadAsArrowTable(self): 

818 # This astropy <-> arrow works fine with masked columns. 

819 tab1 = _makeSimpleAstropyTable(include_masked=True) 

820 

821 self.butler.put(tab1, self.datasetType, dataId={}) 

822 

823 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable") 

824 

825 tab2_astropy = arrow_to_astropy(tab2) 

826 self._checkAstropyTableEquality(tab1, tab2_astropy) 

827 

828 # Check reading the columns. 

829 columns = tab2.schema.names 

830 columns2 = self.butler.get( 

831 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

832 ) 

833 self.assertEqual(columns2, columns) 

834 

835 # Check reading the schema. 

836 schema = tab2.schema 

837 schema2 = self.butler.get( 

838 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema" 

839 ) 

840 

841 self.assertEqual(schema, schema2) 

842 

843 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

844 def testWriteAstropyReadAsDataFrame(self): 

845 tab1 = _makeSimpleAstropyTable() 

846 

847 self.butler.put(tab1, self.datasetType, dataId={}) 

848 

849 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

850 

851 # This is tricky because it loses the units and gains a bonus pandas 

852 # _index_ column, so we just test the dataframe form. 

853 

854 tab1_df = tab1.to_pandas() 

855 self.assertTrue(tab1_df.equals(tab2)) 

856 

857 # Check reading the columns. 

858 columns = tab2.columns 

859 columns2 = self.butler.get( 

860 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex" 

861 ) 

862 self.assertTrue(columns.equals(columns2)) 

863 

864 # Check reading the schema. 

865 schema = DataFrameSchema(tab2) 

866 schema2 = self.butler.get( 

867 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema" 

868 ) 

869 

870 self.assertEqual(schema2, schema) 

871 

872 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

873 def testWriteAstropyWithMaskedColsReadAsDataFrame(self): 

874 # We need to special-case the write-as-astropy read-as-pandas code 

875 # with masks because pandas has multiple ways to use masked columns. 

876 # (When writing an astropy table with masked columns we get an object 

877 # column back, but each unmasked element has the correct type.) 

878 tab1 = _makeSimpleAstropyTable(include_masked=True) 

879 

880 self.butler.put(tab1, self.datasetType, dataId={}) 

881 

882 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

883 

884 tab1_df = tab1.to_pandas() 

885 

886 self.assertTrue(tab1_df.columns.equals(tab2.columns)) 

887 for name in tab2.columns: 

888 col1 = tab1_df[name] 

889 col2 = tab2[name] 

890 

891 if col1.hasnans: 

892 notNull = col1.notnull() 

893 self.assertTrue(notNull.equals(col2.notnull())) 

894 # Need to check value-by-value because column may 

895 # be made of objects, depending on what pandas decides. 

896 for index in notNull.values.nonzero()[0]: 

897 self.assertEqual(col1[index], col2[index]) 

898 else: 

899 self.assertTrue(col1.equals(col2)) 

900 

901 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

902 def testWriteAstropyReadAsNumpyTable(self): 

903 tab1 = _makeSimpleAstropyTable() 

904 self.butler.put(tab1, self.datasetType, dataId={}) 

905 

906 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy") 

907 

908 # This is tricky because it loses the units. 

909 tab2_astropy = atable.Table(tab2) 

910 

911 self._checkAstropyTableEquality(tab1, tab2_astropy, skip_units=True) 

912 

913 # Check reading the columns. 

914 columns = list(tab2.dtype.names) 

915 columns2 = self.butler.get( 

916 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

917 ) 

918 self.assertEqual(columns2, columns) 

919 

920 # Check reading the schema. 

921 schema = ArrowNumpySchema(tab2.dtype) 

922 schema2 = self.butler.get( 

923 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema" 

924 ) 

925 

926 self.assertEqual(schema2, schema) 

927 

928 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

929 def testWriteAstropyReadAsNumpyDict(self): 

930 tab1 = _makeSimpleAstropyTable() 

931 self.butler.put(tab1, self.datasetType, dataId={}) 

932 

933 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict") 

934 

935 # This is tricky because it loses the units. 

936 tab2_astropy = atable.Table(tab2) 

937 

938 self._checkAstropyTableEquality(tab1, tab2_astropy, skip_units=True) 

939 

940 def _checkAstropyTableEquality(self, table1, table2, skip_units=False, has_bigendian=False): 

941 """Check if two astropy tables have the same columns/values. 

942 

943 Parameters 

944 ---------- 

945 table1 : `astropy.table.Table` 

946 table2 : `astropy.table.Table` 

947 skip_units : `bool` 

948 has_bigendian : `bool` 

949 """ 

950 if not has_bigendian: 

951 self.assertEqual(table1.dtype, table2.dtype) 

952 else: 

953 for name in table1.dtype.names: 

954 # Only check type matches, force to little-endian. 

955 self.assertEqual(table1.dtype[name].newbyteorder(">"), table2.dtype[name].newbyteorder(">")) 

956 

957 self.assertEqual(table1.meta, table2.meta) 

958 if not skip_units: 

959 for name in table1.columns: 

960 self.assertEqual(table1[name].unit, table2[name].unit) 

961 self.assertEqual(table1[name].description, table2[name].description) 

962 self.assertEqual(table1[name].format, table2[name].format) 

963 self.assertTrue(np.all(table1 == table2)) 

964 

965 

966@unittest.skipUnless(atable is not None, "Cannot test InMemoryArrowAstropyDelegate without astropy.") 

967class InMemoryArrowAstropyDelegateTestCase(ParquetFormatterArrowAstropyTestCase): 

968 """Tests for InMemoryDatastore, using ArrowAstropyDelegate.""" 

969 

970 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

971 

972 def testAstropyParquet(self): 

973 # This test does not work with an inMemoryDatastore. 

974 pass 

975 

976 def testBadInput(self): 

977 tab1 = _makeSimpleAstropyTable() 

978 delegate = ArrowAstropyDelegate("ArrowAstropy") 

979 

980 with self.assertRaises(ValueError): 

981 delegate.handleParameters(inMemoryDataset="not_an_astropy_table") 

982 

983 with self.assertRaises(NotImplementedError): 

984 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]}) 

985 

986 with self.assertRaises(AttributeError): 

987 delegate.getComponent(composite=tab1, componentName="nothing") 

988 

989 

990@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.") 

991@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowNumpy without pyarrow.") 

992class ParquetFormatterArrowNumpyTestCase(unittest.TestCase): 

993 """Tests for ParquetFormatter, ArrowNumpy, using local file datastore.""" 

994 

995 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

996 

997 def setUp(self): 

998 """Create a new butler root for each test.""" 

999 self.root = makeTestTempDir(TESTDIR) 

1000 config = Config(self.configFile) 

1001 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run") 

1002 # No dimensions in dataset type so we don't have to worry about 

1003 # inserting dimension data or defining data IDs. 

1004 self.datasetType = DatasetType( 

1005 "data", dimensions=(), storageClass="ArrowNumpy", universe=self.butler.registry.dimensions 

1006 ) 

1007 self.butler.registry.registerDatasetType(self.datasetType) 

1008 

1009 def tearDown(self): 

1010 removeTestTempDir(self.root) 

1011 

1012 def testNumpyTable(self): 

1013 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1014 

1015 self.butler.put(tab1, self.datasetType, dataId={}) 

1016 # Read the whole Table. 

1017 tab2 = self.butler.get(self.datasetType, dataId={}) 

1018 self._checkNumpyTableEquality(tab1, tab2) 

1019 # Read the columns. 

1020 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

1021 self.assertEqual(len(columns2), len(tab1.dtype.names)) 

1022 for i, name in enumerate(tab1.dtype.names): 

1023 self.assertEqual(columns2[i], name) 

1024 # Read the rowcount. 

1025 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

1026 self.assertEqual(rowcount, len(tab1)) 

1027 # Read the schema. 

1028 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

1029 self.assertEqual(schema, ArrowNumpySchema(tab1.dtype)) 

1030 # Read just some columns a few different ways. 

1031 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

1032 self._checkNumpyTableEquality(tab1[["a", "c"]], tab3) 

1033 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

1034 self._checkNumpyTableEquality( 

1035 tab1[ 

1036 [ 

1037 "a", 

1038 ] 

1039 ], 

1040 tab4, 

1041 ) 

1042 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]}) 

1043 self._checkNumpyTableEquality(tab1[["index", "a"]], tab5) 

1044 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"}) 

1045 self._checkNumpyTableEquality( 

1046 tab1[ 

1047 [ 

1048 "ddd", 

1049 ] 

1050 ], 

1051 tab6, 

1052 ) 

1053 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]}) 

1054 self._checkNumpyTableEquality( 

1055 tab1[ 

1056 [ 

1057 "a", 

1058 ] 

1059 ], 

1060 tab7, 

1061 ) 

1062 # Passing an unrecognized column should be a ValueError. 

1063 with self.assertRaises(ValueError): 

1064 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]}) 

1065 

1066 def testNumpyTableBigEndian(self): 

1067 tab1 = _makeSimpleNumpyTable(include_bigendian=True) 

1068 

1069 self.butler.put(tab1, self.datasetType, dataId={}) 

1070 # Read the whole Table. 

1071 tab2 = self.butler.get(self.datasetType, dataId={}) 

1072 self._checkNumpyTableEquality(tab1, tab2, has_bigendian=True) 

1073 

1074 def testArrowNumpySchema(self): 

1075 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1076 tab1_arrow = numpy_to_arrow(tab1) 

1077 schema = ArrowNumpySchema.from_arrow(tab1_arrow.schema) 

1078 

1079 self.assertIsInstance(schema.schema, np.dtype) 

1080 self.assertEqual(repr(schema), repr(schema._dtype)) 

1081 self.assertNotEqual(schema, "not_a_schema") 

1082 self.assertEqual(schema, schema) 

1083 

1084 # Test inequality 

1085 tab2 = tab1.copy() 

1086 names = list(tab2.dtype.names) 

1087 names[0] = "index2" 

1088 tab2.dtype.names = names 

1089 schema2 = ArrowNumpySchema(tab2.dtype) 

1090 self.assertNotEqual(schema2, schema) 

1091 

1092 @unittest.skipUnless(pa is not None, "Cannot test arrow conversions without pyarrow.") 

1093 def testNumpyDictConversions(self): 

1094 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1095 

1096 # Verify that everything round-trips, including the schema. 

1097 tab1_arrow = numpy_to_arrow(tab1) 

1098 tab1_dict = arrow_to_numpy_dict(tab1_arrow) 

1099 tab1_dict_arrow = numpy_dict_to_arrow(tab1_dict) 

1100 

1101 self.assertEqual(tab1_arrow.schema, tab1_dict_arrow.schema) 

1102 self.assertEqual(tab1_arrow, tab1_dict_arrow) 

1103 

1104 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.") 

1105 def testWriteNumpyTableReadAsArrowTable(self): 

1106 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1107 

1108 self.butler.put(tab1, self.datasetType, dataId={}) 

1109 

1110 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable") 

1111 

1112 tab2_numpy = arrow_to_numpy(tab2) 

1113 

1114 self._checkNumpyTableEquality(tab1, tab2_numpy) 

1115 

1116 # Check reading the columns. 

1117 columns = tab2.schema.names 

1118 columns2 = self.butler.get( 

1119 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

1120 ) 

1121 self.assertEqual(columns2, columns) 

1122 

1123 # Check reading the schema. 

1124 schema = tab2.schema 

1125 schema2 = self.butler.get( 

1126 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowSchema" 

1127 ) 

1128 self.assertEqual(schema2, schema) 

1129 

1130 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

1131 def testWriteNumpyTableReadAsDataFrame(self): 

1132 tab1 = _makeSimpleNumpyTable() 

1133 

1134 self.butler.put(tab1, self.datasetType, dataId={}) 

1135 

1136 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

1137 

1138 # Converting this back to numpy gets confused with the index column 

1139 # and changes the datatype of the string column. 

1140 

1141 tab1_df = pd.DataFrame(tab1) 

1142 

1143 self.assertTrue(tab1_df.equals(tab2)) 

1144 

1145 # Check reading the columns. 

1146 columns = tab2.columns 

1147 columns2 = self.butler.get( 

1148 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex" 

1149 ) 

1150 self.assertTrue(columns.equals(columns2)) 

1151 

1152 # Check reading the schema. 

1153 schema = DataFrameSchema(tab2) 

1154 schema2 = self.butler.get( 

1155 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema" 

1156 ) 

1157 

1158 self.assertEqual(schema2, schema) 

1159 

1160 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

1161 def testWriteNumpyTableReadAsAstropyTable(self): 

1162 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1163 

1164 self.butler.put(tab1, self.datasetType, dataId={}) 

1165 

1166 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

1167 tab2_numpy = tab2.as_array() 

1168 

1169 self._checkNumpyTableEquality(tab1, tab2_numpy) 

1170 

1171 # Check reading the columns. 

1172 columns = list(tab2.columns.keys()) 

1173 columns2 = self.butler.get( 

1174 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

1175 ) 

1176 self.assertEqual(columns2, columns) 

1177 

1178 # Check reading the schema. 

1179 schema = ArrowAstropySchema(tab2) 

1180 schema2 = self.butler.get( 

1181 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema" 

1182 ) 

1183 

1184 self.assertEqual(schema2, schema) 

1185 

1186 def testWriteNumpyTableReadAsNumpyDict(self): 

1187 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1188 

1189 self.butler.put(tab1, self.datasetType, dataId={}) 

1190 

1191 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict") 

1192 tab2_numpy = _numpy_dict_to_numpy(tab2) 

1193 

1194 self._checkNumpyTableEquality(tab1, tab2_numpy) 

1195 

1196 def _checkNumpyTableEquality(self, table1, table2, has_bigendian=False): 

1197 """Check if two numpy tables have the same columns/values 

1198 

1199 Parameters 

1200 ---------- 

1201 table1 : `numpy.ndarray` 

1202 table2 : `numpy.ndarray` 

1203 has_bigendian : `bool` 

1204 """ 

1205 self.assertEqual(table1.dtype.names, table2.dtype.names) 

1206 for name in table1.dtype.names: 

1207 if not has_bigendian: 

1208 self.assertEqual(table1.dtype[name], table2.dtype[name]) 

1209 else: 

1210 # Only check type matches, force to little-endian. 

1211 self.assertEqual(table1.dtype[name].newbyteorder(">"), table2.dtype[name].newbyteorder(">")) 

1212 self.assertTrue(np.all(table1 == table2)) 

1213 

1214 

1215@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.") 

1216class InMemoryArrowNumpyDelegateTestCase(ParquetFormatterArrowNumpyTestCase): 

1217 """Tests for InMemoryDatastore, using ArrowNumpyDelegate.""" 

1218 

1219 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

1220 

1221 def testBadInput(self): 

1222 tab1 = _makeSimpleNumpyTable() 

1223 delegate = ArrowNumpyDelegate("ArrowNumpy") 

1224 

1225 with self.assertRaises(ValueError): 

1226 delegate.handleParameters(inMemoryDataset="not_a_numpy_table") 

1227 

1228 with self.assertRaises(NotImplementedError): 

1229 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]}) 

1230 

1231 with self.assertRaises(AttributeError): 

1232 delegate.getComponent(composite=tab1, componentName="nothing") 

1233 

1234 def testStorageClass(self): 

1235 tab1 = _makeSimpleNumpyTable() 

1236 

1237 factory = StorageClassFactory() 

1238 factory.addFromConfig(StorageClassConfig()) 

1239 

1240 storageClass = factory.findStorageClass(type(tab1), compare_types=False) 

1241 # Force the name lookup to do name matching. 

1242 storageClass._pytype = None 

1243 self.assertEqual(storageClass.name, "ArrowNumpy") 

1244 

1245 storageClass = factory.findStorageClass(type(tab1), compare_types=True) 

1246 # Force the name lookup to do name matching. 

1247 storageClass._pytype = None 

1248 self.assertEqual(storageClass.name, "ArrowNumpy") 

1249 

1250 

1251@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowTable without pyarrow.") 

1252class ParquetFormatterArrowTableTestCase(unittest.TestCase): 

1253 """Tests for ParquetFormatter, ArrowTable, using local file datastore.""" 

1254 

1255 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1256 

1257 def setUp(self): 

1258 """Create a new butler root for each test.""" 

1259 self.root = makeTestTempDir(TESTDIR) 

1260 config = Config(self.configFile) 

1261 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run") 

1262 # No dimensions in dataset type so we don't have to worry about 

1263 # inserting dimension data or defining data IDs. 

1264 self.datasetType = DatasetType( 

1265 "data", dimensions=(), storageClass="ArrowTable", universe=self.butler.registry.dimensions 

1266 ) 

1267 self.butler.registry.registerDatasetType(self.datasetType) 

1268 

1269 def tearDown(self): 

1270 removeTestTempDir(self.root) 

1271 

1272 def testArrowTable(self): 

1273 tab1 = _makeSimpleArrowTable(include_multidim=True, include_masked=True) 

1274 

1275 self.butler.put(tab1, self.datasetType, dataId={}) 

1276 # Read the whole Table. 

1277 tab2 = self.butler.get(self.datasetType, dataId={}) 

1278 self.assertEqual(tab2, tab1) 

1279 # Read the columns. 

1280 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

1281 self.assertEqual(len(columns2), len(tab1.schema.names)) 

1282 for i, name in enumerate(tab1.schema.names): 

1283 self.assertEqual(columns2[i], name) 

1284 # Read the rowcount. 

1285 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

1286 self.assertEqual(rowcount, len(tab1)) 

1287 # Read the schema. 

1288 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

1289 self.assertEqual(schema, tab1.schema) 

1290 # Read just some columns a few different ways. 

1291 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

1292 self.assertEqual(tab3, tab1.select(("a", "c"))) 

1293 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

1294 self.assertEqual(tab4, tab1.select(("a",))) 

1295 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]}) 

1296 self.assertEqual(tab5, tab1.select(("index", "a"))) 

1297 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"}) 

1298 self.assertEqual(tab6, tab1.select(("ddd",))) 

1299 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]}) 

1300 self.assertEqual(tab7, tab1.select(("a",))) 

1301 # Passing an unrecognized column should be a ValueError. 

1302 with self.assertRaises(ValueError): 

1303 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]}) 

1304 

1305 def testEmptyArrowTable(self): 

1306 data = _makeSimpleNumpyTable() 

1307 type_list = _numpy_dtype_to_arrow_types(data.dtype) 

1308 

1309 schema = pa.schema(type_list) 

1310 arrays = [[]] * len(schema.names) 

1311 

1312 tab1 = pa.Table.from_arrays(arrays, schema=schema) 

1313 

1314 self.butler.put(tab1, self.datasetType, dataId={}) 

1315 tab2 = self.butler.get(self.datasetType, dataId={}) 

1316 self.assertEqual(tab2, tab1) 

1317 

1318 tab1_numpy = arrow_to_numpy(tab1) 

1319 self.assertEqual(len(tab1_numpy), 0) 

1320 tab1_numpy_arrow = numpy_to_arrow(tab1_numpy) 

1321 self.assertEqual(tab1_numpy_arrow, tab1) 

1322 

1323 tab1_pandas = arrow_to_pandas(tab1) 

1324 self.assertEqual(len(tab1_pandas), 0) 

1325 tab1_pandas_arrow = pandas_to_arrow(tab1_pandas) 

1326 # Unfortunately, string/byte columns get mangled when translated 

1327 # through empty pandas dataframes. 

1328 self.assertEqual( 

1329 tab1_pandas_arrow.select(("index", "a", "b", "c", "ddd")), 

1330 tab1.select(("index", "a", "b", "c", "ddd")), 

1331 ) 

1332 

1333 tab1_astropy = arrow_to_astropy(tab1) 

1334 self.assertEqual(len(tab1_astropy), 0) 

1335 tab1_astropy_arrow = astropy_to_arrow(tab1_astropy) 

1336 self.assertEqual(tab1_astropy_arrow, tab1) 

1337 

1338 def testEmptyArrowTableMultidim(self): 

1339 data = _makeSimpleNumpyTable(include_multidim=True) 

1340 type_list = _numpy_dtype_to_arrow_types(data.dtype) 

1341 

1342 md = {} 

1343 for name in data.dtype.names: 

1344 _append_numpy_multidim_metadata(md, name, data.dtype[name]) 

1345 

1346 schema = pa.schema(type_list, metadata=md) 

1347 arrays = [[]] * len(schema.names) 

1348 

1349 tab1 = pa.Table.from_arrays(arrays, schema=schema) 

1350 

1351 self.butler.put(tab1, self.datasetType, dataId={}) 

1352 tab2 = self.butler.get(self.datasetType, dataId={}) 

1353 self.assertEqual(tab2, tab1) 

1354 

1355 tab1_numpy = arrow_to_numpy(tab1) 

1356 self.assertEqual(len(tab1_numpy), 0) 

1357 tab1_numpy_arrow = numpy_to_arrow(tab1_numpy) 

1358 self.assertEqual(tab1_numpy_arrow, tab1) 

1359 

1360 tab1_astropy = arrow_to_astropy(tab1) 

1361 self.assertEqual(len(tab1_astropy), 0) 

1362 tab1_astropy_arrow = astropy_to_arrow(tab1_astropy) 

1363 self.assertEqual(tab1_astropy_arrow, tab1) 

1364 

1365 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

1366 def testWriteArrowTableReadAsSingleIndexDataFrame(self): 

1367 df1, allColumns = _makeSingleIndexDataFrame() 

1368 

1369 self.butler.put(df1, self.datasetType, dataId={}) 

1370 

1371 # Read back out as a dataframe. 

1372 df2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

1373 self.assertTrue(df1.equals(df2)) 

1374 

1375 # Read back out as an arrow table, convert to dataframe. 

1376 tab3 = self.butler.get(self.datasetType, dataId={}) 

1377 df3 = arrow_to_pandas(tab3) 

1378 self.assertTrue(df1.equals(df3)) 

1379 

1380 # Check reading the columns. 

1381 columns = df2.reset_index().columns 

1382 columns2 = self.butler.get( 

1383 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex" 

1384 ) 

1385 # We check the set because pandas reorders the columns. 

1386 self.assertEqual(set(columns2.to_list()), set(columns.to_list())) 

1387 

1388 # Check reading the schema. 

1389 schema = DataFrameSchema(df1) 

1390 schema2 = self.butler.get( 

1391 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema" 

1392 ) 

1393 self.assertEqual(schema2, schema) 

1394 

1395 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

1396 def testWriteArrowTableReadAsMultiIndexDataFrame(self): 

1397 df1 = _makeMultiIndexDataFrame() 

1398 

1399 self.butler.put(df1, self.datasetType, dataId={}) 

1400 

1401 # Read back out as a dataframe. 

1402 df2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

1403 self.assertTrue(df1.equals(df2)) 

1404 

1405 # Read back out as an arrow table, convert to dataframe. 

1406 atab3 = self.butler.get(self.datasetType, dataId={}) 

1407 df3 = arrow_to_pandas(atab3) 

1408 self.assertTrue(df1.equals(df3)) 

1409 

1410 # Check reading the columns. 

1411 columns = df2.columns 

1412 columns2 = self.butler.get( 

1413 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="DataFrameIndex" 

1414 ) 

1415 self.assertTrue(columns2.equals(columns)) 

1416 

1417 # Check reading the schema. 

1418 schema = DataFrameSchema(df1) 

1419 schema2 = self.butler.get( 

1420 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="DataFrameSchema" 

1421 ) 

1422 self.assertEqual(schema2, schema) 

1423 

1424 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

1425 def testWriteArrowTableReadAsAstropyTable(self): 

1426 tab1 = _makeSimpleAstropyTable(include_multidim=True, include_masked=True) 

1427 

1428 self.butler.put(tab1, self.datasetType, dataId={}) 

1429 

1430 # Read back out as an astropy table. 

1431 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

1432 self._checkAstropyTableEquality(tab1, tab2) 

1433 

1434 # Read back out as an arrow table, convert to astropy table. 

1435 atab3 = self.butler.get(self.datasetType, dataId={}) 

1436 tab3 = arrow_to_astropy(atab3) 

1437 self._checkAstropyTableEquality(tab1, tab3) 

1438 

1439 # Check reading the columns. 

1440 columns = list(tab2.columns.keys()) 

1441 columns2 = self.butler.get( 

1442 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

1443 ) 

1444 self.assertEqual(columns2, columns) 

1445 

1446 # Check reading the schema. 

1447 schema = ArrowAstropySchema(tab1) 

1448 schema2 = self.butler.get( 

1449 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowAstropySchema" 

1450 ) 

1451 self.assertEqual(schema2, schema) 

1452 

1453 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

1454 def testWriteArrowTableReadAsNumpyTable(self): 

1455 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1456 

1457 self.butler.put(tab1, self.datasetType, dataId={}) 

1458 

1459 # Read back out as a numpy table. 

1460 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy") 

1461 self._checkNumpyTableEquality(tab1, tab2) 

1462 

1463 # Read back out as an arrow table, convert to numpy table. 

1464 atab3 = self.butler.get(self.datasetType, dataId={}) 

1465 tab3 = arrow_to_numpy(atab3) 

1466 self._checkNumpyTableEquality(tab1, tab3) 

1467 

1468 # Check reading the columns. 

1469 columns = list(tab2.dtype.names) 

1470 columns2 = self.butler.get( 

1471 self.datasetType.componentTypeName("columns"), dataId={}, storageClass="ArrowColumnList" 

1472 ) 

1473 self.assertEqual(columns2, columns) 

1474 

1475 # Check reading the schema. 

1476 schema = ArrowNumpySchema(tab1.dtype) 

1477 schema2 = self.butler.get( 

1478 self.datasetType.componentTypeName("schema"), dataId={}, storageClass="ArrowNumpySchema" 

1479 ) 

1480 self.assertEqual(schema2, schema) 

1481 

1482 @unittest.skipUnless(np is not None, "Cannot test reading as numpy without numpy.") 

1483 def testWriteArrowTableReadAsNumpyDict(self): 

1484 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1485 

1486 self.butler.put(tab1, self.datasetType, dataId={}) 

1487 

1488 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpyDict") 

1489 tab2_numpy = _numpy_dict_to_numpy(tab2) 

1490 self._checkNumpyTableEquality(tab1, tab2_numpy) 

1491 

1492 def _checkAstropyTableEquality(self, table1, table2): 

1493 """Check if two astropy tables have the same columns/values 

1494 

1495 Parameters 

1496 ---------- 

1497 table1 : `astropy.table.Table` 

1498 table2 : `astropy.table.Table` 

1499 """ 

1500 self.assertEqual(table1.dtype, table2.dtype) 

1501 for name in table1.columns: 

1502 self.assertEqual(table1[name].unit, table2[name].unit) 

1503 self.assertEqual(table1[name].description, table2[name].description) 

1504 self.assertEqual(table1[name].format, table2[name].format) 

1505 self.assertTrue(np.all(table1 == table2)) 

1506 

1507 def _checkNumpyTableEquality(self, table1, table2): 

1508 """Check if two numpy tables have the same columns/values 

1509 

1510 Parameters 

1511 ---------- 

1512 table1 : `numpy.ndarray` 

1513 table2 : `numpy.ndarray` 

1514 """ 

1515 self.assertEqual(table1.dtype.names, table2.dtype.names) 

1516 for name in table1.dtype.names: 

1517 self.assertEqual(table1.dtype[name], table2.dtype[name]) 

1518 self.assertTrue(np.all(table1 == table2)) 

1519 

1520 

1521@unittest.skipUnless(pa is not None, "Cannot test InMemoryArrowTableDelegate without pyarrow.") 

1522class InMemoryArrowTableDelegateTestCase(ParquetFormatterArrowTableTestCase): 

1523 """Tests for InMemoryDatastore, using ArrowTableDelegate.""" 

1524 

1525 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

1526 

1527 def testBadInput(self): 

1528 tab1 = _makeSimpleArrowTable() 

1529 delegate = ArrowTableDelegate("ArrowTable") 

1530 

1531 with self.assertRaises(ValueError): 

1532 delegate.handleParameters(inMemoryDataset="not_an_arrow_table") 

1533 

1534 with self.assertRaises(NotImplementedError): 

1535 delegate.handleParameters(inMemoryDataset=tab1, parameters={"columns": [("a", "b")]}) 

1536 

1537 with self.assertRaises(AttributeError): 

1538 delegate.getComponent(composite=tab1, componentName="nothing") 

1539 

1540 def testStorageClass(self): 

1541 tab1 = _makeSimpleArrowTable() 

1542 

1543 factory = StorageClassFactory() 

1544 factory.addFromConfig(StorageClassConfig()) 

1545 

1546 storageClass = factory.findStorageClass(type(tab1), compare_types=False) 

1547 # Force the name lookup to do name matching. 

1548 storageClass._pytype = None 

1549 self.assertEqual(storageClass.name, "ArrowTable") 

1550 

1551 storageClass = factory.findStorageClass(type(tab1), compare_types=True) 

1552 # Force the name lookup to do name matching. 

1553 storageClass._pytype = None 

1554 self.assertEqual(storageClass.name, "ArrowTable") 

1555 

1556 

1557@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.") 

1558@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowNumpy without pyarrow.") 

1559class ParquetFormatterArrowNumpyDictTestCase(unittest.TestCase): 

1560 """Tests for ParquetFormatter, ArrowNumpyDict, using local file store.""" 

1561 

1562 configFile = os.path.join(TESTDIR, "config/basic/butler.yaml") 

1563 

1564 def setUp(self): 

1565 """Create a new butler root for each test.""" 

1566 self.root = makeTestTempDir(TESTDIR) 

1567 config = Config(self.configFile) 

1568 self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run") 

1569 # No dimensions in dataset type so we don't have to worry about 

1570 # inserting dimension data or defining data IDs. 

1571 self.datasetType = DatasetType( 

1572 "data", dimensions=(), storageClass="ArrowNumpyDict", universe=self.butler.registry.dimensions 

1573 ) 

1574 self.butler.registry.registerDatasetType(self.datasetType) 

1575 

1576 def tearDown(self): 

1577 removeTestTempDir(self.root) 

1578 

1579 def testNumpyDict(self): 

1580 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1581 dict1 = _numpy_to_numpy_dict(tab1) 

1582 

1583 self.butler.put(dict1, self.datasetType, dataId={}) 

1584 # Read the whole table. 

1585 dict2 = self.butler.get(self.datasetType, dataId={}) 

1586 self._checkNumpyDictEquality(dict1, dict2) 

1587 # Read the columns. 

1588 columns2 = self.butler.get(self.datasetType.componentTypeName("columns"), dataId={}) 

1589 self.assertEqual(len(columns2), len(dict1.keys())) 

1590 for i, name in enumerate(dict1.keys()): 

1591 self.assertIn(name, columns2) 

1592 # Read the rowcount. 

1593 rowcount = self.butler.get(self.datasetType.componentTypeName("rowcount"), dataId={}) 

1594 self.assertEqual(rowcount, len(dict1["a"])) 

1595 # Read the schema. 

1596 schema = self.butler.get(self.datasetType.componentTypeName("schema"), dataId={}) 

1597 self.assertEqual(schema, ArrowNumpySchema(tab1.dtype)) 

1598 # Read just some columns a few different ways. 

1599 tab3 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "c"]}) 

1600 subdict = {key: dict1[key] for key in ["a", "c"]} 

1601 self._checkNumpyDictEquality(subdict, tab3) 

1602 tab4 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "a"}) 

1603 subdict = {key: dict1[key] for key in ["a"]} 

1604 self._checkNumpyDictEquality(subdict, tab4) 

1605 tab5 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["index", "a"]}) 

1606 subdict = {key: dict1[key] for key in ["index", "a"]} 

1607 self._checkNumpyDictEquality(subdict, tab5) 

1608 tab6 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": "ddd"}) 

1609 subdict = {key: dict1[key] for key in ["ddd"]} 

1610 self._checkNumpyDictEquality(subdict, tab6) 

1611 tab7 = self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["a", "a"]}) 

1612 subdict = {key: dict1[key] for key in ["a"]} 

1613 self._checkNumpyDictEquality(subdict, tab7) 

1614 # Passing an unrecognized column should be a ValueError. 

1615 with self.assertRaises(ValueError): 

1616 self.butler.get(self.datasetType, dataId={}, parameters={"columns": ["e"]}) 

1617 

1618 @unittest.skipUnless(pa is not None, "Cannot test reading as arrow without pyarrow.") 

1619 def testWriteNumpyDictReadAsArrowTable(self): 

1620 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1621 dict1 = _numpy_to_numpy_dict(tab1) 

1622 

1623 self.butler.put(dict1, self.datasetType, dataId={}) 

1624 

1625 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowTable") 

1626 

1627 tab2_dict = arrow_to_numpy_dict(tab2) 

1628 

1629 self._checkNumpyDictEquality(dict1, tab2_dict) 

1630 

1631 @unittest.skipUnless(pd is not None, "Cannot test reading as a dataframe without pandas.") 

1632 def testWriteNumpyDictReadAsDataFrame(self): 

1633 tab1 = _makeSimpleNumpyTable() 

1634 dict1 = _numpy_to_numpy_dict(tab1) 

1635 

1636 self.butler.put(dict1, self.datasetType, dataId={}) 

1637 

1638 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="DataFrame") 

1639 

1640 # The order of the dict may get mixed up, so we need to check column 

1641 # by column. We also need to do this in dataframe form because pandas 

1642 # changes the datatype of the string column. 

1643 tab1_df = pd.DataFrame(tab1) 

1644 

1645 self.assertEqual(set(tab1_df.columns), set(tab2.columns)) 

1646 for col in tab1_df.columns: 

1647 self.assertTrue(np.all(tab1_df[col].values == tab2[col].values)) 

1648 

1649 @unittest.skipUnless(atable is not None, "Cannot test reading as astropy without astropy.") 

1650 def testWriteNumpyDictReadAsAstropyTable(self): 

1651 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1652 dict1 = _numpy_to_numpy_dict(tab1) 

1653 

1654 self.butler.put(dict1, self.datasetType, dataId={}) 

1655 

1656 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowAstropy") 

1657 tab2_dict = _astropy_to_numpy_dict(tab2) 

1658 

1659 self._checkNumpyDictEquality(dict1, tab2_dict) 

1660 

1661 def testWriteNumpyDictReadAsNumpyTable(self): 

1662 tab1 = _makeSimpleNumpyTable(include_multidim=True) 

1663 dict1 = _numpy_to_numpy_dict(tab1) 

1664 

1665 self.butler.put(dict1, self.datasetType, dataId={}) 

1666 

1667 tab2 = self.butler.get(self.datasetType, dataId={}, storageClass="ArrowNumpy") 

1668 tab2_dict = _numpy_to_numpy_dict(tab2) 

1669 

1670 self._checkNumpyDictEquality(dict1, tab2_dict) 

1671 

1672 def testWriteNumpyDictBad(self): 

1673 dict1 = {"a": 4, "b": np.ndarray([1])} 

1674 with self.assertRaises(RuntimeError): 

1675 self.butler.put(dict1, self.datasetType, dataId={}) 

1676 

1677 dict2 = {"a": np.zeros(4), "b": np.zeros(5)} 

1678 with self.assertRaises(RuntimeError): 

1679 self.butler.put(dict2, self.datasetType, dataId={}) 

1680 

1681 dict3 = {"a": [0] * 5, "b": np.zeros(5)} 

1682 with self.assertRaises(RuntimeError): 

1683 self.butler.put(dict3, self.datasetType, dataId={}) 

1684 

1685 def _checkNumpyDictEquality(self, dict1, dict2): 

1686 """Check if two numpy dicts have the same columns/values. 

1687 

1688 Parameters 

1689 ---------- 

1690 dict1 : `dict` [`str`, `np.ndarray`] 

1691 dict2 : `dict` [`str`, `np.ndarray`] 

1692 """ 

1693 self.assertEqual(set(dict1.keys()), set(dict2.keys())) 

1694 for name in dict1.keys(): 

1695 self.assertEqual(dict1[name].dtype, dict2[name].dtype) 

1696 self.assertTrue(np.all(dict1[name] == dict2[name])) 

1697 

1698 

1699@unittest.skipUnless(np is not None, "Cannot test ParquetFormatterArrowNumpy without numpy.") 

1700@unittest.skipUnless(pa is not None, "Cannot test ParquetFormatterArrowNumpy without pyarrow.") 

1701class InMemoryNumpyDictDelegateTestCase(ParquetFormatterArrowNumpyDictTestCase): 

1702 """Tests for InMemoryDatastore, using ArrowNumpyDictDelegate.""" 

1703 

1704 configFile = os.path.join(TESTDIR, "config/basic/butler-inmemory.yaml") 

1705 

1706 def testWriteNumpyDictBad(self): 

1707 # The sub-type checking is not done on in-memory datastore. 

1708 pass 

1709 

1710 

1711if __name__ == "__main__": 

1712 unittest.main()