Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1"""Instance Catalog""" 

2from __future__ import print_function 

3from builtins import zip 

4from builtins import str 

5from builtins import range 

6from builtins import object 

7import warnings 

8import numpy as np 

9import inspect 

10import re 

11import copy 

12from collections import OrderedDict 

13from lsst.sims.utils import defaultSpecMap 

14from lsst.sims.utils import ObservationMetaData 

15from future.utils import with_metaclass 

16 

17__all__ = ["InstanceCatalog"] 

18 

19 

20class InstanceCatalogMeta(type): 

21 """Meta class for registering instance catalogs. 

22 

23 When any new type of instance catalog class is created, this registers it 

24 in a `registry` class attribute, available to all derived instance 

25 catalogs. 

26 """ 

27 @staticmethod 

28 def convert_to_underscores(name): 

29 """convert, e.g. CatalogName to catalog_name""" 

30 s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name) 

31 return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower() 

32 

33 def __new__(cls, name, bases, dct): 

34 # check if attribute catalog_type is specified. 

35 # If not, create a default 

36 if 'registry' in dct: 36 ↛ 37line 36 didn't jump to line 37, because the condition on line 36 was never true

37 warnings.warn("registry class attribute should not be " 

38 "over-ridden in InstanceCatalog classes. " 

39 "Proceed with caution") 

40 if 'catalog_type' not in dct: 

41 dct['catalog_type'] = cls.convert_to_underscores(name) 

42 

43 dct['_cached_columns'] = {} 

44 dct['_compound_columns'] = {} 

45 dct['_compound_column_names'] = {} 

46 

47 return super(InstanceCatalogMeta, cls).__new__(cls, name, bases, dct) 

48 

49 def __init__(cls, name, bases, dct): 

50 # check if 'registry' is specified. 

51 # if not, then this is the base class: add the registry 

52 if not hasattr(cls, 'registry'): 

53 cls.registry = {} 

54 

55 # add this class to the registry 

56 if cls.catalog_type in cls.registry: 56 ↛ 57line 56 didn't jump to line 57, because the condition on line 56 was never true

57 raise ValueError("Catalog Type %s is duplicated" 

58 % cls.catalog_type) 

59 cls.registry[cls.catalog_type] = cls 

60 

61 # add methods for default columns 

62 for default in cls.default_columns: 

63 setattr(cls, 'default_%s'%(default[0]), 63 ↛ exitline 63 didn't jump to the function exit

64 lambda self, value=default[1], type=default[2]: 

65 np.array([value for i in range(len(self._current_chunk))], dtype=type)) 

66 

67 # store compound columns and check for collisions 

68 # 

69 # We create a forward and backward mapping. 

70 # The dictionary cls._compound_columns maps the compound column 

71 # name to the multiple individual columns it represents. 

72 # The dictionary cls._compound_column_names maps the individual 

73 # column names to the compound column that contains them 

74 for key in dir(cls): 

75 if not key.startswith('get_'): 

76 continue 

77 compound_getter = getattr(cls, key) 

78 if not hasattr(compound_getter, '_compound_column'): 

79 continue 

80 

81 for col in compound_getter._colnames: 

82 try: 

83 getter = 'get_'+col 

84 except TypeError: 

85 raise ValueError("column names in compound " 

86 "decorator must be strings") 

87 

88 if hasattr(cls, getter): 88 ↛ 89line 88 didn't jump to line 89, because the condition on line 88 was never true

89 raise ValueError("column name '%s' in compound getter " 

90 "'%s' conflicts with getter '%s'" 

91 % (col, key, getter)) 

92 

93 elif col in cls._compound_column_names: 93 ↛ 94line 93 didn't jump to line 94, because the condition on line 93 was never true

94 raise ValueError("duplicate compound column name: '%s'" 

95 % col) 

96 

97 else: 

98 cls._compound_column_names[col] = key 

99 cls._compound_columns[key] = compound_getter._colnames 

100 

101 return super(InstanceCatalogMeta, cls).__init__(name, bases, dct) 

102 

103 

104class _MimicRecordArray(object): 

105 """An object used for introspection of the database colums. 

106 

107 This mimics a numpy record array, but when a column is referenced, 

108 it logs the reference and returns zeros. 

109 """ 

110 def __init__(self): 

111 self.referenced_columns = set() 

112 

113 def __getitem__(self, column): 

114 self.referenced_columns.add(column) 

115 return np.empty(0) 

116 

117 def __len__(self): 

118 return 0 

119 

120 

121class InstanceCatalog(with_metaclass(InstanceCatalogMeta, object)): 

122 """ Base class for instance catalogs generated by simulations. 

123 

124 Instance catalogs include a dictionary of numpy arrays which contains 

125 core data. Additional arrays can be appended as ancillary data. 

126 

127 Catalog types and Object types are defined in the CatalogDescription class 

128 catalogType = TRIM, SCIENCE, PHOTCAL, DIASOURCE, MISC, INVALID 

129 objectType = Point, Moving, Sersic, Image, Artefact, MISC 

130 catalogTable is name of the database table queried 

131 dataArray dictionary of numpy arrays of data 

132 """ 

133 

134 # These are the class attributes to be specified in any derived class: 

135 catalog_type = 'instance_catalog' 

136 column_outputs = None 

137 specFileMap = defaultSpecMap 

138 default_columns = [] 

139 cannot_be_null = None # will be a list of columns which, if null, cause a row not to be printed by write_catalog() 

140 # Note: these columns will be filtered on even if they are not included in column_outputs 

141 

142 default_formats = {'S': '%s', 'f': '%.4f', 'i': '%i'} 

143 override_formats = {} 

144 transformations = {} 

145 delimiter = ", " 

146 comment_char = "#" 

147 endline = "\n" 

148 _pre_screen = False # if true, write_catalog() will check database query results against 

149 # cannot_be_null before calculating getter columns 

150 

151 @classmethod 

152 def new_catalog(cls, catalog_type, *args, **kwargs): 

153 """Return a new catalog of the given catalog type""" 

154 if catalog_type in cls.registry: 

155 return cls.registry[catalog_type](*args, **kwargs) 

156 elif inspect.isclass(catalog_type) and issubclass(catalog_type, InstanceCatalog): 

157 return catalog_type(*args, **kwargs) 

158 else: 

159 raise ValueError("Unrecognized catalog_type: %s" 

160 % str(catalog_type)) 

161 

162 @classmethod 

163 def is_compound_column(cls, column_name): 

164 """Return true if the given column name is a compound column""" 

165 getfunc = "get_%s" % column_name 

166 if hasattr(cls, getfunc): 

167 if hasattr(getattr(cls, getfunc), '_compound_column'): 

168 return True 

169 return False 

170 

171 def iter_column_names(self): 

172 """Iterate the column names, expanding any compound columns""" 

173 

174 for column in self._column_outputs: 

175 if self.is_compound_column(column): 

176 for col in getattr(getattr(self, "get_" + column), '_colnames'): 

177 yield col 

178 else: 

179 yield column 

180 

181 def __init__(self, db_obj, obs_metadata=None, column_outputs=None, 

182 constraint=None, specFileMap=None, cannot_be_null=None): 

183 

184 """ 

185 @param [in] db_obj is an instantiation of the CatalogDBObject class, 

186 which provide connection to a specific database table 

187 

188 see sims_catalogs_generation/python/lsst/sims/catalogs/generation/db/dbConnection.py 

189 

190 @param [in] obs_metadata is an instantiation of the ObservationMetaData class 

191 characterizing a specific telescope observation 

192 

193 see sims_catalogs_generation/python/lsst/sims/catalogs/generation/db/ObservationMetaData.py 

194 

195 @param [in] column_outputs is a list of column names to be output 

196 in the catalog. This is optional and will be appended to the list 

197 of column_outputs defined int he class definition. 

198 

199 @param [in] cannot_be_null is a list of column names indicating columns 

200 which cannot have the values Null, None, or NaN. Rows running afoul 

201 of this criterion will not be written by the write_catalog() method 

202 (though they may appear in the iterator returned by iter_catalog()). 

203 Note: these columns will be filtered on, even if they do not appear in 

204 column_outputs. 

205 

206 @param [in] constraint is an optional SQL constraint to be applied to the 

207 database query 

208 

209 @param [in] specFileMap is an instantiation of the SpecMap class 

210 

211 (defined in sims_catalogs_measures/python/sims/catalogs/measures/instance/fileMaps.py) 

212 

213 that maps database entries for SED names to actual file paths. If set to None, 

214 the class definition of InstanceCatalog ensures that it will be set to 

215 defaultSpecMap, which is the correct mapping for the LSST sims_sed_library 

216 """ 

217 

218 self.verbose = db_obj.verbose 

219 

220 self.db_obj = db_obj 

221 self._current_chunk = None 

222 

223 # this dict will contain information telling the user where the columns in 

224 # the catalog come from 

225 self._column_origins = {} 

226 

227 if obs_metadata is not None: 

228 if not isinstance(obs_metadata, ObservationMetaData): 

229 raise ValueError("You passed InstanceCatalog something that was not ObservationMetaData") 

230 

231 self.obs_metadata = copy.deepcopy(obs_metadata) 

232 else: 

233 self.obs_metadata = ObservationMetaData() 

234 

235 if self.column_outputs is not None: 

236 self._column_outputs = copy.deepcopy(self.column_outputs) 

237 

238 if column_outputs is not None: 

239 if self.column_outputs is None: 

240 self._column_outputs = copy.deepcopy(column_outputs) 

241 else: 

242 for col in column_outputs: 

243 if col not in self._column_outputs: 

244 self._column_outputs.append(col) 

245 

246 # Because cannot_be_null can both be declared at class definition 

247 # and at instantiation, we need to be able to combine the two inputs 

248 # into something the InstanceCatalog will actually use to filter 

249 # rows. self._cannot_be_null is a member variable that contains 

250 # the contents both of self.cannot_be_null (set at class definition) 

251 # and the cannot_be_null kwarg passed to __init__(). self._cannot_be_null 

252 # is what the catalog actually uses in self._filter_chunk 

253 self._cannot_be_null = None 

254 if self.cannot_be_null is not None: 

255 self._cannot_be_null = copy.deepcopy(self.cannot_be_null) 

256 

257 if cannot_be_null is not None: 

258 if self.cannot_be_null is None: 

259 self._cannot_be_null = copy.deepcopy(cannot_be_null) 

260 else: 

261 for col in cannot_be_null: 

262 if col not in self._cannot_be_null: 

263 self._cannot_be_null.append(col) 

264 

265 self._actually_calculated_columns = [] # a list of all the columns referenced by self.column_by_name 

266 self.constraint = constraint 

267 

268 if specFileMap is not None: 

269 self.specFileMap = specFileMap 

270 

271 self.refIdCol = self.db_obj.getIdColKey() 

272 

273 self._column_cache = {} 

274 

275 # self._column_origins_switch tells column_by_name to log where it is getting 

276 # the columns in self._column_origins (we only want to do that once) 

277 self._column_origins_switch = True 

278 

279 # now we will create and populate a list containing the names of 

280 # all of the columns which this InstanceCatalog can return. 

281 # Note: this needs to happen before self._check_requirements() 

282 # is called in case any getters depend on the contents of 

283 # _all_available_columns. That way, self._check_requirements() 

284 # can verify that the getter will run the way it is actually 

285 # being called. 

286 self._all_available_columns = [] 

287 

288 for name in self.db_obj.columnMap.keys(): 

289 if name not in self._all_available_columns: 

290 self._all_available_columns.append(name) 

291 

292 for name in self._compound_column_names: 

293 if name not in self._all_available_columns: 

294 self._all_available_columns.append(name) 

295 

296 for name in self._compound_columns: 

297 if name not in self._all_available_columns: 

298 self._all_available_columns.append(name) 

299 

300 for name in dir(self): 

301 if name[:4] == 'get_': 

302 columnName = name[4:] 

303 if columnName not in self._all_available_columns: 

304 self._all_available_columns.append(columnName) 

305 elif name[:8] == 'default_': 

306 columnName = name[8:] 

307 if columnName not in self._all_available_columns: 

308 self._all_available_columns.append(columnName) 

309 

310 if not hasattr(self, '_column_outputs'): 

311 self._column_outputs = [] 

312 

313 # because asking for a compound_column means asking for 

314 # its individual sub-columns, which means those columns 

315 # will get listed twice in the catalog 

316 for name in self._all_available_columns: 

317 if name not in self._compound_columns: 

318 self._column_outputs.append(name) 

319 

320 self._check_requirements() 

321 

322 def _set_current_chunk(self, chunk, column_cache=None): 

323 """Set the current chunk and clear the column cache""" 

324 self._current_chunk = chunk 

325 if column_cache is None: 

326 self._column_cache = {} 

327 else: 

328 self._column_cache = column_cache 

329 

330 def _delete_current_chunk(self): 

331 """ 

332 Set the column cache and _current_chunk to None. 

333 This is just going to be called by the 

334 CompoundInstanceCatalog._write_compound method to try to control 

335 memory bloat as multiple copies of the returned database query 

336 accumulate in the different InstanceCatalogs being written. 

337 """ 

338 self._column_cache = {} 

339 self._current_chunk = None 

340 

341 def db_required_columns(self): 

342 """Get the list of columns required to be in the database object.""" 

343 saved_cache = self._cached_columns 

344 saved_chunk = self._current_chunk 

345 self._set_current_chunk(_MimicRecordArray()) 

346 

347 for col_name in self.iter_column_names(): 

348 # just call the column: this will log queries to the database. 

349 self.column_by_name(col_name) 

350 

351 # now do the same thing for columns specified in _cannot_be_null 

352 # (in case the catalog is filtered on columns that are not meant 

353 # to be written to the catalog) 

354 if self._cannot_be_null is not None: 

355 for col_name in self._cannot_be_null: 

356 self.column_by_name(col_name) 

357 

358 db_required_columns = list(self._current_chunk.referenced_columns) 

359 

360 default_columns_set = set(el[0] for el in self.default_columns) 

361 required_columns_set = set(db_required_columns) 

362 required_columns_with_defaults = default_columns_set & required_columns_set 

363 

364 self._set_current_chunk(saved_chunk, saved_cache) 

365 

366 return db_required_columns, list(required_columns_with_defaults) 

367 

368 def column_by_name(self, column_name, *args, **kwargs): 

369 """Given a column name, return the column data""" 

370 

371 if (isinstance(self._current_chunk, _MimicRecordArray) and 

372 column_name not in self._actually_calculated_columns): 

373 

374 self._actually_calculated_columns.append(column_name) 

375 

376 getfunc = "get_%s" % column_name 

377 if hasattr(self, getfunc): 

378 function = getattr(self, getfunc) 

379 

380 if self._column_origins_switch: 

381 self._column_origins[column_name] = self._get_class_that_defined_method(function) 

382 

383 return function(*args, **kwargs) 

384 elif column_name in self._compound_column_names: 

385 getfunc = self._compound_column_names[column_name] 

386 function = getattr(self, getfunc) 

387 

388 if self._column_origins_switch and column_name: 

389 self._column_origins[column_name] = self._get_class_that_defined_method(function) 

390 

391 compound_column = function(*args, **kwargs) 

392 return compound_column[column_name] 

393 elif (isinstance(self._current_chunk, _MimicRecordArray) or 

394 column_name in self._current_chunk.dtype.names): 

395 

396 if self._column_origins_switch: 

397 self._column_origins[column_name] = 'the database' 

398 

399 return self._current_chunk[column_name] 

400 else: 

401 

402 if self._column_origins_switch: 

403 self._column_origins[column_name] = 'default column' 

404 

405 return getattr(self, "default_%s"%column_name)(*args, **kwargs) 

406 

407 def _check_requirements(self): 

408 """Check whether the supplied db_obj has the necessary column names""" 

409 

410 missing_cols = [] 

411 self._active_columns = [] 

412 cols, defaults = self.db_required_columns() 

413 

414 for col in cols: 

415 if col not in self.db_obj.columnMap: 

416 missing_cols.append(col) 

417 else: 

418 self._active_columns.append(col) 

419 

420 self._column_origins_switch = False # do not want to log column origins any more 

421 

422 if len(missing_cols) > 0: 

423 nodefault = [] 

424 for col in missing_cols: 

425 if col not in defaults: 

426 nodefault.append(col) 

427 else: 

428 # Because some earlier part of the code copies default columns 

429 # into the same place as columns that exist natively in the 

430 # database, this is where we have to mark columns that are 

431 # set by default 

432 self._column_origins[col] = 'default column' 

433 

434 if len(nodefault) > 0: 

435 raise ValueError("Required columns missing from database: " 

436 "({0})".format(', '.join(nodefault))) 

437 

438 if self.verbose: 

439 self.print_column_origins() 

440 

441 def _make_line_template(self, chunk_cols): 

442 templ_list = [] 

443 for i, col in enumerate(self.iter_column_names()): 

444 templ = self.override_formats.get(col, None) 

445 

446 if templ is None: 

447 typ = chunk_cols[i].dtype.kind 

448 templ = self.default_formats.get(typ) 

449 

450 if templ is None: 

451 if self.verbose: 

452 warnings.warn("Using raw formatting for column '%s' " 

453 "with type %s" % (col, chunk_cols[i].dtype)) 

454 templ = "%s" 

455 templ_list.append(templ) 

456 

457 return self.delimiter.join(templ_list) + self.endline 

458 

459 def write_header(self, file_handle): 

460 column_names = list(self.iter_column_names()) 

461 templ = [self.comment_char, ] 

462 templ += ["%s" for col in column_names] 

463 file_handle.write("{0}".format(self.comment_char + self.delimiter.join(column_names)) + 

464 self.endline) 

465 

466 def write_catalog(self, filename, chunk_size=None, 

467 write_header=True, write_mode='w'): 

468 """ 

469 Write query self.db_obj and write the resulting InstanceCatalog to 

470 an ASCII output file 

471 

472 @param [in] filename is the name of the ASCII file to be written 

473 

474 @param [in] chunk_size is an optional parameter telling the CompoundInstanceCatalog 

475 to query the database in manageable chunks (in case returning the whole catalog 

476 takes too much memory) 

477 

478 @param [in] write_header a boolean specifying whether or not to add a header 

479 to the output catalog (default True) 

480 

481 @param [in] write_mode is 'w' if you want to overwrite the output file or 

482 'a' if you want to append to an existing output file (default: 'w') 

483 """ 

484 

485 self._write_pre_process() 

486 

487 self._query_and_write(filename, chunk_size=chunk_size, 

488 write_header=write_header, 

489 write_mode=write_mode, 

490 obs_metadata=self.obs_metadata, 

491 constraint=self.constraint) 

492 

493 def _query_and_write(self, filename, chunk_size=None, write_header=True, 

494 write_mode='w', obs_metadata=None, constraint=None): 

495 """ 

496 This method queries db_obj, and then writes the resulting recarray 

497 to the specified ASCII output file. 

498 

499 @param [in] filename is the name of the ASCII file to be written 

500 

501 @param [in] obs_metadata is an ObservationMetaData instantiation 

502 characterizing the telescope pointing (optional) 

503 

504 @param [in] constraint is an optional SQL constraint applied to the database query. 

505 

506 @param [in] chunk_size is an optional parameter telling the CompoundInstanceCatalog 

507 to query the database in manageable chunks (in case returning the whole catalog 

508 takes too much memory) 

509 

510 @param [in] write_header a boolean specifying whether or not to add a header 

511 to the output catalog (default True) 

512 

513 @param [in] write_mode is 'w' if you want to overwrite the output file or 

514 'a' if you want to append to an existing output file (default: 'w') 

515 """ 

516 

517 with open(filename, write_mode) as file_handle: 

518 if write_header: 

519 self.write_header(file_handle) 

520 

521 query_result = self.db_obj.query_columns(colnames=self._active_columns, 

522 obs_metadata=obs_metadata, 

523 constraint=constraint, 

524 chunk_size=chunk_size) 

525 

526 for chunk in query_result: 

527 self._write_recarray(chunk, file_handle) 

528 

529 def _write_pre_process(self): 

530 """ 

531 This function verifies the catalog's required columns, initializes 

532 some member variables that are required for the catalog-writing process. 

533 """ 

534 db_required_columns, required_columns_with_defaults = self.db_required_columns() 

535 self._template = None 

536 

537 def _update_current_chunk(self, good_dexes): 

538 """ 

539 Update self._current_chunk and self._column_cache to only include the rows 

540 specified by good_dexes (which will be a list of indexes). 

541 """ 

542 # In the event that self._column_cache has already been created, 

543 # update the cache so that only valid rows remain therein 

544 new_cache = {} 

545 if len(self._column_cache) > 0: 

546 for col_name in self._column_cache: 

547 if col_name in self._compound_column_names: 

548 # this is a sub-column of a compound column; 

549 # ignore it, we will update the cache when we come 

550 # to the compound column 

551 continue 

552 elif 'get_'+col_name in self._compound_columns: 

553 super_col = self._column_cache[col_name] 

554 new_cache[col_name] = OrderedDict([(key, super_col[key][good_dexes]) for key in super_col]) 

555 else: 

556 new_cache[col_name] = self._column_cache[col_name][good_dexes] 

557 

558 self._set_current_chunk(self._current_chunk[good_dexes], column_cache=new_cache) 

559 

560 def _filter_chunk(self, chunk): 

561 """ 

562 Take a chunk of database rows and select only those that match the criteria 

563 set by self._cannot_be_null. Set self._current_chunk to be the rows that pass 

564 this test. Return a numpy array of the indices of those rows relative to 

565 the original chunk. 

566 """ 

567 final_dexes = np.arange(len(chunk), dtype=int) 

568 

569 if self._pre_screen and self._cannot_be_null is not None: 

570 # go through the database query results and remove all of those 

571 # rows that have already run afoul of self._cannot_be_null 

572 for col_name in self._cannot_be_null: 

573 if col_name in chunk.dtype.names: 

574 if chunk[col_name].dtype == float: 

575 good_dexes = np.where(np.isfinite(chunk[col_name])) 

576 else: 

577 str_vec = np.char.lower(chunk[col_name].astype('str')) 

578 good_dexes = np.where(np.logical_and(str_vec != 'none', 

579 np.logical_and(str_vec != 'nan', str_vec != 'null'))) 

580 chunk = chunk[good_dexes] 

581 final_dexes = final_dexes[good_dexes] 

582 

583 self._set_current_chunk(chunk) 

584 

585 # If some columns are specified as cannot_be_null, loop over those columns, 

586 # removing rows that run afoul of that criterion from the chunk. 

587 if self._cannot_be_null is not None: 

588 filter_switch = None 

589 for filter_col in self._cannot_be_null: 

590 filter_vals = self.column_by_name(filter_col) 

591 if filter_vals.dtype == float: 

592 local_switch = np.isfinite(filter_vals) 

593 else: 

594 try: 

595 filter_vals = filter_vals.astype(float) 

596 local_switch = np.isfinite(filter_vals) 

597 except ValueError: 

598 filter_vals = np.char.lower(filter_vals.astype('str')) 

599 local_switch = np.logical_and(filter_vals != 'none', 

600 np.logical_and(filter_vals != 'nan', filter_vals != 'null')) 

601 if filter_switch is None: 

602 filter_switch = local_switch 

603 else: 

604 filter_switch &= local_switch 

605 

606 good_dexes = np.where(filter_switch) 

607 final_dexes = final_dexes[good_dexes] 

608 

609 if len(good_dexes[0]) < len(chunk): 

610 self._update_current_chunk(good_dexes) 

611 

612 return final_dexes 

613 

614 def _write_current_chunk(self, file_handle): 

615 """ 

616 write self._current_chunk to the file specified by file_handle 

617 """ 

618 if len(self._current_chunk) is 0: 

619 return 

620 

621 list_of_transform_keys = list(self.transformations.keys()) 

622 

623 chunk_cols = [self.transformations[col](self.column_by_name(col)) 

624 if col in list_of_transform_keys else 

625 self.column_by_name(col) 

626 for col in self.iter_column_names()] 

627 

628 # Create the template with the first chunk 

629 if self._template is None: 

630 self._template = self._make_line_template(chunk_cols) 

631 

632 # use a generator expression for lines rather than a list 

633 # for memory efficiency 

634 file_handle.writelines(self._template % line for line in zip(*chunk_cols)) 

635 

636 def _write_recarray(self, chunk, file_handle): 

637 """ 

638 This method takes a recarray (usually returned by querying db_obj), 

639 and writes it to the catalog. This method also handles any transformation 

640 of columns that needs to happen before they are written to the catalog. 

641 

642 @param [in] chunk is the recarray of queried columns to be formatted 

643 and written to the catalog. 

644 

645 @param [in] file_handle is a file handle pointing to the file where 

646 the catalog is being written. 

647 """ 

648 self._filter_chunk(chunk) 

649 self._write_current_chunk(file_handle) 

650 

651 def iter_catalog(self, chunk_size=None): 

652 """ 

653 Iterate over the lines of a catalog. 

654 

655 chunk_size controls the number of rows returned at a 

656 time from the database (smaller chunk_size will result 

657 in less memory usage but slower performance). 

658 

659 Catalog rows will be returned as lists. 

660 """ 

661 self.db_required_columns() 

662 

663 query_result = self.db_obj.query_columns(colnames=self._active_columns, 

664 obs_metadata=self.obs_metadata, 

665 constraint=self.constraint, 

666 chunk_size=chunk_size) 

667 

668 list_of_transform_keys = list(self.transformations.keys()) 

669 

670 for chunk in query_result: 

671 self._filter_chunk(chunk) 

672 chunk_cols = [self.transformations[col](self.column_by_name(col)) 

673 if col in list_of_transform_keys else 

674 self.column_by_name(col) 

675 for col in self.iter_column_names()] 

676 for line in zip(*chunk_cols): 

677 yield line 

678 

679 def iter_catalog_chunks(self, chunk_size=None): 

680 """ 

681 Iterate over catalog contents one chunk at a time. 

682 

683 chunk_size controls the number of catalog rows contained 

684 in each chunk. 

685 

686 The iterator will return a chunk of the database (a list of lists 

687 containing the contents of the datbase chunk). The first dimension 

688 of the chunk corresponds to the columns of the catalog, i.e. chunk[0] 

689 is a list containing the 0th column of the catalog. 

690 

691 The iterator will also yield a colMap, which is a dict mapping the 

692 names of the columns to their index value in the chunk. 

693 

694 Usage: 

695 

696 for chunk, colMap in cat.iter_catalog_chunks(chunk_size=1000): 

697 for ix in range(len(chunk[0])): 

698 print chunk[0][ix], chunk[1][ix], chunk[2][ix] 

699 

700 will print out the first three columns of the catalog, row by row 

701 """ 

702 self.db_required_columns() 

703 

704 query_result = self.db_obj.query_columns(colnames=self._active_columns, 

705 obs_metadata=self.obs_metadata, 

706 constraint=self.constraint, 

707 chunk_size=chunk_size) 

708 

709 list_of_transform_keys = list(self.transformations.keys()) 

710 

711 for chunk in query_result: 

712 self._filter_chunk(chunk) 

713 chunk_cols = [self.transformations[col](self.column_by_name(col)) 

714 if col in list_of_transform_keys else 

715 self.column_by_name(col) 

716 for col in self.iter_column_names()] 

717 chunkColMap = dict([(col, i) for i, col in enumerate(self.iter_column_names())]) 

718 yield chunk_cols, chunkColMap 

719 

720 def get_objId(self): 

721 return self.column_by_name(self.refIdCol) 

722 

723 def get_uniqueId(self, nShift=10): 

724 arr = self.column_by_name(self.refIdCol) 

725 if len(arr) > 0: 

726 return np.left_shift(self.column_by_name(self.refIdCol), nShift) + \ 

727 self.db_obj.getObjectTypeId() 

728 else: 

729 return arr 

730 

731 def _get_class_that_defined_method(self, meth): 

732 """ 

733 This method will return the name of the class that first defined the 

734 input method. 

735 

736 This is taken verbatim from 

737 http://stackoverflow.com/questions/961048/get-class-that-defined-method 

738 """ 

739 

740 for cls in inspect.getmro(meth.__self__.__class__): 

741 if meth.__name__ in cls.__dict__: 

742 return cls 

743 

744 return None 

745 

746 def print_column_origins(self): 

747 """ 

748 Print the origins of the columns in this catalog 

749 """ 

750 

751 print('\nwhere the columns in ', self.__class__, ' come from') 

752 for column_name in self._column_origins: 

753 print(column_name, self._column_origins[column_name]) 

754 

755 print('\n') 

756