Coverage for python/lsst/daf/persistence/registries.py: 16%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

232 statements  

1# 

2# LSST Data Management System 

3# Copyright 2008, 2009, 2010 LSST Corporation. 

4# 

5# This product includes software developed by the 

6# LSST Project (http://www.lsst.org/). 

7# 

8# This program is free software: you can redistribute it and/or modify 

9# it under the terms of the GNU General Public License as published by 

10# the Free Software Foundation, either version 3 of the License, or 

11# (at your option) any later version. 

12# 

13# This program is distributed in the hope that it will be useful, 

14# but WITHOUT ANY WARRANTY; without even the implied warranty of 

15# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

16# GNU General Public License for more details. 

17# 

18# You should have received a copy of the LSST License Statement and 

19# the GNU General Public License along with this program. If not, 

20# see <http://www.lsstcorp.org/LegalNotices/>. 

21# 

22 

23"""This module provides registry classes for maintaining dataset metadata 

24for use by the Data Butler. Currently only a SQLite3-based registry is 

25implemented, but registries based on a text file, a policy file, a MySQL 

26(or other) relational database, and data gathered from scanning a filesystem 

27are all anticipated. 

28 

29Currently this module assumes posix access (for both PosixRegistry AND 

30SqliteRegistry). It is possible that it can be factored so that at least the 

31SqliteRegistry can be remote/not on the local filesystem. For now this module 

32is only used by CameraMapper and by PosixStorage, both of which work on the 

33local filesystem only, so this works for the time being. 

34""" 

35import copy 

36from . import fsScanner, sequencify 

37import os 

38import astropy.io.fits 

39import re 

40import yaml 

41 

42try: 

43 import sqlite3 

44 haveSqlite3 = True 

45except ImportError: 

46 try: 

47 # try external pysqlite package; deprecated 

48 import sqlite as sqlite3 

49 haveSqlite3 = True 

50 except ImportError: 

51 haveSqlite3 = False 

52 

53# PostgreSQL support 

54try: 

55 import psycopg2 as pgsql 

56 havePgsql = True 

57except ImportError: 

58 havePgsql = False 

59 

60 

61class Registry: 

62 """The registry base class.""" 

63 

64 def __init__(self): 

65 pass 

66 

67 def __del__(self): 

68 pass 

69 

70 @staticmethod 

71 def create(location): 

72 """Create a registry object of an appropriate type. 

73 @param location (string) Path or URL for registry, or None if 

74 unavailable""" 

75 

76 if location is None: 

77 return 

78 

79 # if re.match(r'.*\.registry', location): 

80 # return FileRegistry(location) 

81 

82 if location.endswith(".pgsql"): 

83 return PgsqlRegistry(location) 

84 

85 # look for an sqlite3 registry 

86 if re.match(r'.*\.sqlite3', location): 

87 if not haveSqlite3: 

88 raise RuntimeError("sqlite3 registry specified (%s), but unable to import sqlite3 module" % 

89 (location,)) 

90 registry = SqliteRegistry(location) 

91 if registry.conn is None: 

92 return None 

93 return registry 

94 

95 # if re.match(r'mysql:', location): 

96 # return DbRegistry(location) 

97 # return FsRegistry(location) 

98 

99 # next try to create a PosixRegistry 

100 if os.path.isdir(location): 

101 return PosixRegistry(root=location) 

102 

103 raise RuntimeError("Unable to create registry using location: " + location) 

104 

105 

106class PosixRegistry(Registry): 

107 """A glob-based filesystem registry""" 

108 

109 def __init__(self, root): 

110 Registry.__init__(self) 

111 self.root = root 

112 

113 @staticmethod 

114 def getHduNumber(template, dataId): 

115 """Looks up the HDU number for a given template+dataId. 

116 :param template: template with HDU specifier (ends with brackets and an 

117 identifier that can be populated by a key-value pair in dataId. 

118 e.g. "%(visit)07d/instcal%(visit)07d.fits.fz[%(ccdnum)d]" 

119 :param dataId: dictionary that hopefully has a key-value pair whose key 

120 matches (has the same name) as the key specifier in the template. 

121 :return: the HDU specified by the template+dataId pair, or None if the 

122 HDU can not be determined. 

123 """ 

124 # sanity check that the template at least ends with a brace. 

125 if not template.endswith(']'): 

126 return None 

127 

128 # get the key (with formatting) out of the brances 

129 hduKey = template[template.rfind('[') + 1:template.rfind(']')] 

130 # extract the key name from the formatting 

131 hduKey = hduKey[hduKey.rfind('(') + 1:hduKey.rfind(')')] 

132 

133 if hduKey in dataId: 

134 return dataId[hduKey] 

135 return None 

136 

137 class LookupData: 

138 

139 def __init__(self, lookupProperties, dataId): 

140 self.dataId = copy.copy(dataId) 

141 lookupProperties = sequencify(lookupProperties) 

142 self.lookupProperties = copy.copy(lookupProperties) 

143 self.foundItems = {} 

144 self.cachedStatus = None 

145 self.neededKeys = set(lookupProperties).union(dataId.keys()) 

146 

147 def __repr__(self): 

148 return "LookupData lookupProperties:%s dataId:%s foundItems:%s cachedStatus:%s" % \ 

149 (self.lookupProperties, self.dataId, self.foundItems, self.cachedStatus) 

150 

151 def status(self): 

152 """Query the lookup status 

153 

154 :return: 'match' if the key+value pairs in dataId have been satisifed and keys in 

155 lookupProperties have found and their key+value added to resolvedId 

156 'incomplete' if the found data matches but not all keys in lookupProperties have been matched 

157 'not match' if data in foundId does not match data in dataId 

158 """ 

159 class NotFound: 

160 """Placeholder class for item not found. 

161 

162 (None might be a valid value so we don't want to use that) 

163 """ 

164 pass 

165 

166 if self.cachedStatus is not None: 

167 return self.cachedStatus 

168 self.cachedStatus = 'match' 

169 for key in self.lookupProperties: 

170 val = self.foundItems.get(key, NotFound) 

171 if val is NotFound: 

172 self.cachedStatus = 'incomplete' 

173 break 

174 for dataIdKey, dataIdValue in self.dataId.items(): 

175 foundValue = self.foundItems.get(dataIdKey, NotFound) 

176 if foundValue is not NotFound and foundValue != dataIdValue: 

177 self.cachedStatus = 'notMatch' 

178 break 

179 return self.cachedStatus 

180 

181 def setFoundItems(self, items): 

182 self.cachedStatus = None 

183 self.foundItems = items 

184 

185 def addFoundItems(self, items): 

186 self.cachedStatus = None 

187 self.foundItems.update(items) 

188 

189 def getMissingKeys(self): 

190 return self.neededKeys - set(self.foundItems.keys()) 

191 

192 def lookup(self, lookupProperties, reference, dataId, **kwargs): 

193 """Perform a lookup in the registry. 

194 

195 Return values are refined by the values in dataId. 

196 Returns a list of values that match keys in lookupProperties. 

197 e.g. if the template is 'raw/raw_v%(visit)d_f%(filter)s.fits.gz', and 

198 dataId={'visit':1}, and lookupProperties is ['filter'], and the 

199 filesystem under self.root has exactly one file 'raw/raw_v1_fg.fits.gz' 

200 then the return value will be [('g',)] 

201 

202 :param lookupProperties: keys whose values will be returned. 

203 :param reference: other data types that may be used to search for values. 

204 :param dataId: must be an iterable. Keys must be string. 

205 If value is a string then will look for elements in the repository that match value for key. 

206 If value is a 2-item iterable then will look for elements in the repository are between (inclusive) 

207 the first and second items in the value. 

208 :param **kwargs: keys required for the posix registry to search for items. If required keys are not 

209 provide will return an empty list. 

210 'template': required. template parameter (typically from a policy) that can be used to look for files 

211 'storage': optional. Needed to look for metadata in files. Currently supported values: 'FitsStorage'. 

212 :return: a list of values that match keys in lookupProperties. 

213 """ 

214 # required kwargs: 

215 if 'template' in kwargs: 

216 template = kwargs['template'] 

217 else: 

218 return [] 

219 # optional kwargs: 

220 storage = kwargs['storage'] if 'storage' in kwargs else None 

221 

222 lookupData = PosixRegistry.LookupData(lookupProperties, dataId) 

223 scanner = fsScanner.FsScanner(template) 

224 allPaths = scanner.processPath(self.root) 

225 retItems = [] # one item for each found file that matches 

226 for path, foundProperties in allPaths.items(): 

227 # check for dataId keys that are not present in found properties 

228 # search for those keys in metadata of file at path 

229 # if present, check for matching values 

230 # if not present, file can not match, do not use it. 

231 lookupData.setFoundItems(foundProperties) 

232 if 'incomplete' == lookupData.status(): 

233 PosixRegistry.lookupMetadata(os.path.join(self.root, path), template, lookupData, storage) 

234 if 'match' == lookupData.status(): 

235 ll = tuple(lookupData.foundItems[key] for key in lookupData.lookupProperties) 

236 retItems.append(ll) 

237 return retItems 

238 

239 @staticmethod 

240 def lookupMetadata(filepath, template, lookupData, storage): 

241 """Dispatcher for looking up metadata in a file of a given storage type 

242 """ 

243 if storage == 'FitsStorage': 

244 PosixRegistry.lookupFitsMetadata(filepath, template, lookupData, storage) 

245 

246 @staticmethod 

247 def lookupFitsMetadata(filepath, template, lookupData, dataId): 

248 """Look up metadata in a fits file. 

249 Will try to discover the correct HDU to look in by testing if the 

250 template has a value in brackets at the end. 

251 If the HDU is specified but the metadata key is not discovered in 

252 that HDU, will look in the primary HDU before giving up. 

253 :param filepath: path to the file 

254 :param template: template that was used to discover the file. This can 

255 be used to look up the correct HDU as needed. 

256 :param lookupData: an instance if LookupData that contains the 

257 lookupProperties, the dataId, and the data that has been found so far. 

258 Will be updated with new information as discovered. 

259 :param dataId: 

260 :return: 

261 """ 

262 try: 

263 hdulist = astropy.io.fits.open(filepath, memmap=True) 

264 except IOError: 

265 return 

266 hduNumber = PosixRegistry.getHduNumber(template=template, dataId=dataId) 

267 if hduNumber is not None and hduNumber < len(hdulist): 

268 hdu = hdulist[hduNumber] 

269 else: 

270 hdu = None 

271 if len(hdulist) > 0: 

272 primaryHdu = hdulist[0] 

273 else: 

274 primaryHdu = None 

275 

276 for property in lookupData.getMissingKeys(): 

277 propertyValue = None 

278 if hdu is not None and property in hdu.header: 

279 propertyValue = hdu.header[property] 

280 # if the value is not in the indicated HDU, try the primary HDU: 

281 elif primaryHdu is not None and property in primaryHdu.header: 

282 propertyValue = primaryHdu.header[property] 

283 lookupData.addFoundItems({property: propertyValue}) 

284 

285 

286class SqlRegistry(Registry): 

287 """A base class for SQL-based registries 

288 

289 Subclasses should define the class variable `placeHolder` (the particular 

290 placeholder to use for parameter substitution) appropriately. The 

291 database's python module should define `paramstyle` (see PEP 249), which 

292 would indicate what to use for a placeholder: 

293 * paramstyle = "qmark" --> placeHolder = "?" 

294 * paramstyle = "format" --> placeHolder = "%s" 

295 Other `paramstyle` values are not currently supported. 

296 

297 Constructor parameters 

298 ---------------------- 

299 conn : DBAPI connection object 

300 Connection object 

301 """ 

302 placeHolder = "?" # Placeholder for parameter substitution 

303 

304 def __init__(self, conn): 

305 """Constructor. 

306 

307 Parameters 

308 ---------- 

309 conn : DBAPI connection object 

310 Connection object 

311 """ 

312 Registry.__init__(self) 

313 self.conn = conn 

314 

315 def __del__(self): 

316 if hasattr(self, "conn") and self.conn: 

317 self.conn.close() 

318 super().__del__() 

319 

320 def _lookup(self, lookupProperties, dataId, reference, checkColumns=False): 

321 """Perform a lookup in the registry. 

322 

323 This is the worker code for cls.lookup with the added option of checking 

324 that all the columns being looked up are in the database. The classic 

325 example here is adding a template with an hdu, where the hdu in the dataId 

326 prevents us looking up e.g. dateObs. checkColumns results in a performance 

327 penalty, so is only invoked when a problem in the dataId keys has been seen 

328 

329 Return values are refined by the values in dataId. 

330 Returns a list of values that match keys in lookupProperties. 

331 e.g. if the template is 'raw/raw_v%(visit)d_f%(filter)s.fits.gz', and 

332 dataId={'visit':1}, and lookupProperties is ['filter'], and the 

333 filesystem under self.root has exactly one file 'raw/raw_v1_fg.fits.gz' 

334 then the return value will be [('g',)] 

335 

336 :param lookupProperties: 

337 :param dataId: must be a key/value iterable. Keys must be string. 

338 See `SqlRegistry.lookup` for further details 

339 :param reference: other data types that may be used to search for values. 

340 :param checkColumns: if True, check that keys are actually in the registry and ignore them if not 

341 :return: a list of values that match keys in lookupProperties. 

342 """ 

343 cmd = "SELECT DISTINCT " 

344 cmd += ", ".join(lookupProperties) 

345 cmd += " FROM " + " NATURAL JOIN ".join(reference) 

346 valueList = [] 

347 if dataId is not None and len(dataId) > 0: 

348 whereList = [] 

349 for k, v in dataId.items(): 

350 if checkColumns: # check if k is in registry 

351 try: 

352 self.conn.cursor().execute( 

353 f'SELECT {k} FROM {" NATURAL JOIN ".join(reference)} LIMIT 1') 

354 except sqlite3.OperationalError: 

355 continue 

356 

357 if hasattr(k, '__iter__') and not isinstance(k, str): 

358 if len(k) != 2: 

359 raise RuntimeError("Wrong number of keys for range:%s" % (k,)) 

360 whereList.append("(%s BETWEEN %s AND %s)" % (self.placeHolder, k[0], k[1])) 

361 valueList.append(v) 

362 else: 

363 whereList.append("%s = %s" % (k, self.placeHolder)) 

364 valueList.append(v) 

365 cmd += " WHERE " + " AND ".join(whereList) 

366 cursor = self.conn.cursor() 

367 cursor.execute(cmd, valueList) 

368 return [row for row in cursor.fetchall()] 

369 

370 def lookup(self, lookupProperties, reference, dataId, **kwargs): 

371 """Perform a lookup in the registry. 

372 

373 Return values are refined by the values in dataId. 

374 Returns a list of values that match keys in lookupProperties. 

375 e.g. if the template is 'raw/raw_v%(visit)d_f%(filter)s.fits.gz', and 

376 dataId={'visit':1}, and lookupProperties is ['filter'], and the 

377 filesystem under self.root has exactly one file 'raw/raw_v1_fg.fits.gz' 

378 then the return value will be [('g',)] 

379 

380 :param lookupProperties: 

381 :param dataId: must be a key/value iterable. Keys must be string. 

382 If value is a string then will look for elements in the repository that match value for value. 

383 If value is a 2-item iterable then will look for elements in the repository where the value is between 

384 the values of value[0] and value[1]. 

385 :param reference: other data types that may be used to search for values. 

386 :param **kwargs: nothing needed for sqlite lookup 

387 :return: a list of values that match keys in lookupProperties. 

388 """ 

389 if not self.conn: 

390 return None 

391 

392 # input variable sanitization: 

393 reference = sequencify(reference) 

394 lookupProperties = sequencify(lookupProperties) 

395 

396 try: 

397 return self._lookup(lookupProperties, dataId, reference) 

398 except sqlite3.OperationalError: # try again, with extra checking of the dataId keys 

399 return self._lookup(lookupProperties, dataId, reference, checkColumns=True) 

400 

401 def executeQuery(self, returnFields, joinClause, whereFields, range, values): 

402 """Extract metadata from the registry. 

403 @param returnFields (list of strings) Metadata fields to be extracted. 

404 @param joinClause (list of strings) Tables in which metadata fields 

405 are located. 

406 @param whereFields (list of tuples) First tuple element is metadata 

407 field to query; second is the value that field 

408 must have (often '?'). 

409 @param range (tuple) Value, lower limit, and upper limit for a 

410 range condition on the metadata. Any of these can 

411 be metadata fields. 

412 @param values (tuple) Tuple of values to be substituted for '?' 

413 characters in the whereFields values or the range 

414 values. 

415 @return (list of tuples) All sets of field values that meet the 

416 criteria""" 

417 if not self.conn: 

418 return None 

419 cmd = "SELECT DISTINCT " 

420 cmd += ", ".join(returnFields) 

421 cmd += " FROM " + " NATURAL JOIN ".join(joinClause) 

422 whereList = [] 

423 if whereFields: 

424 for k, v in whereFields: 

425 whereList.append("(%s = %s)" % (k, v)) 

426 if range is not None: 

427 whereList.append("(%s BETWEEN %s AND %s)" % range) 

428 if len(whereList) > 0: 

429 cmd += " WHERE " + " AND ".join(whereList) 

430 cursor = self.conn.cursor() 

431 cursor.execute(cmd, values) 

432 return [row for row in cursor.fetchall()] 

433 

434 

435class SqliteRegistry(SqlRegistry): 

436 """A SQLite-based registry""" 

437 placeHolder = "?" # Placeholder for parameter substitution 

438 

439 def __init__(self, location): 

440 """Constructor 

441 

442 Parameters 

443 ---------- 

444 location : `str` 

445 Path to SQLite3 file 

446 """ 

447 if os.path.exists(location): 

448 conn = sqlite3.connect(location) 

449 conn.text_factory = str 

450 self.root = location 

451 else: 

452 conn = None 

453 SqlRegistry.__init__(self, conn) 

454 

455 

456class PgsqlRegistry(SqlRegistry): 

457 """A PostgreSQL-based registry""" 

458 placeHolder = "%s" 

459 

460 def __init__(self, location): 

461 """Constructor 

462 

463 Parameters 

464 ---------- 

465 location : `str` 

466 Path to PostgreSQL configuration file. 

467 """ 

468 if not havePgsql: 

469 raise RuntimeError("Cannot use PgsqlRegistry: could not import psycopg2") 

470 config = self.readYaml(location) 

471 self._config = config 

472 conn = pgsql.connect(host=config["host"], port=config["port"], database=config["database"], 

473 user=config["user"], password=config["password"]) 

474 self.root = location 

475 SqlRegistry.__init__(self, conn) 

476 

477 @staticmethod 

478 def readYaml(location): 

479 """Read YAML configuration file 

480 

481 The YAML configuration file should contain: 

482 * host : host name for database connection 

483 * port : port for database connection 

484 * user : user name for database connection 

485 * database : database name 

486 

487 It may also contain: 

488 * password : password for database connection 

489 

490 The optional entries are set to `None` in the output configuration. 

491 

492 Parameters 

493 ---------- 

494 location : `str` 

495 Path to PostgreSQL YAML config file. 

496 

497 Returns 

498 ------- 

499 config : `dict` 

500 Configuration 

501 """ 

502 try: 

503 # PyYAML >=5.1 prefers a different loader 

504 loader = yaml.UnsafeLoader 

505 except AttributeError: 

506 loader = yaml.Loader 

507 with open(location) as ff: 

508 data = yaml.load(ff, Loader=loader) 

509 requireKeys = set(["host", "port", "database", "user"]) 

510 optionalKeys = set(["password"]) 

511 haveKeys = set(data.keys()) 

512 if haveKeys - optionalKeys != requireKeys: 

513 raise RuntimeError( 

514 "PostgreSQL YAML configuration (%s) should contain only %s, and may contain 'password', " 

515 "but this contains: %s" % 

516 (location, ",".join("'%s'" % key for key in requireKeys), 

517 ",".join("'%s'" % key for key in data.keys())) 

518 ) 

519 for key in optionalKeys: 

520 if key not in data: 

521 data[key] = None 

522 

523 return data 

524 

525 def lookup(self, *args, **kwargs): 

526 try: 

527 return SqlRegistry.lookup(self, *args, **kwargs) 

528 except Exception: 

529 self.conn.rollback() 

530 raise