Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# 

2# LSST Data Management System 

3# Copyright 2008, 2009, 2010 LSST Corporation. 

4# 

5# This product includes software developed by the 

6# LSST Project (http://www.lsst.org/). 

7# 

8# This program is free software: you can redistribute it and/or modify 

9# it under the terms of the GNU General Public License as published by 

10# the Free Software Foundation, either version 3 of the License, or 

11# (at your option) any later version. 

12# 

13# This program is distributed in the hope that it will be useful, 

14# but WITHOUT ANY WARRANTY; without even the implied warranty of 

15# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

16# GNU General Public License for more details. 

17# 

18# You should have received a copy of the LSST License Statement and 

19# the GNU General Public License along with this program. If not, 

20# see <http://www.lsstcorp.org/LegalNotices/>. 

21# 

22 

23"""This module provides registry classes for maintaining dataset metadata 

24for use by the Data Butler. Currently only a SQLite3-based registry is 

25implemented, but registries based on a text file, a policy file, a MySQL 

26(or other) relational database, and data gathered from scanning a filesystem 

27are all anticipated. 

28 

29Currently this module assumes posix access (for both PosixRegistry AND 

30SqliteRegistry). It is possible that it can be factored so that at least the 

31SqliteRegistry can be remote/not on the local filesystem. For now this module 

32is only used by CameraMapper and by PosixStorage, both of which work on the 

33local filesystem only, so this works for the time being. 

34""" 

35import copy 

36from . import fsScanner, sequencify 

37import os 

38import astropy.io.fits 

39import re 

40import yaml 

41 

42try: 

43 import sqlite3 

44 haveSqlite3 = True 

45except ImportError: 

46 try: 

47 # try external pysqlite package; deprecated 

48 import sqlite as sqlite3 

49 haveSqlite3 = True 

50 except ImportError: 

51 haveSqlite3 = False 

52 

53# PostgreSQL support 

54try: 

55 import psycopg2 as pgsql 

56 havePgsql = True 

57except ImportError: 

58 havePgsql = False 

59 

60 

61class Registry: 

62 """The registry base class.""" 

63 

64 def __init__(self): 

65 pass 

66 

67 def __del__(self): 

68 pass 

69 

70 @staticmethod 

71 def create(location): 

72 """Create a registry object of an appropriate type. 

73 @param location (string) Path or URL for registry, or None if 

74 unavailable""" 

75 

76 if location is None: 

77 return 

78 

79 # if re.match(r'.*\.registry', location): 

80 # return FileRegistry(location) 

81 # if re.match(r'.*\.paf', location): 

82 # return CalibRegistry(location) 

83 

84 if location.endswith(".pgsql"): 

85 return PgsqlRegistry(location) 

86 

87 # look for an sqlite3 registry 

88 if re.match(r'.*\.sqlite3', location): 

89 if not haveSqlite3: 

90 raise RuntimeError("sqlite3 registry specified (%s), but unable to import sqlite3 module" % 

91 (location,)) 

92 registry = SqliteRegistry(location) 

93 if registry.conn is None: 

94 return None 

95 return registry 

96 

97 # if re.match(r'mysql:', location): 

98 # return DbRegistry(location) 

99 # return FsRegistry(location) 

100 

101 # next try to create a PosixRegistry 

102 if os.path.isdir(location): 

103 return PosixRegistry(root=location) 

104 

105 raise RuntimeError("Unable to create registry using location: " + location) 

106 

107 

108class PosixRegistry(Registry): 

109 """A glob-based filesystem registry""" 

110 

111 def __init__(self, root): 

112 Registry.__init__(self) 

113 self.root = root 

114 

115 @staticmethod 

116 def getHduNumber(template, dataId): 

117 """Looks up the HDU number for a given template+dataId. 

118 :param template: template with HDU specifier (ends with brackets and an 

119 identifier that can be populated by a key-value pair in dataId. 

120 e.g. "%(visit)07d/instcal%(visit)07d.fits.fz[%(ccdnum)d]" 

121 :param dataId: dictionary that hopefully has a key-value pair whose key 

122 matches (has the same name) as the key specifier in the template. 

123 :return: the HDU specified by the template+dataId pair, or None if the 

124 HDU can not be determined. 

125 """ 

126 # sanity check that the template at least ends with a brace. 

127 if not template.endswith(']'): 

128 return None 

129 

130 # get the key (with formatting) out of the brances 

131 hduKey = template[template.rfind('[') + 1:template.rfind(']')] 

132 # extract the key name from the formatting 

133 hduKey = hduKey[hduKey.rfind('(') + 1:hduKey.rfind(')')] 

134 

135 if hduKey in dataId: 

136 return dataId[hduKey] 

137 return None 

138 

139 class LookupData: 

140 

141 def __init__(self, lookupProperties, dataId): 

142 self.dataId = copy.copy(dataId) 

143 lookupProperties = sequencify(lookupProperties) 

144 self.lookupProperties = copy.copy(lookupProperties) 

145 self.foundItems = {} 

146 self.cachedStatus = None 

147 self.neededKeys = set(lookupProperties).union(dataId.keys()) 

148 

149 def __repr__(self): 

150 return "LookupData lookupProperties:%s dataId:%s foundItems:%s cachedStatus:%s" % \ 

151 (self.lookupProperties, self.dataId, self.foundItems, self.cachedStatus) 

152 

153 def status(self): 

154 """Query the lookup status 

155 

156 :return: 'match' if the key+value pairs in dataId have been satisifed and keys in 

157 lookupProperties have found and their key+value added to resolvedId 

158 'incomplete' if the found data matches but not all keys in lookupProperties have been matched 

159 'not match' if data in foundId does not match data in dataId 

160 """ 

161 class NotFound: 

162 """Placeholder class for item not found. 

163 

164 (None might be a valid value so we don't want to use that) 

165 """ 

166 pass 

167 

168 if self.cachedStatus is not None: 

169 return self.cachedStatus 

170 self.cachedStatus = 'match' 

171 for key in self.lookupProperties: 

172 val = self.foundItems.get(key, NotFound) 

173 if val is NotFound: 

174 self.cachedStatus = 'incomplete' 

175 break 

176 for dataIdKey, dataIdValue in self.dataId.items(): 

177 foundValue = self.foundItems.get(dataIdKey, NotFound) 

178 if foundValue is not NotFound and foundValue != dataIdValue: 

179 self.cachedStatus = 'notMatch' 

180 break 

181 return self.cachedStatus 

182 

183 def setFoundItems(self, items): 

184 self.cachedStatus = None 

185 self.foundItems = items 

186 

187 def addFoundItems(self, items): 

188 self.cachedStatus = None 

189 self.foundItems.update(items) 

190 

191 def getMissingKeys(self): 

192 return self.neededKeys - set(self.foundItems.keys()) 

193 

194 def lookup(self, lookupProperties, reference, dataId, **kwargs): 

195 """Perform a lookup in the registry. 

196 

197 Return values are refined by the values in dataId. 

198 Returns a list of values that match keys in lookupProperties. 

199 e.g. if the template is 'raw/raw_v%(visit)d_f%(filter)s.fits.gz', and 

200 dataId={'visit':1}, and lookupProperties is ['filter'], and the 

201 filesystem under self.root has exactly one file 'raw/raw_v1_fg.fits.gz' 

202 then the return value will be [('g',)] 

203 

204 :param lookupProperties: keys whose values will be returned. 

205 :param reference: other data types that may be used to search for values. 

206 :param dataId: must be an iterable. Keys must be string. 

207 If value is a string then will look for elements in the repository that match value for key. 

208 If value is a 2-item iterable then will look for elements in the repository are between (inclusive) 

209 the first and second items in the value. 

210 :param **kwargs: keys required for the posix registry to search for items. If required keys are not 

211 provide will return an empty list. 

212 'template': required. template parameter (typically from a policy) that can be used to look for files 

213 'storage': optional. Needed to look for metadata in files. Currently supported values: 'FitsStorage'. 

214 :return: a list of values that match keys in lookupProperties. 

215 """ 

216 # required kwargs: 

217 if 'template' in kwargs: 

218 template = kwargs['template'] 

219 else: 

220 return [] 

221 # optional kwargs: 

222 storage = kwargs['storage'] if 'storage' in kwargs else None 

223 

224 lookupData = PosixRegistry.LookupData(lookupProperties, dataId) 

225 scanner = fsScanner.FsScanner(template) 

226 allPaths = scanner.processPath(self.root) 

227 retItems = [] # one item for each found file that matches 

228 for path, foundProperties in allPaths.items(): 

229 # check for dataId keys that are not present in found properties 

230 # search for those keys in metadata of file at path 

231 # if present, check for matching values 

232 # if not present, file can not match, do not use it. 

233 lookupData.setFoundItems(foundProperties) 

234 if 'incomplete' == lookupData.status(): 

235 PosixRegistry.lookupMetadata(os.path.join(self.root, path), template, lookupData, storage) 

236 if 'match' == lookupData.status(): 

237 ll = tuple(lookupData.foundItems[key] for key in lookupData.lookupProperties) 

238 retItems.append(ll) 

239 return retItems 

240 

241 @staticmethod 

242 def lookupMetadata(filepath, template, lookupData, storage): 

243 """Dispatcher for looking up metadata in a file of a given storage type 

244 """ 

245 if storage == 'FitsStorage': 

246 PosixRegistry.lookupFitsMetadata(filepath, template, lookupData, storage) 

247 

248 @staticmethod 

249 def lookupFitsMetadata(filepath, template, lookupData, dataId): 

250 """Look up metadata in a fits file. 

251 Will try to discover the correct HDU to look in by testing if the 

252 template has a value in brackets at the end. 

253 If the HDU is specified but the metadata key is not discovered in 

254 that HDU, will look in the primary HDU before giving up. 

255 :param filepath: path to the file 

256 :param template: template that was used to discover the file. This can 

257 be used to look up the correct HDU as needed. 

258 :param lookupData: an instance if LookupData that contains the 

259 lookupProperties, the dataId, and the data that has been found so far. 

260 Will be updated with new information as discovered. 

261 :param dataId: 

262 :return: 

263 """ 

264 try: 

265 hdulist = astropy.io.fits.open(filepath, memmap=True) 

266 except IOError: 

267 return 

268 hduNumber = PosixRegistry.getHduNumber(template=template, dataId=dataId) 

269 if hduNumber is not None and hduNumber < len(hdulist): 

270 hdu = hdulist[hduNumber] 

271 else: 

272 hdu = None 

273 if len(hdulist) > 0: 

274 primaryHdu = hdulist[0] 

275 else: 

276 primaryHdu = None 

277 

278 for property in lookupData.getMissingKeys(): 

279 propertyValue = None 

280 if hdu is not None and property in hdu.header: 

281 propertyValue = hdu.header[property] 

282 # if the value is not in the indicated HDU, try the primary HDU: 

283 elif primaryHdu is not None and property in primaryHdu.header: 

284 propertyValue = primaryHdu.header[property] 

285 lookupData.addFoundItems({property: propertyValue}) 

286 

287 

288class SqlRegistry(Registry): 

289 """A base class for SQL-based registries 

290 

291 Subclasses should define the class variable `placeHolder` (the particular 

292 placeholder to use for parameter substitution) appropriately. The 

293 database's python module should define `paramstyle` (see PEP 249), which 

294 would indicate what to use for a placeholder: 

295 * paramstyle = "qmark" --> placeHolder = "?" 

296 * paramstyle = "format" --> placeHolder = "%s" 

297 Other `paramstyle` values are not currently supported. 

298 

299 Constructor parameters 

300 ---------------------- 

301 conn : DBAPI connection object 

302 Connection object 

303 """ 

304 placeHolder = "?" # Placeholder for parameter substitution 

305 

306 def __init__(self, conn): 

307 """Constructor. 

308 

309 Parameters 

310 ---------- 

311 conn : DBAPI connection object 

312 Connection object 

313 """ 

314 Registry.__init__(self) 

315 self.conn = conn 

316 

317 def __del__(self): 

318 if hasattr(self, "conn") and self.conn: 

319 self.conn.close() 

320 super().__del__() 

321 

322 def lookup(self, lookupProperties, reference, dataId, **kwargs): 

323 """Perform a lookup in the registry. 

324 

325 Return values are refined by the values in dataId. 

326 Returns a list of values that match keys in lookupProperties. 

327 e.g. if the template is 'raw/raw_v%(visit)d_f%(filter)s.fits.gz', and 

328 dataId={'visit':1}, and lookupProperties is ['filter'], and the 

329 filesystem under self.root has exactly one file 'raw/raw_v1_fg.fits.gz' 

330 then the return value will be [('g',)] 

331 

332 :param lookupProperties: 

333 :param dataId: must be an iterable. Keys must be string. 

334 If key is a string then will look for elements in the repository that match value for key. 

335 If key is a 2-item iterable then will look for elements in the repository where the value is between 

336 the values of key[0] and key[1]. 

337 :param reference: other data types that may be used to search for values. 

338 :param **kwargs: nothing needed for sqlite lookup 

339 :return: a list of values that match keys in lookupProperties. 

340 """ 

341 if not self.conn: 

342 return None 

343 

344 # input variable sanitization: 

345 reference = sequencify(reference) 

346 lookupProperties = sequencify(lookupProperties) 

347 

348 cmd = "SELECT DISTINCT " 

349 cmd += ", ".join(lookupProperties) 

350 cmd += " FROM " + " NATURAL JOIN ".join(reference) 

351 valueList = [] 

352 if dataId is not None and len(dataId) > 0: 

353 whereList = [] 

354 for k, v in dataId.items(): 

355 if hasattr(k, '__iter__') and not isinstance(k, str): 

356 if len(k) != 2: 

357 raise RuntimeError("Wrong number of keys for range:%s" % (k,)) 

358 whereList.append("(%s BETWEEN %s AND %s)" % (self.placeHolder, k[0], k[1])) 

359 valueList.append(v) 

360 else: 

361 whereList.append("%s = %s" % (k, self.placeHolder)) 

362 valueList.append(v) 

363 cmd += " WHERE " + " AND ".join(whereList) 

364 cursor = self.conn.cursor() 

365 cursor.execute(cmd, valueList) 

366 return [row for row in cursor.fetchall()] 

367 

368 def executeQuery(self, returnFields, joinClause, whereFields, range, values): 

369 """Extract metadata from the registry. 

370 @param returnFields (list of strings) Metadata fields to be extracted. 

371 @param joinClause (list of strings) Tables in which metadata fields 

372 are located. 

373 @param whereFields (list of tuples) First tuple element is metadata 

374 field to query; second is the value that field 

375 must have (often '?'). 

376 @param range (tuple) Value, lower limit, and upper limit for a 

377 range condition on the metadata. Any of these can 

378 be metadata fields. 

379 @param values (tuple) Tuple of values to be substituted for '?' 

380 characters in the whereFields values or the range 

381 values. 

382 @return (list of tuples) All sets of field values that meet the 

383 criteria""" 

384 if not self.conn: 

385 return None 

386 cmd = "SELECT DISTINCT " 

387 cmd += ", ".join(returnFields) 

388 cmd += " FROM " + " NATURAL JOIN ".join(joinClause) 

389 whereList = [] 

390 if whereFields: 

391 for k, v in whereFields: 

392 whereList.append("(%s = %s)" % (k, v)) 

393 if range is not None: 

394 whereList.append("(%s BETWEEN %s AND %s)" % range) 

395 if len(whereList) > 0: 

396 cmd += " WHERE " + " AND ".join(whereList) 

397 cursor = self.conn.cursor() 

398 cursor.execute(cmd, values) 

399 return [row for row in cursor.fetchall()] 

400 

401 

402class SqliteRegistry(SqlRegistry): 

403 """A SQLite-based registry""" 

404 placeHolder = "?" # Placeholder for parameter substitution 

405 

406 def __init__(self, location): 

407 """Constructor 

408 

409 Parameters 

410 ---------- 

411 location : `str` 

412 Path to SQLite3 file 

413 """ 

414 if os.path.exists(location): 

415 conn = sqlite3.connect(location) 

416 conn.text_factory = str 

417 self.root = location 

418 else: 

419 conn = None 

420 SqlRegistry.__init__(self, conn) 

421 

422 

423class PgsqlRegistry(SqlRegistry): 

424 """A PostgreSQL-based registry""" 

425 placeHolder = "%s" 

426 

427 def __init__(self, location): 

428 """Constructor 

429 

430 Parameters 

431 ---------- 

432 location : `str` 

433 Path to PostgreSQL configuration file. 

434 """ 

435 if not havePgsql: 

436 raise RuntimeError("Cannot use PgsqlRegistry: could not import psycopg2") 

437 config = self.readYaml(location) 

438 self._config = config 

439 conn = pgsql.connect(host=config["host"], port=config["port"], database=config["database"], 

440 user=config["user"], password=config["password"]) 

441 self.root = location 

442 SqlRegistry.__init__(self, conn) 

443 

444 @staticmethod 

445 def readYaml(location): 

446 """Read YAML configuration file 

447 

448 The YAML configuration file should contain: 

449 * host : host name for database connection 

450 * port : port for database connection 

451 * user : user name for database connection 

452 * database : database name 

453 

454 It may also contain: 

455 * password : password for database connection 

456 

457 The optional entries are set to `None` in the output configuration. 

458 

459 Parameters 

460 ---------- 

461 location : `str` 

462 Path to PostgreSQL YAML config file. 

463 

464 Returns 

465 ------- 

466 config : `dict` 

467 Configuration 

468 """ 

469 try: 

470 # PyYAML >=5.1 prefers a different loader 

471 loader = yaml.FullLoader 

472 except AttributeError: 

473 loader = yaml.Loader 

474 with open(location) as ff: 

475 data = yaml.load(ff, Loader=loader) 

476 requireKeys = set(["host", "port", "database", "user"]) 

477 optionalKeys = set(["password"]) 

478 haveKeys = set(data.keys()) 

479 if haveKeys - optionalKeys != requireKeys: 

480 raise RuntimeError( 

481 "PostgreSQL YAML configuration (%s) should contain only %s, and may contain 'password', " 

482 "but this contains: %s" % 

483 (location, ",".join("'%s'" % key for key in requireKeys), 

484 ",".join("'%s'" % key for key in data.keys())) 

485 ) 

486 for key in optionalKeys: 

487 if key not in data: 

488 data[key] = None 

489 

490 return data 

491 

492 def lookup(self, *args, **kwargs): 

493 try: 

494 return SqlRegistry.lookup(self, *args, **kwargs) 

495 except Exception: 

496 self.conn.rollback() 

497 raise