Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# 

2# LSST Data Management System 

3# Copyright 2008, 2009, 2010 LSST Corporation. 

4# 

5# This product includes software developed by the 

6# LSST Project (http://www.lsst.org/). 

7# 

8# This program is free software: you can redistribute it and/or modify 

9# it under the terms of the GNU General Public License as published by 

10# the Free Software Foundation, either version 3 of the License, or 

11# (at your option) any later version. 

12# 

13# This program is distributed in the hope that it will be useful, 

14# but WITHOUT ANY WARRANTY; without even the implied warranty of 

15# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

16# GNU General Public License for more details. 

17# 

18# You should have received a copy of the LSST License Statement and 

19# the GNU General Public License along with this program. If not, 

20# see <http://www.lsstcorp.org/LegalNotices/>. 

21# 

22 

23"""This module provides registry classes for maintaining dataset metadata 

24for use by the Data Butler. Currently only a SQLite3-based registry is 

25implemented, but registries based on a text file, a policy file, a MySQL 

26(or other) relational database, and data gathered from scanning a filesystem 

27are all anticipated. 

28 

29Currently this module assumes posix access (for both PosixRegistry AND 

30SqliteRegistry). It is possible that it can be factored so that at least the 

31SqliteRegistry can be remote/not on the local filesystem. For now this module 

32is only used by CameraMapper and by PosixStorage, both of which work on the 

33local filesystem only, so this works for the time being. 

34""" 

35import copy 

36from . import fsScanner, sequencify 

37import os 

38import astropy.io.fits 

39import re 

40import yaml 

41 

42try: 

43 import sqlite3 

44 haveSqlite3 = True 

45except ImportError: 

46 try: 

47 # try external pysqlite package; deprecated 

48 import sqlite as sqlite3 

49 haveSqlite3 = True 

50 except ImportError: 

51 haveSqlite3 = False 

52 

53# PostgreSQL support 

54try: 

55 import psycopg2 as pgsql 

56 havePgsql = True 

57except ImportError: 

58 havePgsql = False 

59 

60 

61class Registry: 

62 """The registry base class.""" 

63 

64 def __init__(self): 

65 pass 

66 

67 def __del__(self): 

68 pass 

69 

70 @staticmethod 

71 def create(location): 

72 """Create a registry object of an appropriate type. 

73 @param location (string) Path or URL for registry, or None if 

74 unavailable""" 

75 

76 if location is None: 

77 return 

78 

79 # if re.match(r'.*\.registry', location): 

80 # return FileRegistry(location) 

81 

82 if location.endswith(".pgsql"): 

83 return PgsqlRegistry(location) 

84 

85 # look for an sqlite3 registry 

86 if re.match(r'.*\.sqlite3', location): 

87 if not haveSqlite3: 

88 raise RuntimeError("sqlite3 registry specified (%s), but unable to import sqlite3 module" % 

89 (location,)) 

90 registry = SqliteRegistry(location) 

91 if registry.conn is None: 

92 return None 

93 return registry 

94 

95 # if re.match(r'mysql:', location): 

96 # return DbRegistry(location) 

97 # return FsRegistry(location) 

98 

99 # next try to create a PosixRegistry 

100 if os.path.isdir(location): 

101 return PosixRegistry(root=location) 

102 

103 raise RuntimeError("Unable to create registry using location: " + location) 

104 

105 

106class PosixRegistry(Registry): 

107 """A glob-based filesystem registry""" 

108 

109 def __init__(self, root): 

110 Registry.__init__(self) 

111 self.root = root 

112 

113 @staticmethod 

114 def getHduNumber(template, dataId): 

115 """Looks up the HDU number for a given template+dataId. 

116 :param template: template with HDU specifier (ends with brackets and an 

117 identifier that can be populated by a key-value pair in dataId. 

118 e.g. "%(visit)07d/instcal%(visit)07d.fits.fz[%(ccdnum)d]" 

119 :param dataId: dictionary that hopefully has a key-value pair whose key 

120 matches (has the same name) as the key specifier in the template. 

121 :return: the HDU specified by the template+dataId pair, or None if the 

122 HDU can not be determined. 

123 """ 

124 # sanity check that the template at least ends with a brace. 

125 if not template.endswith(']'): 

126 return None 

127 

128 # get the key (with formatting) out of the brances 

129 hduKey = template[template.rfind('[') + 1:template.rfind(']')] 

130 # extract the key name from the formatting 

131 hduKey = hduKey[hduKey.rfind('(') + 1:hduKey.rfind(')')] 

132 

133 if hduKey in dataId: 

134 return dataId[hduKey] 

135 return None 

136 

137 class LookupData: 

138 

139 def __init__(self, lookupProperties, dataId): 

140 self.dataId = copy.copy(dataId) 

141 lookupProperties = sequencify(lookupProperties) 

142 self.lookupProperties = copy.copy(lookupProperties) 

143 self.foundItems = {} 

144 self.cachedStatus = None 

145 self.neededKeys = set(lookupProperties).union(dataId.keys()) 

146 

147 def __repr__(self): 

148 return "LookupData lookupProperties:%s dataId:%s foundItems:%s cachedStatus:%s" % \ 

149 (self.lookupProperties, self.dataId, self.foundItems, self.cachedStatus) 

150 

151 def status(self): 

152 """Query the lookup status 

153 

154 :return: 'match' if the key+value pairs in dataId have been satisifed and keys in 

155 lookupProperties have found and their key+value added to resolvedId 

156 'incomplete' if the found data matches but not all keys in lookupProperties have been matched 

157 'not match' if data in foundId does not match data in dataId 

158 """ 

159 class NotFound: 

160 """Placeholder class for item not found. 

161 

162 (None might be a valid value so we don't want to use that) 

163 """ 

164 pass 

165 

166 if self.cachedStatus is not None: 

167 return self.cachedStatus 

168 self.cachedStatus = 'match' 

169 for key in self.lookupProperties: 

170 val = self.foundItems.get(key, NotFound) 

171 if val is NotFound: 

172 self.cachedStatus = 'incomplete' 

173 break 

174 for dataIdKey, dataIdValue in self.dataId.items(): 

175 foundValue = self.foundItems.get(dataIdKey, NotFound) 

176 if foundValue is not NotFound and foundValue != dataIdValue: 

177 self.cachedStatus = 'notMatch' 

178 break 

179 return self.cachedStatus 

180 

181 def setFoundItems(self, items): 

182 self.cachedStatus = None 

183 self.foundItems = items 

184 

185 def addFoundItems(self, items): 

186 self.cachedStatus = None 

187 self.foundItems.update(items) 

188 

189 def getMissingKeys(self): 

190 return self.neededKeys - set(self.foundItems.keys()) 

191 

192 def lookup(self, lookupProperties, reference, dataId, **kwargs): 

193 """Perform a lookup in the registry. 

194 

195 Return values are refined by the values in dataId. 

196 Returns a list of values that match keys in lookupProperties. 

197 e.g. if the template is 'raw/raw_v%(visit)d_f%(filter)s.fits.gz', and 

198 dataId={'visit':1}, and lookupProperties is ['filter'], and the 

199 filesystem under self.root has exactly one file 'raw/raw_v1_fg.fits.gz' 

200 then the return value will be [('g',)] 

201 

202 :param lookupProperties: keys whose values will be returned. 

203 :param reference: other data types that may be used to search for values. 

204 :param dataId: must be an iterable. Keys must be string. 

205 If value is a string then will look for elements in the repository that match value for key. 

206 If value is a 2-item iterable then will look for elements in the repository are between (inclusive) 

207 the first and second items in the value. 

208 :param **kwargs: keys required for the posix registry to search for items. If required keys are not 

209 provide will return an empty list. 

210 'template': required. template parameter (typically from a policy) that can be used to look for files 

211 'storage': optional. Needed to look for metadata in files. Currently supported values: 'FitsStorage'. 

212 :return: a list of values that match keys in lookupProperties. 

213 """ 

214 # required kwargs: 

215 if 'template' in kwargs: 

216 template = kwargs['template'] 

217 else: 

218 return [] 

219 # optional kwargs: 

220 storage = kwargs['storage'] if 'storage' in kwargs else None 

221 

222 lookupData = PosixRegistry.LookupData(lookupProperties, dataId) 

223 scanner = fsScanner.FsScanner(template) 

224 allPaths = scanner.processPath(self.root) 

225 retItems = [] # one item for each found file that matches 

226 for path, foundProperties in allPaths.items(): 

227 # check for dataId keys that are not present in found properties 

228 # search for those keys in metadata of file at path 

229 # if present, check for matching values 

230 # if not present, file can not match, do not use it. 

231 lookupData.setFoundItems(foundProperties) 

232 if 'incomplete' == lookupData.status(): 

233 PosixRegistry.lookupMetadata(os.path.join(self.root, path), template, lookupData, storage) 

234 if 'match' == lookupData.status(): 

235 ll = tuple(lookupData.foundItems[key] for key in lookupData.lookupProperties) 

236 retItems.append(ll) 

237 return retItems 

238 

239 @staticmethod 

240 def lookupMetadata(filepath, template, lookupData, storage): 

241 """Dispatcher for looking up metadata in a file of a given storage type 

242 """ 

243 if storage == 'FitsStorage': 

244 PosixRegistry.lookupFitsMetadata(filepath, template, lookupData, storage) 

245 

246 @staticmethod 

247 def lookupFitsMetadata(filepath, template, lookupData, dataId): 

248 """Look up metadata in a fits file. 

249 Will try to discover the correct HDU to look in by testing if the 

250 template has a value in brackets at the end. 

251 If the HDU is specified but the metadata key is not discovered in 

252 that HDU, will look in the primary HDU before giving up. 

253 :param filepath: path to the file 

254 :param template: template that was used to discover the file. This can 

255 be used to look up the correct HDU as needed. 

256 :param lookupData: an instance if LookupData that contains the 

257 lookupProperties, the dataId, and the data that has been found so far. 

258 Will be updated with new information as discovered. 

259 :param dataId: 

260 :return: 

261 """ 

262 try: 

263 hdulist = astropy.io.fits.open(filepath, memmap=True) 

264 except IOError: 

265 return 

266 hduNumber = PosixRegistry.getHduNumber(template=template, dataId=dataId) 

267 if hduNumber is not None and hduNumber < len(hdulist): 

268 hdu = hdulist[hduNumber] 

269 else: 

270 hdu = None 

271 if len(hdulist) > 0: 

272 primaryHdu = hdulist[0] 

273 else: 

274 primaryHdu = None 

275 

276 for property in lookupData.getMissingKeys(): 

277 propertyValue = None 

278 if hdu is not None and property in hdu.header: 

279 propertyValue = hdu.header[property] 

280 # if the value is not in the indicated HDU, try the primary HDU: 

281 elif primaryHdu is not None and property in primaryHdu.header: 

282 propertyValue = primaryHdu.header[property] 

283 lookupData.addFoundItems({property: propertyValue}) 

284 

285 

286class SqlRegistry(Registry): 

287 """A base class for SQL-based registries 

288 

289 Subclasses should define the class variable `placeHolder` (the particular 

290 placeholder to use for parameter substitution) appropriately. The 

291 database's python module should define `paramstyle` (see PEP 249), which 

292 would indicate what to use for a placeholder: 

293 * paramstyle = "qmark" --> placeHolder = "?" 

294 * paramstyle = "format" --> placeHolder = "%s" 

295 Other `paramstyle` values are not currently supported. 

296 

297 Constructor parameters 

298 ---------------------- 

299 conn : DBAPI connection object 

300 Connection object 

301 """ 

302 placeHolder = "?" # Placeholder for parameter substitution 

303 

304 def __init__(self, conn): 

305 """Constructor. 

306 

307 Parameters 

308 ---------- 

309 conn : DBAPI connection object 

310 Connection object 

311 """ 

312 Registry.__init__(self) 

313 self.conn = conn 

314 

315 def __del__(self): 

316 if hasattr(self, "conn") and self.conn: 

317 self.conn.close() 

318 super().__del__() 

319 

320 def lookup(self, lookupProperties, reference, dataId, **kwargs): 

321 """Perform a lookup in the registry. 

322 

323 Return values are refined by the values in dataId. 

324 Returns a list of values that match keys in lookupProperties. 

325 e.g. if the template is 'raw/raw_v%(visit)d_f%(filter)s.fits.gz', and 

326 dataId={'visit':1}, and lookupProperties is ['filter'], and the 

327 filesystem under self.root has exactly one file 'raw/raw_v1_fg.fits.gz' 

328 then the return value will be [('g',)] 

329 

330 :param lookupProperties: 

331 :param dataId: must be an iterable. Keys must be string. 

332 If key is a string then will look for elements in the repository that match value for key. 

333 If key is a 2-item iterable then will look for elements in the repository where the value is between 

334 the values of key[0] and key[1]. 

335 :param reference: other data types that may be used to search for values. 

336 :param **kwargs: nothing needed for sqlite lookup 

337 :return: a list of values that match keys in lookupProperties. 

338 """ 

339 if not self.conn: 

340 return None 

341 

342 # input variable sanitization: 

343 reference = sequencify(reference) 

344 lookupProperties = sequencify(lookupProperties) 

345 

346 cmd = "SELECT DISTINCT " 

347 cmd += ", ".join(lookupProperties) 

348 cmd += " FROM " + " NATURAL JOIN ".join(reference) 

349 valueList = [] 

350 if dataId is not None and len(dataId) > 0: 

351 whereList = [] 

352 for k, v in dataId.items(): 

353 if hasattr(k, '__iter__') and not isinstance(k, str): 

354 if len(k) != 2: 

355 raise RuntimeError("Wrong number of keys for range:%s" % (k,)) 

356 whereList.append("(%s BETWEEN %s AND %s)" % (self.placeHolder, k[0], k[1])) 

357 valueList.append(v) 

358 else: 

359 whereList.append("%s = %s" % (k, self.placeHolder)) 

360 valueList.append(v) 

361 cmd += " WHERE " + " AND ".join(whereList) 

362 cursor = self.conn.cursor() 

363 cursor.execute(cmd, valueList) 

364 return [row for row in cursor.fetchall()] 

365 

366 def executeQuery(self, returnFields, joinClause, whereFields, range, values): 

367 """Extract metadata from the registry. 

368 @param returnFields (list of strings) Metadata fields to be extracted. 

369 @param joinClause (list of strings) Tables in which metadata fields 

370 are located. 

371 @param whereFields (list of tuples) First tuple element is metadata 

372 field to query; second is the value that field 

373 must have (often '?'). 

374 @param range (tuple) Value, lower limit, and upper limit for a 

375 range condition on the metadata. Any of these can 

376 be metadata fields. 

377 @param values (tuple) Tuple of values to be substituted for '?' 

378 characters in the whereFields values or the range 

379 values. 

380 @return (list of tuples) All sets of field values that meet the 

381 criteria""" 

382 if not self.conn: 

383 return None 

384 cmd = "SELECT DISTINCT " 

385 cmd += ", ".join(returnFields) 

386 cmd += " FROM " + " NATURAL JOIN ".join(joinClause) 

387 whereList = [] 

388 if whereFields: 

389 for k, v in whereFields: 

390 whereList.append("(%s = %s)" % (k, v)) 

391 if range is not None: 

392 whereList.append("(%s BETWEEN %s AND %s)" % range) 

393 if len(whereList) > 0: 

394 cmd += " WHERE " + " AND ".join(whereList) 

395 cursor = self.conn.cursor() 

396 cursor.execute(cmd, values) 

397 return [row for row in cursor.fetchall()] 

398 

399 

400class SqliteRegistry(SqlRegistry): 

401 """A SQLite-based registry""" 

402 placeHolder = "?" # Placeholder for parameter substitution 

403 

404 def __init__(self, location): 

405 """Constructor 

406 

407 Parameters 

408 ---------- 

409 location : `str` 

410 Path to SQLite3 file 

411 """ 

412 if os.path.exists(location): 

413 conn = sqlite3.connect(location) 

414 conn.text_factory = str 

415 self.root = location 

416 else: 

417 conn = None 

418 SqlRegistry.__init__(self, conn) 

419 

420 

421class PgsqlRegistry(SqlRegistry): 

422 """A PostgreSQL-based registry""" 

423 placeHolder = "%s" 

424 

425 def __init__(self, location): 

426 """Constructor 

427 

428 Parameters 

429 ---------- 

430 location : `str` 

431 Path to PostgreSQL configuration file. 

432 """ 

433 if not havePgsql: 

434 raise RuntimeError("Cannot use PgsqlRegistry: could not import psycopg2") 

435 config = self.readYaml(location) 

436 self._config = config 

437 conn = pgsql.connect(host=config["host"], port=config["port"], database=config["database"], 

438 user=config["user"], password=config["password"]) 

439 self.root = location 

440 SqlRegistry.__init__(self, conn) 

441 

442 @staticmethod 

443 def readYaml(location): 

444 """Read YAML configuration file 

445 

446 The YAML configuration file should contain: 

447 * host : host name for database connection 

448 * port : port for database connection 

449 * user : user name for database connection 

450 * database : database name 

451 

452 It may also contain: 

453 * password : password for database connection 

454 

455 The optional entries are set to `None` in the output configuration. 

456 

457 Parameters 

458 ---------- 

459 location : `str` 

460 Path to PostgreSQL YAML config file. 

461 

462 Returns 

463 ------- 

464 config : `dict` 

465 Configuration 

466 """ 

467 try: 

468 # PyYAML >=5.1 prefers a different loader 

469 loader = yaml.FullLoader 

470 except AttributeError: 

471 loader = yaml.Loader 

472 with open(location) as ff: 

473 data = yaml.load(ff, Loader=loader) 

474 requireKeys = set(["host", "port", "database", "user"]) 

475 optionalKeys = set(["password"]) 

476 haveKeys = set(data.keys()) 

477 if haveKeys - optionalKeys != requireKeys: 

478 raise RuntimeError( 

479 "PostgreSQL YAML configuration (%s) should contain only %s, and may contain 'password', " 

480 "but this contains: %s" % 

481 (location, ",".join("'%s'" % key for key in requireKeys), 

482 ",".join("'%s'" % key for key in data.keys())) 

483 ) 

484 for key in optionalKeys: 

485 if key not in data: 

486 data[key] = None 

487 

488 return data 

489 

490 def lookup(self, *args, **kwargs): 

491 try: 

492 return SqlRegistry.lookup(self, *args, **kwargs) 

493 except Exception: 

494 self.conn.rollback() 

495 raise