23 """This module provides registry classes for maintaining dataset metadata 24 for use by the Data Butler. Currently only a SQLite3-based registry is 25 implemented, but registries based on a text file, a policy file, a MySQL 26 (or other) relational database, and data gathered from scanning a filesystem 29 Currently this module assumes posix access (for both PosixRegistry AND 30 SqliteRegistry). It is possible that it can be factored so that at least the 31 SqliteRegistry can be remote/not on the local filesystem. For now this module 32 is only used by CameraMapper and by PosixStorage, both of which work on the 33 local filesystem only, so this works for the time being. 35 from __future__
import absolute_import
36 from builtins
import object
37 from past.builtins
import basestring
40 from .
import fsScanner, sequencify
42 import astropy.io.fits
52 import sqlite
as sqlite3
59 import psycopg2
as pgsql
66 """The registry base class.""" 73 """Create a registry object of an appropriate type. 74 @param location (string) Path or URL for registry, or None if 85 if location.endswith(
".pgsql"):
89 if re.match(
r'.*\.sqlite3', location):
91 raise RuntimeError(
"sqlite3 registry specified (%s), but unable to import sqlite3 module" %
94 if registry.conn
is None:
103 if os.path.isdir(location):
106 raise RuntimeError(
"Unable to create registry using location: " + location)
110 """A glob-based filesystem registry""" 113 Registry.__init__(self)
118 """Looks up the HDU number for a given template+dataId. 119 :param template: template with HDU specifier (ends with brackets and an 120 identifier that can be populated by a key-value pair in dataId. 121 e.g. "%(visit)07d/instcal%(visit)07d.fits.fz[%(ccdnum)d]" 122 :param dataId: dictionary that hopefully has a key-value pair whose key 123 matches (has the same name) as the key specifier in the template. 124 :return: the HDU specified by the template+dataId pair, or None if the 125 HDU can not be determined. 128 if not template.endswith(
']'):
132 hduKey = template[template.rfind(
'[') + 1:template.rfind(
']')]
134 hduKey = hduKey[hduKey.rfind(
'(') + 1:hduKey.rfind(
')')]
137 return dataId[hduKey]
144 lookupProperties =
sequencify(lookupProperties)
151 return "LookupData lookupProperties:%s dataId:%s foundItems:%s cachedStatus:%s" % \
155 """Query the lookup status 157 :return: 'match' if the key+value pairs in dataId have been satisifed and keys in 158 lookupProperties have found and their key+value added to resolvedId 159 'incomplete' if the found data matches but not all keys in lookupProperties have been matched 160 'not match' if data in foundId does not match data in dataId 163 """Placeholder class for item not found. 165 (None might be a valid value so we don't want to use that) 169 if self.cachedStatus
is not None:
170 return self.cachedStatus
171 self.cachedStatus =
'match' 172 for key
in self.lookupProperties:
173 val = self.foundItems.get(key, NotFound)
175 self.cachedStatus =
'incomplete' 177 for dataIdKey, dataIdValue
in self.dataId.items():
178 foundValue = self.foundItems.get(dataIdKey, NotFound)
179 if foundValue
is not NotFound
and foundValue != dataIdValue:
180 self.cachedStatus =
'notMatch' 182 return self.cachedStatus
195 def lookup(self, lookupProperties, reference, dataId, **kwargs):
196 """Perform a lookup in the registry. 198 Return values are refined by the values in dataId. 199 Returns a list of values that match keys in lookupProperties. 200 e.g. if the template is 'raw/raw_v%(visit)d_f%(filter)s.fits.gz', and 201 dataId={'visit':1}, and lookupProperties is ['filter'], and the 202 filesystem under self.root has exactly one file 'raw/raw_v1_fg.fits.gz' 203 then the return value will be [('g',)] 205 :param lookupProperties: keys whose values will be returned. 206 :param reference: other data types that may be used to search for values. 207 :param dataId: must be an iterable. Keys must be string. 208 If value is a string then will look for elements in the repository that match value for key. 209 If value is a 2-item iterable then will look for elements in the repository are between (inclusive) 210 the first and second items in the value. 211 :param **kwargs: keys required for the posix registry to search for items. If required keys are not 212 provide will return an empty list. 213 'template': required. template parameter (typically from a policy) that can be used to look for files 214 'storage': optional. Needed to look for metadata in files. Currently supported values: 'FitsStorage'. 215 :return: a list of values that match keys in lookupProperties. 218 if 'template' in kwargs:
219 template = kwargs[
'template']
223 storage = kwargs[
'storage']
if 'storage' in kwargs
else None 227 allPaths = scanner.processPath(self.
root)
229 for path, foundProperties
in allPaths.items():
234 lookupData.setFoundItems(foundProperties)
235 if 'incomplete' == lookupData.status():
236 PosixRegistry.lookupMetadata(os.path.join(self.
root, path), template, lookupData, storage)
237 if 'match' == lookupData.status():
238 l = tuple(lookupData.foundItems[key]
for key
in lookupData.lookupProperties)
244 """Dispatcher for looking up metadata in a file of a given storage type 246 if storage ==
'FitsStorage':
247 PosixRegistry.lookupFitsMetadata(filepath, template, lookupData, storage)
251 """Look up metadata in a fits file. 252 Will try to discover the correct HDU to look in by testing if the 253 template has a value in brackets at the end. 254 If the HDU is specified but the metadata key is not discovered in 255 that HDU, will look in the primary HDU before giving up. 256 :param filepath: path to the file 257 :param template: template that was used to discover the file. This can 258 be used to look up the correct HDU as needed. 259 :param lookupData: an instance if LookupData that contains the 260 lookupProperties, the dataId, and the data that has been found so far. 261 Will be updated with new information as discovered. 266 hdulist = astropy.io.fits.open(filepath, memmap=
True)
269 hduNumber = PosixRegistry.getHduNumber(template=template, dataId=dataId)
270 if hduNumber
is not None and hduNumber < len(hdulist):
271 hdu = hdulist[hduNumber]
275 primaryHdu = hdulist[0]
279 for property
in lookupData.getMissingKeys():
281 if hdu
is not None and property
in hdu.header:
282 propertyValue = hdu.header[property]
284 elif primaryHdu
is not None and property
in primaryHdu.header:
285 propertyValue = primaryHdu.header[property]
286 lookupData.addFoundItems({property: propertyValue})
290 """A base class for SQL-based registries 292 Subclasses should define the class variable `placeHolder` (the particular 293 placeholder to use for parameter substitution) appropriately. The 294 database's python module should define `paramstyle` (see PEP 249), which 295 would indicate what to use for a placeholder: 296 * paramstyle = "qmark" --> placeHolder = "?" 297 * paramstyle = "format" --> placeHolder = "%s" 298 Other `paramstyle` values are not currently supported. 300 Constructor parameters 301 ---------------------- 302 conn : DBAPI connection object 312 conn : DBAPI connection object 315 Registry.__init__(self)
318 def lookup(self, lookupProperties, reference, dataId, **kwargs):
319 """Perform a lookup in the registry. 321 Return values are refined by the values in dataId. 322 Returns a list of values that match keys in lookupProperties. 323 e.g. if the template is 'raw/raw_v%(visit)d_f%(filter)s.fits.gz', and 324 dataId={'visit':1}, and lookupProperties is ['filter'], and the 325 filesystem under self.root has exactly one file 'raw/raw_v1_fg.fits.gz' 326 then the return value will be [('g',)] 328 :param lookupProperties: 329 :param dataId: must be an iterable. Keys must be string. 330 If key is a string then will look for elements in the repository that match value for key. 331 If key is a 2-item iterable then will look for elements in the repository where the value is between 332 the values of key[0] and key[1]. 333 :param reference: other data types that may be used to search for values. 334 :param **kwargs: nothing needed for sqlite lookup 335 :return: a list of values that match keys in lookupProperties. 342 lookupProperties =
sequencify(lookupProperties)
344 cmd =
"SELECT DISTINCT " 345 cmd +=
", ".join(lookupProperties)
346 cmd +=
" FROM " +
" NATURAL JOIN ".join(reference)
348 if dataId
is not None and len(dataId) > 0:
350 for k, v
in dataId.items():
351 if hasattr(k,
'__iter__')
and not isinstance(k, basestring):
353 raise RuntimeError(
"Wrong number of keys for range:%s" % (k,))
354 whereList.append(
"(%s BETWEEN %s AND %s)" % (self.
placeHolder, k[0], k[1]))
357 whereList.append(
"%s = %s" % (k, self.
placeHolder))
359 cmd +=
" WHERE " +
" AND ".join(whereList)
360 cursor = self.
conn.cursor()
361 cursor.execute(cmd, valueList)
362 return [row
for row
in cursor.fetchall()]
364 def executeQuery(self, returnFields, joinClause, whereFields, range, values):
365 """Extract metadata from the registry. 366 @param returnFields (list of strings) Metadata fields to be extracted. 367 @param joinClause (list of strings) Tables in which metadata fields 369 @param whereFields (list of tuples) First tuple element is metadata 370 field to query; second is the value that field 371 must have (often '?'). 372 @param range (tuple) Value, lower limit, and upper limit for a 373 range condition on the metadata. Any of these can 375 @param values (tuple) Tuple of values to be substituted for '?' 376 characters in the whereFields values or the range 378 @return (list of tuples) All sets of field values that meet the 382 cmd =
"SELECT DISTINCT " 383 cmd +=
", ".join(returnFields)
384 cmd +=
" FROM " +
" NATURAL JOIN ".join(joinClause)
387 for k, v
in whereFields:
388 whereList.append(
"(%s = %s)" % (k, v))
389 if range
is not None:
390 whereList.append(
"(%s BETWEEN %s AND %s)" % range)
391 if len(whereList) > 0:
392 cmd +=
" WHERE " +
" AND ".join(whereList)
393 cursor = self.
conn.cursor()
394 cursor.execute(cmd, values)
395 return [row
for row
in cursor.fetchall()]
399 """A SQLite-based registry""" 410 if os.path.exists(location):
411 conn = sqlite3.connect(location)
412 conn.text_factory = str
415 SqlRegistry.__init__(self, conn)
419 """A PostgreSQL-based registry""" 428 Path to PostgreSQL configuration file. 431 raise RuntimeError(
"Cannot use PgsqlRegistry: could not import psycopg2")
434 conn = pgsql.connect(host=config[
"host"], port=config[
"port"], database=config[
"database"],
435 user=config[
"user"], password=config[
"password"])
436 SqlRegistry.__init__(self, conn)
444 """Read YAML configuration file 446 The YAML configuration file should contain: 447 * host : host name for database connection 448 * port : port for database connection 449 * user : user name for database connection 450 * database : database name 453 * password : password for database connection 455 The optional entries are set to `None` in the output configuration. 460 Path to PostgreSQL YAML config file. 467 with open(location)
as ff:
469 requireKeys = set([
"host",
"port",
"database",
"user"])
470 optionalKeys = set([
"password"])
471 haveKeys = set(data.keys())
472 if haveKeys - optionalKeys != requireKeys:
474 "PostgreSQL YAML configuration (%s) should contain only %s, and may contain 'password', " 475 "but this contains: %s" %
476 (location,
",".join(
"'%s'" % key
for key
in requireKeys),
477 ",".join(
"'%s'" % key
for key
in data.keys()))
479 for key
in optionalKeys:
487 return SqlRegistry.lookup(self, *args, **kwargs)
488 except Exception
as exc:
def __init__(self, location)
def lookup(self, lookupProperties, reference, dataId, kwargs)
def lookupFitsMetadata(filepath, template, lookupData, dataId)
def lookup(self, args, kwargs)
def getHduNumber(template, dataId)
def __init__(self, lookupProperties, dataId)
def addFoundItems(self, items)
def lookup(self, lookupProperties, reference, dataId, kwargs)
def executeQuery(self, returnFields, joinClause, whereFields, range, values)
def lookupMetadata(filepath, template, lookupData, storage)
def setFoundItems(self, items)
def __init__(self, location)