lsst.daf.persistence  13.0-36-gb9d951e
 All Classes Namespaces Files Functions Variables Typedefs Friends Macros
registries.py
Go to the documentation of this file.
1 #
2 # LSST Data Management System
3 # Copyright 2008, 2009, 2010 LSST Corporation.
4 #
5 # This product includes software developed by the
6 # LSST Project (http://www.lsst.org/).
7 #
8 # This program is free software: you can redistribute it and/or modify
9 # it under the terms of the GNU General Public License as published by
10 # the Free Software Foundation, either version 3 of the License, or
11 # (at your option) any later version.
12 #
13 # This program is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU General Public License for more details.
17 #
18 # You should have received a copy of the LSST License Statement and
19 # the GNU General Public License along with this program. If not,
20 # see <http://www.lsstcorp.org/LegalNotices/>.
21 #
22 
23 """This module provides registry classes for maintaining dataset metadata
24 for use by the Data Butler. Currently only a SQLite3-based registry is
25 implemented, but registries based on a text file, a policy file, a MySQL
26 (or other) relational database, and data gathered from scanning a filesystem
27 are all anticipated.
28 
29 Currently this module assumes posix access (for both PosixRegistry AND
30 SqliteRegistry). It is possible that it can be factored so that at least the
31 SqliteRegistry can be remote/not on the local filesystem. For now this module
32 is only used by CameraMapper and by PosixStorage, both of which work on the
33 local filesystem only, so this works for the time being.
34 """
35 from __future__ import absolute_import
36 from builtins import object
37 from past.builtins import basestring
38 
39 import copy
40 from . import fsScanner, sequencify
41 import os
42 import astropy.io.fits
43 import re
44 import yaml
45 
46 try:
47  import sqlite3
48  haveSqlite3 = True
49 except ImportError:
50  try:
51  # try external pysqlite package; deprecated
52  import sqlite as sqlite3
53  haveSqlite3 = True
54  except ImportError:
55  haveSqlite3 = False
56 
57 # PostgreSQL support
58 try:
59  import psycopg2 as pgsql
60  havePgsql = True
61 except ImportError:
62  havePgsql = False
63 
64 
65 class Registry(object):
66  """The registry base class."""
67 
68  def __init__(self):
69  pass
70 
71  @staticmethod
72  def create(location):
73  """Create a registry object of an appropriate type.
74  @param location (string) Path or URL for registry, or None if
75  unavailable"""
76 
77  if location is None:
78  return
79 
80  # if re.match(r'.*\.registry', location):
81  # return FileRegistry(location)
82  # if re.match(r'.*\.paf', location):
83  # return CalibRegistry(location)
84 
85  if location.endswith(".pgsql"):
86  return PgsqlRegistry(location)
87 
88  # look for an sqlite3 registry
89  if re.match(r'.*\.sqlite3', location):
90  if not haveSqlite3:
91  raise RuntimeError("sqlite3 registry specified (%s), but unable to import sqlite3 module" %
92  (location,))
93  registry = SqliteRegistry(location)
94  if registry.conn is None:
95  return None
96  return registry
97 
98  # if re.match(r'mysql:', location):
99  # return DbRegistry(location)
100  # return FsRegistry(location)
101 
102  # next try to create a PosixRegistry
103  if os.path.isdir(location):
104  return PosixRegistry(root=location)
105 
106  raise RuntimeError("Unable to create registry using location: " + location)
107 
108 
110  """A glob-based filesystem registry"""
111 
112  def __init__(self, root):
113  Registry.__init__(self)
114  self.root = root
115 
116  @staticmethod
117  def getHduNumber(template, dataId):
118  """Looks up the HDU number for a given template+dataId.
119  :param template: template with HDU specifier (ends with brackets and an
120  identifier that can be populated by a key-value pair in dataId.
121  e.g. "%(visit)07d/instcal%(visit)07d.fits.fz[%(ccdnum)d]"
122  :param dataId: dictionary that hopefully has a key-value pair whose key
123  matches (has the same name) as the key specifier in the template.
124  :return: the HDU specified by the template+dataId pair, or None if the
125  HDU can not be determined.
126  """
127  # sanity check that the template at least ends with a brace.
128  if not template.endswith(']'):
129  return None
130 
131  # get the key (with formatting) out of the brances
132  hduKey = template[template.rfind('[') + 1:template.rfind(']')]
133  # extract the key name from the formatting
134  hduKey = hduKey[hduKey.rfind('(') + 1:hduKey.rfind(')')]
135 
136  if hduKey in dataId:
137  return dataId[hduKey]
138  return None
139 
140  class LookupData(object):
141 
142  def __init__(self, lookupProperties, dataId):
143  self.dataId = copy.copy(dataId)
144  lookupProperties = sequencify(lookupProperties)
145  self.lookupProperties = copy.copy(lookupProperties)
146  self.foundItems = {}
147  self.cachedStatus = None
148  self.neededKeys = set(lookupProperties).union(dataId.keys())
149 
150  def __repr__(self):
151  return "LookupData lookupProperties:%s dataId:%s foundItems:%s cachedStatus:%s" % \
152  (self.lookupProperties, self.dataId, self.foundItems, self.cachedStatus)
153 
154  def status(self):
155  """Query the lookup status
156 
157  :return: 'match' if the key+value pairs in dataId have been satisifed and keys in
158  lookupProperties have found and their key+value added to resolvedId
159  'incomplete' if the found data matches but not all keys in lookupProperties have been matched
160  'not match' if data in foundId does not match data in dataId
161  """
162  class NotFound:
163  """Placeholder class for item not found.
164 
165  (None might be a valid value so we don't want to use that)
166  """
167  pass
168 
169  if self.cachedStatus is not None:
170  return self.cachedStatus
171  self.cachedStatus = 'match'
172  for key in self.lookupProperties:
173  val = self.foundItems.get(key, NotFound)
174  if val is NotFound:
175  self.cachedStatus = 'incomplete'
176  break
177  for dataIdKey, dataIdValue in self.dataId.items():
178  foundValue = self.foundItems.get(dataIdKey, NotFound)
179  if foundValue is not NotFound and foundValue != dataIdValue:
180  self.cachedStatus = 'notMatch'
181  break
182  return self.cachedStatus
183 
184  def setFoundItems(self, items):
185  self.cachedStatus = None
186  self.foundItems = items
187 
188  def addFoundItems(self, items):
189  self.cachedStatus = None
190  self.foundItems.update(items)
191 
192  def getMissingKeys(self):
193  return self.neededKeys - set(self.foundItems.keys())
194 
195  def lookup(self, lookupProperties, reference, dataId, **kwargs):
196  """Perform a lookup in the registry.
197 
198  Return values are refined by the values in dataId.
199  Returns a list of values that match keys in lookupProperties.
200  e.g. if the template is 'raw/raw_v%(visit)d_f%(filter)s.fits.gz', and
201  dataId={'visit':1}, and lookupProperties is ['filter'], and the
202  filesystem under self.root has exactly one file 'raw/raw_v1_fg.fits.gz'
203  then the return value will be [('g',)]
204 
205  :param lookupProperties: keys whose values will be returned.
206  :param reference: other data types that may be used to search for values.
207  :param dataId: must be an iterable. Keys must be string.
208  If value is a string then will look for elements in the repository that match value for key.
209  If value is a 2-item iterable then will look for elements in the repository are between (inclusive)
210  the first and second items in the value.
211  :param **kwargs: keys required for the posix registry to search for items. If required keys are not
212  provide will return an empty list.
213  'template': required. template parameter (typically from a policy) that can be used to look for files
214  'storage': optional. Needed to look for metadata in files. Currently supported values: 'FitsStorage'.
215  :return: a list of values that match keys in lookupProperties.
216  """
217  # required kwargs:
218  if 'template' in kwargs:
219  template = kwargs['template']
220  else:
221  return []
222  # optional kwargs:
223  storage = kwargs['storage'] if 'storage' in kwargs else None
224 
225  lookupData = PosixRegistry.LookupData(lookupProperties, dataId)
226  scanner = fsScanner.FsScanner(template)
227  allPaths = scanner.processPath(self.root)
228  retItems = [] # one item for each found file that matches
229  for path, foundProperties in allPaths.items():
230  # check for dataId keys that are not present in found properties
231  # search for those keys in metadata of file at path
232  # if present, check for matching values
233  # if not present, file can not match, do not use it.
234  lookupData.setFoundItems(foundProperties)
235  if 'incomplete' == lookupData.status():
236  PosixRegistry.lookupMetadata(os.path.join(self.root, path), template, lookupData, storage)
237  if 'match' == lookupData.status():
238  l = tuple(lookupData.foundItems[key] for key in lookupData.lookupProperties)
239  retItems.append(l)
240  return retItems
241 
242  @staticmethod
243  def lookupMetadata(filepath, template, lookupData, storage):
244  """Dispatcher for looking up metadata in a file of a given storage type
245  """
246  if storage == 'FitsStorage':
247  PosixRegistry.lookupFitsMetadata(filepath, template, lookupData, storage)
248 
249  @staticmethod
250  def lookupFitsMetadata(filepath, template, lookupData, dataId):
251  """Look up metadata in a fits file.
252  Will try to discover the correct HDU to look in by testing if the
253  template has a value in brackets at the end.
254  If the HDU is specified but the metadata key is not discovered in
255  that HDU, will look in the primary HDU before giving up.
256  :param filepath: path to the file
257  :param template: template that was used to discover the file. This can
258  be used to look up the correct HDU as needed.
259  :param lookupData: an instance if LookupData that contains the
260  lookupProperties, the dataId, and the data that has been found so far.
261  Will be updated with new information as discovered.
262  :param dataId:
263  :return:
264  """
265  try:
266  hdulist = astropy.io.fits.open(filepath, memmap=True)
267  except IOError:
268  return
269  hduNumber = PosixRegistry.getHduNumber(template=template, dataId=dataId)
270  if hduNumber is not None and hduNumber < len(hdulist):
271  hdu = hdulist[hduNumber]
272  else:
273  hdu = None
274  if len(hdulist) > 0:
275  primaryHdu = hdulist[0]
276  else:
277  primaryHdu = None
278 
279  for property in lookupData.getMissingKeys():
280  propertyValue = None
281  if hdu is not None and property in hdu.header:
282  propertyValue = hdu.header[property]
283  # if the value is not in the indicated HDU, try the primary HDU:
284  elif primaryHdu is not None and property in primaryHdu.header:
285  propertyValue = primaryHdu.header[property]
286  lookupData.addFoundItems({property: propertyValue})
287 
288 
290  """A base class for SQL-based registries
291 
292  Subclasses should define the class variable `placeHolder` (the particular
293  placeholder to use for parameter substitution) appropriately. The
294  database's python module should define `paramstyle` (see PEP 249), which
295  would indicate what to use for a placeholder:
296  * paramstyle = "qmark" --> placeHolder = "?"
297  * paramstyle = "format" --> placeHolder = "%s"
298  Other `paramstyle` values are not currently supported.
299 
300  Constructor parameters
301  ----------------------
302  conn : DBAPI connection object
303  Connection object
304  """
305  placeHolder = "?" # Placeholder for parameter substitution
306 
307  def __init__(self, conn):
308  """Constructor.
309 
310  Parameters
311  ----------
312  conn : DBAPI connection object
313  Connection object
314  """
315  Registry.__init__(self)
316  self.conn = conn
317 
318  def __del__(self):
319  self.conn.close()
320  super(SqlRegistry, self).__del__()
321 
322  def lookup(self, lookupProperties, reference, dataId, **kwargs):
323  """Perform a lookup in the registry.
324 
325  Return values are refined by the values in dataId.
326  Returns a list of values that match keys in lookupProperties.
327  e.g. if the template is 'raw/raw_v%(visit)d_f%(filter)s.fits.gz', and
328  dataId={'visit':1}, and lookupProperties is ['filter'], and the
329  filesystem under self.root has exactly one file 'raw/raw_v1_fg.fits.gz'
330  then the return value will be [('g',)]
331 
332  :param lookupProperties:
333  :param dataId: must be an iterable. Keys must be string.
334  If key is a string then will look for elements in the repository that match value for key.
335  If key is a 2-item iterable then will look for elements in the repository where the value is between
336  the values of key[0] and key[1].
337  :param reference: other data types that may be used to search for values.
338  :param **kwargs: nothing needed for sqlite lookup
339  :return: a list of values that match keys in lookupProperties.
340  """
341  if not self.conn:
342  return None
343 
344  # input variable sanitization:
345  reference = sequencify(reference)
346  lookupProperties = sequencify(lookupProperties)
347 
348  cmd = "SELECT DISTINCT "
349  cmd += ", ".join(lookupProperties)
350  cmd += " FROM " + " NATURAL JOIN ".join(reference)
351  valueList = []
352  if dataId is not None and len(dataId) > 0:
353  whereList = []
354  for k, v in dataId.items():
355  if hasattr(k, '__iter__') and not isinstance(k, basestring):
356  if len(k) != 2:
357  raise RuntimeError("Wrong number of keys for range:%s" % (k,))
358  whereList.append("(%s BETWEEN %s AND %s)" % (self.placeHolder, k[0], k[1]))
359  valueList.append(v)
360  else:
361  whereList.append("%s = %s" % (k, self.placeHolder))
362  valueList.append(v)
363  cmd += " WHERE " + " AND ".join(whereList)
364  cursor = self.conn.cursor()
365  cursor.execute(cmd, valueList)
366  return [row for row in cursor.fetchall()]
367 
368  def executeQuery(self, returnFields, joinClause, whereFields, range, values):
369  """Extract metadata from the registry.
370  @param returnFields (list of strings) Metadata fields to be extracted.
371  @param joinClause (list of strings) Tables in which metadata fields
372  are located.
373  @param whereFields (list of tuples) First tuple element is metadata
374  field to query; second is the value that field
375  must have (often '?').
376  @param range (tuple) Value, lower limit, and upper limit for a
377  range condition on the metadata. Any of these can
378  be metadata fields.
379  @param values (tuple) Tuple of values to be substituted for '?'
380  characters in the whereFields values or the range
381  values.
382  @return (list of tuples) All sets of field values that meet the
383  criteria"""
384  if not self.conn:
385  return None
386  cmd = "SELECT DISTINCT "
387  cmd += ", ".join(returnFields)
388  cmd += " FROM " + " NATURAL JOIN ".join(joinClause)
389  whereList = []
390  if whereFields:
391  for k, v in whereFields:
392  whereList.append("(%s = %s)" % (k, v))
393  if range is not None:
394  whereList.append("(%s BETWEEN %s AND %s)" % range)
395  if len(whereList) > 0:
396  cmd += " WHERE " + " AND ".join(whereList)
397  cursor = self.conn.cursor()
398  cursor.execute(cmd, values)
399  return [row for row in cursor.fetchall()]
400 
401 
403  """A SQLite-based registry"""
404  placeHolder = "?" # Placeholder for parameter substitution
405 
406  def __init__(self, location):
407  """Constructor
408 
409  Parameters
410  ----------
411  location : `str`
412  Path to SQLite3 file
413  """
414  if os.path.exists(location):
415  conn = sqlite3.connect(location)
416  conn.text_factory = str
417  else:
418  conn = None
419  SqlRegistry.__init__(self, conn)
420 
421 
423  """A PostgreSQL-based registry"""
424  placeHolder = "%s"
425 
426  def __init__(self, location):
427  """Constructor
428 
429  Parameters
430  ----------
431  location : `str`
432  Path to PostgreSQL configuration file.
433  """
434  if not havePgsql:
435  raise RuntimeError("Cannot use PgsqlRegistry: could not import psycopg2")
436  config = self.readYaml(location)
437  self._config = config
438  conn = pgsql.connect(host=config["host"], port=config["port"], database=config["database"],
439  user=config["user"], password=config["password"])
440  SqlRegistry.__init__(self, conn)
441 
442  def __del__(self):
443  if self.conn:
444  self.conn.close()
445 
446  @staticmethod
447  def readYaml(location):
448  """Read YAML configuration file
449 
450  The YAML configuration file should contain:
451  * host : host name for database connection
452  * port : port for database connection
453  * user : user name for database connection
454  * database : database name
455 
456  It may also contain:
457  * password : password for database connection
458 
459  The optional entries are set to `None` in the output configuration.
460 
461  Parameters
462  ----------
463  location : `str`
464  Path to PostgreSQL YAML config file.
465 
466  Returns
467  -------
468  config : `dict`
469  Configuration
470  """
471  with open(location) as ff:
472  data = yaml.load(ff)
473  requireKeys = set(["host", "port", "database", "user"])
474  optionalKeys = set(["password"])
475  haveKeys = set(data.keys())
476  if haveKeys - optionalKeys != requireKeys:
477  raise RuntimeError(
478  "PostgreSQL YAML configuration (%s) should contain only %s, and may contain 'password', "
479  "but this contains: %s" %
480  (location, ",".join("'%s'" % key for key in requireKeys),
481  ",".join("'%s'" % key for key in data.keys()))
482  )
483  for key in optionalKeys:
484  if key not in data:
485  data[key] = None
486 
487  return data
488 
489  def lookup(self, *args, **kwargs):
490  try:
491  return SqlRegistry.lookup(self, *args, **kwargs)
492  except Exception:
493  self.conn.rollback()
494  raise