lsst.daf.persistence  13.0-30-gd2bda26+1
 All Classes Namespaces Files Functions Variables Typedefs Friends Macros
registries.py
Go to the documentation of this file.
1 #
2 # LSST Data Management System
3 # Copyright 2008, 2009, 2010 LSST Corporation.
4 #
5 # This product includes software developed by the
6 # LSST Project (http://www.lsst.org/).
7 #
8 # This program is free software: you can redistribute it and/or modify
9 # it under the terms of the GNU General Public License as published by
10 # the Free Software Foundation, either version 3 of the License, or
11 # (at your option) any later version.
12 #
13 # This program is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU General Public License for more details.
17 #
18 # You should have received a copy of the LSST License Statement and
19 # the GNU General Public License along with this program. If not,
20 # see <http://www.lsstcorp.org/LegalNotices/>.
21 #
22 
23 """This module provides registry classes for maintaining dataset metadata
24 for use by the Data Butler. Currently only a SQLite3-based registry is
25 implemented, but registries based on a text file, a policy file, a MySQL
26 (or other) relational database, and data gathered from scanning a filesystem
27 are all anticipated.
28 
29 Currently this module assumes posix access (for both PosixRegistry AND
30 SqliteRegistry). It is possible that it can be factored so that at least the
31 SqliteRegistry can be remote/not on the local filesystem. For now this module
32 is only used by CameraMapper and by PosixStorage, both of which work on the
33 local filesystem only, so this works for the time being.
34 """
35 from __future__ import absolute_import
36 from builtins import object
37 from past.builtins import basestring
38 
39 import copy
40 from . import fsScanner, sequencify
41 import os
42 import astropy.io.fits
43 import re
44 import yaml
45 
46 try:
47  import sqlite3
48  haveSqlite3 = True
49 except ImportError:
50  try:
51  # try external pysqlite package; deprecated
52  import sqlite as sqlite3
53  haveSqlite3 = True
54  except ImportError:
55  haveSqlite3 = False
56 
57 # PostgreSQL support
58 try:
59  import psycopg2 as pgsql
60  havePgsql = True
61 except ImportError:
62  havePgsql = False
63 
64 
65 class Registry(object):
66  """The registry base class."""
67 
68  def __init__(self):
69  pass
70 
71  @staticmethod
72  def create(location):
73  """Create a registry object of an appropriate type.
74  @param location (string) Path or URL for registry, or None if
75  unavailable"""
76 
77  if location is None:
78  return
79 
80  # if re.match(r'.*\.registry', location):
81  # return FileRegistry(location)
82  # if re.match(r'.*\.paf', location):
83  # return CalibRegistry(location)
84 
85  if location.endswith(".pgsql"):
86  return PgsqlRegistry(location)
87 
88  # look for an sqlite3 registry
89  if re.match(r'.*\.sqlite3', location):
90  if not haveSqlite3:
91  raise RuntimeError("sqlite3 registry specified (%s), but unable to import sqlite3 module" %
92  (location,))
93  registry = SqliteRegistry(location)
94  if registry.conn is None:
95  return None
96  return registry
97 
98  # if re.match(r'mysql:', location):
99  # return DbRegistry(location)
100  # return FsRegistry(location)
101 
102  # next try to create a PosixRegistry
103  if os.path.isdir(location):
104  return PosixRegistry(root=location)
105 
106  raise RuntimeError("Unable to create registry using location: " + location)
107 
108 
110  """A glob-based filesystem registry"""
111 
112  def __init__(self, root):
113  Registry.__init__(self)
114  self.root = root
115 
116  @staticmethod
117  def getHduNumber(template, dataId):
118  """Looks up the HDU number for a given template+dataId.
119  :param template: template with HDU specifier (ends with brackets and an
120  identifier that can be populated by a key-value pair in dataId.
121  e.g. "%(visit)07d/instcal%(visit)07d.fits.fz[%(ccdnum)d]"
122  :param dataId: dictionary that hopefully has a key-value pair whose key
123  matches (has the same name) as the key specifier in the template.
124  :return: the HDU specified by the template+dataId pair, or None if the
125  HDU can not be determined.
126  """
127  # sanity check that the template at least ends with a brace.
128  if not template.endswith(']'):
129  return None
130 
131  # get the key (with formatting) out of the brances
132  hduKey = template[template.rfind('[') + 1:template.rfind(']')]
133  # extract the key name from the formatting
134  hduKey = hduKey[hduKey.rfind('(') + 1:hduKey.rfind(')')]
135 
136  if hduKey in dataId:
137  return dataId[hduKey]
138  return None
139 
140  class LookupData(object):
141 
142  def __init__(self, lookupProperties, dataId):
143  self.dataId = copy.copy(dataId)
144  lookupProperties = sequencify(lookupProperties)
145  self.lookupProperties = copy.copy(lookupProperties)
146  self.foundItems = {}
147  self.cachedStatus = None
148  self.neededKeys = set(lookupProperties).union(dataId.keys())
149 
150  def __repr__(self):
151  return "LookupData lookupProperties:%s dataId:%s foundItems:%s cachedStatus:%s" % \
152  (self.lookupProperties, self.dataId, self.foundItems, self.cachedStatus)
153 
154  def status(self):
155  """Query the lookup status
156 
157  :return: 'match' if the key+value pairs in dataId have been satisifed and keys in
158  lookupProperties have found and their key+value added to resolvedId
159  'incomplete' if the found data matches but not all keys in lookupProperties have been matched
160  'not match' if data in foundId does not match data in dataId
161  """
162  class NotFound:
163  """Placeholder class for item not found.
164 
165  (None might be a valid value so we don't want to use that)
166  """
167  pass
168 
169  if self.cachedStatus is not None:
170  return self.cachedStatus
171  self.cachedStatus = 'match'
172  for key in self.lookupProperties:
173  val = self.foundItems.get(key, NotFound)
174  if val is NotFound:
175  self.cachedStatus = 'incomplete'
176  break
177  for dataIdKey, dataIdValue in self.dataId.items():
178  foundValue = self.foundItems.get(dataIdKey, NotFound)
179  if foundValue is not NotFound and foundValue != dataIdValue:
180  self.cachedStatus = 'notMatch'
181  break
182  return self.cachedStatus
183 
184  def setFoundItems(self, items):
185  self.cachedStatus = None
186  self.foundItems = items
187 
188  def addFoundItems(self, items):
189  self.cachedStatus = None
190  self.foundItems.update(items)
191 
192  def getMissingKeys(self):
193  return self.neededKeys - set(self.foundItems.keys())
194 
195  def lookup(self, lookupProperties, reference, dataId, **kwargs):
196  """Perform a lookup in the registry.
197 
198  Return values are refined by the values in dataId.
199  Returns a list of values that match keys in lookupProperties.
200  e.g. if the template is 'raw/raw_v%(visit)d_f%(filter)s.fits.gz', and
201  dataId={'visit':1}, and lookupProperties is ['filter'], and the
202  filesystem under self.root has exactly one file 'raw/raw_v1_fg.fits.gz'
203  then the return value will be [('g',)]
204 
205  :param lookupProperties: keys whose values will be returned.
206  :param reference: other data types that may be used to search for values.
207  :param dataId: must be an iterable. Keys must be string.
208  If value is a string then will look for elements in the repository that match value for key.
209  If value is a 2-item iterable then will look for elements in the repository are between (inclusive)
210  the first and second items in the value.
211  :param **kwargs: keys required for the posix registry to search for items. If required keys are not
212  provide will return an empty list.
213  'template': required. template parameter (typically from a policy) that can be used to look for files
214  'storage': optional. Needed to look for metadata in files. Currently supported values: 'FitsStorage'.
215  :return: a list of values that match keys in lookupProperties.
216  """
217  # required kwargs:
218  if 'template' in kwargs:
219  template = kwargs['template']
220  else:
221  return []
222  # optional kwargs:
223  storage = kwargs['storage'] if 'storage' in kwargs else None
224 
225  lookupData = PosixRegistry.LookupData(lookupProperties, dataId)
226  scanner = fsScanner.FsScanner(template)
227  allPaths = scanner.processPath(self.root)
228  retItems = [] # one item for each found file that matches
229  for path, foundProperties in allPaths.items():
230  # check for dataId keys that are not present in found properties
231  # search for those keys in metadata of file at path
232  # if present, check for matching values
233  # if not present, file can not match, do not use it.
234  lookupData.setFoundItems(foundProperties)
235  if 'incomplete' == lookupData.status():
236  PosixRegistry.lookupMetadata(os.path.join(self.root, path), template, lookupData, storage)
237  if 'match' == lookupData.status():
238  l = tuple(lookupData.foundItems[key] for key in lookupData.lookupProperties)
239  retItems.append(l)
240  return retItems
241 
242  @staticmethod
243  def lookupMetadata(filepath, template, lookupData, storage):
244  """Dispatcher for looking up metadata in a file of a given storage type
245  """
246  if storage == 'FitsStorage':
247  PosixRegistry.lookupFitsMetadata(filepath, template, lookupData, storage)
248 
249  @staticmethod
250  def lookupFitsMetadata(filepath, template, lookupData, dataId):
251  """Look up metadata in a fits file.
252  Will try to discover the correct HDU to look in by testing if the
253  template has a value in brackets at the end.
254  If the HDU is specified but the metadata key is not discovered in
255  that HDU, will look in the primary HDU before giving up.
256  :param filepath: path to the file
257  :param template: template that was used to discover the file. This can
258  be used to look up the correct HDU as needed.
259  :param lookupData: an instance if LookupData that contains the
260  lookupProperties, the dataId, and the data that has been found so far.
261  Will be updated with new information as discovered.
262  :param dataId:
263  :return:
264  """
265  try:
266  hdulist = astropy.io.fits.open(filepath, memmap=True)
267  except IOError:
268  return
269  hduNumber = PosixRegistry.getHduNumber(template=template, dataId=dataId)
270  if hduNumber is not None and hduNumber < len(hdulist):
271  hdu = hdulist[hduNumber]
272  else:
273  hdu = None
274  if len(hdulist) > 0:
275  primaryHdu = hdulist[0]
276  else:
277  primaryHdu = None
278 
279  for property in lookupData.getMissingKeys():
280  propertyValue = None
281  if hdu is not None and property in hdu.header:
282  propertyValue = hdu.header[property]
283  # if the value is not in the indicated HDU, try the primary HDU:
284  elif primaryHdu is not None and property in primaryHdu.header:
285  propertyValue = primaryHdu.header[property]
286  lookupData.addFoundItems({property: propertyValue})
287 
288 
290  """A base class for SQL-based registries
291 
292  Subclasses should define the class variable `placeHolder` (the particular
293  placeholder to use for parameter substitution) appropriately. The
294  database's python module should define `paramstyle` (see PEP 249), which
295  would indicate what to use for a placeholder:
296  * paramstyle = "qmark" --> placeHolder = "?"
297  * paramstyle = "format" --> placeHolder = "%s"
298  Other `paramstyle` values are not currently supported.
299 
300  Constructor parameters
301  ----------------------
302  conn : DBAPI connection object
303  Connection object
304  """
305  placeHolder = "?" # Placeholder for parameter substitution
306 
307  def __init__(self, conn):
308  """Constructor.
309 
310  Parameters
311  ----------
312  conn : DBAPI connection object
313  Connection object
314  """
315  Registry.__init__(self)
316  self.conn = conn
317 
318  def lookup(self, lookupProperties, reference, dataId, **kwargs):
319  """Perform a lookup in the registry.
320 
321  Return values are refined by the values in dataId.
322  Returns a list of values that match keys in lookupProperties.
323  e.g. if the template is 'raw/raw_v%(visit)d_f%(filter)s.fits.gz', and
324  dataId={'visit':1}, and lookupProperties is ['filter'], and the
325  filesystem under self.root has exactly one file 'raw/raw_v1_fg.fits.gz'
326  then the return value will be [('g',)]
327 
328  :param lookupProperties:
329  :param dataId: must be an iterable. Keys must be string.
330  If key is a string then will look for elements in the repository that match value for key.
331  If key is a 2-item iterable then will look for elements in the repository where the value is between
332  the values of key[0] and key[1].
333  :param reference: other data types that may be used to search for values.
334  :param **kwargs: nothing needed for sqlite lookup
335  :return: a list of values that match keys in lookupProperties.
336  """
337  if not self.conn:
338  return None
339 
340  # input variable sanitization:
341  reference = sequencify(reference)
342  lookupProperties = sequencify(lookupProperties)
343 
344  cmd = "SELECT DISTINCT "
345  cmd += ", ".join(lookupProperties)
346  cmd += " FROM " + " NATURAL JOIN ".join(reference)
347  valueList = []
348  if dataId is not None and len(dataId) > 0:
349  whereList = []
350  for k, v in dataId.items():
351  if hasattr(k, '__iter__') and not isinstance(k, basestring):
352  if len(k) != 2:
353  raise RuntimeError("Wrong number of keys for range:%s" % (k,))
354  whereList.append("(%s BETWEEN %s AND %s)" % (self.placeHolder, k[0], k[1]))
355  valueList.append(v)
356  else:
357  whereList.append("%s = %s" % (k, self.placeHolder))
358  valueList.append(v)
359  cmd += " WHERE " + " AND ".join(whereList)
360  cursor = self.conn.cursor()
361  cursor.execute(cmd, valueList)
362  return [row for row in cursor.fetchall()]
363 
364  def executeQuery(self, returnFields, joinClause, whereFields, range, values):
365  """Extract metadata from the registry.
366  @param returnFields (list of strings) Metadata fields to be extracted.
367  @param joinClause (list of strings) Tables in which metadata fields
368  are located.
369  @param whereFields (list of tuples) First tuple element is metadata
370  field to query; second is the value that field
371  must have (often '?').
372  @param range (tuple) Value, lower limit, and upper limit for a
373  range condition on the metadata. Any of these can
374  be metadata fields.
375  @param values (tuple) Tuple of values to be substituted for '?'
376  characters in the whereFields values or the range
377  values.
378  @return (list of tuples) All sets of field values that meet the
379  criteria"""
380  if not self.conn:
381  return None
382  cmd = "SELECT DISTINCT "
383  cmd += ", ".join(returnFields)
384  cmd += " FROM " + " NATURAL JOIN ".join(joinClause)
385  whereList = []
386  if whereFields:
387  for k, v in whereFields:
388  whereList.append("(%s = %s)" % (k, v))
389  if range is not None:
390  whereList.append("(%s BETWEEN %s AND %s)" % range)
391  if len(whereList) > 0:
392  cmd += " WHERE " + " AND ".join(whereList)
393  cursor = self.conn.cursor()
394  cursor.execute(cmd, values)
395  return [row for row in cursor.fetchall()]
396 
397 
399  """A SQLite-based registry"""
400  placeHolder = "?" # Placeholder for parameter substitution
401 
402  def __init__(self, location):
403  """Constructor
404 
405  Parameters
406  ----------
407  location : `str`
408  Path to SQLite3 file
409  """
410  if os.path.exists(location):
411  conn = sqlite3.connect(location)
412  conn.text_factory = str
413  else:
414  conn = None
415  SqlRegistry.__init__(self, conn)
416 
417 
419  """A PostgreSQL-based registry"""
420  placeHolder = "%s"
421 
422  def __init__(self, location):
423  """Constructor
424 
425  Parameters
426  ----------
427  location : `str`
428  Path to PostgreSQL configuration file.
429  """
430  if not havePgsql:
431  raise RuntimeError("Cannot use PgsqlRegistry: could not import psycopg2")
432  config = self.readYaml(location)
433  self._config = config
434  conn = pgsql.connect(host=config["host"], port=config["port"], database=config["database"],
435  user=config["user"], password=config["password"])
436  SqlRegistry.__init__(self, conn)
437 
438  def __del__(self):
439  if self.conn:
440  self.conn.close()
441 
442  @staticmethod
443  def readYaml(location):
444  """Read YAML configuration file
445 
446  The YAML configuration file should contain:
447  * host : host name for database connection
448  * port : port for database connection
449  * user : user name for database connection
450  * database : database name
451 
452  It may also contain:
453  * password : password for database connection
454 
455  The optional entries are set to `None` in the output configuration.
456 
457  Parameters
458  ----------
459  location : `str`
460  Path to PostgreSQL YAML config file.
461 
462  Returns
463  -------
464  config : `dict`
465  Configuration
466  """
467  with open(location) as ff:
468  data = yaml.load(ff)
469  requireKeys = set(["host", "port", "database", "user"])
470  optionalKeys = set(["password"])
471  haveKeys = set(data.keys())
472  if haveKeys - optionalKeys != requireKeys:
473  raise RuntimeError(
474  "PostgreSQL YAML configuration (%s) should contain only %s, and may contain 'password', "
475  "but this contains: %s" %
476  (location, ",".join("'%s'" % key for key in requireKeys),
477  ",".join("'%s'" % key for key in data.keys()))
478  )
479  for key in optionalKeys:
480  if key not in data:
481  data[key] = None
482 
483  return data
484 
485  def lookup(self, *args, **kwargs):
486  try:
487  return SqlRegistry.lookup(self, *args, **kwargs)
488  except Exception as exc:
489  self.conn.rollback()
490  raise