lsst.obs.base  19.0.0-18-g955d782+3
parser.py
Go to the documentation of this file.
1 # This file is part of obs_base.
2 #
3 # Developed for the LSST Data Management System.
4 # This product includes software developed by the LSST Project
5 # (http://www.lsst.org).
6 # See the COPYRIGHT file at the top-level directory of this distribution
7 # for details of code ownership.
8 #
9 # This program is free software: you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or
12 # (at your option) any later version.
13 #
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
18 #
19 # You should have received a copy of the GNU General Public License
20 # along with this program. If not, see <http://www.gnu.org/licenses/>.
21 """Classes that transform (part of) a Gen2 filename template into a regular
22 expression that we can use to extract Gen2 data IDs from files.
23 """
24 from __future__ import annotations
25 
26 __all__ = ["PathElementParser"]
27 
28 
29 from abc import ABC, abstractmethod
30 import re
31 from typing import ClassVar, Dict, Optional
32 
33 from lsst.log import Log
34 
35 
36 class FormattableRegEx(ABC):
37  """An interface that generates a regular expression from a template and
38  a data ID.
39 
40  This is used by `PathElementParser` to abstract over whether a path
41  element's regex needs to include values from a data ID extracted from
42  parent path elements or not.
43  """
44 
45  @abstractmethod
46  def format(self, dataId: dict) -> re.Pattern:
47  """Substitute values from the given data ID and return a regular
48  expression.
49 
50  Parameters
51  ----------
52  dataId : `dict`
53  A dictionary whose entries may be used to format the regular
54  expression. May include unused entries.
55  """
56  raise NotImplementedError()
57 
58 
60  """A trivial implementation of `FormattableRegEx` that does no formatting.
61 
62  Parameters
63  ----------
64  regex : `re.Pattern`
65  The fixed regular expression to return.
66  """
67  def __init__(self, regex: re.Pattern):
68  self.regex = regex
69 
70  __slots__ = ("regex",)
71 
72  def format(self, dataId: dict) -> re.Pattern:
73  # Docstring inherited from FormattableRegEx.
74  return self.regex
75 
76 
78  """An implementation of `FormattableRegEx` formed from a concatenation of
79  actual regular terms and %-style format strings.
80  """
81  def __init__(self):
82  self._terms = []
83 
84  __slots__ = ("_terms",)
85 
86  def addRegexTerm(self, regex: str):
87  """Add a regular expression term.
88  """
89  self._terms.append((regex, False))
90 
91  def addSubstitutionTerm(self, template: str):
92  """Add a %-style format template term.
93  """
94  self._terms.append((template, True))
95 
96  def format(self, dataId: dict) -> re.Pattern:
97  # Docstring inherited from FormattableRegEx.
98  return re.compile("".join(re.escape(s % dataId) if isSub else s
99  for s, isSub in self._terms))
100 
101  def simplify(self) -> FormattableRegEx:
102  """Return a possibly-simplified version of this object.
103 
104  If `addSubstitionTerm` was never called, this returns a simple
105  `FixedRegEx`.
106  """
107  if not any(isSub for _, isSub in self._terms):
108  return FixedRegEx(re.compile("".join(s for s, _ in self._terms)))
109  else:
110  return self
111 
112 
114  """An object that matches Gen2 file names and extracts Gen2 data IDs.
115 
116  Parameters
117  ----------
118  target : `str`
119  Either a full Gen2 path template or the part of one the corresponds to
120  a single path element (a subdirectory or file name).
121  allKeys : `dict` [`str`, `type`]
122  A dictionary that provides types for all Gen2 data ID keys that are
123  substituted into the given template. Additional key-value pairs may
124  be present and will be ignored.
125  previousKeys : `dict` [`str`, `type`], optional
126  A dictionary containing key strings and types for Gen2 data ID keys
127  that have been extracted from previous path elements of the same
128  template. Values for these keys must be provided via the
129  ``lastDataId`` argument when calling `parse`.
130  """
131  def __init__(self, template: str, allKeys: Dict[str, type], *,
132  previousKeys: Optional[Dict[str, type]] = None):
133  self.template = template
134  self.keys = {}
135  # For each template path element, we iterate over each %-tagged
136  # substitution string.
137  last = 0
139  for match in self.TEMPLATE_RE.finditer(self.template):
140  # Copy the (escaped) regular string between the last substitution
141  # and this one, escaping it appropriately.
142  self.regex.addRegexTerm(re.escape(self.template[last:match.start()]))
143  # Pull out the data ID key from the name used in the
144  # substitution string. Use that and the substition
145  # type to come up with the pattern to use in the regex.
146  name = match.group("name")
147  if name == "patch":
148  pattern = r"\d+,\d+"
149  elif match.group("type") in "id": # integers
150  pattern = r"0*\d+"
151  else:
152  pattern = ".+"
153  # Create a new named groups for the first occurence of a key
154  # within an element.
155  if name not in self.keys:
156  if previousKeys and name in previousKeys:
157  # Key is new to this part of the template, but it appeared
158  # in some previous part of the template. We'll format the
159  # original template with the data ID from that previous
160  # step later.
161  start, stop = match.span()
162  self.regex.addSubstitutionTerm(self.template[start:stop])
163  else:
164  # Key is new; expect to extract a data ID value from it.
165  self.regex.addRegexTerm(r"(?P<%s>%s)" % (name, pattern))
166  self.keys[name] = allKeys[name]
167  else:
168  # Require a match with the last group for a second
169  # occurrence.
170  self.regex.addRegexTerm(r"(?P=<%s>)" % name)
171  # Remember the end of this match
172  last = match.end()
173  # Append anything remaining after the last substitution string.
174  self.regex.addRegexTerm(re.escape(self.template[last:]))
175  # If there are no substitutions, join and compile into a single regex
176  # now.
177  self.regex = self.regex.simplify()
178 
179  __slots__ = ("keys", "template", "regex")
180 
181  TEMPLATE_RE: ClassVar[re.Pattern] = re.compile(r"\%\((?P<name>\w+)\)[^\%]*?(?P<type>[idrs])")
182  """Regular expression that matches a single substitution in
183  Gen2 CameraMapper template, such as "%(tract)04d".
184  """
185 
186  def parse(self, name: str, lastDataId: dict, *, log: Optional[Log] = None) -> Optional[dict]:
187  """Parse the path element.
188 
189  Parameters
190  ----------
191  name : `str`
192  The path name to parse.
193  lastDataId : `dict`
194  The cumulative Gen2 data ID obtaining by calling `parse` on parsers
195  for parent directories of the same path.
196  log : `Log`, optional
197  Log to use to report warnings and debug information.
198 
199  Returns
200  -------
201  dataId : `dict` or `None`
202  Gen2 data ID that combines key-value pairs obtained from this path
203  with those from ``lastDataId``. `None` if ``name`` is not matched
204  by this parser. If the keys extracted are inconsistent with those
205  in ``lastDataID``, a warning is sent to ``log`` and `None` is
206  returned.
207  """
208  m = self.regex.format(lastDataId).fullmatch(name)
209  if m is None:
210  return None
211  newDataId = {k: v(m.group(k)) for k, v in self.keys.items()}
212  for commonKey in lastDataId.keys() & newDataId.keys():
213  if newDataId[commonKey] != lastDataId[commonKey]:
214  if log is not None:
215  log.warn("Inconsistent value %s=%r when parsing %r with %r.",
216  commonKey, newDataId[commonKey], name, lastDataId)
217  return None
218  newDataId.update(lastDataId)
219  return newDataId
220 
221  keys: Dict[str, type]
222  """Dictionary mapping Gen2 data ID key to the type of its associated
223  value, covering only those keys that can be extracted from this path
224  element.
225  """
226 
227  template: str
228  """The portion of the original Gen2 filename template that this parser was
229  constructed with.
230  """
231 
232  regex: re.Pattern
233  """A regular expression that can be used to match the path element and
234  populate the Gen2 data ID items whose keys are in ``keys``.
235  """