Coverage for python/lsst/obs/base/gen2to3/repoWalker/parser.py: 33%
80 statements
« prev ^ index » next coverage.py v6.5.0, created at 2022-12-08 14:44 -0800
« prev ^ index » next coverage.py v6.5.0, created at 2022-12-08 14:44 -0800
1# This file is part of obs_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21"""Classes that transform (part of) a Gen2 filename template into a regular
22expression that we can use to extract Gen2 data IDs from files.
23"""
24from __future__ import annotations
26__all__ = ["PathElementParser"]
29import logging
30import re
31from abc import ABC, abstractmethod
32from typing import ClassVar, Dict, Optional
35class FormattableRegEx(ABC):
36 """An interface that generates a regular expression from a template and
37 a data ID.
39 This is used by `PathElementParser` to abstract over whether a path
40 element's regex needs to include values from a data ID extracted from
41 parent path elements or not.
42 """
44 @abstractmethod
45 def format(self, dataId: dict) -> re.Pattern:
46 """Substitute values from the given data ID and return a regular
47 expression.
49 Parameters
50 ----------
51 dataId : `dict`
52 A dictionary whose entries may be used to format the regular
53 expression. May include unused entries.
54 """
55 raise NotImplementedError()
58class FixedRegEx(FormattableRegEx):
59 """A trivial implementation of `FormattableRegEx` that does no formatting.
61 Parameters
62 ----------
63 regex : `re.Pattern`
64 The fixed regular expression to return.
65 """
67 def __init__(self, regex: re.Pattern):
68 self.regex = regex
70 __slots__ = ("regex",)
72 def format(self, dataId: dict) -> re.Pattern:
73 # Docstring inherited from FormattableRegEx.
74 return self.regex
76 def __str__(self):
77 return f"{type(self).__name__}({self.regex})"
80class SubstitutableRegEx:
81 """An implementation of `FormattableRegEx` formed from a concatenation of
82 actual regular terms and %-style format strings.
83 """
85 def __init__(self):
86 self._terms = []
88 __slots__ = ("_terms",)
90 def addRegexTerm(self, regex: str):
91 """Add a regular expression term."""
92 self._terms.append((regex, False))
94 def addSubstitutionTerm(self, template: str):
95 """Add a %-style format template term."""
96 self._terms.append((template, True))
98 def format(self, dataId: dict) -> re.Pattern:
99 # Docstring inherited from FormattableRegEx.
100 return re.compile("".join(re.escape(s % dataId) if isSub else s for s, isSub in self._terms))
102 def simplify(self) -> FormattableRegEx:
103 """Return a possibly-simplified version of this object.
105 If `addSubstitionTerm` was never called, this returns a simple
106 `FixedRegEx`.
107 """
108 if not any(isSub for _, isSub in self._terms):
109 return FixedRegEx(re.compile("".join(s for s, _ in self._terms)))
110 else:
111 return self
114class PathElementParser:
115 """An object that matches Gen2 file names and extracts Gen2 data IDs.
117 Parameters
118 ----------
119 target : `str`
120 Either a full Gen2 path template or the part of one the corresponds to
121 a single path element (a subdirectory or file name).
122 allKeys : `dict` [`str`, `type`]
123 A dictionary that provides types for all Gen2 data ID keys that are
124 substituted into the given template. Additional key-value pairs may
125 be present and will be ignored.
126 previousKeys : `dict` [`str`, `type`], optional
127 A dictionary containing key strings and types for Gen2 data ID keys
128 that have been extracted from previous path elements of the same
129 template. Values for these keys must be provided via the
130 ``lastDataId`` argument when calling `parse`.
131 """
133 def __init__(
134 self, template: str, allKeys: Dict[str, type], *, previousKeys: Optional[Dict[str, type]] = None
135 ):
136 self.template = template
137 self.keys = {}
138 # For each template path element, we iterate over each %-tagged
139 # substitution string.
140 last = 0
141 self.regex = SubstitutableRegEx()
142 for match in self.TEMPLATE_RE.finditer(self.template):
143 # Copy the (escaped) regular string between the last substitution
144 # and this one, escaping it appropriately.
145 self.regex.addRegexTerm(re.escape(self.template[last : match.start()]))
146 # Pull out the data ID key from the name used in the
147 # substitution string. Use that and the substition
148 # type to come up with the pattern to use in the regex.
149 name = match.group("name")
150 if name == "patch":
151 pattern = r"\d+,\d+"
152 elif match.group("type") in "id": # integers
153 pattern = r"0*\d+"
154 else:
155 pattern = ".+"
156 # Create a new named groups for the first occurence of a key
157 # within an element.
158 if name not in self.keys:
159 if previousKeys and name in previousKeys:
160 # Key is new to this part of the template, but it appeared
161 # in some previous part of the template. We'll format the
162 # original template with the data ID from that previous
163 # step later.
164 start, stop = match.span()
165 self.regex.addSubstitutionTerm(self.template[start:stop])
166 else:
167 # Key is new; expect to extract a data ID value from it.
168 self.regex.addRegexTerm(r"(?P<%s>%s)" % (name, pattern))
169 self.keys[name] = allKeys[name]
170 else:
171 # Require a match with the last group for a second
172 # occurrence.
173 self.regex.addRegexTerm(r"(?P=<%s>)" % name)
174 # Remember the end of this match
175 last = match.end()
176 # Append anything remaining after the last substitution string.
177 self.regex.addRegexTerm(re.escape(self.template[last:]))
178 # If there are no substitutions, join and compile into a single regex
179 # now.
180 self.regex = self.regex.simplify()
182 __slots__ = ("keys", "template", "regex")
184 TEMPLATE_RE: ClassVar[re.Pattern] = re.compile(r"\%\((?P<name>\w+)\)[^\%]*?(?P<type>[idrs])")
185 """Regular expression that matches a single substitution in
186 Gen2 CameraMapper template, such as "%(tract)04d".
187 """
189 def __str__(self):
190 return f"{type(self).__name__}({self.regex})"
192 def parse(self, name: str, lastDataId: dict, *, log: Optional[logging.Logger] = None) -> Optional[dict]:
193 """Parse the path element.
195 Parameters
196 ----------
197 name : `str`
198 The path name to parse.
199 lastDataId : `dict`
200 The cumulative Gen2 data ID obtaining by calling `parse` on parsers
201 for parent directories of the same path.
202 log : `logging.Logger`, optional
203 Log to use to report warnings and debug information.
205 Returns
206 -------
207 dataId : `dict` or `None`
208 Gen2 data ID that combines key-value pairs obtained from this path
209 with those from ``lastDataId``. `None` if ``name`` is not matched
210 by this parser. If the keys extracted are inconsistent with those
211 in ``lastDataID``, a warning is sent to ``log`` and `None` is
212 returned.
213 """
214 m = self.regex.format(lastDataId).fullmatch(name)
215 if m is None:
216 return None
217 newDataId = {k: v(m.group(k)) for k, v in self.keys.items()}
218 for commonKey in lastDataId.keys() & newDataId.keys():
219 if newDataId[commonKey] != lastDataId[commonKey]:
220 if log is not None:
221 log.warning(
222 "Inconsistent value %s=%r when parsing %r with %r.",
223 commonKey,
224 newDataId[commonKey],
225 name,
226 lastDataId,
227 )
228 return None
229 newDataId.update(lastDataId)
230 return newDataId
232 keys: Dict[str, type]
233 """Dictionary mapping Gen2 data ID key to the type of its associated
234 value, covering only those keys that can be extracted from this path
235 element.
236 """
238 template: str
239 """The portion of the original Gen2 filename template that this parser was
240 constructed with.
241 """
243 regex: re.Pattern
244 """A regular expression that can be used to match the path element and
245 populate the Gen2 data ID items whose keys are in ``keys``.
246 """