lsst.obs.base  19.0.0-24-g940be9f+1
builders.py
Go to the documentation of this file.
1 # This file is part of obs_base.
2 #
3 # Developed for the LSST Data Management System.
4 # This product includes software developed by the LSST Project
5 # (http://www.lsst.org).
6 # See the COPYRIGHT file at the top-level directory of this distribution
7 # for details of code ownership.
8 #
9 # This program is free software: you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or
12 # (at your option) any later version.
13 #
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
18 #
19 # You should have received a copy of the GNU General Public License
20 # along with this program. If not, see <http://www.gnu.org/licenses/>.
21 """Classes used in `RepoWalker` construction.
22 
23 The objects here form a temporary tree that is pruned and then transformed
24 into a similar tree of `PathElementHandler` instances. See `BuilderNode`
25 method documentation for more information.
26 """
27 from __future__ import annotations
28 
29 __all__ = ["BuilderSkipInput", "BuilderTargetInput", "BuilderTree"]
30 
31 from abc import ABC, abstractmethod
32 import os
33 import re
34 from typing import (
35  Any,
36  Dict,
37  List,
38  Optional,
39  Tuple,
40 )
41 
42 from lsst.daf.butler import DatasetType, DimensionUniverse, StorageClass
43 from ..translators import Translator
44 from .parser import PathElementParser
45 from .scanner import PathElementHandler, DirectoryScanner
46 from .handlers import (IgnoreHandler, SubdirectoryHandler, SkipHandler,
47  TargetFileHandler, MultiExtensionFileHandler)
48 
49 
50 class BuilderNode(ABC):
51  """Abstract interface for nodes in the temporary tree that is used to
52  construct a `RepoWalker`.
53  """
54 
55  @abstractmethod
56  def prune(self) -> Tuple[BuilderNode, List[str], bool]:
57  """Attempt to prune this node and its children from the tree.
58 
59  Returns
60  -------
61  replacement : `BuilderNode`
62  The result of recursively pruning child nodes; often just ``self``.
63  messages : `list` [`str`]
64  Warning messages that should be logged by a parent node when a
65  matching path element is encountered, if this node is pruned.
66  prune : `bool`
67  If `True`, this node may be pruned from the tree (but will not
68  necessarily be - it may correspond to a path element that should
69  be skipped with siblings that should not be).
70  """
71  raise NotImplementedError()
72 
73  @abstractmethod
74  def build(self, parser: PathElementParser, allKeys: Dict[str, type], cumulativeKeys: Dict[str, type], *,
75  fileIgnoreRegEx: Optional[re.Pattern], dirIgnoreRegEx: Optional[re.Pattern]
76  ) -> PathElementHandler:
77  """Transform this node in the build tree into a corresponding
78  `PathElementHandler`, recursing to any children.
79 
80  Must be called after `prune`.
81 
82  Parameters
83  ----------
84  parser : `PathElementParser`
85  An object that matches the path element the new handler is
86  responsible for and extracts a (partial) Gen2 data ID from it.
87  allKeys : `dict` [`str`, `type`]
88  A mapping from Gen2 data ID key to the type of its value. Will
89  contain all keys that may be extracted by the given parser, and
90  possibly others.
91  cumulativeKeys : `dict` [`str`, `type`], optional
92  A dictionary containing key strings and types for Gen2 data ID keys
93  that have been extracted from previous path elements for this
94  template, including those extracted by ``parser``.
95 
96  Returns
97  -------
98  handler : `PathElementHandler`
99  A new handler object.
100  """
101  raise NotImplementedError()
102 
103 
105  """An intermediate base for `BuilderNode` classes that are provided as
106  direct inputs to a `RepoWalker`, and generally correspond to exactly one
107  Gen2 dataset type.
108 
109  Parameters
110  ----------
111  template : `str`
112  The complete Gen2 template to be matched (not just the template for
113  one path element).
114  keys : `dict` [`str`, `type`]
115  A mapping from Gen2 data ID key to the type of its value.
116  """
117  def __init__(self, template: str, keys: Dict[str, type]):
118  self.template = template
119  self.keys = keys
120  self.elements = self.template.split(os.path.sep)
121 
122  template: str
123  """The complete Gen2 template to be matched (`str`).
124  """
125 
126  keys: Dict[str, type]
127  """A mapping from Gen2 data ID key to the type of its value
128  (`dict` [`str`, `type`]).
129  """
130 
131  elements: List[str]
132  """The path elements (file or directory levels) of `template`
133  (`list` of `str`).
134  """
135 
136 
138  """An input to a `RepoWalker` that indicates that matched files should be
139  skipped, possibly with a warning message.
140 
141  BuilderSkipInputs can be pruned. When they are not pruned, they build
142  `SkipHandler` instances.
143 
144  Parameters
145  ----------
146  template : `str`
147  The complete Gen2 template to be matched (not just the template for
148  one path element).
149  keys : `dict` [`str`, `type`]
150  A mapping from Gen2 data ID key to the type of its value.
151  message : `str`, optional
152  If not `None`, a warning message that should be printed either when a
153  matching file is enountered or a directory that may contain such files
154  is skipped.
155  isForFiles : `bool`, optional
156  If `True` (default), this handler should be run on files. Otherwise it
157  should be run on directories.
158  """
159  def __init__(self, template: str, keys: Dict[str, type], message: Optional[str] = None, *,
160  isForFiles: bool = True):
161  super().__init__(template=template, keys=keys)
162  self._message = message
163  self._isForFiles = isForFiles
164 
165  def build(self, parser: PathElementParser, allKeys: Dict[str, type], cumulativeKeys: Dict[str, type], *,
166  fileIgnoreRegEx: Optional[re.Pattern], dirIgnoreRegEx: Optional[re.Pattern]
167  ) -> PathElementHandler:
168  # Docstring inherited from BuilderNode.
169  return SkipHandler(parser=parser, isForFiles=self._isForFiles, message=self._message)
170 
171  def prune(self) -> Tuple[BuilderNode, List[str], bool]:
172  # Docstring inherited from BuilderNode.
173  return self, [self._message] if self._message is not None else [], True
174 
175 
177  """An input to a `RepoWalker` that matches files that correspond to
178  datasets that we want to extract.
179 
180  BuilderTargetInputs can never be pruned, and always build
181  `TargetFileHandler` instances.
182 
183  Parameters
184  ----------
185  datasetTypeName : `str`
186  Name of the dataset type.
187  template : `str`
188  Full Gen2 filename template.
189  keys : `dict` [`str`, `type`]
190  Dictionary that maps Gen2 data ID key to the type of its value.
191  storageClass : `StorageClass`
192  `StorageClass` for the Gen3 dataset type.
193  universe : `DimensionUniverse`
194  All candidate dimensions for the Gen3 dataset type.
195  kwargs:
196  Additional keyword arguments are passed to `Translator.makeMatching`,
197  in along with ``datasetTypeName`` and ``keys``.
198  """
199  def __init__(self, *, datasetTypeName: str, template: str, keys: Dict[str, type],
200  storageClass: StorageClass, universe: DimensionUniverse, **kwargs: Any):
201  # strip off [%HDU] identifiers from e.g. DECAM Community Pipeline products
202  template = template.split('[%(')[0]
203  super().__init__(template=template, keys=keys)
204  self._translator = Translator.makeMatching(datasetTypeName, keys, **kwargs)
205  self.datasetType = DatasetType(datasetTypeName, dimensions=self._translator.dimensionNames,
206  storageClass=storageClass, universe=universe)
207 
208  def build(self, parser: PathElementParser, allKeys: Dict[str, type], cumulativeKeys: Dict[str, type], *,
209  fileIgnoreRegEx: Optional[re.Pattern], dirIgnoreRegEx: Optional[re.Pattern]
210  ) -> PathElementHandler:
211  # Docstring inherited from BuilderNode.
212  if self.datasetType.name == 'cpBias' or self.datasetType.name == 'cpFlat':
213  # 'cpBias'/'cpFlat' are DECam Community Pipeline calibrations
214  # stored as multi-extension FITS files.
215  return MultiExtensionFileHandler(parser=parser,
216  translator=self._translator,
217  datasetType=self.datasetType)
218  else:
219  return TargetFileHandler(parser=parser, translator=self._translator, datasetType=self.datasetType)
220 
221  def prune(self) -> Tuple[BuilderNode, List[str], bool]:
222  # Docstring inherited from BuilderNode.
223  return self, [], False
224 
225  datasetType: DatasetType
226  """The Gen3 dataset type extracted by the hander this object builds
227  (`lsst.daf.butler.DatasetType`).
228  """
229 
230 
232  """A `BuilderNode` that represents a subdirectory to be skipped,
233  created by pruning `BuilderTree` that contained only `BuilderSkipInput`
234  instances.
235 
236  BuilderPrunedTrees can be pruned. When they are not pruned, they
237  build `SkipHandler` instances.
238 
239  Parameters
240  ----------
241  messages : `list` [`str`]
242  A list of warning messages to be printed when the handler produced by
243  this builder matches a subdirectory.
244  """
245 
246  def __init__(self, messages: List[str]):
247  self._messages = messages
248 
249  def build(self, parser: PathElementParser, allKeys: Dict[str, type], cumulativeKeys: Dict[str, type], *,
250  fileIgnoreRegEx: Optional[re.Pattern], dirIgnoreRegEx: Optional[re.Pattern]
251  ) -> PathElementHandler:
252  # Docstring inherited from BuilderNode.
253  message = "; ".join(self._messages) if self._messages else None
254  return SkipHandler(parser=parser, isForFiles=False, message=message)
255 
256  def prune(self) -> Tuple[BuilderNode, List[str], bool]:
257  # Docstring inherited from BuilderNode.
258  return self, self._messages, True
259 
260 
262  """A `BuilderNode` that represents a collection of `BuilderInput` instances
263  that all have the same template.
264  """
265  def __init__(self, old: BuilderInput, new: BuilderInput):
266  self._children = []
267  if isinstance(old, BuilderDuplicateInputs):
268  self._children.extend(old._children)
269  else:
270  self._children.append(old)
271  self._children.append(new)
272  self._messages = [] # populated in prune()
273 
274  def build(self, parser: PathElementParser, allKeys: Dict[str, type], cumulativeKeys: Dict[str, type], *,
275  fileIgnoreRegEx: Optional[re.Pattern], dirIgnoreRegEx: Optional[re.Pattern]
276  ) -> PathElementHandler:
277  # Docstring inherited from BuilderNode.
278  message = "; ".join(self._messages) if self._messages else None
279  return SkipHandler(parser=parser, isForFiles=False, message=message)
280 
281  def prune(self) -> Tuple[BuilderNode, List[str], bool]:
282  # Docstring inherited from BuilderNode.
283  unprunable = []
284  newChildren = []
285  for child in self._children:
286  newChild, childMessages, toPruneChild = child.prune()
287  if toPruneChild:
288  self._messages.extend(childMessages)
289  else:
290  unprunable.append(newChild)
291  newChildren.append(newChildren)
292  self._children = newChildren
293  if len(unprunable) == 0:
294  # All children are just skips, so we can prune this node if we
295  # remember their messages.
296  return self, self._messages, True
297  elif len(unprunable) == 1 and not self._messages:
298  # Exactly one child is a target, and the others were ignored with
299  # no warning messages. Tell parent node to just use that child,
300  # so if we see any matching files, we just assume they're for that
301  # target.
302  return unprunable[0], [], False
303  else:
304  # Multiple targets or skips with messages, which means we won't
305  # know how to handle any matching files. Replace any messages we
306  # have with a single message that combines them all as well as
307  # any target dataset types that they are ambiguous with.
308  nested = [f"{c.datasetType.name} (target)" for c in unprunable]
309  nested.extend(self._messages)
310  self._messages = [f"ambiguous match: [{', '.join(nested)}]"]
311  return self, self._messages, True
312 
313 
315  """A `BuilderNode` that represents a directory.
316 
317  This is the only `BuilderNode` class that is not a leaf node. If all
318  of its children can be pruned, it is replaced by a `BuilderPrunedTree`
319  (which can then be pruned itself). It builds `SubdirectoryHandler`
320  instances when not pruned.
321  """
322  def __init__(self):
323  self._children = {} # Maps template path element to BuilderNode
324 
325  def insert(self, level: int, leaf: BuilderInput):
326  """Insert an input leaf node into the tree, recursively constructing
327  intermediate parents in order to put it at the right level.
328 
329  Parameters
330  ----------
331  level : `int`
332  The level ``self``is at in the larger tree, with zero the
333  repository root. The right level for the leaf is given by the
334  length of ``leaf.elements``.
335  leaf : `BuilderInput`
336  The leaf node to insert.
337  """
338  nextLevel = level + 1
339  element = leaf.elements[level]
340  if nextLevel == len(leaf.elements):
341  conflict = self._children.get(element)
342  if conflict is not None:
343  # Sadly, the Gen2 butler has some actual dataset types that
344  # use the exact same template.
345  leaf = BuilderDuplicateInputs(conflict, leaf)
346  self._children[element] = leaf
347  else:
348  child = self._children.setdefault(element, BuilderTree())
349  child.insert(nextLevel, leaf)
350 
351  def fill(self, scanner: DirectoryScanner, allKeys: Dict[str, type], previousKeys: Dict[str, type], *,
352  fileIgnoreRegEx: Optional[re.Pattern], dirIgnoreRegEx: Optional[re.Pattern]):
353  """Fill a `DirectoryScanner` instance by recursively building all
354  child nodes.
355 
356  Parameters
357  ----------
358  scanner : `DirectoryScanner`
359  Object to populate.
360  allKeys : `dict` [`str`, `type`]
361  Mapping from Gen2 data ID key to its value type, covering all keys
362  that could be used in any child template.
363  previousKeys : `dict` [`str`, `type`], optional
364  A dictionary containing key strings and types for Gen2 data ID keys
365  that have been extracted from previous path elements of the same
366  template.
367  """
368  if fileIgnoreRegEx is not None:
369  scanner.add(IgnoreHandler(fileIgnoreRegEx, isForFiles=True))
370  if dirIgnoreRegEx is not None:
371  scanner.add(IgnoreHandler(dirIgnoreRegEx, isForFiles=False))
372  for template, child in self._children.items():
373  parser = PathElementParser(template, allKeys, previousKeys=previousKeys)
374  cumulativeKeys = previousKeys.copy()
375  cumulativeKeys.update(parser.keys)
376  scanner.add(child.build(parser, allKeys, cumulativeKeys, fileIgnoreRegEx=fileIgnoreRegEx,
377  dirIgnoreRegEx=dirIgnoreRegEx))
378 
379  def prune(self) -> Tuple[BuilderNode, List[str], bool]:
380  # Docstring inherited from BuilderNode.
381  toPruneThis = True
382  newChildren = {}
383  messages = []
384  # Recursively prune children.
385  for template, child in list(self._children.items()):
386  newChild, childMessages, toPruneChild = child.prune()
387  newChildren[template] = newChild
388  messages.extend(childMessages)
389  if not toPruneChild:
390  toPruneThis = False
391  self._children = newChildren
392  if toPruneThis:
393  return BuilderPrunedTree(messages), messages, True
394  else:
395  return self, [], False
396 
397  def build(self, parser: PathElementParser, allKeys: Dict[str, type], cumulativeKeys: Dict[str, type], *,
398  fileIgnoreRegEx: Optional[re.Pattern], dirIgnoreRegEx: Optional[re.Pattern]
399  ) -> PathElementHandler:
400  # Docstring inherited from BuilderNode.
401  built = SubdirectoryHandler(parser)
402  self.fill(built.scanner, allKeys, cumulativeKeys, fileIgnoreRegEx=fileIgnoreRegEx,
403  dirIgnoreRegEx=dirIgnoreRegEx)
404  return built