lsst.obs.base  20.0.0-54-gba713e9+a7d430d1e1
builders.py
Go to the documentation of this file.
1 # This file is part of obs_base.
2 #
3 # Developed for the LSST Data Management System.
4 # This product includes software developed by the LSST Project
5 # (http://www.lsst.org).
6 # See the COPYRIGHT file at the top-level directory of this distribution
7 # for details of code ownership.
8 #
9 # This program is free software: you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or
12 # (at your option) any later version.
13 #
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
18 #
19 # You should have received a copy of the GNU General Public License
20 # along with this program. If not, see <http://www.gnu.org/licenses/>.
21 """Classes used in `RepoWalker` construction.
22 
23 The objects here form a temporary tree that is pruned and then transformed
24 into a similar tree of `PathElementHandler` instances. See `BuilderNode`
25 method documentation for more information.
26 """
27 from __future__ import annotations
28 
29 __all__ = ["BuilderSkipInput", "BuilderTargetInput", "BuilderTree"]
30 
31 from abc import ABC, abstractmethod
32 import os
33 import re
34 from typing import (
35  Any,
36  Dict,
37  List,
38  Optional,
39  Tuple,
40 )
41 
42 from lsst.daf.butler import DatasetType, DimensionUniverse, StorageClass, FormatterParameter
43 from ..translators import TranslatorFactory
44 from .parser import PathElementParser
45 from .scanner import PathElementHandler, DirectoryScanner
46 from .handlers import (IgnoreHandler, SubdirectoryHandler, SkipHandler,
47  TargetFileHandler)
48 
49 
50 class BuilderNode(ABC):
51  """Abstract interface for nodes in the temporary tree that is used to
52  construct a `RepoWalker`.
53  """
54 
55  @abstractmethod
56  def prune(self) -> Tuple[BuilderNode, List[str], bool]:
57  """Attempt to prune this node and its children from the tree.
58 
59  Returns
60  -------
61  replacement : `BuilderNode`
62  The result of recursively pruning child nodes; often just ``self``.
63  messages : `list` [`str`]
64  Warning messages that should be logged by a parent node when a
65  matching path element is encountered, if this node is pruned.
66  prune : `bool`
67  If `True`, this node may be pruned from the tree (but will not
68  necessarily be - it may correspond to a path element that should
69  be skipped with siblings that should not be).
70  """
71  raise NotImplementedError()
72 
73  @abstractmethod
74  def build(self, parser: PathElementParser, allKeys: Dict[str, type], cumulativeKeys: Dict[str, type], *,
75  fileIgnoreRegEx: Optional[re.Pattern], dirIgnoreRegEx: Optional[re.Pattern]
76  ) -> PathElementHandler:
77  """Transform this node in the build tree into a corresponding
78  `PathElementHandler`, recursing to any children.
79 
80  Must be called after `prune`.
81 
82  Parameters
83  ----------
84  parser : `PathElementParser`
85  An object that matches the path element the new handler is
86  responsible for and extracts a (partial) Gen2 data ID from it.
87  allKeys : `dict` [`str`, `type`]
88  A mapping from Gen2 data ID key to the type of its value. Will
89  contain all keys that may be extracted by the given parser, and
90  possibly others.
91  cumulativeKeys : `dict` [`str`, `type`], optional
92  A dictionary containing key strings and types for Gen2 data ID keys
93  that have been extracted from previous path elements for this
94  template, including those extracted by ``parser``.
95 
96  Returns
97  -------
98  handler : `PathElementHandler`
99  A new handler object.
100  """
101  raise NotImplementedError()
102 
103 
105  """An intermediate base for `BuilderNode` classes that are provided as
106  direct inputs to a `RepoWalker`, and generally correspond to exactly one
107  Gen2 dataset type.
108 
109  Parameters
110  ----------
111  template : `str`
112  The complete Gen2 template to be matched (not just the template for
113  one path element).
114  keys : `dict` [`str`, `type`]
115  A mapping from Gen2 data ID key to the type of its value.
116  """
117  def __init__(self, template: str, keys: Dict[str, type]):
118  self.template = template
119  self.keys = keys
120  self.elements = self.template.split(os.path.sep)
121 
122  template: str
123  """The complete Gen2 template to be matched (`str`).
124  """
125 
126  keys: Dict[str, type]
127  """A mapping from Gen2 data ID key to the type of its value
128  (`dict` [`str`, `type`]).
129  """
130 
131  elements: List[str]
132  """The path elements (file or directory levels) of `template`
133  (`list` of `str`).
134  """
135 
136 
138  """An input to a `RepoWalker` that indicates that matched files should be
139  skipped, possibly with a warning message.
140 
141  BuilderSkipInputs can be pruned. When they are not pruned, they build
142  `SkipHandler` instances.
143 
144  Parameters
145  ----------
146  template : `str`
147  The complete Gen2 template to be matched (not just the template for
148  one path element).
149  keys : `dict` [`str`, `type`]
150  A mapping from Gen2 data ID key to the type of its value.
151  message : `str`, optional
152  If not `None`, a warning message that should be printed either when a
153  matching file is enountered or a directory that may contain such files
154  is skipped.
155  isForFiles : `bool`, optional
156  If `True` (default), this handler should be run on files. Otherwise it
157  should be run on directories.
158  """
159  def __init__(self, template: str, keys: Dict[str, type], message: Optional[str] = None, *,
160  isForFiles: bool = True):
161  super().__init__(template=template, keys=keys)
162  self._message = message
163  self._isForFiles = isForFiles
164 
165  def build(self, parser: PathElementParser, allKeys: Dict[str, type], cumulativeKeys: Dict[str, type], *,
166  fileIgnoreRegEx: Optional[re.Pattern], dirIgnoreRegEx: Optional[re.Pattern]
167  ) -> PathElementHandler:
168  # Docstring inherited from BuilderNode.
169  return SkipHandler(parser=parser, isForFiles=self._isForFiles, message=self._message)
170 
171  def prune(self) -> Tuple[BuilderNode, List[str], bool]:
172  # Docstring inherited from BuilderNode.
173  return self, [self._message] if self._message is not None else [], True
174 
175 
177  """An input to a `RepoWalker` that matches files that correspond to
178  datasets that we want to extract.
179 
180  BuilderTargetInputs can never be pruned, and always build
181  `TargetFileHandler` instances.
182 
183  Parameters
184  ----------
185  datasetTypeName : `str`
186  Name of the dataset type.
187  template : `str`
188  Full Gen2 filename template.
189  keys : `dict` [`str`, `type`]
190  Dictionary that maps Gen2 data ID key to the type of its value.
191  storageClass : `StorageClass`
192  `StorageClass` for the Gen3 dataset type.
193  universe : `DimensionUniverse`
194  All candidate dimensions for the Gen3 dataset type.
195  formatter : `lsst.daf.butler.Formatter` or `str`, optional
196  A Gen 3 formatter class or fully-qualified name.
197  translatorFactory : `TranslatorFactory`
198  Object that can be used to construct data ID translators.
199  targetHandler : `PathElementHandler`, optional
200  Override target handler for this dataset type.
201  **kwargs:
202  Additional keyword arguments are passed to `Translator.makeMatching`,
203  in along with ``datasetTypeName`` and ``keys``.
204  """
205  def __init__(self, *, datasetTypeName: str, template: str, keys: Dict[str, type],
206  storageClass: StorageClass, universe: DimensionUniverse,
207  formatter: FormatterParameter, translatorFactory: TranslatorFactory,
208  targetHandler: Optional[PathElementHandler] = None,
209  **kwargs: Any):
210  # strip off [%HDU] identifiers from e.g. DECAM Community Pipeline products
211  template = template.split('[%(')[0]
212  super().__init__(template=template, keys=keys)
213  self._translator = translatorFactory.makeMatching(datasetTypeName, keys, **kwargs)
214  self.datasetType = DatasetType(datasetTypeName, dimensions=self._translator.dimensionNames,
215  storageClass=storageClass, universe=universe,
216  isCalibration=("calibDate" in keys))
217  self._formatter = formatter
218  if targetHandler is None:
219  targetHandler = TargetFileHandler
220  self._handler = targetHandler
221 
222  def build(self, parser: PathElementParser, allKeys: Dict[str, type], cumulativeKeys: Dict[str, type], *,
223  fileIgnoreRegEx: Optional[re.Pattern], dirIgnoreRegEx: Optional[re.Pattern]
224  ) -> PathElementHandler:
225  # Docstring inherited from BuilderNode.
226  return self._handler(parser=parser, translator=self._translator, datasetType=self.datasetType,
227  formatter=self._formatter)
228 
229  def prune(self) -> Tuple[BuilderNode, List[str], bool]:
230  # Docstring inherited from BuilderNode.
231  return self, [], False
232 
233  datasetType: DatasetType
234  """The Gen3 dataset type extracted by the handler this object builds
235  (`lsst.daf.butler.DatasetType`).
236  """
237 
238 
240  """A `BuilderNode` that represents a subdirectory to be skipped,
241  created by pruning `BuilderTree` that contained only `BuilderSkipInput`
242  instances.
243 
244  BuilderPrunedTrees can be pruned. When they are not pruned, they
245  build `SkipHandler` instances.
246 
247  Parameters
248  ----------
249  messages : `list` [`str`]
250  A list of warning messages to be printed when the handler produced by
251  this builder matches a subdirectory.
252  """
253 
254  def __init__(self, messages: List[str]):
255  self._messages = messages
256 
257  def build(self, parser: PathElementParser, allKeys: Dict[str, type], cumulativeKeys: Dict[str, type], *,
258  fileIgnoreRegEx: Optional[re.Pattern], dirIgnoreRegEx: Optional[re.Pattern]
259  ) -> PathElementHandler:
260  # Docstring inherited from BuilderNode.
261  message = "; ".join(self._messages) if self._messages else None
262  return SkipHandler(parser=parser, isForFiles=False, message=message)
263 
264  def prune(self) -> Tuple[BuilderNode, List[str], bool]:
265  # Docstring inherited from BuilderNode.
266  return self, self._messages, True
267 
268 
270  """A `BuilderNode` that represents a collection of `BuilderInput` instances
271  that all have the same template.
272  """
273  def __init__(self, old: BuilderInput, new: BuilderInput):
274  self._children = []
275  if isinstance(old, BuilderDuplicateInputs):
276  self._children.extend(old._children)
277  else:
278  self._children.append(old)
279  self._children.append(new)
280  self._messages = [] # populated in prune()
281 
282  def build(self, parser: PathElementParser, allKeys: Dict[str, type], cumulativeKeys: Dict[str, type], *,
283  fileIgnoreRegEx: Optional[re.Pattern], dirIgnoreRegEx: Optional[re.Pattern]
284  ) -> PathElementHandler:
285  # Docstring inherited from BuilderNode.
286  message = "; ".join(self._messages) if self._messages else None
287  return SkipHandler(parser=parser, isForFiles=False, message=message)
288 
289  def prune(self) -> Tuple[BuilderNode, List[str], bool]:
290  # Docstring inherited from BuilderNode.
291  unprunable = []
292  newChildren = []
293  for child in self._children:
294  newChild, childMessages, toPruneChild = child.prune()
295  if toPruneChild:
296  self._messages.extend(childMessages)
297  else:
298  unprunable.append(newChild)
299  newChildren.append(newChildren)
300  self._children = newChildren
301  if len(unprunable) == 0:
302  # All children are just skips, so we can prune this node if we
303  # remember their messages.
304  return self, self._messages, True
305  elif len(unprunable) == 1 and not self._messages:
306  # Exactly one child is a target, and the others were ignored with
307  # no warning messages. Tell parent node to just use that child,
308  # so if we see any matching files, we just assume they're for that
309  # target.
310  return unprunable[0], [], False
311  else:
312  # Multiple targets or skips with messages, which means we won't
313  # know how to handle any matching files. Replace any messages we
314  # have with a single message that combines them all as well as
315  # any target dataset types that they are ambiguous with.
316  nested = [f"{c.datasetType.name} (target)" for c in unprunable]
317  nested.extend(self._messages)
318  self._messages = [f"ambiguous match: [{', '.join(nested)}]"]
319  return self, self._messages, True
320 
321 
323  """A `BuilderNode` that represents a directory.
324 
325  This is the only `BuilderNode` class that is not a leaf node. If all
326  of its children can be pruned, it is replaced by a `BuilderPrunedTree`
327  (which can then be pruned itself). It builds `SubdirectoryHandler`
328  instances when not pruned.
329  """
330  def __init__(self):
331  self._children = {} # Maps template path element to BuilderNode
332 
333  def insert(self, level: int, leaf: BuilderInput):
334  """Insert an input leaf node into the tree, recursively constructing
335  intermediate parents in order to put it at the right level.
336 
337  Parameters
338  ----------
339  level : `int`
340  The level ``self``is at in the larger tree, with zero the
341  repository root. The right level for the leaf is given by the
342  length of ``leaf.elements``.
343  leaf : `BuilderInput`
344  The leaf node to insert.
345  """
346  nextLevel = level + 1
347  element = leaf.elements[level]
348  if nextLevel == len(leaf.elements):
349  conflict = self._children.get(element)
350  if conflict is not None:
351  # Sadly, the Gen2 butler has some actual dataset types that
352  # use the exact same template.
353  leaf = BuilderDuplicateInputs(conflict, leaf)
354  self._children[element] = leaf
355  else:
356  child = self._children.setdefault(element, BuilderTree())
357  child.insert(nextLevel, leaf)
358 
359  def fill(self, scanner: DirectoryScanner, allKeys: Dict[str, type], previousKeys: Dict[str, type], *,
360  fileIgnoreRegEx: Optional[re.Pattern], dirIgnoreRegEx: Optional[re.Pattern]):
361  """Fill a `DirectoryScanner` instance by recursively building all
362  child nodes.
363 
364  Parameters
365  ----------
366  scanner : `DirectoryScanner`
367  Object to populate.
368  allKeys : `dict` [`str`, `type`]
369  Mapping from Gen2 data ID key to its value type, covering all keys
370  that could be used in any child template.
371  previousKeys : `dict` [`str`, `type`], optional
372  A dictionary containing key strings and types for Gen2 data ID keys
373  that have been extracted from previous path elements of the same
374  template.
375  fileIgnoreRegEx : `re.Pattern`, optional
376  A regular expression pattern that identifies non-dataset files that
377  can be ignored, to be applied at all levels of the directory tree.
378  dirIgnoreRegEx : `re.Pattern`, optional
379  A regular expression pattern that identifies non-dataset
380  subdirectories that can be ignored, to be applied at all levels of
381  the directory tree.
382  """
383  if fileIgnoreRegEx is not None:
384  scanner.add(IgnoreHandler(fileIgnoreRegEx, isForFiles=True))
385  if dirIgnoreRegEx is not None:
386  scanner.add(IgnoreHandler(dirIgnoreRegEx, isForFiles=False))
387  for template, child in self._children.items():
388  parser = PathElementParser(template, allKeys, previousKeys=previousKeys)
389  cumulativeKeys = previousKeys.copy()
390  cumulativeKeys.update(parser.keys)
391  scanner.add(child.build(parser, allKeys, cumulativeKeys, fileIgnoreRegEx=fileIgnoreRegEx,
392  dirIgnoreRegEx=dirIgnoreRegEx))
393 
394  def prune(self) -> Tuple[BuilderNode, List[str], bool]:
395  # Docstring inherited from BuilderNode.
396  toPruneThis = True
397  newChildren = {}
398  messages = []
399  # Recursively prune children.
400  for template, child in list(self._children.items()):
401  newChild, childMessages, toPruneChild = child.prune()
402  newChildren[template] = newChild
403  messages.extend(childMessages)
404  if not toPruneChild:
405  toPruneThis = False
406  self._children = newChildren
407  if toPruneThis:
408  return BuilderPrunedTree(messages), messages, True
409  else:
410  return self, [], False
411 
412  def build(self, parser: PathElementParser, allKeys: Dict[str, type], cumulativeKeys: Dict[str, type], *,
413  fileIgnoreRegEx: Optional[re.Pattern], dirIgnoreRegEx: Optional[re.Pattern]
414  ) -> PathElementHandler:
415  # Docstring inherited from BuilderNode.
416  built = SubdirectoryHandler(parser)
417  self.fill(built.scanner, allKeys, cumulativeKeys, fileIgnoreRegEx=fileIgnoreRegEx,
418  dirIgnoreRegEx=dirIgnoreRegEx)
419  return built
lsst.obs.base.gen2to3.repoWalker.builders.BuilderTargetInput.build
PathElementHandler build(self, PathElementParser parser, Dict[str, type] allKeys, Dict[str, type] cumulativeKeys, *Optional[re.Pattern] fileIgnoreRegEx, Optional[re.Pattern] dirIgnoreRegEx)
Definition: builders.py:222
lsst.obs.base.gen2to3.repoWalker.handlers.SkipHandler
Definition: handlers.py:162
lsst.obs.base.gen2to3.repoWalker.builders.BuilderSkipInput._message
_message
Definition: builders.py:161
lsst.obs.base.gen2to3.repoWalker.builders.BuilderSkipInput.__init__
def __init__(self, str template, Dict[str, type] keys, Optional[str] message=None, *bool isForFiles=True)
Definition: builders.py:159
lsst.obs.base.gen2to3.repoWalker.handlers.SubdirectoryHandler
Definition: handlers.py:201
lsst.obs.base.gen2to3.repoWalker.builders.BuilderInput.__init__
def __init__(self, str template, Dict[str, type] keys)
Definition: builders.py:117
lsst.obs.base.gen2to3.repoWalker.builders.BuilderInput.template
template
Definition: builders.py:118
lsst.obs.base.gen2to3.repoWalker.builders.BuilderTargetInput._handler
_handler
Definition: builders.py:216
lsst.obs.base.gen2to3.repoWalker.builders.BuilderTree
Definition: builders.py:322
lsst.obs.base.gen2to3.repoWalker.builders.BuilderNode.prune
Tuple[BuilderNode, List[str], bool] prune(self)
Definition: builders.py:56
lsst.obs.base.gen2to3.repoWalker.builders.BuilderInput.keys
keys
Definition: builders.py:119
lsst.obs.base.gen2to3.repoWalker.builders.BuilderTree.build
PathElementHandler build(self, PathElementParser parser, Dict[str, type] allKeys, Dict[str, type] cumulativeKeys, *Optional[re.Pattern] fileIgnoreRegEx, Optional[re.Pattern] dirIgnoreRegEx)
Definition: builders.py:412
lsst.obs.base.gen2to3.repoWalker.builders.BuilderTree.__init__
def __init__(self)
Definition: builders.py:330
lsst.obs.base.gen2to3.repoWalker.builders.BuilderTargetInput.prune
Tuple[BuilderNode, List[str], bool] prune(self)
Definition: builders.py:229
lsst.obs.base.gen2to3.repoWalker.builders.BuilderSkipInput.prune
Tuple[BuilderNode, List[str], bool] prune(self)
Definition: builders.py:171
lsst.obs.base.gen2to3.repoWalker.builders.BuilderTargetInput._formatter
_formatter
Definition: builders.py:213
lsst.obs.base.gen2to3.repoWalker.builders.BuilderTargetInput.__init__
def __init__(self, *str datasetTypeName, str template, Dict[str, type] keys, StorageClass storageClass, DimensionUniverse universe, FormatterParameter formatter, TranslatorFactory translatorFactory, Optional[PathElementHandler] targetHandler=None, **Any kwargs)
Definition: builders.py:205
lsst.obs.base.gen2to3.repoWalker.builders.BuilderDuplicateInputs._messages
_messages
Definition: builders.py:280
lsst.obs.base.gen2to3.repoWalker.builders.BuilderPrunedTree.__init__
def __init__(self, List[str] messages)
Definition: builders.py:254
lsst.obs.base.gen2to3.repoWalker.builders.BuilderTree._children
_children
Definition: builders.py:331
lsst.obs.base.gen2to3.repoWalker.builders.BuilderDuplicateInputs
Definition: builders.py:269
lsst.obs.base.gen2to3.repoWalker.builders.BuilderDuplicateInputs.build
PathElementHandler build(self, PathElementParser parser, Dict[str, type] allKeys, Dict[str, type] cumulativeKeys, *Optional[re.Pattern] fileIgnoreRegEx, Optional[re.Pattern] dirIgnoreRegEx)
Definition: builders.py:282
lsst.obs.base.gen2to3.repoWalker.builders.BuilderTargetInput
Definition: builders.py:176
lsst.obs.base.gen2to3.repoWalker.builders.BuilderPrunedTree
Definition: builders.py:239
lsst.obs.base.gen2to3.repoWalker.handlers.IgnoreHandler
Definition: handlers.py:56
lsst.obs.base.gen2to3.repoWalker.builders.BuilderInput.elements
elements
Definition: builders.py:120
lsst.obs.base.gen2to3.repoWalker.builders.BuilderNode
Definition: builders.py:50
lsst.obs.base.gen2to3.repoWalker.builders.BuilderInput
Definition: builders.py:104
lsst.obs.base.gen2to3.repoWalker.builders.BuilderTargetInput._translator
_translator
Definition: builders.py:209
lsst.obs.base.gen2to3.repoWalker.builders.BuilderTree.prune
Tuple[BuilderNode, List[str], bool] prune(self)
Definition: builders.py:394
lsst.obs.base.gen2to3.repoWalker.builders.BuilderSkipInput
Definition: builders.py:137
lsst.obs.base.gen2to3.repoWalker.builders.BuilderDuplicateInputs._children
_children
Definition: builders.py:274
lsst.obs.base.gen2to3.repoWalker.builders.BuilderNode.build
PathElementHandler build(self, PathElementParser parser, Dict[str, type] allKeys, Dict[str, type] cumulativeKeys, *Optional[re.Pattern] fileIgnoreRegEx, Optional[re.Pattern] dirIgnoreRegEx)
Definition: builders.py:74
lsst.obs.base.gen2to3.repoWalker.builders.BuilderPrunedTree.prune
Tuple[BuilderNode, List[str], bool] prune(self)
Definition: builders.py:264
lsst.obs.base.gen2to3.repoWalker.builders.BuilderPrunedTree.build
PathElementHandler build(self, PathElementParser parser, Dict[str, type] allKeys, Dict[str, type] cumulativeKeys, *Optional[re.Pattern] fileIgnoreRegEx, Optional[re.Pattern] dirIgnoreRegEx)
Definition: builders.py:257
lsst.obs.base.gen2to3.repoWalker.builders.BuilderTree.insert
def insert(self, int level, BuilderInput leaf)
Definition: builders.py:333
lsst.obs.base.gen2to3.repoWalker.builders.BuilderDuplicateInputs.__init__
def __init__(self, BuilderInput old, BuilderInput new)
Definition: builders.py:273
lsst.obs.base.gen2to3.repoWalker.builders.BuilderSkipInput.build
PathElementHandler build(self, PathElementParser parser, Dict[str, type] allKeys, Dict[str, type] cumulativeKeys, *Optional[re.Pattern] fileIgnoreRegEx, Optional[re.Pattern] dirIgnoreRegEx)
Definition: builders.py:165
lsst.obs.base.gen2to3.repoWalker.builders.BuilderTree.fill
def fill(self, DirectoryScanner scanner, Dict[str, type] allKeys, Dict[str, type] previousKeys, *Optional[re.Pattern] fileIgnoreRegEx, Optional[re.Pattern] dirIgnoreRegEx)
Definition: builders.py:359
lsst.obs.base.gen2to3.repoWalker.builders.BuilderDuplicateInputs.prune
Tuple[BuilderNode, List[str], bool] prune(self)
Definition: builders.py:289
lsst.obs.base.gen2to3.repoWalker.builders.BuilderSkipInput._isForFiles
_isForFiles
Definition: builders.py:162
lsst.obs.base.gen2to3.repoWalker.builders.BuilderTargetInput.datasetType
datasetType
Definition: builders.py:210
lsst.obs.base.gen2to3.repoWalker.builders.BuilderPrunedTree._messages
_messages
Definition: builders.py:255
lsst.obs.base.gen2to3.repoWalker.parser.PathElementParser
Definition: parser.py:116