lsst.obs.base  19.0.0-16-g8258e2a+1
builders.py
Go to the documentation of this file.
1 # This file is part of obs_base.
2 #
3 # Developed for the LSST Data Management System.
4 # This product includes software developed by the LSST Project
5 # (http://www.lsst.org).
6 # See the COPYRIGHT file at the top-level directory of this distribution
7 # for details of code ownership.
8 #
9 # This program is free software: you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or
12 # (at your option) any later version.
13 #
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
18 #
19 # You should have received a copy of the GNU General Public License
20 # along with this program. If not, see <http://www.gnu.org/licenses/>.
21 """Classes used in `RepoWalker` construction.
22 
23 The objects here form a temporary tree that is pruned and then transformed
24 into a similar tree of `PathElementHandler` instances. See `BuilderNode`
25 method documentation for more information.
26 """
27 from __future__ import annotations
28 
29 __all__ = ["BuilderSkipInput", "BuilderTargetInput", "BuilderTree"]
30 
31 from abc import ABC, abstractmethod
32 import os
33 import re
34 from typing import (
35  Any,
36  Dict,
37  List,
38  Optional,
39  Tuple,
40 )
41 
42 from lsst.daf.butler import DatasetType, DimensionUniverse, StorageClass
43 from ..translators import Translator
44 from .parser import PathElementParser
45 from .scanner import PathElementHandler, DirectoryScanner
46 from .handlers import IgnoreHandler, SubdirectoryHandler, SkipHandler, TargetFileHandler
47 
48 
49 class BuilderNode(ABC):
50  """Abstract interface for nodes in the temporary tree that is used to
51  construct a `RepoWalker`.
52  """
53 
54  @abstractmethod
55  def prune(self) -> Tuple[BuilderNode, List[str], bool]:
56  """Attempt to prune this node and its children from the tree.
57 
58  Returns
59  -------
60  replacement : `BuilderNode`
61  The result of recursively pruning child nodes; often just ``self``.
62  messages : `list` [`str`]
63  Warning messages that should be logged by a parent node when a
64  matching path element is encountered, if this node is pruned.
65  prune : `bool`
66  If `True`, this node may be pruned from the tree (but will not
67  necessarily be - it may correspond to a path element that should
68  be skipped with siblings that should not be).
69  """
70  raise NotImplementedError()
71 
72  @abstractmethod
73  def build(self, parser: PathElementParser, allKeys: Dict[str, type], cumulativeKeys: Dict[str, type], *,
74  fileIgnoreRegEx: Optional[re.Pattern], dirIgnoreRegEx: Optional[re.Pattern]
75  ) -> PathElementHandler:
76  """Transform this node in the build tree into a corresponding
77  `PathElementHandler`, recursing to any children.
78 
79  Must be called after `prune`.
80 
81  Parameters
82  ----------
83  parser : `PathElementParser`
84  An object that matches the path element the new handler is
85  responsible for and extracts a (partial) Gen2 data ID from it.
86  allKeys : `dict` [`str`, `type`]
87  A mapping from Gen2 data ID key to the type of its value. Will
88  contain all keys that may be extracted by the given parser, and
89  possibly others.
90  cumulativeKeys : `dict` [`str`, `type`], optional
91  A dictionary containing key strings and types for Gen2 data ID keys
92  that have been extracted from previous path elements for this
93  template, including those extracted by ``parser``.
94 
95  Returns
96  -------
97  handler : `PathElementHandler`
98  A new handler object.
99  """
100  raise NotImplementedError()
101 
102 
104  """An intermediate base for `BuilderNode` classes that are provided as
105  direct inputs to a `RepoWalker`, and generally correspond to exactly one
106  Gen2 dataset type.
107 
108  Parameters
109  ----------
110  template : `str`
111  The complete Gen2 template to be matched (not just the template for
112  one path element).
113  keys : `dict` [`str`, `type`]
114  A mapping from Gen2 data ID key to the type of its value.
115  """
116  def __init__(self, template: str, keys: Dict[str, type]):
117  self.template = template
118  self.keys = keys
119  self.elements = self.template.split(os.path.sep)
120 
121  template: str
122  """The complete Gen2 template to be matched (`str`).
123  """
124 
125  keys: Dict[str, type]
126  """A mapping from Gen2 data ID key to the type of its value
127  (`dict` [`str`, `type`]).
128  """
129 
130  elements: List[str]
131  """The path elements (file or directory levels) of `template`
132  (`list` of `str`).
133  """
134 
135 
137  """An input to a `RepoWalker` that indicates that matched files should be
138  skipped, possibly with a warning message.
139 
140  BuilderSkipInputs can be pruned. When they are not pruned, they build
141  `SkipHandler` instances.
142 
143  Parameters
144  ----------
145  template : `str`
146  The complete Gen2 template to be matched (not just the template for
147  one path element).
148  keys : `dict` [`str`, `type`]
149  A mapping from Gen2 data ID key to the type of its value.
150  message : `str`, optional
151  If not `None`, a warning message that should be printed either when a
152  matching file is enountered or a directory that may contain such files
153  is skipped.
154  isForFiles : `bool`, optional
155  If `True` (default), this handler should be run on files. Otherwise it
156  should be run on directories.
157  """
158  def __init__(self, template: str, keys: Dict[str, type], message: Optional[str] = None, *,
159  isForFiles: bool = True):
160  super().__init__(template=template, keys=keys)
161  self._message = message
162  self._isForFiles = isForFiles
163 
164  def build(self, parser: PathElementParser, allKeys: Dict[str, type], cumulativeKeys: Dict[str, type], *,
165  fileIgnoreRegEx: Optional[re.Pattern], dirIgnoreRegEx: Optional[re.Pattern]
166  ) -> PathElementHandler:
167  # Docstring inherited from BuilderNode.
168  return SkipHandler(parser=parser, isForFiles=self._isForFiles, message=self._message)
169 
170  def prune(self) -> Tuple[BuilderNode, List[str], bool]:
171  # Docstring inherited from BuilderNode.
172  return self, [self._message] if self._message is not None else [], True
173 
174 
176  """An input to a `RepoWalker` that matches files that correspond to
177  datasets that we want to extract.
178 
179  BuilderTargetInputs can never be pruned, and always build
180  `TargetFileHandler` instances.
181 
182  Parameters
183  ----------
184  datasetTypeName : `str`
185  Name of the dataset type.
186  template : `str`
187  Full Gen2 filename template.
188  keys : `dict` [`str`, `type`]
189  Dictionary that maps Gen2 data ID key to the type of its value.
190  storageClass : `StorageClass`
191  `StorageClass` for the Gen3 dataset type.
192  universe : `DimensionUniverse`
193  All candidate dimensions for the Gen3 dataset type.
194  kwargs:
195  Additional keyword argumetns are passed to `Translator.makeMatching`,
196  in along with ``datasetTypeName`` and ``keys``.
197  """
198  def __init__(self, *, datasetTypeName: str, template: str, keys: Dict[str, type],
199  storageClass: StorageClass, universe: DimensionUniverse, **kwargs: Any):
200  super().__init__(template=template, keys=keys)
201  self._translator = Translator.makeMatching(datasetTypeName, keys, **kwargs)
202  self.datasetType = DatasetType(datasetTypeName, dimensions=self._translator.dimensionNames,
203  storageClass=storageClass, universe=universe)
204 
205  def build(self, parser: PathElementParser, allKeys: Dict[str, type], cumulativeKeys: Dict[str, type], *,
206  fileIgnoreRegEx: Optional[re.Pattern], dirIgnoreRegEx: Optional[re.Pattern]
207  ) -> PathElementHandler:
208  # Docstring inherited from BuilderNode.
209  return TargetFileHandler(parser=parser, translator=self._translator, datasetType=self.datasetType)
210 
211  def prune(self) -> Tuple[BuilderNode, List[str], bool]:
212  # Docstring inherited from BuilderNode.
213  return self, [], False
214 
215  datasetType: DatasetType
216  """The Gen3 dataset type extracted by the hander this object builds
217  (`lsst.daf.butler.DatasetType`).
218  """
219 
220 
222  """A `BuilderNode` that represents a subdirectory to be skipped,
223  created by pruning `BuilderTree` that contained only `BuilderSkipInput`
224  instances.
225 
226  BuilderPrunedTrees can be pruned. When they are not pruned, they
227  build `SkipHandler` instances.
228 
229  Parameters
230  ----------
231  messages : `list` [`str`]
232  A list of warning messages to be printed when the handler produced by
233  this builder matches a subdirectory.
234  """
235 
236  def __init__(self, messages: List[str]):
237  self._messages = messages
238 
239  def build(self, parser: PathElementParser, allKeys: Dict[str, type], cumulativeKeys: Dict[str, type], *,
240  fileIgnoreRegEx: Optional[re.Pattern], dirIgnoreRegEx: Optional[re.Pattern]
241  ) -> PathElementHandler:
242  # Docstring inherited from BuilderNode.
243  message = "; ".join(self._messages) if self._messages else None
244  return SkipHandler(parser=parser, isForFiles=False, message=message)
245 
246  def prune(self) -> Tuple[BuilderNode, List[str], bool]:
247  # Docstring inherited from BuilderNode.
248  return self, self._messages, True
249 
250 
252  """A `BuilderNode` that represents a collection of `BuilderInput` instances
253  that all have the same template.
254  """
255  def __init__(self, old: BuilderInput, new: BuilderInput):
256  self._children = []
257  if isinstance(old, BuilderDuplicateInputs):
258  self._children.extend(old._children)
259  else:
260  self._children.append(old)
261  self._children.append(new)
262  self._messages = [] # populated in prune()
263 
264  def build(self, parser: PathElementParser, allKeys: Dict[str, type], cumulativeKeys: Dict[str, type], *,
265  fileIgnoreRegEx: Optional[re.Pattern], dirIgnoreRegEx: Optional[re.Pattern]
266  ) -> PathElementHandler:
267  # Docstring inherited from BuilderNode.
268  message = "; ".join(self._messages) if self._messages else None
269  return SkipHandler(parser=parser, isForFiles=False, message=message)
270 
271  def prune(self) -> Tuple[BuilderNode, List[str], bool]:
272  # Docstring inherited from BuilderNode.
273  unprunable = []
274  newChildren = []
275  for child in self._children:
276  newChild, childMessages, toPruneChild = child.prune()
277  if toPruneChild:
278  self._messages.extend(childMessages)
279  else:
280  unprunable.append(newChild)
281  newChildren.append(newChildren)
282  self._children = newChildren
283  if len(unprunable) == 0:
284  # All children are just skips, so we can prune this node if we
285  # remember their messages.
286  return self, self._messages, True
287  elif len(unprunable) == 1 and not self._messages:
288  # Exactly one child is a target, and the others were ignored with
289  # no warning messages. Tell parent node to just use that child,
290  # so if we see any matching files, we just assume they're for that
291  # target.
292  return unprunable[0], [], False
293  else:
294  # Multiple targets or skips with messages, which means we won't
295  # know how to handle any matching files. Replace any messages we
296  # have with a single message that combines them all as well as
297  # any target dataset types that they are ambiguous with.
298  nested = [f"{c.datasetType.name} (target)" for c in unprunable]
299  nested.extend(self._messages)
300  self._messages = [f"ambiguous match: [{', '.join(nested)}]"]
301  return self, self._messages, True
302 
303 
305  """A `BuilderNode` that represents a directory.
306 
307  This is the only `BuilderNode` class that is not a leaf node. If all
308  of its children can be pruned, it is replaced by a `BuilderPrunedTree`
309  (which can then be pruned itself). It builds `SubdirectoryHandler`
310  instances when not pruned.
311  """
312  def __init__(self):
313  self._children = {} # Maps template path element to BuilderNode
314 
315  def insert(self, level: int, leaf: BuilderInput):
316  """Insert an input leaf node into the tree, recursively constructing
317  intermediate parents in order to put it at the right level.
318 
319  Parameters
320  ----------
321  level : `int`
322  The level ``self``is at in the larger tree, with zero the
323  repository root. The right level for the leaf is given by the
324  length of ``leaf.elements``.
325  leaf : `BuilderInput`
326  The leaf node to insert.
327  """
328  nextLevel = level + 1
329  element = leaf.elements[level]
330  if nextLevel == len(leaf.elements):
331  conflict = self._children.get(element)
332  if conflict is not None:
333  # Sadly, the Gen2 butler has some actual dataset types that
334  # use the exact same template.
335  leaf = BuilderDuplicateInputs(conflict, leaf)
336  self._children[element] = leaf
337  else:
338  child = self._children.setdefault(element, BuilderTree())
339  child.insert(nextLevel, leaf)
340 
341  def fill(self, scanner: DirectoryScanner, allKeys: Dict[str, type], previousKeys: Dict[str, type], *,
342  fileIgnoreRegEx: Optional[re.Pattern], dirIgnoreRegEx: Optional[re.Pattern]):
343  """Fill a `DirectoryScanner` instance by recursively building all
344  child nodes.
345 
346  Parameters
347  ----------
348  scanner : `DirectoryScanner`
349  Object to populate.
350  allKeys : `dict` [`str`, `type`]
351  Mapping from Gen2 data ID key to its value type, covering all keys
352  that could be used in any child template.
353  previousKeys : `dict` [`str`, `type`], optional
354  A dictionary containing key strings and types for Gen2 data ID keys
355  that have been extracted from previous path elements of the same
356  template.
357  """
358  if fileIgnoreRegEx is not None:
359  scanner.add(IgnoreHandler(fileIgnoreRegEx, isForFiles=True))
360  if dirIgnoreRegEx is not None:
361  scanner.add(IgnoreHandler(dirIgnoreRegEx, isForFiles=False))
362  for template, child in self._children.items():
363  parser = PathElementParser(template, allKeys, previousKeys=previousKeys)
364  cumulativeKeys = previousKeys.copy()
365  cumulativeKeys.update(parser.keys)
366  scanner.add(child.build(parser, allKeys, cumulativeKeys, fileIgnoreRegEx=fileIgnoreRegEx,
367  dirIgnoreRegEx=dirIgnoreRegEx))
368 
369  def prune(self) -> Tuple[BuilderNode, List[str], bool]:
370  # Docstring inherited from BuilderNode.
371  toPruneThis = True
372  newChildren = {}
373  messages = []
374  # Recursively prune children.
375  for template, child in list(self._children.items()):
376  newChild, childMessages, toPruneChild = child.prune()
377  newChildren[template] = newChild
378  messages.extend(childMessages)
379  if not toPruneChild:
380  toPruneThis = False
381  self._children = newChildren
382  if toPruneThis:
383  return BuilderPrunedTree(messages), messages, True
384  else:
385  return self, [], False
386 
387  def build(self, parser: PathElementParser, allKeys: Dict[str, type], cumulativeKeys: Dict[str, type], *,
388  fileIgnoreRegEx: Optional[re.Pattern], dirIgnoreRegEx: Optional[re.Pattern]
389  ) -> PathElementHandler:
390  # Docstring inherited from BuilderNode.
391  built = SubdirectoryHandler(parser)
392  self.fill(built.scanner, allKeys, cumulativeKeys, fileIgnoreRegEx=fileIgnoreRegEx,
393  dirIgnoreRegEx=dirIgnoreRegEx)
394  return built