lsst.obs.base  18.1.0-11-g311e899
repoConverter.py
Go to the documentation of this file.
1 # This file is part of obs_base.
2 #
3 # Developed for the LSST Data Management System.
4 # This product includes software developed by the LSST Project
5 # (http://www.lsst.org).
6 # See the COPYRIGHT file at the top-level directory of this distribution
7 # for details of code ownership.
8 #
9 # This program is free software: you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or
12 # (at your option) any later version.
13 #
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
18 #
19 # You should have received a copy of the GNU General Public License
20 # along with this program. If not, see <http://www.gnu.org/licenses/>.
21 
22 __all__ = ("RepoConverter", "DataIdExtractor")
23 
24 import os
25 import pickle
26 from collections import OrderedDict # for move_to_end
27 
28 import yaml
29 
30 # register YAML loader for repositoryCfg.yaml files.
31 import lsst.daf.persistence.repositoryCfg # noqa: F401
32 
33 from lsst.daf.butler import DataId, DatasetType, DatasetRef
34 from lsst.daf.butler.gen2convert import FilePathParser, Translator
35 from lsst.log import Log
36 from lsst.log.utils import temporaryLogLevel
37 from lsst.utils import doImport
38 
39 
40 def findMapperClass(root):
41  """Find the mapper class associated with a Gen2 data repository root.
42 
43  Parameters
44  ----------
45  root : `str`
46  Path to a Gen2 repository root directory.
47 
48  Returns
49  -------
50  cls : `type`
51  A subclass of `lsst.obs.base.CameraMapper`.
52 
53  Raises
54  ------
55  ValueError
56  Raised if the directory does not appear to be the root of a
57  Gen2 data repository.
58  """
59  cfgPath = os.path.join(root, "repositoryCfg.yaml")
60  if os.path.exists(cfgPath):
61  with open(cfgPath, "r") as f:
62  repoCfg = yaml.load(f, Loader=yaml.UnsafeLoader)
63  return repoCfg.mapper
64  parentLinkPath = os.path.join(root, "_parent")
65  if os.path.exists(parentLinkPath):
66  return findMapperClass(os.readlink(parentLinkPath))
67  mapperFilePath = os.path.join(root, "_mapper")
68  if os.path.exists(mapperFilePath):
69  with open(mapperFilePath, "r") as f:
70  mapperClassPath = f.read().strip()
71  return doImport(mapperClassPath)
72  calibRegistryPath = os.path.join(root, "calibRegistry.sqlite3")
73  if os.path.exists(calibRegistryPath):
74  return findMapperClass(os.path.normpath(os.path.join(root, os.path.pardir)))
75  raise ValueError(f"Could not determine (Gen2) mapper class for repo at '{root}'.")
76 
77 
79  """A class that extracts Gen3 data IDs from Gen2 filenames for a
80  particular dataset type.
81 
82  Parameters
83  ----------
84  datasetTypeName : `str`
85  Name of the dataset type the object will process.
86  storageClass : `str` or `lsst.daf.butler.StorageClass`
87  Gen3 storage class of the dataset type.
88  universe : `lsst.daf.butler.DimensionUniverse`
89  Object containing all dimension definitions.
90  baseDataId : `dict`
91  Key-value pairs that may need to appear in the Gen3 data ID, but can
92  never be inferred from a Gen2 filename. This should always include
93  the instrument name (even Gen3 data IDs that don't involve the
94  instrument dimension have instrument-dependent Gen2 filenames) and
95  should also include the skymap name for any data ID that involves
96  tracts or patches.
97  filePathParser : `lsst.daf.butler.gen2convert.FilePathParser`, optional
98  Object responsible for reading a Gen2 data ID from a filename. Will
99  be created from ``mapper`` if not provided.
100  translator : `lsst.daf.butler.gen2convert.Translator`, optional
101  Object responsible for converting a Gen2 data ID into a Gen3 data ID.
102  Will be created if not provided.
103  mapper : `lsst.obs.base.CameraMapper`, optional
104  Object that defines Gen2 filename templates. Must be provided if
105  ``filePathParser`` is not.
106  skyMap : `lsst.skymap.BaseSkyMap`, optional
107  SkyMap that defines tracts and patches. Must be provided for datasets
108  with a ``patch`` key in their data IDs.
109  """
110 
111  def __init__(self, datasetTypeName, storageClass, *, universe, baseDataId,
112  filePathParser=None, translator=None, mapper=None, skyMap=None):
113  if filePathParser is None:
114  filePathParser = FilePathParser.fromMapping(mapper.mappings[datasetTypeName])
115  self.filePathParser = filePathParser
116  if translator is None:
117  translator = Translator.makeMatching(filePathParser.datasetType, baseDataId, skyMap=skyMap)
118  self.translator = translator
119  self.datasetType = DatasetType(datasetTypeName, dimensions=self.translator.dimensionNames,
120  storageClass=storageClass, universe=universe)
121 
122  def apply(self, fileNameInRoot):
123  """Extract a Gen3 data ID from the given filename,
124 
125  Parameters
126  ----------
127  fileNameInRoot : `str`
128  Filename relative to a Gen2 data repository root.
129 
130  Returns
131  -------
132  dataId : `lsst.daf.butler.DataId` or `None`
133  The Gen3 data ID, or `None` if the file was not recognized as an
134  instance of the extractor's dataset type.
135  """
136  gen2id = self.filePathParser(fileNameInRoot)
137  if gen2id is None:
138  return None
139  return DataId(self.translator(gen2id), dimensions=self.datasetType.dimensions)
140 
141 
143  """A helper class that ingests (some of) the contents of a Gen2 data
144  repository into a Gen3 data repository.
145 
146  Parameters
147  ----------
148  root : `str`
149  Root of the Gen2 data repository.
150  universe : `lsst.daf.butler.DimensionUniverse`
151  Object containing all dimension definitions.
152  baseDataId : `dict`
153  Key-value pairs that may need to appear in the Gen3 data ID, but can
154  never be inferred from a Gen2 filename. This should always include
155  the instrument name (even Gen3 data IDs that don't involve the
156  instrument dimension have instrument-dependent Gen2 filenames) and
157  should also include the skymap name in order to process any data IDs
158  that involve tracts or patches.
159  mapper : `lsst.obs.base.CameraMapper`, optional
160  Object that defines Gen2 filename templates. Will be identified,
161  imported, and constructed from ``root`` if not provided.
162  skyMap : `lsst.skymap.BaseSkyMap`, optional
163  SkyMap that defines tracts and patches. Must be provided in order to
164  provess datasets with a ``patch`` key in their data IDs.
165  """
166 
167  COADD_NAMES = ("deep", "goodSeeing", "dcr")
168  REPO_ROOT_FILES = ("registry.sqlite3", "_mapper", "repositoryCfg.yaml",
169  "calibRegistry.sqlite3", "_parent")
170 
171  def __init__(self, root, *, universe, baseDataId, mapper=None, skyMap=None):
172  self.root = root
173  if mapper is None:
174  # Shush spurious log messages from Gen2 Mapper classes.
175  # These aren't spurious in other contexts - we're just playing fast
176  # and loose with mapper initialization, because we don't care about
177  # things like parent lookups (we just want the set of templates).
178  with temporaryLogLevel("CameraMapper", Log.ERROR):
179  with temporaryLogLevel("HscMapper", Log.ERROR):
180  cls = findMapperClass(root)
181  mapper = cls(root=root)
182  self.mapper = mapper
183  self.universe = universe
184  self.baseDataId = baseDataId
185  self.extractors = OrderedDict() # for move_to_end
186  if "skymap" in baseDataId:
187  if skyMap is None:
188  for name in self.COADD_NAMES:
189  mapping = self.mapper.mappings.get(f"{name}Coadd_skyMap", None)
190  if mapping is None:
191  continue
192  filename = os.path.join(self.root, mapping.template)
193  if os.path.exists(filename):
194  if skyMap is not None:
195  raise ValueError("Multiple SkyMaps found in repository; please use multiple "
196  "RepoConverters with an explicit skyMap argument for each.")
197  with open(filename, "rb") as f:
198  skyMap = pickle.load(f, encoding="latin1")
199  self.skyMap = skyMap
200 
201  def addDatasetType(self, datasetTypeName, storageClass):
202  """Add a dataset type to those recognized by the converter.
203 
204  Parameters
205  ----------
206  datasetTypeName : `str`
207  String name of the dataset type.
208  storageClass : `str` or `lsst.daf.butler.StorageClass`
209  Gen3 storage class of the dataset type.
210 
211  Returns
212  -------
213  extractor : `DataIdExtractor`
214  The object that will be used to extract data IDs for instances of
215  this dataset type (also held internally, so the return value can
216  usually be ignored).
217  """
218  r = DataIdExtractor(datasetTypeName, storageClass, mapper=self.mapper,
219  universe=self.universe, baseDataId=self.baseDataId, skyMap=self.skyMap)
220  self.extractors[datasetTypeName] = r
221  return r
222 
223  def extractDatasetRef(self, fileNameInRoot):
224  """Extract a Gen3 `~lsst.daf.butler.DatasetRef` from a filename in a
225  Gen2 data repository.
226 
227  Parameters
228  ----------
229  fileNameInRoot : `str`
230  Name of the file, relative to the root of its Gen2 repository.
231 
232  Return
233  ------
234  ref : `lsst.daf.butler.DatasetRef` or `None`
235  Reference to the Gen3 dataset that would be created by converting
236  this file, or `None` if the file is not recognized as an instance
237  of a dataset type known to this converter.
238  """
239  for datasetTypeName, extractor in self.extractors.items():
240  dataId = extractor.apply(fileNameInRoot)
241  if dataId is not None:
242  # Move the extractor that matched to the front of the
243  # dictionary, as we're likely to see instances of the
244  # same DatasetType together.
245  self.extractors.move_to_end(datasetTypeName, last=False)
246  return DatasetRef(extractor.datasetType, dataId=dataId)
247  return None
248 
249  def walkRepo(self, directory=None, skipDirs=()):
250  """Recursively a (subset of) a Gen2 data repository, yielding files
251  that may be convertible.
252 
253  Parameters
254  ----------
255  directory : `str`, optional
256  A subdirectory of the repository root to process, instead of
257  processing the entire repository.
258  skipDirs : sequence of `str`
259  Subdirectories that should be skipped.
260 
261  Yields
262  ------
263  fileNameInRoot : `str`
264  Name of a file in the repository, relative to the root of the
265  repository.
266  """
267  if directory is None:
268  directory = self.root
269  for dirPath, subdirNamesInDir, fileNamesInDir in os.walk(directory, followlinks=True):
270  # Remove subdirectories that appear to be repositories themselves
271  # from the walking
272  def isRepoRoot(dirName):
273  return any(os.path.exists(os.path.join(dirPath, dirName, f))
274  for f in self.REPO_ROOT_FILES)
275  subdirNamesInDir[:] = [d for d in subdirNamesInDir if not isRepoRoot(d) and d not in skipDirs]
276  # Loop over files in this directory, and ask per-DatasetType
277  # extractors if they recognize them and can extract a data ID;
278  # if so, ingest.
279  dirPathInRoot = dirPath[len(self.root) + len(os.path.sep):]
280  for fileNameInDir in fileNamesInDir:
281  fileNameInRoot = os.path.join(dirPathInRoot, fileNameInDir)
282  if fileNameInRoot in self.REPO_ROOT_FILES:
283  continue
284  yield fileNameInRoot
285 
286  def convertRepo(self, butler, *, directory=None, transfer=None, formatter=None, skipDirs=()):
287  """Ingest all recognized files into a Gen3 repository.
288 
289  Parameters
290  ----------
291  butler : `lsst.daf.butler.Butler`
292  Gen3 butler that files should be ingested into.
293  directory : `str`, optional
294  A subdirectory of the repository root to process, instead of
295  processing the entire repository.
296  transfer : str, optional
297  If not `None`, must be one of 'move', 'copy', 'hardlink', or
298  'symlink' indicating how to transfer the file.
299  formatter : `lsst.daf.butler.Formatter`, optional
300  Formatter that should be used to retreive the Dataset. If not
301  provided, the formatter will be constructed according to
302  Datastore configuration. This should only be used when converting
303  only a single dataset type multiple dataset types of the same
304  storage class.
305  skipDirs : sequence of `str`
306  Subdirectories that should be skipped.
307  """
308  log = Log.getLogger("RepoConverter")
309  for extractor in self.extractors.values():
310  butler.registry.registerDatasetType(extractor.datasetType)
311  skipped = {}
312  for file in self.walkRepo(directory=directory, skipDirs=skipDirs):
313  ref = self.extractDatasetRef(file)
314  if ref is not None:
315  try:
316  butler.ingest(os.path.join(self.root, file), ref, transfer=transfer, formatter=formatter)
317  except Exception as err:
318  skipped.setdefault(type(err), []).append(str(err))
319  if skipped:
320  for cls, messages in skipped.items():
321  log.warn("Skipped %s files due to exceptions of type %s.", len(messages), cls.__name__)
322  if log.isDebugEnabled():
323  for message in messages:
324  log.debug(message)
def convertRepo(self, butler, directory=None, transfer=None, formatter=None, skipDirs=())
def __init__(self, datasetTypeName, storageClass, universe, baseDataId, filePathParser=None, translator=None, mapper=None, skyMap=None)
def walkRepo(self, directory=None, skipDirs=())
def addDatasetType(self, datasetTypeName, storageClass)
def __init__(self, root, universe, baseDataId, mapper=None, skyMap=None)