lsst.obs.base  19.0.0-43-gbcf6a3c
rootRepoConverter.py
Go to the documentation of this file.
1 # This file is part of obs_base.
2 #
3 # Developed for the LSST Data Management System.
4 # This product includes software developed by the LSST Project
5 # (https://www.lsst.org).
6 # See the COPYRIGHT file at the top-level directory of this distribution
7 # for details of code ownership.
8 #
9 # This program is free software: you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or
12 # (at your option) any later version.
13 #
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
18 #
19 # You should have received a copy of the GNU General Public License
20 # along with this program. If not, see <http://www.gnu.org/licenses/>.
21 from __future__ import annotations
22 
23 __all__ = ["RootRepoConverter"]
24 
25 import os
26 import re
27 import itertools
28 from typing import TYPE_CHECKING, Iterator, Optional, Tuple, List, Set
29 
30 from lsst.skymap import BaseSkyMap
31 from lsst.daf.butler import DatasetType, DatasetRef, FileDataset
32 from .standardRepoConverter import StandardRepoConverter
33 
34 SKYMAP_DATASET_TYPES = {
35  coaddName: f"{coaddName}Coadd_skyMap" for coaddName in ("deep", "goodSeeing", "dcr")
36 }
37 
38 if TYPE_CHECKING:
39  from lsst.daf.butler import SkyPixDimension
40  from ..ingest import RawExposureData
41 
42 
43 def getDataPaths(dataRefs):
44  """Strip HDU identifiers from paths and return a unique set of paths.
45 
46  Parameters
47  ----------
48  dataRefs : `lsst.daf.persistence.ButlerDataRef`
49  The gen2 datarefs to strip "[HDU]" values from.
50 
51  Returns
52  -------
53  paths : `set` [`str`]
54  The unique file paths without appended "[HDU]".
55  """
56  paths = set()
57  for dataRef in dataRefs:
58  path = dataRef.getUri()
59  # handle with FITS files with multiple HDUs (e.g. decam raw)
60  paths.add(path.split('[')[0])
61  return paths
62 
63 
65  """A specialization of `RepoConverter` for root data repositories.
66 
67  `RootRepoConverter` adds support for raw images (mostly delegated to the
68  parent task's `RawIngestTask` subtask) and reference catalogs.
69 
70  Parameters
71  ----------
72  kwds
73  Keyword arguments are forwarded to (and required by) `RepoConverter`.
74  """
75 
76  def __init__(self, **kwds):
77  super().__init__(run=None, **kwds)
78  self._exposureData: List[RawExposureData] = []
79  self._refCats: List[Tuple[str, SkyPixDimension]] = []
80  if self.task.config.rootSkyMapName is not None:
81  self._rootSkyMap = self.task.config.skyMaps[self.task.config.rootSkyMapName].skyMap.apply()
82  else:
83  self._rootSkyMap = None
84  self._chain = None
85 
86  def isDatasetTypeSpecial(self, datasetTypeName: str) -> bool:
87  # Docstring inherited from RepoConverter.
88  return (
89  super().isDatasetTypeSpecial(datasetTypeName)
90  or datasetTypeName in ("raw", "ref_cat", "ref_cat_config")
91  # in Gen2, some of these are in the root repo, not a calib repo
92  or datasetTypeName in self.task.config.curatedCalibrations
93  )
94 
95  def getSpecialDirectories(self) -> List[str]:
96  # Docstring inherited from RepoConverter.
97  return super().getSpecialDirectories() + ["CALIB", "ref_cats", "rerun"]
98 
99  def findMatchingSkyMap(self, datasetTypeName: str) -> Tuple[Optional[BaseSkyMap], Optional[str]]:
100  # Docstring inherited from StandardRepoConverter.findMatchingSkyMap.
101  skyMap, name = super().findMatchingSkyMap(datasetTypeName)
102  if skyMap is None and self.task.config.rootSkyMapName is not None:
103  self.task.log.debug(
104  ("Assuming configured root skymap with name '%s' for dataset %s."),
105  self.task.config.rootSkyMapName, datasetTypeName
106  )
107  skyMap = self._rootSkyMap
108  name = self.task.config.rootSkyMapName
109  return skyMap, name
110 
111  def prep(self):
112  # Docstring inherited from RepoConverter.
113  # Gather information about raws.
114  if self.task.raws is not None:
115  self.task.log.info(f"Preparing raws from root {self.root}.")
116  if self.subset is not None:
117  dataRefs = itertools.chain.from_iterable(
118  self.butler2.subset(self.task.config.rawDatasetType,
119  visit=visit) for visit in self.subset.visits
120  )
121  else:
122  dataRefs = self.butler2.subset(self.task.config.rawDatasetType)
123  dataPaths = getDataPaths(dataRefs)
124  self.task.log.debug("Prepping files: %s", dataPaths)
125  self._exposureData.extend(self.task.raws.prep(dataPaths))
126  # Gather information about reference catalogs.
127  if self.task.isDatasetTypeIncluded("ref_cat") and len(self.task.config.refCats) != 0:
128  from lsst.meas.algorithms import DatasetConfig as RefCatDatasetConfig
129  for refCat in os.listdir(os.path.join(self.root, "ref_cats")):
130  path = os.path.join(self.root, "ref_cats", refCat)
131  configFile = os.path.join(path, "config.py")
132  if not os.path.exists(configFile):
133  continue
134  if refCat not in self.task.config.refCats:
135  continue
136  self.task.log.info(f"Preparing ref_cat {refCat} from root {self.root}.")
137  onDiskConfig = RefCatDatasetConfig()
138  onDiskConfig.load(configFile)
139  if onDiskConfig.indexer.name != "HTM":
140  raise ValueError(f"Reference catalog '{refCat}' uses unsupported "
141  f"pixelization '{onDiskConfig.indexer.name}'.")
142  level = onDiskConfig.indexer["HTM"].depth
143  try:
144  dimension = self.task.universe[f"htm{level}"]
145  except KeyError as err:
146  raise ValueError(f"Reference catalog {refCat} uses HTM level {level}, but no htm{level} "
147  f"skypix dimension is configured for this registry.") from err
148  self.task.useSkyPix(dimension)
149  self._refCats.append((refCat, dimension))
150  if self.task.isDatasetTypeIncluded("brightObjectMask") and self.task.config.rootSkyMapName:
151  self.task.useSkyMap(self._rootSkyMap, self.task.config.rootSkyMapName)
152  super().prep()
153 
155  # Docstring inherited from RepoConverter.
156  self.task.log.info(f"Inserting observation dimension records from {self.root}.")
157  records = {"visit": [], "exposure": [], "visit_detector_region": []}
158  for exposure in self._exposureData:
159  for dimension, recordsForDimension in exposure.records.items():
160  records[dimension].extend(recordsForDimension)
161  self.task.raws.insertDimensionData(records)
162 
163  def iterDatasets(self) -> Iterator[FileDataset]:
164  # Docstring inherited from RepoConverter.
165  # Iterate over reference catalog files.
166  for refCat, dimension in self._refCats:
167  datasetType = DatasetType(refCat, dimensions=[dimension], universe=self.task.universe,
168  storageClass="SimpleCatalog")
169  if self.subset is None:
170  regex = re.compile(r"(\d+)\.fits")
171  for fileName in os.listdir(os.path.join(self.root, "ref_cats", refCat)):
172  m = regex.match(fileName)
173  if m is not None:
174  htmId = int(m.group(1))
175  dataId = self.task.registry.expandDataId({dimension: htmId})
176  yield FileDataset(path=os.path.join(self.root, "ref_cats", refCat, fileName),
177  refs=DatasetRef(datasetType, dataId))
178  else:
179  for begin, end in self.subset.skypix[dimension]:
180  for htmId in range(begin, end):
181  dataId = self.task.registry.expandDataId({dimension: htmId})
182  yield FileDataset(path=os.path.join(self.root, "ref_cats", refCat, f"{htmId}.fits"),
183  refs=DatasetRef(datasetType, dataId))
184  yield from super().iterDatasets()
185 
186  def ingest(self):
187  # Docstring inherited from RepoConverter.
188  self._chain = {}
189  if self.task.raws is not None:
190  self.task.log.info("Ingesting raws from root %s into run %s.", self.root,
191  self.task.raws.butler.run)
192  self.task.registry.registerDatasetType(self.task.raws.datasetType)
193  self._chain.setdefault(self.task.raws.butler.run, set()).add(self.task.raws.datasetType.name)
194  # We need te delegate to RawIngestTask to actually ingest raws,
195  # rather than just including those datasets in iterDatasets for
196  # the base class to handle, because we don't want to assume we
197  # can use the Datastore-configured Formatter for raw data.
198  for exposure in self._exposureData:
199  self.task.raws.ingestExposureDatasets(exposure)
200  super().ingest()
201 
202  def getRun(self, datasetTypeName: str) -> str:
203  # Docstring inherited from RepoConverter.
204  run = self.task.config.runs[datasetTypeName]
205  self._chain.setdefault(run, set()).add(datasetTypeName)
206  return run
207 
208  def getCollectionChain(self) -> List[Tuple[str, Set[str]]]:
209  """Return tuples of run name and associated dataset type names that
210  can be used to construct a chained collection that refers to the
211  converted root repository (`list` [ `tuple` ]).
212  """
213  return list(self._chain.items())