lsst.obs.base  18.0.0-7-g898c2a1+1
bootstrapRepo.py
Go to the documentation of this file.
1 # This file is part of obs_base.
2 #
3 # Developed for the LSST Data Management System.
4 # This product includes software developed by the LSST Project
5 # (http://www.lsst.org).
6 # See the COPYRIGHT file at the top-level directory of this distribution
7 # for details of code ownership.
8 #
9 # This program is free software: you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or
12 # (at your option) any later version.
13 #
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
18 #
19 # You should have received a copy of the GNU General Public License
20 # along with this program. If not, see <http://www.gnu.org/licenses/>.
21 
22 
23 __all__ = ("BootstrapRepoConfig", "BootstrapRepoTask", "BootstrapRepoInputs",
24  "BootstrapRepoSkyMapConfig", "BootstrapRepoRefCatConfig")
25 
26 import os.path
27 from dataclasses import dataclass
28 from typing import List
29 import glob
30 
31 from lsst import sphgeom
32 from lsst.daf.butler import Butler, DatasetType
33 from lsst.daf.butler.instrument import Instrument
34 from lsst.pex.config import Config, Field, ConfigurableField, ConfigDictField, ConfigField
35 from lsst.pipe.base import Task
36 from lsst.obs.base.gen3 import RawIngestTask, makeTransferChoiceField
37 from lsst.skymap import skyMapRegistry
38 from lsst.meas.algorithms import DatasetConfig
39 
40 from .repoConverter import RepoConverter
41 from .calibRepoConverter import CalibRepoConverter
42 
43 
45  datasetTypeName = Field(("DatasetType used to write the SkyMap instance. If None, the instance will "
46  "not be written, and only the Registry will be modified."),
47  dtype=str, default="deepCoadd_skyMap", optional=True)
48  collection = Field(("Butler collection the SkyMap instance should be written to. If None, the "
49  "collection used to initialize the butler will be used."),
50  dtype=str, default="skymaps", optional=True)
51  skyMap = skyMapRegistry.makeField(
52  doc="Type and parameters for the SkyMap itself.",
53  default="dodeca",
54  )
55 
56 
58  datasetTypeName = Field(("DatasetType used to write the catalog shards.."),
59  dtype=str, default="ref_cat")
60  filterByRawRegions = Field(("If True, do not ingest shards that do not overlap visits. "
61  "Does not guarantee that all ingested shards will overlap a visit."),
62  dtype=bool, default=True)
63  collection = Field(("Butler collection the reference catalog should be written to. If None, the "
64  "collection used to initialize the butler will be used. May also be a string with "
65  "the format placeholder '{name}', which will be replaced with the reference "
66  "catalog name (i.e. the key of the configuration dictionary,"),
67  dtype=str, default="refcats/{name}", optional=True)
68  transfer = makeTransferChoiceField(default="symlink")
69 
70 
72  collection = Field(("Butler collection that datasets should be ingested into. "
73  "If None, the collection used to initialize the butler will be used."),
74  dtype=str, default=None, optional=True)
75  transfer = makeTransferChoiceField(default="symlink")
76 
77 
79  skymap = Field("SkyMap dimension name used to define the tracts and patches for bright object masks.",
80  dtype=str, default=None, optional=False)
81  filterByRawRegions = Field(("If True, do not ingest files that do not overlap visits. "
82  "Does not guarantee that all ingested files will overlap a visit."),
83  dtype=bool, default=True)
84 
85 
86 class BootstrapRepoConfig(Config):
87  raws = ConfigurableField(target=RawIngestTask,
88  doc=("Configuration for subtask responsible for ingesting raws and adding "
89  "visit and exposure dimension entries."))
90  skymaps = ConfigDictField(doc=("SkyMap definitions to register and ingest into the repo, keyed by "
91  "skymap dimension name."),
92  keytype=str,
93  itemtype=BootstrapRepoSkyMapConfig,
94  default={})
95  refCats = ConfigDictField(doc=("Reference catalogs to ingest into the repo, keyed by their subdirectory "
96  "within the overall reference catalog root."),
97  keytype=str,
98  itemtype=BootstrapRepoRefCatConfig,
99  default={})
100  brightObjectMasks = ConfigField(doc="Configuration for ingesting brightObjectMask files.",
101  dtype=BootstrapRepoBrightObjectMasksConfig)
102  calibrations = ConfigField(doc="Configuration for ingesting and creating master calibration products.",
103  dtype=BootstrapRepoGenericIngestConfig)
104 
105  def setDefaults(self):
106  self.raws.transfer = "symlink"
107 
108 
109 @dataclass
111  """Simple struct that aggregates all non-config inputs to
112  `BootstrapRepoTask`.
113 
114  Generally, this stuct contains inputs that depend on the organization
115  of the input files on a particular system, while the config includes
116  everything else. The exception is the ``instrument`` attribute, which
117  cannot be included in the config because it's expected that driver code
118  will actually use it (via
119  `~lsst.daf.butler.instrument.Instrument.applyConfigOverrides`) to define
120  the config.
121  """
122 
123  instrument: Instrument
124  """Instrument subclass instance for the raws and calibrations to be
125  included in the initial repo.
126  """
127 
128  raws: List[str]
129  """List of filenames for raw files to ingest (complete paths).
130  """
131 
132  refCatRoot: str
133  """Root of the directory containing the reference catalogs, with immediate
134  subdirectories that correspond to different reference catalogs.
135  """
136 
137  brightObjectMaskRoot: str
138  """Root of the Gen2 repository containing bright object masks.
139  """
140 
141  calibRoot: str
142  """Root of the Gen2 calibraion repository containing flats, biases,
143  darks, and fringes.
144  """
145 
146 
147 class BootstrapRepoTask(Task):
148  """A Task that populates a Gen3 repo with the minimum content needed to
149  run the DRP pipelines.
150 
151  BootstrapRepoTask currently relies on Gen2 data repository information
152  for both bright object masks and master calibrations, but nothing else;
153  unlike dedicated Gen2->Gen3 conversion code, it will be updated in the
154  future as more pure-Gen3 approaches become available.
155 
156  Like other Gen3 Tasks that are not PipelineTasks, BootstrapRepoTask does
157  not yet have a dedicated, general-purpose command-line driver. At least
158  for now, it is instead expected that custom driver scripts will be written
159  for different contexts and predefined datasets.
160 
161  Parameters
162  ----------
163  config : `BootstrapRepoConfig`
164  Configuration for the task.
165  butler : `lsst.daf.butler.Butler`
166  Gen3 Butler defining the repository to populate. New butlers with
167  different output collections will be created as necessary from this
168  butler to match the output collections defined in the configuration.
169  kwds
170  Additional keyword arguments are forwarded to the
171  `lsst.pipe.base.Task` constructor.
172  """
173 
174  ConfigClass = BootstrapRepoConfig
175 
176  _DefaultName = "bootstrapRepo"
177 
178  def __init__(self, config=None, *, butler, **kwds):
179  super().__init__(config, **kwds)
180  self.butler = butler
181  self.makeSubtask("raws", butler=self.butler)
182  self.skyMaps = {}
183 
184  def getButler(self, collection=None):
185  """Create a new butler that writes into the given collection.
186 
187  Parameters
188  ----------
189  collection : `str`, optional
190  The new output collection. If `None`, ``self.butler`` is returned
191  directly.
192 
193  Returns
194  -------
195  butler : `lsst.daf.butler.Butler`
196  Butler instance pointing at the same repository as
197  ``self.butler``, but possibly a different collection.
198  """
199  if collection is not None:
200  return Butler(butler=self.butler, run=collection)
201  return self.butler
202 
203  def run(self, inputs):
204  """Run all steps involved in populating the new repository.
205 
206  Parameters
207  ----------
208  inputs : `BootstrapRepoInputs`
209  Filenames and paths for the data to be ingested.
210  """
211  self.bootstrapInstrument(inputs.instrument)
212  self.bootstrapCalibrations(inputs.instrument, inputs.calibRoot)
213  self.bootstrapRaws(inputs.raws)
214  self.bootstrapRefCats(inputs.refCatRoot)
215  self.bootstrapSkyMaps()
216  self.bootstrapBrightObjectMasks(inputs.instrument, inputs.brightObjectMaskRoot)
217 
218  def bootstrapInstrument(self, instrument):
219  """Add an instrument, associated metadata, and human-curated
220  calibrations to the repository.
221 
222  Parameters
223  ----------
224  instrument : `lsst.daf.butler.instrument.Instrument`
225  Instrument class that defines detectors, physical filters, and
226  curated calibrations to ingest.
227  """
228  self.log.info("Registering instrument '%s' and adding curated calibrations.", instrument.getName())
229  with self.butler.transaction():
230  instrument.register(self.butler.registry)
231  instrument.writeCuratedCalibrations(self.getButler(self.config.calibrations.collection))
232 
233  def bootstrapSkyMaps(self):
234  """Add configured SkyMaps to the repository.
235 
236  This both registers skymap dimension entries (the skymap, tract, and
237  patch tables, and their associated join tables) and adds a
238  ``<something>Coadd_skyMap`` dataset.
239  """
240  for name, config in self.config.skymaps.items():
241  self.log.info("Registering skymap '%s'.", name)
242  with self.butler.transaction():
243  skyMap = config.skyMap.apply()
244  skyMap.register(name, self.butler.registry)
245  if config.datasetTypeName is not None:
246  datasetType = DatasetType(config.datasetTypeName, dimensions=["skymap"],
247  storageClass="SkyMap",
248  universe=self.butler.registry.dimensions)
249  self.butler.registry.registerDatasetType(datasetType)
250  self.getButler(config.collection).put(skyMap, datasetType, skymap=name)
251  self.skyMaps[name] = skyMap
252 
253  def bootstrapRaws(self, files):
254  """Ingest raw images.
255 
256  This step must be run after `bootstrapInstrument`, but may be run
257  multiple times with different arguments (which may be overlapping if
258  the nested `RawIngestTask` is configured to ignore duplicates).
259 
260  Parameters
261  ----------
262  files : sequence of `str`
263  The complete path names of the files to be ingested.
264  """
265  self.log.info("Ingesting raw images.")
266  return self.raws.run(files) # transaction handled internally, according to config.
267 
269  """Compute and return the skypix dimension entries that overlap
270  already-ingested visits.
271  """
272  # TODO: provide a non-SQL way to efficiently perform this query?
273  return list(
274  row["skypix"] for row in self.butler.registry.query(
275  "SELECT DISTINCT skypix FROM visit_skypix_join"
276  )
277  )
278 
279  def bootstrapRefCats(self, root):
280  """Ingest reference catalogs.
281 
282  This step must be run after `bootstrapRaws` if the
283  ``filterByRawRegions`` config option is `True` for any reference
284  catalog.
285 
286  Parameters
287  ----------
288  root : `str`
289  Root of the directory containing the reference catalogs, with
290  immediate subdirectories that correspond to different reference
291  catalogs.
292  """
293  if not self.config.refCats:
294  return
295  if any(config.filterByRawRegions for config in self.config.refCats.values()):
296  rawSkyPixels = self.computeRawSkyPixels()
297  datasetType = DatasetType("ref_cat", dimensions=["skypix"], storageClass="SimpleCatalog",
298  universe=self.butler.registry.dimensions)
299  self.butler.registry.registerDatasetType(datasetType)
300  for name, config in self.config.refCats.items():
301  self.log.info("Ingesting reference catalog '%s'.", name)
302  with self.butler.transaction():
303  onDiskConfig = DatasetConfig()
304  onDiskConfig.load(os.path.join(root, name, "config.py"))
305  if onDiskConfig.indexer.name != "HTM":
306  raise ValueError(f"Reference catalog '{name}' uses unsupported "
307  f"pixelization '{onDiskConfig.indexer.name}'.")
308  if not isinstance(self.butler.registry.pixelization, sphgeom.HtmPixelization):
309  raise ValueError(f"Registry uses unsupported pixelization class "
310  f"{self.butler.registry.pixelization.__class__}.")
311  if onDiskConfig.indexer["HTM"].depth != self.butler.registry.pixelization.getLevel():
312  raise ValueError(f"Registry HTM level {self.butler.registry.pixelization.getLevel()} "
313  f"does not match reference catalog level {onDiskConfig.indexer.depth}.")
314  butler = self.getButler(config.collection.format(name))
315  if config.filterByRawRegions:
316  missing = []
317  for index in rawSkyPixels:
318  path = os.path.join(root, name, f"{index}.fits")
319  if os.path.exists(path):
320  butler.ingest(path, datasetType, transfer=config.transfer, skypix=index)
321  else:
322  missing.append(index)
323  if missing:
324  self.log.warn("Some overlapping reference catalog shards missing: %s", missing)
325  else:
326  for path in glob.glob(os.path.join(root, name, "*.fits")):
327  if path.endswith("master_schema.fits"):
328  continue
329  _, filename = os.path.split(path)
330  basename, _ = os.path.splitext(filename)
331  try:
332  index = int(basename)
333  except ValueError:
334  self.log.warn("Unrecognized file in reference catalog root: '%s'.", path)
335  continue
336  butler.ingest(path, datasetType, transfer=config.transfer, skypix=index)
337 
338  def computeRawTracts(self, skymap):
339  """Compute and return the tract dimension entries that overlap
340  already-ingested visits.
341  """
342  # TODO: provide a non-SQL way to efficiently perform this query?
343  return list(
344  row["tract"] for row in self.butler.registry.query(
345  "SELECT DISTINCT tract FROM visit_tract_join WHERE skymap=:skymap",
346  skymap=skymap
347  )
348  )
349 
350  def bootstrapBrightObjectMasks(self, instrument, root):
351  """Ingest bright object masks from a Gen2 data repository.
352 
353  This step must be run after `bootstrapRaws` if the
354  ``filterByRawRegions`` config option is `True` for any reference
355  catalog, and must always be run after `bootstrapSkyMaps`.
356 
357  Parameters
358  ----------
359  root : `str`
360  Root of the Gen2 repository containing bright object masks.
361  instrument : `lsst.daf.butler.instrument.Instrument`
362  Instrument subclass instance; used to relate Gen2 filter
363  strings to Gen3 physical_filters and abstract_filters.
364  """
365  self.log.info("Ingesting bright object masks.")
366  butler = self.getButler(self.config.brightObjectMasks.collection)
367  baseDataId = {
368  "skymap": self.config.brightObjectMasks.skymap,
369  "instrument": instrument.getName()
370  }
371  converter = RepoConverter(root, universe=butler.registry.dimensions, baseDataId=baseDataId,
372  skyMap=self.skyMaps[self.config.brightObjectMasks.skymap])
373  converter.addDatasetType("brightObjectMask", "ObjectMaskCatalog")
374  if self.config.brightObjectMasks.filterByRawRegions:
375  for tract in self.computeRawTracts(self.config.brightObjectMasks.skymap):
376  with self.butler.transaction():
377  converter.convertRepo(butler, directory=f"{root}/deepCoadd/BrightObjectMasks/{tract:d}",
378  transfer=self.config.brightObjectMasks.transfer)
379  else:
380  with self.butler.transaction():
381  converter.convertRepo(butler, transfer=self.config.brightObjectMasks.transfer)
382 
383  def bootstrapCalibrations(self, instrument, root):
384  """Ingest master calibrations from a Gen2 calibration data repository.
385 
386  At present, all master calibrations in the Gen2 repostory are
387  transferred, even those unrelated to the ingested raws.
388 
389  This step must be run after `bootstrapInstrument`.
390 
391  Parameters
392  ----------
393  instrument : `lsst.daf.butler.instrument.Instrument`
394  Instrument subclass instance for the raws and calibrations to be
395  included in the initial repo.
396  root : `str`
397  Root of the Gen2 calibration data repository.
398  """
399  self.log.info("Ingesting calibrations.")
400  baseDataId = {"instrument": instrument.getName()}
401  butler = self.getButler(self.config.calibrations.collection)
402  converter = CalibRepoConverter(root, universe=butler.registry.dimensions, baseDataId=baseDataId)
403  converter.addDatasetType("flat", "MaskedImageF")
404  converter.addDatasetType("bias", "ImageF")
405  converter.addDatasetType("dark", "ImageF")
406  converter.addDatasetType("sky", "ExposureF")
407  converter.addDatasetType("fringe", "ExposureF")
408  # TODO, DM-16805: No StorageClass/Formatter for yBackground in Gen3.
409  with self.butler.transaction():
410  converter.convertRepo(butler, transfer=self.config.brightObjectMasks.transfer)
def __init__(self, config=None, butler, kwds)
def makeTransferChoiceField(doc="How to transfer files (None for no transfer).", default=None)
Definition: ingest.py:48
def bootstrapBrightObjectMasks(self, instrument, root)