lsst.pipe.tasks  21.0.0-55-g0be6b205+66ae927d20
postprocess.py
Go to the documentation of this file.
1 # This file is part of pipe_tasks
2 #
3 # Developed for the LSST Data Management System.
4 # This product includes software developed by the LSST Project
5 # (https://www.lsst.org).
6 # See the COPYRIGHT file at the top-level directory of this distribution
7 # for details of code ownership.
8 #
9 # This program is free software: you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or
12 # (at your option) any later version.
13 #
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
18 #
19 # You should have received a copy of the GNU General Public License
20 # along with this program. If not, see <https://www.gnu.org/licenses/>.
21 
22 import functools
23 import pandas as pd
24 import numpy as np
25 from collections import defaultdict
26 
27 import lsst.geom
28 import lsst.pex.config as pexConfig
29 import lsst.pipe.base as pipeBase
30 import lsst.daf.base as dafBase
31 from lsst.pipe.base import connectionTypes
32 import lsst.afw.table as afwTable
33 from lsst.meas.base import SingleFrameMeasurementTask
34 from lsst.pipe.base import CmdLineTask, ArgumentParser, DataIdContainer
35 from lsst.coadd.utils.coaddDataIdContainer import CoaddDataIdContainer
36 
37 from .parquetTable import ParquetTable
38 from .multiBandUtils import makeMergeArgumentParser, MergeSourcesRunner
39 from .functors import CompositeFunctor, RAColumn, DecColumn, Column
40 
41 
42 def flattenFilters(df, noDupCols=['coord_ra', 'coord_dec'], camelCase=False):
43  """Flattens a dataframe with multilevel column index
44  """
45  newDf = pd.DataFrame()
46  for band in set(df.columns.to_frame()['band']):
47  subdf = df[band]
48  columnFormat = '{0}{1}' if camelCase else '{0}_{1}'
49  newColumns = {c: columnFormat.format(band, c)
50  for c in subdf.columns if c not in noDupCols}
51  cols = list(newColumns.keys())
52  newDf = pd.concat([newDf, subdf[cols].rename(columns=newColumns)], axis=1)
53 
54  newDf = pd.concat([subdf[noDupCols], newDf], axis=1)
55  return newDf
56 
57 
58 class WriteObjectTableConfig(pexConfig.Config):
59  engine = pexConfig.Field(
60  dtype=str,
61  default="pyarrow",
62  doc="Parquet engine for writing (pyarrow or fastparquet)"
63  )
64  coaddName = pexConfig.Field(
65  dtype=str,
66  default="deep",
67  doc="Name of coadd"
68  )
69 
70 
71 class WriteObjectTableTask(CmdLineTask):
72  """Write filter-merged source tables to parquet
73  """
74  _DefaultName = "writeObjectTable"
75  ConfigClass = WriteObjectTableConfig
76  RunnerClass = MergeSourcesRunner
77 
78  # Names of table datasets to be merged
79  inputDatasets = ('forced_src', 'meas', 'ref')
80 
81  # Tag of output dataset written by `MergeSourcesTask.write`
82  outputDataset = 'obj'
83 
84  def __init__(self, butler=None, schema=None, **kwargs):
85  # It is a shame that this class can't use the default init for CmdLineTask
86  # But to do so would require its own special task runner, which is many
87  # more lines of specialization, so this is how it is for now
88  CmdLineTask.__init__(self, **kwargs)
89 
90  def runDataRef(self, patchRefList):
91  """!
92  @brief Merge coadd sources from multiple bands. Calls @ref `run` which must be defined in
93  subclasses that inherit from MergeSourcesTask.
94  @param[in] patchRefList list of data references for each filter
95  """
96  catalogs = dict(self.readCatalogreadCatalog(patchRef) for patchRef in patchRefList)
97  dataId = patchRefList[0].dataId
98  mergedCatalog = self.runrun(catalogs, tract=dataId['tract'], patch=dataId['patch'])
99  self.writewrite(patchRefList[0], mergedCatalog)
100 
101  @classmethod
102  def _makeArgumentParser(cls):
103  """Create a suitable ArgumentParser.
104 
105  We will use the ArgumentParser to get a list of data
106  references for patches; the RunnerClass will sort them into lists
107  of data references for the same patch.
108 
109  References first of self.inputDatasets, rather than
110  self.inputDataset
111  """
112  return makeMergeArgumentParser(cls._DefaultName_DefaultName, cls.inputDatasetsinputDatasets[0])
113 
114  def readCatalog(self, patchRef):
115  """Read input catalogs
116 
117  Read all the input datasets given by the 'inputDatasets'
118  attribute.
119 
120  Parameters
121  ----------
122  patchRef : `lsst.daf.persistence.ButlerDataRef`
123  Data reference for patch
124 
125  Returns
126  -------
127  Tuple consisting of band name and a dict of catalogs, keyed by
128  dataset name
129  """
130  band = patchRef.get(self.config.coaddName + "Coadd_filterLabel", immediate=True).bandLabel
131  catalogDict = {}
132  for dataset in self.inputDatasetsinputDatasets:
133  catalog = patchRef.get(self.config.coaddName + "Coadd_" + dataset, immediate=True)
134  self.log.info("Read %d sources from %s for band %s: %s" %
135  (len(catalog), dataset, band, patchRef.dataId))
136  catalogDict[dataset] = catalog
137  return band, catalogDict
138 
139  def run(self, catalogs, tract, patch):
140  """Merge multiple catalogs.
141 
142  Parameters
143  ----------
144  catalogs : `dict`
145  Mapping from filter names to dict of catalogs.
146  tract : int
147  tractId to use for the tractId column
148  patch : str
149  patchId to use for the patchId column
150 
151  Returns
152  -------
153  catalog : `lsst.pipe.tasks.parquetTable.ParquetTable`
154  Merged dataframe, with each column prefixed by
155  `filter_tag(filt)`, wrapped in the parquet writer shim class.
156  """
157 
158  dfs = []
159  for filt, tableDict in catalogs.items():
160  for dataset, table in tableDict.items():
161  # Convert afwTable to pandas DataFrame
162  df = table.asAstropy().to_pandas().set_index('id', drop=True)
163 
164  # Sort columns by name, to ensure matching schema among patches
165  df = df.reindex(sorted(df.columns), axis=1)
166  df['tractId'] = tract
167  df['patchId'] = patch
168 
169  # Make columns a 3-level MultiIndex
170  df.columns = pd.MultiIndex.from_tuples([(dataset, filt, c) for c in df.columns],
171  names=('dataset', 'band', 'column'))
172  dfs.append(df)
173 
174  catalog = functools.reduce(lambda d1, d2: d1.join(d2), dfs)
175  return ParquetTable(dataFrame=catalog)
176 
177  def write(self, patchRef, catalog):
178  """Write the output.
179 
180  Parameters
181  ----------
182  catalog : `ParquetTable`
183  Catalog to write
184  patchRef : `lsst.daf.persistence.ButlerDataRef`
185  Data reference for patch
186  """
187  patchRef.put(catalog, self.config.coaddName + "Coadd_" + self.outputDatasetoutputDataset)
188  # since the filter isn't actually part of the data ID for the dataset we're saving,
189  # it's confusing to see it in the log message, even if the butler simply ignores it.
190  mergeDataId = patchRef.dataId.copy()
191  del mergeDataId["filter"]
192  self.log.info("Wrote merged catalog: %s" % (mergeDataId,))
193 
194  def writeMetadata(self, dataRefList):
195  """No metadata to write, and not sure how to write it for a list of dataRefs.
196  """
197  pass
198 
199 
200 class WriteSourceTableConnections(pipeBase.PipelineTaskConnections,
201  dimensions=("instrument", "visit", "detector")):
202 
203  catalog = connectionTypes.Input(
204  doc="Input full-depth catalog of sources produced by CalibrateTask",
205  name="src",
206  storageClass="SourceCatalog",
207  dimensions=("instrument", "visit", "detector")
208  )
209  outputCatalog = connectionTypes.Output(
210  doc="Catalog of sources, `src` in Parquet format",
211  name="source",
212  storageClass="DataFrame",
213  dimensions=("instrument", "visit", "detector")
214  )
215 
216 
217 class WriteSourceTableConfig(pipeBase.PipelineTaskConfig,
218  pipelineConnections=WriteSourceTableConnections):
219  doApplyExternalPhotoCalib = pexConfig.Field(
220  dtype=bool,
221  default=False,
222  doc=("Add local photoCalib columns from the calexp.photoCalib? Should only set True if "
223  "generating Source Tables from older src tables which do not already have local calib columns")
224  )
225  doApplyExternalSkyWcs = pexConfig.Field(
226  dtype=bool,
227  default=False,
228  doc=("Add local WCS columns from the calexp.wcs? Should only set True if "
229  "generating Source Tables from older src tables which do not already have local calib columns")
230  )
231 
232 
233 class WriteSourceTableTask(CmdLineTask, pipeBase.PipelineTask):
234  """Write source table to parquet
235  """
236  _DefaultName = "writeSourceTable"
237  ConfigClass = WriteSourceTableConfig
238 
239  def runDataRef(self, dataRef):
240  src = dataRef.get('src')
241  if self.config.doApplyExternalPhotoCalib or self.config.doApplyExternalSkyWcs:
242  src = self.addCalibColumnsaddCalibColumns(src, dataRef)
243 
244  ccdVisitId = dataRef.get('ccdExposureId')
245  result = self.runrun(src, ccdVisitId=ccdVisitId)
246  dataRef.put(result.table, 'source')
247 
248  def runQuantum(self, butlerQC, inputRefs, outputRefs):
249  inputs = butlerQC.get(inputRefs)
250  inputs['ccdVisitId'] = butlerQC.quantum.dataId.pack("visit_detector")
251  result = self.runrun(**inputs).table
252  outputs = pipeBase.Struct(outputCatalog=result.toDataFrame())
253  butlerQC.put(outputs, outputRefs)
254 
255  def run(self, catalog, ccdVisitId=None):
256  """Convert `src` catalog to parquet
257 
258  Parameters
259  ----------
260  catalog: `afwTable.SourceCatalog`
261  catalog to be converted
262  ccdVisitId: `int`
263  ccdVisitId to be added as a column
264 
265  Returns
266  -------
267  result : `lsst.pipe.base.Struct`
268  ``table``
269  `ParquetTable` version of the input catalog
270  """
271  self.log.info("Generating parquet table from src catalog %s", ccdVisitId)
272  df = catalog.asAstropy().to_pandas().set_index('id', drop=True)
273  df['ccdVisitId'] = ccdVisitId
274  return pipeBase.Struct(table=ParquetTable(dataFrame=df))
275 
276  def addCalibColumns(self, catalog, dataRef):
277  """Add columns with local calibration evaluated at each centroid
278 
279  for backwards compatibility with old repos.
280  This exists for the purpose of converting old src catalogs
281  (which don't have the expected local calib columns) to Source Tables.
282 
283  Parameters
284  ----------
285  catalog: `afwTable.SourceCatalog`
286  catalog to which calib columns will be added
287  dataRef: `lsst.daf.persistence.ButlerDataRef
288  for fetching the calibs from disk.
289 
290  Returns
291  -------
292  newCat: `afwTable.SourceCatalog`
293  Source Catalog with requested local calib columns
294  """
295  mapper = afwTable.SchemaMapper(catalog.schema)
296  measureConfig = SingleFrameMeasurementTask.ConfigClass()
297  measureConfig.doReplaceWithNoise = False
298 
299  # Just need the WCS or the PhotoCalib attached to an exposue
300  exposure = dataRef.get('calexp_sub',
302 
303  mapper = afwTable.SchemaMapper(catalog.schema)
304  mapper.addMinimalSchema(catalog.schema, True)
305  schema = mapper.getOutputSchema()
306 
307  exposureIdInfo = dataRef.get("expIdInfo")
308  measureConfig.plugins.names = []
309  if self.config.doApplyExternalSkyWcs:
310  plugin = 'base_LocalWcs'
311  if plugin in schema:
312  raise RuntimeError(f"{plugin} already in src catalog. Set doApplyExternalSkyWcs=False")
313  else:
314  measureConfig.plugins.names.add(plugin)
315 
316  if self.config.doApplyExternalPhotoCalib:
317  plugin = 'base_LocalPhotoCalib'
318  if plugin in schema:
319  raise RuntimeError(f"{plugin} already in src catalog. Set doApplyExternalPhotoCalib=False")
320  else:
321  measureConfig.plugins.names.add(plugin)
322 
323  measurement = SingleFrameMeasurementTask(config=measureConfig, schema=schema)
324  newCat = afwTable.SourceCatalog(schema)
325  newCat.extend(catalog, mapper=mapper)
326  measurement.run(measCat=newCat, exposure=exposure, exposureId=exposureIdInfo.expId)
327  return newCat
328 
329  def writeMetadata(self, dataRef):
330  """No metadata to write.
331  """
332  pass
333 
334  @classmethod
335  def _makeArgumentParser(cls):
336  parser = ArgumentParser(name=cls._DefaultName_DefaultName)
337  parser.add_id_argument("--id", 'src',
338  help="data ID, e.g. --id visit=12345 ccd=0")
339  return parser
340 
341 
342 class PostprocessAnalysis(object):
343  """Calculate columns from ParquetTable
344 
345  This object manages and organizes an arbitrary set of computations
346  on a catalog. The catalog is defined by a
347  `lsst.pipe.tasks.parquetTable.ParquetTable` object (or list thereof), such as a
348  `deepCoadd_obj` dataset, and the computations are defined by a collection
349  of `lsst.pipe.tasks.functor.Functor` objects (or, equivalently,
350  a `CompositeFunctor`).
351 
352  After the object is initialized, accessing the `.df` attribute (which
353  holds the `pandas.DataFrame` containing the results of the calculations) triggers
354  computation of said dataframe.
355 
356  One of the conveniences of using this object is the ability to define a desired common
357  filter for all functors. This enables the same functor collection to be passed to
358  several different `PostprocessAnalysis` objects without having to change the original
359  functor collection, since the `filt` keyword argument of this object triggers an
360  overwrite of the `filt` property for all functors in the collection.
361 
362  This object also allows a list of refFlags to be passed, and defines a set of default
363  refFlags that are always included even if not requested.
364 
365  If a list of `ParquetTable` object is passed, rather than a single one, then the
366  calculations will be mapped over all the input catalogs. In principle, it should
367  be straightforward to parallelize this activity, but initial tests have failed
368  (see TODO in code comments).
369 
370  Parameters
371  ----------
372  parq : `lsst.pipe.tasks.ParquetTable` (or list of such)
373  Source catalog(s) for computation
374 
375  functors : `list`, `dict`, or `lsst.pipe.tasks.functors.CompositeFunctor`
376  Computations to do (functors that act on `parq`).
377  If a dict, the output
378  DataFrame will have columns keyed accordingly.
379  If a list, the column keys will come from the
380  `.shortname` attribute of each functor.
381 
382  filt : `str` (optional)
383  Filter in which to calculate. If provided,
384  this will overwrite any existing `.filt` attribute
385  of the provided functors.
386 
387  flags : `list` (optional)
388  List of flags (per-band) to include in output table.
389 
390  refFlags : `list` (optional)
391  List of refFlags (only reference band) to include in output table.
392 
393 
394  """
395  _defaultRefFlags = []
396  _defaultFuncs = (('coord_ra', RAColumn()),
397  ('coord_dec', DecColumn()))
398 
399  def __init__(self, parq, functors, filt=None, flags=None, refFlags=None):
400  self.parqparq = parq
401  self.functorsfunctors = functors
402 
403  self.filtfilt = filt
404  self.flagsflags = list(flags) if flags is not None else []
405  self.refFlagsrefFlags = list(self._defaultRefFlags_defaultRefFlags)
406  if refFlags is not None:
407  self.refFlagsrefFlags += list(refFlags)
408 
409  self._df_df = None
410 
411  @property
412  def defaultFuncs(self):
413  funcs = dict(self._defaultFuncs_defaultFuncs)
414  return funcs
415 
416  @property
417  def func(self):
418  additionalFuncs = self.defaultFuncsdefaultFuncs
419  additionalFuncs.update({flag: Column(flag, dataset='ref') for flag in self.refFlagsrefFlags})
420  additionalFuncs.update({flag: Column(flag, dataset='meas') for flag in self.flagsflags})
421 
422  if isinstance(self.functorsfunctors, CompositeFunctor):
423  func = self.functorsfunctors
424  else:
425  func = CompositeFunctor(self.functorsfunctors)
426 
427  func.funcDict.update(additionalFuncs)
428  func.filt = self.filtfilt
429 
430  return func
431 
432  @property
433  def noDupCols(self):
434  return [name for name, func in self.funcfunc.funcDict.items() if func.noDup or func.dataset == 'ref']
435 
436  @property
437  def df(self):
438  if self._df_df is None:
439  self.computecompute()
440  return self._df_df
441 
442  def compute(self, dropna=False, pool=None):
443  # map over multiple parquet tables
444  if type(self.parqparq) in (list, tuple):
445  if pool is None:
446  dflist = [self.funcfunc(parq, dropna=dropna) for parq in self.parqparq]
447  else:
448  # TODO: Figure out why this doesn't work (pyarrow pickling issues?)
449  dflist = pool.map(functools.partial(self.funcfunc, dropna=dropna), self.parqparq)
450  self._df_df = pd.concat(dflist)
451  else:
452  self._df_df = self.funcfunc(self.parqparq, dropna=dropna)
453 
454  return self._df_df
455 
456 
457 class TransformCatalogBaseConnections(pipeBase.PipelineTaskConnections,
458  dimensions=()):
459  """Expected Connections for subclasses of TransformCatalogBaseTask.
460 
461  Must be subclassed.
462  """
463  inputCatalog = connectionTypes.Input(
464  name="",
465  storageClass="DataFrame",
466  )
467  outputCatalog = connectionTypes.Output(
468  name="",
469  storageClass="DataFrame",
470  )
471 
472 
473 class TransformCatalogBaseConfig(pipeBase.PipelineTaskConfig,
474  pipelineConnections=TransformCatalogBaseConnections):
475  functorFile = pexConfig.Field(
476  dtype=str,
477  doc='Path to YAML file specifying functors to be computed',
478  default=None,
479  optional=True
480  )
481 
482 
483 class TransformCatalogBaseTask(CmdLineTask, pipeBase.PipelineTask):
484  """Base class for transforming/standardizing a catalog
485 
486  by applying functors that convert units and apply calibrations.
487  The purpose of this task is to perform a set of computations on
488  an input `ParquetTable` dataset (such as `deepCoadd_obj`) and write the
489  results to a new dataset (which needs to be declared in an `outputDataset`
490  attribute).
491 
492  The calculations to be performed are defined in a YAML file that specifies
493  a set of functors to be computed, provided as
494  a `--functorFile` config parameter. An example of such a YAML file
495  is the following:
496 
497  funcs:
498  psfMag:
499  functor: Mag
500  args:
501  - base_PsfFlux
502  filt: HSC-G
503  dataset: meas
504  cmodel_magDiff:
505  functor: MagDiff
506  args:
507  - modelfit_CModel
508  - base_PsfFlux
509  filt: HSC-G
510  gauss_magDiff:
511  functor: MagDiff
512  args:
513  - base_GaussianFlux
514  - base_PsfFlux
515  filt: HSC-G
516  count:
517  functor: Column
518  args:
519  - base_InputCount_value
520  filt: HSC-G
521  deconvolved_moments:
522  functor: DeconvolvedMoments
523  filt: HSC-G
524  dataset: forced_src
525  refFlags:
526  - calib_psfUsed
527  - merge_measurement_i
528  - merge_measurement_r
529  - merge_measurement_z
530  - merge_measurement_y
531  - merge_measurement_g
532  - base_PixelFlags_flag_inexact_psfCenter
533  - detect_isPrimary
534 
535  The names for each entry under "func" will become the names of columns in the
536  output dataset. All the functors referenced are defined in `lsst.pipe.tasks.functors`.
537  Positional arguments to be passed to each functor are in the `args` list,
538  and any additional entries for each column other than "functor" or "args" (e.g., `'filt'`,
539  `'dataset'`) are treated as keyword arguments to be passed to the functor initialization.
540 
541  The "refFlags" entry is shortcut for a bunch of `Column` functors with the original column and
542  taken from the `'ref'` dataset.
543 
544  The "flags" entry will be expanded out per band.
545 
546  This task uses the `lsst.pipe.tasks.postprocess.PostprocessAnalysis` object
547  to organize and excecute the calculations.
548 
549  """
550  @property
551  def _DefaultName(self):
552  raise NotImplementedError('Subclass must define "_DefaultName" attribute')
553 
554  @property
555  def outputDataset(self):
556  raise NotImplementedError('Subclass must define "outputDataset" attribute')
557 
558  @property
559  def inputDataset(self):
560  raise NotImplementedError('Subclass must define "inputDataset" attribute')
561 
562  @property
563  def ConfigClass(self):
564  raise NotImplementedError('Subclass must define "ConfigClass" attribute')
565 
566  def __init__(self, *args, **kwargs):
567  super().__init__(*args, **kwargs)
568  if self.config.functorFile:
569  self.log.info('Loading tranform functor definitions from %s',
570  self.config.functorFile)
571  self.funcsfuncs = CompositeFunctor.from_file(self.config.functorFile)
572  self.funcsfuncs.update(dict(PostprocessAnalysis._defaultFuncs))
573  else:
574  self.funcsfuncs = None
575 
576  def runQuantum(self, butlerQC, inputRefs, outputRefs):
577  inputs = butlerQC.get(inputRefs)
578  if self.funcsfuncs is None:
579  raise ValueError("config.functorFile is None. "
580  "Must be a valid path to yaml in order to run Task as a PipelineTask.")
581  result = self.runrun(parq=inputs['inputCatalog'], funcs=self.funcsfuncs,
582  dataId=outputRefs.outputCatalog.dataId.full)
583  outputs = pipeBase.Struct(outputCatalog=result)
584  butlerQC.put(outputs, outputRefs)
585 
586  def runDataRef(self, dataRef):
587  parq = dataRef.get()
588  if self.funcsfuncs is None:
589  raise ValueError("config.functorFile is None. "
590  "Must be a valid path to yaml in order to run as a CommandlineTask.")
591  df = self.runrun(parq, funcs=self.funcsfuncs, dataId=dataRef.dataId)
592  self.writewrite(df, dataRef)
593  return df
594 
595  def run(self, parq, funcs=None, dataId=None, band=None):
596  """Do postprocessing calculations
597 
598  Takes a `ParquetTable` object and dataId,
599  returns a dataframe with results of postprocessing calculations.
600 
601  Parameters
602  ----------
603  parq : `lsst.pipe.tasks.parquetTable.ParquetTable`
604  ParquetTable from which calculations are done.
605  funcs : `lsst.pipe.tasks.functors.Functors`
606  Functors to apply to the table's columns
607  dataId : dict, optional
608  Used to add a `patchId` column to the output dataframe.
609  band : `str`, optional
610  Filter band that is being processed.
611 
612  Returns
613  ------
614  `pandas.DataFrame`
615 
616  """
617  self.log.info("Transforming/standardizing the source table dataId: %s", dataId)
618 
619  df = self.transformtransform(band, parq, funcs, dataId).df
620  self.log.info("Made a table of %d columns and %d rows", len(df.columns), len(df))
621  return df
622 
623  def getFunctors(self):
624  return self.funcsfuncs
625 
626  def getAnalysis(self, parq, funcs=None, band=None):
627  if funcs is None:
628  funcs = self.funcsfuncs
629  analysis = PostprocessAnalysis(parq, funcs, filt=band)
630  return analysis
631 
632  def transform(self, band, parq, funcs, dataId):
633  analysis = self.getAnalysisgetAnalysis(parq, funcs=funcs, band=band)
634  df = analysis.df
635  if dataId is not None:
636  for key, value in dataId.items():
637  df[key] = value
638 
639  return pipeBase.Struct(
640  df=df,
641  analysis=analysis
642  )
643 
644  def write(self, df, parqRef):
645  parqRef.put(ParquetTable(dataFrame=df), self.outputDatasetoutputDataset)
646 
647  def writeMetadata(self, dataRef):
648  """No metadata to write.
649  """
650  pass
651 
652 
653 class TransformObjectCatalogConfig(TransformCatalogBaseConfig):
654  coaddName = pexConfig.Field(
655  dtype=str,
656  default="deep",
657  doc="Name of coadd"
658  )
659  # TODO: remove in DM-27177
660  filterMap = pexConfig.DictField(
661  keytype=str,
662  itemtype=str,
663  default={},
664  doc=("Dictionary mapping full filter name to short one for column name munging."
665  "These filters determine the output columns no matter what filters the "
666  "input data actually contain."),
667  deprecated=("Coadds are now identified by the band, so this transform is unused."
668  "Will be removed after v22.")
669  )
670  outputBands = pexConfig.ListField(
671  dtype=str,
672  default=None,
673  optional=True,
674  doc=("These bands and only these bands will appear in the output,"
675  " NaN-filled if the input does not include them."
676  " If None, then use all bands found in the input.")
677  )
678  camelCase = pexConfig.Field(
679  dtype=bool,
680  default=True,
681  doc=("Write per-band columns names with camelCase, else underscore "
682  "For example: gPsFlux instead of g_PsFlux.")
683  )
684  multilevelOutput = pexConfig.Field(
685  dtype=bool,
686  default=False,
687  doc=("Whether results dataframe should have a multilevel column index (True) or be flat "
688  "and name-munged (False).")
689  )
690 
691 
693  """Produce a flattened Object Table to match the format specified in
694  sdm_schemas.
695 
696  Do the same set of postprocessing calculations on all bands
697 
698  This is identical to `TransformCatalogBaseTask`, except for that it does the
699  specified functor calculations for all filters present in the
700  input `deepCoadd_obj` table. Any specific `"filt"` keywords specified
701  by the YAML file will be superceded.
702  """
703  _DefaultName = "transformObjectCatalog"
704  ConfigClass = TransformObjectCatalogConfig
705 
706  inputDataset = 'deepCoadd_obj'
707  outputDataset = 'objectTable'
708 
709  @classmethod
710  def _makeArgumentParser(cls):
711  parser = ArgumentParser(name=cls._DefaultName_DefaultName_DefaultName)
712  parser.add_id_argument("--id", cls.inputDatasetinputDatasetinputDataset,
713  ContainerClass=CoaddDataIdContainer,
714  help="data ID, e.g. --id tract=12345 patch=1,2")
715  return parser
716 
717  def run(self, parq, funcs=None, dataId=None, band=None):
718  # NOTE: band kwarg is ignored here.
719  dfDict = {}
720  analysisDict = {}
721  templateDf = pd.DataFrame()
722  outputBands = parq.columnLevelNames['band'] if self.config.outputBands is None else \
723  self.config.outputBands
724 
725  # Perform transform for data of filters that exist in parq.
726  for inputBand in parq.columnLevelNames['band']:
727  if inputBand not in outputBands:
728  self.log.info("Ignoring %s band data in the input", inputBand)
729  continue
730  self.log.info("Transforming the catalog of band %s", inputBand)
731  result = self.transformtransform(inputBand, parq, funcs, dataId)
732  dfDict[inputBand] = result.df
733  analysisDict[inputBand] = result.analysis
734  if templateDf.empty:
735  templateDf = result.df
736 
737  # Fill NaNs in columns of other wanted bands
738  for filt in outputBands:
739  if filt not in dfDict:
740  self.log.info("Adding empty columns for band %s", filt)
741  dfDict[filt] = pd.DataFrame().reindex_like(templateDf)
742 
743  # This makes a multilevel column index, with band as first level
744  df = pd.concat(dfDict, axis=1, names=['band', 'column'])
745 
746  if not self.config.multilevelOutput:
747  noDupCols = list(set.union(*[set(v.noDupCols) for v in analysisDict.values()]))
748  if dataId is not None:
749  noDupCols += list(dataId.keys())
750  df = flattenFilters(df, noDupCols=noDupCols, camelCase=self.config.camelCase)
751 
752  self.log.info("Made a table of %d columns and %d rows", len(df.columns), len(df))
753  return df
754 
755 
757 
758  def makeDataRefList(self, namespace):
759  """Make self.refList from self.idList
760 
761  Generate a list of data references given tract and/or patch.
762  This was adapted from `TractQADataIdContainer`, which was
763  `TractDataIdContainer` modifie to not require "filter".
764  Only existing dataRefs are returned.
765  """
766  def getPatchRefList(tract):
767  return [namespace.butler.dataRef(datasetType=self.datasetType,
768  tract=tract.getId(),
769  patch="%d,%d" % patch.getIndex()) for patch in tract]
770 
771  tractRefs = defaultdict(list) # Data references for each tract
772  for dataId in self.idList:
773  skymap = self.getSkymapgetSkymap(namespace)
774 
775  if "tract" in dataId:
776  tractId = dataId["tract"]
777  if "patch" in dataId:
778  tractRefs[tractId].append(namespace.butler.dataRef(datasetType=self.datasetType,
779  tract=tractId,
780  patch=dataId['patch']))
781  else:
782  tractRefs[tractId] += getPatchRefList(skymap[tractId])
783  else:
784  tractRefs = dict((tract.getId(), tractRefs.get(tract.getId(), []) + getPatchRefList(tract))
785  for tract in skymap)
786  outputRefList = []
787  for tractRefList in tractRefs.values():
788  existingRefs = [ref for ref in tractRefList if ref.datasetExists()]
789  outputRefList.append(existingRefs)
790 
791  self.refListrefList = outputRefList
792 
793 
794 class ConsolidateObjectTableConfig(pexConfig.Config):
795  coaddName = pexConfig.Field(
796  dtype=str,
797  default="deep",
798  doc="Name of coadd"
799  )
800 
801 
802 class ConsolidateObjectTableTask(CmdLineTask):
803  """Write patch-merged source tables to a tract-level parquet file
804  """
805  _DefaultName = "consolidateObjectTable"
806  ConfigClass = ConsolidateObjectTableConfig
807 
808  inputDataset = 'objectTable'
809  outputDataset = 'objectTable_tract'
810 
811  @classmethod
812  def _makeArgumentParser(cls):
813  parser = ArgumentParser(name=cls._DefaultName_DefaultName)
814 
815  parser.add_id_argument("--id", cls.inputDatasetinputDataset,
816  help="data ID, e.g. --id tract=12345",
817  ContainerClass=TractObjectDataIdContainer)
818  return parser
819 
820  def runDataRef(self, patchRefList):
821  df = pd.concat([patchRef.get().toDataFrame() for patchRef in patchRefList])
822  patchRefList[0].put(ParquetTable(dataFrame=df), self.outputDatasetoutputDataset)
823 
824  def writeMetadata(self, dataRef):
825  """No metadata to write.
826  """
827  pass
828 
829 
830 class TransformSourceTableConnections(pipeBase.PipelineTaskConnections,
831  dimensions=("instrument", "visit", "detector")):
832 
833  inputCatalog = connectionTypes.Input(
834  doc="Wide input catalog of sources produced by WriteSourceTableTask",
835  name="source",
836  storageClass="DataFrame",
837  dimensions=("instrument", "visit", "detector"),
838  deferLoad=True
839  )
840  outputCatalog = connectionTypes.Output(
841  doc="Narrower, per-detector Source Table transformed and converted per a "
842  "specified set of functors",
843  name="sourceTable",
844  storageClass="DataFrame",
845  dimensions=("instrument", "visit", "detector")
846  )
847 
848 
850  pipelineConnections=TransformSourceTableConnections):
851  pass
852 
853 
855  """Transform/standardize a source catalog
856  """
857  _DefaultName = "transformSourceTable"
858  ConfigClass = TransformSourceTableConfig
859 
860  inputDataset = 'source'
861  outputDataset = 'sourceTable'
862 
863  @classmethod
864  def _makeArgumentParser(cls):
865  parser = ArgumentParser(name=cls._DefaultName_DefaultName_DefaultName)
866  parser.add_id_argument("--id", datasetType=cls.inputDatasetinputDatasetinputDataset,
867  level="sensor",
868  help="data ID, e.g. --id visit=12345 ccd=0")
869  return parser
870 
871  def runDataRef(self, dataRef):
872  """Override to specify band label to run()."""
873  parq = dataRef.get()
874  funcs = self.getFunctorsgetFunctors()
875  band = dataRef.get("calexp_filterLabel", immediate=True).bandLabel
876  df = self.runrun(parq, funcs=funcs, dataId=dataRef.dataId, band=band)
877  self.writewrite(df, dataRef)
878  return df
879 
880 
881 class ConsolidateVisitSummaryConnections(pipeBase.PipelineTaskConnections,
882  dimensions=("instrument", "visit",),
883  defaultTemplates={}):
884  calexp = connectionTypes.Input(
885  doc="Processed exposures used for metadata",
886  name="calexp",
887  storageClass="ExposureF",
888  dimensions=("instrument", "visit", "detector"),
889  deferLoad=True,
890  multiple=True,
891  )
892  visitSummary = connectionTypes.Output(
893  doc=("Per-visit consolidated exposure metadata. These catalogs use "
894  "detector id for the id and are sorted for fast lookups of a "
895  "detector."),
896  name="visitSummary",
897  storageClass="ExposureCatalog",
898  dimensions=("instrument", "visit"),
899  )
900 
901 
902 class ConsolidateVisitSummaryConfig(pipeBase.PipelineTaskConfig,
903  pipelineConnections=ConsolidateVisitSummaryConnections):
904  """Config for ConsolidateVisitSummaryTask"""
905  pass
906 
907 
908 class ConsolidateVisitSummaryTask(pipeBase.PipelineTask, pipeBase.CmdLineTask):
909  """Task to consolidate per-detector visit metadata.
910 
911  This task aggregates the following metadata from all the detectors in a
912  single visit into an exposure catalog:
913  - The visitInfo.
914  - The wcs.
915  - The photoCalib.
916  - The physical_filter and band (if available).
917  - The psf size, shape, and effective area at the center of the detector.
918  - The corners of the bounding box in right ascension/declination.
919 
920  Other quantities such as Psf, ApCorrMap, and TransmissionCurve are not
921  persisted here because of storage concerns, and because of their limited
922  utility as summary statistics.
923 
924  Tests for this task are performed in ci_hsc_gen3.
925  """
926  _DefaultName = "consolidateVisitSummary"
927  ConfigClass = ConsolidateVisitSummaryConfig
928 
929  @classmethod
930  def _makeArgumentParser(cls):
931  parser = ArgumentParser(name=cls._DefaultName_DefaultName)
932 
933  parser.add_id_argument("--id", "calexp",
934  help="data ID, e.g. --id visit=12345",
935  ContainerClass=VisitDataIdContainer)
936  return parser
937 
938  def writeMetadata(self, dataRef):
939  """No metadata to persist, so override to remove metadata persistance.
940  """
941  pass
942 
943  def writeConfig(self, butler, clobber=False, doBackup=True):
944  """No config to persist, so override to remove config persistance.
945  """
946  pass
947 
948  def runDataRef(self, dataRefList):
949  visit = dataRefList[0].dataId['visit']
950 
951  self.log.debug("Concatenating metadata from %d per-detector calexps (visit %d)" %
952  (len(dataRefList), visit))
953 
954  expCatalog = self._combineExposureMetadata(visit, dataRefList, isGen3=False)
955 
956  dataRefList[0].put(expCatalog, 'visitSummary', visit=visit)
957 
958  def runQuantum(self, butlerQC, inputRefs, outputRefs):
959  dataRefs = butlerQC.get(inputRefs.calexp)
960  visit = dataRefs[0].dataId.byName()['visit']
961 
962  self.log.debug("Concatenating metadata from %d per-detector calexps (visit %d)" %
963  (len(dataRefs), visit))
964 
965  expCatalog = self._combineExposureMetadata_combineExposureMetadata(visit, dataRefs)
966 
967  butlerQC.put(expCatalog, outputRefs.visitSummary)
968 
969  def _combineExposureMetadata(self, visit, dataRefs, isGen3=True):
970  """Make a combined exposure catalog from a list of dataRefs.
971 
972  Parameters
973  ----------
974  visit : `int`
975  Visit identification number
976  dataRefs : `list`
977  List of calexp dataRefs in visit. May be list of
978  `lsst.daf.persistence.ButlerDataRef` (Gen2) or
979  `lsst.daf.butler.DeferredDatasetHandle` (Gen3).
980  isGen3 : `bool`, optional
981  Specifies if this is a Gen3 list of datarefs.
982 
983  Returns
984  -------
985  visitSummary : `lsst.afw.table.ExposureCatalog`
986  Exposure catalog with per-detector summary information.
987  """
988  schema = afwTable.ExposureTable.makeMinimalSchema()
989  schema.addField('visit', type='I', doc='Visit number')
990  schema.addField('physical_filter', type='String', size=32, doc='Physical filter')
991  schema.addField('band', type='String', size=32, doc='Name of band')
992  schema.addField('psfSigma', type='F',
993  doc='PSF model second-moments determinant radius (center of chip) (pixel)')
994  schema.addField('psfArea', type='F',
995  doc='PSF model effective area (center of chip) (pixel**2)')
996  schema.addField('psfIxx', type='F',
997  doc='PSF model Ixx (center of chip) (pixel**2)')
998  schema.addField('psfIyy', type='F',
999  doc='PSF model Iyy (center of chip) (pixel**2)')
1000  schema.addField('psfIxy', type='F',
1001  doc='PSF model Ixy (center of chip) (pixel**2)')
1002  schema.addField('raCorners', type='ArrayD', size=4,
1003  doc='Right Ascension of bounding box corners (degrees)')
1004  schema.addField('decCorners', type='ArrayD', size=4,
1005  doc='Declination of bounding box corners (degrees)')
1006 
1007  cat = afwTable.ExposureCatalog(schema)
1008  cat.resize(len(dataRefs))
1009 
1010  cat['visit'] = visit
1011 
1012  for i, dataRef in enumerate(dataRefs):
1013  if isGen3:
1014  visitInfo = dataRef.get(component='visitInfo')
1015  filterLabel = dataRef.get(component='filterLabel')
1016  psf = dataRef.get(component='psf')
1017  wcs = dataRef.get(component='wcs')
1018  photoCalib = dataRef.get(component='photoCalib')
1019  detector = dataRef.get(component='detector')
1020  bbox = dataRef.get(component='bbox')
1021  validPolygon = dataRef.get(component='validPolygon')
1022  else:
1023  # Note that we need to read the calexp because there is
1024  # no magic access to the psf except through the exposure.
1025  gen2_read_bbox = lsst.geom.BoxI(lsst.geom.PointI(0, 0), lsst.geom.PointI(1, 1))
1026  exp = dataRef.get(datasetType='calexp_sub', bbox=gen2_read_bbox)
1027  visitInfo = exp.getInfo().getVisitInfo()
1028  filterLabel = dataRef.get("calexp_filterLabel")
1029  psf = exp.getPsf()
1030  wcs = exp.getWcs()
1031  photoCalib = exp.getPhotoCalib()
1032  detector = exp.getDetector()
1033  bbox = dataRef.get(datasetType='calexp_bbox')
1034  validPolygon = exp.getInfo().getValidPolygon()
1035 
1036  rec = cat[i]
1037  rec.setBBox(bbox)
1038  rec.setVisitInfo(visitInfo)
1039  rec.setWcs(wcs)
1040  rec.setPhotoCalib(photoCalib)
1041  rec.setDetector(detector)
1042  rec.setValidPolygon(validPolygon)
1043 
1044  rec['physical_filter'] = filterLabel.physicalLabel if filterLabel.hasPhysicalLabel() else ""
1045  rec['band'] = filterLabel.bandLabel if filterLabel.hasBandLabel() else ""
1046  rec.setId(detector.getId())
1047  shape = psf.computeShape(bbox.getCenter())
1048  rec['psfSigma'] = shape.getDeterminantRadius()
1049  rec['psfIxx'] = shape.getIxx()
1050  rec['psfIyy'] = shape.getIyy()
1051  rec['psfIxy'] = shape.getIxy()
1052  im = psf.computeKernelImage(bbox.getCenter())
1053  # The calculation of effective psf area is taken from
1054  # meas_base/src/PsfFlux.cc#L112. See
1055  # https://github.com/lsst/meas_base/blob/
1056  # 750bffe6620e565bda731add1509507f5c40c8bb/src/PsfFlux.cc#L112
1057  rec['psfArea'] = np.sum(im.array)/np.sum(im.array**2.)
1058 
1059  sph_pts = wcs.pixelToSky(lsst.geom.Box2D(bbox).getCorners())
1060  rec['raCorners'][:] = [sph.getRa().asDegrees() for sph in sph_pts]
1061  rec['decCorners'][:] = [sph.getDec().asDegrees() for sph in sph_pts]
1062 
1063  metadata = dafBase.PropertyList()
1064  metadata.add("COMMENT", "Catalog id is detector id, sorted.")
1065  # We are looping over existing datarefs, so the following is true
1066  metadata.add("COMMENT", "Only detectors with data have entries.")
1067  cat.setMetadata(metadata)
1068 
1069  cat.sort()
1070  return cat
1071 
1072 
1073 class VisitDataIdContainer(DataIdContainer):
1074  """DataIdContainer that groups sensor-level id's by visit
1075  """
1076 
1077  def makeDataRefList(self, namespace):
1078  """Make self.refList from self.idList
1079 
1080  Generate a list of data references grouped by visit.
1081 
1082  Parameters
1083  ----------
1084  namespace : `argparse.Namespace`
1085  Namespace used by `lsst.pipe.base.CmdLineTask` to parse command line arguments
1086  """
1087  # Group by visits
1088  visitRefs = defaultdict(list)
1089  for dataId in self.idList:
1090  if "visit" in dataId:
1091  visitId = dataId["visit"]
1092  # append all subsets to
1093  subset = namespace.butler.subset(self.datasetType, dataId=dataId)
1094  visitRefs[visitId].extend([dataRef for dataRef in subset])
1095 
1096  outputRefList = []
1097  for refList in visitRefs.values():
1098  existingRefs = [ref for ref in refList if ref.datasetExists()]
1099  if existingRefs:
1100  outputRefList.append(existingRefs)
1101 
1102  self.refListrefList = outputRefList
1103 
1104 
1105 class ConsolidateSourceTableConnections(pipeBase.PipelineTaskConnections,
1106  dimensions=("instrument", "visit")):
1107  inputCatalogs = connectionTypes.Input(
1108  doc="Input per-detector Source Tables",
1109  name="sourceTable",
1110  storageClass="DataFrame",
1111  dimensions=("instrument", "visit", "detector"),
1112  multiple=True
1113  )
1114  outputCatalog = connectionTypes.Output(
1115  doc="Per-visit concatenation of Source Table",
1116  name="sourceTable_visit",
1117  storageClass="DataFrame",
1118  dimensions=("instrument", "visit")
1119  )
1120 
1121 
1122 class ConsolidateSourceTableConfig(pipeBase.PipelineTaskConfig,
1123  pipelineConnections=ConsolidateSourceTableConnections):
1124  pass
1125 
1126 
1127 class ConsolidateSourceTableTask(CmdLineTask, pipeBase.PipelineTask):
1128  """Concatenate `sourceTable` list into a per-visit `sourceTable_visit`
1129  """
1130  _DefaultName = 'consolidateSourceTable'
1131  ConfigClass = ConsolidateSourceTableConfig
1132 
1133  inputDataset = 'sourceTable'
1134  outputDataset = 'sourceTable_visit'
1135 
1136  def runQuantum(self, butlerQC, inputRefs, outputRefs):
1137  inputs = butlerQC.get(inputRefs)
1138  self.log.info("Concatenating %s per-detector Source Tables",
1139  len(inputs['inputCatalogs']))
1140  df = pd.concat(inputs['inputCatalogs'])
1141  butlerQC.put(pipeBase.Struct(outputCatalog=df), outputRefs)
1142 
1143  def runDataRef(self, dataRefList):
1144  self.log.info("Concatenating %s per-detector Source Tables", len(dataRefList))
1145  df = pd.concat([dataRef.get().toDataFrame() for dataRef in dataRefList])
1146  dataRefList[0].put(ParquetTable(dataFrame=df), self.outputDatasetoutputDataset)
1147 
1148  @classmethod
1149  def _makeArgumentParser(cls):
1150  parser = ArgumentParser(name=cls._DefaultName_DefaultName)
1151 
1152  parser.add_id_argument("--id", cls.inputDatasetinputDataset,
1153  help="data ID, e.g. --id visit=12345",
1154  ContainerClass=VisitDataIdContainer)
1155  return parser
1156 
1157  def writeMetadata(self, dataRef):
1158  """No metadata to write.
1159  """
1160  pass
1161 
1162  def writeConfig(self, butler, clobber=False, doBackup=True):
1163  """No config to write.
1164  """
1165  pass
def writeConfig(self, butler, clobber=False, doBackup=True)
def runQuantum(self, butlerQC, inputRefs, outputRefs)
def writeConfig(self, butler, clobber=False, doBackup=True)
Definition: postprocess.py:943
def runQuantum(self, butlerQC, inputRefs, outputRefs)
Definition: postprocess.py:958
def _combineExposureMetadata(self, visit, dataRefs, isGen3=True)
Definition: postprocess.py:969
def __init__(self, parq, functors, filt=None, flags=None, refFlags=None)
Definition: postprocess.py:399
def compute(self, dropna=False, pool=None)
Definition: postprocess.py:442
def getAnalysis(self, parq, funcs=None, band=None)
Definition: postprocess.py:626
def transform(self, band, parq, funcs, dataId)
Definition: postprocess.py:632
def run(self, parq, funcs=None, dataId=None, band=None)
Definition: postprocess.py:595
def runQuantum(self, butlerQC, inputRefs, outputRefs)
Definition: postprocess.py:576
def run(self, parq, funcs=None, dataId=None, band=None)
Definition: postprocess.py:717
def __init__(self, butler=None, schema=None, **kwargs)
Definition: postprocess.py:84
def runDataRef(self, patchRefList)
Merge coadd sources from multiple bands.
Definition: postprocess.py:90
def run(self, catalogs, tract, patch)
Definition: postprocess.py:139
def run(self, catalog, ccdVisitId=None)
Definition: postprocess.py:255
def runQuantum(self, butlerQC, inputRefs, outputRefs)
Definition: postprocess.py:248
def addCalibColumns(self, catalog, dataRef)
Definition: postprocess.py:276
def makeMergeArgumentParser(name, dataset)
Create a suitable ArgumentParser.
def flattenFilters(df, noDupCols=['coord_ra', 'coord_dec'], camelCase=False)
Definition: postprocess.py:42