lsst.pipe.tasks  21.0.0-46-g880d6fab+d14224996b
postprocess.py
Go to the documentation of this file.
1 # This file is part of pipe_tasks
2 #
3 # Developed for the LSST Data Management System.
4 # This product includes software developed by the LSST Project
5 # (https://www.lsst.org).
6 # See the COPYRIGHT file at the top-level directory of this distribution
7 # for details of code ownership.
8 #
9 # This program is free software: you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or
12 # (at your option) any later version.
13 #
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
18 #
19 # You should have received a copy of the GNU General Public License
20 # along with this program. If not, see <https://www.gnu.org/licenses/>.
21 
22 import functools
23 import pandas as pd
24 import numpy as np
25 from collections import defaultdict
26 
27 import lsst.geom
28 import lsst.pex.config as pexConfig
29 import lsst.pipe.base as pipeBase
30 from lsst.pipe.base import connectionTypes
31 import lsst.afw.table as afwTable
32 from lsst.meas.base import SingleFrameMeasurementTask
33 from lsst.pipe.base import CmdLineTask, ArgumentParser, DataIdContainer
34 from lsst.coadd.utils.coaddDataIdContainer import CoaddDataIdContainer
35 
36 from .parquetTable import ParquetTable
37 from .multiBandUtils import makeMergeArgumentParser, MergeSourcesRunner
38 from .functors import CompositeFunctor, RAColumn, DecColumn, Column
39 
40 
41 def flattenFilters(df, noDupCols=['coord_ra', 'coord_dec'], camelCase=False):
42  """Flattens a dataframe with multilevel column index
43  """
44  newDf = pd.DataFrame()
45  for band in set(df.columns.to_frame()['band']):
46  subdf = df[band]
47  columnFormat = '{0}{1}' if camelCase else '{0}_{1}'
48  newColumns = {c: columnFormat.format(band, c)
49  for c in subdf.columns if c not in noDupCols}
50  cols = list(newColumns.keys())
51  newDf = pd.concat([newDf, subdf[cols].rename(columns=newColumns)], axis=1)
52 
53  newDf = pd.concat([subdf[noDupCols], newDf], axis=1)
54  return newDf
55 
56 
57 class WriteObjectTableConfig(pexConfig.Config):
58  engine = pexConfig.Field(
59  dtype=str,
60  default="pyarrow",
61  doc="Parquet engine for writing (pyarrow or fastparquet)"
62  )
63  coaddName = pexConfig.Field(
64  dtype=str,
65  default="deep",
66  doc="Name of coadd"
67  )
68 
69 
70 class WriteObjectTableTask(CmdLineTask):
71  """Write filter-merged source tables to parquet
72  """
73  _DefaultName = "writeObjectTable"
74  ConfigClass = WriteObjectTableConfig
75  RunnerClass = MergeSourcesRunner
76 
77  # Names of table datasets to be merged
78  inputDatasets = ('forced_src', 'meas', 'ref')
79 
80  # Tag of output dataset written by `MergeSourcesTask.write`
81  outputDataset = 'obj'
82 
83  def __init__(self, butler=None, schema=None, **kwargs):
84  # It is a shame that this class can't use the default init for CmdLineTask
85  # But to do so would require its own special task runner, which is many
86  # more lines of specialization, so this is how it is for now
87  CmdLineTask.__init__(self, **kwargs)
88 
89  def runDataRef(self, patchRefList):
90  """!
91  @brief Merge coadd sources from multiple bands. Calls @ref `run` which must be defined in
92  subclasses that inherit from MergeSourcesTask.
93  @param[in] patchRefList list of data references for each filter
94  """
95  catalogs = dict(self.readCatalogreadCatalog(patchRef) for patchRef in patchRefList)
96  dataId = patchRefList[0].dataId
97  mergedCatalog = self.runrun(catalogs, tract=dataId['tract'], patch=dataId['patch'])
98  self.writewrite(patchRefList[0], mergedCatalog)
99 
100  @classmethod
101  def _makeArgumentParser(cls):
102  """Create a suitable ArgumentParser.
103 
104  We will use the ArgumentParser to get a list of data
105  references for patches; the RunnerClass will sort them into lists
106  of data references for the same patch.
107 
108  References first of self.inputDatasets, rather than
109  self.inputDataset
110  """
111  return makeMergeArgumentParser(cls._DefaultName_DefaultName, cls.inputDatasetsinputDatasets[0])
112 
113  def readCatalog(self, patchRef):
114  """Read input catalogs
115 
116  Read all the input datasets given by the 'inputDatasets'
117  attribute.
118 
119  Parameters
120  ----------
121  patchRef : `lsst.daf.persistence.ButlerDataRef`
122  Data reference for patch
123 
124  Returns
125  -------
126  Tuple consisting of band name and a dict of catalogs, keyed by
127  dataset name
128  """
129  band = patchRef.get(self.config.coaddName + "Coadd_filterLabel", immediate=True).bandLabel
130  catalogDict = {}
131  for dataset in self.inputDatasetsinputDatasets:
132  catalog = patchRef.get(self.config.coaddName + "Coadd_" + dataset, immediate=True)
133  self.log.info("Read %d sources from %s for band %s: %s" %
134  (len(catalog), dataset, band, patchRef.dataId))
135  catalogDict[dataset] = catalog
136  return band, catalogDict
137 
138  def run(self, catalogs, tract, patch):
139  """Merge multiple catalogs.
140 
141  Parameters
142  ----------
143  catalogs : `dict`
144  Mapping from filter names to dict of catalogs.
145  tract : int
146  tractId to use for the tractId column
147  patch : str
148  patchId to use for the patchId column
149 
150  Returns
151  -------
152  catalog : `lsst.pipe.tasks.parquetTable.ParquetTable`
153  Merged dataframe, with each column prefixed by
154  `filter_tag(filt)`, wrapped in the parquet writer shim class.
155  """
156 
157  dfs = []
158  for filt, tableDict in catalogs.items():
159  for dataset, table in tableDict.items():
160  # Convert afwTable to pandas DataFrame
161  df = table.asAstropy().to_pandas().set_index('id', drop=True)
162 
163  # Sort columns by name, to ensure matching schema among patches
164  df = df.reindex(sorted(df.columns), axis=1)
165  df['tractId'] = tract
166  df['patchId'] = patch
167 
168  # Make columns a 3-level MultiIndex
169  df.columns = pd.MultiIndex.from_tuples([(dataset, filt, c) for c in df.columns],
170  names=('dataset', 'band', 'column'))
171  dfs.append(df)
172 
173  catalog = functools.reduce(lambda d1, d2: d1.join(d2), dfs)
174  return ParquetTable(dataFrame=catalog)
175 
176  def write(self, patchRef, catalog):
177  """Write the output.
178 
179  Parameters
180  ----------
181  catalog : `ParquetTable`
182  Catalog to write
183  patchRef : `lsst.daf.persistence.ButlerDataRef`
184  Data reference for patch
185  """
186  patchRef.put(catalog, self.config.coaddName + "Coadd_" + self.outputDatasetoutputDataset)
187  # since the filter isn't actually part of the data ID for the dataset we're saving,
188  # it's confusing to see it in the log message, even if the butler simply ignores it.
189  mergeDataId = patchRef.dataId.copy()
190  del mergeDataId["filter"]
191  self.log.info("Wrote merged catalog: %s" % (mergeDataId,))
192 
193  def writeMetadata(self, dataRefList):
194  """No metadata to write, and not sure how to write it for a list of dataRefs.
195  """
196  pass
197 
198 
199 class WriteSourceTableConnections(pipeBase.PipelineTaskConnections,
200  dimensions=("instrument", "visit", "detector")):
201 
202  catalog = connectionTypes.Input(
203  doc="Input full-depth catalog of sources produced by CalibrateTask",
204  name="src",
205  storageClass="SourceCatalog",
206  dimensions=("instrument", "visit", "detector")
207  )
208  outputCatalog = connectionTypes.Output(
209  doc="Catalog of sources, `src` in Parquet format",
210  name="source",
211  storageClass="DataFrame",
212  dimensions=("instrument", "visit", "detector")
213  )
214 
215 
216 class WriteSourceTableConfig(pipeBase.PipelineTaskConfig,
217  pipelineConnections=WriteSourceTableConnections):
218  doApplyExternalPhotoCalib = pexConfig.Field(
219  dtype=bool,
220  default=False,
221  doc=("Add local photoCalib columns from the calexp.photoCalib? Should only set True if "
222  "generating Source Tables from older src tables which do not already have local calib columns")
223  )
224  doApplyExternalSkyWcs = pexConfig.Field(
225  dtype=bool,
226  default=False,
227  doc=("Add local WCS columns from the calexp.wcs? Should only set True if "
228  "generating Source Tables from older src tables which do not already have local calib columns")
229  )
230 
231 
232 class WriteSourceTableTask(CmdLineTask, pipeBase.PipelineTask):
233  """Write source table to parquet
234  """
235  _DefaultName = "writeSourceTable"
236  ConfigClass = WriteSourceTableConfig
237 
238  def runDataRef(self, dataRef):
239  src = dataRef.get('src')
240  if self.config.doApplyExternalPhotoCalib or self.config.doApplyExternalSkyWcs:
241  src = self.addCalibColumnsaddCalibColumns(src, dataRef)
242 
243  ccdVisitId = dataRef.get('ccdExposureId')
244  result = self.runrun(src, ccdVisitId=ccdVisitId)
245  dataRef.put(result.table, 'source')
246 
247  def runQuantum(self, butlerQC, inputRefs, outputRefs):
248  inputs = butlerQC.get(inputRefs)
249  inputs['ccdVisitId'] = butlerQC.quantum.dataId.pack("visit_detector")
250  result = self.runrun(**inputs).table
251  outputs = pipeBase.Struct(outputCatalog=result.toDataFrame())
252  butlerQC.put(outputs, outputRefs)
253 
254  def run(self, catalog, ccdVisitId=None):
255  """Convert `src` catalog to parquet
256 
257  Parameters
258  ----------
259  catalog: `afwTable.SourceCatalog`
260  catalog to be converted
261  ccdVisitId: `int`
262  ccdVisitId to be added as a column
263 
264  Returns
265  -------
266  result : `lsst.pipe.base.Struct`
267  ``table``
268  `ParquetTable` version of the input catalog
269  """
270  self.log.info("Generating parquet table from src catalog %s", ccdVisitId)
271  df = catalog.asAstropy().to_pandas().set_index('id', drop=True)
272  df['ccdVisitId'] = ccdVisitId
273  return pipeBase.Struct(table=ParquetTable(dataFrame=df))
274 
275  def addCalibColumns(self, catalog, dataRef):
276  """Add columns with local calibration evaluated at each centroid
277 
278  for backwards compatibility with old repos.
279  This exists for the purpose of converting old src catalogs
280  (which don't have the expected local calib columns) to Source Tables.
281 
282  Parameters
283  ----------
284  catalog: `afwTable.SourceCatalog`
285  catalog to which calib columns will be added
286  dataRef: `lsst.daf.persistence.ButlerDataRef
287  for fetching the calibs from disk.
288 
289  Returns
290  -------
291  newCat: `afwTable.SourceCatalog`
292  Source Catalog with requested local calib columns
293  """
294  mapper = afwTable.SchemaMapper(catalog.schema)
295  measureConfig = SingleFrameMeasurementTask.ConfigClass()
296  measureConfig.doReplaceWithNoise = False
297 
298  # Just need the WCS or the PhotoCalib attached to an exposue
299  exposure = dataRef.get('calexp_sub',
301 
302  mapper = afwTable.SchemaMapper(catalog.schema)
303  mapper.addMinimalSchema(catalog.schema, True)
304  schema = mapper.getOutputSchema()
305 
306  exposureIdInfo = dataRef.get("expIdInfo")
307  measureConfig.plugins.names = []
308  if self.config.doApplyExternalSkyWcs:
309  plugin = 'base_LocalWcs'
310  if plugin in schema:
311  raise RuntimeError(f"{plugin} already in src catalog. Set doApplyExternalSkyWcs=False")
312  else:
313  measureConfig.plugins.names.add(plugin)
314 
315  if self.config.doApplyExternalPhotoCalib:
316  plugin = 'base_LocalPhotoCalib'
317  if plugin in schema:
318  raise RuntimeError(f"{plugin} already in src catalog. Set doApplyExternalPhotoCalib=False")
319  else:
320  measureConfig.plugins.names.add(plugin)
321 
322  measurement = SingleFrameMeasurementTask(config=measureConfig, schema=schema)
323  newCat = afwTable.SourceCatalog(schema)
324  newCat.extend(catalog, mapper=mapper)
325  measurement.run(measCat=newCat, exposure=exposure, exposureId=exposureIdInfo.expId)
326  return newCat
327 
328  def writeMetadata(self, dataRef):
329  """No metadata to write.
330  """
331  pass
332 
333  @classmethod
334  def _makeArgumentParser(cls):
335  parser = ArgumentParser(name=cls._DefaultName_DefaultName)
336  parser.add_id_argument("--id", 'src',
337  help="data ID, e.g. --id visit=12345 ccd=0")
338  return parser
339 
340 
341 class PostprocessAnalysis(object):
342  """Calculate columns from ParquetTable
343 
344  This object manages and organizes an arbitrary set of computations
345  on a catalog. The catalog is defined by a
346  `lsst.pipe.tasks.parquetTable.ParquetTable` object (or list thereof), such as a
347  `deepCoadd_obj` dataset, and the computations are defined by a collection
348  of `lsst.pipe.tasks.functor.Functor` objects (or, equivalently,
349  a `CompositeFunctor`).
350 
351  After the object is initialized, accessing the `.df` attribute (which
352  holds the `pandas.DataFrame` containing the results of the calculations) triggers
353  computation of said dataframe.
354 
355  One of the conveniences of using this object is the ability to define a desired common
356  filter for all functors. This enables the same functor collection to be passed to
357  several different `PostprocessAnalysis` objects without having to change the original
358  functor collection, since the `filt` keyword argument of this object triggers an
359  overwrite of the `filt` property for all functors in the collection.
360 
361  This object also allows a list of refFlags to be passed, and defines a set of default
362  refFlags that are always included even if not requested.
363 
364  If a list of `ParquetTable` object is passed, rather than a single one, then the
365  calculations will be mapped over all the input catalogs. In principle, it should
366  be straightforward to parallelize this activity, but initial tests have failed
367  (see TODO in code comments).
368 
369  Parameters
370  ----------
371  parq : `lsst.pipe.tasks.ParquetTable` (or list of such)
372  Source catalog(s) for computation
373 
374  functors : `list`, `dict`, or `lsst.pipe.tasks.functors.CompositeFunctor`
375  Computations to do (functors that act on `parq`).
376  If a dict, the output
377  DataFrame will have columns keyed accordingly.
378  If a list, the column keys will come from the
379  `.shortname` attribute of each functor.
380 
381  filt : `str` (optional)
382  Filter in which to calculate. If provided,
383  this will overwrite any existing `.filt` attribute
384  of the provided functors.
385 
386  flags : `list` (optional)
387  List of flags (per-band) to include in output table.
388 
389  refFlags : `list` (optional)
390  List of refFlags (only reference band) to include in output table.
391 
392 
393  """
394  _defaultRefFlags = []
395  _defaultFuncs = (('coord_ra', RAColumn()),
396  ('coord_dec', DecColumn()))
397 
398  def __init__(self, parq, functors, filt=None, flags=None, refFlags=None):
399  self.parqparq = parq
400  self.functorsfunctors = functors
401 
402  self.filtfilt = filt
403  self.flagsflags = list(flags) if flags is not None else []
404  self.refFlagsrefFlags = list(self._defaultRefFlags_defaultRefFlags)
405  if refFlags is not None:
406  self.refFlagsrefFlags += list(refFlags)
407 
408  self._df_df = None
409 
410  @property
411  def defaultFuncs(self):
412  funcs = dict(self._defaultFuncs_defaultFuncs)
413  return funcs
414 
415  @property
416  def func(self):
417  additionalFuncs = self.defaultFuncsdefaultFuncs
418  additionalFuncs.update({flag: Column(flag, dataset='ref') for flag in self.refFlagsrefFlags})
419  additionalFuncs.update({flag: Column(flag, dataset='meas') for flag in self.flagsflags})
420 
421  if isinstance(self.functorsfunctors, CompositeFunctor):
422  func = self.functorsfunctors
423  else:
424  func = CompositeFunctor(self.functorsfunctors)
425 
426  func.funcDict.update(additionalFuncs)
427  func.filt = self.filtfilt
428 
429  return func
430 
431  @property
432  def noDupCols(self):
433  return [name for name, func in self.funcfunc.funcDict.items() if func.noDup or func.dataset == 'ref']
434 
435  @property
436  def df(self):
437  if self._df_df is None:
438  self.computecompute()
439  return self._df_df
440 
441  def compute(self, dropna=False, pool=None):
442  # map over multiple parquet tables
443  if type(self.parqparq) in (list, tuple):
444  if pool is None:
445  dflist = [self.funcfunc(parq, dropna=dropna) for parq in self.parqparq]
446  else:
447  # TODO: Figure out why this doesn't work (pyarrow pickling issues?)
448  dflist = pool.map(functools.partial(self.funcfunc, dropna=dropna), self.parqparq)
449  self._df_df = pd.concat(dflist)
450  else:
451  self._df_df = self.funcfunc(self.parqparq, dropna=dropna)
452 
453  return self._df_df
454 
455 
456 class TransformCatalogBaseConnections(pipeBase.PipelineTaskConnections,
457  dimensions=()):
458  """Expected Connections for subclasses of TransformCatalogBaseTask.
459 
460  Must be subclassed.
461  """
462  inputCatalog = connectionTypes.Input(
463  name="",
464  storageClass="DataFrame",
465  )
466  outputCatalog = connectionTypes.Output(
467  name="",
468  storageClass="DataFrame",
469  )
470 
471 
472 class TransformCatalogBaseConfig(pipeBase.PipelineTaskConfig,
473  pipelineConnections=TransformCatalogBaseConnections):
474  functorFile = pexConfig.Field(
475  dtype=str,
476  doc='Path to YAML file specifying functors to be computed',
477  default=None,
478  optional=True
479  )
480 
481 
482 class TransformCatalogBaseTask(CmdLineTask, pipeBase.PipelineTask):
483  """Base class for transforming/standardizing a catalog
484 
485  by applying functors that convert units and apply calibrations.
486  The purpose of this task is to perform a set of computations on
487  an input `ParquetTable` dataset (such as `deepCoadd_obj`) and write the
488  results to a new dataset (which needs to be declared in an `outputDataset`
489  attribute).
490 
491  The calculations to be performed are defined in a YAML file that specifies
492  a set of functors to be computed, provided as
493  a `--functorFile` config parameter. An example of such a YAML file
494  is the following:
495 
496  funcs:
497  psfMag:
498  functor: Mag
499  args:
500  - base_PsfFlux
501  filt: HSC-G
502  dataset: meas
503  cmodel_magDiff:
504  functor: MagDiff
505  args:
506  - modelfit_CModel
507  - base_PsfFlux
508  filt: HSC-G
509  gauss_magDiff:
510  functor: MagDiff
511  args:
512  - base_GaussianFlux
513  - base_PsfFlux
514  filt: HSC-G
515  count:
516  functor: Column
517  args:
518  - base_InputCount_value
519  filt: HSC-G
520  deconvolved_moments:
521  functor: DeconvolvedMoments
522  filt: HSC-G
523  dataset: forced_src
524  refFlags:
525  - calib_psfUsed
526  - merge_measurement_i
527  - merge_measurement_r
528  - merge_measurement_z
529  - merge_measurement_y
530  - merge_measurement_g
531  - base_PixelFlags_flag_inexact_psfCenter
532  - detect_isPrimary
533 
534  The names for each entry under "func" will become the names of columns in the
535  output dataset. All the functors referenced are defined in `lsst.pipe.tasks.functors`.
536  Positional arguments to be passed to each functor are in the `args` list,
537  and any additional entries for each column other than "functor" or "args" (e.g., `'filt'`,
538  `'dataset'`) are treated as keyword arguments to be passed to the functor initialization.
539 
540  The "refFlags" entry is shortcut for a bunch of `Column` functors with the original column and
541  taken from the `'ref'` dataset.
542 
543  The "flags" entry will be expanded out per band.
544 
545  This task uses the `lsst.pipe.tasks.postprocess.PostprocessAnalysis` object
546  to organize and excecute the calculations.
547 
548  """
549  @property
550  def _DefaultName(self):
551  raise NotImplementedError('Subclass must define "_DefaultName" attribute')
552 
553  @property
554  def outputDataset(self):
555  raise NotImplementedError('Subclass must define "outputDataset" attribute')
556 
557  @property
558  def inputDataset(self):
559  raise NotImplementedError('Subclass must define "inputDataset" attribute')
560 
561  @property
562  def ConfigClass(self):
563  raise NotImplementedError('Subclass must define "ConfigClass" attribute')
564 
565  def __init__(self, *args, **kwargs):
566  super().__init__(*args, **kwargs)
567  if self.config.functorFile:
568  self.log.info('Loading tranform functor definitions from %s',
569  self.config.functorFile)
570  self.funcsfuncs = CompositeFunctor.from_file(self.config.functorFile)
571  self.funcsfuncs.update(dict(PostprocessAnalysis._defaultFuncs))
572  else:
573  self.funcsfuncs = None
574 
575  def runQuantum(self, butlerQC, inputRefs, outputRefs):
576  inputs = butlerQC.get(inputRefs)
577  if self.funcsfuncs is None:
578  raise ValueError("config.functorFile is None. "
579  "Must be a valid path to yaml in order to run Task as a PipelineTask.")
580  result = self.runrun(parq=inputs['inputCatalog'], funcs=self.funcsfuncs,
581  dataId=outputRefs.outputCatalog.dataId.full)
582  outputs = pipeBase.Struct(outputCatalog=result)
583  butlerQC.put(outputs, outputRefs)
584 
585  def runDataRef(self, dataRef):
586  parq = dataRef.get()
587  if self.funcsfuncs is None:
588  raise ValueError("config.functorFile is None. "
589  "Must be a valid path to yaml in order to run as a CommandlineTask.")
590  df = self.runrun(parq, funcs=self.funcsfuncs, dataId=dataRef.dataId)
591  self.writewrite(df, dataRef)
592  return df
593 
594  def run(self, parq, funcs=None, dataId=None, band=None):
595  """Do postprocessing calculations
596 
597  Takes a `ParquetTable` object and dataId,
598  returns a dataframe with results of postprocessing calculations.
599 
600  Parameters
601  ----------
602  parq : `lsst.pipe.tasks.parquetTable.ParquetTable`
603  ParquetTable from which calculations are done.
604  funcs : `lsst.pipe.tasks.functors.Functors`
605  Functors to apply to the table's columns
606  dataId : dict, optional
607  Used to add a `patchId` column to the output dataframe.
608  band : `str`, optional
609  Filter band that is being processed.
610 
611  Returns
612  ------
613  `pandas.DataFrame`
614 
615  """
616  self.log.info("Transforming/standardizing the source table dataId: %s", dataId)
617 
618  df = self.transformtransform(band, parq, funcs, dataId).df
619  self.log.info("Made a table of %d columns and %d rows", len(df.columns), len(df))
620  return df
621 
622  def getFunctors(self):
623  return self.funcsfuncs
624 
625  def getAnalysis(self, parq, funcs=None, band=None):
626  if funcs is None:
627  funcs = self.funcsfuncs
628  analysis = PostprocessAnalysis(parq, funcs, filt=band)
629  return analysis
630 
631  def transform(self, band, parq, funcs, dataId):
632  analysis = self.getAnalysisgetAnalysis(parq, funcs=funcs, band=band)
633  df = analysis.df
634  if dataId is not None:
635  for key, value in dataId.items():
636  df[key] = value
637 
638  return pipeBase.Struct(
639  df=df,
640  analysis=analysis
641  )
642 
643  def write(self, df, parqRef):
644  parqRef.put(ParquetTable(dataFrame=df), self.outputDatasetoutputDataset)
645 
646  def writeMetadata(self, dataRef):
647  """No metadata to write.
648  """
649  pass
650 
651 
652 class TransformObjectCatalogConfig(TransformCatalogBaseConfig):
653  coaddName = pexConfig.Field(
654  dtype=str,
655  default="deep",
656  doc="Name of coadd"
657  )
658  # TODO: remove in DM-27177
659  filterMap = pexConfig.DictField(
660  keytype=str,
661  itemtype=str,
662  default={},
663  doc=("Dictionary mapping full filter name to short one for column name munging."
664  "These filters determine the output columns no matter what filters the "
665  "input data actually contain."),
666  deprecated=("Coadds are now identified by the band, so this transform is unused."
667  "Will be removed after v22.")
668  )
669  outputBands = pexConfig.ListField(
670  dtype=str,
671  default=None,
672  optional=True,
673  doc=("These bands and only these bands will appear in the output,"
674  " NaN-filled if the input does not include them."
675  " If None, then use all bands found in the input.")
676  )
677  camelCase = pexConfig.Field(
678  dtype=bool,
679  default=True,
680  doc=("Write per-band columns names with camelCase, else underscore "
681  "For example: gPsFlux instead of g_PsFlux.")
682  )
683  multilevelOutput = pexConfig.Field(
684  dtype=bool,
685  default=False,
686  doc=("Whether results dataframe should have a multilevel column index (True) or be flat "
687  "and name-munged (False).")
688  )
689 
690 
692  """Produce a flattened Object Table to match the format specified in
693  sdm_schemas.
694 
695  Do the same set of postprocessing calculations on all bands
696 
697  This is identical to `TransformCatalogBaseTask`, except for that it does the
698  specified functor calculations for all filters present in the
699  input `deepCoadd_obj` table. Any specific `"filt"` keywords specified
700  by the YAML file will be superceded.
701  """
702  _DefaultName = "transformObjectCatalog"
703  ConfigClass = TransformObjectCatalogConfig
704 
705  inputDataset = 'deepCoadd_obj'
706  outputDataset = 'objectTable'
707 
708  @classmethod
709  def _makeArgumentParser(cls):
710  parser = ArgumentParser(name=cls._DefaultName_DefaultName_DefaultName)
711  parser.add_id_argument("--id", cls.inputDatasetinputDatasetinputDataset,
712  ContainerClass=CoaddDataIdContainer,
713  help="data ID, e.g. --id tract=12345 patch=1,2")
714  return parser
715 
716  def run(self, parq, funcs=None, dataId=None, band=None):
717  # NOTE: band kwarg is ignored here.
718  dfDict = {}
719  analysisDict = {}
720  templateDf = pd.DataFrame()
721  outputBands = parq.columnLevelNames['band'] if self.config.outputBands is None else \
722  self.config.outputBands
723 
724  # Perform transform for data of filters that exist in parq.
725  for inputBand in parq.columnLevelNames['band']:
726  if inputBand not in outputBands:
727  self.log.info("Ignoring %s band data in the input", inputBand)
728  continue
729  self.log.info("Transforming the catalog of band %s", inputBand)
730  result = self.transformtransform(inputBand, parq, funcs, dataId)
731  dfDict[inputBand] = result.df
732  analysisDict[inputBand] = result.analysis
733  if templateDf.empty:
734  templateDf = result.df
735 
736  # Fill NaNs in columns of other wanted bands
737  for filt in outputBands:
738  if filt not in dfDict:
739  self.log.info("Adding empty columns for band %s", filt)
740  dfDict[filt] = pd.DataFrame().reindex_like(templateDf)
741 
742  # This makes a multilevel column index, with band as first level
743  df = pd.concat(dfDict, axis=1, names=['band', 'column'])
744 
745  if not self.config.multilevelOutput:
746  noDupCols = list(set.union(*[set(v.noDupCols) for v in analysisDict.values()]))
747  if dataId is not None:
748  noDupCols += list(dataId.keys())
749  df = flattenFilters(df, noDupCols=noDupCols, camelCase=self.config.camelCase)
750 
751  self.log.info("Made a table of %d columns and %d rows", len(df.columns), len(df))
752  return df
753 
754 
756 
757  def makeDataRefList(self, namespace):
758  """Make self.refList from self.idList
759 
760  Generate a list of data references given tract and/or patch.
761  This was adapted from `TractQADataIdContainer`, which was
762  `TractDataIdContainer` modifie to not require "filter".
763  Only existing dataRefs are returned.
764  """
765  def getPatchRefList(tract):
766  return [namespace.butler.dataRef(datasetType=self.datasetType,
767  tract=tract.getId(),
768  patch="%d,%d" % patch.getIndex()) for patch in tract]
769 
770  tractRefs = defaultdict(list) # Data references for each tract
771  for dataId in self.idList:
772  skymap = self.getSkymapgetSkymap(namespace)
773 
774  if "tract" in dataId:
775  tractId = dataId["tract"]
776  if "patch" in dataId:
777  tractRefs[tractId].append(namespace.butler.dataRef(datasetType=self.datasetType,
778  tract=tractId,
779  patch=dataId['patch']))
780  else:
781  tractRefs[tractId] += getPatchRefList(skymap[tractId])
782  else:
783  tractRefs = dict((tract.getId(), tractRefs.get(tract.getId(), []) + getPatchRefList(tract))
784  for tract in skymap)
785  outputRefList = []
786  for tractRefList in tractRefs.values():
787  existingRefs = [ref for ref in tractRefList if ref.datasetExists()]
788  outputRefList.append(existingRefs)
789 
790  self.refListrefList = outputRefList
791 
792 
793 class ConsolidateObjectTableConfig(pexConfig.Config):
794  coaddName = pexConfig.Field(
795  dtype=str,
796  default="deep",
797  doc="Name of coadd"
798  )
799 
800 
801 class ConsolidateObjectTableTask(CmdLineTask):
802  """Write patch-merged source tables to a tract-level parquet file
803  """
804  _DefaultName = "consolidateObjectTable"
805  ConfigClass = ConsolidateObjectTableConfig
806 
807  inputDataset = 'objectTable'
808  outputDataset = 'objectTable_tract'
809 
810  @classmethod
811  def _makeArgumentParser(cls):
812  parser = ArgumentParser(name=cls._DefaultName_DefaultName)
813 
814  parser.add_id_argument("--id", cls.inputDatasetinputDataset,
815  help="data ID, e.g. --id tract=12345",
816  ContainerClass=TractObjectDataIdContainer)
817  return parser
818 
819  def runDataRef(self, patchRefList):
820  df = pd.concat([patchRef.get().toDataFrame() for patchRef in patchRefList])
821  patchRefList[0].put(ParquetTable(dataFrame=df), self.outputDatasetoutputDataset)
822 
823  def writeMetadata(self, dataRef):
824  """No metadata to write.
825  """
826  pass
827 
828 
829 class TransformSourceTableConnections(pipeBase.PipelineTaskConnections,
830  dimensions=("instrument", "visit", "detector")):
831 
832  inputCatalog = connectionTypes.Input(
833  doc="Wide input catalog of sources produced by WriteSourceTableTask",
834  name="source",
835  storageClass="DataFrame",
836  dimensions=("instrument", "visit", "detector"),
837  deferLoad=True
838  )
839  outputCatalog = connectionTypes.Output(
840  doc="Narrower, per-detector Source Table transformed and converted per a "
841  "specified set of functors",
842  name="sourceTable",
843  storageClass="DataFrame",
844  dimensions=("instrument", "visit", "detector")
845  )
846 
847 
849  pipelineConnections=TransformSourceTableConnections):
850  pass
851 
852 
854  """Transform/standardize a source catalog
855  """
856  _DefaultName = "transformSourceTable"
857  ConfigClass = TransformSourceTableConfig
858 
859  inputDataset = 'source'
860  outputDataset = 'sourceTable'
861 
862  @classmethod
863  def _makeArgumentParser(cls):
864  parser = ArgumentParser(name=cls._DefaultName_DefaultName_DefaultName)
865  parser.add_id_argument("--id", datasetType=cls.inputDatasetinputDatasetinputDataset,
866  level="sensor",
867  help="data ID, e.g. --id visit=12345 ccd=0")
868  return parser
869 
870  def runDataRef(self, dataRef):
871  """Override to specify band label to run()."""
872  parq = dataRef.get()
873  funcs = self.getFunctorsgetFunctors()
874  band = dataRef.get("calexp_filterLabel", immediate=True).bandLabel
875  df = self.runrun(parq, funcs=funcs, dataId=dataRef.dataId, band=band)
876  self.writewrite(df, dataRef)
877  return df
878 
879 
880 class ConsolidateVisitSummaryConnections(pipeBase.PipelineTaskConnections,
881  dimensions=("instrument", "visit",),
882  defaultTemplates={}):
883  calexp = connectionTypes.Input(
884  doc="Processed exposures used for metadata",
885  name="calexp",
886  storageClass="ExposureF",
887  dimensions=("instrument", "visit", "detector"),
888  deferLoad=True,
889  multiple=True,
890  )
891  visitSummary = connectionTypes.Output(
892  doc="Consolidated visit-level exposure metadata",
893  name="visitSummary",
894  storageClass="ExposureCatalog",
895  dimensions=("instrument", "visit"),
896  )
897 
898 
899 class ConsolidateVisitSummaryConfig(pipeBase.PipelineTaskConfig,
900  pipelineConnections=ConsolidateVisitSummaryConnections):
901  """Config for ConsolidateVisitSummaryTask"""
902  pass
903 
904 
905 class ConsolidateVisitSummaryTask(pipeBase.PipelineTask, pipeBase.CmdLineTask):
906  """Task to consolidate per-detector visit metadata.
907 
908  This task aggregates the following metadata from all the detectors in a
909  single visit into an exposure catalog:
910  - The visitInfo.
911  - The wcs.
912  - The photoCalib.
913  - The physical_filter and band (if available).
914  - The psf size, shape, and effective area at the center of the detector.
915  - The corners of the bounding box in right ascension/declination.
916 
917  Other quantities such as Psf, ApCorrMap, and TransmissionCurve are not
918  persisted here because of storage concerns, and because of their limited
919  utility as summary statistics.
920 
921  Tests for this task are performed in ci_hsc_gen3.
922  """
923  _DefaultName = "consolidateVisitSummary"
924  ConfigClass = ConsolidateVisitSummaryConfig
925 
926  @classmethod
927  def _makeArgumentParser(cls):
928  parser = ArgumentParser(name=cls._DefaultName_DefaultName)
929 
930  parser.add_id_argument("--id", "calexp",
931  help="data ID, e.g. --id visit=12345",
932  ContainerClass=VisitDataIdContainer)
933  return parser
934 
935  def writeMetadata(self, dataRef):
936  """No metadata to persist, so override to remove metadata persistance.
937  """
938  pass
939 
940  def writeConfig(self, butler, clobber=False, doBackup=True):
941  """No config to persist, so override to remove config persistance.
942  """
943  pass
944 
945  def runDataRef(self, dataRefList):
946  visit = dataRefList[0].dataId['visit']
947 
948  self.log.debug("Concatenating metadata from %d per-detector calexps (visit %d)" %
949  (len(dataRefList), visit))
950 
951  expCatalog = self._combineExposureMetadata(visit, dataRefList, isGen3=False)
952 
953  dataRefList[0].put(expCatalog, 'visitSummary', visit=visit)
954 
955  def runQuantum(self, butlerQC, inputRefs, outputRefs):
956  dataRefs = butlerQC.get(inputRefs.calexp)
957  visit = dataRefs[0].dataId.byName()['visit']
958 
959  self.log.debug("Concatenating metadata from %d per-detector calexps (visit %d)" %
960  (len(dataRefs), visit))
961 
962  expCatalog = self._combineExposureMetadata_combineExposureMetadata(visit, dataRefs)
963 
964  butlerQC.put(expCatalog, outputRefs.visitSummary)
965 
966  def _combineExposureMetadata(self, visit, dataRefs, isGen3=True):
967  """Make a combined exposure catalog from a list of dataRefs.
968 
969  Parameters
970  ----------
971  visit : `int`
972  Visit identification number
973  dataRefs : `list`
974  List of calexp dataRefs in visit. May be list of
975  `lsst.daf.persistence.ButlerDataRef` (Gen2) or
976  `lsst.daf.butler.DeferredDatasetHandle` (Gen3).
977  isGen3 : `bool`, optional
978  Specifies if this is a Gen3 list of datarefs.
979 
980  Returns
981  -------
982  visitSummary : `lsst.afw.table.ExposureCatalog`
983  Exposure catalog with per-detector summary information.
984  """
985  schema = afwTable.ExposureTable.makeMinimalSchema()
986  schema.addField('visit', type='I', doc='Visit number')
987  schema.addField('detector_id', type='I', doc='Detector number')
988  schema.addField('physical_filter', type='String', size=32, doc='Physical filter')
989  schema.addField('band', type='String', size=32, doc='Name of band')
990  schema.addField('psfSigma', type='F',
991  doc='PSF model second-moments determinant radius (center of chip) (pixel)')
992  schema.addField('psfArea', type='F',
993  doc='PSF model effective area (center of chip) (pixel**2)')
994  schema.addField('psfIxx', type='F',
995  doc='PSF model Ixx (center of chip) (pixel**2)')
996  schema.addField('psfIyy', type='F',
997  doc='PSF model Iyy (center of chip) (pixel**2)')
998  schema.addField('psfIxy', type='F',
999  doc='PSF model Ixy (center of chip) (pixel**2)')
1000  schema.addField('raCorners', type='ArrayD', size=4,
1001  doc='Right Ascension of bounding box corners (degrees)')
1002  schema.addField('decCorners', type='ArrayD', size=4,
1003  doc='Declination of bounding box corners (degrees)')
1004 
1005  cat = afwTable.ExposureCatalog(schema)
1006  cat.resize(len(dataRefs))
1007 
1008  cat['visit'] = visit
1009 
1010  for i, dataRef in enumerate(dataRefs):
1011  if isGen3:
1012  visitInfo = dataRef.get(component='visitInfo')
1013  filterLabel = dataRef.get(component='filterLabel')
1014  psf = dataRef.get(component='psf')
1015  wcs = dataRef.get(component='wcs')
1016  photoCalib = dataRef.get(component='photoCalib')
1017  detector = dataRef.get(component='detector')
1018  bbox = dataRef.get(component='bbox')
1019  validPolygon = dataRef.get(component='validPolygon')
1020  else:
1021  # Note that we need to read the calexp because there is
1022  # no magic access to the psf except through the exposure.
1023  gen2_read_bbox = lsst.geom.BoxI(lsst.geom.PointI(0, 0), lsst.geom.PointI(1, 1))
1024  exp = dataRef.get(datasetType='calexp_sub', bbox=gen2_read_bbox)
1025  visitInfo = exp.getInfo().getVisitInfo()
1026  filterLabel = dataRef.get("calexp_filterLabel")
1027  psf = exp.getPsf()
1028  wcs = exp.getWcs()
1029  photoCalib = exp.getPhotoCalib()
1030  detector = exp.getDetector()
1031  bbox = dataRef.get(datasetType='calexp_bbox')
1032  validPolygon = exp.getInfo().getValidPolygon()
1033 
1034  rec = cat[i]
1035  rec.setBBox(bbox)
1036  rec.setVisitInfo(visitInfo)
1037  rec.setWcs(wcs)
1038  rec.setPhotoCalib(photoCalib)
1039  rec.setDetector(detector)
1040  rec.setValidPolygon(validPolygon)
1041 
1042  rec['physical_filter'] = filterLabel.physicalLabel if filterLabel.hasPhysicalLabel() else ""
1043  rec['band'] = filterLabel.bandLabel if filterLabel.hasBandLabel() else ""
1044  rec['detector_id'] = detector.getId()
1045  shape = psf.computeShape(bbox.getCenter())
1046  rec['psfSigma'] = shape.getDeterminantRadius()
1047  rec['psfIxx'] = shape.getIxx()
1048  rec['psfIyy'] = shape.getIyy()
1049  rec['psfIxy'] = shape.getIxy()
1050  im = psf.computeKernelImage(bbox.getCenter())
1051  # The calculation of effective psf area is taken from
1052  # meas_base/src/PsfFlux.cc#L112. See
1053  # https://github.com/lsst/meas_base/blob/
1054  # 750bffe6620e565bda731add1509507f5c40c8bb/src/PsfFlux.cc#L112
1055  rec['psfArea'] = np.sum(im.array)/np.sum(im.array**2.)
1056 
1057  sph_pts = wcs.pixelToSky(lsst.geom.Box2D(bbox).getCorners())
1058  rec['raCorners'][:] = [sph.getRa().asDegrees() for sph in sph_pts]
1059  rec['decCorners'][:] = [sph.getDec().asDegrees() for sph in sph_pts]
1060 
1061  return cat
1062 
1063 
1064 class VisitDataIdContainer(DataIdContainer):
1065  """DataIdContainer that groups sensor-level id's by visit
1066  """
1067 
1068  def makeDataRefList(self, namespace):
1069  """Make self.refList from self.idList
1070 
1071  Generate a list of data references grouped by visit.
1072 
1073  Parameters
1074  ----------
1075  namespace : `argparse.Namespace`
1076  Namespace used by `lsst.pipe.base.CmdLineTask` to parse command line arguments
1077  """
1078  # Group by visits
1079  visitRefs = defaultdict(list)
1080  for dataId in self.idList:
1081  if "visit" in dataId:
1082  visitId = dataId["visit"]
1083  # append all subsets to
1084  subset = namespace.butler.subset(self.datasetType, dataId=dataId)
1085  visitRefs[visitId].extend([dataRef for dataRef in subset])
1086 
1087  outputRefList = []
1088  for refList in visitRefs.values():
1089  existingRefs = [ref for ref in refList if ref.datasetExists()]
1090  if existingRefs:
1091  outputRefList.append(existingRefs)
1092 
1093  self.refListrefList = outputRefList
1094 
1095 
1096 class ConsolidateSourceTableConnections(pipeBase.PipelineTaskConnections,
1097  dimensions=("instrument", "visit")):
1098  inputCatalogs = connectionTypes.Input(
1099  doc="Input per-detector Source Tables",
1100  name="sourceTable",
1101  storageClass="DataFrame",
1102  dimensions=("instrument", "visit", "detector"),
1103  multiple=True
1104  )
1105  outputCatalog = connectionTypes.Output(
1106  doc="Per-visit concatenation of Source Table",
1107  name="sourceTable_visit",
1108  storageClass="DataFrame",
1109  dimensions=("instrument", "visit")
1110  )
1111 
1112 
1113 class ConsolidateSourceTableConfig(pipeBase.PipelineTaskConfig,
1114  pipelineConnections=ConsolidateSourceTableConnections):
1115  pass
1116 
1117 
1118 class ConsolidateSourceTableTask(CmdLineTask, pipeBase.PipelineTask):
1119  """Concatenate `sourceTable` list into a per-visit `sourceTable_visit`
1120  """
1121  _DefaultName = 'consolidateSourceTable'
1122  ConfigClass = ConsolidateSourceTableConfig
1123 
1124  inputDataset = 'sourceTable'
1125  outputDataset = 'sourceTable_visit'
1126 
1127  def runQuantum(self, butlerQC, inputRefs, outputRefs):
1128  inputs = butlerQC.get(inputRefs)
1129  self.log.info("Concatenating %s per-detector Source Tables",
1130  len(inputs['inputCatalogs']))
1131  df = pd.concat(inputs['inputCatalogs'])
1132  butlerQC.put(pipeBase.Struct(outputCatalog=df), outputRefs)
1133 
1134  def runDataRef(self, dataRefList):
1135  self.log.info("Concatenating %s per-detector Source Tables", len(dataRefList))
1136  df = pd.concat([dataRef.get().toDataFrame() for dataRef in dataRefList])
1137  dataRefList[0].put(ParquetTable(dataFrame=df), self.outputDatasetoutputDataset)
1138 
1139  @classmethod
1140  def _makeArgumentParser(cls):
1141  parser = ArgumentParser(name=cls._DefaultName_DefaultName)
1142 
1143  parser.add_id_argument("--id", cls.inputDatasetinputDataset,
1144  help="data ID, e.g. --id visit=12345",
1145  ContainerClass=VisitDataIdContainer)
1146  return parser
1147 
1148  def writeMetadata(self, dataRef):
1149  """No metadata to write.
1150  """
1151  pass
1152 
1153  def writeConfig(self, butler, clobber=False, doBackup=True):
1154  """No config to write.
1155  """
1156  pass
def writeConfig(self, butler, clobber=False, doBackup=True)
def runQuantum(self, butlerQC, inputRefs, outputRefs)
def writeConfig(self, butler, clobber=False, doBackup=True)
Definition: postprocess.py:940
def runQuantum(self, butlerQC, inputRefs, outputRefs)
Definition: postprocess.py:955
def _combineExposureMetadata(self, visit, dataRefs, isGen3=True)
Definition: postprocess.py:966
def __init__(self, parq, functors, filt=None, flags=None, refFlags=None)
Definition: postprocess.py:398
def compute(self, dropna=False, pool=None)
Definition: postprocess.py:441
def getAnalysis(self, parq, funcs=None, band=None)
Definition: postprocess.py:625
def transform(self, band, parq, funcs, dataId)
Definition: postprocess.py:631
def run(self, parq, funcs=None, dataId=None, band=None)
Definition: postprocess.py:594
def runQuantum(self, butlerQC, inputRefs, outputRefs)
Definition: postprocess.py:575
def run(self, parq, funcs=None, dataId=None, band=None)
Definition: postprocess.py:716
def __init__(self, butler=None, schema=None, **kwargs)
Definition: postprocess.py:83
def runDataRef(self, patchRefList)
Merge coadd sources from multiple bands.
Definition: postprocess.py:89
def run(self, catalogs, tract, patch)
Definition: postprocess.py:138
def run(self, catalog, ccdVisitId=None)
Definition: postprocess.py:254
def runQuantum(self, butlerQC, inputRefs, outputRefs)
Definition: postprocess.py:247
def addCalibColumns(self, catalog, dataRef)
Definition: postprocess.py:275
def makeMergeArgumentParser(name, dataset)
Create a suitable ArgumentParser.
def flattenFilters(df, noDupCols=['coord_ra', 'coord_dec'], camelCase=False)
Definition: postprocess.py:41