lsst.pipe.tasks  21.0.0-38-g070523fc+49d011e2c2
postprocess.py
Go to the documentation of this file.
1 # This file is part of pipe_tasks
2 #
3 # Developed for the LSST Data Management System.
4 # This product includes software developed by the LSST Project
5 # (https://www.lsst.org).
6 # See the COPYRIGHT file at the top-level directory of this distribution
7 # for details of code ownership.
8 #
9 # This program is free software: you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or
12 # (at your option) any later version.
13 #
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
18 #
19 # You should have received a copy of the GNU General Public License
20 # along with this program. If not, see <https://www.gnu.org/licenses/>.
21 
22 import functools
23 import pandas as pd
24 import numpy as np
25 from collections import defaultdict
26 
27 import lsst.geom
28 import lsst.pex.config as pexConfig
29 import lsst.pipe.base as pipeBase
30 from lsst.pipe.base import connectionTypes
31 import lsst.afw.table as afwTable
32 from lsst.meas.base import SingleFrameMeasurementTask
33 from lsst.pipe.base import CmdLineTask, ArgumentParser, DataIdContainer
34 from lsst.coadd.utils.coaddDataIdContainer import CoaddDataIdContainer
35 
36 from .parquetTable import ParquetTable
37 from .multiBandUtils import makeMergeArgumentParser, MergeSourcesRunner
38 from .functors import CompositeFunctor, RAColumn, DecColumn, Column
39 
40 
41 def flattenFilters(df, noDupCols=['coord_ra', 'coord_dec'], camelCase=False):
42  """Flattens a dataframe with multilevel column index
43  """
44  newDf = pd.DataFrame()
45  for band in set(df.columns.to_frame()['band']):
46  subdf = df[band]
47  columnFormat = '{0}{1}' if camelCase else '{0}_{1}'
48  newColumns = {c: columnFormat.format(band, c)
49  for c in subdf.columns if c not in noDupCols}
50  cols = list(newColumns.keys())
51  newDf = pd.concat([newDf, subdf[cols].rename(columns=newColumns)], axis=1)
52 
53  newDf = pd.concat([subdf[noDupCols], newDf], axis=1)
54  return newDf
55 
56 
57 class WriteObjectTableConfig(pexConfig.Config):
58  priorityList = pexConfig.ListField(
59  dtype=str,
60  default=[],
61  doc="Priority-ordered list of bands for the merge."
62  )
63  engine = pexConfig.Field(
64  dtype=str,
65  default="pyarrow",
66  doc="Parquet engine for writing (pyarrow or fastparquet)"
67  )
68  coaddName = pexConfig.Field(
69  dtype=str,
70  default="deep",
71  doc="Name of coadd"
72  )
73 
74  def validate(self):
75  pexConfig.Config.validate(self)
76  if len(self.priorityListpriorityList) == 0:
77  raise RuntimeError("No priority list provided")
78 
79 
80 class WriteObjectTableTask(CmdLineTask):
81  """Write filter-merged source tables to parquet
82  """
83  _DefaultName = "writeObjectTable"
84  ConfigClass = WriteObjectTableConfig
85  RunnerClass = MergeSourcesRunner
86 
87  # Names of table datasets to be merged
88  inputDatasets = ('forced_src', 'meas', 'ref')
89 
90  # Tag of output dataset written by `MergeSourcesTask.write`
91  outputDataset = 'obj'
92 
93  def __init__(self, butler=None, schema=None, **kwargs):
94  # It is a shame that this class can't use the default init for CmdLineTask
95  # But to do so would require its own special task runner, which is many
96  # more lines of specialization, so this is how it is for now
97  CmdLineTask.__init__(self, **kwargs)
98 
99  def runDataRef(self, patchRefList):
100  """!
101  @brief Merge coadd sources from multiple bands. Calls @ref `run` which must be defined in
102  subclasses that inherit from MergeSourcesTask.
103  @param[in] patchRefList list of data references for each filter
104  """
105  catalogs = dict(self.readCatalogreadCatalog(patchRef) for patchRef in patchRefList)
106  dataId = patchRefList[0].dataId
107  mergedCatalog = self.runrun(catalogs, tract=dataId['tract'], patch=dataId['patch'])
108  self.writewrite(patchRefList[0], mergedCatalog)
109 
110  @classmethod
111  def _makeArgumentParser(cls):
112  """Create a suitable ArgumentParser.
113 
114  We will use the ArgumentParser to get a list of data
115  references for patches; the RunnerClass will sort them into lists
116  of data references for the same patch.
117 
118  References first of self.inputDatasets, rather than
119  self.inputDataset
120  """
121  return makeMergeArgumentParser(cls._DefaultName_DefaultName, cls.inputDatasetsinputDatasets[0])
122 
123  def readCatalog(self, patchRef):
124  """Read input catalogs
125 
126  Read all the input datasets given by the 'inputDatasets'
127  attribute.
128 
129  Parameters
130  ----------
131  patchRef : `lsst.daf.persistence.ButlerDataRef`
132  Data reference for patch
133 
134  Returns
135  -------
136  Tuple consisting of band name and a dict of catalogs, keyed by
137  dataset name
138  """
139  band = patchRef.get(self.config.coaddName + "Coadd_filterLabel", immediate=True).bandLabel
140  catalogDict = {}
141  for dataset in self.inputDatasetsinputDatasets:
142  catalog = patchRef.get(self.config.coaddName + "Coadd_" + dataset, immediate=True)
143  self.log.info("Read %d sources from %s for band %s: %s" %
144  (len(catalog), dataset, band, patchRef.dataId))
145  catalogDict[dataset] = catalog
146  return band, catalogDict
147 
148  def run(self, catalogs, tract, patch):
149  """Merge multiple catalogs.
150 
151  Parameters
152  ----------
153  catalogs : `dict`
154  Mapping from filter names to dict of catalogs.
155  tract : int
156  tractId to use for the tractId column
157  patch : str
158  patchId to use for the patchId column
159 
160  Returns
161  -------
162  catalog : `lsst.pipe.tasks.parquetTable.ParquetTable`
163  Merged dataframe, with each column prefixed by
164  `filter_tag(filt)`, wrapped in the parquet writer shim class.
165  """
166 
167  dfs = []
168  for filt, tableDict in catalogs.items():
169  for dataset, table in tableDict.items():
170  # Convert afwTable to pandas DataFrame
171  df = table.asAstropy().to_pandas().set_index('id', drop=True)
172 
173  # Sort columns by name, to ensure matching schema among patches
174  df = df.reindex(sorted(df.columns), axis=1)
175  df['tractId'] = tract
176  df['patchId'] = patch
177 
178  # Make columns a 3-level MultiIndex
179  df.columns = pd.MultiIndex.from_tuples([(dataset, filt, c) for c in df.columns],
180  names=('dataset', 'band', 'column'))
181  dfs.append(df)
182 
183  catalog = functools.reduce(lambda d1, d2: d1.join(d2), dfs)
184  return ParquetTable(dataFrame=catalog)
185 
186  def write(self, patchRef, catalog):
187  """Write the output.
188 
189  Parameters
190  ----------
191  catalog : `ParquetTable`
192  Catalog to write
193  patchRef : `lsst.daf.persistence.ButlerDataRef`
194  Data reference for patch
195  """
196  patchRef.put(catalog, self.config.coaddName + "Coadd_" + self.outputDatasetoutputDataset)
197  # since the filter isn't actually part of the data ID for the dataset we're saving,
198  # it's confusing to see it in the log message, even if the butler simply ignores it.
199  mergeDataId = patchRef.dataId.copy()
200  del mergeDataId["filter"]
201  self.log.info("Wrote merged catalog: %s" % (mergeDataId,))
202 
203  def writeMetadata(self, dataRefList):
204  """No metadata to write, and not sure how to write it for a list of dataRefs.
205  """
206  pass
207 
208 
209 class WriteSourceTableConfig(pexConfig.Config):
210  doApplyExternalPhotoCalib = pexConfig.Field(
211  dtype=bool,
212  default=False,
213  doc=("Add local photoCalib columns from the calexp.photoCalib? Should only set True if "
214  "generating Source Tables from older src tables which do not already have local calib columns")
215  )
216  doApplyExternalSkyWcs = pexConfig.Field(
217  dtype=bool,
218  default=False,
219  doc=("Add local WCS columns from the calexp.wcs? Should only set True if "
220  "generating Source Tables from older src tables which do not already have local calib columns")
221  )
222 
223 
224 class WriteSourceTableTask(CmdLineTask):
225  """Write source table to parquet
226  """
227  _DefaultName = "writeSourceTable"
228  ConfigClass = WriteSourceTableConfig
229 
230  def runDataRef(self, dataRef):
231  src = dataRef.get('src')
232  if self.config.doApplyExternalPhotoCalib or self.config.doApplyExternalSkyWcs:
233  src = self.addCalibColumnsaddCalibColumns(src, dataRef)
234 
235  ccdVisitId = dataRef.get('ccdExposureId')
236  result = self.runrun(src, ccdVisitId=ccdVisitId)
237  dataRef.put(result.table, 'source')
238 
239  def run(self, catalog, ccdVisitId=None):
240  """Convert `src` catalog to parquet
241 
242  Parameters
243  ----------
244  catalog: `afwTable.SourceCatalog`
245  catalog to be converted
246  ccdVisitId: `int`
247  ccdVisitId to be added as a column
248 
249  Returns
250  -------
251  result : `lsst.pipe.base.Struct`
252  ``table``
253  `ParquetTable` version of the input catalog
254  """
255  self.log.info("Generating parquet table from src catalog")
256  df = catalog.asAstropy().to_pandas().set_index('id', drop=True)
257  df['ccdVisitId'] = ccdVisitId
258  return pipeBase.Struct(table=ParquetTable(dataFrame=df))
259 
260  def addCalibColumns(self, catalog, dataRef):
261  """Add columns with local calibration evaluated at each centroid
262 
263  for backwards compatibility with old repos.
264  This exists for the purpose of converting old src catalogs
265  (which don't have the expected local calib columns) to Source Tables.
266 
267  Parameters
268  ----------
269  catalog: `afwTable.SourceCatalog`
270  catalog to which calib columns will be added
271  dataRef: `lsst.daf.persistence.ButlerDataRef
272  for fetching the calibs from disk.
273 
274  Returns
275  -------
276  newCat: `afwTable.SourceCatalog`
277  Source Catalog with requested local calib columns
278  """
279  mapper = afwTable.SchemaMapper(catalog.schema)
280  measureConfig = SingleFrameMeasurementTask.ConfigClass()
281  measureConfig.doReplaceWithNoise = False
282 
283  # Just need the WCS or the PhotoCalib attached to an exposue
284  exposure = dataRef.get('calexp_sub',
286 
287  mapper = afwTable.SchemaMapper(catalog.schema)
288  mapper.addMinimalSchema(catalog.schema, True)
289  schema = mapper.getOutputSchema()
290 
291  exposureIdInfo = dataRef.get("expIdInfo")
292  measureConfig.plugins.names = []
293  if self.config.doApplyExternalSkyWcs:
294  plugin = 'base_LocalWcs'
295  if plugin in schema:
296  raise RuntimeError(f"{plugin} already in src catalog. Set doApplyExternalSkyWcs=False")
297  else:
298  measureConfig.plugins.names.add(plugin)
299 
300  if self.config.doApplyExternalPhotoCalib:
301  plugin = 'base_LocalPhotoCalib'
302  if plugin in schema:
303  raise RuntimeError(f"{plugin} already in src catalog. Set doApplyExternalPhotoCalib=False")
304  else:
305  measureConfig.plugins.names.add(plugin)
306 
307  measurement = SingleFrameMeasurementTask(config=measureConfig, schema=schema)
308  newCat = afwTable.SourceCatalog(schema)
309  newCat.extend(catalog, mapper=mapper)
310  measurement.run(measCat=newCat, exposure=exposure, exposureId=exposureIdInfo.expId)
311  return newCat
312 
313  def writeMetadata(self, dataRef):
314  """No metadata to write.
315  """
316  pass
317 
318  @classmethod
319  def _makeArgumentParser(cls):
320  parser = ArgumentParser(name=cls._DefaultName_DefaultName)
321  parser.add_id_argument("--id", 'src',
322  help="data ID, e.g. --id visit=12345 ccd=0")
323  return parser
324 
325 
326 class PostprocessAnalysis(object):
327  """Calculate columns from ParquetTable
328 
329  This object manages and organizes an arbitrary set of computations
330  on a catalog. The catalog is defined by a
331  `lsst.pipe.tasks.parquetTable.ParquetTable` object (or list thereof), such as a
332  `deepCoadd_obj` dataset, and the computations are defined by a collection
333  of `lsst.pipe.tasks.functor.Functor` objects (or, equivalently,
334  a `CompositeFunctor`).
335 
336  After the object is initialized, accessing the `.df` attribute (which
337  holds the `pandas.DataFrame` containing the results of the calculations) triggers
338  computation of said dataframe.
339 
340  One of the conveniences of using this object is the ability to define a desired common
341  filter for all functors. This enables the same functor collection to be passed to
342  several different `PostprocessAnalysis` objects without having to change the original
343  functor collection, since the `filt` keyword argument of this object triggers an
344  overwrite of the `filt` property for all functors in the collection.
345 
346  This object also allows a list of refFlags to be passed, and defines a set of default
347  refFlags that are always included even if not requested.
348 
349  If a list of `ParquetTable` object is passed, rather than a single one, then the
350  calculations will be mapped over all the input catalogs. In principle, it should
351  be straightforward to parallelize this activity, but initial tests have failed
352  (see TODO in code comments).
353 
354  Parameters
355  ----------
356  parq : `lsst.pipe.tasks.ParquetTable` (or list of such)
357  Source catalog(s) for computation
358 
359  functors : `list`, `dict`, or `lsst.pipe.tasks.functors.CompositeFunctor`
360  Computations to do (functors that act on `parq`).
361  If a dict, the output
362  DataFrame will have columns keyed accordingly.
363  If a list, the column keys will come from the
364  `.shortname` attribute of each functor.
365 
366  filt : `str` (optional)
367  Filter in which to calculate. If provided,
368  this will overwrite any existing `.filt` attribute
369  of the provided functors.
370 
371  flags : `list` (optional)
372  List of flags (per-band) to include in output table.
373 
374  refFlags : `list` (optional)
375  List of refFlags (only reference band) to include in output table.
376 
377 
378  """
379  _defaultRefFlags = []
380  _defaultFuncs = (('coord_ra', RAColumn()),
381  ('coord_dec', DecColumn()))
382 
383  def __init__(self, parq, functors, filt=None, flags=None, refFlags=None):
384  self.parqparq = parq
385  self.functorsfunctors = functors
386 
387  self.filtfilt = filt
388  self.flagsflags = list(flags) if flags is not None else []
389  self.refFlagsrefFlags = list(self._defaultRefFlags_defaultRefFlags)
390  if refFlags is not None:
391  self.refFlagsrefFlags += list(refFlags)
392 
393  self._df_df = None
394 
395  @property
396  def defaultFuncs(self):
397  funcs = dict(self._defaultFuncs_defaultFuncs)
398  return funcs
399 
400  @property
401  def func(self):
402  additionalFuncs = self.defaultFuncsdefaultFuncs
403  additionalFuncs.update({flag: Column(flag, dataset='ref') for flag in self.refFlagsrefFlags})
404  additionalFuncs.update({flag: Column(flag, dataset='meas') for flag in self.flagsflags})
405 
406  if isinstance(self.functorsfunctors, CompositeFunctor):
407  func = self.functorsfunctors
408  else:
409  func = CompositeFunctor(self.functorsfunctors)
410 
411  func.funcDict.update(additionalFuncs)
412  func.filt = self.filtfilt
413 
414  return func
415 
416  @property
417  def noDupCols(self):
418  return [name for name, func in self.funcfunc.funcDict.items() if func.noDup or func.dataset == 'ref']
419 
420  @property
421  def df(self):
422  if self._df_df is None:
423  self.computecompute()
424  return self._df_df
425 
426  def compute(self, dropna=False, pool=None):
427  # map over multiple parquet tables
428  if type(self.parqparq) in (list, tuple):
429  if pool is None:
430  dflist = [self.funcfunc(parq, dropna=dropna) for parq in self.parqparq]
431  else:
432  # TODO: Figure out why this doesn't work (pyarrow pickling issues?)
433  dflist = pool.map(functools.partial(self.funcfunc, dropna=dropna), self.parqparq)
434  self._df_df = pd.concat(dflist)
435  else:
436  self._df_df = self.funcfunc(self.parqparq, dropna=dropna)
437 
438  return self._df_df
439 
440 
441 class TransformCatalogBaseConfig(pexConfig.Config):
442  functorFile = pexConfig.Field(
443  dtype=str,
444  doc='Path to YAML file specifying functors to be computed',
445  default=None,
446  optional=True
447  )
448 
449 
450 class TransformCatalogBaseTask(CmdLineTask):
451  """Base class for transforming/standardizing a catalog
452 
453  by applying functors that convert units and apply calibrations.
454  The purpose of this task is to perform a set of computations on
455  an input `ParquetTable` dataset (such as `deepCoadd_obj`) and write the
456  results to a new dataset (which needs to be declared in an `outputDataset`
457  attribute).
458 
459  The calculations to be performed are defined in a YAML file that specifies
460  a set of functors to be computed, provided as
461  a `--functorFile` config parameter. An example of such a YAML file
462  is the following:
463 
464  funcs:
465  psfMag:
466  functor: Mag
467  args:
468  - base_PsfFlux
469  filt: HSC-G
470  dataset: meas
471  cmodel_magDiff:
472  functor: MagDiff
473  args:
474  - modelfit_CModel
475  - base_PsfFlux
476  filt: HSC-G
477  gauss_magDiff:
478  functor: MagDiff
479  args:
480  - base_GaussianFlux
481  - base_PsfFlux
482  filt: HSC-G
483  count:
484  functor: Column
485  args:
486  - base_InputCount_value
487  filt: HSC-G
488  deconvolved_moments:
489  functor: DeconvolvedMoments
490  filt: HSC-G
491  dataset: forced_src
492  refFlags:
493  - calib_psfUsed
494  - merge_measurement_i
495  - merge_measurement_r
496  - merge_measurement_z
497  - merge_measurement_y
498  - merge_measurement_g
499  - base_PixelFlags_flag_inexact_psfCenter
500  - detect_isPrimary
501 
502  The names for each entry under "func" will become the names of columns in the
503  output dataset. All the functors referenced are defined in `lsst.pipe.tasks.functors`.
504  Positional arguments to be passed to each functor are in the `args` list,
505  and any additional entries for each column other than "functor" or "args" (e.g., `'filt'`,
506  `'dataset'`) are treated as keyword arguments to be passed to the functor initialization.
507 
508  The "refFlags" entry is shortcut for a bunch of `Column` functors with the original column and
509  taken from the `'ref'` dataset.
510 
511  The "flags" entry will be expanded out per band.
512 
513  This task uses the `lsst.pipe.tasks.postprocess.PostprocessAnalysis` object
514  to organize and excecute the calculations.
515 
516  """
517  @property
518  def _DefaultName(self):
519  raise NotImplementedError('Subclass must define "_DefaultName" attribute')
520 
521  @property
522  def outputDataset(self):
523  raise NotImplementedError('Subclass must define "outputDataset" attribute')
524 
525  @property
526  def inputDataset(self):
527  raise NotImplementedError('Subclass must define "inputDataset" attribute')
528 
529  @property
530  def ConfigClass(self):
531  raise NotImplementedError('Subclass must define "ConfigClass" attribute')
532 
533  def runDataRef(self, dataRef):
534  parq = dataRef.get()
535  funcs = self.getFunctorsgetFunctors()
536  df = self.runrun(parq, funcs=funcs, dataId=dataRef.dataId)
537  self.writewrite(df, dataRef)
538  return df
539 
540  def run(self, parq, funcs=None, dataId=None, band=None):
541  """Do postprocessing calculations
542 
543  Takes a `ParquetTable` object and dataId,
544  returns a dataframe with results of postprocessing calculations.
545 
546  Parameters
547  ----------
548  parq : `lsst.pipe.tasks.parquetTable.ParquetTable`
549  ParquetTable from which calculations are done.
550  funcs : `lsst.pipe.tasks.functors.Functors`
551  Functors to apply to the table's columns
552  dataId : dict, optional
553  Used to add a `patchId` column to the output dataframe.
554  band : `str`, optional
555  Filter band that is being processed.
556 
557  Returns
558  ------
559  `pandas.DataFrame`
560 
561  """
562  self.log.info("Transforming/standardizing the source table dataId: %s", dataId)
563 
564  df = self.transformtransform(band, parq, funcs, dataId).df
565  self.log.info("Made a table of %d columns and %d rows", len(df.columns), len(df))
566  return df
567 
568  def getFunctors(self):
569  funcs = CompositeFunctor.from_file(self.config.functorFile)
570  funcs.update(dict(PostprocessAnalysis._defaultFuncs))
571  return funcs
572 
573  def getAnalysis(self, parq, funcs=None, band=None):
574  # Avoids disk access if funcs is passed
575  if funcs is None:
576  funcs = self.getFunctorsgetFunctors()
577  analysis = PostprocessAnalysis(parq, funcs, filt=band)
578  return analysis
579 
580  def transform(self, band, parq, funcs, dataId):
581  analysis = self.getAnalysisgetAnalysis(parq, funcs=funcs, band=band)
582  df = analysis.df
583  if dataId is not None:
584  for key, value in dataId.items():
585  df[key] = value
586 
587  return pipeBase.Struct(
588  df=df,
589  analysis=analysis
590  )
591 
592  def write(self, df, parqRef):
593  parqRef.put(ParquetTable(dataFrame=df), self.outputDatasetoutputDataset)
594 
595  def writeMetadata(self, dataRef):
596  """No metadata to write.
597  """
598  pass
599 
600 
601 class TransformObjectCatalogConfig(TransformCatalogBaseConfig):
602  coaddName = pexConfig.Field(
603  dtype=str,
604  default="deep",
605  doc="Name of coadd"
606  )
607  # TODO: remove in DM-27177
608  filterMap = pexConfig.DictField(
609  keytype=str,
610  itemtype=str,
611  default={},
612  doc=("Dictionary mapping full filter name to short one for column name munging."
613  "These filters determine the output columns no matter what filters the "
614  "input data actually contain."),
615  deprecated=("Coadds are now identified by the band, so this transform is unused."
616  "Will be removed after v22.")
617  )
618  outputBands = pexConfig.ListField(
619  dtype=str,
620  default=None,
621  optional=True,
622  doc=("These bands and only these bands will appear in the output,"
623  " NaN-filled if the input does not include them."
624  " If None, then use all bands found in the input.")
625  )
626  camelCase = pexConfig.Field(
627  dtype=bool,
628  default=True,
629  doc=("Write per-band columns names with camelCase, else underscore "
630  "For example: gPsFlux instead of g_PsFlux.")
631  )
632  multilevelOutput = pexConfig.Field(
633  dtype=bool,
634  default=False,
635  doc=("Whether results dataframe should have a multilevel column index (True) or be flat "
636  "and name-munged (False).")
637  )
638 
639 
641  """Produce a flattened Object Table to match the format specified in
642  sdm_schemas.
643 
644  Do the same set of postprocessing calculations on all bands
645 
646  This is identical to `TransformCatalogBaseTask`, except for that it does the
647  specified functor calculations for all filters present in the
648  input `deepCoadd_obj` table. Any specific `"filt"` keywords specified
649  by the YAML file will be superceded.
650  """
651  _DefaultName = "transformObjectCatalog"
652  ConfigClass = TransformObjectCatalogConfig
653 
654  inputDataset = 'deepCoadd_obj'
655  outputDataset = 'objectTable'
656 
657  @classmethod
658  def _makeArgumentParser(cls):
659  parser = ArgumentParser(name=cls._DefaultName_DefaultName_DefaultName)
660  parser.add_id_argument("--id", cls.inputDatasetinputDatasetinputDataset,
661  ContainerClass=CoaddDataIdContainer,
662  help="data ID, e.g. --id tract=12345 patch=1,2")
663  return parser
664 
665  def run(self, parq, funcs=None, dataId=None, band=None):
666  # NOTE: band kwarg is ignored here.
667  dfDict = {}
668  analysisDict = {}
669  templateDf = pd.DataFrame()
670  outputBands = parq.columnLevelNames['band'] if self.config.outputBands is None else \
671  self.config.outputBands
672 
673  # Perform transform for data of filters that exist in parq.
674  for inputBand in parq.columnLevelNames['band']:
675  if inputBand not in outputBands:
676  self.log.info("Ignoring %s band data in the input", inputBand)
677  continue
678  self.log.info("Transforming the catalog of band %s", inputBand)
679  result = self.transformtransform(inputBand, parq, funcs, dataId)
680  dfDict[inputBand] = result.df
681  analysisDict[inputBand] = result.analysis
682  if templateDf.empty:
683  templateDf = result.df
684 
685  # Fill NaNs in columns of other wanted bands
686  for filt in outputBands:
687  if filt not in dfDict:
688  self.log.info("Adding empty columns for band %s", filt)
689  dfDict[filt] = pd.DataFrame().reindex_like(templateDf)
690 
691  # This makes a multilevel column index, with band as first level
692  df = pd.concat(dfDict, axis=1, names=['band', 'column'])
693 
694  if not self.config.multilevelOutput:
695  noDupCols = list(set.union(*[set(v.noDupCols) for v in analysisDict.values()]))
696  if dataId is not None:
697  noDupCols += list(dataId.keys())
698  df = flattenFilters(df, noDupCols=noDupCols, camelCase=self.config.camelCase)
699 
700  self.log.info("Made a table of %d columns and %d rows", len(df.columns), len(df))
701  return df
702 
703 
705 
706  def makeDataRefList(self, namespace):
707  """Make self.refList from self.idList
708 
709  Generate a list of data references given tract and/or patch.
710  This was adapted from `TractQADataIdContainer`, which was
711  `TractDataIdContainer` modifie to not require "filter".
712  Only existing dataRefs are returned.
713  """
714  def getPatchRefList(tract):
715  return [namespace.butler.dataRef(datasetType=self.datasetType,
716  tract=tract.getId(),
717  patch="%d,%d" % patch.getIndex()) for patch in tract]
718 
719  tractRefs = defaultdict(list) # Data references for each tract
720  for dataId in self.idList:
721  skymap = self.getSkymapgetSkymap(namespace)
722 
723  if "tract" in dataId:
724  tractId = dataId["tract"]
725  if "patch" in dataId:
726  tractRefs[tractId].append(namespace.butler.dataRef(datasetType=self.datasetType,
727  tract=tractId,
728  patch=dataId['patch']))
729  else:
730  tractRefs[tractId] += getPatchRefList(skymap[tractId])
731  else:
732  tractRefs = dict((tract.getId(), tractRefs.get(tract.getId(), []) + getPatchRefList(tract))
733  for tract in skymap)
734  outputRefList = []
735  for tractRefList in tractRefs.values():
736  existingRefs = [ref for ref in tractRefList if ref.datasetExists()]
737  outputRefList.append(existingRefs)
738 
739  self.refListrefList = outputRefList
740 
741 
742 class ConsolidateObjectTableConfig(pexConfig.Config):
743  coaddName = pexConfig.Field(
744  dtype=str,
745  default="deep",
746  doc="Name of coadd"
747  )
748 
749 
750 class ConsolidateObjectTableTask(CmdLineTask):
751  """Write patch-merged source tables to a tract-level parquet file
752  """
753  _DefaultName = "consolidateObjectTable"
754  ConfigClass = ConsolidateObjectTableConfig
755 
756  inputDataset = 'objectTable'
757  outputDataset = 'objectTable_tract'
758 
759  @classmethod
760  def _makeArgumentParser(cls):
761  parser = ArgumentParser(name=cls._DefaultName_DefaultName)
762 
763  parser.add_id_argument("--id", cls.inputDatasetinputDataset,
764  help="data ID, e.g. --id tract=12345",
765  ContainerClass=TractObjectDataIdContainer)
766  return parser
767 
768  def runDataRef(self, patchRefList):
769  df = pd.concat([patchRef.get().toDataFrame() for patchRef in patchRefList])
770  patchRefList[0].put(ParquetTable(dataFrame=df), self.outputDatasetoutputDataset)
771 
772  def writeMetadata(self, dataRef):
773  """No metadata to write.
774  """
775  pass
776 
777 
778 class TransformSourceTableConfig(TransformCatalogBaseConfig):
779  pass
780 
781 
783  """Transform/standardize a source catalog
784  """
785  _DefaultName = "transformSourceTable"
786  ConfigClass = TransformSourceTableConfig
787 
788  inputDataset = 'source'
789  outputDataset = 'sourceTable'
790 
791  def writeMetadata(self, dataRef):
792  """No metadata to write.
793  """
794  pass
795 
796  @classmethod
797  def _makeArgumentParser(cls):
798  parser = ArgumentParser(name=cls._DefaultName_DefaultName_DefaultName)
799  parser.add_id_argument("--id", datasetType=cls.inputDatasetinputDatasetinputDataset,
800  level="sensor",
801  help="data ID, e.g. --id visit=12345 ccd=0")
802  return parser
803 
804  def runDataRef(self, dataRef):
805  """Override to specify band label to run()."""
806  parq = dataRef.get()
807  funcs = self.getFunctorsgetFunctors()
808  band = dataRef.get("calexp_filterLabel", immediate=True).bandLabel
809  df = self.runrun(parq, funcs=funcs, dataId=dataRef.dataId, band=band)
810  self.writewrite(df, dataRef)
811  return df
812 
813 
814 class ConsolidateVisitSummaryConnections(pipeBase.PipelineTaskConnections,
815  dimensions=("instrument", "visit",),
816  defaultTemplates={}):
817  calexp = connectionTypes.Input(
818  doc="Processed exposures used for metadata",
819  name="calexp",
820  storageClass="ExposureF",
821  dimensions=("instrument", "visit", "detector"),
822  deferLoad=True,
823  multiple=True,
824  )
825  visitSummary = connectionTypes.Output(
826  doc="Consolidated visit-level exposure metadata",
827  name="visitSummary",
828  storageClass="ExposureCatalog",
829  dimensions=("instrument", "visit"),
830  )
831 
832 
833 class ConsolidateVisitSummaryConfig(pipeBase.PipelineTaskConfig,
834  pipelineConnections=ConsolidateVisitSummaryConnections):
835  """Config for ConsolidateVisitSummaryTask"""
836  pass
837 
838 
839 class ConsolidateVisitSummaryTask(pipeBase.PipelineTask, pipeBase.CmdLineTask):
840  """Task to consolidate per-detector visit metadata.
841 
842  This task aggregates the following metadata from all the detectors in a
843  single visit into an exposure catalog:
844  - The visitInfo.
845  - The wcs.
846  - The photoCalib.
847  - The physical_filter and band (if available).
848  - The psf size, shape, and effective area at the center of the detector.
849  - The corners of the bounding box in right ascension/declination.
850 
851  Other quantities such as Psf, ApCorrMap, and TransmissionCurve are not
852  persisted here because of storage concerns, and because of their limited
853  utility as summary statistics.
854 
855  Tests for this task are performed in ci_hsc_gen3.
856  """
857  _DefaultName = "consolidateVisitSummary"
858  ConfigClass = ConsolidateVisitSummaryConfig
859 
860  @classmethod
861  def _makeArgumentParser(cls):
862  parser = ArgumentParser(name=cls._DefaultName_DefaultName)
863 
864  parser.add_id_argument("--id", "calexp",
865  help="data ID, e.g. --id visit=12345",
866  ContainerClass=VisitDataIdContainer)
867  return parser
868 
869  def writeMetadata(self, dataRef):
870  """No metadata to persist, so override to remove metadata persistance.
871  """
872  pass
873 
874  def writeConfig(self, butler, clobber=False, doBackup=True):
875  """No config to persist, so override to remove config persistance.
876  """
877  pass
878 
879  def runDataRef(self, dataRefList):
880  visit = dataRefList[0].dataId['visit']
881 
882  self.log.debug("Concatenating metadata from %d per-detector calexps (visit %d)" %
883  (len(dataRefList), visit))
884 
885  expCatalog = self._combineExposureMetadata(visit, dataRefList, isGen3=False)
886 
887  dataRefList[0].put(expCatalog, 'visitSummary', visit=visit)
888 
889  def runQuantum(self, butlerQC, inputRefs, outputRefs):
890  dataRefs = butlerQC.get(inputRefs.calexp)
891  visit = dataRefs[0].dataId.byName()['visit']
892 
893  self.log.debug("Concatenating metadata from %d per-detector calexps (visit %d)" %
894  (len(dataRefs), visit))
895 
896  expCatalog = self._combineExposureMetadata_combineExposureMetadata(visit, dataRefs)
897 
898  butlerQC.put(expCatalog, outputRefs.visitSummary)
899 
900  def _combineExposureMetadata(self, visit, dataRefs, isGen3=True):
901  """Make a combined exposure catalog from a list of dataRefs.
902 
903  Parameters
904  ----------
905  visit : `int`
906  Visit identification number
907  dataRefs : `list`
908  List of calexp dataRefs in visit. May be list of
909  `lsst.daf.persistence.ButlerDataRef` (Gen2) or
910  `lsst.daf.butler.DeferredDatasetHandle` (Gen3).
911  isGen3 : `bool`, optional
912  Specifies if this is a Gen3 list of datarefs.
913 
914  Returns
915  -------
916  visitSummary : `lsst.afw.table.ExposureCatalog`
917  Exposure catalog with per-detector summary information.
918  """
919  schema = afwTable.ExposureTable.makeMinimalSchema()
920  schema.addField('visit', type='I', doc='Visit number')
921  schema.addField('detector_id', type='I', doc='Detector number')
922  schema.addField('physical_filter', type='String', size=32, doc='Physical filter')
923  schema.addField('band', type='String', size=32, doc='Name of band')
924  schema.addField('psfSigma', type='F',
925  doc='PSF model second-moments determinant radius (center of chip) (pixel)')
926  schema.addField('psfArea', type='F',
927  doc='PSF model effective area (center of chip) (pixel**2)')
928  schema.addField('psfIxx', type='F',
929  doc='PSF model Ixx (center of chip) (pixel**2)')
930  schema.addField('psfIyy', type='F',
931  doc='PSF model Iyy (center of chip) (pixel**2)')
932  schema.addField('psfIxy', type='F',
933  doc='PSF model Ixy (center of chip) (pixel**2)')
934  schema.addField('raCorners', type='ArrayD', size=4,
935  doc='Right Ascension of bounding box corners (degrees)')
936  schema.addField('decCorners', type='ArrayD', size=4,
937  doc='Declination of bounding box corners (degrees)')
938 
939  cat = afwTable.ExposureCatalog(schema)
940  cat.resize(len(dataRefs))
941 
942  cat['visit'] = visit
943 
944  for i, dataRef in enumerate(dataRefs):
945  if isGen3:
946  visitInfo = dataRef.get(component='visitInfo')
947  filterLabel = dataRef.get(component='filterLabel')
948  psf = dataRef.get(component='psf')
949  wcs = dataRef.get(component='wcs')
950  photoCalib = dataRef.get(component='photoCalib')
951  detector = dataRef.get(component='detector')
952  bbox = dataRef.get(component='bbox')
953  validPolygon = dataRef.get(component='validPolygon')
954  else:
955  # Note that we need to read the calexp because there is
956  # no magic access to the psf except through the exposure.
957  gen2_read_bbox = lsst.geom.BoxI(lsst.geom.PointI(0, 0), lsst.geom.PointI(1, 1))
958  exp = dataRef.get(datasetType='calexp_sub', bbox=gen2_read_bbox)
959  visitInfo = exp.getInfo().getVisitInfo()
960  filterLabel = exp.getFilterLabel()
961  psf = exp.getPsf()
962  wcs = exp.getWcs()
963  photoCalib = exp.getPhotoCalib()
964  detector = exp.getDetector()
965  bbox = dataRef.get(datasetType='calexp_bbox')
966  validPolygon = exp.getInfo().getValidPolygon()
967 
968  rec = cat[i]
969  rec.setBBox(bbox)
970  rec.setVisitInfo(visitInfo)
971  rec.setWcs(wcs)
972  rec.setPhotoCalib(photoCalib)
973  rec.setDetector(detector)
974  rec.setValidPolygon(validPolygon)
975 
976  rec['physical_filter'] = filterLabel.physicalLabel if filterLabel.hasPhysicalLabel() else ""
977  rec['band'] = filterLabel.bandLabel if filterLabel.hasBandLabel() else ""
978  rec['detector_id'] = detector.getId()
979  shape = psf.computeShape(bbox.getCenter())
980  rec['psfSigma'] = shape.getDeterminantRadius()
981  rec['psfIxx'] = shape.getIxx()
982  rec['psfIyy'] = shape.getIyy()
983  rec['psfIxy'] = shape.getIxy()
984  im = psf.computeKernelImage(bbox.getCenter())
985  # The calculation of effective psf area is taken from
986  # meas_base/src/PsfFlux.cc#L112. See
987  # https://github.com/lsst/meas_base/blob/
988  # 750bffe6620e565bda731add1509507f5c40c8bb/src/PsfFlux.cc#L112
989  rec['psfArea'] = np.sum(im.array)/np.sum(im.array**2.)
990 
991  sph_pts = wcs.pixelToSky(lsst.geom.Box2D(bbox).getCorners())
992  rec['raCorners'][:] = [sph.getRa().asDegrees() for sph in sph_pts]
993  rec['decCorners'][:] = [sph.getDec().asDegrees() for sph in sph_pts]
994 
995  return cat
996 
997 
998 class VisitDataIdContainer(DataIdContainer):
999  """DataIdContainer that groups sensor-level id's by visit
1000  """
1001 
1002  def makeDataRefList(self, namespace):
1003  """Make self.refList from self.idList
1004 
1005  Generate a list of data references grouped by visit.
1006 
1007  Parameters
1008  ----------
1009  namespace : `argparse.Namespace`
1010  Namespace used by `lsst.pipe.base.CmdLineTask` to parse command line arguments
1011  """
1012  # Group by visits
1013  visitRefs = defaultdict(list)
1014  for dataId in self.idList:
1015  if "visit" in dataId:
1016  visitId = dataId["visit"]
1017  # append all subsets to
1018  subset = namespace.butler.subset(self.datasetType, dataId=dataId)
1019  visitRefs[visitId].extend([dataRef for dataRef in subset])
1020 
1021  outputRefList = []
1022  for refList in visitRefs.values():
1023  existingRefs = [ref for ref in refList if ref.datasetExists()]
1024  if existingRefs:
1025  outputRefList.append(existingRefs)
1026 
1027  self.refListrefList = outputRefList
1028 
1029 
1030 class ConsolidateSourceTableConfig(pexConfig.Config):
1031  pass
1032 
1033 
1034 class ConsolidateSourceTableTask(CmdLineTask):
1035  """Concatenate `sourceTable` list into a per-visit `sourceTable_visit`
1036  """
1037  _DefaultName = 'consolidateSourceTable'
1038  ConfigClass = ConsolidateSourceTableConfig
1039 
1040  inputDataset = 'sourceTable'
1041  outputDataset = 'sourceTable_visit'
1042 
1043  def runDataRef(self, dataRefList):
1044  self.log.info("Concatenating %s per-detector Source Tables", len(dataRefList))
1045  df = pd.concat([dataRef.get().toDataFrame() for dataRef in dataRefList])
1046  dataRefList[0].put(ParquetTable(dataFrame=df), self.outputDatasetoutputDataset)
1047 
1048  @classmethod
1049  def _makeArgumentParser(cls):
1050  parser = ArgumentParser(name=cls._DefaultName_DefaultName)
1051 
1052  parser.add_id_argument("--id", cls.inputDatasetinputDataset,
1053  help="data ID, e.g. --id visit=12345",
1054  ContainerClass=VisitDataIdContainer)
1055  return parser
1056 
1057  def writeMetadata(self, dataRef):
1058  """No metadata to write.
1059  """
1060  pass
1061 
1062  def writeConfig(self, butler, clobber=False, doBackup=True):
1063  """No config to write.
1064  """
1065  pass
def writeConfig(self, butler, clobber=False, doBackup=True)
def writeConfig(self, butler, clobber=False, doBackup=True)
Definition: postprocess.py:874
def runQuantum(self, butlerQC, inputRefs, outputRefs)
Definition: postprocess.py:889
def _combineExposureMetadata(self, visit, dataRefs, isGen3=True)
Definition: postprocess.py:900
def __init__(self, parq, functors, filt=None, flags=None, refFlags=None)
Definition: postprocess.py:383
def compute(self, dropna=False, pool=None)
Definition: postprocess.py:426
def getAnalysis(self, parq, funcs=None, band=None)
Definition: postprocess.py:573
def transform(self, band, parq, funcs, dataId)
Definition: postprocess.py:580
def run(self, parq, funcs=None, dataId=None, band=None)
Definition: postprocess.py:540
def run(self, parq, funcs=None, dataId=None, band=None)
Definition: postprocess.py:665
def __init__(self, butler=None, schema=None, **kwargs)
Definition: postprocess.py:93
def runDataRef(self, patchRefList)
Merge coadd sources from multiple bands.
Definition: postprocess.py:99
def run(self, catalogs, tract, patch)
Definition: postprocess.py:148
def run(self, catalog, ccdVisitId=None)
Definition: postprocess.py:239
def addCalibColumns(self, catalog, dataRef)
Definition: postprocess.py:260
def makeMergeArgumentParser(name, dataset)
Create a suitable ArgumentParser.
def flattenFilters(df, noDupCols=['coord_ra', 'coord_dec'], camelCase=False)
Definition: postprocess.py:41