lsst.pipe.tasks  21.0.0-40-gd3a68701+eacd05cfb3
postprocess.py
Go to the documentation of this file.
1 # This file is part of pipe_tasks
2 #
3 # Developed for the LSST Data Management System.
4 # This product includes software developed by the LSST Project
5 # (https://www.lsst.org).
6 # See the COPYRIGHT file at the top-level directory of this distribution
7 # for details of code ownership.
8 #
9 # This program is free software: you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or
12 # (at your option) any later version.
13 #
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
18 #
19 # You should have received a copy of the GNU General Public License
20 # along with this program. If not, see <https://www.gnu.org/licenses/>.
21 
22 import functools
23 import pandas as pd
24 import numpy as np
25 from collections import defaultdict
26 
27 import lsst.geom
28 import lsst.pex.config as pexConfig
29 import lsst.pipe.base as pipeBase
30 from lsst.pipe.base import connectionTypes
31 import lsst.afw.table as afwTable
32 from lsst.meas.base import SingleFrameMeasurementTask
33 from lsst.pipe.base import CmdLineTask, ArgumentParser, DataIdContainer
34 from lsst.coadd.utils.coaddDataIdContainer import CoaddDataIdContainer
35 
36 from .parquetTable import ParquetTable
37 from .multiBandUtils import makeMergeArgumentParser, MergeSourcesRunner
38 from .functors import CompositeFunctor, RAColumn, DecColumn, Column
39 
40 
41 def flattenFilters(df, noDupCols=['coord_ra', 'coord_dec'], camelCase=False):
42  """Flattens a dataframe with multilevel column index
43  """
44  newDf = pd.DataFrame()
45  for band in set(df.columns.to_frame()['band']):
46  subdf = df[band]
47  columnFormat = '{0}{1}' if camelCase else '{0}_{1}'
48  newColumns = {c: columnFormat.format(band, c)
49  for c in subdf.columns if c not in noDupCols}
50  cols = list(newColumns.keys())
51  newDf = pd.concat([newDf, subdf[cols].rename(columns=newColumns)], axis=1)
52 
53  newDf = pd.concat([subdf[noDupCols], newDf], axis=1)
54  return newDf
55 
56 
57 class WriteObjectTableConfig(pexConfig.Config):
58  engine = pexConfig.Field(
59  dtype=str,
60  default="pyarrow",
61  doc="Parquet engine for writing (pyarrow or fastparquet)"
62  )
63  coaddName = pexConfig.Field(
64  dtype=str,
65  default="deep",
66  doc="Name of coadd"
67  )
68 
69 
70 class WriteObjectTableTask(CmdLineTask):
71  """Write filter-merged source tables to parquet
72  """
73  _DefaultName = "writeObjectTable"
74  ConfigClass = WriteObjectTableConfig
75  RunnerClass = MergeSourcesRunner
76 
77  # Names of table datasets to be merged
78  inputDatasets = ('forced_src', 'meas', 'ref')
79 
80  # Tag of output dataset written by `MergeSourcesTask.write`
81  outputDataset = 'obj'
82 
83  def __init__(self, butler=None, schema=None, **kwargs):
84  # It is a shame that this class can't use the default init for CmdLineTask
85  # But to do so would require its own special task runner, which is many
86  # more lines of specialization, so this is how it is for now
87  CmdLineTask.__init__(self, **kwargs)
88 
89  def runDataRef(self, patchRefList):
90  """!
91  @brief Merge coadd sources from multiple bands. Calls @ref `run` which must be defined in
92  subclasses that inherit from MergeSourcesTask.
93  @param[in] patchRefList list of data references for each filter
94  """
95  catalogs = dict(self.readCatalogreadCatalog(patchRef) for patchRef in patchRefList)
96  dataId = patchRefList[0].dataId
97  mergedCatalog = self.runrun(catalogs, tract=dataId['tract'], patch=dataId['patch'])
98  self.writewrite(patchRefList[0], mergedCatalog)
99 
100  @classmethod
101  def _makeArgumentParser(cls):
102  """Create a suitable ArgumentParser.
103 
104  We will use the ArgumentParser to get a list of data
105  references for patches; the RunnerClass will sort them into lists
106  of data references for the same patch.
107 
108  References first of self.inputDatasets, rather than
109  self.inputDataset
110  """
111  return makeMergeArgumentParser(cls._DefaultName_DefaultName, cls.inputDatasetsinputDatasets[0])
112 
113  def readCatalog(self, patchRef):
114  """Read input catalogs
115 
116  Read all the input datasets given by the 'inputDatasets'
117  attribute.
118 
119  Parameters
120  ----------
121  patchRef : `lsst.daf.persistence.ButlerDataRef`
122  Data reference for patch
123 
124  Returns
125  -------
126  Tuple consisting of band name and a dict of catalogs, keyed by
127  dataset name
128  """
129  band = patchRef.get(self.config.coaddName + "Coadd_filterLabel", immediate=True).bandLabel
130  catalogDict = {}
131  for dataset in self.inputDatasetsinputDatasets:
132  catalog = patchRef.get(self.config.coaddName + "Coadd_" + dataset, immediate=True)
133  self.log.info("Read %d sources from %s for band %s: %s" %
134  (len(catalog), dataset, band, patchRef.dataId))
135  catalogDict[dataset] = catalog
136  return band, catalogDict
137 
138  def run(self, catalogs, tract, patch):
139  """Merge multiple catalogs.
140 
141  Parameters
142  ----------
143  catalogs : `dict`
144  Mapping from filter names to dict of catalogs.
145  tract : int
146  tractId to use for the tractId column
147  patch : str
148  patchId to use for the patchId column
149 
150  Returns
151  -------
152  catalog : `lsst.pipe.tasks.parquetTable.ParquetTable`
153  Merged dataframe, with each column prefixed by
154  `filter_tag(filt)`, wrapped in the parquet writer shim class.
155  """
156 
157  dfs = []
158  for filt, tableDict in catalogs.items():
159  for dataset, table in tableDict.items():
160  # Convert afwTable to pandas DataFrame
161  df = table.asAstropy().to_pandas().set_index('id', drop=True)
162 
163  # Sort columns by name, to ensure matching schema among patches
164  df = df.reindex(sorted(df.columns), axis=1)
165  df['tractId'] = tract
166  df['patchId'] = patch
167 
168  # Make columns a 3-level MultiIndex
169  df.columns = pd.MultiIndex.from_tuples([(dataset, filt, c) for c in df.columns],
170  names=('dataset', 'band', 'column'))
171  dfs.append(df)
172 
173  catalog = functools.reduce(lambda d1, d2: d1.join(d2), dfs)
174  return ParquetTable(dataFrame=catalog)
175 
176  def write(self, patchRef, catalog):
177  """Write the output.
178 
179  Parameters
180  ----------
181  catalog : `ParquetTable`
182  Catalog to write
183  patchRef : `lsst.daf.persistence.ButlerDataRef`
184  Data reference for patch
185  """
186  patchRef.put(catalog, self.config.coaddName + "Coadd_" + self.outputDatasetoutputDataset)
187  # since the filter isn't actually part of the data ID for the dataset we're saving,
188  # it's confusing to see it in the log message, even if the butler simply ignores it.
189  mergeDataId = patchRef.dataId.copy()
190  del mergeDataId["filter"]
191  self.log.info("Wrote merged catalog: %s" % (mergeDataId,))
192 
193  def writeMetadata(self, dataRefList):
194  """No metadata to write, and not sure how to write it for a list of dataRefs.
195  """
196  pass
197 
198 
199 class WriteSourceTableConfig(pexConfig.Config):
200  doApplyExternalPhotoCalib = pexConfig.Field(
201  dtype=bool,
202  default=False,
203  doc=("Add local photoCalib columns from the calexp.photoCalib? Should only set True if "
204  "generating Source Tables from older src tables which do not already have local calib columns")
205  )
206  doApplyExternalSkyWcs = pexConfig.Field(
207  dtype=bool,
208  default=False,
209  doc=("Add local WCS columns from the calexp.wcs? Should only set True if "
210  "generating Source Tables from older src tables which do not already have local calib columns")
211  )
212 
213 
214 class WriteSourceTableTask(CmdLineTask):
215  """Write source table to parquet
216  """
217  _DefaultName = "writeSourceTable"
218  ConfigClass = WriteSourceTableConfig
219 
220  def runDataRef(self, dataRef):
221  src = dataRef.get('src')
222  if self.config.doApplyExternalPhotoCalib or self.config.doApplyExternalSkyWcs:
223  src = self.addCalibColumnsaddCalibColumns(src, dataRef)
224 
225  ccdVisitId = dataRef.get('ccdExposureId')
226  result = self.runrun(src, ccdVisitId=ccdVisitId)
227  dataRef.put(result.table, 'source')
228 
229  def run(self, catalog, ccdVisitId=None):
230  """Convert `src` catalog to parquet
231 
232  Parameters
233  ----------
234  catalog: `afwTable.SourceCatalog`
235  catalog to be converted
236  ccdVisitId: `int`
237  ccdVisitId to be added as a column
238 
239  Returns
240  -------
241  result : `lsst.pipe.base.Struct`
242  ``table``
243  `ParquetTable` version of the input catalog
244  """
245  self.log.info("Generating parquet table from src catalog")
246  df = catalog.asAstropy().to_pandas().set_index('id', drop=True)
247  df['ccdVisitId'] = ccdVisitId
248  return pipeBase.Struct(table=ParquetTable(dataFrame=df))
249 
250  def addCalibColumns(self, catalog, dataRef):
251  """Add columns with local calibration evaluated at each centroid
252 
253  for backwards compatibility with old repos.
254  This exists for the purpose of converting old src catalogs
255  (which don't have the expected local calib columns) to Source Tables.
256 
257  Parameters
258  ----------
259  catalog: `afwTable.SourceCatalog`
260  catalog to which calib columns will be added
261  dataRef: `lsst.daf.persistence.ButlerDataRef
262  for fetching the calibs from disk.
263 
264  Returns
265  -------
266  newCat: `afwTable.SourceCatalog`
267  Source Catalog with requested local calib columns
268  """
269  mapper = afwTable.SchemaMapper(catalog.schema)
270  measureConfig = SingleFrameMeasurementTask.ConfigClass()
271  measureConfig.doReplaceWithNoise = False
272 
273  # Just need the WCS or the PhotoCalib attached to an exposue
274  exposure = dataRef.get('calexp_sub',
276 
277  mapper = afwTable.SchemaMapper(catalog.schema)
278  mapper.addMinimalSchema(catalog.schema, True)
279  schema = mapper.getOutputSchema()
280 
281  exposureIdInfo = dataRef.get("expIdInfo")
282  measureConfig.plugins.names = []
283  if self.config.doApplyExternalSkyWcs:
284  plugin = 'base_LocalWcs'
285  if plugin in schema:
286  raise RuntimeError(f"{plugin} already in src catalog. Set doApplyExternalSkyWcs=False")
287  else:
288  measureConfig.plugins.names.add(plugin)
289 
290  if self.config.doApplyExternalPhotoCalib:
291  plugin = 'base_LocalPhotoCalib'
292  if plugin in schema:
293  raise RuntimeError(f"{plugin} already in src catalog. Set doApplyExternalPhotoCalib=False")
294  else:
295  measureConfig.plugins.names.add(plugin)
296 
297  measurement = SingleFrameMeasurementTask(config=measureConfig, schema=schema)
298  newCat = afwTable.SourceCatalog(schema)
299  newCat.extend(catalog, mapper=mapper)
300  measurement.run(measCat=newCat, exposure=exposure, exposureId=exposureIdInfo.expId)
301  return newCat
302 
303  def writeMetadata(self, dataRef):
304  """No metadata to write.
305  """
306  pass
307 
308  @classmethod
309  def _makeArgumentParser(cls):
310  parser = ArgumentParser(name=cls._DefaultName_DefaultName)
311  parser.add_id_argument("--id", 'src',
312  help="data ID, e.g. --id visit=12345 ccd=0")
313  return parser
314 
315 
316 class PostprocessAnalysis(object):
317  """Calculate columns from ParquetTable
318 
319  This object manages and organizes an arbitrary set of computations
320  on a catalog. The catalog is defined by a
321  `lsst.pipe.tasks.parquetTable.ParquetTable` object (or list thereof), such as a
322  `deepCoadd_obj` dataset, and the computations are defined by a collection
323  of `lsst.pipe.tasks.functor.Functor` objects (or, equivalently,
324  a `CompositeFunctor`).
325 
326  After the object is initialized, accessing the `.df` attribute (which
327  holds the `pandas.DataFrame` containing the results of the calculations) triggers
328  computation of said dataframe.
329 
330  One of the conveniences of using this object is the ability to define a desired common
331  filter for all functors. This enables the same functor collection to be passed to
332  several different `PostprocessAnalysis` objects without having to change the original
333  functor collection, since the `filt` keyword argument of this object triggers an
334  overwrite of the `filt` property for all functors in the collection.
335 
336  This object also allows a list of refFlags to be passed, and defines a set of default
337  refFlags that are always included even if not requested.
338 
339  If a list of `ParquetTable` object is passed, rather than a single one, then the
340  calculations will be mapped over all the input catalogs. In principle, it should
341  be straightforward to parallelize this activity, but initial tests have failed
342  (see TODO in code comments).
343 
344  Parameters
345  ----------
346  parq : `lsst.pipe.tasks.ParquetTable` (or list of such)
347  Source catalog(s) for computation
348 
349  functors : `list`, `dict`, or `lsst.pipe.tasks.functors.CompositeFunctor`
350  Computations to do (functors that act on `parq`).
351  If a dict, the output
352  DataFrame will have columns keyed accordingly.
353  If a list, the column keys will come from the
354  `.shortname` attribute of each functor.
355 
356  filt : `str` (optional)
357  Filter in which to calculate. If provided,
358  this will overwrite any existing `.filt` attribute
359  of the provided functors.
360 
361  flags : `list` (optional)
362  List of flags (per-band) to include in output table.
363 
364  refFlags : `list` (optional)
365  List of refFlags (only reference band) to include in output table.
366 
367 
368  """
369  _defaultRefFlags = []
370  _defaultFuncs = (('coord_ra', RAColumn()),
371  ('coord_dec', DecColumn()))
372 
373  def __init__(self, parq, functors, filt=None, flags=None, refFlags=None):
374  self.parqparq = parq
375  self.functorsfunctors = functors
376 
377  self.filtfilt = filt
378  self.flagsflags = list(flags) if flags is not None else []
379  self.refFlagsrefFlags = list(self._defaultRefFlags_defaultRefFlags)
380  if refFlags is not None:
381  self.refFlagsrefFlags += list(refFlags)
382 
383  self._df_df = None
384 
385  @property
386  def defaultFuncs(self):
387  funcs = dict(self._defaultFuncs_defaultFuncs)
388  return funcs
389 
390  @property
391  def func(self):
392  additionalFuncs = self.defaultFuncsdefaultFuncs
393  additionalFuncs.update({flag: Column(flag, dataset='ref') for flag in self.refFlagsrefFlags})
394  additionalFuncs.update({flag: Column(flag, dataset='meas') for flag in self.flagsflags})
395 
396  if isinstance(self.functorsfunctors, CompositeFunctor):
397  func = self.functorsfunctors
398  else:
399  func = CompositeFunctor(self.functorsfunctors)
400 
401  func.funcDict.update(additionalFuncs)
402  func.filt = self.filtfilt
403 
404  return func
405 
406  @property
407  def noDupCols(self):
408  return [name for name, func in self.funcfunc.funcDict.items() if func.noDup or func.dataset == 'ref']
409 
410  @property
411  def df(self):
412  if self._df_df is None:
413  self.computecompute()
414  return self._df_df
415 
416  def compute(self, dropna=False, pool=None):
417  # map over multiple parquet tables
418  if type(self.parqparq) in (list, tuple):
419  if pool is None:
420  dflist = [self.funcfunc(parq, dropna=dropna) for parq in self.parqparq]
421  else:
422  # TODO: Figure out why this doesn't work (pyarrow pickling issues?)
423  dflist = pool.map(functools.partial(self.funcfunc, dropna=dropna), self.parqparq)
424  self._df_df = pd.concat(dflist)
425  else:
426  self._df_df = self.funcfunc(self.parqparq, dropna=dropna)
427 
428  return self._df_df
429 
430 
431 class TransformCatalogBaseConfig(pexConfig.Config):
432  functorFile = pexConfig.Field(
433  dtype=str,
434  doc='Path to YAML file specifying functors to be computed',
435  default=None,
436  optional=True
437  )
438 
439 
440 class TransformCatalogBaseTask(CmdLineTask):
441  """Base class for transforming/standardizing a catalog
442 
443  by applying functors that convert units and apply calibrations.
444  The purpose of this task is to perform a set of computations on
445  an input `ParquetTable` dataset (such as `deepCoadd_obj`) and write the
446  results to a new dataset (which needs to be declared in an `outputDataset`
447  attribute).
448 
449  The calculations to be performed are defined in a YAML file that specifies
450  a set of functors to be computed, provided as
451  a `--functorFile` config parameter. An example of such a YAML file
452  is the following:
453 
454  funcs:
455  psfMag:
456  functor: Mag
457  args:
458  - base_PsfFlux
459  filt: HSC-G
460  dataset: meas
461  cmodel_magDiff:
462  functor: MagDiff
463  args:
464  - modelfit_CModel
465  - base_PsfFlux
466  filt: HSC-G
467  gauss_magDiff:
468  functor: MagDiff
469  args:
470  - base_GaussianFlux
471  - base_PsfFlux
472  filt: HSC-G
473  count:
474  functor: Column
475  args:
476  - base_InputCount_value
477  filt: HSC-G
478  deconvolved_moments:
479  functor: DeconvolvedMoments
480  filt: HSC-G
481  dataset: forced_src
482  refFlags:
483  - calib_psfUsed
484  - merge_measurement_i
485  - merge_measurement_r
486  - merge_measurement_z
487  - merge_measurement_y
488  - merge_measurement_g
489  - base_PixelFlags_flag_inexact_psfCenter
490  - detect_isPrimary
491 
492  The names for each entry under "func" will become the names of columns in the
493  output dataset. All the functors referenced are defined in `lsst.pipe.tasks.functors`.
494  Positional arguments to be passed to each functor are in the `args` list,
495  and any additional entries for each column other than "functor" or "args" (e.g., `'filt'`,
496  `'dataset'`) are treated as keyword arguments to be passed to the functor initialization.
497 
498  The "refFlags" entry is shortcut for a bunch of `Column` functors with the original column and
499  taken from the `'ref'` dataset.
500 
501  The "flags" entry will be expanded out per band.
502 
503  This task uses the `lsst.pipe.tasks.postprocess.PostprocessAnalysis` object
504  to organize and excecute the calculations.
505 
506  """
507  @property
508  def _DefaultName(self):
509  raise NotImplementedError('Subclass must define "_DefaultName" attribute')
510 
511  @property
512  def outputDataset(self):
513  raise NotImplementedError('Subclass must define "outputDataset" attribute')
514 
515  @property
516  def inputDataset(self):
517  raise NotImplementedError('Subclass must define "inputDataset" attribute')
518 
519  @property
520  def ConfigClass(self):
521  raise NotImplementedError('Subclass must define "ConfigClass" attribute')
522 
523  def runDataRef(self, dataRef):
524  parq = dataRef.get()
525  funcs = self.getFunctorsgetFunctors()
526  df = self.runrun(parq, funcs=funcs, dataId=dataRef.dataId)
527  self.writewrite(df, dataRef)
528  return df
529 
530  def run(self, parq, funcs=None, dataId=None, band=None):
531  """Do postprocessing calculations
532 
533  Takes a `ParquetTable` object and dataId,
534  returns a dataframe with results of postprocessing calculations.
535 
536  Parameters
537  ----------
538  parq : `lsst.pipe.tasks.parquetTable.ParquetTable`
539  ParquetTable from which calculations are done.
540  funcs : `lsst.pipe.tasks.functors.Functors`
541  Functors to apply to the table's columns
542  dataId : dict, optional
543  Used to add a `patchId` column to the output dataframe.
544  band : `str`, optional
545  Filter band that is being processed.
546 
547  Returns
548  ------
549  `pandas.DataFrame`
550 
551  """
552  self.log.info("Transforming/standardizing the source table dataId: %s", dataId)
553 
554  df = self.transformtransform(band, parq, funcs, dataId).df
555  self.log.info("Made a table of %d columns and %d rows", len(df.columns), len(df))
556  return df
557 
558  def getFunctors(self):
559  funcs = CompositeFunctor.from_file(self.config.functorFile)
560  funcs.update(dict(PostprocessAnalysis._defaultFuncs))
561  return funcs
562 
563  def getAnalysis(self, parq, funcs=None, band=None):
564  # Avoids disk access if funcs is passed
565  if funcs is None:
566  funcs = self.getFunctorsgetFunctors()
567  analysis = PostprocessAnalysis(parq, funcs, filt=band)
568  return analysis
569 
570  def transform(self, band, parq, funcs, dataId):
571  analysis = self.getAnalysisgetAnalysis(parq, funcs=funcs, band=band)
572  df = analysis.df
573  if dataId is not None:
574  for key, value in dataId.items():
575  df[key] = value
576 
577  return pipeBase.Struct(
578  df=df,
579  analysis=analysis
580  )
581 
582  def write(self, df, parqRef):
583  parqRef.put(ParquetTable(dataFrame=df), self.outputDatasetoutputDataset)
584 
585  def writeMetadata(self, dataRef):
586  """No metadata to write.
587  """
588  pass
589 
590 
591 class TransformObjectCatalogConfig(TransformCatalogBaseConfig):
592  coaddName = pexConfig.Field(
593  dtype=str,
594  default="deep",
595  doc="Name of coadd"
596  )
597  # TODO: remove in DM-27177
598  filterMap = pexConfig.DictField(
599  keytype=str,
600  itemtype=str,
601  default={},
602  doc=("Dictionary mapping full filter name to short one for column name munging."
603  "These filters determine the output columns no matter what filters the "
604  "input data actually contain."),
605  deprecated=("Coadds are now identified by the band, so this transform is unused."
606  "Will be removed after v22.")
607  )
608  outputBands = pexConfig.ListField(
609  dtype=str,
610  default=None,
611  optional=True,
612  doc=("These bands and only these bands will appear in the output,"
613  " NaN-filled if the input does not include them."
614  " If None, then use all bands found in the input.")
615  )
616  camelCase = pexConfig.Field(
617  dtype=bool,
618  default=True,
619  doc=("Write per-band columns names with camelCase, else underscore "
620  "For example: gPsFlux instead of g_PsFlux.")
621  )
622  multilevelOutput = pexConfig.Field(
623  dtype=bool,
624  default=False,
625  doc=("Whether results dataframe should have a multilevel column index (True) or be flat "
626  "and name-munged (False).")
627  )
628 
629 
631  """Produce a flattened Object Table to match the format specified in
632  sdm_schemas.
633 
634  Do the same set of postprocessing calculations on all bands
635 
636  This is identical to `TransformCatalogBaseTask`, except for that it does the
637  specified functor calculations for all filters present in the
638  input `deepCoadd_obj` table. Any specific `"filt"` keywords specified
639  by the YAML file will be superceded.
640  """
641  _DefaultName = "transformObjectCatalog"
642  ConfigClass = TransformObjectCatalogConfig
643 
644  inputDataset = 'deepCoadd_obj'
645  outputDataset = 'objectTable'
646 
647  @classmethod
648  def _makeArgumentParser(cls):
649  parser = ArgumentParser(name=cls._DefaultName_DefaultName_DefaultName)
650  parser.add_id_argument("--id", cls.inputDatasetinputDatasetinputDataset,
651  ContainerClass=CoaddDataIdContainer,
652  help="data ID, e.g. --id tract=12345 patch=1,2")
653  return parser
654 
655  def run(self, parq, funcs=None, dataId=None, band=None):
656  # NOTE: band kwarg is ignored here.
657  dfDict = {}
658  analysisDict = {}
659  templateDf = pd.DataFrame()
660  outputBands = parq.columnLevelNames['band'] if self.config.outputBands is None else \
661  self.config.outputBands
662 
663  # Perform transform for data of filters that exist in parq.
664  for inputBand in parq.columnLevelNames['band']:
665  if inputBand not in outputBands:
666  self.log.info("Ignoring %s band data in the input", inputBand)
667  continue
668  self.log.info("Transforming the catalog of band %s", inputBand)
669  result = self.transformtransform(inputBand, parq, funcs, dataId)
670  dfDict[inputBand] = result.df
671  analysisDict[inputBand] = result.analysis
672  if templateDf.empty:
673  templateDf = result.df
674 
675  # Fill NaNs in columns of other wanted bands
676  for filt in outputBands:
677  if filt not in dfDict:
678  self.log.info("Adding empty columns for band %s", filt)
679  dfDict[filt] = pd.DataFrame().reindex_like(templateDf)
680 
681  # This makes a multilevel column index, with band as first level
682  df = pd.concat(dfDict, axis=1, names=['band', 'column'])
683 
684  if not self.config.multilevelOutput:
685  noDupCols = list(set.union(*[set(v.noDupCols) for v in analysisDict.values()]))
686  if dataId is not None:
687  noDupCols += list(dataId.keys())
688  df = flattenFilters(df, noDupCols=noDupCols, camelCase=self.config.camelCase)
689 
690  self.log.info("Made a table of %d columns and %d rows", len(df.columns), len(df))
691  return df
692 
693 
695 
696  def makeDataRefList(self, namespace):
697  """Make self.refList from self.idList
698 
699  Generate a list of data references given tract and/or patch.
700  This was adapted from `TractQADataIdContainer`, which was
701  `TractDataIdContainer` modifie to not require "filter".
702  Only existing dataRefs are returned.
703  """
704  def getPatchRefList(tract):
705  return [namespace.butler.dataRef(datasetType=self.datasetType,
706  tract=tract.getId(),
707  patch="%d,%d" % patch.getIndex()) for patch in tract]
708 
709  tractRefs = defaultdict(list) # Data references for each tract
710  for dataId in self.idList:
711  skymap = self.getSkymapgetSkymap(namespace)
712 
713  if "tract" in dataId:
714  tractId = dataId["tract"]
715  if "patch" in dataId:
716  tractRefs[tractId].append(namespace.butler.dataRef(datasetType=self.datasetType,
717  tract=tractId,
718  patch=dataId['patch']))
719  else:
720  tractRefs[tractId] += getPatchRefList(skymap[tractId])
721  else:
722  tractRefs = dict((tract.getId(), tractRefs.get(tract.getId(), []) + getPatchRefList(tract))
723  for tract in skymap)
724  outputRefList = []
725  for tractRefList in tractRefs.values():
726  existingRefs = [ref for ref in tractRefList if ref.datasetExists()]
727  outputRefList.append(existingRefs)
728 
729  self.refListrefList = outputRefList
730 
731 
732 class ConsolidateObjectTableConfig(pexConfig.Config):
733  coaddName = pexConfig.Field(
734  dtype=str,
735  default="deep",
736  doc="Name of coadd"
737  )
738 
739 
740 class ConsolidateObjectTableTask(CmdLineTask):
741  """Write patch-merged source tables to a tract-level parquet file
742  """
743  _DefaultName = "consolidateObjectTable"
744  ConfigClass = ConsolidateObjectTableConfig
745 
746  inputDataset = 'objectTable'
747  outputDataset = 'objectTable_tract'
748 
749  @classmethod
750  def _makeArgumentParser(cls):
751  parser = ArgumentParser(name=cls._DefaultName_DefaultName)
752 
753  parser.add_id_argument("--id", cls.inputDatasetinputDataset,
754  help="data ID, e.g. --id tract=12345",
755  ContainerClass=TractObjectDataIdContainer)
756  return parser
757 
758  def runDataRef(self, patchRefList):
759  df = pd.concat([patchRef.get().toDataFrame() for patchRef in patchRefList])
760  patchRefList[0].put(ParquetTable(dataFrame=df), self.outputDatasetoutputDataset)
761 
762  def writeMetadata(self, dataRef):
763  """No metadata to write.
764  """
765  pass
766 
767 
768 class TransformSourceTableConfig(TransformCatalogBaseConfig):
769  pass
770 
771 
773  """Transform/standardize a source catalog
774  """
775  _DefaultName = "transformSourceTable"
776  ConfigClass = TransformSourceTableConfig
777 
778  inputDataset = 'source'
779  outputDataset = 'sourceTable'
780 
781  def writeMetadata(self, dataRef):
782  """No metadata to write.
783  """
784  pass
785 
786  @classmethod
787  def _makeArgumentParser(cls):
788  parser = ArgumentParser(name=cls._DefaultName_DefaultName_DefaultName)
789  parser.add_id_argument("--id", datasetType=cls.inputDatasetinputDatasetinputDataset,
790  level="sensor",
791  help="data ID, e.g. --id visit=12345 ccd=0")
792  return parser
793 
794  def runDataRef(self, dataRef):
795  """Override to specify band label to run()."""
796  parq = dataRef.get()
797  funcs = self.getFunctorsgetFunctors()
798  band = dataRef.get("calexp_filterLabel", immediate=True).bandLabel
799  df = self.runrun(parq, funcs=funcs, dataId=dataRef.dataId, band=band)
800  self.writewrite(df, dataRef)
801  return df
802 
803 
804 class ConsolidateVisitSummaryConnections(pipeBase.PipelineTaskConnections,
805  dimensions=("instrument", "visit",),
806  defaultTemplates={}):
807  calexp = connectionTypes.Input(
808  doc="Processed exposures used for metadata",
809  name="calexp",
810  storageClass="ExposureF",
811  dimensions=("instrument", "visit", "detector"),
812  deferLoad=True,
813  multiple=True,
814  )
815  visitSummary = connectionTypes.Output(
816  doc="Consolidated visit-level exposure metadata",
817  name="visitSummary",
818  storageClass="ExposureCatalog",
819  dimensions=("instrument", "visit"),
820  )
821 
822 
823 class ConsolidateVisitSummaryConfig(pipeBase.PipelineTaskConfig,
824  pipelineConnections=ConsolidateVisitSummaryConnections):
825  """Config for ConsolidateVisitSummaryTask"""
826  pass
827 
828 
829 class ConsolidateVisitSummaryTask(pipeBase.PipelineTask, pipeBase.CmdLineTask):
830  """Task to consolidate per-detector visit metadata.
831 
832  This task aggregates the following metadata from all the detectors in a
833  single visit into an exposure catalog:
834  - The visitInfo.
835  - The wcs.
836  - The photoCalib.
837  - The physical_filter and band (if available).
838  - The psf size, shape, and effective area at the center of the detector.
839  - The corners of the bounding box in right ascension/declination.
840 
841  Other quantities such as Psf, ApCorrMap, and TransmissionCurve are not
842  persisted here because of storage concerns, and because of their limited
843  utility as summary statistics.
844 
845  Tests for this task are performed in ci_hsc_gen3.
846  """
847  _DefaultName = "consolidateVisitSummary"
848  ConfigClass = ConsolidateVisitSummaryConfig
849 
850  @classmethod
851  def _makeArgumentParser(cls):
852  parser = ArgumentParser(name=cls._DefaultName_DefaultName)
853 
854  parser.add_id_argument("--id", "calexp",
855  help="data ID, e.g. --id visit=12345",
856  ContainerClass=VisitDataIdContainer)
857  return parser
858 
859  def writeMetadata(self, dataRef):
860  """No metadata to persist, so override to remove metadata persistance.
861  """
862  pass
863 
864  def writeConfig(self, butler, clobber=False, doBackup=True):
865  """No config to persist, so override to remove config persistance.
866  """
867  pass
868 
869  def runDataRef(self, dataRefList):
870  visit = dataRefList[0].dataId['visit']
871 
872  self.log.debug("Concatenating metadata from %d per-detector calexps (visit %d)" %
873  (len(dataRefList), visit))
874 
875  expCatalog = self._combineExposureMetadata(visit, dataRefList, isGen3=False)
876 
877  dataRefList[0].put(expCatalog, 'visitSummary', visit=visit)
878 
879  def runQuantum(self, butlerQC, inputRefs, outputRefs):
880  dataRefs = butlerQC.get(inputRefs.calexp)
881  visit = dataRefs[0].dataId.byName()['visit']
882 
883  self.log.debug("Concatenating metadata from %d per-detector calexps (visit %d)" %
884  (len(dataRefs), visit))
885 
886  expCatalog = self._combineExposureMetadata_combineExposureMetadata(visit, dataRefs)
887 
888  butlerQC.put(expCatalog, outputRefs.visitSummary)
889 
890  def _combineExposureMetadata(self, visit, dataRefs, isGen3=True):
891  """Make a combined exposure catalog from a list of dataRefs.
892 
893  Parameters
894  ----------
895  visit : `int`
896  Visit identification number
897  dataRefs : `list`
898  List of calexp dataRefs in visit. May be list of
899  `lsst.daf.persistence.ButlerDataRef` (Gen2) or
900  `lsst.daf.butler.DeferredDatasetHandle` (Gen3).
901  isGen3 : `bool`, optional
902  Specifies if this is a Gen3 list of datarefs.
903 
904  Returns
905  -------
906  visitSummary : `lsst.afw.table.ExposureCatalog`
907  Exposure catalog with per-detector summary information.
908  """
909  schema = afwTable.ExposureTable.makeMinimalSchema()
910  schema.addField('visit', type='I', doc='Visit number')
911  schema.addField('detector_id', type='I', doc='Detector number')
912  schema.addField('physical_filter', type='String', size=32, doc='Physical filter')
913  schema.addField('band', type='String', size=32, doc='Name of band')
914  schema.addField('psfSigma', type='F',
915  doc='PSF model second-moments determinant radius (center of chip) (pixel)')
916  schema.addField('psfArea', type='F',
917  doc='PSF model effective area (center of chip) (pixel**2)')
918  schema.addField('psfIxx', type='F',
919  doc='PSF model Ixx (center of chip) (pixel**2)')
920  schema.addField('psfIyy', type='F',
921  doc='PSF model Iyy (center of chip) (pixel**2)')
922  schema.addField('psfIxy', type='F',
923  doc='PSF model Ixy (center of chip) (pixel**2)')
924  schema.addField('raCorners', type='ArrayD', size=4,
925  doc='Right Ascension of bounding box corners (degrees)')
926  schema.addField('decCorners', type='ArrayD', size=4,
927  doc='Declination of bounding box corners (degrees)')
928 
929  cat = afwTable.ExposureCatalog(schema)
930  cat.resize(len(dataRefs))
931 
932  cat['visit'] = visit
933 
934  for i, dataRef in enumerate(dataRefs):
935  if isGen3:
936  visitInfo = dataRef.get(component='visitInfo')
937  filterLabel = dataRef.get(component='filterLabel')
938  psf = dataRef.get(component='psf')
939  wcs = dataRef.get(component='wcs')
940  photoCalib = dataRef.get(component='photoCalib')
941  detector = dataRef.get(component='detector')
942  bbox = dataRef.get(component='bbox')
943  validPolygon = dataRef.get(component='validPolygon')
944  else:
945  # Note that we need to read the calexp because there is
946  # no magic access to the psf except through the exposure.
947  gen2_read_bbox = lsst.geom.BoxI(lsst.geom.PointI(0, 0), lsst.geom.PointI(1, 1))
948  exp = dataRef.get(datasetType='calexp_sub', bbox=gen2_read_bbox)
949  visitInfo = exp.getInfo().getVisitInfo()
950  filterLabel = dataRef.get("calexp_filterLabel")
951  psf = exp.getPsf()
952  wcs = exp.getWcs()
953  photoCalib = exp.getPhotoCalib()
954  detector = exp.getDetector()
955  bbox = dataRef.get(datasetType='calexp_bbox')
956  validPolygon = exp.getInfo().getValidPolygon()
957 
958  rec = cat[i]
959  rec.setBBox(bbox)
960  rec.setVisitInfo(visitInfo)
961  rec.setWcs(wcs)
962  rec.setPhotoCalib(photoCalib)
963  rec.setDetector(detector)
964  rec.setValidPolygon(validPolygon)
965 
966  rec['physical_filter'] = filterLabel.physicalLabel if filterLabel.hasPhysicalLabel() else ""
967  rec['band'] = filterLabel.bandLabel if filterLabel.hasBandLabel() else ""
968  rec['detector_id'] = detector.getId()
969  shape = psf.computeShape(bbox.getCenter())
970  rec['psfSigma'] = shape.getDeterminantRadius()
971  rec['psfIxx'] = shape.getIxx()
972  rec['psfIyy'] = shape.getIyy()
973  rec['psfIxy'] = shape.getIxy()
974  im = psf.computeKernelImage(bbox.getCenter())
975  # The calculation of effective psf area is taken from
976  # meas_base/src/PsfFlux.cc#L112. See
977  # https://github.com/lsst/meas_base/blob/
978  # 750bffe6620e565bda731add1509507f5c40c8bb/src/PsfFlux.cc#L112
979  rec['psfArea'] = np.sum(im.array)/np.sum(im.array**2.)
980 
981  sph_pts = wcs.pixelToSky(lsst.geom.Box2D(bbox).getCorners())
982  rec['raCorners'][:] = [sph.getRa().asDegrees() for sph in sph_pts]
983  rec['decCorners'][:] = [sph.getDec().asDegrees() for sph in sph_pts]
984 
985  return cat
986 
987 
988 class VisitDataIdContainer(DataIdContainer):
989  """DataIdContainer that groups sensor-level id's by visit
990  """
991 
992  def makeDataRefList(self, namespace):
993  """Make self.refList from self.idList
994 
995  Generate a list of data references grouped by visit.
996 
997  Parameters
998  ----------
999  namespace : `argparse.Namespace`
1000  Namespace used by `lsst.pipe.base.CmdLineTask` to parse command line arguments
1001  """
1002  # Group by visits
1003  visitRefs = defaultdict(list)
1004  for dataId in self.idList:
1005  if "visit" in dataId:
1006  visitId = dataId["visit"]
1007  # append all subsets to
1008  subset = namespace.butler.subset(self.datasetType, dataId=dataId)
1009  visitRefs[visitId].extend([dataRef for dataRef in subset])
1010 
1011  outputRefList = []
1012  for refList in visitRefs.values():
1013  existingRefs = [ref for ref in refList if ref.datasetExists()]
1014  if existingRefs:
1015  outputRefList.append(existingRefs)
1016 
1017  self.refListrefList = outputRefList
1018 
1019 
1020 class ConsolidateSourceTableConfig(pexConfig.Config):
1021  pass
1022 
1023 
1024 class ConsolidateSourceTableTask(CmdLineTask):
1025  """Concatenate `sourceTable` list into a per-visit `sourceTable_visit`
1026  """
1027  _DefaultName = 'consolidateSourceTable'
1028  ConfigClass = ConsolidateSourceTableConfig
1029 
1030  inputDataset = 'sourceTable'
1031  outputDataset = 'sourceTable_visit'
1032 
1033  def runDataRef(self, dataRefList):
1034  self.log.info("Concatenating %s per-detector Source Tables", len(dataRefList))
1035  df = pd.concat([dataRef.get().toDataFrame() for dataRef in dataRefList])
1036  dataRefList[0].put(ParquetTable(dataFrame=df), self.outputDatasetoutputDataset)
1037 
1038  @classmethod
1039  def _makeArgumentParser(cls):
1040  parser = ArgumentParser(name=cls._DefaultName_DefaultName)
1041 
1042  parser.add_id_argument("--id", cls.inputDatasetinputDataset,
1043  help="data ID, e.g. --id visit=12345",
1044  ContainerClass=VisitDataIdContainer)
1045  return parser
1046 
1047  def writeMetadata(self, dataRef):
1048  """No metadata to write.
1049  """
1050  pass
1051 
1052  def writeConfig(self, butler, clobber=False, doBackup=True):
1053  """No config to write.
1054  """
1055  pass
def writeConfig(self, butler, clobber=False, doBackup=True)
def writeConfig(self, butler, clobber=False, doBackup=True)
Definition: postprocess.py:864
def runQuantum(self, butlerQC, inputRefs, outputRefs)
Definition: postprocess.py:879
def _combineExposureMetadata(self, visit, dataRefs, isGen3=True)
Definition: postprocess.py:890
def __init__(self, parq, functors, filt=None, flags=None, refFlags=None)
Definition: postprocess.py:373
def compute(self, dropna=False, pool=None)
Definition: postprocess.py:416
def getAnalysis(self, parq, funcs=None, band=None)
Definition: postprocess.py:563
def transform(self, band, parq, funcs, dataId)
Definition: postprocess.py:570
def run(self, parq, funcs=None, dataId=None, band=None)
Definition: postprocess.py:530
def run(self, parq, funcs=None, dataId=None, band=None)
Definition: postprocess.py:655
def __init__(self, butler=None, schema=None, **kwargs)
Definition: postprocess.py:83
def runDataRef(self, patchRefList)
Merge coadd sources from multiple bands.
Definition: postprocess.py:89
def run(self, catalogs, tract, patch)
Definition: postprocess.py:138
def run(self, catalog, ccdVisitId=None)
Definition: postprocess.py:229
def addCalibColumns(self, catalog, dataRef)
Definition: postprocess.py:250
def makeMergeArgumentParser(name, dataset)
Create a suitable ArgumentParser.
def flattenFilters(df, noDupCols=['coord_ra', 'coord_dec'], camelCase=False)
Definition: postprocess.py:41