24 from collections
import defaultdict
26 import lsst.pex.config
as pexConfig
28 from lsst.pipe.base import CmdLineTask, ArgumentParser, DataIdContainer
31 from .parquetTable
import ParquetTable
32 from .multiBandUtils
import makeMergeArgumentParser, MergeSourcesRunner
33 from .functors
import CompositeFunctor, RAColumn, DecColumn, Column
36 def flattenFilters(df, filterDict, noDupCols=['coord_ra', 'coord_dec'], camelCase=False):
37 """Flattens a dataframe with multilevel column index
39 newDf = pd.DataFrame()
40 for filt, filtShort
in filterDict.items():
42 columnFormat =
'{0}{1}' if camelCase
else '{0}_{1}'
43 newColumns = {c: columnFormat.format(filtShort, c)
44 for c
in subdf.columns
if c
not in noDupCols}
45 cols = list(newColumns.keys())
46 newDf = pd.concat([newDf, subdf[cols].rename(columns=newColumns)], axis=1)
48 newDf = pd.concat([subdf[noDupCols], newDf], axis=1)
53 priorityList = pexConfig.ListField(
56 doc=
"Priority-ordered list of bands for the merge."
58 engine = pexConfig.Field(
61 doc=
"Parquet engine for writing (pyarrow or fastparquet)"
63 coaddName = pexConfig.Field(
70 pexConfig.Config.validate(self)
72 raise RuntimeError(
"No priority list provided")
76 """Write filter-merged source tables to parquet
78 _DefaultName =
"writeObjectTable"
79 ConfigClass = WriteObjectTableConfig
80 RunnerClass = MergeSourcesRunner
83 inputDatasets = (
'forced_src',
'meas',
'ref')
88 def __init__(self, butler=None, schema=None, **kwargs):
92 CmdLineTask.__init__(self, **kwargs)
96 @brief Merge coadd sources from multiple bands. Calls @ref `run` which must be defined in
97 subclasses that inherit from MergeSourcesTask.
98 @param[in] patchRefList list of data references for each filter
100 catalogs = dict(self.
readCatalog(patchRef)
for patchRef
in patchRefList)
101 dataId = patchRefList[0].dataId
102 mergedCatalog = self.
run(catalogs, tract=dataId[
'tract'], patch=dataId[
'patch'])
103 self.
write(patchRefList[0], mergedCatalog)
106 def _makeArgumentParser(cls):
107 """Create a suitable ArgumentParser.
109 We will use the ArgumentParser to get a list of data
110 references for patches; the RunnerClass will sort them into lists
111 of data references for the same patch.
113 References first of self.inputDatasets, rather than
119 """Read input catalogs
121 Read all the input datasets given by the 'inputDatasets'
126 patchRef : `lsst.daf.persistence.ButlerDataRef`
127 Data reference for patch
131 Tuple consisting of filter name and a dict of catalogs, keyed by
134 filterName = patchRef.dataId[
"filter"]
137 catalog = patchRef.get(self.config.coaddName +
"Coadd_" + dataset, immediate=
True)
138 self.log.info(
"Read %d sources from %s for filter %s: %s" %
139 (len(catalog), dataset, filterName, patchRef.dataId))
140 catalogDict[dataset] = catalog
141 return filterName, catalogDict
143 def run(self, catalogs, tract, patch):
144 """Merge multiple catalogs.
149 Mapping from filter names to dict of catalogs.
151 tractId to use for the tractId column
153 patchId to use for the patchId column
157 catalog : `lsst.pipe.tasks.parquetTable.ParquetTable`
158 Merged dataframe, with each column prefixed by
159 `filter_tag(filt)`, wrapped in the parquet writer shim class.
163 for filt, tableDict
in catalogs.items():
164 for dataset, table
in tableDict.items():
166 df = table.asAstropy().to_pandas().set_index(
'id', drop=
True)
169 df = df.reindex(sorted(df.columns), axis=1)
170 df[
'tractId'] = tract
171 df[
'patchId'] = patch
174 df.columns = pd.MultiIndex.from_tuples([(dataset, filt, c)
for c
in df.columns],
175 names=(
'dataset',
'filter',
'column'))
178 catalog = functools.reduce(
lambda d1, d2: d1.join(d2), dfs)
186 catalog : `ParquetTable`
188 patchRef : `lsst.daf.persistence.ButlerDataRef`
189 Data reference for patch
191 patchRef.put(catalog, self.config.coaddName +
"Coadd_" + self.
outputDataset)
194 mergeDataId = patchRef.dataId.copy()
195 del mergeDataId[
"filter"]
196 self.log.info(
"Wrote merged catalog: %s" % (mergeDataId,))
199 """No metadata to write, and not sure how to write it for a list of dataRefs.
204 class WriteSourceTableConfig(pexConfig.Config):
209 """Write source table to parquet
211 _DefaultName =
"writeSourceTable"
212 ConfigClass = WriteSourceTableConfig
215 src = dataRef.get(
'src')
216 ccdVisitId = dataRef.get(
'ccdExposureId')
217 result = self.
run(src, ccdVisitId=ccdVisitId)
218 dataRef.put(result.table,
'source')
220 def run(self, catalog, ccdVisitId=None):
221 """Convert `src` catalog to parquet
225 catalog: `lsst.afw.table.SourceCatalog`
226 catalog to be converted
228 ccdVisitId to be added as a column
232 result : `lsst.pipe.base.Struct`
234 `ParquetTable` version of the input catalog
236 self.log.info(
"Generating parquet table from src catalog")
237 df = catalog.asAstropy().to_pandas().set_index(
'id', drop=
True)
238 df[
'ccdVisitId'] = ccdVisitId
239 return pipeBase.Struct(table=
ParquetTable(dataFrame=df))
242 """No metadata to write.
247 """No config to write.
252 def _makeArgumentParser(cls):
253 parser = ArgumentParser(name=cls._DefaultName)
254 parser.add_id_argument(
"--id",
'src',
255 help=
"data ID, e.g. --id visit=12345 ccd=0")
260 """Calculate columns from ParquetTable
262 This object manages and organizes an arbitrary set of computations
263 on a catalog. The catalog is defined by a
264 `lsst.pipe.tasks.parquetTable.ParquetTable` object (or list thereof), such as a
265 `deepCoadd_obj` dataset, and the computations are defined by a collection
266 of `lsst.pipe.tasks.functor.Functor` objects (or, equivalently,
267 a `CompositeFunctor`).
269 After the object is initialized, accessing the `.df` attribute (which
270 holds the `pandas.DataFrame` containing the results of the calculations) triggers
271 computation of said dataframe.
273 One of the conveniences of using this object is the ability to define a desired common
274 filter for all functors. This enables the same functor collection to be passed to
275 several different `PostprocessAnalysis` objects without having to change the original
276 functor collection, since the `filt` keyword argument of this object triggers an
277 overwrite of the `filt` property for all functors in the collection.
279 This object also allows a list of refFlags to be passed, and defines a set of default
280 refFlags that are always included even if not requested.
282 If a list of `ParquetTable` object is passed, rather than a single one, then the
283 calculations will be mapped over all the input catalogs. In principle, it should
284 be straightforward to parallelize this activity, but initial tests have failed
285 (see TODO in code comments).
289 parq : `lsst.pipe.tasks.ParquetTable` (or list of such)
290 Source catalog(s) for computation
292 functors : `list`, `dict`, or `lsst.pipe.tasks.functors.CompositeFunctor`
293 Computations to do (functors that act on `parq`).
294 If a dict, the output
295 DataFrame will have columns keyed accordingly.
296 If a list, the column keys will come from the
297 `.shortname` attribute of each functor.
299 filt : `str` (optional)
300 Filter in which to calculate. If provided,
301 this will overwrite any existing `.filt` attribute
302 of the provided functors.
304 flags : `list` (optional)
305 List of flags (per-band) to include in output table.
307 refFlags : `list` (optional)
308 List of refFlags (only reference band) to include in output table.
312 _defaultRefFlags = []
313 _defaultFuncs = ((
'coord_ra',
RAColumn()),
316 def __init__(self, parq, functors, filt=None, flags=None, refFlags=None):
321 self.
flags = list(flags)
if flags
is not None else []
323 if refFlags
is not None:
336 additionalFuncs.update({flag:
Column(flag, dataset=
'ref')
for flag
in self.
refFlags})
337 additionalFuncs.update({flag:
Column(flag, dataset=
'meas')
for flag
in self.
flags})
339 if isinstance(self.
functors, CompositeFunctor):
344 func.funcDict.update(additionalFuncs)
345 func.filt = self.
filt
351 return [name
for name, func
in self.
func.funcDict.items()
if func.noDup
or func.dataset ==
'ref']
361 if type(self.
parq)
in (list, tuple):
363 dflist = [self.
func(parq, dropna=dropna)
for parq
in self.
parq]
366 dflist = pool.map(functools.partial(self.
func, dropna=dropna), self.
parq)
367 self.
_df = pd.concat(dflist)
375 functorFile = pexConfig.Field(
377 doc=
'Path to YAML file specifying functors to be computed',
384 """Base class for transforming/standardizing a catalog
386 by applying functors that convert units and apply calibrations.
387 The purpose of this task is to perform a set of computations on
388 an input `ParquetTable` dataset (such as `deepCoadd_obj`) and write the
389 results to a new dataset (which needs to be declared in an `outputDataset`
392 The calculations to be performed are defined in a YAML file that specifies
393 a set of functors to be computed, provided as
394 a `--functorFile` config parameter. An example of such a YAML file
419 - base_InputCount_value
422 functor: DeconvolvedMoments
427 - merge_measurement_i
428 - merge_measurement_r
429 - merge_measurement_z
430 - merge_measurement_y
431 - merge_measurement_g
432 - base_PixelFlags_flag_inexact_psfCenter
435 The names for each entry under "func" will become the names of columns in the
436 output dataset. All the functors referenced are defined in `lsst.pipe.tasks.functors`.
437 Positional arguments to be passed to each functor are in the `args` list,
438 and any additional entries for each column other than "functor" or "args" (e.g., `'filt'`,
439 `'dataset'`) are treated as keyword arguments to be passed to the functor initialization.
441 The "refFlags" entry is shortcut for a bunch of `Column` functors with the original column and
442 taken from the `'ref'` dataset.
444 The "flags" entry will be expanded out per band.
446 Note, if `'filter'` is provided as part of the `dataId` when running this task (even though
447 `deepCoadd_obj` does not use `'filter'`), then this will override the `filt` kwargs
448 provided in the YAML file, and the calculations will be done in that filter.
450 This task uses the `lsst.pipe.tasks.postprocess.PostprocessAnalysis` object
451 to organize and excecute the calculations.
455 def _DefaultName(self):
456 raise NotImplementedError(
'Subclass must define "_DefaultName" attribute')
460 raise NotImplementedError(
'Subclass must define "outputDataset" attribute')
464 raise NotImplementedError(
'Subclass must define "inputDataset" attribute')
468 raise NotImplementedError(
'Subclass must define "ConfigClass" attribute')
473 df = self.
run(parq, funcs=funcs, dataId=dataRef.dataId)
474 self.
write(df, dataRef)
477 def run(self, parq, funcs=None, dataId=None):
478 """Do postprocessing calculations
480 Takes a `ParquetTable` object and dataId,
481 returns a dataframe with results of postprocessing calculations.
485 parq : `lsst.pipe.tasks.parquetTable.ParquetTable`
486 ParquetTable from which calculations are done.
487 funcs : `lsst.pipe.tasks.functors.Functors`
488 Functors to apply to the table's columns
489 dataId : dict, optional
490 Used to add a `patchId` column to the output dataframe.
497 self.log.info(
"Transforming/standardizing the source table dataId: %s", dataId)
499 filt = dataId.get(
'filter',
None)
500 df = self.
transform(filt, parq, funcs, dataId).df
501 self.log.info(
"Made a table of %d columns and %d rows", len(df.columns), len(df))
505 funcs = CompositeFunctor.from_file(self.config.functorFile)
506 funcs.update(dict(PostprocessAnalysis._defaultFuncs))
517 analysis = self.
getAnalysis(parq, funcs=funcs, filt=filt)
519 if dataId
is not None:
520 for key, value
in dataId.items():
523 return pipeBase.Struct(
532 """No metadata to write.
537 class TransformObjectCatalogConfig(TransformCatalogBaseConfig):
538 coaddName = pexConfig.Field(
543 filterMap = pexConfig.DictField(
547 doc=(
"Dictionary mapping full filter name to short one for column name munging."
548 "These filters determine the output columns no matter what filters the "
549 "input data actually contain.")
551 camelCase = pexConfig.Field(
554 doc=(
"Write per-filter columns names with camelCase, else underscore "
555 "For example: gPsfFlux instead of g_PsfFlux.")
557 multilevelOutput = pexConfig.Field(
560 doc=(
"Whether results dataframe should have a multilevel column index (True) or be flat "
561 "and name-munged (False).")
566 """Compute Flatted Object Table as defined in the DPDD
568 Do the same set of postprocessing calculations on all bands
570 This is identical to `TransformCatalogBaseTask`, except for that it does the
571 specified functor calculations for all filters present in the
572 input `deepCoadd_obj` table. Any specific `"filt"` keywords specified
573 by the YAML file will be superceded.
575 _DefaultName =
"transformObjectCatalog"
576 ConfigClass = TransformObjectCatalogConfig
578 inputDataset =
'deepCoadd_obj'
579 outputDataset =
'objectTable'
582 def _makeArgumentParser(cls):
585 ContainerClass=CoaddDataIdContainer,
586 help=
"data ID, e.g. --id tract=12345 patch=1,2")
589 def run(self, parq, funcs=None, dataId=None):
592 templateDf = pd.DataFrame()
595 for filt
in parq.columnLevelNames[
'filter']:
596 if filt
not in self.config.filterMap:
597 self.log.info(
"Ignoring %s data in the input", filt)
599 self.log.info(
"Transforming the catalog of filter %s", filt)
600 result = self.
transform(filt, parq, funcs, dataId)
601 dfDict[filt] = result.df
602 analysisDict[filt] = result.analysis
604 templateDf = result.df
607 for filt
in self.config.filterMap:
608 if filt
not in dfDict:
609 self.log.info(
"Adding empty columns for filter %s", filt)
610 dfDict[filt] = pd.DataFrame().reindex_like(templateDf)
613 df = pd.concat(dfDict, axis=1, names=[
'filter',
'column'])
615 if not self.config.multilevelOutput:
616 noDupCols = list(set.union(*[set(v.noDupCols)
for v
in analysisDict.values()]))
617 if dataId
is not None:
618 noDupCols += list(dataId.keys())
619 df =
flattenFilters(df, self.config.filterMap, noDupCols=noDupCols,
620 camelCase=self.config.camelCase)
622 self.log.info(
"Made a table of %d columns and %d rows", len(df.columns), len(df))
629 """Make self.refList from self.idList
631 Generate a list of data references given tract and/or patch.
632 This was adapted from `TractQADataIdContainer`, which was
633 `TractDataIdContainer` modifie to not require "filter".
634 Only existing dataRefs are returned.
636 def getPatchRefList(tract):
637 return [namespace.butler.dataRef(datasetType=self.datasetType,
639 patch=
"%d,%d" % patch.getIndex())
for patch
in tract]
641 tractRefs = defaultdict(list)
642 for dataId
in self.idList:
645 if "tract" in dataId:
646 tractId = dataId[
"tract"]
647 if "patch" in dataId:
648 tractRefs[tractId].append(namespace.butler.dataRef(datasetType=self.datasetType,
650 patch=dataId[
'patch']))
652 tractRefs[tractId] += getPatchRefList(skymap[tractId])
654 tractRefs = dict((tract.getId(), tractRefs.get(tract.getId(), []) + getPatchRefList(tract))
657 for tractRefList
in tractRefs.values():
658 existingRefs = [ref
for ref
in tractRefList
if ref.datasetExists()]
659 outputRefList.append(existingRefs)
665 coaddName = pexConfig.Field(
673 """Write patch-merged source tables to a tract-level parquet file
675 _DefaultName =
"consolidateObjectTable"
676 ConfigClass = ConsolidateObjectTableConfig
678 inputDataset =
'objectTable'
679 outputDataset =
'objectTable_tract'
682 def _makeArgumentParser(cls):
686 help=
"data ID, e.g. --id tract=12345",
687 ContainerClass=TractObjectDataIdContainer)
691 df = pd.concat([patchRef.get().toDataFrame()
for patchRef
in patchRefList])
695 """No metadata to write.
700 class TransformSourceTableConfig(TransformCatalogBaseConfig):
705 """Transform/standardize a source catalog
707 _DefaultName =
"transformSourceTable"
708 ConfigClass = TransformSourceTableConfig
710 inputDataset =
'source'
711 outputDataset =
'sourceTable'
714 """No metadata to write.
719 def _makeArgumentParser(cls):
721 parser.add_id_argument(
"--id", datasetType=cls.
inputDataset,
723 help=
"data ID, e.g. --id visit=12345 ccd=0")
728 """DataIdContainer that groups sensor-level id's by visit
732 """Make self.refList from self.idList
734 Generate a list of data references grouped by visit.
738 namespace : `argparse.Namespace`
739 Namespace used by `lsst.pipe.base.CmdLineTask` to parse command line arguments
741 def ccdDataRefList(visitId):
742 """Get all possible ccds for a given visit"""
743 ccds = namespace.butler.queryMetadata(
'src', [
'ccd'], dataId={
'visit': visitId})
744 return [namespace.butler.dataRef(datasetType=self.datasetType,
746 ccd=ccd)
for ccd
in ccds]
748 visitRefs = defaultdict(list)
749 for dataId
in self.idList:
750 if "visit" in dataId:
751 visitId = dataId[
"visit"]
753 visitRefs[visitId].append(namespace.butler.dataRef(datasetType=self.datasetType,
754 visit=visitId, ccd=dataId[
'ccd']))
756 visitRefs[visitId] += ccdDataRefList(visitId)
758 for refList
in visitRefs.values():
759 existingRefs = [ref
for ref
in refList
if ref.datasetExists()]
760 outputRefList.append(existingRefs)
769 class ConsolidateSourceTableTask(CmdLineTask):
770 """Concatenate `sourceTable` list into a per-visit `sourceTable_visit`
772 _DefaultName =
'consolidateSourceTable'
773 ConfigClass = ConsolidateSourceTableConfig
775 inputDataset =
'sourceTable'
776 outputDataset =
'sourceTable_visit'
779 self.log.info(
"Concatenating %s per-detector Source Tables", len(dataRefList))
780 df = pd.concat([dataRef.get().toDataFrame()
for dataRef
in dataRefList])
784 def _makeArgumentParser(cls):
788 help=
"data ID, e.g. --id visit=12345",
789 ContainerClass=VisitDataIdContainer)
793 """No metadata to write.
798 """No config to write.