24 from collections
import defaultdict
26 import lsst.pex.config
as pexConfig
31 from .parquetTable
import ParquetTable
32 from .multiBandUtils
import makeMergeArgumentParser, MergeSourcesRunner
33 from .functors
import CompositeFunctor, RAColumn, DecColumn, Column
36 def flattenFilters(df, filterDict, noDupCols=['coord_ra', 'coord_dec'], camelCase=False):
37 """Flattens a dataframe with multilevel column index
39 newDf = pd.DataFrame()
40 for filt, filtShort
in filterDict.items():
42 columnFormat =
'{0}{1}' if camelCase
else '{0}_{1}'
43 newColumns = {c: columnFormat.format(filtShort, c)
44 for c
in subdf.columns
if c
not in noDupCols}
45 cols = list(newColumns.keys())
46 newDf = pd.concat([newDf, subdf[cols].rename(columns=newColumns)], axis=1)
48 newDf = pd.concat([subdf[noDupCols], newDf], axis=1)
53 priorityList = pexConfig.ListField(
56 doc=
"Priority-ordered list of bands for the merge."
58 engine = pexConfig.Field(
61 doc=
"Parquet engine for writing (pyarrow or fastparquet)"
63 coaddName = pexConfig.Field(
70 pexConfig.Config.validate(self)
72 raise RuntimeError(
"No priority list provided")
76 """Write filter-merged source tables to parquet
78 _DefaultName =
"writeObjectTable"
79 ConfigClass = WriteObjectTableConfig
80 RunnerClass = MergeSourcesRunner
83 inputDatasets = (
'forced_src',
'meas',
'ref')
88 def __init__(self, butler=None, schema=None, **kwargs):
92 CmdLineTask.__init__(self, **kwargs)
96 @brief Merge coadd sources from multiple bands. Calls @ref `run` which must be defined in
97 subclasses that inherit from MergeSourcesTask.
98 @param[in] patchRefList list of data references for each filter
100 catalogs = dict(self.
readCatalog(patchRef)
for patchRef
in patchRefList)
101 dataId = patchRefList[0].dataId
102 mergedCatalog = self.
run(catalogs, tract=dataId[
'tract'], patch=dataId[
'patch'])
103 self.
write(patchRefList[0], mergedCatalog)
106 def _makeArgumentParser(cls):
107 """Create a suitable ArgumentParser.
109 We will use the ArgumentParser to get a list of data
110 references for patches; the RunnerClass will sort them into lists
111 of data references for the same patch.
113 References first of self.inputDatasets, rather than
119 """Read input catalogs
121 Read all the input datasets given by the 'inputDatasets'
126 patchRef : `lsst.daf.persistence.ButlerDataRef`
127 Data reference for patch
131 Tuple consisting of filter name and a dict of catalogs, keyed by
134 filterName = patchRef.dataId[
"filter"]
137 catalog = patchRef.get(self.config.coaddName +
"Coadd_" + dataset, immediate=
True)
138 self.log.info(
"Read %d sources from %s for filter %s: %s" %
139 (len(catalog), dataset, filterName, patchRef.dataId))
140 catalogDict[dataset] = catalog
141 return filterName, catalogDict
143 def run(self, catalogs, tract, patch):
144 """Merge multiple catalogs.
149 Mapping from filter names to dict of catalogs.
151 tractId to use for the tractId column
153 patchId to use for the patchId column
157 catalog : `lsst.pipe.tasks.parquetTable.ParquetTable`
158 Merged dataframe, with each column prefixed by
159 `filter_tag(filt)`, wrapped in the parquet writer shim class.
163 for filt, tableDict
in catalogs.items():
164 for dataset, table
in tableDict.items():
166 df = table.asAstropy().to_pandas().set_index(
'id', drop=
True)
169 df = df.reindex(sorted(df.columns), axis=1)
170 df[
'tractId'] = tract
171 df[
'patchId'] = patch
174 df.columns = pd.MultiIndex.from_tuples([(dataset, filt, c)
for c
in df.columns],
175 names=(
'dataset',
'filter',
'column'))
178 catalog = functools.reduce(
lambda d1, d2: d1.join(d2), dfs)
186 catalog : `ParquetTable`
188 patchRef : `lsst.daf.persistence.ButlerDataRef`
189 Data reference for patch
191 patchRef.put(catalog, self.config.coaddName +
"Coadd_" + self.
outputDataset)
194 mergeDataId = patchRef.dataId.copy()
195 del mergeDataId[
"filter"]
196 self.log.info(
"Wrote merged catalog: %s" % (mergeDataId,))
199 """No metadata to write, and not sure how to write it for a list of dataRefs.
204 class PostprocessAnalysis(object):
205 """Calculate columns from ParquetTable
207 This object manages and organizes an arbitrary set of computations
208 on a catalog. The catalog is defined by a
209 `lsst.pipe.tasks.parquetTable.ParquetTable` object (or list thereof), such as a
210 `deepCoadd_obj` dataset, and the computations are defined by a collection
211 of `lsst.pipe.tasks.functor.Functor` objects (or, equivalently,
212 a `CompositeFunctor`).
214 After the object is initialized, accessing the `.df` attribute (which
215 holds the `pandas.DataFrame` containing the results of the calculations) triggers
216 computation of said dataframe.
218 One of the conveniences of using this object is the ability to define a desired common
219 filter for all functors. This enables the same functor collection to be passed to
220 several different `PostprocessAnalysis` objects without having to change the original
221 functor collection, since the `filt` keyword argument of this object triggers an
222 overwrite of the `filt` property for all functors in the collection.
224 This object also allows a list of refFlags to be passed, and defines a set of default
225 refFlags that are always included even if not requested.
227 If a list of `ParquetTable` object is passed, rather than a single one, then the
228 calculations will be mapped over all the input catalogs. In principle, it should
229 be straightforward to parallelize this activity, but initial tests have failed
230 (see TODO in code comments).
234 parq : `lsst.pipe.tasks.ParquetTable` (or list of such)
235 Source catalog(s) for computation
237 functors : `list`, `dict`, or `lsst.pipe.tasks.functors.CompositeFunctor`
238 Computations to do (functors that act on `parq`).
239 If a dict, the output
240 DataFrame will have columns keyed accordingly.
241 If a list, the column keys will come from the
242 `.shortname` attribute of each functor.
244 filt : `str` (optional)
245 Filter in which to calculate. If provided,
246 this will overwrite any existing `.filt` attribute
247 of the provided functors.
249 flags : `list` (optional)
250 List of flags (per-band) to include in output table.
252 refFlags : `list` (optional)
253 List of refFlags (only reference band) to include in output table.
257 _defaultRefFlags = (
'calib_psf_used',
'detect_isPrimary')
258 _defaultFuncs = ((
'coord_ra', RAColumn()),
259 (
'coord_dec', DecColumn()))
261 def __init__(self, parq, functors, filt=None, flags=None, refFlags=None):
266 self.
flags = list(flags)
if flags
is not None else []
268 if refFlags
is not None:
281 additionalFuncs.update({flag:
Column(flag, dataset=
'ref')
for flag
in self.
refFlags})
282 additionalFuncs.update({flag:
Column(flag, dataset=
'meas')
for flag
in self.
flags})
284 if isinstance(self.
functors, CompositeFunctor):
289 func.funcDict.update(additionalFuncs)
290 func.filt = self.
filt
296 return [name
for name, func
in self.
func.funcDict.items()
if func.noDup
or func.dataset ==
'ref']
306 if type(self.
parq)
in (list, tuple):
308 dflist = [self.
func(parq, dropna=dropna)
for parq
in self.
parq]
311 dflist = pool.map(functools.partial(self.
func, dropna=dropna), self.
parq)
312 self.
_df = pd.concat(dflist)
320 coaddName = pexConfig.Field(
325 functorFile = pexConfig.Field(
327 doc=
'Path to YAML file specifying functors to be computed',
333 """Base class for transforming/standardizing a catalog
335 by applying functors that convert units and apply calibrations.
336 The purpose of this task is to perform a set of computations on
337 an input `ParquetTable` dataset (such as `deepCoadd_obj`) and write the
338 results to a new dataset (which needs to be declared in an `outputDataset`
341 The calculations to be performed are defined in a YAML file that specifies
342 a set of functors to be computed, provided as
343 a `--functorFile` config parameter. An example of such a YAML file
368 - base_InputCount_value
371 functor: DeconvolvedMoments
376 - merge_measurement_i
377 - merge_measurement_r
378 - merge_measurement_z
379 - merge_measurement_y
380 - merge_measurement_g
381 - base_PixelFlags_flag_inexact_psfCenter
384 The names for each entry under "func" will become the names of columns in the
385 output dataset. All the functors referenced are defined in `lsst.pipe.tasks.functors`.
386 Positional arguments to be passed to each functor are in the `args` list,
387 and any additional entries for each column other than "functor" or "args" (e.g., `'filt'`,
388 `'dataset'`) are treated as keyword arguments to be passed to the functor initialization.
390 The "refFlags" entry is shortcut for a bunch of `Column` functors with the original column and
391 taken from the `'ref'` dataset.
393 The "flags" entry will be expanded out per band.
395 Note, if `'filter'` is provided as part of the `dataId` when running this task (even though
396 `deepCoadd_obj` does not use `'filter'`), then this will override the `filt` kwargs
397 provided in the YAML file, and the calculations will be done in that filter.
399 This task uses the `lsst.pipe.tasks.postprocess.PostprocessAnalysis` object
400 to organize and excecute the calculations.
404 def _DefaultName(self):
405 raise NotImplementedError(
'Subclass must define "_DefaultName" attribute')
409 raise NotImplementedError(
'Subclass must define "outputDataset" attribute')
413 raise NotImplementedError(
'Subclass must define "inputDataset" attribute')
417 raise NotImplementedError(
'Subclass must define "ConfigClass" attribute')
420 parq = patchRef.get()
421 dataId = patchRef.dataId
423 self.log.info(
"Transforming/standardizing the catalog of %s", dataId)
424 df = self.
run(parq, funcs=funcs, dataId=dataId)
425 self.
write(df, patchRef)
428 def run(self, parq, funcs=None, dataId=None):
429 """Do postprocessing calculations
431 Takes a `ParquetTable` object and dataId,
432 returns a dataframe with results of postprocessing calculations.
436 parq : `lsst.pipe.tasks.parquetTable.ParquetTable`
437 ParquetTable from which calculations are done.
438 funcs : `lsst.pipe.tasks.functors.Functors`
439 Functors to apply to the table's columns
440 dataId : dict, optional
441 Used to add a `patchId` column to the output dataframe.
448 filt = dataId.get(
'filter',
None)
449 return self.
transform(filt, parq, funcs, dataId).df
452 funcs = CompositeFunctor.from_file(self.config.functorFile)
453 funcs.update(dict(PostprocessAnalysis._defaultFuncs))
464 analysis = self.
getAnalysis(parq, funcs=funcs, filt=filt)
466 if dataId
is not None:
467 for key, value
in dataId.items():
470 return pipeBase.Struct(
479 """No metadata to write.
484 class TransformObjectCatalogConfig(TransformCatalogBaseConfig):
485 filterMap = pexConfig.DictField(
489 doc=(
"Dictionary mapping full filter name to short one for column name munging."
490 "These filters determine the output columns no matter what filters the "
491 "input data actually contain.")
493 camelCase = pexConfig.Field(
496 doc=(
"Write per-filter columns names with camelCase, else underscore "
497 "For example: gPsfFlux instead of g_PsfFlux.")
499 multilevelOutput = pexConfig.Field(
502 doc=(
"Whether results dataframe should have a multilevel column index (True) or be flat "
503 "and name-munged (False).")
508 """Compute Flatted Object Table as defined in the DPDD
510 Do the same set of postprocessing calculations on all bands
512 This is identical to `TransformCatalogBaseTask`, except for that it does the
513 specified functor calculations for all filters present in the
514 input `deepCoadd_obj` table. Any specific `"filt"` keywords specified
515 by the YAML file will be superceded.
517 _DefaultName =
"transformObjectCatalog"
518 ConfigClass = TransformObjectCatalogConfig
520 inputDataset =
'deepCoadd_obj'
521 outputDataset =
'objectTable'
524 def _makeArgumentParser(cls):
527 ContainerClass=CoaddDataIdContainer,
528 help=
"data ID, e.g. --id tract=12345 patch=1,2")
531 def run(self, parq, funcs=None, dataId=None):
534 templateDf = pd.DataFrame()
537 for filt
in parq.columnLevelNames[
'filter']:
538 if filt
not in self.config.filterMap:
539 self.log.info(
"Ignoring %s data in the input", filt)
541 self.log.info(
"Transforming the catalog of filter %s", filt)
542 result = self.
transform(filt, parq, funcs, dataId)
543 dfDict[filt] = result.df
544 analysisDict[filt] = result.analysis
546 templateDf = result.df
549 for filt
in self.config.filterMap:
550 if filt
not in dfDict:
551 self.log.info(
"Adding empty columns for filter %s", filt)
552 dfDict[filt] = pd.DataFrame().reindex_like(templateDf)
555 df = pd.concat(dfDict, axis=1, names=[
'filter',
'column'])
557 if not self.config.multilevelOutput:
558 noDupCols = list(set.union(*[set(v.noDupCols)
for v
in analysisDict.values()]))
559 if dataId
is not None:
560 noDupCols += list(dataId.keys())
561 df =
flattenFilters(df, self.config.filterMap, noDupCols=noDupCols,
562 camelCase=self.config.camelCase)
564 self.log.info(
"Made a table of %d columns and %d rows", len(df.columns), len(df))
571 """Make self.refList from self.idList
573 Generate a list of data references given tract and/or patch.
574 This was adapted from `TractQADataIdContainer`, which was
575 `TractDataIdContainer` modifie to not require "filter".
576 Only existing dataRefs are returned.
578 def getPatchRefList(tract):
579 return [namespace.butler.dataRef(datasetType=self.datasetType,
581 patch=
"%d,%d" % patch.getIndex())
for patch
in tract]
583 tractRefs = defaultdict(list)
584 for dataId
in self.idList:
587 if "tract" in dataId:
588 tractId = dataId[
"tract"]
589 if "patch" in dataId:
590 tractRefs[tractId].append(namespace.butler.dataRef(datasetType=self.datasetType,
592 patch=dataId[
'patch']))
594 tractRefs[tractId] += getPatchRefList(skymap[tractId])
596 tractRefs = dict((tract.getId(), tractRefs.get(tract.getId(), []) + getPatchRefList(tract))
599 for tractRefList
in tractRefs.values():
600 existingRefs = [ref
for ref
in tractRefList
if ref.datasetExists()]
601 outputRefList.append(existingRefs)
607 coaddName = pexConfig.Field(
615 """Write patch-merged source tables to a tract-level parquet file
617 _DefaultName =
"consolidateObjectTable"
618 ConfigClass = ConsolidateObjectTableConfig
620 inputDataset =
'objectTable'
621 outputDataset =
'objectTable_tract'
624 def _makeArgumentParser(cls):
628 help=
"data ID, e.g. --id tract=12345",
629 ContainerClass=TractObjectDataIdContainer)
633 df = pd.concat([patchRef.get().toDataFrame()
for patchRef
in patchRefList])
637 """No metadata to write.