24 from collections
import defaultdict
26 import lsst.pex.config
as pexConfig
31 from .parquetTable
import ParquetTable
32 from .multiBandUtils
import makeMergeArgumentParser, MergeSourcesRunner
33 from .functors
import CompositeFunctor, RAColumn, DecColumn, Column
36 def flattenFilters(df, filterDict, noDupCols=['coord_ra', 'coord_dec'], camelCase=False):
37 """Flattens a dataframe with multilevel column index 39 newDf = pd.DataFrame()
40 for filt, filtShort
in filterDict.items():
42 columnFormat =
'{0}{1}' if camelCase
else '{0}_{1}' 43 newColumns = {c: columnFormat.format(filtShort, c)
44 for c
in subdf.columns
if c
not in noDupCols}
45 cols = list(newColumns.keys())
46 newDf = pd.concat([newDf, subdf[cols].rename(columns=newColumns)], axis=1)
48 newDf = pd.concat([subdf[noDupCols], newDf], axis=1)
53 priorityList = pexConfig.ListField(
56 doc=
"Priority-ordered list of bands for the merge." 58 engine = pexConfig.Field(
61 doc=
"Parquet engine for writing (pyarrow or fastparquet)" 63 coaddName = pexConfig.Field(
70 pexConfig.Config.validate(self)
72 raise RuntimeError(
"No priority list provided")
76 """Write filter-merged source tables to parquet 78 _DefaultName =
"writeObjectTable" 79 ConfigClass = WriteObjectTableConfig
80 RunnerClass = MergeSourcesRunner
83 inputDatasets = (
'forced_src',
'meas',
'ref')
88 def __init__(self, butler=None, schema=None, **kwargs):
92 CmdLineTask.__init__(self, **kwargs)
96 @brief Merge coadd sources from multiple bands. Calls @ref `run` which must be defined in 97 subclasses that inherit from MergeSourcesTask. 98 @param[in] patchRefList list of data references for each filter 100 catalogs = dict(self.
readCatalog(patchRef)
for patchRef
in patchRefList)
101 dataId = patchRefList[0].dataId
102 mergedCatalog = self.
run(catalogs, tract=dataId[
'tract'], patch=dataId[
'patch'])
103 self.
write(patchRefList[0], mergedCatalog)
106 def _makeArgumentParser(cls):
107 """Create a suitable ArgumentParser. 109 We will use the ArgumentParser to get a list of data 110 references for patches; the RunnerClass will sort them into lists 111 of data references for the same patch. 113 References first of self.inputDatasets, rather than 119 """Read input catalogs 121 Read all the input datasets given by the 'inputDatasets' 126 patchRef : `lsst.daf.persistence.ButlerDataRef` 127 Data reference for patch 131 Tuple consisting of filter name and a dict of catalogs, keyed by 134 filterName = patchRef.dataId[
"filter"]
137 catalog = patchRef.get(self.config.coaddName +
"Coadd_" + dataset, immediate=
True)
138 self.log.info(
"Read %d sources from %s for filter %s: %s" %
139 (len(catalog), dataset, filterName, patchRef.dataId))
140 catalogDict[dataset] = catalog
141 return filterName, catalogDict
143 def run(self, catalogs, tract, patch):
144 """Merge multiple catalogs. 149 Mapping from filter names to dict of catalogs. 151 tractId to use for the tractId column 153 patchId to use for the patchId column 157 catalog : `lsst.pipe.tasks.parquetTable.ParquetTable` 158 Merged dataframe, with each column prefixed by 159 `filter_tag(filt)`, wrapped in the parquet writer shim class. 163 for filt, tableDict
in catalogs.items():
164 for dataset, table
in tableDict.items():
166 df = table.asAstropy().to_pandas().set_index(
'id', drop=
True)
169 df = df.reindex(sorted(df.columns), axis=1)
170 df[
'tractId'] = tract
171 df[
'patchId'] = patch
174 df.columns = pd.MultiIndex.from_tuples([(dataset, filt, c)
for c
in df.columns],
175 names=(
'dataset',
'filter',
'column'))
178 catalog = functools.reduce(
lambda d1, d2: d1.join(d2), dfs)
186 catalog : `ParquetTable` 188 patchRef : `lsst.daf.persistence.ButlerDataRef` 189 Data reference for patch 191 patchRef.put(catalog, self.config.coaddName +
"Coadd_" + self.
outputDataset)
194 mergeDataId = patchRef.dataId.copy()
195 del mergeDataId[
"filter"]
196 self.log.info(
"Wrote merged catalog: %s" % (mergeDataId,))
199 """No metadata to write, and not sure how to write it for a list of dataRefs. 204 class PostprocessAnalysis(object):
205 """Calculate columns from ParquetTable 207 This object manages and organizes an arbitrary set of computations 208 on a catalog. The catalog is defined by a 209 `lsst.pipe.tasks.parquetTable.ParquetTable` object (or list thereof), such as a 210 `deepCoadd_obj` dataset, and the computations are defined by a collection 211 of `lsst.pipe.tasks.functor.Functor` objects (or, equivalently, 212 a `CompositeFunctor`). 214 After the object is initialized, accessing the `.df` attribute (which 215 holds the `pandas.DataFrame` containing the results of the calculations) triggers 216 computation of said dataframe. 218 One of the conveniences of using this object is the ability to define a desired common 219 filter for all functors. This enables the same functor collection to be passed to 220 several different `PostprocessAnalysis` objects without having to change the original 221 functor collection, since the `filt` keyword argument of this object triggers an 222 overwrite of the `filt` property for all functors in the collection. 224 This object also allows a list of flags to be passed, and defines a set of default 225 flags that are always included even if not requested. 227 If a list of `ParquetTable` object is passed, rather than a single one, then the 228 calculations will be mapped over all the input catalogs. In principle, it should 229 be straightforward to parallelize this activity, but initial tests have failed 230 (see TODO in code comments). 234 parq : `lsst.pipe.tasks.ParquetTable` (or list of such) 235 Source catalog(s) for computation 237 functors : `list`, `dict`, or `lsst.pipe.tasks.functors.CompositeFunctor` 238 Computations to do (functors that act on `parq`). 239 If a dict, the output 240 DataFrame will have columns keyed accordingly. 241 If a list, the column keys will come from the 242 `.shortname` attribute of each functor. 244 filt : `str` (optional) 245 Filter in which to calculate. If provided, 246 this will overwrite any existing `.filt` attribute 247 of the provided functors. 249 flags : `list` (optional) 250 List of flags to include in output table. 252 _defaultFlags = (
'calib_psf_used',
'detect_isPrimary')
253 _defaultFuncs = ((
'coord_ra', RAColumn()),
254 (
'coord_dec', DecColumn()))
256 def __init__(self, parq, functors, filt=None, flags=None):
262 if flags
is not None:
263 self.
flags += list(flags)
275 additionalFuncs.update({flag:
Column(flag)
for flag
in self.
flags})
277 if isinstance(self.
functors, CompositeFunctor):
282 func.funcDict.update(additionalFuncs)
283 func.filt = self.
filt 289 return [name
for name, func
in self.
func.funcDict.items()
if func.noDup
or func.dataset ==
'ref']
299 if type(self.
parq)
in (list, tuple):
301 dflist = [self.
func(parq, dropna=dropna)
for parq
in self.
parq]
304 dflist = pool.map(functools.partial(self.
func, dropna=dropna), self.
parq)
305 self.
_df = pd.concat(dflist)
313 coaddName = pexConfig.Field(
318 functorFile = pexConfig.Field(
320 doc=
'Path to YAML file specifying functors to be computed',
326 """Base class for transforming/standardizing a catalog 328 by applying functors that convert units and apply calibrations. 329 The purpose of this task is to perform a set of computations on 330 an input `ParquetTable` dataset (such as `deepCoadd_obj`) and write the 331 results to a new dataset (which needs to be declared in an `outputDataset` 334 The calculations to be performed are defined in a YAML file that specifies 335 a set of functors to be computed, provided as 336 a `--functorFile` config parameter. An example of such a YAML file 361 - base_InputCount_value 364 functor: DeconvolvedMoments 369 - merge_measurement_i 370 - merge_measurement_r 371 - merge_measurement_z 372 - merge_measurement_y 373 - merge_measurement_g 374 - base_PixelFlags_flag_inexact_psfCenter 377 The names for each entry under "func" will become the names of columns in the 378 output dataset. All the functors referenced are defined in `lsst.pipe.tasks.functors`. 379 Positional arguments to be passed to each functor are in the `args` list, 380 and any additional entries for each column other than "functor" or "args" (e.g., `'filt'`, 381 `'dataset'`) are treated as keyword arguments to be passed to the functor initialization. 383 The "flags" entry is shortcut for a bunch of `Column` functors with the original column and 384 taken from the `'ref'` dataset. 386 Note, if `'filter'` is provided as part of the `dataId` when running this task (even though 387 `deepCoadd_obj` does not use `'filter'`), then this will override the `filt` kwargs 388 provided in the YAML file, and the calculations will be done in that filter. 390 This task uses the `lsst.pipe.tasks.postprocess.PostprocessAnalysis` object 391 to organize and excecute the calculations. 395 def _DefaultName(self):
396 raise NotImplementedError(
'Subclass must define "_DefaultName" attribute')
400 raise NotImplementedError(
'Subclass must define "outputDataset" attribute')
404 raise NotImplementedError(
'Subclass must define "inputDataset" attribute')
408 raise NotImplementedError(
'Subclass must define "ConfigClass" attribute')
411 parq = patchRef.get()
412 dataId = patchRef.dataId
414 self.log.info(
"Transforming/standardizing the catalog of %s", dataId)
415 df = self.
run(parq, funcs=funcs, dataId=dataId)
416 self.
write(df, patchRef)
419 def run(self, parq, funcs=None, dataId=None):
420 """Do postprocessing calculations 422 Takes a `ParquetTable` object and dataId, 423 returns a dataframe with results of postprocessing calculations. 427 parq : `lsst.pipe.tasks.parquetTable.ParquetTable` 428 ParquetTable from which calculations are done. 429 funcs : `lsst.pipe.tasks.functors.Functors` 430 Functors to apply to the table's columns 431 dataId : dict, optional 432 Used to add a `patchId` column to the output dataframe. 439 filt = dataId.get(
'filter',
None)
440 return self.
transform(filt, parq, funcs, dataId).df
443 funcs = CompositeFunctor.from_file(self.config.functorFile)
444 funcs.update(dict(PostprocessAnalysis._defaultFuncs))
455 analysis = self.
getAnalysis(parq, funcs=funcs, filt=filt)
457 if dataId
is not None:
458 for key, value
in dataId.items():
461 return pipeBase.Struct(
470 """No metadata to write. 475 class TransformObjectCatalogConfig(TransformCatalogBaseConfig):
476 filterMap = pexConfig.DictField(
480 doc=(
"Dictionary mapping full filter name to short one for column name munging." 481 "These filters determine the output columns no matter what filters the " 482 "input data actually contain.")
484 camelCase = pexConfig.Field(
487 doc=(
"Write per-filter columns names with camelCase, else underscore " 488 "For example: gPsfFlux instead of g_PsfFlux.")
490 multilevelOutput = pexConfig.Field(
493 doc=(
"Whether results dataframe should have a multilevel column index (True) or be flat " 494 "and name-munged (False).")
499 """Compute Flatted Object Table as defined in the DPDD 501 Do the same set of postprocessing calculations on all bands 503 This is identical to `TransformCatalogBaseTask`, except for that it does the 504 specified functor calculations for all filters present in the 505 input `deepCoadd_obj` table. Any specific `"filt"` keywords specified 506 by the YAML file will be superceded. 508 _DefaultName =
"transformObjectCatalog" 509 ConfigClass = TransformObjectCatalogConfig
511 inputDataset =
'deepCoadd_obj' 512 outputDataset =
'objectTable' 515 def _makeArgumentParser(cls):
518 ContainerClass=CoaddDataIdContainer,
519 help=
"data ID, e.g. --id tract=12345 patch=1,2")
522 def run(self, parq, funcs=None, dataId=None):
525 templateDf = pd.DataFrame()
528 for filt
in parq.columnLevelNames[
'filter']:
529 if filt
not in self.config.filterMap:
530 self.log.info(
"Ignoring %s data in the input", filt)
532 self.log.info(
"Transforming the catalog of filter %s", filt)
533 result = self.
transform(filt, parq, funcs, dataId)
534 dfDict[filt] = result.df
535 analysisDict[filt] = result.analysis
537 templateDf = result.df
540 for filt
in self.config.filterMap:
541 if filt
not in dfDict:
542 self.log.info(
"Adding empty columns for filter %s", filt)
543 dfDict[filt] = pd.DataFrame().reindex_like(templateDf)
546 df = pd.concat(dfDict, axis=1, names=[
'filter',
'column'])
548 if not self.config.multilevelOutput:
549 noDupCols = list(set.union(*[set(v.noDupCols)
for v
in analysisDict.values()]))
550 if dataId
is not None:
551 noDupCols += list(dataId.keys())
552 df =
flattenFilters(df, self.config.filterMap, noDupCols=noDupCols,
553 camelCase=self.config.camelCase)
555 self.log.info(
"Made a table of %d columns and %d rows", len(df.columns), len(df))
562 """Make self.refList from self.idList 564 Generate a list of data references given tract and/or patch. 565 This was adapted from `TractQADataIdContainer`, which was 566 `TractDataIdContainer` modifie to not require "filter". 567 Only existing dataRefs are returned. 569 def getPatchRefList(tract):
570 return [namespace.butler.dataRef(datasetType=self.datasetType,
572 patch=
"%d,%d" % patch.getIndex())
for patch
in tract]
574 tractRefs = defaultdict(list)
575 for dataId
in self.idList:
578 if "tract" in dataId:
579 tractId = dataId[
"tract"]
580 if "patch" in dataId:
581 tractRefs[tractId].append(namespace.butler.dataRef(datasetType=self.datasetType,
583 patch=dataId[
'patch']))
585 tractRefs[tractId] += getPatchRefList(skymap[tractId])
587 tractRefs = dict((tract.getId(), tractRefs.get(tract.getId(), []) + getPatchRefList(tract))
590 for tractRefList
in tractRefs.values():
591 existingRefs = [ref
for ref
in tractRefList
if ref.datasetExists()]
592 outputRefList.append(existingRefs)
598 coaddName = pexConfig.Field(
606 """Write patch-merged source tables to a tract-level parquet file 608 _DefaultName =
"consolidateObjectTable" 609 ConfigClass = ConsolidateObjectTableConfig
611 inputDataset =
'objectTable' 612 outputDataset =
'objectTable_tract' 615 def _makeArgumentParser(cls):
619 help=
"data ID, e.g. --id tract=12345",
620 ContainerClass=TractObjectDataIdContainer)
624 df = pd.concat([patchRef.get().toDataFrame()
for patchRef
in patchRefList])
628 """No metadata to write. def readCatalog(self, patchRef)
def flattenFilters(df, filterDict, noDupCols=['coord_ra', coord_dec, camelCase=False)
def makeDataRefList(self, namespace)
def makeMergeArgumentParser(name, dataset)
Create a suitable ArgumentParser.
def run(self, parq, funcs=None, dataId=None)
def __init__(self, parq, functors, filt=None, flags=None)
def writeMetadata(self, dataRef)
def runDataRef(self, patchRef)
def write(self, patchRef, catalog)
def runDataRef(self, patchRefList)
def __init__(self, butler=None, schema=None, kwargs)
def writeMetadata(self, dataRefList)
def compute(self, dropna=False, pool=None)
def run(self, catalogs, tract, patch)
def getSkymap(self, namespace)
def runDataRef(self, patchRefList)
Merge coadd sources from multiple bands.
def run(self, parq, funcs=None, dataId=None)
def write(self, df, parqRef)
def writeMetadata(self, dataRef)
def transform(self, filt, parq, funcs, dataId)
def getAnalysis(self, parq, funcs=None, filt=None)