24 from collections
import defaultdict
26 import lsst.pex.config
as pexConfig
31 from .parquetTable
import ParquetTable
32 from .multiBandUtils
import makeMergeArgumentParser, MergeSourcesRunner
33 from .functors
import CompositeFunctor, RAColumn, DecColumn, Column
36 def flattenFilters(df, filterDict, noDupCols=['coord_ra', 'coord_dec'], camelCase=False):
37 """Flattens a dataframe with multilevel column index 39 newDf = pd.DataFrame()
40 for filt, filtShort
in filterDict.items():
42 columnFormat =
'{0}{1}' if camelCase
else '{0}_{1}' 43 newColumns = {c: columnFormat.format(filtShort, c)
44 for c
in subdf.columns
if c
not in noDupCols}
45 cols = list(newColumns.keys())
46 newDf = pd.concat([newDf, subdf[cols].rename(columns=newColumns)], axis=1)
48 newDf = pd.concat([subdf[noDupCols], newDf], axis=1)
53 priorityList = pexConfig.ListField(
56 doc=
"Priority-ordered list of bands for the merge." 58 engine = pexConfig.Field(
61 doc=
"Parquet engine for writing (pyarrow or fastparquet)" 63 coaddName = pexConfig.Field(
70 pexConfig.Config.validate(self)
72 raise RuntimeError(
"No priority list provided")
76 """Write filter-merged source tables to parquet 78 _DefaultName =
"writeObjectTable" 79 ConfigClass = WriteObjectTableConfig
80 RunnerClass = MergeSourcesRunner
83 inputDatasets = (
'forced_src',
'meas',
'ref')
88 def __init__(self, butler=None, schema=None, **kwargs):
92 CmdLineTask.__init__(self, **kwargs)
96 @brief Merge coadd sources from multiple bands. Calls @ref `run` which must be defined in 97 subclasses that inherit from MergeSourcesTask. 98 @param[in] patchRefList list of data references for each filter 100 catalogs = dict(self.
readCatalog(patchRef)
for patchRef
in patchRefList)
101 dataId = patchRefList[0].dataId
102 mergedCatalog = self.
run(catalogs, tract=dataId[
'tract'], patch=dataId[
'patch'])
103 self.
write(patchRefList[0], mergedCatalog)
106 def _makeArgumentParser(cls):
107 """Create a suitable ArgumentParser. 109 We will use the ArgumentParser to get a list of data 110 references for patches; the RunnerClass will sort them into lists 111 of data references for the same patch. 113 References first of self.inputDatasets, rather than 119 """Read input catalogs 121 Read all the input datasets given by the 'inputDatasets' 126 patchRef : `lsst.daf.persistence.ButlerDataRef` 127 Data reference for patch 131 Tuple consisting of filter name and a dict of catalogs, keyed by 134 filterName = patchRef.dataId[
"filter"]
137 catalog = patchRef.get(self.config.coaddName +
"Coadd_" + dataset, immediate=
True)
138 self.log.info(
"Read %d sources from %s for filter %s: %s" %
139 (len(catalog), dataset, filterName, patchRef.dataId))
140 catalogDict[dataset] = catalog
141 return filterName, catalogDict
143 def run(self, catalogs, tract, patch):
144 """Merge multiple catalogs. 149 Mapping from filter names to dict of catalogs. 151 tractId to use for the tractId column 153 patchId to use for the patchId column 157 catalog : `lsst.pipe.tasks.parquetTable.ParquetTable` 158 Merged dataframe, with each column prefixed by 159 `filter_tag(filt)`, wrapped in the parquet writer shim class. 163 for filt, tableDict
in catalogs.items():
164 for dataset, table
in tableDict.items():
166 df = table.asAstropy().to_pandas().set_index(
'id', drop=
True)
169 df = df.reindex(sorted(df.columns), axis=1)
170 df[
'tractId'] = tract
171 df[
'patchId'] = patch
174 df.columns = pd.MultiIndex.from_tuples([(dataset, filt, c)
for c
in df.columns],
175 names=(
'dataset',
'filter',
'column'))
178 catalog = functools.reduce(
lambda d1, d2: d1.join(d2), dfs)
186 catalog : `ParquetTable` 188 patchRef : `lsst.daf.persistence.ButlerDataRef` 189 Data reference for patch 191 patchRef.put(catalog, self.config.coaddName +
"Coadd_" + self.
outputDataset)
194 mergeDataId = patchRef.dataId.copy()
195 del mergeDataId[
"filter"]
196 self.log.info(
"Wrote merged catalog: %s" % (mergeDataId,))
199 """No metadata to write, and not sure how to write it for a list of dataRefs. 204 class PostprocessAnalysis(object):
205 """Calculate columns from ParquetTable 207 This object manages and organizes an arbitrary set of computations 208 on a catalog. The catalog is defined by a 209 `lsst.pipe.tasks.parquetTable.ParquetTable` object (or list thereof), such as a 210 `deepCoadd_obj` dataset, and the computations are defined by a collection 211 of `lsst.pipe.tasks.functor.Functor` objects (or, equivalently, 212 a `CompositeFunctor`). 214 After the object is initialized, accessing the `.df` attribute (which 215 holds the `pandas.DataFrame` containing the results of the calculations) triggers 216 computation of said dataframe. 218 One of the conveniences of using this object is the ability to define a desired common 219 filter for all functors. This enables the same functor collection to be passed to 220 several different `PostprocessAnalysis` objects without having to change the original 221 functor collection, since the `filt` keyword argument of this object triggers an 222 overwrite of the `filt` property for all functors in the collection. 224 This object also allows a list of refFlags to be passed, and defines a set of default 225 refFlags that are always included even if not requested. 227 If a list of `ParquetTable` object is passed, rather than a single one, then the 228 calculations will be mapped over all the input catalogs. In principle, it should 229 be straightforward to parallelize this activity, but initial tests have failed 230 (see TODO in code comments). 234 parq : `lsst.pipe.tasks.ParquetTable` (or list of such) 235 Source catalog(s) for computation 237 functors : `list`, `dict`, or `lsst.pipe.tasks.functors.CompositeFunctor` 238 Computations to do (functors that act on `parq`). 239 If a dict, the output 240 DataFrame will have columns keyed accordingly. 241 If a list, the column keys will come from the 242 `.shortname` attribute of each functor. 244 filt : `str` (optional) 245 Filter in which to calculate. If provided, 246 this will overwrite any existing `.filt` attribute 247 of the provided functors. 249 flags : `list` (optional) 250 List of flags (per-band) to include in output table. 252 refFlags : `list` (optional) 253 List of refFlags (only reference band) to include in output table. 257 _defaultRefFlags = (
'calib_psf_used',
'detect_isPrimary')
258 _defaultFuncs = ((
'coord_ra', RAColumn()),
259 (
'coord_dec', DecColumn()))
261 def __init__(self, parq, functors, filt=None, flags=None, refFlags=None):
266 self.
flags = list(flags)
if flags
is not None else []
268 if refFlags
is not None:
281 additionalFuncs.update({flag:
Column(flag, dataset=
'ref')
for flag
in self.
refFlags})
282 additionalFuncs.update({flag:
Column(flag, dataset=
'meas')
for flag
in self.
flags})
284 if isinstance(self.
functors, CompositeFunctor):
289 func.funcDict.update(additionalFuncs)
290 func.filt = self.
filt 296 return [name
for name, func
in self.
func.funcDict.items()
if func.noDup
or func.dataset ==
'ref']
306 if type(self.
parq)
in (list, tuple):
308 dflist = [self.
func(parq, dropna=dropna)
for parq
in self.
parq]
311 dflist = pool.map(functools.partial(self.
func, dropna=dropna), self.
parq)
312 self.
_df = pd.concat(dflist)
320 coaddName = pexConfig.Field(
325 functorFile = pexConfig.Field(
327 doc=
'Path to YAML file specifying functors to be computed',
333 """Base class for transforming/standardizing a catalog 335 by applying functors that convert units and apply calibrations. 336 The purpose of this task is to perform a set of computations on 337 an input `ParquetTable` dataset (such as `deepCoadd_obj`) and write the 338 results to a new dataset (which needs to be declared in an `outputDataset` 341 The calculations to be performed are defined in a YAML file that specifies 342 a set of functors to be computed, provided as 343 a `--functorFile` config parameter. An example of such a YAML file 368 - base_InputCount_value 371 functor: DeconvolvedMoments 376 - merge_measurement_i 377 - merge_measurement_r 378 - merge_measurement_z 379 - merge_measurement_y 380 - merge_measurement_g 381 - base_PixelFlags_flag_inexact_psfCenter 384 The names for each entry under "func" will become the names of columns in the 385 output dataset. All the functors referenced are defined in `lsst.pipe.tasks.functors`. 386 Positional arguments to be passed to each functor are in the `args` list, 387 and any additional entries for each column other than "functor" or "args" (e.g., `'filt'`, 388 `'dataset'`) are treated as keyword arguments to be passed to the functor initialization. 390 The "refFlags" entry is shortcut for a bunch of `Column` functors with the original column and 391 taken from the `'ref'` dataset. 393 The "flags" entry will be expanded out per band. 395 Note, if `'filter'` is provided as part of the `dataId` when running this task (even though 396 `deepCoadd_obj` does not use `'filter'`), then this will override the `filt` kwargs 397 provided in the YAML file, and the calculations will be done in that filter. 399 This task uses the `lsst.pipe.tasks.postprocess.PostprocessAnalysis` object 400 to organize and excecute the calculations. 404 def _DefaultName(self):
405 raise NotImplementedError(
'Subclass must define "_DefaultName" attribute')
409 raise NotImplementedError(
'Subclass must define "outputDataset" attribute')
413 raise NotImplementedError(
'Subclass must define "inputDataset" attribute')
417 raise NotImplementedError(
'Subclass must define "ConfigClass" attribute')
420 parq = patchRef.get()
421 dataId = patchRef.dataId
423 self.log.info(
"Transforming/standardizing the catalog of %s", dataId)
424 df = self.
run(parq, funcs=funcs, dataId=dataId)
425 self.
write(df, patchRef)
428 def run(self, parq, funcs=None, dataId=None):
429 """Do postprocessing calculations 431 Takes a `ParquetTable` object and dataId, 432 returns a dataframe with results of postprocessing calculations. 436 parq : `lsst.pipe.tasks.parquetTable.ParquetTable` 437 ParquetTable from which calculations are done. 438 funcs : `lsst.pipe.tasks.functors.Functors` 439 Functors to apply to the table's columns 440 dataId : dict, optional 441 Used to add a `patchId` column to the output dataframe. 448 filt = dataId.get(
'filter',
None)
449 return self.
transform(filt, parq, funcs, dataId).df
452 funcs = CompositeFunctor.from_file(self.config.functorFile)
453 funcs.update(dict(PostprocessAnalysis._defaultFuncs))
464 analysis = self.
getAnalysis(parq, funcs=funcs, filt=filt)
466 if dataId
is not None:
467 for key, value
in dataId.items():
470 return pipeBase.Struct(
479 """No metadata to write. 484 class TransformObjectCatalogConfig(TransformCatalogBaseConfig):
485 filterMap = pexConfig.DictField(
489 doc=(
"Dictionary mapping full filter name to short one for column name munging." 490 "These filters determine the output columns no matter what filters the " 491 "input data actually contain.")
493 camelCase = pexConfig.Field(
496 doc=(
"Write per-filter columns names with camelCase, else underscore " 497 "For example: gPsfFlux instead of g_PsfFlux.")
499 multilevelOutput = pexConfig.Field(
502 doc=(
"Whether results dataframe should have a multilevel column index (True) or be flat " 503 "and name-munged (False).")
508 """Compute Flatted Object Table as defined in the DPDD 510 Do the same set of postprocessing calculations on all bands 512 This is identical to `TransformCatalogBaseTask`, except for that it does the 513 specified functor calculations for all filters present in the 514 input `deepCoadd_obj` table. Any specific `"filt"` keywords specified 515 by the YAML file will be superceded. 517 _DefaultName =
"transformObjectCatalog" 518 ConfigClass = TransformObjectCatalogConfig
520 inputDataset =
'deepCoadd_obj' 521 outputDataset =
'objectTable' 524 def _makeArgumentParser(cls):
527 ContainerClass=CoaddDataIdContainer,
528 help=
"data ID, e.g. --id tract=12345 patch=1,2")
531 def run(self, parq, funcs=None, dataId=None):
534 templateDf = pd.DataFrame()
537 for filt
in parq.columnLevelNames[
'filter']:
538 if filt
not in self.config.filterMap:
539 self.log.info(
"Ignoring %s data in the input", filt)
541 self.log.info(
"Transforming the catalog of filter %s", filt)
542 result = self.
transform(filt, parq, funcs, dataId)
543 dfDict[filt] = result.df
544 analysisDict[filt] = result.analysis
546 templateDf = result.df
549 for filt
in self.config.filterMap:
550 if filt
not in dfDict:
551 self.log.info(
"Adding empty columns for filter %s", filt)
552 dfDict[filt] = pd.DataFrame().reindex_like(templateDf)
555 df = pd.concat(dfDict, axis=1, names=[
'filter',
'column'])
557 if not self.config.multilevelOutput:
558 noDupCols = list(set.union(*[set(v.noDupCols)
for v
in analysisDict.values()]))
559 if dataId
is not None:
560 noDupCols += list(dataId.keys())
561 df =
flattenFilters(df, self.config.filterMap, noDupCols=noDupCols,
562 camelCase=self.config.camelCase)
564 self.log.info(
"Made a table of %d columns and %d rows", len(df.columns), len(df))
571 """Make self.refList from self.idList 573 Generate a list of data references given tract and/or patch. 574 This was adapted from `TractQADataIdContainer`, which was 575 `TractDataIdContainer` modifie to not require "filter". 576 Only existing dataRefs are returned. 578 def getPatchRefList(tract):
579 return [namespace.butler.dataRef(datasetType=self.datasetType,
581 patch=
"%d,%d" % patch.getIndex())
for patch
in tract]
583 tractRefs = defaultdict(list)
584 for dataId
in self.idList:
587 if "tract" in dataId:
588 tractId = dataId[
"tract"]
589 if "patch" in dataId:
590 tractRefs[tractId].append(namespace.butler.dataRef(datasetType=self.datasetType,
592 patch=dataId[
'patch']))
594 tractRefs[tractId] += getPatchRefList(skymap[tractId])
596 tractRefs = dict((tract.getId(), tractRefs.get(tract.getId(), []) + getPatchRefList(tract))
599 for tractRefList
in tractRefs.values():
600 existingRefs = [ref
for ref
in tractRefList
if ref.datasetExists()]
601 outputRefList.append(existingRefs)
607 coaddName = pexConfig.Field(
615 """Write patch-merged source tables to a tract-level parquet file 617 _DefaultName =
"consolidateObjectTable" 618 ConfigClass = ConsolidateObjectTableConfig
620 inputDataset =
'objectTable' 621 outputDataset =
'objectTable_tract' 624 def _makeArgumentParser(cls):
628 help=
"data ID, e.g. --id tract=12345",
629 ContainerClass=TractObjectDataIdContainer)
633 df = pd.concat([patchRef.get().toDataFrame()
for patchRef
in patchRefList])
637 """No metadata to write. def readCatalog(self, patchRef)
def flattenFilters(df, filterDict, noDupCols=['coord_ra', coord_dec, camelCase=False)
def makeDataRefList(self, namespace)
def makeMergeArgumentParser(name, dataset)
Create a suitable ArgumentParser.
def run(self, parq, funcs=None, dataId=None)
def writeMetadata(self, dataRef)
def runDataRef(self, patchRef)
def write(self, patchRef, catalog)
def runDataRef(self, patchRefList)
def __init__(self, butler=None, schema=None, kwargs)
def writeMetadata(self, dataRefList)
def compute(self, dropna=False, pool=None)
def run(self, catalogs, tract, patch)
def getSkymap(self, namespace)
def runDataRef(self, patchRefList)
Merge coadd sources from multiple bands.
def run(self, parq, funcs=None, dataId=None)
def __init__(self, parq, functors, filt=None, flags=None, refFlags=None)
def write(self, df, parqRef)
def writeMetadata(self, dataRef)
def transform(self, filt, parq, funcs, dataId)
def getAnalysis(self, parq, funcs=None, filt=None)