Coverage for python/lsst/pipe/tasks/postprocess.py : 28%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of pipe_tasks
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <https://www.gnu.org/licenses/>.
22import functools
23import pandas as pd
24from collections import defaultdict
26import lsst.geom
27import lsst.pex.config as pexConfig
28import lsst.pipe.base as pipeBase
29import lsst.daf.base as dafBase
30from lsst.pipe.base import connectionTypes
31import lsst.afw.table as afwTable
32from lsst.meas.base import SingleFrameMeasurementTask
33from lsst.pipe.base import CmdLineTask, ArgumentParser, DataIdContainer
34from lsst.coadd.utils.coaddDataIdContainer import CoaddDataIdContainer
35from lsst.daf.butler import DeferredDatasetHandle
37from .parquetTable import ParquetTable
38from .multiBandUtils import makeMergeArgumentParser, MergeSourcesRunner
39from .functors import CompositeFunctor, RAColumn, DecColumn, Column
42def flattenFilters(df, noDupCols=['coord_ra', 'coord_dec'], camelCase=False, inputBands=None):
43 """Flattens a dataframe with multilevel column index
44 """
45 newDf = pd.DataFrame()
46 # band is the level 0 index
47 dfBands = df.columns.unique(level=0).values
48 for band in dfBands:
49 subdf = df[band]
50 columnFormat = '{0}{1}' if camelCase else '{0}_{1}'
51 newColumns = {c: columnFormat.format(band, c)
52 for c in subdf.columns if c not in noDupCols}
53 cols = list(newColumns.keys())
54 newDf = pd.concat([newDf, subdf[cols].rename(columns=newColumns)], axis=1)
56 # Band must be present in the input and output or else column is all NaN:
57 presentBands = dfBands if inputBands is None else list(set(inputBands).intersection(dfBands))
58 # Get the unexploded columns from any present band's partition
59 noDupDf = df[presentBands[0]][noDupCols]
60 newDf = pd.concat([noDupDf, newDf], axis=1)
61 return newDf
64class WriteObjectTableConnections(pipeBase.PipelineTaskConnections,
65 defaultTemplates={"coaddName": "deep"},
66 dimensions=("tract", "patch", "skymap")):
67 inputCatalogMeas = connectionTypes.Input(
68 doc="Catalog of source measurements on the deepCoadd.",
69 dimensions=("tract", "patch", "band", "skymap"),
70 storageClass="SourceCatalog",
71 name="{coaddName}Coadd_meas",
72 multiple=True
73 )
74 inputCatalogForcedSrc = connectionTypes.Input(
75 doc="Catalog of forced measurements (shape and position parameters held fixed) on the deepCoadd.",
76 dimensions=("tract", "patch", "band", "skymap"),
77 storageClass="SourceCatalog",
78 name="{coaddName}Coadd_forced_src",
79 multiple=True
80 )
81 inputCatalogRef = connectionTypes.Input(
82 doc="Catalog marking the primary detection (which band provides a good shape and position)"
83 "for each detection in deepCoadd_mergeDet.",
84 dimensions=("tract", "patch", "skymap"),
85 storageClass="SourceCatalog",
86 name="{coaddName}Coadd_ref"
87 )
88 outputCatalog = connectionTypes.Output(
89 doc="A vertical concatenation of the deepCoadd_{ref|meas|forced_src} catalogs, "
90 "stored as a DataFrame with a multi-level column index per-patch.",
91 dimensions=("tract", "patch", "skymap"),
92 storageClass="DataFrame",
93 name="{coaddName}Coadd_obj"
94 )
97class WriteObjectTableConfig(pipeBase.PipelineTaskConfig,
98 pipelineConnections=WriteObjectTableConnections):
99 engine = pexConfig.Field(
100 dtype=str,
101 default="pyarrow",
102 doc="Parquet engine for writing (pyarrow or fastparquet)"
103 )
104 coaddName = pexConfig.Field(
105 dtype=str,
106 default="deep",
107 doc="Name of coadd"
108 )
111class WriteObjectTableTask(CmdLineTask, pipeBase.PipelineTask):
112 """Write filter-merged source tables to parquet
113 """
114 _DefaultName = "writeObjectTable"
115 ConfigClass = WriteObjectTableConfig
116 RunnerClass = MergeSourcesRunner
118 # Names of table datasets to be merged
119 inputDatasets = ('forced_src', 'meas', 'ref')
121 # Tag of output dataset written by `MergeSourcesTask.write`
122 outputDataset = 'obj'
124 def __init__(self, butler=None, schema=None, **kwargs):
125 # It is a shame that this class can't use the default init for CmdLineTask
126 # But to do so would require its own special task runner, which is many
127 # more lines of specialization, so this is how it is for now
128 super().__init__(**kwargs)
130 def runDataRef(self, patchRefList):
131 """!
132 @brief Merge coadd sources from multiple bands. Calls @ref `run` which must be defined in
133 subclasses that inherit from MergeSourcesTask.
134 @param[in] patchRefList list of data references for each filter
135 """
136 catalogs = dict(self.readCatalog(patchRef) for patchRef in patchRefList)
137 dataId = patchRefList[0].dataId
138 mergedCatalog = self.run(catalogs, tract=dataId['tract'], patch=dataId['patch'])
139 self.write(patchRefList[0], ParquetTable(dataFrame=mergedCatalog))
141 def runQuantum(self, butlerQC, inputRefs, outputRefs):
142 inputs = butlerQC.get(inputRefs)
144 measDict = {ref.dataId['band']: {'meas': cat} for ref, cat in
145 zip(inputRefs.inputCatalogMeas, inputs['inputCatalogMeas'])}
146 forcedSourceDict = {ref.dataId['band']: {'forced_src': cat} for ref, cat in
147 zip(inputRefs.inputCatalogForcedSrc, inputs['inputCatalogForcedSrc'])}
149 catalogs = {}
150 for band in measDict.keys():
151 catalogs[band] = {'meas': measDict[band]['meas'],
152 'forced_src': forcedSourceDict[band]['forced_src'],
153 'ref': inputs['inputCatalogRef']}
154 dataId = butlerQC.quantum.dataId
155 df = self.run(catalogs=catalogs, tract=dataId['tract'], patch=dataId['patch'])
156 outputs = pipeBase.Struct(outputCatalog=df)
157 butlerQC.put(outputs, outputRefs)
159 @classmethod
160 def _makeArgumentParser(cls):
161 """Create a suitable ArgumentParser.
163 We will use the ArgumentParser to get a list of data
164 references for patches; the RunnerClass will sort them into lists
165 of data references for the same patch.
167 References first of self.inputDatasets, rather than
168 self.inputDataset
169 """
170 return makeMergeArgumentParser(cls._DefaultName, cls.inputDatasets[0])
172 def readCatalog(self, patchRef):
173 """Read input catalogs
175 Read all the input datasets given by the 'inputDatasets'
176 attribute.
178 Parameters
179 ----------
180 patchRef : `lsst.daf.persistence.ButlerDataRef`
181 Data reference for patch
183 Returns
184 -------
185 Tuple consisting of band name and a dict of catalogs, keyed by
186 dataset name
187 """
188 band = patchRef.get(self.config.coaddName + "Coadd_filterLabel", immediate=True).bandLabel
189 catalogDict = {}
190 for dataset in self.inputDatasets:
191 catalog = patchRef.get(self.config.coaddName + "Coadd_" + dataset, immediate=True)
192 self.log.info("Read %d sources from %s for band %s: %s" %
193 (len(catalog), dataset, band, patchRef.dataId))
194 catalogDict[dataset] = catalog
195 return band, catalogDict
197 def run(self, catalogs, tract, patch):
198 """Merge multiple catalogs.
200 Parameters
201 ----------
202 catalogs : `dict`
203 Mapping from filter names to dict of catalogs.
204 tract : int
205 tractId to use for the tractId column
206 patch : str
207 patchId to use for the patchId column
209 Returns
210 -------
211 catalog : `pandas.DataFrame`
212 Merged dataframe
213 """
215 dfs = []
216 for filt, tableDict in catalogs.items():
217 for dataset, table in tableDict.items():
218 # Convert afwTable to pandas DataFrame
219 df = table.asAstropy().to_pandas().set_index('id', drop=True)
221 # Sort columns by name, to ensure matching schema among patches
222 df = df.reindex(sorted(df.columns), axis=1)
223 df['tractId'] = tract
224 df['patchId'] = patch
226 # Make columns a 3-level MultiIndex
227 df.columns = pd.MultiIndex.from_tuples([(dataset, filt, c) for c in df.columns],
228 names=('dataset', 'band', 'column'))
229 dfs.append(df)
231 catalog = functools.reduce(lambda d1, d2: d1.join(d2), dfs)
232 return catalog
234 def write(self, patchRef, catalog):
235 """Write the output.
237 Parameters
238 ----------
239 catalog : `ParquetTable`
240 Catalog to write
241 patchRef : `lsst.daf.persistence.ButlerDataRef`
242 Data reference for patch
243 """
244 patchRef.put(catalog, self.config.coaddName + "Coadd_" + self.outputDataset)
245 # since the filter isn't actually part of the data ID for the dataset we're saving,
246 # it's confusing to see it in the log message, even if the butler simply ignores it.
247 mergeDataId = patchRef.dataId.copy()
248 del mergeDataId["filter"]
249 self.log.info("Wrote merged catalog: %s" % (mergeDataId,))
251 def writeMetadata(self, dataRefList):
252 """No metadata to write, and not sure how to write it for a list of dataRefs.
253 """
254 pass
257class WriteSourceTableConnections(pipeBase.PipelineTaskConnections,
258 dimensions=("instrument", "visit", "detector")):
260 catalog = connectionTypes.Input(
261 doc="Input full-depth catalog of sources produced by CalibrateTask",
262 name="src",
263 storageClass="SourceCatalog",
264 dimensions=("instrument", "visit", "detector")
265 )
266 outputCatalog = connectionTypes.Output(
267 doc="Catalog of sources, `src` in Parquet format",
268 name="source",
269 storageClass="DataFrame",
270 dimensions=("instrument", "visit", "detector")
271 )
274class WriteSourceTableConfig(pipeBase.PipelineTaskConfig,
275 pipelineConnections=WriteSourceTableConnections):
276 doApplyExternalPhotoCalib = pexConfig.Field(
277 dtype=bool,
278 default=False,
279 doc=("Add local photoCalib columns from the calexp.photoCalib? Should only set True if "
280 "generating Source Tables from older src tables which do not already have local calib columns")
281 )
282 doApplyExternalSkyWcs = pexConfig.Field(
283 dtype=bool,
284 default=False,
285 doc=("Add local WCS columns from the calexp.wcs? Should only set True if "
286 "generating Source Tables from older src tables which do not already have local calib columns")
287 )
290class WriteSourceTableTask(CmdLineTask, pipeBase.PipelineTask):
291 """Write source table to parquet
292 """
293 _DefaultName = "writeSourceTable"
294 ConfigClass = WriteSourceTableConfig
296 def runDataRef(self, dataRef):
297 src = dataRef.get('src')
298 if self.config.doApplyExternalPhotoCalib or self.config.doApplyExternalSkyWcs:
299 src = self.addCalibColumns(src, dataRef)
301 ccdVisitId = dataRef.get('ccdExposureId')
302 result = self.run(src, ccdVisitId=ccdVisitId)
303 dataRef.put(result.table, 'source')
305 def runQuantum(self, butlerQC, inputRefs, outputRefs):
306 inputs = butlerQC.get(inputRefs)
307 inputs['ccdVisitId'] = butlerQC.quantum.dataId.pack("visit_detector")
308 result = self.run(**inputs).table
309 outputs = pipeBase.Struct(outputCatalog=result.toDataFrame())
310 butlerQC.put(outputs, outputRefs)
312 def run(self, catalog, ccdVisitId=None):
313 """Convert `src` catalog to parquet
315 Parameters
316 ----------
317 catalog: `afwTable.SourceCatalog`
318 catalog to be converted
319 ccdVisitId: `int`
320 ccdVisitId to be added as a column
322 Returns
323 -------
324 result : `lsst.pipe.base.Struct`
325 ``table``
326 `ParquetTable` version of the input catalog
327 """
328 self.log.info("Generating parquet table from src catalog %s", ccdVisitId)
329 df = catalog.asAstropy().to_pandas().set_index('id', drop=True)
330 df['ccdVisitId'] = ccdVisitId
331 return pipeBase.Struct(table=ParquetTable(dataFrame=df))
333 def addCalibColumns(self, catalog, dataRef):
334 """Add columns with local calibration evaluated at each centroid
336 for backwards compatibility with old repos.
337 This exists for the purpose of converting old src catalogs
338 (which don't have the expected local calib columns) to Source Tables.
340 Parameters
341 ----------
342 catalog: `afwTable.SourceCatalog`
343 catalog to which calib columns will be added
344 dataRef: `lsst.daf.persistence.ButlerDataRef
345 for fetching the calibs from disk.
347 Returns
348 -------
349 newCat: `afwTable.SourceCatalog`
350 Source Catalog with requested local calib columns
351 """
352 mapper = afwTable.SchemaMapper(catalog.schema)
353 measureConfig = SingleFrameMeasurementTask.ConfigClass()
354 measureConfig.doReplaceWithNoise = False
356 # Just need the WCS or the PhotoCalib attached to an exposue
357 exposure = dataRef.get('calexp_sub',
358 bbox=lsst.geom.Box2I(lsst.geom.Point2I(0, 0), lsst.geom.Point2I(0, 0)))
360 mapper = afwTable.SchemaMapper(catalog.schema)
361 mapper.addMinimalSchema(catalog.schema, True)
362 schema = mapper.getOutputSchema()
364 exposureIdInfo = dataRef.get("expIdInfo")
365 measureConfig.plugins.names = []
366 if self.config.doApplyExternalSkyWcs:
367 plugin = 'base_LocalWcs'
368 if plugin in schema:
369 raise RuntimeError(f"{plugin} already in src catalog. Set doApplyExternalSkyWcs=False")
370 else:
371 measureConfig.plugins.names.add(plugin)
373 if self.config.doApplyExternalPhotoCalib:
374 plugin = 'base_LocalPhotoCalib'
375 if plugin in schema:
376 raise RuntimeError(f"{plugin} already in src catalog. Set doApplyExternalPhotoCalib=False")
377 else:
378 measureConfig.plugins.names.add(plugin)
380 measurement = SingleFrameMeasurementTask(config=measureConfig, schema=schema)
381 newCat = afwTable.SourceCatalog(schema)
382 newCat.extend(catalog, mapper=mapper)
383 measurement.run(measCat=newCat, exposure=exposure, exposureId=exposureIdInfo.expId)
384 return newCat
386 def writeMetadata(self, dataRef):
387 """No metadata to write.
388 """
389 pass
391 @classmethod
392 def _makeArgumentParser(cls):
393 parser = ArgumentParser(name=cls._DefaultName)
394 parser.add_id_argument("--id", 'src',
395 help="data ID, e.g. --id visit=12345 ccd=0")
396 return parser
399class PostprocessAnalysis(object):
400 """Calculate columns from ParquetTable
402 This object manages and organizes an arbitrary set of computations
403 on a catalog. The catalog is defined by a
404 `lsst.pipe.tasks.parquetTable.ParquetTable` object (or list thereof), such as a
405 `deepCoadd_obj` dataset, and the computations are defined by a collection
406 of `lsst.pipe.tasks.functor.Functor` objects (or, equivalently,
407 a `CompositeFunctor`).
409 After the object is initialized, accessing the `.df` attribute (which
410 holds the `pandas.DataFrame` containing the results of the calculations) triggers
411 computation of said dataframe.
413 One of the conveniences of using this object is the ability to define a desired common
414 filter for all functors. This enables the same functor collection to be passed to
415 several different `PostprocessAnalysis` objects without having to change the original
416 functor collection, since the `filt` keyword argument of this object triggers an
417 overwrite of the `filt` property for all functors in the collection.
419 This object also allows a list of refFlags to be passed, and defines a set of default
420 refFlags that are always included even if not requested.
422 If a list of `ParquetTable` object is passed, rather than a single one, then the
423 calculations will be mapped over all the input catalogs. In principle, it should
424 be straightforward to parallelize this activity, but initial tests have failed
425 (see TODO in code comments).
427 Parameters
428 ----------
429 parq : `lsst.pipe.tasks.ParquetTable` (or list of such)
430 Source catalog(s) for computation
432 functors : `list`, `dict`, or `lsst.pipe.tasks.functors.CompositeFunctor`
433 Computations to do (functors that act on `parq`).
434 If a dict, the output
435 DataFrame will have columns keyed accordingly.
436 If a list, the column keys will come from the
437 `.shortname` attribute of each functor.
439 filt : `str` (optional)
440 Filter in which to calculate. If provided,
441 this will overwrite any existing `.filt` attribute
442 of the provided functors.
444 flags : `list` (optional)
445 List of flags (per-band) to include in output table.
447 refFlags : `list` (optional)
448 List of refFlags (only reference band) to include in output table.
451 """
452 _defaultRefFlags = []
453 _defaultFuncs = (('coord_ra', RAColumn()),
454 ('coord_dec', DecColumn()))
456 def __init__(self, parq, functors, filt=None, flags=None, refFlags=None):
457 self.parq = parq
458 self.functors = functors
460 self.filt = filt
461 self.flags = list(flags) if flags is not None else []
462 self.refFlags = list(self._defaultRefFlags)
463 if refFlags is not None:
464 self.refFlags += list(refFlags)
466 self._df = None
468 @property
469 def defaultFuncs(self):
470 funcs = dict(self._defaultFuncs)
471 return funcs
473 @property
474 def func(self):
475 additionalFuncs = self.defaultFuncs
476 additionalFuncs.update({flag: Column(flag, dataset='ref') for flag in self.refFlags})
477 additionalFuncs.update({flag: Column(flag, dataset='meas') for flag in self.flags})
479 if isinstance(self.functors, CompositeFunctor):
480 func = self.functors
481 else:
482 func = CompositeFunctor(self.functors)
484 func.funcDict.update(additionalFuncs)
485 func.filt = self.filt
487 return func
489 @property
490 def noDupCols(self):
491 return [name for name, func in self.func.funcDict.items() if func.noDup or func.dataset == 'ref']
493 @property
494 def df(self):
495 if self._df is None:
496 self.compute()
497 return self._df
499 def compute(self, dropna=False, pool=None):
500 # map over multiple parquet tables
501 if type(self.parq) in (list, tuple):
502 if pool is None:
503 dflist = [self.func(parq, dropna=dropna) for parq in self.parq]
504 else:
505 # TODO: Figure out why this doesn't work (pyarrow pickling issues?)
506 dflist = pool.map(functools.partial(self.func, dropna=dropna), self.parq)
507 self._df = pd.concat(dflist)
508 else:
509 self._df = self.func(self.parq, dropna=dropna)
511 return self._df
514class TransformCatalogBaseConnections(pipeBase.PipelineTaskConnections,
515 dimensions=()):
516 """Expected Connections for subclasses of TransformCatalogBaseTask.
518 Must be subclassed.
519 """
520 inputCatalog = connectionTypes.Input(
521 name="",
522 storageClass="DataFrame",
523 )
524 outputCatalog = connectionTypes.Output(
525 name="",
526 storageClass="DataFrame",
527 )
530class TransformCatalogBaseConfig(pipeBase.PipelineTaskConfig,
531 pipelineConnections=TransformCatalogBaseConnections):
532 functorFile = pexConfig.Field(
533 dtype=str,
534 doc='Path to YAML file specifying functors to be computed',
535 default=None,
536 optional=True
537 )
540class TransformCatalogBaseTask(CmdLineTask, pipeBase.PipelineTask):
541 """Base class for transforming/standardizing a catalog
543 by applying functors that convert units and apply calibrations.
544 The purpose of this task is to perform a set of computations on
545 an input `ParquetTable` dataset (such as `deepCoadd_obj`) and write the
546 results to a new dataset (which needs to be declared in an `outputDataset`
547 attribute).
549 The calculations to be performed are defined in a YAML file that specifies
550 a set of functors to be computed, provided as
551 a `--functorFile` config parameter. An example of such a YAML file
552 is the following:
554 funcs:
555 psfMag:
556 functor: Mag
557 args:
558 - base_PsfFlux
559 filt: HSC-G
560 dataset: meas
561 cmodel_magDiff:
562 functor: MagDiff
563 args:
564 - modelfit_CModel
565 - base_PsfFlux
566 filt: HSC-G
567 gauss_magDiff:
568 functor: MagDiff
569 args:
570 - base_GaussianFlux
571 - base_PsfFlux
572 filt: HSC-G
573 count:
574 functor: Column
575 args:
576 - base_InputCount_value
577 filt: HSC-G
578 deconvolved_moments:
579 functor: DeconvolvedMoments
580 filt: HSC-G
581 dataset: forced_src
582 refFlags:
583 - calib_psfUsed
584 - merge_measurement_i
585 - merge_measurement_r
586 - merge_measurement_z
587 - merge_measurement_y
588 - merge_measurement_g
589 - base_PixelFlags_flag_inexact_psfCenter
590 - detect_isPrimary
592 The names for each entry under "func" will become the names of columns in the
593 output dataset. All the functors referenced are defined in `lsst.pipe.tasks.functors`.
594 Positional arguments to be passed to each functor are in the `args` list,
595 and any additional entries for each column other than "functor" or "args" (e.g., `'filt'`,
596 `'dataset'`) are treated as keyword arguments to be passed to the functor initialization.
598 The "refFlags" entry is shortcut for a bunch of `Column` functors with the original column and
599 taken from the `'ref'` dataset.
601 The "flags" entry will be expanded out per band.
603 This task uses the `lsst.pipe.tasks.postprocess.PostprocessAnalysis` object
604 to organize and excecute the calculations.
606 """
607 @property
608 def _DefaultName(self):
609 raise NotImplementedError('Subclass must define "_DefaultName" attribute')
611 @property
612 def outputDataset(self):
613 raise NotImplementedError('Subclass must define "outputDataset" attribute')
615 @property
616 def inputDataset(self):
617 raise NotImplementedError('Subclass must define "inputDataset" attribute')
619 @property
620 def ConfigClass(self):
621 raise NotImplementedError('Subclass must define "ConfigClass" attribute')
623 def __init__(self, *args, **kwargs):
624 super().__init__(*args, **kwargs)
625 if self.config.functorFile:
626 self.log.info('Loading tranform functor definitions from %s',
627 self.config.functorFile)
628 self.funcs = CompositeFunctor.from_file(self.config.functorFile)
629 self.funcs.update(dict(PostprocessAnalysis._defaultFuncs))
630 else:
631 self.funcs = None
633 def runQuantum(self, butlerQC, inputRefs, outputRefs):
634 inputs = butlerQC.get(inputRefs)
635 if self.funcs is None:
636 raise ValueError("config.functorFile is None. "
637 "Must be a valid path to yaml in order to run Task as a PipelineTask.")
638 result = self.run(parq=inputs['inputCatalog'], funcs=self.funcs,
639 dataId=outputRefs.outputCatalog.dataId.full)
640 outputs = pipeBase.Struct(outputCatalog=result)
641 butlerQC.put(outputs, outputRefs)
643 def runDataRef(self, dataRef):
644 parq = dataRef.get()
645 if self.funcs is None:
646 raise ValueError("config.functorFile is None. "
647 "Must be a valid path to yaml in order to run as a CommandlineTask.")
648 df = self.run(parq, funcs=self.funcs, dataId=dataRef.dataId)
649 self.write(df, dataRef)
650 return df
652 def run(self, parq, funcs=None, dataId=None, band=None):
653 """Do postprocessing calculations
655 Takes a `ParquetTable` object and dataId,
656 returns a dataframe with results of postprocessing calculations.
658 Parameters
659 ----------
660 parq : `lsst.pipe.tasks.parquetTable.ParquetTable`
661 ParquetTable from which calculations are done.
662 funcs : `lsst.pipe.tasks.functors.Functors`
663 Functors to apply to the table's columns
664 dataId : dict, optional
665 Used to add a `patchId` column to the output dataframe.
666 band : `str`, optional
667 Filter band that is being processed.
669 Returns
670 ------
671 `pandas.DataFrame`
673 """
674 self.log.info("Transforming/standardizing the source table dataId: %s", dataId)
676 df = self.transform(band, parq, funcs, dataId).df
677 self.log.info("Made a table of %d columns and %d rows", len(df.columns), len(df))
678 return df
680 def getFunctors(self):
681 return self.funcs
683 def getAnalysis(self, parq, funcs=None, band=None):
684 if funcs is None:
685 funcs = self.funcs
686 analysis = PostprocessAnalysis(parq, funcs, filt=band)
687 return analysis
689 def transform(self, band, parq, funcs, dataId):
690 analysis = self.getAnalysis(parq, funcs=funcs, band=band)
691 df = analysis.df
692 if dataId is not None:
693 for key, value in dataId.items():
694 df[str(key)] = value
696 return pipeBase.Struct(
697 df=df,
698 analysis=analysis
699 )
701 def write(self, df, parqRef):
702 parqRef.put(ParquetTable(dataFrame=df), self.outputDataset)
704 def writeMetadata(self, dataRef):
705 """No metadata to write.
706 """
707 pass
710class TransformObjectCatalogConnections(pipeBase.PipelineTaskConnections,
711 defaultTemplates={"coaddName": "deep"},
712 dimensions=("tract", "patch", "skymap")):
713 inputCatalog = connectionTypes.Input(
714 doc="The vertical concatenation of the deepCoadd_{ref|meas|forced_src} catalogs, "
715 "stored as a DataFrame with a multi-level column index per-patch.",
716 dimensions=("tract", "patch", "skymap"),
717 storageClass="DataFrame",
718 name="{coaddName}Coadd_obj",
719 deferLoad=True,
720 )
721 outputCatalog = connectionTypes.Output(
722 doc="Per-Patch Object Table of columns transformed from the deepCoadd_obj table per the standard "
723 "data model.",
724 dimensions=("tract", "patch", "skymap"),
725 storageClass="DataFrame",
726 name="objectTable"
727 )
730class TransformObjectCatalogConfig(TransformCatalogBaseConfig,
731 pipelineConnections=TransformObjectCatalogConnections):
732 coaddName = pexConfig.Field(
733 dtype=str,
734 default="deep",
735 doc="Name of coadd"
736 )
737 # TODO: remove in DM-27177
738 filterMap = pexConfig.DictField(
739 keytype=str,
740 itemtype=str,
741 default={},
742 doc=("Dictionary mapping full filter name to short one for column name munging."
743 "These filters determine the output columns no matter what filters the "
744 "input data actually contain."),
745 deprecated=("Coadds are now identified by the band, so this transform is unused."
746 "Will be removed after v22.")
747 )
748 outputBands = pexConfig.ListField(
749 dtype=str,
750 default=None,
751 optional=True,
752 doc=("These bands and only these bands will appear in the output,"
753 " NaN-filled if the input does not include them."
754 " If None, then use all bands found in the input.")
755 )
756 camelCase = pexConfig.Field(
757 dtype=bool,
758 default=True,
759 doc=("Write per-band columns names with camelCase, else underscore "
760 "For example: gPsFlux instead of g_PsFlux.")
761 )
762 multilevelOutput = pexConfig.Field(
763 dtype=bool,
764 default=False,
765 doc=("Whether results dataframe should have a multilevel column index (True) or be flat "
766 "and name-munged (False).")
767 )
770class TransformObjectCatalogTask(TransformCatalogBaseTask):
771 """Produce a flattened Object Table to match the format specified in
772 sdm_schemas.
774 Do the same set of postprocessing calculations on all bands
776 This is identical to `TransformCatalogBaseTask`, except for that it does the
777 specified functor calculations for all filters present in the
778 input `deepCoadd_obj` table. Any specific `"filt"` keywords specified
779 by the YAML file will be superceded.
780 """
781 _DefaultName = "transformObjectCatalog"
782 ConfigClass = TransformObjectCatalogConfig
784 # Used by Gen 2 runDataRef only:
785 inputDataset = 'deepCoadd_obj'
786 outputDataset = 'objectTable'
788 @classmethod
789 def _makeArgumentParser(cls):
790 parser = ArgumentParser(name=cls._DefaultName)
791 parser.add_id_argument("--id", cls.inputDataset,
792 ContainerClass=CoaddDataIdContainer,
793 help="data ID, e.g. --id tract=12345 patch=1,2")
794 return parser
796 def run(self, parq, funcs=None, dataId=None, band=None):
797 # NOTE: band kwarg is ignored here.
798 dfDict = {}
799 analysisDict = {}
800 templateDf = pd.DataFrame()
802 if isinstance(parq, DeferredDatasetHandle):
803 columns = parq.get(component='columns')
804 inputBands = columns.unique(level=1).values
805 else:
806 inputBands = parq.columnLevelNames['band']
808 outputBands = self.config.outputBands if self.config.outputBands else inputBands
810 # Perform transform for data of filters that exist in parq.
811 for inputBand in inputBands:
812 if inputBand not in outputBands:
813 self.log.info("Ignoring %s band data in the input", inputBand)
814 continue
815 self.log.info("Transforming the catalog of band %s", inputBand)
816 result = self.transform(inputBand, parq, funcs, dataId)
817 dfDict[inputBand] = result.df
818 analysisDict[inputBand] = result.analysis
819 if templateDf.empty:
820 templateDf = result.df
822 # Fill NaNs in columns of other wanted bands
823 for filt in outputBands:
824 if filt not in dfDict:
825 self.log.info("Adding empty columns for band %s", filt)
826 dfDict[filt] = pd.DataFrame().reindex_like(templateDf)
828 # This makes a multilevel column index, with band as first level
829 df = pd.concat(dfDict, axis=1, names=['band', 'column'])
831 if not self.config.multilevelOutput:
832 noDupCols = list(set.union(*[set(v.noDupCols) for v in analysisDict.values()]))
833 if dataId is not None:
834 noDupCols += list(dataId.keys())
835 df = flattenFilters(df, noDupCols=noDupCols, camelCase=self.config.camelCase,
836 inputBands=inputBands)
838 self.log.info("Made a table of %d columns and %d rows", len(df.columns), len(df))
839 return df
842class TractObjectDataIdContainer(CoaddDataIdContainer):
844 def makeDataRefList(self, namespace):
845 """Make self.refList from self.idList
847 Generate a list of data references given tract and/or patch.
848 This was adapted from `TractQADataIdContainer`, which was
849 `TractDataIdContainer` modifie to not require "filter".
850 Only existing dataRefs are returned.
851 """
852 def getPatchRefList(tract):
853 return [namespace.butler.dataRef(datasetType=self.datasetType,
854 tract=tract.getId(),
855 patch="%d,%d" % patch.getIndex()) for patch in tract]
857 tractRefs = defaultdict(list) # Data references for each tract
858 for dataId in self.idList:
859 skymap = self.getSkymap(namespace)
861 if "tract" in dataId:
862 tractId = dataId["tract"]
863 if "patch" in dataId:
864 tractRefs[tractId].append(namespace.butler.dataRef(datasetType=self.datasetType,
865 tract=tractId,
866 patch=dataId['patch']))
867 else:
868 tractRefs[tractId] += getPatchRefList(skymap[tractId])
869 else:
870 tractRefs = dict((tract.getId(), tractRefs.get(tract.getId(), []) + getPatchRefList(tract))
871 for tract in skymap)
872 outputRefList = []
873 for tractRefList in tractRefs.values():
874 existingRefs = [ref for ref in tractRefList if ref.datasetExists()]
875 outputRefList.append(existingRefs)
877 self.refList = outputRefList
880class ConsolidateObjectTableConnections(pipeBase.PipelineTaskConnections,
881 dimensions=("tract", "skymap")):
882 inputCatalogs = connectionTypes.Input(
883 doc="Per-Patch objectTables conforming to the standard data model.",
884 name="objectTable",
885 storageClass="DataFrame",
886 dimensions=("tract", "patch", "skymap"),
887 multiple=True,
888 )
889 outputCatalog = connectionTypes.Output(
890 doc="Pre-tract horizontal concatenation of the input objectTables",
891 name="objectTable_tract",
892 storageClass="DataFrame",
893 dimensions=("tract", "skymap"),
894 )
897class ConsolidateObjectTableConfig(pipeBase.PipelineTaskConfig,
898 pipelineConnections=ConsolidateObjectTableConnections):
899 coaddName = pexConfig.Field(
900 dtype=str,
901 default="deep",
902 doc="Name of coadd"
903 )
906class ConsolidateObjectTableTask(CmdLineTask, pipeBase.PipelineTask):
907 """Write patch-merged source tables to a tract-level parquet file
909 Concatenates `objectTable` list into a per-visit `objectTable_tract`
910 """
911 _DefaultName = "consolidateObjectTable"
912 ConfigClass = ConsolidateObjectTableConfig
914 inputDataset = 'objectTable'
915 outputDataset = 'objectTable_tract'
917 def runQuantum(self, butlerQC, inputRefs, outputRefs):
918 inputs = butlerQC.get(inputRefs)
919 self.log.info("Concatenating %s per-patch Object Tables",
920 len(inputs['inputCatalogs']))
921 df = pd.concat(inputs['inputCatalogs'])
922 butlerQC.put(pipeBase.Struct(outputCatalog=df), outputRefs)
924 @classmethod
925 def _makeArgumentParser(cls):
926 parser = ArgumentParser(name=cls._DefaultName)
928 parser.add_id_argument("--id", cls.inputDataset,
929 help="data ID, e.g. --id tract=12345",
930 ContainerClass=TractObjectDataIdContainer)
931 return parser
933 def runDataRef(self, patchRefList):
934 df = pd.concat([patchRef.get().toDataFrame() for patchRef in patchRefList])
935 patchRefList[0].put(ParquetTable(dataFrame=df), self.outputDataset)
937 def writeMetadata(self, dataRef):
938 """No metadata to write.
939 """
940 pass
943class TransformSourceTableConnections(pipeBase.PipelineTaskConnections,
944 dimensions=("instrument", "visit", "detector")):
946 inputCatalog = connectionTypes.Input(
947 doc="Wide input catalog of sources produced by WriteSourceTableTask",
948 name="source",
949 storageClass="DataFrame",
950 dimensions=("instrument", "visit", "detector"),
951 deferLoad=True
952 )
953 outputCatalog = connectionTypes.Output(
954 doc="Narrower, per-detector Source Table transformed and converted per a "
955 "specified set of functors",
956 name="sourceTable",
957 storageClass="DataFrame",
958 dimensions=("instrument", "visit", "detector")
959 )
962class TransformSourceTableConfig(TransformCatalogBaseConfig,
963 pipelineConnections=TransformSourceTableConnections):
964 pass
967class TransformSourceTableTask(TransformCatalogBaseTask):
968 """Transform/standardize a source catalog
969 """
970 _DefaultName = "transformSourceTable"
971 ConfigClass = TransformSourceTableConfig
973 inputDataset = 'source'
974 outputDataset = 'sourceTable'
976 @classmethod
977 def _makeArgumentParser(cls):
978 parser = ArgumentParser(name=cls._DefaultName)
979 parser.add_id_argument("--id", datasetType=cls.inputDataset,
980 level="sensor",
981 help="data ID, e.g. --id visit=12345 ccd=0")
982 return parser
984 def runDataRef(self, dataRef):
985 """Override to specify band label to run()."""
986 parq = dataRef.get()
987 funcs = self.getFunctors()
988 band = dataRef.get("calexp_filterLabel", immediate=True).bandLabel
989 df = self.run(parq, funcs=funcs, dataId=dataRef.dataId, band=band)
990 self.write(df, dataRef)
991 return df
994class ConsolidateVisitSummaryConnections(pipeBase.PipelineTaskConnections,
995 dimensions=("instrument", "visit",),
996 defaultTemplates={}):
997 calexp = connectionTypes.Input(
998 doc="Processed exposures used for metadata",
999 name="calexp",
1000 storageClass="ExposureF",
1001 dimensions=("instrument", "visit", "detector"),
1002 deferLoad=True,
1003 multiple=True,
1004 )
1005 visitSummary = connectionTypes.Output(
1006 doc=("Per-visit consolidated exposure metadata. These catalogs use "
1007 "detector id for the id and are sorted for fast lookups of a "
1008 "detector."),
1009 name="visitSummary",
1010 storageClass="ExposureCatalog",
1011 dimensions=("instrument", "visit"),
1012 )
1015class ConsolidateVisitSummaryConfig(pipeBase.PipelineTaskConfig,
1016 pipelineConnections=ConsolidateVisitSummaryConnections):
1017 """Config for ConsolidateVisitSummaryTask"""
1018 pass
1021class ConsolidateVisitSummaryTask(pipeBase.PipelineTask, pipeBase.CmdLineTask):
1022 """Task to consolidate per-detector visit metadata.
1024 This task aggregates the following metadata from all the detectors in a
1025 single visit into an exposure catalog:
1026 - The visitInfo.
1027 - The wcs.
1028 - The photoCalib.
1029 - The physical_filter and band (if available).
1030 - The psf size, shape, and effective area at the center of the detector.
1031 - The corners of the bounding box in right ascension/declination.
1033 Other quantities such as Detector, Psf, ApCorrMap, and TransmissionCurve
1034 are not persisted here because of storage concerns, and because of their
1035 limited utility as summary statistics.
1037 Tests for this task are performed in ci_hsc_gen3.
1038 """
1039 _DefaultName = "consolidateVisitSummary"
1040 ConfigClass = ConsolidateVisitSummaryConfig
1042 @classmethod
1043 def _makeArgumentParser(cls):
1044 parser = ArgumentParser(name=cls._DefaultName)
1046 parser.add_id_argument("--id", "calexp",
1047 help="data ID, e.g. --id visit=12345",
1048 ContainerClass=VisitDataIdContainer)
1049 return parser
1051 def writeMetadata(self, dataRef):
1052 """No metadata to persist, so override to remove metadata persistance.
1053 """
1054 pass
1056 def writeConfig(self, butler, clobber=False, doBackup=True):
1057 """No config to persist, so override to remove config persistance.
1058 """
1059 pass
1061 def runDataRef(self, dataRefList):
1062 visit = dataRefList[0].dataId['visit']
1064 self.log.debug("Concatenating metadata from %d per-detector calexps (visit %d)" %
1065 (len(dataRefList), visit))
1067 expCatalog = self._combineExposureMetadata(visit, dataRefList, isGen3=False)
1069 dataRefList[0].put(expCatalog, 'visitSummary', visit=visit)
1071 def runQuantum(self, butlerQC, inputRefs, outputRefs):
1072 dataRefs = butlerQC.get(inputRefs.calexp)
1073 visit = dataRefs[0].dataId.byName()['visit']
1075 self.log.debug("Concatenating metadata from %d per-detector calexps (visit %d)" %
1076 (len(dataRefs), visit))
1078 expCatalog = self._combineExposureMetadata(visit, dataRefs)
1080 butlerQC.put(expCatalog, outputRefs.visitSummary)
1082 def _combineExposureMetadata(self, visit, dataRefs, isGen3=True):
1083 """Make a combined exposure catalog from a list of dataRefs.
1084 These dataRefs must point to exposures with wcs, summaryStats,
1085 and other visit metadata.
1087 Parameters
1088 ----------
1089 visit : `int`
1090 Visit identification number.
1091 dataRefs : `list`
1092 List of dataRefs in visit. May be list of
1093 `lsst.daf.persistence.ButlerDataRef` (Gen2) or
1094 `lsst.daf.butler.DeferredDatasetHandle` (Gen3).
1095 isGen3 : `bool`, optional
1096 Specifies if this is a Gen3 list of datarefs.
1098 Returns
1099 -------
1100 visitSummary : `lsst.afw.table.ExposureCatalog`
1101 Exposure catalog with per-detector summary information.
1102 """
1103 schema = self._makeVisitSummarySchema()
1104 cat = afwTable.ExposureCatalog(schema)
1105 cat.resize(len(dataRefs))
1107 cat['visit'] = visit
1109 for i, dataRef in enumerate(dataRefs):
1110 if isGen3:
1111 visitInfo = dataRef.get(component='visitInfo')
1112 filterLabel = dataRef.get(component='filterLabel')
1113 summaryStats = dataRef.get(component='summaryStats')
1114 detector = dataRef.get(component='detector')
1115 wcs = dataRef.get(component='wcs')
1116 photoCalib = dataRef.get(component='photoCalib')
1117 detector = dataRef.get(component='detector')
1118 bbox = dataRef.get(component='bbox')
1119 validPolygon = dataRef.get(component='validPolygon')
1120 else:
1121 # Note that we need to read the calexp because there is
1122 # no magic access to the psf except through the exposure.
1123 gen2_read_bbox = lsst.geom.BoxI(lsst.geom.PointI(0, 0), lsst.geom.PointI(1, 1))
1124 exp = dataRef.get(datasetType='calexp_sub', bbox=gen2_read_bbox)
1125 visitInfo = exp.getInfo().getVisitInfo()
1126 filterLabel = dataRef.get("calexp_filterLabel")
1127 summaryStats = exp.getInfo().getSummaryStats()
1128 wcs = exp.getWcs()
1129 photoCalib = exp.getPhotoCalib()
1130 detector = exp.getDetector()
1131 bbox = dataRef.get(datasetType='calexp_bbox')
1132 validPolygon = exp.getInfo().getValidPolygon()
1134 rec = cat[i]
1135 rec.setBBox(bbox)
1136 rec.setVisitInfo(visitInfo)
1137 rec.setWcs(wcs)
1138 rec.setPhotoCalib(photoCalib)
1139 rec.setValidPolygon(validPolygon)
1141 rec['physical_filter'] = filterLabel.physicalLabel if filterLabel.hasPhysicalLabel() else ""
1142 rec['band'] = filterLabel.bandLabel if filterLabel.hasBandLabel() else ""
1143 rec.setId(detector.getId())
1144 rec['psfSigma'] = summaryStats.psfSigma
1145 rec['psfIxx'] = summaryStats.psfIxx
1146 rec['psfIyy'] = summaryStats.psfIyy
1147 rec['psfIxy'] = summaryStats.psfIxy
1148 rec['psfArea'] = summaryStats.psfArea
1149 rec['raCorners'][:] = summaryStats.raCorners
1150 rec['decCorners'][:] = summaryStats.decCorners
1151 rec['ra'] = summaryStats.ra
1152 rec['decl'] = summaryStats.decl
1153 rec['zenithDistance'] = summaryStats.zenithDistance
1154 rec['zeroPoint'] = summaryStats.zeroPoint
1155 rec['skyBg'] = summaryStats.skyBg
1156 rec['skyNoise'] = summaryStats.skyNoise
1157 rec['meanVar'] = summaryStats.meanVar
1159 metadata = dafBase.PropertyList()
1160 metadata.add("COMMENT", "Catalog id is detector id, sorted.")
1161 # We are looping over existing datarefs, so the following is true
1162 metadata.add("COMMENT", "Only detectors with data have entries.")
1163 cat.setMetadata(metadata)
1165 cat.sort()
1166 return cat
1168 def _makeVisitSummarySchema(self):
1169 """Make the schema for the visitSummary catalog."""
1170 schema = afwTable.ExposureTable.makeMinimalSchema()
1171 schema.addField('visit', type='I', doc='Visit number')
1172 schema.addField('physical_filter', type='String', size=32, doc='Physical filter')
1173 schema.addField('band', type='String', size=32, doc='Name of band')
1174 schema.addField('psfSigma', type='F',
1175 doc='PSF model second-moments determinant radius (center of chip) (pixel)')
1176 schema.addField('psfArea', type='F',
1177 doc='PSF model effective area (center of chip) (pixel**2)')
1178 schema.addField('psfIxx', type='F',
1179 doc='PSF model Ixx (center of chip) (pixel**2)')
1180 schema.addField('psfIyy', type='F',
1181 doc='PSF model Iyy (center of chip) (pixel**2)')
1182 schema.addField('psfIxy', type='F',
1183 doc='PSF model Ixy (center of chip) (pixel**2)')
1184 schema.addField('raCorners', type='ArrayD', size=4,
1185 doc='Right Ascension of bounding box corners (degrees)')
1186 schema.addField('decCorners', type='ArrayD', size=4,
1187 doc='Declination of bounding box corners (degrees)')
1188 schema.addField('ra', type='D',
1189 doc='Right Ascension of bounding box center (degrees)')
1190 schema.addField('decl', type='D',
1191 doc='Declination of bounding box center (degrees)')
1192 schema.addField('zenithDistance', type='F',
1193 doc='Zenith distance of bounding box center (degrees)')
1194 schema.addField('zeroPoint', type='F',
1195 doc='Mean zeropoint in detector (mag)')
1196 schema.addField('skyBg', type='F',
1197 doc='Average sky background (ADU)')
1198 schema.addField('skyNoise', type='F',
1199 doc='Average sky noise (ADU)')
1200 schema.addField('meanVar', type='F',
1201 doc='Mean variance of the weight plane (ADU**2)')
1203 return schema
1206class VisitDataIdContainer(DataIdContainer):
1207 """DataIdContainer that groups sensor-level id's by visit
1208 """
1210 def makeDataRefList(self, namespace):
1211 """Make self.refList from self.idList
1213 Generate a list of data references grouped by visit.
1215 Parameters
1216 ----------
1217 namespace : `argparse.Namespace`
1218 Namespace used by `lsst.pipe.base.CmdLineTask` to parse command line arguments
1219 """
1220 # Group by visits
1221 visitRefs = defaultdict(list)
1222 for dataId in self.idList:
1223 if "visit" in dataId:
1224 visitId = dataId["visit"]
1225 # append all subsets to
1226 subset = namespace.butler.subset(self.datasetType, dataId=dataId)
1227 visitRefs[visitId].extend([dataRef for dataRef in subset])
1229 outputRefList = []
1230 for refList in visitRefs.values():
1231 existingRefs = [ref for ref in refList if ref.datasetExists()]
1232 if existingRefs:
1233 outputRefList.append(existingRefs)
1235 self.refList = outputRefList
1238class ConsolidateSourceTableConnections(pipeBase.PipelineTaskConnections,
1239 dimensions=("instrument", "visit")):
1240 inputCatalogs = connectionTypes.Input(
1241 doc="Input per-detector Source Tables",
1242 name="sourceTable",
1243 storageClass="DataFrame",
1244 dimensions=("instrument", "visit", "detector"),
1245 multiple=True
1246 )
1247 outputCatalog = connectionTypes.Output(
1248 doc="Per-visit concatenation of Source Table",
1249 name="sourceTable_visit",
1250 storageClass="DataFrame",
1251 dimensions=("instrument", "visit")
1252 )
1255class ConsolidateSourceTableConfig(pipeBase.PipelineTaskConfig,
1256 pipelineConnections=ConsolidateSourceTableConnections):
1257 pass
1260class ConsolidateSourceTableTask(CmdLineTask, pipeBase.PipelineTask):
1261 """Concatenate `sourceTable` list into a per-visit `sourceTable_visit`
1262 """
1263 _DefaultName = 'consolidateSourceTable'
1264 ConfigClass = ConsolidateSourceTableConfig
1266 inputDataset = 'sourceTable'
1267 outputDataset = 'sourceTable_visit'
1269 def runQuantum(self, butlerQC, inputRefs, outputRefs):
1270 inputs = butlerQC.get(inputRefs)
1271 self.log.info("Concatenating %s per-detector Source Tables",
1272 len(inputs['inputCatalogs']))
1273 df = pd.concat(inputs['inputCatalogs'])
1274 butlerQC.put(pipeBase.Struct(outputCatalog=df), outputRefs)
1276 def runDataRef(self, dataRefList):
1277 self.log.info("Concatenating %s per-detector Source Tables", len(dataRefList))
1278 df = pd.concat([dataRef.get().toDataFrame() for dataRef in dataRefList])
1279 dataRefList[0].put(ParquetTable(dataFrame=df), self.outputDataset)
1281 @classmethod
1282 def _makeArgumentParser(cls):
1283 parser = ArgumentParser(name=cls._DefaultName)
1285 parser.add_id_argument("--id", cls.inputDataset,
1286 help="data ID, e.g. --id visit=12345",
1287 ContainerClass=VisitDataIdContainer)
1288 return parser
1290 def writeMetadata(self, dataRef):
1291 """No metadata to write.
1292 """
1293 pass
1295 def writeConfig(self, butler, clobber=False, doBackup=True):
1296 """No config to write.
1297 """
1298 pass