Coverage for python/lsst/pipe/tasks/postprocess.py : 28%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of pipe_tasks
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <https://www.gnu.org/licenses/>.
22import functools
23import pandas as pd
24import numpy as np
25from collections import defaultdict
27import lsst.geom
28import lsst.pex.config as pexConfig
29import lsst.pipe.base as pipeBase
30from lsst.pipe.base import connectionTypes
31import lsst.afw.table as afwTable
32from lsst.meas.base import SingleFrameMeasurementTask
33from lsst.pipe.base import CmdLineTask, ArgumentParser, DataIdContainer
34from lsst.coadd.utils.coaddDataIdContainer import CoaddDataIdContainer
36from .parquetTable import ParquetTable
37from .multiBandUtils import makeMergeArgumentParser, MergeSourcesRunner
38from .functors import CompositeFunctor, RAColumn, DecColumn, Column
41def flattenFilters(df, noDupCols=['coord_ra', 'coord_dec'], camelCase=False):
42 """Flattens a dataframe with multilevel column index
43 """
44 newDf = pd.DataFrame()
45 for band in set(df.columns.to_frame()['band']):
46 subdf = df[band]
47 columnFormat = '{0}{1}' if camelCase else '{0}_{1}'
48 newColumns = {c: columnFormat.format(band, c)
49 for c in subdf.columns if c not in noDupCols}
50 cols = list(newColumns.keys())
51 newDf = pd.concat([newDf, subdf[cols].rename(columns=newColumns)], axis=1)
53 newDf = pd.concat([subdf[noDupCols], newDf], axis=1)
54 return newDf
57class WriteObjectTableConfig(pexConfig.Config):
58 engine = pexConfig.Field(
59 dtype=str,
60 default="pyarrow",
61 doc="Parquet engine for writing (pyarrow or fastparquet)"
62 )
63 coaddName = pexConfig.Field(
64 dtype=str,
65 default="deep",
66 doc="Name of coadd"
67 )
70class WriteObjectTableTask(CmdLineTask):
71 """Write filter-merged source tables to parquet
72 """
73 _DefaultName = "writeObjectTable"
74 ConfigClass = WriteObjectTableConfig
75 RunnerClass = MergeSourcesRunner
77 # Names of table datasets to be merged
78 inputDatasets = ('forced_src', 'meas', 'ref')
80 # Tag of output dataset written by `MergeSourcesTask.write`
81 outputDataset = 'obj'
83 def __init__(self, butler=None, schema=None, **kwargs):
84 # It is a shame that this class can't use the default init for CmdLineTask
85 # But to do so would require its own special task runner, which is many
86 # more lines of specialization, so this is how it is for now
87 CmdLineTask.__init__(self, **kwargs)
89 def runDataRef(self, patchRefList):
90 """!
91 @brief Merge coadd sources from multiple bands. Calls @ref `run` which must be defined in
92 subclasses that inherit from MergeSourcesTask.
93 @param[in] patchRefList list of data references for each filter
94 """
95 catalogs = dict(self.readCatalog(patchRef) for patchRef in patchRefList)
96 dataId = patchRefList[0].dataId
97 mergedCatalog = self.run(catalogs, tract=dataId['tract'], patch=dataId['patch'])
98 self.write(patchRefList[0], mergedCatalog)
100 @classmethod
101 def _makeArgumentParser(cls):
102 """Create a suitable ArgumentParser.
104 We will use the ArgumentParser to get a list of data
105 references for patches; the RunnerClass will sort them into lists
106 of data references for the same patch.
108 References first of self.inputDatasets, rather than
109 self.inputDataset
110 """
111 return makeMergeArgumentParser(cls._DefaultName, cls.inputDatasets[0])
113 def readCatalog(self, patchRef):
114 """Read input catalogs
116 Read all the input datasets given by the 'inputDatasets'
117 attribute.
119 Parameters
120 ----------
121 patchRef : `lsst.daf.persistence.ButlerDataRef`
122 Data reference for patch
124 Returns
125 -------
126 Tuple consisting of band name and a dict of catalogs, keyed by
127 dataset name
128 """
129 band = patchRef.get(self.config.coaddName + "Coadd_filterLabel", immediate=True).bandLabel
130 catalogDict = {}
131 for dataset in self.inputDatasets:
132 catalog = patchRef.get(self.config.coaddName + "Coadd_" + dataset, immediate=True)
133 self.log.info("Read %d sources from %s for band %s: %s" %
134 (len(catalog), dataset, band, patchRef.dataId))
135 catalogDict[dataset] = catalog
136 return band, catalogDict
138 def run(self, catalogs, tract, patch):
139 """Merge multiple catalogs.
141 Parameters
142 ----------
143 catalogs : `dict`
144 Mapping from filter names to dict of catalogs.
145 tract : int
146 tractId to use for the tractId column
147 patch : str
148 patchId to use for the patchId column
150 Returns
151 -------
152 catalog : `lsst.pipe.tasks.parquetTable.ParquetTable`
153 Merged dataframe, with each column prefixed by
154 `filter_tag(filt)`, wrapped in the parquet writer shim class.
155 """
157 dfs = []
158 for filt, tableDict in catalogs.items():
159 for dataset, table in tableDict.items():
160 # Convert afwTable to pandas DataFrame
161 df = table.asAstropy().to_pandas().set_index('id', drop=True)
163 # Sort columns by name, to ensure matching schema among patches
164 df = df.reindex(sorted(df.columns), axis=1)
165 df['tractId'] = tract
166 df['patchId'] = patch
168 # Make columns a 3-level MultiIndex
169 df.columns = pd.MultiIndex.from_tuples([(dataset, filt, c) for c in df.columns],
170 names=('dataset', 'band', 'column'))
171 dfs.append(df)
173 catalog = functools.reduce(lambda d1, d2: d1.join(d2), dfs)
174 return ParquetTable(dataFrame=catalog)
176 def write(self, patchRef, catalog):
177 """Write the output.
179 Parameters
180 ----------
181 catalog : `ParquetTable`
182 Catalog to write
183 patchRef : `lsst.daf.persistence.ButlerDataRef`
184 Data reference for patch
185 """
186 patchRef.put(catalog, self.config.coaddName + "Coadd_" + self.outputDataset)
187 # since the filter isn't actually part of the data ID for the dataset we're saving,
188 # it's confusing to see it in the log message, even if the butler simply ignores it.
189 mergeDataId = patchRef.dataId.copy()
190 del mergeDataId["filter"]
191 self.log.info("Wrote merged catalog: %s" % (mergeDataId,))
193 def writeMetadata(self, dataRefList):
194 """No metadata to write, and not sure how to write it for a list of dataRefs.
195 """
196 pass
199class WriteSourceTableConnections(pipeBase.PipelineTaskConnections,
200 dimensions=("instrument", "visit", "detector")):
202 catalog = connectionTypes.Input(
203 doc="Input full-depth catalog of sources produced by CalibrateTask",
204 name="src",
205 storageClass="SourceCatalog",
206 dimensions=("instrument", "visit", "detector")
207 )
208 outputCatalog = connectionTypes.Output(
209 doc="Catalog of sources, `src` in Parquet format",
210 name="source",
211 storageClass="DataFrame",
212 dimensions=("instrument", "visit", "detector")
213 )
216class WriteSourceTableConfig(pipeBase.PipelineTaskConfig,
217 pipelineConnections=WriteSourceTableConnections):
218 doApplyExternalPhotoCalib = pexConfig.Field(
219 dtype=bool,
220 default=False,
221 doc=("Add local photoCalib columns from the calexp.photoCalib? Should only set True if "
222 "generating Source Tables from older src tables which do not already have local calib columns")
223 )
224 doApplyExternalSkyWcs = pexConfig.Field(
225 dtype=bool,
226 default=False,
227 doc=("Add local WCS columns from the calexp.wcs? Should only set True if "
228 "generating Source Tables from older src tables which do not already have local calib columns")
229 )
232class WriteSourceTableTask(CmdLineTask, pipeBase.PipelineTask):
233 """Write source table to parquet
234 """
235 _DefaultName = "writeSourceTable"
236 ConfigClass = WriteSourceTableConfig
238 def runDataRef(self, dataRef):
239 src = dataRef.get('src')
240 if self.config.doApplyExternalPhotoCalib or self.config.doApplyExternalSkyWcs:
241 src = self.addCalibColumns(src, dataRef)
243 ccdVisitId = dataRef.get('ccdExposureId')
244 result = self.run(src, ccdVisitId=ccdVisitId)
245 dataRef.put(result.table, 'source')
247 def runQuantum(self, butlerQC, inputRefs, outputRefs):
248 inputs = butlerQC.get(inputRefs)
249 inputs['ccdVisitId'] = butlerQC.quantum.dataId.pack("visit_detector")
250 result = self.run(**inputs).table
251 outputs = pipeBase.Struct(outputCatalog=result.toDataFrame())
252 butlerQC.put(outputs, outputRefs)
254 def run(self, catalog, ccdVisitId=None):
255 """Convert `src` catalog to parquet
257 Parameters
258 ----------
259 catalog: `afwTable.SourceCatalog`
260 catalog to be converted
261 ccdVisitId: `int`
262 ccdVisitId to be added as a column
264 Returns
265 -------
266 result : `lsst.pipe.base.Struct`
267 ``table``
268 `ParquetTable` version of the input catalog
269 """
270 self.log.info("Generating parquet table from src catalog %s", ccdVisitId)
271 df = catalog.asAstropy().to_pandas().set_index('id', drop=True)
272 df['ccdVisitId'] = ccdVisitId
273 return pipeBase.Struct(table=ParquetTable(dataFrame=df))
275 def addCalibColumns(self, catalog, dataRef):
276 """Add columns with local calibration evaluated at each centroid
278 for backwards compatibility with old repos.
279 This exists for the purpose of converting old src catalogs
280 (which don't have the expected local calib columns) to Source Tables.
282 Parameters
283 ----------
284 catalog: `afwTable.SourceCatalog`
285 catalog to which calib columns will be added
286 dataRef: `lsst.daf.persistence.ButlerDataRef
287 for fetching the calibs from disk.
289 Returns
290 -------
291 newCat: `afwTable.SourceCatalog`
292 Source Catalog with requested local calib columns
293 """
294 mapper = afwTable.SchemaMapper(catalog.schema)
295 measureConfig = SingleFrameMeasurementTask.ConfigClass()
296 measureConfig.doReplaceWithNoise = False
298 # Just need the WCS or the PhotoCalib attached to an exposue
299 exposure = dataRef.get('calexp_sub',
300 bbox=lsst.geom.Box2I(lsst.geom.Point2I(0, 0), lsst.geom.Point2I(0, 0)))
302 mapper = afwTable.SchemaMapper(catalog.schema)
303 mapper.addMinimalSchema(catalog.schema, True)
304 schema = mapper.getOutputSchema()
306 exposureIdInfo = dataRef.get("expIdInfo")
307 measureConfig.plugins.names = []
308 if self.config.doApplyExternalSkyWcs:
309 plugin = 'base_LocalWcs'
310 if plugin in schema:
311 raise RuntimeError(f"{plugin} already in src catalog. Set doApplyExternalSkyWcs=False")
312 else:
313 measureConfig.plugins.names.add(plugin)
315 if self.config.doApplyExternalPhotoCalib:
316 plugin = 'base_LocalPhotoCalib'
317 if plugin in schema:
318 raise RuntimeError(f"{plugin} already in src catalog. Set doApplyExternalPhotoCalib=False")
319 else:
320 measureConfig.plugins.names.add(plugin)
322 measurement = SingleFrameMeasurementTask(config=measureConfig, schema=schema)
323 newCat = afwTable.SourceCatalog(schema)
324 newCat.extend(catalog, mapper=mapper)
325 measurement.run(measCat=newCat, exposure=exposure, exposureId=exposureIdInfo.expId)
326 return newCat
328 def writeMetadata(self, dataRef):
329 """No metadata to write.
330 """
331 pass
333 @classmethod
334 def _makeArgumentParser(cls):
335 parser = ArgumentParser(name=cls._DefaultName)
336 parser.add_id_argument("--id", 'src',
337 help="data ID, e.g. --id visit=12345 ccd=0")
338 return parser
341class PostprocessAnalysis(object):
342 """Calculate columns from ParquetTable
344 This object manages and organizes an arbitrary set of computations
345 on a catalog. The catalog is defined by a
346 `lsst.pipe.tasks.parquetTable.ParquetTable` object (or list thereof), such as a
347 `deepCoadd_obj` dataset, and the computations are defined by a collection
348 of `lsst.pipe.tasks.functor.Functor` objects (or, equivalently,
349 a `CompositeFunctor`).
351 After the object is initialized, accessing the `.df` attribute (which
352 holds the `pandas.DataFrame` containing the results of the calculations) triggers
353 computation of said dataframe.
355 One of the conveniences of using this object is the ability to define a desired common
356 filter for all functors. This enables the same functor collection to be passed to
357 several different `PostprocessAnalysis` objects without having to change the original
358 functor collection, since the `filt` keyword argument of this object triggers an
359 overwrite of the `filt` property for all functors in the collection.
361 This object also allows a list of refFlags to be passed, and defines a set of default
362 refFlags that are always included even if not requested.
364 If a list of `ParquetTable` object is passed, rather than a single one, then the
365 calculations will be mapped over all the input catalogs. In principle, it should
366 be straightforward to parallelize this activity, but initial tests have failed
367 (see TODO in code comments).
369 Parameters
370 ----------
371 parq : `lsst.pipe.tasks.ParquetTable` (or list of such)
372 Source catalog(s) for computation
374 functors : `list`, `dict`, or `lsst.pipe.tasks.functors.CompositeFunctor`
375 Computations to do (functors that act on `parq`).
376 If a dict, the output
377 DataFrame will have columns keyed accordingly.
378 If a list, the column keys will come from the
379 `.shortname` attribute of each functor.
381 filt : `str` (optional)
382 Filter in which to calculate. If provided,
383 this will overwrite any existing `.filt` attribute
384 of the provided functors.
386 flags : `list` (optional)
387 List of flags (per-band) to include in output table.
389 refFlags : `list` (optional)
390 List of refFlags (only reference band) to include in output table.
393 """
394 _defaultRefFlags = []
395 _defaultFuncs = (('coord_ra', RAColumn()),
396 ('coord_dec', DecColumn()))
398 def __init__(self, parq, functors, filt=None, flags=None, refFlags=None):
399 self.parq = parq
400 self.functors = functors
402 self.filt = filt
403 self.flags = list(flags) if flags is not None else []
404 self.refFlags = list(self._defaultRefFlags)
405 if refFlags is not None:
406 self.refFlags += list(refFlags)
408 self._df = None
410 @property
411 def defaultFuncs(self):
412 funcs = dict(self._defaultFuncs)
413 return funcs
415 @property
416 def func(self):
417 additionalFuncs = self.defaultFuncs
418 additionalFuncs.update({flag: Column(flag, dataset='ref') for flag in self.refFlags})
419 additionalFuncs.update({flag: Column(flag, dataset='meas') for flag in self.flags})
421 if isinstance(self.functors, CompositeFunctor):
422 func = self.functors
423 else:
424 func = CompositeFunctor(self.functors)
426 func.funcDict.update(additionalFuncs)
427 func.filt = self.filt
429 return func
431 @property
432 def noDupCols(self):
433 return [name for name, func in self.func.funcDict.items() if func.noDup or func.dataset == 'ref']
435 @property
436 def df(self):
437 if self._df is None:
438 self.compute()
439 return self._df
441 def compute(self, dropna=False, pool=None):
442 # map over multiple parquet tables
443 if type(self.parq) in (list, tuple):
444 if pool is None:
445 dflist = [self.func(parq, dropna=dropna) for parq in self.parq]
446 else:
447 # TODO: Figure out why this doesn't work (pyarrow pickling issues?)
448 dflist = pool.map(functools.partial(self.func, dropna=dropna), self.parq)
449 self._df = pd.concat(dflist)
450 else:
451 self._df = self.func(self.parq, dropna=dropna)
453 return self._df
456class TransformCatalogBaseConnections(pipeBase.PipelineTaskConnections,
457 dimensions=()):
458 """Expected Connections for subclasses of TransformCatalogBaseTask.
460 Must be subclassed.
461 """
462 inputCatalog = connectionTypes.Input(
463 name="",
464 storageClass="DataFrame",
465 )
466 outputCatalog = connectionTypes.Output(
467 name="",
468 storageClass="DataFrame",
469 )
472class TransformCatalogBaseConfig(pipeBase.PipelineTaskConfig,
473 pipelineConnections=TransformCatalogBaseConnections):
474 functorFile = pexConfig.Field(
475 dtype=str,
476 doc='Path to YAML file specifying functors to be computed',
477 default=None,
478 optional=True
479 )
482class TransformCatalogBaseTask(CmdLineTask, pipeBase.PipelineTask):
483 """Base class for transforming/standardizing a catalog
485 by applying functors that convert units and apply calibrations.
486 The purpose of this task is to perform a set of computations on
487 an input `ParquetTable` dataset (such as `deepCoadd_obj`) and write the
488 results to a new dataset (which needs to be declared in an `outputDataset`
489 attribute).
491 The calculations to be performed are defined in a YAML file that specifies
492 a set of functors to be computed, provided as
493 a `--functorFile` config parameter. An example of such a YAML file
494 is the following:
496 funcs:
497 psfMag:
498 functor: Mag
499 args:
500 - base_PsfFlux
501 filt: HSC-G
502 dataset: meas
503 cmodel_magDiff:
504 functor: MagDiff
505 args:
506 - modelfit_CModel
507 - base_PsfFlux
508 filt: HSC-G
509 gauss_magDiff:
510 functor: MagDiff
511 args:
512 - base_GaussianFlux
513 - base_PsfFlux
514 filt: HSC-G
515 count:
516 functor: Column
517 args:
518 - base_InputCount_value
519 filt: HSC-G
520 deconvolved_moments:
521 functor: DeconvolvedMoments
522 filt: HSC-G
523 dataset: forced_src
524 refFlags:
525 - calib_psfUsed
526 - merge_measurement_i
527 - merge_measurement_r
528 - merge_measurement_z
529 - merge_measurement_y
530 - merge_measurement_g
531 - base_PixelFlags_flag_inexact_psfCenter
532 - detect_isPrimary
534 The names for each entry under "func" will become the names of columns in the
535 output dataset. All the functors referenced are defined in `lsst.pipe.tasks.functors`.
536 Positional arguments to be passed to each functor are in the `args` list,
537 and any additional entries for each column other than "functor" or "args" (e.g., `'filt'`,
538 `'dataset'`) are treated as keyword arguments to be passed to the functor initialization.
540 The "refFlags" entry is shortcut for a bunch of `Column` functors with the original column and
541 taken from the `'ref'` dataset.
543 The "flags" entry will be expanded out per band.
545 This task uses the `lsst.pipe.tasks.postprocess.PostprocessAnalysis` object
546 to organize and excecute the calculations.
548 """
549 @property
550 def _DefaultName(self):
551 raise NotImplementedError('Subclass must define "_DefaultName" attribute')
553 @property
554 def outputDataset(self):
555 raise NotImplementedError('Subclass must define "outputDataset" attribute')
557 @property
558 def inputDataset(self):
559 raise NotImplementedError('Subclass must define "inputDataset" attribute')
561 @property
562 def ConfigClass(self):
563 raise NotImplementedError('Subclass must define "ConfigClass" attribute')
565 def __init__(self, *args, **kwargs):
566 super().__init__(*args, **kwargs)
567 if self.config.functorFile:
568 self.log.info('Loading tranform functor definitions from %s',
569 self.config.functorFile)
570 self.funcs = CompositeFunctor.from_file(self.config.functorFile)
571 self.funcs.update(dict(PostprocessAnalysis._defaultFuncs))
572 else:
573 self.funcs = None
575 def runQuantum(self, butlerQC, inputRefs, outputRefs):
576 inputs = butlerQC.get(inputRefs)
577 if self.funcs is None:
578 raise ValueError("config.functorFile is None. "
579 "Must be a valid path to yaml in order to run Task as a PipelineTask.")
580 result = self.run(parq=inputs['inputCatalog'], funcs=self.funcs,
581 dataId=outputRefs.outputCatalog.dataId.full)
582 outputs = pipeBase.Struct(outputCatalog=result)
583 butlerQC.put(outputs, outputRefs)
585 def runDataRef(self, dataRef):
586 parq = dataRef.get()
587 if self.funcs is None:
588 raise ValueError("config.functorFile is None. "
589 "Must be a valid path to yaml in order to run as a CommandlineTask.")
590 df = self.run(parq, funcs=self.funcs, dataId=dataRef.dataId)
591 self.write(df, dataRef)
592 return df
594 def run(self, parq, funcs=None, dataId=None, band=None):
595 """Do postprocessing calculations
597 Takes a `ParquetTable` object and dataId,
598 returns a dataframe with results of postprocessing calculations.
600 Parameters
601 ----------
602 parq : `lsst.pipe.tasks.parquetTable.ParquetTable`
603 ParquetTable from which calculations are done.
604 funcs : `lsst.pipe.tasks.functors.Functors`
605 Functors to apply to the table's columns
606 dataId : dict, optional
607 Used to add a `patchId` column to the output dataframe.
608 band : `str`, optional
609 Filter band that is being processed.
611 Returns
612 ------
613 `pandas.DataFrame`
615 """
616 self.log.info("Transforming/standardizing the source table dataId: %s", dataId)
618 df = self.transform(band, parq, funcs, dataId).df
619 self.log.info("Made a table of %d columns and %d rows", len(df.columns), len(df))
620 return df
622 def getFunctors(self):
623 return self.funcs
625 def getAnalysis(self, parq, funcs=None, band=None):
626 if funcs is None:
627 funcs = self.funcs
628 analysis = PostprocessAnalysis(parq, funcs, filt=band)
629 return analysis
631 def transform(self, band, parq, funcs, dataId):
632 analysis = self.getAnalysis(parq, funcs=funcs, band=band)
633 df = analysis.df
634 if dataId is not None:
635 for key, value in dataId.items():
636 df[key] = value
638 return pipeBase.Struct(
639 df=df,
640 analysis=analysis
641 )
643 def write(self, df, parqRef):
644 parqRef.put(ParquetTable(dataFrame=df), self.outputDataset)
646 def writeMetadata(self, dataRef):
647 """No metadata to write.
648 """
649 pass
652class TransformObjectCatalogConfig(TransformCatalogBaseConfig):
653 coaddName = pexConfig.Field(
654 dtype=str,
655 default="deep",
656 doc="Name of coadd"
657 )
658 # TODO: remove in DM-27177
659 filterMap = pexConfig.DictField(
660 keytype=str,
661 itemtype=str,
662 default={},
663 doc=("Dictionary mapping full filter name to short one for column name munging."
664 "These filters determine the output columns no matter what filters the "
665 "input data actually contain."),
666 deprecated=("Coadds are now identified by the band, so this transform is unused."
667 "Will be removed after v22.")
668 )
669 outputBands = pexConfig.ListField(
670 dtype=str,
671 default=None,
672 optional=True,
673 doc=("These bands and only these bands will appear in the output,"
674 " NaN-filled if the input does not include them."
675 " If None, then use all bands found in the input.")
676 )
677 camelCase = pexConfig.Field(
678 dtype=bool,
679 default=True,
680 doc=("Write per-band columns names with camelCase, else underscore "
681 "For example: gPsFlux instead of g_PsFlux.")
682 )
683 multilevelOutput = pexConfig.Field(
684 dtype=bool,
685 default=False,
686 doc=("Whether results dataframe should have a multilevel column index (True) or be flat "
687 "and name-munged (False).")
688 )
691class TransformObjectCatalogTask(TransformCatalogBaseTask):
692 """Produce a flattened Object Table to match the format specified in
693 sdm_schemas.
695 Do the same set of postprocessing calculations on all bands
697 This is identical to `TransformCatalogBaseTask`, except for that it does the
698 specified functor calculations for all filters present in the
699 input `deepCoadd_obj` table. Any specific `"filt"` keywords specified
700 by the YAML file will be superceded.
701 """
702 _DefaultName = "transformObjectCatalog"
703 ConfigClass = TransformObjectCatalogConfig
705 inputDataset = 'deepCoadd_obj'
706 outputDataset = 'objectTable'
708 @classmethod
709 def _makeArgumentParser(cls):
710 parser = ArgumentParser(name=cls._DefaultName)
711 parser.add_id_argument("--id", cls.inputDataset,
712 ContainerClass=CoaddDataIdContainer,
713 help="data ID, e.g. --id tract=12345 patch=1,2")
714 return parser
716 def run(self, parq, funcs=None, dataId=None, band=None):
717 # NOTE: band kwarg is ignored here.
718 dfDict = {}
719 analysisDict = {}
720 templateDf = pd.DataFrame()
721 outputBands = parq.columnLevelNames['band'] if self.config.outputBands is None else \
722 self.config.outputBands
724 # Perform transform for data of filters that exist in parq.
725 for inputBand in parq.columnLevelNames['band']:
726 if inputBand not in outputBands:
727 self.log.info("Ignoring %s band data in the input", inputBand)
728 continue
729 self.log.info("Transforming the catalog of band %s", inputBand)
730 result = self.transform(inputBand, parq, funcs, dataId)
731 dfDict[inputBand] = result.df
732 analysisDict[inputBand] = result.analysis
733 if templateDf.empty:
734 templateDf = result.df
736 # Fill NaNs in columns of other wanted bands
737 for filt in outputBands:
738 if filt not in dfDict:
739 self.log.info("Adding empty columns for band %s", filt)
740 dfDict[filt] = pd.DataFrame().reindex_like(templateDf)
742 # This makes a multilevel column index, with band as first level
743 df = pd.concat(dfDict, axis=1, names=['band', 'column'])
745 if not self.config.multilevelOutput:
746 noDupCols = list(set.union(*[set(v.noDupCols) for v in analysisDict.values()]))
747 if dataId is not None:
748 noDupCols += list(dataId.keys())
749 df = flattenFilters(df, noDupCols=noDupCols, camelCase=self.config.camelCase)
751 self.log.info("Made a table of %d columns and %d rows", len(df.columns), len(df))
752 return df
755class TractObjectDataIdContainer(CoaddDataIdContainer):
757 def makeDataRefList(self, namespace):
758 """Make self.refList from self.idList
760 Generate a list of data references given tract and/or patch.
761 This was adapted from `TractQADataIdContainer`, which was
762 `TractDataIdContainer` modifie to not require "filter".
763 Only existing dataRefs are returned.
764 """
765 def getPatchRefList(tract):
766 return [namespace.butler.dataRef(datasetType=self.datasetType,
767 tract=tract.getId(),
768 patch="%d,%d" % patch.getIndex()) for patch in tract]
770 tractRefs = defaultdict(list) # Data references for each tract
771 for dataId in self.idList:
772 skymap = self.getSkymap(namespace)
774 if "tract" in dataId:
775 tractId = dataId["tract"]
776 if "patch" in dataId:
777 tractRefs[tractId].append(namespace.butler.dataRef(datasetType=self.datasetType,
778 tract=tractId,
779 patch=dataId['patch']))
780 else:
781 tractRefs[tractId] += getPatchRefList(skymap[tractId])
782 else:
783 tractRefs = dict((tract.getId(), tractRefs.get(tract.getId(), []) + getPatchRefList(tract))
784 for tract in skymap)
785 outputRefList = []
786 for tractRefList in tractRefs.values():
787 existingRefs = [ref for ref in tractRefList if ref.datasetExists()]
788 outputRefList.append(existingRefs)
790 self.refList = outputRefList
793class ConsolidateObjectTableConfig(pexConfig.Config):
794 coaddName = pexConfig.Field(
795 dtype=str,
796 default="deep",
797 doc="Name of coadd"
798 )
801class ConsolidateObjectTableTask(CmdLineTask):
802 """Write patch-merged source tables to a tract-level parquet file
803 """
804 _DefaultName = "consolidateObjectTable"
805 ConfigClass = ConsolidateObjectTableConfig
807 inputDataset = 'objectTable'
808 outputDataset = 'objectTable_tract'
810 @classmethod
811 def _makeArgumentParser(cls):
812 parser = ArgumentParser(name=cls._DefaultName)
814 parser.add_id_argument("--id", cls.inputDataset,
815 help="data ID, e.g. --id tract=12345",
816 ContainerClass=TractObjectDataIdContainer)
817 return parser
819 def runDataRef(self, patchRefList):
820 df = pd.concat([patchRef.get().toDataFrame() for patchRef in patchRefList])
821 patchRefList[0].put(ParquetTable(dataFrame=df), self.outputDataset)
823 def writeMetadata(self, dataRef):
824 """No metadata to write.
825 """
826 pass
829class TransformSourceTableConnections(pipeBase.PipelineTaskConnections,
830 dimensions=("instrument", "visit", "detector")):
832 inputCatalog = connectionTypes.Input(
833 doc="Wide input catalog of sources produced by WriteSourceTableTask",
834 name="source",
835 storageClass="DataFrame",
836 dimensions=("instrument", "visit", "detector"),
837 deferLoad=True
838 )
839 outputCatalog = connectionTypes.Output(
840 doc="Narrower, per-detector Source Table transformed and converted per a "
841 "specified set of functors",
842 name="sourceTable",
843 storageClass="DataFrame",
844 dimensions=("instrument", "visit", "detector")
845 )
848class TransformSourceTableConfig(TransformCatalogBaseConfig,
849 pipelineConnections=TransformSourceTableConnections):
850 pass
853class TransformSourceTableTask(TransformCatalogBaseTask):
854 """Transform/standardize a source catalog
855 """
856 _DefaultName = "transformSourceTable"
857 ConfigClass = TransformSourceTableConfig
859 inputDataset = 'source'
860 outputDataset = 'sourceTable'
862 @classmethod
863 def _makeArgumentParser(cls):
864 parser = ArgumentParser(name=cls._DefaultName)
865 parser.add_id_argument("--id", datasetType=cls.inputDataset,
866 level="sensor",
867 help="data ID, e.g. --id visit=12345 ccd=0")
868 return parser
870 def runDataRef(self, dataRef):
871 """Override to specify band label to run()."""
872 parq = dataRef.get()
873 funcs = self.getFunctors()
874 band = dataRef.get("calexp_filterLabel", immediate=True).bandLabel
875 df = self.run(parq, funcs=funcs, dataId=dataRef.dataId, band=band)
876 self.write(df, dataRef)
877 return df
880class ConsolidateVisitSummaryConnections(pipeBase.PipelineTaskConnections,
881 dimensions=("instrument", "visit",),
882 defaultTemplates={}):
883 calexp = connectionTypes.Input(
884 doc="Processed exposures used for metadata",
885 name="calexp",
886 storageClass="ExposureF",
887 dimensions=("instrument", "visit", "detector"),
888 deferLoad=True,
889 multiple=True,
890 )
891 visitSummary = connectionTypes.Output(
892 doc="Consolidated visit-level exposure metadata",
893 name="visitSummary",
894 storageClass="ExposureCatalog",
895 dimensions=("instrument", "visit"),
896 )
899class ConsolidateVisitSummaryConfig(pipeBase.PipelineTaskConfig,
900 pipelineConnections=ConsolidateVisitSummaryConnections):
901 """Config for ConsolidateVisitSummaryTask"""
902 pass
905class ConsolidateVisitSummaryTask(pipeBase.PipelineTask, pipeBase.CmdLineTask):
906 """Task to consolidate per-detector visit metadata.
908 This task aggregates the following metadata from all the detectors in a
909 single visit into an exposure catalog:
910 - The visitInfo.
911 - The wcs.
912 - The photoCalib.
913 - The physical_filter and band (if available).
914 - The psf size, shape, and effective area at the center of the detector.
915 - The corners of the bounding box in right ascension/declination.
917 Other quantities such as Psf, ApCorrMap, and TransmissionCurve are not
918 persisted here because of storage concerns, and because of their limited
919 utility as summary statistics.
921 Tests for this task are performed in ci_hsc_gen3.
922 """
923 _DefaultName = "consolidateVisitSummary"
924 ConfigClass = ConsolidateVisitSummaryConfig
926 @classmethod
927 def _makeArgumentParser(cls):
928 parser = ArgumentParser(name=cls._DefaultName)
930 parser.add_id_argument("--id", "calexp",
931 help="data ID, e.g. --id visit=12345",
932 ContainerClass=VisitDataIdContainer)
933 return parser
935 def writeMetadata(self, dataRef):
936 """No metadata to persist, so override to remove metadata persistance.
937 """
938 pass
940 def writeConfig(self, butler, clobber=False, doBackup=True):
941 """No config to persist, so override to remove config persistance.
942 """
943 pass
945 def runDataRef(self, dataRefList):
946 visit = dataRefList[0].dataId['visit']
948 self.log.debug("Concatenating metadata from %d per-detector calexps (visit %d)" %
949 (len(dataRefList), visit))
951 expCatalog = self._combineExposureMetadata(visit, dataRefList, isGen3=False)
953 dataRefList[0].put(expCatalog, 'visitSummary', visit=visit)
955 def runQuantum(self, butlerQC, inputRefs, outputRefs):
956 dataRefs = butlerQC.get(inputRefs.calexp)
957 visit = dataRefs[0].dataId.byName()['visit']
959 self.log.debug("Concatenating metadata from %d per-detector calexps (visit %d)" %
960 (len(dataRefs), visit))
962 expCatalog = self._combineExposureMetadata(visit, dataRefs)
964 butlerQC.put(expCatalog, outputRefs.visitSummary)
966 def _combineExposureMetadata(self, visit, dataRefs, isGen3=True):
967 """Make a combined exposure catalog from a list of dataRefs.
969 Parameters
970 ----------
971 visit : `int`
972 Visit identification number
973 dataRefs : `list`
974 List of calexp dataRefs in visit. May be list of
975 `lsst.daf.persistence.ButlerDataRef` (Gen2) or
976 `lsst.daf.butler.DeferredDatasetHandle` (Gen3).
977 isGen3 : `bool`, optional
978 Specifies if this is a Gen3 list of datarefs.
980 Returns
981 -------
982 visitSummary : `lsst.afw.table.ExposureCatalog`
983 Exposure catalog with per-detector summary information.
984 """
985 schema = afwTable.ExposureTable.makeMinimalSchema()
986 schema.addField('visit', type='I', doc='Visit number')
987 schema.addField('detector_id', type='I', doc='Detector number')
988 schema.addField('physical_filter', type='String', size=32, doc='Physical filter')
989 schema.addField('band', type='String', size=32, doc='Name of band')
990 schema.addField('psfSigma', type='F',
991 doc='PSF model second-moments determinant radius (center of chip) (pixel)')
992 schema.addField('psfArea', type='F',
993 doc='PSF model effective area (center of chip) (pixel**2)')
994 schema.addField('psfIxx', type='F',
995 doc='PSF model Ixx (center of chip) (pixel**2)')
996 schema.addField('psfIyy', type='F',
997 doc='PSF model Iyy (center of chip) (pixel**2)')
998 schema.addField('psfIxy', type='F',
999 doc='PSF model Ixy (center of chip) (pixel**2)')
1000 schema.addField('raCorners', type='ArrayD', size=4,
1001 doc='Right Ascension of bounding box corners (degrees)')
1002 schema.addField('decCorners', type='ArrayD', size=4,
1003 doc='Declination of bounding box corners (degrees)')
1005 cat = afwTable.ExposureCatalog(schema)
1006 cat.resize(len(dataRefs))
1008 cat['visit'] = visit
1010 for i, dataRef in enumerate(dataRefs):
1011 if isGen3:
1012 visitInfo = dataRef.get(component='visitInfo')
1013 filterLabel = dataRef.get(component='filterLabel')
1014 psf = dataRef.get(component='psf')
1015 wcs = dataRef.get(component='wcs')
1016 photoCalib = dataRef.get(component='photoCalib')
1017 detector = dataRef.get(component='detector')
1018 bbox = dataRef.get(component='bbox')
1019 validPolygon = dataRef.get(component='validPolygon')
1020 else:
1021 # Note that we need to read the calexp because there is
1022 # no magic access to the psf except through the exposure.
1023 gen2_read_bbox = lsst.geom.BoxI(lsst.geom.PointI(0, 0), lsst.geom.PointI(1, 1))
1024 exp = dataRef.get(datasetType='calexp_sub', bbox=gen2_read_bbox)
1025 visitInfo = exp.getInfo().getVisitInfo()
1026 filterLabel = dataRef.get("calexp_filterLabel")
1027 psf = exp.getPsf()
1028 wcs = exp.getWcs()
1029 photoCalib = exp.getPhotoCalib()
1030 detector = exp.getDetector()
1031 bbox = dataRef.get(datasetType='calexp_bbox')
1032 validPolygon = exp.getInfo().getValidPolygon()
1034 rec = cat[i]
1035 rec.setBBox(bbox)
1036 rec.setVisitInfo(visitInfo)
1037 rec.setWcs(wcs)
1038 rec.setPhotoCalib(photoCalib)
1039 rec.setDetector(detector)
1040 rec.setValidPolygon(validPolygon)
1042 rec['physical_filter'] = filterLabel.physicalLabel if filterLabel.hasPhysicalLabel() else ""
1043 rec['band'] = filterLabel.bandLabel if filterLabel.hasBandLabel() else ""
1044 rec['detector_id'] = detector.getId()
1045 shape = psf.computeShape(bbox.getCenter())
1046 rec['psfSigma'] = shape.getDeterminantRadius()
1047 rec['psfIxx'] = shape.getIxx()
1048 rec['psfIyy'] = shape.getIyy()
1049 rec['psfIxy'] = shape.getIxy()
1050 im = psf.computeKernelImage(bbox.getCenter())
1051 # The calculation of effective psf area is taken from
1052 # meas_base/src/PsfFlux.cc#L112. See
1053 # https://github.com/lsst/meas_base/blob/
1054 # 750bffe6620e565bda731add1509507f5c40c8bb/src/PsfFlux.cc#L112
1055 rec['psfArea'] = np.sum(im.array)/np.sum(im.array**2.)
1057 sph_pts = wcs.pixelToSky(lsst.geom.Box2D(bbox).getCorners())
1058 rec['raCorners'][:] = [sph.getRa().asDegrees() for sph in sph_pts]
1059 rec['decCorners'][:] = [sph.getDec().asDegrees() for sph in sph_pts]
1061 return cat
1064class VisitDataIdContainer(DataIdContainer):
1065 """DataIdContainer that groups sensor-level id's by visit
1066 """
1068 def makeDataRefList(self, namespace):
1069 """Make self.refList from self.idList
1071 Generate a list of data references grouped by visit.
1073 Parameters
1074 ----------
1075 namespace : `argparse.Namespace`
1076 Namespace used by `lsst.pipe.base.CmdLineTask` to parse command line arguments
1077 """
1078 # Group by visits
1079 visitRefs = defaultdict(list)
1080 for dataId in self.idList:
1081 if "visit" in dataId:
1082 visitId = dataId["visit"]
1083 # append all subsets to
1084 subset = namespace.butler.subset(self.datasetType, dataId=dataId)
1085 visitRefs[visitId].extend([dataRef for dataRef in subset])
1087 outputRefList = []
1088 for refList in visitRefs.values():
1089 existingRefs = [ref for ref in refList if ref.datasetExists()]
1090 if existingRefs:
1091 outputRefList.append(existingRefs)
1093 self.refList = outputRefList
1096class ConsolidateSourceTableConnections(pipeBase.PipelineTaskConnections,
1097 dimensions=("instrument", "visit")):
1098 inputCatalogs = connectionTypes.Input(
1099 doc="Input per-detector Source Tables",
1100 name="sourceTable",
1101 storageClass="DataFrame",
1102 dimensions=("instrument", "visit", "detector"),
1103 multiple=True
1104 )
1105 outputCatalog = connectionTypes.Output(
1106 doc="Per-visit concatenation of Source Table",
1107 name="sourceTable_visit",
1108 storageClass="DataFrame",
1109 dimensions=("instrument", "visit")
1110 )
1113class ConsolidateSourceTableConfig(pipeBase.PipelineTaskConfig,
1114 pipelineConnections=ConsolidateSourceTableConnections):
1115 pass
1118class ConsolidateSourceTableTask(CmdLineTask, pipeBase.PipelineTask):
1119 """Concatenate `sourceTable` list into a per-visit `sourceTable_visit`
1120 """
1121 _DefaultName = 'consolidateSourceTable'
1122 ConfigClass = ConsolidateSourceTableConfig
1124 inputDataset = 'sourceTable'
1125 outputDataset = 'sourceTable_visit'
1127 def runQuantum(self, butlerQC, inputRefs, outputRefs):
1128 inputs = butlerQC.get(inputRefs)
1129 self.log.info("Concatenating %s per-detector Source Tables",
1130 len(inputs['inputCatalogs']))
1131 df = pd.concat(inputs['inputCatalogs'])
1132 butlerQC.put(pipeBase.Struct(outputCatalog=df), outputRefs)
1134 def runDataRef(self, dataRefList):
1135 self.log.info("Concatenating %s per-detector Source Tables", len(dataRefList))
1136 df = pd.concat([dataRef.get().toDataFrame() for dataRef in dataRefList])
1137 dataRefList[0].put(ParquetTable(dataFrame=df), self.outputDataset)
1139 @classmethod
1140 def _makeArgumentParser(cls):
1141 parser = ArgumentParser(name=cls._DefaultName)
1143 parser.add_id_argument("--id", cls.inputDataset,
1144 help="data ID, e.g. --id visit=12345",
1145 ContainerClass=VisitDataIdContainer)
1146 return parser
1148 def writeMetadata(self, dataRef):
1149 """No metadata to write.
1150 """
1151 pass
1153 def writeConfig(self, butler, clobber=False, doBackup=True):
1154 """No config to write.
1155 """
1156 pass