Coverage for python/lsst/pipe/tasks/postprocess.py : 28%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of pipe_tasks
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <https://www.gnu.org/licenses/>.
22import functools
23import pandas as pd
24import numpy as np
25from collections import defaultdict
27import lsst.geom
28import lsst.pex.config as pexConfig
29import lsst.pipe.base as pipeBase
30import lsst.daf.base as dafBase
31from lsst.pipe.base import connectionTypes
32import lsst.afw.table as afwTable
33from lsst.meas.base import SingleFrameMeasurementTask
34from lsst.pipe.base import CmdLineTask, ArgumentParser, DataIdContainer
35from lsst.coadd.utils.coaddDataIdContainer import CoaddDataIdContainer
37from .parquetTable import ParquetTable
38from .multiBandUtils import makeMergeArgumentParser, MergeSourcesRunner
39from .functors import CompositeFunctor, RAColumn, DecColumn, Column
42def flattenFilters(df, noDupCols=['coord_ra', 'coord_dec'], camelCase=False):
43 """Flattens a dataframe with multilevel column index
44 """
45 newDf = pd.DataFrame()
46 for band in set(df.columns.to_frame()['band']):
47 subdf = df[band]
48 columnFormat = '{0}{1}' if camelCase else '{0}_{1}'
49 newColumns = {c: columnFormat.format(band, c)
50 for c in subdf.columns if c not in noDupCols}
51 cols = list(newColumns.keys())
52 newDf = pd.concat([newDf, subdf[cols].rename(columns=newColumns)], axis=1)
54 newDf = pd.concat([subdf[noDupCols], newDf], axis=1)
55 return newDf
58class WriteObjectTableConfig(pexConfig.Config):
59 engine = pexConfig.Field(
60 dtype=str,
61 default="pyarrow",
62 doc="Parquet engine for writing (pyarrow or fastparquet)"
63 )
64 coaddName = pexConfig.Field(
65 dtype=str,
66 default="deep",
67 doc="Name of coadd"
68 )
71class WriteObjectTableTask(CmdLineTask):
72 """Write filter-merged source tables to parquet
73 """
74 _DefaultName = "writeObjectTable"
75 ConfigClass = WriteObjectTableConfig
76 RunnerClass = MergeSourcesRunner
78 # Names of table datasets to be merged
79 inputDatasets = ('forced_src', 'meas', 'ref')
81 # Tag of output dataset written by `MergeSourcesTask.write`
82 outputDataset = 'obj'
84 def __init__(self, butler=None, schema=None, **kwargs):
85 # It is a shame that this class can't use the default init for CmdLineTask
86 # But to do so would require its own special task runner, which is many
87 # more lines of specialization, so this is how it is for now
88 CmdLineTask.__init__(self, **kwargs)
90 def runDataRef(self, patchRefList):
91 """!
92 @brief Merge coadd sources from multiple bands. Calls @ref `run` which must be defined in
93 subclasses that inherit from MergeSourcesTask.
94 @param[in] patchRefList list of data references for each filter
95 """
96 catalogs = dict(self.readCatalog(patchRef) for patchRef in patchRefList)
97 dataId = patchRefList[0].dataId
98 mergedCatalog = self.run(catalogs, tract=dataId['tract'], patch=dataId['patch'])
99 self.write(patchRefList[0], mergedCatalog)
101 @classmethod
102 def _makeArgumentParser(cls):
103 """Create a suitable ArgumentParser.
105 We will use the ArgumentParser to get a list of data
106 references for patches; the RunnerClass will sort them into lists
107 of data references for the same patch.
109 References first of self.inputDatasets, rather than
110 self.inputDataset
111 """
112 return makeMergeArgumentParser(cls._DefaultName, cls.inputDatasets[0])
114 def readCatalog(self, patchRef):
115 """Read input catalogs
117 Read all the input datasets given by the 'inputDatasets'
118 attribute.
120 Parameters
121 ----------
122 patchRef : `lsst.daf.persistence.ButlerDataRef`
123 Data reference for patch
125 Returns
126 -------
127 Tuple consisting of band name and a dict of catalogs, keyed by
128 dataset name
129 """
130 band = patchRef.get(self.config.coaddName + "Coadd_filterLabel", immediate=True).bandLabel
131 catalogDict = {}
132 for dataset in self.inputDatasets:
133 catalog = patchRef.get(self.config.coaddName + "Coadd_" + dataset, immediate=True)
134 self.log.info("Read %d sources from %s for band %s: %s" %
135 (len(catalog), dataset, band, patchRef.dataId))
136 catalogDict[dataset] = catalog
137 return band, catalogDict
139 def run(self, catalogs, tract, patch):
140 """Merge multiple catalogs.
142 Parameters
143 ----------
144 catalogs : `dict`
145 Mapping from filter names to dict of catalogs.
146 tract : int
147 tractId to use for the tractId column
148 patch : str
149 patchId to use for the patchId column
151 Returns
152 -------
153 catalog : `lsst.pipe.tasks.parquetTable.ParquetTable`
154 Merged dataframe, with each column prefixed by
155 `filter_tag(filt)`, wrapped in the parquet writer shim class.
156 """
158 dfs = []
159 for filt, tableDict in catalogs.items():
160 for dataset, table in tableDict.items():
161 # Convert afwTable to pandas DataFrame
162 df = table.asAstropy().to_pandas().set_index('id', drop=True)
164 # Sort columns by name, to ensure matching schema among patches
165 df = df.reindex(sorted(df.columns), axis=1)
166 df['tractId'] = tract
167 df['patchId'] = patch
169 # Make columns a 3-level MultiIndex
170 df.columns = pd.MultiIndex.from_tuples([(dataset, filt, c) for c in df.columns],
171 names=('dataset', 'band', 'column'))
172 dfs.append(df)
174 catalog = functools.reduce(lambda d1, d2: d1.join(d2), dfs)
175 return ParquetTable(dataFrame=catalog)
177 def write(self, patchRef, catalog):
178 """Write the output.
180 Parameters
181 ----------
182 catalog : `ParquetTable`
183 Catalog to write
184 patchRef : `lsst.daf.persistence.ButlerDataRef`
185 Data reference for patch
186 """
187 patchRef.put(catalog, self.config.coaddName + "Coadd_" + self.outputDataset)
188 # since the filter isn't actually part of the data ID for the dataset we're saving,
189 # it's confusing to see it in the log message, even if the butler simply ignores it.
190 mergeDataId = patchRef.dataId.copy()
191 del mergeDataId["filter"]
192 self.log.info("Wrote merged catalog: %s" % (mergeDataId,))
194 def writeMetadata(self, dataRefList):
195 """No metadata to write, and not sure how to write it for a list of dataRefs.
196 """
197 pass
200class WriteSourceTableConnections(pipeBase.PipelineTaskConnections,
201 dimensions=("instrument", "visit", "detector")):
203 catalog = connectionTypes.Input(
204 doc="Input full-depth catalog of sources produced by CalibrateTask",
205 name="src",
206 storageClass="SourceCatalog",
207 dimensions=("instrument", "visit", "detector")
208 )
209 outputCatalog = connectionTypes.Output(
210 doc="Catalog of sources, `src` in Parquet format",
211 name="source",
212 storageClass="DataFrame",
213 dimensions=("instrument", "visit", "detector")
214 )
217class WriteSourceTableConfig(pipeBase.PipelineTaskConfig,
218 pipelineConnections=WriteSourceTableConnections):
219 doApplyExternalPhotoCalib = pexConfig.Field(
220 dtype=bool,
221 default=False,
222 doc=("Add local photoCalib columns from the calexp.photoCalib? Should only set True if "
223 "generating Source Tables from older src tables which do not already have local calib columns")
224 )
225 doApplyExternalSkyWcs = pexConfig.Field(
226 dtype=bool,
227 default=False,
228 doc=("Add local WCS columns from the calexp.wcs? Should only set True if "
229 "generating Source Tables from older src tables which do not already have local calib columns")
230 )
233class WriteSourceTableTask(CmdLineTask, pipeBase.PipelineTask):
234 """Write source table to parquet
235 """
236 _DefaultName = "writeSourceTable"
237 ConfigClass = WriteSourceTableConfig
239 def runDataRef(self, dataRef):
240 src = dataRef.get('src')
241 if self.config.doApplyExternalPhotoCalib or self.config.doApplyExternalSkyWcs:
242 src = self.addCalibColumns(src, dataRef)
244 ccdVisitId = dataRef.get('ccdExposureId')
245 result = self.run(src, ccdVisitId=ccdVisitId)
246 dataRef.put(result.table, 'source')
248 def runQuantum(self, butlerQC, inputRefs, outputRefs):
249 inputs = butlerQC.get(inputRefs)
250 inputs['ccdVisitId'] = butlerQC.quantum.dataId.pack("visit_detector")
251 result = self.run(**inputs).table
252 outputs = pipeBase.Struct(outputCatalog=result.toDataFrame())
253 butlerQC.put(outputs, outputRefs)
255 def run(self, catalog, ccdVisitId=None):
256 """Convert `src` catalog to parquet
258 Parameters
259 ----------
260 catalog: `afwTable.SourceCatalog`
261 catalog to be converted
262 ccdVisitId: `int`
263 ccdVisitId to be added as a column
265 Returns
266 -------
267 result : `lsst.pipe.base.Struct`
268 ``table``
269 `ParquetTable` version of the input catalog
270 """
271 self.log.info("Generating parquet table from src catalog %s", ccdVisitId)
272 df = catalog.asAstropy().to_pandas().set_index('id', drop=True)
273 df['ccdVisitId'] = ccdVisitId
274 return pipeBase.Struct(table=ParquetTable(dataFrame=df))
276 def addCalibColumns(self, catalog, dataRef):
277 """Add columns with local calibration evaluated at each centroid
279 for backwards compatibility with old repos.
280 This exists for the purpose of converting old src catalogs
281 (which don't have the expected local calib columns) to Source Tables.
283 Parameters
284 ----------
285 catalog: `afwTable.SourceCatalog`
286 catalog to which calib columns will be added
287 dataRef: `lsst.daf.persistence.ButlerDataRef
288 for fetching the calibs from disk.
290 Returns
291 -------
292 newCat: `afwTable.SourceCatalog`
293 Source Catalog with requested local calib columns
294 """
295 mapper = afwTable.SchemaMapper(catalog.schema)
296 measureConfig = SingleFrameMeasurementTask.ConfigClass()
297 measureConfig.doReplaceWithNoise = False
299 # Just need the WCS or the PhotoCalib attached to an exposue
300 exposure = dataRef.get('calexp_sub',
301 bbox=lsst.geom.Box2I(lsst.geom.Point2I(0, 0), lsst.geom.Point2I(0, 0)))
303 mapper = afwTable.SchemaMapper(catalog.schema)
304 mapper.addMinimalSchema(catalog.schema, True)
305 schema = mapper.getOutputSchema()
307 exposureIdInfo = dataRef.get("expIdInfo")
308 measureConfig.plugins.names = []
309 if self.config.doApplyExternalSkyWcs:
310 plugin = 'base_LocalWcs'
311 if plugin in schema:
312 raise RuntimeError(f"{plugin} already in src catalog. Set doApplyExternalSkyWcs=False")
313 else:
314 measureConfig.plugins.names.add(plugin)
316 if self.config.doApplyExternalPhotoCalib:
317 plugin = 'base_LocalPhotoCalib'
318 if plugin in schema:
319 raise RuntimeError(f"{plugin} already in src catalog. Set doApplyExternalPhotoCalib=False")
320 else:
321 measureConfig.plugins.names.add(plugin)
323 measurement = SingleFrameMeasurementTask(config=measureConfig, schema=schema)
324 newCat = afwTable.SourceCatalog(schema)
325 newCat.extend(catalog, mapper=mapper)
326 measurement.run(measCat=newCat, exposure=exposure, exposureId=exposureIdInfo.expId)
327 return newCat
329 def writeMetadata(self, dataRef):
330 """No metadata to write.
331 """
332 pass
334 @classmethod
335 def _makeArgumentParser(cls):
336 parser = ArgumentParser(name=cls._DefaultName)
337 parser.add_id_argument("--id", 'src',
338 help="data ID, e.g. --id visit=12345 ccd=0")
339 return parser
342class PostprocessAnalysis(object):
343 """Calculate columns from ParquetTable
345 This object manages and organizes an arbitrary set of computations
346 on a catalog. The catalog is defined by a
347 `lsst.pipe.tasks.parquetTable.ParquetTable` object (or list thereof), such as a
348 `deepCoadd_obj` dataset, and the computations are defined by a collection
349 of `lsst.pipe.tasks.functor.Functor` objects (or, equivalently,
350 a `CompositeFunctor`).
352 After the object is initialized, accessing the `.df` attribute (which
353 holds the `pandas.DataFrame` containing the results of the calculations) triggers
354 computation of said dataframe.
356 One of the conveniences of using this object is the ability to define a desired common
357 filter for all functors. This enables the same functor collection to be passed to
358 several different `PostprocessAnalysis` objects without having to change the original
359 functor collection, since the `filt` keyword argument of this object triggers an
360 overwrite of the `filt` property for all functors in the collection.
362 This object also allows a list of refFlags to be passed, and defines a set of default
363 refFlags that are always included even if not requested.
365 If a list of `ParquetTable` object is passed, rather than a single one, then the
366 calculations will be mapped over all the input catalogs. In principle, it should
367 be straightforward to parallelize this activity, but initial tests have failed
368 (see TODO in code comments).
370 Parameters
371 ----------
372 parq : `lsst.pipe.tasks.ParquetTable` (or list of such)
373 Source catalog(s) for computation
375 functors : `list`, `dict`, or `lsst.pipe.tasks.functors.CompositeFunctor`
376 Computations to do (functors that act on `parq`).
377 If a dict, the output
378 DataFrame will have columns keyed accordingly.
379 If a list, the column keys will come from the
380 `.shortname` attribute of each functor.
382 filt : `str` (optional)
383 Filter in which to calculate. If provided,
384 this will overwrite any existing `.filt` attribute
385 of the provided functors.
387 flags : `list` (optional)
388 List of flags (per-band) to include in output table.
390 refFlags : `list` (optional)
391 List of refFlags (only reference band) to include in output table.
394 """
395 _defaultRefFlags = []
396 _defaultFuncs = (('coord_ra', RAColumn()),
397 ('coord_dec', DecColumn()))
399 def __init__(self, parq, functors, filt=None, flags=None, refFlags=None):
400 self.parq = parq
401 self.functors = functors
403 self.filt = filt
404 self.flags = list(flags) if flags is not None else []
405 self.refFlags = list(self._defaultRefFlags)
406 if refFlags is not None:
407 self.refFlags += list(refFlags)
409 self._df = None
411 @property
412 def defaultFuncs(self):
413 funcs = dict(self._defaultFuncs)
414 return funcs
416 @property
417 def func(self):
418 additionalFuncs = self.defaultFuncs
419 additionalFuncs.update({flag: Column(flag, dataset='ref') for flag in self.refFlags})
420 additionalFuncs.update({flag: Column(flag, dataset='meas') for flag in self.flags})
422 if isinstance(self.functors, CompositeFunctor):
423 func = self.functors
424 else:
425 func = CompositeFunctor(self.functors)
427 func.funcDict.update(additionalFuncs)
428 func.filt = self.filt
430 return func
432 @property
433 def noDupCols(self):
434 return [name for name, func in self.func.funcDict.items() if func.noDup or func.dataset == 'ref']
436 @property
437 def df(self):
438 if self._df is None:
439 self.compute()
440 return self._df
442 def compute(self, dropna=False, pool=None):
443 # map over multiple parquet tables
444 if type(self.parq) in (list, tuple):
445 if pool is None:
446 dflist = [self.func(parq, dropna=dropna) for parq in self.parq]
447 else:
448 # TODO: Figure out why this doesn't work (pyarrow pickling issues?)
449 dflist = pool.map(functools.partial(self.func, dropna=dropna), self.parq)
450 self._df = pd.concat(dflist)
451 else:
452 self._df = self.func(self.parq, dropna=dropna)
454 return self._df
457class TransformCatalogBaseConnections(pipeBase.PipelineTaskConnections,
458 dimensions=()):
459 """Expected Connections for subclasses of TransformCatalogBaseTask.
461 Must be subclassed.
462 """
463 inputCatalog = connectionTypes.Input(
464 name="",
465 storageClass="DataFrame",
466 )
467 outputCatalog = connectionTypes.Output(
468 name="",
469 storageClass="DataFrame",
470 )
473class TransformCatalogBaseConfig(pipeBase.PipelineTaskConfig,
474 pipelineConnections=TransformCatalogBaseConnections):
475 functorFile = pexConfig.Field(
476 dtype=str,
477 doc='Path to YAML file specifying functors to be computed',
478 default=None,
479 optional=True
480 )
483class TransformCatalogBaseTask(CmdLineTask, pipeBase.PipelineTask):
484 """Base class for transforming/standardizing a catalog
486 by applying functors that convert units and apply calibrations.
487 The purpose of this task is to perform a set of computations on
488 an input `ParquetTable` dataset (such as `deepCoadd_obj`) and write the
489 results to a new dataset (which needs to be declared in an `outputDataset`
490 attribute).
492 The calculations to be performed are defined in a YAML file that specifies
493 a set of functors to be computed, provided as
494 a `--functorFile` config parameter. An example of such a YAML file
495 is the following:
497 funcs:
498 psfMag:
499 functor: Mag
500 args:
501 - base_PsfFlux
502 filt: HSC-G
503 dataset: meas
504 cmodel_magDiff:
505 functor: MagDiff
506 args:
507 - modelfit_CModel
508 - base_PsfFlux
509 filt: HSC-G
510 gauss_magDiff:
511 functor: MagDiff
512 args:
513 - base_GaussianFlux
514 - base_PsfFlux
515 filt: HSC-G
516 count:
517 functor: Column
518 args:
519 - base_InputCount_value
520 filt: HSC-G
521 deconvolved_moments:
522 functor: DeconvolvedMoments
523 filt: HSC-G
524 dataset: forced_src
525 refFlags:
526 - calib_psfUsed
527 - merge_measurement_i
528 - merge_measurement_r
529 - merge_measurement_z
530 - merge_measurement_y
531 - merge_measurement_g
532 - base_PixelFlags_flag_inexact_psfCenter
533 - detect_isPrimary
535 The names for each entry under "func" will become the names of columns in the
536 output dataset. All the functors referenced are defined in `lsst.pipe.tasks.functors`.
537 Positional arguments to be passed to each functor are in the `args` list,
538 and any additional entries for each column other than "functor" or "args" (e.g., `'filt'`,
539 `'dataset'`) are treated as keyword arguments to be passed to the functor initialization.
541 The "refFlags" entry is shortcut for a bunch of `Column` functors with the original column and
542 taken from the `'ref'` dataset.
544 The "flags" entry will be expanded out per band.
546 This task uses the `lsst.pipe.tasks.postprocess.PostprocessAnalysis` object
547 to organize and excecute the calculations.
549 """
550 @property
551 def _DefaultName(self):
552 raise NotImplementedError('Subclass must define "_DefaultName" attribute')
554 @property
555 def outputDataset(self):
556 raise NotImplementedError('Subclass must define "outputDataset" attribute')
558 @property
559 def inputDataset(self):
560 raise NotImplementedError('Subclass must define "inputDataset" attribute')
562 @property
563 def ConfigClass(self):
564 raise NotImplementedError('Subclass must define "ConfigClass" attribute')
566 def __init__(self, *args, **kwargs):
567 super().__init__(*args, **kwargs)
568 if self.config.functorFile:
569 self.log.info('Loading tranform functor definitions from %s',
570 self.config.functorFile)
571 self.funcs = CompositeFunctor.from_file(self.config.functorFile)
572 self.funcs.update(dict(PostprocessAnalysis._defaultFuncs))
573 else:
574 self.funcs = None
576 def runQuantum(self, butlerQC, inputRefs, outputRefs):
577 inputs = butlerQC.get(inputRefs)
578 if self.funcs is None:
579 raise ValueError("config.functorFile is None. "
580 "Must be a valid path to yaml in order to run Task as a PipelineTask.")
581 result = self.run(parq=inputs['inputCatalog'], funcs=self.funcs,
582 dataId=outputRefs.outputCatalog.dataId.full)
583 outputs = pipeBase.Struct(outputCatalog=result)
584 butlerQC.put(outputs, outputRefs)
586 def runDataRef(self, dataRef):
587 parq = dataRef.get()
588 if self.funcs is None:
589 raise ValueError("config.functorFile is None. "
590 "Must be a valid path to yaml in order to run as a CommandlineTask.")
591 df = self.run(parq, funcs=self.funcs, dataId=dataRef.dataId)
592 self.write(df, dataRef)
593 return df
595 def run(self, parq, funcs=None, dataId=None, band=None):
596 """Do postprocessing calculations
598 Takes a `ParquetTable` object and dataId,
599 returns a dataframe with results of postprocessing calculations.
601 Parameters
602 ----------
603 parq : `lsst.pipe.tasks.parquetTable.ParquetTable`
604 ParquetTable from which calculations are done.
605 funcs : `lsst.pipe.tasks.functors.Functors`
606 Functors to apply to the table's columns
607 dataId : dict, optional
608 Used to add a `patchId` column to the output dataframe.
609 band : `str`, optional
610 Filter band that is being processed.
612 Returns
613 ------
614 `pandas.DataFrame`
616 """
617 self.log.info("Transforming/standardizing the source table dataId: %s", dataId)
619 df = self.transform(band, parq, funcs, dataId).df
620 self.log.info("Made a table of %d columns and %d rows", len(df.columns), len(df))
621 return df
623 def getFunctors(self):
624 return self.funcs
626 def getAnalysis(self, parq, funcs=None, band=None):
627 if funcs is None:
628 funcs = self.funcs
629 analysis = PostprocessAnalysis(parq, funcs, filt=band)
630 return analysis
632 def transform(self, band, parq, funcs, dataId):
633 analysis = self.getAnalysis(parq, funcs=funcs, band=band)
634 df = analysis.df
635 if dataId is not None:
636 for key, value in dataId.items():
637 df[key] = value
639 return pipeBase.Struct(
640 df=df,
641 analysis=analysis
642 )
644 def write(self, df, parqRef):
645 parqRef.put(ParquetTable(dataFrame=df), self.outputDataset)
647 def writeMetadata(self, dataRef):
648 """No metadata to write.
649 """
650 pass
653class TransformObjectCatalogConfig(TransformCatalogBaseConfig):
654 coaddName = pexConfig.Field(
655 dtype=str,
656 default="deep",
657 doc="Name of coadd"
658 )
659 # TODO: remove in DM-27177
660 filterMap = pexConfig.DictField(
661 keytype=str,
662 itemtype=str,
663 default={},
664 doc=("Dictionary mapping full filter name to short one for column name munging."
665 "These filters determine the output columns no matter what filters the "
666 "input data actually contain."),
667 deprecated=("Coadds are now identified by the band, so this transform is unused."
668 "Will be removed after v22.")
669 )
670 outputBands = pexConfig.ListField(
671 dtype=str,
672 default=None,
673 optional=True,
674 doc=("These bands and only these bands will appear in the output,"
675 " NaN-filled if the input does not include them."
676 " If None, then use all bands found in the input.")
677 )
678 camelCase = pexConfig.Field(
679 dtype=bool,
680 default=True,
681 doc=("Write per-band columns names with camelCase, else underscore "
682 "For example: gPsFlux instead of g_PsFlux.")
683 )
684 multilevelOutput = pexConfig.Field(
685 dtype=bool,
686 default=False,
687 doc=("Whether results dataframe should have a multilevel column index (True) or be flat "
688 "and name-munged (False).")
689 )
692class TransformObjectCatalogTask(TransformCatalogBaseTask):
693 """Produce a flattened Object Table to match the format specified in
694 sdm_schemas.
696 Do the same set of postprocessing calculations on all bands
698 This is identical to `TransformCatalogBaseTask`, except for that it does the
699 specified functor calculations for all filters present in the
700 input `deepCoadd_obj` table. Any specific `"filt"` keywords specified
701 by the YAML file will be superceded.
702 """
703 _DefaultName = "transformObjectCatalog"
704 ConfigClass = TransformObjectCatalogConfig
706 inputDataset = 'deepCoadd_obj'
707 outputDataset = 'objectTable'
709 @classmethod
710 def _makeArgumentParser(cls):
711 parser = ArgumentParser(name=cls._DefaultName)
712 parser.add_id_argument("--id", cls.inputDataset,
713 ContainerClass=CoaddDataIdContainer,
714 help="data ID, e.g. --id tract=12345 patch=1,2")
715 return parser
717 def run(self, parq, funcs=None, dataId=None, band=None):
718 # NOTE: band kwarg is ignored here.
719 dfDict = {}
720 analysisDict = {}
721 templateDf = pd.DataFrame()
722 outputBands = parq.columnLevelNames['band'] if self.config.outputBands is None else \
723 self.config.outputBands
725 # Perform transform for data of filters that exist in parq.
726 for inputBand in parq.columnLevelNames['band']:
727 if inputBand not in outputBands:
728 self.log.info("Ignoring %s band data in the input", inputBand)
729 continue
730 self.log.info("Transforming the catalog of band %s", inputBand)
731 result = self.transform(inputBand, parq, funcs, dataId)
732 dfDict[inputBand] = result.df
733 analysisDict[inputBand] = result.analysis
734 if templateDf.empty:
735 templateDf = result.df
737 # Fill NaNs in columns of other wanted bands
738 for filt in outputBands:
739 if filt not in dfDict:
740 self.log.info("Adding empty columns for band %s", filt)
741 dfDict[filt] = pd.DataFrame().reindex_like(templateDf)
743 # This makes a multilevel column index, with band as first level
744 df = pd.concat(dfDict, axis=1, names=['band', 'column'])
746 if not self.config.multilevelOutput:
747 noDupCols = list(set.union(*[set(v.noDupCols) for v in analysisDict.values()]))
748 if dataId is not None:
749 noDupCols += list(dataId.keys())
750 df = flattenFilters(df, noDupCols=noDupCols, camelCase=self.config.camelCase)
752 self.log.info("Made a table of %d columns and %d rows", len(df.columns), len(df))
753 return df
756class TractObjectDataIdContainer(CoaddDataIdContainer):
758 def makeDataRefList(self, namespace):
759 """Make self.refList from self.idList
761 Generate a list of data references given tract and/or patch.
762 This was adapted from `TractQADataIdContainer`, which was
763 `TractDataIdContainer` modifie to not require "filter".
764 Only existing dataRefs are returned.
765 """
766 def getPatchRefList(tract):
767 return [namespace.butler.dataRef(datasetType=self.datasetType,
768 tract=tract.getId(),
769 patch="%d,%d" % patch.getIndex()) for patch in tract]
771 tractRefs = defaultdict(list) # Data references for each tract
772 for dataId in self.idList:
773 skymap = self.getSkymap(namespace)
775 if "tract" in dataId:
776 tractId = dataId["tract"]
777 if "patch" in dataId:
778 tractRefs[tractId].append(namespace.butler.dataRef(datasetType=self.datasetType,
779 tract=tractId,
780 patch=dataId['patch']))
781 else:
782 tractRefs[tractId] += getPatchRefList(skymap[tractId])
783 else:
784 tractRefs = dict((tract.getId(), tractRefs.get(tract.getId(), []) + getPatchRefList(tract))
785 for tract in skymap)
786 outputRefList = []
787 for tractRefList in tractRefs.values():
788 existingRefs = [ref for ref in tractRefList if ref.datasetExists()]
789 outputRefList.append(existingRefs)
791 self.refList = outputRefList
794class ConsolidateObjectTableConfig(pexConfig.Config):
795 coaddName = pexConfig.Field(
796 dtype=str,
797 default="deep",
798 doc="Name of coadd"
799 )
802class ConsolidateObjectTableTask(CmdLineTask):
803 """Write patch-merged source tables to a tract-level parquet file
804 """
805 _DefaultName = "consolidateObjectTable"
806 ConfigClass = ConsolidateObjectTableConfig
808 inputDataset = 'objectTable'
809 outputDataset = 'objectTable_tract'
811 @classmethod
812 def _makeArgumentParser(cls):
813 parser = ArgumentParser(name=cls._DefaultName)
815 parser.add_id_argument("--id", cls.inputDataset,
816 help="data ID, e.g. --id tract=12345",
817 ContainerClass=TractObjectDataIdContainer)
818 return parser
820 def runDataRef(self, patchRefList):
821 df = pd.concat([patchRef.get().toDataFrame() for patchRef in patchRefList])
822 patchRefList[0].put(ParquetTable(dataFrame=df), self.outputDataset)
824 def writeMetadata(self, dataRef):
825 """No metadata to write.
826 """
827 pass
830class TransformSourceTableConnections(pipeBase.PipelineTaskConnections,
831 dimensions=("instrument", "visit", "detector")):
833 inputCatalog = connectionTypes.Input(
834 doc="Wide input catalog of sources produced by WriteSourceTableTask",
835 name="source",
836 storageClass="DataFrame",
837 dimensions=("instrument", "visit", "detector"),
838 deferLoad=True
839 )
840 outputCatalog = connectionTypes.Output(
841 doc="Narrower, per-detector Source Table transformed and converted per a "
842 "specified set of functors",
843 name="sourceTable",
844 storageClass="DataFrame",
845 dimensions=("instrument", "visit", "detector")
846 )
849class TransformSourceTableConfig(TransformCatalogBaseConfig,
850 pipelineConnections=TransformSourceTableConnections):
851 pass
854class TransformSourceTableTask(TransformCatalogBaseTask):
855 """Transform/standardize a source catalog
856 """
857 _DefaultName = "transformSourceTable"
858 ConfigClass = TransformSourceTableConfig
860 inputDataset = 'source'
861 outputDataset = 'sourceTable'
863 @classmethod
864 def _makeArgumentParser(cls):
865 parser = ArgumentParser(name=cls._DefaultName)
866 parser.add_id_argument("--id", datasetType=cls.inputDataset,
867 level="sensor",
868 help="data ID, e.g. --id visit=12345 ccd=0")
869 return parser
871 def runDataRef(self, dataRef):
872 """Override to specify band label to run()."""
873 parq = dataRef.get()
874 funcs = self.getFunctors()
875 band = dataRef.get("calexp_filterLabel", immediate=True).bandLabel
876 df = self.run(parq, funcs=funcs, dataId=dataRef.dataId, band=band)
877 self.write(df, dataRef)
878 return df
881class ConsolidateVisitSummaryConnections(pipeBase.PipelineTaskConnections,
882 dimensions=("instrument", "visit",),
883 defaultTemplates={}):
884 calexp = connectionTypes.Input(
885 doc="Processed exposures used for metadata",
886 name="calexp",
887 storageClass="ExposureF",
888 dimensions=("instrument", "visit", "detector"),
889 deferLoad=True,
890 multiple=True,
891 )
892 visitSummary = connectionTypes.Output(
893 doc=("Per-visit consolidated exposure metadata. These catalogs use "
894 "detector id for the id and are sorted for fast lookups of a "
895 "detector."),
896 name="visitSummary",
897 storageClass="ExposureCatalog",
898 dimensions=("instrument", "visit"),
899 )
902class ConsolidateVisitSummaryConfig(pipeBase.PipelineTaskConfig,
903 pipelineConnections=ConsolidateVisitSummaryConnections):
904 """Config for ConsolidateVisitSummaryTask"""
905 pass
908class ConsolidateVisitSummaryTask(pipeBase.PipelineTask, pipeBase.CmdLineTask):
909 """Task to consolidate per-detector visit metadata.
911 This task aggregates the following metadata from all the detectors in a
912 single visit into an exposure catalog:
913 - The visitInfo.
914 - The wcs.
915 - The photoCalib.
916 - The physical_filter and band (if available).
917 - The psf size, shape, and effective area at the center of the detector.
918 - The corners of the bounding box in right ascension/declination.
920 Other quantities such as Psf, ApCorrMap, and TransmissionCurve are not
921 persisted here because of storage concerns, and because of their limited
922 utility as summary statistics.
924 Tests for this task are performed in ci_hsc_gen3.
925 """
926 _DefaultName = "consolidateVisitSummary"
927 ConfigClass = ConsolidateVisitSummaryConfig
929 @classmethod
930 def _makeArgumentParser(cls):
931 parser = ArgumentParser(name=cls._DefaultName)
933 parser.add_id_argument("--id", "calexp",
934 help="data ID, e.g. --id visit=12345",
935 ContainerClass=VisitDataIdContainer)
936 return parser
938 def writeMetadata(self, dataRef):
939 """No metadata to persist, so override to remove metadata persistance.
940 """
941 pass
943 def writeConfig(self, butler, clobber=False, doBackup=True):
944 """No config to persist, so override to remove config persistance.
945 """
946 pass
948 def runDataRef(self, dataRefList):
949 visit = dataRefList[0].dataId['visit']
951 self.log.debug("Concatenating metadata from %d per-detector calexps (visit %d)" %
952 (len(dataRefList), visit))
954 expCatalog = self._combineExposureMetadata(visit, dataRefList, isGen3=False)
956 dataRefList[0].put(expCatalog, 'visitSummary', visit=visit)
958 def runQuantum(self, butlerQC, inputRefs, outputRefs):
959 dataRefs = butlerQC.get(inputRefs.calexp)
960 visit = dataRefs[0].dataId.byName()['visit']
962 self.log.debug("Concatenating metadata from %d per-detector calexps (visit %d)" %
963 (len(dataRefs), visit))
965 expCatalog = self._combineExposureMetadata(visit, dataRefs)
967 butlerQC.put(expCatalog, outputRefs.visitSummary)
969 def _combineExposureMetadata(self, visit, dataRefs, isGen3=True):
970 """Make a combined exposure catalog from a list of dataRefs.
972 Parameters
973 ----------
974 visit : `int`
975 Visit identification number
976 dataRefs : `list`
977 List of calexp dataRefs in visit. May be list of
978 `lsst.daf.persistence.ButlerDataRef` (Gen2) or
979 `lsst.daf.butler.DeferredDatasetHandle` (Gen3).
980 isGen3 : `bool`, optional
981 Specifies if this is a Gen3 list of datarefs.
983 Returns
984 -------
985 visitSummary : `lsst.afw.table.ExposureCatalog`
986 Exposure catalog with per-detector summary information.
987 """
988 schema = afwTable.ExposureTable.makeMinimalSchema()
989 schema.addField('visit', type='I', doc='Visit number')
990 schema.addField('physical_filter', type='String', size=32, doc='Physical filter')
991 schema.addField('band', type='String', size=32, doc='Name of band')
992 schema.addField('psfSigma', type='F',
993 doc='PSF model second-moments determinant radius (center of chip) (pixel)')
994 schema.addField('psfArea', type='F',
995 doc='PSF model effective area (center of chip) (pixel**2)')
996 schema.addField('psfIxx', type='F',
997 doc='PSF model Ixx (center of chip) (pixel**2)')
998 schema.addField('psfIyy', type='F',
999 doc='PSF model Iyy (center of chip) (pixel**2)')
1000 schema.addField('psfIxy', type='F',
1001 doc='PSF model Ixy (center of chip) (pixel**2)')
1002 schema.addField('raCorners', type='ArrayD', size=4,
1003 doc='Right Ascension of bounding box corners (degrees)')
1004 schema.addField('decCorners', type='ArrayD', size=4,
1005 doc='Declination of bounding box corners (degrees)')
1007 cat = afwTable.ExposureCatalog(schema)
1008 cat.resize(len(dataRefs))
1010 cat['visit'] = visit
1012 for i, dataRef in enumerate(dataRefs):
1013 if isGen3:
1014 visitInfo = dataRef.get(component='visitInfo')
1015 filterLabel = dataRef.get(component='filterLabel')
1016 psf = dataRef.get(component='psf')
1017 wcs = dataRef.get(component='wcs')
1018 photoCalib = dataRef.get(component='photoCalib')
1019 detector = dataRef.get(component='detector')
1020 bbox = dataRef.get(component='bbox')
1021 validPolygon = dataRef.get(component='validPolygon')
1022 else:
1023 # Note that we need to read the calexp because there is
1024 # no magic access to the psf except through the exposure.
1025 gen2_read_bbox = lsst.geom.BoxI(lsst.geom.PointI(0, 0), lsst.geom.PointI(1, 1))
1026 exp = dataRef.get(datasetType='calexp_sub', bbox=gen2_read_bbox)
1027 visitInfo = exp.getInfo().getVisitInfo()
1028 filterLabel = dataRef.get("calexp_filterLabel")
1029 psf = exp.getPsf()
1030 wcs = exp.getWcs()
1031 photoCalib = exp.getPhotoCalib()
1032 detector = exp.getDetector()
1033 bbox = dataRef.get(datasetType='calexp_bbox')
1034 validPolygon = exp.getInfo().getValidPolygon()
1036 rec = cat[i]
1037 rec.setBBox(bbox)
1038 rec.setVisitInfo(visitInfo)
1039 rec.setWcs(wcs)
1040 rec.setPhotoCalib(photoCalib)
1041 rec.setDetector(detector)
1042 rec.setValidPolygon(validPolygon)
1044 rec['physical_filter'] = filterLabel.physicalLabel if filterLabel.hasPhysicalLabel() else ""
1045 rec['band'] = filterLabel.bandLabel if filterLabel.hasBandLabel() else ""
1046 rec.setId(detector.getId())
1047 shape = psf.computeShape(bbox.getCenter())
1048 rec['psfSigma'] = shape.getDeterminantRadius()
1049 rec['psfIxx'] = shape.getIxx()
1050 rec['psfIyy'] = shape.getIyy()
1051 rec['psfIxy'] = shape.getIxy()
1052 im = psf.computeKernelImage(bbox.getCenter())
1053 # The calculation of effective psf area is taken from
1054 # meas_base/src/PsfFlux.cc#L112. See
1055 # https://github.com/lsst/meas_base/blob/
1056 # 750bffe6620e565bda731add1509507f5c40c8bb/src/PsfFlux.cc#L112
1057 rec['psfArea'] = np.sum(im.array)/np.sum(im.array**2.)
1059 sph_pts = wcs.pixelToSky(lsst.geom.Box2D(bbox).getCorners())
1060 rec['raCorners'][:] = [sph.getRa().asDegrees() for sph in sph_pts]
1061 rec['decCorners'][:] = [sph.getDec().asDegrees() for sph in sph_pts]
1063 metadata = dafBase.PropertyList()
1064 metadata.add("COMMENT", "Catalog id is detector id, sorted.")
1065 # We are looping over existing datarefs, so the following is true
1066 metadata.add("COMMENT", "Only detectors with data have entries.")
1067 cat.setMetadata(metadata)
1069 cat.sort()
1070 return cat
1073class VisitDataIdContainer(DataIdContainer):
1074 """DataIdContainer that groups sensor-level id's by visit
1075 """
1077 def makeDataRefList(self, namespace):
1078 """Make self.refList from self.idList
1080 Generate a list of data references grouped by visit.
1082 Parameters
1083 ----------
1084 namespace : `argparse.Namespace`
1085 Namespace used by `lsst.pipe.base.CmdLineTask` to parse command line arguments
1086 """
1087 # Group by visits
1088 visitRefs = defaultdict(list)
1089 for dataId in self.idList:
1090 if "visit" in dataId:
1091 visitId = dataId["visit"]
1092 # append all subsets to
1093 subset = namespace.butler.subset(self.datasetType, dataId=dataId)
1094 visitRefs[visitId].extend([dataRef for dataRef in subset])
1096 outputRefList = []
1097 for refList in visitRefs.values():
1098 existingRefs = [ref for ref in refList if ref.datasetExists()]
1099 if existingRefs:
1100 outputRefList.append(existingRefs)
1102 self.refList = outputRefList
1105class ConsolidateSourceTableConnections(pipeBase.PipelineTaskConnections,
1106 dimensions=("instrument", "visit")):
1107 inputCatalogs = connectionTypes.Input(
1108 doc="Input per-detector Source Tables",
1109 name="sourceTable",
1110 storageClass="DataFrame",
1111 dimensions=("instrument", "visit", "detector"),
1112 multiple=True
1113 )
1114 outputCatalog = connectionTypes.Output(
1115 doc="Per-visit concatenation of Source Table",
1116 name="sourceTable_visit",
1117 storageClass="DataFrame",
1118 dimensions=("instrument", "visit")
1119 )
1122class ConsolidateSourceTableConfig(pipeBase.PipelineTaskConfig,
1123 pipelineConnections=ConsolidateSourceTableConnections):
1124 pass
1127class ConsolidateSourceTableTask(CmdLineTask, pipeBase.PipelineTask):
1128 """Concatenate `sourceTable` list into a per-visit `sourceTable_visit`
1129 """
1130 _DefaultName = 'consolidateSourceTable'
1131 ConfigClass = ConsolidateSourceTableConfig
1133 inputDataset = 'sourceTable'
1134 outputDataset = 'sourceTable_visit'
1136 def runQuantum(self, butlerQC, inputRefs, outputRefs):
1137 inputs = butlerQC.get(inputRefs)
1138 self.log.info("Concatenating %s per-detector Source Tables",
1139 len(inputs['inputCatalogs']))
1140 df = pd.concat(inputs['inputCatalogs'])
1141 butlerQC.put(pipeBase.Struct(outputCatalog=df), outputRefs)
1143 def runDataRef(self, dataRefList):
1144 self.log.info("Concatenating %s per-detector Source Tables", len(dataRefList))
1145 df = pd.concat([dataRef.get().toDataFrame() for dataRef in dataRefList])
1146 dataRefList[0].put(ParquetTable(dataFrame=df), self.outputDataset)
1148 @classmethod
1149 def _makeArgumentParser(cls):
1150 parser = ArgumentParser(name=cls._DefaultName)
1152 parser.add_id_argument("--id", cls.inputDataset,
1153 help="data ID, e.g. --id visit=12345",
1154 ContainerClass=VisitDataIdContainer)
1155 return parser
1157 def writeMetadata(self, dataRef):
1158 """No metadata to write.
1159 """
1160 pass
1162 def writeConfig(self, butler, clobber=False, doBackup=True):
1163 """No config to write.
1164 """
1165 pass