Coverage for python/lsst/pipe/tasks/postprocess.py : 28%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of pipe_tasks
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <https://www.gnu.org/licenses/>.
22import functools
23import pandas as pd
24from collections import defaultdict
26import lsst.pex.config as pexConfig
27import lsst.pipe.base as pipeBase
28from lsst.pipe.base import CmdLineTask, ArgumentParser, DataIdContainer
29from lsst.coadd.utils.coaddDataIdContainer import CoaddDataIdContainer
31from .parquetTable import ParquetTable
32from .multiBandUtils import makeMergeArgumentParser, MergeSourcesRunner
33from .functors import CompositeFunctor, RAColumn, DecColumn, Column
36def flattenFilters(df, filterDict, noDupCols=['coord_ra', 'coord_dec'], camelCase=False):
37 """Flattens a dataframe with multilevel column index
38 """
39 newDf = pd.DataFrame()
40 for filt, filtShort in filterDict.items():
41 subdf = df[filt]
42 columnFormat = '{0}{1}' if camelCase else '{0}_{1}'
43 newColumns = {c: columnFormat.format(filtShort, c)
44 for c in subdf.columns if c not in noDupCols}
45 cols = list(newColumns.keys())
46 newDf = pd.concat([newDf, subdf[cols].rename(columns=newColumns)], axis=1)
48 newDf = pd.concat([subdf[noDupCols], newDf], axis=1)
49 return newDf
52class WriteObjectTableConfig(pexConfig.Config):
53 priorityList = pexConfig.ListField(
54 dtype=str,
55 default=[],
56 doc="Priority-ordered list of bands for the merge."
57 )
58 engine = pexConfig.Field(
59 dtype=str,
60 default="pyarrow",
61 doc="Parquet engine for writing (pyarrow or fastparquet)"
62 )
63 coaddName = pexConfig.Field(
64 dtype=str,
65 default="deep",
66 doc="Name of coadd"
67 )
69 def validate(self):
70 pexConfig.Config.validate(self)
71 if len(self.priorityList) == 0:
72 raise RuntimeError("No priority list provided")
75class WriteObjectTableTask(CmdLineTask):
76 """Write filter-merged source tables to parquet
77 """
78 _DefaultName = "writeObjectTable"
79 ConfigClass = WriteObjectTableConfig
80 RunnerClass = MergeSourcesRunner
82 # Names of table datasets to be merged
83 inputDatasets = ('forced_src', 'meas', 'ref')
85 # Tag of output dataset written by `MergeSourcesTask.write`
86 outputDataset = 'obj'
88 def __init__(self, butler=None, schema=None, **kwargs):
89 # It is a shame that this class can't use the default init for CmdLineTask
90 # But to do so would require its own special task runner, which is many
91 # more lines of specialization, so this is how it is for now
92 CmdLineTask.__init__(self, **kwargs)
94 def runDataRef(self, patchRefList):
95 """!
96 @brief Merge coadd sources from multiple bands. Calls @ref `run` which must be defined in
97 subclasses that inherit from MergeSourcesTask.
98 @param[in] patchRefList list of data references for each filter
99 """
100 catalogs = dict(self.readCatalog(patchRef) for patchRef in patchRefList)
101 dataId = patchRefList[0].dataId
102 mergedCatalog = self.run(catalogs, tract=dataId['tract'], patch=dataId['patch'])
103 self.write(patchRefList[0], mergedCatalog)
105 @classmethod
106 def _makeArgumentParser(cls):
107 """Create a suitable ArgumentParser.
109 We will use the ArgumentParser to get a list of data
110 references for patches; the RunnerClass will sort them into lists
111 of data references for the same patch.
113 References first of self.inputDatasets, rather than
114 self.inputDataset
115 """
116 return makeMergeArgumentParser(cls._DefaultName, cls.inputDatasets[0])
118 def readCatalog(self, patchRef):
119 """Read input catalogs
121 Read all the input datasets given by the 'inputDatasets'
122 attribute.
124 Parameters
125 ----------
126 patchRef : `lsst.daf.persistence.ButlerDataRef`
127 Data reference for patch
129 Returns
130 -------
131 Tuple consisting of filter name and a dict of catalogs, keyed by
132 dataset name
133 """
134 filterName = patchRef.dataId["filter"]
135 catalogDict = {}
136 for dataset in self.inputDatasets:
137 catalog = patchRef.get(self.config.coaddName + "Coadd_" + dataset, immediate=True)
138 self.log.info("Read %d sources from %s for filter %s: %s" %
139 (len(catalog), dataset, filterName, patchRef.dataId))
140 catalogDict[dataset] = catalog
141 return filterName, catalogDict
143 def run(self, catalogs, tract, patch):
144 """Merge multiple catalogs.
146 Parameters
147 ----------
148 catalogs : `dict`
149 Mapping from filter names to dict of catalogs.
150 tract : int
151 tractId to use for the tractId column
152 patch : str
153 patchId to use for the patchId column
155 Returns
156 -------
157 catalog : `lsst.pipe.tasks.parquetTable.ParquetTable`
158 Merged dataframe, with each column prefixed by
159 `filter_tag(filt)`, wrapped in the parquet writer shim class.
160 """
162 dfs = []
163 for filt, tableDict in catalogs.items():
164 for dataset, table in tableDict.items():
165 # Convert afwTable to pandas DataFrame
166 df = table.asAstropy().to_pandas().set_index('id', drop=True)
168 # Sort columns by name, to ensure matching schema among patches
169 df = df.reindex(sorted(df.columns), axis=1)
170 df['tractId'] = tract
171 df['patchId'] = patch
173 # Make columns a 3-level MultiIndex
174 df.columns = pd.MultiIndex.from_tuples([(dataset, filt, c) for c in df.columns],
175 names=('dataset', 'filter', 'column'))
176 dfs.append(df)
178 catalog = functools.reduce(lambda d1, d2: d1.join(d2), dfs)
179 return ParquetTable(dataFrame=catalog)
181 def write(self, patchRef, catalog):
182 """Write the output.
184 Parameters
185 ----------
186 catalog : `ParquetTable`
187 Catalog to write
188 patchRef : `lsst.daf.persistence.ButlerDataRef`
189 Data reference for patch
190 """
191 patchRef.put(catalog, self.config.coaddName + "Coadd_" + self.outputDataset)
192 # since the filter isn't actually part of the data ID for the dataset we're saving,
193 # it's confusing to see it in the log message, even if the butler simply ignores it.
194 mergeDataId = patchRef.dataId.copy()
195 del mergeDataId["filter"]
196 self.log.info("Wrote merged catalog: %s" % (mergeDataId,))
198 def writeMetadata(self, dataRefList):
199 """No metadata to write, and not sure how to write it for a list of dataRefs.
200 """
201 pass
204class WriteSourceTableConfig(pexConfig.Config):
205 pass
208class WriteSourceTableTask(CmdLineTask):
209 """Write source table to parquet
210 """
211 _DefaultName = "writeSourceTable"
212 ConfigClass = WriteSourceTableConfig
214 def runDataRef(self, dataRef):
215 src = dataRef.get('src')
216 ccdVisitId = dataRef.get('ccdExposureId')
217 result = self.run(src, ccdVisitId=ccdVisitId)
218 dataRef.put(result.table, 'source')
220 def run(self, catalog, ccdVisitId=None):
221 """Convert `src` catalog to parquet
223 Parameters
224 ----------
225 catalog: `lsst.afw.table.SourceCatalog`
226 catalog to be converted
227 ccdVisitId: `int`
228 ccdVisitId to be added as a column
230 Returns
231 -------
232 result : `lsst.pipe.base.Struct`
233 ``table``
234 `ParquetTable` version of the input catalog
235 """
236 self.log.info("Generating parquet table from src catalog")
237 df = catalog.asAstropy().to_pandas().set_index('id', drop=True)
238 df['ccdVisitId'] = ccdVisitId
239 return pipeBase.Struct(table=ParquetTable(dataFrame=df))
241 def writeMetadata(self, dataRef):
242 """No metadata to write.
243 """
244 pass
246 def writeConfig(self, butler, clobber=False, doBackup=True):
247 """No config to write.
248 """
249 pass
251 @classmethod
252 def _makeArgumentParser(cls):
253 parser = ArgumentParser(name=cls._DefaultName)
254 parser.add_id_argument("--id", 'src',
255 help="data ID, e.g. --id visit=12345 ccd=0")
256 return parser
259class PostprocessAnalysis(object):
260 """Calculate columns from ParquetTable
262 This object manages and organizes an arbitrary set of computations
263 on a catalog. The catalog is defined by a
264 `lsst.pipe.tasks.parquetTable.ParquetTable` object (or list thereof), such as a
265 `deepCoadd_obj` dataset, and the computations are defined by a collection
266 of `lsst.pipe.tasks.functor.Functor` objects (or, equivalently,
267 a `CompositeFunctor`).
269 After the object is initialized, accessing the `.df` attribute (which
270 holds the `pandas.DataFrame` containing the results of the calculations) triggers
271 computation of said dataframe.
273 One of the conveniences of using this object is the ability to define a desired common
274 filter for all functors. This enables the same functor collection to be passed to
275 several different `PostprocessAnalysis` objects without having to change the original
276 functor collection, since the `filt` keyword argument of this object triggers an
277 overwrite of the `filt` property for all functors in the collection.
279 This object also allows a list of refFlags to be passed, and defines a set of default
280 refFlags that are always included even if not requested.
282 If a list of `ParquetTable` object is passed, rather than a single one, then the
283 calculations will be mapped over all the input catalogs. In principle, it should
284 be straightforward to parallelize this activity, but initial tests have failed
285 (see TODO in code comments).
287 Parameters
288 ----------
289 parq : `lsst.pipe.tasks.ParquetTable` (or list of such)
290 Source catalog(s) for computation
292 functors : `list`, `dict`, or `lsst.pipe.tasks.functors.CompositeFunctor`
293 Computations to do (functors that act on `parq`).
294 If a dict, the output
295 DataFrame will have columns keyed accordingly.
296 If a list, the column keys will come from the
297 `.shortname` attribute of each functor.
299 filt : `str` (optional)
300 Filter in which to calculate. If provided,
301 this will overwrite any existing `.filt` attribute
302 of the provided functors.
304 flags : `list` (optional)
305 List of flags (per-band) to include in output table.
307 refFlags : `list` (optional)
308 List of refFlags (only reference band) to include in output table.
311 """
312 _defaultRefFlags = []
313 _defaultFuncs = (('coord_ra', RAColumn()),
314 ('coord_dec', DecColumn()))
316 def __init__(self, parq, functors, filt=None, flags=None, refFlags=None):
317 self.parq = parq
318 self.functors = functors
320 self.filt = filt
321 self.flags = list(flags) if flags is not None else []
322 self.refFlags = list(self._defaultRefFlags)
323 if refFlags is not None:
324 self.refFlags += list(refFlags)
326 self._df = None
328 @property
329 def defaultFuncs(self):
330 funcs = dict(self._defaultFuncs)
331 return funcs
333 @property
334 def func(self):
335 additionalFuncs = self.defaultFuncs
336 additionalFuncs.update({flag: Column(flag, dataset='ref') for flag in self.refFlags})
337 additionalFuncs.update({flag: Column(flag, dataset='meas') for flag in self.flags})
339 if isinstance(self.functors, CompositeFunctor):
340 func = self.functors
341 else:
342 func = CompositeFunctor(self.functors)
344 func.funcDict.update(additionalFuncs)
345 func.filt = self.filt
347 return func
349 @property
350 def noDupCols(self):
351 return [name for name, func in self.func.funcDict.items() if func.noDup or func.dataset == 'ref']
353 @property
354 def df(self):
355 if self._df is None:
356 self.compute()
357 return self._df
359 def compute(self, dropna=False, pool=None):
360 # map over multiple parquet tables
361 if type(self.parq) in (list, tuple):
362 if pool is None:
363 dflist = [self.func(parq, dropna=dropna) for parq in self.parq]
364 else:
365 # TODO: Figure out why this doesn't work (pyarrow pickling issues?)
366 dflist = pool.map(functools.partial(self.func, dropna=dropna), self.parq)
367 self._df = pd.concat(dflist)
368 else:
369 self._df = self.func(self.parq, dropna=dropna)
371 return self._df
374class TransformCatalogBaseConfig(pexConfig.Config):
375 functorFile = pexConfig.Field(
376 dtype=str,
377 doc='Path to YAML file specifying functors to be computed',
378 default=None,
379 optional=True
380 )
383class TransformCatalogBaseTask(CmdLineTask):
384 """Base class for transforming/standardizing a catalog
386 by applying functors that convert units and apply calibrations.
387 The purpose of this task is to perform a set of computations on
388 an input `ParquetTable` dataset (such as `deepCoadd_obj`) and write the
389 results to a new dataset (which needs to be declared in an `outputDataset`
390 attribute).
392 The calculations to be performed are defined in a YAML file that specifies
393 a set of functors to be computed, provided as
394 a `--functorFile` config parameter. An example of such a YAML file
395 is the following:
397 funcs:
398 psfMag:
399 functor: Mag
400 args:
401 - base_PsfFlux
402 filt: HSC-G
403 dataset: meas
404 cmodel_magDiff:
405 functor: MagDiff
406 args:
407 - modelfit_CModel
408 - base_PsfFlux
409 filt: HSC-G
410 gauss_magDiff:
411 functor: MagDiff
412 args:
413 - base_GaussianFlux
414 - base_PsfFlux
415 filt: HSC-G
416 count:
417 functor: Column
418 args:
419 - base_InputCount_value
420 filt: HSC-G
421 deconvolved_moments:
422 functor: DeconvolvedMoments
423 filt: HSC-G
424 dataset: forced_src
425 refFlags:
426 - calib_psfUsed
427 - merge_measurement_i
428 - merge_measurement_r
429 - merge_measurement_z
430 - merge_measurement_y
431 - merge_measurement_g
432 - base_PixelFlags_flag_inexact_psfCenter
433 - detect_isPrimary
435 The names for each entry under "func" will become the names of columns in the
436 output dataset. All the functors referenced are defined in `lsst.pipe.tasks.functors`.
437 Positional arguments to be passed to each functor are in the `args` list,
438 and any additional entries for each column other than "functor" or "args" (e.g., `'filt'`,
439 `'dataset'`) are treated as keyword arguments to be passed to the functor initialization.
441 The "refFlags" entry is shortcut for a bunch of `Column` functors with the original column and
442 taken from the `'ref'` dataset.
444 The "flags" entry will be expanded out per band.
446 Note, if `'filter'` is provided as part of the `dataId` when running this task (even though
447 `deepCoadd_obj` does not use `'filter'`), then this will override the `filt` kwargs
448 provided in the YAML file, and the calculations will be done in that filter.
450 This task uses the `lsst.pipe.tasks.postprocess.PostprocessAnalysis` object
451 to organize and excecute the calculations.
453 """
454 @property
455 def _DefaultName(self):
456 raise NotImplementedError('Subclass must define "_DefaultName" attribute')
458 @property
459 def outputDataset(self):
460 raise NotImplementedError('Subclass must define "outputDataset" attribute')
462 @property
463 def inputDataset(self):
464 raise NotImplementedError('Subclass must define "inputDataset" attribute')
466 @property
467 def ConfigClass(self):
468 raise NotImplementedError('Subclass must define "ConfigClass" attribute')
470 def runDataRef(self, dataRef):
471 parq = dataRef.get()
472 funcs = self.getFunctors()
473 df = self.run(parq, funcs=funcs, dataId=dataRef.dataId)
474 self.write(df, dataRef)
475 return df
477 def run(self, parq, funcs=None, dataId=None):
478 """Do postprocessing calculations
480 Takes a `ParquetTable` object and dataId,
481 returns a dataframe with results of postprocessing calculations.
483 Parameters
484 ----------
485 parq : `lsst.pipe.tasks.parquetTable.ParquetTable`
486 ParquetTable from which calculations are done.
487 funcs : `lsst.pipe.tasks.functors.Functors`
488 Functors to apply to the table's columns
489 dataId : dict, optional
490 Used to add a `patchId` column to the output dataframe.
492 Returns
493 ------
494 `pandas.DataFrame`
496 """
497 self.log.info("Transforming/standardizing the source table dataId: %s", dataId)
499 filt = dataId.get('filter', None)
500 df = self.transform(filt, parq, funcs, dataId).df
501 self.log.info("Made a table of %d columns and %d rows", len(df.columns), len(df))
502 return df
504 def getFunctors(self):
505 funcs = CompositeFunctor.from_file(self.config.functorFile)
506 funcs.update(dict(PostprocessAnalysis._defaultFuncs))
507 return funcs
509 def getAnalysis(self, parq, funcs=None, filt=None):
510 # Avoids disk access if funcs is passed
511 if funcs is None:
512 funcs = self.getFunctors()
513 analysis = PostprocessAnalysis(parq, funcs, filt=filt)
514 return analysis
516 def transform(self, filt, parq, funcs, dataId):
517 analysis = self.getAnalysis(parq, funcs=funcs, filt=filt)
518 df = analysis.df
519 if dataId is not None:
520 for key, value in dataId.items():
521 df[key] = value
523 return pipeBase.Struct(
524 df=df,
525 analysis=analysis
526 )
528 def write(self, df, parqRef):
529 parqRef.put(ParquetTable(dataFrame=df), self.outputDataset)
531 def writeMetadata(self, dataRef):
532 """No metadata to write.
533 """
534 pass
537class TransformObjectCatalogConfig(TransformCatalogBaseConfig):
538 coaddName = pexConfig.Field(
539 dtype=str,
540 default="deep",
541 doc="Name of coadd"
542 )
543 filterMap = pexConfig.DictField(
544 keytype=str,
545 itemtype=str,
546 default={},
547 doc=("Dictionary mapping full filter name to short one for column name munging."
548 "These filters determine the output columns no matter what filters the "
549 "input data actually contain.")
550 )
551 camelCase = pexConfig.Field(
552 dtype=bool,
553 default=True,
554 doc=("Write per-filter columns names with camelCase, else underscore "
555 "For example: gPsfFlux instead of g_PsfFlux.")
556 )
557 multilevelOutput = pexConfig.Field(
558 dtype=bool,
559 default=False,
560 doc=("Whether results dataframe should have a multilevel column index (True) or be flat "
561 "and name-munged (False).")
562 )
565class TransformObjectCatalogTask(TransformCatalogBaseTask):
566 """Compute Flatted Object Table as defined in the DPDD
568 Do the same set of postprocessing calculations on all bands
570 This is identical to `TransformCatalogBaseTask`, except for that it does the
571 specified functor calculations for all filters present in the
572 input `deepCoadd_obj` table. Any specific `"filt"` keywords specified
573 by the YAML file will be superceded.
574 """
575 _DefaultName = "transformObjectCatalog"
576 ConfigClass = TransformObjectCatalogConfig
578 inputDataset = 'deepCoadd_obj'
579 outputDataset = 'objectTable'
581 @classmethod
582 def _makeArgumentParser(cls):
583 parser = ArgumentParser(name=cls._DefaultName)
584 parser.add_id_argument("--id", cls.inputDataset,
585 ContainerClass=CoaddDataIdContainer,
586 help="data ID, e.g. --id tract=12345 patch=1,2")
587 return parser
589 def run(self, parq, funcs=None, dataId=None):
590 dfDict = {}
591 analysisDict = {}
592 templateDf = pd.DataFrame()
593 # Perform transform for data of filters that exist in parq and are
594 # specified in config.filterMap
595 for filt in parq.columnLevelNames['filter']:
596 if filt not in self.config.filterMap:
597 self.log.info("Ignoring %s data in the input", filt)
598 continue
599 self.log.info("Transforming the catalog of filter %s", filt)
600 result = self.transform(filt, parq, funcs, dataId)
601 dfDict[filt] = result.df
602 analysisDict[filt] = result.analysis
603 if templateDf.empty:
604 templateDf = result.df
606 # Fill NaNs in columns of other wanted filters
607 for filt in self.config.filterMap:
608 if filt not in dfDict:
609 self.log.info("Adding empty columns for filter %s", filt)
610 dfDict[filt] = pd.DataFrame().reindex_like(templateDf)
612 # This makes a multilevel column index, with filter as first level
613 df = pd.concat(dfDict, axis=1, names=['filter', 'column'])
615 if not self.config.multilevelOutput:
616 noDupCols = list(set.union(*[set(v.noDupCols) for v in analysisDict.values()]))
617 if dataId is not None:
618 noDupCols += list(dataId.keys())
619 df = flattenFilters(df, self.config.filterMap, noDupCols=noDupCols,
620 camelCase=self.config.camelCase)
622 self.log.info("Made a table of %d columns and %d rows", len(df.columns), len(df))
623 return df
626class TractObjectDataIdContainer(CoaddDataIdContainer):
628 def makeDataRefList(self, namespace):
629 """Make self.refList from self.idList
631 Generate a list of data references given tract and/or patch.
632 This was adapted from `TractQADataIdContainer`, which was
633 `TractDataIdContainer` modifie to not require "filter".
634 Only existing dataRefs are returned.
635 """
636 def getPatchRefList(tract):
637 return [namespace.butler.dataRef(datasetType=self.datasetType,
638 tract=tract.getId(),
639 patch="%d,%d" % patch.getIndex()) for patch in tract]
641 tractRefs = defaultdict(list) # Data references for each tract
642 for dataId in self.idList:
643 skymap = self.getSkymap(namespace)
645 if "tract" in dataId:
646 tractId = dataId["tract"]
647 if "patch" in dataId:
648 tractRefs[tractId].append(namespace.butler.dataRef(datasetType=self.datasetType,
649 tract=tractId,
650 patch=dataId['patch']))
651 else:
652 tractRefs[tractId] += getPatchRefList(skymap[tractId])
653 else:
654 tractRefs = dict((tract.getId(), tractRefs.get(tract.getId(), []) + getPatchRefList(tract))
655 for tract in skymap)
656 outputRefList = []
657 for tractRefList in tractRefs.values():
658 existingRefs = [ref for ref in tractRefList if ref.datasetExists()]
659 outputRefList.append(existingRefs)
661 self.refList = outputRefList
664class ConsolidateObjectTableConfig(pexConfig.Config):
665 coaddName = pexConfig.Field(
666 dtype=str,
667 default="deep",
668 doc="Name of coadd"
669 )
672class ConsolidateObjectTableTask(CmdLineTask):
673 """Write patch-merged source tables to a tract-level parquet file
674 """
675 _DefaultName = "consolidateObjectTable"
676 ConfigClass = ConsolidateObjectTableConfig
678 inputDataset = 'objectTable'
679 outputDataset = 'objectTable_tract'
681 @classmethod
682 def _makeArgumentParser(cls):
683 parser = ArgumentParser(name=cls._DefaultName)
685 parser.add_id_argument("--id", cls.inputDataset,
686 help="data ID, e.g. --id tract=12345",
687 ContainerClass=TractObjectDataIdContainer)
688 return parser
690 def runDataRef(self, patchRefList):
691 df = pd.concat([patchRef.get().toDataFrame() for patchRef in patchRefList])
692 patchRefList[0].put(ParquetTable(dataFrame=df), self.outputDataset)
694 def writeMetadata(self, dataRef):
695 """No metadata to write.
696 """
697 pass
700class TransformSourceTableConfig(TransformCatalogBaseConfig):
701 pass
704class TransformSourceTableTask(TransformCatalogBaseTask):
705 """Transform/standardize a source catalog
706 """
707 _DefaultName = "transformSourceTable"
708 ConfigClass = TransformSourceTableConfig
710 inputDataset = 'source'
711 outputDataset = 'sourceTable'
713 def writeMetadata(self, dataRef):
714 """No metadata to write.
715 """
716 pass
718 @classmethod
719 def _makeArgumentParser(cls):
720 parser = ArgumentParser(name=cls._DefaultName)
721 parser.add_id_argument("--id", datasetType=cls.inputDataset,
722 level="sensor",
723 help="data ID, e.g. --id visit=12345 ccd=0")
724 return parser
727class VisitDataIdContainer(DataIdContainer):
728 """DataIdContainer that groups sensor-level id's by visit
729 """
731 def makeDataRefList(self, namespace):
732 """Make self.refList from self.idList
734 Generate a list of data references grouped by visit.
736 Parameters
737 ----------
738 namespace : `argparse.Namespace`
739 Namespace used by `lsst.pipe.base.CmdLineTask` to parse command line arguments
740 """
741 def ccdDataRefList(visitId):
742 """Get all possible ccds for a given visit"""
743 ccds = namespace.butler.queryMetadata('src', ['ccd'], dataId={'visit': visitId})
744 return [namespace.butler.dataRef(datasetType=self.datasetType,
745 visit=visitId,
746 ccd=ccd) for ccd in ccds]
747 # Group by visits
748 visitRefs = defaultdict(list)
749 for dataId in self.idList:
750 if "visit" in dataId:
751 visitId = dataId["visit"]
752 if "ccd" in dataId:
753 visitRefs[visitId].append(namespace.butler.dataRef(datasetType=self.datasetType,
754 visit=visitId, ccd=dataId['ccd']))
755 else:
756 visitRefs[visitId] += ccdDataRefList(visitId)
757 outputRefList = []
758 for refList in visitRefs.values():
759 existingRefs = [ref for ref in refList if ref.datasetExists()]
760 outputRefList.append(existingRefs)
762 self.refList = outputRefList
765class ConsolidateSourceTableConfig(pexConfig.Config):
766 pass
769class ConsolidateSourceTableTask(CmdLineTask):
770 """Concatenate `sourceTable` list into a per-visit `sourceTable_visit`
771 """
772 _DefaultName = 'consolidateSourceTable'
773 ConfigClass = ConsolidateSourceTableConfig
775 inputDataset = 'sourceTable'
776 outputDataset = 'sourceTable_visit'
778 def runDataRef(self, dataRefList):
779 self.log.info("Concatenating %s per-detector Source Tables", len(dataRefList))
780 df = pd.concat([dataRef.get().toDataFrame() for dataRef in dataRefList])
781 dataRefList[0].put(ParquetTable(dataFrame=df), self.outputDataset)
783 @classmethod
784 def _makeArgumentParser(cls):
785 parser = ArgumentParser(name=cls._DefaultName)
787 parser.add_id_argument("--id", cls.inputDataset,
788 help="data ID, e.g. --id visit=12345",
789 ContainerClass=VisitDataIdContainer)
790 return parser
792 def writeMetadata(self, dataRef):
793 """No metadata to write.
794 """
795 pass
797 def writeConfig(self, butler, clobber=False, doBackup=True):
798 """No config to write.
799 """
800 pass