Coverage for python/lsst/ctrl/mpexec/separablePipelineExecutor.py: 45%

50 statements  

« prev     ^ index     » next       coverage.py v7.2.5, created at 2023-05-11 10:25 +0000

1# This file is part of ctrl_mpexec. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22 

23from __future__ import annotations 

24 

25__all__ = [ 

26 "SeparablePipelineExecutor", 

27] 

28 

29 

30import datetime 

31import getpass 

32import logging 

33import math 

34import multiprocessing 

35from typing import Any, Iterable, Mapping, Protocol 

36 

37import lsst.pipe.base 

38import lsst.resources 

39from lsst.daf.butler import Butler 

40 

41from .mpGraphExecutor import MPGraphExecutor 

42from .preExecInit import PreExecInit 

43from .quantumGraphExecutor import QuantumGraphExecutor 

44from .singleQuantumExecutor import SingleQuantumExecutor 

45from .taskFactory import TaskFactory 

46 

47_LOG = logging.getLogger(__name__) 

48 

49 

50# Only way to keep black, flake8, and mypy all happy 

51_dqc = lsst.pipe.base._datasetQueryConstraints 

52 

53 

54class _GraphBuilderLike(Protocol): 

55 def makeGraph( 

56 self, 

57 pipeline: lsst.pipe.base.Pipeline | Iterable[lsst.pipe.base.pipeline.TaskDef], 

58 collections: Any, 

59 run: str, 

60 userQuery: str | None, 

61 datasetQueryConstraint: _dqc.DatasetQueryConstraintVariant = _dqc._ALL, 

62 metadata: Mapping[str, Any] | None = None, 

63 bind: Mapping[str, Any] | None = None, 

64 ) -> lsst.pipe.base.QuantumGraph: 

65 pass 

66 

67 

68class SeparablePipelineExecutor: 

69 """An executor that allows each step of pipeline execution to be 

70 run independently. 

71 

72 The executor can run any or all of the following steps: 

73 

74 * pre-execution initialization 

75 * pipeline building 

76 * quantum graph generation 

77 * quantum graph execution 

78 

79 Any of these steps can also be handed off to external code without 

80 compromising the remaining ones. 

81 

82 Parameters 

83 ---------- 

84 butler : `lsst.daf.butler.Butler` 

85 A Butler whose ``collections`` and ``run`` attributes contain the input 

86 and output collections to use for processing. 

87 clobber_output : `bool`, optional 

88 If set, the pipeline execution overwrites existing output files. 

89 Otherwise, any conflict between existing and new outputs is an error. 

90 skip_existing_in : iterable [`str`], optional 

91 If not empty, the pipeline execution searches the listed collections 

92 for existing outputs, and skips any quanta that have run to completion 

93 (or have no work to do). Otherwise, all tasks are attempted (subject 

94 to ``clobber_output``). 

95 task_factory : `lsst.pipe.base.TaskFactory`, optional 

96 A custom task factory for use in pre-execution and execution. By 

97 default, a new instance of `lsst.ctrl.mpexec.TaskFactory` is used. 

98 """ 

99 

100 def __init__( 

101 self, 

102 butler: Butler, 

103 clobber_output: bool = False, 

104 skip_existing_in: Iterable[str] | None = None, 

105 task_factory: lsst.pipe.base.TaskFactory | None = None, 

106 ): 

107 self._butler = Butler(butler=butler, collections=butler.collections, run=butler.run) 

108 if not self._butler.collections: 

109 raise ValueError("Butler must specify input collections for pipeline.") 

110 if not self._butler.run: 

111 raise ValueError("Butler must specify output run for pipeline.") 

112 

113 self._clobber_output = clobber_output 

114 self._skip_existing_in = list(skip_existing_in) if skip_existing_in else [] 

115 

116 self._task_factory = task_factory if task_factory else TaskFactory() 

117 

118 def pre_execute_qgraph( 

119 self, 

120 graph: lsst.pipe.base.QuantumGraph, 

121 register_dataset_types: bool = False, 

122 save_init_outputs: bool = True, 

123 save_versions: bool = True, 

124 ) -> None: 

125 """Run pre-execution initialization. 

126 

127 This method will be deprecated after DM-38041, to be replaced with a 

128 method that takes either a `~lsst.pipe.base.Pipeline` or a 

129 ``ResolvedPipelineGraph`` instead of a `~lsst.pipe.base.QuantumGraph`. 

130 

131 Parameters 

132 ---------- 

133 graph : `lsst.pipe.base.QuantumGraph` 

134 The quantum graph defining the pipeline and datasets to 

135 be initialized. 

136 register_dataset_types : `bool`, optional 

137 If `True`, register all output dataset types from the pipeline 

138 represented by ``graph``. 

139 save_init_outputs : `bool`, optional 

140 If `True`, create init-output datasets in this object's output run. 

141 save_versions : `bool`, optional 

142 If `True`, save a package versions dataset. 

143 """ 

144 pre_exec_init = PreExecInit(self._butler, self._task_factory, extendRun=self._clobber_output) 

145 pre_exec_init.initialize( 

146 graph=graph, 

147 saveInitOutputs=save_init_outputs, 

148 registerDatasetTypes=register_dataset_types, 

149 saveVersions=save_versions, 

150 ) 

151 

152 def make_pipeline(self, pipeline_uri: str | lsst.resources.ResourcePath) -> lsst.pipe.base.Pipeline: 

153 """Build a pipeline from pipeline and configuration information. 

154 

155 Parameters 

156 ---------- 

157 pipeline_uri : `str` or `lsst.resources.ResourcePath` 

158 URI to a file containing a pipeline definition. A URI fragment may 

159 be used to specify a subset of the pipeline, as described in 

160 :ref:`pipeline-running-intro`. 

161 

162 Returns 

163 ------- 

164 pipeline : `lsst.pipe.base.Pipeline` 

165 The fully-built pipeline. 

166 """ 

167 return lsst.pipe.base.Pipeline.from_uri(pipeline_uri) 

168 

169 def make_quantum_graph( 

170 self, pipeline: lsst.pipe.base.Pipeline, where: str = "", builder: _GraphBuilderLike | None = None 

171 ) -> lsst.pipe.base.QuantumGraph: 

172 """Build a quantum graph from a pipeline and input datasets. 

173 

174 Parameters 

175 ---------- 

176 pipeline : `lsst.pipe.base.Pipeline` 

177 The pipeline for which to generate a quantum graph. 

178 where : `str`, optional 

179 A data ID query that constrains the quanta generated. 

180 builder : `lsst.pipe.base.GraphBuilder`-like, optional 

181 A graph builder that implements a 

182 `~lsst.pipe.base.GraphBuilder.makeGraph` method. By default, a new 

183 instance of `lsst.pipe.base.GraphBuilder` is used. 

184 

185 Returns 

186 ------- 

187 graph : `lsst.pipe.base.QuantumGraph` 

188 The quantum graph for ``pipeline`` as run on the datasets 

189 identified by ``where``. 

190 

191 Notes 

192 ----- 

193 This method does no special handling of empty quantum graphs. If 

194 needed, clients can use `len` to test if the returned graph is empty. 

195 """ 

196 if not builder: 

197 builder = lsst.pipe.base.GraphBuilder( 

198 self._butler.registry, 

199 skipExistingIn=self._skip_existing_in, 

200 clobberOutputs=self._clobber_output, 

201 ) 

202 

203 metadata = { 

204 "input": self._butler.collections, 

205 "output_run": self._butler.run, 

206 "skip_existing_in": self._skip_existing_in, 

207 "skip_existing": bool(self._skip_existing_in), 

208 "data_query": where, 

209 "user": getpass.getuser(), 

210 "time": str(datetime.datetime.now()), 

211 } 

212 assert self._butler.run is not None, "Butler output run collection must be defined" 

213 graph = builder.makeGraph( 

214 pipeline, 

215 self._butler.collections, 

216 self._butler.run, 

217 userQuery=where, 

218 metadata=metadata, 

219 ) 

220 _LOG.info( 

221 "QuantumGraph contains %d quanta for %d tasks, graph ID: %r", 

222 len(graph), 

223 len(graph.taskGraph), 

224 graph.graphID, 

225 ) 

226 return graph 

227 

228 def run_pipeline( 

229 self, 

230 graph: lsst.pipe.base.QuantumGraph, 

231 fail_fast: bool = False, 

232 graph_executor: QuantumGraphExecutor | None = None, 

233 ) -> None: 

234 """Run a pipeline in the form of a prepared quantum graph. 

235 

236 Pre-execution initialization must have already been run; 

237 see `pre_execute_qgraph`. 

238 

239 Parameters 

240 ---------- 

241 graph : `lsst.pipe.base.QuantumGraph` 

242 The pipeline and datasets to execute. 

243 fail_fast : `bool`, optional 

244 If `True`, abort all (parallel) execution if any task fails (only 

245 used with the default graph executor). 

246 graph_executor : `lsst.ctrl.mpexec.QuantumGraphExecutor`, optional 

247 A custom graph executor. By default, a new instance of 

248 `lsst.ctrl.mpexec.MPGraphExecutor` is used. 

249 """ 

250 if not graph_executor: 

251 quantum_executor = SingleQuantumExecutor( 

252 self._butler, 

253 self._task_factory, 

254 skipExistingIn=self._skip_existing_in, 

255 clobberOutputs=self._clobber_output, 

256 ) 

257 graph_executor = MPGraphExecutor( 

258 numProc=math.ceil(0.8 * multiprocessing.cpu_count()), 

259 timeout=2_592_000.0, # In practice, timeout is never helpful; set to 30 days. 

260 quantumExecutor=quantum_executor, 

261 failFast=fail_fast, 

262 ) 

263 # Have to reset connection pool to avoid sharing connections with 

264 # forked processes. 

265 self._butler.registry.resetConnectionPool() 

266 

267 graph_executor.execute(graph)