Coverage for python/lsst/ctrl/mpexec/separablePipelineExecutor.py: 45%

52 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-06-28 10:40 +0000

1# This file is part of ctrl_mpexec. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22 

23from __future__ import annotations 

24 

25__all__ = [ 

26 "SeparablePipelineExecutor", 

27] 

28 

29 

30import datetime 

31import getpass 

32import logging 

33import math 

34import multiprocessing 

35from collections.abc import Iterable, Mapping 

36from typing import Any, Protocol 

37 

38import lsst.pipe.base 

39import lsst.resources 

40from lsst.daf.butler import Butler 

41 

42from .mpGraphExecutor import MPGraphExecutor 

43from .preExecInit import PreExecInit 

44from .quantumGraphExecutor import QuantumGraphExecutor 

45from .singleQuantumExecutor import SingleQuantumExecutor 

46from .taskFactory import TaskFactory 

47 

48_LOG = logging.getLogger(__name__) 

49 

50 

51# Only way to keep black, flake8, and mypy all happy 

52_dqc = lsst.pipe.base._datasetQueryConstraints 

53 

54 

55class _GraphBuilderLike(Protocol): 

56 def makeGraph( 

57 self, 

58 pipeline: lsst.pipe.base.Pipeline | Iterable[lsst.pipe.base.pipeline.TaskDef], 

59 collections: Any, 

60 run: str, 

61 userQuery: str | None, 

62 datasetQueryConstraint: _dqc.DatasetQueryConstraintVariant = _dqc._ALL, 

63 metadata: Mapping[str, Any] | None = None, 

64 bind: Mapping[str, Any] | None = None, 

65 ) -> lsst.pipe.base.QuantumGraph: 

66 pass 

67 

68 

69class SeparablePipelineExecutor: 

70 """An executor that allows each step of pipeline execution to be 

71 run independently. 

72 

73 The executor can run any or all of the following steps: 

74 

75 * pre-execution initialization 

76 * pipeline building 

77 * quantum graph generation 

78 * quantum graph execution 

79 

80 Any of these steps can also be handed off to external code without 

81 compromising the remaining ones. 

82 

83 Parameters 

84 ---------- 

85 butler : `lsst.daf.butler.Butler` 

86 A Butler whose ``collections`` and ``run`` attributes contain the input 

87 and output collections to use for processing. 

88 clobber_output : `bool`, optional 

89 If set, the pipeline execution overwrites existing output files. 

90 Otherwise, any conflict between existing and new outputs is an error. 

91 skip_existing_in : iterable [`str`], optional 

92 If not empty, the pipeline execution searches the listed collections 

93 for existing outputs, and skips any quanta that have run to completion 

94 (or have no work to do). Otherwise, all tasks are attempted (subject 

95 to ``clobber_output``). 

96 task_factory : `lsst.pipe.base.TaskFactory`, optional 

97 A custom task factory for use in pre-execution and execution. By 

98 default, a new instance of `lsst.ctrl.mpexec.TaskFactory` is used. 

99 resources : `~lsst.pipe.base.ExecutionResources` 

100 The resources available to each quantum being executed. 

101 """ 

102 

103 def __init__( 

104 self, 

105 butler: Butler, 

106 clobber_output: bool = False, 

107 skip_existing_in: Iterable[str] | None = None, 

108 task_factory: lsst.pipe.base.TaskFactory | None = None, 

109 resources: lsst.pipe.base.ExecutionResources | None = None, 

110 ): 

111 self._butler = Butler(butler=butler, collections=butler.collections, run=butler.run) 

112 if not self._butler.collections: 

113 raise ValueError("Butler must specify input collections for pipeline.") 

114 if not self._butler.run: 

115 raise ValueError("Butler must specify output run for pipeline.") 

116 

117 self._clobber_output = clobber_output 

118 self._skip_existing_in = list(skip_existing_in) if skip_existing_in else [] 

119 

120 self._task_factory = task_factory if task_factory else TaskFactory() 

121 self.resources = resources 

122 

123 def pre_execute_qgraph( 

124 self, 

125 graph: lsst.pipe.base.QuantumGraph, 

126 register_dataset_types: bool = False, 

127 save_init_outputs: bool = True, 

128 save_versions: bool = True, 

129 ) -> None: 

130 """Run pre-execution initialization. 

131 

132 This method will be deprecated after DM-38041, to be replaced with a 

133 method that takes either a `~lsst.pipe.base.Pipeline` or a 

134 ``ResolvedPipelineGraph`` instead of a `~lsst.pipe.base.QuantumGraph`. 

135 

136 Parameters 

137 ---------- 

138 graph : `lsst.pipe.base.QuantumGraph` 

139 The quantum graph defining the pipeline and datasets to 

140 be initialized. 

141 register_dataset_types : `bool`, optional 

142 If `True`, register all output dataset types from the pipeline 

143 represented by ``graph``. 

144 save_init_outputs : `bool`, optional 

145 If `True`, create init-output datasets in this object's output run. 

146 save_versions : `bool`, optional 

147 If `True`, save a package versions dataset. 

148 """ 

149 pre_exec_init = PreExecInit(self._butler, self._task_factory, extendRun=self._clobber_output) 

150 pre_exec_init.initialize( 

151 graph=graph, 

152 saveInitOutputs=save_init_outputs, 

153 registerDatasetTypes=register_dataset_types, 

154 saveVersions=save_versions, 

155 ) 

156 

157 def make_pipeline(self, pipeline_uri: str | lsst.resources.ResourcePath) -> lsst.pipe.base.Pipeline: 

158 """Build a pipeline from pipeline and configuration information. 

159 

160 Parameters 

161 ---------- 

162 pipeline_uri : `str` or `lsst.resources.ResourcePath` 

163 URI to a file containing a pipeline definition. A URI fragment may 

164 be used to specify a subset of the pipeline, as described in 

165 :ref:`pipeline-running-intro`. 

166 

167 Returns 

168 ------- 

169 pipeline : `lsst.pipe.base.Pipeline` 

170 The fully-built pipeline. 

171 """ 

172 return lsst.pipe.base.Pipeline.from_uri(pipeline_uri) 

173 

174 def make_quantum_graph( 

175 self, pipeline: lsst.pipe.base.Pipeline, where: str = "", builder: _GraphBuilderLike | None = None 

176 ) -> lsst.pipe.base.QuantumGraph: 

177 """Build a quantum graph from a pipeline and input datasets. 

178 

179 Parameters 

180 ---------- 

181 pipeline : `lsst.pipe.base.Pipeline` 

182 The pipeline for which to generate a quantum graph. 

183 where : `str`, optional 

184 A data ID query that constrains the quanta generated. 

185 builder : `lsst.pipe.base.GraphBuilder`-like, optional 

186 A graph builder that implements a 

187 `~lsst.pipe.base.GraphBuilder.makeGraph` method. By default, a new 

188 instance of `lsst.pipe.base.GraphBuilder` is used. 

189 

190 Returns 

191 ------- 

192 graph : `lsst.pipe.base.QuantumGraph` 

193 The quantum graph for ``pipeline`` as run on the datasets 

194 identified by ``where``. 

195 

196 Notes 

197 ----- 

198 This method does no special handling of empty quantum graphs. If 

199 needed, clients can use `len` to test if the returned graph is empty. 

200 """ 

201 if not builder: 

202 builder = lsst.pipe.base.GraphBuilder( 

203 self._butler.registry, 

204 skipExistingIn=self._skip_existing_in, 

205 clobberOutputs=self._clobber_output, 

206 ) 

207 

208 metadata = { 

209 "input": self._butler.collections, 

210 "output_run": self._butler.run, 

211 "skip_existing_in": self._skip_existing_in, 

212 "skip_existing": bool(self._skip_existing_in), 

213 "data_query": where, 

214 "user": getpass.getuser(), 

215 "time": str(datetime.datetime.now()), 

216 } 

217 assert self._butler.run is not None, "Butler output run collection must be defined" 

218 graph = builder.makeGraph( 

219 pipeline, 

220 self._butler.collections, 

221 self._butler.run, 

222 userQuery=where, 

223 metadata=metadata, 

224 ) 

225 _LOG.info( 

226 "QuantumGraph contains %d quanta for %d tasks, graph ID: %r", 

227 len(graph), 

228 len(graph.taskGraph), 

229 graph.graphID, 

230 ) 

231 return graph 

232 

233 def run_pipeline( 

234 self, 

235 graph: lsst.pipe.base.QuantumGraph, 

236 fail_fast: bool = False, 

237 graph_executor: QuantumGraphExecutor | None = None, 

238 ) -> None: 

239 """Run a pipeline in the form of a prepared quantum graph. 

240 

241 Pre-execution initialization must have already been run; 

242 see `pre_execute_qgraph`. 

243 

244 Parameters 

245 ---------- 

246 graph : `lsst.pipe.base.QuantumGraph` 

247 The pipeline and datasets to execute. 

248 fail_fast : `bool`, optional 

249 If `True`, abort all (parallel) execution if any task fails (only 

250 used with the default graph executor). 

251 graph_executor : `lsst.ctrl.mpexec.QuantumGraphExecutor`, optional 

252 A custom graph executor. By default, a new instance of 

253 `lsst.ctrl.mpexec.MPGraphExecutor` is used. 

254 """ 

255 if not graph_executor: 

256 quantum_executor = SingleQuantumExecutor( 

257 self._butler, 

258 self._task_factory, 

259 skipExistingIn=self._skip_existing_in, 

260 clobberOutputs=self._clobber_output, 

261 resources=self.resources, 

262 ) 

263 graph_executor = MPGraphExecutor( 

264 numProc=math.ceil(0.8 * multiprocessing.cpu_count()), 

265 timeout=2_592_000.0, # In practice, timeout is never helpful; set to 30 days. 

266 quantumExecutor=quantum_executor, 

267 failFast=fail_fast, 

268 ) 

269 # Have to reset connection pool to avoid sharing connections with 

270 # forked processes. 

271 self._butler.registry.resetConnectionPool() 

272 

273 graph_executor.execute(graph)