Coverage for python/lsst/ctrl/mpexec/separablePipelineExecutor.py: 46%

49 statements  

« prev     ^ index     » next       coverage.py v7.2.3, created at 2023-04-20 03:34 -0700

1# This file is part of ctrl_mpexec. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22 

23from __future__ import annotations 

24 

25__all__ = [ 

26 "SeparablePipelineExecutor", 

27] 

28 

29 

30import datetime 

31import getpass 

32import logging 

33import math 

34import multiprocessing 

35from typing import Any, Iterable, Mapping, Protocol 

36 

37import lsst.pipe.base 

38import lsst.resources 

39from lsst.daf.butler import Butler 

40 

41from .mpGraphExecutor import MPGraphExecutor 

42from .preExecInit import PreExecInit 

43from .quantumGraphExecutor import QuantumGraphExecutor 

44from .singleQuantumExecutor import SingleQuantumExecutor 

45from .taskFactory import TaskFactory 

46 

47_LOG = logging.getLogger(__name__) 

48 

49 

50# Only way to keep black, flake8, and mypy all happy 

51_dqc = lsst.pipe.base._datasetQueryConstraints 

52 

53 

54class _GraphBuilderLike(Protocol): 

55 def makeGraph( 

56 self, 

57 pipeline: lsst.pipe.base.Pipeline | Iterable[lsst.pipe.base.pipeline.TaskDef], 

58 collections: Any, 

59 run: str | None, 

60 userQuery: str | None, 

61 datasetQueryConstraint: _dqc.DatasetQueryConstraintVariant = _dqc._ALL, 

62 metadata: Mapping[str, Any] | None = None, 

63 resolveRefs: bool = False, 

64 bind: Mapping[str, Any] | None = None, 

65 ) -> lsst.pipe.base.QuantumGraph: 

66 pass 

67 

68 

69class SeparablePipelineExecutor: 

70 """An executor that allows each step of pipeline execution to be 

71 run independently. 

72 

73 The executor can run any or all of the following steps: 

74 

75 * pre-execution initialization 

76 * pipeline building 

77 * quantum graph generation 

78 * quantum graph execution 

79 

80 Any of these steps can also be handed off to external code without 

81 compromising the remaining ones. 

82 

83 Parameters 

84 ---------- 

85 butler : `lsst.daf.butler.Butler` 

86 A Butler whose ``collections`` and ``run`` attributes contain the input 

87 and output collections to use for processing. 

88 clobber_output : `bool`, optional 

89 If set, the pipeline execution overwrites existing output files. 

90 Otherwise, any conflict between existing and new outputs is an error. 

91 skip_existing_in : iterable [`str`], optional 

92 If not empty, the pipeline execution searches the listed collections 

93 for existing outputs, and skips any quanta that have run to completion 

94 (or have no work to do). Otherwise, all tasks are attempted (subject 

95 to ``clobber_output``). 

96 task_factory : `lsst.pipe.base.TaskFactory`, optional 

97 A custom task factory for use in pre-execution and execution. By 

98 default, a new instance of `lsst.ctrl.mpexec.TaskFactory` is used. 

99 """ 

100 

101 def __init__( 

102 self, 

103 butler: Butler, 

104 clobber_output: bool = False, 

105 skip_existing_in: Iterable[str] | None = None, 

106 task_factory: lsst.pipe.base.TaskFactory | None = None, 

107 ): 

108 self._butler = Butler(butler=butler, collections=butler.collections, run=butler.run) 

109 if not self._butler.collections: 

110 raise ValueError("Butler must specify input collections for pipeline.") 

111 if not self._butler.run: 

112 raise ValueError("Butler must specify output run for pipeline.") 

113 

114 self._clobber_output = clobber_output 

115 self._skip_existing_in = list(skip_existing_in) if skip_existing_in else [] 

116 

117 self._task_factory = task_factory if task_factory else TaskFactory() 

118 

119 def pre_execute_qgraph( 

120 self, 

121 graph: lsst.pipe.base.QuantumGraph, 

122 register_dataset_types: bool = False, 

123 save_init_outputs: bool = True, 

124 save_versions: bool = True, 

125 ) -> None: 

126 """Run pre-execution initialization. 

127 

128 This method will be deprecated after DM-38041, to be replaced with a 

129 method that takes either a `~lsst.pipe.base.Pipeline` or a 

130 ``ResolvedPipelineGraph`` instead of a `~lsst.pipe.base.QuantumGraph`. 

131 

132 Parameters 

133 ---------- 

134 graph : `lsst.pipe.base.QuantumGraph` 

135 The quantum graph defining the pipeline and datasets to 

136 be initialized. 

137 register_dataset_types : `bool`, optional 

138 If `True`, register all output dataset types from the pipeline 

139 represented by ``graph``. 

140 save_init_outputs : `bool`, optional 

141 If `True`, create init-output datasets in this object's output run. 

142 save_versions : `bool`, optional 

143 If `True`, save a package versions dataset. 

144 """ 

145 pre_exec_init = PreExecInit(self._butler, self._task_factory, extendRun=self._clobber_output) 

146 pre_exec_init.initialize( 

147 graph=graph, 

148 saveInitOutputs=save_init_outputs, 

149 registerDatasetTypes=register_dataset_types, 

150 saveVersions=save_versions, 

151 ) 

152 

153 def make_pipeline(self, pipeline_uri: str | lsst.resources.ResourcePath) -> lsst.pipe.base.Pipeline: 

154 """Build a pipeline from pipeline and configuration information. 

155 

156 Parameters 

157 ---------- 

158 pipeline_uri : `str` or `lsst.resources.ResourcePath` 

159 URI to a file containing a pipeline definition. A URI fragment may 

160 be used to specify a subset of the pipeline, as described in 

161 :ref:`pipeline-running-intro`. 

162 

163 Returns 

164 ------- 

165 pipeline : `lsst.pipe.base.Pipeline` 

166 The fully-built pipeline. 

167 """ 

168 return lsst.pipe.base.Pipeline.from_uri(pipeline_uri) 

169 

170 def make_quantum_graph( 

171 self, pipeline: lsst.pipe.base.Pipeline, where: str = "", builder: _GraphBuilderLike | None = None 

172 ) -> lsst.pipe.base.QuantumGraph: 

173 """Build a quantum graph from a pipeline and input datasets. 

174 

175 Parameters 

176 ---------- 

177 pipeline : `lsst.pipe.base.Pipeline` 

178 The pipeline for which to generate a quantum graph. 

179 where : `str`, optional 

180 A data ID query that constrains the quanta generated. 

181 builder : `lsst.pipe.base.GraphBuilder`-like, optional 

182 A graph builder that implements a 

183 `~lsst.pipe.base.GraphBuilder.makeGraph` method. By default, a new 

184 instance of `lsst.pipe.base.GraphBuilder` is used. 

185 

186 Returns 

187 ------- 

188 graph : `lsst.pipe.base.QuantumGraph` 

189 The quantum graph for ``pipeline`` as run on the datasets 

190 identified by ``where``. 

191 

192 Notes 

193 ----- 

194 This method does no special handling of empty quantum graphs. If 

195 needed, clients can use `len` to test if the returned graph is empty. 

196 """ 

197 if not builder: 

198 builder = lsst.pipe.base.GraphBuilder( 

199 self._butler.registry, 

200 skipExistingIn=self._skip_existing_in, 

201 clobberOutputs=self._clobber_output, 

202 ) 

203 

204 metadata = { 

205 "input": self._butler.collections, 

206 "output_run": self._butler.run, 

207 "skip_existing_in": self._skip_existing_in, 

208 "skip_existing": bool(self._skip_existing_in), 

209 "data_query": where, 

210 "user": getpass.getuser(), 

211 "time": str(datetime.datetime.now()), 

212 } 

213 graph = builder.makeGraph( 

214 pipeline, 

215 self._butler.collections, 

216 self._butler.run, 

217 userQuery=where, 

218 metadata=metadata, 

219 resolveRefs=True, 

220 ) 

221 _LOG.info( 

222 "QuantumGraph contains %d quanta for %d tasks, graph ID: %r", 

223 len(graph), 

224 len(graph.taskGraph), 

225 graph.graphID, 

226 ) 

227 return graph 

228 

229 def run_pipeline( 

230 self, 

231 graph: lsst.pipe.base.QuantumGraph, 

232 fail_fast: bool = False, 

233 graph_executor: QuantumGraphExecutor | None = None, 

234 ) -> None: 

235 """Run a pipeline in the form of a prepared quantum graph. 

236 

237 Pre-execution initialization must have already been run; 

238 see `pre_execute_qgraph`. 

239 

240 Parameters 

241 ---------- 

242 graph : `lsst.pipe.base.QuantumGraph` 

243 The pipeline and datasets to execute. 

244 fail_fast : `bool`, optional 

245 If `True`, abort all (parallel) execution if any task fails (only 

246 used with the default graph executor). 

247 graph_executor : `lsst.ctrl.mpexec.QuantumGraphExecutor`, optional 

248 A custom graph executor. By default, a new instance of 

249 `lsst.ctrl.mpexec.MPGraphExecutor` is used. 

250 """ 

251 if not graph_executor: 

252 quantum_executor = SingleQuantumExecutor( 

253 self._butler, 

254 self._task_factory, 

255 skipExistingIn=self._skip_existing_in, 

256 clobberOutputs=self._clobber_output, 

257 ) 

258 graph_executor = MPGraphExecutor( 

259 numProc=math.ceil(0.8 * multiprocessing.cpu_count()), 

260 timeout=2_592_000.0, # In practice, timeout is never helpful; set to 30 days. 

261 quantumExecutor=quantum_executor, 

262 failFast=fail_fast, 

263 ) 

264 # Have to reset connection pool to avoid sharing connections with 

265 # forked processes. 

266 self._butler.registry.resetConnectionPool() 

267 

268 graph_executor.execute(graph)