Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of ctrl_mpexec. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""Module defining CmdLineFwk class and related methods. 

23""" 

24 

25__all__ = ['CmdLineFwk'] 

26 

27# ------------------------------- 

28# Imports of standard modules -- 

29# ------------------------------- 

30import fnmatch 

31import logging 

32import pickle 

33import re 

34import sys 

35import warnings 

36import functools 

37from collections import defaultdict 

38 

39# ----------------------------- 

40# Imports for other modules -- 

41# ----------------------------- 

42from lsst.daf.butler import Butler, DatasetRef 

43import lsst.log 

44import lsst.pex.config as pexConfig 

45from lsst.pipe.base import GraphBuilder, Pipeline, QuantumGraph 

46from .cmdLineParser import makeParser 

47from .dotTools import graph2dot, pipeline2dot 

48from .mpGraphExecutor import MPGraphExecutor 

49from .preExecInit import PreExecInit 

50from .taskFactory import TaskFactory 

51from . import util 

52 

53# ---------------------------------- 

54# Local non-exported definitions -- 

55# ---------------------------------- 

56 

57# logging properties 

58_LOG_PROP = """\ 

59log4j.rootLogger=INFO, A1 

60log4j.appender.A1=ConsoleAppender 

61log4j.appender.A1.Target=System.err 

62log4j.appender.A1.layout=PatternLayout 

63log4j.appender.A1.layout.ConversionPattern={} 

64""" 

65 

66_LOG = logging.getLogger(__name__.partition(".")[2]) 

67 

68 

69class _FilteredStream: 

70 """A file-like object that filters some config fields. 

71 

72 Note 

73 ---- 

74 This class depends on implementation details of ``Config.saveToStream`` 

75 methods, in particular that that method uses single call to write() 

76 method to save information about single config field, and that call 

77 combines comments string(s) for a field and field path and value. 

78 This class will not work reliably on the "import" strings, so imports 

79 should be disabled by passing ``skipImports=True`` to ``saveToStream()``. 

80 """ 

81 def __init__(self, pattern): 

82 # obey case if pattern isn't lowercase or requests NOIGNORECASE 

83 mat = re.search(r"(.*):NOIGNORECASE$", pattern) 

84 

85 if mat: 

86 pattern = mat.group(1) 

87 self._pattern = re.compile(fnmatch.translate(pattern)) 

88 else: 

89 if pattern != pattern.lower(): 

90 print(f"Matching \"{pattern}\" without regard to case " 

91 "(append :NOIGNORECASE to prevent this)", file=sys.stdout) 

92 self._pattern = re.compile(fnmatch.translate(pattern), re.IGNORECASE) 

93 

94 def write(self, showStr): 

95 # Strip off doc string line(s) and cut off at "=" for string matching 

96 matchStr = showStr.rstrip().split("\n")[-1].split("=")[0] 

97 if self._pattern.search(matchStr): 

98 sys.stdout.write(showStr) 

99 

100# ------------------------ 

101# Exported definitions -- 

102# ------------------------ 

103 

104 

105class CmdLineFwk: 

106 """PipelineTask framework which executes tasks from command line. 

107 

108 In addition to executing tasks this activator provides additional methods 

109 for task management like dumping configuration or execution chain. 

110 """ 

111 

112 MP_TIMEOUT = 9999 # Default timeout (sec) for multiprocessing 

113 

114 def __init__(self): 

115 pass 

116 

117 def parseAndRun(self, argv=None): 

118 """ 

119 This method is a main entry point for this class, it parses command 

120 line and executes all commands. 

121 

122 Parameters 

123 ---------- 

124 argv : `list` of `str`, optional 

125 list of command line arguments, if not specified then 

126 `sys.argv[1:]` is used 

127 """ 

128 

129 if argv is None: 

130 argv = sys.argv[1:] 

131 

132 # start with parsing command line, only do partial parsing now as 

133 # the tasks can add more arguments later 

134 parser = makeParser() 

135 args = parser.parse_args(argv) 

136 

137 # First thing to do is to setup logging. 

138 self.configLog(args.longlog, args.loglevel) 

139 

140 taskFactory = TaskFactory() 

141 

142 # make pipeline out of command line arguments (can return empty pipeline) 

143 try: 

144 pipeline = self.makePipeline(args) 

145 except Exception as exc: 

146 print("Failed to build pipeline: {}".format(exc), file=sys.stderr) 

147 raise 

148 

149 if args.subcommand == "build": 

150 # stop here but process --show option first 

151 self.showInfo(args, pipeline) 

152 return 0 

153 

154 # make quantum graph 

155 try: 

156 qgraph = self.makeGraph(pipeline, args) 

157 except Exception as exc: 

158 print("Failed to build graph: {}".format(exc), file=sys.stderr) 

159 raise 

160 

161 # optionally dump some info 

162 self.showInfo(args, pipeline, qgraph) 

163 

164 if qgraph is None: 

165 # No need to raise an exception here, code that makes graph 

166 # should have printed warning message already. 

167 return 2 

168 

169 if args.subcommand == "qgraph": 

170 # stop here 

171 return 0 

172 

173 # execute 

174 if args.subcommand == "run": 

175 return self.runPipeline(qgraph, taskFactory, args) 

176 

177 @staticmethod 

178 def configLog(longlog, logLevels): 

179 """Configure logging system. 

180 

181 Parameters 

182 ---------- 

183 longlog : `bool` 

184 If True then make log messages appear in "long format" 

185 logLevels : `list` of `tuple` 

186 per-component logging levels, each item in the list is a tuple 

187 (component, level), `component` is a logger name or `None` for root 

188 logger, `level` is a logging level name ('DEBUG', 'INFO', etc.) 

189 """ 

190 if longlog: 

191 message_fmt = "%-5p %d{yyyy-MM-ddTHH:mm:ss.SSSZ} %c (%X{LABEL})(%F:%L)- %m%n" 

192 else: 

193 message_fmt = "%c %p: %m%n" 

194 

195 # global logging config 

196 lsst.log.configure_prop(_LOG_PROP.format(message_fmt)) 

197 

198 # Forward all Python logging to lsst.log 

199 lgr = logging.getLogger() 

200 lgr.setLevel(logging.INFO) # same as in log4cxx config above 

201 lgr.addHandler(lsst.log.LogHandler()) 

202 

203 # also capture warnings and send them to logging 

204 logging.captureWarnings(True) 

205 

206 # configure individual loggers 

207 for component, level in logLevels: 

208 level = getattr(lsst.log.Log, level.upper(), None) 

209 if level is not None: 

210 # set logging level for lsst.log 

211 logger = lsst.log.Log.getLogger(component or "") 

212 logger.setLevel(level) 

213 # set logging level for Python logging 

214 pyLevel = lsst.log.LevelTranslator.lsstLog2logging(level) 

215 logging.getLogger(component).setLevel(pyLevel) 

216 

217 def makePipeline(self, args): 

218 """Build a pipeline from command line arguments. 

219 

220 Parameters 

221 ---------- 

222 args : `argparse.Namespace` 

223 Parsed command line 

224 

225 Returns 

226 ------- 

227 pipeline : `~lsst.pipe.base.Pipeline` 

228 """ 

229 if args.pipeline: 

230 pipeline = Pipeline.fromFile(args.pipeline) 

231 else: 

232 pipeline = Pipeline("anonymous") 

233 

234 # loop over all pipeline actions and apply them in order 

235 for action in args.pipeline_actions: 

236 if action.action == "add_instrument": 

237 

238 pipeline.addInstrument(action.value) 

239 

240 elif action.action == "new_task": 

241 

242 pipeline.addTask(action.value, action.label) 

243 

244 elif action.action == "delete_task": 

245 

246 pipeline.removeTask(action.label) 

247 

248 elif action.action == "config": 

249 

250 # action value string is "field=value", split it at '=' 

251 field, _, value = action.value.partition("=") 

252 pipeline.addConfigOverride(action.label, field, value) 

253 

254 elif action.action == "configfile": 

255 

256 pipeline.addConfigFile(action.label, action.value) 

257 

258 else: 

259 

260 raise ValueError(f"Unexpected pipeline action: {action.action}") 

261 

262 if args.save_pipeline: 

263 pipeline.toFile(args.save_pipeline) 

264 

265 if args.pipeline_dot: 

266 pipeline2dot(pipeline, args.pipeline_dot) 

267 

268 return pipeline 

269 

270 def makeGraph(self, pipeline, args): 

271 """Build a graph from command line arguments. 

272 

273 Parameters 

274 ---------- 

275 pipeline : `~lsst.pipe.base.Pipeline` 

276 Pipeline, can be empty or ``None`` if graph is read from pickle 

277 file. 

278 args : `argparse.Namespace` 

279 Parsed command line 

280 

281 Returns 

282 ------- 

283 graph : `~lsst.pipe.base.QuantumGraph` or `None` 

284 If resulting graph is empty then `None` is returned. 

285 """ 

286 if args.qgraph: 

287 

288 # Un-pickling QGraph needs a dimensions universe defined in 

289 # registry. Easiest way to do it now is to initialize whole data 

290 # butler. Butler requires run or collection provided in 

291 # constructor but in this case we do not care about (or do not 

292 # know) what collection to use so give it an empty name. 

293 butler = Butler(config=args.butler_config, collection="") 

294 

295 with open(args.qgraph, 'rb') as pickleFile: 

296 qgraph = pickle.load(pickleFile) 

297 if not isinstance(qgraph, QuantumGraph): 

298 raise TypeError("QuantumGraph pickle file has incorrect object type: {}".format( 

299 type(qgraph))) 

300 

301 # pipeline can not be provided in this case 

302 if pipeline: 

303 raise ValueError("Pipeline must not be given when quantum graph is read from file.") 

304 

305 else: 

306 

307 if not pipeline: 

308 raise ValueError("Pipeline must be given for quantum graph construction.") 

309 

310 # build collection names 

311 inputs = args.input.copy() 

312 defaultInputs = inputs.pop("", None) 

313 outputs = args.output.copy() 

314 defaultOutputs = outputs.pop("", None) 

315 

316 # Make butler instance. From this Butler we only need Registry 

317 # instance. Input/output collections are handled by pre-flight 

318 # and we don't want to be constrained here by Butler's restrictions 

319 # on collection names. 

320 collection = defaultInputs[0] if defaultInputs else None 

321 butler = Butler(config=args.butler_config, collection=collection) 

322 

323 # if default input collections are not given on command line then 

324 # use one from Butler (has to be configured in butler config) 

325 if not defaultInputs: 

326 defaultInputs = [butler.collection] 

327 inputCollections = defaultdict(functools.partial(list, defaultInputs)) 

328 inputCollections.update(inputs) 

329 outputCollection = defaultOutputs 

330 if outputs: 

331 # TODO: this may never be supported; maybe we should just 

332 # remove the command-line option? 

333 raise NotImplementedError("Different output collections for different dataset " 

334 "types is not currently supported.") 

335 

336 # make execution plan (a.k.a. DAG) for pipeline 

337 graphBuilder = GraphBuilder(butler.registry, 

338 skipExisting=args.skip_existing, 

339 clobberExisting=args.clobber_output) 

340 qgraph = graphBuilder.makeGraph(pipeline, inputCollections, outputCollection, args.data_query) 

341 

342 # count quanta in graph and give a warning if it's empty and return None 

343 nQuanta = qgraph.countQuanta() 

344 if nQuanta == 0: 

345 warnings.warn("QuantumGraph is empty", stacklevel=2) 

346 return None 

347 else: 

348 _LOG.info("QuantumGraph contains %d quanta for %d tasks", 

349 nQuanta, len(qgraph)) 

350 

351 if args.save_qgraph: 

352 with open(args.save_qgraph, "wb") as pickleFile: 

353 pickle.dump(qgraph, pickleFile) 

354 

355 if args.save_single_quanta: 

356 for iq, sqgraph in enumerate(qgraph.quantaAsQgraph()): 

357 filename = args.save_single_quanta.format(iq) 

358 with open(filename, "wb") as pickleFile: 

359 pickle.dump(sqgraph, pickleFile) 

360 

361 if args.qgraph_dot: 

362 graph2dot(qgraph, args.qgraph_dot) 

363 

364 return qgraph 

365 

366 def runPipeline(self, graph, taskFactory, args, butler=None): 

367 """Execute complete QuantumGraph. 

368 

369 Parameters 

370 ---------- 

371 graph : `QuantumGraph` 

372 Execution graph. 

373 taskFactory : `~lsst.pipe.base.TaskFactory` 

374 Task factory 

375 args : `argparse.Namespace` 

376 Parsed command line 

377 butler : `~lsst.daf.butler.Butler`, optional 

378 Data Butler instance, if not defined then new instance is made 

379 using command line options. 

380 """ 

381 # If default output collection is given then use it to override 

382 # butler-configured one. 

383 run = args.output.get("", None) 

384 

385 # make butler instance 

386 if butler is None: 

387 butler = Butler(config=args.butler_config, run=run) 

388 

389 # at this point we require that output collection was defined 

390 if not butler.run: 

391 raise ValueError("no output collection defined in data butler") 

392 

393 # Enable lsstDebug debugging. Note that this is done once in the 

394 # main process before PreExecInit and it is also repeated before 

395 # running each task in SingleQuantumExecutor (which may not be 

396 # needed if `multipocessing` always uses fork start method). 

397 if args.enableLsstDebug: 

398 try: 

399 _LOG.debug("Will try to import debug.py") 

400 import debug # noqa:F401 

401 except ImportError: 

402 _LOG.warn("No 'debug' module found.") 

403 

404 preExecInit = PreExecInit(butler, taskFactory, args.skip_existing, args.clobber_output) 

405 preExecInit.initialize(graph, 

406 saveInitOutputs=not args.skip_init_writes, 

407 registerDatasetTypes=args.register_dataset_types) 

408 

409 if not args.init_only: 

410 executor = MPGraphExecutor(numProc=args.processes, timeout=self.MP_TIMEOUT, 

411 skipExisting=args.skip_existing, 

412 clobberOutput=args.clobber_output, 

413 enableLsstDebug=args.enableLsstDebug) 

414 with util.profile(args.profile, _LOG): 

415 executor.execute(graph, butler, taskFactory) 

416 

417 def showInfo(self, args, pipeline, graph=None): 

418 """Display useful info about pipeline and environment. 

419 

420 Parameters 

421 ---------- 

422 args : `argparse.Namespace` 

423 Parsed command line 

424 pipeline : `Pipeline` 

425 Pipeline definition 

426 graph : `QuantumGraph`, optional 

427 Execution graph 

428 """ 

429 showOpts = args.show 

430 for what in showOpts: 

431 showCommand, _, showArgs = what.partition("=") 

432 

433 if showCommand in ["pipeline", "config", "history", "tasks"]: 

434 if not pipeline: 

435 _LOG.warning("Pipeline is required for --show=%s", showCommand) 

436 continue 

437 

438 if showCommand in ["graph", "workflow"]: 

439 if not graph: 

440 _LOG.warning("QuantumGraph is required for --show=%s", showCommand) 

441 continue 

442 

443 if showCommand == "pipeline": 

444 print(pipeline) 

445 elif showCommand == "config": 

446 self._showConfig(pipeline, showArgs, False) 

447 elif showCommand == "dump-config": 

448 self._showConfig(pipeline, showArgs, True) 

449 elif showCommand == "history": 

450 self._showConfigHistory(pipeline, showArgs) 

451 elif showCommand == "tasks": 

452 self._showTaskHierarchy(pipeline) 

453 elif showCommand == "graph": 

454 if graph: 

455 self._showGraph(graph) 

456 elif showCommand == "workflow": 

457 if graph: 

458 self._showWorkflow(graph, args) 

459 else: 

460 print("Unknown value for show: %s (choose from '%s')" % 

461 (what, "', '".join("pipeline config[=XXX] history=XXX tasks graph".split())), 

462 file=sys.stderr) 

463 sys.exit(1) 

464 

465 def _showConfig(self, pipeline, showArgs, dumpFullConfig): 

466 """Show task configuration 

467 

468 Parameters 

469 ---------- 

470 pipeline : `Pipeline` 

471 Pipeline definition 

472 showArgs : `str` 

473 Defines what to show 

474 dumpFullConfig : `bool` 

475 If true then dump complete task configuration with all imports. 

476 """ 

477 stream = sys.stdout 

478 if dumpFullConfig: 

479 # Task label can be given with this option 

480 taskName = showArgs 

481 else: 

482 # The argument can have form [TaskLabel::][pattern:NOIGNORECASE] 

483 matConfig = re.search(r"^(?:(\w+)::)?(?:config.)?(.+)?", showArgs) 

484 taskName = matConfig.group(1) 

485 pattern = matConfig.group(2) 

486 if pattern: 

487 stream = _FilteredStream(pattern) 

488 

489 tasks = util.filterTasks(pipeline, taskName) 

490 if not tasks: 

491 print("Pipeline has no tasks named {}".format(taskName), file=sys.stderr) 

492 sys.exit(1) 

493 

494 for taskDef in tasks: 

495 print("### Configuration for task `{}'".format(taskDef.label)) 

496 taskDef.config.saveToStream(stream, root="config", skipImports=not dumpFullConfig) 

497 

498 def _showConfigHistory(self, pipeline, showArgs): 

499 """Show history for task configuration 

500 

501 Parameters 

502 ---------- 

503 pipeline : `Pipeline` 

504 Pipeline definition 

505 showArgs : `str` 

506 Defines what to show 

507 """ 

508 

509 taskName = None 

510 pattern = None 

511 matHistory = re.search(r"^(?:(\w+)::)?(?:config[.])?(.+)", showArgs) 

512 if matHistory: 

513 taskName = matHistory.group(1) 

514 pattern = matHistory.group(2) 

515 if not pattern: 

516 print("Please provide a value with --show history (e.g. history=Task::param)", file=sys.stderr) 

517 sys.exit(1) 

518 

519 tasks = util.filterTasks(pipeline, taskName) 

520 if not tasks: 

521 print(f"Pipeline has no tasks named {taskName}", file=sys.stderr) 

522 sys.exit(1) 

523 

524 cpath, _, cname = pattern.rpartition(".") 

525 found = False 

526 for taskDef in tasks: 

527 try: 

528 if not cpath: 

529 # looking for top-level field 

530 hconfig = taskDef.config 

531 else: 

532 hconfig = eval("config." + cpath, {}, {"config": taskDef.config}) 

533 except AttributeError: 

534 # Means this config object has no such field, but maybe some other task has it. 

535 continue 

536 except Exception: 

537 # Any other exception probably means some error in the expression. 

538 print(f"ERROR: Failed to evaluate field expression `{pattern}'", file=sys.stderr) 

539 sys.exit(1) 

540 

541 if hasattr(hconfig, cname): 

542 print(f"### Configuration field for task `{taskDef.label}'") 

543 print(pexConfig.history.format(hconfig, cname)) 

544 found = True 

545 

546 if not found: 

547 print(f"None of the tasks has field named {pattern}", file=sys.stderr) 

548 sys.exit(1) 

549 

550 def _showTaskHierarchy(self, pipeline): 

551 """Print task hierarchy to stdout 

552 

553 Parameters 

554 ---------- 

555 pipeline: `Pipeline` 

556 """ 

557 for taskDef in pipeline.toExpandedPipeline(): 

558 print("### Subtasks for task `{}'".format(taskDef.taskName)) 

559 

560 for configName, taskName in util.subTaskIter(taskDef.config): 

561 print("{}: {}".format(configName, taskName)) 

562 

563 def _showGraph(self, graph): 

564 """Print quanta information to stdout 

565 

566 Parameters 

567 ---------- 

568 graph : `QuantumGraph` 

569 Execution graph. 

570 """ 

571 for taskNodes in graph: 

572 print(taskNodes.taskDef) 

573 

574 for iq, quantum in enumerate(taskNodes.quanta): 

575 print(" Quantum {}:".format(iq)) 

576 print(" inputs:") 

577 for key, refs in quantum.predictedInputs.items(): 

578 dataIds = ["DataId({})".format(ref.dataId) for ref in refs] 

579 print(" {}: [{}]".format(key, ", ".join(dataIds))) 

580 print(" outputs:") 

581 for key, refs in quantum.outputs.items(): 

582 dataIds = ["DataId({})".format(ref.dataId) for ref in refs] 

583 print(" {}: [{}]".format(key, ", ".join(dataIds))) 

584 

585 def _showWorkflow(self, graph, args): 

586 """Print quanta information and dependency to stdout 

587 

588 The input and predicted output URIs based on the Butler repo are printed. 

589 

590 Parameters 

591 ---------- 

592 graph : `QuantumGraph` 

593 Execution graph. 

594 args : `argparse.Namespace` 

595 Parsed command line 

596 """ 

597 run = args.output.get("", None) 

598 butler = Butler(config=args.butler_config, run=run) 

599 hashToParent = {} 

600 for iq, (taskDef, quantum) in enumerate(graph.quanta()): 

601 shortname = taskDef.taskName.split('.')[-1] 

602 print("Quantum {}: {}".format(iq, shortname)) 

603 print(" inputs:") 

604 for key, refs in quantum.predictedInputs.items(): 

605 for ref in refs: 

606 if butler.datastore.exists(ref): 

607 print(" {}".format(butler.datastore.getUri(ref))) 

608 else: 

609 fakeRef = DatasetRef(ref.datasetType, ref.dataId, run=run) 

610 print(" {}".format(butler.datastore.getUri(fakeRef, predict=True))) 

611 print(" outputs:") 

612 for key, refs in quantum.outputs.items(): 

613 for ref in refs: 

614 if butler.datastore.exists(ref): 

615 print(" {}".format(butler.datastore.getUri(ref))) 

616 else: 

617 fakeRef = DatasetRef(ref.datasetType, ref.dataId, run=run) 

618 print(" {}".format(butler.datastore.getUri(fakeRef, predict=True))) 

619 # Store hash to figure out dependency 

620 dhash = hash((key, ref.dataId)) 

621 hashToParent[dhash] = iq 

622 

623 uses = set() 

624 for iq, (taskDef, quantum) in enumerate(graph.quanta()): 

625 for key, refs in quantum.predictedInputs.items(): 

626 for ref in refs: 

627 dhash = hash((key, ref.dataId)) 

628 if dhash in hashToParent and (iq, hashToParent[dhash]) not in uses: 

629 parentIq = hashToParent[dhash] 

630 uses.add((iq, parentIq)) # iq uses parentIq 

631 print("Parent Quantum {} - Child Quantum {}".format(parentIq, iq))