Coverage for python/lsst/pipe/base/cmdLineTask.py: 17%

230 statements  

« prev     ^ index     » next       coverage.py v6.4.2, created at 2022-07-14 16:10 -0700

1# 

2# LSST Data Management System 

3# Copyright 2008-2015 AURA/LSST. 

4# 

5# This product includes software developed by the 

6# LSST Project (http://www.lsst.org/). 

7# 

8# This program is free software: you can redistribute it and/or modify 

9# it under the terms of the GNU General Public License as published by 

10# the Free Software Foundation, either version 3 of the License, or 

11# (at your option) any later version. 

12# 

13# This program is distributed in the hope that it will be useful, 

14# but WITHOUT ANY WARRANTY; without even the implied warranty of 

15# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

16# GNU General Public License for more details. 

17# 

18# You should have received a copy of the LSST License Statement and 

19# the GNU General Public License along with this program. If not, 

20# see <https://www.lsstcorp.org/LegalNotices/>. 

21# 

22__all__ = ["CmdLineTask", "TaskRunner", "ButlerInitializedTaskRunner", "LegacyTaskRunner"] 

23 

24import contextlib 

25import functools 

26import sys 

27import traceback 

28 

29import lsst.afw.table as afwTable 

30import lsst.log 

31import lsst.utils.introspection 

32import lsst.utils.logging 

33from lsst.base import Packages, disableImplicitThreading 

34 

35from .argumentParser import ArgumentParser 

36from .struct import Struct 

37from .task import Task, TaskError 

38 

39 

40def _runPool(pool, timeout, function, iterable): 

41 """Wrapper around ``pool.map_async``, to handle timeout 

42 

43 This is required so as to trigger an immediate interrupt on the 

44 KeyboardInterrupt (Ctrl-C); see 

45 http://stackoverflow.com/questions/1408356/keyboard-interrupts-with-pythons-multiprocessing-pool 

46 """ 

47 return pool.map_async(function, iterable).get(timeout) 

48 

49 

50@contextlib.contextmanager 

51def profile(filename, log=None): 

52 """Context manager for profiling with cProfile. 

53 

54 

55 Parameters 

56 ---------- 

57 filename : `str` 

58 Filename to which to write profile (profiling disabled if `None` or 

59 empty). 

60 log : `logging.Logger`, optional 

61 Log object for logging the profile operations. 

62 

63 If profiling is enabled, the context manager returns the cProfile.Profile 

64 object (otherwise it returns None), which allows additional control over 

65 profiling. You can obtain this using the "as" clause, e.g.: 

66 

67 .. code-block:: python 

68 

69 with profile(filename) as prof: 

70 runYourCodeHere() 

71 

72 The output cumulative profile can be printed with a command-line like: 

73 

74 .. code-block:: bash 

75 

76 python -c 'import pstats; \ 

77 pstats.Stats("<filename>").sort_stats("cumtime").print_stats(30)' 

78 """ 

79 if not filename: 

80 # Nothing to do 

81 yield 

82 return 

83 from cProfile import Profile 

84 

85 profile = Profile() 

86 if log is not None: 

87 log.info("Enabling cProfile profiling") 

88 profile.enable() 

89 yield profile 

90 profile.disable() 

91 profile.dump_stats(filename) 

92 if log is not None: 

93 log.info("cProfile stats written to %s", filename) 

94 

95 

96class TaskRunner: 

97 """Run a command-line task, using `multiprocessing` if requested. 

98 

99 Parameters 

100 ---------- 

101 TaskClass : `lsst.pipe.base.Task` subclass 

102 The class of the task to run. 

103 parsedCmd : `argparse.Namespace` 

104 The parsed command-line arguments, as returned by the task's argument 

105 parser's `~lsst.pipe.base.ArgumentParser.parse_args` method. 

106 

107 .. warning:: 

108 

109 Do not store ``parsedCmd``, as this instance is pickled (if 

110 multiprocessing) and parsedCmd may contain non-picklable elements. 

111 It certainly contains more data than we need to send to each 

112 instance of the task. 

113 doReturnResults : `bool`, optional 

114 Should run return the collected result from each invocation of the 

115 task? This is only intended for unit tests and similar use. It can 

116 easily exhaust memory (if the task returns enough data and you call it 

117 enough times) and it will fail when using multiprocessing if the 

118 returned data cannot be pickled. 

119 

120 Note that even if ``doReturnResults`` is False a struct with a single 

121 member "exitStatus" is returned, with value 0 or 1 to be returned to 

122 the unix shell. 

123 

124 Raises 

125 ------ 

126 ImportError 

127 Raised if multiprocessing is requested (and the task supports it) but 

128 the multiprocessing library cannot be imported. 

129 

130 Notes 

131 ----- 

132 Each command-line task (subclass of `lsst.pipe.base.CmdLineTask`) has a 

133 task runner. By default it is this class, but some tasks require a 

134 subclass. See the manual :ref:`creating-a-command-line-task` for more 

135 information. See `CmdLineTask.parseAndRun` to see how a task runner is 

136 used. 

137 

138 You may use this task runner for your command-line task if your task has a 

139 ``runDataRef`` method that takes exactly one argument: a butler data 

140 reference. Otherwise you must provide a task-specific subclass of 

141 this runner for your task's ``RunnerClass`` that overrides 

142 `TaskRunner.getTargetList` and possibly 

143 `TaskRunner.__call__`. See `TaskRunner.getTargetList` for details. 

144 

145 This design matches the common pattern for command-line tasks: the 

146 ``runDataRef`` method takes a single data reference, of some suitable name. 

147 Additional arguments are rare, and if present, require a subclass of 

148 `TaskRunner` that calls these additional arguments by name. 

149 

150 Instances of this class must be picklable in order to be compatible with 

151 multiprocessing. If multiprocessing is requested 

152 (``parsedCmd.numProcesses > 1``) then `runDataRef` calls 

153 `prepareForMultiProcessing` to jettison optional non-picklable elements. 

154 If your task runner is not compatible with multiprocessing then indicate 

155 this in your task by setting class variable ``canMultiprocess=False``. 

156 

157 Due to a `python bug`__, handling a `KeyboardInterrupt` properly `requires 

158 specifying a timeout`__. This timeout (in sec) can be specified as the 

159 ``timeout`` element in the output from `~lsst.pipe.base.ArgumentParser` 

160 (the ``parsedCmd``), if available, otherwise we use `TaskRunner.TIMEOUT`. 

161 

162 By default, we disable "implicit" threading -- ie, as provided by 

163 underlying numerical libraries such as MKL or BLAS. This is designed to 

164 avoid thread contention both when a single command line task spawns 

165 multiple processes and when multiple users are running on a shared system. 

166 Users can override this behaviour by setting the 

167 ``LSST_ALLOW_IMPLICIT_THREADS`` environment variable. 

168 

169 .. __: http://bugs.python.org/issue8296 

170 .. __: http://stackoverflow.com/questions/1408356/ 

171 """ 

172 

173 TIMEOUT = 3600 * 24 * 30 

174 """Default timeout (seconds) for multiprocessing.""" 

175 

176 def __init__(self, TaskClass, parsedCmd, doReturnResults=False): 

177 self.TaskClass = TaskClass 

178 self.doReturnResults = bool(doReturnResults) 

179 self.config = parsedCmd.config 

180 self.log = parsedCmd.log 

181 self.doRaise = bool(parsedCmd.doraise) 

182 self.clobberConfig = bool(parsedCmd.clobberConfig) 

183 self.doBackup = not bool(parsedCmd.noBackupConfig) 

184 self.numProcesses = int(getattr(parsedCmd, "processes", 1)) 

185 

186 self.timeout = getattr(parsedCmd, "timeout", None) 

187 if self.timeout is None or self.timeout <= 0: 

188 self.timeout = self.TIMEOUT 

189 

190 if self.numProcesses > 1: 

191 if not TaskClass.canMultiprocess: 

192 self.log.warning("This task does not support multiprocessing; using one process") 

193 self.numProcesses = 1 

194 

195 def prepareForMultiProcessing(self): 

196 """Prepare this instance for multiprocessing 

197 

198 Optional non-picklable elements are removed. 

199 

200 This is only called if the task is run under multiprocessing. 

201 """ 

202 self.log = None 

203 

204 def run(self, parsedCmd): 

205 """Run the task on all targets. 

206 

207 Parameters 

208 ---------- 

209 parsedCmd : `argparse.Namespace` 

210 Parsed command `argparse.Namespace`. 

211 

212 Returns 

213 ------- 

214 resultList : `list` 

215 A list of results returned by `TaskRunner.__call__`, or an empty 

216 list if `TaskRunner.__call__` is not called (e.g. if 

217 `TaskRunner.precall` returns `False`). See `TaskRunner.__call__` 

218 for details. 

219 

220 Notes 

221 ----- 

222 The task is run under multiprocessing if `TaskRunner.numProcesses` 

223 is more than 1; otherwise processing is serial. 

224 """ 

225 resultList = [] 

226 disableImplicitThreading() # To prevent thread contention 

227 if self.numProcesses > 1: 

228 import multiprocessing 

229 

230 self.prepareForMultiProcessing() 

231 pool = multiprocessing.Pool(processes=self.numProcesses, maxtasksperchild=1) 

232 mapFunc = functools.partial(_runPool, pool, self.timeout) 

233 else: 

234 pool = None 

235 mapFunc = map 

236 

237 if self.precall(parsedCmd): 

238 profileName = parsedCmd.profile if hasattr(parsedCmd, "profile") else None 

239 log = parsedCmd.log 

240 targetList = self.getTargetList(parsedCmd) 

241 if len(targetList) > 0: 

242 with profile(profileName, log): 

243 # Run the task using self.__call__ 

244 resultList = list(mapFunc(self, targetList)) 

245 else: 

246 log.warning( 

247 "Not running the task because there is no data to process; " 

248 'you may preview data using "--show data"' 

249 ) 

250 

251 if pool is not None: 

252 pool.close() 

253 pool.join() 

254 

255 return resultList 

256 

257 @staticmethod 

258 def getTargetList(parsedCmd, **kwargs): 

259 """Get a list of (dataRef, kwargs) for `TaskRunner.__call__`. 

260 

261 Parameters 

262 ---------- 

263 parsedCmd : `argparse.Namespace` 

264 The parsed command object returned by 

265 `lsst.pipe.base.argumentParser.ArgumentParser.parse_args`. 

266 kwargs 

267 Any additional keyword arguments. In the default `TaskRunner` this 

268 is an empty dict, but having it simplifies overriding `TaskRunner` 

269 for tasks whose runDataRef method takes additional arguments 

270 (see case (1) below). 

271 

272 Notes 

273 ----- 

274 The default implementation of `TaskRunner.getTargetList` and 

275 `TaskRunner.__call__` works for any command-line task whose 

276 ``runDataRef`` method takes exactly one argument: a data reference. 

277 Otherwise you must provide a variant of TaskRunner that overrides 

278 `TaskRunner.getTargetList` and possibly `TaskRunner.__call__`. 

279 There are two cases. 

280 

281 **Case 1** 

282 

283 If your command-line task has a ``runDataRef`` method that takes one 

284 data reference followed by additional arguments, then you need only 

285 override `TaskRunner.getTargetList` to return the additional 

286 arguments as an argument dict. To make this easier, your overridden 

287 version of `~TaskRunner.getTargetList` may call 

288 `TaskRunner.getTargetList` with the extra arguments as keyword 

289 arguments. For example, the following adds an argument dict containing 

290 a single key: "calExpList", whose value is the list of data IDs for 

291 the calexp ID argument: 

292 

293 .. code-block:: python 

294 

295 def getTargetList(parsedCmd): 

296 return TaskRunner.getTargetList( 

297 parsedCmd, 

298 calExpList=parsedCmd.calexp.idList 

299 ) 

300 

301 It is equivalent to this slightly longer version: 

302 

303 .. code-block:: python 

304 

305 @staticmethod 

306 def getTargetList(parsedCmd): 

307 argDict = dict(calExpList=parsedCmd.calexp.idList) 

308 return [(dataId, argDict) for dataId in parsedCmd.id.idList] 

309 

310 **Case 2** 

311 

312 If your task does not meet condition (1) then you must override both 

313 TaskRunner.getTargetList and `TaskRunner.__call__`. You may do this 

314 however you see fit, so long as `TaskRunner.getTargetList` 

315 returns a list, each of whose elements is sent to 

316 `TaskRunner.__call__`, which runs your task. 

317 """ 

318 return [(ref, kwargs) for ref in parsedCmd.id.refList] 

319 

320 def makeTask(self, parsedCmd=None, args=None): 

321 """Create a Task instance. 

322 

323 Parameters 

324 ---------- 

325 parsedCmd 

326 Parsed command-line options (used for extra task args by some task 

327 runners). 

328 args 

329 Args tuple passed to `TaskRunner.__call__` (used for extra task 

330 arguments by some task runners). 

331 

332 Notes 

333 ----- 

334 ``makeTask`` can be called with either the ``parsedCmd`` argument or 

335 ``args`` argument set to None, but it must construct identical Task 

336 instances in either case. 

337 

338 Subclasses may ignore this method entirely if they reimplement both 

339 `TaskRunner.precall` and `TaskRunner.__call__`. 

340 """ 

341 return self.TaskClass(config=self.config, log=self.log) 

342 

343 def _precallImpl(self, task, parsedCmd): 

344 """The main work of `precall`. 

345 

346 We write package versions, schemas and configs, or compare these to 

347 existing files on disk if present. 

348 """ 

349 if not parsedCmd.noVersions: 

350 task.writePackageVersions(parsedCmd.butler, clobber=parsedCmd.clobberVersions) 

351 task.writeConfig(parsedCmd.butler, clobber=self.clobberConfig, doBackup=self.doBackup) 

352 task.writeSchemas(parsedCmd.butler, clobber=self.clobberConfig, doBackup=self.doBackup) 

353 

354 def precall(self, parsedCmd): 

355 """Hook for code that should run exactly once, before multiprocessing. 

356 

357 Notes 

358 ----- 

359 Must return True if `TaskRunner.__call__` should subsequently be 

360 called. 

361 

362 .. warning:: 

363 

364 Implementations must take care to ensure that no unpicklable 

365 attributes are added to the TaskRunner itself, for compatibility 

366 with multiprocessing. 

367 

368 The default implementation writes package versions, schemas and 

369 configs, or compares them to existing files on disk if present. 

370 """ 

371 task = self.makeTask(parsedCmd=parsedCmd) 

372 

373 if self.doRaise: 

374 self._precallImpl(task, parsedCmd) 

375 else: 

376 try: 

377 self._precallImpl(task, parsedCmd) 

378 except Exception as e: 

379 task.log.fatal("Failed in task initialization: %s", e) 

380 if not isinstance(e, TaskError): 

381 traceback.print_exc(file=sys.stderr) 

382 return False 

383 return True 

384 

385 def __call__(self, args): 

386 """Run the Task on a single target. 

387 

388 Parameters 

389 ---------- 

390 args 

391 Arguments for Task.runDataRef() 

392 

393 Returns 

394 ------- 

395 struct : `lsst.pipe.base.Struct` 

396 Contains these fields if ``doReturnResults`` is `True`: 

397 

398 - ``dataRef``: the provided data reference. 

399 - ``metadata``: task metadata after execution of run. 

400 - ``result``: result returned by task run, or `None` if the task 

401 fails. 

402 - ``exitStatus``: 0 if the task completed successfully, 1 

403 otherwise. 

404 

405 If ``doReturnResults`` is `False` the struct contains: 

406 

407 - ``exitStatus``: 0 if the task completed successfully, 1 

408 otherwise. 

409 

410 Notes 

411 ----- 

412 This default implementation assumes that the ``args`` is a tuple 

413 containing a data reference and a dict of keyword arguments. 

414 

415 .. warning:: 

416 

417 If you override this method and wish to return something when 

418 ``doReturnResults`` is `False`, then it must be picklable to 

419 support multiprocessing and it should be small enough that pickling 

420 and unpickling do not add excessive overhead. 

421 """ 

422 dataRef, kwargs = args 

423 if self.log is None: 

424 self.log = lsst.utils.logging.getLogger() 

425 if hasattr(dataRef, "dataId"): 

426 lsst.log.MDC("LABEL", str(dataRef.dataId)) 

427 elif isinstance(dataRef, (list, tuple)): 

428 lsst.log.MDC("LABEL", str([ref.dataId for ref in dataRef if hasattr(ref, "dataId")])) 

429 task = self.makeTask(args=args) 

430 result = None # in case the task fails 

431 exitStatus = 0 # exit status for the shell 

432 if self.doRaise: 

433 result = self.runTask(task, dataRef, kwargs) 

434 else: 

435 try: 

436 result = self.runTask(task, dataRef, kwargs) 

437 except Exception as e: 

438 # The shell exit value will be the number of dataRefs returning 

439 # non-zero, so the actual value used here is lost. 

440 exitStatus = 1 

441 

442 # don't use a try block as we need to preserve the original 

443 # exception 

444 eName = type(e).__name__ 

445 if hasattr(dataRef, "dataId"): 

446 task.log.fatal("Failed on dataId=%s: %s: %s", dataRef.dataId, eName, e) 

447 elif isinstance(dataRef, (list, tuple)): 

448 task.log.fatal( 

449 "Failed on dataIds=[%s]: %s: %s", 

450 ", ".join(str(ref.dataId) for ref in dataRef), 

451 eName, 

452 e, 

453 ) 

454 else: 

455 task.log.fatal("Failed on dataRef=%s: %s: %s", dataRef, eName, e) 

456 

457 if not isinstance(e, TaskError): 

458 traceback.print_exc(file=sys.stderr) 

459 

460 # Ensure all errors have been logged and aren't hanging around in a 

461 # buffer 

462 sys.stdout.flush() 

463 sys.stderr.flush() 

464 

465 task.writeMetadata(dataRef) 

466 

467 # remove MDC so it does not show up outside of task context 

468 lsst.log.MDCRemove("LABEL") 

469 

470 if self.doReturnResults: 

471 return Struct( 

472 exitStatus=exitStatus, 

473 dataRef=dataRef, 

474 metadata=task.metadata, 

475 result=result, 

476 ) 

477 else: 

478 return Struct( 

479 exitStatus=exitStatus, 

480 ) 

481 

482 def runTask(self, task, dataRef, kwargs): 

483 """Make the actual call to `runDataRef` for this task. 

484 

485 Parameters 

486 ---------- 

487 task : `lsst.pipe.base.CmdLineTask` class 

488 The class of the task to run. 

489 dataRef 

490 Butler data reference that contains the data the task will process. 

491 kwargs 

492 Any additional keyword arguments. See `TaskRunner.getTargetList` 

493 above. 

494 

495 Notes 

496 ----- 

497 The default implementation of `TaskRunner.runTask` works for any 

498 command-line task which has a ``runDataRef`` method that takes a data 

499 reference and an optional set of additional keyword arguments. 

500 This method returns the results generated by the task's `runDataRef` 

501 method. 

502 

503 """ 

504 return task.runDataRef(dataRef, **kwargs) 

505 

506 

507class LegacyTaskRunner(TaskRunner): 

508 r"""A `TaskRunner` for `CmdLineTask`\ s which calls the `Task`\ 's `run` 

509 method on a `dataRef` rather than the `runDataRef` method. 

510 """ 

511 

512 def runTask(self, task, dataRef, kwargs): 

513 """Call `run` for this task instead of `runDataRef`. See 

514 `TaskRunner.runTask` above for details. 

515 """ 

516 return task.run(dataRef, **kwargs) 

517 

518 

519class ButlerInitializedTaskRunner(TaskRunner): 

520 r"""A `TaskRunner` for `CmdLineTask`\ s that require a ``butler`` keyword 

521 argument to be passed to their constructor. 

522 """ 

523 

524 def makeTask(self, parsedCmd=None, args=None): 

525 """A variant of the base version that passes a butler argument to the 

526 task's constructor. 

527 

528 Parameters 

529 ---------- 

530 parsedCmd : `argparse.Namespace` 

531 Parsed command-line options, as returned by the 

532 `~lsst.pipe.base.ArgumentParser`; if specified then args is 

533 ignored. 

534 args 

535 Other arguments; if ``parsedCmd`` is `None` then this must be 

536 specified. 

537 

538 Raises 

539 ------ 

540 RuntimeError 

541 Raised if ``parsedCmd`` and ``args`` are both `None`. 

542 """ 

543 if parsedCmd is not None: 

544 butler = parsedCmd.butler 

545 elif args is not None: 

546 dataRef, kwargs = args 

547 butler = dataRef.butlerSubset.butler 

548 else: 

549 raise RuntimeError("parsedCmd or args must be specified") 

550 return self.TaskClass(config=self.config, log=self.log, butler=butler) 

551 

552 

553class CmdLineTask(Task): 

554 """Base class for command-line tasks: tasks that may be executed from the 

555 command-line. 

556 

557 Notes 

558 ----- 

559 See :ref:`task-framework-overview` to learn what tasks are and 

560 :ref:`creating-a-command-line-task` for more information about writing 

561 command-line tasks. 

562 

563 Subclasses must specify the following class variables: 

564 

565 - ``ConfigClass``: configuration class for your task (a subclass of 

566 `lsst.pex.config.Config`, or if your task needs no configuration, then 

567 `lsst.pex.config.Config` itself). 

568 - ``_DefaultName``: default name used for this task (a `str`). 

569 

570 Subclasses may also specify the following class variables: 

571 

572 - ``RunnerClass``: a task runner class. The default is ``TaskRunner``, 

573 which works for any task with a runDataRef method that takes exactly one 

574 argument: a data reference. If your task does not meet this requirement 

575 then you must supply a variant of ``TaskRunner``; see ``TaskRunner`` 

576 for more information. 

577 - ``canMultiprocess``: the default is `True`; set `False` if your task 

578 does not support multiprocessing. 

579 

580 Subclasses must specify a method named ``runDataRef``: 

581 

582 - By default ``runDataRef`` accepts a single butler data reference, but 

583 you can specify an alternate task runner (subclass of ``TaskRunner``) as 

584 the value of class variable ``RunnerClass`` if your run method needs 

585 something else. 

586 - ``runDataRef`` is expected to return its data in a 

587 `lsst.pipe.base.Struct`. This provides safety for evolution of the task 

588 since new values may be added without harming existing code. 

589 - The data returned by ``runDataRef`` must be picklable if your task is to 

590 support multiprocessing. 

591 """ 

592 

593 RunnerClass = TaskRunner 

594 canMultiprocess = True 

595 

596 @classmethod 

597 def applyOverrides(cls, config): 

598 """A hook to allow a task to change the values of its config *after* 

599 the camera-specific overrides are loaded but before any command-line 

600 overrides are applied. 

601 

602 Parameters 

603 ---------- 

604 config : instance of task's ``ConfigClass`` 

605 Task configuration. 

606 

607 Notes 

608 ----- 

609 This is necessary in some cases because the camera-specific overrides 

610 may retarget subtasks, wiping out changes made in 

611 ConfigClass.setDefaults. See LSST Trac ticket #2282 for more 

612 discussion. 

613 

614 .. warning:: 

615 

616 This is called by CmdLineTask.parseAndRun; other ways of 

617 constructing a config will not apply these overrides. 

618 """ 

619 pass 

620 

621 @classmethod 

622 def parseAndRun(cls, args=None, config=None, log=None, doReturnResults=False): 

623 """Parse an argument list and run the command. 

624 

625 Parameters 

626 ---------- 

627 args : `list`, optional 

628 List of command-line arguments; if `None` use `sys.argv`. 

629 config : `lsst.pex.config.Config`-type, optional 

630 Config for task. If `None` use `Task.ConfigClass`. 

631 log : `logging.Logger`-type, optional 

632 Log. If `None` use the default log. 

633 doReturnResults : `bool`, optional 

634 If `True`, return the results of this task. Default is `False`. 

635 This is only intended for unit tests and similar use. It can 

636 easily exhaust memory (if the task returns enough data and you 

637 call it enough times) and it will fail when using multiprocessing 

638 if the returned data cannot be pickled. 

639 

640 Returns 

641 ------- 

642 struct : `lsst.pipe.base.Struct` 

643 Fields are: 

644 

645 ``argumentParser`` 

646 the argument parser (`lsst.pipe.base.ArgumentParser`). 

647 ``parsedCmd`` 

648 the parsed command returned by the argument parser's 

649 `~lsst.pipe.base.ArgumentParser.parse_args` method 

650 (`argparse.Namespace`). 

651 ``taskRunner`` 

652 the task runner used to run the task (an instance of 

653 `Task.RunnerClass`). 

654 ``resultList`` 

655 results returned by the task runner's ``run`` method, one entry 

656 per invocation (`list`). This will typically be a list of 

657 `Struct`, each containing at least an ``exitStatus`` integer 

658 (0 or 1); see `Task.RunnerClass` (`TaskRunner` by default) for 

659 more details. 

660 

661 Notes 

662 ----- 

663 Calling this method with no arguments specified is the standard way to 

664 run a command-line task from the command-line. For an example see 

665 ``pipe_tasks`` ``bin/makeSkyMap.py`` or almost any other file in that 

666 directory. 

667 

668 If one or more of the dataIds fails then this routine will exit (with 

669 a status giving the number of failed dataIds) rather than returning 

670 this struct; this behaviour can be overridden by specifying the 

671 ``--noExit`` command-line option. 

672 """ 

673 if args is None: 

674 commandAsStr = " ".join(sys.argv) 

675 args = sys.argv[1:] 

676 else: 

677 commandAsStr = "{}{}".format(lsst.utils.introspection.get_caller_name(stacklevel=1), tuple(args)) 

678 

679 argumentParser = cls._makeArgumentParser() 

680 if config is None: 

681 config = cls.ConfigClass() 

682 parsedCmd = argumentParser.parse_args(config=config, args=args, log=log, override=cls.applyOverrides) 

683 # print this message after parsing the command so the log is fully 

684 # configured 

685 parsedCmd.log.info("Running: %s", commandAsStr) 

686 

687 taskRunner = cls.RunnerClass(TaskClass=cls, parsedCmd=parsedCmd, doReturnResults=doReturnResults) 

688 resultList = taskRunner.run(parsedCmd) 

689 

690 try: 

691 nFailed = sum(((res.exitStatus != 0) for res in resultList)) 

692 except (TypeError, AttributeError) as e: 

693 # NOTE: TypeError if resultList is None, AttributeError if it 

694 # doesn't have exitStatus. 

695 parsedCmd.log.warning("Unable to retrieve exit status (%s); assuming success", e) 

696 nFailed = 0 

697 

698 if nFailed > 0: 

699 if parsedCmd.noExit: 

700 parsedCmd.log.error("%d dataRefs failed; not exiting as --noExit was set", nFailed) 

701 else: 

702 sys.exit(nFailed) 

703 

704 return Struct( 

705 argumentParser=argumentParser, 

706 parsedCmd=parsedCmd, 

707 taskRunner=taskRunner, 

708 resultList=resultList, 

709 ) 

710 

711 @classmethod 

712 def _makeArgumentParser(cls): 

713 """Create and return an argument parser. 

714 

715 Returns 

716 ------- 

717 parser : `lsst.pipe.base.ArgumentParser` 

718 The argument parser for this task. 

719 

720 Notes 

721 ----- 

722 By default this returns an `~lsst.pipe.base.ArgumentParser` with one 

723 ID argument named `--id` of dataset type ``raw``. 

724 

725 Your task subclass may need to override this method to change the 

726 dataset type or data ref level, or to add additional data ID arguments. 

727 If you add additional data ID arguments or your task's runDataRef 

728 method takes more than a single data reference then you will also have 

729 to provide a task-specific task runner (see TaskRunner for more 

730 information). 

731 """ 

732 parser = ArgumentParser(name=cls._DefaultName) 

733 parser.add_id_argument( 

734 name="--id", datasetType="raw", help="data IDs, e.g. --id visit=12345 ccd=1,2^0,3" 

735 ) 

736 return parser 

737 

738 def writeConfig(self, butler, clobber=False, doBackup=True): 

739 """Write the configuration used for processing the data, or check that 

740 an existing one is equal to the new one if present. 

741 

742 Parameters 

743 ---------- 

744 butler : `lsst.daf.persistence.Butler` 

745 Data butler used to write the config. The config is written to 

746 dataset type `CmdLineTask._getConfigName`. 

747 clobber : `bool`, optional 

748 A boolean flag that controls what happens if a config already has 

749 been saved: 

750 

751 - `True`: overwrite or rename the existing config, depending on 

752 ``doBackup``. 

753 - `False`: raise `TaskError` if this config does not match the 

754 existing config. 

755 doBackup : `bool`, optional 

756 Set to `True` to backup the config files if clobbering. 

757 """ 

758 configName = self._getConfigName() 

759 if configName is None: 

760 return 

761 if clobber: 

762 butler.put(self.config, configName, doBackup=doBackup) 

763 elif butler.datasetExists(configName, write=True): 

764 # this may be subject to a race condition; see #2789 

765 try: 

766 oldConfig = butler.get(configName, immediate=True) 

767 except Exception as exc: 

768 raise type(exc)( 

769 f"Unable to read stored config file {configName} (exc); " 

770 "consider using --clobber-config" 

771 ) 

772 

773 def logConfigMismatch(msg): 

774 self.log.fatal("Comparing configuration: %s", msg) 

775 

776 if not self.config.compare(oldConfig, shortcut=False, output=logConfigMismatch): 

777 raise TaskError( 

778 f"Config does not match existing task config {configName!r} on disk; " 

779 "tasks configurations must be consistent within the same output repo " 

780 "(override with --clobber-config)" 

781 ) 

782 else: 

783 butler.put(self.config, configName) 

784 

785 def writeSchemas(self, butler, clobber=False, doBackup=True): 

786 """Write the schemas returned by 

787 `lsst.pipe.base.Task.getAllSchemaCatalogs`. 

788 

789 Parameters 

790 ---------- 

791 butler : `lsst.daf.persistence.Butler` 

792 Data butler used to write the schema. Each schema is written to the 

793 dataset type specified as the key in the dict returned by 

794 `~lsst.pipe.base.Task.getAllSchemaCatalogs`. 

795 clobber : `bool`, optional 

796 A boolean flag that controls what happens if a schema already has 

797 been saved: 

798 

799 - `True`: overwrite or rename the existing schema, depending on 

800 ``doBackup``. 

801 - `False`: raise `TaskError` if this schema does not match the 

802 existing schema. 

803 doBackup : `bool`, optional 

804 Set to `True` to backup the schema files if clobbering. 

805 

806 Notes 

807 ----- 

808 If ``clobber`` is `False` and an existing schema does not match a 

809 current schema, then some schemas may have been saved successfully 

810 and others may not, and there is no easy way to tell which is which. 

811 """ 

812 for dataset, catalog in self.getAllSchemaCatalogs().items(): 

813 schemaDataset = dataset + "_schema" 

814 if clobber: 

815 butler.put(catalog, schemaDataset, doBackup=doBackup) 

816 elif butler.datasetExists(schemaDataset, write=True): 

817 oldSchema = butler.get(schemaDataset, immediate=True).getSchema() 

818 if not oldSchema.compare(catalog.getSchema(), afwTable.Schema.IDENTICAL): 

819 raise TaskError( 

820 f"New schema does not match schema {dataset!r} on disk; " 

821 "schemas must be consistent within the same output repo " 

822 "(override with --clobber-config)" 

823 ) 

824 else: 

825 butler.put(catalog, schemaDataset) 

826 

827 def writeMetadata(self, dataRef): 

828 """Write the metadata produced from processing the data. 

829 

830 Parameters 

831 ---------- 

832 dataRef 

833 Butler data reference used to write the metadata. 

834 The metadata is written to dataset type 

835 `CmdLineTask._getMetadataName`. 

836 """ 

837 try: 

838 metadataName = self._getMetadataName() 

839 if metadataName is not None: 

840 dataRef.put(self.getFullMetadata(), metadataName) 

841 except Exception as e: 

842 self.log.warning("Could not persist metadata for dataId=%s: %s", dataRef.dataId, e) 

843 

844 def writePackageVersions(self, butler, clobber=False, doBackup=True, dataset="packages"): 

845 """Compare and write package versions. 

846 

847 Parameters 

848 ---------- 

849 butler : `lsst.daf.persistence.Butler` 

850 Data butler used to read/write the package versions. 

851 clobber : `bool`, optional 

852 A boolean flag that controls what happens if versions already have 

853 been saved: 

854 

855 - `True`: overwrite or rename the existing version info, depending 

856 on ``doBackup``. 

857 - `False`: raise `TaskError` if this version info does not match 

858 the existing. 

859 doBackup : `bool`, optional 

860 If `True` and clobbering, old package version files are backed up. 

861 dataset : `str`, optional 

862 Name of dataset to read/write. 

863 

864 Raises 

865 ------ 

866 TaskError 

867 Raised if there is a version mismatch with current and persisted 

868 lists of package versions. 

869 

870 Notes 

871 ----- 

872 Note that this operation is subject to a race condition. 

873 """ 

874 packages = Packages.fromSystem() 

875 

876 if clobber: 

877 return butler.put(packages, dataset, doBackup=doBackup) 

878 if not butler.datasetExists(dataset, write=True): 

879 return butler.put(packages, dataset) 

880 

881 try: 

882 old = butler.get(dataset, immediate=True) 

883 except Exception as exc: 

884 raise type(exc)( 

885 f"Unable to read stored version dataset {dataset} ({exc}); " 

886 "consider using --clobber-versions or --no-versions" 

887 ) 

888 # Note that because we can only detect python modules that have been 

889 # imported, the stored list of products may be more or less complete 

890 # than what we have now. What's important is that the products that 

891 # are in common have the same version. 

892 diff = packages.difference(old) 

893 if diff: 

894 versions_str = "; ".join(f"{pkg}: {diff[pkg][1]} vs {diff[pkg][0]}" for pkg in diff) 

895 raise TaskError( 

896 f"Version mismatch ({versions_str}); consider using --clobber-versions or --no-versions" 

897 ) 

898 # Update the old set of packages in case we have more packages that 

899 # haven't been persisted. 

900 extra = packages.extra(old) 

901 if extra: 

902 old.update(packages) 

903 butler.put(old, dataset, doBackup=doBackup) 

904 

905 def _getConfigName(self): 

906 """Get the name of the config dataset type, or `None` if config is not 

907 to be persisted. 

908 

909 Notes 

910 ----- 

911 The name may depend on the config; that is why this is not a class 

912 method. 

913 """ 

914 return self._DefaultName + "_config" 

915 

916 def _getMetadataName(self): 

917 """Get the name of the metadata dataset type, or `None` if metadata is 

918 not to be persisted. 

919 

920 Notes 

921 ----- 

922 The name may depend on the config; that is why this is not a class 

923 method. 

924 """ 

925 return self._DefaultName + "_metadata"