Coverage for python/lsst/pipe/base/cmdLineTask.py: 15%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

230 statements  

1# 

2# LSST Data Management System 

3# Copyright 2008-2015 AURA/LSST. 

4# 

5# This product includes software developed by the 

6# LSST Project (http://www.lsst.org/). 

7# 

8# This program is free software: you can redistribute it and/or modify 

9# it under the terms of the GNU General Public License as published by 

10# the Free Software Foundation, either version 3 of the License, or 

11# (at your option) any later version. 

12# 

13# This program is distributed in the hope that it will be useful, 

14# but WITHOUT ANY WARRANTY; without even the implied warranty of 

15# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

16# GNU General Public License for more details. 

17# 

18# You should have received a copy of the LSST License Statement and 

19# the GNU General Public License along with this program. If not, 

20# see <https://www.lsstcorp.org/LegalNotices/>. 

21# 

22__all__ = ["CmdLineTask", "TaskRunner", "ButlerInitializedTaskRunner", "LegacyTaskRunner"] 

23 

24import sys 

25import traceback 

26import functools 

27import contextlib 

28 

29import lsst.log 

30import lsst.utils 

31from lsst.base import disableImplicitThreading 

32import lsst.afw.table as afwTable 

33from .task import Task, TaskError 

34from .struct import Struct 

35from .argumentParser import ArgumentParser 

36from .task_logging import getTaskLogger 

37from lsst.base import Packages 

38 

39 

40def _runPool(pool, timeout, function, iterable): 

41 """Wrapper around ``pool.map_async``, to handle timeout 

42 

43 This is required so as to trigger an immediate interrupt on the 

44 KeyboardInterrupt (Ctrl-C); see 

45 http://stackoverflow.com/questions/1408356/keyboard-interrupts-with-pythons-multiprocessing-pool 

46 """ 

47 return pool.map_async(function, iterable).get(timeout) 

48 

49 

50@contextlib.contextmanager 

51def profile(filename, log=None): 

52 """Context manager for profiling with cProfile. 

53 

54 

55 Parameters 

56 ---------- 

57 filename : `str` 

58 Filename to which to write profile (profiling disabled if `None` or 

59 empty). 

60 log : `logging.Logger`, optional 

61 Log object for logging the profile operations. 

62 

63 If profiling is enabled, the context manager returns the cProfile.Profile 

64 object (otherwise it returns None), which allows additional control over 

65 profiling. You can obtain this using the "as" clause, e.g.: 

66 

67 .. code-block:: python 

68 

69 with profile(filename) as prof: 

70 runYourCodeHere() 

71 

72 The output cumulative profile can be printed with a command-line like: 

73 

74 .. code-block:: bash 

75 

76 python -c 'import pstats; \ 

77 pstats.Stats("<filename>").sort_stats("cumtime").print_stats(30)' 

78 """ 

79 if not filename: 

80 # Nothing to do 

81 yield 

82 return 

83 from cProfile import Profile 

84 profile = Profile() 

85 if log is not None: 

86 log.info("Enabling cProfile profiling") 

87 profile.enable() 

88 yield profile 

89 profile.disable() 

90 profile.dump_stats(filename) 

91 if log is not None: 

92 log.info("cProfile stats written to %s", filename) 

93 

94 

95class TaskRunner: 

96 """Run a command-line task, using `multiprocessing` if requested. 

97 

98 Parameters 

99 ---------- 

100 TaskClass : `lsst.pipe.base.Task` subclass 

101 The class of the task to run. 

102 parsedCmd : `argparse.Namespace` 

103 The parsed command-line arguments, as returned by the task's argument 

104 parser's `~lsst.pipe.base.ArgumentParser.parse_args` method. 

105 

106 .. warning:: 

107 

108 Do not store ``parsedCmd``, as this instance is pickled (if 

109 multiprocessing) and parsedCmd may contain non-picklable elements. 

110 It certainly contains more data than we need to send to each 

111 instance of the task. 

112 doReturnResults : `bool`, optional 

113 Should run return the collected result from each invocation of the 

114 task? This is only intended for unit tests and similar use. It can 

115 easily exhaust memory (if the task returns enough data and you call it 

116 enough times) and it will fail when using multiprocessing if the 

117 returned data cannot be pickled. 

118 

119 Note that even if ``doReturnResults`` is False a struct with a single 

120 member "exitStatus" is returned, with value 0 or 1 to be returned to 

121 the unix shell. 

122 

123 Raises 

124 ------ 

125 ImportError 

126 Raised if multiprocessing is requested (and the task supports it) but 

127 the multiprocessing library cannot be imported. 

128 

129 Notes 

130 ----- 

131 Each command-line task (subclass of `lsst.pipe.base.CmdLineTask`) has a 

132 task runner. By default it is this class, but some tasks require a 

133 subclass. See the manual :ref:`creating-a-command-line-task` for more 

134 information. See `CmdLineTask.parseAndRun` to see how a task runner is 

135 used. 

136 

137 You may use this task runner for your command-line task if your task has a 

138 ``runDataRef`` method that takes exactly one argument: a butler data 

139 reference. Otherwise you must provide a task-specific subclass of 

140 this runner for your task's ``RunnerClass`` that overrides 

141 `TaskRunner.getTargetList` and possibly 

142 `TaskRunner.__call__`. See `TaskRunner.getTargetList` for details. 

143 

144 This design matches the common pattern for command-line tasks: the 

145 ``runDataRef`` method takes a single data reference, of some suitable name. 

146 Additional arguments are rare, and if present, require a subclass of 

147 `TaskRunner` that calls these additional arguments by name. 

148 

149 Instances of this class must be picklable in order to be compatible with 

150 multiprocessing. If multiprocessing is requested 

151 (``parsedCmd.numProcesses > 1``) then `runDataRef` calls 

152 `prepareForMultiProcessing` to jettison optional non-picklable elements. 

153 If your task runner is not compatible with multiprocessing then indicate 

154 this in your task by setting class variable ``canMultiprocess=False``. 

155 

156 Due to a `python bug`__, handling a `KeyboardInterrupt` properly `requires 

157 specifying a timeout`__. This timeout (in sec) can be specified as the 

158 ``timeout`` element in the output from `~lsst.pipe.base.ArgumentParser` 

159 (the ``parsedCmd``), if available, otherwise we use `TaskRunner.TIMEOUT`. 

160 

161 By default, we disable "implicit" threading -- ie, as provided by 

162 underlying numerical libraries such as MKL or BLAS. This is designed to 

163 avoid thread contention both when a single command line task spawns 

164 multiple processes and when multiple users are running on a shared system. 

165 Users can override this behaviour by setting the 

166 ``LSST_ALLOW_IMPLICIT_THREADS`` environment variable. 

167 

168 .. __: http://bugs.python.org/issue8296 

169 .. __: http://stackoverflow.com/questions/1408356/ 

170 """ 

171 

172 TIMEOUT = 3600*24*30 

173 """Default timeout (seconds) for multiprocessing.""" 

174 

175 def __init__(self, TaskClass, parsedCmd, doReturnResults=False): 

176 self.TaskClass = TaskClass 

177 self.doReturnResults = bool(doReturnResults) 

178 self.config = parsedCmd.config 

179 self.log = parsedCmd.log 

180 self.doRaise = bool(parsedCmd.doraise) 

181 self.clobberConfig = bool(parsedCmd.clobberConfig) 

182 self.doBackup = not bool(parsedCmd.noBackupConfig) 

183 self.numProcesses = int(getattr(parsedCmd, 'processes', 1)) 

184 

185 self.timeout = getattr(parsedCmd, 'timeout', None) 

186 if self.timeout is None or self.timeout <= 0: 

187 self.timeout = self.TIMEOUT 

188 

189 if self.numProcesses > 1: 

190 if not TaskClass.canMultiprocess: 

191 self.log.warning("This task does not support multiprocessing; using one process") 

192 self.numProcesses = 1 

193 

194 def prepareForMultiProcessing(self): 

195 """Prepare this instance for multiprocessing 

196 

197 Optional non-picklable elements are removed. 

198 

199 This is only called if the task is run under multiprocessing. 

200 """ 

201 self.log = None 

202 

203 def run(self, parsedCmd): 

204 """Run the task on all targets. 

205 

206 Parameters 

207 ---------- 

208 parsedCmd : `argparse.Namespace` 

209 Parsed command `argparse.Namespace`. 

210 

211 Returns 

212 ------- 

213 resultList : `list` 

214 A list of results returned by `TaskRunner.__call__`, or an empty 

215 list if `TaskRunner.__call__` is not called (e.g. if 

216 `TaskRunner.precall` returns `False`). See `TaskRunner.__call__` 

217 for details. 

218 

219 Notes 

220 ----- 

221 The task is run under multiprocessing if `TaskRunner.numProcesses` 

222 is more than 1; otherwise processing is serial. 

223 """ 

224 resultList = [] 

225 disableImplicitThreading() # To prevent thread contention 

226 if self.numProcesses > 1: 

227 import multiprocessing 

228 self.prepareForMultiProcessing() 

229 pool = multiprocessing.Pool(processes=self.numProcesses, maxtasksperchild=1) 

230 mapFunc = functools.partial(_runPool, pool, self.timeout) 

231 else: 

232 pool = None 

233 mapFunc = map 

234 

235 if self.precall(parsedCmd): 

236 profileName = parsedCmd.profile if hasattr(parsedCmd, "profile") else None 

237 log = parsedCmd.log 

238 targetList = self.getTargetList(parsedCmd) 

239 if len(targetList) > 0: 

240 with profile(profileName, log): 

241 # Run the task using self.__call__ 

242 resultList = list(mapFunc(self, targetList)) 

243 else: 

244 log.warning("Not running the task because there is no data to process; " 

245 "you may preview data using \"--show data\"") 

246 

247 if pool is not None: 

248 pool.close() 

249 pool.join() 

250 

251 return resultList 

252 

253 @staticmethod 

254 def getTargetList(parsedCmd, **kwargs): 

255 """Get a list of (dataRef, kwargs) for `TaskRunner.__call__`. 

256 

257 Parameters 

258 ---------- 

259 parsedCmd : `argparse.Namespace` 

260 The parsed command object returned by 

261 `lsst.pipe.base.argumentParser.ArgumentParser.parse_args`. 

262 kwargs 

263 Any additional keyword arguments. In the default `TaskRunner` this 

264 is an empty dict, but having it simplifies overriding `TaskRunner` 

265 for tasks whose runDataRef method takes additional arguments 

266 (see case (1) below). 

267 

268 Notes 

269 ----- 

270 The default implementation of `TaskRunner.getTargetList` and 

271 `TaskRunner.__call__` works for any command-line task whose 

272 ``runDataRef`` method takes exactly one argument: a data reference. 

273 Otherwise you must provide a variant of TaskRunner that overrides 

274 `TaskRunner.getTargetList` and possibly `TaskRunner.__call__`. 

275 There are two cases. 

276 

277 **Case 1** 

278 

279 If your command-line task has a ``runDataRef`` method that takes one 

280 data reference followed by additional arguments, then you need only 

281 override `TaskRunner.getTargetList` to return the additional 

282 arguments as an argument dict. To make this easier, your overridden 

283 version of `~TaskRunner.getTargetList` may call 

284 `TaskRunner.getTargetList` with the extra arguments as keyword 

285 arguments. For example, the following adds an argument dict containing 

286 a single key: "calExpList", whose value is the list of data IDs for 

287 the calexp ID argument: 

288 

289 .. code-block:: python 

290 

291 def getTargetList(parsedCmd): 

292 return TaskRunner.getTargetList( 

293 parsedCmd, 

294 calExpList=parsedCmd.calexp.idList 

295 ) 

296 

297 It is equivalent to this slightly longer version: 

298 

299 .. code-block:: python 

300 

301 @staticmethod 

302 def getTargetList(parsedCmd): 

303 argDict = dict(calExpList=parsedCmd.calexp.idList) 

304 return [(dataId, argDict) for dataId in parsedCmd.id.idList] 

305 

306 **Case 2** 

307 

308 If your task does not meet condition (1) then you must override both 

309 TaskRunner.getTargetList and `TaskRunner.__call__`. You may do this 

310 however you see fit, so long as `TaskRunner.getTargetList` 

311 returns a list, each of whose elements is sent to 

312 `TaskRunner.__call__`, which runs your task. 

313 """ 

314 return [(ref, kwargs) for ref in parsedCmd.id.refList] 

315 

316 def makeTask(self, parsedCmd=None, args=None): 

317 """Create a Task instance. 

318 

319 Parameters 

320 ---------- 

321 parsedCmd 

322 Parsed command-line options (used for extra task args by some task 

323 runners). 

324 args 

325 Args tuple passed to `TaskRunner.__call__` (used for extra task 

326 arguments by some task runners). 

327 

328 Notes 

329 ----- 

330 ``makeTask`` can be called with either the ``parsedCmd`` argument or 

331 ``args`` argument set to None, but it must construct identical Task 

332 instances in either case. 

333 

334 Subclasses may ignore this method entirely if they reimplement both 

335 `TaskRunner.precall` and `TaskRunner.__call__`. 

336 """ 

337 return self.TaskClass(config=self.config, log=self.log) 

338 

339 def _precallImpl(self, task, parsedCmd): 

340 """The main work of `precall`. 

341 

342 We write package versions, schemas and configs, or compare these to 

343 existing files on disk if present. 

344 """ 

345 if not parsedCmd.noVersions: 

346 task.writePackageVersions(parsedCmd.butler, clobber=parsedCmd.clobberVersions) 

347 task.writeConfig(parsedCmd.butler, clobber=self.clobberConfig, doBackup=self.doBackup) 

348 task.writeSchemas(parsedCmd.butler, clobber=self.clobberConfig, doBackup=self.doBackup) 

349 

350 def precall(self, parsedCmd): 

351 """Hook for code that should run exactly once, before multiprocessing. 

352 

353 Notes 

354 ----- 

355 Must return True if `TaskRunner.__call__` should subsequently be 

356 called. 

357 

358 .. warning:: 

359 

360 Implementations must take care to ensure that no unpicklable 

361 attributes are added to the TaskRunner itself, for compatibility 

362 with multiprocessing. 

363 

364 The default implementation writes package versions, schemas and 

365 configs, or compares them to existing files on disk if present. 

366 """ 

367 task = self.makeTask(parsedCmd=parsedCmd) 

368 

369 if self.doRaise: 

370 self._precallImpl(task, parsedCmd) 

371 else: 

372 try: 

373 self._precallImpl(task, parsedCmd) 

374 except Exception as e: 

375 task.log.fatal("Failed in task initialization: %s", e) 

376 if not isinstance(e, TaskError): 

377 traceback.print_exc(file=sys.stderr) 

378 return False 

379 return True 

380 

381 def __call__(self, args): 

382 """Run the Task on a single target. 

383 

384 Parameters 

385 ---------- 

386 args 

387 Arguments for Task.runDataRef() 

388 

389 Returns 

390 ------- 

391 struct : `lsst.pipe.base.Struct` 

392 Contains these fields if ``doReturnResults`` is `True`: 

393 

394 - ``dataRef``: the provided data reference. 

395 - ``metadata``: task metadata after execution of run. 

396 - ``result``: result returned by task run, or `None` if the task 

397 fails. 

398 - ``exitStatus``: 0 if the task completed successfully, 1 

399 otherwise. 

400 

401 If ``doReturnResults`` is `False` the struct contains: 

402 

403 - ``exitStatus``: 0 if the task completed successfully, 1 

404 otherwise. 

405 

406 Notes 

407 ----- 

408 This default implementation assumes that the ``args`` is a tuple 

409 containing a data reference and a dict of keyword arguments. 

410 

411 .. warning:: 

412 

413 If you override this method and wish to return something when 

414 ``doReturnResults`` is `False`, then it must be picklable to 

415 support multiprocessing and it should be small enough that pickling 

416 and unpickling do not add excessive overhead. 

417 """ 

418 dataRef, kwargs = args 

419 if self.log is None: 

420 self.log = getTaskLogger() 

421 if hasattr(dataRef, "dataId"): 

422 lsst.log.MDC("LABEL", str(dataRef.dataId)) 

423 elif isinstance(dataRef, (list, tuple)): 

424 lsst.log.MDC("LABEL", str([ref.dataId for ref in dataRef if hasattr(ref, "dataId")])) 

425 task = self.makeTask(args=args) 

426 result = None # in case the task fails 

427 exitStatus = 0 # exit status for the shell 

428 if self.doRaise: 

429 result = self.runTask(task, dataRef, kwargs) 

430 else: 

431 try: 

432 result = self.runTask(task, dataRef, kwargs) 

433 except Exception as e: 

434 # The shell exit value will be the number of dataRefs returning 

435 # non-zero, so the actual value used here is lost. 

436 exitStatus = 1 

437 

438 # don't use a try block as we need to preserve the original 

439 # exception 

440 eName = type(e).__name__ 

441 if hasattr(dataRef, "dataId"): 

442 task.log.fatal("Failed on dataId=%s: %s: %s", dataRef.dataId, eName, e) 

443 elif isinstance(dataRef, (list, tuple)): 

444 task.log.fatal("Failed on dataIds=[%s]: %s: %s", 

445 ", ".join(str(ref.dataId) for ref in dataRef), eName, e) 

446 else: 

447 task.log.fatal("Failed on dataRef=%s: %s: %s", dataRef, eName, e) 

448 

449 if not isinstance(e, TaskError): 

450 traceback.print_exc(file=sys.stderr) 

451 

452 # Ensure all errors have been logged and aren't hanging around in a 

453 # buffer 

454 sys.stdout.flush() 

455 sys.stderr.flush() 

456 

457 task.writeMetadata(dataRef) 

458 

459 # remove MDC so it does not show up outside of task context 

460 lsst.log.MDCRemove("LABEL") 

461 

462 if self.doReturnResults: 

463 return Struct( 

464 exitStatus=exitStatus, 

465 dataRef=dataRef, 

466 metadata=task.metadata, 

467 result=result, 

468 ) 

469 else: 

470 return Struct( 

471 exitStatus=exitStatus, 

472 ) 

473 

474 def runTask(self, task, dataRef, kwargs): 

475 """Make the actual call to `runDataRef` for this task. 

476 

477 Parameters 

478 ---------- 

479 task : `lsst.pipe.base.CmdLineTask` class 

480 The class of the task to run. 

481 dataRef 

482 Butler data reference that contains the data the task will process. 

483 kwargs 

484 Any additional keyword arguments. See `TaskRunner.getTargetList` 

485 above. 

486 

487 Notes 

488 ----- 

489 The default implementation of `TaskRunner.runTask` works for any 

490 command-line task which has a ``runDataRef`` method that takes a data 

491 reference and an optional set of additional keyword arguments. 

492 This method returns the results generated by the task's `runDataRef` 

493 method. 

494 

495 """ 

496 return task.runDataRef(dataRef, **kwargs) 

497 

498 

499class LegacyTaskRunner(TaskRunner): 

500 r"""A `TaskRunner` for `CmdLineTask`\ s which calls the `Task`\ 's `run` 

501 method on a `dataRef` rather than the `runDataRef` method. 

502 """ 

503 

504 def runTask(self, task, dataRef, kwargs): 

505 """Call `run` for this task instead of `runDataRef`. See 

506 `TaskRunner.runTask` above for details. 

507 """ 

508 return task.run(dataRef, **kwargs) 

509 

510 

511class ButlerInitializedTaskRunner(TaskRunner): 

512 r"""A `TaskRunner` for `CmdLineTask`\ s that require a ``butler`` keyword 

513 argument to be passed to their constructor. 

514 """ 

515 

516 def makeTask(self, parsedCmd=None, args=None): 

517 """A variant of the base version that passes a butler argument to the 

518 task's constructor. 

519 

520 Parameters 

521 ---------- 

522 parsedCmd : `argparse.Namespace` 

523 Parsed command-line options, as returned by the 

524 `~lsst.pipe.base.ArgumentParser`; if specified then args is 

525 ignored. 

526 args 

527 Other arguments; if ``parsedCmd`` is `None` then this must be 

528 specified. 

529 

530 Raises 

531 ------ 

532 RuntimeError 

533 Raised if ``parsedCmd`` and ``args`` are both `None`. 

534 """ 

535 if parsedCmd is not None: 

536 butler = parsedCmd.butler 

537 elif args is not None: 

538 dataRef, kwargs = args 

539 butler = dataRef.butlerSubset.butler 

540 else: 

541 raise RuntimeError("parsedCmd or args must be specified") 

542 return self.TaskClass(config=self.config, log=self.log, butler=butler) 

543 

544 

545class CmdLineTask(Task): 

546 """Base class for command-line tasks: tasks that may be executed from the 

547 command-line. 

548 

549 Notes 

550 ----- 

551 See :ref:`task-framework-overview` to learn what tasks are and 

552 :ref:`creating-a-command-line-task` for more information about writing 

553 command-line tasks. 

554 

555 Subclasses must specify the following class variables: 

556 

557 - ``ConfigClass``: configuration class for your task (a subclass of 

558 `lsst.pex.config.Config`, or if your task needs no configuration, then 

559 `lsst.pex.config.Config` itself). 

560 - ``_DefaultName``: default name used for this task (a `str`). 

561 

562 Subclasses may also specify the following class variables: 

563 

564 - ``RunnerClass``: a task runner class. The default is ``TaskRunner``, 

565 which works for any task with a runDataRef method that takes exactly one 

566 argument: a data reference. If your task does not meet this requirement 

567 then you must supply a variant of ``TaskRunner``; see ``TaskRunner`` 

568 for more information. 

569 - ``canMultiprocess``: the default is `True`; set `False` if your task 

570 does not support multiprocessing. 

571 

572 Subclasses must specify a method named ``runDataRef``: 

573 

574 - By default ``runDataRef`` accepts a single butler data reference, but 

575 you can specify an alternate task runner (subclass of ``TaskRunner``) as 

576 the value of class variable ``RunnerClass`` if your run method needs 

577 something else. 

578 - ``runDataRef`` is expected to return its data in a 

579 `lsst.pipe.base.Struct`. This provides safety for evolution of the task 

580 since new values may be added without harming existing code. 

581 - The data returned by ``runDataRef`` must be picklable if your task is to 

582 support multiprocessing. 

583 """ 

584 RunnerClass = TaskRunner 

585 canMultiprocess = True 

586 

587 @classmethod 

588 def applyOverrides(cls, config): 

589 """A hook to allow a task to change the values of its config *after* 

590 the camera-specific overrides are loaded but before any command-line 

591 overrides are applied. 

592 

593 Parameters 

594 ---------- 

595 config : instance of task's ``ConfigClass`` 

596 Task configuration. 

597 

598 Notes 

599 ----- 

600 This is necessary in some cases because the camera-specific overrides 

601 may retarget subtasks, wiping out changes made in 

602 ConfigClass.setDefaults. See LSST Trac ticket #2282 for more 

603 discussion. 

604 

605 .. warning:: 

606 

607 This is called by CmdLineTask.parseAndRun; other ways of 

608 constructing a config will not apply these overrides. 

609 """ 

610 pass 

611 

612 @classmethod 

613 def parseAndRun(cls, args=None, config=None, log=None, doReturnResults=False): 

614 """Parse an argument list and run the command. 

615 

616 Parameters 

617 ---------- 

618 args : `list`, optional 

619 List of command-line arguments; if `None` use `sys.argv`. 

620 config : `lsst.pex.config.Config`-type, optional 

621 Config for task. If `None` use `Task.ConfigClass`. 

622 log : `logging.Logger`-type, optional 

623 Log. If `None` use the default log. 

624 doReturnResults : `bool`, optional 

625 If `True`, return the results of this task. Default is `False`. 

626 This is only intended for unit tests and similar use. It can 

627 easily exhaust memory (if the task returns enough data and you 

628 call it enough times) and it will fail when using multiprocessing 

629 if the returned data cannot be pickled. 

630 

631 Returns 

632 ------- 

633 struct : `lsst.pipe.base.Struct` 

634 Fields are: 

635 

636 ``argumentParser`` 

637 the argument parser (`lsst.pipe.base.ArgumentParser`). 

638 ``parsedCmd`` 

639 the parsed command returned by the argument parser's 

640 `~lsst.pipe.base.ArgumentParser.parse_args` method 

641 (`argparse.Namespace`). 

642 ``taskRunner`` 

643 the task runner used to run the task (an instance of 

644 `Task.RunnerClass`). 

645 ``resultList`` 

646 results returned by the task runner's ``run`` method, one entry 

647 per invocation (`list`). This will typically be a list of 

648 `Struct`, each containing at least an ``exitStatus`` integer 

649 (0 or 1); see `Task.RunnerClass` (`TaskRunner` by default) for 

650 more details. 

651 

652 Notes 

653 ----- 

654 Calling this method with no arguments specified is the standard way to 

655 run a command-line task from the command-line. For an example see 

656 ``pipe_tasks`` ``bin/makeSkyMap.py`` or almost any other file in that 

657 directory. 

658 

659 If one or more of the dataIds fails then this routine will exit (with 

660 a status giving the number of failed dataIds) rather than returning 

661 this struct; this behaviour can be overridden by specifying the 

662 ``--noExit`` command-line option. 

663 """ 

664 if args is None: 

665 commandAsStr = " ".join(sys.argv) 

666 args = sys.argv[1:] 

667 else: 

668 commandAsStr = "{}{}".format(lsst.utils.get_caller_name(skip=1), tuple(args)) 

669 

670 argumentParser = cls._makeArgumentParser() 

671 if config is None: 

672 config = cls.ConfigClass() 

673 parsedCmd = argumentParser.parse_args(config=config, args=args, log=log, override=cls.applyOverrides) 

674 # print this message after parsing the command so the log is fully 

675 # configured 

676 parsedCmd.log.info("Running: %s", commandAsStr) 

677 

678 taskRunner = cls.RunnerClass(TaskClass=cls, parsedCmd=parsedCmd, doReturnResults=doReturnResults) 

679 resultList = taskRunner.run(parsedCmd) 

680 

681 try: 

682 nFailed = sum(((res.exitStatus != 0) for res in resultList)) 

683 except (TypeError, AttributeError) as e: 

684 # NOTE: TypeError if resultList is None, AttributeError if it 

685 # doesn't have exitStatus. 

686 parsedCmd.log.warning("Unable to retrieve exit status (%s); assuming success", e) 

687 nFailed = 0 

688 

689 if nFailed > 0: 

690 if parsedCmd.noExit: 

691 parsedCmd.log.error("%d dataRefs failed; not exiting as --noExit was set", nFailed) 

692 else: 

693 sys.exit(nFailed) 

694 

695 return Struct( 

696 argumentParser=argumentParser, 

697 parsedCmd=parsedCmd, 

698 taskRunner=taskRunner, 

699 resultList=resultList, 

700 ) 

701 

702 @classmethod 

703 def _makeArgumentParser(cls): 

704 """Create and return an argument parser. 

705 

706 Returns 

707 ------- 

708 parser : `lsst.pipe.base.ArgumentParser` 

709 The argument parser for this task. 

710 

711 Notes 

712 ----- 

713 By default this returns an `~lsst.pipe.base.ArgumentParser` with one 

714 ID argument named `--id` of dataset type ``raw``. 

715 

716 Your task subclass may need to override this method to change the 

717 dataset type or data ref level, or to add additional data ID arguments. 

718 If you add additional data ID arguments or your task's runDataRef 

719 method takes more than a single data reference then you will also have 

720 to provide a task-specific task runner (see TaskRunner for more 

721 information). 

722 """ 

723 parser = ArgumentParser(name=cls._DefaultName) 

724 parser.add_id_argument(name="--id", datasetType="raw", 

725 help="data IDs, e.g. --id visit=12345 ccd=1,2^0,3") 

726 return parser 

727 

728 def writeConfig(self, butler, clobber=False, doBackup=True): 

729 """Write the configuration used for processing the data, or check that 

730 an existing one is equal to the new one if present. 

731 

732 Parameters 

733 ---------- 

734 butler : `lsst.daf.persistence.Butler` 

735 Data butler used to write the config. The config is written to 

736 dataset type `CmdLineTask._getConfigName`. 

737 clobber : `bool`, optional 

738 A boolean flag that controls what happens if a config already has 

739 been saved: 

740 

741 - `True`: overwrite or rename the existing config, depending on 

742 ``doBackup``. 

743 - `False`: raise `TaskError` if this config does not match the 

744 existing config. 

745 doBackup : `bool`, optional 

746 Set to `True` to backup the config files if clobbering. 

747 """ 

748 configName = self._getConfigName() 

749 if configName is None: 

750 return 

751 if clobber: 

752 butler.put(self.config, configName, doBackup=doBackup) 

753 elif butler.datasetExists(configName, write=True): 

754 # this may be subject to a race condition; see #2789 

755 try: 

756 oldConfig = butler.get(configName, immediate=True) 

757 except Exception as exc: 

758 raise type(exc)(f"Unable to read stored config file {configName} (exc); " 

759 "consider using --clobber-config") 

760 

761 def logConfigMismatch(msg): 

762 self.log.fatal("Comparing configuration: %s", msg) 

763 

764 if not self.config.compare(oldConfig, shortcut=False, output=logConfigMismatch): 

765 raise TaskError( 

766 f"Config does not match existing task config {configName!r} on disk; " 

767 "tasks configurations must be consistent within the same output repo " 

768 "(override with --clobber-config)") 

769 else: 

770 butler.put(self.config, configName) 

771 

772 def writeSchemas(self, butler, clobber=False, doBackup=True): 

773 """Write the schemas returned by 

774 `lsst.pipe.base.Task.getAllSchemaCatalogs`. 

775 

776 Parameters 

777 ---------- 

778 butler : `lsst.daf.persistence.Butler` 

779 Data butler used to write the schema. Each schema is written to the 

780 dataset type specified as the key in the dict returned by 

781 `~lsst.pipe.base.Task.getAllSchemaCatalogs`. 

782 clobber : `bool`, optional 

783 A boolean flag that controls what happens if a schema already has 

784 been saved: 

785 

786 - `True`: overwrite or rename the existing schema, depending on 

787 ``doBackup``. 

788 - `False`: raise `TaskError` if this schema does not match the 

789 existing schema. 

790 doBackup : `bool`, optional 

791 Set to `True` to backup the schema files if clobbering. 

792 

793 Notes 

794 ----- 

795 If ``clobber`` is `False` and an existing schema does not match a 

796 current schema, then some schemas may have been saved successfully 

797 and others may not, and there is no easy way to tell which is which. 

798 """ 

799 for dataset, catalog in self.getAllSchemaCatalogs().items(): 

800 schemaDataset = dataset + "_schema" 

801 if clobber: 

802 butler.put(catalog, schemaDataset, doBackup=doBackup) 

803 elif butler.datasetExists(schemaDataset, write=True): 

804 oldSchema = butler.get(schemaDataset, immediate=True).getSchema() 

805 if not oldSchema.compare(catalog.getSchema(), afwTable.Schema.IDENTICAL): 

806 raise TaskError( 

807 f"New schema does not match schema {dataset!r} on disk; " 

808 "schemas must be consistent within the same output repo " 

809 "(override with --clobber-config)") 

810 else: 

811 butler.put(catalog, schemaDataset) 

812 

813 def writeMetadata(self, dataRef): 

814 """Write the metadata produced from processing the data. 

815 

816 Parameters 

817 ---------- 

818 dataRef 

819 Butler data reference used to write the metadata. 

820 The metadata is written to dataset type 

821 `CmdLineTask._getMetadataName`. 

822 """ 

823 try: 

824 metadataName = self._getMetadataName() 

825 if metadataName is not None: 

826 dataRef.put(self.getFullMetadata(), metadataName) 

827 except Exception as e: 

828 self.log.warning("Could not persist metadata for dataId=%s: %s", dataRef.dataId, e) 

829 

830 def writePackageVersions(self, butler, clobber=False, doBackup=True, dataset="packages"): 

831 """Compare and write package versions. 

832 

833 Parameters 

834 ---------- 

835 butler : `lsst.daf.persistence.Butler` 

836 Data butler used to read/write the package versions. 

837 clobber : `bool`, optional 

838 A boolean flag that controls what happens if versions already have 

839 been saved: 

840 

841 - `True`: overwrite or rename the existing version info, depending 

842 on ``doBackup``. 

843 - `False`: raise `TaskError` if this version info does not match 

844 the existing. 

845 doBackup : `bool`, optional 

846 If `True` and clobbering, old package version files are backed up. 

847 dataset : `str`, optional 

848 Name of dataset to read/write. 

849 

850 Raises 

851 ------ 

852 TaskError 

853 Raised if there is a version mismatch with current and persisted 

854 lists of package versions. 

855 

856 Notes 

857 ----- 

858 Note that this operation is subject to a race condition. 

859 """ 

860 packages = Packages.fromSystem() 

861 

862 if clobber: 

863 return butler.put(packages, dataset, doBackup=doBackup) 

864 if not butler.datasetExists(dataset, write=True): 

865 return butler.put(packages, dataset) 

866 

867 try: 

868 old = butler.get(dataset, immediate=True) 

869 except Exception as exc: 

870 raise type(exc)(f"Unable to read stored version dataset {dataset} ({exc}); " 

871 "consider using --clobber-versions or --no-versions") 

872 # Note that because we can only detect python modules that have been 

873 # imported, the stored list of products may be more or less complete 

874 # than what we have now. What's important is that the products that 

875 # are in common have the same version. 

876 diff = packages.difference(old) 

877 if diff: 

878 versions_str = "; ".join(f"{pkg}: {diff[pkg][1]} vs {diff[pkg][0]}" for pkg in diff) 

879 raise TaskError( 

880 f"Version mismatch ({versions_str}); consider using --clobber-versions or --no-versions") 

881 # Update the old set of packages in case we have more packages that 

882 # haven't been persisted. 

883 extra = packages.extra(old) 

884 if extra: 

885 old.update(packages) 

886 butler.put(old, dataset, doBackup=doBackup) 

887 

888 def _getConfigName(self): 

889 """Get the name of the config dataset type, or `None` if config is not 

890 to be persisted. 

891 

892 Notes 

893 ----- 

894 The name may depend on the config; that is why this is not a class 

895 method. 

896 """ 

897 return self._DefaultName + "_config" 

898 

899 def _getMetadataName(self): 

900 """Get the name of the metadata dataset type, or `None` if metadata is 

901 not to be persisted. 

902 

903 Notes 

904 ----- 

905 The name may depend on the config; that is why this is not a class 

906 method. 

907 """ 

908 return self._DefaultName + "_metadata"