Coverage for python/lsst/pipe/base/all_dimensions_quantum_graph_builder.py: 18%

197 statements  

« prev     ^ index     » next       coverage.py v7.4.1, created at 2024-01-30 10:51 +0000

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28"""The standard, general-purpose implementation of the QuantumGraph-generation 

29algorithm. 

30""" 

31 

32from __future__ import annotations 

33 

34__all__ = ("AllDimensionsQuantumGraphBuilder",) 

35 

36import dataclasses 

37from collections.abc import Iterator, Mapping 

38from contextlib import contextmanager 

39from typing import TYPE_CHECKING, Any, final 

40 

41from lsst.daf.butler.registry import MissingDatasetTypeError 

42from lsst.utils.timer import timeMethod 

43 

44from ._datasetQueryConstraints import DatasetQueryConstraintVariant 

45from .quantum_graph_builder import ( 

46 DatasetKey, 

47 PrerequisiteDatasetKey, 

48 QuantumGraphBuilder, 

49 QuantumGraphBuilderError, 

50 QuantumGraphSkeleton, 

51 QuantumKey, 

52) 

53 

54if TYPE_CHECKING: 

55 from lsst.daf.butler import Butler, DataCoordinateQueryResults, DimensionGroup 

56 from lsst.utils.logging import LsstLogAdapter 

57 

58 from .pipeline_graph import DatasetTypeNode, PipelineGraph, TaskNode 

59 

60 

61@final 

62class AllDimensionsQuantumGraphBuilder(QuantumGraphBuilder): 

63 """An implementation of `QuantumGraphBuilder` that uses a single large 

64 query for data IDs covering all dimensions in the pipeline. 

65 

66 Parameters 

67 ---------- 

68 pipeline_graph : `.pipeline_graph.PipelineGraph` 

69 Pipeline to build a `QuantumGraph` from, as a graph. Will be resolved 

70 in-place with the given butler (any existing resolution is ignored). 

71 butler : `lsst.daf.butler.Butler` 

72 Client for the data repository. Should be read-only. 

73 where : `str` 

74 Butler expression language constraint to apply to all data IDs. 

75 dataset_query_constraint : `DatasetQueryConstraintVariant`, optional 

76 Specification of which overall-input datasets should be used to 

77 constrain the initial data ID queries. Not including an important 

78 constraint can result in catastrophically large query results that take 

79 too long to process, while including too many makes the query much more 

80 complex, increasing the chances that the database will choose a bad 

81 (sometimes catastrophically bad) query plan. 

82 bind : `~collections.abc.Mapping`, optional 

83 Variable substitutions for the ``where`` expression. 

84 **kwargs 

85 Additional keyword arguments forwarded to `QuantumGraphBuilder`. 

86 

87 Notes 

88 ----- 

89 This is a general-purpose algorithm that delegates the problem of 

90 determining which "end" of the pipeline is more constrained (beginning by 

91 input collection contents vs. end by the ``where`` string) to the database 

92 query planner, which *usually* does a good job. 

93 

94 This algorithm suffers from a serious limitation, which we refer to as the 

95 "tract slicing" problem from its most common variant: the ``where`` string 

96 and general data ID intersection rules apply to *all* data IDs in the 

97 graph. For example, if a ``tract`` constraint is present in the ``where`` 

98 string or an overall-input dataset, then it is impossible for any data ID 

99 that does not overlap that tract to be present anywhere in the pipeline, 

100 such as a ``{visit, detector}`` combination where the ``visit`` overlaps 

101 the ``tract`` even if the ``detector`` does not. 

102 """ 

103 

104 def __init__( 

105 self, 

106 pipeline_graph: PipelineGraph, 

107 butler: Butler, 

108 *, 

109 where: str, 

110 dataset_query_constraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL, 

111 bind: Mapping[str, Any] | None = None, 

112 **kwargs: Any, 

113 ): 

114 super().__init__(pipeline_graph, butler, **kwargs) 

115 self.where = where 

116 self.dataset_query_constraint = dataset_query_constraint 

117 self.bind = bind 

118 

119 @timeMethod 

120 def process_subgraph(self, subgraph: PipelineGraph) -> QuantumGraphSkeleton: 

121 # Docstring inherited. 

122 # There is some chance that the dimension query for one subgraph would 

123 # be the same as or a dimension-subset of another. This is an 

124 # optimization opportunity we're not currently taking advantage of. 

125 with _AllDimensionsQuery.from_builder(self, subgraph) as query: 

126 skeleton = self._make_subgraph_skeleton(query) 

127 self._find_followup_datasets(query, skeleton) 

128 return skeleton 

129 

130 @timeMethod 

131 def _make_subgraph_skeleton(self, query: _AllDimensionsQuery) -> QuantumGraphSkeleton: 

132 """Build a `QuantumGraphSkeleton` by iterating over the result rows 

133 of the initial data ID query. 

134 

135 Parameters 

136 ---------- 

137 query : `_AllDimensionsQuery` 

138 Object representing the full-pipeline data ID query. 

139 

140 Returns 

141 ------- 

142 skeleton : `QuantumGraphSkeleton` 

143 Preliminary quantum graph. 

144 """ 

145 # First we make containers of empty-dimensions quantum and dataset 

146 # keys, and add those to the skelton, since empty data IDs are 

147 # logically subsets of any data ID. We'll copy those to initialize the 

148 # containers of keys for each result row. We don't ever explicitly add 

149 # nodes to the skeleton for these, and that's okay because networkx 

150 # adds nodes implicitly when an edge to that node is added, and we 

151 # don't want to add nodes for init datasets here. 

152 skeleton = QuantumGraphSkeleton(query.subgraph.tasks) 

153 empty_dimensions_dataset_keys = {} 

154 for dataset_type_name in query.empty_dimensions_dataset_types.keys(): 

155 empty_dimensions_dataset_keys[dataset_type_name] = skeleton.add_dataset_node( 

156 dataset_type_name, self.empty_data_id 

157 ) 

158 empty_dimensions_quantum_keys = [] 

159 for task_label in query.empty_dimensions_tasks.keys(): 

160 empty_dimensions_quantum_keys.append(skeleton.add_quantum_node(task_label, self.empty_data_id)) 

161 self.log.info("Iterating over query results to associate quanta with datasets.") 

162 # Iterate over query results, populating data IDs for datasets and 

163 # quanta and then connecting them to each other. This is the slowest 

164 # client-side part of QG generation, and it's often the slowest part 

165 # overall, so inside this loop is where it's really critical to avoid 

166 # expensive things, especially in the nested loops. 

167 n_rows = 0 

168 for common_data_id in query.common_data_ids: 

169 # Create a data ID for each set of dimensions used by one or more 

170 # tasks or dataset types, and use that to record all quanta and 

171 # dataset data IDs for this row. 

172 dataset_keys_for_row: dict[str, DatasetKey] = empty_dimensions_dataset_keys.copy() 

173 quantum_keys_for_row: list[QuantumKey] = empty_dimensions_quantum_keys.copy() 

174 for dimensions, (task_nodes, dataset_type_nodes) in query.grouped_by_dimensions.items(): 

175 data_id = common_data_id.subset(dimensions) 

176 for dataset_type_name in dataset_type_nodes.keys(): 

177 dataset_keys_for_row[dataset_type_name] = skeleton.add_dataset_node( 

178 dataset_type_name, data_id 

179 ) 

180 for task_label in task_nodes.keys(): 

181 quantum_keys_for_row.append(skeleton.add_quantum_node(task_label, data_id)) 

182 # Whether these quanta are new or existing, we can now associate 

183 # the dataset data IDs for this row with them. The fact that a 

184 # quantum data ID and a dataset data ID both came from the same 

185 # result row is what tells us they should be associated. Many of 

186 # these associates will be duplicates (because another query row 

187 # that differed from this one only in irrelevant dimensions already 

188 # added them), and our use of sets should take care of that. 

189 for quantum_key in quantum_keys_for_row: 

190 for read_edge in self._pipeline_graph.tasks[quantum_key.task_label].inputs.values(): 

191 skeleton.add_input_edge( 

192 quantum_key, dataset_keys_for_row[read_edge.parent_dataset_type_name] 

193 ) 

194 for write_edge in self._pipeline_graph.tasks[quantum_key.task_label].iter_all_outputs(): 

195 skeleton.add_output_edge( 

196 quantum_key, dataset_keys_for_row[write_edge.parent_dataset_type_name] 

197 ) 

198 n_rows += 1 

199 if n_rows == 0: 

200 query.log_failure(self.log) 

201 else: 

202 n_quanta = sum(len(skeleton.get_quanta(task_label)) for task_label in query.subgraph.tasks) 

203 self.log.info( 

204 "Initial bipartite graph has %d quanta, %d dataset nodes, and %d edges from %d query row(s).", 

205 n_quanta, 

206 skeleton.n_nodes - n_quanta, 

207 skeleton.n_edges, 

208 n_rows, 

209 ) 

210 return skeleton 

211 

212 @timeMethod 

213 def _find_followup_datasets(self, query: _AllDimensionsQuery, skeleton: QuantumGraphSkeleton) -> None: 

214 """Populate `existing_datasets` by performing follow-up queries joined 

215 to column-subsets of the initial data ID query. 

216 

217 Parameters 

218 ---------- 

219 query : `_AllDimensionsQuery` 

220 Object representing the full-pipeline data ID query. 

221 """ 

222 for dimensions, (tasks_in_group, dataset_types_in_group) in query.grouped_by_dimensions.items(): 

223 data_ids = query.common_data_ids.subset(dimensions, unique=True) 

224 # Iterate over regular input/output dataset type nodes with these 

225 # dimensions to find those datasets using straightforward followup 

226 # queries. 

227 for dataset_type_node in dataset_types_in_group.values(): 

228 if dataset_type_node.name in query.overall_inputs: 

229 # Dataset type is an overall input; we always need to try 

230 # to find these. 

231 count = 0 

232 try: 

233 for ref in data_ids.find_datasets(dataset_type_node.name, self.input_collections): 

234 self.existing_datasets.inputs[ 

235 DatasetKey(dataset_type_node.name, ref.dataId.required_values) 

236 ] = ref 

237 count += 1 

238 except MissingDatasetTypeError: 

239 pass 

240 self.log.verbose( 

241 "Found %d overall-input dataset(s) of type %r.", count, dataset_type_node.name 

242 ) 

243 continue 

244 if self.skip_existing_in: 

245 # Dataset type is an intermediate or output; need to find 

246 # these if only they're from previously executed quanta 

247 # that we might skip... 

248 count = 0 

249 try: 

250 for ref in data_ids.find_datasets(dataset_type_node.name, self.skip_existing_in): 

251 key = DatasetKey(dataset_type_node.name, ref.dataId.required_values) 

252 self.existing_datasets.outputs_for_skip[key] = ref 

253 count += 1 

254 if ref.run == self.output_run: 

255 self.existing_datasets.outputs_in_the_way[key] = ref 

256 except MissingDatasetTypeError: 

257 pass 

258 self.log.verbose( 

259 "Found %d output dataset(s) of type %r in %s.", 

260 count, 

261 dataset_type_node.name, 

262 self.skip_existing_in, 

263 ) 

264 if self.output_run_exists and not self.skip_existing_starts_with_output_run: 

265 # ...or if they're in the way and would need to be 

266 # clobbered (and we haven't already found them in the 

267 # previous block). 

268 count = 0 

269 try: 

270 for ref in data_ids.find_datasets(dataset_type_node.name, [self.output_run]): 

271 self.existing_datasets.outputs_in_the_way[ 

272 DatasetKey(dataset_type_node.name, ref.dataId.required_values) 

273 ] = ref 

274 count += 1 

275 except MissingDatasetTypeError: 

276 pass 

277 self.log.verbose( 

278 "Found %d output dataset(s) of type %r in %s.", 

279 count, 

280 dataset_type_node.name, 

281 self.output_run, 

282 ) 

283 del dataset_type_node 

284 # Iterate over tasks with these dimensions to perform follow-up 

285 # queries for prerequisite inputs, which may have dimensions that 

286 # were not in ``common_data_ids`` and/or require temporal joins to 

287 # calibration validity ranges. 

288 for task_node in tasks_in_group.values(): 

289 task_prerequisite_info = self.prerequisite_info[task_node.label] 

290 for connection_name, finder in list(task_prerequisite_info.finders.items()): 

291 if finder.lookup_function is not None: 

292 self.log.verbose( 

293 "Deferring prerequisite input %r of task %r to per-quantum processing " 

294 "(lookup function provided).", 

295 finder.dataset_type_node.name, 

296 task_node.label, 

297 ) 

298 continue 

299 # We also fall back to the base class if there is a 

300 # nontrivial spatial or temporal join in the lookup. 

301 if finder.dataset_skypix or finder.dataset_other_spatial: 

302 if task_prerequisite_info.bounds.spatial_connections: 

303 self.log.verbose( 

304 "Deferring prerequisite input %r of task %r to per-quantum processing " 

305 "(for spatial-bounds-connections handling).", 

306 finder.dataset_type_node.name, 

307 task_node.label, 

308 ) 

309 continue 

310 if not task_node.dimensions.spatial: 

311 self.log.verbose( 

312 "Deferring prerequisite input %r of task %r to per-quantum processing " 

313 "(dataset has spatial data IDs, but task does not).", 

314 finder.dataset_type_node.name, 

315 task_node.label, 

316 ) 

317 continue 

318 if finder.dataset_has_timespan: 

319 if task_prerequisite_info.bounds.spatial_connections: 

320 self.log.verbose( 

321 "Deferring prerequisite input %r of task %r to per-quantum processing " 

322 "(for temporal-bounds-connections handling).", 

323 finder.dataset_type_node.name, 

324 task_node.label, 

325 ) 

326 continue 

327 if not task_node.dimensions.temporal: 

328 self.log.verbose( 

329 "Deferring prerequisite input %r of task %r to per-quantum processing " 

330 "(dataset has temporal data IDs, but task does not).", 

331 finder.dataset_type_node.name, 

332 task_node.label, 

333 ) 

334 continue 

335 # We have a simple case where we can do a single query 

336 # that joins the query we already have for the task data 

337 # IDs to the datasets we're looking for. 

338 count = 0 

339 try: 

340 query_results = data_ids.find_related_datasets( 

341 finder.dataset_type_node.dataset_type, self.input_collections 

342 ) 

343 except MissingDatasetTypeError: 

344 query_results = [] 

345 for data_id, ref in query_results: 

346 dataset_key = PrerequisiteDatasetKey(finder.dataset_type_node.name, ref.id.bytes) 

347 quantum_key = QuantumKey(task_node.label, data_id.required_values) 

348 # The column-subset operation used to make `data_ids` 

349 # from `common_data_ids` can strip away post-query 

350 # filtering; e.g. if we starts with a {visit, patch} 

351 # query but subset down to just {visit}, we can't keep 

352 # the patch.region column we need for that filtering. 

353 # This means we can get some data IDs that weren't in 

354 # the original query (e.g. visits that don't overlap 

355 # the same patch, but do overlap the some common skypix 

356 # ID). We don't want to add quanta with those data ID 

357 # here, which is why we pass 

358 # ignore_unrecognized_quanta=True here. 

359 if skeleton.add_input_edge(quantum_key, dataset_key, ignore_unrecognized_quanta=True): 

360 self.existing_datasets.inputs[dataset_key] = ref 

361 count += 1 

362 # Remove this finder from the mapping so the base class 

363 # knows it doesn't have to look for these prerequisites. 

364 del task_prerequisite_info.finders[connection_name] 

365 self.log.verbose( 

366 "Added %d prerequisite input edge(s) from dataset type %r to task %r.", 

367 count, 

368 finder.dataset_type_node.name, 

369 task_node.label, 

370 ) 

371 

372 

373@dataclasses.dataclass(eq=False, repr=False) 

374class _AllDimensionsQuery: 

375 """A helper class for `AllDimensionsQuantumGraphBuilder` that holds all 

376 per-subgraph state. 

377 

378 This object should always be constructed by `from_builder`, which returns 

379 an instance wrapped with a context manager. This controls the lifetime of 

380 the temporary table referenced by `common_data_ids`. 

381 """ 

382 

383 subgraph: PipelineGraph 

384 """Graph of this subset of the pipeline.""" 

385 

386 grouped_by_dimensions: dict[ 

387 DimensionGroup, tuple[dict[str, TaskNode], dict[str, DatasetTypeNode]] 

388 ] = dataclasses.field(default_factory=dict) 

389 """The tasks and dataset types of this subset of the pipeline, grouped 

390 by their dimensions. 

391 

392 The tasks and dataset types with empty dimensions are not included; they're 

393 in other attributes since they are usually used differently. Prerequisite 

394 dataset types are also not included. 

395 """ 

396 

397 empty_dimensions_tasks: dict[str, TaskNode] = dataclasses.field(default_factory=dict) 

398 """The tasks of this subset of this pipeline that have empty dimensions.""" 

399 

400 empty_dimensions_dataset_types: dict[str, DatasetTypeNode] = dataclasses.field(default_factory=dict) 

401 """The dataset types of this subset of this pipeline that have empty 

402 dimensions. 

403 

404 Prerequisite dataset types are not included. 

405 """ 

406 

407 overall_inputs: dict[str, DatasetTypeNode] = dataclasses.field(default_factory=dict) 

408 """Pipeline graph nodes for all non-prerequisite, non-init overall-input 

409 dataset types for this subset of the pipeline. 

410 """ 

411 

412 query_args: dict[str, Any] = dataclasses.field(default_factory=dict) 

413 """All keyword arguments passed to `lsst.daf.butler.Registry.queryDataIds`. 

414 """ 

415 

416 common_data_ids: DataCoordinateQueryResults = dataclasses.field(init=False) 

417 """Results of the materialized initial data ID query.""" 

418 

419 @classmethod 

420 @contextmanager 

421 def from_builder( 

422 cls, builder: AllDimensionsQuantumGraphBuilder, subgraph: PipelineGraph 

423 ) -> Iterator[_AllDimensionsQuery]: 

424 """Construct and run the query, returning an instance guarded by 

425 a context manager. 

426 

427 Parameters 

428 ---------- 

429 builder : `AllDimensionsQuantumGraphBuilder` 

430 Builder object this helper is associated with. 

431 subgraph : `pipeline_graph.PipelineGraph` 

432 Subset of the pipeline being processed. 

433 

434 Returns 

435 ------- 

436 context : `AbstractContextManager` [ `_AllDimensionsQuery` ] 

437 An instance of this class, inside a context manager that manages 

438 the lifetime of its temporary database table. 

439 """ 

440 result = cls(subgraph) 

441 builder.log.debug("Analyzing subgraph dimensions and overall-inputs.") 

442 result.grouped_by_dimensions = result.subgraph.group_by_dimensions() 

443 ( 

444 result.empty_dimensions_tasks, 

445 result.empty_dimensions_dataset_types, 

446 ) = result.grouped_by_dimensions.pop(builder.universe.empty.as_group()) 

447 result.overall_inputs = { 

448 name: node # type: ignore 

449 for name, node in result.subgraph.iter_overall_inputs() 

450 if not node.is_prerequisite # type: ignore 

451 } 

452 dimension_names: set[str] = set() 

453 for dimensions_for_group in result.grouped_by_dimensions.keys(): 

454 dimension_names.update(dimensions_for_group.names) 

455 dimensions = builder.universe.conform(dimension_names) 

456 builder.log.debug("Building query for data IDs.") 

457 result.query_args = { 

458 "dimensions": dimensions, 

459 "where": builder.where, 

460 "data_id": result.subgraph.data_id, 

461 "bind": builder.bind, 

462 } 

463 if builder.dataset_query_constraint == DatasetQueryConstraintVariant.ALL: 

464 builder.log.debug("Constraining graph query using all datasets not marked as deferred.") 

465 result.query_args["datasets"] = { 

466 name 

467 for name, dataset_type_node in result.overall_inputs.items() 

468 if ( 

469 dataset_type_node.is_initial_query_constraint 

470 and name not in result.empty_dimensions_dataset_types 

471 ) 

472 } 

473 result.query_args["collections"] = builder.input_collections 

474 elif builder.dataset_query_constraint == DatasetQueryConstraintVariant.OFF: 

475 builder.log.debug("Not using dataset existence to constrain query.") 

476 elif builder.dataset_query_constraint == DatasetQueryConstraintVariant.LIST: 

477 constraint = set(builder.dataset_query_constraint) 

478 inputs = result.overall_inputs - result.empty_dimensions_dataset_types.keys() 

479 if remainder := constraint.difference(inputs): 

480 raise QuantumGraphBuilderError( 

481 f"{remainder} dataset type(s) specified as a graph constraint, but" 

482 f" do not appear as an overall input to the specified pipeline: {inputs}." 

483 " Note that component datasets are not permitted as constraints." 

484 ) 

485 builder.log.debug(f"Constraining graph query using {constraint}") 

486 result.query_args["datasets"] = constraint 

487 result.query_args["collections"] = builder.input_collections 

488 else: 

489 raise QuantumGraphBuilderError( 

490 f"Unable to handle type {builder.dataset_query_constraint} " 

491 "given as datasetQueryConstraint." 

492 ) 

493 builder.log.verbose("Querying for data IDs with arguments:") 

494 builder.log.verbose(" dimensions=%s,", list(result.query_args["dimensions"].names)) 

495 builder.log.verbose(" data_id=%s,", dict(result.query_args["data_id"].required)) 

496 if result.query_args["where"]: 

497 builder.log.verbose(" where=%s,", repr(result.query_args["where"])) 

498 if "datasets" in result.query_args: 

499 builder.log.verbose(" datasets=%s,", list(result.query_args["datasets"])) 

500 if "collections" in result.query_args: 

501 builder.log.verbose(" collections=%s,", list(result.query_args["collections"])) 

502 with builder.butler._query() as query: 

503 with query.data_ids(**result.query_args).materialize() as common_data_ids: 

504 builder.log.debug("Expanding data IDs.") 

505 result.common_data_ids = common_data_ids.expanded() 

506 yield result 

507 

508 def log_failure(self, log: LsstLogAdapter) -> None: 

509 """Emit a series of CRITICAL-level log message that attempts to explain 

510 why the initial data ID query returned no rows. 

511 

512 Parameters 

513 ---------- 

514 log : `logging.Logger` 

515 The logger to use to emit log messages. 

516 """ 

517 log.critical("Initial data ID query returned no rows, so QuantumGraph will be empty.") 

518 for message in self.common_data_ids.explain_no_results(): 

519 log.critical(message) 

520 log.critical( 

521 "To reproduce this query for debugging purposes, run " 

522 "Registry.queryDataIds with these arguments:" 

523 ) 

524 # We could just repr() the queryArgs dict to get something 

525 # the user could make sense of, but it's friendlier to 

526 # put these args in an easier-to-reconstruct equivalent form 

527 # so they can read it more easily and copy and paste into 

528 # a Python terminal. 

529 log.critical(" dimensions=%s,", list(self.query_args["dimensions"].names)) 

530 log.critical(" data_id=%s,", dict(self.query_args["data_id"].required)) 

531 if self.query_args["where"]: 

532 log.critical(" where=%s,", repr(self.query_args["where"])) 

533 if "datasets" in self.query_args: 

534 log.critical(" datasets=%s,", list(self.query_args["datasets"])) 

535 if "collections" in self.query_args: 

536 log.critical(" collections=%s,", list(self.query_args["collections"]))