Coverage for python/lsst/pipe/base/all_dimensions_quantum_graph_builder.py: 19%

200 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-11-18 10:50 +0000

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28"""The standard, general-purpose implementation of the QuantumGraph-generation 

29algorithm. 

30""" 

31 

32from __future__ import annotations 

33 

34__all__ = ("AllDimensionsQuantumGraphBuilder",) 

35 

36import dataclasses 

37from collections.abc import Iterator, Mapping 

38from contextlib import contextmanager 

39from typing import Any, final 

40 

41from lsst.daf.butler import Butler, DimensionGraph 

42from lsst.daf.butler.registry import MissingDatasetTypeError 

43from lsst.daf.butler.registry.queries import DataCoordinateQueryResults 

44from lsst.utils.logging import LsstLogAdapter 

45from lsst.utils.timer import timeMethod 

46 

47from ._datasetQueryConstraints import DatasetQueryConstraintVariant 

48from .pipeline_graph import DatasetTypeNode, PipelineGraph, TaskNode 

49from .quantum_graph_builder import ( 

50 DatasetKey, 

51 PrerequisiteDatasetKey, 

52 QuantumGraphBuilder, 

53 QuantumGraphBuilderError, 

54 QuantumGraphSkeleton, 

55 QuantumKey, 

56) 

57 

58 

59@final 

60class AllDimensionsQuantumGraphBuilder(QuantumGraphBuilder): 

61 """An implementation of `QuantumGraphBuilder` that uses a single large 

62 query for data IDs covering all dimensions in the pipeline. 

63 

64 Parameters 

65 ---------- 

66 pipeline_graph : `.pipeline_graph.PipelineGraph` 

67 Pipeline to build a `QuantumGraph` from, as a graph. Will be resolved 

68 in-place with the given butler (any existing resolution is ignored). 

69 butler : `lsst.daf.butler.Butler` 

70 Client for the data repository. Should be read-only. 

71 where : `str` 

72 Butler expression language constraint to apply to all data IDs. 

73 dataset_query_constraint : `DatasetQueryConstraintVariant`, optional 

74 Specification of which overall-input datasets should be used to 

75 constrain the initial data ID queries. Not including an important 

76 constraint can result in catastrophically large query results that take 

77 too long to process, while including too many makes the query much more 

78 complex, increasing the chances that the database will choose a bad 

79 (sometimes catastrophically bad) query plan. 

80 bind : `~collections.abc.Mapping`, optional 

81 Variable substitutions for the ``where`` expression. 

82 **kwargs 

83 Additional keyword arguments forwarded to `QuantumGraphBuilder`. 

84 

85 Notes 

86 ----- 

87 This is a general-purpose algorithm that delegates the problem of 

88 determining which "end" of the pipeline is more constrained (beginning by 

89 input collection contents vs. end by the ``where`` string) to the database 

90 query planner, which *usually* does a good job. 

91 

92 This algorithm suffers from a serious limitation, which we refer to as the 

93 "tract slicing" problem from its most common variant: the ``where`` string 

94 and general data ID intersection rules apply to *all* data IDs in the 

95 graph. For example, if a ``tract`` constraint is present in the ``where`` 

96 string or an overall-input dataset, then it is impossible for any data ID 

97 that does not overlap that tract to be present anywhere in the pipeline, 

98 such as a ``{visit, detector}`` combination where the ``visit`` overlaps 

99 the ``tract`` even if the ``detector`` does not. 

100 """ 

101 

102 def __init__( 

103 self, 

104 pipeline_graph: PipelineGraph, 

105 butler: Butler, 

106 *, 

107 where: str, 

108 dataset_query_constraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL, 

109 bind: Mapping[str, Any] | None = None, 

110 **kwargs: Any, 

111 ): 

112 super().__init__(pipeline_graph, butler, **kwargs) 

113 self.where = where 

114 self.dataset_query_constraint = dataset_query_constraint 

115 self.bind = bind 

116 

117 @timeMethod 

118 def process_subgraph(self, subgraph: PipelineGraph) -> QuantumGraphSkeleton: 

119 # Docstring inherited. 

120 # There is some chance that the dimension query for one subgraph would 

121 # be the same as or a dimension-subset of another. This is an 

122 # optimization opportunity we're not currently taking advantage of. 

123 with _AllDimensionsQuery.from_builder(self, subgraph) as query: 

124 skeleton = self._make_subgraph_skeleton(query) 

125 self._find_followup_datasets(query, skeleton) 

126 return skeleton 

127 

128 @timeMethod 

129 def _make_subgraph_skeleton(self, query: _AllDimensionsQuery) -> QuantumGraphSkeleton: 

130 """Build a `QuantumGraphSkeleton` by iterating over the result rows 

131 of the initial data ID query. 

132 

133 Parameters 

134 ---------- 

135 query : `_AllDimensionsQuery` 

136 Object representing the full-pipeline data ID query. 

137 

138 Returns 

139 ------- 

140 skeleton : `QuantumGraphSkeleton` 

141 Preliminary quantum graph. 

142 """ 

143 # First we make containers of empty-dimensions quantum and dataset 

144 # keys, and add those to the skelton, since empty data IDs are 

145 # logically subsets of any data ID. We'll copy those to initialize the 

146 # containers of keys for each result row. We don't ever explicitly add 

147 # nodes to the skeleton for these, and that's okay because networkx 

148 # adds nodes implicitly when an edge to that node is added, and we 

149 # don't want to add nodes for init datasets here. 

150 skeleton = QuantumGraphSkeleton(query.subgraph.tasks) 

151 empty_dimensions_dataset_keys = {} 

152 for dataset_type_name in query.empty_dimensions_dataset_types.keys(): 

153 empty_dimensions_dataset_keys[dataset_type_name] = skeleton.add_dataset_node( 

154 dataset_type_name, self.empty_data_id 

155 ) 

156 empty_dimensions_quantum_keys = [] 

157 for task_label in query.empty_dimensions_tasks.keys(): 

158 empty_dimensions_quantum_keys.append(skeleton.add_quantum_node(task_label, self.empty_data_id)) 

159 self.log.info("Iterating over query results to associate quanta with datasets.") 

160 # Iterate over query results, populating data IDs for datasets and 

161 # quanta and then connecting them to each other. This is the slowest 

162 # client-side part of QG generation, and it's often the slowest part 

163 # overall, so inside this loop is where it's really critical to avoid 

164 # expensive things, especially in the nested loops. 

165 n_rows = 0 

166 for common_data_id in query.common_data_ids: 

167 # Create a data ID for each set of dimensions used by one or more 

168 # tasks or dataset types, and use that to record all quanta and 

169 # dataset data IDs for this row. 

170 dataset_keys_for_row: dict[str, DatasetKey] = empty_dimensions_dataset_keys.copy() 

171 quantum_keys_for_row: list[QuantumKey] = empty_dimensions_quantum_keys.copy() 

172 for dimensions, (task_nodes, dataset_type_nodes) in query.grouped_by_dimensions.items(): 

173 data_id = common_data_id.subset(dimensions) 

174 for dataset_type_name in dataset_type_nodes.keys(): 

175 dataset_keys_for_row[dataset_type_name] = skeleton.add_dataset_node( 

176 dataset_type_name, data_id 

177 ) 

178 for task_label in task_nodes.keys(): 

179 quantum_keys_for_row.append(skeleton.add_quantum_node(task_label, data_id)) 

180 # Whether these quanta are new or existing, we can now associate 

181 # the dataset data IDs for this row with them. The fact that a 

182 # quantum data ID and a dataset data ID both came from the same 

183 # result row is what tells us they should be associated. Many of 

184 # these associates will be duplicates (because another query row 

185 # that differed from this one only in irrelevant dimensions already 

186 # added them), and our use of sets should take care of that. 

187 for quantum_key in quantum_keys_for_row: 

188 for read_edge in self._pipeline_graph.tasks[quantum_key.task_label].inputs.values(): 

189 skeleton.add_input_edge( 

190 quantum_key, dataset_keys_for_row[read_edge.parent_dataset_type_name] 

191 ) 

192 for write_edge in self._pipeline_graph.tasks[quantum_key.task_label].iter_all_outputs(): 

193 skeleton.add_output_edge( 

194 quantum_key, dataset_keys_for_row[write_edge.parent_dataset_type_name] 

195 ) 

196 n_rows += 1 

197 if n_rows == 0: 

198 query.log_failure(self.log) 

199 else: 

200 n_quanta = sum(len(skeleton.get_quanta(task_label)) for task_label in query.subgraph.tasks) 

201 self.log.info( 

202 "Initial bipartite graph has %d quanta, %d dataset nodes, and %d edges from %d query row(s).", 

203 n_quanta, 

204 skeleton.n_nodes - n_quanta, 

205 skeleton.n_edges, 

206 n_rows, 

207 ) 

208 return skeleton 

209 

210 @timeMethod 

211 def _find_followup_datasets(self, query: _AllDimensionsQuery, skeleton: QuantumGraphSkeleton) -> None: 

212 """Populate `existing_datasets` by performing follow-up queries joined 

213 to column-subsets of the initial data ID query. 

214 

215 Parameters 

216 ---------- 

217 query : `_AllDimensionsQuery` 

218 Object representing the full-pipeline data ID query. 

219 """ 

220 for dimensions, (tasks_in_group, dataset_types_in_group) in query.grouped_by_dimensions.items(): 

221 data_ids = query.common_data_ids.subset(dimensions, unique=True) 

222 # Iterate over regular input/output dataset type nodes with these 

223 # dimensions to find those datasets using straightforward followup 

224 # queries. 

225 for dataset_type_node in dataset_types_in_group.values(): 

226 if dataset_type_node.name in query.overall_inputs: 

227 # Dataset type is an overall input; we always need to try 

228 # to find these. 

229 count = 0 

230 try: 

231 for ref in data_ids.findDatasets(dataset_type_node.name, self.input_collections): 

232 self.existing_datasets.inputs[ 

233 DatasetKey(dataset_type_node.name, ref.dataId.values_tuple()) 

234 ] = ref 

235 count += 1 

236 except MissingDatasetTypeError: 

237 pass 

238 self.log.verbose( 

239 "Found %d overall-input dataset(s) of type %r.", count, dataset_type_node.name 

240 ) 

241 continue 

242 if self.skip_existing_in: 

243 # Dataset type is an intermediate or output; need to find 

244 # these if only they're from previously executed quanta 

245 # that we might skip... 

246 count = 0 

247 try: 

248 for ref in data_ids.findDatasets(dataset_type_node.name, self.skip_existing_in): 

249 key = DatasetKey(dataset_type_node.name, ref.dataId.values_tuple()) 

250 self.existing_datasets.outputs_for_skip[key] = ref 

251 count += 1 

252 if ref.run == self.output_run: 

253 self.existing_datasets.outputs_in_the_way[key] = ref 

254 except MissingDatasetTypeError: 

255 pass 

256 self.log.verbose( 

257 "Found %d output dataset(s) of type %r in %s.", 

258 count, 

259 dataset_type_node.name, 

260 self.skip_existing_in, 

261 ) 

262 if self.output_run_exists and not self.skip_existing_starts_with_output_run: 

263 # ...or if they're in the way and would need to be 

264 # clobbered (and we haven't already found them in the 

265 # previous block). 

266 count = 0 

267 try: 

268 for ref in data_ids.findDatasets(dataset_type_node.name, [self.output_run]): 

269 self.existing_datasets.outputs_in_the_way[ 

270 DatasetKey(dataset_type_node.name, ref.dataId.values_tuple()) 

271 ] = ref 

272 count += 1 

273 except MissingDatasetTypeError: 

274 pass 

275 self.log.verbose( 

276 "Found %d output dataset(s) of type %r in %s.", 

277 count, 

278 dataset_type_node.name, 

279 self.output_run, 

280 ) 

281 del dataset_type_node 

282 # Iterate over tasks with these dimensions to perform follow-up 

283 # queries for prerequisite inputs, which may have dimensions that 

284 # were not in ``common_data_ids`` and/or require temporal joins to 

285 # calibration validity ranges. 

286 for task_node in tasks_in_group.values(): 

287 task_prerequisite_info = self.prerequisite_info[task_node.label] 

288 for connection_name, finder in list(task_prerequisite_info.finders.items()): 

289 if finder.lookup_function is not None: 

290 self.log.verbose( 

291 "Deferring prerequisite input %r of task %r to per-quantum processing " 

292 "(lookup function provided).", 

293 finder.dataset_type_node.name, 

294 task_node.label, 

295 ) 

296 continue 

297 # We also fall back to the base class if there is a 

298 # nontrivial spatial or temporal join in the lookup. 

299 if finder.dataset_skypix or finder.dataset_other_spatial: 

300 if task_prerequisite_info.bounds.spatial_connections: 

301 self.log.verbose( 

302 "Deferring prerequisite input %r of task %r to per-quantum processing " 

303 "(for spatial-bounds-connections handling).", 

304 finder.dataset_type_node.name, 

305 task_node.label, 

306 ) 

307 continue 

308 if not task_node.dimensions.spatial: 

309 self.log.verbose( 

310 "Deferring prerequisite input %r of task %r to per-quantum processing " 

311 "(dataset has spatial data IDs, but task does not).", 

312 finder.dataset_type_node.name, 

313 task_node.label, 

314 ) 

315 continue 

316 if finder.dataset_has_timespan: 

317 if task_prerequisite_info.bounds.spatial_connections: 

318 self.log.verbose( 

319 "Deferring prerequisite input %r of task %r to per-quantum processing " 

320 "(for temporal-bounds-connections handling).", 

321 finder.dataset_type_node.name, 

322 task_node.label, 

323 ) 

324 continue 

325 if not task_node.dimensions.temporal: 

326 self.log.verbose( 

327 "Deferring prerequisite input %r of task %r to per-quantum processing " 

328 "(dataset has temporal data IDs, but task does not).", 

329 finder.dataset_type_node.name, 

330 task_node.label, 

331 ) 

332 continue 

333 # We have a simple case where we can do a single query 

334 # that joins the query we already have for the task data 

335 # IDs to the datasets we're looking for. 

336 count = 0 

337 try: 

338 query_results = data_ids.findRelatedDatasets( 

339 finder.dataset_type_node.dataset_type, self.input_collections 

340 ) 

341 except MissingDatasetTypeError: 

342 query_results = [] 

343 for data_id, ref in query_results: 

344 dataset_key = PrerequisiteDatasetKey(finder.dataset_type_node.name, ref.id.bytes) 

345 quantum_key = QuantumKey(task_node.label, data_id.values_tuple()) 

346 # The column-subset operation used to make `data_ids` 

347 # from `common_data_ids` can strip away post-query 

348 # filtering; e.g. if we starts with a {visit, patch} 

349 # query but subset down to just {visit}, we can't keep 

350 # the patch.region column we need for that filtering. 

351 # This means we can get some data IDs that weren't in 

352 # the original query (e.g. visits that don't overlap 

353 # the same patch, but do overlap the some common skypix 

354 # ID). We don't want to add quanta with those data ID 

355 # here, which is why we pass 

356 # ignore_unrecognized_quanta=True here. 

357 if skeleton.add_input_edge(quantum_key, dataset_key, ignore_unrecognized_quanta=True): 

358 self.existing_datasets.inputs[dataset_key] = ref 

359 count += 1 

360 # Remove this finder from the mapping so the base class 

361 # knows it doesn't have to look for these prerequisites. 

362 del task_prerequisite_info.finders[connection_name] 

363 self.log.verbose( 

364 "Added %d prerequisite input edge(s) from dataset type %r to task %r.", 

365 count, 

366 finder.dataset_type_node.name, 

367 task_node.label, 

368 ) 

369 

370 

371@dataclasses.dataclass(eq=False, repr=False) 

372class _AllDimensionsQuery: 

373 """A helper class for `AllDimensionsQuantumGraphBuilder` that holds all 

374 per-subgraph state. 

375 

376 This object should always be constructed by `from_builder`, which returns 

377 an instance wrapped with a context manager. This controls the lifetime of 

378 the temporary table referenced by `common_data_ids`. 

379 """ 

380 

381 subgraph: PipelineGraph 

382 """Graph of this subset of the pipeline.""" 

383 

384 grouped_by_dimensions: dict[ 

385 DimensionGraph, tuple[dict[str, TaskNode], dict[str, DatasetTypeNode]] 

386 ] = dataclasses.field(default_factory=dict) 

387 """The tasks and dataset types of this subset of the pipeline, grouped 

388 by their dimensions. 

389 

390 The tasks and dataset types with empty dimensions are not included; they're 

391 in other attributes since they are usually used differently. Prerequisite 

392 dataset types are also not included. 

393 """ 

394 

395 empty_dimensions_tasks: dict[str, TaskNode] = dataclasses.field(default_factory=dict) 

396 """The tasks of this subset of this pipeline that have empty dimensions.""" 

397 

398 empty_dimensions_dataset_types: dict[str, DatasetTypeNode] = dataclasses.field(default_factory=dict) 

399 """The dataset types of this subset of this pipeline that have empty 

400 dimensions. 

401 

402 Prerequisite dataset types are not included. 

403 """ 

404 

405 overall_inputs: dict[str, DatasetTypeNode] = dataclasses.field(default_factory=dict) 

406 """Pipeline graph nodes for all non-prerequisite, non-init overall-input 

407 dataset types for this subset of the pipeline. 

408 """ 

409 

410 query_args: dict[str, Any] = dataclasses.field(default_factory=dict) 

411 """All keyword arguments passed to `lsst.daf.butler.Registry.queryDataIds`. 

412 """ 

413 

414 common_data_ids: DataCoordinateQueryResults = dataclasses.field(init=False) 

415 """Results of the materialized initial data ID query.""" 

416 

417 @classmethod 

418 @contextmanager 

419 def from_builder( 

420 cls, builder: AllDimensionsQuantumGraphBuilder, subgraph: PipelineGraph 

421 ) -> Iterator[_AllDimensionsQuery]: 

422 """Construct and run the query, returning an instance guarded by 

423 a context manager. 

424 

425 Parameters 

426 ---------- 

427 builder : `AllDimensionsQuantumGraphBuilder` 

428 Builder object this helper is associated with. 

429 subgraph : `pipeline_graph.PipelineGraph` 

430 Subset of the pipeline being processed. 

431 

432 Returns 

433 ------- 

434 context : `AbstractContextManager` [ `_AllDimensionsQuery` ] 

435 An instance of this class, inside a context manager that manages 

436 the lifetime of its temporary database table. 

437 """ 

438 result = cls(subgraph) 

439 builder.log.debug("Analyzing subgraph dimensions and overall-inputs.") 

440 result.grouped_by_dimensions = result.subgraph.group_by_dimensions() 

441 ( 

442 result.empty_dimensions_tasks, 

443 result.empty_dimensions_dataset_types, 

444 ) = result.grouped_by_dimensions.pop(builder.universe.empty) 

445 result.overall_inputs = { 

446 name: node # type: ignore 

447 for name, node in result.subgraph.iter_overall_inputs() 

448 if not node.is_prerequisite # type: ignore 

449 } 

450 dimension_names: set[str] = set() 

451 for dimensions_for_group in result.grouped_by_dimensions.keys(): 

452 dimension_names.update(dimensions_for_group.names) 

453 dimensions = builder.universe.extract(dimension_names) 

454 builder.log.debug("Building query for data IDs.") 

455 result.query_args = { 

456 "dimensions": dimensions, 

457 "where": builder.where, 

458 "dataId": result.subgraph.data_id, 

459 "bind": builder.bind, 

460 } 

461 if builder.dataset_query_constraint == DatasetQueryConstraintVariant.ALL: 

462 builder.log.debug("Constraining graph query using all datasets not marked as deferred.") 

463 result.query_args["datasets"] = { 

464 name 

465 for name, dataset_type_node in result.overall_inputs.items() 

466 if ( 

467 dataset_type_node.is_initial_query_constraint 

468 and name not in result.empty_dimensions_dataset_types 

469 ) 

470 } 

471 result.query_args["collections"] = builder.input_collections 

472 elif builder.dataset_query_constraint == DatasetQueryConstraintVariant.OFF: 

473 builder.log.debug("Not using dataset existence to constrain query.") 

474 elif builder.dataset_query_constraint == DatasetQueryConstraintVariant.LIST: 

475 constraint = set(builder.dataset_query_constraint) 

476 inputs = result.overall_inputs - result.empty_dimensions_dataset_types.keys() 

477 if remainder := constraint.difference(inputs): 

478 raise QuantumGraphBuilderError( 

479 f"{remainder} dataset type(s) specified as a graph constraint, but" 

480 f" do not appear as an overall input to the specified pipeline: {inputs}." 

481 " Note that component datasets are not permitted as constraints." 

482 ) 

483 builder.log.debug(f"Constraining graph query using {constraint}") 

484 result.query_args["datasets"] = constraint 

485 result.query_args["collections"] = builder.input_collections 

486 else: 

487 raise QuantumGraphBuilderError( 

488 f"Unable to handle type {builder.dataset_query_constraint} " 

489 "given as datasetQueryConstraint." 

490 ) 

491 builder.log.verbose("Querying for data IDs with arguments:") 

492 builder.log.verbose(" dimensions=%s,", list(result.query_args["dimensions"].names)) 

493 builder.log.verbose(" dataId=%s,", result.query_args["dataId"].byName()) 

494 if result.query_args["where"]: 

495 builder.log.verbose(" where=%s,", repr(result.query_args["where"])) 

496 if "datasets" in result.query_args: 

497 builder.log.verbose(" datasets=%s,", list(result.query_args["datasets"])) 

498 if "collections" in result.query_args: 

499 builder.log.verbose(" collections=%s,", list(result.query_args["collections"])) 

500 with builder.butler.registry.queryDataIds(**result.query_args).materialize() as common_data_ids: 

501 builder.log.debug("Expanding data IDs.") 

502 result.common_data_ids = common_data_ids.expanded() 

503 yield result 

504 

505 def log_failure(self, log: LsstLogAdapter) -> None: 

506 """Emit a series of CRITICAL-level log message that attempts to explain 

507 why the initial data ID query returned no rows. 

508 """ 

509 log.critical("Initial data ID query returned no rows, so QuantumGraph will be empty.") 

510 for message in self.common_data_ids.explain_no_results(): 

511 log.critical(message) 

512 log.critical( 

513 "To reproduce this query for debugging purposes, run " 

514 "Registry.queryDataIds with these arguments:" 

515 ) 

516 # We could just repr() the queryArgs dict to get something 

517 # the user could make sense of, but it's friendlier to 

518 # put these args in an easier-to-reconstruct equivalent form 

519 # so they can read it more easily and copy and paste into 

520 # a Python terminal. 

521 log.critical(" dimensions=%s,", list(self.query_args["dimensions"].names)) 

522 log.critical(" dataId=%s,", self.query_args["dataId"].byName()) 

523 if self.query_args["where"]: 

524 log.critical(" where=%s,", repr(self.query_args["where"])) 

525 if "datasets" in self.query_args: 

526 log.critical(" datasets=%s,", list(self.query_args["datasets"])) 

527 if "collections" in self.query_args: 

528 log.critical(" collections=%s,", list(self.query_args["collections"]))