Coverage for python/lsst/pipe/base/all_dimensions_quantum_graph_builder.py: 19%

200 statements  

« prev     ^ index     » next       coverage.py v7.3.0, created at 2023-08-31 09:39 +0000

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""The standard, general-purpose implementation of the QuantumGraph-generation 

23algorithm. 

24""" 

25 

26from __future__ import annotations 

27 

28__all__ = ("AllDimensionsQuantumGraphBuilder",) 

29 

30import dataclasses 

31from collections.abc import Iterator, Mapping 

32from contextlib import contextmanager 

33from typing import Any, final 

34 

35from lsst.daf.butler import Butler, DimensionGraph 

36from lsst.daf.butler.registry import MissingDatasetTypeError 

37from lsst.daf.butler.registry.queries import DataCoordinateQueryResults 

38from lsst.utils.logging import LsstLogAdapter 

39from lsst.utils.timer import timeMethod 

40 

41from ._datasetQueryConstraints import DatasetQueryConstraintVariant 

42from .pipeline_graph import DatasetTypeNode, PipelineGraph, TaskNode 

43from .quantum_graph_builder import ( 

44 DatasetKey, 

45 PrerequisiteDatasetKey, 

46 QuantumGraphBuilder, 

47 QuantumGraphBuilderError, 

48 QuantumGraphSkeleton, 

49 QuantumKey, 

50) 

51 

52 

53@final 

54class AllDimensionsQuantumGraphBuilder(QuantumGraphBuilder): 

55 """An implementation of `QuantumGraphBuilder` that uses a single large 

56 query for data IDs covering all dimensions in the pipeline. 

57 

58 Parameters 

59 ---------- 

60 pipeline_graph : `.pipeline_graph.PipelineGraph` 

61 Pipeline to build a `QuantumGraph` from, as a graph. Will be resolved 

62 in-place with the given butler (any existing resolution is ignored). 

63 butler : `lsst.daf.butler.Butler` 

64 Client for the data repository. Should be read-only. 

65 where : `str` 

66 Butler expression language constraint to apply to all data IDs. 

67 dataset_query_constraint : `DatasetQueryConstraintVariant`, optional 

68 Specification of which overall-input datasets should be used to 

69 constrain the initial data ID queries. Not including an important 

70 constraint can result in catastrophically large query results that take 

71 too long to process, while including too many makes the query much more 

72 complex, increasing the chances that the database will choose a bad 

73 (sometimes catastrophically bad) query plan. 

74 bind : `~collections.abc.Mapping`, optional 

75 Variable substitutions for the ``where`` expression. 

76 **kwargs 

77 Additional keyword arguments forwarded to `QuantumGraphBuilder`. 

78 

79 Notes 

80 ----- 

81 This is a general-purpose algorithm that delegates the problem of 

82 determining which "end" of the pipeline is more constrained (beginning by 

83 input collection contents vs. end by the ``where`` string) to the database 

84 query planner, which *usually* does a good job. 

85 

86 This algorithm suffers from a serious limitation, which we refer to as the 

87 "tract slicing" problem from its most common variant: the ``where`` string 

88 and general data ID intersection rules apply to *all* data IDs in the 

89 graph. For example, if a ``tract`` constraint is present in the ``where`` 

90 string or an overall-input dataset, then it is impossible for any data ID 

91 that does not overlap that tract to be present anywhere in the pipeline, 

92 such as a ``{visit, detector}`` combination where the ``visit`` overlaps 

93 the ``tract`` even if the ``detector`` does not. 

94 """ 

95 

96 def __init__( 

97 self, 

98 pipeline_graph: PipelineGraph, 

99 butler: Butler, 

100 *, 

101 where: str, 

102 dataset_query_constraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL, 

103 bind: Mapping[str, Any] | None = None, 

104 **kwargs: Any, 

105 ): 

106 super().__init__(pipeline_graph, butler, **kwargs) 

107 self.where = where 

108 self.dataset_query_constraint = dataset_query_constraint 

109 self.bind = bind 

110 

111 @timeMethod 

112 def process_subgraph(self, subgraph: PipelineGraph) -> QuantumGraphSkeleton: 

113 # Docstring inherited. 

114 # There is some chance that the dimension query for one subgraph would 

115 # be the same as or a dimension-subset of another. This is an 

116 # optimization opportunity we're not currently taking advantage of. 

117 with _AllDimensionsQuery.from_builder(self, subgraph) as query: 

118 skeleton = self._make_subgraph_skeleton(query) 

119 self._find_followup_datasets(query, skeleton) 

120 return skeleton 

121 

122 @timeMethod 

123 def _make_subgraph_skeleton(self, query: _AllDimensionsQuery) -> QuantumGraphSkeleton: 

124 """Build a `QuantumGraphSkeleton` by iterating over the result rows 

125 of the initial data ID query. 

126 

127 Parameters 

128 ---------- 

129 query : `_AllDimensionsQuery` 

130 Object representing the full-pipeline data ID query. 

131 

132 Returns 

133 ------- 

134 skeleton : `QuantumGraphSkeleton` 

135 Preliminary quantum graph. 

136 """ 

137 # First we make containers of empty-dimensions quantum and dataset 

138 # keys, and add those to the skelton, since empty data IDs are 

139 # logically subsets of any data ID. We'll copy those to initialize the 

140 # containers of keys for each result row. We don't ever explicitly add 

141 # nodes to the skeleton for these, and that's okay because networkx 

142 # adds nodes implicitly when an edge to that node is added, and we 

143 # don't want to add nodes for init datasets here. 

144 skeleton = QuantumGraphSkeleton(query.subgraph.tasks) 

145 empty_dimensions_dataset_keys = {} 

146 for dataset_type_name in query.empty_dimensions_dataset_types.keys(): 

147 empty_dimensions_dataset_keys[dataset_type_name] = skeleton.add_dataset_node( 

148 dataset_type_name, self.empty_data_id 

149 ) 

150 empty_dimensions_quantum_keys = [] 

151 for task_label in query.empty_dimensions_tasks.keys(): 

152 empty_dimensions_quantum_keys.append(skeleton.add_quantum_node(task_label, self.empty_data_id)) 

153 self.log.info("Iterating over query results to associate quanta with datasets.") 

154 # Iterate over query results, populating data IDs for datasets and 

155 # quanta and then connecting them to each other. This is the slowest 

156 # client-side part of QG generation, and it's often the slowest part 

157 # overall, so inside this loop is where it's really critical to avoid 

158 # expensive things, especially in the nested loops. 

159 n_rows = 0 

160 for common_data_id in query.common_data_ids: 

161 # Create a data ID for each set of dimensions used by one or more 

162 # tasks or dataset types, and use that to record all quanta and 

163 # dataset data IDs for this row. 

164 dataset_keys_for_row: dict[str, DatasetKey] = empty_dimensions_dataset_keys.copy() 

165 quantum_keys_for_row: list[QuantumKey] = empty_dimensions_quantum_keys.copy() 

166 for dimensions, (task_nodes, dataset_type_nodes) in query.grouped_by_dimensions.items(): 

167 data_id = common_data_id.subset(dimensions) 

168 for dataset_type_name in dataset_type_nodes.keys(): 

169 dataset_keys_for_row[dataset_type_name] = skeleton.add_dataset_node( 

170 dataset_type_name, data_id 

171 ) 

172 for task_label in task_nodes.keys(): 

173 quantum_keys_for_row.append(skeleton.add_quantum_node(task_label, data_id)) 

174 # Whether these quanta are new or existing, we can now associate 

175 # the dataset data IDs for this row with them. The fact that a 

176 # quantum data ID and a dataset data ID both came from the same 

177 # result row is what tells us they should be associated. Many of 

178 # these associates will be duplicates (because another query row 

179 # that differed from this one only in irrelevant dimensions already 

180 # added them), and our use of sets should take care of that. 

181 for quantum_key in quantum_keys_for_row: 

182 for read_edge in self._pipeline_graph.tasks[quantum_key.task_label].inputs.values(): 

183 skeleton.add_input_edge( 

184 quantum_key, dataset_keys_for_row[read_edge.parent_dataset_type_name] 

185 ) 

186 for write_edge in self._pipeline_graph.tasks[quantum_key.task_label].iter_all_outputs(): 

187 skeleton.add_output_edge( 

188 quantum_key, dataset_keys_for_row[write_edge.parent_dataset_type_name] 

189 ) 

190 n_rows += 1 

191 if n_rows == 0: 

192 query.log_failure(self.log) 

193 else: 

194 n_quanta = sum(len(skeleton.get_quanta(task_label)) for task_label in query.subgraph.tasks) 

195 self.log.info( 

196 "Initial bipartite graph has %d quanta, %d dataset nodes, and %d edges from %d query row(s).", 

197 n_quanta, 

198 skeleton.n_nodes - n_quanta, 

199 skeleton.n_edges, 

200 n_rows, 

201 ) 

202 return skeleton 

203 

204 @timeMethod 

205 def _find_followup_datasets(self, query: _AllDimensionsQuery, skeleton: QuantumGraphSkeleton) -> None: 

206 """Populate `existing_datasets` by performing follow-up queries joined 

207 to column-subsets of the initial data ID query. 

208 

209 Parameters 

210 ---------- 

211 query : `_AllDimensionsQuery` 

212 Object representing the full-pipeline data ID query. 

213 """ 

214 for dimensions, (tasks_in_group, dataset_types_in_group) in query.grouped_by_dimensions.items(): 

215 data_ids = query.common_data_ids.subset(dimensions, unique=True) 

216 # Iterate over regular input/output dataset type nodes with these 

217 # dimensions to find those datasets using straightforward followup 

218 # queries. 

219 for dataset_type_node in dataset_types_in_group.values(): 

220 if dataset_type_node.name in query.overall_inputs: 

221 # Dataset type is an overall input; we always need to try 

222 # to find these. 

223 count = 0 

224 try: 

225 for ref in data_ids.findDatasets(dataset_type_node.name, self.input_collections): 

226 self.existing_datasets.inputs[ 

227 DatasetKey(dataset_type_node.name, ref.dataId.values_tuple()) 

228 ] = ref 

229 count += 1 

230 except MissingDatasetTypeError: 

231 pass 

232 self.log.verbose( 

233 "Found %d overall-input dataset(s) of type %r.", count, dataset_type_node.name 

234 ) 

235 continue 

236 if self.skip_existing_in: 

237 # Dataset type is an intermediate or output; need to find 

238 # these if only they're from previously executed quanta 

239 # that we might skip... 

240 count = 0 

241 try: 

242 for ref in data_ids.findDatasets(dataset_type_node.name, self.skip_existing_in): 

243 key = DatasetKey(dataset_type_node.name, ref.dataId.values_tuple()) 

244 self.existing_datasets.outputs_for_skip[key] = ref 

245 count += 1 

246 if ref.run == self.output_run: 

247 self.existing_datasets.outputs_in_the_way[key] = ref 

248 except MissingDatasetTypeError: 

249 pass 

250 self.log.verbose( 

251 "Found %d output dataset(s) of type %r in %s.", 

252 count, 

253 dataset_type_node.name, 

254 self.skip_existing_in, 

255 ) 

256 if self.output_run_exists and not self.skip_existing_starts_with_output_run: 

257 # ...or if they're in the way and would need to be 

258 # clobbered (and we haven't already found them in the 

259 # previous block). 

260 count = 0 

261 try: 

262 for ref in data_ids.findDatasets(dataset_type_node.name, [self.output_run]): 

263 self.existing_datasets.outputs_in_the_way[ 

264 DatasetKey(dataset_type_node.name, ref.dataId.values_tuple()) 

265 ] = ref 

266 count += 1 

267 except MissingDatasetTypeError: 

268 pass 

269 self.log.verbose( 

270 "Found %d output dataset(s) of type %r in %s.", 

271 count, 

272 dataset_type_node.name, 

273 self.output_run, 

274 ) 

275 del dataset_type_node 

276 # Iterate over tasks with these dimensions to perform follow-up 

277 # queries for prerequisite inputs, which may have dimensions that 

278 # were not in ``common_data_ids`` and/or require temporal joins to 

279 # calibration validity ranges. 

280 for task_node in tasks_in_group.values(): 

281 task_prerequisite_info = self.prerequisite_info[task_node.label] 

282 for connection_name, finder in list(task_prerequisite_info.finders.items()): 

283 if finder.lookup_function is not None: 

284 self.log.verbose( 

285 "Deferring prerequisite input %r of task %r to per-quantum processing " 

286 "(lookup function provided).", 

287 finder.dataset_type_node.name, 

288 task_node.label, 

289 ) 

290 continue 

291 # We also fall back to the base class if there is a 

292 # nontrivial spatial or temporal join in the lookup. 

293 if finder.dataset_skypix or finder.dataset_other_spatial: 

294 if task_prerequisite_info.bounds.spatial_connections: 

295 self.log.verbose( 

296 "Deferring prerequisite input %r of task %r to per-quantum processing " 

297 "(for spatial-bounds-connections handling).", 

298 finder.dataset_type_node.name, 

299 task_node.label, 

300 ) 

301 continue 

302 if not task_node.dimensions.spatial: 

303 self.log.verbose( 

304 "Deferring prerequisite input %r of task %r to per-quantum processing " 

305 "(dataset has spatial data IDs, but task does not).", 

306 finder.dataset_type_node.name, 

307 task_node.label, 

308 ) 

309 continue 

310 if finder.dataset_has_timespan: 

311 if task_prerequisite_info.bounds.spatial_connections: 

312 self.log.verbose( 

313 "Deferring prerequisite input %r of task %r to per-quantum processing " 

314 "(for temporal-bounds-connections handling).", 

315 finder.dataset_type_node.name, 

316 task_node.label, 

317 ) 

318 continue 

319 if not task_node.dimensions.temporal: 

320 self.log.verbose( 

321 "Deferring prerequisite input %r of task %r to per-quantum processing " 

322 "(dataset has temporal data IDs, but task does not).", 

323 finder.dataset_type_node.name, 

324 task_node.label, 

325 ) 

326 continue 

327 # We have a simple case where we can do a single query 

328 # that joins the query we already have for the task data 

329 # IDs to the datasets we're looking for. 

330 count = 0 

331 try: 

332 query_results = data_ids.findRelatedDatasets( 

333 finder.dataset_type_node.dataset_type, self.input_collections 

334 ) 

335 except MissingDatasetTypeError: 

336 query_results = [] 

337 for data_id, ref in query_results: 

338 dataset_key = PrerequisiteDatasetKey(finder.dataset_type_node.name, ref.id.bytes) 

339 quantum_key = QuantumKey(task_node.label, data_id.values_tuple()) 

340 # The column-subset operation used to make `data_ids` 

341 # from `common_data_ids` can strip away post-query 

342 # filtering; e.g. if we starts with a {visit, patch} 

343 # query but subset down to just {visit}, we can't keep 

344 # the patch.region column we need for that filtering. 

345 # This means we can get some data IDs that weren't in 

346 # the original query (e.g. visits that don't overlap 

347 # the same patch, but do overlap the some common skypix 

348 # ID). We don't want to add quanta with those data ID 

349 # here, which is why we pass 

350 # ignore_unrecognized_quanta=True here. 

351 if skeleton.add_input_edge(quantum_key, dataset_key, ignore_unrecognized_quanta=True): 

352 self.existing_datasets.inputs[dataset_key] = ref 

353 count += 1 

354 # Remove this finder from the mapping so the base class 

355 # knows it doesn't have to look for these prerequisites. 

356 del task_prerequisite_info.finders[connection_name] 

357 self.log.verbose( 

358 "Added %d prerequisite input edge(s) from dataset type %r to task %r.", 

359 count, 

360 finder.dataset_type_node.name, 

361 task_node.label, 

362 ) 

363 

364 

365@dataclasses.dataclass(eq=False, repr=False) 

366class _AllDimensionsQuery: 

367 """A helper class for `AllDimensionsQuantumGraphBuilder` that holds all 

368 per-subgraph state. 

369 

370 This object should always be constructed by `from_builder`, which returns 

371 an instance wrapped with a context manager. This controls the lifetime of 

372 the temporary table referenced by `common_data_ids`. 

373 """ 

374 

375 subgraph: PipelineGraph 

376 """Graph of this subset of the pipeline.""" 

377 

378 grouped_by_dimensions: dict[ 

379 DimensionGraph, tuple[dict[str, TaskNode], dict[str, DatasetTypeNode]] 

380 ] = dataclasses.field(default_factory=dict) 

381 """The tasks and dataset types of this subset of the pipeline, grouped 

382 by their dimensions. 

383 

384 The tasks and dataset types with empty dimensions are not included; they're 

385 in other attributes since they are usually used differently. Prerequisite 

386 dataset types are also not included. 

387 """ 

388 

389 empty_dimensions_tasks: dict[str, TaskNode] = dataclasses.field(default_factory=dict) 

390 """The tasks of this subset of this pipeline that have empty dimensions.""" 

391 

392 empty_dimensions_dataset_types: dict[str, DatasetTypeNode] = dataclasses.field(default_factory=dict) 

393 """The dataset types of this subset of this pipeline that have empty 

394 dimensions. 

395 

396 Prerequisite dataset types are not included. 

397 """ 

398 

399 overall_inputs: dict[str, DatasetTypeNode] = dataclasses.field(default_factory=dict) 

400 """Pipeline graph nodes for all non-prerequisite, non-init overall-input 

401 dataset types for this subset of the pipeline. 

402 """ 

403 

404 query_args: dict[str, Any] = dataclasses.field(default_factory=dict) 

405 """All keyword arguments passed to `lsst.daf.butler.Registry.queryDataIds`. 

406 """ 

407 

408 common_data_ids: DataCoordinateQueryResults = dataclasses.field(init=False) 

409 """Results of the materialized initial data ID query.""" 

410 

411 @classmethod 

412 @contextmanager 

413 def from_builder( 

414 cls, builder: AllDimensionsQuantumGraphBuilder, subgraph: PipelineGraph 

415 ) -> Iterator[_AllDimensionsQuery]: 

416 """Construct and run the query, returning an instance guarded by 

417 a context manager. 

418 

419 Parameters 

420 ---------- 

421 builder : `AllDimensionsQuantumGraphBuilder` 

422 Builder object this helper is associated with. 

423 subgraph : `pipeline_graph.PipelineGraph` 

424 Subset of the pipeline being processed. 

425 

426 Returns 

427 ------- 

428 context : `AbstractContextManager` [ `_AllDimensionsQuery` ] 

429 An instance of this class, inside a context manager that manages 

430 the lifetime of its temporary database table. 

431 """ 

432 result = cls(subgraph) 

433 builder.log.debug("Analyzing subgraph dimensions and overall-inputs.") 

434 result.grouped_by_dimensions = result.subgraph.group_by_dimensions() 

435 ( 

436 result.empty_dimensions_tasks, 

437 result.empty_dimensions_dataset_types, 

438 ) = result.grouped_by_dimensions.pop(builder.universe.empty) 

439 result.overall_inputs = { 

440 name: node # type: ignore 

441 for name, node in result.subgraph.iter_overall_inputs() 

442 if not node.is_prerequisite # type: ignore 

443 } 

444 dimension_names: set[str] = set() 

445 for dimensions_for_group in result.grouped_by_dimensions.keys(): 

446 dimension_names.update(dimensions_for_group.names) 

447 dimensions = builder.universe.extract(dimension_names) 

448 builder.log.debug("Building query for data IDs.") 

449 result.query_args = { 

450 "dimensions": dimensions, 

451 "where": builder.where, 

452 "dataId": result.subgraph.data_id, 

453 "bind": builder.bind, 

454 } 

455 if builder.dataset_query_constraint == DatasetQueryConstraintVariant.ALL: 

456 builder.log.debug("Constraining graph query using all datasets not marked as deferred.") 

457 result.query_args["datasets"] = { 

458 name 

459 for name, dataset_type_node in result.overall_inputs.items() 

460 if ( 

461 dataset_type_node.is_initial_query_constraint 

462 and name not in result.empty_dimensions_dataset_types 

463 ) 

464 } 

465 result.query_args["collections"] = builder.input_collections 

466 elif builder.dataset_query_constraint == DatasetQueryConstraintVariant.OFF: 

467 builder.log.debug("Not using dataset existence to constrain query.") 

468 elif builder.dataset_query_constraint == DatasetQueryConstraintVariant.LIST: 

469 constraint = set(builder.dataset_query_constraint) 

470 inputs = result.overall_inputs - result.empty_dimensions_dataset_types.keys() 

471 if remainder := constraint.difference(inputs): 

472 raise QuantumGraphBuilderError( 

473 f"{remainder} dataset type(s) specified as a graph constraint, but" 

474 f" do not appear as an overall input to the specified pipeline: {inputs}." 

475 " Note that component datasets are not permitted as constraints." 

476 ) 

477 builder.log.debug(f"Constraining graph query using {constraint}") 

478 result.query_args["datasets"] = constraint 

479 result.query_args["collections"] = builder.input_collections 

480 else: 

481 raise QuantumGraphBuilderError( 

482 f"Unable to handle type {builder.dataset_query_constraint} " 

483 "given as datasetQueryConstraint." 

484 ) 

485 builder.log.verbose("Querying for data IDs with arguments:") 

486 builder.log.verbose(" dimensions=%s,", list(result.query_args["dimensions"].names)) 

487 builder.log.verbose(" dataId=%s,", result.query_args["dataId"].byName()) 

488 if result.query_args["where"]: 

489 builder.log.verbose(" where=%s,", repr(result.query_args["where"])) 

490 if "datasets" in result.query_args: 

491 builder.log.verbose(" datasets=%s,", list(result.query_args["datasets"])) 

492 if "collections" in result.query_args: 

493 builder.log.verbose(" collections=%s,", list(result.query_args["collections"])) 

494 with builder.butler.registry.queryDataIds(**result.query_args).materialize() as common_data_ids: 

495 builder.log.debug("Expanding data IDs.") 

496 result.common_data_ids = common_data_ids.expanded() 

497 yield result 

498 

499 def log_failure(self, log: LsstLogAdapter) -> None: 

500 """Emit a series of CRITICAL-level log message that attempts to explain 

501 why the initial data ID query returned no rows. 

502 """ 

503 log.critical("Initial data ID query returned no rows, so QuantumGraph will be empty.") 

504 for message in self.common_data_ids.explain_no_results(): 

505 log.critical(message) 

506 log.critical( 

507 "To reproduce this query for debugging purposes, run " 

508 "Registry.queryDataIds with these arguments:" 

509 ) 

510 # We could just repr() the queryArgs dict to get something 

511 # the user could make sense of, but it's friendlier to 

512 # put these args in an easier-to-reconstruct equivalent form 

513 # so they can read it more easily and copy and paste into 

514 # a Python terminal. 

515 log.critical(" dimensions=%s,", list(self.query_args["dimensions"].names)) 

516 log.critical(" dataId=%s,", self.query_args["dataId"].byName()) 

517 if self.query_args["where"]: 

518 log.critical(" where=%s,", repr(self.query_args["where"])) 

519 if "datasets" in self.query_args: 

520 log.critical(" datasets=%s,", list(self.query_args["datasets"])) 

521 if "collections" in self.query_args: 

522 log.critical(" collections=%s,", list(self.query_args["collections"]))