Coverage for python/lsst/pipe/base/all_dimensions_quantum_graph_builder.py: 18%

197 statements  

« prev     ^ index     » next       coverage.py v7.4.4, created at 2024-04-10 03:25 -0700

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28"""The standard, general-purpose implementation of the QuantumGraph-generation 

29algorithm. 

30""" 

31 

32from __future__ import annotations 

33 

34__all__ = ("AllDimensionsQuantumGraphBuilder", "DatasetQueryConstraintVariant") 

35 

36import dataclasses 

37from collections.abc import Iterator, Mapping 

38from contextlib import contextmanager 

39from typing import TYPE_CHECKING, Any, final 

40 

41from lsst.daf.butler.registry import MissingDatasetTypeError 

42from lsst.utils.timer import timeMethod 

43 

44from ._datasetQueryConstraints import DatasetQueryConstraintVariant 

45from .quantum_graph_builder import ( 

46 DatasetKey, 

47 PrerequisiteDatasetKey, 

48 QuantumGraphBuilder, 

49 QuantumGraphBuilderError, 

50 QuantumGraphSkeleton, 

51 QuantumKey, 

52) 

53 

54if TYPE_CHECKING: 

55 from lsst.daf.butler import Butler, DimensionGroup 

56 from lsst.daf.butler.registry.queries import DataCoordinateQueryResults 

57 from lsst.utils.logging import LsstLogAdapter 

58 

59 from .pipeline_graph import DatasetTypeNode, PipelineGraph, TaskNode 

60 

61 

62@final 

63class AllDimensionsQuantumGraphBuilder(QuantumGraphBuilder): 

64 """An implementation of `QuantumGraphBuilder` that uses a single large 

65 query for data IDs covering all dimensions in the pipeline. 

66 

67 Parameters 

68 ---------- 

69 pipeline_graph : `.pipeline_graph.PipelineGraph` 

70 Pipeline to build a `QuantumGraph` from, as a graph. Will be resolved 

71 in-place with the given butler (any existing resolution is ignored). 

72 butler : `lsst.daf.butler.Butler` 

73 Client for the data repository. Should be read-only. 

74 where : `str`, optional 

75 Butler expression language constraint to apply to all data IDs. 

76 dataset_query_constraint : `DatasetQueryConstraintVariant`, optional 

77 Specification of which overall-input datasets should be used to 

78 constrain the initial data ID queries. Not including an important 

79 constraint can result in catastrophically large query results that take 

80 too long to process, while including too many makes the query much more 

81 complex, increasing the chances that the database will choose a bad 

82 (sometimes catastrophically bad) query plan. 

83 bind : `~collections.abc.Mapping`, optional 

84 Variable substitutions for the ``where`` expression. 

85 **kwargs 

86 Additional keyword arguments forwarded to `QuantumGraphBuilder`. 

87 

88 Notes 

89 ----- 

90 This is a general-purpose algorithm that delegates the problem of 

91 determining which "end" of the pipeline is more constrained (beginning by 

92 input collection contents vs. end by the ``where`` string) to the database 

93 query planner, which *usually* does a good job. 

94 

95 This algorithm suffers from a serious limitation, which we refer to as the 

96 "tract slicing" problem from its most common variant: the ``where`` string 

97 and general data ID intersection rules apply to *all* data IDs in the 

98 graph. For example, if a ``tract`` constraint is present in the ``where`` 

99 string or an overall-input dataset, then it is impossible for any data ID 

100 that does not overlap that tract to be present anywhere in the pipeline, 

101 such as a ``{visit, detector}`` combination where the ``visit`` overlaps 

102 the ``tract`` even if the ``detector`` does not. 

103 """ 

104 

105 def __init__( 

106 self, 

107 pipeline_graph: PipelineGraph, 

108 butler: Butler, 

109 *, 

110 where: str = "", 

111 dataset_query_constraint: DatasetQueryConstraintVariant = DatasetQueryConstraintVariant.ALL, 

112 bind: Mapping[str, Any] | None = None, 

113 **kwargs: Any, 

114 ): 

115 super().__init__(pipeline_graph, butler, **kwargs) 

116 self.where = where 

117 self.dataset_query_constraint = dataset_query_constraint 

118 self.bind = bind 

119 

120 @timeMethod 

121 def process_subgraph(self, subgraph: PipelineGraph) -> QuantumGraphSkeleton: 

122 # Docstring inherited. 

123 # There is some chance that the dimension query for one subgraph would 

124 # be the same as or a dimension-subset of another. This is an 

125 # optimization opportunity we're not currently taking advantage of. 

126 with _AllDimensionsQuery.from_builder(self, subgraph) as query: 

127 skeleton = self._make_subgraph_skeleton(query) 

128 self._find_followup_datasets(query, skeleton) 

129 return skeleton 

130 

131 @timeMethod 

132 def _make_subgraph_skeleton(self, query: _AllDimensionsQuery) -> QuantumGraphSkeleton: 

133 """Build a `QuantumGraphSkeleton` by iterating over the result rows 

134 of the initial data ID query. 

135 

136 Parameters 

137 ---------- 

138 query : `_AllDimensionsQuery` 

139 Object representing the full-pipeline data ID query. 

140 

141 Returns 

142 ------- 

143 skeleton : `QuantumGraphSkeleton` 

144 Preliminary quantum graph. 

145 """ 

146 # First we make containers of empty-dimensions quantum and dataset 

147 # keys, and add those to the skelton, since empty data IDs are 

148 # logically subsets of any data ID. We'll copy those to initialize the 

149 # containers of keys for each result row. We don't ever explicitly add 

150 # nodes to the skeleton for these, and that's okay because networkx 

151 # adds nodes implicitly when an edge to that node is added, and we 

152 # don't want to add nodes for init datasets here. 

153 skeleton = QuantumGraphSkeleton(query.subgraph.tasks) 

154 empty_dimensions_dataset_keys = {} 

155 for dataset_type_name in query.empty_dimensions_dataset_types.keys(): 

156 empty_dimensions_dataset_keys[dataset_type_name] = skeleton.add_dataset_node( 

157 dataset_type_name, self.empty_data_id 

158 ) 

159 empty_dimensions_quantum_keys = [] 

160 for task_label in query.empty_dimensions_tasks.keys(): 

161 empty_dimensions_quantum_keys.append(skeleton.add_quantum_node(task_label, self.empty_data_id)) 

162 self.log.info("Iterating over query results to associate quanta with datasets.") 

163 # Iterate over query results, populating data IDs for datasets and 

164 # quanta and then connecting them to each other. This is the slowest 

165 # client-side part of QG generation, and it's often the slowest part 

166 # overall, so inside this loop is where it's really critical to avoid 

167 # expensive things, especially in the nested loops. 

168 n_rows = 0 

169 for common_data_id in query.common_data_ids: 

170 # Create a data ID for each set of dimensions used by one or more 

171 # tasks or dataset types, and use that to record all quanta and 

172 # dataset data IDs for this row. 

173 dataset_keys_for_row: dict[str, DatasetKey] = empty_dimensions_dataset_keys.copy() 

174 quantum_keys_for_row: list[QuantumKey] = empty_dimensions_quantum_keys.copy() 

175 for dimensions, (task_nodes, dataset_type_nodes) in query.grouped_by_dimensions.items(): 

176 data_id = common_data_id.subset(dimensions) 

177 for dataset_type_name in dataset_type_nodes.keys(): 

178 dataset_keys_for_row[dataset_type_name] = skeleton.add_dataset_node( 

179 dataset_type_name, data_id 

180 ) 

181 for task_label in task_nodes.keys(): 

182 quantum_keys_for_row.append(skeleton.add_quantum_node(task_label, data_id)) 

183 # Whether these quanta are new or existing, we can now associate 

184 # the dataset data IDs for this row with them. The fact that a 

185 # quantum data ID and a dataset data ID both came from the same 

186 # result row is what tells us they should be associated. Many of 

187 # these associates will be duplicates (because another query row 

188 # that differed from this one only in irrelevant dimensions already 

189 # added them), and our use of sets should take care of that. 

190 for quantum_key in quantum_keys_for_row: 

191 for read_edge in self._pipeline_graph.tasks[quantum_key.task_label].inputs.values(): 

192 skeleton.add_input_edge( 

193 quantum_key, dataset_keys_for_row[read_edge.parent_dataset_type_name] 

194 ) 

195 for write_edge in self._pipeline_graph.tasks[quantum_key.task_label].iter_all_outputs(): 

196 skeleton.add_output_edge( 

197 quantum_key, dataset_keys_for_row[write_edge.parent_dataset_type_name] 

198 ) 

199 n_rows += 1 

200 if n_rows == 0: 

201 query.log_failure(self.log) 

202 else: 

203 n_quanta = sum(len(skeleton.get_quanta(task_label)) for task_label in query.subgraph.tasks) 

204 self.log.info( 

205 "Initial bipartite graph has %d quanta, %d dataset nodes, and %d edges from %d query row(s).", 

206 n_quanta, 

207 skeleton.n_nodes - n_quanta, 

208 skeleton.n_edges, 

209 n_rows, 

210 ) 

211 return skeleton 

212 

213 @timeMethod 

214 def _find_followup_datasets(self, query: _AllDimensionsQuery, skeleton: QuantumGraphSkeleton) -> None: 

215 """Populate `existing_datasets` by performing follow-up queries joined 

216 to column-subsets of the initial data ID query. 

217 

218 Parameters 

219 ---------- 

220 query : `_AllDimensionsQuery` 

221 Object representing the full-pipeline data ID query. 

222 """ 

223 for dimensions, (tasks_in_group, dataset_types_in_group) in query.grouped_by_dimensions.items(): 

224 data_ids = query.common_data_ids.subset(dimensions, unique=True) 

225 # Iterate over regular input/output dataset type nodes with these 

226 # dimensions to find those datasets using straightforward followup 

227 # queries. 

228 for dataset_type_node in dataset_types_in_group.values(): 

229 if dataset_type_node.name in query.overall_inputs: 

230 # Dataset type is an overall input; we always need to try 

231 # to find these. 

232 count = 0 

233 try: 

234 for ref in data_ids.findDatasets(dataset_type_node.name, self.input_collections): 

235 self.existing_datasets.inputs[ 

236 DatasetKey(dataset_type_node.name, ref.dataId.required_values) 

237 ] = ref 

238 count += 1 

239 except MissingDatasetTypeError: 

240 pass 

241 self.log.verbose( 

242 "Found %d overall-input dataset(s) of type %r.", count, dataset_type_node.name 

243 ) 

244 continue 

245 if self.skip_existing_in: 

246 # Dataset type is an intermediate or output; need to find 

247 # these if only they're from previously executed quanta 

248 # that we might skip... 

249 count = 0 

250 try: 

251 for ref in data_ids.findDatasets(dataset_type_node.name, self.skip_existing_in): 

252 key = DatasetKey(dataset_type_node.name, ref.dataId.required_values) 

253 self.existing_datasets.outputs_for_skip[key] = ref 

254 count += 1 

255 if ref.run == self.output_run: 

256 self.existing_datasets.outputs_in_the_way[key] = ref 

257 except MissingDatasetTypeError: 

258 pass 

259 self.log.verbose( 

260 "Found %d output dataset(s) of type %r in %s.", 

261 count, 

262 dataset_type_node.name, 

263 self.skip_existing_in, 

264 ) 

265 if self.output_run_exists and not self.skip_existing_starts_with_output_run: 

266 # ...or if they're in the way and would need to be 

267 # clobbered (and we haven't already found them in the 

268 # previous block). 

269 count = 0 

270 try: 

271 for ref in data_ids.findDatasets(dataset_type_node.name, [self.output_run]): 

272 self.existing_datasets.outputs_in_the_way[ 

273 DatasetKey(dataset_type_node.name, ref.dataId.required_values) 

274 ] = ref 

275 count += 1 

276 except MissingDatasetTypeError: 

277 pass 

278 self.log.verbose( 

279 "Found %d output dataset(s) of type %r in %s.", 

280 count, 

281 dataset_type_node.name, 

282 self.output_run, 

283 ) 

284 del dataset_type_node 

285 # Iterate over tasks with these dimensions to perform follow-up 

286 # queries for prerequisite inputs, which may have dimensions that 

287 # were not in ``common_data_ids`` and/or require temporal joins to 

288 # calibration validity ranges. 

289 for task_node in tasks_in_group.values(): 

290 task_prerequisite_info = self.prerequisite_info[task_node.label] 

291 for connection_name, finder in list(task_prerequisite_info.finders.items()): 

292 if finder.lookup_function is not None: 

293 self.log.verbose( 

294 "Deferring prerequisite input %r of task %r to per-quantum processing " 

295 "(lookup function provided).", 

296 finder.dataset_type_node.name, 

297 task_node.label, 

298 ) 

299 continue 

300 # We also fall back to the base class if there is a 

301 # nontrivial spatial or temporal join in the lookup. 

302 if finder.dataset_skypix or finder.dataset_other_spatial: 

303 if task_prerequisite_info.bounds.spatial_connections: 

304 self.log.verbose( 

305 "Deferring prerequisite input %r of task %r to per-quantum processing " 

306 "(for spatial-bounds-connections handling).", 

307 finder.dataset_type_node.name, 

308 task_node.label, 

309 ) 

310 continue 

311 if not task_node.dimensions.spatial: 

312 self.log.verbose( 

313 "Deferring prerequisite input %r of task %r to per-quantum processing " 

314 "(dataset has spatial data IDs, but task does not).", 

315 finder.dataset_type_node.name, 

316 task_node.label, 

317 ) 

318 continue 

319 if finder.dataset_has_timespan: 

320 if task_prerequisite_info.bounds.spatial_connections: 

321 self.log.verbose( 

322 "Deferring prerequisite input %r of task %r to per-quantum processing " 

323 "(for temporal-bounds-connections handling).", 

324 finder.dataset_type_node.name, 

325 task_node.label, 

326 ) 

327 continue 

328 if not task_node.dimensions.temporal: 

329 self.log.verbose( 

330 "Deferring prerequisite input %r of task %r to per-quantum processing " 

331 "(dataset has temporal data IDs, but task does not).", 

332 finder.dataset_type_node.name, 

333 task_node.label, 

334 ) 

335 continue 

336 # We have a simple case where we can do a single query 

337 # that joins the query we already have for the task data 

338 # IDs to the datasets we're looking for. 

339 count = 0 

340 try: 

341 query_results = data_ids.findRelatedDatasets( 

342 finder.dataset_type_node.dataset_type, self.input_collections 

343 ) 

344 except MissingDatasetTypeError: 

345 query_results = [] 

346 for data_id, ref in query_results: 

347 dataset_key = PrerequisiteDatasetKey(finder.dataset_type_node.name, ref.id.bytes) 

348 quantum_key = QuantumKey(task_node.label, data_id.required_values) 

349 # The column-subset operation used to make `data_ids` 

350 # from `common_data_ids` can strip away post-query 

351 # filtering; e.g. if we starts with a {visit, patch} 

352 # query but subset down to just {visit}, we can't keep 

353 # the patch.region column we need for that filtering. 

354 # This means we can get some data IDs that weren't in 

355 # the original query (e.g. visits that don't overlap 

356 # the same patch, but do overlap the some common skypix 

357 # ID). We don't want to add quanta with those data ID 

358 # here, which is why we pass 

359 # ignore_unrecognized_quanta=True here. 

360 if skeleton.add_input_edge(quantum_key, dataset_key, ignore_unrecognized_quanta=True): 

361 self.existing_datasets.inputs[dataset_key] = ref 

362 count += 1 

363 # Remove this finder from the mapping so the base class 

364 # knows it doesn't have to look for these prerequisites. 

365 del task_prerequisite_info.finders[connection_name] 

366 self.log.verbose( 

367 "Added %d prerequisite input edge(s) from dataset type %r to task %r.", 

368 count, 

369 finder.dataset_type_node.name, 

370 task_node.label, 

371 ) 

372 

373 

374@dataclasses.dataclass(eq=False, repr=False) 

375class _AllDimensionsQuery: 

376 """A helper class for `AllDimensionsQuantumGraphBuilder` that holds all 

377 per-subgraph state. 

378 

379 This object should always be constructed by `from_builder`, which returns 

380 an instance wrapped with a context manager. This controls the lifetime of 

381 the temporary table referenced by `common_data_ids`. 

382 """ 

383 

384 subgraph: PipelineGraph 

385 """Graph of this subset of the pipeline.""" 

386 

387 grouped_by_dimensions: dict[DimensionGroup, tuple[dict[str, TaskNode], dict[str, DatasetTypeNode]]] = ( 

388 dataclasses.field(default_factory=dict) 

389 ) 

390 """The tasks and dataset types of this subset of the pipeline, grouped 

391 by their dimensions. 

392 

393 The tasks and dataset types with empty dimensions are not included; they're 

394 in other attributes since they are usually used differently. Prerequisite 

395 dataset types are also not included. 

396 """ 

397 

398 empty_dimensions_tasks: dict[str, TaskNode] = dataclasses.field(default_factory=dict) 

399 """The tasks of this subset of this pipeline that have empty dimensions.""" 

400 

401 empty_dimensions_dataset_types: dict[str, DatasetTypeNode] = dataclasses.field(default_factory=dict) 

402 """The dataset types of this subset of this pipeline that have empty 

403 dimensions. 

404 

405 Prerequisite dataset types are not included. 

406 """ 

407 

408 overall_inputs: dict[str, DatasetTypeNode] = dataclasses.field(default_factory=dict) 

409 """Pipeline graph nodes for all non-prerequisite, non-init overall-input 

410 dataset types for this subset of the pipeline. 

411 """ 

412 

413 query_args: dict[str, Any] = dataclasses.field(default_factory=dict) 

414 """All keyword arguments passed to `lsst.daf.butler.Registry.queryDataIds`. 

415 """ 

416 

417 common_data_ids: DataCoordinateQueryResults = dataclasses.field(init=False) 

418 """Results of the materialized initial data ID query.""" 

419 

420 @classmethod 

421 @contextmanager 

422 def from_builder( 

423 cls, builder: AllDimensionsQuantumGraphBuilder, subgraph: PipelineGraph 

424 ) -> Iterator[_AllDimensionsQuery]: 

425 """Construct and run the query, returning an instance guarded by 

426 a context manager. 

427 

428 Parameters 

429 ---------- 

430 builder : `AllDimensionsQuantumGraphBuilder` 

431 Builder object this helper is associated with. 

432 subgraph : `pipeline_graph.PipelineGraph` 

433 Subset of the pipeline being processed. 

434 

435 Returns 

436 ------- 

437 context : `AbstractContextManager` [ `_AllDimensionsQuery` ] 

438 An instance of this class, inside a context manager that manages 

439 the lifetime of its temporary database table. 

440 """ 

441 result = cls(subgraph) 

442 builder.log.debug("Analyzing subgraph dimensions and overall-inputs.") 

443 result.grouped_by_dimensions = result.subgraph.group_by_dimensions() 

444 ( 

445 result.empty_dimensions_tasks, 

446 result.empty_dimensions_dataset_types, 

447 ) = result.grouped_by_dimensions.pop(builder.universe.empty.as_group()) 

448 result.overall_inputs = { 

449 name: node # type: ignore 

450 for name, node in result.subgraph.iter_overall_inputs() 

451 if not node.is_prerequisite # type: ignore 

452 } 

453 dimension_names: set[str] = set() 

454 for dimensions_for_group in result.grouped_by_dimensions.keys(): 

455 dimension_names.update(dimensions_for_group.names) 

456 dimensions = builder.universe.conform(dimension_names) 

457 builder.log.debug("Building query for data IDs.") 

458 result.query_args = { 

459 "dimensions": dimensions, 

460 "where": builder.where, 

461 "dataId": result.subgraph.data_id, 

462 "bind": builder.bind, 

463 } 

464 if builder.dataset_query_constraint == DatasetQueryConstraintVariant.ALL: 

465 builder.log.debug("Constraining graph query using all datasets not marked as deferred.") 

466 result.query_args["datasets"] = { 

467 name 

468 for name, dataset_type_node in result.overall_inputs.items() 

469 if ( 

470 dataset_type_node.is_initial_query_constraint 

471 and name not in result.empty_dimensions_dataset_types 

472 ) 

473 } 

474 result.query_args["collections"] = builder.input_collections 

475 elif builder.dataset_query_constraint == DatasetQueryConstraintVariant.OFF: 

476 builder.log.debug("Not using dataset existence to constrain query.") 

477 elif builder.dataset_query_constraint == DatasetQueryConstraintVariant.LIST: 

478 constraint = set(builder.dataset_query_constraint) 

479 inputs = result.overall_inputs - result.empty_dimensions_dataset_types.keys() 

480 if remainder := constraint.difference(inputs): 

481 raise QuantumGraphBuilderError( 

482 f"{remainder} dataset type(s) specified as a graph constraint, but" 

483 f" do not appear as an overall input to the specified pipeline: {inputs}." 

484 " Note that component datasets are not permitted as constraints." 

485 ) 

486 builder.log.debug(f"Constraining graph query using {constraint}") 

487 result.query_args["datasets"] = constraint 

488 result.query_args["collections"] = builder.input_collections 

489 else: 

490 raise QuantumGraphBuilderError( 

491 f"Unable to handle type {builder.dataset_query_constraint} " 

492 "given as datasetQueryConstraint." 

493 ) 

494 builder.log.verbose("Querying for data IDs with arguments:") 

495 builder.log.verbose(" dimensions=%s,", list(result.query_args["dimensions"].names)) 

496 builder.log.verbose(" dataId=%s,", dict(result.query_args["dataId"].required)) 

497 if result.query_args["where"]: 

498 builder.log.verbose(" where=%s,", repr(result.query_args["where"])) 

499 if "datasets" in result.query_args: 

500 builder.log.verbose(" datasets=%s,", list(result.query_args["datasets"])) 

501 if "collections" in result.query_args: 

502 builder.log.verbose(" collections=%s,", list(result.query_args["collections"])) 

503 with builder.butler.registry.caching_context(): 

504 with builder.butler.registry.queryDataIds(**result.query_args).materialize() as common_data_ids: 

505 builder.log.debug("Expanding data IDs.") 

506 result.common_data_ids = common_data_ids.expanded() 

507 yield result 

508 

509 def log_failure(self, log: LsstLogAdapter) -> None: 

510 """Emit a series of CRITICAL-level log message that attempts to explain 

511 why the initial data ID query returned no rows. 

512 

513 Parameters 

514 ---------- 

515 log : `logging.Logger` 

516 The logger to use to emit log messages. 

517 """ 

518 log.critical("Initial data ID query returned no rows, so QuantumGraph will be empty.") 

519 for message in self.common_data_ids.explain_no_results(): 

520 log.critical(message) 

521 log.critical( 

522 "To reproduce this query for debugging purposes, run " 

523 "Registry.queryDataIds with these arguments:" 

524 ) 

525 # We could just repr() the queryArgs dict to get something 

526 # the user could make sense of, but it's friendlier to 

527 # put these args in an easier-to-reconstruct equivalent form 

528 # so they can read it more easily and copy and paste into 

529 # a Python terminal. 

530 log.critical(" dimensions=%s,", list(self.query_args["dimensions"].names)) 

531 log.critical(" dataId=%s,", dict(self.query_args["dataId"].required)) 

532 if self.query_args["where"]: 

533 log.critical(" where=%s,", repr(self.query_args["where"])) 

534 if "datasets" in self.query_args: 

535 log.critical(" datasets=%s,", list(self.query_args["datasets"])) 

536 if "collections" in self.query_args: 

537 log.critical(" collections=%s,", list(self.query_args["collections"]))