Coverage for python/lsst/pipe/base/executionButlerBuilder.py: 9%

149 statements  

« prev     ^ index     » next       coverage.py v7.4.4, created at 2024-04-17 02:45 -0700

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27from __future__ import annotations 

28 

29__all__ = ("buildExecutionButler",) 

30 

31import io 

32from collections import defaultdict 

33from collections.abc import Callable, Iterable, Mapping 

34 

35from lsst.daf.butler import Butler, Config, DatasetRef, DatasetType, Registry 

36from lsst.daf.butler.direct_butler import DirectButler 

37from lsst.daf.butler.registry import ConflictingDefinitionError, MissingDatasetTypeError 

38from lsst.daf.butler.repo_relocation import BUTLER_ROOT_TAG 

39from lsst.daf.butler.transfers import RepoExportContext 

40from lsst.resources import ResourcePath, ResourcePathExpression 

41from lsst.utils.introspection import get_class_of 

42 

43from .graph import QuantumGraph 

44 

45DataSetTypeRefMap = Mapping[DatasetType, set[DatasetRef]] 

46 

47 

48def _validate_dataset_type( 

49 candidate: DatasetType, previous: dict[str | DatasetType, DatasetType], registry: Registry 

50) -> DatasetType: 

51 """Check the dataset types and return a consistent variant if there are 

52 different compatible options. 

53 

54 Parameters 

55 ---------- 

56 candidate : `lsst.daf.butler.DatasetType` 

57 The candidate dataset type. 

58 previous : `dict` [ `str` | `~lsst.daf.butler.DatasetType`, \ 

59 `~lsst.daf.butler.DatasetType`] 

60 Previous dataset types found, indexed by name and also by 

61 dataset type. The latter provides a quick way of returning a 

62 previously checked dataset type. 

63 registry : `lsst.daf.butler.Registry` 

64 Main registry whose dataset type registration should override the 

65 given one if it exists. 

66 

67 Returns 

68 ------- 

69 datasetType : `lsst.daf.butler.DatasetType` 

70 The dataset type to be used. This can be different from the 

71 given ``candidate`` if a previous dataset type was encountered 

72 with the same name and this one is compatible with it. 

73 

74 Raises 

75 ------ 

76 ConflictingDefinitionError 

77 Raised if a candidate dataset type has the same name as one 

78 previously encountered but is not compatible with it. 

79 

80 Notes 

81 ----- 

82 This function ensures that if a dataset type is given that has the 

83 same name as a previously encountered dataset type but differs solely 

84 in a way that is interchangeable (through a supported storage class) 

85 then we will always return the first dataset type encountered instead 

86 of the new variant. We assume that the butler will handle the 

87 type conversion itself later. 

88 """ 

89 # First check that if we have previously vetted this dataset type. 

90 # Return the vetted form immediately if we have. 

91 checked = previous.get(candidate) 

92 if checked: 

93 return checked 

94 

95 # Have not previously encountered this dataset type. 

96 name = candidate.name 

97 if prevDsType := previous.get(name): 

98 # Check compatibility. For now assume both directions have to 

99 # be acceptable. 

100 if prevDsType.is_compatible_with(candidate) and candidate.is_compatible_with(prevDsType): 

101 # Ensure that if this dataset type is used again we will return 

102 # the version that we were first given with this name. Store 

103 # it for next time and return the previous one. 

104 previous[candidate] = prevDsType 

105 return prevDsType 

106 else: 

107 raise ConflictingDefinitionError( 

108 f"Dataset type incompatibility in graph: {prevDsType} not compatible with {candidate}" 

109 ) 

110 

111 # We haven't seen this dataset type in this graph before, but it may 

112 # already be in the registry. 

113 try: 

114 registryDsType = registry.getDatasetType(name) 

115 previous[candidate] = registryDsType 

116 return registryDsType 

117 except MissingDatasetTypeError: 

118 pass 

119 # Dataset type is totally new. Store it by name and by dataset type so 

120 # it will be validated immediately next time it comes up. 

121 previous[name] = candidate 

122 previous[candidate] = candidate 

123 return candidate 

124 

125 

126def _accumulate( 

127 butler: Butler, 

128 graph: QuantumGraph, 

129) -> tuple[set[DatasetRef], DataSetTypeRefMap]: 

130 # accumulate the DatasetRefs that will be transferred to the execution 

131 # registry 

132 

133 # exports holds all the existing data that will be migrated to the 

134 # execution butler 

135 exports: set[DatasetRef] = set() 

136 

137 # inserts is the mapping of DatasetType to dataIds for what is to be 

138 # inserted into the registry. These are the products that are expected 

139 # to be produced during processing of the QuantumGraph 

140 inserts: DataSetTypeRefMap = defaultdict(set) 

141 

142 # It is possible to end up with a graph that has different storage 

143 # classes attached to the same dataset type name. This is okay but 

144 # must we must ensure that only a single dataset type definition is 

145 # accumulated in the loop below. This data structure caches every dataset 

146 # type encountered and stores the compatible alternative. 

147 datasetTypes: dict[str | DatasetType, DatasetType] = {} 

148 

149 # Find the initOutput refs. 

150 initOutputRefs = list(graph.globalInitOutputRefs()) 

151 for task_def in graph.iterTaskGraph(): 

152 task_refs = graph.initOutputRefs(task_def) 

153 if task_refs: 

154 initOutputRefs.extend(task_refs) 

155 

156 for ref in initOutputRefs: 

157 dataset_type = ref.datasetType 

158 if dataset_type.component() is not None: 

159 dataset_type = dataset_type.makeCompositeDatasetType() 

160 dataset_type = _validate_dataset_type(dataset_type, datasetTypes, butler.registry) 

161 inserts[dataset_type].add(ref) 

162 

163 # Output references may be resolved even if they do not exist. Find all 

164 # actually existing refs. 

165 check_refs: set[DatasetRef] = set() 

166 for quantum in (n.quantum for n in graph): 

167 for attrName in ("initInputs", "inputs", "outputs"): 

168 attr: Mapping[DatasetType, DatasetRef | list[DatasetRef]] = getattr(quantum, attrName) 

169 for refs in attr.values(): 

170 # This if block is because init inputs has a different 

171 # signature for its items 

172 if not isinstance(refs, list | tuple): 

173 refs = [refs] 

174 for ref in refs: 

175 if ref.isComponent(): 

176 ref = ref.makeCompositeRef() 

177 check_refs.add(ref) 

178 exist_map = butler._exists_many(check_refs, full_check=False) 

179 existing_ids = {ref.id for ref, exists in exist_map.items() if exists} 

180 del exist_map 

181 

182 for quantum in (n.quantum for n in graph): 

183 for attrName in ("initInputs", "inputs", "outputs"): 

184 attr = getattr(quantum, attrName) 

185 

186 for type, refs in attr.items(): 

187 if not isinstance(refs, list | tuple): 

188 refs = [refs] 

189 if type.component() is not None: 

190 type = type.makeCompositeDatasetType() 

191 type = _validate_dataset_type(type, datasetTypes, butler.registry) 

192 # iterate over all the references, if it exists and should be 

193 # exported, if not it should be inserted into the new registry 

194 for ref in refs: 

195 # Component dataset ID is the same as its parent ID, so 

196 # checking component in existing_ids works OK. 

197 if ref.id in existing_ids: 

198 # If this is a component we want the composite to be 

199 # exported. 

200 if ref.isComponent(): 

201 ref = ref.makeCompositeRef() 

202 # Make sure we export this with the registry's dataset 

203 # type, since transfer_from doesn't handle storage 

204 # class differences (maybe it should, but it's not 

205 # bad to be defensive here even if that changes). 

206 if type != ref.datasetType: 

207 ref = ref.overrideStorageClass(type.storageClass) 

208 assert ref.datasetType == type, "Dataset types should not differ in other ways." 

209 exports.add(ref) 

210 else: 

211 if ref.isComponent(): 

212 # We can't insert a component, and a component will 

213 # be part of some other upstream dataset, so it 

214 # should be safe to skip them here 

215 continue 

216 inserts[type].add(ref) 

217 

218 return exports, inserts 

219 

220 

221def _discoverCollections(butler: Butler, collections: Iterable[str]) -> set[str]: 

222 # Recurse through any discovered collections to make sure all collections 

223 # are exported. This exists because I ran into a situation where some 

224 # collections were not properly being discovered and exported. This 

225 # method may be able to be removed in the future if collection export 

226 # logic changes 

227 collections = set(collections) 

228 while True: 

229 discoveredCollections = set( 

230 butler.registry.queryCollections(collections, flattenChains=True, includeChains=True) 

231 ) 

232 if len(discoveredCollections) > len(collections): 

233 collections = discoveredCollections 

234 else: 

235 break 

236 return collections 

237 

238 

239def _export( 

240 butler: DirectButler, collections: Iterable[str] | None, inserts: DataSetTypeRefMap 

241) -> io.StringIO: 

242 # This exports relevant dimension records and collections using daf butler 

243 # objects, however it reaches in deep and does not use the public methods 

244 # so that it can export it to a string buffer and skip disk access. This 

245 # does not export the datasets themselves, since we use transfer_from for 

246 # that. 

247 yamlBuffer = io.StringIO() 

248 # Yaml is hard coded, since the class controls both ends of the 

249 # export/import 

250 BackendClass = get_class_of(butler._config["repo_transfer_formats", "yaml", "export"]) 

251 backend = BackendClass(yamlBuffer, universe=butler.dimensions) 

252 exporter = RepoExportContext(butler._registry, butler._datastore, backend, directory=None, transfer=None) 

253 

254 # Need to ensure that the dimension records for outputs are 

255 # transferred. 

256 for _, refs in inserts.items(): 

257 exporter.saveDataIds([ref.dataId for ref in refs]) 

258 

259 # Look for any defined collection, if not get the defaults 

260 if collections is None: 

261 collections = butler.registry.defaults.collections 

262 

263 # look up all collections associated with those inputs, this follows 

264 # all chains to make sure everything is properly exported 

265 for c in _discoverCollections(butler, collections): 

266 exporter.saveCollection(c) 

267 exporter._finish() 

268 

269 # reset the string buffer to the beginning so the read operation will 

270 # actually *see* the data that was exported 

271 yamlBuffer.seek(0) 

272 return yamlBuffer 

273 

274 

275def _setupNewButler( 

276 butler: DirectButler, 

277 outputLocation: ResourcePath, 

278 dirExists: bool, 

279 datastoreRoot: ResourcePath | None = None, 

280) -> Butler: 

281 """Set up the execution butler 

282 

283 Parameters 

284 ---------- 

285 butler : `Butler` 

286 The original butler, upon which the execution butler is based. 

287 outputLocation : `~lsst.resources.ResourcePath` 

288 Location of the execution butler. 

289 dirExists : `bool` 

290 Does the ``outputLocation`` exist, and if so, should it be clobbered? 

291 datastoreRoot : `~lsst.resources.ResourcePath`, optional 

292 Path for the execution butler datastore. If not specified, then the 

293 original butler's datastore will be used. 

294 

295 Returns 

296 ------- 

297 execution_butler : `Butler` 

298 Execution butler. 

299 """ 

300 # Set up the new butler object at the specified location 

301 if dirExists: 

302 # Remove the existing table, if the code got this far and this exists 

303 # clobber must be true 

304 executionRegistry = outputLocation.join("gen3.sqlite3") 

305 if executionRegistry.exists(): 

306 executionRegistry.remove() 

307 else: 

308 outputLocation.mkdir() 

309 

310 # Copy the existing butler config, modifying the location of the 

311 # registry to the specified location. 

312 # Preserve the root path from the existing butler so things like 

313 # file data stores continue to look at the old location. 

314 config = Config(butler._config) 

315 config["root"] = outputLocation.geturl() 

316 config["registry", "db"] = "sqlite:///<butlerRoot>/gen3.sqlite3" 

317 

318 # Remove any namespace that may be set in main registry. 

319 config.pop(("registry", "namespace"), None) 

320 

321 # Obscore manager cannot be used with execution butler. 

322 config.pop(("registry", "managers", "obscore"), None) 

323 

324 # record the current root of the datastore if it is specified relative 

325 # to the butler root 

326 if datastoreRoot is not None: 

327 config["datastore", "root"] = datastoreRoot.geturl() 

328 elif config.get(("datastore", "root")) == BUTLER_ROOT_TAG and butler._config.configDir is not None: 

329 config["datastore", "root"] = butler._config.configDir.geturl() 

330 config["datastore", "trust_get_request"] = True 

331 

332 # Requires that we use the dimension configuration from the original 

333 # butler and not use the defaults. 

334 config = Butler.makeRepo( 

335 root=outputLocation, 

336 config=config, 

337 dimensionConfig=butler.dimensions.dimensionConfig, 

338 overwrite=True, 

339 forceConfigRoot=False, 

340 ) 

341 

342 # Return a newly created butler 

343 return Butler.from_config(config, writeable=True) 

344 

345 

346def _import( 

347 yamlBuffer: io.StringIO, 

348 newButler: Butler, 

349 inserts: DataSetTypeRefMap, 

350 run: str | None, 

351 butlerModifier: Callable[[Butler], Butler] | None, 

352) -> Butler: 

353 # This method takes the exports from the existing butler, imports 

354 # them into the newly created butler, and then inserts the datasets 

355 # that are expected to be produced. 

356 

357 # import the existing datasets using "split" mode. "split" is safe 

358 # because execution butler is assumed to be able to see all the file 

359 # locations that the main datastore can see. "split" supports some 

360 # absolute URIs in the datastore. 

361 newButler.import_(filename=yamlBuffer, format="yaml", transfer="split") 

362 

363 # If there is modifier callable, run it to make necessary updates 

364 # to the new butler. 

365 if butlerModifier is not None: 

366 newButler = butlerModifier(newButler) 

367 

368 # Register datasets to be produced and insert them into the registry 

369 for dsType, refs in inserts.items(): 

370 # Storage class differences should have already been resolved by calls 

371 # _validate_dataset_type in _export, resulting in the Registry dataset 

372 # type whenever that exists. 

373 newButler.registry.registerDatasetType(dsType) 

374 newButler.registry._importDatasets(refs) 

375 

376 return newButler 

377 

378 

379def buildExecutionButler( 

380 butler: DirectButler, 

381 graph: QuantumGraph, 

382 outputLocation: ResourcePathExpression, 

383 run: str | None, 

384 *, 

385 clobber: bool = False, 

386 butlerModifier: Callable[[Butler], Butler] | None = None, 

387 collections: Iterable[str] | None = None, 

388 datastoreRoot: ResourcePathExpression | None = None, 

389 transfer: str = "auto", 

390) -> Butler: 

391 r"""Create an execution butler. 

392 

393 Responsible for exporting 

394 input `QuantumGraph`\s into a new minimal `~lsst.daf.butler.Butler` which 

395 only contains datasets specified by the `QuantumGraph`. 

396 

397 These datasets are both those that already exist in the input 

398 `~lsst.daf.butler.Butler`, and those that are expected to be produced 

399 during the execution of the `QuantumGraph`. 

400 

401 Parameters 

402 ---------- 

403 butler : `lsst.daf.butler.Butler` 

404 This is the existing `~lsst.daf.butler.Butler` instance from 

405 which existing datasets will be exported. This should be the 

406 `~lsst.daf.butler.Butler` which was used to create any `QuantumGraphs` 

407 that will be converted with this object. 

408 graph : `QuantumGraph` 

409 Graph containing nodes that are to be exported into an execution 

410 butler. 

411 outputLocation : convertible to `~lsst.resources.ResourcePath` 

412 URI Location at which the execution butler is to be exported. May be 

413 specified as a string or a `~lsst.resources.ResourcePath` instance. 

414 run : `str`, optional 

415 The run collection that the exported datasets are to be placed in. If 

416 None, the default value in registry.defaults will be used. 

417 clobber : `bool`, Optional 

418 By default a butler will not be created if a file or directory 

419 already exists at the output location. If this is set to `True` 

420 what is at the location will be deleted prior to running the 

421 export. Defaults to `False`. 

422 butlerModifier : `~typing.Callable`, Optional 

423 If supplied this should be a callable that accepts a 

424 `~lsst.daf.butler.Butler`, and returns an instantiated 

425 `~lsst.daf.butler.Butler`. This callable may be used to make any 

426 modifications to the `~lsst.daf.butler.Butler` desired. This 

427 will be called after importing all datasets that exist in the input 

428 `~lsst.daf.butler.Butler` but prior to inserting Datasets expected 

429 to be produced. Examples of what this method could do include 

430 things such as creating collections/runs/ etc. 

431 collections : `~typing.Iterable` of `str`, Optional 

432 An iterable of collection names that will be exported from the input 

433 `~lsst.daf.butler.Butler` when creating the execution butler. If not 

434 supplied the `~lsst.daf.butler.Butler`\ 's `~lsst.daf.butler.Registry` 

435 default collections will be used. 

436 datastoreRoot : convertible to `~lsst.resources.ResourcePath`, Optional 

437 Root directory for datastore of execution butler. If `None`, then the 

438 original butler's datastore will be used. 

439 transfer : `str` 

440 How (and whether) the input datasets should be added to the execution 

441 butler datastore. This should be a ``transfer`` string recognized by 

442 :func:`lsst.resources.ResourcePath.transfer_from`. 

443 ``"auto"`` means to ``"copy"`` if the ``datastoreRoot`` is specified. 

444 

445 Returns 

446 ------- 

447 executionButler : `lsst.daf.butler.Butler` 

448 An instance of the newly created execution butler. 

449 

450 Raises 

451 ------ 

452 FileExistsError 

453 Raised if something exists in the filesystem at the specified output 

454 location and clobber is `False`. 

455 NotADirectoryError 

456 Raised if specified output URI does not correspond to a directory. 

457 """ 

458 # Now require that if run is given it must match the graph run. 

459 if run and graph.metadata and run != (graph_run := graph.metadata.get("output_run")): 

460 raise ValueError(f"The given run, {run!r}, does not match that specified in the graph, {graph_run!r}") 

461 

462 # We know this must refer to a directory. 

463 outputLocation = ResourcePath(outputLocation, forceDirectory=True) 

464 if datastoreRoot is not None: 

465 datastoreRoot = ResourcePath(datastoreRoot, forceDirectory=True) 

466 

467 # Do this first to Fail Fast if the output exists 

468 if (dirExists := outputLocation.exists()) and not clobber: 

469 raise FileExistsError("Cannot create a butler at specified location, location exists") 

470 if not outputLocation.isdir(): 

471 raise NotADirectoryError("The specified output URI does not appear to correspond to a directory") 

472 

473 exports, inserts = _accumulate(butler, graph) 

474 yamlBuffer = _export(butler, collections, inserts) 

475 

476 newButler = _setupNewButler(butler, outputLocation, dirExists, datastoreRoot) 

477 

478 newButler = _import(yamlBuffer, newButler, inserts, run, butlerModifier) 

479 

480 if transfer == "auto" and datastoreRoot is not None: 

481 transfer = "copy" 

482 

483 # Transfer the existing datasets directly from the source butler. 

484 newButler.transfer_from( 

485 butler, 

486 exports, 

487 transfer=transfer, 

488 skip_missing=False, # Everything should exist. 

489 register_dataset_types=True, 

490 transfer_dimensions=True, 

491 ) 

492 

493 return newButler