Coverage for python/lsst/pipe/base/executionButlerBuilder.py: 9%

148 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-10-11 09:32 +0000

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27from __future__ import annotations 

28 

29__all__ = ("buildExecutionButler",) 

30 

31import io 

32from collections import defaultdict 

33from collections.abc import Callable, Iterable, Mapping 

34 

35from lsst.daf.butler import Butler, Config, DatasetRef, DatasetType, Registry 

36from lsst.daf.butler.registry import ConflictingDefinitionError, MissingDatasetTypeError 

37from lsst.daf.butler.repo_relocation import BUTLER_ROOT_TAG 

38from lsst.daf.butler.transfers import RepoExportContext 

39from lsst.resources import ResourcePath, ResourcePathExpression 

40from lsst.utils.introspection import get_class_of 

41 

42from .graph import QuantumGraph 

43 

44DataSetTypeRefMap = Mapping[DatasetType, set[DatasetRef]] 

45 

46 

47def _validate_dataset_type( 

48 candidate: DatasetType, previous: dict[str | DatasetType, DatasetType], registry: Registry 

49) -> DatasetType: 

50 """Check the dataset types and return a consistent variant if there are 

51 different compatible options. 

52 

53 Parameters 

54 ---------- 

55 candidate : `lsst.daf.butler.DatasetType` 

56 The candidate dataset type. 

57 previous : `dict` [ `str` | `~lsst.daf.butler.DatasetType`, \ 

58 `~lsst.daf.butler.DatasetType`] 

59 Previous dataset types found, indexed by name and also by 

60 dataset type. The latter provides a quick way of returning a 

61 previously checked dataset type. 

62 registry : `lsst.daf.butler.Registry` 

63 Main registry whose dataset type registration should override the 

64 given one if it exists. 

65 

66 Returns 

67 ------- 

68 datasetType : `lsst.daf.butler.DatasetType` 

69 The dataset type to be used. This can be different from the 

70 given ``candidate`` if a previous dataset type was encountered 

71 with the same name and this one is compatible with it. 

72 

73 Raises 

74 ------ 

75 ConflictingDefinitionError 

76 Raised if a candidate dataset type has the same name as one 

77 previously encountered but is not compatible with it. 

78 

79 Notes 

80 ----- 

81 This function ensures that if a dataset type is given that has the 

82 same name as a previously encountered dataset type but differs solely 

83 in a way that is interchangeable (through a supported storage class) 

84 then we will always return the first dataset type encountered instead 

85 of the new variant. We assume that the butler will handle the 

86 type conversion itself later. 

87 """ 

88 # First check that if we have previously vetted this dataset type. 

89 # Return the vetted form immediately if we have. 

90 checked = previous.get(candidate) 

91 if checked: 

92 return checked 

93 

94 # Have not previously encountered this dataset type. 

95 name = candidate.name 

96 if prevDsType := previous.get(name): 

97 # Check compatibility. For now assume both directions have to 

98 # be acceptable. 

99 if prevDsType.is_compatible_with(candidate) and candidate.is_compatible_with(prevDsType): 

100 # Ensure that if this dataset type is used again we will return 

101 # the version that we were first given with this name. Store 

102 # it for next time and return the previous one. 

103 previous[candidate] = prevDsType 

104 return prevDsType 

105 else: 

106 raise ConflictingDefinitionError( 

107 f"Dataset type incompatibility in graph: {prevDsType} not compatible with {candidate}" 

108 ) 

109 

110 # We haven't seen this dataset type in this graph before, but it may 

111 # already be in the registry. 

112 try: 

113 registryDsType = registry.getDatasetType(name) 

114 previous[candidate] = registryDsType 

115 return registryDsType 

116 except MissingDatasetTypeError: 

117 pass 

118 # Dataset type is totally new. Store it by name and by dataset type so 

119 # it will be validated immediately next time it comes up. 

120 previous[name] = candidate 

121 previous[candidate] = candidate 

122 return candidate 

123 

124 

125def _accumulate( 

126 butler: Butler, 

127 graph: QuantumGraph, 

128) -> tuple[set[DatasetRef], DataSetTypeRefMap]: 

129 # accumulate the DatasetRefs that will be transferred to the execution 

130 # registry 

131 

132 # exports holds all the existing data that will be migrated to the 

133 # execution butler 

134 exports: set[DatasetRef] = set() 

135 

136 # inserts is the mapping of DatasetType to dataIds for what is to be 

137 # inserted into the registry. These are the products that are expected 

138 # to be produced during processing of the QuantumGraph 

139 inserts: DataSetTypeRefMap = defaultdict(set) 

140 

141 # It is possible to end up with a graph that has different storage 

142 # classes attached to the same dataset type name. This is okay but 

143 # must we must ensure that only a single dataset type definition is 

144 # accumulated in the loop below. This data structure caches every dataset 

145 # type encountered and stores the compatible alternative. 

146 datasetTypes: dict[str | DatasetType, DatasetType] = {} 

147 

148 # Find the initOutput refs. 

149 initOutputRefs = list(graph.globalInitOutputRefs()) 

150 for task_def in graph.iterTaskGraph(): 

151 task_refs = graph.initOutputRefs(task_def) 

152 if task_refs: 

153 initOutputRefs.extend(task_refs) 

154 

155 for ref in initOutputRefs: 

156 dataset_type = ref.datasetType 

157 if dataset_type.component() is not None: 

158 dataset_type = dataset_type.makeCompositeDatasetType() 

159 dataset_type = _validate_dataset_type(dataset_type, datasetTypes, butler.registry) 

160 inserts[dataset_type].add(ref) 

161 

162 # Output references may be resolved even if they do not exist. Find all 

163 # actually existing refs. 

164 check_refs: set[DatasetRef] = set() 

165 for quantum in (n.quantum for n in graph): 

166 for attrName in ("initInputs", "inputs", "outputs"): 

167 attr: Mapping[DatasetType, DatasetRef | list[DatasetRef]] = getattr(quantum, attrName) 

168 for refs in attr.values(): 

169 # This if block is because init inputs has a different 

170 # signature for its items 

171 if not isinstance(refs, list | tuple): 

172 refs = [refs] 

173 for ref in refs: 

174 if ref.isComponent(): 

175 ref = ref.makeCompositeRef() 

176 check_refs.add(ref) 

177 exist_map = butler._exists_many(check_refs, full_check=False) 

178 existing_ids = {ref.id for ref, exists in exist_map.items() if exists} 

179 del exist_map 

180 

181 for quantum in (n.quantum for n in graph): 

182 for attrName in ("initInputs", "inputs", "outputs"): 

183 attr = getattr(quantum, attrName) 

184 

185 for type, refs in attr.items(): 

186 if not isinstance(refs, list | tuple): 

187 refs = [refs] 

188 if type.component() is not None: 

189 type = type.makeCompositeDatasetType() 

190 type = _validate_dataset_type(type, datasetTypes, butler.registry) 

191 # iterate over all the references, if it exists and should be 

192 # exported, if not it should be inserted into the new registry 

193 for ref in refs: 

194 # Component dataset ID is the same as its parent ID, so 

195 # checking component in existing_ids works OK. 

196 if ref.id in existing_ids: 

197 # If this is a component we want the composite to be 

198 # exported. 

199 if ref.isComponent(): 

200 ref = ref.makeCompositeRef() 

201 # Make sure we export this with the registry's dataset 

202 # type, since transfer_from doesn't handle storage 

203 # class differences (maybe it should, but it's not 

204 # bad to be defensive here even if that changes). 

205 if type != ref.datasetType: 

206 ref = ref.overrideStorageClass(type.storageClass) 

207 assert ref.datasetType == type, "Dataset types should not differ in other ways." 

208 exports.add(ref) 

209 else: 

210 if ref.isComponent(): 

211 # We can't insert a component, and a component will 

212 # be part of some other upstream dataset, so it 

213 # should be safe to skip them here 

214 continue 

215 inserts[type].add(ref) 

216 

217 return exports, inserts 

218 

219 

220def _discoverCollections(butler: Butler, collections: Iterable[str]) -> set[str]: 

221 # Recurse through any discovered collections to make sure all collections 

222 # are exported. This exists because I ran into a situation where some 

223 # collections were not properly being discovered and exported. This 

224 # method may be able to be removed in the future if collection export 

225 # logic changes 

226 collections = set(collections) 

227 while True: 

228 discoveredCollections = set( 

229 butler.registry.queryCollections(collections, flattenChains=True, includeChains=True) 

230 ) 

231 if len(discoveredCollections) > len(collections): 

232 collections = discoveredCollections 

233 else: 

234 break 

235 return collections 

236 

237 

238def _export(butler: Butler, collections: Iterable[str] | None, inserts: DataSetTypeRefMap) -> io.StringIO: 

239 # This exports relevant dimension records and collections using daf butler 

240 # objects, however it reaches in deep and does not use the public methods 

241 # so that it can export it to a string buffer and skip disk access. This 

242 # does not export the datasets themselves, since we use transfer_from for 

243 # that. 

244 yamlBuffer = io.StringIO() 

245 # Yaml is hard coded, since the class controls both ends of the 

246 # export/import 

247 BackendClass = get_class_of(butler._config["repo_transfer_formats", "yaml", "export"]) 

248 backend = BackendClass(yamlBuffer, universe=butler.dimensions) 

249 exporter = RepoExportContext(butler._registry, butler._datastore, backend, directory=None, transfer=None) 

250 

251 # Need to ensure that the dimension records for outputs are 

252 # transferred. 

253 for _, refs in inserts.items(): 

254 exporter.saveDataIds([ref.dataId for ref in refs]) 

255 

256 # Look for any defined collection, if not get the defaults 

257 if collections is None: 

258 collections = butler.registry.defaults.collections 

259 

260 # look up all collections associated with those inputs, this follows 

261 # all chains to make sure everything is properly exported 

262 for c in _discoverCollections(butler, collections): 

263 exporter.saveCollection(c) 

264 exporter._finish() 

265 

266 # reset the string buffer to the beginning so the read operation will 

267 # actually *see* the data that was exported 

268 yamlBuffer.seek(0) 

269 return yamlBuffer 

270 

271 

272def _setupNewButler( 

273 butler: Butler, 

274 outputLocation: ResourcePath, 

275 dirExists: bool, 

276 datastoreRoot: ResourcePath | None = None, 

277) -> Butler: 

278 """Set up the execution butler 

279 

280 Parameters 

281 ---------- 

282 butler : `Butler` 

283 The original butler, upon which the execution butler is based. 

284 outputLocation : `~lsst.resources.ResourcePath` 

285 Location of the execution butler. 

286 dirExists : `bool` 

287 Does the ``outputLocation`` exist, and if so, should it be clobbered? 

288 datastoreRoot : `~lsst.resources.ResourcePath`, optional 

289 Path for the execution butler datastore. If not specified, then the 

290 original butler's datastore will be used. 

291 

292 Returns 

293 ------- 

294 execution_butler : `Butler` 

295 Execution butler. 

296 """ 

297 # Set up the new butler object at the specified location 

298 if dirExists: 

299 # Remove the existing table, if the code got this far and this exists 

300 # clobber must be true 

301 executionRegistry = outputLocation.join("gen3.sqlite3") 

302 if executionRegistry.exists(): 

303 executionRegistry.remove() 

304 else: 

305 outputLocation.mkdir() 

306 

307 # Copy the existing butler config, modifying the location of the 

308 # registry to the specified location. 

309 # Preserve the root path from the existing butler so things like 

310 # file data stores continue to look at the old location. 

311 config = Config(butler._config) 

312 config["root"] = outputLocation.geturl() 

313 config["registry", "db"] = "sqlite:///<butlerRoot>/gen3.sqlite3" 

314 

315 # Remove any namespace that may be set in main registry. 

316 config.pop(("registry", "namespace"), None) 

317 

318 # Obscore manager cannot be used with execution butler. 

319 config.pop(("registry", "managers", "obscore"), None) 

320 

321 # record the current root of the datastore if it is specified relative 

322 # to the butler root 

323 if datastoreRoot is not None: 

324 config["datastore", "root"] = datastoreRoot.geturl() 

325 elif config.get(("datastore", "root")) == BUTLER_ROOT_TAG and butler._config.configDir is not None: 

326 config["datastore", "root"] = butler._config.configDir.geturl() 

327 config["datastore", "trust_get_request"] = True 

328 

329 # Requires that we use the dimension configuration from the original 

330 # butler and not use the defaults. 

331 config = Butler.makeRepo( 

332 root=outputLocation, 

333 config=config, 

334 dimensionConfig=butler.dimensions.dimensionConfig, 

335 overwrite=True, 

336 forceConfigRoot=False, 

337 ) 

338 

339 # Return a newly created butler 

340 return Butler(config, writeable=True) 

341 

342 

343def _import( 

344 yamlBuffer: io.StringIO, 

345 newButler: Butler, 

346 inserts: DataSetTypeRefMap, 

347 run: str | None, 

348 butlerModifier: Callable[[Butler], Butler] | None, 

349) -> Butler: 

350 # This method takes the exports from the existing butler, imports 

351 # them into the newly created butler, and then inserts the datasets 

352 # that are expected to be produced. 

353 

354 # import the existing datasets using "split" mode. "split" is safe 

355 # because execution butler is assumed to be able to see all the file 

356 # locations that the main datastore can see. "split" supports some 

357 # absolute URIs in the datastore. 

358 newButler.import_(filename=yamlBuffer, format="yaml", transfer="split") 

359 

360 # If there is modifier callable, run it to make necessary updates 

361 # to the new butler. 

362 if butlerModifier is not None: 

363 newButler = butlerModifier(newButler) 

364 

365 # Register datasets to be produced and insert them into the registry 

366 for dsType, refs in inserts.items(): 

367 # Storage class differences should have already been resolved by calls 

368 # _validate_dataset_type in _export, resulting in the Registry dataset 

369 # type whenever that exists. 

370 newButler.registry.registerDatasetType(dsType) 

371 newButler.registry._importDatasets(refs) 

372 

373 return newButler 

374 

375 

376def buildExecutionButler( 

377 butler: Butler, 

378 graph: QuantumGraph, 

379 outputLocation: ResourcePathExpression, 

380 run: str | None, 

381 *, 

382 clobber: bool = False, 

383 butlerModifier: Callable[[Butler], Butler] | None = None, 

384 collections: Iterable[str] | None = None, 

385 datastoreRoot: ResourcePathExpression | None = None, 

386 transfer: str = "auto", 

387) -> Butler: 

388 r"""Create an execution butler. 

389 

390 Responsible for exporting 

391 input `QuantumGraph`\s into a new minimal `~lsst.daf.butler.Butler` which 

392 only contains datasets specified by the `QuantumGraph`. 

393 

394 These datasets are both those that already exist in the input 

395 `~lsst.daf.butler.Butler`, and those that are expected to be produced 

396 during the execution of the `QuantumGraph`. 

397 

398 Parameters 

399 ---------- 

400 butler : `lsst.daf.butler.Butler` 

401 This is the existing `~lsst.daf.butler.Butler` instance from which 

402 existing datasets will be exported. This should be the 

403 `~lsst.daf.butler.Butler` which was used to create any `QuantumGraphs` 

404 that will be converted with this object. 

405 graph : `QuantumGraph` 

406 Graph containing nodes that are to be exported into an execution 

407 butler 

408 outputLocation : convertible to `~lsst.resources.ResourcePath` 

409 URI Location at which the execution butler is to be exported. May be 

410 specified as a string or a `~lsst.resources.ResourcePath` instance. 

411 run : `str`, optional 

412 The run collection that the exported datasets are to be placed in. If 

413 None, the default value in registry.defaults will be used. 

414 clobber : `bool`, Optional 

415 By default a butler will not be created if a file or directory 

416 already exists at the output location. If this is set to `True` 

417 what is at the location will be deleted prior to running the 

418 export. Defaults to `False` 

419 butlerModifier : `~typing.Callable`, Optional 

420 If supplied this should be a callable that accepts a 

421 `~lsst.daf.butler.Butler`, and returns an instantiated 

422 `~lsst.daf.butler.Butler`. This callable may be used to make any 

423 modifications to the `~lsst.daf.butler.Butler` desired. This 

424 will be called after importing all datasets that exist in the input 

425 `~lsst.daf.butler.Butler` but prior to inserting Datasets expected 

426 to be produced. Examples of what this method could do include 

427 things such as creating collections/runs/ etc. 

428 collections : `~typing.Iterable` of `str`, Optional 

429 An iterable of collection names that will be exported from the input 

430 `~lsst.daf.butler.Butler` when creating the execution butler. If not 

431 supplied the `~lsst.daf.butler.Butler`\ 's `~lsst.daf.butler.Registry` 

432 default collections will be used. 

433 datastoreRoot : convertible to `~lsst.resources.ResourcePath`, Optional 

434 Root directory for datastore of execution butler. If `None`, then the 

435 original butler's datastore will be used. 

436 transfer : `str` 

437 How (and whether) the input datasets should be added to the execution 

438 butler datastore. This should be a ``transfer`` string recognized by 

439 :func:`lsst.resources.ResourcePath.transfer_from`. 

440 ``"auto"`` means to ``"copy"`` if the ``datastoreRoot`` is specified. 

441 

442 Returns 

443 ------- 

444 executionButler : `lsst.daf.butler.Butler` 

445 An instance of the newly created execution butler. 

446 

447 Raises 

448 ------ 

449 FileExistsError 

450 Raised if something exists in the filesystem at the specified output 

451 location and clobber is `False`. 

452 NotADirectoryError 

453 Raised if specified output URI does not correspond to a directory. 

454 """ 

455 # Now require that if run is given it must match the graph run. 

456 if run and graph.metadata and run != (graph_run := graph.metadata.get("output_run")): 

457 raise ValueError(f"The given run, {run!r}, does not match that specified in the graph, {graph_run!r}") 

458 

459 # We know this must refer to a directory. 

460 outputLocation = ResourcePath(outputLocation, forceDirectory=True) 

461 if datastoreRoot is not None: 

462 datastoreRoot = ResourcePath(datastoreRoot, forceDirectory=True) 

463 

464 # Do this first to Fail Fast if the output exists 

465 if (dirExists := outputLocation.exists()) and not clobber: 

466 raise FileExistsError("Cannot create a butler at specified location, location exists") 

467 if not outputLocation.isdir(): 

468 raise NotADirectoryError("The specified output URI does not appear to correspond to a directory") 

469 

470 exports, inserts = _accumulate(butler, graph) 

471 yamlBuffer = _export(butler, collections, inserts) 

472 

473 newButler = _setupNewButler(butler, outputLocation, dirExists, datastoreRoot) 

474 

475 newButler = _import(yamlBuffer, newButler, inserts, run, butlerModifier) 

476 

477 if transfer == "auto" and datastoreRoot is not None: 

478 transfer = "copy" 

479 

480 # Transfer the existing datasets directly from the source butler. 

481 newButler.transfer_from( 

482 butler, 

483 exports, 

484 transfer=transfer, 

485 skip_missing=False, # Everything should exist. 

486 register_dataset_types=True, 

487 transfer_dimensions=True, 

488 ) 

489 

490 return newButler