Coverage for python/lsst/pipe/base/executionButlerBuilder.py: 9%

148 statements  

« prev     ^ index     » next       coverage.py v7.3.0, created at 2023-08-23 10:31 +0000

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ("buildExecutionButler",) 

24 

25import io 

26from collections import defaultdict 

27from collections.abc import Callable, Iterable, Mapping 

28 

29from lsst.daf.butler import Butler, Config, DatasetRef, DatasetType, Registry 

30from lsst.daf.butler.core.repoRelocation import BUTLER_ROOT_TAG 

31from lsst.daf.butler.registry import ConflictingDefinitionError, MissingDatasetTypeError 

32from lsst.daf.butler.transfers import RepoExportContext 

33from lsst.resources import ResourcePath, ResourcePathExpression 

34from lsst.utils.introspection import get_class_of 

35 

36from .graph import QuantumGraph 

37 

38DataSetTypeRefMap = Mapping[DatasetType, set[DatasetRef]] 

39 

40 

41def _validate_dataset_type( 

42 candidate: DatasetType, previous: dict[str | DatasetType, DatasetType], registry: Registry 

43) -> DatasetType: 

44 """Check the dataset types and return a consistent variant if there are 

45 different compatible options. 

46 

47 Parameters 

48 ---------- 

49 candidate : `lsst.daf.butler.DatasetType` 

50 The candidate dataset type. 

51 previous : `dict` [ `str` | `~lsst.daf.butler.DatasetType`, \ 

52 `~lsst.daf.butler.DatasetType`] 

53 Previous dataset types found, indexed by name and also by 

54 dataset type. The latter provides a quick way of returning a 

55 previously checked dataset type. 

56 registry : `lsst.daf.butler.Registry` 

57 Main registry whose dataset type registration should override the 

58 given one if it exists. 

59 

60 Returns 

61 ------- 

62 datasetType : `lsst.daf.butler.DatasetType` 

63 The dataset type to be used. This can be different from the 

64 given ``candidate`` if a previous dataset type was encountered 

65 with the same name and this one is compatible with it. 

66 

67 Raises 

68 ------ 

69 ConflictingDefinitionError 

70 Raised if a candidate dataset type has the same name as one 

71 previously encountered but is not compatible with it. 

72 

73 Notes 

74 ----- 

75 This function ensures that if a dataset type is given that has the 

76 same name as a previously encountered dataset type but differs solely 

77 in a way that is interchangeable (through a supported storage class) 

78 then we will always return the first dataset type encountered instead 

79 of the new variant. We assume that the butler will handle the 

80 type conversion itself later. 

81 """ 

82 # First check that if we have previously vetted this dataset type. 

83 # Return the vetted form immediately if we have. 

84 checked = previous.get(candidate) 

85 if checked: 

86 return checked 

87 

88 # Have not previously encountered this dataset type. 

89 name = candidate.name 

90 if prevDsType := previous.get(name): 

91 # Check compatibility. For now assume both directions have to 

92 # be acceptable. 

93 if prevDsType.is_compatible_with(candidate) and candidate.is_compatible_with(prevDsType): 

94 # Ensure that if this dataset type is used again we will return 

95 # the version that we were first given with this name. Store 

96 # it for next time and return the previous one. 

97 previous[candidate] = prevDsType 

98 return prevDsType 

99 else: 

100 raise ConflictingDefinitionError( 

101 f"Dataset type incompatibility in graph: {prevDsType} not compatible with {candidate}" 

102 ) 

103 

104 # We haven't seen this dataset type in this graph before, but it may 

105 # already be in the registry. 

106 try: 

107 registryDsType = registry.getDatasetType(name) 

108 previous[candidate] = registryDsType 

109 return registryDsType 

110 except MissingDatasetTypeError: 

111 pass 

112 # Dataset type is totally new. Store it by name and by dataset type so 

113 # it will be validated immediately next time it comes up. 

114 previous[name] = candidate 

115 previous[candidate] = candidate 

116 return candidate 

117 

118 

119def _accumulate( 

120 butler: Butler, 

121 graph: QuantumGraph, 

122) -> tuple[set[DatasetRef], DataSetTypeRefMap]: 

123 # accumulate the DatasetRefs that will be transferred to the execution 

124 # registry 

125 

126 # exports holds all the existing data that will be migrated to the 

127 # execution butler 

128 exports: set[DatasetRef] = set() 

129 

130 # inserts is the mapping of DatasetType to dataIds for what is to be 

131 # inserted into the registry. These are the products that are expected 

132 # to be produced during processing of the QuantumGraph 

133 inserts: DataSetTypeRefMap = defaultdict(set) 

134 

135 # It is possible to end up with a graph that has different storage 

136 # classes attached to the same dataset type name. This is okay but 

137 # must we must ensure that only a single dataset type definition is 

138 # accumulated in the loop below. This data structure caches every dataset 

139 # type encountered and stores the compatible alternative. 

140 datasetTypes: dict[str | DatasetType, DatasetType] = {} 

141 

142 # Find the initOutput refs. 

143 initOutputRefs = list(graph.globalInitOutputRefs()) 

144 for task_def in graph.iterTaskGraph(): 

145 task_refs = graph.initOutputRefs(task_def) 

146 if task_refs: 

147 initOutputRefs.extend(task_refs) 

148 

149 for ref in initOutputRefs: 

150 dataset_type = ref.datasetType 

151 if dataset_type.component() is not None: 

152 dataset_type = dataset_type.makeCompositeDatasetType() 

153 dataset_type = _validate_dataset_type(dataset_type, datasetTypes, butler.registry) 

154 inserts[dataset_type].add(ref) 

155 

156 # Output references may be resolved even if they do not exist. Find all 

157 # actually existing refs. 

158 check_refs: set[DatasetRef] = set() 

159 for quantum in (n.quantum for n in graph): 

160 for attrName in ("initInputs", "inputs", "outputs"): 

161 attr: Mapping[DatasetType, DatasetRef | list[DatasetRef]] = getattr(quantum, attrName) 

162 for refs in attr.values(): 

163 # This if block is because init inputs has a different 

164 # signature for its items 

165 if not isinstance(refs, list | tuple): 

166 refs = [refs] 

167 for ref in refs: 

168 if ref.isComponent(): 

169 ref = ref.makeCompositeRef() 

170 check_refs.add(ref) 

171 exist_map = butler._exists_many(check_refs, full_check=False) 

172 existing_ids = {ref.id for ref, exists in exist_map.items() if exists} 

173 del exist_map 

174 

175 for quantum in (n.quantum for n in graph): 

176 for attrName in ("initInputs", "inputs", "outputs"): 

177 attr = getattr(quantum, attrName) 

178 

179 for type, refs in attr.items(): 

180 if not isinstance(refs, list | tuple): 

181 refs = [refs] 

182 if type.component() is not None: 

183 type = type.makeCompositeDatasetType() 

184 type = _validate_dataset_type(type, datasetTypes, butler.registry) 

185 # iterate over all the references, if it exists and should be 

186 # exported, if not it should be inserted into the new registry 

187 for ref in refs: 

188 # Component dataset ID is the same as its parent ID, so 

189 # checking component in existing_ids works OK. 

190 if ref.id in existing_ids: 

191 # If this is a component we want the composite to be 

192 # exported. 

193 if ref.isComponent(): 

194 ref = ref.makeCompositeRef() 

195 # Make sure we export this with the registry's dataset 

196 # type, since transfer_from doesn't handle storage 

197 # class differences (maybe it should, but it's not 

198 # bad to be defensive here even if that changes). 

199 if type != ref.datasetType: 

200 ref = ref.overrideStorageClass(type.storageClass) 

201 assert ref.datasetType == type, "Dataset types should not differ in other ways." 

202 exports.add(ref) 

203 else: 

204 if ref.isComponent(): 

205 # We can't insert a component, and a component will 

206 # be part of some other upstream dataset, so it 

207 # should be safe to skip them here 

208 continue 

209 inserts[type].add(ref) 

210 

211 return exports, inserts 

212 

213 

214def _discoverCollections(butler: Butler, collections: Iterable[str]) -> set[str]: 

215 # Recurse through any discovered collections to make sure all collections 

216 # are exported. This exists because I ran into a situation where some 

217 # collections were not properly being discovered and exported. This 

218 # method may be able to be removed in the future if collection export 

219 # logic changes 

220 collections = set(collections) 

221 while True: 

222 discoveredCollections = set( 

223 butler.registry.queryCollections(collections, flattenChains=True, includeChains=True) 

224 ) 

225 if len(discoveredCollections) > len(collections): 

226 collections = discoveredCollections 

227 else: 

228 break 

229 return collections 

230 

231 

232def _export(butler: Butler, collections: Iterable[str] | None, inserts: DataSetTypeRefMap) -> io.StringIO: 

233 # This exports relevant dimension records and collections using daf butler 

234 # objects, however it reaches in deep and does not use the public methods 

235 # so that it can export it to a string buffer and skip disk access. This 

236 # does not export the datasets themselves, since we use transfer_from for 

237 # that. 

238 yamlBuffer = io.StringIO() 

239 # Yaml is hard coded, since the class controls both ends of the 

240 # export/import 

241 BackendClass = get_class_of(butler._config["repo_transfer_formats", "yaml", "export"]) 

242 backend = BackendClass(yamlBuffer, universe=butler.dimensions) 

243 exporter = RepoExportContext(butler._registry, butler._datastore, backend, directory=None, transfer=None) 

244 

245 # Need to ensure that the dimension records for outputs are 

246 # transferred. 

247 for _, refs in inserts.items(): 

248 exporter.saveDataIds([ref.dataId for ref in refs]) 

249 

250 # Look for any defined collection, if not get the defaults 

251 if collections is None: 

252 collections = butler.registry.defaults.collections 

253 

254 # look up all collections associated with those inputs, this follows 

255 # all chains to make sure everything is properly exported 

256 for c in _discoverCollections(butler, collections): 

257 exporter.saveCollection(c) 

258 exporter._finish() 

259 

260 # reset the string buffer to the beginning so the read operation will 

261 # actually *see* the data that was exported 

262 yamlBuffer.seek(0) 

263 return yamlBuffer 

264 

265 

266def _setupNewButler( 

267 butler: Butler, 

268 outputLocation: ResourcePath, 

269 dirExists: bool, 

270 datastoreRoot: ResourcePath | None = None, 

271) -> Butler: 

272 """Set up the execution butler 

273 

274 Parameters 

275 ---------- 

276 butler : `Butler` 

277 The original butler, upon which the execution butler is based. 

278 outputLocation : `~lsst.resources.ResourcePath` 

279 Location of the execution butler. 

280 dirExists : `bool` 

281 Does the ``outputLocation`` exist, and if so, should it be clobbered? 

282 datastoreRoot : `~lsst.resources.ResourcePath`, optional 

283 Path for the execution butler datastore. If not specified, then the 

284 original butler's datastore will be used. 

285 

286 Returns 

287 ------- 

288 execution_butler : `Butler` 

289 Execution butler. 

290 """ 

291 # Set up the new butler object at the specified location 

292 if dirExists: 

293 # Remove the existing table, if the code got this far and this exists 

294 # clobber must be true 

295 executionRegistry = outputLocation.join("gen3.sqlite3") 

296 if executionRegistry.exists(): 

297 executionRegistry.remove() 

298 else: 

299 outputLocation.mkdir() 

300 

301 # Copy the existing butler config, modifying the location of the 

302 # registry to the specified location. 

303 # Preserve the root path from the existing butler so things like 

304 # file data stores continue to look at the old location. 

305 config = Config(butler._config) 

306 config["root"] = outputLocation.geturl() 

307 config["registry", "db"] = "sqlite:///<butlerRoot>/gen3.sqlite3" 

308 

309 # Remove any namespace that may be set in main registry. 

310 config.pop(("registry", "namespace"), None) 

311 

312 # Obscore manager cannot be used with execution butler. 

313 config.pop(("registry", "managers", "obscore"), None) 

314 

315 # record the current root of the datastore if it is specified relative 

316 # to the butler root 

317 if datastoreRoot is not None: 

318 config["datastore", "root"] = datastoreRoot.geturl() 

319 elif config.get(("datastore", "root")) == BUTLER_ROOT_TAG and butler._config.configDir is not None: 

320 config["datastore", "root"] = butler._config.configDir.geturl() 

321 config["datastore", "trust_get_request"] = True 

322 

323 # Requires that we use the dimension configuration from the original 

324 # butler and not use the defaults. 

325 config = Butler.makeRepo( 

326 root=outputLocation, 

327 config=config, 

328 dimensionConfig=butler.dimensions.dimensionConfig, 

329 overwrite=True, 

330 forceConfigRoot=False, 

331 ) 

332 

333 # Return a newly created butler 

334 return Butler(config, writeable=True) 

335 

336 

337def _import( 

338 yamlBuffer: io.StringIO, 

339 newButler: Butler, 

340 inserts: DataSetTypeRefMap, 

341 run: str | None, 

342 butlerModifier: Callable[[Butler], Butler] | None, 

343) -> Butler: 

344 # This method takes the exports from the existing butler, imports 

345 # them into the newly created butler, and then inserts the datasets 

346 # that are expected to be produced. 

347 

348 # import the existing datasets using "split" mode. "split" is safe 

349 # because execution butler is assumed to be able to see all the file 

350 # locations that the main datastore can see. "split" supports some 

351 # absolute URIs in the datastore. 

352 newButler.import_(filename=yamlBuffer, format="yaml", transfer="split") 

353 

354 # If there is modifier callable, run it to make necessary updates 

355 # to the new butler. 

356 if butlerModifier is not None: 

357 newButler = butlerModifier(newButler) 

358 

359 # Register datasets to be produced and insert them into the registry 

360 for dsType, refs in inserts.items(): 

361 # Storage class differences should have already been resolved by calls 

362 # _validate_dataset_type in _export, resulting in the Registry dataset 

363 # type whenever that exists. 

364 newButler.registry.registerDatasetType(dsType) 

365 newButler.registry._importDatasets(refs) 

366 

367 return newButler 

368 

369 

370def buildExecutionButler( 

371 butler: Butler, 

372 graph: QuantumGraph, 

373 outputLocation: ResourcePathExpression, 

374 run: str | None, 

375 *, 

376 clobber: bool = False, 

377 butlerModifier: Callable[[Butler], Butler] | None = None, 

378 collections: Iterable[str] | None = None, 

379 datastoreRoot: ResourcePathExpression | None = None, 

380 transfer: str = "auto", 

381) -> Butler: 

382 r"""Create an execution butler. 

383 

384 Responsible for exporting 

385 input `QuantumGraph`\s into a new minimal `~lsst.daf.butler.Butler` which 

386 only contains datasets specified by the `QuantumGraph`. 

387 

388 These datasets are both those that already exist in the input 

389 `~lsst.daf.butler.Butler`, and those that are expected to be produced 

390 during the execution of the `QuantumGraph`. 

391 

392 Parameters 

393 ---------- 

394 butler : `lsst.daf.butler.Butler` 

395 This is the existing `~lsst.daf.butler.Butler` instance from which 

396 existing datasets will be exported. This should be the 

397 `~lsst.daf.butler.Butler` which was used to create any `QuantumGraphs` 

398 that will be converted with this object. 

399 graph : `QuantumGraph` 

400 Graph containing nodes that are to be exported into an execution 

401 butler 

402 outputLocation : convertible to `~lsst.resources.ResourcePath` 

403 URI Location at which the execution butler is to be exported. May be 

404 specified as a string or a `~lsst.resources.ResourcePath` instance. 

405 run : `str`, optional 

406 The run collection that the exported datasets are to be placed in. If 

407 None, the default value in registry.defaults will be used. 

408 clobber : `bool`, Optional 

409 By default a butler will not be created if a file or directory 

410 already exists at the output location. If this is set to `True` 

411 what is at the location will be deleted prior to running the 

412 export. Defaults to `False` 

413 butlerModifier : `~typing.Callable`, Optional 

414 If supplied this should be a callable that accepts a 

415 `~lsst.daf.butler.Butler`, and returns an instantiated 

416 `~lsst.daf.butler.Butler`. This callable may be used to make any 

417 modifications to the `~lsst.daf.butler.Butler` desired. This 

418 will be called after importing all datasets that exist in the input 

419 `~lsst.daf.butler.Butler` but prior to inserting Datasets expected 

420 to be produced. Examples of what this method could do include 

421 things such as creating collections/runs/ etc. 

422 collections : `~typing.Iterable` of `str`, Optional 

423 An iterable of collection names that will be exported from the input 

424 `~lsst.daf.butler.Butler` when creating the execution butler. If not 

425 supplied the `~lsst.daf.butler.Butler`\ 's `~lsst.daf.butler.Registry` 

426 default collections will be used. 

427 datastoreRoot : convertible to `~lsst.resources.ResourcePath`, Optional 

428 Root directory for datastore of execution butler. If `None`, then the 

429 original butler's datastore will be used. 

430 transfer : `str` 

431 How (and whether) the input datasets should be added to the execution 

432 butler datastore. This should be a ``transfer`` string recognized by 

433 :func:`lsst.resources.ResourcePath.transfer_from`. 

434 ``"auto"`` means to ``"copy"`` if the ``datastoreRoot`` is specified. 

435 

436 Returns 

437 ------- 

438 executionButler : `lsst.daf.butler.Butler` 

439 An instance of the newly created execution butler. 

440 

441 Raises 

442 ------ 

443 FileExistsError 

444 Raised if something exists in the filesystem at the specified output 

445 location and clobber is `False`. 

446 NotADirectoryError 

447 Raised if specified output URI does not correspond to a directory. 

448 """ 

449 # Now require that if run is given it must match the graph run. 

450 if run and graph.metadata and run != (graph_run := graph.metadata.get("output_run")): 

451 raise ValueError(f"The given run, {run!r}, does not match that specified in the graph, {graph_run!r}") 

452 

453 # We know this must refer to a directory. 

454 outputLocation = ResourcePath(outputLocation, forceDirectory=True) 

455 if datastoreRoot is not None: 

456 datastoreRoot = ResourcePath(datastoreRoot, forceDirectory=True) 

457 

458 # Do this first to Fail Fast if the output exists 

459 if (dirExists := outputLocation.exists()) and not clobber: 

460 raise FileExistsError("Cannot create a butler at specified location, location exists") 

461 if not outputLocation.isdir(): 

462 raise NotADirectoryError("The specified output URI does not appear to correspond to a directory") 

463 

464 exports, inserts = _accumulate(butler, graph) 

465 yamlBuffer = _export(butler, collections, inserts) 

466 

467 newButler = _setupNewButler(butler, outputLocation, dirExists, datastoreRoot) 

468 

469 newButler = _import(yamlBuffer, newButler, inserts, run, butlerModifier) 

470 

471 if transfer == "auto" and datastoreRoot is not None: 

472 transfer = "copy" 

473 

474 # Transfer the existing datasets directly from the source butler. 

475 newButler.transfer_from( 

476 butler, 

477 exports, 

478 transfer=transfer, 

479 skip_missing=False, # Everything should exist. 

480 register_dataset_types=True, 

481 transfer_dimensions=True, 

482 ) 

483 

484 return newButler