Coverage for python/lsst/pipe/base/executionButlerBuilder.py: 9%

149 statements  

« prev     ^ index     » next       coverage.py v7.2.5, created at 2023-05-09 09:17 +0000

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ("buildExecutionButler",) 

24 

25import io 

26from collections import defaultdict 

27from typing import Callable, Iterable, List, Mapping, Optional, Set, Tuple, Union 

28 

29from lsst.daf.butler import Butler, Config, DatasetRef, DatasetType, Registry 

30from lsst.daf.butler.core.repoRelocation import BUTLER_ROOT_TAG 

31from lsst.daf.butler.registry import ConflictingDefinitionError, MissingDatasetTypeError 

32from lsst.daf.butler.transfers import RepoExportContext 

33from lsst.resources import ResourcePath, ResourcePathExpression 

34from lsst.utils.introspection import get_class_of 

35 

36from .graph import QuantumGraph 

37 

38DataSetTypeRefMap = Mapping[DatasetType, Set[DatasetRef]] 

39 

40 

41def _validate_dataset_type( 

42 candidate: DatasetType, previous: dict[Union[str, DatasetType], DatasetType], registry: Registry 

43) -> DatasetType: 

44 """Check the dataset types and return a consistent variant if there are 

45 different compatible options. 

46 

47 Parameters 

48 ---------- 

49 candidate : `lsst.daf.butler.DatasetType` 

50 The candidate dataset type. 

51 previous : `dict` [Union[`str`, `DatasetType`], `DatasetType`] 

52 Previous dataset types found, indexed by name and also by 

53 dataset type. The latter provides a quick way of returning a 

54 previously checked dataset type. 

55 registry : `lsst.daf.butler.Registry` 

56 Main registry whose dataset type registration should override the 

57 given one if it exists. 

58 

59 Returns 

60 ------- 

61 datasetType : `lsst.daf.butler.DatasetType` 

62 The dataset type to be used. This can be different from the 

63 given ``candidate`` if a previous dataset type was encountered 

64 with the same name and this one is compatible with it. 

65 

66 Raises 

67 ------ 

68 ConflictingDefinitionError 

69 Raised if a candidate dataset type has the same name as one 

70 previously encountered but is not compatible with it. 

71 

72 Notes 

73 ----- 

74 This function ensures that if a dataset type is given that has the 

75 same name as a previously encountered dataset type but differs solely 

76 in a way that is interchangeable (through a supported storage class) 

77 then we will always return the first dataset type encountered instead 

78 of the new variant. We assume that the butler will handle the 

79 type conversion itself later. 

80 """ 

81 # First check that if we have previously vetted this dataset type. 

82 # Return the vetted form immediately if we have. 

83 checked = previous.get(candidate) 

84 if checked: 

85 return checked 

86 

87 # Have not previously encountered this dataset type. 

88 name = candidate.name 

89 if prevDsType := previous.get(name): 

90 # Check compatibility. For now assume both directions have to 

91 # be acceptable. 

92 if prevDsType.is_compatible_with(candidate) and candidate.is_compatible_with(prevDsType): 

93 # Ensure that if this dataset type is used again we will return 

94 # the version that we were first given with this name. Store 

95 # it for next time and return the previous one. 

96 previous[candidate] = prevDsType 

97 return prevDsType 

98 else: 

99 raise ConflictingDefinitionError( 

100 f"Dataset type incompatibility in graph: {prevDsType} not compatible with {candidate}" 

101 ) 

102 

103 # We haven't seen this dataset type in this graph before, but it may 

104 # already be in the registry. 

105 try: 

106 registryDsType = registry.getDatasetType(name) 

107 previous[candidate] = registryDsType 

108 return registryDsType 

109 except MissingDatasetTypeError: 

110 pass 

111 # Dataset type is totally new. Store it by name and by dataset type so 

112 # it will be validated immediately next time it comes up. 

113 previous[name] = candidate 

114 previous[candidate] = candidate 

115 return candidate 

116 

117 

118def _accumulate( 

119 butler: Butler, 

120 graph: QuantumGraph, 

121) -> Tuple[Set[DatasetRef], DataSetTypeRefMap]: 

122 # accumulate the DatasetRefs that will be transferred to the execution 

123 # registry 

124 

125 # exports holds all the existing data that will be migrated to the 

126 # execution butler 

127 exports: Set[DatasetRef] = set() 

128 

129 # inserts is the mapping of DatasetType to dataIds for what is to be 

130 # inserted into the registry. These are the products that are expected 

131 # to be produced during processing of the QuantumGraph 

132 inserts: DataSetTypeRefMap = defaultdict(set) 

133 

134 # It is possible to end up with a graph that has different storage 

135 # classes attached to the same dataset type name. This is okay but 

136 # must we must ensure that only a single dataset type definition is 

137 # accumulated in the loop below. This data structure caches every dataset 

138 # type encountered and stores the compatible alternative. 

139 datasetTypes: dict[Union[str, DatasetType], DatasetType] = {} 

140 

141 # Find the initOutput refs. 

142 initOutputRefs = list(graph.globalInitOutputRefs()) 

143 for task_def in graph.iterTaskGraph(): 

144 task_refs = graph.initOutputRefs(task_def) 

145 if task_refs: 

146 initOutputRefs.extend(task_refs) 

147 

148 for ref in initOutputRefs: 

149 dataset_type = ref.datasetType 

150 if dataset_type.component() is not None: 

151 dataset_type = dataset_type.makeCompositeDatasetType() 

152 dataset_type = _validate_dataset_type(dataset_type, datasetTypes, butler.registry) 

153 inserts[dataset_type].add(ref) 

154 

155 # Output references may be resolved even if they do not exist. Find all 

156 # actually existing refs. 

157 check_refs: Set[DatasetRef] = set() 

158 for quantum in (n.quantum for n in graph): 

159 for attrName in ("initInputs", "inputs", "outputs"): 

160 attr: Mapping[DatasetType, Union[DatasetRef, List[DatasetRef]]] = getattr(quantum, attrName) 

161 for type, refs in attr.items(): 

162 # This if block is because init inputs has a different 

163 # signature for its items 

164 if not isinstance(refs, list): 

165 refs = [refs] 

166 for ref in refs: 

167 if ref.id is not None: 

168 # We could check existence of individual components, 

169 # but it should be less work to check their parent. 

170 if ref.isComponent(): 

171 ref = ref.makeCompositeRef() 

172 check_refs.add(ref) 

173 exist_map = butler.datastore.knows_these(check_refs) 

174 existing_ids = set(ref.id for ref, exists in exist_map.items() if exists) 

175 del exist_map 

176 

177 for quantum in (n.quantum for n in graph): 

178 for attrName in ("initInputs", "inputs", "outputs"): 

179 attr = getattr(quantum, attrName) 

180 

181 for type, refs in attr.items(): 

182 if not isinstance(refs, list): 

183 refs = [refs] 

184 if type.component() is not None: 

185 type = type.makeCompositeDatasetType() 

186 type = _validate_dataset_type(type, datasetTypes, butler.registry) 

187 # iterate over all the references, if it exists and should be 

188 # exported, if not it should be inserted into the new registry 

189 for ref in refs: 

190 # Component dataset ID is the same as its parent ID, so 

191 # checking component in existing_ids works OK. 

192 if ref.id is not None and ref.id in existing_ids: 

193 # If this is a component we want the composite to be 

194 # exported. 

195 if ref.isComponent(): 

196 ref = ref.makeCompositeRef() 

197 # Make sure we export this with the registry's dataset 

198 # type, since transfer_from doesn't handle storage 

199 # class differences (maybe it should, but it's not 

200 # bad to be defensive here even if that changes). 

201 if type != ref.datasetType: 

202 ref = ref.overrideStorageClass(type.storageClass) 

203 assert ref.datasetType == type, "Dataset types should not differ in other ways." 

204 exports.add(ref) 

205 else: 

206 if ref.isComponent(): 

207 # We can't insert a component, and a component will 

208 # be part of some other upstream dataset, so it 

209 # should be safe to skip them here 

210 continue 

211 inserts[type].add(ref) 

212 

213 return exports, inserts 

214 

215 

216def _discoverCollections(butler: Butler, collections: Iterable[str]) -> set[str]: 

217 # Recurse through any discovered collections to make sure all collections 

218 # are exported. This exists because I ran into a situation where some 

219 # collections were not properly being discovered and exported. This 

220 # method may be able to be removed in the future if collection export 

221 # logic changes 

222 collections = set(collections) 

223 while True: 

224 discoveredCollections = set( 

225 butler.registry.queryCollections(collections, flattenChains=True, includeChains=True) 

226 ) 

227 if len(discoveredCollections) > len(collections): 

228 collections = discoveredCollections 

229 else: 

230 break 

231 return collections 

232 

233 

234def _export(butler: Butler, collections: Optional[Iterable[str]], inserts: DataSetTypeRefMap) -> io.StringIO: 

235 # This exports relevant dimension records and collections using daf butler 

236 # objects, however it reaches in deep and does not use the public methods 

237 # so that it can export it to a string buffer and skip disk access. This 

238 # does not export the datasets themselves, since we use transfer_from for 

239 # that. 

240 yamlBuffer = io.StringIO() 

241 # Yaml is hard coded, since the class controls both ends of the 

242 # export/import 

243 BackendClass = get_class_of(butler._config["repo_transfer_formats", "yaml", "export"]) 

244 backend = BackendClass(yamlBuffer, universe=butler.registry.dimensions) 

245 exporter = RepoExportContext(butler.registry, butler.datastore, backend, directory=None, transfer=None) 

246 

247 # Need to ensure that the dimension records for outputs are 

248 # transferred. 

249 for _, refs in inserts.items(): 

250 exporter.saveDataIds([ref.dataId for ref in refs]) 

251 

252 # Look for any defined collection, if not get the defaults 

253 if collections is None: 

254 collections = butler.registry.defaults.collections 

255 

256 # look up all collections associated with those inputs, this follows 

257 # all chains to make sure everything is properly exported 

258 for c in _discoverCollections(butler, collections): 

259 exporter.saveCollection(c) 

260 exporter._finish() 

261 

262 # reset the string buffer to the beginning so the read operation will 

263 # actually *see* the data that was exported 

264 yamlBuffer.seek(0) 

265 return yamlBuffer 

266 

267 

268def _setupNewButler( 

269 butler: Butler, 

270 outputLocation: ResourcePath, 

271 dirExists: bool, 

272 datastoreRoot: Optional[ResourcePath] = None, 

273) -> Butler: 

274 """Set up the execution butler 

275 

276 Parameters 

277 ---------- 

278 butler : `Butler` 

279 The original butler, upon which the execution butler is based. 

280 outputLocation : `ResourcePath` 

281 Location of the execution butler. 

282 dirExists : `bool` 

283 Does the ``outputLocation`` exist, and if so, should it be clobbered? 

284 datastoreRoot : `ResourcePath`, optional 

285 Path for the execution butler datastore. If not specified, then the 

286 original butler's datastore will be used. 

287 

288 Returns 

289 ------- 

290 execution_butler : `Butler` 

291 Execution butler. 

292 """ 

293 # Set up the new butler object at the specified location 

294 if dirExists: 

295 # Remove the existing table, if the code got this far and this exists 

296 # clobber must be true 

297 executionRegistry = outputLocation.join("gen3.sqlite3") 

298 if executionRegistry.exists(): 

299 executionRegistry.remove() 

300 else: 

301 outputLocation.mkdir() 

302 

303 # Copy the existing butler config, modifying the location of the 

304 # registry to the specified location. 

305 # Preserve the root path from the existing butler so things like 

306 # file data stores continue to look at the old location. 

307 config = Config(butler._config) 

308 config["root"] = outputLocation.geturl() 

309 config["registry", "db"] = "sqlite:///<butlerRoot>/gen3.sqlite3" 

310 

311 # Remove any namespace that may be set in main registry. 

312 config.pop(("registry", "namespace"), None) 

313 

314 # Obscore manager cannot be used with execution butler. 

315 config.pop(("registry", "managers", "obscore"), None) 

316 

317 # record the current root of the datastore if it is specified relative 

318 # to the butler root 

319 if datastoreRoot is not None: 

320 config["datastore", "root"] = datastoreRoot.geturl() 

321 elif config.get(("datastore", "root")) == BUTLER_ROOT_TAG and butler._config.configDir is not None: 

322 config["datastore", "root"] = butler._config.configDir.geturl() 

323 config["datastore", "trust_get_request"] = True 

324 

325 # Requires that we use the dimension configuration from the original 

326 # butler and not use the defaults. 

327 config = Butler.makeRepo( 

328 root=outputLocation, 

329 config=config, 

330 dimensionConfig=butler.registry.dimensions.dimensionConfig, 

331 overwrite=True, 

332 forceConfigRoot=False, 

333 ) 

334 

335 # Return a newly created butler 

336 return Butler(config, writeable=True) 

337 

338 

339def _import( 

340 yamlBuffer: io.StringIO, 

341 newButler: Butler, 

342 inserts: DataSetTypeRefMap, 

343 run: Optional[str], 

344 butlerModifier: Optional[Callable[[Butler], Butler]], 

345) -> Butler: 

346 # This method takes the exports from the existing butler, imports 

347 # them into the newly created butler, and then inserts the datasets 

348 # that are expected to be produced. 

349 

350 # import the existing datasets using "split" mode. "split" is safe 

351 # because execution butler is assumed to be able to see all the file 

352 # locations that the main datastore can see. "split" supports some 

353 # absolute URIs in the datastore. 

354 newButler.import_(filename=yamlBuffer, format="yaml", reuseIds=True, transfer="split") 

355 

356 # If there is modifier callable, run it to make necessary updates 

357 # to the new butler. 

358 if butlerModifier is not None: 

359 newButler = butlerModifier(newButler) 

360 

361 # Register datasets to be produced and insert them into the registry 

362 for dsType, refs in inserts.items(): 

363 # Storage class differences should have already been resolved by calls 

364 # _validate_dataset_type in _export, resulting in the Registry dataset 

365 # type whenever that exists. 

366 newButler.registry.registerDatasetType(dsType) 

367 newButler.registry._importDatasets(refs) 

368 

369 return newButler 

370 

371 

372def buildExecutionButler( 

373 butler: Butler, 

374 graph: QuantumGraph, 

375 outputLocation: ResourcePathExpression, 

376 run: Optional[str], 

377 *, 

378 clobber: bool = False, 

379 butlerModifier: Optional[Callable[[Butler], Butler]] = None, 

380 collections: Optional[Iterable[str]] = None, 

381 datastoreRoot: Optional[ResourcePathExpression] = None, 

382 transfer: str = "auto", 

383) -> Butler: 

384 r"""buildExecutionButler is a function that is responsible for exporting 

385 input `QuantumGraphs` into a new minimal `~lsst.daf.butler.Butler` which 

386 only contains datasets specified by the `QuantumGraph`. These datasets are 

387 both those that already exist in the input `~lsst.daf.butler.Butler`, and 

388 those that are expected to be produced during the execution of the 

389 `QuantumGraph`. 

390 

391 Parameters 

392 ---------- 

393 butler : `lsst.daf.butler.Butler` 

394 This is the existing `~lsst.daf.butler.Butler` instance from which 

395 existing datasets will be exported. This should be the 

396 `~lsst.daf.butler.Butler` which was used to create any `QuantumGraphs` 

397 that will be converted with this object. 

398 graph : `QuantumGraph` 

399 Graph containing nodes that are to be exported into an execution 

400 butler 

401 outputLocation : convertible to `ResourcePath` 

402 URI Location at which the execution butler is to be exported. May be 

403 specified as a string or a `ResourcePath` instance. 

404 run : `str`, optional 

405 The run collection that the exported datasets are to be placed in. If 

406 None, the default value in registry.defaults will be used. 

407 clobber : `bool`, Optional 

408 By default a butler will not be created if a file or directory 

409 already exists at the output location. If this is set to `True` 

410 what is at the location will be deleted prior to running the 

411 export. Defaults to `False` 

412 butlerModifier : `~typing.Callable`, Optional 

413 If supplied this should be a callable that accepts a 

414 `~lsst.daf.butler.Butler`, and returns an instantiated 

415 `~lsst.daf.butler.Butler`. This callable may be used to make any 

416 modifications to the `~lsst.daf.butler.Butler` desired. This 

417 will be called after importing all datasets that exist in the input 

418 `~lsst.daf.butler.Butler` but prior to inserting Datasets expected 

419 to be produced. Examples of what this method could do include 

420 things such as creating collections/runs/ etc. 

421 collections : `~typing.Iterable` of `str`, Optional 

422 An iterable of collection names that will be exported from the input 

423 `~lsst.daf.butler.Butler` when creating the execution butler. If not 

424 supplied the `~lsst.daf.butler.Butler`\ 's `~lsst.daf.butler.Registry` 

425 default collections will be used. 

426 datastoreRoot : convertible to `ResourcePath`, Optional 

427 Root directory for datastore of execution butler. If `None`, then the 

428 original butler's datastore will be used. 

429 transfer : `str` 

430 How (and whether) the input datasets should be added to the execution 

431 butler datastore. This should be a ``transfer`` string recognized by 

432 :func:`lsst.resources.ResourcePath.transfer_from`. 

433 ``"auto"`` means to ``"copy"`` if the ``datastoreRoot`` is specified. 

434 

435 Returns 

436 ------- 

437 executionButler : `lsst.daf.butler.Butler` 

438 An instance of the newly created execution butler 

439 

440 Raises 

441 ------ 

442 FileExistsError 

443 Raised if something exists in the filesystem at the specified output 

444 location and clobber is `False` 

445 NotADirectoryError 

446 Raised if specified output URI does not correspond to a directory 

447 """ 

448 # Now require that if run is given it must match the graph run. 

449 if run and graph.metadata and run != (graph_run := graph.metadata.get("output_run")): 

450 raise ValueError(f"The given run, {run!r}, does not match that specified in the graph, {graph_run!r}") 

451 

452 # We know this must refer to a directory. 

453 outputLocation = ResourcePath(outputLocation, forceDirectory=True) 

454 if datastoreRoot is not None: 

455 datastoreRoot = ResourcePath(datastoreRoot, forceDirectory=True) 

456 

457 # Do this first to Fail Fast if the output exists 

458 if (dirExists := outputLocation.exists()) and not clobber: 

459 raise FileExistsError("Cannot create a butler at specified location, location exists") 

460 if not outputLocation.isdir(): 

461 raise NotADirectoryError("The specified output URI does not appear to correspond to a directory") 

462 

463 exports, inserts = _accumulate(butler, graph) 

464 yamlBuffer = _export(butler, collections, inserts) 

465 

466 newButler = _setupNewButler(butler, outputLocation, dirExists, datastoreRoot) 

467 

468 newButler = _import(yamlBuffer, newButler, inserts, run, butlerModifier) 

469 

470 if transfer == "auto" and datastoreRoot is not None: 

471 transfer = "copy" 

472 

473 # Transfer the existing datasets directly from the source butler. 

474 newButler.transfer_from( 

475 butler, 

476 exports, 

477 transfer=transfer, 

478 skip_missing=False, # Everything should exist. 

479 register_dataset_types=True, 

480 transfer_dimensions=True, 

481 ) 

482 

483 return newButler