Coverage for python/lsst/pipe/base/executionButlerBuilder.py: 11%

137 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2023-01-27 09:56 +0000

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ("buildExecutionButler",) 

24 

25import io 

26import itertools 

27from collections import defaultdict 

28from typing import Callable, DefaultDict, Iterable, List, Mapping, Optional, Set, Tuple, Union 

29 

30from lsst.daf.butler import Butler, Config, DataCoordinate, DatasetRef, DatasetType 

31from lsst.daf.butler.core.repoRelocation import BUTLER_ROOT_TAG 

32from lsst.daf.butler.registry import ConflictingDefinitionError 

33from lsst.daf.butler.transfers import RepoExportContext 

34from lsst.resources import ResourcePath, ResourcePathExpression 

35from lsst.utils.introspection import get_class_of 

36 

37from .graph import QuantumGraph, QuantumNode 

38from .pipeline import PipelineDatasetTypes 

39 

40DataSetTypeMap = Mapping[DatasetType, Set[DataCoordinate]] 

41 

42 

43def _validate_dataset_type( 

44 candidate: DatasetType, previous: dict[Union[str, DatasetType], DatasetType] 

45) -> DatasetType: 

46 """Check the dataset types and return a consistent variant if there are 

47 different compatible options. 

48 

49 Parameters 

50 ---------- 

51 candidate : `lsst.daf.butler.DatasetType` 

52 The candidate dataset type. 

53 previous : `dict` [Union[`str`, `DatasetType`], `DatasetType`] 

54 Previous dataset types found, indexed by name and also by 

55 dataset type. The latter provides a quick way of returning a 

56 previously checked dataset type. 

57 

58 Returns 

59 ------- 

60 datasetType : `lsst.daf.butler.DatasetType` 

61 The dataset type to be used. This can be different from the 

62 given ``candidate`` if a previous dataset type was encountered 

63 with the same name and this one is compatible with it. 

64 

65 Raises 

66 ------ 

67 ConflictingDefinitionError 

68 Raised if a candidate dataset type has the same name as one 

69 previously encountered but is not compatible with it. 

70 

71 Notes 

72 ----- 

73 This function ensures that if a dataset type is given that has the 

74 same name as a previously encountered dataset type but differs solely 

75 in a way that is interchangeable (through a supported storage class) 

76 then we will always return the first dataset type encountered instead 

77 of the new variant. We assume that the butler will handle the 

78 type conversion itself later. 

79 """ 

80 # First check that if we have previously vetted this dataset type. 

81 # Return the vetted form immediately if we have. 

82 checked = previous.get(candidate) 

83 if checked: 

84 return checked 

85 

86 # Have not previously encountered this dataset type. 

87 name = candidate.name 

88 if prevDsType := previous.get(name): 

89 # Check compatibility. For now assume both directions have to 

90 # be acceptable. 

91 if prevDsType.is_compatible_with(candidate) and candidate.is_compatible_with(prevDsType): 

92 # Ensure that if this dataset type is used again we will return 

93 # the version that we were first given with this name. Store 

94 # it for next time and return the previous one. 

95 previous[candidate] = prevDsType 

96 return prevDsType 

97 else: 

98 raise ConflictingDefinitionError( 

99 f"Dataset type incompatibility in graph: {prevDsType} not compatible with {candidate}" 

100 ) 

101 

102 # New dataset type encountered. Store it by name and by dataset type 

103 # so it will be validated immediately next time it comes up. 

104 previous[name] = candidate 

105 previous[candidate] = candidate 

106 return candidate 

107 

108 

109def _accumulate( 

110 butler: Butler, 

111 graph: QuantumGraph, 

112 dataset_types: PipelineDatasetTypes, 

113) -> Tuple[Set[DatasetRef], DataSetTypeMap]: 

114 # accumulate the DatasetRefs that will be transferred to the execution 

115 # registry 

116 

117 # exports holds all the existing data that will be migrated to the 

118 # execution butler 

119 exports: Set[DatasetRef] = set() 

120 

121 # inserts is the mapping of DatasetType to dataIds for what is to be 

122 # inserted into the registry. These are the products that are expected 

123 # to be produced during processing of the QuantumGraph 

124 inserts: DefaultDict[DatasetType, Set[DataCoordinate]] = defaultdict(set) 

125 

126 # It is possible to end up with a graph that has different storage 

127 # classes attached to the same dataset type name. This is okay but 

128 # must we must ensure that only a single dataset type definition is 

129 # accumulated in the loop below. This data structure caches every dataset 

130 # type encountered and stores the compatible alternative. 

131 datasetTypes: dict[Union[str, DatasetType], DatasetType] = {} 

132 

133 # Add inserts for initOutputs (including initIntermediates); these are 

134 # defined fully by their DatasetType, because they have no dimensions. 

135 # initInputs are part of Quantum and that's the only place the graph stores 

136 # the dataset IDs, so we process them there even though each Quantum for a 

137 # task has the same ones. 

138 for dataset_type in itertools.chain(dataset_types.initIntermediates, dataset_types.initOutputs): 

139 dataset_type = _validate_dataset_type(dataset_type, datasetTypes) 

140 inserts[dataset_type].add(DataCoordinate.makeEmpty(dataset_type.dimensions.universe)) 

141 

142 # Output references may be resolved even if they do not exist. Find all 

143 # actually existing refs. 

144 check_refs: Set[DatasetRef] = set() 

145 n: QuantumNode 

146 for quantum in (n.quantum for n in graph): 

147 for attrName in ("initInputs", "inputs", "outputs"): 

148 attr: Mapping[DatasetType, Union[DatasetRef, List[DatasetRef]]] = getattr(quantum, attrName) 

149 for type, refs in attr.items(): 

150 # This if block is because init inputs has a different 

151 # signature for its items 

152 if not isinstance(refs, list): 

153 refs = [refs] 

154 for ref in refs: 

155 if ref.id is not None: 

156 # We could check existence of individual components, 

157 # but it should be less work to check their parent. 

158 if ref.isComponent(): 

159 ref = ref.makeCompositeRef() 

160 check_refs.add(ref) 

161 exist_map = butler.datastore.knows_these(check_refs) 

162 existing_ids = set(ref.id for ref, exists in exist_map.items() if exists) 

163 del exist_map 

164 

165 for quantum in (n.quantum for n in graph): 

166 for attrName in ("initInputs", "inputs", "outputs"): 

167 attr = getattr(quantum, attrName) 

168 

169 for type, refs in attr.items(): 

170 if not isinstance(refs, list): 

171 refs = [refs] 

172 # iterate over all the references, if it exists and should be 

173 # exported, if not it should be inserted into the new registry 

174 for ref in refs: 

175 # Component dataset ID is the same as its parent ID, so 

176 # checking component in existing_ids works OK. 

177 if ref.id is not None and ref.id in existing_ids: 

178 # If this is a component we want the composite to be 

179 # exported. 

180 if ref.isComponent(): 

181 ref = ref.makeCompositeRef() 

182 exports.add(ref) 

183 else: 

184 if ref.isComponent(): 

185 # We can't insert a component, and a component will 

186 # be part of some other upstream dataset, so it 

187 # should be safe to skip them here 

188 continue 

189 type = _validate_dataset_type(type, datasetTypes) 

190 inserts[type].add(ref.dataId) 

191 return exports, inserts 

192 

193 

194def _discoverCollections(butler: Butler, collections: Iterable[str]) -> set[str]: 

195 # Recurse through any discovered collections to make sure all collections 

196 # are exported. This exists because I ran into a situation where some 

197 # collections were not properly being discovered and exported. This 

198 # method may be able to be removed in the future if collection export 

199 # logic changes 

200 collections = set(collections) 

201 while True: 

202 discoveredCollections = set( 

203 butler.registry.queryCollections(collections, flattenChains=True, includeChains=True) 

204 ) 

205 if len(discoveredCollections) > len(collections): 

206 collections = discoveredCollections 

207 else: 

208 break 

209 return collections 

210 

211 

212def _export( 

213 butler: Butler, collections: Optional[Iterable[str]], exports: Set[DatasetRef], inserts: DataSetTypeMap 

214) -> io.StringIO: 

215 # This exports the datasets that exist in the input butler using 

216 # daf butler objects, however it reaches in deep and does not use the 

217 # public methods so that it can export it to a string buffer and skip 

218 # disk access. 

219 yamlBuffer = io.StringIO() 

220 # Yaml is hard coded, since the class controls both ends of the 

221 # export/import 

222 BackendClass = get_class_of(butler._config["repo_transfer_formats", "yaml", "export"]) 

223 backend = BackendClass(yamlBuffer, universe=butler.registry.dimensions) 

224 exporter = RepoExportContext(butler.registry, butler.datastore, backend, directory=None, transfer=None) 

225 

226 # Need to ensure that the dimension records for outputs are 

227 # transferred. 

228 for _, dataIds in inserts.items(): 

229 exporter.saveDataIds(dataIds) 

230 

231 # Look for any defined collection, if not get the defaults 

232 if collections is None: 

233 collections = butler.registry.defaults.collections 

234 

235 # look up all collections associated with those inputs, this follows 

236 # all chains to make sure everything is properly exported 

237 for c in _discoverCollections(butler, collections): 

238 exporter.saveCollection(c) 

239 exporter._finish() 

240 

241 # reset the string buffer to the beginning so the read operation will 

242 # actually *see* the data that was exported 

243 yamlBuffer.seek(0) 

244 return yamlBuffer 

245 

246 

247def _setupNewButler( 

248 butler: Butler, 

249 outputLocation: ResourcePath, 

250 dirExists: bool, 

251 datastoreRoot: Optional[ResourcePath] = None, 

252) -> Butler: 

253 """Set up the execution butler 

254 

255 Parameters 

256 ---------- 

257 butler : `Butler` 

258 The original butler, upon which the execution butler is based. 

259 outputLocation : `ResourcePath` 

260 Location of the execution butler. 

261 dirExists : `bool` 

262 Does the ``outputLocation`` exist, and if so, should it be clobbered? 

263 datastoreRoot : `ResourcePath`, optional 

264 Path for the execution butler datastore. If not specified, then the 

265 original butler's datastore will be used. 

266 

267 Returns 

268 ------- 

269 execution_butler : `Butler` 

270 Execution butler. 

271 """ 

272 # Set up the new butler object at the specified location 

273 if dirExists: 

274 # Remove the existing table, if the code got this far and this exists 

275 # clobber must be true 

276 executionRegistry = outputLocation.join("gen3.sqlite3") 

277 if executionRegistry.exists(): 

278 executionRegistry.remove() 

279 else: 

280 outputLocation.mkdir() 

281 

282 # Copy the existing butler config, modifying the location of the 

283 # registry to the specified location. 

284 # Preserve the root path from the existing butler so things like 

285 # file data stores continue to look at the old location. 

286 config = Config(butler._config) 

287 config["root"] = outputLocation.geturl() 

288 config["allow_put_of_predefined_dataset"] = True 

289 config["registry", "db"] = "sqlite:///<butlerRoot>/gen3.sqlite3" 

290 

291 # Remove any namespace that may be set in main registry. 

292 config.pop(("registry", "namespace"), None) 

293 

294 # record the current root of the datastore if it is specified relative 

295 # to the butler root 

296 if datastoreRoot is not None: 

297 config["datastore", "root"] = datastoreRoot.geturl() 

298 elif config.get(("datastore", "root")) == BUTLER_ROOT_TAG and butler._config.configDir is not None: 

299 config["datastore", "root"] = butler._config.configDir.geturl() 

300 config["datastore", "trust_get_request"] = True 

301 

302 # Requires that we use the dimension configuration from the original 

303 # butler and not use the defaults. 

304 config = Butler.makeRepo( 

305 root=outputLocation, 

306 config=config, 

307 dimensionConfig=butler.registry.dimensions.dimensionConfig, 

308 overwrite=True, 

309 forceConfigRoot=False, 

310 ) 

311 

312 # Return a newly created butler 

313 return Butler(config, writeable=True) 

314 

315 

316def _import( 

317 yamlBuffer: io.StringIO, 

318 newButler: Butler, 

319 inserts: DataSetTypeMap, 

320 run: Optional[str], 

321 butlerModifier: Optional[Callable[[Butler], Butler]], 

322) -> Butler: 

323 # This method takes the exports from the existing butler, imports 

324 # them into the newly created butler, and then inserts the datasets 

325 # that are expected to be produced. 

326 

327 # import the existing datasets using "split" mode. "split" is safe 

328 # because execution butler is assumed to be able to see all the file 

329 # locations that the main datastore can see. "split" supports some 

330 # absolute URIs in the datastore. 

331 newButler.import_(filename=yamlBuffer, format="yaml", reuseIds=True, transfer="split") 

332 

333 # If there is modifier callable, run it to make necessary updates 

334 # to the new butler. 

335 if butlerModifier is not None: 

336 newButler = butlerModifier(newButler) 

337 

338 # Register datasets to be produced and insert them into the registry 

339 for dsType, dataIds in inserts.items(): 

340 # There may be inconsistencies with storage class definitions 

341 # so those differences must be checked. 

342 try: 

343 newButler.registry.registerDatasetType(dsType) 

344 except ConflictingDefinitionError: 

345 # We do not at this point know whether the dataset type is 

346 # an intermediate (and so must be able to support conversion 

347 # from the registry storage class to an input) or solely an output 

348 # dataset type. Test both compatibilities. 

349 registryDsType = newButler.registry.getDatasetType(dsType.name) 

350 if registryDsType.is_compatible_with(dsType) and dsType.is_compatible_with(registryDsType): 

351 # Ensure that we use the registry type when inserting. 

352 dsType = registryDsType 

353 else: 

354 # Not compatible so re-raise the original exception. 

355 raise 

356 

357 newButler.registry.insertDatasets(dsType, dataIds, run) 

358 

359 return newButler 

360 

361 

362def buildExecutionButler( 

363 butler: Butler, 

364 graph: QuantumGraph, 

365 outputLocation: ResourcePathExpression, 

366 run: Optional[str], 

367 *, 

368 clobber: bool = False, 

369 butlerModifier: Optional[Callable[[Butler], Butler]] = None, 

370 collections: Optional[Iterable[str]] = None, 

371 datastoreRoot: Optional[ResourcePathExpression] = None, 

372 transfer: str = "auto", 

373) -> Butler: 

374 r"""buildExecutionButler is a function that is responsible for exporting 

375 input `QuantumGraphs` into a new minimal `~lsst.daf.butler.Butler` which 

376 only contains datasets specified by the `QuantumGraph`. These datasets are 

377 both those that already exist in the input `~lsst.daf.butler.Butler`, and 

378 those that are expected to be produced during the execution of the 

379 `QuantumGraph`. 

380 

381 Parameters 

382 ---------- 

383 butler : `lsst.daf.butler.Bulter` 

384 This is the existing `~lsst.daf.butler.Butler` instance from which 

385 existing datasets will be exported. This should be the 

386 `~lsst.daf.butler.Butler` which was used to create any `QuantumGraphs` 

387 that will be converted with this object. 

388 graph : `QuantumGraph` 

389 Graph containing nodes that are to be exported into an execution 

390 butler 

391 outputLocation : convertible to `ResourcePath` 

392 URI Location at which the execution butler is to be exported. May be 

393 specified as a string or a `ResourcePath` instance. 

394 run : `str`, optional 

395 The run collection that the exported datasets are to be placed in. If 

396 None, the default value in registry.defaults will be used. 

397 clobber : `bool`, Optional 

398 By default a butler will not be created if a file or directory 

399 already exists at the output location. If this is set to `True` 

400 what is at the location will be deleted prior to running the 

401 export. Defaults to `False` 

402 butlerModifier : `~typing.Callable`, Optional 

403 If supplied this should be a callable that accepts a 

404 `~lsst.daf.butler.Butler`, and returns an instantiated 

405 `~lsst.daf.butler.Butler`. This callable may be used to make any 

406 modifications to the `~lsst.daf.butler.Butler` desired. This 

407 will be called after importing all datasets that exist in the input 

408 `~lsst.daf.butler.Butler` but prior to inserting Datasets expected 

409 to be produced. Examples of what this method could do include 

410 things such as creating collections/runs/ etc. 

411 collections : `~typing.Iterable` of `str`, Optional 

412 An iterable of collection names that will be exported from the input 

413 `~lsst.daf.butler.Butler` when creating the execution butler. If not 

414 supplied the `~lsst.daf.butler.Butler`\ 's `~lsst.daf.butler.Registry` 

415 default collections will be used. 

416 datastoreRoot : convertible to `ResourcePath`, Optional 

417 Root directory for datastore of execution butler. If `None`, then the 

418 original butler's datastore will be used. 

419 transfer : `str` 

420 How (and whether) the input datasets should be added to the execution 

421 butler datastore. This should be a ``transfer`` string recognized by 

422 :func:`lsst.resources.ResourcePath.transfer_from`. 

423 ``"auto"`` means to ``"copy"`` if the ``datastoreRoot`` is specified. 

424 

425 Returns 

426 ------- 

427 executionButler : `lsst.daf.butler.Butler` 

428 An instance of the newly created execution butler 

429 

430 Raises 

431 ------ 

432 FileExistsError 

433 Raised if something exists in the filesystem at the specified output 

434 location and clobber is `False` 

435 NotADirectoryError 

436 Raised if specified output URI does not correspond to a directory 

437 """ 

438 # We know this must refer to a directory. 

439 outputLocation = ResourcePath(outputLocation, forceDirectory=True) 

440 if datastoreRoot is not None: 

441 datastoreRoot = ResourcePath(datastoreRoot, forceDirectory=True) 

442 

443 # Do this first to Fail Fast if the output exists 

444 if (dirExists := outputLocation.exists()) and not clobber: 

445 raise FileExistsError("Cannot create a butler at specified location, location exists") 

446 if not outputLocation.isdir(): 

447 raise NotADirectoryError("The specified output URI does not appear to correspond to a directory") 

448 

449 # Gather all DatasetTypes from the Python and check any that already exist 

450 # in the registry for consistency. This does not check that all dataset 

451 # types here exist, because they might want to register dataset types 

452 # later. It would be nice to also check that, but to that we would need to 

453 # be told whether they plan to register dataset types later (DM-30845). 

454 dataset_types = PipelineDatasetTypes.fromPipeline(graph.iterTaskGraph(), registry=butler.registry) 

455 

456 exports, inserts = _accumulate(butler, graph, dataset_types) 

457 yamlBuffer = _export(butler, collections, exports, inserts) 

458 

459 newButler = _setupNewButler(butler, outputLocation, dirExists, datastoreRoot) 

460 

461 newButler = _import(yamlBuffer, newButler, inserts, run, butlerModifier) 

462 

463 if transfer == "auto" and datastoreRoot is not None: 

464 transfer = "copy" 

465 

466 # Transfer the existing datasets directly from the source butler. 

467 newButler.transfer_from( 

468 butler, 

469 exports, 

470 transfer=transfer, 

471 skip_missing=False, # Everything should exist. 

472 register_dataset_types=True, 

473 transfer_dimensions=True, 

474 ) 

475 

476 return newButler