Coverage for python/lsst/pipe/base/executionButlerBuilder.py: 12%

124 statements  

« prev     ^ index     » next       coverage.py v6.4.2, created at 2022-07-14 16:10 -0700

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ("buildExecutionButler",) 

24 

25import io 

26import itertools 

27from collections import defaultdict 

28from typing import Callable, DefaultDict, Iterable, List, Mapping, Optional, Set, Tuple, Union 

29 

30from lsst.daf.butler import Butler, Config, DataCoordinate, DatasetRef, DatasetType 

31from lsst.daf.butler.core.repoRelocation import BUTLER_ROOT_TAG 

32from lsst.daf.butler.registry import ConflictingDefinitionError 

33from lsst.daf.butler.transfers import RepoExportContext 

34from lsst.resources import ResourcePath, ResourcePathExpression 

35from lsst.utils.introspection import get_class_of 

36 

37from .graph import QuantumGraph, QuantumNode 

38from .pipeline import PipelineDatasetTypes 

39 

40DataSetTypeMap = Mapping[DatasetType, Set[DataCoordinate]] 

41 

42 

43def _validate_dataset_type( 

44 candidate: DatasetType, previous: dict[Union[str, DatasetType], DatasetType] 

45) -> DatasetType: 

46 """Check the dataset types and return a consistent variant if there are 

47 different compatible options. 

48 

49 Parameters 

50 ---------- 

51 candidate : `lsst.daf.butler.DatasetType` 

52 The candidate dataset type. 

53 previous : `dict` [Union[`str`, `DatasetType`], `DatasetType`] 

54 Previous dataset types found, indexed by name and also by 

55 dataset type. The latter provides a quick way of returning a 

56 previously checked dataset type. 

57 

58 Returns 

59 ------- 

60 datasetType : `lsst.daf.butler.DatasetType` 

61 The dataset type to be used. This can be different from the 

62 given ``candidate`` if a previous dataset type was encountered 

63 with the same name and this one is compatible with it. 

64 

65 Raises 

66 ------ 

67 ConflictingDefinitionError 

68 Raised if a candidate dataset type has the same name as one 

69 previously encountered but is not compatible with it. 

70 

71 Notes 

72 ----- 

73 This function ensures that if a dataset type is given that has the 

74 same name as a previously encountered dataset type but differs solely 

75 in a way that is interchangeable (through a supported storage class) 

76 then we will always return the first dataset type encountered instead 

77 of the new variant. We assume that the butler will handle the 

78 type conversion itself later. 

79 """ 

80 # First check that if we have previously vetted this dataset type. 

81 # Return the vetted form immediately if we have. 

82 checked = previous.get(candidate) 

83 if checked: 

84 return checked 

85 

86 # Have not previously encountered this dataset type. 

87 name = candidate.name 

88 if prevDsType := previous.get(name): 

89 # Check compatibility. For now assume both directions have to 

90 # be acceptable. 

91 if prevDsType.is_compatible_with(candidate) and candidate.is_compatible_with(prevDsType): 

92 # Ensure that if this dataset type is used again we will return 

93 # the version that we were first given with this name. Store 

94 # it for next time and return the previous one. 

95 previous[candidate] = prevDsType 

96 return prevDsType 

97 else: 

98 raise ConflictingDefinitionError( 

99 f"Dataset type incompatibility in graph: {prevDsType} not compatible with {candidate}" 

100 ) 

101 

102 # New dataset type encountered. Store it by name and by dataset type 

103 # so it will be validated immediately next time it comes up. 

104 previous[name] = candidate 

105 previous[candidate] = candidate 

106 return candidate 

107 

108 

109def _accumulate( 

110 graph: QuantumGraph, 

111 dataset_types: PipelineDatasetTypes, 

112) -> Tuple[Set[DatasetRef], DataSetTypeMap]: 

113 # accumulate the DatasetRefs that will be transferred to the execution 

114 # registry 

115 

116 # exports holds all the existing data that will be migrated to the 

117 # execution butler 

118 exports: Set[DatasetRef] = set() 

119 

120 # inserts is the mapping of DatasetType to dataIds for what is to be 

121 # inserted into the registry. These are the products that are expected 

122 # to be produced during processing of the QuantumGraph 

123 inserts: DefaultDict[DatasetType, Set[DataCoordinate]] = defaultdict(set) 

124 

125 # It is possible to end up with a graph that has different storage 

126 # classes attached to the same dataset type name. This is okay but 

127 # must we must ensure that only a single dataset type definition is 

128 # accumulated in the loop below. This data structure caches every dataset 

129 # type encountered and stores the compatible alternative. 

130 datasetTypes: dict[Union[str, DatasetType], DatasetType] = {} 

131 

132 # Add inserts for initOutputs (including initIntermediates); these are 

133 # defined fully by their DatasetType, because they have no dimensions, and 

134 # they are by definition not resolved. initInputs are part of Quantum and 

135 # that's the only place the graph stores the dataset IDs, so we process 

136 # them there even though each Quantum for a task has the same ones. 

137 for dataset_type in itertools.chain(dataset_types.initIntermediates, dataset_types.initOutputs): 

138 dataset_type = _validate_dataset_type(dataset_type, datasetTypes) 

139 inserts[dataset_type].add(DataCoordinate.makeEmpty(dataset_type.dimensions.universe)) 

140 

141 n: QuantumNode 

142 for quantum in (n.quantum for n in graph): 

143 for attrName in ("initInputs", "inputs", "outputs"): 

144 attr: Mapping[DatasetType, Union[DatasetRef, List[DatasetRef]]] = getattr(quantum, attrName) 

145 

146 for type, refs in attr.items(): 

147 # This if block is because init inputs has a different 

148 # signature for its items 

149 if not isinstance(refs, list): 

150 refs = [refs] 

151 # iterate over all the references, if it has an id, it 

152 # means it exists and should be exported, if not it should 

153 # be inserted into the new registry 

154 for ref in refs: 

155 if ref.id is not None: 

156 # If this is a component we want the composite to be 

157 # exported. 

158 if ref.isComponent(): 

159 ref = ref.makeCompositeRef() 

160 exports.add(ref) 

161 else: 

162 if ref.isComponent(): 

163 # We can't insert a component, and a component will 

164 # be part of some other upstream dataset, so it 

165 # should be safe to skip them here 

166 continue 

167 type = _validate_dataset_type(type, datasetTypes) 

168 inserts[type].add(ref.dataId) 

169 return exports, inserts 

170 

171 

172def _discoverCollections(butler: Butler, collections: Iterable[str]) -> set[str]: 

173 # Recurse through any discovered collections to make sure all collections 

174 # are exported. This exists because I ran into a situation where some 

175 # collections were not properly being discovered and exported. This 

176 # method may be able to be removed in the future if collection export 

177 # logic changes 

178 collections = set(collections) 

179 while True: 

180 discoveredCollections = set( 

181 butler.registry.queryCollections(collections, flattenChains=True, includeChains=True) 

182 ) 

183 if len(discoveredCollections) > len(collections): 

184 collections = discoveredCollections 

185 else: 

186 break 

187 return collections 

188 

189 

190def _export( 

191 butler: Butler, collections: Optional[Iterable[str]], exports: Set[DatasetRef], inserts: DataSetTypeMap 

192) -> io.StringIO: 

193 # This exports the datasets that exist in the input butler using 

194 # daf butler objects, however it reaches in deep and does not use the 

195 # public methods so that it can export it to a string buffer and skip 

196 # disk access. 

197 yamlBuffer = io.StringIO() 

198 # Yaml is hard coded, since the class controls both ends of the 

199 # export/import 

200 BackendClass = get_class_of(butler._config["repo_transfer_formats", "yaml", "export"]) 

201 backend = BackendClass(yamlBuffer, universe=butler.registry.dimensions) 

202 exporter = RepoExportContext(butler.registry, butler.datastore, backend, directory=None, transfer=None) 

203 

204 # Need to ensure that the dimension records for input are transferred. 

205 # Butler.transfer_from() does not (yet) transfer records. 

206 dataIds = set(ref.dataId for ref in exports) 

207 exporter.saveDataIds(dataIds) 

208 

209 # Need to ensure that the dimension records for outputs are 

210 # transferred. 

211 for _, dataIds in inserts.items(): 

212 exporter.saveDataIds(dataIds) 

213 

214 # Look for any defined collection, if not get the defaults 

215 if collections is None: 

216 collections = butler.registry.defaults.collections 

217 

218 # look up all collections associated with those inputs, this follows 

219 # all chains to make sure everything is properly exported 

220 for c in _discoverCollections(butler, collections): 

221 exporter.saveCollection(c) 

222 exporter._finish() 

223 

224 # reset the string buffer to the beginning so the read operation will 

225 # actually *see* the data that was exported 

226 yamlBuffer.seek(0) 

227 return yamlBuffer 

228 

229 

230def _setupNewButler( 

231 butler: Butler, 

232 outputLocation: ResourcePath, 

233 dirExists: bool, 

234 datastoreRoot: Optional[ResourcePath] = None, 

235) -> Butler: 

236 """Set up the execution butler 

237 

238 Parameters 

239 ---------- 

240 butler : `Butler` 

241 The original butler, upon which the execution butler is based. 

242 outputLocation : `ResourcePath` 

243 Location of the execution butler. 

244 dirExists : `bool` 

245 Does the ``outputLocation`` exist, and if so, should it be clobbered? 

246 datastoreRoot : `ResourcePath`, optional 

247 Path for the execution butler datastore. If not specified, then the 

248 original butler's datastore will be used. 

249 

250 Returns 

251 ------- 

252 execution_butler : `Butler` 

253 Execution butler. 

254 """ 

255 # Set up the new butler object at the specified location 

256 if dirExists: 

257 # Remove the existing table, if the code got this far and this exists 

258 # clobber must be true 

259 executionRegistry = outputLocation.join("gen3.sqlite3") 

260 if executionRegistry.exists(): 

261 executionRegistry.remove() 

262 else: 

263 outputLocation.mkdir() 

264 

265 # Copy the existing butler config, modifying the location of the 

266 # registry to the specified location. 

267 # Preserve the root path from the existing butler so things like 

268 # file data stores continue to look at the old location. 

269 config = Config(butler._config) 

270 config["root"] = outputLocation.geturl() 

271 config["allow_put_of_predefined_dataset"] = True 

272 config["registry", "db"] = "sqlite:///<butlerRoot>/gen3.sqlite3" 

273 

274 # Remove any namespace that may be set in main registry. 

275 config.pop(("registry", "namespace"), None) 

276 

277 # record the current root of the datastore if it is specified relative 

278 # to the butler root 

279 if datastoreRoot is not None: 

280 config["datastore", "root"] = datastoreRoot.geturl() 

281 elif config.get(("datastore", "root")) == BUTLER_ROOT_TAG and butler._config.configDir is not None: 

282 config["datastore", "root"] = butler._config.configDir.geturl() 

283 config["datastore", "trust_get_request"] = True 

284 

285 # Requires that we use the dimension configuration from the original 

286 # butler and not use the defaults. 

287 config = Butler.makeRepo( 

288 root=outputLocation, 

289 config=config, 

290 dimensionConfig=butler.registry.dimensions.dimensionConfig, 

291 overwrite=True, 

292 forceConfigRoot=False, 

293 ) 

294 

295 # Return a newly created butler 

296 return Butler(config, writeable=True) 

297 

298 

299def _import( 

300 yamlBuffer: io.StringIO, 

301 newButler: Butler, 

302 inserts: DataSetTypeMap, 

303 run: Optional[str], 

304 butlerModifier: Optional[Callable[[Butler], Butler]], 

305) -> Butler: 

306 # This method takes the exports from the existing butler, imports 

307 # them into the newly created butler, and then inserts the datasets 

308 # that are expected to be produced. 

309 

310 # import the existing datasets using "split" mode. "split" is safe 

311 # because execution butler is assumed to be able to see all the file 

312 # locations that the main datastore can see. "split" supports some 

313 # absolute URIs in the datastore. 

314 newButler.import_(filename=yamlBuffer, format="yaml", reuseIds=True, transfer="split") 

315 

316 # If there is modifier callable, run it to make necessary updates 

317 # to the new butler. 

318 if butlerModifier is not None: 

319 newButler = butlerModifier(newButler) 

320 

321 # Register datasets to be produced and insert them into the registry 

322 for dsType, dataIds in inserts.items(): 

323 # There may be inconsistencies with storage class definitions 

324 # so those differences must be checked. 

325 try: 

326 newButler.registry.registerDatasetType(dsType) 

327 except ConflictingDefinitionError: 

328 # We do not at this point know whether the dataset type is 

329 # an intermediate (and so must be able to support conversion 

330 # from the registry storage class to an input) or solely an output 

331 # dataset type. Test both compatibilities. 

332 registryDsType = newButler.registry.getDatasetType(dsType.name) 

333 if registryDsType.is_compatible_with(dsType) and dsType.is_compatible_with(registryDsType): 

334 # Ensure that we use the registry type when inserting. 

335 dsType = registryDsType 

336 else: 

337 # Not compatible so re-raise the original exception. 

338 raise 

339 

340 newButler.registry.insertDatasets(dsType, dataIds, run) 

341 

342 return newButler 

343 

344 

345def buildExecutionButler( 

346 butler: Butler, 

347 graph: QuantumGraph, 

348 outputLocation: ResourcePathExpression, 

349 run: Optional[str], 

350 *, 

351 clobber: bool = False, 

352 butlerModifier: Optional[Callable[[Butler], Butler]] = None, 

353 collections: Optional[Iterable[str]] = None, 

354 datastoreRoot: Optional[ResourcePathExpression] = None, 

355 transfer: str = "auto", 

356) -> Butler: 

357 r"""buildExecutionButler is a function that is responsible for exporting 

358 input `QuantumGraphs` into a new minimal `~lsst.daf.butler.Butler` which 

359 only contains datasets specified by the `QuantumGraph`. These datasets are 

360 both those that already exist in the input `~lsst.daf.butler.Butler`, and 

361 those that are expected to be produced during the execution of the 

362 `QuantumGraph`. 

363 

364 Parameters 

365 ---------- 

366 butler : `lsst.daf.butler.Bulter` 

367 This is the existing `~lsst.daf.butler.Butler` instance from which 

368 existing datasets will be exported. This should be the 

369 `~lsst.daf.butler.Butler` which was used to create any `QuantumGraphs` 

370 that will be converted with this object. 

371 graph : `QuantumGraph` 

372 Graph containing nodes that are to be exported into an execution 

373 butler 

374 outputLocation : convertible to `ResourcePath` 

375 URI Location at which the execution butler is to be exported. May be 

376 specified as a string or a `ResourcePath` instance. 

377 run : `str`, optional 

378 The run collection that the exported datasets are to be placed in. If 

379 None, the default value in registry.defaults will be used. 

380 clobber : `bool`, Optional 

381 By default a butler will not be created if a file or directory 

382 already exists at the output location. If this is set to `True` 

383 what is at the location will be deleted prior to running the 

384 export. Defaults to `False` 

385 butlerModifier : `~typing.Callable`, Optional 

386 If supplied this should be a callable that accepts a 

387 `~lsst.daf.butler.Butler`, and returns an instantiated 

388 `~lsst.daf.butler.Butler`. This callable may be used to make any 

389 modifications to the `~lsst.daf.butler.Butler` desired. This 

390 will be called after importing all datasets that exist in the input 

391 `~lsst.daf.butler.Butler` but prior to inserting Datasets expected 

392 to be produced. Examples of what this method could do include 

393 things such as creating collections/runs/ etc. 

394 collections : `~typing.Iterable` of `str`, Optional 

395 An iterable of collection names that will be exported from the input 

396 `~lsst.daf.butler.Butler` when creating the execution butler. If not 

397 supplied the `~lsst.daf.butler.Butler`\ 's `~lsst.daf.butler.Registry` 

398 default collections will be used. 

399 datastoreRoot : convertible to `ResourcePath`, Optional 

400 Root directory for datastore of execution butler. If `None`, then the 

401 original butler's datastore will be used. 

402 transfer : `str` 

403 How (and whether) the input datasets should be added to the execution 

404 butler datastore. This should be a ``transfer`` string recognized by 

405 :func:`lsst.resources.ResourcePath.transfer_from`. 

406 ``"auto"`` means to ``"copy"`` if the ``datastoreRoot`` is specified. 

407 

408 Returns 

409 ------- 

410 executionButler : `lsst.daf.butler.Butler` 

411 An instance of the newly created execution butler 

412 

413 Raises 

414 ------ 

415 FileExistsError 

416 Raised if something exists in the filesystem at the specified output 

417 location and clobber is `False` 

418 NotADirectoryError 

419 Raised if specified output URI does not correspond to a directory 

420 """ 

421 # We know this must refer to a directory. 

422 outputLocation = ResourcePath(outputLocation, forceDirectory=True) 

423 if datastoreRoot is not None: 

424 datastoreRoot = ResourcePath(datastoreRoot, forceDirectory=True) 

425 

426 # Do this first to Fail Fast if the output exists 

427 if (dirExists := outputLocation.exists()) and not clobber: 

428 raise FileExistsError("Cannot create a butler at specified location, location exists") 

429 if not outputLocation.isdir(): 

430 raise NotADirectoryError("The specified output URI does not appear to correspond to a directory") 

431 

432 # Gather all DatasetTypes from the Python and check any that already exist 

433 # in the registry for consistency. This does not check that all dataset 

434 # types here exist, because they might want to register dataset types 

435 # later. It would be nice to also check that, but to that we would need to 

436 # be told whether they plan to register dataset types later (DM-30845). 

437 dataset_types = PipelineDatasetTypes.fromPipeline(graph.iterTaskGraph(), registry=butler.registry) 

438 

439 exports, inserts = _accumulate(graph, dataset_types) 

440 yamlBuffer = _export(butler, collections, exports, inserts) 

441 

442 newButler = _setupNewButler(butler, outputLocation, dirExists, datastoreRoot) 

443 

444 newButler = _import(yamlBuffer, newButler, inserts, run, butlerModifier) 

445 

446 if transfer == "auto" and datastoreRoot is not None: 

447 transfer = "copy" 

448 

449 # Transfer the existing datasets directly from the source butler. 

450 newButler.transfer_from( 

451 butler, 

452 exports, 

453 transfer=transfer, 

454 skip_missing=False, # Everything should exist. 

455 register_dataset_types=True, 

456 ) 

457 

458 return newButler