Coverage for python/lsst/pipe/base/executionButlerBuilder.py: 10%

141 statements  

« prev     ^ index     » next       coverage.py v7.2.3, created at 2023-04-19 04:01 -0700

1# This file is part of pipe_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ("buildExecutionButler",) 

24 

25import io 

26import itertools 

27from collections import defaultdict 

28from typing import Callable, DefaultDict, Iterable, List, Mapping, Optional, Set, Tuple, Union 

29 

30from lsst.daf.butler import Butler, Config, DataCoordinate, DatasetRef, DatasetType, Registry 

31from lsst.daf.butler.core.repoRelocation import BUTLER_ROOT_TAG 

32from lsst.daf.butler.registry import ConflictingDefinitionError, MissingDatasetTypeError 

33from lsst.daf.butler.transfers import RepoExportContext 

34from lsst.resources import ResourcePath, ResourcePathExpression 

35from lsst.utils.introspection import get_class_of 

36 

37from .graph import QuantumGraph 

38from .pipeline import PipelineDatasetTypes 

39 

40DataSetTypeMap = Mapping[DatasetType, Set[DataCoordinate]] 

41 

42 

43def _validate_dataset_type( 

44 candidate: DatasetType, previous: dict[Union[str, DatasetType], DatasetType], registry: Registry 

45) -> DatasetType: 

46 """Check the dataset types and return a consistent variant if there are 

47 different compatible options. 

48 

49 Parameters 

50 ---------- 

51 candidate : `lsst.daf.butler.DatasetType` 

52 The candidate dataset type. 

53 previous : `dict` [Union[`str`, `DatasetType`], `DatasetType`] 

54 Previous dataset types found, indexed by name and also by 

55 dataset type. The latter provides a quick way of returning a 

56 previously checked dataset type. 

57 registry : `lsst.daf.butler.Registry` 

58 Main registry whose dataset type registration should override the 

59 given one if it exists. 

60 

61 Returns 

62 ------- 

63 datasetType : `lsst.daf.butler.DatasetType` 

64 The dataset type to be used. This can be different from the 

65 given ``candidate`` if a previous dataset type was encountered 

66 with the same name and this one is compatible with it. 

67 

68 Raises 

69 ------ 

70 ConflictingDefinitionError 

71 Raised if a candidate dataset type has the same name as one 

72 previously encountered but is not compatible with it. 

73 

74 Notes 

75 ----- 

76 This function ensures that if a dataset type is given that has the 

77 same name as a previously encountered dataset type but differs solely 

78 in a way that is interchangeable (through a supported storage class) 

79 then we will always return the first dataset type encountered instead 

80 of the new variant. We assume that the butler will handle the 

81 type conversion itself later. 

82 """ 

83 # First check that if we have previously vetted this dataset type. 

84 # Return the vetted form immediately if we have. 

85 checked = previous.get(candidate) 

86 if checked: 

87 return checked 

88 

89 # Have not previously encountered this dataset type. 

90 name = candidate.name 

91 if prevDsType := previous.get(name): 

92 # Check compatibility. For now assume both directions have to 

93 # be acceptable. 

94 if prevDsType.is_compatible_with(candidate) and candidate.is_compatible_with(prevDsType): 

95 # Ensure that if this dataset type is used again we will return 

96 # the version that we were first given with this name. Store 

97 # it for next time and return the previous one. 

98 previous[candidate] = prevDsType 

99 return prevDsType 

100 else: 

101 raise ConflictingDefinitionError( 

102 f"Dataset type incompatibility in graph: {prevDsType} not compatible with {candidate}" 

103 ) 

104 

105 # We haven't seen this dataset type in this graph before, but it may 

106 # already be in the registry. 

107 try: 

108 registryDsType = registry.getDatasetType(name) 

109 previous[candidate] = registryDsType 

110 return registryDsType 

111 except MissingDatasetTypeError: 

112 pass 

113 # Dataset type is totally new. Store it by name and by dataset type so 

114 # it will be validated immediately next time it comes up. 

115 previous[name] = candidate 

116 previous[candidate] = candidate 

117 return candidate 

118 

119 

120def _accumulate( 

121 butler: Butler, 

122 graph: QuantumGraph, 

123 dataset_types: PipelineDatasetTypes, 

124) -> Tuple[Set[DatasetRef], DataSetTypeMap]: 

125 # accumulate the DatasetRefs that will be transferred to the execution 

126 # registry 

127 

128 # exports holds all the existing data that will be migrated to the 

129 # execution butler 

130 exports: Set[DatasetRef] = set() 

131 

132 # inserts is the mapping of DatasetType to dataIds for what is to be 

133 # inserted into the registry. These are the products that are expected 

134 # to be produced during processing of the QuantumGraph 

135 inserts: DefaultDict[DatasetType, Set[DataCoordinate]] = defaultdict(set) 

136 

137 # It is possible to end up with a graph that has different storage 

138 # classes attached to the same dataset type name. This is okay but 

139 # must we must ensure that only a single dataset type definition is 

140 # accumulated in the loop below. This data structure caches every dataset 

141 # type encountered and stores the compatible alternative. 

142 datasetTypes: dict[Union[str, DatasetType], DatasetType] = {} 

143 

144 # Add inserts for initOutputs (including initIntermediates); these are 

145 # defined fully by their DatasetType, because they have no dimensions. 

146 # initInputs are part of Quantum and that's the only place the graph stores 

147 # the dataset IDs, so we process them there even though each Quantum for a 

148 # task has the same ones. 

149 for dataset_type in itertools.chain(dataset_types.initIntermediates, dataset_types.initOutputs): 

150 dataset_type = _validate_dataset_type(dataset_type, datasetTypes, butler.registry) 

151 inserts[dataset_type].add(DataCoordinate.makeEmpty(dataset_type.dimensions.universe)) 

152 

153 # Output references may be resolved even if they do not exist. Find all 

154 # actually existing refs. 

155 check_refs: Set[DatasetRef] = set() 

156 for quantum in (n.quantum for n in graph): 

157 for attrName in ("initInputs", "inputs", "outputs"): 

158 attr: Mapping[DatasetType, Union[DatasetRef, List[DatasetRef]]] = getattr(quantum, attrName) 

159 for type, refs in attr.items(): 

160 # This if block is because init inputs has a different 

161 # signature for its items 

162 if not isinstance(refs, list): 

163 refs = [refs] 

164 for ref in refs: 

165 if ref.id is not None: 

166 # We could check existence of individual components, 

167 # but it should be less work to check their parent. 

168 if ref.isComponent(): 

169 ref = ref.makeCompositeRef() 

170 check_refs.add(ref) 

171 exist_map = butler.datastore.knows_these(check_refs) 

172 existing_ids = set(ref.id for ref, exists in exist_map.items() if exists) 

173 del exist_map 

174 

175 for quantum in (n.quantum for n in graph): 

176 for attrName in ("initInputs", "inputs", "outputs"): 

177 attr = getattr(quantum, attrName) 

178 

179 for type, refs in attr.items(): 

180 if not isinstance(refs, list): 

181 refs = [refs] 

182 # iterate over all the references, if it exists and should be 

183 # exported, if not it should be inserted into the new registry 

184 for ref in refs: 

185 # Component dataset ID is the same as its parent ID, so 

186 # checking component in existing_ids works OK. 

187 if ref.id is not None and ref.id in existing_ids: 

188 # If this is a component we want the composite to be 

189 # exported. 

190 if ref.isComponent(): 

191 ref = ref.makeCompositeRef() 

192 # Make sure we export this with the registry's dataset 

193 # type, since transfer_from doesn't handle storage 

194 # class differences (maybe it should, but it's not 

195 # bad to be defensive here even if that changes). 

196 type = _validate_dataset_type(type, datasetTypes, butler.registry) 

197 if type != ref.datasetType: 

198 ref = ref.overrideStorageClass(type.storageClass) 

199 assert ref.datasetType == type, "Dataset types should not differ in other ways." 

200 exports.add(ref) 

201 else: 

202 if ref.isComponent(): 

203 # We can't insert a component, and a component will 

204 # be part of some other upstream dataset, so it 

205 # should be safe to skip them here 

206 continue 

207 type = _validate_dataset_type(type, datasetTypes, butler.registry) 

208 inserts[type].add(ref.dataId) 

209 return exports, inserts 

210 

211 

212def _discoverCollections(butler: Butler, collections: Iterable[str]) -> set[str]: 

213 # Recurse through any discovered collections to make sure all collections 

214 # are exported. This exists because I ran into a situation where some 

215 # collections were not properly being discovered and exported. This 

216 # method may be able to be removed in the future if collection export 

217 # logic changes 

218 collections = set(collections) 

219 while True: 

220 discoveredCollections = set( 

221 butler.registry.queryCollections(collections, flattenChains=True, includeChains=True) 

222 ) 

223 if len(discoveredCollections) > len(collections): 

224 collections = discoveredCollections 

225 else: 

226 break 

227 return collections 

228 

229 

230def _export(butler: Butler, collections: Optional[Iterable[str]], inserts: DataSetTypeMap) -> io.StringIO: 

231 # This exports relevant dimension records and collections using daf butler 

232 # objects, however it reaches in deep and does not use the public methods 

233 # so that it can export it to a string buffer and skip disk access. This 

234 # does not export the datasets themselves, since we use transfer_from for 

235 # that. 

236 yamlBuffer = io.StringIO() 

237 # Yaml is hard coded, since the class controls both ends of the 

238 # export/import 

239 BackendClass = get_class_of(butler._config["repo_transfer_formats", "yaml", "export"]) 

240 backend = BackendClass(yamlBuffer, universe=butler.registry.dimensions) 

241 exporter = RepoExportContext(butler.registry, butler.datastore, backend, directory=None, transfer=None) 

242 

243 # Need to ensure that the dimension records for outputs are 

244 # transferred. 

245 for _, dataIds in inserts.items(): 

246 exporter.saveDataIds(dataIds) 

247 

248 # Look for any defined collection, if not get the defaults 

249 if collections is None: 

250 collections = butler.registry.defaults.collections 

251 

252 # look up all collections associated with those inputs, this follows 

253 # all chains to make sure everything is properly exported 

254 for c in _discoverCollections(butler, collections): 

255 exporter.saveCollection(c) 

256 exporter._finish() 

257 

258 # reset the string buffer to the beginning so the read operation will 

259 # actually *see* the data that was exported 

260 yamlBuffer.seek(0) 

261 return yamlBuffer 

262 

263 

264def _setupNewButler( 

265 butler: Butler, 

266 outputLocation: ResourcePath, 

267 dirExists: bool, 

268 datastoreRoot: Optional[ResourcePath] = None, 

269) -> Butler: 

270 """Set up the execution butler 

271 

272 Parameters 

273 ---------- 

274 butler : `Butler` 

275 The original butler, upon which the execution butler is based. 

276 outputLocation : `ResourcePath` 

277 Location of the execution butler. 

278 dirExists : `bool` 

279 Does the ``outputLocation`` exist, and if so, should it be clobbered? 

280 datastoreRoot : `ResourcePath`, optional 

281 Path for the execution butler datastore. If not specified, then the 

282 original butler's datastore will be used. 

283 

284 Returns 

285 ------- 

286 execution_butler : `Butler` 

287 Execution butler. 

288 """ 

289 # Set up the new butler object at the specified location 

290 if dirExists: 

291 # Remove the existing table, if the code got this far and this exists 

292 # clobber must be true 

293 executionRegistry = outputLocation.join("gen3.sqlite3") 

294 if executionRegistry.exists(): 

295 executionRegistry.remove() 

296 else: 

297 outputLocation.mkdir() 

298 

299 # Copy the existing butler config, modifying the location of the 

300 # registry to the specified location. 

301 # Preserve the root path from the existing butler so things like 

302 # file data stores continue to look at the old location. 

303 config = Config(butler._config) 

304 config["root"] = outputLocation.geturl() 

305 config["allow_put_of_predefined_dataset"] = True 

306 config["registry", "db"] = "sqlite:///<butlerRoot>/gen3.sqlite3" 

307 

308 # Remove any namespace that may be set in main registry. 

309 config.pop(("registry", "namespace"), None) 

310 

311 # record the current root of the datastore if it is specified relative 

312 # to the butler root 

313 if datastoreRoot is not None: 

314 config["datastore", "root"] = datastoreRoot.geturl() 

315 elif config.get(("datastore", "root")) == BUTLER_ROOT_TAG and butler._config.configDir is not None: 

316 config["datastore", "root"] = butler._config.configDir.geturl() 

317 config["datastore", "trust_get_request"] = True 

318 

319 # Requires that we use the dimension configuration from the original 

320 # butler and not use the defaults. 

321 config = Butler.makeRepo( 

322 root=outputLocation, 

323 config=config, 

324 dimensionConfig=butler.registry.dimensions.dimensionConfig, 

325 overwrite=True, 

326 forceConfigRoot=False, 

327 ) 

328 

329 # Return a newly created butler 

330 return Butler(config, writeable=True) 

331 

332 

333def _import( 

334 yamlBuffer: io.StringIO, 

335 newButler: Butler, 

336 inserts: DataSetTypeMap, 

337 run: Optional[str], 

338 butlerModifier: Optional[Callable[[Butler], Butler]], 

339) -> Butler: 

340 # This method takes the exports from the existing butler, imports 

341 # them into the newly created butler, and then inserts the datasets 

342 # that are expected to be produced. 

343 

344 # import the existing datasets using "split" mode. "split" is safe 

345 # because execution butler is assumed to be able to see all the file 

346 # locations that the main datastore can see. "split" supports some 

347 # absolute URIs in the datastore. 

348 newButler.import_(filename=yamlBuffer, format="yaml", reuseIds=True, transfer="split") 

349 

350 # If there is modifier callable, run it to make necessary updates 

351 # to the new butler. 

352 if butlerModifier is not None: 

353 newButler = butlerModifier(newButler) 

354 

355 # Register datasets to be produced and insert them into the registry 

356 for dsType, dataIds in inserts.items(): 

357 # Storage class differences should have already been resolved by calls 

358 # _validate_dataset_type in _export, resulting in the Registry dataset 

359 # type whenever that exists. 

360 newButler.registry.registerDatasetType(dsType) 

361 newButler.registry.insertDatasets(dsType, dataIds, run) 

362 

363 return newButler 

364 

365 

366def buildExecutionButler( 

367 butler: Butler, 

368 graph: QuantumGraph, 

369 outputLocation: ResourcePathExpression, 

370 run: Optional[str], 

371 *, 

372 clobber: bool = False, 

373 butlerModifier: Optional[Callable[[Butler], Butler]] = None, 

374 collections: Optional[Iterable[str]] = None, 

375 datastoreRoot: Optional[ResourcePathExpression] = None, 

376 transfer: str = "auto", 

377) -> Butler: 

378 r"""buildExecutionButler is a function that is responsible for exporting 

379 input `QuantumGraphs` into a new minimal `~lsst.daf.butler.Butler` which 

380 only contains datasets specified by the `QuantumGraph`. These datasets are 

381 both those that already exist in the input `~lsst.daf.butler.Butler`, and 

382 those that are expected to be produced during the execution of the 

383 `QuantumGraph`. 

384 

385 Parameters 

386 ---------- 

387 butler : `lsst.daf.butler.Bulter` 

388 This is the existing `~lsst.daf.butler.Butler` instance from which 

389 existing datasets will be exported. This should be the 

390 `~lsst.daf.butler.Butler` which was used to create any `QuantumGraphs` 

391 that will be converted with this object. 

392 graph : `QuantumGraph` 

393 Graph containing nodes that are to be exported into an execution 

394 butler 

395 outputLocation : convertible to `ResourcePath` 

396 URI Location at which the execution butler is to be exported. May be 

397 specified as a string or a `ResourcePath` instance. 

398 run : `str`, optional 

399 The run collection that the exported datasets are to be placed in. If 

400 None, the default value in registry.defaults will be used. 

401 clobber : `bool`, Optional 

402 By default a butler will not be created if a file or directory 

403 already exists at the output location. If this is set to `True` 

404 what is at the location will be deleted prior to running the 

405 export. Defaults to `False` 

406 butlerModifier : `~typing.Callable`, Optional 

407 If supplied this should be a callable that accepts a 

408 `~lsst.daf.butler.Butler`, and returns an instantiated 

409 `~lsst.daf.butler.Butler`. This callable may be used to make any 

410 modifications to the `~lsst.daf.butler.Butler` desired. This 

411 will be called after importing all datasets that exist in the input 

412 `~lsst.daf.butler.Butler` but prior to inserting Datasets expected 

413 to be produced. Examples of what this method could do include 

414 things such as creating collections/runs/ etc. 

415 collections : `~typing.Iterable` of `str`, Optional 

416 An iterable of collection names that will be exported from the input 

417 `~lsst.daf.butler.Butler` when creating the execution butler. If not 

418 supplied the `~lsst.daf.butler.Butler`\ 's `~lsst.daf.butler.Registry` 

419 default collections will be used. 

420 datastoreRoot : convertible to `ResourcePath`, Optional 

421 Root directory for datastore of execution butler. If `None`, then the 

422 original butler's datastore will be used. 

423 transfer : `str` 

424 How (and whether) the input datasets should be added to the execution 

425 butler datastore. This should be a ``transfer`` string recognized by 

426 :func:`lsst.resources.ResourcePath.transfer_from`. 

427 ``"auto"`` means to ``"copy"`` if the ``datastoreRoot`` is specified. 

428 

429 Returns 

430 ------- 

431 executionButler : `lsst.daf.butler.Butler` 

432 An instance of the newly created execution butler 

433 

434 Raises 

435 ------ 

436 FileExistsError 

437 Raised if something exists in the filesystem at the specified output 

438 location and clobber is `False` 

439 NotADirectoryError 

440 Raised if specified output URI does not correspond to a directory 

441 """ 

442 # We know this must refer to a directory. 

443 outputLocation = ResourcePath(outputLocation, forceDirectory=True) 

444 if datastoreRoot is not None: 

445 datastoreRoot = ResourcePath(datastoreRoot, forceDirectory=True) 

446 

447 # Do this first to Fail Fast if the output exists 

448 if (dirExists := outputLocation.exists()) and not clobber: 

449 raise FileExistsError("Cannot create a butler at specified location, location exists") 

450 if not outputLocation.isdir(): 

451 raise NotADirectoryError("The specified output URI does not appear to correspond to a directory") 

452 

453 # Gather all DatasetTypes from the Python and check any that already exist 

454 # in the registry for consistency. This does not check that all dataset 

455 # types here exist, because they might want to register dataset types 

456 # later. It would be nice to also check that, but to that we would need to 

457 # be told whether they plan to register dataset types later (DM-30845). 

458 dataset_types = PipelineDatasetTypes.fromPipeline(graph.iterTaskGraph(), registry=butler.registry) 

459 

460 exports, inserts = _accumulate(butler, graph, dataset_types) 

461 yamlBuffer = _export(butler, collections, inserts) 

462 

463 newButler = _setupNewButler(butler, outputLocation, dirExists, datastoreRoot) 

464 

465 newButler = _import(yamlBuffer, newButler, inserts, run, butlerModifier) 

466 

467 if transfer == "auto" and datastoreRoot is not None: 

468 transfer = "copy" 

469 

470 # Transfer the existing datasets directly from the source butler. 

471 newButler.transfer_from( 

472 butler, 

473 exports, 

474 transfer=transfer, 

475 skip_missing=False, # Everything should exist. 

476 register_dataset_types=True, 

477 transfer_dimensions=True, 

478 ) 

479 

480 return newButler