Coverage for python / lsst / daf / butler / script / _pruneDatasets.py: 34%

87 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-30 08:41 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27from __future__ import annotations 

28 

29__all__ = ["pruneDatasets"] 

30 

31import itertools 

32from collections.abc import Callable, Iterable 

33from enum import Enum, auto 

34from typing import TYPE_CHECKING, Any 

35 

36from .._butler import Butler 

37from .._collection_type import CollectionType 

38from .queryDatasets import QueryDatasets 

39 

40if TYPE_CHECKING: 

41 from astropy.table import Table 

42 

43 

44class PruneDatasetsResult: 

45 """Contains the results of a prune-datasets action. 

46 

47 The action may not be complete if the caller requested a confirmation, in 

48 which case calling ``onConfirmation`` will perform the action. 

49 

50 Parameters 

51 ---------- 

52 tables : `list` [`astropy.table.Table`], optional 

53 The astropy tables that will be or were deleted, by default None. 

54 state : `PruneDatasetsResult.State`, optional 

55 The initial state of execution of the action, if `None` the result 

56 state is ``INIT``, by default `None`. 

57 errDict : `dict` [`str`, `str`] or `None` 

58 Place to store error messages. Will be created if not given. 

59 

60 Attributes 

61 ---------- 

62 tables : `list` [`astropy.table.Table`] 

63 Same as in Parameters. 

64 state : ``PruneDatasetsResult.State`` 

65 The current state of the action. 

66 onConfirmation : `~collections.abc.Callable` [`None`, `None`]` 

67 The function to call to perform the action if the caller wants to 

68 confirm the tables before performing the action. 

69 """ 

70 

71 action: dict[str, Any] | None 

72 onConfirmation: Callable | None 

73 

74 class State(Enum): 

75 """State associated with dataset pruning request.""" 

76 

77 INIT = auto() 

78 DRY_RUN_COMPLETE = auto() 

79 AWAITING_CONFIRMATION = auto() 

80 FINISHED = auto() 

81 ERR_PURGE_AND_DISASSOCIATE = auto() 

82 ERR_NO_COLLECTION_RESTRICTION = auto() 

83 ERR_PRUNE_ON_NOT_RUN = auto() 

84 ERR_NO_OP = auto() 

85 

86 def __init__( 

87 self, 

88 tables: list[Table] | None = None, 

89 state: State | None = None, 

90 errDict: dict[str, str] | None = None, 

91 ): 

92 self.state = state or self.State.INIT 

93 if tables is None: 

94 tables = [] 

95 self.tables = tables 

96 self.onConfirmation = None 

97 # Action describes the removal action for dry-run, will be a dict with 

98 # keys disassociate, unstore, purge, and collections. 

99 self.action = None 

100 # errDict is a container for variables related to the error that may be 

101 # substituted into a user-visible string. 

102 self.errDict = errDict or {} 

103 

104 @property 

105 def dryRun(self) -> bool: 

106 return self.state is self.State.DRY_RUN_COMPLETE 

107 

108 @property 

109 def confirm(self) -> bool: 

110 return self.state is self.State.AWAITING_CONFIRMATION 

111 

112 @property 

113 def finished(self) -> bool: 

114 return self.state is self.State.FINISHED 

115 

116 @property 

117 def errPurgeAndDisassociate(self) -> bool: 

118 return self.state is self.State.ERR_PURGE_AND_DISASSOCIATE 

119 

120 @property 

121 def errNoCollectionRestriction(self) -> bool: 

122 return self.state is self.State.ERR_NO_COLLECTION_RESTRICTION 

123 

124 @property 

125 def errPruneOnNotRun(self) -> bool: 

126 return self.state is self.State.ERR_PRUNE_ON_NOT_RUN 

127 

128 @property 

129 def errNoOp(self) -> bool: 

130 return self.state is self.State.ERR_NO_OP 

131 

132 

133def pruneDatasets( 

134 repo: str, 

135 collections: Iterable[str], 

136 datasets: Iterable[str], 

137 where: str, 

138 disassociate_tags: Iterable[str], 

139 unstore: bool, 

140 purge_run: str, 

141 dry_run: bool, 

142 confirm: bool, 

143 find_all: bool, 

144) -> PruneDatasetsResult: 

145 """Prune datasets from a repository. 

146 

147 Parameters 

148 ---------- 

149 repo : `str` 

150 URI to the location of the repo or URI to a config file describing the 

151 repo and its location. 

152 collections : `~collections.abc.Iterable` [`str`] 

153 A list of glob-style search string that identify the collections to 

154 search for. 

155 datasets : `~collections.abc.Iterable` [`str`] 

156 A list of glob-style search string that identify the dataset type names 

157 to search for. 

158 where : `str` 

159 A string expression similar to a SQL WHERE clause. May involve any 

160 column of a dimension table or (as a shortcut for the primary key 

161 column of a dimension table) dimension name. 

162 disassociate_tags : `list` [`str`] 

163 TAGGED collections to disassociate the datasets from. If not `None` 

164 then ``purge_run`` must be `None`. 

165 unstore : `bool` 

166 Same as the unstore argument to ``Butler.pruneDatasets``. 

167 purge_run : `str` 

168 Completely remove datasets from the ``Registry``. Note that current 

169 implementation accepts any RUN-type collection, but will remove 

170 datasets from all collections in ``collections`` if it is non-empty. 

171 dry_run : `bool` 

172 Get results for what would be removed but do not remove. 

173 confirm : `bool` 

174 Get results for what would be removed and return the results for 

175 display & confirmation, with a completion function to run after 

176 confirmation. 

177 find_all : `bool` 

178 If False, for each result data ID, will only delete the dataset from 

179 the first collection in which a dataset of that dataset type appears 

180 (according to the order of ``collections`` passed in). If used, 

181 ``collections`` must specify at least one expression and must not 

182 contain wildcards. This is the inverse of ``QueryDataset``'s find_first 

183 option. 

184 

185 Notes 

186 ----- 

187 The matrix of legal & illegal combinations of purge, unstore, and 

188 disassociate is this: 

189 - none of (purge, unstore, disassociate): error, nothing to do 

190 - purge only: ok 

191 - unstore only: ok 

192 - disassociate only: ok 

193 - purge+unstore: ok, just ignore unstore (purge effectively implies 

194 unstore) 

195 - purge+disassociate: this is an error (instead of ignoring disassociate), 

196 because that comes with a collection argument that we can't respect, and 

197 that might be confusing (purge will disassociate from all TAGGED 

198 collections, not just the one given) 

199 - purge+unstore+disassociate: an error, for the same reason as just 

200 purge+disassociate 

201 - unstore+disassociate: ok; these operations are unrelated to each other 

202 

203 Returns 

204 ------- 

205 results : `PruneDatasetsResult` 

206 A data structure that contains information about datasets for removal, 

207 removal status, and options to continue in some cases. 

208 """ 

209 if not disassociate_tags and not unstore and not purge_run: 

210 return PruneDatasetsResult(state=PruneDatasetsResult.State.ERR_NO_OP) 

211 

212 if disassociate_tags and purge_run: 

213 return PruneDatasetsResult(state=PruneDatasetsResult.State.ERR_PURGE_AND_DISASSOCIATE) 

214 

215 # If collections is not specified and a purge_run is, use the purge_run for 

216 # collections, or if disassociate_tags is then use that. 

217 if not collections: 

218 if purge_run: 

219 collections = (purge_run,) 

220 elif disassociate_tags: 

221 collections = disassociate_tags 

222 

223 if not collections: 

224 return PruneDatasetsResult(state=PruneDatasetsResult.State.ERR_NO_COLLECTION_RESTRICTION) 

225 

226 # If purging, verify that the collection to purge is RUN type collection. 

227 with Butler.from_config(repo, without_datastore=True) as butler: 

228 if purge_run: 

229 collection_info = butler.collections.get_info(purge_run) 

230 if collection_info.type is not CollectionType.RUN: 

231 return PruneDatasetsResult( 

232 state=PruneDatasetsResult.State.ERR_PRUNE_ON_NOT_RUN, errDict=dict(collection=purge_run) 

233 ) 

234 

235 datasets_found = QueryDatasets( 

236 butler=butler, 

237 glob=datasets, 

238 collections=collections, 

239 where=where, 

240 # By default we want find_first to be True if collections are 

241 # provided 

242 # (else False) (find_first requires collections to be provided). 

243 # But the user may specify that they want to find all (thus forcing 

244 # find_first to be False) 

245 find_first=not find_all, 

246 show_uri=False, 

247 ) 

248 dataset_refs = list(itertools.chain.from_iterable(datasets_found.getDatasets())) 

249 

250 result = PruneDatasetsResult(list(datasets_found.getTables())) 

251 

252 disassociate = bool(disassociate_tags) or bool(purge_run) 

253 purge = bool(purge_run) 

254 unstore = unstore or bool(purge_run) 

255 

256 if dry_run: 

257 result.state = PruneDatasetsResult.State.DRY_RUN_COMPLETE 

258 result.action = dict(disassociate=disassociate, purge=purge, unstore=unstore, collections=collections) 

259 return result 

260 

261 def doPruneDatasets() -> PruneDatasetsResult: 

262 with Butler.from_config(repo, writeable=True) as butler: 

263 butler.pruneDatasets( 

264 refs=dataset_refs, 

265 disassociate=disassociate, 

266 tags=disassociate_tags or (), 

267 purge=purge, 

268 unstore=unstore, 

269 ) 

270 result.state = PruneDatasetsResult.State.FINISHED 

271 return result 

272 

273 if confirm: 

274 result.state = PruneDatasetsResult.State.AWAITING_CONFIRMATION 

275 result.onConfirmation = doPruneDatasets 

276 return result 

277 

278 return doPruneDatasets()