Coverage for python/lsst/daf/butler/script/_pruneDatasets.py: 41%
84 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-01 11:00 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-01 11:00 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
27from __future__ import annotations
29from collections.abc import Callable, Iterable
30from enum import Enum, auto
31from typing import TYPE_CHECKING, Any
33from .._butler import Butler
34from ..registry import CollectionType
35from .queryDatasets import QueryDatasets
37if TYPE_CHECKING:
38 from astropy.table import Table
41class PruneDatasetsResult:
42 """Contains the results of a prune-datasets action.
44 The action may not be complete if the caller requested a confirmation, in
45 which case calling ``onConfirmation`` will perform the action.
47 Parameters
48 ----------
49 tables : `list` [`astropy.table.Table`], optional
50 The astropy tables that will be or were deleted, by default None.
51 state : `PruneDatasetsResult.State`, optional
52 The initial state of execution of the action, if `None` the result
53 state is ``INIT``, by default None.
55 Attributes
56 ----------
57 tables
58 Same as in Parameters.
59 state : ``PruneDatasetsResult.State``
60 The current state of the action.
61 onConfirmation : `Callable[None, None]`
62 The function to call to perform the action if the caller wants to
63 confirm the tables before performing the action.
64 """
66 action: dict[str, Any] | None
67 onConfirmation: Callable | None
69 class State(Enum):
70 """State associated with dataset pruning request."""
72 INIT = auto()
73 DRY_RUN_COMPLETE = auto()
74 AWAITING_CONFIRMATION = auto()
75 FINISHED = auto()
76 ERR_PURGE_AND_DISASSOCIATE = auto()
77 ERR_NO_COLLECTION_RESTRICTION = auto()
78 ERR_PRUNE_ON_NOT_RUN = auto()
79 ERR_NO_OP = auto()
81 def __init__(
82 self,
83 tables: list[Table] | None = None,
84 state: State | None = None,
85 errDict: dict[str, str] | None = None,
86 ):
87 self.state = state or self.State.INIT
88 if tables is None:
89 tables = []
90 self.tables = tables
91 self.onConfirmation = None
92 # Action describes the removal action for dry-run, will be a dict with
93 # keys disassociate, unstore, purge, and collections.
94 self.action = None
95 # errDict is a container for variables related to the error that may be
96 # substituted into a user-visible string.
97 self.errDict = errDict or {}
99 @property
100 def dryRun(self) -> bool:
101 return self.state is self.State.DRY_RUN_COMPLETE
103 @property
104 def confirm(self) -> bool:
105 return self.state is self.State.AWAITING_CONFIRMATION
107 @property
108 def finished(self) -> bool:
109 return self.state is self.State.FINISHED
111 @property
112 def errPurgeAndDisassociate(self) -> bool:
113 return self.state is self.State.ERR_PURGE_AND_DISASSOCIATE
115 @property
116 def errNoCollectionRestriction(self) -> bool:
117 return self.state is self.State.ERR_NO_COLLECTION_RESTRICTION
119 @property
120 def errPruneOnNotRun(self) -> bool:
121 return self.state is self.State.ERR_PRUNE_ON_NOT_RUN
123 @property
124 def errNoOp(self) -> bool:
125 return self.state is self.State.ERR_NO_OP
128def pruneDatasets(
129 repo: str,
130 collections: Iterable[str],
131 datasets: Iterable[str],
132 where: str,
133 disassociate_tags: Iterable[str],
134 unstore: bool,
135 purge_run: str,
136 dry_run: bool,
137 confirm: bool,
138 find_all: bool,
139) -> PruneDatasetsResult:
140 """Prune datasets from a repository.
142 Parameters
143 ----------
144 repo : `str`
145 URI to the location of the repo or URI to a config file describing the
146 repo and its location.
147 collections : iterable [`str`]
148 A list of glob-style search string that identify the collections to
149 search for.
150 datasets : iterable [`str`]
151 A list of glob-style search string that identify the dataset type names
152 to search for.
153 where : `str`
154 A string expression similar to a SQL WHERE clause. May involve any
155 column of a dimension table or (as a shortcut for the primary key
156 column of a dimension table) dimension name.
157 disassociate_tags : `list` [`str`]
158 TAGGED collections to disassociate the datasets from. If not `None`
159 then ``purge_run`` must be `None`.
160 unstore : `bool`
161 Same as the unstore argument to ``Butler.pruneDatasets``.
162 purge_run : `str`
163 Completely remove datasets from the ``Registry``. Note that current
164 implementation accepts any RUN-type collection, but will remove
165 datasets from all collections in ``collections`` if it is non-empty.
166 dry_run : `bool`
167 Get results for what would be removed but do not remove.
168 confirm : `bool`
169 Get results for what would be removed and return the results for
170 display & confirmation, with a completion function to run after
171 confirmation.
172 find_all : `bool`
173 If False, for each result data ID, will only delete the dataset from
174 the first collection in which a dataset of that dataset type appears
175 (according to the order of ``collections`` passed in). If used,
176 ``collections`` must specify at least one expression and must not
177 contain wildcards. This is the inverse of ``QueryDataset``'s find_first
178 option.
180 Notes
181 -----
182 The matrix of legal & illegal combinations of purge, unstore, and
183 disassociate is this:
184 - none of (purge, unstore, disassociate): error, nothing to do
185 - purge only: ok
186 - unstore only: ok
187 - disassociate only: ok
188 - purge+unstore: ok, just ignore unstore (purge effectively implies
189 unstore)
190 - purge+disassociate: this is an error (instead of ignoring disassociate),
191 because that comes with a collection argument that we can't respect, and
192 that might be confusing (purge will disassociate from all TAGGED
193 collections, not just the one given)
194 - purge+unstore+disassociate: an error, for the same reason as just
195 purge+disassociate
196 - unstore+disassociate: ok; these operations are unrelated to each other
198 Returns
199 -------
200 results : `PruneDatasetsResult`
201 A data structure that contains information about datasets for removal,
202 removal status, and options to continue in some cases.
203 """
204 if not disassociate_tags and not unstore and not purge_run:
205 return PruneDatasetsResult(state=PruneDatasetsResult.State.ERR_NO_OP)
207 if disassociate_tags and purge_run:
208 return PruneDatasetsResult(state=PruneDatasetsResult.State.ERR_PURGE_AND_DISASSOCIATE)
210 # If collections is not specified and a purge_run is, use the purge_run for
211 # collections, or if disassociate_tags is then use that.
212 if not collections:
213 if purge_run:
214 collections = (purge_run,)
215 elif disassociate_tags:
216 collections = disassociate_tags
218 if not collections:
219 return PruneDatasetsResult(state=PruneDatasetsResult.State.ERR_NO_COLLECTION_RESTRICTION)
221 butler = Butler.from_config(repo)
223 # If purging, verify that the collection to purge is RUN type collection.
224 if purge_run:
225 collectionType = butler.registry.getCollectionType(purge_run)
226 if collectionType is not CollectionType.RUN:
227 return PruneDatasetsResult(
228 state=PruneDatasetsResult.State.ERR_PRUNE_ON_NOT_RUN, errDict=dict(collection=purge_run)
229 )
231 datasets_found = QueryDatasets(
232 repo=repo,
233 glob=datasets,
234 collections=collections,
235 where=where,
236 # By default we want find_first to be True if collections are provided
237 # (else False) (find_first requires collections to be provided).
238 # But the user may specify that they want to find all (thus forcing
239 # find_first to be False)
240 find_first=not find_all,
241 show_uri=False,
242 )
244 result = PruneDatasetsResult(datasets_found.getTables())
246 disassociate = bool(disassociate_tags) or bool(purge_run)
247 purge = bool(purge_run)
248 unstore = unstore or bool(purge_run)
250 if dry_run:
251 result.state = PruneDatasetsResult.State.DRY_RUN_COMPLETE
252 result.action = dict(disassociate=disassociate, purge=purge, unstore=unstore, collections=collections)
253 return result
255 def doPruneDatasets() -> PruneDatasetsResult:
256 butler = Butler.from_config(repo, writeable=True)
257 butler.pruneDatasets(
258 refs=datasets_found.getDatasets(),
259 disassociate=disassociate,
260 tags=disassociate_tags or (),
261 purge=purge,
262 unstore=unstore,
263 )
264 result.state = PruneDatasetsResult.State.FINISHED
265 return result
267 if confirm:
268 result.state = PruneDatasetsResult.State.AWAITING_CONFIRMATION
269 result.onConfirmation = doPruneDatasets
270 return result
272 return doPruneDatasets()