Coverage for python / lsst / daf / butler / script / _pruneDatasets.py: 34%
87 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-28 08:36 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-28 08:36 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
27from __future__ import annotations
29__all__ = ["pruneDatasets"]
31import itertools
32from collections.abc import Callable, Iterable
33from enum import Enum, auto
34from typing import TYPE_CHECKING, Any
36from .._butler import Butler
37from .._collection_type import CollectionType
38from .queryDatasets import QueryDatasets
40if TYPE_CHECKING:
41 from astropy.table import Table
44class PruneDatasetsResult:
45 """Contains the results of a prune-datasets action.
47 The action may not be complete if the caller requested a confirmation, in
48 which case calling ``onConfirmation`` will perform the action.
50 Parameters
51 ----------
52 tables : `list` [`astropy.table.Table`], optional
53 The astropy tables that will be or were deleted, by default None.
54 state : `PruneDatasetsResult.State`, optional
55 The initial state of execution of the action, if `None` the result
56 state is ``INIT``, by default `None`.
57 errDict : `dict` [`str`, `str`] or `None`
58 Place to store error messages. Will be created if not given.
60 Attributes
61 ----------
62 tables : `list` [`astropy.table.Table`]
63 Same as in Parameters.
64 state : ``PruneDatasetsResult.State``
65 The current state of the action.
66 onConfirmation : `~collections.abc.Callable` [`None`, `None`]`
67 The function to call to perform the action if the caller wants to
68 confirm the tables before performing the action.
69 """
71 action: dict[str, Any] | None
72 onConfirmation: Callable | None
74 class State(Enum):
75 """State associated with dataset pruning request."""
77 INIT = auto()
78 DRY_RUN_COMPLETE = auto()
79 AWAITING_CONFIRMATION = auto()
80 FINISHED = auto()
81 ERR_PURGE_AND_DISASSOCIATE = auto()
82 ERR_NO_COLLECTION_RESTRICTION = auto()
83 ERR_PRUNE_ON_NOT_RUN = auto()
84 ERR_NO_OP = auto()
86 def __init__(
87 self,
88 tables: list[Table] | None = None,
89 state: State | None = None,
90 errDict: dict[str, str] | None = None,
91 ):
92 self.state = state or self.State.INIT
93 if tables is None:
94 tables = []
95 self.tables = tables
96 self.onConfirmation = None
97 # Action describes the removal action for dry-run, will be a dict with
98 # keys disassociate, unstore, purge, and collections.
99 self.action = None
100 # errDict is a container for variables related to the error that may be
101 # substituted into a user-visible string.
102 self.errDict = errDict or {}
104 @property
105 def dryRun(self) -> bool:
106 return self.state is self.State.DRY_RUN_COMPLETE
108 @property
109 def confirm(self) -> bool:
110 return self.state is self.State.AWAITING_CONFIRMATION
112 @property
113 def finished(self) -> bool:
114 return self.state is self.State.FINISHED
116 @property
117 def errPurgeAndDisassociate(self) -> bool:
118 return self.state is self.State.ERR_PURGE_AND_DISASSOCIATE
120 @property
121 def errNoCollectionRestriction(self) -> bool:
122 return self.state is self.State.ERR_NO_COLLECTION_RESTRICTION
124 @property
125 def errPruneOnNotRun(self) -> bool:
126 return self.state is self.State.ERR_PRUNE_ON_NOT_RUN
128 @property
129 def errNoOp(self) -> bool:
130 return self.state is self.State.ERR_NO_OP
133def pruneDatasets(
134 repo: str,
135 collections: Iterable[str],
136 datasets: Iterable[str],
137 where: str,
138 disassociate_tags: Iterable[str],
139 unstore: bool,
140 purge_run: str,
141 dry_run: bool,
142 confirm: bool,
143 find_all: bool,
144) -> PruneDatasetsResult:
145 """Prune datasets from a repository.
147 Parameters
148 ----------
149 repo : `str`
150 URI to the location of the repo or URI to a config file describing the
151 repo and its location.
152 collections : `~collections.abc.Iterable` [`str`]
153 A list of glob-style search string that identify the collections to
154 search for.
155 datasets : `~collections.abc.Iterable` [`str`]
156 A list of glob-style search string that identify the dataset type names
157 to search for.
158 where : `str`
159 A string expression similar to a SQL WHERE clause. May involve any
160 column of a dimension table or (as a shortcut for the primary key
161 column of a dimension table) dimension name.
162 disassociate_tags : `list` [`str`]
163 TAGGED collections to disassociate the datasets from. If not `None`
164 then ``purge_run`` must be `None`.
165 unstore : `bool`
166 Same as the unstore argument to ``Butler.pruneDatasets``.
167 purge_run : `str`
168 Completely remove datasets from the ``Registry``. Note that current
169 implementation accepts any RUN-type collection, but will remove
170 datasets from all collections in ``collections`` if it is non-empty.
171 dry_run : `bool`
172 Get results for what would be removed but do not remove.
173 confirm : `bool`
174 Get results for what would be removed and return the results for
175 display & confirmation, with a completion function to run after
176 confirmation.
177 find_all : `bool`
178 If False, for each result data ID, will only delete the dataset from
179 the first collection in which a dataset of that dataset type appears
180 (according to the order of ``collections`` passed in). If used,
181 ``collections`` must specify at least one expression and must not
182 contain wildcards. This is the inverse of ``QueryDataset``'s find_first
183 option.
185 Notes
186 -----
187 The matrix of legal & illegal combinations of purge, unstore, and
188 disassociate is this:
189 - none of (purge, unstore, disassociate): error, nothing to do
190 - purge only: ok
191 - unstore only: ok
192 - disassociate only: ok
193 - purge+unstore: ok, just ignore unstore (purge effectively implies
194 unstore)
195 - purge+disassociate: this is an error (instead of ignoring disassociate),
196 because that comes with a collection argument that we can't respect, and
197 that might be confusing (purge will disassociate from all TAGGED
198 collections, not just the one given)
199 - purge+unstore+disassociate: an error, for the same reason as just
200 purge+disassociate
201 - unstore+disassociate: ok; these operations are unrelated to each other
203 Returns
204 -------
205 results : `PruneDatasetsResult`
206 A data structure that contains information about datasets for removal,
207 removal status, and options to continue in some cases.
208 """
209 if not disassociate_tags and not unstore and not purge_run:
210 return PruneDatasetsResult(state=PruneDatasetsResult.State.ERR_NO_OP)
212 if disassociate_tags and purge_run:
213 return PruneDatasetsResult(state=PruneDatasetsResult.State.ERR_PURGE_AND_DISASSOCIATE)
215 # If collections is not specified and a purge_run is, use the purge_run for
216 # collections, or if disassociate_tags is then use that.
217 if not collections:
218 if purge_run:
219 collections = (purge_run,)
220 elif disassociate_tags:
221 collections = disassociate_tags
223 if not collections:
224 return PruneDatasetsResult(state=PruneDatasetsResult.State.ERR_NO_COLLECTION_RESTRICTION)
226 # If purging, verify that the collection to purge is RUN type collection.
227 with Butler.from_config(repo, without_datastore=True) as butler:
228 if purge_run:
229 collection_info = butler.collections.get_info(purge_run)
230 if collection_info.type is not CollectionType.RUN:
231 return PruneDatasetsResult(
232 state=PruneDatasetsResult.State.ERR_PRUNE_ON_NOT_RUN, errDict=dict(collection=purge_run)
233 )
235 datasets_found = QueryDatasets(
236 butler=butler,
237 glob=datasets,
238 collections=collections,
239 where=where,
240 # By default we want find_first to be True if collections are
241 # provided
242 # (else False) (find_first requires collections to be provided).
243 # But the user may specify that they want to find all (thus forcing
244 # find_first to be False)
245 find_first=not find_all,
246 show_uri=False,
247 )
248 dataset_refs = list(itertools.chain.from_iterable(datasets_found.getDatasets()))
250 result = PruneDatasetsResult(list(datasets_found.getTables()))
252 disassociate = bool(disassociate_tags) or bool(purge_run)
253 purge = bool(purge_run)
254 unstore = unstore or bool(purge_run)
256 if dry_run:
257 result.state = PruneDatasetsResult.State.DRY_RUN_COMPLETE
258 result.action = dict(disassociate=disassociate, purge=purge, unstore=unstore, collections=collections)
259 return result
261 def doPruneDatasets() -> PruneDatasetsResult:
262 with Butler.from_config(repo, writeable=True) as butler:
263 butler.pruneDatasets(
264 refs=dataset_refs,
265 disassociate=disassociate,
266 tags=disassociate_tags or (),
267 purge=purge,
268 unstore=unstore,
269 )
270 result.state = PruneDatasetsResult.State.FINISHED
271 return result
273 if confirm:
274 result.state = PruneDatasetsResult.State.AWAITING_CONFIRMATION
275 result.onConfirmation = doPruneDatasets
276 return result
278 return doPruneDatasets()