Coverage for python/lsst/daf/butler/script/queryDatasets.py: 27%

67 statements  

« prev     ^ index     » next       coverage.py v7.5.1, created at 2024-05-16 02:58 -0700

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27from __future__ import annotations 

28 

29import dataclasses 

30from collections import defaultdict 

31from collections.abc import Iterable 

32from types import EllipsisType 

33from typing import TYPE_CHECKING 

34 

35import numpy as np 

36from astropy.table import Table as AstropyTable 

37 

38from .._butler import Butler 

39from ..cli.utils import sortAstropyTable 

40 

41if TYPE_CHECKING: 

42 from lsst.daf.butler import DatasetRef 

43 from lsst.daf.butler.registry.queries import DatasetQueryResults 

44 from lsst.resources import ResourcePath 

45 

46 

47@dataclasses.dataclass(frozen=True) 

48class _RefInfo: 

49 datasetRef: DatasetRef 

50 uri: str | None 

51 

52 

53class _Table: 

54 """Aggregates rows for a single dataset type, and creates an astropy table 

55 with the aggregated data. Eliminates duplicate rows. 

56 """ 

57 

58 datasetRefs: set[_RefInfo] 

59 

60 def __init__(self) -> None: 

61 self.datasetRefs = set() 

62 

63 def add(self, datasetRef: DatasetRef, uri: ResourcePath | None = None) -> None: 

64 """Add a row of information to the table. 

65 

66 ``uri`` is optional but must be the consistent; provided or not, for 

67 every call to a ``_Table`` instance. 

68 

69 Parameters 

70 ---------- 

71 datasetRef : `DatasetRef` 

72 A dataset ref that will be added as a row in the table. 

73 uri : `lsst.resources.ResourcePath`, optional 

74 The URI to show as a file location in the table, by default `None`. 

75 """ 

76 uri_str = str(uri) if uri else None 

77 self.datasetRefs.add(_RefInfo(datasetRef, uri_str)) 

78 

79 def getAstropyTable(self, datasetTypeName: str) -> AstropyTable: 

80 """Get the table as an astropy table. 

81 

82 Parameters 

83 ---------- 

84 datasetTypeName : `str` 

85 The dataset type name to show in the ``type`` column of the table. 

86 

87 Returns 

88 ------- 

89 table : `astropy.table._Table` 

90 The table with the provided column names and rows. 

91 """ 

92 # Should never happen; adding a dataset should be the action that 

93 # causes a _Table to be created. 

94 if not self.datasetRefs: 

95 raise RuntimeError(f"No DatasetRefs were provided for dataset type {datasetTypeName}") 

96 

97 refInfo = next(iter(self.datasetRefs)) 

98 dimensions = [ 

99 refInfo.datasetRef.dataId.universe.dimensions[k] 

100 for k in refInfo.datasetRef.dataId.dimensions.data_coordinate_keys 

101 ] 

102 columnNames = ["type", "run", "id", *[str(item) for item in dimensions]] 

103 

104 # Need to hint the column types for numbers since the per-row 

105 # constructor of Table does not work this out on its own and sorting 

106 # will not work properly without. 

107 typeMap = {float: np.float64, int: np.int64} 

108 columnTypes = [ 

109 None, 

110 None, 

111 str, 

112 *[typeMap.get(type(value)) for value in refInfo.datasetRef.dataId.full_values], 

113 ] 

114 if refInfo.uri: 

115 columnNames.append("URI") 

116 columnTypes.append(None) 

117 

118 rows = [] 

119 for refInfo in self.datasetRefs: 

120 row = [ 

121 datasetTypeName, 

122 refInfo.datasetRef.run, 

123 str(refInfo.datasetRef.id), 

124 *refInfo.datasetRef.dataId.full_values, 

125 ] 

126 if refInfo.uri: 

127 row.append(refInfo.uri) 

128 rows.append(row) 

129 

130 dataset_table = AstropyTable(np.array(rows), names=columnNames, dtype=columnTypes) 

131 return sortAstropyTable(dataset_table, dimensions, ["type", "run"]) 

132 

133 

134class QueryDatasets: 

135 """Get dataset refs from a repository. 

136 

137 Parameters 

138 ---------- 

139 glob : iterable [`str`] 

140 A list of glob-style search string that fully or partially identify 

141 the dataset type names to search for. 

142 collections : iterable [`str`] 

143 A list of glob-style search string that fully or partially identify 

144 the collections to search for. 

145 where : `str` 

146 A string expression similar to a SQL WHERE clause. May involve any 

147 column of a dimension table or (as a shortcut for the primary key 

148 column of a dimension table) dimension name. 

149 find_first : `bool` 

150 For each result data ID, only yield one DatasetRef of each DatasetType, 

151 from the first collection in which a dataset of that dataset type 

152 appears (according to the order of `collections` passed in). If used, 

153 `collections` must specify at least one expression and must not contain 

154 wildcards. 

155 show_uri : `bool` 

156 If True, include the dataset URI in the output. 

157 repo : `str` or `None` 

158 URI to the location of the repo or URI to a config file describing the 

159 repo and its location. One of `repo` and `butler` must be `None` and 

160 the other must not be `None`. 

161 butler : `lsst.daf.butler.Butler` or `None` 

162 The butler to use to query. One of `repo` and `butler` must be `None` 

163 and the other must not be `None`. 

164 """ 

165 

166 def __init__( 

167 self, 

168 glob: Iterable[str], 

169 collections: Iterable[str], 

170 where: str, 

171 find_first: bool, 

172 show_uri: bool, 

173 repo: str | None = None, 

174 butler: Butler | None = None, 

175 ): 

176 if (repo and butler) or (not repo and not butler): 

177 raise RuntimeError("One of repo and butler must be provided and the other must be None.") 

178 # show_uri requires a datastore. 

179 without_datastore = not show_uri 

180 self.butler = butler or Butler.from_config(repo, without_datastore=without_datastore) 

181 self._getDatasets(glob, collections, where, find_first) 

182 self.showUri = show_uri 

183 

184 def _getDatasets( 

185 self, glob: Iterable[str], collections: Iterable[str], where: str, find_first: bool 

186 ) -> None: 

187 datasetTypes = glob or ... 

188 query_collections: Iterable[str] | EllipsisType = collections or ... 

189 

190 self.datasets = self.butler.registry.queryDatasets( 

191 datasetType=datasetTypes, collections=query_collections, where=where, findFirst=find_first 

192 ).expanded() 

193 

194 def getTables(self) -> list[AstropyTable]: 

195 """Get the datasets as a list of astropy tables. 

196 

197 Returns 

198 ------- 

199 datasetTables : `list` [``astropy.table._Table``] 

200 A list of astropy tables, one for each dataset type. 

201 """ 

202 tables: dict[str, _Table] = defaultdict(_Table) 

203 if not self.showUri: 

204 for dataset_ref in self.datasets: 

205 tables[dataset_ref.datasetType.name].add(dataset_ref) 

206 else: 

207 d = list(self.datasets) 

208 ref_uris = self.butler.get_many_uris(d, predict=True) 

209 for ref, uris in ref_uris.items(): 

210 if uris.primaryURI: 

211 tables[ref.datasetType.name].add(ref, uris.primaryURI) 

212 for name, uri in uris.componentURIs.items(): 

213 tables[ref.datasetType.componentTypeName(name)].add(ref, uri) 

214 

215 return [table.getAstropyTable(datasetTypeName) for datasetTypeName, table in tables.items()] 

216 

217 def getDatasets(self) -> DatasetQueryResults: 

218 """Get the datasets as a list of ``DatasetQueryResults``. 

219 

220 Returns 

221 ------- 

222 refs : `lsst.daf.butler.registry.queries.DatasetQueryResults` 

223 Dataset references matching the given query criteria. 

224 """ 

225 return self.datasets