Coverage for python/lsst/daf/butler/script/queryDatasets.py: 27%

67 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-12-06 10:53 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27from __future__ import annotations 

28 

29import dataclasses 

30from collections import defaultdict 

31from collections.abc import Iterable 

32from types import EllipsisType 

33from typing import TYPE_CHECKING 

34 

35import numpy as np 

36from astropy.table import Table as AstropyTable 

37 

38from .._butler import Butler 

39from ..cli.utils import sortAstropyTable 

40 

41if TYPE_CHECKING: 

42 from lsst.daf.butler import DatasetRef 

43 from lsst.daf.butler.registry.queries import DatasetQueryResults 

44 from lsst.resources import ResourcePath 

45 

46 

47@dataclasses.dataclass(frozen=True) 

48class _RefInfo: 

49 datasetRef: DatasetRef 

50 uri: str | None 

51 

52 

53class _Table: 

54 """Aggregates rows for a single dataset type, and creates an astropy table 

55 with the aggregated data. Eliminates duplicate rows. 

56 """ 

57 

58 datasetRefs: set[_RefInfo] 

59 

60 def __init__(self) -> None: 

61 self.datasetRefs = set() 

62 

63 def add(self, datasetRef: DatasetRef, uri: ResourcePath | None = None) -> None: 

64 """Add a row of information to the table. 

65 

66 ``uri`` is optional but must be the consistent; provided or not, for 

67 every call to a ``_Table`` instance. 

68 

69 Parameters 

70 ---------- 

71 datasetRef : `DatasetRef` 

72 A dataset ref that will be added as a row in the table. 

73 uri : `lsst.resources.ResourcePath`, optional 

74 The URI to show as a file location in the table, by default None 

75 """ 

76 uri_str = str(uri) if uri else None 

77 self.datasetRefs.add(_RefInfo(datasetRef, uri_str)) 

78 

79 def getAstropyTable(self, datasetTypeName: str) -> AstropyTable: 

80 """Get the table as an astropy table. 

81 

82 Parameters 

83 ---------- 

84 datasetTypeName : `str` 

85 The dataset type name to show in the ``type`` column of the table. 

86 

87 Returns 

88 ------- 

89 table : `astropy.table._Table` 

90 The table with the provided column names and rows. 

91 """ 

92 # Should never happen; adding a dataset should be the action that 

93 # causes a _Table to be created. 

94 if not self.datasetRefs: 

95 raise RuntimeError(f"No DatasetRefs were provided for dataset type {datasetTypeName}") 

96 

97 refInfo = next(iter(self.datasetRefs)) 

98 dimensions = [ 

99 refInfo.datasetRef.dataId.universe.dimensions[k] 

100 for k in refInfo.datasetRef.dataId.dimensions.data_coordinate_keys 

101 ] 

102 columnNames = ["type", "run", "id", *[str(item) for item in dimensions]] 

103 

104 # Need to hint the column types for numbers since the per-row 

105 # constructor of Table does not work this out on its own and sorting 

106 # will not work properly without. 

107 typeMap = {float: np.float64, int: np.int64} 

108 columnTypes = [ 

109 None, 

110 None, 

111 str, 

112 *[typeMap.get(type(value)) for value in refInfo.datasetRef.dataId.full_values], 

113 ] 

114 if refInfo.uri: 

115 columnNames.append("URI") 

116 columnTypes.append(None) 

117 

118 rows = [] 

119 for refInfo in self.datasetRefs: 

120 row = [ 

121 datasetTypeName, 

122 refInfo.datasetRef.run, 

123 str(refInfo.datasetRef.id), 

124 *refInfo.datasetRef.dataId.full_values, 

125 ] 

126 if refInfo.uri: 

127 row.append(refInfo.uri) 

128 rows.append(row) 

129 

130 dataset_table = AstropyTable(np.array(rows), names=columnNames, dtype=columnTypes) 

131 return sortAstropyTable(dataset_table, dimensions, ["type", "run"]) 

132 

133 

134class QueryDatasets: 

135 """Get dataset refs from a repository. 

136 

137 Parameters 

138 ---------- 

139 glob : iterable [`str`] 

140 A list of glob-style search string that fully or partially identify 

141 the dataset type names to search for. 

142 collections : iterable [`str`] 

143 A list of glob-style search string that fully or partially identify 

144 the collections to search for. 

145 where : `str` 

146 A string expression similar to a SQL WHERE clause. May involve any 

147 column of a dimension table or (as a shortcut for the primary key 

148 column of a dimension table) dimension name. 

149 find_first : `bool` 

150 For each result data ID, only yield one DatasetRef of each DatasetType, 

151 from the first collection in which a dataset of that dataset type 

152 appears (according to the order of `collections` passed in). If used, 

153 `collections` must specify at least one expression and must not contain 

154 wildcards. 

155 show_uri : `bool` 

156 If True, include the dataset URI in the output. 

157 repo : `str` or `None` 

158 URI to the location of the repo or URI to a config file describing the 

159 repo and its location. One of `repo` and `butler` must be `None` and 

160 the other must not be `None`. 

161 butler : `lsst.daf.butler.Butler` or `None` 

162 The butler to use to query. One of `repo` and `butler` must be `None` 

163 and the other must not be `None`. 

164 

165 """ 

166 

167 def __init__( 

168 self, 

169 glob: Iterable[str], 

170 collections: Iterable[str], 

171 where: str, 

172 find_first: bool, 

173 show_uri: bool, 

174 repo: str | None = None, 

175 butler: Butler | None = None, 

176 ): 

177 if (repo and butler) or (not repo and not butler): 

178 raise RuntimeError("One of repo and butler must be provided and the other must be None.") 

179 # show_uri requires a datastore. 

180 without_datastore = not show_uri 

181 self.butler = butler or Butler.from_config(repo, without_datastore=without_datastore) 

182 self._getDatasets(glob, collections, where, find_first) 

183 self.showUri = show_uri 

184 

185 def _getDatasets( 

186 self, glob: Iterable[str], collections: Iterable[str], where: str, find_first: bool 

187 ) -> None: 

188 datasetTypes = glob or ... 

189 query_collections: Iterable[str] | EllipsisType = collections or ... 

190 

191 self.datasets = self.butler.registry.queryDatasets( 

192 datasetType=datasetTypes, collections=query_collections, where=where, findFirst=find_first 

193 ).expanded() 

194 

195 def getTables(self) -> list[AstropyTable]: 

196 """Get the datasets as a list of astropy tables. 

197 

198 Returns 

199 ------- 

200 datasetTables : `list` [``astropy.table._Table``] 

201 A list of astropy tables, one for each dataset type. 

202 """ 

203 tables: dict[str, _Table] = defaultdict(_Table) 

204 if not self.showUri: 

205 for dataset_ref in self.datasets: 

206 tables[dataset_ref.datasetType.name].add(dataset_ref) 

207 else: 

208 d = list(self.datasets) 

209 ref_uris = self.butler.get_many_uris(d, predict=True) 

210 for ref, uris in ref_uris.items(): 

211 if uris.primaryURI: 

212 tables[ref.datasetType.name].add(ref, uris.primaryURI) 

213 for name, uri in uris.componentURIs.items(): 

214 tables[ref.datasetType.componentTypeName(name)].add(ref, uri) 

215 

216 return [table.getAstropyTable(datasetTypeName) for datasetTypeName, table in tables.items()] 

217 

218 def getDatasets(self) -> DatasetQueryResults: 

219 """Get the datasets as a list of ``DatasetQueryResults``. 

220 

221 Returns 

222 ------- 

223 refs : `lsst.daf.butler.registry.queries.DatasetQueryResults` 

224 Dataset references matching the given query criteria. 

225 """ 

226 return self.datasets