Coverage for python/lsst/daf/butler/script/queryDatasets.py: 26%

75 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2023-03-11 02:06 -0800

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23import dataclasses 

24import uuid 

25from collections import defaultdict 

26from collections.abc import Iterable 

27from typing import TYPE_CHECKING 

28 

29import numpy as np 

30from astropy.table import Table as AstropyTable 

31 

32from .._butler import Butler 

33from ..cli.utils import sortAstropyTable 

34 

35if TYPE_CHECKING: 35 ↛ 36line 35 didn't jump to line 36, because the condition on line 35 was never true

36 from lsst.daf.butler import DatasetRef 

37 from lsst.daf.butler.registry.queries import DatasetQueryResults 

38 from lsst.resources import ResourcePath 

39 

40 

41@dataclasses.dataclass(frozen=True) 

42class _RefInfo: 

43 datasetRef: DatasetRef 

44 uri: str | None 

45 

46 

47class _Table: 

48 """Aggregates rows for a single dataset type, and creates an astropy table 

49 with the aggregated data. Eliminates duplicate rows. 

50 """ 

51 

52 datasetRefs: set[_RefInfo] 

53 

54 def __init__(self) -> None: 

55 self.datasetRefs = set() 

56 

57 def add(self, datasetRef: DatasetRef, uri: ResourcePath | None = None) -> None: 

58 """Add a row of information to the table. 

59 

60 ``uri`` is optional but must be the consistent; provided or not, for 

61 every call to a ``_Table`` instance. 

62 

63 Parameters 

64 ---------- 

65 datasetRef : `DatasetRef` 

66 A dataset ref that will be added as a row in the table. 

67 uri : `lsst.resources.ResourcePath`, optional 

68 The URI to show as a file location in the table, by default None 

69 """ 

70 uri_str = str(uri) if uri else None 

71 self.datasetRefs.add(_RefInfo(datasetRef, uri_str)) 

72 

73 def getAstropyTable(self, datasetTypeName: str) -> AstropyTable: 

74 """Get the table as an astropy table. 

75 

76 Parameters 

77 ---------- 

78 datasetTypeName : `str` 

79 The dataset type name to show in the ``type`` column of the table. 

80 

81 Returns 

82 ------- 

83 table : `astropy.table._Table` 

84 The table with the provided column names and rows. 

85 """ 

86 

87 def _id_type(datasetRef: DatasetRef) -> type[str] | type[np.int64]: 

88 if isinstance(datasetRef.id, uuid.UUID): 

89 return str 

90 else: 

91 return np.int64 

92 

93 # Should never happen; adding a dataset should be the action that 

94 # causes a _Table to be created. 

95 if not self.datasetRefs: 

96 raise RuntimeError(f"No DatasetRefs were provided for dataset type {datasetTypeName}") 

97 

98 refInfo = next(iter(self.datasetRefs)) 

99 dimensions = list(refInfo.datasetRef.dataId.full.keys()) 

100 columnNames = ["type", "run", "id", *[str(item) for item in dimensions]] 

101 

102 # Need to hint the column types for numbers since the per-row 

103 # constructor of Table does not work this out on its own and sorting 

104 # will not work properly without. 

105 typeMap = {float: np.float64, int: np.int64} 

106 idType = _id_type(refInfo.datasetRef) 

107 columnTypes = [ 

108 None, 

109 None, 

110 idType, 

111 *[typeMap.get(type(value)) for value in refInfo.datasetRef.dataId.full.values()], 

112 ] 

113 if refInfo.uri: 

114 columnNames.append("URI") 

115 columnTypes.append(None) 

116 

117 rows = [] 

118 for refInfo in self.datasetRefs: 

119 row = [ 

120 datasetTypeName, 

121 refInfo.datasetRef.run, 

122 str(refInfo.datasetRef.id) if idType is str else refInfo.datasetRef.id, 

123 *[value for value in refInfo.datasetRef.dataId.full.values()], 

124 ] 

125 if refInfo.uri: 

126 row.append(refInfo.uri) 

127 rows.append(row) 

128 

129 dataset_table = AstropyTable(np.array(rows), names=columnNames, dtype=columnTypes) 

130 return sortAstropyTable(dataset_table, dimensions, ["type", "run"]) 

131 

132 

133class QueryDatasets: 

134 """Get dataset refs from a repository. 

135 

136 Parameters 

137 ---------- 

138 glob : iterable [`str`] 

139 A list of glob-style search string that fully or partially identify 

140 the dataset type names to search for. 

141 collections : iterable [`str`] 

142 A list of glob-style search string that fully or partially identify 

143 the collections to search for. 

144 where : `str` 

145 A string expression similar to a SQL WHERE clause. May involve any 

146 column of a dimension table or (as a shortcut for the primary key 

147 column of a dimension table) dimension name. 

148 find_first : `bool` 

149 For each result data ID, only yield one DatasetRef of each DatasetType, 

150 from the first collection in which a dataset of that dataset type 

151 appears (according to the order of `collections` passed in). If used, 

152 `collections` must specify at least one expression and must not contain 

153 wildcards. 

154 show_uri : `bool` 

155 If True, include the dataset URI in the output. 

156 repo : `str` or `None` 

157 URI to the location of the repo or URI to a config file describing the 

158 repo and its location. One of `repo` and `butler` must be `None` and 

159 the other must not be `None`. 

160 butler : `lsst.daf.butler.Butler` or `None` 

161 The butler to use to query. One of `repo` and `butler` must be `None` 

162 and the other must not be `None`. 

163 

164 """ 

165 

166 def __init__( 

167 self, 

168 glob: Iterable[str], 

169 collections: Iterable[str], 

170 where: str, 

171 find_first: bool, 

172 show_uri: bool, 

173 repo: str | None = None, 

174 butler: Butler | None = None, 

175 ): 

176 if (repo and butler) or (not repo and not butler): 

177 raise RuntimeError("One of repo and butler must be provided and the other must be None.") 

178 self.butler = butler or Butler(repo) 

179 self._getDatasets(glob, collections, where, find_first) 

180 self.showUri = show_uri 

181 

182 def _getDatasets( 

183 self, glob: Iterable[str], collections: Iterable[str], where: str, find_first: bool 

184 ) -> None: 

185 datasetTypes = glob if glob else ... 

186 query_collections = collections if collections else ... 

187 

188 self.datasets = self.butler.registry.queryDatasets( 

189 datasetType=datasetTypes, collections=query_collections, where=where, findFirst=find_first 

190 ).expanded() 

191 

192 def getTables(self) -> list[AstropyTable]: 

193 """Get the datasets as a list of astropy tables. 

194 

195 Returns 

196 ------- 

197 datasetTables : `list` [``astropy.table._Table``] 

198 A list of astropy tables, one for each dataset type. 

199 """ 

200 tables: dict[str, _Table] = defaultdict(_Table) 

201 if not self.showUri: 

202 for dataset_ref in self.datasets: 

203 tables[dataset_ref.datasetType.name].add(dataset_ref) 

204 else: 

205 d = list(self.datasets) 

206 ref_uris = self.butler.datastore.getManyURIs(d, predict=True) 

207 for ref, uris in ref_uris.items(): 

208 if uris.primaryURI: 

209 tables[ref.datasetType.name].add(ref, uris.primaryURI) 

210 for name, uri in uris.componentURIs.items(): 

211 tables[ref.datasetType.componentTypeName(name)].add(ref, uri) 

212 

213 return [table.getAstropyTable(datasetTypeName) for datasetTypeName, table in tables.items()] 

214 

215 def getDatasets(self) -> DatasetQueryResults: 

216 """Get the datasets as a list of ``DatasetQueryResults``. 

217 

218 Returns 

219 ------- 

220 refs : `lsst.daf.butler.registry.queries.DatasetQueryResults` 

221 Dataset references matching the given query criteria. 

222 """ 

223 return self.datasets