Coverage for python/lsst/daf/butler/script/queryDatasets.py: 25%

71 statements  

« prev     ^ index     » next       coverage.py v7.2.3, created at 2023-04-22 02:18 -0700

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23import dataclasses 

24import uuid 

25from collections import defaultdict 

26from collections.abc import Iterable 

27from typing import TYPE_CHECKING 

28 

29import numpy as np 

30from astropy.table import Table as AstropyTable 

31 

32from .._butler import Butler 

33from ..cli.utils import sortAstropyTable 

34 

35if TYPE_CHECKING: 

36 from lsst.daf.butler import DatasetRef 

37 from lsst.daf.butler.registry.queries import DatasetQueryResults 

38 from lsst.resources import ResourcePath 

39 

40 

41@dataclasses.dataclass(frozen=True) 

42class _RefInfo: 

43 datasetRef: DatasetRef 

44 uri: str | None 

45 

46 

47class _Table: 

48 """Aggregates rows for a single dataset type, and creates an astropy table 

49 with the aggregated data. Eliminates duplicate rows. 

50 """ 

51 

52 datasetRefs: set[_RefInfo] 

53 

54 def __init__(self) -> None: 

55 self.datasetRefs = set() 

56 

57 def add(self, datasetRef: DatasetRef, uri: ResourcePath | None = None) -> None: 

58 """Add a row of information to the table. 

59 

60 ``uri`` is optional but must be the consistent; provided or not, for 

61 every call to a ``_Table`` instance. 

62 

63 Parameters 

64 ---------- 

65 datasetRef : `DatasetRef` 

66 A dataset ref that will be added as a row in the table. 

67 uri : `lsst.resources.ResourcePath`, optional 

68 The URI to show as a file location in the table, by default None 

69 """ 

70 uri_str = str(uri) if uri else None 

71 self.datasetRefs.add(_RefInfo(datasetRef, uri_str)) 

72 

73 def getAstropyTable(self, datasetTypeName: str) -> AstropyTable: 

74 """Get the table as an astropy table. 

75 

76 Parameters 

77 ---------- 

78 datasetTypeName : `str` 

79 The dataset type name to show in the ``type`` column of the table. 

80 

81 Returns 

82 ------- 

83 table : `astropy.table._Table` 

84 The table with the provided column names and rows. 

85 """ 

86 

87 def _id_type(datasetRef: DatasetRef) -> type[str] | type[np.int64]: 

88 if isinstance(datasetRef.id, uuid.UUID): 

89 return str 

90 else: 

91 return np.int64 

92 

93 # Should never happen; adding a dataset should be the action that 

94 # causes a _Table to be created. 

95 if not self.datasetRefs: 

96 raise RuntimeError(f"No DatasetRefs were provided for dataset type {datasetTypeName}") 

97 

98 refInfo = next(iter(self.datasetRefs)) 

99 dimensions = list(refInfo.datasetRef.dataId.full.keys()) 

100 columnNames = ["type", "run", "id", *[str(item) for item in dimensions]] 

101 

102 # Need to hint the column types for numbers since the per-row 

103 # constructor of Table does not work this out on its own and sorting 

104 # will not work properly without. 

105 typeMap = {float: np.float64, int: np.int64} 

106 idType = _id_type(refInfo.datasetRef) 

107 columnTypes = [ 

108 None, 

109 None, 

110 idType, 

111 *[typeMap.get(type(value)) for value in refInfo.datasetRef.dataId.full.values()], 

112 ] 

113 if refInfo.uri: 

114 columnNames.append("URI") 

115 columnTypes.append(None) 

116 

117 rows = [] 

118 for refInfo in self.datasetRefs: 

119 row = [ 

120 datasetTypeName, 

121 refInfo.datasetRef.run, 

122 str(refInfo.datasetRef.id) if idType is str else refInfo.datasetRef.id, 

123 *[value for value in refInfo.datasetRef.dataId.full.values()], 

124 ] 

125 if refInfo.uri: 

126 row.append(refInfo.uri) 

127 rows.append(row) 

128 

129 dataset_table = AstropyTable(np.array(rows), names=columnNames, dtype=columnTypes) 

130 return sortAstropyTable(dataset_table, dimensions, ["type", "run"]) 

131 

132 

133class QueryDatasets: 

134 """Get dataset refs from a repository. 

135 

136 Parameters 

137 ---------- 

138 glob : iterable [`str`] 

139 A list of glob-style search string that fully or partially identify 

140 the dataset type names to search for. 

141 collections : iterable [`str`] 

142 A list of glob-style search string that fully or partially identify 

143 the collections to search for. 

144 where : `str` 

145 A string expression similar to a SQL WHERE clause. May involve any 

146 column of a dimension table or (as a shortcut for the primary key 

147 column of a dimension table) dimension name. 

148 find_first : `bool` 

149 For each result data ID, only yield one DatasetRef of each DatasetType, 

150 from the first collection in which a dataset of that dataset type 

151 appears (according to the order of `collections` passed in). If used, 

152 `collections` must specify at least one expression and must not contain 

153 wildcards. 

154 show_uri : `bool` 

155 If True, include the dataset URI in the output. 

156 repo : `str` or `None` 

157 URI to the location of the repo or URI to a config file describing the 

158 repo and its location. One of `repo` and `butler` must be `None` and 

159 the other must not be `None`. 

160 butler : `lsst.daf.butler.Butler` or `None` 

161 The butler to use to query. One of `repo` and `butler` must be `None` 

162 and the other must not be `None`. 

163 

164 """ 

165 

166 def __init__( 

167 self, 

168 glob: Iterable[str], 

169 collections: Iterable[str], 

170 where: str, 

171 find_first: bool, 

172 show_uri: bool, 

173 repo: str | None = None, 

174 butler: Butler | None = None, 

175 ): 

176 if (repo and butler) or (not repo and not butler): 

177 raise RuntimeError("One of repo and butler must be provided and the other must be None.") 

178 self.butler = butler or Butler(repo) 

179 self._getDatasets(glob, collections, where, find_first) 

180 self.showUri = show_uri 

181 

182 def _getDatasets( 

183 self, glob: Iterable[str], collections: Iterable[str], where: str, find_first: bool 

184 ) -> None: 

185 datasetTypes = glob if glob else ... 

186 query_collections = collections if collections else ... 

187 

188 self.datasets = self.butler.registry.queryDatasets( 

189 datasetType=datasetTypes, collections=query_collections, where=where, findFirst=find_first 

190 ).expanded() 

191 

192 def getTables(self) -> list[AstropyTable]: 

193 """Get the datasets as a list of astropy tables. 

194 

195 Returns 

196 ------- 

197 datasetTables : `list` [``astropy.table._Table``] 

198 A list of astropy tables, one for each dataset type. 

199 """ 

200 tables: dict[str, _Table] = defaultdict(_Table) 

201 if not self.showUri: 

202 for dataset_ref in self.datasets: 

203 tables[dataset_ref.datasetType.name].add(dataset_ref) 

204 else: 

205 d = list(self.datasets) 

206 ref_uris = self.butler.datastore.getManyURIs(d, predict=True) 

207 for ref, uris in ref_uris.items(): 

208 if uris.primaryURI: 

209 tables[ref.datasetType.name].add(ref, uris.primaryURI) 

210 for name, uri in uris.componentURIs.items(): 

211 tables[ref.datasetType.componentTypeName(name)].add(ref, uri) 

212 

213 return [table.getAstropyTable(datasetTypeName) for datasetTypeName, table in tables.items()] 

214 

215 def getDatasets(self) -> DatasetQueryResults: 

216 """Get the datasets as a list of ``DatasetQueryResults``. 

217 

218 Returns 

219 ------- 

220 refs : `lsst.daf.butler.registry.queries.DatasetQueryResults` 

221 Dataset references matching the given query criteria. 

222 """ 

223 return self.datasets