Coverage for python/lsst/daf/butler/script/queryDatasets.py: 26%

72 statements  

« prev     ^ index     » next       coverage.py v7.2.5, created at 2023-05-09 02:11 -0700

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23import dataclasses 

24import uuid 

25from collections import defaultdict 

26from collections.abc import Iterable 

27from typing import TYPE_CHECKING 

28 

29import numpy as np 

30from astropy.table import Table as AstropyTable 

31from lsst.utils.ellipsis import Ellipsis, EllipsisType 

32 

33from .._butler import Butler 

34from ..cli.utils import sortAstropyTable 

35 

36if TYPE_CHECKING: 

37 from lsst.daf.butler import DatasetRef 

38 from lsst.daf.butler.registry.queries import DatasetQueryResults 

39 from lsst.resources import ResourcePath 

40 

41 

42@dataclasses.dataclass(frozen=True) 

43class _RefInfo: 

44 datasetRef: DatasetRef 

45 uri: str | None 

46 

47 

48class _Table: 

49 """Aggregates rows for a single dataset type, and creates an astropy table 

50 with the aggregated data. Eliminates duplicate rows. 

51 """ 

52 

53 datasetRefs: set[_RefInfo] 

54 

55 def __init__(self) -> None: 

56 self.datasetRefs = set() 

57 

58 def add(self, datasetRef: DatasetRef, uri: ResourcePath | None = None) -> None: 

59 """Add a row of information to the table. 

60 

61 ``uri`` is optional but must be the consistent; provided or not, for 

62 every call to a ``_Table`` instance. 

63 

64 Parameters 

65 ---------- 

66 datasetRef : `DatasetRef` 

67 A dataset ref that will be added as a row in the table. 

68 uri : `lsst.resources.ResourcePath`, optional 

69 The URI to show as a file location in the table, by default None 

70 """ 

71 uri_str = str(uri) if uri else None 

72 self.datasetRefs.add(_RefInfo(datasetRef, uri_str)) 

73 

74 def getAstropyTable(self, datasetTypeName: str) -> AstropyTable: 

75 """Get the table as an astropy table. 

76 

77 Parameters 

78 ---------- 

79 datasetTypeName : `str` 

80 The dataset type name to show in the ``type`` column of the table. 

81 

82 Returns 

83 ------- 

84 table : `astropy.table._Table` 

85 The table with the provided column names and rows. 

86 """ 

87 

88 def _id_type(datasetRef: DatasetRef) -> type[str] | type[np.int64]: 

89 if isinstance(datasetRef.id, uuid.UUID): 

90 return str 

91 else: 

92 return np.int64 

93 

94 # Should never happen; adding a dataset should be the action that 

95 # causes a _Table to be created. 

96 if not self.datasetRefs: 

97 raise RuntimeError(f"No DatasetRefs were provided for dataset type {datasetTypeName}") 

98 

99 refInfo = next(iter(self.datasetRefs)) 

100 dimensions = list(refInfo.datasetRef.dataId.full.keys()) 

101 columnNames = ["type", "run", "id", *[str(item) for item in dimensions]] 

102 

103 # Need to hint the column types for numbers since the per-row 

104 # constructor of Table does not work this out on its own and sorting 

105 # will not work properly without. 

106 typeMap = {float: np.float64, int: np.int64} 

107 idType = _id_type(refInfo.datasetRef) 

108 columnTypes = [ 

109 None, 

110 None, 

111 idType, 

112 *[typeMap.get(type(value)) for value in refInfo.datasetRef.dataId.full.values()], 

113 ] 

114 if refInfo.uri: 

115 columnNames.append("URI") 

116 columnTypes.append(None) 

117 

118 rows = [] 

119 for refInfo in self.datasetRefs: 

120 row = [ 

121 datasetTypeName, 

122 refInfo.datasetRef.run, 

123 str(refInfo.datasetRef.id) if idType is str else refInfo.datasetRef.id, 

124 *[value for value in refInfo.datasetRef.dataId.full.values()], 

125 ] 

126 if refInfo.uri: 

127 row.append(refInfo.uri) 

128 rows.append(row) 

129 

130 dataset_table = AstropyTable(np.array(rows), names=columnNames, dtype=columnTypes) 

131 return sortAstropyTable(dataset_table, dimensions, ["type", "run"]) 

132 

133 

134class QueryDatasets: 

135 """Get dataset refs from a repository. 

136 

137 Parameters 

138 ---------- 

139 glob : iterable [`str`] 

140 A list of glob-style search string that fully or partially identify 

141 the dataset type names to search for. 

142 collections : iterable [`str`] 

143 A list of glob-style search string that fully or partially identify 

144 the collections to search for. 

145 where : `str` 

146 A string expression similar to a SQL WHERE clause. May involve any 

147 column of a dimension table or (as a shortcut for the primary key 

148 column of a dimension table) dimension name. 

149 find_first : `bool` 

150 For each result data ID, only yield one DatasetRef of each DatasetType, 

151 from the first collection in which a dataset of that dataset type 

152 appears (according to the order of `collections` passed in). If used, 

153 `collections` must specify at least one expression and must not contain 

154 wildcards. 

155 show_uri : `bool` 

156 If True, include the dataset URI in the output. 

157 repo : `str` or `None` 

158 URI to the location of the repo or URI to a config file describing the 

159 repo and its location. One of `repo` and `butler` must be `None` and 

160 the other must not be `None`. 

161 butler : `lsst.daf.butler.Butler` or `None` 

162 The butler to use to query. One of `repo` and `butler` must be `None` 

163 and the other must not be `None`. 

164 

165 """ 

166 

167 def __init__( 

168 self, 

169 glob: Iterable[str], 

170 collections: Iterable[str], 

171 where: str, 

172 find_first: bool, 

173 show_uri: bool, 

174 repo: str | None = None, 

175 butler: Butler | None = None, 

176 ): 

177 if (repo and butler) or (not repo and not butler): 

178 raise RuntimeError("One of repo and butler must be provided and the other must be None.") 

179 self.butler = butler or Butler(repo) 

180 self._getDatasets(glob, collections, where, find_first) 

181 self.showUri = show_uri 

182 

183 def _getDatasets( 

184 self, glob: Iterable[str], collections: Iterable[str], where: str, find_first: bool 

185 ) -> None: 

186 datasetTypes = glob if glob else ... 

187 query_collections: Iterable[str] | EllipsisType = collections if collections else Ellipsis 

188 

189 self.datasets = self.butler.registry.queryDatasets( 

190 datasetType=datasetTypes, collections=query_collections, where=where, findFirst=find_first 

191 ).expanded() 

192 

193 def getTables(self) -> list[AstropyTable]: 

194 """Get the datasets as a list of astropy tables. 

195 

196 Returns 

197 ------- 

198 datasetTables : `list` [``astropy.table._Table``] 

199 A list of astropy tables, one for each dataset type. 

200 """ 

201 tables: dict[str, _Table] = defaultdict(_Table) 

202 if not self.showUri: 

203 for dataset_ref in self.datasets: 

204 tables[dataset_ref.datasetType.name].add(dataset_ref) 

205 else: 

206 d = list(self.datasets) 

207 ref_uris = self.butler.datastore.getManyURIs(d, predict=True) 

208 for ref, uris in ref_uris.items(): 

209 if uris.primaryURI: 

210 tables[ref.datasetType.name].add(ref, uris.primaryURI) 

211 for name, uri in uris.componentURIs.items(): 

212 tables[ref.datasetType.componentTypeName(name)].add(ref, uri) 

213 

214 return [table.getAstropyTable(datasetTypeName) for datasetTypeName, table in tables.items()] 

215 

216 def getDatasets(self) -> DatasetQueryResults: 

217 """Get the datasets as a list of ``DatasetQueryResults``. 

218 

219 Returns 

220 ------- 

221 refs : `lsst.daf.butler.registry.queries.DatasetQueryResults` 

222 Dataset references matching the given query criteria. 

223 """ 

224 return self.datasets