Coverage for python/lsst/daf/butler/script/queryDatasets.py: 27%

67 statements  

« prev     ^ index     » next       coverage.py v7.3.1, created at 2023-10-02 08:00 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27from __future__ import annotations 

28 

29import dataclasses 

30from collections import defaultdict 

31from collections.abc import Iterable 

32from types import EllipsisType 

33from typing import TYPE_CHECKING 

34 

35import numpy as np 

36from astropy.table import Table as AstropyTable 

37 

38from .._butler import Butler 

39from ..cli.utils import sortAstropyTable 

40 

41if TYPE_CHECKING: 

42 from lsst.daf.butler import DatasetRef 

43 from lsst.daf.butler.registry.queries import DatasetQueryResults 

44 from lsst.resources import ResourcePath 

45 

46 

47@dataclasses.dataclass(frozen=True) 

48class _RefInfo: 

49 datasetRef: DatasetRef 

50 uri: str | None 

51 

52 

53class _Table: 

54 """Aggregates rows for a single dataset type, and creates an astropy table 

55 with the aggregated data. Eliminates duplicate rows. 

56 """ 

57 

58 datasetRefs: set[_RefInfo] 

59 

60 def __init__(self) -> None: 

61 self.datasetRefs = set() 

62 

63 def add(self, datasetRef: DatasetRef, uri: ResourcePath | None = None) -> None: 

64 """Add a row of information to the table. 

65 

66 ``uri`` is optional but must be the consistent; provided or not, for 

67 every call to a ``_Table`` instance. 

68 

69 Parameters 

70 ---------- 

71 datasetRef : `DatasetRef` 

72 A dataset ref that will be added as a row in the table. 

73 uri : `lsst.resources.ResourcePath`, optional 

74 The URI to show as a file location in the table, by default None 

75 """ 

76 uri_str = str(uri) if uri else None 

77 self.datasetRefs.add(_RefInfo(datasetRef, uri_str)) 

78 

79 def getAstropyTable(self, datasetTypeName: str) -> AstropyTable: 

80 """Get the table as an astropy table. 

81 

82 Parameters 

83 ---------- 

84 datasetTypeName : `str` 

85 The dataset type name to show in the ``type`` column of the table. 

86 

87 Returns 

88 ------- 

89 table : `astropy.table._Table` 

90 The table with the provided column names and rows. 

91 """ 

92 # Should never happen; adding a dataset should be the action that 

93 # causes a _Table to be created. 

94 if not self.datasetRefs: 

95 raise RuntimeError(f"No DatasetRefs were provided for dataset type {datasetTypeName}") 

96 

97 refInfo = next(iter(self.datasetRefs)) 

98 dimensions = list(refInfo.datasetRef.dataId.full.keys()) 

99 columnNames = ["type", "run", "id", *[str(item) for item in dimensions]] 

100 

101 # Need to hint the column types for numbers since the per-row 

102 # constructor of Table does not work this out on its own and sorting 

103 # will not work properly without. 

104 typeMap = {float: np.float64, int: np.int64} 

105 columnTypes = [ 

106 None, 

107 None, 

108 str, 

109 *[typeMap.get(type(value)) for value in refInfo.datasetRef.dataId.full.values()], 

110 ] 

111 if refInfo.uri: 

112 columnNames.append("URI") 

113 columnTypes.append(None) 

114 

115 rows = [] 

116 for refInfo in self.datasetRefs: 

117 row = [ 

118 datasetTypeName, 

119 refInfo.datasetRef.run, 

120 str(refInfo.datasetRef.id), 

121 *list(refInfo.datasetRef.dataId.full.values()), 

122 ] 

123 if refInfo.uri: 

124 row.append(refInfo.uri) 

125 rows.append(row) 

126 

127 dataset_table = AstropyTable(np.array(rows), names=columnNames, dtype=columnTypes) 

128 return sortAstropyTable(dataset_table, dimensions, ["type", "run"]) 

129 

130 

131class QueryDatasets: 

132 """Get dataset refs from a repository. 

133 

134 Parameters 

135 ---------- 

136 glob : iterable [`str`] 

137 A list of glob-style search string that fully or partially identify 

138 the dataset type names to search for. 

139 collections : iterable [`str`] 

140 A list of glob-style search string that fully or partially identify 

141 the collections to search for. 

142 where : `str` 

143 A string expression similar to a SQL WHERE clause. May involve any 

144 column of a dimension table or (as a shortcut for the primary key 

145 column of a dimension table) dimension name. 

146 find_first : `bool` 

147 For each result data ID, only yield one DatasetRef of each DatasetType, 

148 from the first collection in which a dataset of that dataset type 

149 appears (according to the order of `collections` passed in). If used, 

150 `collections` must specify at least one expression and must not contain 

151 wildcards. 

152 show_uri : `bool` 

153 If True, include the dataset URI in the output. 

154 repo : `str` or `None` 

155 URI to the location of the repo or URI to a config file describing the 

156 repo and its location. One of `repo` and `butler` must be `None` and 

157 the other must not be `None`. 

158 butler : `lsst.daf.butler.Butler` or `None` 

159 The butler to use to query. One of `repo` and `butler` must be `None` 

160 and the other must not be `None`. 

161 

162 """ 

163 

164 def __init__( 

165 self, 

166 glob: Iterable[str], 

167 collections: Iterable[str], 

168 where: str, 

169 find_first: bool, 

170 show_uri: bool, 

171 repo: str | None = None, 

172 butler: Butler | None = None, 

173 ): 

174 if (repo and butler) or (not repo and not butler): 

175 raise RuntimeError("One of repo and butler must be provided and the other must be None.") 

176 # show_uri requires a datastore. 

177 without_datastore = not show_uri 

178 self.butler = butler or Butler(repo, without_datastore=without_datastore) 

179 self._getDatasets(glob, collections, where, find_first) 

180 self.showUri = show_uri 

181 

182 def _getDatasets( 

183 self, glob: Iterable[str], collections: Iterable[str], where: str, find_first: bool 

184 ) -> None: 

185 datasetTypes = glob or ... 

186 query_collections: Iterable[str] | EllipsisType = collections or ... 

187 

188 self.datasets = self.butler.registry.queryDatasets( 

189 datasetType=datasetTypes, collections=query_collections, where=where, findFirst=find_first 

190 ).expanded() 

191 

192 def getTables(self) -> list[AstropyTable]: 

193 """Get the datasets as a list of astropy tables. 

194 

195 Returns 

196 ------- 

197 datasetTables : `list` [``astropy.table._Table``] 

198 A list of astropy tables, one for each dataset type. 

199 """ 

200 tables: dict[str, _Table] = defaultdict(_Table) 

201 if not self.showUri: 

202 for dataset_ref in self.datasets: 

203 tables[dataset_ref.datasetType.name].add(dataset_ref) 

204 else: 

205 d = list(self.datasets) 

206 ref_uris = self.butler.get_many_uris(d, predict=True) 

207 for ref, uris in ref_uris.items(): 

208 if uris.primaryURI: 

209 tables[ref.datasetType.name].add(ref, uris.primaryURI) 

210 for name, uri in uris.componentURIs.items(): 

211 tables[ref.datasetType.componentTypeName(name)].add(ref, uri) 

212 

213 return [table.getAstropyTable(datasetTypeName) for datasetTypeName, table in tables.items()] 

214 

215 def getDatasets(self) -> DatasetQueryResults: 

216 """Get the datasets as a list of ``DatasetQueryResults``. 

217 

218 Returns 

219 ------- 

220 refs : `lsst.daf.butler.registry.queries.DatasetQueryResults` 

221 Dataset references matching the given query criteria. 

222 """ 

223 return self.datasets