Coverage for python/lsst/daf/butler/script/queryDatasets.py: 20%

68 statements  

« prev     ^ index     » next       coverage.py v6.4.4, created at 2022-08-30 02:26 -0700

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23import uuid 

24from collections import defaultdict, namedtuple 

25from typing import Dict 

26 

27import numpy as np 

28from astropy.table import Table as AstropyTable 

29 

30from .._butler import Butler 

31from ..cli.utils import sortAstropyTable 

32 

33_RefInfo = namedtuple("_RefInfo", ["datasetRef", "uri"]) 

34 

35 

36class _Table: 

37 """Aggregates rows for a single dataset type, and creates an astropy table 

38 with the aggregated data. Eliminates duplicate rows. 

39 """ 

40 

41 def __init__(self): 

42 self.datasetRefs = set() 

43 

44 def add(self, datasetRef, uri=None): 

45 """Add a row of information to the table. 

46 

47 ``uri`` is optional but must be the consistent; provided or not, for 

48 every call to a ``_Table`` instance. 

49 

50 Parameters 

51 ---------- 

52 datasetRef : `DatasetRef` 

53 A dataset ref that will be added as a row in the table. 

54 uri : `lsst.resources.ResourcePath`, optional 

55 The URI to show as a file location in the table, by default None 

56 """ 

57 if uri: 

58 uri = str(uri) 

59 self.datasetRefs.add(_RefInfo(datasetRef, uri)) 

60 

61 def getAstropyTable(self, datasetTypeName): 

62 """Get the table as an astropy table. 

63 

64 Parameters 

65 ---------- 

66 datasetTypeName : `str` 

67 The dataset type name to show in the ``type`` column of the table. 

68 

69 Returns 

70 ------- 

71 table : `astropy.table._Table` 

72 The table with the provided column names and rows. 

73 """ 

74 

75 def _id_type(datasetRef): 

76 if isinstance(datasetRef.id, uuid.UUID): 

77 return str 

78 else: 

79 return np.int64 

80 

81 # Should never happen; adding a dataset should be the action that 

82 # causes a _Table to be created. 

83 if not self.datasetRefs: 

84 raise RuntimeError(f"No DatasetRefs were provided for dataset type {datasetTypeName}") 

85 

86 refInfo = next(iter(self.datasetRefs)) 

87 dimensions = list(refInfo.datasetRef.dataId.full.keys()) 

88 columnNames = ["type", "run", "id", *[str(item) for item in dimensions]] 

89 

90 # Need to hint the column types for numbers since the per-row 

91 # constructor of Table does not work this out on its own and sorting 

92 # will not work properly without. 

93 typeMap = {float: np.float64, int: np.int64} 

94 idType = _id_type(refInfo.datasetRef) 

95 columnTypes = [ 

96 None, 

97 None, 

98 idType, 

99 *[typeMap.get(type(value)) for value in refInfo.datasetRef.dataId.full.values()], 

100 ] 

101 if refInfo.uri: 

102 columnNames.append("URI") 

103 columnTypes.append(None) 

104 

105 rows = [] 

106 for refInfo in self.datasetRefs: 

107 row = [ 

108 datasetTypeName, 

109 refInfo.datasetRef.run, 

110 str(refInfo.datasetRef.id) if idType is str else refInfo.datasetRef.id, 

111 *[value for value in refInfo.datasetRef.dataId.full.values()], 

112 ] 

113 if refInfo.uri: 

114 row.append(refInfo.uri) 

115 rows.append(row) 

116 

117 dataset_table = AstropyTable(np.array(rows), names=columnNames, dtype=columnTypes) 

118 return sortAstropyTable(dataset_table, dimensions, ["type", "run"]) 

119 

120 

121class QueryDatasets: 

122 """Get dataset refs from a repository. 

123 

124 Parameters 

125 ---------- 

126 repo : `str` or `None` 

127 URI to the location of the repo or URI to a config file describing the 

128 repo and its location. One of `repo` and `butler` must be `None` and 

129 the other must not be `None`. 

130 butler : ``lsst.daf.butler.Butler`` or `None` 

131 The butler to use to query. One of `repo` and `butler` must be `None` 

132 and the other must not be `None`. 

133 glob : iterable [`str`] 

134 A list of glob-style search string that fully or partially identify 

135 the dataset type names to search for. 

136 collections : iterable [`str`] 

137 A list of glob-style search string that fully or partially identify 

138 the collections to search for. 

139 where : `str` 

140 A string expression similar to a SQL WHERE clause. May involve any 

141 column of a dimension table or (as a shortcut for the primary key 

142 column of a dimension table) dimension name. 

143 find_first : `bool` 

144 For each result data ID, only yield one DatasetRef of each DatasetType, 

145 from the first collection in which a dataset of that dataset type 

146 appears (according to the order of `collections` passed in). If used, 

147 `collections` must specify at least one expression and must not contain 

148 wildcards. 

149 show_uri : `bool` 

150 If True, include the dataset URI in the output. 

151 """ 

152 

153 def __init__(self, glob, collections, where, find_first, show_uri, repo=None, butler=None): 

154 if (repo and butler) or (not repo and not butler): 

155 raise RuntimeError("One of repo and butler must be provided and the other must be None.") 

156 self.butler = butler or Butler(repo) 

157 self._getDatasets(glob, collections, where, find_first) 

158 self.showUri = show_uri 

159 

160 def _getDatasets(self, glob, collections, where, find_first): 

161 if not glob: 

162 glob = ... 

163 if not collections: 

164 collections = ... 

165 

166 self.datasets = self.butler.registry.queryDatasets( 

167 datasetType=glob, collections=collections, where=where, findFirst=find_first 

168 ).expanded() 

169 

170 def getTables(self): 

171 """Get the datasets as a list of astropy tables. 

172 

173 Returns 

174 ------- 

175 datasetTables : `list` [``astropy.table._Table``] 

176 A list of astropy tables, one for each dataset type. 

177 """ 

178 tables: Dict[str, _Table] = defaultdict(_Table) 

179 if not self.showUri: 

180 for dataset_ref in self.datasets: 

181 tables[dataset_ref.datasetType.name].add(dataset_ref) 

182 else: 

183 d = list(self.datasets) 

184 ref_uris = self.butler.datastore.getManyURIs(d, predict=True) 

185 for ref, uris in ref_uris.items(): 

186 if uris.primaryURI: 

187 tables[ref.datasetType.name].add(ref, uris.primaryURI) 

188 for name, uri in uris.componentURIs.items(): 

189 tables[ref.datasetType.componentTypeName(name)].add(ref, uri) 

190 

191 return [table.getAstropyTable(datasetTypeName) for datasetTypeName, table in tables.items()] 

192 

193 def getDatasets(self): 

194 """Get the datasets as a list of ``DatasetQueryResults``. 

195 

196 Returns 

197 ------- 

198 refs : ``queries.DatasetQueryResults`` 

199 Dataset references matching the given query criteria. 

200 """ 

201 return self.datasets