Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from astropy.table import Table as AstropyTable 

23from collections import defaultdict, namedtuple 

24import numpy as np 

25 

26from .. import Butler 

27from ..core.utils import globToRegex 

28 

29_RefInfo = namedtuple("RefInfo", "datasetRef uri") 

30 

31 

32class _Table: 

33 """Aggregates rows for a single dataset type, and creates an astropy table 

34 with the aggregated data. Eliminates duplicate rows. 

35 

36 Parameters 

37 ---------- 

38 columnNames : `list` [`str`] 

39 The names of columns. 

40 """ 

41 

42 def __init__(self): 

43 self.datasetRefs = set() 

44 

45 def add(self, datasetRef, uri=None): 

46 """Add a row of information to the table. 

47 

48 ``uri`` is optional but must be the consistent; provided or not, for 

49 every call to a ``_Table`` instance. 

50 

51 Parameters 

52 ---------- 

53 datasetRef : ``DatasetRef`` 

54 A dataset ref that will be added as a row in the table. 

55 uri : ``ButlerURI``, optional 

56 The URI to show as a file location in the table, by default None 

57 """ 

58 if uri: 

59 uri = str(uri) 

60 self.datasetRefs.add(_RefInfo(datasetRef, uri)) 

61 

62 def getAstropyTable(self, datasetTypeName): 

63 """Get the table as an astropy table. 

64 

65 Parameters 

66 ---------- 

67 datasetTypeName : `str` 

68 The dataset type name to show in the ``type`` column of the table. 

69 

70 Returns 

71 ------- 

72 table : `astropy.table._Table` 

73 The table with the provided column names and rows. 

74 """ 

75 # Should never happen; adding a dataset should be the action that 

76 # causes a _Table to be created. 

77 if not self.datasetRefs: 

78 raise RuntimeError(f"No DatasetRefs were provided for dataset type {datasetTypeName}") 

79 

80 refInfo = next(iter(self.datasetRefs)) 

81 dimensions = list(refInfo.datasetRef.dataId.full.keys()) 

82 columnNames = ["type", "run", "id", 

83 *[str(item) for item in dimensions]] 

84 

85 # Need to hint the column types for numbers since the per-row 

86 # constructor of Table does not work this out on its own and sorting 

87 # will not work properly without. 

88 typeMap = {float: np.float, int: np.int64} 

89 columnTypes = [None, None, np.int64, 

90 *[typeMap.get(type(value)) for value in refInfo.datasetRef.dataId.full.values()]] 

91 if refInfo.uri: 

92 columnNames.append("URI") 

93 columnTypes.append(None) 

94 

95 rows = [] 

96 for refInfo in self.datasetRefs: 

97 row = [datasetTypeName, 

98 refInfo.datasetRef.run, 

99 refInfo.datasetRef.id, 

100 *[value for value in refInfo.datasetRef.dataId.full.values()]] 

101 if refInfo.uri: 

102 row.append(refInfo.uri) 

103 rows.append(row) 

104 

105 dataset_table = AstropyTable(np.array(rows), names=columnNames, dtype=columnTypes) 

106 

107 # For sorting we want to ignore the id 

108 # We also want to move temporal or spatial dimensions earlier 

109 sort_first = ["type", "run"] 

110 sort_early = [] 

111 sort_late = [] 

112 for dim in dimensions: 

113 if dim.spatial or dim.temporal: 

114 sort_early.extend(dim.required.names) 

115 else: 

116 sort_late.append(str(dim)) 

117 sort_keys = sort_first + sort_early + sort_late 

118 

119 # The required names above means that we have the possibility of 

120 # repeats of sort keys. Now have to remove them 

121 # (order is retained by dict creation). 

122 sort_keys = list(dict.fromkeys(sort_keys).keys()) 

123 

124 dataset_table.sort(sort_keys) 

125 return dataset_table 

126 

127 

128def queryDatasets(repo, glob, collections, where, find_first, show_uri): 

129 """Get dataset refs from a repository. 

130 

131 Parameters 

132 ---------- 

133 repo : `str` 

134 URI to the location of the repo or URI to a config file describing the 

135 repo and its location. 

136 glob : iterable [`str`] 

137 A list of glob-style search string that fully or partially identify 

138 the dataset type names to search for. 

139 collections : iterable [`str`] 

140 A list of glob-style search string that fully or partially identify 

141 the collections to search for. 

142 where : `str` 

143 A string expression similar to a SQL WHERE clause. May involve any 

144 column of a dimension table or (as a shortcut for the primary key 

145 column of a dimension table) dimension name. 

146 find_first : `bool` 

147 For each result data ID, only yield one DatasetRef of each DatasetType, 

148 from the first collection in which a dataset of that dataset type 

149 appears (according to the order of `collections` passed in). If used, 

150 `collections` must specify at least one expression and must not contain 

151 wildcards. 

152 show_uri : `bool` 

153 If True, include the dataset URI in the output. 

154 Returns 

155 ------- 

156 datasetTables : `list` [``astropy.table._Table``] 

157 A list of astropy tables, one for each dataset type. 

158 """ 

159 butler = Butler(repo) 

160 

161 dataset = globToRegex(glob) 

162 if not dataset: 

163 dataset = ... 

164 

165 if collections and not find_first: 

166 collections = globToRegex(collections) 

167 elif not collections: 

168 collections = ... 

169 

170 datasets = butler.registry.queryDatasets(datasetType=dataset, 

171 collections=collections, 

172 where=where, 

173 findFirst=find_first) 

174 

175 tables = defaultdict(_Table) 

176 

177 for datasetRef in datasets: 

178 if not show_uri: 

179 tables[datasetRef.datasetType.name].add(datasetRef) 

180 else: 

181 primaryURI, componentURIs = butler.getURIs(datasetRef, collections=datasetRef.run) 

182 if primaryURI: 

183 tables[datasetRef.datasetType.name].add(datasetRef, primaryURI) 

184 for name, uri in componentURIs.items(): 

185 tables[datasetRef.datasetType.componentTypeName(name)].add(datasetRef, uri) 

186 

187 return [table.getAstropyTable(datasetTypeName) for datasetTypeName, table in tables.items()]