Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23from astropy.table import Table as AstropyTable 

24from collections import defaultdict, namedtuple 

25from typing import Any, Dict 

26import numpy as np 

27 

28from .. import Butler 

29from ..core.utils import globToRegex 

30from ..cli.utils import sortAstropyTable 

31 

32 

33_RefInfo = namedtuple("_RefInfo", ["datasetRef", "uri"]) 

34 

35 

36class _Table: 

37 """Aggregates rows for a single dataset type, and creates an astropy table 

38 with the aggregated data. Eliminates duplicate rows. 

39 """ 

40 

41 def __init__(self): 

42 self.datasetRefs = set() 

43 

44 def add(self, datasetRef, uri=None): 

45 """Add a row of information to the table. 

46 

47 ``uri`` is optional but must be the consistent; provided or not, for 

48 every call to a ``_Table`` instance. 

49 

50 Parameters 

51 ---------- 

52 datasetRef : ``DatasetRef`` 

53 A dataset ref that will be added as a row in the table. 

54 uri : ``ButlerURI``, optional 

55 The URI to show as a file location in the table, by default None 

56 """ 

57 if uri: 

58 uri = str(uri) 

59 self.datasetRefs.add(_RefInfo(datasetRef, uri)) 

60 

61 def getAstropyTable(self, datasetTypeName): 

62 """Get the table as an astropy table. 

63 

64 Parameters 

65 ---------- 

66 datasetTypeName : `str` 

67 The dataset type name to show in the ``type`` column of the table. 

68 

69 Returns 

70 ------- 

71 table : `astropy.table._Table` 

72 The table with the provided column names and rows. 

73 """ 

74 # Should never happen; adding a dataset should be the action that 

75 # causes a _Table to be created. 

76 if not self.datasetRefs: 

77 raise RuntimeError(f"No DatasetRefs were provided for dataset type {datasetTypeName}") 

78 

79 refInfo = next(iter(self.datasetRefs)) 

80 dimensions = list(refInfo.datasetRef.dataId.full.keys()) 

81 columnNames = ["type", "run", "id", 

82 *[str(item) for item in dimensions]] 

83 

84 # Need to hint the column types for numbers since the per-row 

85 # constructor of Table does not work this out on its own and sorting 

86 # will not work properly without. 

87 typeMap = {float: np.float64, int: np.int64} 

88 columnTypes = [None, None, np.int64, 

89 *[typeMap.get(type(value)) for value in refInfo.datasetRef.dataId.full.values()]] 

90 if refInfo.uri: 

91 columnNames.append("URI") 

92 columnTypes.append(None) 

93 

94 rows = [] 

95 for refInfo in self.datasetRefs: 

96 row = [datasetTypeName, 

97 refInfo.datasetRef.run, 

98 refInfo.datasetRef.id, 

99 *[value for value in refInfo.datasetRef.dataId.full.values()]] 

100 if refInfo.uri: 

101 row.append(refInfo.uri) 

102 rows.append(row) 

103 

104 dataset_table = AstropyTable(np.array(rows), names=columnNames, dtype=columnTypes) 

105 return sortAstropyTable(dataset_table, dimensions, ["type", "run"]) 

106 

107 

108class QueryDatasets: 

109 """Get dataset refs from a repository. 

110 

111 Parameters 

112 ---------- 

113 repo : `str` or `None` 

114 URI to the location of the repo or URI to a config file describing the 

115 repo and its location. One of `repo` and `butler` must be `None` and 

116 the other must not be `None`. 

117 butler : ``lsst.daf.butler.Butler`` or `None` 

118 The butler to use to query. One of `repo` and `butler` must be `None` 

119 and the other must not be `None`. 

120 glob : iterable [`str`] 

121 A list of glob-style search string that fully or partially identify 

122 the dataset type names to search for. 

123 collections : iterable [`str`] 

124 A list of glob-style search string that fully or partially identify 

125 the collections to search for. 

126 where : `str` 

127 A string expression similar to a SQL WHERE clause. May involve any 

128 column of a dimension table or (as a shortcut for the primary key 

129 column of a dimension table) dimension name. 

130 find_first : `bool` 

131 For each result data ID, only yield one DatasetRef of each DatasetType, 

132 from the first collection in which a dataset of that dataset type 

133 appears (according to the order of `collections` passed in). If used, 

134 `collections` must specify at least one expression and must not contain 

135 wildcards. 

136 show_uri : `bool` 

137 If True, include the dataset URI in the output. 

138 """ 

139 

140 def __init__(self, glob, collections, where, find_first, show_uri, repo=None, butler=None): 

141 if (repo and butler) or (not repo and not butler): 

142 raise RuntimeError("One of repo and butler must be provided and the other must be None.") 

143 self.butler = butler or Butler(repo) 

144 self._getDatasets(glob, collections, where, find_first) 

145 self.showUri = show_uri 

146 

147 def _getDatasets(self, glob, collections, where, find_first): 

148 dataset: Any = globToRegex(glob) 

149 if not dataset: 

150 dataset = ... 

151 

152 if not find_first: 

153 collections = globToRegex(collections) 

154 

155 self.datasets = self.butler.registry.queryDatasets(datasetType=dataset, 

156 collections=collections, 

157 where=where, 

158 findFirst=find_first) 

159 

160 def getTables(self): 

161 """Get the datasets as a list of astropy tables. 

162 

163 Returns 

164 ------- 

165 datasetTables : `list` [``astropy.table._Table``] 

166 A list of astropy tables, one for each dataset type. 

167 """ 

168 tables: Dict[str, _Table] = defaultdict(_Table) 

169 for datasetRef in self.datasets: 

170 if not self.showUri: 

171 tables[datasetRef.datasetType.name].add(datasetRef) 

172 else: 

173 primaryURI, componentURIs = self.butler.getURIs(datasetRef, collections=datasetRef.run) 

174 if primaryURI: 

175 tables[datasetRef.datasetType.name].add(datasetRef, primaryURI) 

176 for name, uri in componentURIs.items(): 

177 tables[datasetRef.datasetType.componentTypeName(name)].add(datasetRef, uri) 

178 

179 return [table.getAstropyTable(datasetTypeName) for datasetTypeName, table in tables.items()] 

180 

181 def getDatasets(self): 

182 """Get the datasets as a list of ``DatasetQueryResults``. 

183 

184 Returns 

185 ------- 

186 refs : ``queries.DatasetQueryResults`` 

187 Dataset references matching the given query criteria. 

188 """ 

189 return self.datasets