Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23from astropy.table import Table as AstropyTable 

24from collections import defaultdict, namedtuple 

25from typing import Any, Dict 

26import numpy as np 

27import uuid 

28 

29from .. import Butler 

30from ..core.utils import globToRegex 

31from ..cli.utils import sortAstropyTable 

32 

33 

34_RefInfo = namedtuple("_RefInfo", ["datasetRef", "uri"]) 

35 

36 

37class _Table: 

38 """Aggregates rows for a single dataset type, and creates an astropy table 

39 with the aggregated data. Eliminates duplicate rows. 

40 """ 

41 

42 def __init__(self): 

43 self.datasetRefs = set() 

44 

45 def add(self, datasetRef, uri=None): 

46 """Add a row of information to the table. 

47 

48 ``uri`` is optional but must be the consistent; provided or not, for 

49 every call to a ``_Table`` instance. 

50 

51 Parameters 

52 ---------- 

53 datasetRef : ``DatasetRef`` 

54 A dataset ref that will be added as a row in the table. 

55 uri : ``ButlerURI``, optional 

56 The URI to show as a file location in the table, by default None 

57 """ 

58 if uri: 

59 uri = str(uri) 

60 self.datasetRefs.add(_RefInfo(datasetRef, uri)) 

61 

62 def getAstropyTable(self, datasetTypeName): 

63 """Get the table as an astropy table. 

64 

65 Parameters 

66 ---------- 

67 datasetTypeName : `str` 

68 The dataset type name to show in the ``type`` column of the table. 

69 

70 Returns 

71 ------- 

72 table : `astropy.table._Table` 

73 The table with the provided column names and rows. 

74 """ 

75 def _id_type(datasetRef): 

76 if isinstance(datasetRef.id, uuid.UUID): 

77 return str 

78 else: 

79 return np.int64 

80 

81 # Should never happen; adding a dataset should be the action that 

82 # causes a _Table to be created. 

83 if not self.datasetRefs: 

84 raise RuntimeError(f"No DatasetRefs were provided for dataset type {datasetTypeName}") 

85 

86 refInfo = next(iter(self.datasetRefs)) 

87 dimensions = list(refInfo.datasetRef.dataId.full.keys()) 

88 columnNames = ["type", "run", "id", 

89 *[str(item) for item in dimensions]] 

90 

91 # Need to hint the column types for numbers since the per-row 

92 # constructor of Table does not work this out on its own and sorting 

93 # will not work properly without. 

94 typeMap = {float: np.float64, int: np.int64} 

95 idType = _id_type(refInfo.datasetRef) 

96 columnTypes = [None, None, idType, 

97 *[typeMap.get(type(value)) for value in refInfo.datasetRef.dataId.full.values()]] 

98 if refInfo.uri: 

99 columnNames.append("URI") 

100 columnTypes.append(None) 

101 

102 rows = [] 

103 for refInfo in self.datasetRefs: 

104 row = [datasetTypeName, 

105 refInfo.datasetRef.run, 

106 str(refInfo.datasetRef.id) if idType is str else refInfo.datasetRef.id, 

107 *[value for value in refInfo.datasetRef.dataId.full.values()]] 

108 if refInfo.uri: 

109 row.append(refInfo.uri) 

110 rows.append(row) 

111 

112 dataset_table = AstropyTable(np.array(rows), names=columnNames, dtype=columnTypes) 

113 return sortAstropyTable(dataset_table, dimensions, ["type", "run"]) 

114 

115 

116class QueryDatasets: 

117 """Get dataset refs from a repository. 

118 

119 Parameters 

120 ---------- 

121 repo : `str` or `None` 

122 URI to the location of the repo or URI to a config file describing the 

123 repo and its location. One of `repo` and `butler` must be `None` and 

124 the other must not be `None`. 

125 butler : ``lsst.daf.butler.Butler`` or `None` 

126 The butler to use to query. One of `repo` and `butler` must be `None` 

127 and the other must not be `None`. 

128 glob : iterable [`str`] 

129 A list of glob-style search string that fully or partially identify 

130 the dataset type names to search for. 

131 collections : iterable [`str`] 

132 A list of glob-style search string that fully or partially identify 

133 the collections to search for. 

134 where : `str` 

135 A string expression similar to a SQL WHERE clause. May involve any 

136 column of a dimension table or (as a shortcut for the primary key 

137 column of a dimension table) dimension name. 

138 find_first : `bool` 

139 For each result data ID, only yield one DatasetRef of each DatasetType, 

140 from the first collection in which a dataset of that dataset type 

141 appears (according to the order of `collections` passed in). If used, 

142 `collections` must specify at least one expression and must not contain 

143 wildcards. 

144 show_uri : `bool` 

145 If True, include the dataset URI in the output. 

146 """ 

147 

148 def __init__(self, glob, collections, where, find_first, show_uri, repo=None, butler=None): 

149 if (repo and butler) or (not repo and not butler): 

150 raise RuntimeError("One of repo and butler must be provided and the other must be None.") 

151 self.butler = butler or Butler(repo) 

152 self._getDatasets(glob, collections, where, find_first) 

153 self.showUri = show_uri 

154 

155 def _getDatasets(self, glob, collections, where, find_first): 

156 dataset: Any = globToRegex(glob) 

157 if not dataset: 

158 dataset = ... 

159 

160 if not find_first: 

161 collections = globToRegex(collections) 

162 

163 self.datasets = self.butler.registry.queryDatasets(datasetType=dataset, 

164 collections=collections, 

165 where=where, 

166 findFirst=find_first) 

167 

168 def getTables(self): 

169 """Get the datasets as a list of astropy tables. 

170 

171 Returns 

172 ------- 

173 datasetTables : `list` [``astropy.table._Table``] 

174 A list of astropy tables, one for each dataset type. 

175 """ 

176 tables: Dict[str, _Table] = defaultdict(_Table) 

177 for datasetRef in self.datasets: 

178 if not self.showUri: 

179 tables[datasetRef.datasetType.name].add(datasetRef) 

180 else: 

181 primaryURI, componentURIs = self.butler.getURIs(datasetRef, collections=datasetRef.run) 

182 if primaryURI: 

183 tables[datasetRef.datasetType.name].add(datasetRef, primaryURI) 

184 for name, uri in componentURIs.items(): 

185 tables[datasetRef.datasetType.componentTypeName(name)].add(datasetRef, uri) 

186 

187 return [table.getAstropyTable(datasetTypeName) for datasetTypeName, table in tables.items()] 

188 

189 def getDatasets(self): 

190 """Get the datasets as a list of ``DatasetQueryResults``. 

191 

192 Returns 

193 ------- 

194 refs : ``queries.DatasetQueryResults`` 

195 Dataset references matching the given query criteria. 

196 """ 

197 return self.datasets