Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ("DatasetRegistryStorage", "Like", "DatasetTypeExpression", "CollectionsExpression") 

24 

25from dataclasses import dataclass 

26from typing import Mapping, Optional, Sequence, List, Union 

27 

28from sqlalchemy.sql import FromClause, select, case, and_, or_, ColumnElement 

29from sqlalchemy.engine import Connection 

30 

31from ...core import ( 

32 DatasetType, 

33 ExpandedDataCoordinate, 

34 DimensionGraph, 

35 DimensionUniverse, 

36) 

37 

38 

39@dataclass(frozen=True) 

40class Like: 

41 """Simple wrapper around a string pattern used to indicate that a string is 

42 a pattern to be used with the SQL ``LIKE`` operator rather than a complete 

43 name. 

44 """ 

45 

46 pattern: str 

47 """The string pattern, in SQL ``LIKE`` syntax. 

48 """ 

49 

50 

51DatasetTypeExpression = Union[DatasetType, str, Like, type(...)] 

52"""Type annotation alias for the types accepted when querying for a dataset 

53type. 

54 

55Ellipsis (``...``) is used as a full wildcard, indicating that any 

56`DatasetType` will be matched. 

57""" 

58 

59CollectionsExpression = Union[Sequence[Union[str, Like]], type(...)] 

60"""Type annotation alias for the types accepted to describe the collections to 

61be searched for a dataset. 

62 

63Ellipsis (``...``) is used as a full wildcard, indicating that all 

64collections will be searched. 

65""" 

66 

67 

68def makeCollectionsWhereExpression(column: ColumnElement, 

69 collections: CollectionsExpression) -> Optional[ColumnElement]: 

70 """Construct a boolean SQL expression corresponding to a Python expression 

71 for the collections to search for one or more datasets. 

72 

73 Parameters 

74 ---------- 

75 column : `sqlalchemy.sql.ColumnElement` 

76 The "collection" name column from a dataset subquery or table. 

77 collections : `list` of `str` or `Like`, or ``...`` 

78 An expression indicating the collections to be searched. This may 

79 be a sequence containing complete collection names (`str` values), 

80 wildcard expressions (`Like` instances) or the special value ``...``, 

81 indicating all collections. 

82 

83 Returns 

84 ------- 

85 where : `sqlalchemy.sql.ColumnElement` or `None` 

86 A boolean SQL expression object, or `None` if all collections are to 

87 be searched and hence there is no WHERE expression for the given Python 

88 expression (or, more precisely, the WHERE expression is the literal 

89 "true", but we don't want to pollute our SQL queries with those when 

90 we can avoid it). 

91 """ 

92 if collections is ...: 

93 return None 

94 terms = [] 

95 equalities = [] 

96 for collection in collections: 

97 if isinstance(collection, Like): 

98 terms.append(column.like(collection.pattern)) 

99 else: 

100 equalities.append(collection) 

101 if len(equalities) == 1: 

102 terms.append(column == equalities[0]) 

103 if len(equalities) > 1: 

104 terms.append(column.in_(equalities)) 

105 return or_(*terms) 

106 

107 

108class DatasetRegistryStorage: 

109 """An object managing ``dataset`` and related tables in a `Registry`. 

110 

111 Parameters 

112 ---------- 

113 connection : `sqlalchemy.engine.Connection` 

114 A SQLAlchemy connection object, typically shared with the `Registry` 

115 that will own the storage instances. 

116 universe : `DimensionUniverse` 

117 The set of all dimensions for which storage instances should be 

118 constructed. 

119 tables : `dict` 

120 A dictionary mapping table name to a `sqlalchemy.sql.FromClause` 

121 representing that table. 

122 

123 Notes 

124 ----- 

125 Future changes will convert this concrete class into a polymorphic 

126 hierarchy modeled after `DimensionRecordStorage`, with many more 

127 `SqlRegistry` method implementations delegating to it. Its interface 

128 may change significantly at the same time. At present, this functionality 

129 has been factored out of `SqlRegistry` (with a bit of duplication) to 

130 allow the initial `QueryBuilder` design and implementation to be more 

131 forward-looking. 

132 """ 

133 def __init__(self, connection: Connection, universe: DimensionUniverse, 

134 tables: Mapping[str, FromClause]): 

135 self._connection = connection 

136 self._universe = universe 

137 self._datasetTypeTable = tables["dataset_type"] 

138 self._datasetTypeDimensionsTable = tables["dataset_type_dimensions"] 

139 self._datasetTable = tables["dataset"] 

140 self._datasetCollectionTable = tables["dataset_collection"] 

141 

142 def fetchDatasetTypes(self, datasetType: DatasetTypeExpression = ..., *, 

143 collections: CollectionsExpression = ..., 

144 dataId: Optional[ExpandedDataCoordinate] = None) -> List[DatasetType]: 

145 """Retrieve `DatasetType` instances from the database matching an 

146 expression. 

147 

148 Parameters 

149 ---------- 

150 datasetType : `str`, `Like`, `DatasetType`, or ``...`` 

151 An expression indicating the dataset type(s) to fetch. If this is 

152 a true `DatasetType` instance, it will be returned directly without 

153 querying the database. If this is a `str`, the `DatasetType` 

154 matching that name will be returned if it exists. If it is a 

155 `Like` expression, dataset types whose name match the expression 

156 will be returned. The special value ``...`` fetches all dataset 

157 types. If no dataset types match, an empty `list` is returned. 

158 collections : sequence of `str` or `Like`, or ``...`` 

159 An expression indicating collections that *may* be used to limit 

160 the dataset types returned to only those that might have datasets 

161 in these collections. This is intended as an optimization for 

162 higher-level functionality; it may simply be ignored, and cannot 

163 be relied upon to filter the returned dataset types. 

164 dataId : `ExpandedDataCoordinate`, optional 

165 A data ID that *may* be used to limit the dataset types returned 

166 to only those with datasets matching the given data ID. This is 

167 intended as an optimization for higher-level functionality; it may 

168 simply be ignored, and cannot be relied upon to filter the returned 

169 dataset types. 

170 

171 Returns 

172 ------- 

173 datasetTypes : `list` of `DatasetType` 

174 All datasets in the registry matching the given arguments. 

175 """ 

176 if isinstance(datasetType, DatasetType): 

177 # This *could* return an empty list if we could determine 

178 # efficiently that there are no entries of this dataset type 

179 # that match the given data ID or collections, but we are not 

180 # required to do that filtering. 

181 return [datasetType] 

182 whereTerms = [] 

183 if datasetType is ...: 

184 # "..." means no restriction on the dataset types; get all 

185 # of them. 

186 pass 

187 elif isinstance(datasetType, str): 

188 whereTerms.append(self._datasetTypeTable.columns.dataset_type_name == datasetType) 

189 elif isinstance(datasetType, Like): 

190 whereTerms.append(self._datasetTypeTable.columns.dataset_type_name.like(datasetType.pattern)) 

191 else: 

192 raise TypeError(f"Unexpected dataset type expression '{datasetType}' in query.") 

193 query = select([ 

194 self._datasetTypeTable.columns.dataset_type_name, 

195 self._datasetTypeTable.columns.storage_class, 

196 self._datasetTypeDimensionsTable.columns.dimension_name, 

197 ]).select_from( 

198 self._datasetTypeTable.join(self._datasetTypeDimensionsTable) 

199 ) 

200 if whereTerms: 

201 query = query.where(*whereTerms) 

202 # Collections and dataId arguments are currently ignored; they are 

203 # provided so future code *may* restrict the list of returned dataset 

204 # types, but are not required to be used. 

205 grouped = {} 

206 for row in self._connection.execute(query).fetchall(): 

207 datasetTypeName, storageClassName, dimensionName = row 

208 _, dimensionNames = grouped.setdefault(datasetTypeName, (storageClassName, set())) 

209 dimensionNames.add(dimensionName) 

210 return [DatasetType(datasetTypeName, 

211 dimensions=DimensionGraph(self._universe, names=dimensionNames), 

212 storageClass=storageClassName) 

213 for datasetTypeName, (storageClassName, dimensionNames) in grouped.items()] 

214 

215 def getDatasetSubquery(self, datasetType: DatasetType, *, 

216 collections: CollectionsExpression, 

217 dataId: Optional[ExpandedDataCoordinate] = None, 

218 isResult: bool = True, 

219 addRank: bool = False) -> FromClause: 

220 """Return a SQL expression that searches for a dataset of a particular 

221 type in one or more collections. 

222 

223 Parameters 

224 ---------- 

225 datasetType : `DatasetType` 

226 Type of dataset to search for. Must be a true `DatasetType`; 

227 call `fetchDatasetTypes` first to expand an expression if desired. 

228 collections : sequence of `str` or `Like`, or ``...`` 

229 An expression describing the collections in which to search for 

230 the datasets. ``...`` indicates that all collections should be 

231 searched. Returned datasets are guaranteed to be from one of the 

232 given collections (unlike the behavior of the same argument in 

233 `fetchDatasetTypes`). 

234 dataId : `ExpandedDataCoordinate`, optional 

235 A data ID that *may* be used to limit the datasets returned 

236 to only those matching the given data ID. This is intended as an 

237 optimization for higher-level functionality; it may simply be 

238 ignored, and cannot be relied upon to filter the returned dataset 

239 types. 

240 isResult : `bool`, optional 

241 If `True` (default), include the ``dataset_id`` column in the 

242 result columns of the query. 

243 addRank : `bool`, optional 

244 If `True` (`False` is default), also include a calculated column 

245 that ranks the collection in which the dataset was found (lower 

246 is better). Requires that all entries in ``collections`` be 

247 regular strings, so there is a clear search order. Ignored if 

248 ``isResult`` is `False`. 

249 

250 Returns 

251 ------- 

252 subquery : `sqlalchemy.sql.FromClause` 

253 Named subquery or table that can be used in the FROM clause of 

254 a SELECT query. Has at least columns for all dimensions in 

255 ``datasetType.dimensions``; may have additional columns depending 

256 on the values of ``isResult`` and ``addRank``. 

257 """ 

258 # Always include dimension columns, because that's what we use to 

259 # join against other tables. 

260 columns = [self._datasetTable.columns[dimension.name] for dimension in datasetType.dimensions] 

261 # Only include dataset_id and the rank of the collection in the given 

262 # list if caller has indicated that they're going to be actually 

263 # selecting columns from this subquery in the larger query. 

264 if isResult: 

265 columns.append(self._datasetTable.columns.dataset_id) 

266 if addRank: 

267 if collections is ...: 

268 raise TypeError("Cannot rank collections when no collections are provided.") 

269 ranks = {} 

270 for n, collection in enumerate(collections): 

271 if isinstance(collection, Like): 

272 raise TypeError( 

273 f"Cannot rank collections that include LIKE pattern '{collection.pattern}'." 

274 ) 

275 ranks[collection] = n 

276 columns.append( 

277 case( 

278 ranks, 

279 value=self._datasetCollectionTable.columns.collection 

280 ).label("rank") 

281 ) 

282 whereTerms = [self._datasetTable.columns.dataset_type_name == datasetType.name] 

283 collectionsTerm = makeCollectionsWhereExpression(self._datasetCollectionTable.columns.collection, 

284 collections) 

285 if collectionsTerm is not None: 

286 whereTerms.append(collectionsTerm) 

287 return select( 

288 columns 

289 ).select_from( 

290 self._datasetTable.join(self._datasetCollectionTable) 

291 ).where( 

292 and_(*whereTerms) 

293 ).alias(datasetType.name)