Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ["DatasetRegistryStorage"] 

24 

25from typing import Any, Mapping, Iterator, Optional 

26 

27import sqlalchemy 

28 

29from ...core import ( 

30 DatasetType, 

31 DimensionGraph, 

32 DimensionUniverse, 

33) 

34from .._collectionType import CollectionType 

35from ..interfaces import CollectionManager, CollectionRecord 

36from ..wildcards import CategorizedWildcard, CollectionSearch, CollectionQuery 

37 

38 

39class DatasetRegistryStorage: 

40 """An object managing ``dataset`` and related tables in a `Registry`. 

41 

42 Parameters 

43 ---------- 

44 connection : `sqlalchemy.engine.Connection` 

45 A SQLAlchemy connection object, typically shared with the `Registry` 

46 that will own the storage instances. 

47 universe : `DimensionUniverse` 

48 The set of all dimensions for which storage instances should be 

49 constructed. 

50 tables : `dict` 

51 A dictionary mapping table name to a `sqlalchemy.sql.FromClause` 

52 representing that table. 

53 

54 Notes 

55 ----- 

56 Future changes will convert this concrete class into a polymorphic 

57 hierarchy modeled after `DimensionRecordStorage`, with many more 

58 `SqlRegistry` method implementations delegating to it. Its interface 

59 may change significantly at the same time. At present, this functionality 

60 has been factored out of `SqlRegistry` (with a bit of duplication) to 

61 allow the initial `QueryBuilder` design and implementation to be more 

62 forward-looking. 

63 """ 

64 def __init__(self, connection: sqlalchemy.engine.Connection, universe: DimensionUniverse, 

65 tables: Mapping[str, sqlalchemy.sql.FromClause], *, 

66 collections: CollectionManager): 

67 self._connection = connection 

68 self._universe = universe 

69 self._collections = collections 

70 self._datasetTypeTable = tables["dataset_type"] 

71 self._datasetTypeDimensionsTable = tables["dataset_type_dimensions"] 

72 self._datasetTable = tables["dataset"] 

73 self._datasetCollectionTable = tables["dataset_collection"] 

74 

75 def fetchDatasetTypes(self, expression: Any = ...) -> Iterator[DatasetType]: 

76 """Retrieve `DatasetType` instances from the database matching an 

77 expression. 

78 

79 Parameters 

80 ---------- 

81 expression 

82 An expression indicating the dataset type(s) to fetch. 

83 See :ref:`daf_butler_dataset_type_expressions` for more 

84 information. 

85 

86 Yields 

87 ------- 

88 datasetType 

89 A dataset matching the given argument. 

90 """ 

91 query = sqlalchemy.sql.select([ 

92 self._datasetTypeTable.columns.dataset_type_name, 

93 self._datasetTypeTable.columns.storage_class, 

94 self._datasetTypeDimensionsTable.columns.dimension_name, 

95 ]).select_from( 

96 self._datasetTypeTable.join(self._datasetTypeDimensionsTable) 

97 ) 

98 wildcard = CategorizedWildcard.fromExpression(expression, coerceUnrecognized=lambda d: d.name) 

99 if wildcard is not ...: 

100 where = wildcard.makeWhereExpression(self._datasetTypeTable.columns.dataset_type_name) 

101 if where is None: 

102 return 

103 query = query.where(where) 

104 # Run the query and group by dataset type name. 

105 grouped = {} 

106 for row in self._connection.execute(query).fetchall(): 

107 datasetTypeName, storageClassName, dimensionName = row 

108 _, dimensionNames = grouped.setdefault(datasetTypeName, (storageClassName, set())) 

109 dimensionNames.add(dimensionName) 

110 for datasetTypeName, (storageClassName, dimensionNames) in grouped.items(): 

111 yield DatasetType(datasetTypeName, 

112 dimensions=DimensionGraph(self._universe, names=dimensionNames), 

113 storageClass=storageClassName) 

114 

115 def getDatasetSubquery(self, datasetType: DatasetType, *, 

116 collections: Any, 

117 isResult: bool = True, 

118 addRank: bool = False) -> Optional[sqlalchemy.sql.FromClause]: 

119 """Return a SQL expression that searches for a dataset of a particular 

120 type in one or more collections. 

121 

122 Parameters 

123 ---------- 

124 datasetType : `DatasetType` 

125 Type of dataset to search for. Must be a true `DatasetType`; 

126 call `fetchDatasetTypes` first to expand an expression if desired. 

127 collections 

128 An expression describing the collections to search and any 

129 restrictions on the dataset types to search within them. 

130 See :ref:`daf_butler_collection_expressions` for more information. 

131 isResult : `bool`, optional 

132 If `True` (default), include the ``dataset_id`` column in the 

133 result columns of the query. 

134 addRank : `bool`, optional 

135 If `True` (`False` is default), also include a calculated column 

136 that ranks the collection in which the dataset was found (lower 

137 is better). Requires that ``collections`` must be an *ordered* 

138 expression (regular expressions and `...` are not allowed). 

139 

140 Returns 

141 ------- 

142 subquery : `sqlalchemy.sql.FromClause` or `None` 

143 Named subquery or table that can be used in the FROM clause of 

144 a SELECT query. Has at least columns for all dimensions in 

145 ``datasetType.dimensions``; may have additional columns depending 

146 on the values of ``isResult`` and ``addRank``. May be `None` if 

147 it is known that the query would return no results. 

148 """ 

149 # Always include dimension columns, because that's what we use to 

150 # join against other tables. 

151 columns = [self._datasetTable.columns[dimension.name] for dimension in datasetType.dimensions] 

152 

153 def finishSubquery(select: sqlalchemy.sql.Select, collectionRecord: CollectionRecord): 

154 if collectionRecord.type is CollectionType.TAGGED: 

155 collectionColumn = \ 

156 self._datasetCollectionTable.columns[self._collections.getCollectionForeignKeyName()] 

157 fromClause = self._datasetTable.join(self._datasetCollectionTable) 

158 elif collectionRecord.type is CollectionType.RUN: 

159 collectionColumn = self._datasetTable.columns[self._collections.getRunForeignKeyName()] 

160 fromClause = self._datasetTable 

161 else: 

162 raise NotImplementedError(f"Unrecognized CollectionType: '{collectionRecord.type}'.") 

163 return select.select_from( 

164 fromClause 

165 ).where( 

166 sqlalchemy.sql.and_(self._datasetTable.columns.dataset_type_name == datasetType.name, 

167 collectionColumn == collectionRecord.key) 

168 ) 

169 

170 # A list of single-collection queries that we'll UNION together. 

171 subsubqueries = [] 

172 

173 # Only include dataset_id and the rank of the collection in the given 

174 # list if caller has indicated that they're going to be actually 

175 # selecting columns from this subquery in the larger query. 

176 if isResult: 

177 columns.append(self._datasetTable.columns.dataset_id) 

178 if addRank: 

179 collections = CollectionSearch.fromExpression(collections) 

180 for n, record in enumerate(collections.iter(self._collections, datasetType=datasetType)): 

181 subsubqueries.append( 

182 finishSubquery( 

183 sqlalchemy.sql.select( 

184 columns + [sqlalchemy.sql.literal(n).label("rank")] 

185 ), 

186 record 

187 ) 

188 ) 

189 return sqlalchemy.sql.union_all(*subsubqueries).alias(datasetType.name) 

190 

191 # The code path for not adding ranks is similar, but we don't need to 

192 # add the literal rank column, and we transform the collections 

193 # expression into a CollectionQuery instead of a CollectionSearch. 

194 collections = CollectionQuery.fromExpression(collections) 

195 for record in collections.iter(self._collections, datasetType=datasetType): 

196 subsubqueries.append(finishSubquery(sqlalchemy.sql.select(columns), record)) 

197 if not subsubqueries: 

198 return None 

199 return sqlalchemy.sql.union_all(*subsubqueries).alias(datasetType.name)