Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ["DatasetRegistryStorage"] 

24 

25from typing import Any, Mapping, Iterator 

26 

27import sqlalchemy 

28 

29from ...core import ( 

30 DatasetType, 

31 DimensionGraph, 

32 DimensionUniverse, 

33) 

34from .._collectionType import CollectionType 

35from ..interfaces import CollectionManager, CollectionRecord 

36from ..wildcards import CategorizedWildcard, CollectionSearch, CollectionQuery 

37 

38 

39class DatasetRegistryStorage: 

40 """An object managing ``dataset`` and related tables in a `Registry`. 

41 

42 Parameters 

43 ---------- 

44 connection : `sqlalchemy.engine.Connection` 

45 A SQLAlchemy connection object, typically shared with the `Registry` 

46 that will own the storage instances. 

47 universe : `DimensionUniverse` 

48 The set of all dimensions for which storage instances should be 

49 constructed. 

50 tables : `dict` 

51 A dictionary mapping table name to a `sqlalchemy.sql.FromClause` 

52 representing that table. 

53 

54 Notes 

55 ----- 

56 Future changes will convert this concrete class into a polymorphic 

57 hierarchy modeled after `DimensionRecordStorage`, with many more 

58 `SqlRegistry` method implementations delegating to it. Its interface 

59 may change significantly at the same time. At present, this functionality 

60 has been factored out of `SqlRegistry` (with a bit of duplication) to 

61 allow the initial `QueryBuilder` design and implementation to be more 

62 forward-looking. 

63 """ 

64 def __init__(self, connection: sqlalchemy.engine.Connection, universe: DimensionUniverse, 

65 tables: Mapping[str, sqlalchemy.sql.FromClause], *, 

66 collections: CollectionManager): 

67 self._connection = connection 

68 self._universe = universe 

69 self._collections = collections 

70 self._datasetTypeTable = tables["dataset_type"] 

71 self._datasetTypeDimensionsTable = tables["dataset_type_dimensions"] 

72 self._datasetTable = tables["dataset"] 

73 self._datasetCollectionTable = tables["dataset_collection"] 

74 

75 def fetchDatasetTypes(self, expression: Any = ...) -> Iterator[DatasetType]: 

76 """Retrieve `DatasetType` instances from the database matching an 

77 expression. 

78 

79 Parameters 

80 ---------- 

81 expression 

82 An expression indicating the dataset type(s) to fetch. 

83 See :ref:`daf_butler_dataset_type_expressions` for more 

84 information. 

85 

86 Yields 

87 ------- 

88 datasetType 

89 A dataset matching the given argument. 

90 """ 

91 query = sqlalchemy.sql.select([ 

92 self._datasetTypeTable.columns.dataset_type_name, 

93 self._datasetTypeTable.columns.storage_class, 

94 self._datasetTypeDimensionsTable.columns.dimension_name, 

95 ]).select_from( 

96 self._datasetTypeTable.join(self._datasetTypeDimensionsTable) 

97 ) 

98 wildcard = CategorizedWildcard.fromExpression(expression, coerceUnrecognized=lambda d: d.name) 

99 if wildcard is not ...: 

100 where = wildcard.makeWhereExpression(self._datasetTypeTable.columns.dataset_type_name) 

101 if where is None: 

102 return 

103 query = query.where(where) 

104 # Run the query and group by dataset type name. 

105 grouped = {} 

106 for row in self._connection.execute(query).fetchall(): 

107 datasetTypeName, storageClassName, dimensionName = row 

108 _, dimensionNames = grouped.setdefault(datasetTypeName, (storageClassName, set())) 

109 dimensionNames.add(dimensionName) 

110 for datasetTypeName, (storageClassName, dimensionNames) in grouped.items(): 

111 yield DatasetType(datasetTypeName, 

112 dimensions=DimensionGraph(self._universe, names=dimensionNames), 

113 storageClass=storageClassName) 

114 

115 def getDatasetSubquery(self, datasetType: DatasetType, *, 

116 collections: Any, 

117 isResult: bool = True, 

118 addRank: bool = False) -> sqlalchemy.sql.FromClause: 

119 """Return a SQL expression that searches for a dataset of a particular 

120 type in one or more collections. 

121 

122 Parameters 

123 ---------- 

124 datasetType : `DatasetType` 

125 Type of dataset to search for. Must be a true `DatasetType`; 

126 call `fetchDatasetTypes` first to expand an expression if desired. 

127 collections 

128 An expression describing the collections to search and any 

129 restrictions on the dataset types to search within them. 

130 See :ref:`daf_butler_collection_expressions` for more information. 

131 isResult : `bool`, optional 

132 If `True` (default), include the ``dataset_id`` column in the 

133 result columns of the query. 

134 addRank : `bool`, optional 

135 If `True` (`False` is default), also include a calculated column 

136 that ranks the collection in which the dataset was found (lower 

137 is better). Requires that ``collections`` must be an *ordered* 

138 expression (regular expressions and `...` are not allowed). 

139 

140 Returns 

141 ------- 

142 subquery : `sqlalchemy.sql.FromClause` 

143 Named subquery or table that can be used in the FROM clause of 

144 a SELECT query. Has at least columns for all dimensions in 

145 ``datasetType.dimensions``; may have additional columns depending 

146 on the values of ``isResult`` and ``addRank``. 

147 """ 

148 # Always include dimension columns, because that's what we use to 

149 # join against other tables. 

150 columns = [self._datasetTable.columns[dimension.name] for dimension in datasetType.dimensions] 

151 

152 def finishSubquery(select: sqlalchemy.sql.Select, collectionRecord: CollectionRecord): 

153 if collectionRecord.type is CollectionType.TAGGED: 

154 collectionColumn = \ 

155 self._datasetCollectionTable.columns[self._collections.getCollectionForeignKeyName()] 

156 fromClause = self._datasetTable.join(self._datasetCollectionTable) 

157 elif collectionRecord.type is CollectionType.RUN: 

158 collectionColumn = self._datasetTable.columns[self._collections.getRunForeignKeyName()] 

159 fromClause = self._datasetTable 

160 else: 

161 raise NotImplementedError(f"Unrecognized CollectionType: '{collectionRecord.type}'.") 

162 return select.select_from( 

163 fromClause 

164 ).where( 

165 sqlalchemy.sql.and_(self._datasetTable.columns.dataset_type_name == datasetType.name, 

166 collectionColumn == collectionRecord.key) 

167 ) 

168 

169 # A list of single-collection queries that we'll UNION together. 

170 subsubqueries = [] 

171 

172 # Only include dataset_id and the rank of the collection in the given 

173 # list if caller has indicated that they're going to be actually 

174 # selecting columns from this subquery in the larger query. 

175 if isResult: 

176 columns.append(self._datasetTable.columns.dataset_id) 

177 if addRank: 

178 collections = CollectionSearch.fromExpression(collections) 

179 for n, record in enumerate(collections.iter(self._collections, datasetType=datasetType)): 

180 subsubqueries.append( 

181 finishSubquery( 

182 sqlalchemy.sql.select( 

183 columns + [sqlalchemy.sql.literal(n).label("rank")] 

184 ), 

185 record 

186 ) 

187 ) 

188 return sqlalchemy.sql.union_all(*subsubqueries).alias(datasetType.name) 

189 

190 # The code path for not adding ranks is similar, but we don't need to 

191 # add the literal rank column, and we transform the collections 

192 # expression into a CollectionQuery instead of a CollectionSearch. 

193 collections = CollectionQuery.fromExpression(collections) 

194 for record in collections.iter(self._collections, datasetType=datasetType): 

195 subsubqueries.append(finishSubquery(sqlalchemy.sql.select(columns), record)) 

196 return sqlalchemy.sql.union_all(*subsubqueries).alias(datasetType.name)