Coverage for python/lsst/daf/butler/registry/datasets/byDimensions/summaries.py: 97%

81 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2023-01-05 10:35 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ("CollectionSummaryManager",) 

25 

26from collections.abc import Callable, Iterable 

27from typing import Any, Generic, TypeVar 

28 

29import sqlalchemy 

30 

31from ....core import ( 

32 DatasetType, 

33 GovernorDimension, 

34 NamedKeyDict, 

35 NamedKeyMapping, 

36 addDimensionForeignKey, 

37 ddl, 

38) 

39from ..._collection_summary import CollectionSummary 

40from ..._collectionType import CollectionType 

41from ...interfaces import ( 

42 ChainedCollectionRecord, 

43 CollectionManager, 

44 CollectionRecord, 

45 Database, 

46 DimensionRecordStorageManager, 

47 StaticTablesContext, 

48) 

49 

50_T = TypeVar("_T") 

51 

52 

53class CollectionSummaryTables(Generic[_T]): 

54 """Structure that holds the table or table specification objects that 

55 summarize the contents of collections. 

56 

57 Parameters 

58 ---------- 

59 datasetType 

60 Table [specification] that summarizes which dataset types are in each 

61 collection. 

62 dimensions 

63 Mapping of table [specifications] that summarize which governor 

64 dimension values are present in the data IDs of each collection. 

65 """ 

66 

67 def __init__( 

68 self, 

69 datasetType: _T, 

70 dimensions: NamedKeyMapping[GovernorDimension, _T], 

71 ): 

72 self.datasetType = datasetType 

73 self.dimensions = dimensions 

74 

75 @classmethod 

76 def makeTableSpecs( 

77 cls, 

78 collections: CollectionManager, 

79 dimensions: DimensionRecordStorageManager, 

80 ) -> CollectionSummaryTables[ddl.TableSpec]: 

81 """Create specifications for all summary tables. 

82 

83 Parameters 

84 ---------- 

85 collections: `CollectionManager` 

86 Manager object for the collections in this `Registry`. 

87 dimensions : `DimensionRecordStorageManager` 

88 Manager object for the dimensions in this `Registry`. 

89 

90 Returns 

91 ------- 

92 tables : `CollectionSummaryTables` [ `ddl.TableSpec` ] 

93 Structure containing table specifications. 

94 """ 

95 # Spec for collection_summary_dataset_type. 

96 datasetTypeTableSpec = ddl.TableSpec(fields=[]) 

97 collections.addCollectionForeignKey(datasetTypeTableSpec, primaryKey=True, onDelete="CASCADE") 

98 datasetTypeTableSpec.fields.add( 

99 ddl.FieldSpec("dataset_type_id", dtype=sqlalchemy.BigInteger, primaryKey=True) 

100 ) 

101 datasetTypeTableSpec.foreignKeys.append( 

102 ddl.ForeignKeySpec( 

103 "dataset_type", source=("dataset_type_id",), target=("id",), onDelete="CASCADE" 

104 ) 

105 ) 

106 # Specs for collection_summary_<dimension>. 

107 dimensionTableSpecs = NamedKeyDict[GovernorDimension, ddl.TableSpec]() 

108 for dimension in dimensions.universe.getGovernorDimensions(): 

109 tableSpec = ddl.TableSpec(fields=[]) 

110 collections.addCollectionForeignKey(tableSpec, primaryKey=True, onDelete="CASCADE") 

111 addDimensionForeignKey(tableSpec, dimension, primaryKey=True) 

112 dimensionTableSpecs[dimension] = tableSpec 

113 return CollectionSummaryTables( 

114 datasetType=datasetTypeTableSpec, 

115 dimensions=dimensionTableSpecs.freeze(), 

116 ) 

117 

118 

119class CollectionSummaryManager: 

120 """Object manages the summaries of what dataset types and governor 

121 dimension values are present in a collection. 

122 

123 Parameters 

124 ---------- 

125 db : `Database` 

126 Interface to the underlying database engine and namespace. 

127 collections: `CollectionManager` 

128 Manager object for the collections in this `Registry`. 

129 dimensions : `DimensionRecordStorageManager` 

130 Manager object for the dimensions in this `Registry`. 

131 tables : `CollectionSummaryTables` 

132 Struct containing the tables that hold collection summaries. 

133 """ 

134 

135 def __init__( 

136 self, 

137 db: Database, 

138 *, 

139 collections: CollectionManager, 

140 dimensions: DimensionRecordStorageManager, 

141 tables: CollectionSummaryTables[sqlalchemy.sql.Table], 

142 ): 

143 self._db = db 

144 self._collections = collections 

145 self._collectionKeyName = collections.getCollectionForeignKeyName() 

146 self._dimensions = dimensions 

147 self._tables = tables 

148 self._cache: dict[Any, CollectionSummary] = {} 

149 

150 @classmethod 

151 def initialize( 

152 cls, 

153 db: Database, 

154 context: StaticTablesContext, 

155 *, 

156 collections: CollectionManager, 

157 dimensions: DimensionRecordStorageManager, 

158 ) -> CollectionSummaryManager: 

159 """Create all summary tables (or check that they have been created), 

160 returning an object to manage them. 

161 

162 Parameters 

163 ---------- 

164 db : `Database` 

165 Interface to the underlying database engine and namespace. 

166 context : `StaticTablesContext` 

167 Context object obtained from `Database.declareStaticTables`; used 

168 to declare any tables that should always be present. 

169 collections: `CollectionManager` 

170 Manager object for the collections in this `Registry`. 

171 dimensions : `DimensionRecordStorageManager` 

172 Manager object for the dimensions in this `Registry`. 

173 

174 Returns 

175 ------- 

176 manager : `CollectionSummaryManager` 

177 New manager object for collection summaries. 

178 """ 

179 specs = CollectionSummaryTables.makeTableSpecs(collections, dimensions) 

180 tables = CollectionSummaryTables( 

181 datasetType=context.addTable("collection_summary_dataset_type", specs.datasetType), 

182 dimensions=NamedKeyDict[GovernorDimension, sqlalchemy.schema.Table]( 

183 { 

184 dimension: context.addTable(f"collection_summary_{dimension.name}", spec) 

185 for dimension, spec in specs.dimensions.items() 

186 } 

187 ).freeze(), 

188 ) 

189 return cls( 

190 db=db, 

191 collections=collections, 

192 dimensions=dimensions, 

193 tables=tables, 

194 ) 

195 

196 def update( 

197 self, 

198 collection: CollectionRecord, 

199 dataset_type_ids: Iterable[int], 

200 summary: CollectionSummary, 

201 ) -> None: 

202 """Update the summary tables to associate the given collection with 

203 a dataset type and governor dimension values. 

204 

205 Parameters 

206 ---------- 

207 collection : `CollectionRecord` 

208 Collection whose summary should be updated. 

209 dataset_type_ids : `Iterable` [ `int` ] 

210 Integer IDs for the dataset types to associate with this 

211 collection. 

212 summary : `CollectionSummary` 

213 Summary to store. Dataset types must correspond to 

214 ``dataset_type_ids``. 

215 

216 Notes 

217 ----- 

218 This method should only be called inside the transaction context of 

219 another operation that inserts or associates datasets. 

220 """ 

221 self._db.ensure( 

222 self._tables.datasetType, 

223 *[ 

224 { 

225 "dataset_type_id": dataset_type_id, 

226 self._collectionKeyName: collection.key, 

227 } 

228 for dataset_type_id in dataset_type_ids 

229 ], 

230 ) 

231 for dimension, values in summary.governors.items(): 

232 if values: 232 ↛ 231line 232 didn't jump to line 231, because the condition on line 232 was never false

233 self._db.ensure( 

234 self._tables.dimensions[dimension], 

235 *[{self._collectionKeyName: collection.key, dimension: v} for v in values], 

236 ) 

237 # Update the in-memory cache, too. These changes will remain even if 

238 # the database inserts above are rolled back by some later exception in 

239 # the same transaction, but that's okay: we never promise that a 

240 # CollectionSummary has _just_ the dataset types and governor dimension 

241 # values that are actually present, only that it is guaranteed to 

242 # contain any dataset types or governor dimension values that _may_ be 

243 # present. 

244 # That guarantee (and the possibility of rollbacks) means we can't get 

245 # away with checking the cache before we try the database inserts, 

246 # however; if someone had attempted to insert datasets of some dataset 

247 # type previously, and that rolled back, and we're now trying to insert 

248 # some more datasets of that same type, it would not be okay to skip 

249 # the DB summary table insertions because we found entries in the 

250 # in-memory cache. 

251 self.get(collection).update(summary) 

252 

253 def refresh(self, get_dataset_type: Callable[[int], DatasetType]) -> None: 

254 """Load all collection summary information from the database. 

255 

256 Parameters 

257 ---------- 

258 get_dataset_type : `Callable` 

259 Function that takes an `int` dataset_type_id value and returns a 

260 `DatasetType` instance. 

261 """ 

262 # Set up the SQL query we'll use to fetch all of the summary 

263 # information at once. 

264 columns = [ 

265 self._tables.datasetType.columns[self._collectionKeyName].label(self._collectionKeyName), 

266 self._tables.datasetType.columns.dataset_type_id.label("dataset_type_id"), 

267 ] 

268 fromClause = self._tables.datasetType 

269 for dimension, table in self._tables.dimensions.items(): 

270 columns.append(table.columns[dimension.name].label(dimension.name)) 

271 fromClause = fromClause.join( 

272 table, 

273 onclause=( 

274 self._tables.datasetType.columns[self._collectionKeyName] 

275 == table.columns[self._collectionKeyName] 

276 ), 

277 isouter=True, 

278 ) 

279 sql = sqlalchemy.sql.select(*columns).select_from(fromClause) 

280 # Run the query and construct CollectionSummary objects from the result 

281 # rows. This will never include CHAINED collections or collections 

282 # with no datasets. 

283 summaries: dict[Any, CollectionSummary] = {} 

284 with self._db.query(sql) as sql_result: 

285 sql_rows = sql_result.mappings().fetchall() 

286 for row in sql_rows: 

287 # Collection key should never be None/NULL; it's what we join on. 

288 # Extract that and then turn it into a collection name. 

289 collectionKey = row[self._collectionKeyName] 

290 # dataset_type_id should also never be None/NULL; it's in the first 

291 # table we joined. 

292 datasetType = get_dataset_type(row["dataset_type_id"]) 

293 # See if we have a summary already for this collection; if not, 

294 # make one. 

295 summary = summaries.get(collectionKey) 

296 if summary is None: 

297 summary = CollectionSummary() 

298 summaries[collectionKey] = summary 

299 # Update the dimensions with the values in this row that aren't 

300 # None/NULL (many will be in general, because these enter the query 

301 # via LEFT OUTER JOIN). 

302 summary.dataset_types.add(datasetType) 

303 for dimension in self._tables.dimensions: 

304 value = row[dimension.name] 

305 if value is not None: 

306 summary.governors.setdefault(dimension.name, set()).add(value) 

307 self._cache = summaries 

308 

309 def get(self, collection: CollectionRecord) -> CollectionSummary: 

310 """Return a summary for the given collection. 

311 

312 Parameters 

313 ---------- 

314 collection : `CollectionRecord` 

315 Record describing the collection for which a summary is to be 

316 retrieved. 

317 

318 Returns 

319 ------- 

320 summary : `CollectionSummary` 

321 Summary of the dataset types and governor dimension values in 

322 this collection. 

323 """ 

324 summary = self._cache.get(collection.key) 

325 if summary is None: 

326 # When we load the summary information from the database, we don't 

327 # create summaries for CHAINED collections; those are created here 

328 # as needed, and *never* cached - we have no good way to update 

329 # those summaries when some a new dataset is added to a child 

330 # colletion. 

331 if collection.type is CollectionType.CHAINED: 

332 assert isinstance(collection, ChainedCollectionRecord) 

333 child_summaries = [self.get(self._collections.find(child)) for child in collection.children] 

334 if child_summaries: 334 ↛ 337line 334 didn't jump to line 337, because the condition on line 334 was never false

335 summary = CollectionSummary.union(*child_summaries) 

336 else: 

337 summary = CollectionSummary() 

338 else: 

339 # Either this collection doesn't have any datasets yet, or the 

340 # only datasets it has were created by some other process since 

341 # the last call to refresh. We assume the former; the user is 

342 # responsible for calling refresh if they want to read 

343 # concurrently-written things. We do remember this in the 

344 # cache. 

345 summary = CollectionSummary() 

346 self._cache[collection.key] = summary 

347 return summary