Coverage for python/lsst/daf/butler/registry/datasets/byDimensions/summaries.py: 97%

84 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-10-27 09:43 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28from __future__ import annotations 

29 

30from .... import ddl 

31 

32__all__ = ("CollectionSummaryManager",) 

33 

34from collections.abc import Iterable, Mapping 

35from typing import Any, Generic, TypeVar 

36 

37import sqlalchemy 

38 

39from ...._dataset_type import DatasetType 

40from ...._named import NamedKeyDict, NamedKeyMapping 

41from ....dimensions import GovernorDimension, addDimensionForeignKey 

42from ..._collection_summary import CollectionSummary 

43from ..._collection_type import CollectionType 

44from ...interfaces import ( 

45 ChainedCollectionRecord, 

46 CollectionManager, 

47 CollectionRecord, 

48 Database, 

49 DimensionRecordStorageManager, 

50 StaticTablesContext, 

51) 

52 

53_T = TypeVar("_T") 

54 

55 

56class CollectionSummaryTables(Generic[_T]): 

57 """Structure that holds the table or table specification objects that 

58 summarize the contents of collections. 

59 

60 Parameters 

61 ---------- 

62 datasetType 

63 Table [specification] that summarizes which dataset types are in each 

64 collection. 

65 dimensions 

66 Mapping of table [specifications] that summarize which governor 

67 dimension values are present in the data IDs of each collection. 

68 """ 

69 

70 def __init__( 

71 self, 

72 datasetType: _T, 

73 dimensions: NamedKeyMapping[GovernorDimension, _T], 

74 ): 

75 self.datasetType = datasetType 

76 self.dimensions = dimensions 

77 

78 @classmethod 

79 def makeTableSpecs( 

80 cls, 

81 collections: CollectionManager, 

82 dimensions: DimensionRecordStorageManager, 

83 ) -> CollectionSummaryTables[ddl.TableSpec]: 

84 """Create specifications for all summary tables. 

85 

86 Parameters 

87 ---------- 

88 collections: `CollectionManager` 

89 Manager object for the collections in this `Registry`. 

90 dimensions : `DimensionRecordStorageManager` 

91 Manager object for the dimensions in this `Registry`. 

92 

93 Returns 

94 ------- 

95 tables : `CollectionSummaryTables` [ `ddl.TableSpec` ] 

96 Structure containing table specifications. 

97 """ 

98 # Spec for collection_summary_dataset_type. 

99 datasetTypeTableSpec = ddl.TableSpec(fields=[]) 

100 collections.addCollectionForeignKey(datasetTypeTableSpec, primaryKey=True, onDelete="CASCADE") 

101 datasetTypeTableSpec.fields.add( 

102 ddl.FieldSpec("dataset_type_id", dtype=sqlalchemy.BigInteger, primaryKey=True) 

103 ) 

104 datasetTypeTableSpec.foreignKeys.append( 

105 ddl.ForeignKeySpec( 

106 "dataset_type", source=("dataset_type_id",), target=("id",), onDelete="CASCADE" 

107 ) 

108 ) 

109 # Specs for collection_summary_<dimension>. 

110 dimensionTableSpecs = NamedKeyDict[GovernorDimension, ddl.TableSpec]() 

111 for dimension in dimensions.universe.getGovernorDimensions(): 

112 tableSpec = ddl.TableSpec(fields=[]) 

113 collections.addCollectionForeignKey(tableSpec, primaryKey=True, onDelete="CASCADE") 

114 addDimensionForeignKey(tableSpec, dimension, primaryKey=True) 

115 dimensionTableSpecs[dimension] = tableSpec 

116 return CollectionSummaryTables( 

117 datasetType=datasetTypeTableSpec, 

118 dimensions=dimensionTableSpecs.freeze(), 

119 ) 

120 

121 

122class CollectionSummaryManager: 

123 """Object manages the summaries of what dataset types and governor 

124 dimension values are present in a collection. 

125 

126 Parameters 

127 ---------- 

128 db : `Database` 

129 Interface to the underlying database engine and namespace. 

130 collections: `CollectionManager` 

131 Manager object for the collections in this `Registry`. 

132 dimensions : `DimensionRecordStorageManager` 

133 Manager object for the dimensions in this `Registry`. 

134 tables : `CollectionSummaryTables` 

135 Struct containing the tables that hold collection summaries. 

136 """ 

137 

138 def __init__( 

139 self, 

140 db: Database, 

141 *, 

142 collections: CollectionManager, 

143 dimensions: DimensionRecordStorageManager, 

144 tables: CollectionSummaryTables[sqlalchemy.schema.Table], 

145 ): 

146 self._db = db 

147 self._collections = collections 

148 self._collectionKeyName = collections.getCollectionForeignKeyName() 

149 self._dimensions = dimensions 

150 self._tables = tables 

151 self._cache: dict[Any, CollectionSummary] = {} 

152 

153 @classmethod 

154 def initialize( 

155 cls, 

156 db: Database, 

157 context: StaticTablesContext, 

158 *, 

159 collections: CollectionManager, 

160 dimensions: DimensionRecordStorageManager, 

161 ) -> CollectionSummaryManager: 

162 """Create all summary tables (or check that they have been created), 

163 returning an object to manage them. 

164 

165 Parameters 

166 ---------- 

167 db : `Database` 

168 Interface to the underlying database engine and namespace. 

169 context : `StaticTablesContext` 

170 Context object obtained from `Database.declareStaticTables`; used 

171 to declare any tables that should always be present. 

172 collections: `CollectionManager` 

173 Manager object for the collections in this `Registry`. 

174 dimensions : `DimensionRecordStorageManager` 

175 Manager object for the dimensions in this `Registry`. 

176 

177 Returns 

178 ------- 

179 manager : `CollectionSummaryManager` 

180 New manager object for collection summaries. 

181 """ 

182 specs = CollectionSummaryTables.makeTableSpecs(collections, dimensions) 

183 tables = CollectionSummaryTables( 

184 datasetType=context.addTable("collection_summary_dataset_type", specs.datasetType), 

185 dimensions=NamedKeyDict[GovernorDimension, sqlalchemy.schema.Table]( 

186 { 

187 dimension: context.addTable(f"collection_summary_{dimension.name}", spec) 

188 for dimension, spec in specs.dimensions.items() 

189 } 

190 ).freeze(), 

191 ) 

192 return cls( 

193 db=db, 

194 collections=collections, 

195 dimensions=dimensions, 

196 tables=tables, 

197 ) 

198 

199 def update( 

200 self, 

201 collection: CollectionRecord, 

202 dataset_type_ids: Iterable[int], 

203 summary: CollectionSummary, 

204 ) -> None: 

205 """Update the summary tables to associate the given collection with 

206 a dataset type and governor dimension values. 

207 

208 Parameters 

209 ---------- 

210 collection : `CollectionRecord` 

211 Collection whose summary should be updated. 

212 dataset_type_ids : `~collections.abc.Iterable` [ `int` ] 

213 Integer IDs for the dataset types to associate with this 

214 collection. 

215 summary : `CollectionSummary` 

216 Summary to store. Dataset types must correspond to 

217 ``dataset_type_ids``. 

218 

219 Notes 

220 ----- 

221 This method should only be called inside the transaction context of 

222 another operation that inserts or associates datasets. 

223 """ 

224 self._db.ensure( 

225 self._tables.datasetType, 

226 *[ 

227 { 

228 "dataset_type_id": dataset_type_id, 

229 self._collectionKeyName: collection.key, 

230 } 

231 for dataset_type_id in dataset_type_ids 

232 ], 

233 ) 

234 for dimension, values in summary.governors.items(): 

235 if values: 235 ↛ 234line 235 didn't jump to line 234, because the condition on line 235 was never false

236 self._db.ensure( 

237 self._tables.dimensions[dimension], 

238 *[{self._collectionKeyName: collection.key, dimension: v} for v in values], 

239 ) 

240 # Update the in-memory cache, too. These changes will remain even if 

241 # the database inserts above are rolled back by some later exception in 

242 # the same transaction, but that's okay: we never promise that a 

243 # CollectionSummary has _just_ the dataset types and governor dimension 

244 # values that are actually present, only that it is guaranteed to 

245 # contain any dataset types or governor dimension values that _may_ be 

246 # present. 

247 # That guarantee (and the possibility of rollbacks) means we can't get 

248 # away with checking the cache before we try the database inserts, 

249 # however; if someone had attempted to insert datasets of some dataset 

250 # type previously, and that rolled back, and we're now trying to insert 

251 # some more datasets of that same type, it would not be okay to skip 

252 # the DB summary table insertions because we found entries in the 

253 # in-memory cache. 

254 self.get(collection).update(summary) 

255 

256 def refresh(self, dataset_types: Mapping[int, DatasetType]) -> None: 

257 """Load all collection summary information from the database. 

258 

259 Parameters 

260 ---------- 

261 dataset_types : `~collections.abc.Mapping` [`int`, `DatasetType`] 

262 Mapping of an `int` dataset_type_id value to `DatasetType` 

263 instance. Summaries are only loaded for dataset types that appear 

264 in this mapping. 

265 """ 

266 # Set up the SQL query we'll use to fetch all of the summary 

267 # information at once. 

268 columns = [ 

269 self._tables.datasetType.columns[self._collectionKeyName].label(self._collectionKeyName), 

270 self._tables.datasetType.columns.dataset_type_id.label("dataset_type_id"), 

271 ] 

272 fromClause: sqlalchemy.sql.expression.FromClause = self._tables.datasetType 

273 for dimension, table in self._tables.dimensions.items(): 

274 columns.append(table.columns[dimension.name].label(dimension.name)) 

275 fromClause = fromClause.join( 

276 table, 

277 onclause=( 

278 self._tables.datasetType.columns[self._collectionKeyName] 

279 == table.columns[self._collectionKeyName] 

280 ), 

281 isouter=True, 

282 ) 

283 sql = sqlalchemy.sql.select(*columns).select_from(fromClause) 

284 # Run the query and construct CollectionSummary objects from the result 

285 # rows. This will never include CHAINED collections or collections 

286 # with no datasets. 

287 summaries: dict[Any, CollectionSummary] = {} 

288 with self._db.query(sql) as sql_result: 

289 sql_rows = sql_result.mappings().fetchall() 

290 for row in sql_rows: 

291 # Collection key should never be None/NULL; it's what we join on. 

292 # Extract that and then turn it into a collection name. 

293 collectionKey = row[self._collectionKeyName] 

294 # dataset_type_id should also never be None/NULL; it's in the first 

295 # table we joined. 

296 if datasetType := dataset_types.get(row["dataset_type_id"]): 296 ↛ 290line 296 didn't jump to line 290, because the condition on line 296 was never false

297 # See if we have a summary already for this collection; if not, 

298 # make one. 

299 summary = summaries.get(collectionKey) 

300 if summary is None: 

301 summary = CollectionSummary() 

302 summaries[collectionKey] = summary 

303 # Update the dimensions with the values in this row that 

304 # aren't None/NULL (many will be in general, because these 

305 # enter the query via LEFT OUTER JOIN). 

306 summary.dataset_types.add(datasetType) 

307 for dimension in self._tables.dimensions: 

308 value = row[dimension.name] 

309 if value is not None: 

310 summary.governors.setdefault(dimension.name, set()).add(value) 

311 self._cache = summaries 

312 

313 def get(self, collection: CollectionRecord) -> CollectionSummary: 

314 """Return a summary for the given collection. 

315 

316 Parameters 

317 ---------- 

318 collection : `CollectionRecord` 

319 Record describing the collection for which a summary is to be 

320 retrieved. 

321 

322 Returns 

323 ------- 

324 summary : `CollectionSummary` 

325 Summary of the dataset types and governor dimension values in 

326 this collection. 

327 """ 

328 summary = self._cache.get(collection.key) 

329 if summary is None: 

330 # When we load the summary information from the database, we don't 

331 # create summaries for CHAINED collections; those are created here 

332 # as needed, and *never* cached - we have no good way to update 

333 # those summaries when some a new dataset is added to a child 

334 # colletion. 

335 if collection.type is CollectionType.CHAINED: 

336 assert isinstance(collection, ChainedCollectionRecord) 

337 child_summaries = [self.get(self._collections.find(child)) for child in collection.children] 

338 if child_summaries: 338 ↛ 341line 338 didn't jump to line 341, because the condition on line 338 was never false

339 summary = CollectionSummary.union(*child_summaries) 

340 else: 

341 summary = CollectionSummary() 

342 else: 

343 # Either this collection doesn't have any datasets yet, or the 

344 # only datasets it has were created by some other process since 

345 # the last call to refresh. We assume the former; the user is 

346 # responsible for calling refresh if they want to read 

347 # concurrently-written things. We do remember this in the 

348 # cache. 

349 summary = CollectionSummary() 

350 self._cache[collection.key] = summary 

351 return summary