Coverage for python/lsst/daf/butler/registry/datasets/byDimensions/summaries.py: 97%

81 statements  

« prev     ^ index     » next       coverage.py v7.3.1, created at 2023-10-02 07:59 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28from __future__ import annotations 

29 

30__all__ = ("CollectionSummaryManager",) 

31 

32from collections.abc import Iterable, Mapping 

33from typing import Any, Generic, TypeVar 

34 

35import sqlalchemy 

36 

37from ....core import ( 

38 DatasetType, 

39 GovernorDimension, 

40 NamedKeyDict, 

41 NamedKeyMapping, 

42 addDimensionForeignKey, 

43 ddl, 

44) 

45from ..._collection_summary import CollectionSummary 

46from ..._collectionType import CollectionType 

47from ...interfaces import ( 

48 ChainedCollectionRecord, 

49 CollectionManager, 

50 CollectionRecord, 

51 Database, 

52 DimensionRecordStorageManager, 

53 StaticTablesContext, 

54) 

55 

56_T = TypeVar("_T") 

57 

58 

59class CollectionSummaryTables(Generic[_T]): 

60 """Structure that holds the table or table specification objects that 

61 summarize the contents of collections. 

62 

63 Parameters 

64 ---------- 

65 datasetType 

66 Table [specification] that summarizes which dataset types are in each 

67 collection. 

68 dimensions 

69 Mapping of table [specifications] that summarize which governor 

70 dimension values are present in the data IDs of each collection. 

71 """ 

72 

73 def __init__( 

74 self, 

75 datasetType: _T, 

76 dimensions: NamedKeyMapping[GovernorDimension, _T], 

77 ): 

78 self.datasetType = datasetType 

79 self.dimensions = dimensions 

80 

81 @classmethod 

82 def makeTableSpecs( 

83 cls, 

84 collections: CollectionManager, 

85 dimensions: DimensionRecordStorageManager, 

86 ) -> CollectionSummaryTables[ddl.TableSpec]: 

87 """Create specifications for all summary tables. 

88 

89 Parameters 

90 ---------- 

91 collections: `CollectionManager` 

92 Manager object for the collections in this `Registry`. 

93 dimensions : `DimensionRecordStorageManager` 

94 Manager object for the dimensions in this `Registry`. 

95 

96 Returns 

97 ------- 

98 tables : `CollectionSummaryTables` [ `ddl.TableSpec` ] 

99 Structure containing table specifications. 

100 """ 

101 # Spec for collection_summary_dataset_type. 

102 datasetTypeTableSpec = ddl.TableSpec(fields=[]) 

103 collections.addCollectionForeignKey(datasetTypeTableSpec, primaryKey=True, onDelete="CASCADE") 

104 datasetTypeTableSpec.fields.add( 

105 ddl.FieldSpec("dataset_type_id", dtype=sqlalchemy.BigInteger, primaryKey=True) 

106 ) 

107 datasetTypeTableSpec.foreignKeys.append( 

108 ddl.ForeignKeySpec( 

109 "dataset_type", source=("dataset_type_id",), target=("id",), onDelete="CASCADE" 

110 ) 

111 ) 

112 # Specs for collection_summary_<dimension>. 

113 dimensionTableSpecs = NamedKeyDict[GovernorDimension, ddl.TableSpec]() 

114 for dimension in dimensions.universe.getGovernorDimensions(): 

115 tableSpec = ddl.TableSpec(fields=[]) 

116 collections.addCollectionForeignKey(tableSpec, primaryKey=True, onDelete="CASCADE") 

117 addDimensionForeignKey(tableSpec, dimension, primaryKey=True) 

118 dimensionTableSpecs[dimension] = tableSpec 

119 return CollectionSummaryTables( 

120 datasetType=datasetTypeTableSpec, 

121 dimensions=dimensionTableSpecs.freeze(), 

122 ) 

123 

124 

125class CollectionSummaryManager: 

126 """Object manages the summaries of what dataset types and governor 

127 dimension values are present in a collection. 

128 

129 Parameters 

130 ---------- 

131 db : `Database` 

132 Interface to the underlying database engine and namespace. 

133 collections: `CollectionManager` 

134 Manager object for the collections in this `Registry`. 

135 dimensions : `DimensionRecordStorageManager` 

136 Manager object for the dimensions in this `Registry`. 

137 tables : `CollectionSummaryTables` 

138 Struct containing the tables that hold collection summaries. 

139 """ 

140 

141 def __init__( 

142 self, 

143 db: Database, 

144 *, 

145 collections: CollectionManager, 

146 dimensions: DimensionRecordStorageManager, 

147 tables: CollectionSummaryTables[sqlalchemy.schema.Table], 

148 ): 

149 self._db = db 

150 self._collections = collections 

151 self._collectionKeyName = collections.getCollectionForeignKeyName() 

152 self._dimensions = dimensions 

153 self._tables = tables 

154 self._cache: dict[Any, CollectionSummary] = {} 

155 

156 @classmethod 

157 def initialize( 

158 cls, 

159 db: Database, 

160 context: StaticTablesContext, 

161 *, 

162 collections: CollectionManager, 

163 dimensions: DimensionRecordStorageManager, 

164 ) -> CollectionSummaryManager: 

165 """Create all summary tables (or check that they have been created), 

166 returning an object to manage them. 

167 

168 Parameters 

169 ---------- 

170 db : `Database` 

171 Interface to the underlying database engine and namespace. 

172 context : `StaticTablesContext` 

173 Context object obtained from `Database.declareStaticTables`; used 

174 to declare any tables that should always be present. 

175 collections: `CollectionManager` 

176 Manager object for the collections in this `Registry`. 

177 dimensions : `DimensionRecordStorageManager` 

178 Manager object for the dimensions in this `Registry`. 

179 

180 Returns 

181 ------- 

182 manager : `CollectionSummaryManager` 

183 New manager object for collection summaries. 

184 """ 

185 specs = CollectionSummaryTables.makeTableSpecs(collections, dimensions) 

186 tables = CollectionSummaryTables( 

187 datasetType=context.addTable("collection_summary_dataset_type", specs.datasetType), 

188 dimensions=NamedKeyDict[GovernorDimension, sqlalchemy.schema.Table]( 

189 { 

190 dimension: context.addTable(f"collection_summary_{dimension.name}", spec) 

191 for dimension, spec in specs.dimensions.items() 

192 } 

193 ).freeze(), 

194 ) 

195 return cls( 

196 db=db, 

197 collections=collections, 

198 dimensions=dimensions, 

199 tables=tables, 

200 ) 

201 

202 def update( 

203 self, 

204 collection: CollectionRecord, 

205 dataset_type_ids: Iterable[int], 

206 summary: CollectionSummary, 

207 ) -> None: 

208 """Update the summary tables to associate the given collection with 

209 a dataset type and governor dimension values. 

210 

211 Parameters 

212 ---------- 

213 collection : `CollectionRecord` 

214 Collection whose summary should be updated. 

215 dataset_type_ids : `~collections.abc.Iterable` [ `int` ] 

216 Integer IDs for the dataset types to associate with this 

217 collection. 

218 summary : `CollectionSummary` 

219 Summary to store. Dataset types must correspond to 

220 ``dataset_type_ids``. 

221 

222 Notes 

223 ----- 

224 This method should only be called inside the transaction context of 

225 another operation that inserts or associates datasets. 

226 """ 

227 self._db.ensure( 

228 self._tables.datasetType, 

229 *[ 

230 { 

231 "dataset_type_id": dataset_type_id, 

232 self._collectionKeyName: collection.key, 

233 } 

234 for dataset_type_id in dataset_type_ids 

235 ], 

236 ) 

237 for dimension, values in summary.governors.items(): 

238 if values: 238 ↛ 237line 238 didn't jump to line 237, because the condition on line 238 was never false

239 self._db.ensure( 

240 self._tables.dimensions[dimension], 

241 *[{self._collectionKeyName: collection.key, dimension: v} for v in values], 

242 ) 

243 # Update the in-memory cache, too. These changes will remain even if 

244 # the database inserts above are rolled back by some later exception in 

245 # the same transaction, but that's okay: we never promise that a 

246 # CollectionSummary has _just_ the dataset types and governor dimension 

247 # values that are actually present, only that it is guaranteed to 

248 # contain any dataset types or governor dimension values that _may_ be 

249 # present. 

250 # That guarantee (and the possibility of rollbacks) means we can't get 

251 # away with checking the cache before we try the database inserts, 

252 # however; if someone had attempted to insert datasets of some dataset 

253 # type previously, and that rolled back, and we're now trying to insert 

254 # some more datasets of that same type, it would not be okay to skip 

255 # the DB summary table insertions because we found entries in the 

256 # in-memory cache. 

257 self.get(collection).update(summary) 

258 

259 def refresh(self, dataset_types: Mapping[int, DatasetType]) -> None: 

260 """Load all collection summary information from the database. 

261 

262 Parameters 

263 ---------- 

264 dataset_types : `~collections.abc.Mapping` [`int`, `DatasetType`] 

265 Mapping of an `int` dataset_type_id value to `DatasetType` 

266 instance. Summaries are only loaded for dataset types that appear 

267 in this mapping. 

268 """ 

269 # Set up the SQL query we'll use to fetch all of the summary 

270 # information at once. 

271 columns = [ 

272 self._tables.datasetType.columns[self._collectionKeyName].label(self._collectionKeyName), 

273 self._tables.datasetType.columns.dataset_type_id.label("dataset_type_id"), 

274 ] 

275 fromClause: sqlalchemy.sql.expression.FromClause = self._tables.datasetType 

276 for dimension, table in self._tables.dimensions.items(): 

277 columns.append(table.columns[dimension.name].label(dimension.name)) 

278 fromClause = fromClause.join( 

279 table, 

280 onclause=( 

281 self._tables.datasetType.columns[self._collectionKeyName] 

282 == table.columns[self._collectionKeyName] 

283 ), 

284 isouter=True, 

285 ) 

286 sql = sqlalchemy.sql.select(*columns).select_from(fromClause) 

287 # Run the query and construct CollectionSummary objects from the result 

288 # rows. This will never include CHAINED collections or collections 

289 # with no datasets. 

290 summaries: dict[Any, CollectionSummary] = {} 

291 with self._db.query(sql) as sql_result: 

292 sql_rows = sql_result.mappings().fetchall() 

293 for row in sql_rows: 

294 # Collection key should never be None/NULL; it's what we join on. 

295 # Extract that and then turn it into a collection name. 

296 collectionKey = row[self._collectionKeyName] 

297 # dataset_type_id should also never be None/NULL; it's in the first 

298 # table we joined. 

299 if datasetType := dataset_types.get(row["dataset_type_id"]): 299 ↛ 293line 299 didn't jump to line 293, because the condition on line 299 was never false

300 # See if we have a summary already for this collection; if not, 

301 # make one. 

302 summary = summaries.get(collectionKey) 

303 if summary is None: 

304 summary = CollectionSummary() 

305 summaries[collectionKey] = summary 

306 # Update the dimensions with the values in this row that 

307 # aren't None/NULL (many will be in general, because these 

308 # enter the query via LEFT OUTER JOIN). 

309 summary.dataset_types.add(datasetType) 

310 for dimension in self._tables.dimensions: 

311 value = row[dimension.name] 

312 if value is not None: 

313 summary.governors.setdefault(dimension.name, set()).add(value) 

314 self._cache = summaries 

315 

316 def get(self, collection: CollectionRecord) -> CollectionSummary: 

317 """Return a summary for the given collection. 

318 

319 Parameters 

320 ---------- 

321 collection : `CollectionRecord` 

322 Record describing the collection for which a summary is to be 

323 retrieved. 

324 

325 Returns 

326 ------- 

327 summary : `CollectionSummary` 

328 Summary of the dataset types and governor dimension values in 

329 this collection. 

330 """ 

331 summary = self._cache.get(collection.key) 

332 if summary is None: 

333 # When we load the summary information from the database, we don't 

334 # create summaries for CHAINED collections; those are created here 

335 # as needed, and *never* cached - we have no good way to update 

336 # those summaries when some a new dataset is added to a child 

337 # colletion. 

338 if collection.type is CollectionType.CHAINED: 

339 assert isinstance(collection, ChainedCollectionRecord) 

340 child_summaries = [self.get(self._collections.find(child)) for child in collection.children] 

341 if child_summaries: 341 ↛ 344line 341 didn't jump to line 344, because the condition on line 341 was never false

342 summary = CollectionSummary.union(*child_summaries) 

343 else: 

344 summary = CollectionSummary() 

345 else: 

346 # Either this collection doesn't have any datasets yet, or the 

347 # only datasets it has were created by some other process since 

348 # the last call to refresh. We assume the former; the user is 

349 # responsible for calling refresh if they want to read 

350 # concurrently-written things. We do remember this in the 

351 # cache. 

352 summary = CollectionSummary() 

353 self._cache[collection.key] = summary 

354 return summary