Coverage for python/lsst/daf/butler/registry/datasets/byDimensions/summaries.py: 91%

103 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-12-05 11:05 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28from __future__ import annotations 

29 

30from .... import ddl 

31 

32__all__ = ("CollectionSummaryManager",) 

33 

34from collections.abc import Callable, Iterable, Mapping 

35from typing import Any, Generic, TypeVar 

36 

37import sqlalchemy 

38 

39from ...._dataset_type import DatasetType 

40from ...._named import NamedKeyDict, NamedKeyMapping 

41from ....dimensions import GovernorDimension, addDimensionForeignKey 

42from ..._caching_context import CachingContext 

43from ..._collection_summary import CollectionSummary 

44from ..._collection_type import CollectionType 

45from ...interfaces import ( 

46 CollectionManager, 

47 CollectionRecord, 

48 Database, 

49 DimensionRecordStorageManager, 

50 StaticTablesContext, 

51) 

52from ...wildcards import CollectionWildcard 

53 

54_T = TypeVar("_T") 

55 

56 

57class CollectionSummaryTables(Generic[_T]): 

58 """Structure that holds the table or table specification objects that 

59 summarize the contents of collections. 

60 

61 Parameters 

62 ---------- 

63 datasetType 

64 Table [specification] that summarizes which dataset types are in each 

65 collection. 

66 dimensions 

67 Mapping of table [specifications] that summarize which governor 

68 dimension values are present in the data IDs of each collection. 

69 """ 

70 

71 def __init__( 

72 self, 

73 datasetType: _T, 

74 dimensions: NamedKeyMapping[GovernorDimension, _T], 

75 ): 

76 self.datasetType = datasetType 

77 self.dimensions = dimensions 

78 

79 @classmethod 

80 def makeTableSpecs( 

81 cls, 

82 collections: CollectionManager, 

83 dimensions: DimensionRecordStorageManager, 

84 ) -> CollectionSummaryTables[ddl.TableSpec]: 

85 """Create specifications for all summary tables. 

86 

87 Parameters 

88 ---------- 

89 collections: `CollectionManager` 

90 Manager object for the collections in this `Registry`. 

91 dimensions : `DimensionRecordStorageManager` 

92 Manager object for the dimensions in this `Registry`. 

93 

94 Returns 

95 ------- 

96 tables : `CollectionSummaryTables` [ `ddl.TableSpec` ] 

97 Structure containing table specifications. 

98 """ 

99 # Spec for collection_summary_dataset_type. 

100 datasetTypeTableSpec = ddl.TableSpec(fields=[]) 

101 collections.addCollectionForeignKey(datasetTypeTableSpec, primaryKey=True, onDelete="CASCADE") 

102 datasetTypeTableSpec.fields.add( 

103 ddl.FieldSpec("dataset_type_id", dtype=sqlalchemy.BigInteger, primaryKey=True) 

104 ) 

105 datasetTypeTableSpec.foreignKeys.append( 

106 ddl.ForeignKeySpec( 

107 "dataset_type", source=("dataset_type_id",), target=("id",), onDelete="CASCADE" 

108 ) 

109 ) 

110 # Specs for collection_summary_<dimension>. 

111 dimensionTableSpecs = NamedKeyDict[GovernorDimension, ddl.TableSpec]() 

112 for dimension in dimensions.universe.governor_dimensions: 

113 tableSpec = ddl.TableSpec(fields=[]) 

114 collections.addCollectionForeignKey(tableSpec, primaryKey=True, onDelete="CASCADE") 

115 addDimensionForeignKey(tableSpec, dimension, primaryKey=True) 

116 dimensionTableSpecs[dimension] = tableSpec 

117 return CollectionSummaryTables( 

118 datasetType=datasetTypeTableSpec, 

119 dimensions=dimensionTableSpecs.freeze(), 

120 ) 

121 

122 

123class CollectionSummaryManager: 

124 """Object manages the summaries of what dataset types and governor 

125 dimension values are present in a collection. 

126 

127 Parameters 

128 ---------- 

129 db : `Database` 

130 Interface to the underlying database engine and namespace. 

131 collections: `CollectionManager` 

132 Manager object for the collections in this `Registry`. 

133 dimensions : `DimensionRecordStorageManager` 

134 Manager object for the dimensions in this `Registry`. 

135 tables : `CollectionSummaryTables` 

136 Struct containing the tables that hold collection summaries. 

137 dataset_type_table : `sqlalchemy.schema.Table` 

138 Table containing dataset type definitions. 

139 caching_context : `CachingContext` 

140 Object controlling caching of information returned by managers. 

141 """ 

142 

143 def __init__( 

144 self, 

145 db: Database, 

146 *, 

147 collections: CollectionManager, 

148 dimensions: DimensionRecordStorageManager, 

149 tables: CollectionSummaryTables[sqlalchemy.schema.Table], 

150 dataset_type_table: sqlalchemy.schema.Table, 

151 caching_context: CachingContext, 

152 ): 

153 self._db = db 

154 self._collections = collections 

155 self._collectionKeyName = collections.getCollectionForeignKeyName() 

156 self._dimensions = dimensions 

157 self._tables = tables 

158 self._dataset_type_table = dataset_type_table 

159 self._caching_context = caching_context 

160 

161 @classmethod 

162 def initialize( 

163 cls, 

164 db: Database, 

165 context: StaticTablesContext, 

166 *, 

167 collections: CollectionManager, 

168 dimensions: DimensionRecordStorageManager, 

169 dataset_type_table: sqlalchemy.schema.Table, 

170 caching_context: CachingContext, 

171 ) -> CollectionSummaryManager: 

172 """Create all summary tables (or check that they have been created), 

173 returning an object to manage them. 

174 

175 Parameters 

176 ---------- 

177 db : `Database` 

178 Interface to the underlying database engine and namespace. 

179 context : `StaticTablesContext` 

180 Context object obtained from `Database.declareStaticTables`; used 

181 to declare any tables that should always be present. 

182 collections: `CollectionManager` 

183 Manager object for the collections in this `Registry`. 

184 dimensions : `DimensionRecordStorageManager` 

185 Manager object for the dimensions in this `Registry`. 

186 dataset_type_table : `sqlalchemy.schema.Table` 

187 Table containing dataset type definitions. 

188 caching_context : `CachingContext` 

189 Object controlling caching of information returned by managers. 

190 

191 Returns 

192 ------- 

193 manager : `CollectionSummaryManager` 

194 New manager object for collection summaries. 

195 """ 

196 specs = CollectionSummaryTables.makeTableSpecs(collections, dimensions) 

197 tables = CollectionSummaryTables( 

198 datasetType=context.addTable("collection_summary_dataset_type", specs.datasetType), 

199 dimensions=NamedKeyDict[GovernorDimension, sqlalchemy.schema.Table]( 

200 { 

201 dimension: context.addTable(f"collection_summary_{dimension.name}", spec) 

202 for dimension, spec in specs.dimensions.items() 

203 } 

204 ).freeze(), 

205 ) 

206 return cls( 

207 db=db, 

208 collections=collections, 

209 dimensions=dimensions, 

210 tables=tables, 

211 dataset_type_table=dataset_type_table, 

212 caching_context=caching_context, 

213 ) 

214 

215 def update( 

216 self, 

217 collection: CollectionRecord, 

218 dataset_type_ids: Iterable[int], 

219 summary: CollectionSummary, 

220 ) -> None: 

221 """Update the summary tables to associate the given collection with 

222 a dataset type and governor dimension values. 

223 

224 Parameters 

225 ---------- 

226 collection : `CollectionRecord` 

227 Collection whose summary should be updated. 

228 dataset_type_ids : `~collections.abc.Iterable` [ `int` ] 

229 Integer IDs for the dataset types to associate with this 

230 collection. 

231 summary : `CollectionSummary` 

232 Summary to store. Dataset types must correspond to 

233 ``dataset_type_ids``. 

234 

235 Notes 

236 ----- 

237 This method should only be called inside the transaction context of 

238 another operation that inserts or associates datasets. 

239 """ 

240 self._db.ensure( 

241 self._tables.datasetType, 

242 *[ 

243 { 

244 "dataset_type_id": dataset_type_id, 

245 self._collectionKeyName: collection.key, 

246 } 

247 for dataset_type_id in dataset_type_ids 

248 ], 

249 ) 

250 for dimension, values in summary.governors.items(): 

251 if values: 251 ↛ 250line 251 didn't jump to line 250, because the condition on line 251 was never false

252 self._db.ensure( 

253 self._tables.dimensions[dimension], 

254 *[{self._collectionKeyName: collection.key, dimension: v} for v in values], 

255 ) 

256 

257 def fetch_summaries( 

258 self, 

259 collections: Iterable[CollectionRecord], 

260 dataset_type_names: Iterable[str] | None, 

261 dataset_type_factory: Callable[[sqlalchemy.engine.RowMapping], DatasetType], 

262 ) -> Mapping[Any, CollectionSummary]: 

263 """Fetch collection summaries given their names and dataset types. 

264 

265 Parameters 

266 ---------- 

267 collections : `~collections.abc.Iterable` [`CollectionRecord`] 

268 Collection records to query. 

269 dataset_type_names : `~collections.abc.Iterable` [`str`] 

270 Names of dataset types to include into returned summaries. If 

271 `None` then all dataset types will be included. 

272 dataset_type_factory : `Callable` 

273 Method that takes a table row and make `DatasetType` instance out 

274 of it. 

275 

276 Returns 

277 ------- 

278 summaries : `~collections.abc.Mapping` [`Any`, `CollectionSummary`] 

279 Collection summaries indexed by collection record key. This mapping 

280 will also contain all nested non-chained collections of the chained 

281 collections. 

282 """ 

283 summaries: dict[Any, CollectionSummary] = {} 

284 # Check what we have in cache first. 

285 if self._caching_context.collection_summaries is not None: 285 ↛ 286line 285 didn't jump to line 286, because the condition on line 285 was never true

286 summaries, missing_keys = self._caching_context.collection_summaries.find_summaries( 

287 [record.key for record in collections] 

288 ) 

289 if not missing_keys: 

290 return summaries 

291 else: 

292 collections = [record for record in collections if record.key in missing_keys] 

293 

294 # Need to expand all chained collections first. 

295 non_chains: list[CollectionRecord] = [] 

296 chains: dict[CollectionRecord, list[CollectionRecord]] = {} 

297 for collection in collections: 

298 if collection.type is CollectionType.CHAINED: 

299 children = self._collections.resolve_wildcard( 

300 CollectionWildcard.from_names([collection.name]), 

301 flatten_chains=True, 

302 include_chains=False, 

303 ) 

304 non_chains += children 

305 chains[collection] = children 

306 else: 

307 non_chains.append(collection) 

308 

309 # Set up the SQL query we'll use to fetch all of the summary 

310 # information at once. 

311 coll_col = self._tables.datasetType.columns[self._collectionKeyName].label(self._collectionKeyName) 

312 dataset_type_id_col = self._tables.datasetType.columns.dataset_type_id.label("dataset_type_id") 

313 columns = [coll_col, dataset_type_id_col] + list(self._dataset_type_table.columns) 

314 fromClause: sqlalchemy.sql.expression.FromClause = self._tables.datasetType.join( 

315 self._dataset_type_table 

316 ) 

317 for dimension, table in self._tables.dimensions.items(): 

318 columns.append(table.columns[dimension.name].label(dimension.name)) 

319 fromClause = fromClause.join( 

320 table, 

321 onclause=( 

322 self._tables.datasetType.columns[self._collectionKeyName] 

323 == table.columns[self._collectionKeyName] 

324 ), 

325 isouter=True, 

326 ) 

327 

328 sql = sqlalchemy.sql.select(*columns).select_from(fromClause) 

329 sql = sql.where(coll_col.in_([coll.key for coll in non_chains])) 

330 # For caching we need to fetch complete summaries. 

331 if self._caching_context.collection_summaries is None: 331 ↛ 338line 331 didn't jump to line 338, because the condition on line 331 was never false

332 if dataset_type_names is not None: 

333 sql = sql.where(self._dataset_type_table.columns["name"].in_(dataset_type_names)) 

334 

335 # Run the query and construct CollectionSummary objects from the result 

336 # rows. This will never include CHAINED collections or collections 

337 # with no datasets. 

338 with self._db.query(sql) as sql_result: 

339 sql_rows = sql_result.mappings().fetchall() 

340 dataset_type_ids: dict[int, DatasetType] = {} 

341 for row in sql_rows: 

342 # Collection key should never be None/NULL; it's what we join on. 

343 # Extract that and then turn it into a collection name. 

344 collectionKey = row[self._collectionKeyName] 

345 # dataset_type_id should also never be None/NULL; it's in the first 

346 # table we joined. 

347 dataset_type_id = row["dataset_type_id"] 

348 if (dataset_type := dataset_type_ids.get(dataset_type_id)) is None: 

349 dataset_type_ids[dataset_type_id] = dataset_type = dataset_type_factory(row) 

350 # See if we have a summary already for this collection; if not, 

351 # make one. 

352 summary = summaries.get(collectionKey) 

353 if summary is None: 

354 summary = CollectionSummary() 

355 summaries[collectionKey] = summary 

356 # Update the dimensions with the values in this row that 

357 # aren't None/NULL (many will be in general, because these 

358 # enter the query via LEFT OUTER JOIN). 

359 summary.dataset_types.add(dataset_type) 

360 for dimension in self._tables.dimensions: 

361 value = row[dimension.name] 

362 if value is not None: 

363 summary.governors.setdefault(dimension.name, set()).add(value) 

364 

365 # Add empty summary for any missing collection. 

366 for collection in non_chains: 

367 if collection.key not in summaries: 

368 summaries[collection.key] = CollectionSummary() 

369 

370 # Merge children into their chains summaries. 

371 for chain, children in chains.items(): 

372 summaries[chain.key] = CollectionSummary.union(*(summaries[child.key] for child in children)) 

373 

374 if self._caching_context.collection_summaries is not None: 374 ↛ 375line 374 didn't jump to line 375, because the condition on line 374 was never true

375 self._caching_context.collection_summaries.update(summaries) 

376 

377 return summaries