Coverage for python/lsst/daf/butler/registry/datasets/byDimensions/summaries.py: 99%

106 statements  

« prev     ^ index     » next       coverage.py v7.4.0, created at 2024-01-16 10:43 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28from __future__ import annotations 

29 

30from .... import ddl 

31 

32__all__ = ("CollectionSummaryManager",) 

33 

34import logging 

35from collections.abc import Callable, Iterable, Mapping 

36from typing import Any, Generic, TypeVar 

37 

38import sqlalchemy 

39 

40from ...._dataset_type import DatasetType 

41from ...._named import NamedKeyDict, NamedKeyMapping 

42from ....dimensions import GovernorDimension, addDimensionForeignKey 

43from ..._caching_context import CachingContext 

44from ..._collection_summary import CollectionSummary 

45from ..._collection_type import CollectionType 

46from ...interfaces import ( 

47 CollectionManager, 

48 CollectionRecord, 

49 Database, 

50 DimensionRecordStorageManager, 

51 StaticTablesContext, 

52) 

53from ...wildcards import CollectionWildcard 

54 

55_T = TypeVar("_T") 

56 

57 

58_LOG = logging.getLogger(__name__) 

59 

60 

61class CollectionSummaryTables(Generic[_T]): 

62 """Structure that holds the table or table specification objects that 

63 summarize the contents of collections. 

64 

65 Parameters 

66 ---------- 

67 datasetType : _T 

68 Table [specification] that summarizes which dataset types are in each 

69 collection. 

70 dimensions : `NamedKeyMapping` 

71 Mapping of table [specifications] that summarize which governor 

72 dimension values are present in the data IDs of each collection. 

73 """ 

74 

75 def __init__( 

76 self, 

77 datasetType: _T, 

78 dimensions: NamedKeyMapping[GovernorDimension, _T], 

79 ): 

80 self.datasetType = datasetType 

81 self.dimensions = dimensions 

82 

83 @classmethod 

84 def makeTableSpecs( 

85 cls, 

86 collections: CollectionManager, 

87 dimensions: DimensionRecordStorageManager, 

88 ) -> CollectionSummaryTables[ddl.TableSpec]: 

89 """Create specifications for all summary tables. 

90 

91 Parameters 

92 ---------- 

93 collections : `CollectionManager` 

94 Manager object for the collections in this `Registry`. 

95 dimensions : `DimensionRecordStorageManager` 

96 Manager object for the dimensions in this `Registry`. 

97 

98 Returns 

99 ------- 

100 tables : `CollectionSummaryTables` [ `ddl.TableSpec` ] 

101 Structure containing table specifications. 

102 """ 

103 # Spec for collection_summary_dataset_type. 

104 datasetTypeTableSpec = ddl.TableSpec(fields=[]) 

105 collections.addCollectionForeignKey(datasetTypeTableSpec, primaryKey=True, onDelete="CASCADE") 

106 datasetTypeTableSpec.fields.add( 

107 ddl.FieldSpec("dataset_type_id", dtype=sqlalchemy.BigInteger, primaryKey=True) 

108 ) 

109 datasetTypeTableSpec.foreignKeys.append( 

110 ddl.ForeignKeySpec( 

111 "dataset_type", source=("dataset_type_id",), target=("id",), onDelete="CASCADE" 

112 ) 

113 ) 

114 # Specs for collection_summary_<dimension>. 

115 dimensionTableSpecs = NamedKeyDict[GovernorDimension, ddl.TableSpec]() 

116 for dimension in dimensions.universe.governor_dimensions: 

117 tableSpec = ddl.TableSpec(fields=[]) 

118 collections.addCollectionForeignKey(tableSpec, primaryKey=True, onDelete="CASCADE") 

119 addDimensionForeignKey(tableSpec, dimension, primaryKey=True) 

120 dimensionTableSpecs[dimension] = tableSpec 

121 return CollectionSummaryTables( 

122 datasetType=datasetTypeTableSpec, 

123 dimensions=dimensionTableSpecs.freeze(), 

124 ) 

125 

126 

127class CollectionSummaryManager: 

128 """Object manages the summaries of what dataset types and governor 

129 dimension values are present in a collection. 

130 

131 Parameters 

132 ---------- 

133 db : `Database` 

134 Interface to the underlying database engine and namespace. 

135 collections : `CollectionManager` 

136 Manager object for the collections in this `Registry`. 

137 dimensions : `DimensionRecordStorageManager` 

138 Manager object for the dimensions in this `Registry`. 

139 tables : `CollectionSummaryTables` 

140 Struct containing the tables that hold collection summaries. 

141 dataset_type_table : `sqlalchemy.schema.Table` 

142 Table containing dataset type definitions. 

143 caching_context : `CachingContext` 

144 Object controlling caching of information returned by managers. 

145 """ 

146 

147 def __init__( 

148 self, 

149 db: Database, 

150 *, 

151 collections: CollectionManager, 

152 dimensions: DimensionRecordStorageManager, 

153 tables: CollectionSummaryTables[sqlalchemy.schema.Table], 

154 dataset_type_table: sqlalchemy.schema.Table, 

155 caching_context: CachingContext, 

156 ): 

157 self._db = db 

158 self._collections = collections 

159 self._collectionKeyName = collections.getCollectionForeignKeyName() 

160 self._dimensions = dimensions 

161 self._tables = tables 

162 self._dataset_type_table = dataset_type_table 

163 self._caching_context = caching_context 

164 

165 @classmethod 

166 def initialize( 

167 cls, 

168 db: Database, 

169 context: StaticTablesContext, 

170 *, 

171 collections: CollectionManager, 

172 dimensions: DimensionRecordStorageManager, 

173 dataset_type_table: sqlalchemy.schema.Table, 

174 caching_context: CachingContext, 

175 ) -> CollectionSummaryManager: 

176 """Create all summary tables (or check that they have been created), 

177 returning an object to manage them. 

178 

179 Parameters 

180 ---------- 

181 db : `Database` 

182 Interface to the underlying database engine and namespace. 

183 context : `StaticTablesContext` 

184 Context object obtained from `Database.declareStaticTables`; used 

185 to declare any tables that should always be present. 

186 collections : `CollectionManager` 

187 Manager object for the collections in this `Registry`. 

188 dimensions : `DimensionRecordStorageManager` 

189 Manager object for the dimensions in this `Registry`. 

190 dataset_type_table : `sqlalchemy.schema.Table` 

191 Table containing dataset type definitions. 

192 caching_context : `CachingContext` 

193 Object controlling caching of information returned by managers. 

194 

195 Returns 

196 ------- 

197 manager : `CollectionSummaryManager` 

198 New manager object for collection summaries. 

199 """ 

200 specs = CollectionSummaryTables.makeTableSpecs(collections, dimensions) 

201 tables = CollectionSummaryTables( 

202 datasetType=context.addTable("collection_summary_dataset_type", specs.datasetType), 

203 dimensions=NamedKeyDict[GovernorDimension, sqlalchemy.schema.Table]( 

204 { 

205 dimension: context.addTable(f"collection_summary_{dimension.name}", spec) 

206 for dimension, spec in specs.dimensions.items() 

207 } 

208 ).freeze(), 

209 ) 

210 return cls( 

211 db=db, 

212 collections=collections, 

213 dimensions=dimensions, 

214 tables=tables, 

215 dataset_type_table=dataset_type_table, 

216 caching_context=caching_context, 

217 ) 

218 

219 def update( 

220 self, 

221 collection: CollectionRecord, 

222 dataset_type_ids: Iterable[int], 

223 summary: CollectionSummary, 

224 ) -> None: 

225 """Update the summary tables to associate the given collection with 

226 a dataset type and governor dimension values. 

227 

228 Parameters 

229 ---------- 

230 collection : `CollectionRecord` 

231 Collection whose summary should be updated. 

232 dataset_type_ids : `~collections.abc.Iterable` [ `int` ] 

233 Integer IDs for the dataset types to associate with this 

234 collection. 

235 summary : `CollectionSummary` 

236 Summary to store. Dataset types must correspond to 

237 ``dataset_type_ids``. 

238 

239 Notes 

240 ----- 

241 This method should only be called inside the transaction context of 

242 another operation that inserts or associates datasets. 

243 """ 

244 self._db.ensure( 

245 self._tables.datasetType, 

246 *[ 

247 { 

248 "dataset_type_id": dataset_type_id, 

249 self._collectionKeyName: collection.key, 

250 } 

251 for dataset_type_id in dataset_type_ids 

252 ], 

253 ) 

254 for dimension, values in summary.governors.items(): 

255 if values: 255 ↛ 254line 255 didn't jump to line 254, because the condition on line 255 was never false

256 self._db.ensure( 

257 self._tables.dimensions[dimension], 

258 *[{self._collectionKeyName: collection.key, dimension: v} for v in values], 

259 ) 

260 

261 def fetch_summaries( 

262 self, 

263 collections: Iterable[CollectionRecord], 

264 dataset_type_names: Iterable[str] | None, 

265 dataset_type_factory: Callable[[sqlalchemy.engine.RowMapping], DatasetType], 

266 ) -> Mapping[Any, CollectionSummary]: 

267 """Fetch collection summaries given their names and dataset types. 

268 

269 Parameters 

270 ---------- 

271 collections : `~collections.abc.Iterable` [`CollectionRecord`] 

272 Collection records to query. 

273 dataset_type_names : `~collections.abc.Iterable` [`str`] 

274 Names of dataset types to include into returned summaries. If 

275 `None` then all dataset types will be included. 

276 dataset_type_factory : `Callable` 

277 Method that takes a table row and make `DatasetType` instance out 

278 of it. 

279 

280 Returns 

281 ------- 

282 summaries : `~collections.abc.Mapping` [`Any`, `CollectionSummary`] 

283 Collection summaries indexed by collection record key. This mapping 

284 will also contain all nested non-chained collections of the chained 

285 collections. 

286 """ 

287 summaries: dict[Any, CollectionSummary] = {} 

288 # Check what we have in cache first. 

289 if self._caching_context.collection_summaries is not None: 

290 summaries, missing_keys = self._caching_context.collection_summaries.find_summaries( 

291 [record.key for record in collections] 

292 ) 

293 if not missing_keys: 

294 return summaries 

295 else: 

296 collections = [record for record in collections if record.key in missing_keys] 

297 

298 # Need to expand all chained collections first. 

299 non_chains: list[CollectionRecord] = [] 

300 chains: dict[CollectionRecord, list[CollectionRecord]] = {} 

301 for collection in collections: 

302 if collection.type is CollectionType.CHAINED: 

303 children = self._collections.resolve_wildcard( 

304 CollectionWildcard.from_names([collection.name]), 

305 flatten_chains=True, 

306 include_chains=False, 

307 ) 

308 non_chains += children 

309 chains[collection] = children 

310 else: 

311 non_chains.append(collection) 

312 

313 _LOG.debug("Fetching summaries for collections %s.", [record.name for record in non_chains]) 

314 

315 # Set up the SQL query we'll use to fetch all of the summary 

316 # information at once. 

317 coll_col = self._tables.datasetType.columns[self._collectionKeyName].label(self._collectionKeyName) 

318 dataset_type_id_col = self._tables.datasetType.columns.dataset_type_id.label("dataset_type_id") 

319 columns = [coll_col, dataset_type_id_col] + list(self._dataset_type_table.columns) 

320 fromClause: sqlalchemy.sql.expression.FromClause = self._tables.datasetType.join( 

321 self._dataset_type_table 

322 ) 

323 for dimension, table in self._tables.dimensions.items(): 

324 columns.append(table.columns[dimension.name].label(dimension.name)) 

325 fromClause = fromClause.join( 

326 table, 

327 onclause=( 

328 self._tables.datasetType.columns[self._collectionKeyName] 

329 == table.columns[self._collectionKeyName] 

330 ), 

331 isouter=True, 

332 ) 

333 

334 sql = sqlalchemy.sql.select(*columns).select_from(fromClause) 

335 sql = sql.where(coll_col.in_([coll.key for coll in non_chains])) 

336 # For caching we need to fetch complete summaries. 

337 if self._caching_context.collection_summaries is None: 

338 if dataset_type_names is not None: 

339 sql = sql.where(self._dataset_type_table.columns["name"].in_(dataset_type_names)) 

340 

341 # Run the query and construct CollectionSummary objects from the result 

342 # rows. This will never include CHAINED collections or collections 

343 # with no datasets. 

344 with self._db.query(sql) as sql_result: 

345 sql_rows = sql_result.mappings().fetchall() 

346 dataset_type_ids: dict[int, DatasetType] = {} 

347 for row in sql_rows: 

348 # Collection key should never be None/NULL; it's what we join on. 

349 # Extract that and then turn it into a collection name. 

350 collectionKey = row[self._collectionKeyName] 

351 # dataset_type_id should also never be None/NULL; it's in the first 

352 # table we joined. 

353 dataset_type_id = row["dataset_type_id"] 

354 if (dataset_type := dataset_type_ids.get(dataset_type_id)) is None: 

355 dataset_type_ids[dataset_type_id] = dataset_type = dataset_type_factory(row) 

356 # See if we have a summary already for this collection; if not, 

357 # make one. 

358 summary = summaries.get(collectionKey) 

359 if summary is None: 

360 summary = CollectionSummary() 

361 summaries[collectionKey] = summary 

362 # Update the dimensions with the values in this row that 

363 # aren't None/NULL (many will be in general, because these 

364 # enter the query via LEFT OUTER JOIN). 

365 summary.dataset_types.add(dataset_type) 

366 for dimension in self._tables.dimensions: 

367 value = row[dimension.name] 

368 if value is not None: 

369 summary.governors.setdefault(dimension.name, set()).add(value) 

370 

371 # Add empty summary for any missing collection. 

372 for collection in non_chains: 

373 if collection.key not in summaries: 

374 summaries[collection.key] = CollectionSummary() 

375 

376 # Merge children into their chains summaries. 

377 for chain, children in chains.items(): 

378 summaries[chain.key] = CollectionSummary.union(*(summaries[child.key] for child in children)) 

379 

380 if self._caching_context.collection_summaries is not None: 

381 self._caching_context.collection_summaries.update(summaries) 

382 

383 return summaries