Coverage for python/lsst/daf/butler/registry/datasets/byDimensions/summaries.py: 99%

107 statements  

« prev     ^ index     » next       coverage.py v7.4.4, created at 2024-03-30 09:58 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28from __future__ import annotations 

29 

30from .... import ddl 

31 

32__all__ = ("CollectionSummaryManager",) 

33 

34import logging 

35from collections.abc import Callable, Iterable, Mapping 

36from typing import Any, Generic, TypeVar 

37 

38import sqlalchemy 

39 

40from ...._dataset_type import DatasetType 

41from ...._named import NamedKeyDict, NamedKeyMapping 

42from ....dimensions import GovernorDimension, addDimensionForeignKey 

43from ..._caching_context import CachingContext 

44from ..._collection_summary import CollectionSummary 

45from ..._collection_type import CollectionType 

46from ...interfaces import ( 

47 CollectionManager, 

48 CollectionRecord, 

49 Database, 

50 DimensionRecordStorageManager, 

51 StaticTablesContext, 

52) 

53from ...wildcards import CollectionWildcard 

54 

55_T = TypeVar("_T") 

56 

57 

58_LOG = logging.getLogger(__name__) 

59 

60 

61class CollectionSummaryTables(Generic[_T]): 

62 """Structure that holds the table or table specification objects that 

63 summarize the contents of collections. 

64 

65 Parameters 

66 ---------- 

67 datasetType : _T 

68 Table [specification] that summarizes which dataset types are in each 

69 collection. 

70 dimensions : `NamedKeyMapping` 

71 Mapping of table [specifications] that summarize which governor 

72 dimension values are present in the data IDs of each collection. 

73 """ 

74 

75 def __init__( 

76 self, 

77 datasetType: _T, 

78 dimensions: NamedKeyMapping[GovernorDimension, _T], 

79 ): 

80 self.datasetType = datasetType 

81 self.dimensions = dimensions 

82 

83 @classmethod 

84 def makeTableSpecs( 

85 cls, 

86 collections: CollectionManager, 

87 dimensions: DimensionRecordStorageManager, 

88 ) -> CollectionSummaryTables[ddl.TableSpec]: 

89 """Create specifications for all summary tables. 

90 

91 Parameters 

92 ---------- 

93 collections : `CollectionManager` 

94 Manager object for the collections in this `Registry`. 

95 dimensions : `DimensionRecordStorageManager` 

96 Manager object for the dimensions in this `Registry`. 

97 

98 Returns 

99 ------- 

100 tables : `CollectionSummaryTables` [ `ddl.TableSpec` ] 

101 Structure containing table specifications. 

102 """ 

103 # Spec for collection_summary_dataset_type. 

104 datasetTypeTableSpec = ddl.TableSpec(fields=[]) 

105 collections.addCollectionForeignKey(datasetTypeTableSpec, primaryKey=True, onDelete="CASCADE") 

106 datasetTypeTableSpec.fields.add( 

107 ddl.FieldSpec("dataset_type_id", dtype=sqlalchemy.BigInteger, primaryKey=True) 

108 ) 

109 datasetTypeTableSpec.foreignKeys.append( 

110 ddl.ForeignKeySpec( 

111 "dataset_type", source=("dataset_type_id",), target=("id",), onDelete="CASCADE" 

112 ) 

113 ) 

114 # Specs for collection_summary_<dimension>. 

115 dimensionTableSpecs = NamedKeyDict[GovernorDimension, ddl.TableSpec]() 

116 for dimension in dimensions.universe.governor_dimensions: 

117 tableSpec = ddl.TableSpec(fields=[]) 

118 collections.addCollectionForeignKey(tableSpec, primaryKey=True, onDelete="CASCADE") 

119 addDimensionForeignKey(tableSpec, dimension, primaryKey=True) 

120 dimensionTableSpecs[dimension] = tableSpec 

121 return CollectionSummaryTables( 

122 datasetType=datasetTypeTableSpec, 

123 dimensions=dimensionTableSpecs.freeze(), 

124 ) 

125 

126 

127class CollectionSummaryManager: 

128 """Object manages the summaries of what dataset types and governor 

129 dimension values are present in a collection. 

130 

131 Parameters 

132 ---------- 

133 db : `Database` 

134 Interface to the underlying database engine and namespace. 

135 collections : `CollectionManager` 

136 Manager object for the collections in this `Registry`. 

137 tables : `CollectionSummaryTables` 

138 Struct containing the tables that hold collection summaries. 

139 dataset_type_table : `sqlalchemy.schema.Table` 

140 Table containing dataset type definitions. 

141 caching_context : `CachingContext` 

142 Object controlling caching of information returned by managers. 

143 """ 

144 

145 def __init__( 

146 self, 

147 db: Database, 

148 *, 

149 collections: CollectionManager, 

150 tables: CollectionSummaryTables[sqlalchemy.schema.Table], 

151 dataset_type_table: sqlalchemy.schema.Table, 

152 caching_context: CachingContext, 

153 ): 

154 self._db = db 

155 self._collections = collections 

156 self._collectionKeyName = collections.getCollectionForeignKeyName() 

157 self._tables = tables 

158 self._dataset_type_table = dataset_type_table 

159 self._caching_context = caching_context 

160 

161 def clone( 

162 self, 

163 *, 

164 db: Database, 

165 collections: CollectionManager, 

166 caching_context: CachingContext, 

167 ) -> CollectionSummaryManager: 

168 """Make an independent copy of this manager instance bound to new 

169 instances of `Database` and other managers. 

170 

171 Parameters 

172 ---------- 

173 db : `Database` 

174 New `Database` object to use when instantiating the manager. 

175 collections : `CollectionManager` 

176 New `CollectionManager` object to use when instantiating the 

177 manager. 

178 caching_context : `CachingContext` 

179 New `CachingContext` object to use when instantiating the manager. 

180 

181 Returns 

182 ------- 

183 instance : `CollectionSummaryManager` 

184 New manager instance with the same configuration as this instance, 

185 but bound to a new Database object. 

186 """ 

187 return CollectionSummaryManager( 

188 db=db, 

189 collections=collections, 

190 tables=self._tables, 

191 dataset_type_table=self._dataset_type_table, 

192 caching_context=caching_context, 

193 ) 

194 

195 @classmethod 

196 def initialize( 

197 cls, 

198 db: Database, 

199 context: StaticTablesContext, 

200 *, 

201 collections: CollectionManager, 

202 dimensions: DimensionRecordStorageManager, 

203 dataset_type_table: sqlalchemy.schema.Table, 

204 caching_context: CachingContext, 

205 ) -> CollectionSummaryManager: 

206 """Create all summary tables (or check that they have been created), 

207 returning an object to manage them. 

208 

209 Parameters 

210 ---------- 

211 db : `Database` 

212 Interface to the underlying database engine and namespace. 

213 context : `StaticTablesContext` 

214 Context object obtained from `Database.declareStaticTables`; used 

215 to declare any tables that should always be present. 

216 collections : `CollectionManager` 

217 Manager object for the collections in this `Registry`. 

218 dimensions : `DimensionRecordStorageManager` 

219 Manager object for the dimensions in this `Registry`. 

220 dataset_type_table : `sqlalchemy.schema.Table` 

221 Table containing dataset type definitions. 

222 caching_context : `CachingContext` 

223 Object controlling caching of information returned by managers. 

224 

225 Returns 

226 ------- 

227 manager : `CollectionSummaryManager` 

228 New manager object for collection summaries. 

229 """ 

230 specs = CollectionSummaryTables.makeTableSpecs(collections, dimensions) 

231 tables = CollectionSummaryTables( 

232 datasetType=context.addTable("collection_summary_dataset_type", specs.datasetType), 

233 dimensions=NamedKeyDict[GovernorDimension, sqlalchemy.schema.Table]( 

234 { 

235 dimension: context.addTable(f"collection_summary_{dimension.name}", spec) 

236 for dimension, spec in specs.dimensions.items() 

237 } 

238 ).freeze(), 

239 ) 

240 return cls( 

241 db=db, 

242 collections=collections, 

243 tables=tables, 

244 dataset_type_table=dataset_type_table, 

245 caching_context=caching_context, 

246 ) 

247 

248 def update( 

249 self, 

250 collection: CollectionRecord, 

251 dataset_type_ids: Iterable[int], 

252 summary: CollectionSummary, 

253 ) -> None: 

254 """Update the summary tables to associate the given collection with 

255 a dataset type and governor dimension values. 

256 

257 Parameters 

258 ---------- 

259 collection : `CollectionRecord` 

260 Collection whose summary should be updated. 

261 dataset_type_ids : `~collections.abc.Iterable` [ `int` ] 

262 Integer IDs for the dataset types to associate with this 

263 collection. 

264 summary : `CollectionSummary` 

265 Summary to store. Dataset types must correspond to 

266 ``dataset_type_ids``. 

267 

268 Notes 

269 ----- 

270 This method should only be called inside the transaction context of 

271 another operation that inserts or associates datasets. 

272 """ 

273 self._db.ensure( 

274 self._tables.datasetType, 

275 *[ 

276 { 

277 "dataset_type_id": dataset_type_id, 

278 self._collectionKeyName: collection.key, 

279 } 

280 for dataset_type_id in dataset_type_ids 

281 ], 

282 ) 

283 for dimension, values in summary.governors.items(): 

284 if values: 284 ↛ 283line 284 didn't jump to line 283, because the condition on line 284 was never false

285 self._db.ensure( 

286 self._tables.dimensions[dimension], 

287 *[{self._collectionKeyName: collection.key, dimension: v} for v in values], 

288 ) 

289 

290 def fetch_summaries( 

291 self, 

292 collections: Iterable[CollectionRecord], 

293 dataset_type_names: Iterable[str] | None, 

294 dataset_type_factory: Callable[[sqlalchemy.engine.RowMapping], DatasetType], 

295 ) -> Mapping[Any, CollectionSummary]: 

296 """Fetch collection summaries given their names and dataset types. 

297 

298 Parameters 

299 ---------- 

300 collections : `~collections.abc.Iterable` [`CollectionRecord`] 

301 Collection records to query. 

302 dataset_type_names : `~collections.abc.Iterable` [`str`] 

303 Names of dataset types to include into returned summaries. If 

304 `None` then all dataset types will be included. 

305 dataset_type_factory : `Callable` 

306 Method that takes a table row and make `DatasetType` instance out 

307 of it. 

308 

309 Returns 

310 ------- 

311 summaries : `~collections.abc.Mapping` [`Any`, `CollectionSummary`] 

312 Collection summaries indexed by collection record key. This mapping 

313 will also contain all nested non-chained collections of the chained 

314 collections. 

315 """ 

316 summaries: dict[Any, CollectionSummary] = {} 

317 # Check what we have in cache first. 

318 if self._caching_context.collection_summaries is not None: 

319 summaries, missing_keys = self._caching_context.collection_summaries.find_summaries( 

320 [record.key for record in collections] 

321 ) 

322 if not missing_keys: 

323 return summaries 

324 else: 

325 collections = [record for record in collections if record.key in missing_keys] 

326 

327 # Need to expand all chained collections first. 

328 non_chains: list[CollectionRecord] = [] 

329 chains: dict[CollectionRecord, list[CollectionRecord]] = {} 

330 for collection in collections: 

331 if collection.type is CollectionType.CHAINED: 

332 children = self._collections.resolve_wildcard( 

333 CollectionWildcard.from_names([collection.name]), 

334 flatten_chains=True, 

335 include_chains=False, 

336 ) 

337 non_chains += children 

338 chains[collection] = children 

339 else: 

340 non_chains.append(collection) 

341 

342 _LOG.debug("Fetching summaries for collections %s.", [record.name for record in non_chains]) 

343 

344 # Set up the SQL query we'll use to fetch all of the summary 

345 # information at once. 

346 coll_col = self._tables.datasetType.columns[self._collectionKeyName].label(self._collectionKeyName) 

347 dataset_type_id_col = self._tables.datasetType.columns.dataset_type_id.label("dataset_type_id") 

348 columns = [coll_col, dataset_type_id_col] + list(self._dataset_type_table.columns) 

349 fromClause: sqlalchemy.sql.expression.FromClause = self._tables.datasetType.join( 

350 self._dataset_type_table 

351 ) 

352 for dimension, table in self._tables.dimensions.items(): 

353 columns.append(table.columns[dimension.name].label(dimension.name)) 

354 fromClause = fromClause.join( 

355 table, 

356 onclause=( 

357 self._tables.datasetType.columns[self._collectionKeyName] 

358 == table.columns[self._collectionKeyName] 

359 ), 

360 isouter=True, 

361 ) 

362 

363 sql = sqlalchemy.sql.select(*columns).select_from(fromClause) 

364 sql = sql.where(coll_col.in_([coll.key for coll in non_chains])) 

365 # For caching we need to fetch complete summaries. 

366 if self._caching_context.collection_summaries is None: 

367 if dataset_type_names is not None: 

368 sql = sql.where(self._dataset_type_table.columns["name"].in_(dataset_type_names)) 

369 

370 # Run the query and construct CollectionSummary objects from the result 

371 # rows. This will never include CHAINED collections or collections 

372 # with no datasets. 

373 with self._db.query(sql) as sql_result: 

374 sql_rows = sql_result.mappings().fetchall() 

375 dataset_type_ids: dict[int, DatasetType] = {} 

376 for row in sql_rows: 

377 # Collection key should never be None/NULL; it's what we join on. 

378 # Extract that and then turn it into a collection name. 

379 collectionKey = row[self._collectionKeyName] 

380 # dataset_type_id should also never be None/NULL; it's in the first 

381 # table we joined. 

382 dataset_type_id = row["dataset_type_id"] 

383 if (dataset_type := dataset_type_ids.get(dataset_type_id)) is None: 

384 dataset_type_ids[dataset_type_id] = dataset_type = dataset_type_factory(row) 

385 # See if we have a summary already for this collection; if not, 

386 # make one. 

387 summary = summaries.get(collectionKey) 

388 if summary is None: 

389 summary = CollectionSummary() 

390 summaries[collectionKey] = summary 

391 # Update the dimensions with the values in this row that 

392 # aren't None/NULL (many will be in general, because these 

393 # enter the query via LEFT OUTER JOIN). 

394 summary.dataset_types.add(dataset_type) 

395 for dimension in self._tables.dimensions: 

396 value = row[dimension.name] 

397 if value is not None: 

398 summary.governors.setdefault(dimension.name, set()).add(value) 

399 

400 # Add empty summary for any missing collection. 

401 for collection in non_chains: 

402 if collection.key not in summaries: 

403 summaries[collection.key] = CollectionSummary() 

404 

405 # Merge children into their chains summaries. 

406 for chain, children in chains.items(): 

407 summaries[chain.key] = CollectionSummary.union(*(summaries[child.key] for child in children)) 

408 

409 if self._caching_context.collection_summaries is not None: 

410 self._caching_context.collection_summaries.update(summaries) 

411 

412 return summaries