Coverage for python/lsst/daf/butler/registry/datasets/byDimensions/summaries.py: 98%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

81 statements  

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ( 

25 "CollectionSummaryManager", 

26) 

27 

28from typing import ( 

29 Any, 

30 Callable, 

31 Dict, 

32 Generic, 

33 TypeVar, 

34) 

35 

36import sqlalchemy 

37 

38from lsst.daf.butler import ( 

39 DatasetType, 

40 ddl, 

41 GovernorDimension, 

42 NamedKeyDict, 

43 NamedKeyMapping, 

44 NamedValueSet, 

45) 

46from lsst.daf.butler import addDimensionForeignKey 

47from lsst.daf.butler.registry.interfaces import ( 

48 ChainedCollectionRecord, 

49 CollectionManager, 

50 CollectionRecord, 

51 Database, 

52 DimensionRecordStorageManager, 

53 StaticTablesContext, 

54) 

55from ..._collectionType import CollectionType 

56from ...summaries import CollectionSummary, GovernorDimensionRestriction 

57 

58_T = TypeVar("_T") 

59 

60 

61class CollectionSummaryTables(Generic[_T]): 

62 """Structure that holds the table or table specification objects that 

63 summarize the contents of collections. 

64 

65 Parameters 

66 ---------- 

67 datasetType 

68 Table [specification] that summarizes which dataset types are in each 

69 collection. 

70 dimensions 

71 Mapping of table [specifications] that summarize which governor 

72 dimension values are present in the data IDs of each collection. 

73 """ 

74 def __init__( 

75 self, 

76 datasetType: _T, 

77 dimensions: NamedKeyMapping[GovernorDimension, _T], 

78 ): 

79 self.datasetType = datasetType 

80 self.dimensions = dimensions 

81 

82 @classmethod 

83 def makeTableSpecs( 

84 cls, 

85 collections: CollectionManager, 

86 dimensions: DimensionRecordStorageManager, 

87 ) -> CollectionSummaryTables[ddl.TableSpec]: 

88 """Create specifications for all summary tables. 

89 

90 Parameters 

91 ---------- 

92 collections: `CollectionManager` 

93 Manager object for the collections in this `Registry`. 

94 dimensions : `DimensionRecordStorageManager` 

95 Manager object for the dimensions in this `Registry`. 

96 

97 Returns 

98 ------- 

99 tables : `CollectionSummaryTables` [ `ddl.TableSpec` ] 

100 Structure containing table specifications. 

101 """ 

102 # Spec for collection_summary_dataset_type. 

103 datasetTypeTableSpec = ddl.TableSpec(fields=[]) 

104 collections.addCollectionForeignKey(datasetTypeTableSpec, primaryKey=True, onDelete="CASCADE") 

105 datasetTypeTableSpec.fields.add( 

106 ddl.FieldSpec("dataset_type_id", dtype=sqlalchemy.BigInteger, primaryKey=True) 

107 ) 

108 datasetTypeTableSpec.foreignKeys.append( 

109 ddl.ForeignKeySpec("dataset_type", source=("dataset_type_id",), target=("id",), 

110 onDelete="CASCADE") 

111 ) 

112 # Specs for collection_summary_<dimension>. 

113 dimensionTableSpecs = NamedKeyDict[GovernorDimension, ddl.TableSpec]() 

114 for dimension in dimensions.universe.getGovernorDimensions(): 

115 tableSpec = ddl.TableSpec(fields=[]) 

116 collections.addCollectionForeignKey(tableSpec, primaryKey=True, onDelete="CASCADE") 

117 addDimensionForeignKey(tableSpec, dimension, primaryKey=True) 

118 dimensionTableSpecs[dimension] = tableSpec 

119 return CollectionSummaryTables( 

120 datasetType=datasetTypeTableSpec, 

121 dimensions=dimensionTableSpecs.freeze(), 

122 ) 

123 

124 

125class CollectionSummaryManager: 

126 """Object manages the summaries of what dataset types and governor 

127 dimension values are present in a collection. 

128 

129 Parameters 

130 ---------- 

131 db : `Database` 

132 Interface to the underlying database engine and namespace. 

133 collections: `CollectionManager` 

134 Manager object for the collections in this `Registry`. 

135 dimensions : `DimensionRecordStorageManager` 

136 Manager object for the dimensions in this `Registry`. 

137 tables : `CollectionSummaryTables` 

138 Struct containing the tables that hold collection summaries. 

139 """ 

140 def __init__( 

141 self, 

142 db: Database, *, 

143 collections: CollectionManager, 

144 dimensions: DimensionRecordStorageManager, 

145 tables: CollectionSummaryTables[sqlalchemy.sql.Table], 

146 ): 

147 self._db = db 

148 self._collections = collections 

149 self._collectionKeyName = collections.getCollectionForeignKeyName() 

150 self._dimensions = dimensions 

151 self._tables = tables 

152 self._cache: Dict[Any, CollectionSummary] = {} 

153 

154 @classmethod 

155 def initialize( 

156 cls, 

157 db: Database, 

158 context: StaticTablesContext, *, 

159 collections: CollectionManager, 

160 dimensions: DimensionRecordStorageManager, 

161 ) -> CollectionSummaryManager: 

162 """Create all summary tables (or check that they have been created), 

163 returning an object to manage them. 

164 

165 Parameters 

166 ---------- 

167 db : `Database` 

168 Interface to the underlying database engine and namespace. 

169 context : `StaticTablesContext` 

170 Context object obtained from `Database.declareStaticTables`; used 

171 to declare any tables that should always be present. 

172 collections: `CollectionManager` 

173 Manager object for the collections in this `Registry`. 

174 dimensions : `DimensionRecordStorageManager` 

175 Manager object for the dimensions in this `Registry`. 

176 

177 Returns 

178 ------- 

179 manager : `CollectionSummaryManager` 

180 New manager object for collection summaries. 

181 """ 

182 specs = CollectionSummaryTables.makeTableSpecs(collections, dimensions) 

183 tables = CollectionSummaryTables( 

184 datasetType=context.addTable("collection_summary_dataset_type", specs.datasetType), 

185 dimensions=NamedKeyDict({ 

186 dimension: context.addTable(f"collection_summary_{dimension.name}", spec) 

187 for dimension, spec in specs.dimensions.items() 

188 }).freeze(), 

189 ) 

190 return cls( 

191 db=db, 

192 collections=collections, 

193 dimensions=dimensions, 

194 tables=tables, 

195 ) 

196 

197 def update( 

198 self, 

199 collection: CollectionRecord, 

200 datasetType: DatasetType, 

201 dataset_type_id: int, 

202 governors: GovernorDimensionRestriction, 

203 ) -> None: 

204 """Update the summary tables to associate the given collection with 

205 a dataset type and governor dimension values. 

206 

207 Parameters 

208 ---------- 

209 collection : `CollectionRecord` 

210 Collection whose summary should be updated. 

211 datasetType : `DatasetType` 

212 DatasetType instance to associate with this collection. 

213 dataset_type_id : `int` 

214 Integer ID for the dataset type to associate with this collection. 

215 governors : `GovernorDimensionRestriction` 

216 Mapping from `GovernorDimensionRestriction` to sets of values they 

217 may be associated with in the data IDs of the datasets in this 

218 collection. 

219 

220 Notes 

221 ----- 

222 This method should only be called inside the transaction context of 

223 another operation that inserts or associates datasets. 

224 """ 

225 self._db.ensure( 

226 self._tables.datasetType, 

227 { 

228 "dataset_type_id": dataset_type_id, 

229 self._collectionKeyName: collection.key, 

230 } 

231 ) 

232 for dimension, values in governors.items(): 

233 if values: 

234 self._db.ensure( 

235 self._tables.dimensions[dimension.name], 

236 *[{ 

237 self._collectionKeyName: collection.key, 

238 dimension.name: v 

239 } for v in values], 

240 ) 

241 # Update the in-memory cache, too. These changes will remain even if 

242 # the database inserts above are rolled back by some later exception in 

243 # the same transaction, but that's okay: we never promise that a 

244 # CollectionSummary has _just_ the dataset types and governor dimension 

245 # values that are actually present, only that it is guaranteed to 

246 # contain any dataset types or governor dimension values that _may_ be 

247 # present. 

248 # That guarantee (and the possibility of rollbacks) means we can't get 

249 # away with checking the cache before we try the database inserts, 

250 # however; if someone had attempted to insert datasets of some dataset 

251 # type previously, and that rolled back, and we're now trying to insert 

252 # some more datasets of that same type, it would not be okay to skip 

253 # the DB summary table insertions because we found entries in the 

254 # in-memory cache. 

255 summary = self.get(collection) 

256 summary.datasetTypes.add(datasetType) 

257 summary.dimensions.update(governors) 

258 

259 def refresh(self, get_dataset_type: Callable[[int], DatasetType]) -> None: 

260 """Load all collection summary information from the database. 

261 

262 Parameters 

263 ---------- 

264 get_dataset_type : `Callable` 

265 Function that takes an `int` dataset_type_id value and returns a 

266 `DatasetType` instance. 

267 """ 

268 # Set up the SQL query we'll use to fetch all of the summary 

269 # information at once. 

270 columns = [ 

271 self._tables.datasetType.columns[self._collectionKeyName].label(self._collectionKeyName), 

272 self._tables.datasetType.columns.dataset_type_id.label("dataset_type_id"), 

273 ] 

274 fromClause = self._tables.datasetType 

275 for dimension, table in self._tables.dimensions.items(): 

276 columns.append(table.columns[dimension.name].label(dimension.name)) 

277 fromClause = fromClause.join( 

278 table, 

279 onclause=( 

280 self._tables.datasetType.columns[self._collectionKeyName] 

281 == table.columns[self._collectionKeyName] 

282 ), 

283 isouter=True, 

284 ) 

285 sql = sqlalchemy.sql.select(*columns).select_from(fromClause) 

286 # Run the query and construct CollectionSummary objects from the result 

287 # rows. This will never include CHAINED collections or collections 

288 # with no datasets. 

289 summaries: Dict[Any, CollectionSummary] = {} 

290 for row in self._db.query(sql).mappings(): 

291 # Collection key should never be None/NULL; it's what we join on. 

292 # Extract that and then turn it into a collection name. 

293 collectionKey = row[self._collectionKeyName] 

294 # dataset_type_id should also never be None/NULL; it's in the first 

295 # table we joined. 

296 datasetType = get_dataset_type(row["dataset_type_id"]) 

297 # See if we have a summary already for this collection; if not, 

298 # make one. 

299 summary = summaries.get(collectionKey) 

300 if summary is None: 

301 summary = CollectionSummary( 

302 datasetTypes=NamedValueSet([datasetType]), 

303 dimensions=GovernorDimensionRestriction.makeEmpty(self._dimensions.universe), 

304 ) 

305 summaries[collectionKey] = summary 

306 else: 

307 summary.datasetTypes.add(datasetType) 

308 # Update the dimensions with the values in this row that aren't 

309 # None/NULL (many will be in general, because these enter the query 

310 # via LEFT OUTER JOIN). 

311 for dimension in self._tables.dimensions: 

312 value = row[dimension.name] 

313 if value is not None: 

314 summary.dimensions.add(dimension, value) 

315 self._cache = summaries 

316 

317 def get(self, collection: CollectionRecord) -> CollectionSummary: 

318 """Return a summary for the given collection. 

319 

320 Parameters 

321 ---------- 

322 collection : `CollectionRecord` 

323 Record describing the collection for which a summary is to be 

324 retrieved. 

325 

326 Returns 

327 ------- 

328 summary : `CollectionSummary` 

329 Summary of the dataset types and governor dimension values in 

330 this collection. 

331 """ 

332 summary = self._cache.get(collection.key) 

333 if summary is None: 

334 # When we load the summary information from the database, we don't 

335 # create summaries for CHAINED collections; those are created here 

336 # as needed, and *never* cached - we have no good way to update 

337 # those summaries when some a new dataset is added to a child 

338 # colletion. 

339 if collection.type is CollectionType.CHAINED: 

340 assert isinstance(collection, ChainedCollectionRecord) 

341 child_summaries = [ 

342 self.get(self._collections.find(child)) 

343 for child in collection.children 

344 ] 

345 if child_summaries: 345 ↛ 348line 345 didn't jump to line 348, because the condition on line 345 was never false

346 summary = CollectionSummary.union(*child_summaries) 

347 else: 

348 summary = CollectionSummary.makeEmpty(self._dimensions.universe) 

349 else: 

350 # Either this collection doesn't have any datasets yet, or the 

351 # only datasets it has were created by some other process since 

352 # the last call to refresh. We assume the former; the user is 

353 # responsible for calling refresh if they want to read 

354 # concurrently-written things. We do remember this in the 

355 # cache. 

356 summary = CollectionSummary.makeEmpty(self._dimensions.universe) 

357 self._cache[collection.key] = summary 

358 return summary