Coverage for python/lsst/daf/butler/registry/datasets/byDimensions/summaries.py: 98%

80 statements  

« prev     ^ index     » next       coverage.py v6.4.2, created at 2022-07-14 15:54 -0700

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ("CollectionSummaryManager",) 

25 

26from typing import Any, Callable, Dict, Generic, TypeVar 

27 

28import sqlalchemy 

29from lsst.daf.butler import ( 

30 DatasetType, 

31 GovernorDimension, 

32 NamedKeyDict, 

33 NamedKeyMapping, 

34 NamedValueSet, 

35 addDimensionForeignKey, 

36 ddl, 

37) 

38from lsst.daf.butler.registry.interfaces import ( 

39 ChainedCollectionRecord, 

40 CollectionManager, 

41 CollectionRecord, 

42 Database, 

43 DimensionRecordStorageManager, 

44 StaticTablesContext, 

45) 

46 

47from ..._collectionType import CollectionType 

48from ...summaries import CollectionSummary, GovernorDimensionRestriction 

49 

50_T = TypeVar("_T") 

51 

52 

53class CollectionSummaryTables(Generic[_T]): 

54 """Structure that holds the table or table specification objects that 

55 summarize the contents of collections. 

56 

57 Parameters 

58 ---------- 

59 datasetType 

60 Table [specification] that summarizes which dataset types are in each 

61 collection. 

62 dimensions 

63 Mapping of table [specifications] that summarize which governor 

64 dimension values are present in the data IDs of each collection. 

65 """ 

66 

67 def __init__( 

68 self, 

69 datasetType: _T, 

70 dimensions: NamedKeyMapping[GovernorDimension, _T], 

71 ): 

72 self.datasetType = datasetType 

73 self.dimensions = dimensions 

74 

75 @classmethod 

76 def makeTableSpecs( 

77 cls, 

78 collections: CollectionManager, 

79 dimensions: DimensionRecordStorageManager, 

80 ) -> CollectionSummaryTables[ddl.TableSpec]: 

81 """Create specifications for all summary tables. 

82 

83 Parameters 

84 ---------- 

85 collections: `CollectionManager` 

86 Manager object for the collections in this `Registry`. 

87 dimensions : `DimensionRecordStorageManager` 

88 Manager object for the dimensions in this `Registry`. 

89 

90 Returns 

91 ------- 

92 tables : `CollectionSummaryTables` [ `ddl.TableSpec` ] 

93 Structure containing table specifications. 

94 """ 

95 # Spec for collection_summary_dataset_type. 

96 datasetTypeTableSpec = ddl.TableSpec(fields=[]) 

97 collections.addCollectionForeignKey(datasetTypeTableSpec, primaryKey=True, onDelete="CASCADE") 

98 datasetTypeTableSpec.fields.add( 

99 ddl.FieldSpec("dataset_type_id", dtype=sqlalchemy.BigInteger, primaryKey=True) 

100 ) 

101 datasetTypeTableSpec.foreignKeys.append( 

102 ddl.ForeignKeySpec( 

103 "dataset_type", source=("dataset_type_id",), target=("id",), onDelete="CASCADE" 

104 ) 

105 ) 

106 # Specs for collection_summary_<dimension>. 

107 dimensionTableSpecs = NamedKeyDict[GovernorDimension, ddl.TableSpec]() 

108 for dimension in dimensions.universe.getGovernorDimensions(): 

109 tableSpec = ddl.TableSpec(fields=[]) 

110 collections.addCollectionForeignKey(tableSpec, primaryKey=True, onDelete="CASCADE") 

111 addDimensionForeignKey(tableSpec, dimension, primaryKey=True) 

112 dimensionTableSpecs[dimension] = tableSpec 

113 return CollectionSummaryTables( 

114 datasetType=datasetTypeTableSpec, 

115 dimensions=dimensionTableSpecs.freeze(), 

116 ) 

117 

118 

119class CollectionSummaryManager: 

120 """Object manages the summaries of what dataset types and governor 

121 dimension values are present in a collection. 

122 

123 Parameters 

124 ---------- 

125 db : `Database` 

126 Interface to the underlying database engine and namespace. 

127 collections: `CollectionManager` 

128 Manager object for the collections in this `Registry`. 

129 dimensions : `DimensionRecordStorageManager` 

130 Manager object for the dimensions in this `Registry`. 

131 tables : `CollectionSummaryTables` 

132 Struct containing the tables that hold collection summaries. 

133 """ 

134 

135 def __init__( 

136 self, 

137 db: Database, 

138 *, 

139 collections: CollectionManager, 

140 dimensions: DimensionRecordStorageManager, 

141 tables: CollectionSummaryTables[sqlalchemy.sql.Table], 

142 ): 

143 self._db = db 

144 self._collections = collections 

145 self._collectionKeyName = collections.getCollectionForeignKeyName() 

146 self._dimensions = dimensions 

147 self._tables = tables 

148 self._cache: Dict[Any, CollectionSummary] = {} 

149 

150 @classmethod 

151 def initialize( 

152 cls, 

153 db: Database, 

154 context: StaticTablesContext, 

155 *, 

156 collections: CollectionManager, 

157 dimensions: DimensionRecordStorageManager, 

158 ) -> CollectionSummaryManager: 

159 """Create all summary tables (or check that they have been created), 

160 returning an object to manage them. 

161 

162 Parameters 

163 ---------- 

164 db : `Database` 

165 Interface to the underlying database engine and namespace. 

166 context : `StaticTablesContext` 

167 Context object obtained from `Database.declareStaticTables`; used 

168 to declare any tables that should always be present. 

169 collections: `CollectionManager` 

170 Manager object for the collections in this `Registry`. 

171 dimensions : `DimensionRecordStorageManager` 

172 Manager object for the dimensions in this `Registry`. 

173 

174 Returns 

175 ------- 

176 manager : `CollectionSummaryManager` 

177 New manager object for collection summaries. 

178 """ 

179 specs = CollectionSummaryTables.makeTableSpecs(collections, dimensions) 

180 tables = CollectionSummaryTables( 

181 datasetType=context.addTable("collection_summary_dataset_type", specs.datasetType), 

182 dimensions=NamedKeyDict( 

183 { 

184 dimension: context.addTable(f"collection_summary_{dimension.name}", spec) 

185 for dimension, spec in specs.dimensions.items() 

186 } 

187 ).freeze(), 

188 ) 

189 return cls( 

190 db=db, 

191 collections=collections, 

192 dimensions=dimensions, 

193 tables=tables, 

194 ) 

195 

196 def update( 

197 self, 

198 collection: CollectionRecord, 

199 datasetType: DatasetType, 

200 dataset_type_id: int, 

201 governors: GovernorDimensionRestriction, 

202 ) -> None: 

203 """Update the summary tables to associate the given collection with 

204 a dataset type and governor dimension values. 

205 

206 Parameters 

207 ---------- 

208 collection : `CollectionRecord` 

209 Collection whose summary should be updated. 

210 datasetType : `DatasetType` 

211 DatasetType instance to associate with this collection. 

212 dataset_type_id : `int` 

213 Integer ID for the dataset type to associate with this collection. 

214 governors : `GovernorDimensionRestriction` 

215 Mapping from `GovernorDimensionRestriction` to sets of values they 

216 may be associated with in the data IDs of the datasets in this 

217 collection. 

218 

219 Notes 

220 ----- 

221 This method should only be called inside the transaction context of 

222 another operation that inserts or associates datasets. 

223 """ 

224 self._db.ensure( 

225 self._tables.datasetType, 

226 { 

227 "dataset_type_id": dataset_type_id, 

228 self._collectionKeyName: collection.key, 

229 }, 

230 ) 

231 for dimension, values in governors.items(): 

232 if values: 

233 self._db.ensure( 

234 self._tables.dimensions[dimension.name], 

235 *[{self._collectionKeyName: collection.key, dimension.name: v} for v in values], 

236 ) 

237 # Update the in-memory cache, too. These changes will remain even if 

238 # the database inserts above are rolled back by some later exception in 

239 # the same transaction, but that's okay: we never promise that a 

240 # CollectionSummary has _just_ the dataset types and governor dimension 

241 # values that are actually present, only that it is guaranteed to 

242 # contain any dataset types or governor dimension values that _may_ be 

243 # present. 

244 # That guarantee (and the possibility of rollbacks) means we can't get 

245 # away with checking the cache before we try the database inserts, 

246 # however; if someone had attempted to insert datasets of some dataset 

247 # type previously, and that rolled back, and we're now trying to insert 

248 # some more datasets of that same type, it would not be okay to skip 

249 # the DB summary table insertions because we found entries in the 

250 # in-memory cache. 

251 summary = self.get(collection) 

252 summary.datasetTypes.add(datasetType) 

253 summary.dimensions.update(governors) 

254 

255 def refresh(self, get_dataset_type: Callable[[int], DatasetType]) -> None: 

256 """Load all collection summary information from the database. 

257 

258 Parameters 

259 ---------- 

260 get_dataset_type : `Callable` 

261 Function that takes an `int` dataset_type_id value and returns a 

262 `DatasetType` instance. 

263 """ 

264 # Set up the SQL query we'll use to fetch all of the summary 

265 # information at once. 

266 columns = [ 

267 self._tables.datasetType.columns[self._collectionKeyName].label(self._collectionKeyName), 

268 self._tables.datasetType.columns.dataset_type_id.label("dataset_type_id"), 

269 ] 

270 fromClause = self._tables.datasetType 

271 for dimension, table in self._tables.dimensions.items(): 

272 columns.append(table.columns[dimension.name].label(dimension.name)) 

273 fromClause = fromClause.join( 

274 table, 

275 onclause=( 

276 self._tables.datasetType.columns[self._collectionKeyName] 

277 == table.columns[self._collectionKeyName] 

278 ), 

279 isouter=True, 

280 ) 

281 sql = sqlalchemy.sql.select(*columns).select_from(fromClause) 

282 # Run the query and construct CollectionSummary objects from the result 

283 # rows. This will never include CHAINED collections or collections 

284 # with no datasets. 

285 summaries: Dict[Any, CollectionSummary] = {} 

286 for row in self._db.query(sql).mappings(): 

287 # Collection key should never be None/NULL; it's what we join on. 

288 # Extract that and then turn it into a collection name. 

289 collectionKey = row[self._collectionKeyName] 

290 # dataset_type_id should also never be None/NULL; it's in the first 

291 # table we joined. 

292 datasetType = get_dataset_type(row["dataset_type_id"]) 

293 # See if we have a summary already for this collection; if not, 

294 # make one. 

295 summary = summaries.get(collectionKey) 

296 if summary is None: 

297 summary = CollectionSummary( 

298 datasetTypes=NamedValueSet([datasetType]), 

299 dimensions=GovernorDimensionRestriction.makeEmpty(self._dimensions.universe), 

300 ) 

301 summaries[collectionKey] = summary 

302 else: 

303 summary.datasetTypes.add(datasetType) 

304 # Update the dimensions with the values in this row that aren't 

305 # None/NULL (many will be in general, because these enter the query 

306 # via LEFT OUTER JOIN). 

307 for dimension in self._tables.dimensions: 

308 value = row[dimension.name] 

309 if value is not None: 

310 summary.dimensions.add(dimension, value) 

311 self._cache = summaries 

312 

313 def get(self, collection: CollectionRecord) -> CollectionSummary: 

314 """Return a summary for the given collection. 

315 

316 Parameters 

317 ---------- 

318 collection : `CollectionRecord` 

319 Record describing the collection for which a summary is to be 

320 retrieved. 

321 

322 Returns 

323 ------- 

324 summary : `CollectionSummary` 

325 Summary of the dataset types and governor dimension values in 

326 this collection. 

327 """ 

328 summary = self._cache.get(collection.key) 

329 if summary is None: 

330 # When we load the summary information from the database, we don't 

331 # create summaries for CHAINED collections; those are created here 

332 # as needed, and *never* cached - we have no good way to update 

333 # those summaries when some a new dataset is added to a child 

334 # colletion. 

335 if collection.type is CollectionType.CHAINED: 

336 assert isinstance(collection, ChainedCollectionRecord) 

337 child_summaries = [self.get(self._collections.find(child)) for child in collection.children] 

338 if child_summaries: 338 ↛ 341line 338 didn't jump to line 341, because the condition on line 338 was never false

339 summary = CollectionSummary.union(*child_summaries) 

340 else: 

341 summary = CollectionSummary.makeEmpty(self._dimensions.universe) 

342 else: 

343 # Either this collection doesn't have any datasets yet, or the 

344 # only datasets it has were created by some other process since 

345 # the last call to refresh. We assume the former; the user is 

346 # responsible for calling refresh if they want to read 

347 # concurrently-written things. We do remember this in the 

348 # cache. 

349 summary = CollectionSummary.makeEmpty(self._dimensions.universe) 

350 self._cache[collection.key] = summary 

351 return summary