Coverage for python/lsst/daf/butler/registry/datasets/byDimensions/summaries.py: 97%

78 statements  

« prev     ^ index     » next       coverage.py v6.4.4, created at 2022-09-22 02:04 -0700

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ("CollectionSummaryManager",) 

25 

26from typing import Any, Callable, Dict, Generic, Iterable, TypeVar 

27 

28import sqlalchemy 

29from lsst.daf.butler import ( 

30 DatasetType, 

31 GovernorDimension, 

32 NamedKeyDict, 

33 NamedKeyMapping, 

34 addDimensionForeignKey, 

35 ddl, 

36) 

37from lsst.daf.butler.registry.interfaces import ( 

38 ChainedCollectionRecord, 

39 CollectionManager, 

40 CollectionRecord, 

41 Database, 

42 DimensionRecordStorageManager, 

43 StaticTablesContext, 

44) 

45 

46from ..._collection_summary import CollectionSummary 

47from ..._collectionType import CollectionType 

48 

49_T = TypeVar("_T") 

50 

51 

52class CollectionSummaryTables(Generic[_T]): 

53 """Structure that holds the table or table specification objects that 

54 summarize the contents of collections. 

55 

56 Parameters 

57 ---------- 

58 datasetType 

59 Table [specification] that summarizes which dataset types are in each 

60 collection. 

61 dimensions 

62 Mapping of table [specifications] that summarize which governor 

63 dimension values are present in the data IDs of each collection. 

64 """ 

65 

66 def __init__( 

67 self, 

68 datasetType: _T, 

69 dimensions: NamedKeyMapping[GovernorDimension, _T], 

70 ): 

71 self.datasetType = datasetType 

72 self.dimensions = dimensions 

73 

74 @classmethod 

75 def makeTableSpecs( 

76 cls, 

77 collections: CollectionManager, 

78 dimensions: DimensionRecordStorageManager, 

79 ) -> CollectionSummaryTables[ddl.TableSpec]: 

80 """Create specifications for all summary tables. 

81 

82 Parameters 

83 ---------- 

84 collections: `CollectionManager` 

85 Manager object for the collections in this `Registry`. 

86 dimensions : `DimensionRecordStorageManager` 

87 Manager object for the dimensions in this `Registry`. 

88 

89 Returns 

90 ------- 

91 tables : `CollectionSummaryTables` [ `ddl.TableSpec` ] 

92 Structure containing table specifications. 

93 """ 

94 # Spec for collection_summary_dataset_type. 

95 datasetTypeTableSpec = ddl.TableSpec(fields=[]) 

96 collections.addCollectionForeignKey(datasetTypeTableSpec, primaryKey=True, onDelete="CASCADE") 

97 datasetTypeTableSpec.fields.add( 

98 ddl.FieldSpec("dataset_type_id", dtype=sqlalchemy.BigInteger, primaryKey=True) 

99 ) 

100 datasetTypeTableSpec.foreignKeys.append( 

101 ddl.ForeignKeySpec( 

102 "dataset_type", source=("dataset_type_id",), target=("id",), onDelete="CASCADE" 

103 ) 

104 ) 

105 # Specs for collection_summary_<dimension>. 

106 dimensionTableSpecs = NamedKeyDict[GovernorDimension, ddl.TableSpec]() 

107 for dimension in dimensions.universe.getGovernorDimensions(): 

108 tableSpec = ddl.TableSpec(fields=[]) 

109 collections.addCollectionForeignKey(tableSpec, primaryKey=True, onDelete="CASCADE") 

110 addDimensionForeignKey(tableSpec, dimension, primaryKey=True) 

111 dimensionTableSpecs[dimension] = tableSpec 

112 return CollectionSummaryTables( 

113 datasetType=datasetTypeTableSpec, 

114 dimensions=dimensionTableSpecs.freeze(), 

115 ) 

116 

117 

118class CollectionSummaryManager: 

119 """Object manages the summaries of what dataset types and governor 

120 dimension values are present in a collection. 

121 

122 Parameters 

123 ---------- 

124 db : `Database` 

125 Interface to the underlying database engine and namespace. 

126 collections: `CollectionManager` 

127 Manager object for the collections in this `Registry`. 

128 dimensions : `DimensionRecordStorageManager` 

129 Manager object for the dimensions in this `Registry`. 

130 tables : `CollectionSummaryTables` 

131 Struct containing the tables that hold collection summaries. 

132 """ 

133 

134 def __init__( 

135 self, 

136 db: Database, 

137 *, 

138 collections: CollectionManager, 

139 dimensions: DimensionRecordStorageManager, 

140 tables: CollectionSummaryTables[sqlalchemy.sql.Table], 

141 ): 

142 self._db = db 

143 self._collections = collections 

144 self._collectionKeyName = collections.getCollectionForeignKeyName() 

145 self._dimensions = dimensions 

146 self._tables = tables 

147 self._cache: Dict[Any, CollectionSummary] = {} 

148 

149 @classmethod 

150 def initialize( 

151 cls, 

152 db: Database, 

153 context: StaticTablesContext, 

154 *, 

155 collections: CollectionManager, 

156 dimensions: DimensionRecordStorageManager, 

157 ) -> CollectionSummaryManager: 

158 """Create all summary tables (or check that they have been created), 

159 returning an object to manage them. 

160 

161 Parameters 

162 ---------- 

163 db : `Database` 

164 Interface to the underlying database engine and namespace. 

165 context : `StaticTablesContext` 

166 Context object obtained from `Database.declareStaticTables`; used 

167 to declare any tables that should always be present. 

168 collections: `CollectionManager` 

169 Manager object for the collections in this `Registry`. 

170 dimensions : `DimensionRecordStorageManager` 

171 Manager object for the dimensions in this `Registry`. 

172 

173 Returns 

174 ------- 

175 manager : `CollectionSummaryManager` 

176 New manager object for collection summaries. 

177 """ 

178 specs = CollectionSummaryTables.makeTableSpecs(collections, dimensions) 

179 tables = CollectionSummaryTables( 

180 datasetType=context.addTable("collection_summary_dataset_type", specs.datasetType), 

181 dimensions=NamedKeyDict( 

182 { 

183 dimension: context.addTable(f"collection_summary_{dimension.name}", spec) 

184 for dimension, spec in specs.dimensions.items() 

185 } 

186 ).freeze(), 

187 ) 

188 return cls( 

189 db=db, 

190 collections=collections, 

191 dimensions=dimensions, 

192 tables=tables, 

193 ) 

194 

195 def update( 

196 self, 

197 collection: CollectionRecord, 

198 dataset_type_ids: Iterable[int], 

199 summary: CollectionSummary, 

200 ) -> None: 

201 """Update the summary tables to associate the given collection with 

202 a dataset type and governor dimension values. 

203 

204 Parameters 

205 ---------- 

206 collection : `CollectionRecord` 

207 Collection whose summary should be updated. 

208 dataset_type_ids : `Iterable` [ `int` ] 

209 Integer IDs for the dataset types to associate with this 

210 collection. 

211 summary : `CollectionSummary` 

212 Summary to store. Dataset types must correspond to 

213 ``dataset_type_ids``. 

214 

215 Notes 

216 ----- 

217 This method should only be called inside the transaction context of 

218 another operation that inserts or associates datasets. 

219 """ 

220 self._db.ensure( 

221 self._tables.datasetType, 

222 *[ 

223 { 

224 "dataset_type_id": dataset_type_id, 

225 self._collectionKeyName: collection.key, 

226 } 

227 for dataset_type_id in dataset_type_ids 

228 ], 

229 ) 

230 for dimension, values in summary.governors.items(): 

231 if values: 231 ↛ 230line 231 didn't jump to line 230, because the condition on line 231 was never false

232 self._db.ensure( 

233 self._tables.dimensions[dimension], 

234 *[{self._collectionKeyName: collection.key, dimension: v} for v in values], 

235 ) 

236 # Update the in-memory cache, too. These changes will remain even if 

237 # the database inserts above are rolled back by some later exception in 

238 # the same transaction, but that's okay: we never promise that a 

239 # CollectionSummary has _just_ the dataset types and governor dimension 

240 # values that are actually present, only that it is guaranteed to 

241 # contain any dataset types or governor dimension values that _may_ be 

242 # present. 

243 # That guarantee (and the possibility of rollbacks) means we can't get 

244 # away with checking the cache before we try the database inserts, 

245 # however; if someone had attempted to insert datasets of some dataset 

246 # type previously, and that rolled back, and we're now trying to insert 

247 # some more datasets of that same type, it would not be okay to skip 

248 # the DB summary table insertions because we found entries in the 

249 # in-memory cache. 

250 self.get(collection).update(summary) 

251 

252 def refresh(self, get_dataset_type: Callable[[int], DatasetType]) -> None: 

253 """Load all collection summary information from the database. 

254 

255 Parameters 

256 ---------- 

257 get_dataset_type : `Callable` 

258 Function that takes an `int` dataset_type_id value and returns a 

259 `DatasetType` instance. 

260 """ 

261 # Set up the SQL query we'll use to fetch all of the summary 

262 # information at once. 

263 columns = [ 

264 self._tables.datasetType.columns[self._collectionKeyName].label(self._collectionKeyName), 

265 self._tables.datasetType.columns.dataset_type_id.label("dataset_type_id"), 

266 ] 

267 fromClause = self._tables.datasetType 

268 for dimension, table in self._tables.dimensions.items(): 

269 columns.append(table.columns[dimension.name].label(dimension.name)) 

270 fromClause = fromClause.join( 

271 table, 

272 onclause=( 

273 self._tables.datasetType.columns[self._collectionKeyName] 

274 == table.columns[self._collectionKeyName] 

275 ), 

276 isouter=True, 

277 ) 

278 sql = sqlalchemy.sql.select(*columns).select_from(fromClause) 

279 # Run the query and construct CollectionSummary objects from the result 

280 # rows. This will never include CHAINED collections or collections 

281 # with no datasets. 

282 summaries: Dict[Any, CollectionSummary] = {} 

283 for row in self._db.query(sql).mappings(): 

284 # Collection key should never be None/NULL; it's what we join on. 

285 # Extract that and then turn it into a collection name. 

286 collectionKey = row[self._collectionKeyName] 

287 # dataset_type_id should also never be None/NULL; it's in the first 

288 # table we joined. 

289 datasetType = get_dataset_type(row["dataset_type_id"]) 

290 # See if we have a summary already for this collection; if not, 

291 # make one. 

292 summary = summaries.get(collectionKey) 

293 if summary is None: 

294 summary = CollectionSummary() 

295 summaries[collectionKey] = summary 

296 # Update the dimensions with the values in this row that aren't 

297 # None/NULL (many will be in general, because these enter the query 

298 # via LEFT OUTER JOIN). 

299 summary.dataset_types.add(datasetType) 

300 for dimension in self._tables.dimensions: 

301 value = row[dimension.name] 

302 if value is not None: 

303 summary.governors.setdefault(dimension.name, set()).add(value) 

304 self._cache = summaries 

305 

306 def get(self, collection: CollectionRecord) -> CollectionSummary: 

307 """Return a summary for the given collection. 

308 

309 Parameters 

310 ---------- 

311 collection : `CollectionRecord` 

312 Record describing the collection for which a summary is to be 

313 retrieved. 

314 

315 Returns 

316 ------- 

317 summary : `CollectionSummary` 

318 Summary of the dataset types and governor dimension values in 

319 this collection. 

320 """ 

321 summary = self._cache.get(collection.key) 

322 if summary is None: 

323 # When we load the summary information from the database, we don't 

324 # create summaries for CHAINED collections; those are created here 

325 # as needed, and *never* cached - we have no good way to update 

326 # those summaries when some a new dataset is added to a child 

327 # colletion. 

328 if collection.type is CollectionType.CHAINED: 

329 assert isinstance(collection, ChainedCollectionRecord) 

330 child_summaries = [self.get(self._collections.find(child)) for child in collection.children] 

331 if child_summaries: 331 ↛ 334line 331 didn't jump to line 334, because the condition on line 331 was never false

332 summary = CollectionSummary.union(*child_summaries) 

333 else: 

334 summary = CollectionSummary() 

335 else: 

336 # Either this collection doesn't have any datasets yet, or the 

337 # only datasets it has were created by some other process since 

338 # the last call to refresh. We assume the former; the user is 

339 # responsible for calling refresh if they want to read 

340 # concurrently-written things. We do remember this in the 

341 # cache. 

342 summary = CollectionSummary() 

343 self._cache[collection.key] = summary 

344 return summary