Coverage for python/lsst/daf/butler/registry/bridge/monolithic.py: 84%

97 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2022-12-01 19:54 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ("MonolithicDatastoreRegistryBridgeManager", "MonolithicDatastoreRegistryBridge") 

24 

25from collections import namedtuple 

26from contextlib import contextmanager 

27import copy 

28from typing import cast, Dict, Iterable, Iterator, List, Optional, Set, Tuple, Type, TYPE_CHECKING 

29 

30import sqlalchemy 

31 

32from lsst.daf.butler import DatasetRef, ddl, NamedValueSet, StoredDatastoreItemInfo 

33from lsst.daf.butler.registry.interfaces import ( 

34 DatasetIdRef, 

35 DatastoreRegistryBridge, 

36 DatastoreRegistryBridgeManager, 

37 FakeDatasetRef, 

38 OpaqueTableStorage, 

39 VersionTuple, 

40) 

41from lsst.daf.butler.registry.opaque import ByNameOpaqueTableStorage 

42from lsst.daf.butler.registry.bridge.ephemeral import EphemeralDatastoreRegistryBridge 

43 

44if TYPE_CHECKING: 44 ↛ 45line 44 didn't jump to line 45, because the condition on line 44 was never true

45 from lsst.daf.butler import DimensionUniverse 

46 from lsst.daf.butler.registry.interfaces import ( 

47 Database, 

48 DatasetRecordStorageManager, 

49 OpaqueTableStorageManager, 

50 StaticTablesContext, 

51 ) 

52 

53_TablesTuple = namedtuple( 

54 "_TablesTuple", 

55 [ 

56 "dataset_location", 

57 "dataset_location_trash", 

58 ] 

59) 

60 

61# This has to be updated on every schema change 

62_VERSION = VersionTuple(0, 2, 0) 

63 

64 

65def _makeTableSpecs(datasets: Type[DatasetRecordStorageManager]) -> _TablesTuple: 

66 """Construct specifications for tables used by the monolithic datastore 

67 bridge classes. 

68 

69 Parameters 

70 ---------- 

71 universe : `DimensionUniverse` 

72 All dimensions known to the `Registry`. 

73 datasets : subclass of `DatasetRecordStorageManager` 

74 Manager class for datasets; used only to create foreign key fields. 

75 

76 Returns 

77 ------- 

78 specs : `_TablesTuple` 

79 A named tuple containing `ddl.TableSpec` instances. 

80 """ 

81 # We want the dataset_location and dataset_location_trash tables 

82 # to have the same definition, aside from the behavior of their link 

83 # to the dataset table: the trash table has no foreign key constraint. 

84 dataset_location_spec = ddl.TableSpec( 

85 doc=( 

86 "A table that provides information on whether a dataset is stored in " 

87 "one or more Datastores. The presence or absence of a record in this " 

88 "table itself indicates whether the dataset is present in that " 

89 "Datastore. " 

90 ), 

91 fields=NamedValueSet([ 

92 ddl.FieldSpec( 

93 name="datastore_name", 

94 dtype=sqlalchemy.String, 

95 length=256, 

96 primaryKey=True, 

97 nullable=False, 

98 doc="Name of the Datastore this entry corresponds to.", 

99 ), 

100 ]), 

101 ) 

102 dataset_location = copy.deepcopy(dataset_location_spec) 

103 datasets.addDatasetForeignKey(dataset_location, primaryKey=True) 

104 dataset_location_trash = copy.deepcopy(dataset_location_spec) 

105 datasets.addDatasetForeignKey(dataset_location_trash, primaryKey=True, constraint=False) 

106 return _TablesTuple( 

107 dataset_location=dataset_location, 

108 dataset_location_trash=dataset_location_trash, 

109 ) 

110 

111 

112class MonolithicDatastoreRegistryBridge(DatastoreRegistryBridge): 

113 """An implementation of `DatastoreRegistryBridge` that uses the same two 

114 tables for all non-ephemeral datastores. 

115 

116 Parameters 

117 ---------- 

118 datastoreName : `str` 

119 Name of the `Datastore` as it should appear in `Registry` tables 

120 referencing it. 

121 db : `Database` 

122 Object providing a database connection and generic distractions. 

123 tables : `_TablesTuple` 

124 Named tuple containing `sqlalchemy.schema.Table` instances. 

125 """ 

126 def __init__(self, datastoreName: str, *, db: Database, tables: _TablesTuple): 

127 super().__init__(datastoreName) 

128 self._db = db 

129 self._tables = tables 

130 

131 def _refsToRows(self, refs: Iterable[DatasetIdRef]) -> List[dict]: 

132 """Transform an iterable of `DatasetRef` or `FakeDatasetRef` objects to 

133 a list of dictionaries that match the schema of the tables used by this 

134 class. 

135 

136 Parameters 

137 ---------- 

138 refs : `Iterable` [ `DatasetRef` or `FakeDatasetRef` ] 

139 Datasets to transform. 

140 

141 Returns 

142 ------- 

143 rows : `list` [ `dict` ] 

144 List of dictionaries, with "datastoreName" and "dataset_id" keys. 

145 """ 

146 return [{"datastore_name": self.datastoreName, "dataset_id": ref.getCheckedId()} for ref in refs] 

147 

148 def insert(self, refs: Iterable[DatasetIdRef]) -> None: 

149 # Docstring inherited from DatastoreRegistryBridge 

150 self._db.insert(self._tables.dataset_location, *self._refsToRows(refs)) 

151 

152 def forget(self, refs: Iterable[DatasetIdRef]) -> None: 

153 # Docstring inherited from DatastoreRegistryBridge 

154 rows = self._refsToRows(self.check(refs)) 

155 self._db.delete(self._tables.dataset_location, ["datastore_name", "dataset_id"], *rows) 

156 

157 def moveToTrash(self, refs: Iterable[DatasetIdRef]) -> None: 

158 # Docstring inherited from DatastoreRegistryBridge 

159 # TODO: avoid self.check() call via queries like 

160 # INSERT INTO dataset_location_trash 

161 # SELECT datastore_name, dataset_id FROM dataset_location 

162 # WHERE datastore_name=? AND dataset_id IN (?); 

163 # DELETE FROM dataset_location 

164 # WHERE datastore_name=? AND dataset_id IN (?); 

165 # ...but the Database interface doesn't support those kinds of queries 

166 # right now. 

167 rows = self._refsToRows(self.check(refs)) 

168 with self._db.transaction(): 

169 self._db.delete(self._tables.dataset_location, ["datastore_name", "dataset_id"], *rows) 

170 self._db.insert(self._tables.dataset_location_trash, *rows) 

171 

172 def check(self, refs: Iterable[DatasetIdRef]) -> Iterable[DatasetIdRef]: 

173 # Docstring inherited from DatastoreRegistryBridge 

174 byId = {ref.getCheckedId(): ref for ref in refs} 

175 sql = sqlalchemy.sql.select( 

176 self._tables.dataset_location.columns.dataset_id 

177 ).select_from( 

178 self._tables.dataset_location 

179 ).where( 

180 sqlalchemy.sql.and_( 

181 self._tables.dataset_location.columns.datastore_name == self.datastoreName, 

182 self._tables.dataset_location.columns.dataset_id.in_(byId.keys()) 

183 ) 

184 ) 

185 for row in self._db.query(sql).fetchall(): 

186 yield byId[row.dataset_id] 

187 

188 @contextmanager 

189 def emptyTrash(self, records_table: Optional[OpaqueTableStorage] = None, 

190 record_class: Optional[Type[StoredDatastoreItemInfo]] = None, 

191 record_column: Optional[str] = None, 

192 ) -> Iterator[Tuple[Iterable[Tuple[DatasetIdRef, 

193 Optional[StoredDatastoreItemInfo]]], 

194 Optional[Set[str]]]]: 

195 # Docstring inherited from DatastoreRegistryBridge 

196 

197 if records_table is None: 197 ↛ 198line 197 didn't jump to line 198, because the condition on line 197 was never true

198 raise ValueError("This implementation requires a records table.") 

199 

200 assert isinstance(records_table, ByNameOpaqueTableStorage),\ 

201 f"Records table must support hidden attributes. Got {type(records_table)}." 

202 

203 if record_class is None: 203 ↛ 204line 203 didn't jump to line 204, because the condition on line 203 was never true

204 raise ValueError("Record class must be provided if records table is given.") 

205 

206 # Helper closure to generate the common join+where clause. 

207 def join_records(select: sqlalchemy.sql.Select, location_table: sqlalchemy.schema.Table 

208 ) -> sqlalchemy.sql.FromClause: 

209 # mypy needs to be sure 

210 assert isinstance(records_table, ByNameOpaqueTableStorage) 

211 return select.select_from( 

212 records_table._table.join( 

213 location_table, 

214 onclause=records_table._table.columns.dataset_id == location_table.columns.dataset_id, 

215 ) 

216 ).where( 

217 location_table.columns.datastore_name == self.datastoreName 

218 ) 

219 

220 # SELECT records.dataset_id, records.path FROM records 

221 # JOIN records on dataset_location.dataset_id == records.dataset_id 

222 # WHERE dataset_location.datastore_name = datastoreName 

223 

224 # It's possible that we may end up with a ref listed in the trash 

225 # table that is not listed in the records table. Such an 

226 # inconsistency would be missed by this query. 

227 info_in_trash = join_records(records_table._table.select(), self._tables.dataset_location_trash) 

228 

229 # Run query, transform results into a list of dicts that we can later 

230 # use to delete. 

231 rows = [dict(**row, datastore_name=self.datastoreName) 

232 for row in self._db.query(info_in_trash).mappings()] 

233 

234 # It is possible for trashed refs to be linked to artifacts that 

235 # are still associated with refs that are not to be trashed. We 

236 # need to be careful to consider those and indicate to the caller 

237 # that those artifacts should be retained. Can only do this check 

238 # if the caller provides a column name that can map to multiple 

239 # refs. 

240 preserved: Optional[Set[str]] = None 

241 if record_column is not None: 241 ↛ 269line 241 didn't jump to line 269, because the condition on line 241 was never false

242 # Some helper subqueries 

243 items_not_in_trash = join_records( 

244 sqlalchemy.sql.select(records_table._table.columns[record_column]), 

245 self._tables.dataset_location, 

246 ).alias("items_not_in_trash") 

247 items_in_trash = join_records( 

248 sqlalchemy.sql.select(records_table._table.columns[record_column]), 

249 self._tables.dataset_location_trash, 

250 ).alias("items_in_trash") 

251 

252 # A query for paths that are referenced by datasets in the trash 

253 # and datasets not in the trash. 

254 items_to_preserve = sqlalchemy.sql.select( 

255 items_in_trash.columns[record_column] 

256 ).select_from( 

257 items_not_in_trash.join( 

258 items_in_trash, 

259 onclause=items_in_trash.columns[record_column] 

260 == items_not_in_trash.columns[record_column] 

261 ) 

262 ) 

263 preserved = {row[record_column] 

264 for row in self._db.query(items_to_preserve).mappings()} 

265 

266 # Convert results to a tuple of id+info and a record of the artifacts 

267 # that should not be deleted from datastore. The id+info tuple is 

268 # solely to allow logging to report the relevant ID. 

269 id_info = ((FakeDatasetRef(row["dataset_id"]), record_class.from_record(row)) 

270 for row in rows) 

271 

272 # Start contextmanager, return results 

273 yield ((id_info, preserved)) 

274 

275 # No exception raised in context manager block. 

276 if not rows: 

277 return 

278 

279 # Delete the rows from the records table 

280 records_table.delete(["dataset_id"], 

281 *[{"dataset_id": row["dataset_id"]} for row in rows]) 

282 

283 # Delete those rows from the trash table. 

284 self._db.delete(self._tables.dataset_location_trash, ["dataset_id", "datastore_name"], 

285 *[{"dataset_id": row["dataset_id"], "datastore_name": row["datastore_name"]} 

286 for row in rows]) 

287 

288 

289class MonolithicDatastoreRegistryBridgeManager(DatastoreRegistryBridgeManager): 

290 """An implementation of `DatastoreRegistryBridgeManager` that uses the same 

291 two tables for all non-ephemeral datastores. 

292 

293 Parameters 

294 ---------- 

295 db : `Database` 

296 Object providing a database connection and generic distractions. 

297 tables : `_TablesTuple` 

298 Named tuple containing `sqlalchemy.schema.Table` instances. 

299 opaque : `OpaqueTableStorageManager` 

300 Manager object for opaque table storage in the `Registry`. 

301 universe : `DimensionUniverse` 

302 All dimensions know to the `Registry`. 

303 datasetIdColumnType : `type` 

304 Type for dataset ID column. 

305 """ 

306 def __init__(self, *, db: Database, tables: _TablesTuple, 

307 opaque: OpaqueTableStorageManager, universe: DimensionUniverse, 

308 datasetIdColumnType: type): 

309 super().__init__(opaque=opaque, universe=universe, datasetIdColumnType=datasetIdColumnType) 

310 self._db = db 

311 self._tables = tables 

312 self._ephemeral: Dict[str, EphemeralDatastoreRegistryBridge] = {} 

313 

314 @classmethod 

315 def initialize(cls, db: Database, context: StaticTablesContext, *, 

316 opaque: OpaqueTableStorageManager, 

317 datasets: Type[DatasetRecordStorageManager], 

318 universe: DimensionUniverse, 

319 ) -> DatastoreRegistryBridgeManager: 

320 # Docstring inherited from DatastoreRegistryBridge 

321 tables = context.addTableTuple(_makeTableSpecs(datasets)) 

322 return cls(db=db, tables=cast(_TablesTuple, tables), opaque=opaque, universe=universe, 

323 datasetIdColumnType=datasets.getIdColumnType()) 

324 

325 def refresh(self) -> None: 

326 # Docstring inherited from DatastoreRegistryBridge 

327 # This implementation has no in-Python state that depends on which 

328 # datastores exist, so there's nothing to do. 

329 pass 

330 

331 def register(self, name: str, *, ephemeral: bool = False) -> DatastoreRegistryBridge: 

332 # Docstring inherited from DatastoreRegistryBridge 

333 if ephemeral: 

334 return self._ephemeral.setdefault(name, EphemeralDatastoreRegistryBridge(name)) 

335 return MonolithicDatastoreRegistryBridge(name, db=self._db, tables=self._tables) 

336 

337 def findDatastores(self, ref: DatasetRef) -> Iterable[str]: 

338 # Docstring inherited from DatastoreRegistryBridge 

339 sql = sqlalchemy.sql.select( 

340 self._tables.dataset_location.columns.datastore_name 

341 ).select_from( 

342 self._tables.dataset_location 

343 ).where( 

344 self._tables.dataset_location.columns.dataset_id == ref.getCheckedId() 

345 ) 

346 for row in self._db.query(sql).mappings(): 

347 yield row[self._tables.dataset_location.columns.datastore_name] 

348 for name, bridge in self._ephemeral.items(): 

349 if ref in bridge: 

350 yield name 

351 

352 @classmethod 

353 def currentVersion(cls) -> Optional[VersionTuple]: 

354 # Docstring inherited from VersionedExtension. 

355 return _VERSION 

356 

357 def schemaDigest(self) -> Optional[str]: 

358 # Docstring inherited from VersionedExtension. 

359 return self._defaultSchemaDigest(self._tables, self._db.dialect)