Coverage for python/lsst/daf/butler/registry/bridge/monolithic.py: 23%

98 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2023-04-04 02:06 -0700

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ("MonolithicDatastoreRegistryBridgeManager", "MonolithicDatastoreRegistryBridge") 

24 

25import copy 

26from collections import namedtuple 

27from contextlib import contextmanager 

28from typing import TYPE_CHECKING, Dict, Iterable, Iterator, List, Optional, Set, Tuple, Type, cast 

29 

30import sqlalchemy 

31 

32from ...core import NamedValueSet, StoredDatastoreItemInfo, ddl 

33from ..interfaces import ( 

34 DatasetIdRef, 

35 DatastoreRegistryBridge, 

36 DatastoreRegistryBridgeManager, 

37 FakeDatasetRef, 

38 OpaqueTableStorage, 

39 VersionTuple, 

40) 

41from ..opaque import ByNameOpaqueTableStorage 

42from .ephemeral import EphemeralDatastoreRegistryBridge 

43 

44if TYPE_CHECKING: 

45 from ...core import DimensionUniverse 

46 from ...core.datastore import DatastoreTransaction 

47 from ..interfaces import ( 

48 Database, 

49 DatasetRecordStorageManager, 

50 OpaqueTableStorageManager, 

51 StaticTablesContext, 

52 ) 

53 

54_TablesTuple = namedtuple( 

55 "_TablesTuple", 

56 [ 

57 "dataset_location", 

58 "dataset_location_trash", 

59 ], 

60) 

61 

62# This has to be updated on every schema change 

63_VERSION = VersionTuple(0, 2, 0) 

64 

65 

66def _makeTableSpecs(datasets: Type[DatasetRecordStorageManager]) -> _TablesTuple: 

67 """Construct specifications for tables used by the monolithic datastore 

68 bridge classes. 

69 

70 Parameters 

71 ---------- 

72 universe : `DimensionUniverse` 

73 All dimensions known to the `Registry`. 

74 datasets : subclass of `DatasetRecordStorageManager` 

75 Manager class for datasets; used only to create foreign key fields. 

76 

77 Returns 

78 ------- 

79 specs : `_TablesTuple` 

80 A named tuple containing `ddl.TableSpec` instances. 

81 """ 

82 # We want the dataset_location and dataset_location_trash tables 

83 # to have the same definition, aside from the behavior of their link 

84 # to the dataset table: the trash table has no foreign key constraint. 

85 dataset_location_spec = ddl.TableSpec( 

86 doc=( 

87 "A table that provides information on whether a dataset is stored in " 

88 "one or more Datastores. The presence or absence of a record in this " 

89 "table itself indicates whether the dataset is present in that " 

90 "Datastore. " 

91 ), 

92 fields=NamedValueSet( 

93 [ 

94 ddl.FieldSpec( 

95 name="datastore_name", 

96 dtype=sqlalchemy.String, 

97 length=256, 

98 primaryKey=True, 

99 nullable=False, 

100 doc="Name of the Datastore this entry corresponds to.", 

101 ), 

102 ] 

103 ), 

104 ) 

105 dataset_location = copy.deepcopy(dataset_location_spec) 

106 datasets.addDatasetForeignKey(dataset_location, primaryKey=True) 

107 dataset_location_trash = copy.deepcopy(dataset_location_spec) 

108 datasets.addDatasetForeignKey(dataset_location_trash, primaryKey=True, constraint=False) 

109 return _TablesTuple( 

110 dataset_location=dataset_location, 

111 dataset_location_trash=dataset_location_trash, 

112 ) 

113 

114 

115class MonolithicDatastoreRegistryBridge(DatastoreRegistryBridge): 

116 """An implementation of `DatastoreRegistryBridge` that uses the same two 

117 tables for all non-ephemeral datastores. 

118 

119 Parameters 

120 ---------- 

121 datastoreName : `str` 

122 Name of the `Datastore` as it should appear in `Registry` tables 

123 referencing it. 

124 db : `Database` 

125 Object providing a database connection and generic distractions. 

126 tables : `_TablesTuple` 

127 Named tuple containing `sqlalchemy.schema.Table` instances. 

128 """ 

129 

130 def __init__(self, datastoreName: str, *, db: Database, tables: _TablesTuple): 

131 super().__init__(datastoreName) 

132 self._db = db 

133 self._tables = tables 

134 

135 def _refsToRows(self, refs: Iterable[DatasetIdRef]) -> List[dict]: 

136 """Transform an iterable of `DatasetRef` or `FakeDatasetRef` objects to 

137 a list of dictionaries that match the schema of the tables used by this 

138 class. 

139 

140 Parameters 

141 ---------- 

142 refs : `Iterable` [ `DatasetRef` or `FakeDatasetRef` ] 

143 Datasets to transform. 

144 

145 Returns 

146 ------- 

147 rows : `list` [ `dict` ] 

148 List of dictionaries, with "datastoreName" and "dataset_id" keys. 

149 """ 

150 return [{"datastore_name": self.datastoreName, "dataset_id": ref.getCheckedId()} for ref in refs] 

151 

152 def insert(self, refs: Iterable[DatasetIdRef]) -> None: 

153 # Docstring inherited from DatastoreRegistryBridge 

154 self._db.insert(self._tables.dataset_location, *self._refsToRows(refs)) 

155 

156 def forget(self, refs: Iterable[DatasetIdRef]) -> None: 

157 # Docstring inherited from DatastoreRegistryBridge 

158 rows = self._refsToRows(self.check(refs)) 

159 self._db.delete(self._tables.dataset_location, ["datastore_name", "dataset_id"], *rows) 

160 

161 def moveToTrash(self, refs: Iterable[DatasetIdRef], transaction: Optional[DatastoreTransaction]) -> None: 

162 # Docstring inherited from DatastoreRegistryBridge 

163 # TODO: avoid self.check() call via queries like 

164 # INSERT INTO dataset_location_trash 

165 # SELECT datastore_name, dataset_id FROM dataset_location 

166 # WHERE datastore_name=? AND dataset_id IN (?); 

167 # DELETE FROM dataset_location 

168 # WHERE datastore_name=? AND dataset_id IN (?); 

169 # ...but the Database interface doesn't support those kinds of queries 

170 # right now. 

171 rows = self._refsToRows(self.check(refs)) 

172 with self._db.transaction(): 

173 self._db.delete(self._tables.dataset_location, ["datastore_name", "dataset_id"], *rows) 

174 self._db.insert(self._tables.dataset_location_trash, *rows) 

175 

176 def check(self, refs: Iterable[DatasetIdRef]) -> Iterable[DatasetIdRef]: 

177 # Docstring inherited from DatastoreRegistryBridge 

178 byId = {ref.getCheckedId(): ref for ref in refs} 

179 sql = ( 

180 sqlalchemy.sql.select(self._tables.dataset_location.columns.dataset_id) 

181 .select_from(self._tables.dataset_location) 

182 .where( 

183 sqlalchemy.sql.and_( 

184 self._tables.dataset_location.columns.datastore_name == self.datastoreName, 

185 self._tables.dataset_location.columns.dataset_id.in_(byId.keys()), 

186 ) 

187 ) 

188 ) 

189 with self._db.query(sql) as sql_result: 

190 sql_rows = sql_result.fetchall() 

191 for row in sql_rows: 

192 yield byId[row.dataset_id] 

193 

194 @contextmanager 

195 def emptyTrash( 

196 self, 

197 records_table: Optional[OpaqueTableStorage] = None, 

198 record_class: Optional[Type[StoredDatastoreItemInfo]] = None, 

199 record_column: Optional[str] = None, 

200 ) -> Iterator[ 

201 Tuple[Iterable[Tuple[DatasetIdRef, Optional[StoredDatastoreItemInfo]]], Optional[Set[str]]] 

202 ]: 

203 # Docstring inherited from DatastoreRegistryBridge 

204 

205 if records_table is None: 

206 raise ValueError("This implementation requires a records table.") 

207 

208 assert isinstance( 

209 records_table, ByNameOpaqueTableStorage 

210 ), f"Records table must support hidden attributes. Got {type(records_table)}." 

211 

212 if record_class is None: 

213 raise ValueError("Record class must be provided if records table is given.") 

214 

215 # Helper closure to generate the common join+where clause. 

216 def join_records( 

217 select: sqlalchemy.sql.Select, location_table: sqlalchemy.schema.Table 

218 ) -> sqlalchemy.sql.Select: 

219 # mypy needs to be sure 

220 assert isinstance(records_table, ByNameOpaqueTableStorage) 

221 return select.select_from( 

222 records_table._table.join( 

223 location_table, 

224 onclause=records_table._table.columns.dataset_id == location_table.columns.dataset_id, 

225 ) 

226 ).where(location_table.columns.datastore_name == self.datastoreName) 

227 

228 # SELECT records.dataset_id, records.path FROM records 

229 # JOIN records on dataset_location.dataset_id == records.dataset_id 

230 # WHERE dataset_location.datastore_name = datastoreName 

231 

232 # It's possible that we may end up with a ref listed in the trash 

233 # table that is not listed in the records table. Such an 

234 # inconsistency would be missed by this query. 

235 info_in_trash = join_records(records_table._table.select(), self._tables.dataset_location_trash) 

236 

237 # Run query, transform results into a list of dicts that we can later 

238 # use to delete. 

239 with self._db.query(info_in_trash) as sql_result: 

240 rows = [dict(**row, datastore_name=self.datastoreName) for row in sql_result.mappings()] 

241 

242 # It is possible for trashed refs to be linked to artifacts that 

243 # are still associated with refs that are not to be trashed. We 

244 # need to be careful to consider those and indicate to the caller 

245 # that those artifacts should be retained. Can only do this check 

246 # if the caller provides a column name that can map to multiple 

247 # refs. 

248 preserved: Optional[Set[str]] = None 

249 if record_column is not None: 

250 # Some helper subqueries 

251 items_not_in_trash = join_records( 

252 sqlalchemy.sql.select(records_table._table.columns[record_column]), 

253 self._tables.dataset_location, 

254 ).alias("items_not_in_trash") 

255 items_in_trash = join_records( 

256 sqlalchemy.sql.select(records_table._table.columns[record_column]), 

257 self._tables.dataset_location_trash, 

258 ).alias("items_in_trash") 

259 

260 # A query for paths that are referenced by datasets in the trash 

261 # and datasets not in the trash. 

262 items_to_preserve = sqlalchemy.sql.select(items_in_trash.columns[record_column]).select_from( 

263 items_not_in_trash.join( 

264 items_in_trash, 

265 onclause=items_in_trash.columns[record_column] 

266 == items_not_in_trash.columns[record_column], 

267 ) 

268 ) 

269 with self._db.query(items_to_preserve) as sql_result: 

270 preserved = {row[record_column] for row in sql_result.mappings()} 

271 

272 # Convert results to a tuple of id+info and a record of the artifacts 

273 # that should not be deleted from datastore. The id+info tuple is 

274 # solely to allow logging to report the relevant ID. 

275 id_info = ((FakeDatasetRef(row["dataset_id"]), record_class.from_record(row)) for row in rows) 

276 

277 # Start contextmanager, return results 

278 yield ((id_info, preserved)) 

279 

280 # No exception raised in context manager block. 

281 if not rows: 

282 return 

283 

284 # Delete the rows from the records table 

285 records_table.delete(["dataset_id"], *[{"dataset_id": row["dataset_id"]} for row in rows]) 

286 

287 # Delete those rows from the trash table. 

288 self._db.delete( 

289 self._tables.dataset_location_trash, 

290 ["dataset_id", "datastore_name"], 

291 *[{"dataset_id": row["dataset_id"], "datastore_name": row["datastore_name"]} for row in rows], 

292 ) 

293 

294 

295class MonolithicDatastoreRegistryBridgeManager(DatastoreRegistryBridgeManager): 

296 """An implementation of `DatastoreRegistryBridgeManager` that uses the same 

297 two tables for all non-ephemeral datastores. 

298 

299 Parameters 

300 ---------- 

301 db : `Database` 

302 Object providing a database connection and generic distractions. 

303 tables : `_TablesTuple` 

304 Named tuple containing `sqlalchemy.schema.Table` instances. 

305 opaque : `OpaqueTableStorageManager` 

306 Manager object for opaque table storage in the `Registry`. 

307 universe : `DimensionUniverse` 

308 All dimensions know to the `Registry`. 

309 datasetIdColumnType : `type` 

310 Type for dataset ID column. 

311 """ 

312 

313 def __init__( 

314 self, 

315 *, 

316 db: Database, 

317 tables: _TablesTuple, 

318 opaque: OpaqueTableStorageManager, 

319 universe: DimensionUniverse, 

320 datasetIdColumnType: type, 

321 registry_schema_version: VersionTuple | None = None, 

322 ): 

323 super().__init__( 

324 opaque=opaque, 

325 universe=universe, 

326 datasetIdColumnType=datasetIdColumnType, 

327 registry_schema_version=registry_schema_version, 

328 ) 

329 self._db = db 

330 self._tables = tables 

331 self._ephemeral: Dict[str, EphemeralDatastoreRegistryBridge] = {} 

332 

333 @classmethod 

334 def initialize( 

335 cls, 

336 db: Database, 

337 context: StaticTablesContext, 

338 *, 

339 opaque: OpaqueTableStorageManager, 

340 datasets: Type[DatasetRecordStorageManager], 

341 universe: DimensionUniverse, 

342 registry_schema_version: VersionTuple | None = None, 

343 ) -> DatastoreRegistryBridgeManager: 

344 # Docstring inherited from DatastoreRegistryBridge 

345 tables = context.addTableTuple(_makeTableSpecs(datasets)) 

346 return cls( 

347 db=db, 

348 tables=cast(_TablesTuple, tables), 

349 opaque=opaque, 

350 universe=universe, 

351 datasetIdColumnType=datasets.getIdColumnType(), 

352 registry_schema_version=registry_schema_version, 

353 ) 

354 

355 def refresh(self) -> None: 

356 # Docstring inherited from DatastoreRegistryBridge 

357 # This implementation has no in-Python state that depends on which 

358 # datastores exist, so there's nothing to do. 

359 pass 

360 

361 def register(self, name: str, *, ephemeral: bool = False) -> DatastoreRegistryBridge: 

362 # Docstring inherited from DatastoreRegistryBridge 

363 if ephemeral: 

364 return self._ephemeral.setdefault(name, EphemeralDatastoreRegistryBridge(name)) 

365 return MonolithicDatastoreRegistryBridge(name, db=self._db, tables=self._tables) 

366 

367 def findDatastores(self, ref: DatasetIdRef) -> Iterable[str]: 

368 # Docstring inherited from DatastoreRegistryBridge 

369 sql = ( 

370 sqlalchemy.sql.select(self._tables.dataset_location.columns.datastore_name) 

371 .select_from(self._tables.dataset_location) 

372 .where(self._tables.dataset_location.columns.dataset_id == ref.getCheckedId()) 

373 ) 

374 with self._db.query(sql) as sql_result: 

375 sql_rows = sql_result.mappings().fetchall() 

376 for row in sql_rows: 

377 yield row[self._tables.dataset_location.columns.datastore_name] 

378 for name, bridge in self._ephemeral.items(): 

379 if ref in bridge: 

380 yield name 

381 

382 @classmethod 

383 def currentVersions(cls) -> list[VersionTuple]: 

384 # Docstring inherited from VersionedExtension. 

385 return [_VERSION]