Coverage for python/lsst/daf/butler/registry/bridge/monolithic.py: 27%

101 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-10-25 15:14 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ("MonolithicDatastoreRegistryBridgeManager", "MonolithicDatastoreRegistryBridge") 

24 

25import copy 

26from collections import namedtuple 

27from collections.abc import Iterable, Iterator 

28from contextlib import contextmanager 

29from typing import TYPE_CHECKING, cast 

30 

31import sqlalchemy 

32 

33from ...core import NamedValueSet, StoredDatastoreItemInfo, ddl 

34from ..interfaces import ( 

35 DatasetIdRef, 

36 DatastoreRegistryBridge, 

37 DatastoreRegistryBridgeManager, 

38 FakeDatasetRef, 

39 OpaqueTableStorage, 

40 VersionTuple, 

41) 

42from ..opaque import ByNameOpaqueTableStorage 

43from .ephemeral import EphemeralDatastoreRegistryBridge 

44 

45if TYPE_CHECKING: 

46 from ...core import DimensionUniverse 

47 from ...core.datastore import DatastoreTransaction 

48 from ..interfaces import ( 

49 Database, 

50 DatasetRecordStorageManager, 

51 OpaqueTableStorageManager, 

52 StaticTablesContext, 

53 ) 

54 

55_TablesTuple = namedtuple( 

56 "_TablesTuple", 

57 [ 

58 "dataset_location", 

59 "dataset_location_trash", 

60 ], 

61) 

62 

63# This has to be updated on every schema change 

64_VERSION = VersionTuple(0, 2, 0) 

65 

66 

67def _makeTableSpecs(datasets: type[DatasetRecordStorageManager]) -> _TablesTuple: 

68 """Construct specifications for tables used by the monolithic datastore 

69 bridge classes. 

70 

71 Parameters 

72 ---------- 

73 universe : `DimensionUniverse` 

74 All dimensions known to the `Registry`. 

75 datasets : subclass of `DatasetRecordStorageManager` 

76 Manager class for datasets; used only to create foreign key fields. 

77 

78 Returns 

79 ------- 

80 specs : `_TablesTuple` 

81 A named tuple containing `ddl.TableSpec` instances. 

82 """ 

83 # We want the dataset_location and dataset_location_trash tables 

84 # to have the same definition, aside from the behavior of their link 

85 # to the dataset table: the trash table has no foreign key constraint. 

86 dataset_location_spec = ddl.TableSpec( 

87 doc=( 

88 "A table that provides information on whether a dataset is stored in " 

89 "one or more Datastores. The presence or absence of a record in this " 

90 "table itself indicates whether the dataset is present in that " 

91 "Datastore. " 

92 ), 

93 fields=NamedValueSet( 

94 [ 

95 ddl.FieldSpec( 

96 name="datastore_name", 

97 dtype=sqlalchemy.String, 

98 length=256, 

99 primaryKey=True, 

100 nullable=False, 

101 doc="Name of the Datastore this entry corresponds to.", 

102 ), 

103 ] 

104 ), 

105 ) 

106 dataset_location = copy.deepcopy(dataset_location_spec) 

107 datasets.addDatasetForeignKey(dataset_location, primaryKey=True) 

108 dataset_location_trash = copy.deepcopy(dataset_location_spec) 

109 datasets.addDatasetForeignKey(dataset_location_trash, primaryKey=True, constraint=False) 

110 return _TablesTuple( 

111 dataset_location=dataset_location, 

112 dataset_location_trash=dataset_location_trash, 

113 ) 

114 

115 

116class MonolithicDatastoreRegistryBridge(DatastoreRegistryBridge): 

117 """An implementation of `DatastoreRegistryBridge` that uses the same two 

118 tables for all non-ephemeral datastores. 

119 

120 Parameters 

121 ---------- 

122 datastoreName : `str` 

123 Name of the `Datastore` as it should appear in `Registry` tables 

124 referencing it. 

125 db : `Database` 

126 Object providing a database connection and generic distractions. 

127 tables : `_TablesTuple` 

128 Named tuple containing `sqlalchemy.schema.Table` instances. 

129 """ 

130 

131 def __init__(self, datastoreName: str, *, db: Database, tables: _TablesTuple): 

132 super().__init__(datastoreName) 

133 self._db = db 

134 self._tables = tables 

135 

136 def _refsToRows(self, refs: Iterable[DatasetIdRef]) -> list[dict]: 

137 """Transform an iterable of `DatasetRef` or `FakeDatasetRef` objects to 

138 a list of dictionaries that match the schema of the tables used by this 

139 class. 

140 

141 Parameters 

142 ---------- 

143 refs : `~collections.abc.Iterable` [ `DatasetRef` or `FakeDatasetRef` ] 

144 Datasets to transform. 

145 

146 Returns 

147 ------- 

148 rows : `list` [ `dict` ] 

149 List of dictionaries, with "datastoreName" and "dataset_id" keys. 

150 """ 

151 return [{"datastore_name": self.datastoreName, "dataset_id": ref.id} for ref in refs] 

152 

153 def ensure(self, refs: Iterable[DatasetIdRef]) -> None: 

154 # Docstring inherited from DatastoreRegistryBridge 

155 self._db.ensure(self._tables.dataset_location, *self._refsToRows(refs)) 

156 

157 def insert(self, refs: Iterable[DatasetIdRef]) -> None: 

158 # Docstring inherited from DatastoreRegistryBridge 

159 self._db.insert(self._tables.dataset_location, *self._refsToRows(refs)) 

160 

161 def forget(self, refs: Iterable[DatasetIdRef]) -> None: 

162 # Docstring inherited from DatastoreRegistryBridge 

163 rows = self._refsToRows(self.check(refs)) 

164 self._db.delete(self._tables.dataset_location, ["datastore_name", "dataset_id"], *rows) 

165 

166 def moveToTrash(self, refs: Iterable[DatasetIdRef], transaction: DatastoreTransaction | None) -> None: 

167 # Docstring inherited from DatastoreRegistryBridge 

168 # TODO: avoid self.check() call via queries like 

169 # INSERT INTO dataset_location_trash 

170 # SELECT datastore_name, dataset_id FROM dataset_location 

171 # WHERE datastore_name=? AND dataset_id IN (?); 

172 # DELETE FROM dataset_location 

173 # WHERE datastore_name=? AND dataset_id IN (?); 

174 # ...but the Database interface doesn't support those kinds of queries 

175 # right now. 

176 rows = self._refsToRows(self.check(refs)) 

177 with self._db.transaction(): 

178 self._db.delete(self._tables.dataset_location, ["datastore_name", "dataset_id"], *rows) 

179 self._db.insert(self._tables.dataset_location_trash, *rows) 

180 

181 def check(self, refs: Iterable[DatasetIdRef]) -> Iterable[DatasetIdRef]: 

182 # Docstring inherited from DatastoreRegistryBridge 

183 byId = {ref.id: ref for ref in refs} 

184 sql = ( 

185 sqlalchemy.sql.select(self._tables.dataset_location.columns.dataset_id) 

186 .select_from(self._tables.dataset_location) 

187 .where( 

188 sqlalchemy.sql.and_( 

189 self._tables.dataset_location.columns.datastore_name == self.datastoreName, 

190 self._tables.dataset_location.columns.dataset_id.in_(byId.keys()), 

191 ) 

192 ) 

193 ) 

194 with self._db.query(sql) as sql_result: 

195 sql_rows = sql_result.fetchall() 

196 for row in sql_rows: 

197 yield byId[row.dataset_id] 

198 

199 @contextmanager 

200 def emptyTrash( 

201 self, 

202 records_table: OpaqueTableStorage | None = None, 

203 record_class: type[StoredDatastoreItemInfo] | None = None, 

204 record_column: str | None = None, 

205 ) -> Iterator[tuple[Iterable[tuple[DatasetIdRef, StoredDatastoreItemInfo | None]], set[str] | None]]: 

206 # Docstring inherited from DatastoreRegistryBridge 

207 

208 if records_table is None: 

209 raise ValueError("This implementation requires a records table.") 

210 

211 assert isinstance( 

212 records_table, ByNameOpaqueTableStorage 

213 ), f"Records table must support hidden attributes. Got {type(records_table)}." 

214 

215 if record_class is None: 

216 raise ValueError("Record class must be provided if records table is given.") 

217 

218 # Helper closure to generate the common join+where clause. 

219 def join_records( 

220 select: sqlalchemy.sql.Select, location_table: sqlalchemy.schema.Table 

221 ) -> sqlalchemy.sql.Select: 

222 # mypy needs to be sure 

223 assert isinstance(records_table, ByNameOpaqueTableStorage) 

224 return select.select_from( 

225 records_table._table.join( 

226 location_table, 

227 onclause=records_table._table.columns.dataset_id == location_table.columns.dataset_id, 

228 ) 

229 ).where(location_table.columns.datastore_name == self.datastoreName) 

230 

231 # SELECT records.dataset_id, records.path FROM records 

232 # JOIN records on dataset_location.dataset_id == records.dataset_id 

233 # WHERE dataset_location.datastore_name = datastoreName 

234 

235 # It's possible that we may end up with a ref listed in the trash 

236 # table that is not listed in the records table. Such an 

237 # inconsistency would be missed by this query. 

238 info_in_trash = join_records(records_table._table.select(), self._tables.dataset_location_trash) 

239 

240 # Run query, transform results into a list of dicts that we can later 

241 # use to delete. 

242 with self._db.query(info_in_trash) as sql_result: 

243 rows = [dict(row, datastore_name=self.datastoreName) for row in sql_result.mappings()] 

244 

245 # It is possible for trashed refs to be linked to artifacts that 

246 # are still associated with refs that are not to be trashed. We 

247 # need to be careful to consider those and indicate to the caller 

248 # that those artifacts should be retained. Can only do this check 

249 # if the caller provides a column name that can map to multiple 

250 # refs. 

251 preserved: set[str] | None = None 

252 if record_column is not None: 

253 # Some helper subqueries 

254 items_not_in_trash = join_records( 

255 sqlalchemy.sql.select(records_table._table.columns[record_column]), 

256 self._tables.dataset_location, 

257 ).alias("items_not_in_trash") 

258 items_in_trash = join_records( 

259 sqlalchemy.sql.select(records_table._table.columns[record_column]), 

260 self._tables.dataset_location_trash, 

261 ).alias("items_in_trash") 

262 

263 # A query for paths that are referenced by datasets in the trash 

264 # and datasets not in the trash. 

265 items_to_preserve = sqlalchemy.sql.select(items_in_trash.columns[record_column]).select_from( 

266 items_not_in_trash.join( 

267 items_in_trash, 

268 onclause=items_in_trash.columns[record_column] 

269 == items_not_in_trash.columns[record_column], 

270 ) 

271 ) 

272 with self._db.query(items_to_preserve) as sql_result: 

273 preserved = {row[record_column] for row in sql_result.mappings()} 

274 

275 # Convert results to a tuple of id+info and a record of the artifacts 

276 # that should not be deleted from datastore. The id+info tuple is 

277 # solely to allow logging to report the relevant ID. 

278 id_info = ((FakeDatasetRef(row["dataset_id"]), record_class.from_record(row)) for row in rows) 

279 

280 # Start contextmanager, return results 

281 yield ((id_info, preserved)) 

282 

283 # No exception raised in context manager block. 

284 if not rows: 

285 return 

286 

287 # Delete the rows from the records table 

288 records_table.delete(["dataset_id"], *[{"dataset_id": row["dataset_id"]} for row in rows]) 

289 

290 # Delete those rows from the trash table. 

291 self._db.delete( 

292 self._tables.dataset_location_trash, 

293 ["dataset_id", "datastore_name"], 

294 *[{"dataset_id": row["dataset_id"], "datastore_name": row["datastore_name"]} for row in rows], 

295 ) 

296 

297 

298class MonolithicDatastoreRegistryBridgeManager(DatastoreRegistryBridgeManager): 

299 """An implementation of `DatastoreRegistryBridgeManager` that uses the same 

300 two tables for all non-ephemeral datastores. 

301 

302 Parameters 

303 ---------- 

304 db : `Database` 

305 Object providing a database connection and generic distractions. 

306 tables : `_TablesTuple` 

307 Named tuple containing `sqlalchemy.schema.Table` instances. 

308 opaque : `OpaqueTableStorageManager` 

309 Manager object for opaque table storage in the `Registry`. 

310 universe : `DimensionUniverse` 

311 All dimensions know to the `Registry`. 

312 datasetIdColumnType : `type` 

313 Type for dataset ID column. 

314 """ 

315 

316 def __init__( 

317 self, 

318 *, 

319 db: Database, 

320 tables: _TablesTuple, 

321 opaque: OpaqueTableStorageManager, 

322 universe: DimensionUniverse, 

323 datasetIdColumnType: type, 

324 registry_schema_version: VersionTuple | None = None, 

325 ): 

326 super().__init__( 

327 opaque=opaque, 

328 universe=universe, 

329 datasetIdColumnType=datasetIdColumnType, 

330 registry_schema_version=registry_schema_version, 

331 ) 

332 self._db = db 

333 self._tables = tables 

334 self._ephemeral: dict[str, EphemeralDatastoreRegistryBridge] = {} 

335 

336 @classmethod 

337 def initialize( 

338 cls, 

339 db: Database, 

340 context: StaticTablesContext, 

341 *, 

342 opaque: OpaqueTableStorageManager, 

343 datasets: type[DatasetRecordStorageManager], 

344 universe: DimensionUniverse, 

345 registry_schema_version: VersionTuple | None = None, 

346 ) -> DatastoreRegistryBridgeManager: 

347 # Docstring inherited from DatastoreRegistryBridge 

348 tables = context.addTableTuple(_makeTableSpecs(datasets)) 

349 return cls( 

350 db=db, 

351 tables=cast(_TablesTuple, tables), 

352 opaque=opaque, 

353 universe=universe, 

354 datasetIdColumnType=datasets.getIdColumnType(), 

355 registry_schema_version=registry_schema_version, 

356 ) 

357 

358 def refresh(self) -> None: 

359 # Docstring inherited from DatastoreRegistryBridge 

360 # This implementation has no in-Python state that depends on which 

361 # datastores exist, so there's nothing to do. 

362 pass 

363 

364 def register(self, name: str, *, ephemeral: bool = False) -> DatastoreRegistryBridge: 

365 # Docstring inherited from DatastoreRegistryBridge 

366 if ephemeral: 

367 return self._ephemeral.setdefault(name, EphemeralDatastoreRegistryBridge(name)) 

368 return MonolithicDatastoreRegistryBridge(name, db=self._db, tables=self._tables) 

369 

370 def findDatastores(self, ref: DatasetIdRef) -> Iterable[str]: 

371 # Docstring inherited from DatastoreRegistryBridge 

372 sql = ( 

373 sqlalchemy.sql.select(self._tables.dataset_location.columns.datastore_name) 

374 .select_from(self._tables.dataset_location) 

375 .where(self._tables.dataset_location.columns.dataset_id == ref.id) 

376 ) 

377 with self._db.query(sql) as sql_result: 

378 sql_rows = sql_result.mappings().fetchall() 

379 for row in sql_rows: 

380 yield row[self._tables.dataset_location.columns.datastore_name] 

381 for name, bridge in self._ephemeral.items(): 

382 if ref in bridge: 

383 yield name 

384 

385 @classmethod 

386 def currentVersions(cls) -> list[VersionTuple]: 

387 # Docstring inherited from VersionedExtension. 

388 return [_VERSION]