Coverage for python/lsst/daf/butler/registry/bridge/monolithic.py: 28%

105 statements  

« prev     ^ index     » next       coverage.py v7.5.0, created at 2024-04-26 02:47 -0700

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27from __future__ import annotations 

28 

29from ... import ddl 

30 

31__all__ = ("MonolithicDatastoreRegistryBridgeManager", "MonolithicDatastoreRegistryBridge") 

32 

33import copy 

34from collections import namedtuple 

35from collections.abc import Iterable, Iterator 

36from contextlib import contextmanager 

37from typing import TYPE_CHECKING, cast 

38 

39import sqlalchemy 

40 

41from ..._named import NamedValueSet 

42from ...datastore.stored_file_info import StoredDatastoreItemInfo 

43from ..interfaces import ( 

44 DatasetIdRef, 

45 DatastoreRegistryBridge, 

46 DatastoreRegistryBridgeManager, 

47 FakeDatasetRef, 

48 OpaqueTableStorage, 

49 VersionTuple, 

50) 

51from ..opaque import ByNameOpaqueTableStorage 

52from .ephemeral import EphemeralDatastoreRegistryBridge 

53 

54if TYPE_CHECKING: 

55 from ...datastore import DatastoreTransaction 

56 from ...dimensions import DimensionUniverse 

57 from ..interfaces import ( 

58 Database, 

59 DatasetRecordStorageManager, 

60 OpaqueTableStorageManager, 

61 StaticTablesContext, 

62 ) 

63 

64_TablesTuple = namedtuple( 

65 "_TablesTuple", 

66 [ 

67 "dataset_location", 

68 "dataset_location_trash", 

69 ], 

70) 

71 

72# This has to be updated on every schema change 

73_VERSION = VersionTuple(0, 2, 0) 

74 

75 

76def _makeTableSpecs(datasets: type[DatasetRecordStorageManager]) -> _TablesTuple: 

77 """Construct specifications for tables used by the monolithic datastore 

78 bridge classes. 

79 

80 Parameters 

81 ---------- 

82 datasets : subclass of `DatasetRecordStorageManager` 

83 Manager class for datasets; used only to create foreign key fields. 

84 

85 Returns 

86 ------- 

87 specs : `_TablesTuple` 

88 A named tuple containing `ddl.TableSpec` instances. 

89 """ 

90 # We want the dataset_location and dataset_location_trash tables 

91 # to have the same definition, aside from the behavior of their link 

92 # to the dataset table: the trash table has no foreign key constraint. 

93 dataset_location_spec = ddl.TableSpec( 

94 doc=( 

95 "A table that provides information on whether a dataset is stored in " 

96 "one or more Datastores. The presence or absence of a record in this " 

97 "table itself indicates whether the dataset is present in that " 

98 "Datastore. " 

99 ), 

100 fields=NamedValueSet( 

101 [ 

102 ddl.FieldSpec( 

103 name="datastore_name", 

104 dtype=sqlalchemy.String, 

105 length=256, 

106 primaryKey=True, 

107 nullable=False, 

108 doc="Name of the Datastore this entry corresponds to.", 

109 ), 

110 ] 

111 ), 

112 ) 

113 dataset_location = copy.deepcopy(dataset_location_spec) 

114 datasets.addDatasetForeignKey(dataset_location, primaryKey=True) 

115 dataset_location_trash = copy.deepcopy(dataset_location_spec) 

116 datasets.addDatasetForeignKey(dataset_location_trash, primaryKey=True, constraint=False) 

117 return _TablesTuple( 

118 dataset_location=dataset_location, 

119 dataset_location_trash=dataset_location_trash, 

120 ) 

121 

122 

123class MonolithicDatastoreRegistryBridge(DatastoreRegistryBridge): 

124 """An implementation of `DatastoreRegistryBridge` that uses the same two 

125 tables for all non-ephemeral datastores. 

126 

127 Parameters 

128 ---------- 

129 datastoreName : `str` 

130 Name of the `Datastore` as it should appear in `Registry` tables 

131 referencing it. 

132 db : `Database` 

133 Object providing a database connection and generic distractions. 

134 tables : `_TablesTuple` 

135 Named tuple containing `sqlalchemy.schema.Table` instances. 

136 """ 

137 

138 def __init__(self, datastoreName: str, *, db: Database, tables: _TablesTuple): 

139 super().__init__(datastoreName) 

140 self._db = db 

141 self._tables = tables 

142 

143 def _refsToRows(self, refs: Iterable[DatasetIdRef]) -> list[dict]: 

144 """Transform an iterable of `DatasetRef` or `FakeDatasetRef` objects to 

145 a list of dictionaries that match the schema of the tables used by this 

146 class. 

147 

148 Parameters 

149 ---------- 

150 refs : `~collections.abc.Iterable` [ `DatasetRef` or `FakeDatasetRef` ] 

151 Datasets to transform. 

152 

153 Returns 

154 ------- 

155 rows : `list` [ `dict` ] 

156 List of dictionaries, with "datastoreName" and "dataset_id" keys. 

157 """ 

158 return [{"datastore_name": self.datastoreName, "dataset_id": ref.id} for ref in refs] 

159 

160 def ensure(self, refs: Iterable[DatasetIdRef]) -> None: 

161 # Docstring inherited from DatastoreRegistryBridge 

162 self._db.ensure(self._tables.dataset_location, *self._refsToRows(refs)) 

163 

164 def insert(self, refs: Iterable[DatasetIdRef]) -> None: 

165 # Docstring inherited from DatastoreRegistryBridge 

166 self._db.insert(self._tables.dataset_location, *self._refsToRows(refs)) 

167 

168 def forget(self, refs: Iterable[DatasetIdRef]) -> None: 

169 # Docstring inherited from DatastoreRegistryBridge 

170 rows = self._refsToRows(self.check(refs)) 

171 self._db.delete(self._tables.dataset_location, ["datastore_name", "dataset_id"], *rows) 

172 

173 def moveToTrash(self, refs: Iterable[DatasetIdRef], transaction: DatastoreTransaction | None) -> None: 

174 # Docstring inherited from DatastoreRegistryBridge 

175 # TODO: avoid self.check() call via queries like 

176 # INSERT INTO dataset_location_trash 

177 # SELECT datastore_name, dataset_id FROM dataset_location 

178 # WHERE datastore_name=? AND dataset_id IN (?); 

179 # DELETE FROM dataset_location 

180 # WHERE datastore_name=? AND dataset_id IN (?); 

181 # ...but the Database interface doesn't support those kinds of queries 

182 # right now. 

183 rows = self._refsToRows(self.check(refs)) 

184 with self._db.transaction(): 

185 self._db.delete(self._tables.dataset_location, ["datastore_name", "dataset_id"], *rows) 

186 self._db.insert(self._tables.dataset_location_trash, *rows) 

187 

188 def check(self, refs: Iterable[DatasetIdRef]) -> Iterable[DatasetIdRef]: 

189 # Docstring inherited from DatastoreRegistryBridge 

190 byId = {ref.id: ref for ref in refs} 

191 sql = ( 

192 sqlalchemy.sql.select(self._tables.dataset_location.columns.dataset_id) 

193 .select_from(self._tables.dataset_location) 

194 .where( 

195 sqlalchemy.sql.and_( 

196 self._tables.dataset_location.columns.datastore_name == self.datastoreName, 

197 self._tables.dataset_location.columns.dataset_id.in_(byId.keys()), 

198 ) 

199 ) 

200 ) 

201 with self._db.query(sql) as sql_result: 

202 sql_rows = sql_result.fetchall() 

203 for row in sql_rows: 

204 yield byId[row.dataset_id] 

205 

206 @contextmanager 

207 def emptyTrash( 

208 self, 

209 records_table: OpaqueTableStorage | None = None, 

210 record_class: type[StoredDatastoreItemInfo] | None = None, 

211 record_column: str | None = None, 

212 ) -> Iterator[tuple[Iterable[tuple[DatasetIdRef, StoredDatastoreItemInfo | None]], set[str] | None]]: 

213 # Docstring inherited from DatastoreRegistryBridge 

214 

215 if records_table is None: 

216 raise ValueError("This implementation requires a records table.") 

217 

218 assert isinstance( 

219 records_table, ByNameOpaqueTableStorage 

220 ), f"Records table must support hidden attributes. Got {type(records_table)}." 

221 

222 if record_class is None: 

223 raise ValueError("Record class must be provided if records table is given.") 

224 

225 # Helper closure to generate the common join+where clause. 

226 def join_records( 

227 select: sqlalchemy.sql.Select, location_table: sqlalchemy.schema.Table 

228 ) -> sqlalchemy.sql.Select: 

229 # mypy needs to be sure 

230 assert isinstance(records_table, ByNameOpaqueTableStorage) 

231 return select.select_from( 

232 records_table._table.join( 

233 location_table, 

234 onclause=records_table._table.columns.dataset_id == location_table.columns.dataset_id, 

235 ) 

236 ).where(location_table.columns.datastore_name == self.datastoreName) 

237 

238 # SELECT records.dataset_id, records.path FROM records 

239 # JOIN records on dataset_location.dataset_id == records.dataset_id 

240 # WHERE dataset_location.datastore_name = datastoreName 

241 

242 # It's possible that we may end up with a ref listed in the trash 

243 # table that is not listed in the records table. Such an 

244 # inconsistency would be missed by this query. 

245 info_in_trash = join_records(records_table._table.select(), self._tables.dataset_location_trash) 

246 

247 # Run query, transform results into a list of dicts that we can later 

248 # use to delete. 

249 with self._db.query(info_in_trash) as sql_result: 

250 rows = [dict(row, datastore_name=self.datastoreName) for row in sql_result.mappings()] 

251 

252 # It is possible for trashed refs to be linked to artifacts that 

253 # are still associated with refs that are not to be trashed. We 

254 # need to be careful to consider those and indicate to the caller 

255 # that those artifacts should be retained. Can only do this check 

256 # if the caller provides a column name that can map to multiple 

257 # refs. 

258 preserved: set[str] | None = None 

259 if record_column is not None: 

260 # Some helper subqueries 

261 items_not_in_trash = join_records( 

262 sqlalchemy.sql.select(records_table._table.columns[record_column]), 

263 self._tables.dataset_location, 

264 ).alias("items_not_in_trash") 

265 items_in_trash = join_records( 

266 sqlalchemy.sql.select(records_table._table.columns[record_column]), 

267 self._tables.dataset_location_trash, 

268 ).alias("items_in_trash") 

269 

270 # A query for paths that are referenced by datasets in the trash 

271 # and datasets not in the trash. 

272 items_to_preserve = sqlalchemy.sql.select(items_in_trash.columns[record_column]).select_from( 

273 items_not_in_trash.join( 

274 items_in_trash, 

275 onclause=items_in_trash.columns[record_column] 

276 == items_not_in_trash.columns[record_column], 

277 ) 

278 ) 

279 with self._db.query(items_to_preserve) as sql_result: 

280 preserved = {row[record_column] for row in sql_result.mappings()} 

281 

282 # Convert results to a tuple of id+info and a record of the artifacts 

283 # that should not be deleted from datastore. The id+info tuple is 

284 # solely to allow logging to report the relevant ID. 

285 id_info = ((FakeDatasetRef(row["dataset_id"]), record_class.from_record(row)) for row in rows) 

286 

287 # Start contextmanager, return results 

288 yield ((id_info, preserved)) 

289 

290 # No exception raised in context manager block. 

291 if not rows: 

292 return 

293 

294 # Delete the rows from the records table 

295 records_table.delete(["dataset_id"], *[{"dataset_id": row["dataset_id"]} for row in rows]) 

296 

297 # Delete those rows from the trash table. 

298 self._db.delete( 

299 self._tables.dataset_location_trash, 

300 ["dataset_id", "datastore_name"], 

301 *[{"dataset_id": row["dataset_id"], "datastore_name": row["datastore_name"]} for row in rows], 

302 ) 

303 

304 

305class MonolithicDatastoreRegistryBridgeManager(DatastoreRegistryBridgeManager): 

306 """An implementation of `DatastoreRegistryBridgeManager` that uses the same 

307 two tables for all non-ephemeral datastores. 

308 

309 Parameters 

310 ---------- 

311 db : `Database` 

312 Object providing a database connection and generic distractions. 

313 tables : `_TablesTuple` 

314 Named tuple containing `sqlalchemy.schema.Table` instances. 

315 opaque : `OpaqueTableStorageManager` 

316 Manager object for opaque table storage in the `Registry`. 

317 universe : `DimensionUniverse` 

318 All dimensions know to the `Registry`. 

319 datasetIdColumnType : `type` 

320 Type for dataset ID column. 

321 registry_schema_version : `VersionTuple` or `None`, optional 

322 The version of the registry schema. 

323 """ 

324 

325 def __init__( 

326 self, 

327 *, 

328 db: Database, 

329 tables: _TablesTuple, 

330 opaque: OpaqueTableStorageManager, 

331 universe: DimensionUniverse, 

332 datasetIdColumnType: type, 

333 registry_schema_version: VersionTuple | None = None, 

334 ): 

335 super().__init__( 

336 opaque=opaque, 

337 universe=universe, 

338 datasetIdColumnType=datasetIdColumnType, 

339 registry_schema_version=registry_schema_version, 

340 ) 

341 self._db = db 

342 self._tables = tables 

343 self._ephemeral: dict[str, EphemeralDatastoreRegistryBridge] = {} 

344 

345 def clone(self, *, db: Database, opaque: OpaqueTableStorageManager) -> DatastoreRegistryBridgeManager: 

346 return MonolithicDatastoreRegistryBridgeManager( 

347 db=db, 

348 tables=self._tables, 

349 opaque=opaque, 

350 universe=self.universe, 

351 datasetIdColumnType=self.datasetIdColumnType, 

352 registry_schema_version=self._registry_schema_version, 

353 ) 

354 

355 @classmethod 

356 def initialize( 

357 cls, 

358 db: Database, 

359 context: StaticTablesContext, 

360 *, 

361 opaque: OpaqueTableStorageManager, 

362 datasets: type[DatasetRecordStorageManager], 

363 universe: DimensionUniverse, 

364 registry_schema_version: VersionTuple | None = None, 

365 ) -> DatastoreRegistryBridgeManager: 

366 # Docstring inherited from DatastoreRegistryBridge 

367 tables = context.addTableTuple(_makeTableSpecs(datasets)) 

368 return cls( 

369 db=db, 

370 tables=cast(_TablesTuple, tables), 

371 opaque=opaque, 

372 universe=universe, 

373 datasetIdColumnType=datasets.getIdColumnType(), 

374 registry_schema_version=registry_schema_version, 

375 ) 

376 

377 def refresh(self) -> None: 

378 # Docstring inherited from DatastoreRegistryBridge 

379 # This implementation has no in-Python state that depends on which 

380 # datastores exist, so there's nothing to do. 

381 pass 

382 

383 def register(self, name: str, *, ephemeral: bool = False) -> DatastoreRegistryBridge: 

384 # Docstring inherited from DatastoreRegistryBridge 

385 if ephemeral: 

386 return self._ephemeral.setdefault(name, EphemeralDatastoreRegistryBridge(name)) 

387 return MonolithicDatastoreRegistryBridge(name, db=self._db, tables=self._tables) 

388 

389 def findDatastores(self, ref: DatasetIdRef) -> Iterable[str]: 

390 # Docstring inherited from DatastoreRegistryBridge 

391 sql = ( 

392 sqlalchemy.sql.select(self._tables.dataset_location.columns.datastore_name) 

393 .select_from(self._tables.dataset_location) 

394 .where(self._tables.dataset_location.columns.dataset_id == ref.id) 

395 ) 

396 with self._db.query(sql) as sql_result: 

397 sql_rows = sql_result.mappings().fetchall() 

398 for row in sql_rows: 

399 yield row[self._tables.dataset_location.columns.datastore_name] 

400 for name, bridge in self._ephemeral.items(): 

401 if ref in bridge: 

402 yield name 

403 

404 @classmethod 

405 def currentVersions(cls) -> list[VersionTuple]: 

406 # Docstring inherited from VersionedExtension. 

407 return [_VERSION]