Coverage for python/lsst/daf/butler/registry/bridge/monolithic.py: 27%

101 statements  

« prev     ^ index     » next       coverage.py v7.3.1, created at 2023-10-02 08:00 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27from __future__ import annotations 

28 

29__all__ = ("MonolithicDatastoreRegistryBridgeManager", "MonolithicDatastoreRegistryBridge") 

30 

31import copy 

32from collections import namedtuple 

33from collections.abc import Iterable, Iterator 

34from contextlib import contextmanager 

35from typing import TYPE_CHECKING, cast 

36 

37import sqlalchemy 

38 

39from ...core import NamedValueSet, StoredDatastoreItemInfo, ddl 

40from ..interfaces import ( 

41 DatasetIdRef, 

42 DatastoreRegistryBridge, 

43 DatastoreRegistryBridgeManager, 

44 FakeDatasetRef, 

45 OpaqueTableStorage, 

46 VersionTuple, 

47) 

48from ..opaque import ByNameOpaqueTableStorage 

49from .ephemeral import EphemeralDatastoreRegistryBridge 

50 

51if TYPE_CHECKING: 

52 from ...core import DimensionUniverse 

53 from ...core.datastore import DatastoreTransaction 

54 from ..interfaces import ( 

55 Database, 

56 DatasetRecordStorageManager, 

57 OpaqueTableStorageManager, 

58 StaticTablesContext, 

59 ) 

60 

61_TablesTuple = namedtuple( 

62 "_TablesTuple", 

63 [ 

64 "dataset_location", 

65 "dataset_location_trash", 

66 ], 

67) 

68 

69# This has to be updated on every schema change 

70_VERSION = VersionTuple(0, 2, 0) 

71 

72 

73def _makeTableSpecs(datasets: type[DatasetRecordStorageManager]) -> _TablesTuple: 

74 """Construct specifications for tables used by the monolithic datastore 

75 bridge classes. 

76 

77 Parameters 

78 ---------- 

79 universe : `DimensionUniverse` 

80 All dimensions known to the `Registry`. 

81 datasets : subclass of `DatasetRecordStorageManager` 

82 Manager class for datasets; used only to create foreign key fields. 

83 

84 Returns 

85 ------- 

86 specs : `_TablesTuple` 

87 A named tuple containing `ddl.TableSpec` instances. 

88 """ 

89 # We want the dataset_location and dataset_location_trash tables 

90 # to have the same definition, aside from the behavior of their link 

91 # to the dataset table: the trash table has no foreign key constraint. 

92 dataset_location_spec = ddl.TableSpec( 

93 doc=( 

94 "A table that provides information on whether a dataset is stored in " 

95 "one or more Datastores. The presence or absence of a record in this " 

96 "table itself indicates whether the dataset is present in that " 

97 "Datastore. " 

98 ), 

99 fields=NamedValueSet( 

100 [ 

101 ddl.FieldSpec( 

102 name="datastore_name", 

103 dtype=sqlalchemy.String, 

104 length=256, 

105 primaryKey=True, 

106 nullable=False, 

107 doc="Name of the Datastore this entry corresponds to.", 

108 ), 

109 ] 

110 ), 

111 ) 

112 dataset_location = copy.deepcopy(dataset_location_spec) 

113 datasets.addDatasetForeignKey(dataset_location, primaryKey=True) 

114 dataset_location_trash = copy.deepcopy(dataset_location_spec) 

115 datasets.addDatasetForeignKey(dataset_location_trash, primaryKey=True, constraint=False) 

116 return _TablesTuple( 

117 dataset_location=dataset_location, 

118 dataset_location_trash=dataset_location_trash, 

119 ) 

120 

121 

122class MonolithicDatastoreRegistryBridge(DatastoreRegistryBridge): 

123 """An implementation of `DatastoreRegistryBridge` that uses the same two 

124 tables for all non-ephemeral datastores. 

125 

126 Parameters 

127 ---------- 

128 datastoreName : `str` 

129 Name of the `Datastore` as it should appear in `Registry` tables 

130 referencing it. 

131 db : `Database` 

132 Object providing a database connection and generic distractions. 

133 tables : `_TablesTuple` 

134 Named tuple containing `sqlalchemy.schema.Table` instances. 

135 """ 

136 

137 def __init__(self, datastoreName: str, *, db: Database, tables: _TablesTuple): 

138 super().__init__(datastoreName) 

139 self._db = db 

140 self._tables = tables 

141 

142 def _refsToRows(self, refs: Iterable[DatasetIdRef]) -> list[dict]: 

143 """Transform an iterable of `DatasetRef` or `FakeDatasetRef` objects to 

144 a list of dictionaries that match the schema of the tables used by this 

145 class. 

146 

147 Parameters 

148 ---------- 

149 refs : `~collections.abc.Iterable` [ `DatasetRef` or `FakeDatasetRef` ] 

150 Datasets to transform. 

151 

152 Returns 

153 ------- 

154 rows : `list` [ `dict` ] 

155 List of dictionaries, with "datastoreName" and "dataset_id" keys. 

156 """ 

157 return [{"datastore_name": self.datastoreName, "dataset_id": ref.id} for ref in refs] 

158 

159 def ensure(self, refs: Iterable[DatasetIdRef]) -> None: 

160 # Docstring inherited from DatastoreRegistryBridge 

161 self._db.ensure(self._tables.dataset_location, *self._refsToRows(refs)) 

162 

163 def insert(self, refs: Iterable[DatasetIdRef]) -> None: 

164 # Docstring inherited from DatastoreRegistryBridge 

165 self._db.insert(self._tables.dataset_location, *self._refsToRows(refs)) 

166 

167 def forget(self, refs: Iterable[DatasetIdRef]) -> None: 

168 # Docstring inherited from DatastoreRegistryBridge 

169 rows = self._refsToRows(self.check(refs)) 

170 self._db.delete(self._tables.dataset_location, ["datastore_name", "dataset_id"], *rows) 

171 

172 def moveToTrash(self, refs: Iterable[DatasetIdRef], transaction: DatastoreTransaction | None) -> None: 

173 # Docstring inherited from DatastoreRegistryBridge 

174 # TODO: avoid self.check() call via queries like 

175 # INSERT INTO dataset_location_trash 

176 # SELECT datastore_name, dataset_id FROM dataset_location 

177 # WHERE datastore_name=? AND dataset_id IN (?); 

178 # DELETE FROM dataset_location 

179 # WHERE datastore_name=? AND dataset_id IN (?); 

180 # ...but the Database interface doesn't support those kinds of queries 

181 # right now. 

182 rows = self._refsToRows(self.check(refs)) 

183 with self._db.transaction(): 

184 self._db.delete(self._tables.dataset_location, ["datastore_name", "dataset_id"], *rows) 

185 self._db.insert(self._tables.dataset_location_trash, *rows) 

186 

187 def check(self, refs: Iterable[DatasetIdRef]) -> Iterable[DatasetIdRef]: 

188 # Docstring inherited from DatastoreRegistryBridge 

189 byId = {ref.id: ref for ref in refs} 

190 sql = ( 

191 sqlalchemy.sql.select(self._tables.dataset_location.columns.dataset_id) 

192 .select_from(self._tables.dataset_location) 

193 .where( 

194 sqlalchemy.sql.and_( 

195 self._tables.dataset_location.columns.datastore_name == self.datastoreName, 

196 self._tables.dataset_location.columns.dataset_id.in_(byId.keys()), 

197 ) 

198 ) 

199 ) 

200 with self._db.query(sql) as sql_result: 

201 sql_rows = sql_result.fetchall() 

202 for row in sql_rows: 

203 yield byId[row.dataset_id] 

204 

205 @contextmanager 

206 def emptyTrash( 

207 self, 

208 records_table: OpaqueTableStorage | None = None, 

209 record_class: type[StoredDatastoreItemInfo] | None = None, 

210 record_column: str | None = None, 

211 ) -> Iterator[tuple[Iterable[tuple[DatasetIdRef, StoredDatastoreItemInfo | None]], set[str] | None]]: 

212 # Docstring inherited from DatastoreRegistryBridge 

213 

214 if records_table is None: 

215 raise ValueError("This implementation requires a records table.") 

216 

217 assert isinstance( 

218 records_table, ByNameOpaqueTableStorage 

219 ), f"Records table must support hidden attributes. Got {type(records_table)}." 

220 

221 if record_class is None: 

222 raise ValueError("Record class must be provided if records table is given.") 

223 

224 # Helper closure to generate the common join+where clause. 

225 def join_records( 

226 select: sqlalchemy.sql.Select, location_table: sqlalchemy.schema.Table 

227 ) -> sqlalchemy.sql.Select: 

228 # mypy needs to be sure 

229 assert isinstance(records_table, ByNameOpaqueTableStorage) 

230 return select.select_from( 

231 records_table._table.join( 

232 location_table, 

233 onclause=records_table._table.columns.dataset_id == location_table.columns.dataset_id, 

234 ) 

235 ).where(location_table.columns.datastore_name == self.datastoreName) 

236 

237 # SELECT records.dataset_id, records.path FROM records 

238 # JOIN records on dataset_location.dataset_id == records.dataset_id 

239 # WHERE dataset_location.datastore_name = datastoreName 

240 

241 # It's possible that we may end up with a ref listed in the trash 

242 # table that is not listed in the records table. Such an 

243 # inconsistency would be missed by this query. 

244 info_in_trash = join_records(records_table._table.select(), self._tables.dataset_location_trash) 

245 

246 # Run query, transform results into a list of dicts that we can later 

247 # use to delete. 

248 with self._db.query(info_in_trash) as sql_result: 

249 rows = [dict(row, datastore_name=self.datastoreName) for row in sql_result.mappings()] 

250 

251 # It is possible for trashed refs to be linked to artifacts that 

252 # are still associated with refs that are not to be trashed. We 

253 # need to be careful to consider those and indicate to the caller 

254 # that those artifacts should be retained. Can only do this check 

255 # if the caller provides a column name that can map to multiple 

256 # refs. 

257 preserved: set[str] | None = None 

258 if record_column is not None: 

259 # Some helper subqueries 

260 items_not_in_trash = join_records( 

261 sqlalchemy.sql.select(records_table._table.columns[record_column]), 

262 self._tables.dataset_location, 

263 ).alias("items_not_in_trash") 

264 items_in_trash = join_records( 

265 sqlalchemy.sql.select(records_table._table.columns[record_column]), 

266 self._tables.dataset_location_trash, 

267 ).alias("items_in_trash") 

268 

269 # A query for paths that are referenced by datasets in the trash 

270 # and datasets not in the trash. 

271 items_to_preserve = sqlalchemy.sql.select(items_in_trash.columns[record_column]).select_from( 

272 items_not_in_trash.join( 

273 items_in_trash, 

274 onclause=items_in_trash.columns[record_column] 

275 == items_not_in_trash.columns[record_column], 

276 ) 

277 ) 

278 with self._db.query(items_to_preserve) as sql_result: 

279 preserved = {row[record_column] for row in sql_result.mappings()} 

280 

281 # Convert results to a tuple of id+info and a record of the artifacts 

282 # that should not be deleted from datastore. The id+info tuple is 

283 # solely to allow logging to report the relevant ID. 

284 id_info = ((FakeDatasetRef(row["dataset_id"]), record_class.from_record(row)) for row in rows) 

285 

286 # Start contextmanager, return results 

287 yield ((id_info, preserved)) 

288 

289 # No exception raised in context manager block. 

290 if not rows: 

291 return 

292 

293 # Delete the rows from the records table 

294 records_table.delete(["dataset_id"], *[{"dataset_id": row["dataset_id"]} for row in rows]) 

295 

296 # Delete those rows from the trash table. 

297 self._db.delete( 

298 self._tables.dataset_location_trash, 

299 ["dataset_id", "datastore_name"], 

300 *[{"dataset_id": row["dataset_id"], "datastore_name": row["datastore_name"]} for row in rows], 

301 ) 

302 

303 

304class MonolithicDatastoreRegistryBridgeManager(DatastoreRegistryBridgeManager): 

305 """An implementation of `DatastoreRegistryBridgeManager` that uses the same 

306 two tables for all non-ephemeral datastores. 

307 

308 Parameters 

309 ---------- 

310 db : `Database` 

311 Object providing a database connection and generic distractions. 

312 tables : `_TablesTuple` 

313 Named tuple containing `sqlalchemy.schema.Table` instances. 

314 opaque : `OpaqueTableStorageManager` 

315 Manager object for opaque table storage in the `Registry`. 

316 universe : `DimensionUniverse` 

317 All dimensions know to the `Registry`. 

318 datasetIdColumnType : `type` 

319 Type for dataset ID column. 

320 """ 

321 

322 def __init__( 

323 self, 

324 *, 

325 db: Database, 

326 tables: _TablesTuple, 

327 opaque: OpaqueTableStorageManager, 

328 universe: DimensionUniverse, 

329 datasetIdColumnType: type, 

330 registry_schema_version: VersionTuple | None = None, 

331 ): 

332 super().__init__( 

333 opaque=opaque, 

334 universe=universe, 

335 datasetIdColumnType=datasetIdColumnType, 

336 registry_schema_version=registry_schema_version, 

337 ) 

338 self._db = db 

339 self._tables = tables 

340 self._ephemeral: dict[str, EphemeralDatastoreRegistryBridge] = {} 

341 

342 @classmethod 

343 def initialize( 

344 cls, 

345 db: Database, 

346 context: StaticTablesContext, 

347 *, 

348 opaque: OpaqueTableStorageManager, 

349 datasets: type[DatasetRecordStorageManager], 

350 universe: DimensionUniverse, 

351 registry_schema_version: VersionTuple | None = None, 

352 ) -> DatastoreRegistryBridgeManager: 

353 # Docstring inherited from DatastoreRegistryBridge 

354 tables = context.addTableTuple(_makeTableSpecs(datasets)) 

355 return cls( 

356 db=db, 

357 tables=cast(_TablesTuple, tables), 

358 opaque=opaque, 

359 universe=universe, 

360 datasetIdColumnType=datasets.getIdColumnType(), 

361 registry_schema_version=registry_schema_version, 

362 ) 

363 

364 def refresh(self) -> None: 

365 # Docstring inherited from DatastoreRegistryBridge 

366 # This implementation has no in-Python state that depends on which 

367 # datastores exist, so there's nothing to do. 

368 pass 

369 

370 def register(self, name: str, *, ephemeral: bool = False) -> DatastoreRegistryBridge: 

371 # Docstring inherited from DatastoreRegistryBridge 

372 if ephemeral: 

373 return self._ephemeral.setdefault(name, EphemeralDatastoreRegistryBridge(name)) 

374 return MonolithicDatastoreRegistryBridge(name, db=self._db, tables=self._tables) 

375 

376 def findDatastores(self, ref: DatasetIdRef) -> Iterable[str]: 

377 # Docstring inherited from DatastoreRegistryBridge 

378 sql = ( 

379 sqlalchemy.sql.select(self._tables.dataset_location.columns.datastore_name) 

380 .select_from(self._tables.dataset_location) 

381 .where(self._tables.dataset_location.columns.dataset_id == ref.id) 

382 ) 

383 with self._db.query(sql) as sql_result: 

384 sql_rows = sql_result.mappings().fetchall() 

385 for row in sql_rows: 

386 yield row[self._tables.dataset_location.columns.datastore_name] 

387 for name, bridge in self._ephemeral.items(): 

388 if ref in bridge: 

389 yield name 

390 

391 @classmethod 

392 def currentVersions(cls) -> list[VersionTuple]: 

393 # Docstring inherited from VersionedExtension. 

394 return [_VERSION]