Coverage for python/lsst/daf/butler/registry/bridge/monolithic.py: 84%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

97 statements  

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ("MonolithicDatastoreRegistryBridgeManager", "MonolithicDatastoreRegistryBridge") 

24 

25import copy 

26from collections import namedtuple 

27from contextlib import contextmanager 

28from typing import TYPE_CHECKING, Dict, Iterable, Iterator, List, Optional, Set, Tuple, Type, cast 

29 

30import sqlalchemy 

31from lsst.daf.butler import NamedValueSet, StoredDatastoreItemInfo, ddl 

32from lsst.daf.butler.registry.bridge.ephemeral import EphemeralDatastoreRegistryBridge 

33from lsst.daf.butler.registry.interfaces import ( 

34 DatasetIdRef, 

35 DatastoreRegistryBridge, 

36 DatastoreRegistryBridgeManager, 

37 FakeDatasetRef, 

38 OpaqueTableStorage, 

39 VersionTuple, 

40) 

41from lsst.daf.butler.registry.opaque import ByNameOpaqueTableStorage 

42 

43if TYPE_CHECKING: 43 ↛ 44line 43 didn't jump to line 44, because the condition on line 43 was never true

44 from lsst.daf.butler import DimensionUniverse 

45 from lsst.daf.butler.registry.interfaces import ( 

46 Database, 

47 DatasetRecordStorageManager, 

48 OpaqueTableStorageManager, 

49 StaticTablesContext, 

50 ) 

51 

52_TablesTuple = namedtuple( 

53 "_TablesTuple", 

54 [ 

55 "dataset_location", 

56 "dataset_location_trash", 

57 ], 

58) 

59 

60# This has to be updated on every schema change 

61_VERSION = VersionTuple(0, 2, 0) 

62 

63 

64def _makeTableSpecs(datasets: Type[DatasetRecordStorageManager]) -> _TablesTuple: 

65 """Construct specifications for tables used by the monolithic datastore 

66 bridge classes. 

67 

68 Parameters 

69 ---------- 

70 universe : `DimensionUniverse` 

71 All dimensions known to the `Registry`. 

72 datasets : subclass of `DatasetRecordStorageManager` 

73 Manager class for datasets; used only to create foreign key fields. 

74 

75 Returns 

76 ------- 

77 specs : `_TablesTuple` 

78 A named tuple containing `ddl.TableSpec` instances. 

79 """ 

80 # We want the dataset_location and dataset_location_trash tables 

81 # to have the same definition, aside from the behavior of their link 

82 # to the dataset table: the trash table has no foreign key constraint. 

83 dataset_location_spec = ddl.TableSpec( 

84 doc=( 

85 "A table that provides information on whether a dataset is stored in " 

86 "one or more Datastores. The presence or absence of a record in this " 

87 "table itself indicates whether the dataset is present in that " 

88 "Datastore. " 

89 ), 

90 fields=NamedValueSet( 

91 [ 

92 ddl.FieldSpec( 

93 name="datastore_name", 

94 dtype=sqlalchemy.String, 

95 length=256, 

96 primaryKey=True, 

97 nullable=False, 

98 doc="Name of the Datastore this entry corresponds to.", 

99 ), 

100 ] 

101 ), 

102 ) 

103 dataset_location = copy.deepcopy(dataset_location_spec) 

104 datasets.addDatasetForeignKey(dataset_location, primaryKey=True) 

105 dataset_location_trash = copy.deepcopy(dataset_location_spec) 

106 datasets.addDatasetForeignKey(dataset_location_trash, primaryKey=True, constraint=False) 

107 return _TablesTuple( 

108 dataset_location=dataset_location, 

109 dataset_location_trash=dataset_location_trash, 

110 ) 

111 

112 

113class MonolithicDatastoreRegistryBridge(DatastoreRegistryBridge): 

114 """An implementation of `DatastoreRegistryBridge` that uses the same two 

115 tables for all non-ephemeral datastores. 

116 

117 Parameters 

118 ---------- 

119 datastoreName : `str` 

120 Name of the `Datastore` as it should appear in `Registry` tables 

121 referencing it. 

122 db : `Database` 

123 Object providing a database connection and generic distractions. 

124 tables : `_TablesTuple` 

125 Named tuple containing `sqlalchemy.schema.Table` instances. 

126 """ 

127 

128 def __init__(self, datastoreName: str, *, db: Database, tables: _TablesTuple): 

129 super().__init__(datastoreName) 

130 self._db = db 

131 self._tables = tables 

132 

133 def _refsToRows(self, refs: Iterable[DatasetIdRef]) -> List[dict]: 

134 """Transform an iterable of `DatasetRef` or `FakeDatasetRef` objects to 

135 a list of dictionaries that match the schema of the tables used by this 

136 class. 

137 

138 Parameters 

139 ---------- 

140 refs : `Iterable` [ `DatasetRef` or `FakeDatasetRef` ] 

141 Datasets to transform. 

142 

143 Returns 

144 ------- 

145 rows : `list` [ `dict` ] 

146 List of dictionaries, with "datastoreName" and "dataset_id" keys. 

147 """ 

148 return [{"datastore_name": self.datastoreName, "dataset_id": ref.getCheckedId()} for ref in refs] 

149 

150 def insert(self, refs: Iterable[DatasetIdRef]) -> None: 

151 # Docstring inherited from DatastoreRegistryBridge 

152 self._db.insert(self._tables.dataset_location, *self._refsToRows(refs)) 

153 

154 def forget(self, refs: Iterable[DatasetIdRef]) -> None: 

155 # Docstring inherited from DatastoreRegistryBridge 

156 rows = self._refsToRows(self.check(refs)) 

157 self._db.delete(self._tables.dataset_location, ["datastore_name", "dataset_id"], *rows) 

158 

159 def moveToTrash(self, refs: Iterable[DatasetIdRef]) -> None: 

160 # Docstring inherited from DatastoreRegistryBridge 

161 # TODO: avoid self.check() call via queries like 

162 # INSERT INTO dataset_location_trash 

163 # SELECT datastore_name, dataset_id FROM dataset_location 

164 # WHERE datastore_name=? AND dataset_id IN (?); 

165 # DELETE FROM dataset_location 

166 # WHERE datastore_name=? AND dataset_id IN (?); 

167 # ...but the Database interface doesn't support those kinds of queries 

168 # right now. 

169 rows = self._refsToRows(self.check(refs)) 

170 with self._db.transaction(): 

171 self._db.delete(self._tables.dataset_location, ["datastore_name", "dataset_id"], *rows) 

172 self._db.insert(self._tables.dataset_location_trash, *rows) 

173 

174 def check(self, refs: Iterable[DatasetIdRef]) -> Iterable[DatasetIdRef]: 

175 # Docstring inherited from DatastoreRegistryBridge 

176 byId = {ref.getCheckedId(): ref for ref in refs} 

177 sql = ( 

178 sqlalchemy.sql.select(self._tables.dataset_location.columns.dataset_id) 

179 .select_from(self._tables.dataset_location) 

180 .where( 

181 sqlalchemy.sql.and_( 

182 self._tables.dataset_location.columns.datastore_name == self.datastoreName, 

183 self._tables.dataset_location.columns.dataset_id.in_(byId.keys()), 

184 ) 

185 ) 

186 ) 

187 for row in self._db.query(sql).fetchall(): 

188 yield byId[row.dataset_id] 

189 

190 @contextmanager 

191 def emptyTrash( 

192 self, 

193 records_table: Optional[OpaqueTableStorage] = None, 

194 record_class: Optional[Type[StoredDatastoreItemInfo]] = None, 

195 record_column: Optional[str] = None, 

196 ) -> Iterator[ 

197 Tuple[Iterable[Tuple[DatasetIdRef, Optional[StoredDatastoreItemInfo]]], Optional[Set[str]]] 

198 ]: 

199 # Docstring inherited from DatastoreRegistryBridge 

200 

201 if records_table is None: 201 ↛ 202line 201 didn't jump to line 202, because the condition on line 201 was never true

202 raise ValueError("This implementation requires a records table.") 

203 

204 assert isinstance( 

205 records_table, ByNameOpaqueTableStorage 

206 ), f"Records table must support hidden attributes. Got {type(records_table)}." 

207 

208 if record_class is None: 208 ↛ 209line 208 didn't jump to line 209, because the condition on line 208 was never true

209 raise ValueError("Record class must be provided if records table is given.") 

210 

211 # Helper closure to generate the common join+where clause. 

212 def join_records( 

213 select: sqlalchemy.sql.Select, location_table: sqlalchemy.schema.Table 

214 ) -> sqlalchemy.sql.FromClause: 

215 # mypy needs to be sure 

216 assert isinstance(records_table, ByNameOpaqueTableStorage) 

217 return select.select_from( 

218 records_table._table.join( 

219 location_table, 

220 onclause=records_table._table.columns.dataset_id == location_table.columns.dataset_id, 

221 ) 

222 ).where(location_table.columns.datastore_name == self.datastoreName) 

223 

224 # SELECT records.dataset_id, records.path FROM records 

225 # JOIN records on dataset_location.dataset_id == records.dataset_id 

226 # WHERE dataset_location.datastore_name = datastoreName 

227 

228 # It's possible that we may end up with a ref listed in the trash 

229 # table that is not listed in the records table. Such an 

230 # inconsistency would be missed by this query. 

231 info_in_trash = join_records(records_table._table.select(), self._tables.dataset_location_trash) 

232 

233 # Run query, transform results into a list of dicts that we can later 

234 # use to delete. 

235 rows = [ 

236 dict(**row, datastore_name=self.datastoreName) for row in self._db.query(info_in_trash).mappings() 

237 ] 

238 

239 # It is possible for trashed refs to be linked to artifacts that 

240 # are still associated with refs that are not to be trashed. We 

241 # need to be careful to consider those and indicate to the caller 

242 # that those artifacts should be retained. Can only do this check 

243 # if the caller provides a column name that can map to multiple 

244 # refs. 

245 preserved: Optional[Set[str]] = None 

246 if record_column is not None: 246 ↛ 271line 246 didn't jump to line 271, because the condition on line 246 was never false

247 # Some helper subqueries 

248 items_not_in_trash = join_records( 

249 sqlalchemy.sql.select(records_table._table.columns[record_column]), 

250 self._tables.dataset_location, 

251 ).alias("items_not_in_trash") 

252 items_in_trash = join_records( 

253 sqlalchemy.sql.select(records_table._table.columns[record_column]), 

254 self._tables.dataset_location_trash, 

255 ).alias("items_in_trash") 

256 

257 # A query for paths that are referenced by datasets in the trash 

258 # and datasets not in the trash. 

259 items_to_preserve = sqlalchemy.sql.select(items_in_trash.columns[record_column]).select_from( 

260 items_not_in_trash.join( 

261 items_in_trash, 

262 onclause=items_in_trash.columns[record_column] 

263 == items_not_in_trash.columns[record_column], 

264 ) 

265 ) 

266 preserved = {row[record_column] for row in self._db.query(items_to_preserve).mappings()} 

267 

268 # Convert results to a tuple of id+info and a record of the artifacts 

269 # that should not be deleted from datastore. The id+info tuple is 

270 # solely to allow logging to report the relevant ID. 

271 id_info = ((FakeDatasetRef(row["dataset_id"]), record_class.from_record(row)) for row in rows) 

272 

273 # Start contextmanager, return results 

274 yield ((id_info, preserved)) 

275 

276 # No exception raised in context manager block. 

277 if not rows: 

278 return 

279 

280 # Delete the rows from the records table 

281 records_table.delete(["dataset_id"], *[{"dataset_id": row["dataset_id"]} for row in rows]) 

282 

283 # Delete those rows from the trash table. 

284 self._db.delete( 

285 self._tables.dataset_location_trash, 

286 ["dataset_id", "datastore_name"], 

287 *[{"dataset_id": row["dataset_id"], "datastore_name": row["datastore_name"]} for row in rows], 

288 ) 

289 

290 

291class MonolithicDatastoreRegistryBridgeManager(DatastoreRegistryBridgeManager): 

292 """An implementation of `DatastoreRegistryBridgeManager` that uses the same 

293 two tables for all non-ephemeral datastores. 

294 

295 Parameters 

296 ---------- 

297 db : `Database` 

298 Object providing a database connection and generic distractions. 

299 tables : `_TablesTuple` 

300 Named tuple containing `sqlalchemy.schema.Table` instances. 

301 opaque : `OpaqueTableStorageManager` 

302 Manager object for opaque table storage in the `Registry`. 

303 universe : `DimensionUniverse` 

304 All dimensions know to the `Registry`. 

305 datasetIdColumnType : `type` 

306 Type for dataset ID column. 

307 """ 

308 

309 def __init__( 

310 self, 

311 *, 

312 db: Database, 

313 tables: _TablesTuple, 

314 opaque: OpaqueTableStorageManager, 

315 universe: DimensionUniverse, 

316 datasetIdColumnType: type, 

317 ): 

318 super().__init__(opaque=opaque, universe=universe, datasetIdColumnType=datasetIdColumnType) 

319 self._db = db 

320 self._tables = tables 

321 self._ephemeral: Dict[str, EphemeralDatastoreRegistryBridge] = {} 

322 

323 @classmethod 

324 def initialize( 

325 cls, 

326 db: Database, 

327 context: StaticTablesContext, 

328 *, 

329 opaque: OpaqueTableStorageManager, 

330 datasets: Type[DatasetRecordStorageManager], 

331 universe: DimensionUniverse, 

332 ) -> DatastoreRegistryBridgeManager: 

333 # Docstring inherited from DatastoreRegistryBridge 

334 tables = context.addTableTuple(_makeTableSpecs(datasets)) 

335 return cls( 

336 db=db, 

337 tables=cast(_TablesTuple, tables), 

338 opaque=opaque, 

339 universe=universe, 

340 datasetIdColumnType=datasets.getIdColumnType(), 

341 ) 

342 

343 def refresh(self) -> None: 

344 # Docstring inherited from DatastoreRegistryBridge 

345 # This implementation has no in-Python state that depends on which 

346 # datastores exist, so there's nothing to do. 

347 pass 

348 

349 def register(self, name: str, *, ephemeral: bool = False) -> DatastoreRegistryBridge: 

350 # Docstring inherited from DatastoreRegistryBridge 

351 if ephemeral: 

352 return self._ephemeral.setdefault(name, EphemeralDatastoreRegistryBridge(name)) 

353 return MonolithicDatastoreRegistryBridge(name, db=self._db, tables=self._tables) 

354 

355 def findDatastores(self, ref: DatasetIdRef) -> Iterable[str]: 

356 # Docstring inherited from DatastoreRegistryBridge 

357 sql = ( 

358 sqlalchemy.sql.select(self._tables.dataset_location.columns.datastore_name) 

359 .select_from(self._tables.dataset_location) 

360 .where(self._tables.dataset_location.columns.dataset_id == ref.getCheckedId()) 

361 ) 

362 for row in self._db.query(sql).mappings(): 

363 yield row[self._tables.dataset_location.columns.datastore_name] 

364 for name, bridge in self._ephemeral.items(): 

365 if ref in bridge: 

366 yield name 

367 

368 @classmethod 

369 def currentVersion(cls) -> Optional[VersionTuple]: 

370 # Docstring inherited from VersionedExtension. 

371 return _VERSION 

372 

373 def schemaDigest(self) -> Optional[str]: 

374 # Docstring inherited from VersionedExtension. 

375 return self._defaultSchemaDigest(self._tables, self._db.dialect)