Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = () 

24 

25from abc import abstractmethod 

26from collections import namedtuple 

27import itertools 

28from typing import ( 

29 Any, 

30 Dict, 

31 Generic, 

32 Iterable, 

33 Iterator, 

34 Optional, 

35 Type, 

36 TYPE_CHECKING, 

37 TypeVar, 

38) 

39 

40import sqlalchemy 

41 

42from ...core import DatabaseTimespanRepresentation, ddl, Timespan 

43from .._collectionType import CollectionType 

44from ..interfaces import ( 

45 ChainedCollectionRecord, 

46 CollectionManager, 

47 CollectionRecord, 

48 MissingCollectionError, 

49 RunRecord, 

50) 

51from ..wildcards import CollectionSearch, Ellipsis 

52 

53if TYPE_CHECKING: 53 ↛ 54line 53 didn't jump to line 54, because the condition on line 53 was never true

54 from ..interfaces import Database 

55 

56 

57def _makeCollectionForeignKey(sourceColumnName: str, collectionIdName: str, 

58 **kwargs: Any) -> ddl.ForeignKeySpec: 

59 """Define foreign key specification that refers to collections table. 

60 

61 Parameters 

62 ---------- 

63 sourceColumnName : `str` 

64 Name of the column in the referring table. 

65 collectionIdName : `str` 

66 Name of the column in collections table that identifies it (PK). 

67 **kwargs 

68 Additional keyword arguments passed directly to `ddl.ForeignKeySpec`. 

69 

70 Returns 

71 ------- 

72 spec : `ddl.ForeignKeySpec` 

73 Foreign key specification. 

74 

75 Notes 

76 ----- 

77 This method assumes fixed name ("collection") of a collections table. 

78 There is also a general assumption that collection primary key consists 

79 of a single column. 

80 """ 

81 return ddl.ForeignKeySpec("collection", source=(sourceColumnName,), target=(collectionIdName,), 

82 **kwargs) 

83 

84 

85CollectionTablesTuple = namedtuple("CollectionTablesTuple", ["collection", "run", "collection_chain"]) 

86 

87 

88def makeRunTableSpec(collectionIdName: str, collectionIdType: type, 

89 tsRepr: Type[DatabaseTimespanRepresentation]) -> ddl.TableSpec: 

90 """Define specification for "run" table. 

91 

92 Parameters 

93 ---------- 

94 collectionIdName : `str` 

95 Name of the column in collections table that identifies it (PK). 

96 collectionIdType 

97 Type of the PK column in the collections table, one of the 

98 `sqlalchemy` types. 

99 tsRepr : `type` [ `DatabaseTimespanRepresentation` ] 

100 Subclass of `DatabaseTimespanRepresentation` that encapsulates how 

101 timespans are stored in this database. 

102 

103 

104 Returns 

105 ------- 

106 spec : `ddl.TableSpec` 

107 Specification for run table. 

108 

109 Notes 

110 ----- 

111 Assumption here and in the code below is that the name of the identifying 

112 column is the same in both collections and run tables. The names of 

113 non-identifying columns containing run metadata are fixed. 

114 """ 

115 result = ddl.TableSpec( 

116 fields=[ 

117 ddl.FieldSpec(collectionIdName, dtype=collectionIdType, primaryKey=True), 

118 ddl.FieldSpec("host", dtype=sqlalchemy.String, length=128), 

119 ], 

120 foreignKeys=[ 

121 _makeCollectionForeignKey(collectionIdName, collectionIdName, onDelete="CASCADE"), 

122 ], 

123 ) 

124 for fieldSpec in tsRepr.makeFieldSpecs(nullable=True): 

125 result.fields.add(fieldSpec) 

126 return result 

127 

128 

129def makeCollectionChainTableSpec(collectionIdName: str, collectionIdType: type) -> ddl.TableSpec: 

130 """Define specification for "collection_chain" table. 

131 

132 Parameters 

133 ---------- 

134 collectionIdName : `str` 

135 Name of the column in collections table that identifies it (PK). 

136 collectionIdType 

137 Type of the PK column in the collections table, one of the 

138 `sqlalchemy` types. 

139 

140 Returns 

141 ------- 

142 spec : `ddl.TableSpec` 

143 Specification for collection chain table. 

144 

145 Notes 

146 ----- 

147 Collection chain is simply an ordered one-to-many relation between 

148 collections. The names of the columns in the table are fixed and 

149 also hardcoded in the code below. 

150 """ 

151 return ddl.TableSpec( 

152 fields=[ 

153 ddl.FieldSpec("parent", dtype=collectionIdType, primaryKey=True), 

154 ddl.FieldSpec("position", dtype=sqlalchemy.SmallInteger, primaryKey=True), 

155 ddl.FieldSpec("child", dtype=collectionIdType, nullable=False), 

156 ddl.FieldSpec("dataset_type_name", dtype=sqlalchemy.String, length=128, nullable=True), 

157 ], 

158 foreignKeys=[ 

159 _makeCollectionForeignKey("parent", collectionIdName, onDelete="CASCADE"), 

160 _makeCollectionForeignKey("child", collectionIdName), 

161 ], 

162 ) 

163 

164 

165class DefaultRunRecord(RunRecord): 

166 """Default `RunRecord` implementation. 

167 

168 This method assumes the same run table definition as produced by 

169 `makeRunTableSpec` method. The only non-fixed name in the schema 

170 is the PK column name, this needs to be passed in a constructor. 

171 

172 Parameters 

173 ---------- 

174 db : `Database` 

175 Registry database. 

176 key 

177 Unique collection ID, can be the same as ``name`` if ``name`` is used 

178 for identification. Usually this is an integer or string, but can be 

179 other database-specific type. 

180 name : `str` 

181 Run collection name. 

182 table : `sqlalchemy.schema.Table` 

183 Table for run records. 

184 idColumnName : `str` 

185 Name of the identifying column in run table. 

186 host : `str`, optional 

187 Name of the host where run was produced. 

188 timespan : `Timespan`, optional 

189 Timespan for this run. 

190 """ 

191 def __init__(self, db: Database, key: Any, name: str, *, table: sqlalchemy.schema.Table, 

192 idColumnName: str, host: Optional[str] = None, 

193 timespan: Optional[Timespan] = None): 

194 super().__init__(key=key, name=name, type=CollectionType.RUN) 

195 self._db = db 

196 self._table = table 

197 self._host = host 

198 if timespan is None: 198 ↛ 200line 198 didn't jump to line 200, because the condition on line 198 was never false

199 timespan = Timespan(begin=None, end=None) 

200 self._timespan = timespan 

201 self._idName = idColumnName 

202 

203 def update(self, host: Optional[str] = None, 

204 timespan: Optional[Timespan] = None) -> None: 

205 # Docstring inherited from RunRecord. 

206 if timespan is None: 

207 timespan = Timespan(begin=None, end=None) 

208 row = { 

209 self._idName: self.key, 

210 "host": host, 

211 } 

212 self._db.getTimespanRepresentation().update(timespan, result=row) 

213 count = self._db.update(self._table, {self._idName: self.key}, row) 

214 if count != 1: 

215 raise RuntimeError(f"Run update affected {count} records; expected exactly one.") 

216 self._host = host 

217 self._timespan = timespan 

218 

219 @property 

220 def host(self) -> Optional[str]: 

221 # Docstring inherited from RunRecord. 

222 return self._host 

223 

224 @property 

225 def timespan(self) -> Timespan: 

226 # Docstring inherited from RunRecord. 

227 return self._timespan 

228 

229 

230class DefaultChainedCollectionRecord(ChainedCollectionRecord): 

231 """Default `ChainedCollectionRecord` implementation. 

232 

233 This method assumes the same chain table definition as produced by 

234 `makeCollectionChainTableSpec` method. All column names in the table are 

235 fixed and hard-coded in the methods. 

236 

237 Parameters 

238 ---------- 

239 db : `Database` 

240 Registry database. 

241 key 

242 Unique collection ID, can be the same as ``name`` if ``name`` is used 

243 for identification. Usually this is an integer or string, but can be 

244 other database-specific type. 

245 name : `str` 

246 Collection name. 

247 table : `sqlalchemy.schema.Table` 

248 Table for chain relationship records. 

249 """ 

250 def __init__(self, db: Database, key: Any, name: str, *, table: sqlalchemy.schema.Table): 

251 super().__init__(key=key, name=name) 

252 self._db = db 

253 self._table = table 

254 

255 def _update(self, manager: CollectionManager, children: CollectionSearch) -> None: 

256 # Docstring inherited from ChainedCollectionRecord. 

257 rows = [] 

258 position = itertools.count() 

259 for child, restriction in children.iterPairs(manager, flattenChains=False): 

260 if restriction.names is Ellipsis: 

261 rows.append({"parent": self.key, "child": child.key, 

262 "position": next(position), "dataset_type_name": None}) 

263 else: 

264 for name in restriction.names: 

265 rows.append({"parent": self.key, "child": child.key, 

266 "position": next(position), "dataset_type_name": name}) 

267 with self._db.transaction(): 

268 self._db.delete(self._table, ["parent"], {"parent": self.key}) 

269 self._db.insert(self._table, *rows) 

270 

271 def _load(self, manager: CollectionManager) -> CollectionSearch: 

272 # Docstring inherited from ChainedCollectionRecord. 

273 sql = sqlalchemy.sql.select( 

274 [self._table.columns.child, self._table.columns.dataset_type_name] 

275 ).select_from( 

276 self._table 

277 ).where( 

278 self._table.columns.parent == self.key 

279 ).order_by( 

280 self._table.columns.position 

281 ) 

282 # It's fine to have consecutive rows with the same collection name 

283 # and different dataset type names - CollectionSearch will group those 

284 # up for us. 

285 children = [] 

286 for row in self._db.query(sql): 

287 key = row[self._table.columns.child] 

288 restriction = row[self._table.columns.dataset_type_name] 

289 if not restriction: 

290 restriction = ... # we store ... as "" in the database 

291 record = manager[key] 

292 children.append((record.name, restriction)) 

293 return CollectionSearch.fromExpression(children) 

294 

295 

296K = TypeVar("K") 

297 

298 

299class DefaultCollectionManager(Generic[K], CollectionManager): 

300 """Default `CollectionManager` implementation. 

301 

302 This implementation uses record classes defined in this module and is 

303 based on the same assumptions about schema outlined in the record classes. 

304 

305 Parameters 

306 ---------- 

307 db : `Database` 

308 Interface to the underlying database engine and namespace. 

309 tables : `CollectionTablesTuple` 

310 Named tuple of SQLAlchemy table objects. 

311 collectionIdName : `str` 

312 Name of the column in collections table that identifies it (PK). 

313 

314 Notes 

315 ----- 

316 Implementation uses "aggressive" pre-fetching and caching of the records 

317 in memory. Memory cache is synchronized from database when `refresh` 

318 method is called. 

319 """ 

320 def __init__(self, db: Database, tables: CollectionTablesTuple, collectionIdName: str): 

321 self._db = db 

322 self._tables = tables 

323 self._collectionIdName = collectionIdName 

324 self._records: Dict[K, CollectionRecord] = {} # indexed by record ID 

325 

326 def refresh(self) -> None: 

327 # Docstring inherited from CollectionManager. 

328 sql = sqlalchemy.sql.select( 

329 self._tables.collection.columns + self._tables.run.columns 

330 ).select_from( 

331 self._tables.collection.join(self._tables.run, isouter=True) 

332 ) 

333 # Put found records into a temporary instead of updating self._records 

334 # in place, for exception safety. 

335 records = [] 

336 chains = [] 

337 tsRepr = self._db.getTimespanRepresentation() 

338 for row in self._db.query(sql).fetchall(): 

339 collection_id = row[self._tables.collection.columns[self._collectionIdName]] 

340 name = row[self._tables.collection.columns.name] 

341 type = CollectionType(row["type"]) 

342 record: CollectionRecord 

343 if type is CollectionType.RUN: 

344 record = DefaultRunRecord( 

345 key=collection_id, 

346 name=name, 

347 db=self._db, 

348 table=self._tables.run, 

349 idColumnName=self._collectionIdName, 

350 host=row[self._tables.run.columns.host], 

351 timespan=tsRepr.extract(row), 

352 ) 

353 elif type is CollectionType.CHAINED: 353 ↛ 354line 353 didn't jump to line 354, because the condition on line 353 was never true

354 record = DefaultChainedCollectionRecord(db=self._db, 

355 key=collection_id, 

356 table=self._tables.collection_chain, 

357 name=name) 

358 chains.append(record) 

359 else: 

360 record = CollectionRecord(key=collection_id, name=name, type=type) 

361 records.append(record) 

362 self._setRecordCache(records) 

363 for chain in chains: 363 ↛ 364line 363 didn't jump to line 364, because the loop on line 363 never started

364 chain.refresh(self) 

365 

366 def register(self, name: str, type: CollectionType) -> CollectionRecord: 

367 # Docstring inherited from CollectionManager. 

368 record = self._getByName(name) 

369 if record is None: 

370 row, _ = self._db.sync( 

371 self._tables.collection, 

372 keys={"name": name}, 

373 compared={"type": int(type)}, 

374 returning=[self._collectionIdName], 

375 ) 

376 assert row is not None 

377 collection_id = row[self._collectionIdName] 

378 if type is CollectionType.RUN: 

379 tsRepr = self._db.getTimespanRepresentation() 

380 row, _ = self._db.sync( 

381 self._tables.run, 

382 keys={self._collectionIdName: collection_id}, 

383 returning=("host",) + tsRepr.getFieldNames(), 

384 ) 

385 assert row is not None 

386 record = DefaultRunRecord( 

387 db=self._db, 

388 key=collection_id, 

389 name=name, 

390 table=self._tables.run, 

391 idColumnName=self._collectionIdName, 

392 host=row["host"], 

393 timespan=tsRepr.extract(row), 

394 ) 

395 elif type is CollectionType.CHAINED: 

396 record = DefaultChainedCollectionRecord(db=self._db, key=collection_id, name=name, 

397 table=self._tables.collection_chain) 

398 else: 

399 record = CollectionRecord(key=collection_id, name=name, type=type) 

400 self._addCachedRecord(record) 

401 return record 

402 

403 def remove(self, name: str) -> None: 

404 # Docstring inherited from CollectionManager. 

405 record = self._getByName(name) 

406 if record is None: 406 ↛ 407line 406 didn't jump to line 407, because the condition on line 406 was never true

407 raise MissingCollectionError(f"No collection with name '{name}' found.") 

408 # This may raise 

409 self._db.delete(self._tables.collection, [self._collectionIdName], 

410 {self._collectionIdName: record.key}) 

411 self._removeCachedRecord(record) 

412 

413 def find(self, name: str) -> CollectionRecord: 

414 # Docstring inherited from CollectionManager. 

415 result = self._getByName(name) 

416 if result is None: 

417 raise MissingCollectionError(f"No collection with name '{name}' found.") 

418 return result 

419 

420 def __getitem__(self, key: Any) -> CollectionRecord: 

421 # Docstring inherited from CollectionManager. 

422 try: 

423 return self._records[key] 

424 except KeyError as err: 

425 raise MissingCollectionError(f"Collection with key '{key}' not found.") from err 

426 

427 def __iter__(self) -> Iterator[CollectionRecord]: 

428 yield from self._records.values() 

429 

430 def _setRecordCache(self, records: Iterable[CollectionRecord]) -> None: 

431 """Set internal record cache to contain given records, 

432 old cached records will be removed. 

433 """ 

434 self._records = {} 

435 for record in records: 

436 self._records[record.key] = record 

437 

438 def _addCachedRecord(self, record: CollectionRecord) -> None: 

439 """Add single record to cache. 

440 """ 

441 self._records[record.key] = record 

442 

443 def _removeCachedRecord(self, record: CollectionRecord) -> None: 

444 """Remove single record from cache. 

445 """ 

446 del self._records[record.key] 

447 

448 @abstractmethod 

449 def _getByName(self, name: str) -> Optional[CollectionRecord]: 

450 """Find collection record given collection name. 

451 """ 

452 raise NotImplementedError()