Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ("DatasetRecordStorageManager", "DatasetRecordStorage", "DatasetIdGenEnum") 

25 

26from abc import ABC, abstractmethod 

27import enum 

28from typing import ( 

29 Any, 

30 Iterable, 

31 Iterator, 

32 Optional, 

33 Tuple, 

34 TYPE_CHECKING, 

35) 

36 

37from ...core import ( 

38 DataCoordinate, 

39 DatasetId, 

40 DatasetRef, 

41 DatasetType, 

42 ddl, 

43 SimpleQuery, 

44 Timespan, 

45) 

46from ._versioning import VersionedExtension 

47 

48if TYPE_CHECKING: 48 ↛ 49line 48 didn't jump to line 49, because the condition on line 48 was never true

49 from ..summaries import CollectionSummary 

50 from ._database import Database, StaticTablesContext 

51 from ._dimensions import DimensionRecordStorageManager 

52 from ._collections import CollectionManager, CollectionRecord, RunRecord 

53 

54 

55class DatasetIdGenEnum(enum.Enum): 

56 """This enum is used to specify dataset ID generation options for 

57 ``insert()`` method. 

58 """ 

59 

60 UNIQUE = 0 

61 """Unique mode generates unique ID for each inserted dataset, e.g. 

62 auto-generated by database or random UUID. 

63 """ 

64 

65 DATAID_TYPE = 1 

66 """In this mode ID is computed deterministically from a combination of 

67 dataset type and dataId. 

68 """ 

69 

70 DATAID_TYPE_RUN = 2 

71 """In this mode ID is computed deterministically from a combination of 

72 dataset type, dataId, and run collection name. 

73 """ 

74 

75 

76class DatasetRecordStorage(ABC): 

77 """An interface that manages the records associated with a particular 

78 `DatasetType`. 

79 

80 Parameters 

81 ---------- 

82 datasetType : `DatasetType` 

83 Dataset type whose records this object manages. 

84 """ 

85 def __init__(self, datasetType: DatasetType): 

86 self.datasetType = datasetType 

87 

88 @abstractmethod 

89 def insert(self, run: RunRecord, dataIds: Iterable[DataCoordinate], 

90 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE) -> Iterator[DatasetRef]: 

91 """Insert one or more dataset entries into the database. 

92 

93 Parameters 

94 ---------- 

95 run : `RunRecord` 

96 The record object describing the `~CollectionType.RUN` collection 

97 this dataset will be associated with. 

98 dataIds : `Iterable` [ `DataCoordinate` ] 

99 Expanded data IDs (`DataCoordinate` instances) for the 

100 datasets to be added. The dimensions of all data IDs must be the 

101 same as ``self.datasetType.dimensions``. 

102 idMode : `DatasetIdGenEnum` 

103 With `UNIQUE` each new dataset is inserted with its new unique ID. 

104 With non-`UNIQUE` mode ID is computed from some combination of 

105 dataset type, dataId, and run collection name; if the same ID is 

106 already in the database then new record is not inserted. 

107 

108 Returns 

109 ------- 

110 datasets : `Iterable` [ `DatasetRef` ] 

111 References to the inserted datasets. 

112 """ 

113 raise NotImplementedError() 

114 

115 @abstractmethod 

116 def import_(self, run: RunRecord, datasets: Iterable[DatasetRef], 

117 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

118 reuseIds: bool = False) -> Iterator[DatasetRef]: 

119 """Insert one or more dataset entries into the database. 

120 

121 Parameters 

122 ---------- 

123 run : `RunRecord` 

124 The record object describing the `~CollectionType.RUN` collection 

125 this dataset will be associated with. 

126 datasets : `~collections.abc.Iterable` of `DatasetRef` 

127 Datasets to be inserted. Datasets can specify ``id`` attribute 

128 which will be used for inserted datasets. All dataset IDs must 

129 have the same type (`int` or `uuid.UUID`), if type of dataset IDs 

130 does not match type supported by this class then IDs will be 

131 ignored and new IDs will be generated by backend. 

132 idGenerationMode : `DatasetIdGenEnum` 

133 With `UNIQUE` each new dataset is inserted with its new unique ID. 

134 With non-`UNIQUE` mode ID is computed from some combination of 

135 dataset type, dataId, and run collection name; if the same ID is 

136 already in the database then new record is not inserted. 

137 reuseIds : `bool`, optional 

138 If `True` then forces re-use of imported dataset IDs for integer 

139 IDs which are normally generated as auto-incremented; exception 

140 will be raised if imported IDs clash with existing ones. This 

141 option has no effect on the use of globally-unique IDs which are 

142 always re-used (or generated if integer IDs are being imported). 

143 

144 Returns 

145 ------- 

146 datasets : `Iterable` [ `DatasetRef` ] 

147 References to the inserted or existing datasets. 

148 

149 Notes 

150 ----- 

151 The ``datasetType`` and ``run`` attributes of datasets are supposed to 

152 be identical across all datasets but this is not checked and it should 

153 be enforced by higher level registry code. This method does not need 

154 to use those attributes from datasets, only ``dataId`` and ``id`` are 

155 relevant. 

156 """ 

157 raise NotImplementedError() 

158 

159 @abstractmethod 

160 def find(self, collection: CollectionRecord, dataId: DataCoordinate, 

161 timespan: Optional[Timespan] = None) -> Optional[DatasetRef]: 

162 """Search a collection for a dataset with the given data ID. 

163 

164 Parameters 

165 ---------- 

166 collection : `CollectionRecord` 

167 The record object describing the collection to search for the 

168 dataset. May have any `CollectionType`. 

169 dataId: `DataCoordinate` 

170 Complete (but not necessarily expanded) data ID to search with, 

171 with ``dataId.graph == self.datasetType.dimensions``. 

172 timespan : `Timespan`, optional 

173 A timespan that the validity range of the dataset must overlap. 

174 Required if ``collection.type is CollectionType.CALIBRATION``, and 

175 ignored otherwise. 

176 

177 Returns 

178 ------- 

179 ref : `DatasetRef` 

180 A resolved `DatasetRef` (without components populated), or `None` 

181 if no matching dataset was found. 

182 """ 

183 raise NotImplementedError() 

184 

185 @abstractmethod 

186 def delete(self, datasets: Iterable[DatasetRef]) -> None: 

187 """Fully delete the given datasets from the registry. 

188 

189 Parameters 

190 ---------- 

191 datasets : `Iterable` [ `DatasetRef` ] 

192 Datasets to be deleted. All datasets must be resolved and have 

193 the same `DatasetType` as ``self``. 

194 

195 Raises 

196 ------ 

197 AmbiguousDatasetError 

198 Raised if any of the given `DatasetRef` instances is unresolved. 

199 """ 

200 raise NotImplementedError() 

201 

202 @abstractmethod 

203 def associate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None: 

204 """Associate one or more datasets with a collection. 

205 

206 Parameters 

207 ---------- 

208 collection : `CollectionRecord` 

209 The record object describing the collection. ``collection.type`` 

210 must be `~CollectionType.TAGGED`. 

211 datasets : `Iterable` [ `DatasetRef` ] 

212 Datasets to be associated. All datasets must be resolved and have 

213 the same `DatasetType` as ``self``. 

214 

215 Raises 

216 ------ 

217 AmbiguousDatasetError 

218 Raised if any of the given `DatasetRef` instances is unresolved. 

219 

220 Notes 

221 ----- 

222 Associating a dataset with into collection that already contains a 

223 different dataset with the same `DatasetType` and data ID will remove 

224 the existing dataset from that collection. 

225 

226 Associating the same dataset into a collection multiple times is a 

227 no-op, but is still not permitted on read-only databases. 

228 """ 

229 raise NotImplementedError() 

230 

231 @abstractmethod 

232 def disassociate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None: 

233 """Remove one or more datasets from a collection. 

234 

235 Parameters 

236 ---------- 

237 collection : `CollectionRecord` 

238 The record object describing the collection. ``collection.type`` 

239 must be `~CollectionType.TAGGED`. 

240 datasets : `Iterable` [ `DatasetRef` ] 

241 Datasets to be disassociated. All datasets must be resolved and 

242 have the same `DatasetType` as ``self``. 

243 

244 Raises 

245 ------ 

246 AmbiguousDatasetError 

247 Raised if any of the given `DatasetRef` instances is unresolved. 

248 """ 

249 raise NotImplementedError() 

250 

251 @abstractmethod 

252 def certify(self, collection: CollectionRecord, datasets: Iterable[DatasetRef], 

253 timespan: Timespan) -> None: 

254 """Associate one or more datasets with a calibration collection and a 

255 validity range within it. 

256 

257 Parameters 

258 ---------- 

259 collection : `CollectionRecord` 

260 The record object describing the collection. ``collection.type`` 

261 must be `~CollectionType.CALIBRATION`. 

262 datasets : `Iterable` [ `DatasetRef` ] 

263 Datasets to be associated. All datasets must be resolved and have 

264 the same `DatasetType` as ``self``. 

265 timespan : `Timespan` 

266 The validity range for these datasets within the collection. 

267 

268 Raises 

269 ------ 

270 AmbiguousDatasetError 

271 Raised if any of the given `DatasetRef` instances is unresolved. 

272 ConflictingDefinitionError 

273 Raised if the collection already contains a different dataset with 

274 the same `DatasetType` and data ID and an overlapping validity 

275 range. 

276 TypeError 

277 Raised if 

278 ``collection.type is not CollectionType.CALIBRATION`` or if 

279 ``self.datasetType.isCalibration() is False``. 

280 """ 

281 raise NotImplementedError() 

282 

283 @abstractmethod 

284 def decertify(self, collection: CollectionRecord, timespan: Timespan, *, 

285 dataIds: Optional[Iterable[DataCoordinate]] = None) -> None: 

286 """Remove or adjust datasets to clear a validity range within a 

287 calibration collection. 

288 

289 Parameters 

290 ---------- 

291 collection : `CollectionRecord` 

292 The record object describing the collection. ``collection.type`` 

293 must be `~CollectionType.CALIBRATION`. 

294 timespan : `Timespan` 

295 The validity range to remove datasets from within the collection. 

296 Datasets that overlap this range but are not contained by it will 

297 have their validity ranges adjusted to not overlap it, which may 

298 split a single dataset validity range into two. 

299 dataIds : `Iterable` [ `DataCoordinate` ], optional 

300 Data IDs that should be decertified within the given validity range 

301 If `None`, all data IDs for ``self.datasetType`` will be 

302 decertified. 

303 

304 Raises 

305 ------ 

306 TypeError 

307 Raised if ``collection.type is not CollectionType.CALIBRATION``. 

308 """ 

309 raise NotImplementedError() 

310 

311 @abstractmethod 

312 def select(self, collection: CollectionRecord, 

313 dataId: SimpleQuery.Select.Or[DataCoordinate] = SimpleQuery.Select, 

314 id: SimpleQuery.Select.Or[Optional[DatasetId]] = SimpleQuery.Select, 

315 run: SimpleQuery.Select.Or[None] = SimpleQuery.Select, 

316 timespan: SimpleQuery.Select.Or[Optional[Timespan]] = SimpleQuery.Select, 

317 ingestDate: SimpleQuery.Select.Or[Optional[Timespan]] = None, 

318 ) -> Optional[SimpleQuery]: 

319 """Return a SQLAlchemy object that represents a ``SELECT`` query for 

320 this `DatasetType`. 

321 

322 All arguments can either be a value that constrains the query or 

323 the `SimpleQuery.Select` tag object to indicate that the value should 

324 be returned in the columns in the ``SELECT`` clause. The default is 

325 `SimpleQuery.Select`. 

326 

327 Parameters 

328 ---------- 

329 collection : `CollectionRecord` 

330 The record object describing the collection to query. May not be 

331 of type `CollectionType.CHAINED`. 

332 dataId : `DataCoordinate` or `Select` 

333 The data ID to restrict results with, or an instruction to return 

334 the data ID via columns with names 

335 ``self.datasetType.dimensions.names``. 

336 id : `DatasetId`, `Select` or None, 

337 The primary key value for the dataset, an instruction to return it 

338 via a ``id`` column, or `None` to ignore it entirely. 

339 run : `None` or `Select` 

340 If `Select` (default), include the dataset's run key value (as 

341 column labeled with the return value of 

342 ``CollectionManager.getRunForiegnKeyName``). 

343 If `None`, do not include this column (to constrain the run, 

344 pass a `RunRecord` as the ``collection`` argument instead). 

345 timespan : `None`, `Select`, or `Timespan` 

346 If `Select` (default), include the validity range timespan in the 

347 result columns. If a `Timespan` instance, constrain the results to 

348 those whose validity ranges overlap that given timespan. Ignored 

349 unless ``collection.type is CollectionType.CALIBRATION``. 

350 ingestDate : `None`, `Select`, or `Timespan` 

351 If `Select` include the ingest timestamp in the result columns. 

352 If a `Timespan` instance, constrain the results to those whose 

353 ingest times which are inside given timespan and also include 

354 timestamp in the result columns. If `None` (default) then there is 

355 no constraint and timestamp is not returned. 

356 

357 Returns 

358 ------- 

359 query : `SimpleQuery` or `None` 

360 A struct containing the SQLAlchemy object that representing a 

361 simple ``SELECT`` query, or `None` if it is known that there are 

362 no datasets of this `DatasetType` that match the given constraints. 

363 """ 

364 raise NotImplementedError() 

365 

366 datasetType: DatasetType 

367 """Dataset type whose records this object manages (`DatasetType`). 

368 """ 

369 

370 

371class DatasetRecordStorageManager(VersionedExtension): 

372 """An interface that manages the tables that describe datasets. 

373 

374 `DatasetRecordStorageManager` primarily serves as a container and factory 

375 for `DatasetRecordStorage` instances, which each provide access to the 

376 records for a different `DatasetType`. 

377 """ 

378 

379 @classmethod 

380 @abstractmethod 

381 def initialize( 

382 cls, 

383 db: Database, 

384 context: StaticTablesContext, *, 

385 collections: CollectionManager, 

386 dimensions: DimensionRecordStorageManager, 

387 ) -> DatasetRecordStorageManager: 

388 """Construct an instance of the manager. 

389 

390 Parameters 

391 ---------- 

392 db : `Database` 

393 Interface to the underlying database engine and namespace. 

394 context : `StaticTablesContext` 

395 Context object obtained from `Database.declareStaticTables`; used 

396 to declare any tables that should always be present. 

397 collections: `CollectionManager` 

398 Manager object for the collections in this `Registry`. 

399 dimensions : `DimensionRecordStorageManager` 

400 Manager object for the dimensions in this `Registry`. 

401 

402 Returns 

403 ------- 

404 manager : `DatasetRecordStorageManager` 

405 An instance of a concrete `DatasetRecordStorageManager` subclass. 

406 """ 

407 raise NotImplementedError() 

408 

409 @classmethod 

410 @abstractmethod 

411 def getIdColumnType(cls) -> type: 

412 """Return type used for columns storing dataset IDs. 

413 

414 This type is used for columns storing `DatasetRef.id` values, usually 

415 a `type` subclass provided by SQLAlchemy. 

416 

417 Returns 

418 ------- 

419 dtype : `type` 

420 Type used for dataset identification in database. 

421 """ 

422 raise NotImplementedError() 

423 

424 @classmethod 

425 @abstractmethod 

426 def addDatasetForeignKey(cls, tableSpec: ddl.TableSpec, *, 

427 name: str = "dataset", constraint: bool = True, onDelete: Optional[str] = None, 

428 **kwargs: Any) -> ddl.FieldSpec: 

429 """Add a foreign key (field and constraint) referencing the dataset 

430 table. 

431 

432 Parameters 

433 ---------- 

434 tableSpec : `ddl.TableSpec` 

435 Specification for the table that should reference the dataset 

436 table. Will be modified in place. 

437 name: `str`, optional 

438 A name to use for the prefix of the new field; the full name is 

439 ``{name}_id``. 

440 onDelete: `str`, optional 

441 One of "CASCADE" or "SET NULL", indicating what should happen to 

442 the referencing row if the collection row is deleted. `None` 

443 indicates that this should be an integrity error. 

444 constraint: `bool`, optional 

445 If `False` (`True` is default), add a field that can be joined to 

446 the dataset primary key, but do not add a foreign key constraint. 

447 **kwargs 

448 Additional keyword arguments are forwarded to the `ddl.FieldSpec` 

449 constructor (only the ``name`` and ``dtype`` arguments are 

450 otherwise provided). 

451 

452 Returns 

453 ------- 

454 idSpec : `ddl.FieldSpec` 

455 Specification for the ID field. 

456 """ 

457 raise NotImplementedError() 

458 

459 @abstractmethod 

460 def refresh(self) -> None: 

461 """Ensure all other operations on this manager are aware of any 

462 dataset types that may have been registered by other clients since 

463 it was initialized or last refreshed. 

464 """ 

465 raise NotImplementedError() 

466 

467 def __getitem__(self, name: str) -> DatasetRecordStorage: 

468 """Return the object that provides access to the records associated 

469 with the given `DatasetType` name. 

470 

471 This is simply a convenience wrapper for `find` that raises `KeyError` 

472 when the dataset type is not found. 

473 

474 Returns 

475 ------- 

476 records : `DatasetRecordStorage` 

477 The object representing the records for the given dataset type. 

478 

479 Raises 

480 ------ 

481 KeyError 

482 Raised if there is no dataset type with the given name. 

483 

484 Notes 

485 ----- 

486 Dataset types registered by another client of the same repository since 

487 the last call to `initialize` or `refresh` may not be found. 

488 """ 

489 result = self.find(name) 

490 if result is None: 

491 raise KeyError(f"Dataset type with name '{name}' not found.") 

492 return result 

493 

494 @abstractmethod 

495 def find(self, name: str) -> Optional[DatasetRecordStorage]: 

496 """Return an object that provides access to the records associated with 

497 the given `DatasetType` name, if one exists. 

498 

499 Parameters 

500 ---------- 

501 name : `str` 

502 Name of the dataset type. 

503 

504 Returns 

505 ------- 

506 records : `DatasetRecordStorage` or `None` 

507 The object representing the records for the given dataset type, or 

508 `None` if there are no records for that dataset type. 

509 

510 Notes 

511 ----- 

512 Dataset types registered by another client of the same repository since 

513 the last call to `initialize` or `refresh` may not be found. 

514 """ 

515 raise NotImplementedError() 

516 

517 @abstractmethod 

518 def register(self, datasetType: DatasetType) -> Tuple[DatasetRecordStorage, bool]: 

519 """Ensure that this `Registry` can hold records for the given 

520 `DatasetType`, creating new tables as necessary. 

521 

522 Parameters 

523 ---------- 

524 datasetType : `DatasetType` 

525 Dataset type for which a table should created (as necessary) and 

526 an associated `DatasetRecordStorage` returned. 

527 

528 Returns 

529 ------- 

530 records : `DatasetRecordStorage` 

531 The object representing the records for the given dataset type. 

532 inserted : `bool` 

533 `True` if the dataset type did not exist in the registry before. 

534 

535 Notes 

536 ----- 

537 This operation may not be invoked within a `Database.transaction` 

538 context. 

539 """ 

540 raise NotImplementedError() 

541 

542 @abstractmethod 

543 def remove(self, name: str) -> None: 

544 """Remove the dataset type. 

545 

546 Parameters 

547 ---------- 

548 name : `str` 

549 Name of the dataset type. 

550 """ 

551 raise NotImplementedError() 

552 

553 @abstractmethod 

554 def __iter__(self) -> Iterator[DatasetType]: 

555 """Return an iterator over the the dataset types present in this layer. 

556 

557 Notes 

558 ----- 

559 Dataset types registered by another client of the same layer since 

560 the last call to `initialize` or `refresh` may not be included. 

561 """ 

562 raise NotImplementedError() 

563 

564 @abstractmethod 

565 def getDatasetRef(self, id: DatasetId) -> Optional[DatasetRef]: 

566 """Return a `DatasetRef` for the given dataset primary key 

567 value. 

568 

569 Parameters 

570 ---------- 

571 id : `DatasetId` 

572 Primary key value for the dataset. 

573 

574 Returns 

575 ------- 

576 ref : `DatasetRef` or `None` 

577 Object representing the dataset, or `None` if no dataset with the 

578 given primary key values exists in this layer. 

579 """ 

580 raise NotImplementedError() 

581 

582 @abstractmethod 

583 def getCollectionSummary(self, collection: CollectionRecord) -> CollectionSummary: 

584 """Return a summary for the given collection. 

585 

586 Parameters 

587 ---------- 

588 collection : `CollectionRecord` 

589 Record describing the collection for which a summary is to be 

590 retrieved. 

591 

592 Returns 

593 ------- 

594 summary : `CollectionSummary` 

595 Summary of the dataset types and governor dimension values in 

596 this collection. 

597 """ 

598 raise NotImplementedError()