Coverage for python/lsst/daf/butler/registry/interfaces/_datasets.py: 69%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

92 statements  

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ("DatasetRecordStorageManager", "DatasetRecordStorage", "DatasetIdGenEnum") 

25 

26from abc import ABC, abstractmethod 

27import enum 

28from typing import ( 

29 Any, 

30 Iterable, 

31 Iterator, 

32 Optional, 

33 Tuple, 

34 TYPE_CHECKING, 

35) 

36 

37from ...core import ( 

38 DataCoordinate, 

39 DatasetId, 

40 DatasetRef, 

41 DatasetType, 

42 ddl, 

43 SimpleQuery, 

44 Timespan, 

45) 

46from ._versioning import VersionedExtension 

47 

48if TYPE_CHECKING: 48 ↛ 49line 48 didn't jump to line 49, because the condition on line 48 was never true

49 from ..summaries import CollectionSummary 

50 from ._database import Database, StaticTablesContext 

51 from ._dimensions import DimensionRecordStorageManager 

52 from ._collections import CollectionManager, CollectionRecord, RunRecord 

53 

54 

55class DatasetIdGenEnum(enum.Enum): 

56 """This enum is used to specify dataset ID generation options for 

57 ``insert()`` method. 

58 """ 

59 

60 UNIQUE = 0 

61 """Unique mode generates unique ID for each inserted dataset, e.g. 

62 auto-generated by database or random UUID. 

63 """ 

64 

65 DATAID_TYPE = 1 

66 """In this mode ID is computed deterministically from a combination of 

67 dataset type and dataId. 

68 """ 

69 

70 DATAID_TYPE_RUN = 2 

71 """In this mode ID is computed deterministically from a combination of 

72 dataset type, dataId, and run collection name. 

73 """ 

74 

75 

76class DatasetRecordStorage(ABC): 

77 """An interface that manages the records associated with a particular 

78 `DatasetType`. 

79 

80 Parameters 

81 ---------- 

82 datasetType : `DatasetType` 

83 Dataset type whose records this object manages. 

84 """ 

85 def __init__(self, datasetType: DatasetType): 

86 self.datasetType = datasetType 

87 

88 @abstractmethod 

89 def insert(self, run: RunRecord, dataIds: Iterable[DataCoordinate], 

90 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE) -> Iterator[DatasetRef]: 

91 """Insert one or more dataset entries into the database. 

92 

93 Parameters 

94 ---------- 

95 run : `RunRecord` 

96 The record object describing the `~CollectionType.RUN` collection 

97 this dataset will be associated with. 

98 dataIds : `Iterable` [ `DataCoordinate` ] 

99 Expanded data IDs (`DataCoordinate` instances) for the 

100 datasets to be added. The dimensions of all data IDs must be the 

101 same as ``self.datasetType.dimensions``. 

102 idMode : `DatasetIdGenEnum` 

103 With `UNIQUE` each new dataset is inserted with its new unique ID. 

104 With non-`UNIQUE` mode ID is computed from some combination of 

105 dataset type, dataId, and run collection name; if the same ID is 

106 already in the database then new record is not inserted. 

107 

108 Returns 

109 ------- 

110 datasets : `Iterable` [ `DatasetRef` ] 

111 References to the inserted datasets. 

112 """ 

113 raise NotImplementedError() 

114 

115 @abstractmethod 

116 def import_(self, run: RunRecord, datasets: Iterable[DatasetRef], 

117 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

118 reuseIds: bool = False) -> Iterator[DatasetRef]: 

119 """Insert one or more dataset entries into the database. 

120 

121 Parameters 

122 ---------- 

123 run : `RunRecord` 

124 The record object describing the `~CollectionType.RUN` collection 

125 this dataset will be associated with. 

126 datasets : `~collections.abc.Iterable` of `DatasetRef` 

127 Datasets to be inserted. Datasets can specify ``id`` attribute 

128 which will be used for inserted datasets. All dataset IDs must 

129 have the same type (`int` or `uuid.UUID`), if type of dataset IDs 

130 does not match type supported by this class then IDs will be 

131 ignored and new IDs will be generated by backend. 

132 idGenerationMode : `DatasetIdGenEnum` 

133 With `UNIQUE` each new dataset is inserted with its new unique ID. 

134 With non-`UNIQUE` mode ID is computed from some combination of 

135 dataset type, dataId, and run collection name; if the same ID is 

136 already in the database then new record is not inserted. 

137 reuseIds : `bool`, optional 

138 If `True` then forces re-use of imported dataset IDs for integer 

139 IDs which are normally generated as auto-incremented; exception 

140 will be raised if imported IDs clash with existing ones. This 

141 option has no effect on the use of globally-unique IDs which are 

142 always re-used (or generated if integer IDs are being imported). 

143 

144 Returns 

145 ------- 

146 datasets : `Iterable` [ `DatasetRef` ] 

147 References to the inserted or existing datasets. 

148 

149 Notes 

150 ----- 

151 The ``datasetType`` and ``run`` attributes of datasets are supposed to 

152 be identical across all datasets but this is not checked and it should 

153 be enforced by higher level registry code. This method does not need 

154 to use those attributes from datasets, only ``dataId`` and ``id`` are 

155 relevant. 

156 """ 

157 raise NotImplementedError() 

158 

159 @abstractmethod 

160 def find(self, collection: CollectionRecord, dataId: DataCoordinate, 

161 timespan: Optional[Timespan] = None) -> Optional[DatasetRef]: 

162 """Search a collection for a dataset with the given data ID. 

163 

164 Parameters 

165 ---------- 

166 collection : `CollectionRecord` 

167 The record object describing the collection to search for the 

168 dataset. May have any `CollectionType`. 

169 dataId: `DataCoordinate` 

170 Complete (but not necessarily expanded) data ID to search with, 

171 with ``dataId.graph == self.datasetType.dimensions``. 

172 timespan : `Timespan`, optional 

173 A timespan that the validity range of the dataset must overlap. 

174 Required if ``collection.type is CollectionType.CALIBRATION``, and 

175 ignored otherwise. 

176 

177 Returns 

178 ------- 

179 ref : `DatasetRef` 

180 A resolved `DatasetRef` (without components populated), or `None` 

181 if no matching dataset was found. 

182 """ 

183 raise NotImplementedError() 

184 

185 @abstractmethod 

186 def delete(self, datasets: Iterable[DatasetRef]) -> None: 

187 """Fully delete the given datasets from the registry. 

188 

189 Parameters 

190 ---------- 

191 datasets : `Iterable` [ `DatasetRef` ] 

192 Datasets to be deleted. All datasets must be resolved and have 

193 the same `DatasetType` as ``self``. 

194 

195 Raises 

196 ------ 

197 AmbiguousDatasetError 

198 Raised if any of the given `DatasetRef` instances is unresolved. 

199 """ 

200 raise NotImplementedError() 

201 

202 @abstractmethod 

203 def associate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None: 

204 """Associate one or more datasets with a collection. 

205 

206 Parameters 

207 ---------- 

208 collection : `CollectionRecord` 

209 The record object describing the collection. ``collection.type`` 

210 must be `~CollectionType.TAGGED`. 

211 datasets : `Iterable` [ `DatasetRef` ] 

212 Datasets to be associated. All datasets must be resolved and have 

213 the same `DatasetType` as ``self``. 

214 

215 Raises 

216 ------ 

217 AmbiguousDatasetError 

218 Raised if any of the given `DatasetRef` instances is unresolved. 

219 

220 Notes 

221 ----- 

222 Associating a dataset with into collection that already contains a 

223 different dataset with the same `DatasetType` and data ID will remove 

224 the existing dataset from that collection. 

225 

226 Associating the same dataset into a collection multiple times is a 

227 no-op, but is still not permitted on read-only databases. 

228 """ 

229 raise NotImplementedError() 

230 

231 @abstractmethod 

232 def disassociate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None: 

233 """Remove one or more datasets from a collection. 

234 

235 Parameters 

236 ---------- 

237 collection : `CollectionRecord` 

238 The record object describing the collection. ``collection.type`` 

239 must be `~CollectionType.TAGGED`. 

240 datasets : `Iterable` [ `DatasetRef` ] 

241 Datasets to be disassociated. All datasets must be resolved and 

242 have the same `DatasetType` as ``self``. 

243 

244 Raises 

245 ------ 

246 AmbiguousDatasetError 

247 Raised if any of the given `DatasetRef` instances is unresolved. 

248 """ 

249 raise NotImplementedError() 

250 

251 @abstractmethod 

252 def certify(self, collection: CollectionRecord, datasets: Iterable[DatasetRef], 

253 timespan: Timespan) -> None: 

254 """Associate one or more datasets with a calibration collection and a 

255 validity range within it. 

256 

257 Parameters 

258 ---------- 

259 collection : `CollectionRecord` 

260 The record object describing the collection. ``collection.type`` 

261 must be `~CollectionType.CALIBRATION`. 

262 datasets : `Iterable` [ `DatasetRef` ] 

263 Datasets to be associated. All datasets must be resolved and have 

264 the same `DatasetType` as ``self``. 

265 timespan : `Timespan` 

266 The validity range for these datasets within the collection. 

267 

268 Raises 

269 ------ 

270 AmbiguousDatasetError 

271 Raised if any of the given `DatasetRef` instances is unresolved. 

272 ConflictingDefinitionError 

273 Raised if the collection already contains a different dataset with 

274 the same `DatasetType` and data ID and an overlapping validity 

275 range. 

276 TypeError 

277 Raised if 

278 ``collection.type is not CollectionType.CALIBRATION`` or if 

279 ``self.datasetType.isCalibration() is False``. 

280 """ 

281 raise NotImplementedError() 

282 

283 @abstractmethod 

284 def decertify(self, collection: CollectionRecord, timespan: Timespan, *, 

285 dataIds: Optional[Iterable[DataCoordinate]] = None) -> None: 

286 """Remove or adjust datasets to clear a validity range within a 

287 calibration collection. 

288 

289 Parameters 

290 ---------- 

291 collection : `CollectionRecord` 

292 The record object describing the collection. ``collection.type`` 

293 must be `~CollectionType.CALIBRATION`. 

294 timespan : `Timespan` 

295 The validity range to remove datasets from within the collection. 

296 Datasets that overlap this range but are not contained by it will 

297 have their validity ranges adjusted to not overlap it, which may 

298 split a single dataset validity range into two. 

299 dataIds : `Iterable` [ `DataCoordinate` ], optional 

300 Data IDs that should be decertified within the given validity range 

301 If `None`, all data IDs for ``self.datasetType`` will be 

302 decertified. 

303 

304 Raises 

305 ------ 

306 TypeError 

307 Raised if ``collection.type is not CollectionType.CALIBRATION``. 

308 """ 

309 raise NotImplementedError() 

310 

311 @abstractmethod 

312 def select(self, *collections: CollectionRecord, 

313 dataId: SimpleQuery.Select.Or[DataCoordinate] = SimpleQuery.Select, 

314 id: SimpleQuery.Select.Or[Optional[DatasetId]] = SimpleQuery.Select, 

315 run: SimpleQuery.Select.Or[None] = SimpleQuery.Select, 

316 timespan: SimpleQuery.Select.Or[Optional[Timespan]] = SimpleQuery.Select, 

317 ingestDate: SimpleQuery.Select.Or[Optional[Timespan]] = None, 

318 ) -> SimpleQuery: 

319 """Return a SQLAlchemy object that represents a ``SELECT`` query for 

320 this `DatasetType`. 

321 

322 All arguments can either be a value that constrains the query or 

323 the `SimpleQuery.Select` tag object to indicate that the value should 

324 be returned in the columns in the ``SELECT`` clause. The default is 

325 `SimpleQuery.Select`. 

326 

327 Parameters 

328 ---------- 

329 *collections : `CollectionRecord` 

330 The record object(s) describing the collection(s) to query. May 

331 not be of type `CollectionType.CHAINED`. If multiple collections 

332 are passed, the query will search all of them in an unspecified 

333 order, and all collections must have the same type. 

334 dataId : `DataCoordinate` or `Select` 

335 The data ID to restrict results with, or an instruction to return 

336 the data ID via columns with names 

337 ``self.datasetType.dimensions.names``. 

338 id : `DatasetId`, `Select` or None, 

339 The primary key value for the dataset, an instruction to return it 

340 via a ``id`` column, or `None` to ignore it entirely. 

341 run : `None` or `Select` 

342 If `Select` (default), include the dataset's run key value (as 

343 column labeled with the return value of 

344 ``CollectionManager.getRunForiegnKeyName``). 

345 If `None`, do not include this column (to constrain the run, 

346 pass a `RunRecord` as the ``collection`` argument instead). 

347 timespan : `None`, `Select`, or `Timespan` 

348 If `Select` (default), include the validity range timespan in the 

349 result columns. If a `Timespan` instance, constrain the results to 

350 those whose validity ranges overlap that given timespan. Ignored 

351 unless ``collection.type is CollectionType.CALIBRATION``. 

352 ingestDate : `None`, `Select`, or `Timespan` 

353 If `Select` include the ingest timestamp in the result columns. 

354 If a `Timespan` instance, constrain the results to those whose 

355 ingest times which are inside given timespan and also include 

356 timestamp in the result columns. If `None` (default) then there is 

357 no constraint and timestamp is not returned. 

358 

359 Returns 

360 ------- 

361 query : `SimpleQuery` 

362 A struct containing the SQLAlchemy object that representing a 

363 simple ``SELECT`` query. 

364 """ 

365 raise NotImplementedError() 

366 

367 datasetType: DatasetType 

368 """Dataset type whose records this object manages (`DatasetType`). 

369 """ 

370 

371 

372class DatasetRecordStorageManager(VersionedExtension): 

373 """An interface that manages the tables that describe datasets. 

374 

375 `DatasetRecordStorageManager` primarily serves as a container and factory 

376 for `DatasetRecordStorage` instances, which each provide access to the 

377 records for a different `DatasetType`. 

378 """ 

379 

380 @classmethod 

381 @abstractmethod 

382 def initialize( 

383 cls, 

384 db: Database, 

385 context: StaticTablesContext, *, 

386 collections: CollectionManager, 

387 dimensions: DimensionRecordStorageManager, 

388 ) -> DatasetRecordStorageManager: 

389 """Construct an instance of the manager. 

390 

391 Parameters 

392 ---------- 

393 db : `Database` 

394 Interface to the underlying database engine and namespace. 

395 context : `StaticTablesContext` 

396 Context object obtained from `Database.declareStaticTables`; used 

397 to declare any tables that should always be present. 

398 collections: `CollectionManager` 

399 Manager object for the collections in this `Registry`. 

400 dimensions : `DimensionRecordStorageManager` 

401 Manager object for the dimensions in this `Registry`. 

402 

403 Returns 

404 ------- 

405 manager : `DatasetRecordStorageManager` 

406 An instance of a concrete `DatasetRecordStorageManager` subclass. 

407 """ 

408 raise NotImplementedError() 

409 

410 @classmethod 

411 @abstractmethod 

412 def getIdColumnType(cls) -> type: 

413 """Return type used for columns storing dataset IDs. 

414 

415 This type is used for columns storing `DatasetRef.id` values, usually 

416 a `type` subclass provided by SQLAlchemy. 

417 

418 Returns 

419 ------- 

420 dtype : `type` 

421 Type used for dataset identification in database. 

422 """ 

423 raise NotImplementedError() 

424 

425 @classmethod 

426 @abstractmethod 

427 def supportsIdGenerationMode(cls, mode: DatasetIdGenEnum) -> bool: 

428 """Test whether the given dataset ID generation mode is supported by 

429 `insert`. 

430 

431 Parameters 

432 ---------- 

433 mode : `DatasetIdGenEnum` 

434 Enum value for the mode to test. 

435 

436 Returns 

437 ------- 

438 supported : `bool` 

439 Whether the given mode is supported. 

440 """ 

441 raise NotImplementedError() 

442 

443 @classmethod 

444 @abstractmethod 

445 def addDatasetForeignKey(cls, tableSpec: ddl.TableSpec, *, 

446 name: str = "dataset", constraint: bool = True, onDelete: Optional[str] = None, 

447 **kwargs: Any) -> ddl.FieldSpec: 

448 """Add a foreign key (field and constraint) referencing the dataset 

449 table. 

450 

451 Parameters 

452 ---------- 

453 tableSpec : `ddl.TableSpec` 

454 Specification for the table that should reference the dataset 

455 table. Will be modified in place. 

456 name: `str`, optional 

457 A name to use for the prefix of the new field; the full name is 

458 ``{name}_id``. 

459 onDelete: `str`, optional 

460 One of "CASCADE" or "SET NULL", indicating what should happen to 

461 the referencing row if the collection row is deleted. `None` 

462 indicates that this should be an integrity error. 

463 constraint: `bool`, optional 

464 If `False` (`True` is default), add a field that can be joined to 

465 the dataset primary key, but do not add a foreign key constraint. 

466 **kwargs 

467 Additional keyword arguments are forwarded to the `ddl.FieldSpec` 

468 constructor (only the ``name`` and ``dtype`` arguments are 

469 otherwise provided). 

470 

471 Returns 

472 ------- 

473 idSpec : `ddl.FieldSpec` 

474 Specification for the ID field. 

475 """ 

476 raise NotImplementedError() 

477 

478 @abstractmethod 

479 def refresh(self) -> None: 

480 """Ensure all other operations on this manager are aware of any 

481 dataset types that may have been registered by other clients since 

482 it was initialized or last refreshed. 

483 """ 

484 raise NotImplementedError() 

485 

486 def __getitem__(self, name: str) -> DatasetRecordStorage: 

487 """Return the object that provides access to the records associated 

488 with the given `DatasetType` name. 

489 

490 This is simply a convenience wrapper for `find` that raises `KeyError` 

491 when the dataset type is not found. 

492 

493 Returns 

494 ------- 

495 records : `DatasetRecordStorage` 

496 The object representing the records for the given dataset type. 

497 

498 Raises 

499 ------ 

500 KeyError 

501 Raised if there is no dataset type with the given name. 

502 

503 Notes 

504 ----- 

505 Dataset types registered by another client of the same repository since 

506 the last call to `initialize` or `refresh` may not be found. 

507 """ 

508 result = self.find(name) 

509 if result is None: 

510 raise KeyError(f"Dataset type with name '{name}' not found.") 

511 return result 

512 

513 @abstractmethod 

514 def find(self, name: str) -> Optional[DatasetRecordStorage]: 

515 """Return an object that provides access to the records associated with 

516 the given `DatasetType` name, if one exists. 

517 

518 Parameters 

519 ---------- 

520 name : `str` 

521 Name of the dataset type. 

522 

523 Returns 

524 ------- 

525 records : `DatasetRecordStorage` or `None` 

526 The object representing the records for the given dataset type, or 

527 `None` if there are no records for that dataset type. 

528 

529 Notes 

530 ----- 

531 Dataset types registered by another client of the same repository since 

532 the last call to `initialize` or `refresh` may not be found. 

533 """ 

534 raise NotImplementedError() 

535 

536 @abstractmethod 

537 def register(self, datasetType: DatasetType) -> Tuple[DatasetRecordStorage, bool]: 

538 """Ensure that this `Registry` can hold records for the given 

539 `DatasetType`, creating new tables as necessary. 

540 

541 Parameters 

542 ---------- 

543 datasetType : `DatasetType` 

544 Dataset type for which a table should created (as necessary) and 

545 an associated `DatasetRecordStorage` returned. 

546 

547 Returns 

548 ------- 

549 records : `DatasetRecordStorage` 

550 The object representing the records for the given dataset type. 

551 inserted : `bool` 

552 `True` if the dataset type did not exist in the registry before. 

553 

554 Notes 

555 ----- 

556 This operation may not be invoked within a `Database.transaction` 

557 context. 

558 """ 

559 raise NotImplementedError() 

560 

561 @abstractmethod 

562 def remove(self, name: str) -> None: 

563 """Remove the dataset type. 

564 

565 Parameters 

566 ---------- 

567 name : `str` 

568 Name of the dataset type. 

569 """ 

570 raise NotImplementedError() 

571 

572 @abstractmethod 

573 def __iter__(self) -> Iterator[DatasetType]: 

574 """Return an iterator over the the dataset types present in this layer. 

575 

576 Notes 

577 ----- 

578 Dataset types registered by another client of the same layer since 

579 the last call to `initialize` or `refresh` may not be included. 

580 """ 

581 raise NotImplementedError() 

582 

583 @abstractmethod 

584 def getDatasetRef(self, id: DatasetId) -> Optional[DatasetRef]: 

585 """Return a `DatasetRef` for the given dataset primary key 

586 value. 

587 

588 Parameters 

589 ---------- 

590 id : `DatasetId` 

591 Primary key value for the dataset. 

592 

593 Returns 

594 ------- 

595 ref : `DatasetRef` or `None` 

596 Object representing the dataset, or `None` if no dataset with the 

597 given primary key values exists in this layer. 

598 """ 

599 raise NotImplementedError() 

600 

601 @abstractmethod 

602 def getCollectionSummary(self, collection: CollectionRecord) -> CollectionSummary: 

603 """Return a summary for the given collection. 

604 

605 Parameters 

606 ---------- 

607 collection : `CollectionRecord` 

608 Record describing the collection for which a summary is to be 

609 retrieved. 

610 

611 Returns 

612 ------- 

613 summary : `CollectionSummary` 

614 Summary of the dataset types and governor dimension values in 

615 this collection. 

616 """ 

617 raise NotImplementedError()