Coverage for python/lsst/daf/butler/registry/interfaces/_datasets.py: 68%

95 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2022-11-06 12:40 -0800

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ("DatasetRecordStorageManager", "DatasetRecordStorage", "DatasetIdGenEnum") 

25 

26import enum 

27from abc import ABC, abstractmethod 

28from typing import TYPE_CHECKING, Any, Iterable, Iterator, Optional, Tuple 

29 

30import sqlalchemy.sql 

31 

32from ...core import DataCoordinate, DatasetId, DatasetRef, DatasetType, SimpleQuery, Timespan, ddl 

33from ._versioning import VersionedExtension 

34 

35if TYPE_CHECKING: 35 ↛ 36line 35 didn't jump to line 36, because the condition on line 35 was never true

36 from ..summaries import CollectionSummary 

37 from ._collections import CollectionManager, CollectionRecord, RunRecord 

38 from ._database import Database, StaticTablesContext 

39 from ._dimensions import DimensionRecordStorageManager 

40 

41 

42class DatasetIdGenEnum(enum.Enum): 

43 """This enum is used to specify dataset ID generation options for 

44 ``insert()`` method. 

45 """ 

46 

47 UNIQUE = 0 

48 """Unique mode generates unique ID for each inserted dataset, e.g. 

49 auto-generated by database or random UUID. 

50 """ 

51 

52 DATAID_TYPE = 1 

53 """In this mode ID is computed deterministically from a combination of 

54 dataset type and dataId. 

55 """ 

56 

57 DATAID_TYPE_RUN = 2 

58 """In this mode ID is computed deterministically from a combination of 

59 dataset type, dataId, and run collection name. 

60 """ 

61 

62 

63class DatasetRecordStorage(ABC): 

64 """An interface that manages the records associated with a particular 

65 `DatasetType`. 

66 

67 Parameters 

68 ---------- 

69 datasetType : `DatasetType` 

70 Dataset type whose records this object manages. 

71 """ 

72 

73 def __init__(self, datasetType: DatasetType): 

74 self.datasetType = datasetType 

75 

76 @abstractmethod 

77 def insert( 

78 self, 

79 run: RunRecord, 

80 dataIds: Iterable[DataCoordinate], 

81 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

82 ) -> Iterator[DatasetRef]: 

83 """Insert one or more dataset entries into the database. 

84 

85 Parameters 

86 ---------- 

87 run : `RunRecord` 

88 The record object describing the `~CollectionType.RUN` collection 

89 this dataset will be associated with. 

90 dataIds : `Iterable` [ `DataCoordinate` ] 

91 Expanded data IDs (`DataCoordinate` instances) for the 

92 datasets to be added. The dimensions of all data IDs must be the 

93 same as ``self.datasetType.dimensions``. 

94 idMode : `DatasetIdGenEnum` 

95 With `UNIQUE` each new dataset is inserted with its new unique ID. 

96 With non-`UNIQUE` mode ID is computed from some combination of 

97 dataset type, dataId, and run collection name; if the same ID is 

98 already in the database then new record is not inserted. 

99 

100 Returns 

101 ------- 

102 datasets : `Iterable` [ `DatasetRef` ] 

103 References to the inserted datasets. 

104 """ 

105 raise NotImplementedError() 

106 

107 @abstractmethod 

108 def import_( 

109 self, 

110 run: RunRecord, 

111 datasets: Iterable[DatasetRef], 

112 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

113 reuseIds: bool = False, 

114 ) -> Iterator[DatasetRef]: 

115 """Insert one or more dataset entries into the database. 

116 

117 Parameters 

118 ---------- 

119 run : `RunRecord` 

120 The record object describing the `~CollectionType.RUN` collection 

121 this dataset will be associated with. 

122 datasets : `~collections.abc.Iterable` of `DatasetRef` 

123 Datasets to be inserted. Datasets can specify ``id`` attribute 

124 which will be used for inserted datasets. All dataset IDs must 

125 have the same type (`int` or `uuid.UUID`), if type of dataset IDs 

126 does not match type supported by this class then IDs will be 

127 ignored and new IDs will be generated by backend. 

128 idGenerationMode : `DatasetIdGenEnum` 

129 With `UNIQUE` each new dataset is inserted with its new unique ID. 

130 With non-`UNIQUE` mode ID is computed from some combination of 

131 dataset type, dataId, and run collection name; if the same ID is 

132 already in the database then new record is not inserted. 

133 reuseIds : `bool`, optional 

134 If `True` then forces re-use of imported dataset IDs for integer 

135 IDs which are normally generated as auto-incremented; exception 

136 will be raised if imported IDs clash with existing ones. This 

137 option has no effect on the use of globally-unique IDs which are 

138 always re-used (or generated if integer IDs are being imported). 

139 

140 Returns 

141 ------- 

142 datasets : `Iterable` [ `DatasetRef` ] 

143 References to the inserted or existing datasets. 

144 

145 Notes 

146 ----- 

147 The ``datasetType`` and ``run`` attributes of datasets are supposed to 

148 be identical across all datasets but this is not checked and it should 

149 be enforced by higher level registry code. This method does not need 

150 to use those attributes from datasets, only ``dataId`` and ``id`` are 

151 relevant. 

152 """ 

153 raise NotImplementedError() 

154 

155 @abstractmethod 

156 def find( 

157 self, collection: CollectionRecord, dataId: DataCoordinate, timespan: Optional[Timespan] = None 

158 ) -> Optional[DatasetRef]: 

159 """Search a collection for a dataset with the given data ID. 

160 

161 Parameters 

162 ---------- 

163 collection : `CollectionRecord` 

164 The record object describing the collection to search for the 

165 dataset. May have any `CollectionType`. 

166 dataId: `DataCoordinate` 

167 Complete (but not necessarily expanded) data ID to search with, 

168 with ``dataId.graph == self.datasetType.dimensions``. 

169 timespan : `Timespan`, optional 

170 A timespan that the validity range of the dataset must overlap. 

171 Required if ``collection.type is CollectionType.CALIBRATION``, and 

172 ignored otherwise. 

173 

174 Returns 

175 ------- 

176 ref : `DatasetRef` 

177 A resolved `DatasetRef` (without components populated), or `None` 

178 if no matching dataset was found. 

179 """ 

180 raise NotImplementedError() 

181 

182 @abstractmethod 

183 def delete(self, datasets: Iterable[DatasetRef]) -> None: 

184 """Fully delete the given datasets from the registry. 

185 

186 Parameters 

187 ---------- 

188 datasets : `Iterable` [ `DatasetRef` ] 

189 Datasets to be deleted. All datasets must be resolved and have 

190 the same `DatasetType` as ``self``. 

191 

192 Raises 

193 ------ 

194 AmbiguousDatasetError 

195 Raised if any of the given `DatasetRef` instances is unresolved. 

196 """ 

197 raise NotImplementedError() 

198 

199 @abstractmethod 

200 def associate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None: 

201 """Associate one or more datasets with a collection. 

202 

203 Parameters 

204 ---------- 

205 collection : `CollectionRecord` 

206 The record object describing the collection. ``collection.type`` 

207 must be `~CollectionType.TAGGED`. 

208 datasets : `Iterable` [ `DatasetRef` ] 

209 Datasets to be associated. All datasets must be resolved and have 

210 the same `DatasetType` as ``self``. 

211 

212 Raises 

213 ------ 

214 AmbiguousDatasetError 

215 Raised if any of the given `DatasetRef` instances is unresolved. 

216 

217 Notes 

218 ----- 

219 Associating a dataset with into collection that already contains a 

220 different dataset with the same `DatasetType` and data ID will remove 

221 the existing dataset from that collection. 

222 

223 Associating the same dataset into a collection multiple times is a 

224 no-op, but is still not permitted on read-only databases. 

225 """ 

226 raise NotImplementedError() 

227 

228 @abstractmethod 

229 def disassociate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None: 

230 """Remove one or more datasets from a collection. 

231 

232 Parameters 

233 ---------- 

234 collection : `CollectionRecord` 

235 The record object describing the collection. ``collection.type`` 

236 must be `~CollectionType.TAGGED`. 

237 datasets : `Iterable` [ `DatasetRef` ] 

238 Datasets to be disassociated. All datasets must be resolved and 

239 have the same `DatasetType` as ``self``. 

240 

241 Raises 

242 ------ 

243 AmbiguousDatasetError 

244 Raised if any of the given `DatasetRef` instances is unresolved. 

245 """ 

246 raise NotImplementedError() 

247 

248 @abstractmethod 

249 def certify( 

250 self, collection: CollectionRecord, datasets: Iterable[DatasetRef], timespan: Timespan 

251 ) -> None: 

252 """Associate one or more datasets with a calibration collection and a 

253 validity range within it. 

254 

255 Parameters 

256 ---------- 

257 collection : `CollectionRecord` 

258 The record object describing the collection. ``collection.type`` 

259 must be `~CollectionType.CALIBRATION`. 

260 datasets : `Iterable` [ `DatasetRef` ] 

261 Datasets to be associated. All datasets must be resolved and have 

262 the same `DatasetType` as ``self``. 

263 timespan : `Timespan` 

264 The validity range for these datasets within the collection. 

265 

266 Raises 

267 ------ 

268 AmbiguousDatasetError 

269 Raised if any of the given `DatasetRef` instances is unresolved. 

270 ConflictingDefinitionError 

271 Raised if the collection already contains a different dataset with 

272 the same `DatasetType` and data ID and an overlapping validity 

273 range. 

274 CollectionTypeError 

275 Raised if 

276 ``collection.type is not CollectionType.CALIBRATION`` or if 

277 ``self.datasetType.isCalibration() is False``. 

278 """ 

279 raise NotImplementedError() 

280 

281 @abstractmethod 

282 def decertify( 

283 self, 

284 collection: CollectionRecord, 

285 timespan: Timespan, 

286 *, 

287 dataIds: Optional[Iterable[DataCoordinate]] = None, 

288 ) -> None: 

289 """Remove or adjust datasets to clear a validity range within a 

290 calibration collection. 

291 

292 Parameters 

293 ---------- 

294 collection : `CollectionRecord` 

295 The record object describing the collection. ``collection.type`` 

296 must be `~CollectionType.CALIBRATION`. 

297 timespan : `Timespan` 

298 The validity range to remove datasets from within the collection. 

299 Datasets that overlap this range but are not contained by it will 

300 have their validity ranges adjusted to not overlap it, which may 

301 split a single dataset validity range into two. 

302 dataIds : `Iterable` [ `DataCoordinate` ], optional 

303 Data IDs that should be decertified within the given validity range 

304 If `None`, all data IDs for ``self.datasetType`` will be 

305 decertified. 

306 

307 Raises 

308 ------ 

309 CollectionTypeError 

310 Raised if ``collection.type is not CollectionType.CALIBRATION``. 

311 """ 

312 raise NotImplementedError() 

313 

314 @abstractmethod 

315 def select( 

316 self, 

317 *collections: CollectionRecord, 

318 dataId: SimpleQuery.Select.Or[DataCoordinate] = SimpleQuery.Select, 

319 id: SimpleQuery.Select.Or[Optional[DatasetId]] = SimpleQuery.Select, 

320 run: SimpleQuery.Select.Or[None] = SimpleQuery.Select, 

321 timespan: SimpleQuery.Select.Or[Optional[Timespan]] = SimpleQuery.Select, 

322 ingestDate: SimpleQuery.Select.Or[Optional[Timespan]] = None, 

323 ) -> sqlalchemy.sql.Selectable: 

324 """Return a SQLAlchemy object that represents a ``SELECT`` query for 

325 this `DatasetType`. 

326 

327 All arguments can either be a value that constrains the query or 

328 the `SimpleQuery.Select` tag object to indicate that the value should 

329 be returned in the columns in the ``SELECT`` clause. The default is 

330 `SimpleQuery.Select`. 

331 

332 Parameters 

333 ---------- 

334 *collections : `CollectionRecord` 

335 The record object(s) describing the collection(s) to query. May 

336 not be of type `CollectionType.CHAINED`. If multiple collections 

337 are passed, the query will search all of them in an unspecified 

338 order, and all collections must have the same type. 

339 dataId : `DataCoordinate` or `Select` 

340 The data ID to restrict results with, or an instruction to return 

341 the data ID via columns with names 

342 ``self.datasetType.dimensions.names``. 

343 id : `DatasetId`, `Select` or None, 

344 The primary key value for the dataset, an instruction to return it 

345 via a ``id`` column, or `None` to ignore it entirely. 

346 run : `None` or `Select` 

347 If `Select` (default), include the dataset's run key value (as 

348 column labeled with the return value of 

349 ``CollectionManager.getRunForeignKeyName``). 

350 If `None`, do not include this column (to constrain the run, 

351 pass a `RunRecord` as the ``collection`` argument instead). 

352 timespan : `None`, `Select`, or `Timespan` 

353 If `Select` (default), include the validity range timespan in the 

354 result columns. If a `Timespan` instance, constrain the results to 

355 those whose validity ranges overlap that given timespan. Ignored 

356 for collection types other than `~CollectionType.CALIBRATION``, 

357 but `None` should be passed explicitly if a mix of 

358 `~CollectionType.CALIBRATION` and other types are passed in. 

359 ingestDate : `None`, `Select`, or `Timespan` 

360 If `Select` include the ingest timestamp in the result columns. 

361 If a `Timespan` instance, constrain the results to those whose 

362 ingest times which are inside given timespan and also include 

363 timestamp in the result columns. If `None` (default) then there is 

364 no constraint and timestamp is not returned. 

365 

366 Returns 

367 ------- 

368 query : `sqlalchemy.sql.Selectable` 

369 A SQLAlchemy object representing a simple ``SELECT`` query. 

370 """ 

371 raise NotImplementedError() 

372 

373 datasetType: DatasetType 

374 """Dataset type whose records this object manages (`DatasetType`). 

375 """ 

376 

377 

378class DatasetRecordStorageManager(VersionedExtension): 

379 """An interface that manages the tables that describe datasets. 

380 

381 `DatasetRecordStorageManager` primarily serves as a container and factory 

382 for `DatasetRecordStorage` instances, which each provide access to the 

383 records for a different `DatasetType`. 

384 """ 

385 

386 @classmethod 

387 @abstractmethod 

388 def initialize( 

389 cls, 

390 db: Database, 

391 context: StaticTablesContext, 

392 *, 

393 collections: CollectionManager, 

394 dimensions: DimensionRecordStorageManager, 

395 ) -> DatasetRecordStorageManager: 

396 """Construct an instance of the manager. 

397 

398 Parameters 

399 ---------- 

400 db : `Database` 

401 Interface to the underlying database engine and namespace. 

402 context : `StaticTablesContext` 

403 Context object obtained from `Database.declareStaticTables`; used 

404 to declare any tables that should always be present. 

405 collections: `CollectionManager` 

406 Manager object for the collections in this `Registry`. 

407 dimensions : `DimensionRecordStorageManager` 

408 Manager object for the dimensions in this `Registry`. 

409 

410 Returns 

411 ------- 

412 manager : `DatasetRecordStorageManager` 

413 An instance of a concrete `DatasetRecordStorageManager` subclass. 

414 """ 

415 raise NotImplementedError() 

416 

417 @classmethod 

418 @abstractmethod 

419 def getIdColumnType(cls) -> type: 

420 """Return type used for columns storing dataset IDs. 

421 

422 This type is used for columns storing `DatasetRef.id` values, usually 

423 a `type` subclass provided by SQLAlchemy. 

424 

425 Returns 

426 ------- 

427 dtype : `type` 

428 Type used for dataset identification in database. 

429 """ 

430 raise NotImplementedError() 

431 

432 @classmethod 

433 @abstractmethod 

434 def supportsIdGenerationMode(cls, mode: DatasetIdGenEnum) -> bool: 

435 """Test whether the given dataset ID generation mode is supported by 

436 `insert`. 

437 

438 Parameters 

439 ---------- 

440 mode : `DatasetIdGenEnum` 

441 Enum value for the mode to test. 

442 

443 Returns 

444 ------- 

445 supported : `bool` 

446 Whether the given mode is supported. 

447 """ 

448 raise NotImplementedError() 

449 

450 @classmethod 

451 @abstractmethod 

452 def addDatasetForeignKey( 

453 cls, 

454 tableSpec: ddl.TableSpec, 

455 *, 

456 name: str = "dataset", 

457 constraint: bool = True, 

458 onDelete: Optional[str] = None, 

459 **kwargs: Any, 

460 ) -> ddl.FieldSpec: 

461 """Add a foreign key (field and constraint) referencing the dataset 

462 table. 

463 

464 Parameters 

465 ---------- 

466 tableSpec : `ddl.TableSpec` 

467 Specification for the table that should reference the dataset 

468 table. Will be modified in place. 

469 name: `str`, optional 

470 A name to use for the prefix of the new field; the full name is 

471 ``{name}_id``. 

472 onDelete: `str`, optional 

473 One of "CASCADE" or "SET NULL", indicating what should happen to 

474 the referencing row if the collection row is deleted. `None` 

475 indicates that this should be an integrity error. 

476 constraint: `bool`, optional 

477 If `False` (`True` is default), add a field that can be joined to 

478 the dataset primary key, but do not add a foreign key constraint. 

479 **kwargs 

480 Additional keyword arguments are forwarded to the `ddl.FieldSpec` 

481 constructor (only the ``name`` and ``dtype`` arguments are 

482 otherwise provided). 

483 

484 Returns 

485 ------- 

486 idSpec : `ddl.FieldSpec` 

487 Specification for the ID field. 

488 """ 

489 raise NotImplementedError() 

490 

491 @abstractmethod 

492 def refresh(self) -> None: 

493 """Ensure all other operations on this manager are aware of any 

494 dataset types that may have been registered by other clients since 

495 it was initialized or last refreshed. 

496 """ 

497 raise NotImplementedError() 

498 

499 def __getitem__(self, name: str) -> DatasetRecordStorage: 

500 """Return the object that provides access to the records associated 

501 with the given `DatasetType` name. 

502 

503 This is simply a convenience wrapper for `find` that raises `KeyError` 

504 when the dataset type is not found. 

505 

506 Returns 

507 ------- 

508 records : `DatasetRecordStorage` 

509 The object representing the records for the given dataset type. 

510 

511 Raises 

512 ------ 

513 KeyError 

514 Raised if there is no dataset type with the given name. 

515 

516 Notes 

517 ----- 

518 Dataset types registered by another client of the same repository since 

519 the last call to `initialize` or `refresh` may not be found. 

520 """ 

521 result = self.find(name) 

522 if result is None: 

523 raise KeyError(f"Dataset type with name '{name}' not found.") 

524 return result 

525 

526 @abstractmethod 

527 def find(self, name: str) -> Optional[DatasetRecordStorage]: 

528 """Return an object that provides access to the records associated with 

529 the given `DatasetType` name, if one exists. 

530 

531 Parameters 

532 ---------- 

533 name : `str` 

534 Name of the dataset type. 

535 

536 Returns 

537 ------- 

538 records : `DatasetRecordStorage` or `None` 

539 The object representing the records for the given dataset type, or 

540 `None` if there are no records for that dataset type. 

541 

542 Notes 

543 ----- 

544 Dataset types registered by another client of the same repository since 

545 the last call to `initialize` or `refresh` may not be found. 

546 """ 

547 raise NotImplementedError() 

548 

549 @abstractmethod 

550 def register(self, datasetType: DatasetType) -> Tuple[DatasetRecordStorage, bool]: 

551 """Ensure that this `Registry` can hold records for the given 

552 `DatasetType`, creating new tables as necessary. 

553 

554 Parameters 

555 ---------- 

556 datasetType : `DatasetType` 

557 Dataset type for which a table should created (as necessary) and 

558 an associated `DatasetRecordStorage` returned. 

559 

560 Returns 

561 ------- 

562 records : `DatasetRecordStorage` 

563 The object representing the records for the given dataset type. 

564 inserted : `bool` 

565 `True` if the dataset type did not exist in the registry before. 

566 

567 Notes 

568 ----- 

569 This operation may not be invoked within a `Database.transaction` 

570 context. 

571 """ 

572 raise NotImplementedError() 

573 

574 @abstractmethod 

575 def remove(self, name: str) -> None: 

576 """Remove the dataset type. 

577 

578 Parameters 

579 ---------- 

580 name : `str` 

581 Name of the dataset type. 

582 """ 

583 raise NotImplementedError() 

584 

585 @abstractmethod 

586 def __iter__(self) -> Iterator[DatasetType]: 

587 """Return an iterator over the the dataset types present in this layer. 

588 

589 Notes 

590 ----- 

591 Dataset types registered by another client of the same layer since 

592 the last call to `initialize` or `refresh` may not be included. 

593 """ 

594 raise NotImplementedError() 

595 

596 @abstractmethod 

597 def getDatasetRef(self, id: DatasetId) -> Optional[DatasetRef]: 

598 """Return a `DatasetRef` for the given dataset primary key 

599 value. 

600 

601 Parameters 

602 ---------- 

603 id : `DatasetId` 

604 Primary key value for the dataset. 

605 

606 Returns 

607 ------- 

608 ref : `DatasetRef` or `None` 

609 Object representing the dataset, or `None` if no dataset with the 

610 given primary key values exists in this layer. 

611 """ 

612 raise NotImplementedError() 

613 

614 @abstractmethod 

615 def getCollectionSummary(self, collection: CollectionRecord) -> CollectionSummary: 

616 """Return a summary for the given collection. 

617 

618 Parameters 

619 ---------- 

620 collection : `CollectionRecord` 

621 Record describing the collection for which a summary is to be 

622 retrieved. 

623 

624 Returns 

625 ------- 

626 summary : `CollectionSummary` 

627 Summary of the dataset types and governor dimension values in 

628 this collection. 

629 """ 

630 raise NotImplementedError()