Coverage for python/lsst/daf/butler/registry/interfaces/_datasets.py: 69%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

92 statements  

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ("DatasetRecordStorageManager", "DatasetRecordStorage", "DatasetIdGenEnum") 

25 

26import enum 

27from abc import ABC, abstractmethod 

28from typing import TYPE_CHECKING, Any, Iterable, Iterator, Optional, Tuple 

29 

30from ...core import DataCoordinate, DatasetId, DatasetRef, DatasetType, SimpleQuery, Timespan, ddl 

31from ._versioning import VersionedExtension 

32 

33if TYPE_CHECKING: 33 ↛ 34line 33 didn't jump to line 34, because the condition on line 33 was never true

34 from ..summaries import CollectionSummary 

35 from ._collections import CollectionManager, CollectionRecord, RunRecord 

36 from ._database import Database, StaticTablesContext 

37 from ._dimensions import DimensionRecordStorageManager 

38 

39 

40class DatasetIdGenEnum(enum.Enum): 

41 """This enum is used to specify dataset ID generation options for 

42 ``insert()`` method. 

43 """ 

44 

45 UNIQUE = 0 

46 """Unique mode generates unique ID for each inserted dataset, e.g. 

47 auto-generated by database or random UUID. 

48 """ 

49 

50 DATAID_TYPE = 1 

51 """In this mode ID is computed deterministically from a combination of 

52 dataset type and dataId. 

53 """ 

54 

55 DATAID_TYPE_RUN = 2 

56 """In this mode ID is computed deterministically from a combination of 

57 dataset type, dataId, and run collection name. 

58 """ 

59 

60 

61class DatasetRecordStorage(ABC): 

62 """An interface that manages the records associated with a particular 

63 `DatasetType`. 

64 

65 Parameters 

66 ---------- 

67 datasetType : `DatasetType` 

68 Dataset type whose records this object manages. 

69 """ 

70 

71 def __init__(self, datasetType: DatasetType): 

72 self.datasetType = datasetType 

73 

74 @abstractmethod 

75 def insert( 

76 self, 

77 run: RunRecord, 

78 dataIds: Iterable[DataCoordinate], 

79 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

80 ) -> Iterator[DatasetRef]: 

81 """Insert one or more dataset entries into the database. 

82 

83 Parameters 

84 ---------- 

85 run : `RunRecord` 

86 The record object describing the `~CollectionType.RUN` collection 

87 this dataset will be associated with. 

88 dataIds : `Iterable` [ `DataCoordinate` ] 

89 Expanded data IDs (`DataCoordinate` instances) for the 

90 datasets to be added. The dimensions of all data IDs must be the 

91 same as ``self.datasetType.dimensions``. 

92 idMode : `DatasetIdGenEnum` 

93 With `UNIQUE` each new dataset is inserted with its new unique ID. 

94 With non-`UNIQUE` mode ID is computed from some combination of 

95 dataset type, dataId, and run collection name; if the same ID is 

96 already in the database then new record is not inserted. 

97 

98 Returns 

99 ------- 

100 datasets : `Iterable` [ `DatasetRef` ] 

101 References to the inserted datasets. 

102 """ 

103 raise NotImplementedError() 

104 

105 @abstractmethod 

106 def import_( 

107 self, 

108 run: RunRecord, 

109 datasets: Iterable[DatasetRef], 

110 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

111 reuseIds: bool = False, 

112 ) -> Iterator[DatasetRef]: 

113 """Insert one or more dataset entries into the database. 

114 

115 Parameters 

116 ---------- 

117 run : `RunRecord` 

118 The record object describing the `~CollectionType.RUN` collection 

119 this dataset will be associated with. 

120 datasets : `~collections.abc.Iterable` of `DatasetRef` 

121 Datasets to be inserted. Datasets can specify ``id`` attribute 

122 which will be used for inserted datasets. All dataset IDs must 

123 have the same type (`int` or `uuid.UUID`), if type of dataset IDs 

124 does not match type supported by this class then IDs will be 

125 ignored and new IDs will be generated by backend. 

126 idGenerationMode : `DatasetIdGenEnum` 

127 With `UNIQUE` each new dataset is inserted with its new unique ID. 

128 With non-`UNIQUE` mode ID is computed from some combination of 

129 dataset type, dataId, and run collection name; if the same ID is 

130 already in the database then new record is not inserted. 

131 reuseIds : `bool`, optional 

132 If `True` then forces re-use of imported dataset IDs for integer 

133 IDs which are normally generated as auto-incremented; exception 

134 will be raised if imported IDs clash with existing ones. This 

135 option has no effect on the use of globally-unique IDs which are 

136 always re-used (or generated if integer IDs are being imported). 

137 

138 Returns 

139 ------- 

140 datasets : `Iterable` [ `DatasetRef` ] 

141 References to the inserted or existing datasets. 

142 

143 Notes 

144 ----- 

145 The ``datasetType`` and ``run`` attributes of datasets are supposed to 

146 be identical across all datasets but this is not checked and it should 

147 be enforced by higher level registry code. This method does not need 

148 to use those attributes from datasets, only ``dataId`` and ``id`` are 

149 relevant. 

150 """ 

151 raise NotImplementedError() 

152 

153 @abstractmethod 

154 def find( 

155 self, collection: CollectionRecord, dataId: DataCoordinate, timespan: Optional[Timespan] = None 

156 ) -> Optional[DatasetRef]: 

157 """Search a collection for a dataset with the given data ID. 

158 

159 Parameters 

160 ---------- 

161 collection : `CollectionRecord` 

162 The record object describing the collection to search for the 

163 dataset. May have any `CollectionType`. 

164 dataId: `DataCoordinate` 

165 Complete (but not necessarily expanded) data ID to search with, 

166 with ``dataId.graph == self.datasetType.dimensions``. 

167 timespan : `Timespan`, optional 

168 A timespan that the validity range of the dataset must overlap. 

169 Required if ``collection.type is CollectionType.CALIBRATION``, and 

170 ignored otherwise. 

171 

172 Returns 

173 ------- 

174 ref : `DatasetRef` 

175 A resolved `DatasetRef` (without components populated), or `None` 

176 if no matching dataset was found. 

177 """ 

178 raise NotImplementedError() 

179 

180 @abstractmethod 

181 def delete(self, datasets: Iterable[DatasetRef]) -> None: 

182 """Fully delete the given datasets from the registry. 

183 

184 Parameters 

185 ---------- 

186 datasets : `Iterable` [ `DatasetRef` ] 

187 Datasets to be deleted. All datasets must be resolved and have 

188 the same `DatasetType` as ``self``. 

189 

190 Raises 

191 ------ 

192 AmbiguousDatasetError 

193 Raised if any of the given `DatasetRef` instances is unresolved. 

194 """ 

195 raise NotImplementedError() 

196 

197 @abstractmethod 

198 def associate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None: 

199 """Associate one or more datasets with a collection. 

200 

201 Parameters 

202 ---------- 

203 collection : `CollectionRecord` 

204 The record object describing the collection. ``collection.type`` 

205 must be `~CollectionType.TAGGED`. 

206 datasets : `Iterable` [ `DatasetRef` ] 

207 Datasets to be associated. All datasets must be resolved and have 

208 the same `DatasetType` as ``self``. 

209 

210 Raises 

211 ------ 

212 AmbiguousDatasetError 

213 Raised if any of the given `DatasetRef` instances is unresolved. 

214 

215 Notes 

216 ----- 

217 Associating a dataset with into collection that already contains a 

218 different dataset with the same `DatasetType` and data ID will remove 

219 the existing dataset from that collection. 

220 

221 Associating the same dataset into a collection multiple times is a 

222 no-op, but is still not permitted on read-only databases. 

223 """ 

224 raise NotImplementedError() 

225 

226 @abstractmethod 

227 def disassociate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None: 

228 """Remove one or more datasets from a collection. 

229 

230 Parameters 

231 ---------- 

232 collection : `CollectionRecord` 

233 The record object describing the collection. ``collection.type`` 

234 must be `~CollectionType.TAGGED`. 

235 datasets : `Iterable` [ `DatasetRef` ] 

236 Datasets to be disassociated. All datasets must be resolved and 

237 have the same `DatasetType` as ``self``. 

238 

239 Raises 

240 ------ 

241 AmbiguousDatasetError 

242 Raised if any of the given `DatasetRef` instances is unresolved. 

243 """ 

244 raise NotImplementedError() 

245 

246 @abstractmethod 

247 def certify( 

248 self, collection: CollectionRecord, datasets: Iterable[DatasetRef], timespan: Timespan 

249 ) -> None: 

250 """Associate one or more datasets with a calibration collection and a 

251 validity range within it. 

252 

253 Parameters 

254 ---------- 

255 collection : `CollectionRecord` 

256 The record object describing the collection. ``collection.type`` 

257 must be `~CollectionType.CALIBRATION`. 

258 datasets : `Iterable` [ `DatasetRef` ] 

259 Datasets to be associated. All datasets must be resolved and have 

260 the same `DatasetType` as ``self``. 

261 timespan : `Timespan` 

262 The validity range for these datasets within the collection. 

263 

264 Raises 

265 ------ 

266 AmbiguousDatasetError 

267 Raised if any of the given `DatasetRef` instances is unresolved. 

268 ConflictingDefinitionError 

269 Raised if the collection already contains a different dataset with 

270 the same `DatasetType` and data ID and an overlapping validity 

271 range. 

272 TypeError 

273 Raised if 

274 ``collection.type is not CollectionType.CALIBRATION`` or if 

275 ``self.datasetType.isCalibration() is False``. 

276 """ 

277 raise NotImplementedError() 

278 

279 @abstractmethod 

280 def decertify( 

281 self, 

282 collection: CollectionRecord, 

283 timespan: Timespan, 

284 *, 

285 dataIds: Optional[Iterable[DataCoordinate]] = None, 

286 ) -> None: 

287 """Remove or adjust datasets to clear a validity range within a 

288 calibration collection. 

289 

290 Parameters 

291 ---------- 

292 collection : `CollectionRecord` 

293 The record object describing the collection. ``collection.type`` 

294 must be `~CollectionType.CALIBRATION`. 

295 timespan : `Timespan` 

296 The validity range to remove datasets from within the collection. 

297 Datasets that overlap this range but are not contained by it will 

298 have their validity ranges adjusted to not overlap it, which may 

299 split a single dataset validity range into two. 

300 dataIds : `Iterable` [ `DataCoordinate` ], optional 

301 Data IDs that should be decertified within the given validity range 

302 If `None`, all data IDs for ``self.datasetType`` will be 

303 decertified. 

304 

305 Raises 

306 ------ 

307 TypeError 

308 Raised if ``collection.type is not CollectionType.CALIBRATION``. 

309 """ 

310 raise NotImplementedError() 

311 

312 @abstractmethod 

313 def select( 

314 self, 

315 *collections: CollectionRecord, 

316 dataId: SimpleQuery.Select.Or[DataCoordinate] = SimpleQuery.Select, 

317 id: SimpleQuery.Select.Or[Optional[DatasetId]] = SimpleQuery.Select, 

318 run: SimpleQuery.Select.Or[None] = SimpleQuery.Select, 

319 timespan: SimpleQuery.Select.Or[Optional[Timespan]] = SimpleQuery.Select, 

320 ingestDate: SimpleQuery.Select.Or[Optional[Timespan]] = None, 

321 ) -> SimpleQuery: 

322 """Return a SQLAlchemy object that represents a ``SELECT`` query for 

323 this `DatasetType`. 

324 

325 All arguments can either be a value that constrains the query or 

326 the `SimpleQuery.Select` tag object to indicate that the value should 

327 be returned in the columns in the ``SELECT`` clause. The default is 

328 `SimpleQuery.Select`. 

329 

330 Parameters 

331 ---------- 

332 *collections : `CollectionRecord` 

333 The record object(s) describing the collection(s) to query. May 

334 not be of type `CollectionType.CHAINED`. If multiple collections 

335 are passed, the query will search all of them in an unspecified 

336 order, and all collections must have the same type. 

337 dataId : `DataCoordinate` or `Select` 

338 The data ID to restrict results with, or an instruction to return 

339 the data ID via columns with names 

340 ``self.datasetType.dimensions.names``. 

341 id : `DatasetId`, `Select` or None, 

342 The primary key value for the dataset, an instruction to return it 

343 via a ``id`` column, or `None` to ignore it entirely. 

344 run : `None` or `Select` 

345 If `Select` (default), include the dataset's run key value (as 

346 column labeled with the return value of 

347 ``CollectionManager.getRunForeignKeyName``). 

348 If `None`, do not include this column (to constrain the run, 

349 pass a `RunRecord` as the ``collection`` argument instead). 

350 timespan : `None`, `Select`, or `Timespan` 

351 If `Select` (default), include the validity range timespan in the 

352 result columns. If a `Timespan` instance, constrain the results to 

353 those whose validity ranges overlap that given timespan. Ignored 

354 unless ``collection.type is CollectionType.CALIBRATION``. 

355 ingestDate : `None`, `Select`, or `Timespan` 

356 If `Select` include the ingest timestamp in the result columns. 

357 If a `Timespan` instance, constrain the results to those whose 

358 ingest times which are inside given timespan and also include 

359 timestamp in the result columns. If `None` (default) then there is 

360 no constraint and timestamp is not returned. 

361 

362 Returns 

363 ------- 

364 query : `SimpleQuery` 

365 A struct containing the SQLAlchemy object that representing a 

366 simple ``SELECT`` query. 

367 """ 

368 raise NotImplementedError() 

369 

370 datasetType: DatasetType 

371 """Dataset type whose records this object manages (`DatasetType`). 

372 """ 

373 

374 

375class DatasetRecordStorageManager(VersionedExtension): 

376 """An interface that manages the tables that describe datasets. 

377 

378 `DatasetRecordStorageManager` primarily serves as a container and factory 

379 for `DatasetRecordStorage` instances, which each provide access to the 

380 records for a different `DatasetType`. 

381 """ 

382 

383 @classmethod 

384 @abstractmethod 

385 def initialize( 

386 cls, 

387 db: Database, 

388 context: StaticTablesContext, 

389 *, 

390 collections: CollectionManager, 

391 dimensions: DimensionRecordStorageManager, 

392 ) -> DatasetRecordStorageManager: 

393 """Construct an instance of the manager. 

394 

395 Parameters 

396 ---------- 

397 db : `Database` 

398 Interface to the underlying database engine and namespace. 

399 context : `StaticTablesContext` 

400 Context object obtained from `Database.declareStaticTables`; used 

401 to declare any tables that should always be present. 

402 collections: `CollectionManager` 

403 Manager object for the collections in this `Registry`. 

404 dimensions : `DimensionRecordStorageManager` 

405 Manager object for the dimensions in this `Registry`. 

406 

407 Returns 

408 ------- 

409 manager : `DatasetRecordStorageManager` 

410 An instance of a concrete `DatasetRecordStorageManager` subclass. 

411 """ 

412 raise NotImplementedError() 

413 

414 @classmethod 

415 @abstractmethod 

416 def getIdColumnType(cls) -> type: 

417 """Return type used for columns storing dataset IDs. 

418 

419 This type is used for columns storing `DatasetRef.id` values, usually 

420 a `type` subclass provided by SQLAlchemy. 

421 

422 Returns 

423 ------- 

424 dtype : `type` 

425 Type used for dataset identification in database. 

426 """ 

427 raise NotImplementedError() 

428 

429 @classmethod 

430 @abstractmethod 

431 def supportsIdGenerationMode(cls, mode: DatasetIdGenEnum) -> bool: 

432 """Test whether the given dataset ID generation mode is supported by 

433 `insert`. 

434 

435 Parameters 

436 ---------- 

437 mode : `DatasetIdGenEnum` 

438 Enum value for the mode to test. 

439 

440 Returns 

441 ------- 

442 supported : `bool` 

443 Whether the given mode is supported. 

444 """ 

445 raise NotImplementedError() 

446 

447 @classmethod 

448 @abstractmethod 

449 def addDatasetForeignKey( 

450 cls, 

451 tableSpec: ddl.TableSpec, 

452 *, 

453 name: str = "dataset", 

454 constraint: bool = True, 

455 onDelete: Optional[str] = None, 

456 **kwargs: Any, 

457 ) -> ddl.FieldSpec: 

458 """Add a foreign key (field and constraint) referencing the dataset 

459 table. 

460 

461 Parameters 

462 ---------- 

463 tableSpec : `ddl.TableSpec` 

464 Specification for the table that should reference the dataset 

465 table. Will be modified in place. 

466 name: `str`, optional 

467 A name to use for the prefix of the new field; the full name is 

468 ``{name}_id``. 

469 onDelete: `str`, optional 

470 One of "CASCADE" or "SET NULL", indicating what should happen to 

471 the referencing row if the collection row is deleted. `None` 

472 indicates that this should be an integrity error. 

473 constraint: `bool`, optional 

474 If `False` (`True` is default), add a field that can be joined to 

475 the dataset primary key, but do not add a foreign key constraint. 

476 **kwargs 

477 Additional keyword arguments are forwarded to the `ddl.FieldSpec` 

478 constructor (only the ``name`` and ``dtype`` arguments are 

479 otherwise provided). 

480 

481 Returns 

482 ------- 

483 idSpec : `ddl.FieldSpec` 

484 Specification for the ID field. 

485 """ 

486 raise NotImplementedError() 

487 

488 @abstractmethod 

489 def refresh(self) -> None: 

490 """Ensure all other operations on this manager are aware of any 

491 dataset types that may have been registered by other clients since 

492 it was initialized or last refreshed. 

493 """ 

494 raise NotImplementedError() 

495 

496 def __getitem__(self, name: str) -> DatasetRecordStorage: 

497 """Return the object that provides access to the records associated 

498 with the given `DatasetType` name. 

499 

500 This is simply a convenience wrapper for `find` that raises `KeyError` 

501 when the dataset type is not found. 

502 

503 Returns 

504 ------- 

505 records : `DatasetRecordStorage` 

506 The object representing the records for the given dataset type. 

507 

508 Raises 

509 ------ 

510 KeyError 

511 Raised if there is no dataset type with the given name. 

512 

513 Notes 

514 ----- 

515 Dataset types registered by another client of the same repository since 

516 the last call to `initialize` or `refresh` may not be found. 

517 """ 

518 result = self.find(name) 

519 if result is None: 

520 raise KeyError(f"Dataset type with name '{name}' not found.") 

521 return result 

522 

523 @abstractmethod 

524 def find(self, name: str) -> Optional[DatasetRecordStorage]: 

525 """Return an object that provides access to the records associated with 

526 the given `DatasetType` name, if one exists. 

527 

528 Parameters 

529 ---------- 

530 name : `str` 

531 Name of the dataset type. 

532 

533 Returns 

534 ------- 

535 records : `DatasetRecordStorage` or `None` 

536 The object representing the records for the given dataset type, or 

537 `None` if there are no records for that dataset type. 

538 

539 Notes 

540 ----- 

541 Dataset types registered by another client of the same repository since 

542 the last call to `initialize` or `refresh` may not be found. 

543 """ 

544 raise NotImplementedError() 

545 

546 @abstractmethod 

547 def register(self, datasetType: DatasetType) -> Tuple[DatasetRecordStorage, bool]: 

548 """Ensure that this `Registry` can hold records for the given 

549 `DatasetType`, creating new tables as necessary. 

550 

551 Parameters 

552 ---------- 

553 datasetType : `DatasetType` 

554 Dataset type for which a table should created (as necessary) and 

555 an associated `DatasetRecordStorage` returned. 

556 

557 Returns 

558 ------- 

559 records : `DatasetRecordStorage` 

560 The object representing the records for the given dataset type. 

561 inserted : `bool` 

562 `True` if the dataset type did not exist in the registry before. 

563 

564 Notes 

565 ----- 

566 This operation may not be invoked within a `Database.transaction` 

567 context. 

568 """ 

569 raise NotImplementedError() 

570 

571 @abstractmethod 

572 def remove(self, name: str) -> None: 

573 """Remove the dataset type. 

574 

575 Parameters 

576 ---------- 

577 name : `str` 

578 Name of the dataset type. 

579 """ 

580 raise NotImplementedError() 

581 

582 @abstractmethod 

583 def __iter__(self) -> Iterator[DatasetType]: 

584 """Return an iterator over the the dataset types present in this layer. 

585 

586 Notes 

587 ----- 

588 Dataset types registered by another client of the same layer since 

589 the last call to `initialize` or `refresh` may not be included. 

590 """ 

591 raise NotImplementedError() 

592 

593 @abstractmethod 

594 def getDatasetRef(self, id: DatasetId) -> Optional[DatasetRef]: 

595 """Return a `DatasetRef` for the given dataset primary key 

596 value. 

597 

598 Parameters 

599 ---------- 

600 id : `DatasetId` 

601 Primary key value for the dataset. 

602 

603 Returns 

604 ------- 

605 ref : `DatasetRef` or `None` 

606 Object representing the dataset, or `None` if no dataset with the 

607 given primary key values exists in this layer. 

608 """ 

609 raise NotImplementedError() 

610 

611 @abstractmethod 

612 def getCollectionSummary(self, collection: CollectionRecord) -> CollectionSummary: 

613 """Return a summary for the given collection. 

614 

615 Parameters 

616 ---------- 

617 collection : `CollectionRecord` 

618 Record describing the collection for which a summary is to be 

619 retrieved. 

620 

621 Returns 

622 ------- 

623 summary : `CollectionSummary` 

624 Summary of the dataset types and governor dimension values in 

625 this collection. 

626 """ 

627 raise NotImplementedError()