Coverage for python/lsst/daf/butler/registry/interfaces/_datasets.py: 94%

74 statements  

« prev     ^ index     » next       coverage.py v7.4.4, created at 2024-03-26 02:48 -0700

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28from __future__ import annotations 

29 

30from ... import ddl 

31 

32__all__ = ("DatasetRecordStorageManager", "DatasetRecordStorage") 

33 

34from abc import ABC, abstractmethod 

35from collections.abc import Iterable, Iterator, Mapping, Set 

36from typing import TYPE_CHECKING, Any 

37 

38from lsst.daf.relation import Relation 

39 

40from ..._dataset_ref import DatasetId, DatasetIdGenEnum, DatasetRef 

41from ..._dataset_type import DatasetType 

42from ..._exceptions import MissingDatasetTypeError 

43from ..._timespan import Timespan 

44from ...dimensions import DataCoordinate 

45from ._versioning import VersionedExtension, VersionTuple 

46 

47if TYPE_CHECKING: 

48 from .._caching_context import CachingContext 

49 from .._collection_summary import CollectionSummary 

50 from ..queries import SqlQueryContext 

51 from ._collections import CollectionManager, CollectionRecord, RunRecord 

52 from ._database import Database, StaticTablesContext 

53 from ._dimensions import DimensionRecordStorageManager 

54 

55 

56class DatasetRecordStorage(ABC): 

57 """An interface that manages the records associated with a particular 

58 `DatasetType`. 

59 

60 Parameters 

61 ---------- 

62 datasetType : `DatasetType` 

63 Dataset type whose records this object manages. 

64 """ 

65 

66 def __init__(self, datasetType: DatasetType): 

67 self.datasetType = datasetType 

68 

69 @abstractmethod 

70 def insert( 

71 self, 

72 run: RunRecord, 

73 dataIds: Iterable[DataCoordinate], 

74 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

75 ) -> Iterator[DatasetRef]: 

76 """Insert one or more dataset entries into the database. 

77 

78 Parameters 

79 ---------- 

80 run : `RunRecord` 

81 The record object describing the `~CollectionType.RUN` collection 

82 this dataset will be associated with. 

83 dataIds : `~collections.abc.Iterable` [ `DataCoordinate` ] 

84 Expanded data IDs (`DataCoordinate` instances) for the 

85 datasets to be added. The dimensions of all data IDs must be the 

86 same as ``self.datasetType.dimensions``. 

87 idGenerationMode : `DatasetIdGenEnum` 

88 With `UNIQUE` each new dataset is inserted with its new unique ID. 

89 With non-`UNIQUE` mode ID is computed from some combination of 

90 dataset type, dataId, and run collection name; if the same ID is 

91 already in the database then new record is not inserted. 

92 

93 Returns 

94 ------- 

95 datasets : `~collections.abc.Iterable` [ `DatasetRef` ] 

96 References to the inserted datasets. 

97 """ 

98 raise NotImplementedError() 

99 

100 @abstractmethod 

101 def import_( 

102 self, 

103 run: RunRecord, 

104 datasets: Iterable[DatasetRef], 

105 ) -> Iterator[DatasetRef]: 

106 """Insert one or more dataset entries into the database. 

107 

108 Parameters 

109 ---------- 

110 run : `RunRecord` 

111 The record object describing the `~CollectionType.RUN` collection 

112 this dataset will be associated with. 

113 datasets : `~collections.abc.Iterable` of `DatasetRef` 

114 Datasets to be inserted. Datasets can specify ``id`` attribute 

115 which will be used for inserted datasets. All dataset IDs must 

116 have the same type (`int` or `uuid.UUID`), if type of dataset IDs 

117 does not match type supported by this class then IDs will be 

118 ignored and new IDs will be generated by backend. 

119 

120 Returns 

121 ------- 

122 datasets : `~collections.abc.Iterable` [ `DatasetRef` ] 

123 References to the inserted or existing datasets. 

124 

125 Notes 

126 ----- 

127 The ``datasetType`` and ``run`` attributes of datasets are supposed to 

128 be identical across all datasets but this is not checked and it should 

129 be enforced by higher level registry code. This method does not need 

130 to use those attributes from datasets, only ``dataId`` and ``id`` are 

131 relevant. 

132 """ 

133 raise NotImplementedError() 

134 

135 @abstractmethod 

136 def delete(self, datasets: Iterable[DatasetRef]) -> None: 

137 """Fully delete the given datasets from the registry. 

138 

139 Parameters 

140 ---------- 

141 datasets : `~collections.abc.Iterable` [ `DatasetRef` ] 

142 Datasets to be deleted. All datasets must be resolved and have 

143 the same `DatasetType` as ``self``. 

144 

145 Raises 

146 ------ 

147 AmbiguousDatasetError 

148 Raised if any of the given `DatasetRef` instances is unresolved. 

149 """ 

150 raise NotImplementedError() 

151 

152 @abstractmethod 

153 def associate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None: 

154 """Associate one or more datasets with a collection. 

155 

156 Parameters 

157 ---------- 

158 collection : `CollectionRecord` 

159 The record object describing the collection. ``collection.type`` 

160 must be `~CollectionType.TAGGED`. 

161 datasets : `~collections.abc.Iterable` [ `DatasetRef` ] 

162 Datasets to be associated. All datasets must be resolved and have 

163 the same `DatasetType` as ``self``. 

164 

165 Raises 

166 ------ 

167 AmbiguousDatasetError 

168 Raised if any of the given `DatasetRef` instances is unresolved. 

169 

170 Notes 

171 ----- 

172 Associating a dataset with into collection that already contains a 

173 different dataset with the same `DatasetType` and data ID will remove 

174 the existing dataset from that collection. 

175 

176 Associating the same dataset into a collection multiple times is a 

177 no-op, but is still not permitted on read-only databases. 

178 """ 

179 raise NotImplementedError() 

180 

181 @abstractmethod 

182 def disassociate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None: 

183 """Remove one or more datasets from a collection. 

184 

185 Parameters 

186 ---------- 

187 collection : `CollectionRecord` 

188 The record object describing the collection. ``collection.type`` 

189 must be `~CollectionType.TAGGED`. 

190 datasets : `~collections.abc.Iterable` [ `DatasetRef` ] 

191 Datasets to be disassociated. All datasets must be resolved and 

192 have the same `DatasetType` as ``self``. 

193 

194 Raises 

195 ------ 

196 AmbiguousDatasetError 

197 Raised if any of the given `DatasetRef` instances is unresolved. 

198 """ 

199 raise NotImplementedError() 

200 

201 @abstractmethod 

202 def certify( 

203 self, 

204 collection: CollectionRecord, 

205 datasets: Iterable[DatasetRef], 

206 timespan: Timespan, 

207 context: SqlQueryContext, 

208 ) -> None: 

209 """Associate one or more datasets with a calibration collection and a 

210 validity range within it. 

211 

212 Parameters 

213 ---------- 

214 collection : `CollectionRecord` 

215 The record object describing the collection. ``collection.type`` 

216 must be `~CollectionType.CALIBRATION`. 

217 datasets : `~collections.abc.Iterable` [ `DatasetRef` ] 

218 Datasets to be associated. All datasets must be resolved and have 

219 the same `DatasetType` as ``self``. 

220 timespan : `Timespan` 

221 The validity range for these datasets within the collection. 

222 context : `SqlQueryContext` 

223 The object that manages database connections, temporary tables and 

224 relation engines for this query. 

225 

226 Raises 

227 ------ 

228 AmbiguousDatasetError 

229 Raised if any of the given `DatasetRef` instances is unresolved. 

230 ConflictingDefinitionError 

231 Raised if the collection already contains a different dataset with 

232 the same `DatasetType` and data ID and an overlapping validity 

233 range. 

234 CollectionTypeError 

235 Raised if 

236 ``collection.type is not CollectionType.CALIBRATION`` or if 

237 ``self.datasetType.isCalibration() is False``. 

238 """ 

239 raise NotImplementedError() 

240 

241 @abstractmethod 

242 def decertify( 

243 self, 

244 collection: CollectionRecord, 

245 timespan: Timespan, 

246 *, 

247 dataIds: Iterable[DataCoordinate] | None = None, 

248 context: SqlQueryContext, 

249 ) -> None: 

250 """Remove or adjust datasets to clear a validity range within a 

251 calibration collection. 

252 

253 Parameters 

254 ---------- 

255 collection : `CollectionRecord` 

256 The record object describing the collection. ``collection.type`` 

257 must be `~CollectionType.CALIBRATION`. 

258 timespan : `Timespan` 

259 The validity range to remove datasets from within the collection. 

260 Datasets that overlap this range but are not contained by it will 

261 have their validity ranges adjusted to not overlap it, which may 

262 split a single dataset validity range into two. 

263 dataIds : `~collections.abc.Iterable` [ `DataCoordinate` ], optional 

264 Data IDs that should be decertified within the given validity range 

265 If `None`, all data IDs for ``self.datasetType`` will be 

266 decertified. 

267 context : `SqlQueryContext` 

268 The object that manages database connections, temporary tables and 

269 relation engines for this query. 

270 

271 Raises 

272 ------ 

273 CollectionTypeError 

274 Raised if ``collection.type is not CollectionType.CALIBRATION``. 

275 """ 

276 raise NotImplementedError() 

277 

278 @abstractmethod 

279 def make_relation( 

280 self, 

281 *collections: CollectionRecord, 

282 columns: Set[str], 

283 context: SqlQueryContext, 

284 ) -> Relation: 

285 """Return a `sql.Relation` that represents a query for for this 

286 `DatasetType` in one or more collections. 

287 

288 Parameters 

289 ---------- 

290 *collections : `CollectionRecord` 

291 The record object(s) describing the collection(s) to query. May 

292 not be of type `CollectionType.CHAINED`. If multiple collections 

293 are passed, the query will search all of them in an unspecified 

294 order, and all collections must have the same type. Must include 

295 at least one collection. 

296 columns : `~collections.abc.Set` [ `str` ] 

297 Columns to include in the relation. See `Query.find_datasets` for 

298 most options, but this method supports one more: 

299 

300 - ``rank``: a calculated integer column holding the index of the 

301 collection the dataset was found in, within the ``collections`` 

302 sequence given. 

303 context : `SqlQueryContext` 

304 The object that manages database connections, temporary tables and 

305 relation engines for this query. 

306 

307 Returns 

308 ------- 

309 relation : `~lsst.daf.relation.Relation` 

310 Representation of the query. 

311 """ 

312 raise NotImplementedError() 

313 

314 datasetType: DatasetType 

315 """Dataset type whose records this object manages (`DatasetType`). 

316 """ 

317 

318 

319class DatasetRecordStorageManager(VersionedExtension): 

320 """An interface that manages the tables that describe datasets. 

321 

322 `DatasetRecordStorageManager` primarily serves as a container and factory 

323 for `DatasetRecordStorage` instances, which each provide access to the 

324 records for a different `DatasetType`. 

325 

326 Parameters 

327 ---------- 

328 registry_schema_version : `VersionTuple` or `None`, optional 

329 Version of registry schema. 

330 """ 

331 

332 def __init__(self, *, registry_schema_version: VersionTuple | None = None) -> None: 

333 super().__init__(registry_schema_version=registry_schema_version) 

334 

335 @abstractmethod 

336 def clone( 

337 self, 

338 *, 

339 db: Database, 

340 collections: CollectionManager, 

341 dimensions: DimensionRecordStorageManager, 

342 caching_context: CachingContext, 

343 ) -> DatasetRecordStorageManager: 

344 """Make an independent copy of this manager instance bound to new 

345 instances of `Database` and other managers. 

346 

347 Parameters 

348 ---------- 

349 db : `Database` 

350 New `Database` object to use when instantiating the manager. 

351 collections : `CollectionManager` 

352 New `CollectionManager` object to use when instantiating the 

353 manager. 

354 dimensions : `DimensionRecordStorageManager` 

355 New `DimensionRecordStorageManager` object to use when 

356 instantiating the manager. 

357 caching_context : `CachingContext` 

358 New `CachingContext` object to use when instantiating the manager. 

359 

360 Returns 

361 ------- 

362 instance : `DatasetRecordStorageManager` 

363 New manager instance with the same configuration as this instance, 

364 but bound to a new Database object. 

365 """ 

366 raise NotImplementedError() 

367 

368 @classmethod 

369 @abstractmethod 

370 def initialize( 

371 cls, 

372 db: Database, 

373 context: StaticTablesContext, 

374 *, 

375 collections: CollectionManager, 

376 dimensions: DimensionRecordStorageManager, 

377 caching_context: CachingContext, 

378 registry_schema_version: VersionTuple | None = None, 

379 ) -> DatasetRecordStorageManager: 

380 """Construct an instance of the manager. 

381 

382 Parameters 

383 ---------- 

384 db : `Database` 

385 Interface to the underlying database engine and namespace. 

386 context : `StaticTablesContext` 

387 Context object obtained from `Database.declareStaticTables`; used 

388 to declare any tables that should always be present. 

389 collections : `CollectionManager` 

390 Manager object for the collections in this `Registry`. 

391 dimensions : `DimensionRecordStorageManager` 

392 Manager object for the dimensions in this `Registry`. 

393 caching_context : `CachingContext` 

394 Object controlling caching of information returned by managers. 

395 registry_schema_version : `VersionTuple` or `None` 

396 Schema version of this extension as defined in registry. 

397 

398 Returns 

399 ------- 

400 manager : `DatasetRecordStorageManager` 

401 An instance of a concrete `DatasetRecordStorageManager` subclass. 

402 """ 

403 raise NotImplementedError() 

404 

405 @classmethod 

406 @abstractmethod 

407 def getIdColumnType(cls) -> type: 

408 """Return type used for columns storing dataset IDs. 

409 

410 This type is used for columns storing `DatasetRef.id` values, usually 

411 a `type` subclass provided by SQLAlchemy. 

412 

413 Returns 

414 ------- 

415 dtype : `type` 

416 Type used for dataset identification in database. 

417 """ 

418 raise NotImplementedError() 

419 

420 @classmethod 

421 @abstractmethod 

422 def supportsIdGenerationMode(cls, mode: DatasetIdGenEnum) -> bool: 

423 """Test whether the given dataset ID generation mode is supported by 

424 `insert`. 

425 

426 Parameters 

427 ---------- 

428 mode : `DatasetIdGenEnum` 

429 Enum value for the mode to test. 

430 

431 Returns 

432 ------- 

433 supported : `bool` 

434 Whether the given mode is supported. 

435 """ 

436 raise NotImplementedError() 

437 

438 @classmethod 

439 @abstractmethod 

440 def addDatasetForeignKey( 

441 cls, 

442 tableSpec: ddl.TableSpec, 

443 *, 

444 name: str = "dataset", 

445 constraint: bool = True, 

446 onDelete: str | None = None, 

447 **kwargs: Any, 

448 ) -> ddl.FieldSpec: 

449 """Add a foreign key (field and constraint) referencing the dataset 

450 table. 

451 

452 Parameters 

453 ---------- 

454 tableSpec : `ddl.TableSpec` 

455 Specification for the table that should reference the dataset 

456 table. Will be modified in place. 

457 name : `str`, optional 

458 A name to use for the prefix of the new field; the full name is 

459 ``{name}_id``. 

460 constraint : `bool`, optional 

461 If `False` (`True` is default), add a field that can be joined to 

462 the dataset primary key, but do not add a foreign key constraint. 

463 onDelete : `str`, optional 

464 One of "CASCADE" or "SET NULL", indicating what should happen to 

465 the referencing row if the collection row is deleted. `None` 

466 indicates that this should be an integrity error. 

467 **kwargs 

468 Additional keyword arguments are forwarded to the `ddl.FieldSpec` 

469 constructor (only the ``name`` and ``dtype`` arguments are 

470 otherwise provided). 

471 

472 Returns 

473 ------- 

474 idSpec : `ddl.FieldSpec` 

475 Specification for the ID field. 

476 """ 

477 raise NotImplementedError() 

478 

479 @abstractmethod 

480 def refresh(self) -> None: 

481 """Ensure all other operations on this manager are aware of any 

482 dataset types that may have been registered by other clients since 

483 it was initialized or last refreshed. 

484 """ 

485 raise NotImplementedError() 

486 

487 def __getitem__(self, name: str) -> DatasetRecordStorage: 

488 """Return the object that provides access to the records associated 

489 with the given `DatasetType` name. 

490 

491 This is simply a convenience wrapper for `find` that raises `KeyError` 

492 when the dataset type is not found. 

493 

494 Returns 

495 ------- 

496 records : `DatasetRecordStorage` 

497 The object representing the records for the given dataset type. 

498 

499 Raises 

500 ------ 

501 KeyError 

502 Raised if there is no dataset type with the given name. 

503 

504 Notes 

505 ----- 

506 Dataset types registered by another client of the same repository since 

507 the last call to `initialize` or `refresh` may not be found. 

508 """ 

509 result = self.find(name) 

510 if result is None: 

511 raise MissingDatasetTypeError(f"Dataset type with name '{name}' not found.") 

512 return result 

513 

514 @abstractmethod 

515 def find(self, name: str) -> DatasetRecordStorage | None: 

516 """Return an object that provides access to the records associated with 

517 the given `DatasetType` name, if one exists. 

518 

519 Parameters 

520 ---------- 

521 name : `str` 

522 Name of the dataset type. 

523 

524 Returns 

525 ------- 

526 records : `DatasetRecordStorage` or `None` 

527 The object representing the records for the given dataset type, or 

528 `None` if there are no records for that dataset type. 

529 

530 Notes 

531 ----- 

532 Dataset types registered by another client of the same repository since 

533 the last call to `initialize` or `refresh` may not be found. 

534 """ 

535 raise NotImplementedError() 

536 

537 @abstractmethod 

538 def register(self, datasetType: DatasetType) -> bool: 

539 """Ensure that this `Registry` can hold records for the given 

540 `DatasetType`, creating new tables as necessary. 

541 

542 Parameters 

543 ---------- 

544 datasetType : `DatasetType` 

545 Dataset type for which a table should created (as necessary) and 

546 an associated `DatasetRecordStorage` returned. 

547 

548 Returns 

549 ------- 

550 inserted : `bool` 

551 `True` if the dataset type did not exist in the registry before. 

552 

553 Notes 

554 ----- 

555 This operation may not be invoked within a `Database.transaction` 

556 context. 

557 """ 

558 raise NotImplementedError() 

559 

560 @abstractmethod 

561 def remove(self, name: str) -> None: 

562 """Remove the dataset type. 

563 

564 Parameters 

565 ---------- 

566 name : `str` 

567 Name of the dataset type. 

568 """ 

569 raise NotImplementedError() 

570 

571 @abstractmethod 

572 def resolve_wildcard( 

573 self, 

574 expression: Any, 

575 missing: list[str] | None = None, 

576 explicit_only: bool = False, 

577 ) -> list[DatasetType]: 

578 """Resolve a dataset type wildcard expression. 

579 

580 Parameters 

581 ---------- 

582 expression : `~typing.Any` 

583 Expression to resolve. Will be passed to 

584 `DatasetTypeWildcard.from_expression`. 

585 missing : `list` of `str`, optional 

586 String dataset type names that were explicitly given (i.e. not 

587 regular expression patterns) but not found will be appended to this 

588 list, if it is provided. 

589 explicit_only : `bool`, optional 

590 If `True`, require explicit `DatasetType` instances or `str` names, 

591 with `re.Pattern` instances deprecated and ``...`` prohibited. 

592 

593 Returns 

594 ------- 

595 dataset_types : `list` [ `DatasetType` ] 

596 A list of resolved dataset types. 

597 """ 

598 raise NotImplementedError() 

599 

600 @abstractmethod 

601 def getDatasetRef(self, id: DatasetId) -> DatasetRef | None: 

602 """Return a `DatasetRef` for the given dataset primary key 

603 value. 

604 

605 Parameters 

606 ---------- 

607 id : `DatasetId` 

608 Primary key value for the dataset. 

609 

610 Returns 

611 ------- 

612 ref : `DatasetRef` or `None` 

613 Object representing the dataset, or `None` if no dataset with the 

614 given primary key values exists in this layer. 

615 """ 

616 raise NotImplementedError() 

617 

618 @abstractmethod 

619 def getCollectionSummary(self, collection: CollectionRecord) -> CollectionSummary: 

620 """Return a summary for the given collection. 

621 

622 Parameters 

623 ---------- 

624 collection : `CollectionRecord` 

625 Record describing the collection for which a summary is to be 

626 retrieved. 

627 

628 Returns 

629 ------- 

630 summary : `CollectionSummary` 

631 Summary of the dataset types and governor dimension values in 

632 this collection. 

633 """ 

634 raise NotImplementedError() 

635 

636 @abstractmethod 

637 def fetch_summaries( 

638 self, collections: Iterable[CollectionRecord], dataset_types: Iterable[DatasetType] | None = None 

639 ) -> Mapping[Any, CollectionSummary]: 

640 """Fetch collection summaries given their names and dataset types. 

641 

642 Parameters 

643 ---------- 

644 collections : `~collections.abc.Iterable` [`CollectionRecord`] 

645 Collection records to query. 

646 dataset_types : `~collections.abc.Iterable` [`DatasetType`] or `None` 

647 Dataset types to include into returned summaries. If `None` then 

648 all dataset types will be included. 

649 

650 Returns 

651 ------- 

652 summaries : `~collections.abc.Mapping` [`Any`, `CollectionSummary`] 

653 Collection summaries indexed by collection record key. This mapping 

654 will also contain all nested non-chained collections of the chained 

655 collections. 

656 """ 

657 raise NotImplementedError() 

658 

659 @abstractmethod 

660 def ingest_date_dtype(self) -> type: 

661 """Return type of the ``ingest_date`` column.""" 

662 raise NotImplementedError()