Coverage for python/lsst/daf/butler/registry/interfaces/_datasets.py: 94%

76 statements  

« prev     ^ index     » next       coverage.py v7.5.0, created at 2024-05-02 10:24 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28from __future__ import annotations 

29 

30from ... import ddl 

31 

32__all__ = ("DatasetRecordStorageManager", "DatasetRecordStorage") 

33 

34from abc import ABC, abstractmethod 

35from collections.abc import Iterable, Iterator, Mapping, Sequence, Set 

36from typing import TYPE_CHECKING, Any 

37 

38from lsst.daf.relation import Relation 

39 

40from ..._dataset_ref import DatasetId, DatasetIdGenEnum, DatasetRef 

41from ..._dataset_type import DatasetType 

42from ..._exceptions import MissingDatasetTypeError 

43from ..._timespan import Timespan 

44from ...dimensions import DataCoordinate 

45from ._versioning import VersionedExtension, VersionTuple 

46 

47if TYPE_CHECKING: 

48 from ...direct_query_driver import QueryJoiner # new query system, server+direct only 

49 from .._caching_context import CachingContext 

50 from .._collection_summary import CollectionSummary 

51 from ..queries import SqlQueryContext # old registry query system 

52 from ._collections import CollectionManager, CollectionRecord, RunRecord 

53 from ._database import Database, StaticTablesContext 

54 from ._dimensions import DimensionRecordStorageManager 

55 

56 

57class DatasetRecordStorage(ABC): 

58 """An interface that manages the records associated with a particular 

59 `DatasetType`. 

60 

61 Parameters 

62 ---------- 

63 datasetType : `DatasetType` 

64 Dataset type whose records this object manages. 

65 """ 

66 

67 def __init__(self, datasetType: DatasetType): 

68 self.datasetType = datasetType 

69 

70 @abstractmethod 

71 def insert( 

72 self, 

73 run: RunRecord, 

74 dataIds: Iterable[DataCoordinate], 

75 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

76 ) -> Iterator[DatasetRef]: 

77 """Insert one or more dataset entries into the database. 

78 

79 Parameters 

80 ---------- 

81 run : `RunRecord` 

82 The record object describing the `~CollectionType.RUN` collection 

83 this dataset will be associated with. 

84 dataIds : `~collections.abc.Iterable` [ `DataCoordinate` ] 

85 Expanded data IDs (`DataCoordinate` instances) for the 

86 datasets to be added. The dimensions of all data IDs must be the 

87 same as ``self.datasetType.dimensions``. 

88 idGenerationMode : `DatasetIdGenEnum` 

89 With `UNIQUE` each new dataset is inserted with its new unique ID. 

90 With non-`UNIQUE` mode ID is computed from some combination of 

91 dataset type, dataId, and run collection name; if the same ID is 

92 already in the database then new record is not inserted. 

93 

94 Returns 

95 ------- 

96 datasets : `~collections.abc.Iterable` [ `DatasetRef` ] 

97 References to the inserted datasets. 

98 """ 

99 raise NotImplementedError() 

100 

101 @abstractmethod 

102 def import_( 

103 self, 

104 run: RunRecord, 

105 datasets: Iterable[DatasetRef], 

106 ) -> Iterator[DatasetRef]: 

107 """Insert one or more dataset entries into the database. 

108 

109 Parameters 

110 ---------- 

111 run : `RunRecord` 

112 The record object describing the `~CollectionType.RUN` collection 

113 this dataset will be associated with. 

114 datasets : `~collections.abc.Iterable` of `DatasetRef` 

115 Datasets to be inserted. Datasets can specify ``id`` attribute 

116 which will be used for inserted datasets. All dataset IDs must 

117 have the same type (`int` or `uuid.UUID`), if type of dataset IDs 

118 does not match type supported by this class then IDs will be 

119 ignored and new IDs will be generated by backend. 

120 

121 Returns 

122 ------- 

123 datasets : `~collections.abc.Iterable` [ `DatasetRef` ] 

124 References to the inserted or existing datasets. 

125 

126 Notes 

127 ----- 

128 The ``datasetType`` and ``run`` attributes of datasets are supposed to 

129 be identical across all datasets but this is not checked and it should 

130 be enforced by higher level registry code. This method does not need 

131 to use those attributes from datasets, only ``dataId`` and ``id`` are 

132 relevant. 

133 """ 

134 raise NotImplementedError() 

135 

136 @abstractmethod 

137 def delete(self, datasets: Iterable[DatasetRef]) -> None: 

138 """Fully delete the given datasets from the registry. 

139 

140 Parameters 

141 ---------- 

142 datasets : `~collections.abc.Iterable` [ `DatasetRef` ] 

143 Datasets to be deleted. All datasets must be resolved and have 

144 the same `DatasetType` as ``self``. 

145 

146 Raises 

147 ------ 

148 AmbiguousDatasetError 

149 Raised if any of the given `DatasetRef` instances is unresolved. 

150 """ 

151 raise NotImplementedError() 

152 

153 @abstractmethod 

154 def associate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None: 

155 """Associate one or more datasets with a collection. 

156 

157 Parameters 

158 ---------- 

159 collection : `CollectionRecord` 

160 The record object describing the collection. ``collection.type`` 

161 must be `~CollectionType.TAGGED`. 

162 datasets : `~collections.abc.Iterable` [ `DatasetRef` ] 

163 Datasets to be associated. All datasets must be resolved and have 

164 the same `DatasetType` as ``self``. 

165 

166 Raises 

167 ------ 

168 AmbiguousDatasetError 

169 Raised if any of the given `DatasetRef` instances is unresolved. 

170 

171 Notes 

172 ----- 

173 Associating a dataset with into collection that already contains a 

174 different dataset with the same `DatasetType` and data ID will remove 

175 the existing dataset from that collection. 

176 

177 Associating the same dataset into a collection multiple times is a 

178 no-op, but is still not permitted on read-only databases. 

179 """ 

180 raise NotImplementedError() 

181 

182 @abstractmethod 

183 def disassociate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None: 

184 """Remove one or more datasets from a collection. 

185 

186 Parameters 

187 ---------- 

188 collection : `CollectionRecord` 

189 The record object describing the collection. ``collection.type`` 

190 must be `~CollectionType.TAGGED`. 

191 datasets : `~collections.abc.Iterable` [ `DatasetRef` ] 

192 Datasets to be disassociated. All datasets must be resolved and 

193 have the same `DatasetType` as ``self``. 

194 

195 Raises 

196 ------ 

197 AmbiguousDatasetError 

198 Raised if any of the given `DatasetRef` instances is unresolved. 

199 """ 

200 raise NotImplementedError() 

201 

202 @abstractmethod 

203 def certify( 

204 self, 

205 collection: CollectionRecord, 

206 datasets: Iterable[DatasetRef], 

207 timespan: Timespan, 

208 context: SqlQueryContext, 

209 ) -> None: 

210 """Associate one or more datasets with a calibration collection and a 

211 validity range within it. 

212 

213 Parameters 

214 ---------- 

215 collection : `CollectionRecord` 

216 The record object describing the collection. ``collection.type`` 

217 must be `~CollectionType.CALIBRATION`. 

218 datasets : `~collections.abc.Iterable` [ `DatasetRef` ] 

219 Datasets to be associated. All datasets must be resolved and have 

220 the same `DatasetType` as ``self``. 

221 timespan : `Timespan` 

222 The validity range for these datasets within the collection. 

223 context : `SqlQueryContext` 

224 The object that manages database connections, temporary tables and 

225 relation engines for this query. 

226 

227 Raises 

228 ------ 

229 AmbiguousDatasetError 

230 Raised if any of the given `DatasetRef` instances is unresolved. 

231 ConflictingDefinitionError 

232 Raised if the collection already contains a different dataset with 

233 the same `DatasetType` and data ID and an overlapping validity 

234 range. 

235 CollectionTypeError 

236 Raised if 

237 ``collection.type is not CollectionType.CALIBRATION`` or if 

238 ``self.datasetType.isCalibration() is False``. 

239 """ 

240 raise NotImplementedError() 

241 

242 @abstractmethod 

243 def decertify( 

244 self, 

245 collection: CollectionRecord, 

246 timespan: Timespan, 

247 *, 

248 dataIds: Iterable[DataCoordinate] | None = None, 

249 context: SqlQueryContext, 

250 ) -> None: 

251 """Remove or adjust datasets to clear a validity range within a 

252 calibration collection. 

253 

254 Parameters 

255 ---------- 

256 collection : `CollectionRecord` 

257 The record object describing the collection. ``collection.type`` 

258 must be `~CollectionType.CALIBRATION`. 

259 timespan : `Timespan` 

260 The validity range to remove datasets from within the collection. 

261 Datasets that overlap this range but are not contained by it will 

262 have their validity ranges adjusted to not overlap it, which may 

263 split a single dataset validity range into two. 

264 dataIds : `~collections.abc.Iterable` [ `DataCoordinate` ], optional 

265 Data IDs that should be decertified within the given validity range 

266 If `None`, all data IDs for ``self.datasetType`` will be 

267 decertified. 

268 context : `SqlQueryContext` 

269 The object that manages database connections, temporary tables and 

270 relation engines for this query. 

271 

272 Raises 

273 ------ 

274 CollectionTypeError 

275 Raised if ``collection.type is not CollectionType.CALIBRATION``. 

276 """ 

277 raise NotImplementedError() 

278 

279 @abstractmethod 

280 def make_relation( 

281 self, 

282 *collections: CollectionRecord, 

283 columns: Set[str], 

284 context: SqlQueryContext, 

285 ) -> Relation: 

286 """Return a `sql.Relation` that represents a query for for this 

287 `DatasetType` in one or more collections. 

288 

289 Parameters 

290 ---------- 

291 *collections : `CollectionRecord` 

292 The record object(s) describing the collection(s) to query. May 

293 not be of type `CollectionType.CHAINED`. If multiple collections 

294 are passed, the query will search all of them in an unspecified 

295 order, and all collections must have the same type. Must include 

296 at least one collection. 

297 columns : `~collections.abc.Set` [ `str` ] 

298 Columns to include in the relation. See `Query.find_datasets` for 

299 most options, but this method supports one more: 

300 

301 - ``rank``: a calculated integer column holding the index of the 

302 collection the dataset was found in, within the ``collections`` 

303 sequence given. 

304 context : `SqlQueryContext` 

305 The object that manages database connections, temporary tables and 

306 relation engines for this query. 

307 

308 Returns 

309 ------- 

310 relation : `~lsst.daf.relation.Relation` 

311 Representation of the query. 

312 """ 

313 raise NotImplementedError() 

314 

315 @abstractmethod 

316 def make_query_joiner(self, collections: Sequence[CollectionRecord], fields: Set[str]) -> QueryJoiner: 

317 """Make a `..direct_query_driver.QueryJoiner` that represents a search 

318 for datasets of this type. 

319 

320 Parameters 

321 ---------- 

322 collections : `~collections.abc.Sequence` [ `CollectionRecord` ] 

323 Collections to search, in order, after filtering out collections 

324 with no datasets of this type via collection summaries. 

325 fields : `~collections.abc.Set` [ `str` ] 

326 Names of fields to make available in the joiner. Options include: 

327 

328 - ``dataset_id`` (UUID) 

329 - ``run` (collection name, `str`) 

330 - ``collection`` (collection name, `str`) 

331 - ``collection_key`` (collection primary key, manager-dependent) 

332 - ``timespan`` (validity range, or unbounded for non-calibrations) 

333 - ``ingest_date`` (time dataset was ingested into repository) 

334 

335 Dimension keys for the dataset type's required dimensions are 

336 always included. 

337 

338 Returns 

339 ------- 

340 joiner : `..direct_query_driver.QueryJoiner` 

341 A query-construction object representing a table or subquery. If 

342 ``fields`` is empty or ``len(collections) <= 1``, this is 

343 guaranteed to have rows that are unique over dimension keys. 

344 """ 

345 raise NotImplementedError() 

346 

347 datasetType: DatasetType 

348 """Dataset type whose records this object manages (`DatasetType`). 

349 """ 

350 

351 

352class DatasetRecordStorageManager(VersionedExtension): 

353 """An interface that manages the tables that describe datasets. 

354 

355 `DatasetRecordStorageManager` primarily serves as a container and factory 

356 for `DatasetRecordStorage` instances, which each provide access to the 

357 records for a different `DatasetType`. 

358 

359 Parameters 

360 ---------- 

361 registry_schema_version : `VersionTuple` or `None`, optional 

362 Version of registry schema. 

363 """ 

364 

365 def __init__(self, *, registry_schema_version: VersionTuple | None = None) -> None: 

366 super().__init__(registry_schema_version=registry_schema_version) 

367 

368 @abstractmethod 

369 def clone( 

370 self, 

371 *, 

372 db: Database, 

373 collections: CollectionManager, 

374 dimensions: DimensionRecordStorageManager, 

375 caching_context: CachingContext, 

376 ) -> DatasetRecordStorageManager: 

377 """Make an independent copy of this manager instance bound to new 

378 instances of `Database` and other managers. 

379 

380 Parameters 

381 ---------- 

382 db : `Database` 

383 New `Database` object to use when instantiating the manager. 

384 collections : `CollectionManager` 

385 New `CollectionManager` object to use when instantiating the 

386 manager. 

387 dimensions : `DimensionRecordStorageManager` 

388 New `DimensionRecordStorageManager` object to use when 

389 instantiating the manager. 

390 caching_context : `CachingContext` 

391 New `CachingContext` object to use when instantiating the manager. 

392 

393 Returns 

394 ------- 

395 instance : `DatasetRecordStorageManager` 

396 New manager instance with the same configuration as this instance, 

397 but bound to a new Database object. 

398 """ 

399 raise NotImplementedError() 

400 

401 @classmethod 

402 @abstractmethod 

403 def initialize( 

404 cls, 

405 db: Database, 

406 context: StaticTablesContext, 

407 *, 

408 collections: CollectionManager, 

409 dimensions: DimensionRecordStorageManager, 

410 caching_context: CachingContext, 

411 registry_schema_version: VersionTuple | None = None, 

412 ) -> DatasetRecordStorageManager: 

413 """Construct an instance of the manager. 

414 

415 Parameters 

416 ---------- 

417 db : `Database` 

418 Interface to the underlying database engine and namespace. 

419 context : `StaticTablesContext` 

420 Context object obtained from `Database.declareStaticTables`; used 

421 to declare any tables that should always be present. 

422 collections : `CollectionManager` 

423 Manager object for the collections in this `Registry`. 

424 dimensions : `DimensionRecordStorageManager` 

425 Manager object for the dimensions in this `Registry`. 

426 caching_context : `CachingContext` 

427 Object controlling caching of information returned by managers. 

428 registry_schema_version : `VersionTuple` or `None` 

429 Schema version of this extension as defined in registry. 

430 

431 Returns 

432 ------- 

433 manager : `DatasetRecordStorageManager` 

434 An instance of a concrete `DatasetRecordStorageManager` subclass. 

435 """ 

436 raise NotImplementedError() 

437 

438 @classmethod 

439 @abstractmethod 

440 def getIdColumnType(cls) -> type: 

441 """Return type used for columns storing dataset IDs. 

442 

443 This type is used for columns storing `DatasetRef.id` values, usually 

444 a `type` subclass provided by SQLAlchemy. 

445 

446 Returns 

447 ------- 

448 dtype : `type` 

449 Type used for dataset identification in database. 

450 """ 

451 raise NotImplementedError() 

452 

453 @classmethod 

454 @abstractmethod 

455 def supportsIdGenerationMode(cls, mode: DatasetIdGenEnum) -> bool: 

456 """Test whether the given dataset ID generation mode is supported by 

457 `insert`. 

458 

459 Parameters 

460 ---------- 

461 mode : `DatasetIdGenEnum` 

462 Enum value for the mode to test. 

463 

464 Returns 

465 ------- 

466 supported : `bool` 

467 Whether the given mode is supported. 

468 """ 

469 raise NotImplementedError() 

470 

471 @classmethod 

472 @abstractmethod 

473 def addDatasetForeignKey( 

474 cls, 

475 tableSpec: ddl.TableSpec, 

476 *, 

477 name: str = "dataset", 

478 constraint: bool = True, 

479 onDelete: str | None = None, 

480 **kwargs: Any, 

481 ) -> ddl.FieldSpec: 

482 """Add a foreign key (field and constraint) referencing the dataset 

483 table. 

484 

485 Parameters 

486 ---------- 

487 tableSpec : `ddl.TableSpec` 

488 Specification for the table that should reference the dataset 

489 table. Will be modified in place. 

490 name : `str`, optional 

491 A name to use for the prefix of the new field; the full name is 

492 ``{name}_id``. 

493 constraint : `bool`, optional 

494 If `False` (`True` is default), add a field that can be joined to 

495 the dataset primary key, but do not add a foreign key constraint. 

496 onDelete : `str`, optional 

497 One of "CASCADE" or "SET NULL", indicating what should happen to 

498 the referencing row if the collection row is deleted. `None` 

499 indicates that this should be an integrity error. 

500 **kwargs 

501 Additional keyword arguments are forwarded to the `ddl.FieldSpec` 

502 constructor (only the ``name`` and ``dtype`` arguments are 

503 otherwise provided). 

504 

505 Returns 

506 ------- 

507 idSpec : `ddl.FieldSpec` 

508 Specification for the ID field. 

509 """ 

510 raise NotImplementedError() 

511 

512 @abstractmethod 

513 def refresh(self) -> None: 

514 """Ensure all other operations on this manager are aware of any 

515 dataset types that may have been registered by other clients since 

516 it was initialized or last refreshed. 

517 """ 

518 raise NotImplementedError() 

519 

520 def __getitem__(self, name: str) -> DatasetRecordStorage: 

521 """Return the object that provides access to the records associated 

522 with the given `DatasetType` name. 

523 

524 This is simply a convenience wrapper for `find` that raises `KeyError` 

525 when the dataset type is not found. 

526 

527 Returns 

528 ------- 

529 records : `DatasetRecordStorage` 

530 The object representing the records for the given dataset type. 

531 

532 Raises 

533 ------ 

534 KeyError 

535 Raised if there is no dataset type with the given name. 

536 

537 Notes 

538 ----- 

539 Dataset types registered by another client of the same repository since 

540 the last call to `initialize` or `refresh` may not be found. 

541 """ 

542 result = self.find(name) 

543 if result is None: 

544 raise MissingDatasetTypeError(f"Dataset type with name '{name}' not found.") 

545 return result 

546 

547 @abstractmethod 

548 def find(self, name: str) -> DatasetRecordStorage | None: 

549 """Return an object that provides access to the records associated with 

550 the given `DatasetType` name, if one exists. 

551 

552 Parameters 

553 ---------- 

554 name : `str` 

555 Name of the dataset type. 

556 

557 Returns 

558 ------- 

559 records : `DatasetRecordStorage` or `None` 

560 The object representing the records for the given dataset type, or 

561 `None` if there are no records for that dataset type. 

562 

563 Notes 

564 ----- 

565 Dataset types registered by another client of the same repository since 

566 the last call to `initialize` or `refresh` may not be found. 

567 """ 

568 raise NotImplementedError() 

569 

570 @abstractmethod 

571 def register(self, datasetType: DatasetType) -> bool: 

572 """Ensure that this `Registry` can hold records for the given 

573 `DatasetType`, creating new tables as necessary. 

574 

575 Parameters 

576 ---------- 

577 datasetType : `DatasetType` 

578 Dataset type for which a table should created (as necessary) and 

579 an associated `DatasetRecordStorage` returned. 

580 

581 Returns 

582 ------- 

583 inserted : `bool` 

584 `True` if the dataset type did not exist in the registry before. 

585 

586 Notes 

587 ----- 

588 This operation may not be invoked within a `Database.transaction` 

589 context. 

590 """ 

591 raise NotImplementedError() 

592 

593 @abstractmethod 

594 def remove(self, name: str) -> None: 

595 """Remove the dataset type. 

596 

597 Parameters 

598 ---------- 

599 name : `str` 

600 Name of the dataset type. 

601 """ 

602 raise NotImplementedError() 

603 

604 @abstractmethod 

605 def resolve_wildcard( 

606 self, 

607 expression: Any, 

608 missing: list[str] | None = None, 

609 explicit_only: bool = False, 

610 ) -> list[DatasetType]: 

611 """Resolve a dataset type wildcard expression. 

612 

613 Parameters 

614 ---------- 

615 expression : `~typing.Any` 

616 Expression to resolve. Will be passed to 

617 `DatasetTypeWildcard.from_expression`. 

618 missing : `list` of `str`, optional 

619 String dataset type names that were explicitly given (i.e. not 

620 regular expression patterns) but not found will be appended to this 

621 list, if it is provided. 

622 explicit_only : `bool`, optional 

623 If `True`, require explicit `DatasetType` instances or `str` names, 

624 with `re.Pattern` instances deprecated and ``...`` prohibited. 

625 

626 Returns 

627 ------- 

628 dataset_types : `list` [ `DatasetType` ] 

629 A list of resolved dataset types. 

630 """ 

631 raise NotImplementedError() 

632 

633 @abstractmethod 

634 def getDatasetRef(self, id: DatasetId) -> DatasetRef | None: 

635 """Return a `DatasetRef` for the given dataset primary key 

636 value. 

637 

638 Parameters 

639 ---------- 

640 id : `DatasetId` 

641 Primary key value for the dataset. 

642 

643 Returns 

644 ------- 

645 ref : `DatasetRef` or `None` 

646 Object representing the dataset, or `None` if no dataset with the 

647 given primary key values exists in this layer. 

648 """ 

649 raise NotImplementedError() 

650 

651 @abstractmethod 

652 def getCollectionSummary(self, collection: CollectionRecord) -> CollectionSummary: 

653 """Return a summary for the given collection. 

654 

655 Parameters 

656 ---------- 

657 collection : `CollectionRecord` 

658 Record describing the collection for which a summary is to be 

659 retrieved. 

660 

661 Returns 

662 ------- 

663 summary : `CollectionSummary` 

664 Summary of the dataset types and governor dimension values in 

665 this collection. 

666 """ 

667 raise NotImplementedError() 

668 

669 @abstractmethod 

670 def fetch_summaries( 

671 self, collections: Iterable[CollectionRecord], dataset_types: Iterable[DatasetType] | None = None 

672 ) -> Mapping[Any, CollectionSummary]: 

673 """Fetch collection summaries given their names and dataset types. 

674 

675 Parameters 

676 ---------- 

677 collections : `~collections.abc.Iterable` [`CollectionRecord`] 

678 Collection records to query. 

679 dataset_types : `~collections.abc.Iterable` [`DatasetType`] or `None` 

680 Dataset types to include into returned summaries. If `None` then 

681 all dataset types will be included. 

682 

683 Returns 

684 ------- 

685 summaries : `~collections.abc.Mapping` [`Any`, `CollectionSummary`] 

686 Collection summaries indexed by collection record key. This mapping 

687 will also contain all nested non-chained collections of the chained 

688 collections. 

689 """ 

690 raise NotImplementedError() 

691 

692 @abstractmethod 

693 def ingest_date_dtype(self) -> type: 

694 """Return type of the ``ingest_date`` column.""" 

695 raise NotImplementedError()