Coverage for python/lsst/daf/butler/registry/interfaces/_datasets.py: 94%

72 statements  

« prev     ^ index     » next       coverage.py v7.4.0, created at 2024-01-16 10:44 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28from __future__ import annotations 

29 

30from ... import ddl 

31 

32__all__ = ("DatasetRecordStorageManager", "DatasetRecordStorage") 

33 

34from abc import ABC, abstractmethod 

35from collections.abc import Iterable, Iterator, Mapping, Set 

36from typing import TYPE_CHECKING, Any 

37 

38from lsst.daf.relation import Relation 

39 

40from ..._dataset_ref import DatasetId, DatasetIdGenEnum, DatasetRef 

41from ..._dataset_type import DatasetType 

42from ..._timespan import Timespan 

43from ...dimensions import DataCoordinate 

44from .._exceptions import MissingDatasetTypeError 

45from ._versioning import VersionedExtension, VersionTuple 

46 

47if TYPE_CHECKING: 

48 from .._caching_context import CachingContext 

49 from .._collection_summary import CollectionSummary 

50 from ..queries import SqlQueryContext 

51 from ._collections import CollectionManager, CollectionRecord, RunRecord 

52 from ._database import Database, StaticTablesContext 

53 from ._dimensions import DimensionRecordStorageManager 

54 

55 

56class DatasetRecordStorage(ABC): 

57 """An interface that manages the records associated with a particular 

58 `DatasetType`. 

59 

60 Parameters 

61 ---------- 

62 datasetType : `DatasetType` 

63 Dataset type whose records this object manages. 

64 """ 

65 

66 def __init__(self, datasetType: DatasetType): 

67 self.datasetType = datasetType 

68 

69 @abstractmethod 

70 def insert( 

71 self, 

72 run: RunRecord, 

73 dataIds: Iterable[DataCoordinate], 

74 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

75 ) -> Iterator[DatasetRef]: 

76 """Insert one or more dataset entries into the database. 

77 

78 Parameters 

79 ---------- 

80 run : `RunRecord` 

81 The record object describing the `~CollectionType.RUN` collection 

82 this dataset will be associated with. 

83 dataIds : `~collections.abc.Iterable` [ `DataCoordinate` ] 

84 Expanded data IDs (`DataCoordinate` instances) for the 

85 datasets to be added. The dimensions of all data IDs must be the 

86 same as ``self.datasetType.dimensions``. 

87 idGenerationMode : `DatasetIdGenEnum` 

88 With `UNIQUE` each new dataset is inserted with its new unique ID. 

89 With non-`UNIQUE` mode ID is computed from some combination of 

90 dataset type, dataId, and run collection name; if the same ID is 

91 already in the database then new record is not inserted. 

92 

93 Returns 

94 ------- 

95 datasets : `~collections.abc.Iterable` [ `DatasetRef` ] 

96 References to the inserted datasets. 

97 """ 

98 raise NotImplementedError() 

99 

100 @abstractmethod 

101 def import_( 

102 self, 

103 run: RunRecord, 

104 datasets: Iterable[DatasetRef], 

105 ) -> Iterator[DatasetRef]: 

106 """Insert one or more dataset entries into the database. 

107 

108 Parameters 

109 ---------- 

110 run : `RunRecord` 

111 The record object describing the `~CollectionType.RUN` collection 

112 this dataset will be associated with. 

113 datasets : `~collections.abc.Iterable` of `DatasetRef` 

114 Datasets to be inserted. Datasets can specify ``id`` attribute 

115 which will be used for inserted datasets. All dataset IDs must 

116 have the same type (`int` or `uuid.UUID`), if type of dataset IDs 

117 does not match type supported by this class then IDs will be 

118 ignored and new IDs will be generated by backend. 

119 

120 Returns 

121 ------- 

122 datasets : `~collections.abc.Iterable` [ `DatasetRef` ] 

123 References to the inserted or existing datasets. 

124 

125 Notes 

126 ----- 

127 The ``datasetType`` and ``run`` attributes of datasets are supposed to 

128 be identical across all datasets but this is not checked and it should 

129 be enforced by higher level registry code. This method does not need 

130 to use those attributes from datasets, only ``dataId`` and ``id`` are 

131 relevant. 

132 """ 

133 raise NotImplementedError() 

134 

135 @abstractmethod 

136 def delete(self, datasets: Iterable[DatasetRef]) -> None: 

137 """Fully delete the given datasets from the registry. 

138 

139 Parameters 

140 ---------- 

141 datasets : `~collections.abc.Iterable` [ `DatasetRef` ] 

142 Datasets to be deleted. All datasets must be resolved and have 

143 the same `DatasetType` as ``self``. 

144 

145 Raises 

146 ------ 

147 AmbiguousDatasetError 

148 Raised if any of the given `DatasetRef` instances is unresolved. 

149 """ 

150 raise NotImplementedError() 

151 

152 @abstractmethod 

153 def associate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None: 

154 """Associate one or more datasets with a collection. 

155 

156 Parameters 

157 ---------- 

158 collection : `CollectionRecord` 

159 The record object describing the collection. ``collection.type`` 

160 must be `~CollectionType.TAGGED`. 

161 datasets : `~collections.abc.Iterable` [ `DatasetRef` ] 

162 Datasets to be associated. All datasets must be resolved and have 

163 the same `DatasetType` as ``self``. 

164 

165 Raises 

166 ------ 

167 AmbiguousDatasetError 

168 Raised if any of the given `DatasetRef` instances is unresolved. 

169 

170 Notes 

171 ----- 

172 Associating a dataset with into collection that already contains a 

173 different dataset with the same `DatasetType` and data ID will remove 

174 the existing dataset from that collection. 

175 

176 Associating the same dataset into a collection multiple times is a 

177 no-op, but is still not permitted on read-only databases. 

178 """ 

179 raise NotImplementedError() 

180 

181 @abstractmethod 

182 def disassociate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None: 

183 """Remove one or more datasets from a collection. 

184 

185 Parameters 

186 ---------- 

187 collection : `CollectionRecord` 

188 The record object describing the collection. ``collection.type`` 

189 must be `~CollectionType.TAGGED`. 

190 datasets : `~collections.abc.Iterable` [ `DatasetRef` ] 

191 Datasets to be disassociated. All datasets must be resolved and 

192 have the same `DatasetType` as ``self``. 

193 

194 Raises 

195 ------ 

196 AmbiguousDatasetError 

197 Raised if any of the given `DatasetRef` instances is unresolved. 

198 """ 

199 raise NotImplementedError() 

200 

201 @abstractmethod 

202 def certify( 

203 self, 

204 collection: CollectionRecord, 

205 datasets: Iterable[DatasetRef], 

206 timespan: Timespan, 

207 context: SqlQueryContext, 

208 ) -> None: 

209 """Associate one or more datasets with a calibration collection and a 

210 validity range within it. 

211 

212 Parameters 

213 ---------- 

214 collection : `CollectionRecord` 

215 The record object describing the collection. ``collection.type`` 

216 must be `~CollectionType.CALIBRATION`. 

217 datasets : `~collections.abc.Iterable` [ `DatasetRef` ] 

218 Datasets to be associated. All datasets must be resolved and have 

219 the same `DatasetType` as ``self``. 

220 timespan : `Timespan` 

221 The validity range for these datasets within the collection. 

222 context : `SqlQueryContext` 

223 The object that manages database connections, temporary tables and 

224 relation engines for this query. 

225 

226 Raises 

227 ------ 

228 AmbiguousDatasetError 

229 Raised if any of the given `DatasetRef` instances is unresolved. 

230 ConflictingDefinitionError 

231 Raised if the collection already contains a different dataset with 

232 the same `DatasetType` and data ID and an overlapping validity 

233 range. 

234 CollectionTypeError 

235 Raised if 

236 ``collection.type is not CollectionType.CALIBRATION`` or if 

237 ``self.datasetType.isCalibration() is False``. 

238 """ 

239 raise NotImplementedError() 

240 

241 @abstractmethod 

242 def decertify( 

243 self, 

244 collection: CollectionRecord, 

245 timespan: Timespan, 

246 *, 

247 dataIds: Iterable[DataCoordinate] | None = None, 

248 context: SqlQueryContext, 

249 ) -> None: 

250 """Remove or adjust datasets to clear a validity range within a 

251 calibration collection. 

252 

253 Parameters 

254 ---------- 

255 collection : `CollectionRecord` 

256 The record object describing the collection. ``collection.type`` 

257 must be `~CollectionType.CALIBRATION`. 

258 timespan : `Timespan` 

259 The validity range to remove datasets from within the collection. 

260 Datasets that overlap this range but are not contained by it will 

261 have their validity ranges adjusted to not overlap it, which may 

262 split a single dataset validity range into two. 

263 dataIds : `~collections.abc.Iterable` [ `DataCoordinate` ], optional 

264 Data IDs that should be decertified within the given validity range 

265 If `None`, all data IDs for ``self.datasetType`` will be 

266 decertified. 

267 context : `SqlQueryContext` 

268 The object that manages database connections, temporary tables and 

269 relation engines for this query. 

270 

271 Raises 

272 ------ 

273 CollectionTypeError 

274 Raised if ``collection.type is not CollectionType.CALIBRATION``. 

275 """ 

276 raise NotImplementedError() 

277 

278 @abstractmethod 

279 def make_relation( 

280 self, 

281 *collections: CollectionRecord, 

282 columns: Set[str], 

283 context: SqlQueryContext, 

284 ) -> Relation: 

285 """Return a `sql.Relation` that represents a query for for this 

286 `DatasetType` in one or more collections. 

287 

288 Parameters 

289 ---------- 

290 *collections : `CollectionRecord` 

291 The record object(s) describing the collection(s) to query. May 

292 not be of type `CollectionType.CHAINED`. If multiple collections 

293 are passed, the query will search all of them in an unspecified 

294 order, and all collections must have the same type. Must include 

295 at least one collection. 

296 columns : `~collections.abc.Set` [ `str` ] 

297 Columns to include in the relation. See `Query.find_datasets` for 

298 most options, but this method supports one more: 

299 

300 - ``rank``: a calculated integer column holding the index of the 

301 collection the dataset was found in, within the ``collections`` 

302 sequence given. 

303 context : `SqlQueryContext` 

304 The object that manages database connections, temporary tables and 

305 relation engines for this query. 

306 

307 Returns 

308 ------- 

309 relation : `~lsst.daf.relation.Relation` 

310 Representation of the query. 

311 """ 

312 raise NotImplementedError() 

313 

314 datasetType: DatasetType 

315 """Dataset type whose records this object manages (`DatasetType`). 

316 """ 

317 

318 

319class DatasetRecordStorageManager(VersionedExtension): 

320 """An interface that manages the tables that describe datasets. 

321 

322 `DatasetRecordStorageManager` primarily serves as a container and factory 

323 for `DatasetRecordStorage` instances, which each provide access to the 

324 records for a different `DatasetType`. 

325 

326 Parameters 

327 ---------- 

328 registry_schema_version : `VersionTuple` or `None`, optional 

329 Version of registry schema. 

330 """ 

331 

332 def __init__(self, *, registry_schema_version: VersionTuple | None = None) -> None: 

333 super().__init__(registry_schema_version=registry_schema_version) 

334 

335 @classmethod 

336 @abstractmethod 

337 def initialize( 

338 cls, 

339 db: Database, 

340 context: StaticTablesContext, 

341 *, 

342 collections: CollectionManager, 

343 dimensions: DimensionRecordStorageManager, 

344 caching_context: CachingContext, 

345 registry_schema_version: VersionTuple | None = None, 

346 ) -> DatasetRecordStorageManager: 

347 """Construct an instance of the manager. 

348 

349 Parameters 

350 ---------- 

351 db : `Database` 

352 Interface to the underlying database engine and namespace. 

353 context : `StaticTablesContext` 

354 Context object obtained from `Database.declareStaticTables`; used 

355 to declare any tables that should always be present. 

356 collections : `CollectionManager` 

357 Manager object for the collections in this `Registry`. 

358 dimensions : `DimensionRecordStorageManager` 

359 Manager object for the dimensions in this `Registry`. 

360 caching_context : `CachingContext` 

361 Object controlling caching of information returned by managers. 

362 registry_schema_version : `VersionTuple` or `None` 

363 Schema version of this extension as defined in registry. 

364 

365 Returns 

366 ------- 

367 manager : `DatasetRecordStorageManager` 

368 An instance of a concrete `DatasetRecordStorageManager` subclass. 

369 """ 

370 raise NotImplementedError() 

371 

372 @classmethod 

373 @abstractmethod 

374 def getIdColumnType(cls) -> type: 

375 """Return type used for columns storing dataset IDs. 

376 

377 This type is used for columns storing `DatasetRef.id` values, usually 

378 a `type` subclass provided by SQLAlchemy. 

379 

380 Returns 

381 ------- 

382 dtype : `type` 

383 Type used for dataset identification in database. 

384 """ 

385 raise NotImplementedError() 

386 

387 @classmethod 

388 @abstractmethod 

389 def supportsIdGenerationMode(cls, mode: DatasetIdGenEnum) -> bool: 

390 """Test whether the given dataset ID generation mode is supported by 

391 `insert`. 

392 

393 Parameters 

394 ---------- 

395 mode : `DatasetIdGenEnum` 

396 Enum value for the mode to test. 

397 

398 Returns 

399 ------- 

400 supported : `bool` 

401 Whether the given mode is supported. 

402 """ 

403 raise NotImplementedError() 

404 

405 @classmethod 

406 @abstractmethod 

407 def addDatasetForeignKey( 

408 cls, 

409 tableSpec: ddl.TableSpec, 

410 *, 

411 name: str = "dataset", 

412 constraint: bool = True, 

413 onDelete: str | None = None, 

414 **kwargs: Any, 

415 ) -> ddl.FieldSpec: 

416 """Add a foreign key (field and constraint) referencing the dataset 

417 table. 

418 

419 Parameters 

420 ---------- 

421 tableSpec : `ddl.TableSpec` 

422 Specification for the table that should reference the dataset 

423 table. Will be modified in place. 

424 name : `str`, optional 

425 A name to use for the prefix of the new field; the full name is 

426 ``{name}_id``. 

427 constraint : `bool`, optional 

428 If `False` (`True` is default), add a field that can be joined to 

429 the dataset primary key, but do not add a foreign key constraint. 

430 onDelete : `str`, optional 

431 One of "CASCADE" or "SET NULL", indicating what should happen to 

432 the referencing row if the collection row is deleted. `None` 

433 indicates that this should be an integrity error. 

434 **kwargs 

435 Additional keyword arguments are forwarded to the `ddl.FieldSpec` 

436 constructor (only the ``name`` and ``dtype`` arguments are 

437 otherwise provided). 

438 

439 Returns 

440 ------- 

441 idSpec : `ddl.FieldSpec` 

442 Specification for the ID field. 

443 """ 

444 raise NotImplementedError() 

445 

446 @abstractmethod 

447 def refresh(self) -> None: 

448 """Ensure all other operations on this manager are aware of any 

449 dataset types that may have been registered by other clients since 

450 it was initialized or last refreshed. 

451 """ 

452 raise NotImplementedError() 

453 

454 def __getitem__(self, name: str) -> DatasetRecordStorage: 

455 """Return the object that provides access to the records associated 

456 with the given `DatasetType` name. 

457 

458 This is simply a convenience wrapper for `find` that raises `KeyError` 

459 when the dataset type is not found. 

460 

461 Returns 

462 ------- 

463 records : `DatasetRecordStorage` 

464 The object representing the records for the given dataset type. 

465 

466 Raises 

467 ------ 

468 KeyError 

469 Raised if there is no dataset type with the given name. 

470 

471 Notes 

472 ----- 

473 Dataset types registered by another client of the same repository since 

474 the last call to `initialize` or `refresh` may not be found. 

475 """ 

476 result = self.find(name) 

477 if result is None: 

478 raise MissingDatasetTypeError(f"Dataset type with name '{name}' not found.") 

479 return result 

480 

481 @abstractmethod 

482 def find(self, name: str) -> DatasetRecordStorage | None: 

483 """Return an object that provides access to the records associated with 

484 the given `DatasetType` name, if one exists. 

485 

486 Parameters 

487 ---------- 

488 name : `str` 

489 Name of the dataset type. 

490 

491 Returns 

492 ------- 

493 records : `DatasetRecordStorage` or `None` 

494 The object representing the records for the given dataset type, or 

495 `None` if there are no records for that dataset type. 

496 

497 Notes 

498 ----- 

499 Dataset types registered by another client of the same repository since 

500 the last call to `initialize` or `refresh` may not be found. 

501 """ 

502 raise NotImplementedError() 

503 

504 @abstractmethod 

505 def register(self, datasetType: DatasetType) -> bool: 

506 """Ensure that this `Registry` can hold records for the given 

507 `DatasetType`, creating new tables as necessary. 

508 

509 Parameters 

510 ---------- 

511 datasetType : `DatasetType` 

512 Dataset type for which a table should created (as necessary) and 

513 an associated `DatasetRecordStorage` returned. 

514 

515 Returns 

516 ------- 

517 inserted : `bool` 

518 `True` if the dataset type did not exist in the registry before. 

519 

520 Notes 

521 ----- 

522 This operation may not be invoked within a `Database.transaction` 

523 context. 

524 """ 

525 raise NotImplementedError() 

526 

527 @abstractmethod 

528 def remove(self, name: str) -> None: 

529 """Remove the dataset type. 

530 

531 Parameters 

532 ---------- 

533 name : `str` 

534 Name of the dataset type. 

535 """ 

536 raise NotImplementedError() 

537 

538 @abstractmethod 

539 def resolve_wildcard( 

540 self, 

541 expression: Any, 

542 missing: list[str] | None = None, 

543 explicit_only: bool = False, 

544 ) -> list[DatasetType]: 

545 """Resolve a dataset type wildcard expression. 

546 

547 Parameters 

548 ---------- 

549 expression : `~typing.Any` 

550 Expression to resolve. Will be passed to 

551 `DatasetTypeWildcard.from_expression`. 

552 missing : `list` of `str`, optional 

553 String dataset type names that were explicitly given (i.e. not 

554 regular expression patterns) but not found will be appended to this 

555 list, if it is provided. 

556 explicit_only : `bool`, optional 

557 If `True`, require explicit `DatasetType` instances or `str` names, 

558 with `re.Pattern` instances deprecated and ``...`` prohibited. 

559 

560 Returns 

561 ------- 

562 dataset_types : `list` [ `DatasetType` ] 

563 A list of resolved dataset types. 

564 """ 

565 raise NotImplementedError() 

566 

567 @abstractmethod 

568 def getDatasetRef(self, id: DatasetId) -> DatasetRef | None: 

569 """Return a `DatasetRef` for the given dataset primary key 

570 value. 

571 

572 Parameters 

573 ---------- 

574 id : `DatasetId` 

575 Primary key value for the dataset. 

576 

577 Returns 

578 ------- 

579 ref : `DatasetRef` or `None` 

580 Object representing the dataset, or `None` if no dataset with the 

581 given primary key values exists in this layer. 

582 """ 

583 raise NotImplementedError() 

584 

585 @abstractmethod 

586 def getCollectionSummary(self, collection: CollectionRecord) -> CollectionSummary: 

587 """Return a summary for the given collection. 

588 

589 Parameters 

590 ---------- 

591 collection : `CollectionRecord` 

592 Record describing the collection for which a summary is to be 

593 retrieved. 

594 

595 Returns 

596 ------- 

597 summary : `CollectionSummary` 

598 Summary of the dataset types and governor dimension values in 

599 this collection. 

600 """ 

601 raise NotImplementedError() 

602 

603 @abstractmethod 

604 def fetch_summaries( 

605 self, collections: Iterable[CollectionRecord], dataset_types: Iterable[DatasetType] | None = None 

606 ) -> Mapping[Any, CollectionSummary]: 

607 """Fetch collection summaries given their names and dataset types. 

608 

609 Parameters 

610 ---------- 

611 collections : `~collections.abc.Iterable` [`CollectionRecord`] 

612 Collection records to query. 

613 dataset_types : `~collections.abc.Iterable` [`DatasetType`] or `None` 

614 Dataset types to include into returned summaries. If `None` then 

615 all dataset types will be included. 

616 

617 Returns 

618 ------- 

619 summaries : `~collections.abc.Mapping` [`Any`, `CollectionSummary`] 

620 Collection summaries indexed by collection record key. This mapping 

621 will also contain all nested non-chained collections of the chained 

622 collections. 

623 """ 

624 raise NotImplementedError() 

625 

626 @abstractmethod 

627 def ingest_date_dtype(self) -> type: 

628 """Return type of the ``ingest_date`` column.""" 

629 raise NotImplementedError()