Coverage for python/lsst/daf/butler/registry/interfaces/_datasets.py: 94%

66 statements  

« prev     ^ index     » next       coverage.py v7.3.1, created at 2023-10-02 08:00 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28from __future__ import annotations 

29 

30__all__ = ("DatasetRecordStorageManager", "DatasetRecordStorage") 

31 

32from abc import ABC, abstractmethod 

33from collections.abc import Iterable, Iterator, Set 

34from typing import TYPE_CHECKING, Any 

35 

36from lsst.daf.relation import Relation 

37 

38from ...core import DataCoordinate, DatasetId, DatasetIdGenEnum, DatasetRef, DatasetType, Timespan, ddl 

39from .._exceptions import MissingDatasetTypeError 

40from ._versioning import VersionedExtension, VersionTuple 

41 

42if TYPE_CHECKING: 

43 from .._collection_summary import CollectionSummary 

44 from ..queries import SqlQueryContext 

45 from ._collections import CollectionManager, CollectionRecord, RunRecord 

46 from ._database import Database, StaticTablesContext 

47 from ._dimensions import DimensionRecordStorageManager 

48 

49 

50class DatasetRecordStorage(ABC): 

51 """An interface that manages the records associated with a particular 

52 `DatasetType`. 

53 

54 Parameters 

55 ---------- 

56 datasetType : `DatasetType` 

57 Dataset type whose records this object manages. 

58 """ 

59 

60 def __init__(self, datasetType: DatasetType): 

61 self.datasetType = datasetType 

62 

63 @abstractmethod 

64 def insert( 

65 self, 

66 run: RunRecord, 

67 dataIds: Iterable[DataCoordinate], 

68 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

69 ) -> Iterator[DatasetRef]: 

70 """Insert one or more dataset entries into the database. 

71 

72 Parameters 

73 ---------- 

74 run : `RunRecord` 

75 The record object describing the `~CollectionType.RUN` collection 

76 this dataset will be associated with. 

77 dataIds : `~collections.abc.Iterable` [ `DataCoordinate` ] 

78 Expanded data IDs (`DataCoordinate` instances) for the 

79 datasets to be added. The dimensions of all data IDs must be the 

80 same as ``self.datasetType.dimensions``. 

81 idMode : `DatasetIdGenEnum` 

82 With `UNIQUE` each new dataset is inserted with its new unique ID. 

83 With non-`UNIQUE` mode ID is computed from some combination of 

84 dataset type, dataId, and run collection name; if the same ID is 

85 already in the database then new record is not inserted. 

86 

87 Returns 

88 ------- 

89 datasets : `~collections.abc.Iterable` [ `DatasetRef` ] 

90 References to the inserted datasets. 

91 """ 

92 raise NotImplementedError() 

93 

94 @abstractmethod 

95 def import_( 

96 self, 

97 run: RunRecord, 

98 datasets: Iterable[DatasetRef], 

99 ) -> Iterator[DatasetRef]: 

100 """Insert one or more dataset entries into the database. 

101 

102 Parameters 

103 ---------- 

104 run : `RunRecord` 

105 The record object describing the `~CollectionType.RUN` collection 

106 this dataset will be associated with. 

107 datasets : `~collections.abc.Iterable` of `DatasetRef` 

108 Datasets to be inserted. Datasets can specify ``id`` attribute 

109 which will be used for inserted datasets. All dataset IDs must 

110 have the same type (`int` or `uuid.UUID`), if type of dataset IDs 

111 does not match type supported by this class then IDs will be 

112 ignored and new IDs will be generated by backend. 

113 

114 Returns 

115 ------- 

116 datasets : `~collections.abc.Iterable` [ `DatasetRef` ] 

117 References to the inserted or existing datasets. 

118 

119 Notes 

120 ----- 

121 The ``datasetType`` and ``run`` attributes of datasets are supposed to 

122 be identical across all datasets but this is not checked and it should 

123 be enforced by higher level registry code. This method does not need 

124 to use those attributes from datasets, only ``dataId`` and ``id`` are 

125 relevant. 

126 """ 

127 raise NotImplementedError() 

128 

129 @abstractmethod 

130 def delete(self, datasets: Iterable[DatasetRef]) -> None: 

131 """Fully delete the given datasets from the registry. 

132 

133 Parameters 

134 ---------- 

135 datasets : `~collections.abc.Iterable` [ `DatasetRef` ] 

136 Datasets to be deleted. All datasets must be resolved and have 

137 the same `DatasetType` as ``self``. 

138 

139 Raises 

140 ------ 

141 AmbiguousDatasetError 

142 Raised if any of the given `DatasetRef` instances is unresolved. 

143 """ 

144 raise NotImplementedError() 

145 

146 @abstractmethod 

147 def associate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None: 

148 """Associate one or more datasets with a collection. 

149 

150 Parameters 

151 ---------- 

152 collection : `CollectionRecord` 

153 The record object describing the collection. ``collection.type`` 

154 must be `~CollectionType.TAGGED`. 

155 datasets : `~collections.abc.Iterable` [ `DatasetRef` ] 

156 Datasets to be associated. All datasets must be resolved and have 

157 the same `DatasetType` as ``self``. 

158 

159 Raises 

160 ------ 

161 AmbiguousDatasetError 

162 Raised if any of the given `DatasetRef` instances is unresolved. 

163 

164 Notes 

165 ----- 

166 Associating a dataset with into collection that already contains a 

167 different dataset with the same `DatasetType` and data ID will remove 

168 the existing dataset from that collection. 

169 

170 Associating the same dataset into a collection multiple times is a 

171 no-op, but is still not permitted on read-only databases. 

172 """ 

173 raise NotImplementedError() 

174 

175 @abstractmethod 

176 def disassociate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None: 

177 """Remove one or more datasets from a collection. 

178 

179 Parameters 

180 ---------- 

181 collection : `CollectionRecord` 

182 The record object describing the collection. ``collection.type`` 

183 must be `~CollectionType.TAGGED`. 

184 datasets : `~collections.abc.Iterable` [ `DatasetRef` ] 

185 Datasets to be disassociated. All datasets must be resolved and 

186 have the same `DatasetType` as ``self``. 

187 

188 Raises 

189 ------ 

190 AmbiguousDatasetError 

191 Raised if any of the given `DatasetRef` instances is unresolved. 

192 """ 

193 raise NotImplementedError() 

194 

195 @abstractmethod 

196 def certify( 

197 self, 

198 collection: CollectionRecord, 

199 datasets: Iterable[DatasetRef], 

200 timespan: Timespan, 

201 context: SqlQueryContext, 

202 ) -> None: 

203 """Associate one or more datasets with a calibration collection and a 

204 validity range within it. 

205 

206 Parameters 

207 ---------- 

208 collection : `CollectionRecord` 

209 The record object describing the collection. ``collection.type`` 

210 must be `~CollectionType.CALIBRATION`. 

211 datasets : `~collections.abc.Iterable` [ `DatasetRef` ] 

212 Datasets to be associated. All datasets must be resolved and have 

213 the same `DatasetType` as ``self``. 

214 timespan : `Timespan` 

215 The validity range for these datasets within the collection. 

216 

217 Raises 

218 ------ 

219 AmbiguousDatasetError 

220 Raised if any of the given `DatasetRef` instances is unresolved. 

221 ConflictingDefinitionError 

222 Raised if the collection already contains a different dataset with 

223 the same `DatasetType` and data ID and an overlapping validity 

224 range. 

225 CollectionTypeError 

226 Raised if 

227 ``collection.type is not CollectionType.CALIBRATION`` or if 

228 ``self.datasetType.isCalibration() is False``. 

229 """ 

230 raise NotImplementedError() 

231 

232 @abstractmethod 

233 def decertify( 

234 self, 

235 collection: CollectionRecord, 

236 timespan: Timespan, 

237 *, 

238 dataIds: Iterable[DataCoordinate] | None = None, 

239 context: SqlQueryContext, 

240 ) -> None: 

241 """Remove or adjust datasets to clear a validity range within a 

242 calibration collection. 

243 

244 Parameters 

245 ---------- 

246 collection : `CollectionRecord` 

247 The record object describing the collection. ``collection.type`` 

248 must be `~CollectionType.CALIBRATION`. 

249 timespan : `Timespan` 

250 The validity range to remove datasets from within the collection. 

251 Datasets that overlap this range but are not contained by it will 

252 have their validity ranges adjusted to not overlap it, which may 

253 split a single dataset validity range into two. 

254 dataIds : `~collections.abc.Iterable` [ `DataCoordinate` ], optional 

255 Data IDs that should be decertified within the given validity range 

256 If `None`, all data IDs for ``self.datasetType`` will be 

257 decertified. 

258 

259 Raises 

260 ------ 

261 CollectionTypeError 

262 Raised if ``collection.type is not CollectionType.CALIBRATION``. 

263 """ 

264 raise NotImplementedError() 

265 

266 @abstractmethod 

267 def make_relation( 

268 self, 

269 *collections: CollectionRecord, 

270 columns: Set[str], 

271 context: SqlQueryContext, 

272 ) -> Relation: 

273 """Return a `sql.Relation` that represents a query for for this 

274 `DatasetType` in one or more collections. 

275 

276 Parameters 

277 ---------- 

278 *collections : `CollectionRecord` 

279 The record object(s) describing the collection(s) to query. May 

280 not be of type `CollectionType.CHAINED`. If multiple collections 

281 are passed, the query will search all of them in an unspecified 

282 order, and all collections must have the same type. Must include 

283 at least one collection. 

284 columns : `~collections.abc.Set` [ `str` ] 

285 Columns to include in the relation. See `Query.find_datasets` for 

286 most options, but this method supports one more: 

287 

288 - ``rank``: a calculated integer column holding the index of the 

289 collection the dataset was found in, within the ``collections`` 

290 sequence given. 

291 context : `SqlQueryContext` 

292 The object that manages database connections, temporary tables and 

293 relation engines for this query. 

294 

295 Returns 

296 ------- 

297 relation : `~lsst.daf.relation.Relation` 

298 Representation of the query. 

299 """ 

300 raise NotImplementedError() 

301 

302 datasetType: DatasetType 

303 """Dataset type whose records this object manages (`DatasetType`). 

304 """ 

305 

306 

307class DatasetRecordStorageManager(VersionedExtension): 

308 """An interface that manages the tables that describe datasets. 

309 

310 `DatasetRecordStorageManager` primarily serves as a container and factory 

311 for `DatasetRecordStorage` instances, which each provide access to the 

312 records for a different `DatasetType`. 

313 """ 

314 

315 def __init__(self, *, registry_schema_version: VersionTuple | None = None) -> None: 

316 super().__init__(registry_schema_version=registry_schema_version) 

317 

318 @classmethod 

319 @abstractmethod 

320 def initialize( 

321 cls, 

322 db: Database, 

323 context: StaticTablesContext, 

324 *, 

325 collections: CollectionManager, 

326 dimensions: DimensionRecordStorageManager, 

327 registry_schema_version: VersionTuple | None = None, 

328 ) -> DatasetRecordStorageManager: 

329 """Construct an instance of the manager. 

330 

331 Parameters 

332 ---------- 

333 db : `Database` 

334 Interface to the underlying database engine and namespace. 

335 context : `StaticTablesContext` 

336 Context object obtained from `Database.declareStaticTables`; used 

337 to declare any tables that should always be present. 

338 collections: `CollectionManager` 

339 Manager object for the collections in this `Registry`. 

340 dimensions : `DimensionRecordStorageManager` 

341 Manager object for the dimensions in this `Registry`. 

342 registry_schema_version : `VersionTuple` or `None` 

343 Schema version of this extension as defined in registry. 

344 

345 Returns 

346 ------- 

347 manager : `DatasetRecordStorageManager` 

348 An instance of a concrete `DatasetRecordStorageManager` subclass. 

349 """ 

350 raise NotImplementedError() 

351 

352 @classmethod 

353 @abstractmethod 

354 def getIdColumnType(cls) -> type: 

355 """Return type used for columns storing dataset IDs. 

356 

357 This type is used for columns storing `DatasetRef.id` values, usually 

358 a `type` subclass provided by SQLAlchemy. 

359 

360 Returns 

361 ------- 

362 dtype : `type` 

363 Type used for dataset identification in database. 

364 """ 

365 raise NotImplementedError() 

366 

367 @classmethod 

368 @abstractmethod 

369 def supportsIdGenerationMode(cls, mode: DatasetIdGenEnum) -> bool: 

370 """Test whether the given dataset ID generation mode is supported by 

371 `insert`. 

372 

373 Parameters 

374 ---------- 

375 mode : `DatasetIdGenEnum` 

376 Enum value for the mode to test. 

377 

378 Returns 

379 ------- 

380 supported : `bool` 

381 Whether the given mode is supported. 

382 """ 

383 raise NotImplementedError() 

384 

385 @classmethod 

386 @abstractmethod 

387 def addDatasetForeignKey( 

388 cls, 

389 tableSpec: ddl.TableSpec, 

390 *, 

391 name: str = "dataset", 

392 constraint: bool = True, 

393 onDelete: str | None = None, 

394 **kwargs: Any, 

395 ) -> ddl.FieldSpec: 

396 """Add a foreign key (field and constraint) referencing the dataset 

397 table. 

398 

399 Parameters 

400 ---------- 

401 tableSpec : `ddl.TableSpec` 

402 Specification for the table that should reference the dataset 

403 table. Will be modified in place. 

404 name: `str`, optional 

405 A name to use for the prefix of the new field; the full name is 

406 ``{name}_id``. 

407 onDelete: `str`, optional 

408 One of "CASCADE" or "SET NULL", indicating what should happen to 

409 the referencing row if the collection row is deleted. `None` 

410 indicates that this should be an integrity error. 

411 constraint: `bool`, optional 

412 If `False` (`True` is default), add a field that can be joined to 

413 the dataset primary key, but do not add a foreign key constraint. 

414 **kwargs 

415 Additional keyword arguments are forwarded to the `ddl.FieldSpec` 

416 constructor (only the ``name`` and ``dtype`` arguments are 

417 otherwise provided). 

418 

419 Returns 

420 ------- 

421 idSpec : `ddl.FieldSpec` 

422 Specification for the ID field. 

423 """ 

424 raise NotImplementedError() 

425 

426 @abstractmethod 

427 def refresh(self) -> None: 

428 """Ensure all other operations on this manager are aware of any 

429 dataset types that may have been registered by other clients since 

430 it was initialized or last refreshed. 

431 """ 

432 raise NotImplementedError() 

433 

434 def __getitem__(self, name: str) -> DatasetRecordStorage: 

435 """Return the object that provides access to the records associated 

436 with the given `DatasetType` name. 

437 

438 This is simply a convenience wrapper for `find` that raises `KeyError` 

439 when the dataset type is not found. 

440 

441 Returns 

442 ------- 

443 records : `DatasetRecordStorage` 

444 The object representing the records for the given dataset type. 

445 

446 Raises 

447 ------ 

448 KeyError 

449 Raised if there is no dataset type with the given name. 

450 

451 Notes 

452 ----- 

453 Dataset types registered by another client of the same repository since 

454 the last call to `initialize` or `refresh` may not be found. 

455 """ 

456 result = self.find(name) 

457 if result is None: 

458 raise MissingDatasetTypeError(f"Dataset type with name '{name}' not found.") 

459 return result 

460 

461 @abstractmethod 

462 def find(self, name: str) -> DatasetRecordStorage | None: 

463 """Return an object that provides access to the records associated with 

464 the given `DatasetType` name, if one exists. 

465 

466 Parameters 

467 ---------- 

468 name : `str` 

469 Name of the dataset type. 

470 

471 Returns 

472 ------- 

473 records : `DatasetRecordStorage` or `None` 

474 The object representing the records for the given dataset type, or 

475 `None` if there are no records for that dataset type. 

476 

477 Notes 

478 ----- 

479 Dataset types registered by another client of the same repository since 

480 the last call to `initialize` or `refresh` may not be found. 

481 """ 

482 raise NotImplementedError() 

483 

484 @abstractmethod 

485 def register(self, datasetType: DatasetType) -> tuple[DatasetRecordStorage, bool]: 

486 """Ensure that this `Registry` can hold records for the given 

487 `DatasetType`, creating new tables as necessary. 

488 

489 Parameters 

490 ---------- 

491 datasetType : `DatasetType` 

492 Dataset type for which a table should created (as necessary) and 

493 an associated `DatasetRecordStorage` returned. 

494 

495 Returns 

496 ------- 

497 records : `DatasetRecordStorage` 

498 The object representing the records for the given dataset type. 

499 inserted : `bool` 

500 `True` if the dataset type did not exist in the registry before. 

501 

502 Notes 

503 ----- 

504 This operation may not be invoked within a `Database.transaction` 

505 context. 

506 """ 

507 raise NotImplementedError() 

508 

509 @abstractmethod 

510 def remove(self, name: str) -> None: 

511 """Remove the dataset type. 

512 

513 Parameters 

514 ---------- 

515 name : `str` 

516 Name of the dataset type. 

517 """ 

518 raise NotImplementedError() 

519 

520 @abstractmethod 

521 def resolve_wildcard( 

522 self, 

523 expression: Any, 

524 components: bool | None = False, 

525 missing: list[str] | None = None, 

526 explicit_only: bool = False, 

527 components_deprecated: bool = True, 

528 ) -> dict[DatasetType, list[str | None]]: 

529 """Resolve a dataset type wildcard expression. 

530 

531 Parameters 

532 ---------- 

533 expression 

534 Expression to resolve. Will be passed to 

535 `DatasetTypeWildcard.from_expression`. 

536 components : `bool`, optional 

537 If `True`, apply all expression patterns to component dataset type 

538 names as well. If `False`, never apply patterns to components. If 

539 `None`, apply patterns to components only if their parent 

540 datasets were not matched by the expression. Fully-specified 

541 component datasets (`str` or `DatasetType` instances) are always 

542 included. 

543 missing : `list` of `str`, optional 

544 String dataset type names that were explicitly given (i.e. not 

545 regular expression patterns) but not found will be appended to this 

546 list, if it is provided. 

547 explicit_only : `bool`, optional 

548 If `True`, require explicit `DatasetType` instances or `str` names, 

549 with `re.Pattern` instances deprecated and ``...`` prohibited. 

550 components_deprecated : `bool`, optional 

551 If `True`, this is a context in which component dataset support is 

552 deprecated. This will result in a deprecation warning when 

553 ``components=True`` or ``components=None`` and a component dataset 

554 is matched. In the future this will become an error. 

555 

556 Returns 

557 ------- 

558 dataset_types : `dict` [ `DatasetType`, `list` [ `None`, `str` ] ] 

559 A mapping with resolved dataset types as keys and lists of 

560 matched component names as values, where `None` indicates the 

561 parent composite dataset type was matched. 

562 """ 

563 raise NotImplementedError() 

564 

565 @abstractmethod 

566 def getDatasetRef(self, id: DatasetId) -> DatasetRef | None: 

567 """Return a `DatasetRef` for the given dataset primary key 

568 value. 

569 

570 Parameters 

571 ---------- 

572 id : `DatasetId` 

573 Primary key value for the dataset. 

574 

575 Returns 

576 ------- 

577 ref : `DatasetRef` or `None` 

578 Object representing the dataset, or `None` if no dataset with the 

579 given primary key values exists in this layer. 

580 """ 

581 raise NotImplementedError() 

582 

583 @abstractmethod 

584 def getCollectionSummary(self, collection: CollectionRecord) -> CollectionSummary: 

585 """Return a summary for the given collection. 

586 

587 Parameters 

588 ---------- 

589 collection : `CollectionRecord` 

590 Record describing the collection for which a summary is to be 

591 retrieved. 

592 

593 Returns 

594 ------- 

595 summary : `CollectionSummary` 

596 Summary of the dataset types and governor dimension values in 

597 this collection. 

598 """ 

599 raise NotImplementedError() 

600 

601 @abstractmethod 

602 def ingest_date_dtype(self) -> type: 

603 """Return type of the ``ingest_date`` column.""" 

604 raise NotImplementedError()