Coverage for python/lsst/daf/butler/registry/interfaces/_datasets.py: 94%

70 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-10-27 09:44 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28from __future__ import annotations 

29 

30from ... import ddl 

31 

32__all__ = ("DatasetRecordStorageManager", "DatasetRecordStorage") 

33 

34from abc import ABC, abstractmethod 

35from collections.abc import Iterable, Iterator, Set 

36from typing import TYPE_CHECKING, Any 

37 

38from lsst.daf.relation import Relation 

39 

40from ..._dataset_ref import DatasetId, DatasetIdGenEnum, DatasetRef 

41from ..._dataset_type import DatasetType 

42from ..._timespan import Timespan 

43from ...dimensions import DataCoordinate 

44from .._exceptions import MissingDatasetTypeError 

45from ._versioning import VersionedExtension, VersionTuple 

46 

47if TYPE_CHECKING: 

48 from .._collection_summary import CollectionSummary 

49 from ..queries import SqlQueryContext 

50 from ._collections import CollectionManager, CollectionRecord, RunRecord 

51 from ._database import Database, StaticTablesContext 

52 from ._dimensions import DimensionRecordStorageManager 

53 

54 

55class DatasetRecordStorage(ABC): 

56 """An interface that manages the records associated with a particular 

57 `DatasetType`. 

58 

59 Parameters 

60 ---------- 

61 datasetType : `DatasetType` 

62 Dataset type whose records this object manages. 

63 """ 

64 

65 def __init__(self, datasetType: DatasetType): 

66 self.datasetType = datasetType 

67 

68 @abstractmethod 

69 def insert( 

70 self, 

71 run: RunRecord, 

72 dataIds: Iterable[DataCoordinate], 

73 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

74 ) -> Iterator[DatasetRef]: 

75 """Insert one or more dataset entries into the database. 

76 

77 Parameters 

78 ---------- 

79 run : `RunRecord` 

80 The record object describing the `~CollectionType.RUN` collection 

81 this dataset will be associated with. 

82 dataIds : `~collections.abc.Iterable` [ `DataCoordinate` ] 

83 Expanded data IDs (`DataCoordinate` instances) for the 

84 datasets to be added. The dimensions of all data IDs must be the 

85 same as ``self.datasetType.dimensions``. 

86 idMode : `DatasetIdGenEnum` 

87 With `UNIQUE` each new dataset is inserted with its new unique ID. 

88 With non-`UNIQUE` mode ID is computed from some combination of 

89 dataset type, dataId, and run collection name; if the same ID is 

90 already in the database then new record is not inserted. 

91 

92 Returns 

93 ------- 

94 datasets : `~collections.abc.Iterable` [ `DatasetRef` ] 

95 References to the inserted datasets. 

96 """ 

97 raise NotImplementedError() 

98 

99 @abstractmethod 

100 def import_( 

101 self, 

102 run: RunRecord, 

103 datasets: Iterable[DatasetRef], 

104 ) -> Iterator[DatasetRef]: 

105 """Insert one or more dataset entries into the database. 

106 

107 Parameters 

108 ---------- 

109 run : `RunRecord` 

110 The record object describing the `~CollectionType.RUN` collection 

111 this dataset will be associated with. 

112 datasets : `~collections.abc.Iterable` of `DatasetRef` 

113 Datasets to be inserted. Datasets can specify ``id`` attribute 

114 which will be used for inserted datasets. All dataset IDs must 

115 have the same type (`int` or `uuid.UUID`), if type of dataset IDs 

116 does not match type supported by this class then IDs will be 

117 ignored and new IDs will be generated by backend. 

118 

119 Returns 

120 ------- 

121 datasets : `~collections.abc.Iterable` [ `DatasetRef` ] 

122 References to the inserted or existing datasets. 

123 

124 Notes 

125 ----- 

126 The ``datasetType`` and ``run`` attributes of datasets are supposed to 

127 be identical across all datasets but this is not checked and it should 

128 be enforced by higher level registry code. This method does not need 

129 to use those attributes from datasets, only ``dataId`` and ``id`` are 

130 relevant. 

131 """ 

132 raise NotImplementedError() 

133 

134 @abstractmethod 

135 def delete(self, datasets: Iterable[DatasetRef]) -> None: 

136 """Fully delete the given datasets from the registry. 

137 

138 Parameters 

139 ---------- 

140 datasets : `~collections.abc.Iterable` [ `DatasetRef` ] 

141 Datasets to be deleted. All datasets must be resolved and have 

142 the same `DatasetType` as ``self``. 

143 

144 Raises 

145 ------ 

146 AmbiguousDatasetError 

147 Raised if any of the given `DatasetRef` instances is unresolved. 

148 """ 

149 raise NotImplementedError() 

150 

151 @abstractmethod 

152 def associate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None: 

153 """Associate one or more datasets with a collection. 

154 

155 Parameters 

156 ---------- 

157 collection : `CollectionRecord` 

158 The record object describing the collection. ``collection.type`` 

159 must be `~CollectionType.TAGGED`. 

160 datasets : `~collections.abc.Iterable` [ `DatasetRef` ] 

161 Datasets to be associated. All datasets must be resolved and have 

162 the same `DatasetType` as ``self``. 

163 

164 Raises 

165 ------ 

166 AmbiguousDatasetError 

167 Raised if any of the given `DatasetRef` instances is unresolved. 

168 

169 Notes 

170 ----- 

171 Associating a dataset with into collection that already contains a 

172 different dataset with the same `DatasetType` and data ID will remove 

173 the existing dataset from that collection. 

174 

175 Associating the same dataset into a collection multiple times is a 

176 no-op, but is still not permitted on read-only databases. 

177 """ 

178 raise NotImplementedError() 

179 

180 @abstractmethod 

181 def disassociate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None: 

182 """Remove one or more datasets from a collection. 

183 

184 Parameters 

185 ---------- 

186 collection : `CollectionRecord` 

187 The record object describing the collection. ``collection.type`` 

188 must be `~CollectionType.TAGGED`. 

189 datasets : `~collections.abc.Iterable` [ `DatasetRef` ] 

190 Datasets to be disassociated. All datasets must be resolved and 

191 have the same `DatasetType` as ``self``. 

192 

193 Raises 

194 ------ 

195 AmbiguousDatasetError 

196 Raised if any of the given `DatasetRef` instances is unresolved. 

197 """ 

198 raise NotImplementedError() 

199 

200 @abstractmethod 

201 def certify( 

202 self, 

203 collection: CollectionRecord, 

204 datasets: Iterable[DatasetRef], 

205 timespan: Timespan, 

206 context: SqlQueryContext, 

207 ) -> None: 

208 """Associate one or more datasets with a calibration collection and a 

209 validity range within it. 

210 

211 Parameters 

212 ---------- 

213 collection : `CollectionRecord` 

214 The record object describing the collection. ``collection.type`` 

215 must be `~CollectionType.CALIBRATION`. 

216 datasets : `~collections.abc.Iterable` [ `DatasetRef` ] 

217 Datasets to be associated. All datasets must be resolved and have 

218 the same `DatasetType` as ``self``. 

219 timespan : `Timespan` 

220 The validity range for these datasets within the collection. 

221 

222 Raises 

223 ------ 

224 AmbiguousDatasetError 

225 Raised if any of the given `DatasetRef` instances is unresolved. 

226 ConflictingDefinitionError 

227 Raised if the collection already contains a different dataset with 

228 the same `DatasetType` and data ID and an overlapping validity 

229 range. 

230 CollectionTypeError 

231 Raised if 

232 ``collection.type is not CollectionType.CALIBRATION`` or if 

233 ``self.datasetType.isCalibration() is False``. 

234 """ 

235 raise NotImplementedError() 

236 

237 @abstractmethod 

238 def decertify( 

239 self, 

240 collection: CollectionRecord, 

241 timespan: Timespan, 

242 *, 

243 dataIds: Iterable[DataCoordinate] | None = None, 

244 context: SqlQueryContext, 

245 ) -> None: 

246 """Remove or adjust datasets to clear a validity range within a 

247 calibration collection. 

248 

249 Parameters 

250 ---------- 

251 collection : `CollectionRecord` 

252 The record object describing the collection. ``collection.type`` 

253 must be `~CollectionType.CALIBRATION`. 

254 timespan : `Timespan` 

255 The validity range to remove datasets from within the collection. 

256 Datasets that overlap this range but are not contained by it will 

257 have their validity ranges adjusted to not overlap it, which may 

258 split a single dataset validity range into two. 

259 dataIds : `~collections.abc.Iterable` [ `DataCoordinate` ], optional 

260 Data IDs that should be decertified within the given validity range 

261 If `None`, all data IDs for ``self.datasetType`` will be 

262 decertified. 

263 

264 Raises 

265 ------ 

266 CollectionTypeError 

267 Raised if ``collection.type is not CollectionType.CALIBRATION``. 

268 """ 

269 raise NotImplementedError() 

270 

271 @abstractmethod 

272 def make_relation( 

273 self, 

274 *collections: CollectionRecord, 

275 columns: Set[str], 

276 context: SqlQueryContext, 

277 ) -> Relation: 

278 """Return a `sql.Relation` that represents a query for for this 

279 `DatasetType` in one or more collections. 

280 

281 Parameters 

282 ---------- 

283 *collections : `CollectionRecord` 

284 The record object(s) describing the collection(s) to query. May 

285 not be of type `CollectionType.CHAINED`. If multiple collections 

286 are passed, the query will search all of them in an unspecified 

287 order, and all collections must have the same type. Must include 

288 at least one collection. 

289 columns : `~collections.abc.Set` [ `str` ] 

290 Columns to include in the relation. See `Query.find_datasets` for 

291 most options, but this method supports one more: 

292 

293 - ``rank``: a calculated integer column holding the index of the 

294 collection the dataset was found in, within the ``collections`` 

295 sequence given. 

296 context : `SqlQueryContext` 

297 The object that manages database connections, temporary tables and 

298 relation engines for this query. 

299 

300 Returns 

301 ------- 

302 relation : `~lsst.daf.relation.Relation` 

303 Representation of the query. 

304 """ 

305 raise NotImplementedError() 

306 

307 datasetType: DatasetType 

308 """Dataset type whose records this object manages (`DatasetType`). 

309 """ 

310 

311 

312class DatasetRecordStorageManager(VersionedExtension): 

313 """An interface that manages the tables that describe datasets. 

314 

315 `DatasetRecordStorageManager` primarily serves as a container and factory 

316 for `DatasetRecordStorage` instances, which each provide access to the 

317 records for a different `DatasetType`. 

318 """ 

319 

320 def __init__(self, *, registry_schema_version: VersionTuple | None = None) -> None: 

321 super().__init__(registry_schema_version=registry_schema_version) 

322 

323 @classmethod 

324 @abstractmethod 

325 def initialize( 

326 cls, 

327 db: Database, 

328 context: StaticTablesContext, 

329 *, 

330 collections: CollectionManager, 

331 dimensions: DimensionRecordStorageManager, 

332 registry_schema_version: VersionTuple | None = None, 

333 ) -> DatasetRecordStorageManager: 

334 """Construct an instance of the manager. 

335 

336 Parameters 

337 ---------- 

338 db : `Database` 

339 Interface to the underlying database engine and namespace. 

340 context : `StaticTablesContext` 

341 Context object obtained from `Database.declareStaticTables`; used 

342 to declare any tables that should always be present. 

343 collections: `CollectionManager` 

344 Manager object for the collections in this `Registry`. 

345 dimensions : `DimensionRecordStorageManager` 

346 Manager object for the dimensions in this `Registry`. 

347 registry_schema_version : `VersionTuple` or `None` 

348 Schema version of this extension as defined in registry. 

349 

350 Returns 

351 ------- 

352 manager : `DatasetRecordStorageManager` 

353 An instance of a concrete `DatasetRecordStorageManager` subclass. 

354 """ 

355 raise NotImplementedError() 

356 

357 @classmethod 

358 @abstractmethod 

359 def getIdColumnType(cls) -> type: 

360 """Return type used for columns storing dataset IDs. 

361 

362 This type is used for columns storing `DatasetRef.id` values, usually 

363 a `type` subclass provided by SQLAlchemy. 

364 

365 Returns 

366 ------- 

367 dtype : `type` 

368 Type used for dataset identification in database. 

369 """ 

370 raise NotImplementedError() 

371 

372 @classmethod 

373 @abstractmethod 

374 def supportsIdGenerationMode(cls, mode: DatasetIdGenEnum) -> bool: 

375 """Test whether the given dataset ID generation mode is supported by 

376 `insert`. 

377 

378 Parameters 

379 ---------- 

380 mode : `DatasetIdGenEnum` 

381 Enum value for the mode to test. 

382 

383 Returns 

384 ------- 

385 supported : `bool` 

386 Whether the given mode is supported. 

387 """ 

388 raise NotImplementedError() 

389 

390 @classmethod 

391 @abstractmethod 

392 def addDatasetForeignKey( 

393 cls, 

394 tableSpec: ddl.TableSpec, 

395 *, 

396 name: str = "dataset", 

397 constraint: bool = True, 

398 onDelete: str | None = None, 

399 **kwargs: Any, 

400 ) -> ddl.FieldSpec: 

401 """Add a foreign key (field and constraint) referencing the dataset 

402 table. 

403 

404 Parameters 

405 ---------- 

406 tableSpec : `ddl.TableSpec` 

407 Specification for the table that should reference the dataset 

408 table. Will be modified in place. 

409 name: `str`, optional 

410 A name to use for the prefix of the new field; the full name is 

411 ``{name}_id``. 

412 onDelete: `str`, optional 

413 One of "CASCADE" or "SET NULL", indicating what should happen to 

414 the referencing row if the collection row is deleted. `None` 

415 indicates that this should be an integrity error. 

416 constraint: `bool`, optional 

417 If `False` (`True` is default), add a field that can be joined to 

418 the dataset primary key, but do not add a foreign key constraint. 

419 **kwargs 

420 Additional keyword arguments are forwarded to the `ddl.FieldSpec` 

421 constructor (only the ``name`` and ``dtype`` arguments are 

422 otherwise provided). 

423 

424 Returns 

425 ------- 

426 idSpec : `ddl.FieldSpec` 

427 Specification for the ID field. 

428 """ 

429 raise NotImplementedError() 

430 

431 @abstractmethod 

432 def refresh(self) -> None: 

433 """Ensure all other operations on this manager are aware of any 

434 dataset types that may have been registered by other clients since 

435 it was initialized or last refreshed. 

436 """ 

437 raise NotImplementedError() 

438 

439 def __getitem__(self, name: str) -> DatasetRecordStorage: 

440 """Return the object that provides access to the records associated 

441 with the given `DatasetType` name. 

442 

443 This is simply a convenience wrapper for `find` that raises `KeyError` 

444 when the dataset type is not found. 

445 

446 Returns 

447 ------- 

448 records : `DatasetRecordStorage` 

449 The object representing the records for the given dataset type. 

450 

451 Raises 

452 ------ 

453 KeyError 

454 Raised if there is no dataset type with the given name. 

455 

456 Notes 

457 ----- 

458 Dataset types registered by another client of the same repository since 

459 the last call to `initialize` or `refresh` may not be found. 

460 """ 

461 result = self.find(name) 

462 if result is None: 

463 raise MissingDatasetTypeError(f"Dataset type with name '{name}' not found.") 

464 return result 

465 

466 @abstractmethod 

467 def find(self, name: str) -> DatasetRecordStorage | None: 

468 """Return an object that provides access to the records associated with 

469 the given `DatasetType` name, if one exists. 

470 

471 Parameters 

472 ---------- 

473 name : `str` 

474 Name of the dataset type. 

475 

476 Returns 

477 ------- 

478 records : `DatasetRecordStorage` or `None` 

479 The object representing the records for the given dataset type, or 

480 `None` if there are no records for that dataset type. 

481 

482 Notes 

483 ----- 

484 Dataset types registered by another client of the same repository since 

485 the last call to `initialize` or `refresh` may not be found. 

486 """ 

487 raise NotImplementedError() 

488 

489 @abstractmethod 

490 def register(self, datasetType: DatasetType) -> tuple[DatasetRecordStorage, bool]: 

491 """Ensure that this `Registry` can hold records for the given 

492 `DatasetType`, creating new tables as necessary. 

493 

494 Parameters 

495 ---------- 

496 datasetType : `DatasetType` 

497 Dataset type for which a table should created (as necessary) and 

498 an associated `DatasetRecordStorage` returned. 

499 

500 Returns 

501 ------- 

502 records : `DatasetRecordStorage` 

503 The object representing the records for the given dataset type. 

504 inserted : `bool` 

505 `True` if the dataset type did not exist in the registry before. 

506 

507 Notes 

508 ----- 

509 This operation may not be invoked within a `Database.transaction` 

510 context. 

511 """ 

512 raise NotImplementedError() 

513 

514 @abstractmethod 

515 def remove(self, name: str) -> None: 

516 """Remove the dataset type. 

517 

518 Parameters 

519 ---------- 

520 name : `str` 

521 Name of the dataset type. 

522 """ 

523 raise NotImplementedError() 

524 

525 @abstractmethod 

526 def resolve_wildcard( 

527 self, 

528 expression: Any, 

529 components: bool | None = False, 

530 missing: list[str] | None = None, 

531 explicit_only: bool = False, 

532 components_deprecated: bool = True, 

533 ) -> dict[DatasetType, list[str | None]]: 

534 """Resolve a dataset type wildcard expression. 

535 

536 Parameters 

537 ---------- 

538 expression 

539 Expression to resolve. Will be passed to 

540 `DatasetTypeWildcard.from_expression`. 

541 components : `bool`, optional 

542 If `True`, apply all expression patterns to component dataset type 

543 names as well. If `False`, never apply patterns to components. If 

544 `None`, apply patterns to components only if their parent 

545 datasets were not matched by the expression. Fully-specified 

546 component datasets (`str` or `DatasetType` instances) are always 

547 included. 

548 missing : `list` of `str`, optional 

549 String dataset type names that were explicitly given (i.e. not 

550 regular expression patterns) but not found will be appended to this 

551 list, if it is provided. 

552 explicit_only : `bool`, optional 

553 If `True`, require explicit `DatasetType` instances or `str` names, 

554 with `re.Pattern` instances deprecated and ``...`` prohibited. 

555 components_deprecated : `bool`, optional 

556 If `True`, this is a context in which component dataset support is 

557 deprecated. This will result in a deprecation warning when 

558 ``components=True`` or ``components=None`` and a component dataset 

559 is matched. In the future this will become an error. 

560 

561 Returns 

562 ------- 

563 dataset_types : `dict` [ `DatasetType`, `list` [ `None`, `str` ] ] 

564 A mapping with resolved dataset types as keys and lists of 

565 matched component names as values, where `None` indicates the 

566 parent composite dataset type was matched. 

567 """ 

568 raise NotImplementedError() 

569 

570 @abstractmethod 

571 def getDatasetRef(self, id: DatasetId) -> DatasetRef | None: 

572 """Return a `DatasetRef` for the given dataset primary key 

573 value. 

574 

575 Parameters 

576 ---------- 

577 id : `DatasetId` 

578 Primary key value for the dataset. 

579 

580 Returns 

581 ------- 

582 ref : `DatasetRef` or `None` 

583 Object representing the dataset, or `None` if no dataset with the 

584 given primary key values exists in this layer. 

585 """ 

586 raise NotImplementedError() 

587 

588 @abstractmethod 

589 def getCollectionSummary(self, collection: CollectionRecord) -> CollectionSummary: 

590 """Return a summary for the given collection. 

591 

592 Parameters 

593 ---------- 

594 collection : `CollectionRecord` 

595 Record describing the collection for which a summary is to be 

596 retrieved. 

597 

598 Returns 

599 ------- 

600 summary : `CollectionSummary` 

601 Summary of the dataset types and governor dimension values in 

602 this collection. 

603 """ 

604 raise NotImplementedError() 

605 

606 @abstractmethod 

607 def ingest_date_dtype(self) -> type: 

608 """Return type of the ``ingest_date`` column.""" 

609 raise NotImplementedError()