Coverage for python/lsst/daf/butler/registry/interfaces/_datasets.py: 94%

72 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-12-01 11:00 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28from __future__ import annotations 

29 

30from ... import ddl 

31 

32__all__ = ("DatasetRecordStorageManager", "DatasetRecordStorage") 

33 

34from abc import ABC, abstractmethod 

35from collections.abc import Iterable, Iterator, Mapping, Set 

36from typing import TYPE_CHECKING, Any 

37 

38from lsst.daf.relation import Relation 

39 

40from ..._dataset_ref import DatasetId, DatasetIdGenEnum, DatasetRef 

41from ..._dataset_type import DatasetType 

42from ..._timespan import Timespan 

43from ...dimensions import DataCoordinate 

44from .._exceptions import MissingDatasetTypeError 

45from ._versioning import VersionedExtension, VersionTuple 

46 

47if TYPE_CHECKING: 

48 from .._caching_context import CachingContext 

49 from .._collection_summary import CollectionSummary 

50 from ..queries import SqlQueryContext 

51 from ._collections import CollectionManager, CollectionRecord, RunRecord 

52 from ._database import Database, StaticTablesContext 

53 from ._dimensions import DimensionRecordStorageManager 

54 

55 

56class DatasetRecordStorage(ABC): 

57 """An interface that manages the records associated with a particular 

58 `DatasetType`. 

59 

60 Parameters 

61 ---------- 

62 datasetType : `DatasetType` 

63 Dataset type whose records this object manages. 

64 """ 

65 

66 def __init__(self, datasetType: DatasetType): 

67 self.datasetType = datasetType 

68 

69 @abstractmethod 

70 def insert( 

71 self, 

72 run: RunRecord, 

73 dataIds: Iterable[DataCoordinate], 

74 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

75 ) -> Iterator[DatasetRef]: 

76 """Insert one or more dataset entries into the database. 

77 

78 Parameters 

79 ---------- 

80 run : `RunRecord` 

81 The record object describing the `~CollectionType.RUN` collection 

82 this dataset will be associated with. 

83 dataIds : `~collections.abc.Iterable` [ `DataCoordinate` ] 

84 Expanded data IDs (`DataCoordinate` instances) for the 

85 datasets to be added. The dimensions of all data IDs must be the 

86 same as ``self.datasetType.dimensions``. 

87 idMode : `DatasetIdGenEnum` 

88 With `UNIQUE` each new dataset is inserted with its new unique ID. 

89 With non-`UNIQUE` mode ID is computed from some combination of 

90 dataset type, dataId, and run collection name; if the same ID is 

91 already in the database then new record is not inserted. 

92 

93 Returns 

94 ------- 

95 datasets : `~collections.abc.Iterable` [ `DatasetRef` ] 

96 References to the inserted datasets. 

97 """ 

98 raise NotImplementedError() 

99 

100 @abstractmethod 

101 def import_( 

102 self, 

103 run: RunRecord, 

104 datasets: Iterable[DatasetRef], 

105 ) -> Iterator[DatasetRef]: 

106 """Insert one or more dataset entries into the database. 

107 

108 Parameters 

109 ---------- 

110 run : `RunRecord` 

111 The record object describing the `~CollectionType.RUN` collection 

112 this dataset will be associated with. 

113 datasets : `~collections.abc.Iterable` of `DatasetRef` 

114 Datasets to be inserted. Datasets can specify ``id`` attribute 

115 which will be used for inserted datasets. All dataset IDs must 

116 have the same type (`int` or `uuid.UUID`), if type of dataset IDs 

117 does not match type supported by this class then IDs will be 

118 ignored and new IDs will be generated by backend. 

119 

120 Returns 

121 ------- 

122 datasets : `~collections.abc.Iterable` [ `DatasetRef` ] 

123 References to the inserted or existing datasets. 

124 

125 Notes 

126 ----- 

127 The ``datasetType`` and ``run`` attributes of datasets are supposed to 

128 be identical across all datasets but this is not checked and it should 

129 be enforced by higher level registry code. This method does not need 

130 to use those attributes from datasets, only ``dataId`` and ``id`` are 

131 relevant. 

132 """ 

133 raise NotImplementedError() 

134 

135 @abstractmethod 

136 def delete(self, datasets: Iterable[DatasetRef]) -> None: 

137 """Fully delete the given datasets from the registry. 

138 

139 Parameters 

140 ---------- 

141 datasets : `~collections.abc.Iterable` [ `DatasetRef` ] 

142 Datasets to be deleted. All datasets must be resolved and have 

143 the same `DatasetType` as ``self``. 

144 

145 Raises 

146 ------ 

147 AmbiguousDatasetError 

148 Raised if any of the given `DatasetRef` instances is unresolved. 

149 """ 

150 raise NotImplementedError() 

151 

152 @abstractmethod 

153 def associate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None: 

154 """Associate one or more datasets with a collection. 

155 

156 Parameters 

157 ---------- 

158 collection : `CollectionRecord` 

159 The record object describing the collection. ``collection.type`` 

160 must be `~CollectionType.TAGGED`. 

161 datasets : `~collections.abc.Iterable` [ `DatasetRef` ] 

162 Datasets to be associated. All datasets must be resolved and have 

163 the same `DatasetType` as ``self``. 

164 

165 Raises 

166 ------ 

167 AmbiguousDatasetError 

168 Raised if any of the given `DatasetRef` instances is unresolved. 

169 

170 Notes 

171 ----- 

172 Associating a dataset with into collection that already contains a 

173 different dataset with the same `DatasetType` and data ID will remove 

174 the existing dataset from that collection. 

175 

176 Associating the same dataset into a collection multiple times is a 

177 no-op, but is still not permitted on read-only databases. 

178 """ 

179 raise NotImplementedError() 

180 

181 @abstractmethod 

182 def disassociate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None: 

183 """Remove one or more datasets from a collection. 

184 

185 Parameters 

186 ---------- 

187 collection : `CollectionRecord` 

188 The record object describing the collection. ``collection.type`` 

189 must be `~CollectionType.TAGGED`. 

190 datasets : `~collections.abc.Iterable` [ `DatasetRef` ] 

191 Datasets to be disassociated. All datasets must be resolved and 

192 have the same `DatasetType` as ``self``. 

193 

194 Raises 

195 ------ 

196 AmbiguousDatasetError 

197 Raised if any of the given `DatasetRef` instances is unresolved. 

198 """ 

199 raise NotImplementedError() 

200 

201 @abstractmethod 

202 def certify( 

203 self, 

204 collection: CollectionRecord, 

205 datasets: Iterable[DatasetRef], 

206 timespan: Timespan, 

207 context: SqlQueryContext, 

208 ) -> None: 

209 """Associate one or more datasets with a calibration collection and a 

210 validity range within it. 

211 

212 Parameters 

213 ---------- 

214 collection : `CollectionRecord` 

215 The record object describing the collection. ``collection.type`` 

216 must be `~CollectionType.CALIBRATION`. 

217 datasets : `~collections.abc.Iterable` [ `DatasetRef` ] 

218 Datasets to be associated. All datasets must be resolved and have 

219 the same `DatasetType` as ``self``. 

220 timespan : `Timespan` 

221 The validity range for these datasets within the collection. 

222 

223 Raises 

224 ------ 

225 AmbiguousDatasetError 

226 Raised if any of the given `DatasetRef` instances is unresolved. 

227 ConflictingDefinitionError 

228 Raised if the collection already contains a different dataset with 

229 the same `DatasetType` and data ID and an overlapping validity 

230 range. 

231 CollectionTypeError 

232 Raised if 

233 ``collection.type is not CollectionType.CALIBRATION`` or if 

234 ``self.datasetType.isCalibration() is False``. 

235 """ 

236 raise NotImplementedError() 

237 

238 @abstractmethod 

239 def decertify( 

240 self, 

241 collection: CollectionRecord, 

242 timespan: Timespan, 

243 *, 

244 dataIds: Iterable[DataCoordinate] | None = None, 

245 context: SqlQueryContext, 

246 ) -> None: 

247 """Remove or adjust datasets to clear a validity range within a 

248 calibration collection. 

249 

250 Parameters 

251 ---------- 

252 collection : `CollectionRecord` 

253 The record object describing the collection. ``collection.type`` 

254 must be `~CollectionType.CALIBRATION`. 

255 timespan : `Timespan` 

256 The validity range to remove datasets from within the collection. 

257 Datasets that overlap this range but are not contained by it will 

258 have their validity ranges adjusted to not overlap it, which may 

259 split a single dataset validity range into two. 

260 dataIds : `~collections.abc.Iterable` [ `DataCoordinate` ], optional 

261 Data IDs that should be decertified within the given validity range 

262 If `None`, all data IDs for ``self.datasetType`` will be 

263 decertified. 

264 

265 Raises 

266 ------ 

267 CollectionTypeError 

268 Raised if ``collection.type is not CollectionType.CALIBRATION``. 

269 """ 

270 raise NotImplementedError() 

271 

272 @abstractmethod 

273 def make_relation( 

274 self, 

275 *collections: CollectionRecord, 

276 columns: Set[str], 

277 context: SqlQueryContext, 

278 ) -> Relation: 

279 """Return a `sql.Relation` that represents a query for for this 

280 `DatasetType` in one or more collections. 

281 

282 Parameters 

283 ---------- 

284 *collections : `CollectionRecord` 

285 The record object(s) describing the collection(s) to query. May 

286 not be of type `CollectionType.CHAINED`. If multiple collections 

287 are passed, the query will search all of them in an unspecified 

288 order, and all collections must have the same type. Must include 

289 at least one collection. 

290 columns : `~collections.abc.Set` [ `str` ] 

291 Columns to include in the relation. See `Query.find_datasets` for 

292 most options, but this method supports one more: 

293 

294 - ``rank``: a calculated integer column holding the index of the 

295 collection the dataset was found in, within the ``collections`` 

296 sequence given. 

297 context : `SqlQueryContext` 

298 The object that manages database connections, temporary tables and 

299 relation engines for this query. 

300 

301 Returns 

302 ------- 

303 relation : `~lsst.daf.relation.Relation` 

304 Representation of the query. 

305 """ 

306 raise NotImplementedError() 

307 

308 datasetType: DatasetType 

309 """Dataset type whose records this object manages (`DatasetType`). 

310 """ 

311 

312 

313class DatasetRecordStorageManager(VersionedExtension): 

314 """An interface that manages the tables that describe datasets. 

315 

316 `DatasetRecordStorageManager` primarily serves as a container and factory 

317 for `DatasetRecordStorage` instances, which each provide access to the 

318 records for a different `DatasetType`. 

319 """ 

320 

321 def __init__(self, *, registry_schema_version: VersionTuple | None = None) -> None: 

322 super().__init__(registry_schema_version=registry_schema_version) 

323 

324 @classmethod 

325 @abstractmethod 

326 def initialize( 

327 cls, 

328 db: Database, 

329 context: StaticTablesContext, 

330 *, 

331 collections: CollectionManager, 

332 dimensions: DimensionRecordStorageManager, 

333 caching_context: CachingContext, 

334 registry_schema_version: VersionTuple | None = None, 

335 ) -> DatasetRecordStorageManager: 

336 """Construct an instance of the manager. 

337 

338 Parameters 

339 ---------- 

340 db : `Database` 

341 Interface to the underlying database engine and namespace. 

342 context : `StaticTablesContext` 

343 Context object obtained from `Database.declareStaticTables`; used 

344 to declare any tables that should always be present. 

345 collections: `CollectionManager` 

346 Manager object for the collections in this `Registry`. 

347 dimensions : `DimensionRecordStorageManager` 

348 Manager object for the dimensions in this `Registry`. 

349 caching_context : `CachingContext` 

350 Object controlling caching of information returned by managers. 

351 registry_schema_version : `VersionTuple` or `None` 

352 Schema version of this extension as defined in registry. 

353 

354 Returns 

355 ------- 

356 manager : `DatasetRecordStorageManager` 

357 An instance of a concrete `DatasetRecordStorageManager` subclass. 

358 """ 

359 raise NotImplementedError() 

360 

361 @classmethod 

362 @abstractmethod 

363 def getIdColumnType(cls) -> type: 

364 """Return type used for columns storing dataset IDs. 

365 

366 This type is used for columns storing `DatasetRef.id` values, usually 

367 a `type` subclass provided by SQLAlchemy. 

368 

369 Returns 

370 ------- 

371 dtype : `type` 

372 Type used for dataset identification in database. 

373 """ 

374 raise NotImplementedError() 

375 

376 @classmethod 

377 @abstractmethod 

378 def supportsIdGenerationMode(cls, mode: DatasetIdGenEnum) -> bool: 

379 """Test whether the given dataset ID generation mode is supported by 

380 `insert`. 

381 

382 Parameters 

383 ---------- 

384 mode : `DatasetIdGenEnum` 

385 Enum value for the mode to test. 

386 

387 Returns 

388 ------- 

389 supported : `bool` 

390 Whether the given mode is supported. 

391 """ 

392 raise NotImplementedError() 

393 

394 @classmethod 

395 @abstractmethod 

396 def addDatasetForeignKey( 

397 cls, 

398 tableSpec: ddl.TableSpec, 

399 *, 

400 name: str = "dataset", 

401 constraint: bool = True, 

402 onDelete: str | None = None, 

403 **kwargs: Any, 

404 ) -> ddl.FieldSpec: 

405 """Add a foreign key (field and constraint) referencing the dataset 

406 table. 

407 

408 Parameters 

409 ---------- 

410 tableSpec : `ddl.TableSpec` 

411 Specification for the table that should reference the dataset 

412 table. Will be modified in place. 

413 name: `str`, optional 

414 A name to use for the prefix of the new field; the full name is 

415 ``{name}_id``. 

416 onDelete: `str`, optional 

417 One of "CASCADE" or "SET NULL", indicating what should happen to 

418 the referencing row if the collection row is deleted. `None` 

419 indicates that this should be an integrity error. 

420 constraint: `bool`, optional 

421 If `False` (`True` is default), add a field that can be joined to 

422 the dataset primary key, but do not add a foreign key constraint. 

423 **kwargs 

424 Additional keyword arguments are forwarded to the `ddl.FieldSpec` 

425 constructor (only the ``name`` and ``dtype`` arguments are 

426 otherwise provided). 

427 

428 Returns 

429 ------- 

430 idSpec : `ddl.FieldSpec` 

431 Specification for the ID field. 

432 """ 

433 raise NotImplementedError() 

434 

435 @abstractmethod 

436 def refresh(self) -> None: 

437 """Ensure all other operations on this manager are aware of any 

438 dataset types that may have been registered by other clients since 

439 it was initialized or last refreshed. 

440 """ 

441 raise NotImplementedError() 

442 

443 def __getitem__(self, name: str) -> DatasetRecordStorage: 

444 """Return the object that provides access to the records associated 

445 with the given `DatasetType` name. 

446 

447 This is simply a convenience wrapper for `find` that raises `KeyError` 

448 when the dataset type is not found. 

449 

450 Returns 

451 ------- 

452 records : `DatasetRecordStorage` 

453 The object representing the records for the given dataset type. 

454 

455 Raises 

456 ------ 

457 KeyError 

458 Raised if there is no dataset type with the given name. 

459 

460 Notes 

461 ----- 

462 Dataset types registered by another client of the same repository since 

463 the last call to `initialize` or `refresh` may not be found. 

464 """ 

465 result = self.find(name) 

466 if result is None: 

467 raise MissingDatasetTypeError(f"Dataset type with name '{name}' not found.") 

468 return result 

469 

470 @abstractmethod 

471 def find(self, name: str) -> DatasetRecordStorage | None: 

472 """Return an object that provides access to the records associated with 

473 the given `DatasetType` name, if one exists. 

474 

475 Parameters 

476 ---------- 

477 name : `str` 

478 Name of the dataset type. 

479 

480 Returns 

481 ------- 

482 records : `DatasetRecordStorage` or `None` 

483 The object representing the records for the given dataset type, or 

484 `None` if there are no records for that dataset type. 

485 

486 Notes 

487 ----- 

488 Dataset types registered by another client of the same repository since 

489 the last call to `initialize` or `refresh` may not be found. 

490 """ 

491 raise NotImplementedError() 

492 

493 @abstractmethod 

494 def register(self, datasetType: DatasetType) -> bool: 

495 """Ensure that this `Registry` can hold records for the given 

496 `DatasetType`, creating new tables as necessary. 

497 

498 Parameters 

499 ---------- 

500 datasetType : `DatasetType` 

501 Dataset type for which a table should created (as necessary) and 

502 an associated `DatasetRecordStorage` returned. 

503 

504 Returns 

505 ------- 

506 inserted : `bool` 

507 `True` if the dataset type did not exist in the registry before. 

508 

509 Notes 

510 ----- 

511 This operation may not be invoked within a `Database.transaction` 

512 context. 

513 """ 

514 raise NotImplementedError() 

515 

516 @abstractmethod 

517 def remove(self, name: str) -> None: 

518 """Remove the dataset type. 

519 

520 Parameters 

521 ---------- 

522 name : `str` 

523 Name of the dataset type. 

524 """ 

525 raise NotImplementedError() 

526 

527 @abstractmethod 

528 def resolve_wildcard( 

529 self, 

530 expression: Any, 

531 components: bool | None = False, 

532 missing: list[str] | None = None, 

533 explicit_only: bool = False, 

534 components_deprecated: bool = True, 

535 ) -> dict[DatasetType, list[str | None]]: 

536 """Resolve a dataset type wildcard expression. 

537 

538 Parameters 

539 ---------- 

540 expression 

541 Expression to resolve. Will be passed to 

542 `DatasetTypeWildcard.from_expression`. 

543 components : `bool`, optional 

544 If `True`, apply all expression patterns to component dataset type 

545 names as well. If `False`, never apply patterns to components. If 

546 `None`, apply patterns to components only if their parent 

547 datasets were not matched by the expression. Fully-specified 

548 component datasets (`str` or `DatasetType` instances) are always 

549 included. 

550 missing : `list` of `str`, optional 

551 String dataset type names that were explicitly given (i.e. not 

552 regular expression patterns) but not found will be appended to this 

553 list, if it is provided. 

554 explicit_only : `bool`, optional 

555 If `True`, require explicit `DatasetType` instances or `str` names, 

556 with `re.Pattern` instances deprecated and ``...`` prohibited. 

557 components_deprecated : `bool`, optional 

558 If `True`, this is a context in which component dataset support is 

559 deprecated. This will result in a deprecation warning when 

560 ``components=True`` or ``components=None`` and a component dataset 

561 is matched. In the future this will become an error. 

562 

563 Returns 

564 ------- 

565 dataset_types : `dict` [ `DatasetType`, `list` [ `None`, `str` ] ] 

566 A mapping with resolved dataset types as keys and lists of 

567 matched component names as values, where `None` indicates the 

568 parent composite dataset type was matched. 

569 """ 

570 raise NotImplementedError() 

571 

572 @abstractmethod 

573 def getDatasetRef(self, id: DatasetId) -> DatasetRef | None: 

574 """Return a `DatasetRef` for the given dataset primary key 

575 value. 

576 

577 Parameters 

578 ---------- 

579 id : `DatasetId` 

580 Primary key value for the dataset. 

581 

582 Returns 

583 ------- 

584 ref : `DatasetRef` or `None` 

585 Object representing the dataset, or `None` if no dataset with the 

586 given primary key values exists in this layer. 

587 """ 

588 raise NotImplementedError() 

589 

590 @abstractmethod 

591 def getCollectionSummary(self, collection: CollectionRecord) -> CollectionSummary: 

592 """Return a summary for the given collection. 

593 

594 Parameters 

595 ---------- 

596 collection : `CollectionRecord` 

597 Record describing the collection for which a summary is to be 

598 retrieved. 

599 

600 Returns 

601 ------- 

602 summary : `CollectionSummary` 

603 Summary of the dataset types and governor dimension values in 

604 this collection. 

605 """ 

606 raise NotImplementedError() 

607 

608 @abstractmethod 

609 def fetch_summaries( 

610 self, collections: Iterable[CollectionRecord], dataset_types: Iterable[DatasetType] | None = None 

611 ) -> Mapping[Any, CollectionSummary]: 

612 """Fetch collection summaries given their names and dataset types. 

613 

614 Parameters 

615 ---------- 

616 collections : `~collections.abc.Iterable` [`CollectionRecord`] 

617 Collection records to query. 

618 dataset_types : `~collections.abc.Iterable` [`DatasetType`] or `None` 

619 Dataset types to include into returned summaries. If `None` then 

620 all dataset types will be included. 

621 

622 Returns 

623 ------- 

624 summaries : `~collections.abc.Mapping` [`Any`, `CollectionSummary`] 

625 Collection summaries indexed by collection record key. This mapping 

626 will also contain all nested non-chained collections of the chained 

627 collections. 

628 """ 

629 raise NotImplementedError() 

630 

631 @abstractmethod 

632 def ingest_date_dtype(self) -> type: 

633 """Return type of the ``ingest_date`` column.""" 

634 raise NotImplementedError()