Coverage for python/lsst/daf/butler/registry/interfaces/_datasets.py: 88%

66 statements  

« prev     ^ index     » next       coverage.py v7.2.5, created at 2023-05-18 09:13 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ("DatasetRecordStorageManager", "DatasetRecordStorage") 

25 

26from abc import ABC, abstractmethod 

27from collections.abc import Iterable, Iterator, Set 

28from typing import TYPE_CHECKING, Any 

29 

30from lsst.daf.relation import Relation 

31 

32from ...core import DataCoordinate, DatasetId, DatasetIdGenEnum, DatasetRef, DatasetType, Timespan, ddl 

33from .._exceptions import MissingDatasetTypeError 

34from ._versioning import VersionedExtension, VersionTuple 

35 

36if TYPE_CHECKING: 

37 from .._collection_summary import CollectionSummary 

38 from ..queries import SqlQueryContext 

39 from ._collections import CollectionManager, CollectionRecord, RunRecord 

40 from ._database import Database, StaticTablesContext 

41 from ._dimensions import DimensionRecordStorageManager 

42 

43 

44class DatasetRecordStorage(ABC): 

45 """An interface that manages the records associated with a particular 

46 `DatasetType`. 

47 

48 Parameters 

49 ---------- 

50 datasetType : `DatasetType` 

51 Dataset type whose records this object manages. 

52 """ 

53 

54 def __init__(self, datasetType: DatasetType): 

55 self.datasetType = datasetType 

56 

57 @abstractmethod 

58 def insert( 

59 self, 

60 run: RunRecord, 

61 dataIds: Iterable[DataCoordinate], 

62 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

63 ) -> Iterator[DatasetRef]: 

64 """Insert one or more dataset entries into the database. 

65 

66 Parameters 

67 ---------- 

68 run : `RunRecord` 

69 The record object describing the `~CollectionType.RUN` collection 

70 this dataset will be associated with. 

71 dataIds : `Iterable` [ `DataCoordinate` ] 

72 Expanded data IDs (`DataCoordinate` instances) for the 

73 datasets to be added. The dimensions of all data IDs must be the 

74 same as ``self.datasetType.dimensions``. 

75 idMode : `DatasetIdGenEnum` 

76 With `UNIQUE` each new dataset is inserted with its new unique ID. 

77 With non-`UNIQUE` mode ID is computed from some combination of 

78 dataset type, dataId, and run collection name; if the same ID is 

79 already in the database then new record is not inserted. 

80 

81 Returns 

82 ------- 

83 datasets : `Iterable` [ `DatasetRef` ] 

84 References to the inserted datasets. 

85 """ 

86 raise NotImplementedError() 

87 

88 @abstractmethod 

89 def import_( 

90 self, 

91 run: RunRecord, 

92 datasets: Iterable[DatasetRef], 

93 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

94 reuseIds: bool = False, 

95 ) -> Iterator[DatasetRef]: 

96 """Insert one or more dataset entries into the database. 

97 

98 Parameters 

99 ---------- 

100 run : `RunRecord` 

101 The record object describing the `~CollectionType.RUN` collection 

102 this dataset will be associated with. 

103 datasets : `~collections.abc.Iterable` of `DatasetRef` 

104 Datasets to be inserted. Datasets can specify ``id`` attribute 

105 which will be used for inserted datasets. All dataset IDs must 

106 have the same type (`int` or `uuid.UUID`), if type of dataset IDs 

107 does not match type supported by this class then IDs will be 

108 ignored and new IDs will be generated by backend. 

109 idGenerationMode : `DatasetIdGenEnum` 

110 With `UNIQUE` each new dataset is inserted with its new unique ID. 

111 With non-`UNIQUE` mode ID is computed from some combination of 

112 dataset type, dataId, and run collection name; if the same ID is 

113 already in the database then new record is not inserted. 

114 reuseIds : `bool`, optional 

115 If `True` then forces re-use of imported dataset IDs for integer 

116 IDs which are normally generated as auto-incremented; exception 

117 will be raised if imported IDs clash with existing ones. This 

118 option has no effect on the use of globally-unique IDs which are 

119 always re-used (or generated if integer IDs are being imported). 

120 

121 Returns 

122 ------- 

123 datasets : `Iterable` [ `DatasetRef` ] 

124 References to the inserted or existing datasets. 

125 

126 Notes 

127 ----- 

128 The ``datasetType`` and ``run`` attributes of datasets are supposed to 

129 be identical across all datasets but this is not checked and it should 

130 be enforced by higher level registry code. This method does not need 

131 to use those attributes from datasets, only ``dataId`` and ``id`` are 

132 relevant. 

133 """ 

134 raise NotImplementedError() 

135 

136 @abstractmethod 

137 def delete(self, datasets: Iterable[DatasetRef]) -> None: 

138 """Fully delete the given datasets from the registry. 

139 

140 Parameters 

141 ---------- 

142 datasets : `Iterable` [ `DatasetRef` ] 

143 Datasets to be deleted. All datasets must be resolved and have 

144 the same `DatasetType` as ``self``. 

145 

146 Raises 

147 ------ 

148 AmbiguousDatasetError 

149 Raised if any of the given `DatasetRef` instances is unresolved. 

150 """ 

151 raise NotImplementedError() 

152 

153 @abstractmethod 

154 def associate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None: 

155 """Associate one or more datasets with a collection. 

156 

157 Parameters 

158 ---------- 

159 collection : `CollectionRecord` 

160 The record object describing the collection. ``collection.type`` 

161 must be `~CollectionType.TAGGED`. 

162 datasets : `Iterable` [ `DatasetRef` ] 

163 Datasets to be associated. All datasets must be resolved and have 

164 the same `DatasetType` as ``self``. 

165 

166 Raises 

167 ------ 

168 AmbiguousDatasetError 

169 Raised if any of the given `DatasetRef` instances is unresolved. 

170 

171 Notes 

172 ----- 

173 Associating a dataset with into collection that already contains a 

174 different dataset with the same `DatasetType` and data ID will remove 

175 the existing dataset from that collection. 

176 

177 Associating the same dataset into a collection multiple times is a 

178 no-op, but is still not permitted on read-only databases. 

179 """ 

180 raise NotImplementedError() 

181 

182 @abstractmethod 

183 def disassociate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None: 

184 """Remove one or more datasets from a collection. 

185 

186 Parameters 

187 ---------- 

188 collection : `CollectionRecord` 

189 The record object describing the collection. ``collection.type`` 

190 must be `~CollectionType.TAGGED`. 

191 datasets : `Iterable` [ `DatasetRef` ] 

192 Datasets to be disassociated. All datasets must be resolved and 

193 have the same `DatasetType` as ``self``. 

194 

195 Raises 

196 ------ 

197 AmbiguousDatasetError 

198 Raised if any of the given `DatasetRef` instances is unresolved. 

199 """ 

200 raise NotImplementedError() 

201 

202 @abstractmethod 

203 def certify( 

204 self, 

205 collection: CollectionRecord, 

206 datasets: Iterable[DatasetRef], 

207 timespan: Timespan, 

208 context: SqlQueryContext, 

209 ) -> None: 

210 """Associate one or more datasets with a calibration collection and a 

211 validity range within it. 

212 

213 Parameters 

214 ---------- 

215 collection : `CollectionRecord` 

216 The record object describing the collection. ``collection.type`` 

217 must be `~CollectionType.CALIBRATION`. 

218 datasets : `Iterable` [ `DatasetRef` ] 

219 Datasets to be associated. All datasets must be resolved and have 

220 the same `DatasetType` as ``self``. 

221 timespan : `Timespan` 

222 The validity range for these datasets within the collection. 

223 

224 Raises 

225 ------ 

226 AmbiguousDatasetError 

227 Raised if any of the given `DatasetRef` instances is unresolved. 

228 ConflictingDefinitionError 

229 Raised if the collection already contains a different dataset with 

230 the same `DatasetType` and data ID and an overlapping validity 

231 range. 

232 CollectionTypeError 

233 Raised if 

234 ``collection.type is not CollectionType.CALIBRATION`` or if 

235 ``self.datasetType.isCalibration() is False``. 

236 """ 

237 raise NotImplementedError() 

238 

239 @abstractmethod 

240 def decertify( 

241 self, 

242 collection: CollectionRecord, 

243 timespan: Timespan, 

244 *, 

245 dataIds: Iterable[DataCoordinate] | None = None, 

246 context: SqlQueryContext, 

247 ) -> None: 

248 """Remove or adjust datasets to clear a validity range within a 

249 calibration collection. 

250 

251 Parameters 

252 ---------- 

253 collection : `CollectionRecord` 

254 The record object describing the collection. ``collection.type`` 

255 must be `~CollectionType.CALIBRATION`. 

256 timespan : `Timespan` 

257 The validity range to remove datasets from within the collection. 

258 Datasets that overlap this range but are not contained by it will 

259 have their validity ranges adjusted to not overlap it, which may 

260 split a single dataset validity range into two. 

261 dataIds : `Iterable` [ `DataCoordinate` ], optional 

262 Data IDs that should be decertified within the given validity range 

263 If `None`, all data IDs for ``self.datasetType`` will be 

264 decertified. 

265 

266 Raises 

267 ------ 

268 CollectionTypeError 

269 Raised if ``collection.type is not CollectionType.CALIBRATION``. 

270 """ 

271 raise NotImplementedError() 

272 

273 @abstractmethod 

274 def make_relation( 

275 self, 

276 *collections: CollectionRecord, 

277 columns: Set[str], 

278 context: SqlQueryContext, 

279 ) -> Relation: 

280 """Return a `sql.Relation` that represents a query for for this 

281 `DatasetType` in one or more collections. 

282 

283 Parameters 

284 ---------- 

285 *collections : `CollectionRecord` 

286 The record object(s) describing the collection(s) to query. May 

287 not be of type `CollectionType.CHAINED`. If multiple collections 

288 are passed, the query will search all of them in an unspecified 

289 order, and all collections must have the same type. Must include 

290 at least one collection. 

291 columns : `~collections.abc.Set` [ `str` ] 

292 Columns to include in the relation. See `Query.find_datasets` for 

293 most options, but this method supports one more: 

294 

295 - ``rank``: a calculated integer column holding the index of the 

296 collection the dataset was found in, within the ``collections`` 

297 sequence given. 

298 context : `SqlQueryContext` 

299 The object that manages database connections, temporary tables and 

300 relation engines for this query. 

301 

302 Returns 

303 ------- 

304 relation : `~lsst.daf.relation.Relation` 

305 Representation of the query. 

306 """ 

307 raise NotImplementedError() 

308 

309 datasetType: DatasetType 

310 """Dataset type whose records this object manages (`DatasetType`). 

311 """ 

312 

313 

314class DatasetRecordStorageManager(VersionedExtension): 

315 """An interface that manages the tables that describe datasets. 

316 

317 `DatasetRecordStorageManager` primarily serves as a container and factory 

318 for `DatasetRecordStorage` instances, which each provide access to the 

319 records for a different `DatasetType`. 

320 """ 

321 

322 def __init__(self, *, registry_schema_version: VersionTuple | None = None) -> None: 

323 super().__init__(registry_schema_version=registry_schema_version) 

324 

325 @classmethod 

326 @abstractmethod 

327 def initialize( 

328 cls, 

329 db: Database, 

330 context: StaticTablesContext, 

331 *, 

332 collections: CollectionManager, 

333 dimensions: DimensionRecordStorageManager, 

334 registry_schema_version: VersionTuple | None = None, 

335 ) -> DatasetRecordStorageManager: 

336 """Construct an instance of the manager. 

337 

338 Parameters 

339 ---------- 

340 db : `Database` 

341 Interface to the underlying database engine and namespace. 

342 context : `StaticTablesContext` 

343 Context object obtained from `Database.declareStaticTables`; used 

344 to declare any tables that should always be present. 

345 collections: `CollectionManager` 

346 Manager object for the collections in this `Registry`. 

347 dimensions : `DimensionRecordStorageManager` 

348 Manager object for the dimensions in this `Registry`. 

349 registry_schema_version : `VersionTuple` or `None` 

350 Schema version of this extension as defined in registry. 

351 

352 Returns 

353 ------- 

354 manager : `DatasetRecordStorageManager` 

355 An instance of a concrete `DatasetRecordStorageManager` subclass. 

356 """ 

357 raise NotImplementedError() 

358 

359 @classmethod 

360 @abstractmethod 

361 def getIdColumnType(cls) -> type: 

362 """Return type used for columns storing dataset IDs. 

363 

364 This type is used for columns storing `DatasetRef.id` values, usually 

365 a `type` subclass provided by SQLAlchemy. 

366 

367 Returns 

368 ------- 

369 dtype : `type` 

370 Type used for dataset identification in database. 

371 """ 

372 raise NotImplementedError() 

373 

374 @classmethod 

375 @abstractmethod 

376 def supportsIdGenerationMode(cls, mode: DatasetIdGenEnum) -> bool: 

377 """Test whether the given dataset ID generation mode is supported by 

378 `insert`. 

379 

380 Parameters 

381 ---------- 

382 mode : `DatasetIdGenEnum` 

383 Enum value for the mode to test. 

384 

385 Returns 

386 ------- 

387 supported : `bool` 

388 Whether the given mode is supported. 

389 """ 

390 raise NotImplementedError() 

391 

392 @classmethod 

393 @abstractmethod 

394 def addDatasetForeignKey( 

395 cls, 

396 tableSpec: ddl.TableSpec, 

397 *, 

398 name: str = "dataset", 

399 constraint: bool = True, 

400 onDelete: str | None = None, 

401 **kwargs: Any, 

402 ) -> ddl.FieldSpec: 

403 """Add a foreign key (field and constraint) referencing the dataset 

404 table. 

405 

406 Parameters 

407 ---------- 

408 tableSpec : `ddl.TableSpec` 

409 Specification for the table that should reference the dataset 

410 table. Will be modified in place. 

411 name: `str`, optional 

412 A name to use for the prefix of the new field; the full name is 

413 ``{name}_id``. 

414 onDelete: `str`, optional 

415 One of "CASCADE" or "SET NULL", indicating what should happen to 

416 the referencing row if the collection row is deleted. `None` 

417 indicates that this should be an integrity error. 

418 constraint: `bool`, optional 

419 If `False` (`True` is default), add a field that can be joined to 

420 the dataset primary key, but do not add a foreign key constraint. 

421 **kwargs 

422 Additional keyword arguments are forwarded to the `ddl.FieldSpec` 

423 constructor (only the ``name`` and ``dtype`` arguments are 

424 otherwise provided). 

425 

426 Returns 

427 ------- 

428 idSpec : `ddl.FieldSpec` 

429 Specification for the ID field. 

430 """ 

431 raise NotImplementedError() 

432 

433 @abstractmethod 

434 def refresh(self) -> None: 

435 """Ensure all other operations on this manager are aware of any 

436 dataset types that may have been registered by other clients since 

437 it was initialized or last refreshed. 

438 """ 

439 raise NotImplementedError() 

440 

441 def __getitem__(self, name: str) -> DatasetRecordStorage: 

442 """Return the object that provides access to the records associated 

443 with the given `DatasetType` name. 

444 

445 This is simply a convenience wrapper for `find` that raises `KeyError` 

446 when the dataset type is not found. 

447 

448 Returns 

449 ------- 

450 records : `DatasetRecordStorage` 

451 The object representing the records for the given dataset type. 

452 

453 Raises 

454 ------ 

455 KeyError 

456 Raised if there is no dataset type with the given name. 

457 

458 Notes 

459 ----- 

460 Dataset types registered by another client of the same repository since 

461 the last call to `initialize` or `refresh` may not be found. 

462 """ 

463 result = self.find(name) 

464 if result is None: 

465 raise MissingDatasetTypeError(f"Dataset type with name '{name}' not found.") 

466 return result 

467 

468 @abstractmethod 

469 def find(self, name: str) -> DatasetRecordStorage | None: 

470 """Return an object that provides access to the records associated with 

471 the given `DatasetType` name, if one exists. 

472 

473 Parameters 

474 ---------- 

475 name : `str` 

476 Name of the dataset type. 

477 

478 Returns 

479 ------- 

480 records : `DatasetRecordStorage` or `None` 

481 The object representing the records for the given dataset type, or 

482 `None` if there are no records for that dataset type. 

483 

484 Notes 

485 ----- 

486 Dataset types registered by another client of the same repository since 

487 the last call to `initialize` or `refresh` may not be found. 

488 """ 

489 raise NotImplementedError() 

490 

491 @abstractmethod 

492 def register(self, datasetType: DatasetType) -> tuple[DatasetRecordStorage, bool]: 

493 """Ensure that this `Registry` can hold records for the given 

494 `DatasetType`, creating new tables as necessary. 

495 

496 Parameters 

497 ---------- 

498 datasetType : `DatasetType` 

499 Dataset type for which a table should created (as necessary) and 

500 an associated `DatasetRecordStorage` returned. 

501 

502 Returns 

503 ------- 

504 records : `DatasetRecordStorage` 

505 The object representing the records for the given dataset type. 

506 inserted : `bool` 

507 `True` if the dataset type did not exist in the registry before. 

508 

509 Notes 

510 ----- 

511 This operation may not be invoked within a `Database.transaction` 

512 context. 

513 """ 

514 raise NotImplementedError() 

515 

516 @abstractmethod 

517 def remove(self, name: str) -> None: 

518 """Remove the dataset type. 

519 

520 Parameters 

521 ---------- 

522 name : `str` 

523 Name of the dataset type. 

524 """ 

525 raise NotImplementedError() 

526 

527 @abstractmethod 

528 def resolve_wildcard( 

529 self, 

530 expression: Any, 

531 components: bool | None = None, 

532 missing: list[str] | None = None, 

533 explicit_only: bool = False, 

534 components_deprecated: bool = True, 

535 ) -> dict[DatasetType, list[str | None]]: 

536 """Resolve a dataset type wildcard expression. 

537 

538 Parameters 

539 ---------- 

540 expression 

541 Expression to resolve. Will be passed to 

542 `DatasetTypeWildcard.from_expression`. 

543 components : `bool`, optional 

544 If `True`, apply all expression patterns to component dataset type 

545 names as well. If `False`, never apply patterns to components. If 

546 `None` (default), apply patterns to components only if their parent 

547 datasets were not matched by the expression. Fully-specified 

548 component datasets (`str` or `DatasetType` instances) are always 

549 included. 

550 missing : `list` of `str`, optional 

551 String dataset type names that were explicitly given (i.e. not 

552 regular expression patterns) but not found will be appended to this 

553 list, if it is provided. 

554 explicit_only : `bool`, optional 

555 If `True`, require explicit `DatasetType` instances or `str` names, 

556 with `re.Pattern` instances deprecated and ``...`` prohibited. 

557 components_deprecated : `bool`, optional 

558 If `True`, this is a context in which component dataset support is 

559 deprecated. This will result in a deprecation warning when 

560 ``components=True`` or ``components=None`` and a component dataset 

561 is matched. In the future this will become an error. 

562 

563 Returns 

564 ------- 

565 dataset_types : `dict` [ `DatasetType`, `list` [ `None`, `str` ] ] 

566 A mapping with resolved dataset types as keys and lists of 

567 matched component names as values, where `None` indicates the 

568 parent composite dataset type was matched. 

569 """ 

570 raise NotImplementedError() 

571 

572 @abstractmethod 

573 def getDatasetRef(self, id: DatasetId) -> DatasetRef | None: 

574 """Return a `DatasetRef` for the given dataset primary key 

575 value. 

576 

577 Parameters 

578 ---------- 

579 id : `DatasetId` 

580 Primary key value for the dataset. 

581 

582 Returns 

583 ------- 

584 ref : `DatasetRef` or `None` 

585 Object representing the dataset, or `None` if no dataset with the 

586 given primary key values exists in this layer. 

587 """ 

588 raise NotImplementedError() 

589 

590 @abstractmethod 

591 def getCollectionSummary(self, collection: CollectionRecord) -> CollectionSummary: 

592 """Return a summary for the given collection. 

593 

594 Parameters 

595 ---------- 

596 collection : `CollectionRecord` 

597 Record describing the collection for which a summary is to be 

598 retrieved. 

599 

600 Returns 

601 ------- 

602 summary : `CollectionSummary` 

603 Summary of the dataset types and governor dimension values in 

604 this collection. 

605 """ 

606 raise NotImplementedError() 

607 

608 @abstractmethod 

609 def ingest_date_dtype(self) -> type: 

610 """Return type of the ``ingest_date`` column.""" 

611 raise NotImplementedError()