Coverage for python/lsst/daf/butler/registry/interfaces/_datasets.py: 94%

66 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-07-21 09:55 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ("DatasetRecordStorageManager", "DatasetRecordStorage") 

25 

26from abc import ABC, abstractmethod 

27from collections.abc import Iterable, Iterator, Set 

28from typing import TYPE_CHECKING, Any 

29 

30from lsst.daf.relation import Relation 

31 

32from ...core import DataCoordinate, DatasetId, DatasetIdGenEnum, DatasetRef, DatasetType, Timespan, ddl 

33from .._exceptions import MissingDatasetTypeError 

34from ._versioning import VersionedExtension, VersionTuple 

35 

36if TYPE_CHECKING: 

37 from .._collection_summary import CollectionSummary 

38 from ..queries import SqlQueryContext 

39 from ._collections import CollectionManager, CollectionRecord, RunRecord 

40 from ._database import Database, StaticTablesContext 

41 from ._dimensions import DimensionRecordStorageManager 

42 

43 

44class DatasetRecordStorage(ABC): 

45 """An interface that manages the records associated with a particular 

46 `DatasetType`. 

47 

48 Parameters 

49 ---------- 

50 datasetType : `DatasetType` 

51 Dataset type whose records this object manages. 

52 """ 

53 

54 def __init__(self, datasetType: DatasetType): 

55 self.datasetType = datasetType 

56 

57 @abstractmethod 

58 def insert( 

59 self, 

60 run: RunRecord, 

61 dataIds: Iterable[DataCoordinate], 

62 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

63 ) -> Iterator[DatasetRef]: 

64 """Insert one or more dataset entries into the database. 

65 

66 Parameters 

67 ---------- 

68 run : `RunRecord` 

69 The record object describing the `~CollectionType.RUN` collection 

70 this dataset will be associated with. 

71 dataIds : `~collections.abc.Iterable` [ `DataCoordinate` ] 

72 Expanded data IDs (`DataCoordinate` instances) for the 

73 datasets to be added. The dimensions of all data IDs must be the 

74 same as ``self.datasetType.dimensions``. 

75 idMode : `DatasetIdGenEnum` 

76 With `UNIQUE` each new dataset is inserted with its new unique ID. 

77 With non-`UNIQUE` mode ID is computed from some combination of 

78 dataset type, dataId, and run collection name; if the same ID is 

79 already in the database then new record is not inserted. 

80 

81 Returns 

82 ------- 

83 datasets : `~collections.abc.Iterable` [ `DatasetRef` ] 

84 References to the inserted datasets. 

85 """ 

86 raise NotImplementedError() 

87 

88 @abstractmethod 

89 def import_( 

90 self, 

91 run: RunRecord, 

92 datasets: Iterable[DatasetRef], 

93 ) -> Iterator[DatasetRef]: 

94 """Insert one or more dataset entries into the database. 

95 

96 Parameters 

97 ---------- 

98 run : `RunRecord` 

99 The record object describing the `~CollectionType.RUN` collection 

100 this dataset will be associated with. 

101 datasets : `~collections.abc.Iterable` of `DatasetRef` 

102 Datasets to be inserted. Datasets can specify ``id`` attribute 

103 which will be used for inserted datasets. All dataset IDs must 

104 have the same type (`int` or `uuid.UUID`), if type of dataset IDs 

105 does not match type supported by this class then IDs will be 

106 ignored and new IDs will be generated by backend. 

107 

108 Returns 

109 ------- 

110 datasets : `~collections.abc.Iterable` [ `DatasetRef` ] 

111 References to the inserted or existing datasets. 

112 

113 Notes 

114 ----- 

115 The ``datasetType`` and ``run`` attributes of datasets are supposed to 

116 be identical across all datasets but this is not checked and it should 

117 be enforced by higher level registry code. This method does not need 

118 to use those attributes from datasets, only ``dataId`` and ``id`` are 

119 relevant. 

120 """ 

121 raise NotImplementedError() 

122 

123 @abstractmethod 

124 def delete(self, datasets: Iterable[DatasetRef]) -> None: 

125 """Fully delete the given datasets from the registry. 

126 

127 Parameters 

128 ---------- 

129 datasets : `~collections.abc.Iterable` [ `DatasetRef` ] 

130 Datasets to be deleted. All datasets must be resolved and have 

131 the same `DatasetType` as ``self``. 

132 

133 Raises 

134 ------ 

135 AmbiguousDatasetError 

136 Raised if any of the given `DatasetRef` instances is unresolved. 

137 """ 

138 raise NotImplementedError() 

139 

140 @abstractmethod 

141 def associate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None: 

142 """Associate one or more datasets with a collection. 

143 

144 Parameters 

145 ---------- 

146 collection : `CollectionRecord` 

147 The record object describing the collection. ``collection.type`` 

148 must be `~CollectionType.TAGGED`. 

149 datasets : `~collections.abc.Iterable` [ `DatasetRef` ] 

150 Datasets to be associated. All datasets must be resolved and have 

151 the same `DatasetType` as ``self``. 

152 

153 Raises 

154 ------ 

155 AmbiguousDatasetError 

156 Raised if any of the given `DatasetRef` instances is unresolved. 

157 

158 Notes 

159 ----- 

160 Associating a dataset with into collection that already contains a 

161 different dataset with the same `DatasetType` and data ID will remove 

162 the existing dataset from that collection. 

163 

164 Associating the same dataset into a collection multiple times is a 

165 no-op, but is still not permitted on read-only databases. 

166 """ 

167 raise NotImplementedError() 

168 

169 @abstractmethod 

170 def disassociate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None: 

171 """Remove one or more datasets from a collection. 

172 

173 Parameters 

174 ---------- 

175 collection : `CollectionRecord` 

176 The record object describing the collection. ``collection.type`` 

177 must be `~CollectionType.TAGGED`. 

178 datasets : `~collections.abc.Iterable` [ `DatasetRef` ] 

179 Datasets to be disassociated. All datasets must be resolved and 

180 have the same `DatasetType` as ``self``. 

181 

182 Raises 

183 ------ 

184 AmbiguousDatasetError 

185 Raised if any of the given `DatasetRef` instances is unresolved. 

186 """ 

187 raise NotImplementedError() 

188 

189 @abstractmethod 

190 def certify( 

191 self, 

192 collection: CollectionRecord, 

193 datasets: Iterable[DatasetRef], 

194 timespan: Timespan, 

195 context: SqlQueryContext, 

196 ) -> None: 

197 """Associate one or more datasets with a calibration collection and a 

198 validity range within it. 

199 

200 Parameters 

201 ---------- 

202 collection : `CollectionRecord` 

203 The record object describing the collection. ``collection.type`` 

204 must be `~CollectionType.CALIBRATION`. 

205 datasets : `~collections.abc.Iterable` [ `DatasetRef` ] 

206 Datasets to be associated. All datasets must be resolved and have 

207 the same `DatasetType` as ``self``. 

208 timespan : `Timespan` 

209 The validity range for these datasets within the collection. 

210 

211 Raises 

212 ------ 

213 AmbiguousDatasetError 

214 Raised if any of the given `DatasetRef` instances is unresolved. 

215 ConflictingDefinitionError 

216 Raised if the collection already contains a different dataset with 

217 the same `DatasetType` and data ID and an overlapping validity 

218 range. 

219 CollectionTypeError 

220 Raised if 

221 ``collection.type is not CollectionType.CALIBRATION`` or if 

222 ``self.datasetType.isCalibration() is False``. 

223 """ 

224 raise NotImplementedError() 

225 

226 @abstractmethod 

227 def decertify( 

228 self, 

229 collection: CollectionRecord, 

230 timespan: Timespan, 

231 *, 

232 dataIds: Iterable[DataCoordinate] | None = None, 

233 context: SqlQueryContext, 

234 ) -> None: 

235 """Remove or adjust datasets to clear a validity range within a 

236 calibration collection. 

237 

238 Parameters 

239 ---------- 

240 collection : `CollectionRecord` 

241 The record object describing the collection. ``collection.type`` 

242 must be `~CollectionType.CALIBRATION`. 

243 timespan : `Timespan` 

244 The validity range to remove datasets from within the collection. 

245 Datasets that overlap this range but are not contained by it will 

246 have their validity ranges adjusted to not overlap it, which may 

247 split a single dataset validity range into two. 

248 dataIds : `~collections.abc.Iterable` [ `DataCoordinate` ], optional 

249 Data IDs that should be decertified within the given validity range 

250 If `None`, all data IDs for ``self.datasetType`` will be 

251 decertified. 

252 

253 Raises 

254 ------ 

255 CollectionTypeError 

256 Raised if ``collection.type is not CollectionType.CALIBRATION``. 

257 """ 

258 raise NotImplementedError() 

259 

260 @abstractmethod 

261 def make_relation( 

262 self, 

263 *collections: CollectionRecord, 

264 columns: Set[str], 

265 context: SqlQueryContext, 

266 ) -> Relation: 

267 """Return a `sql.Relation` that represents a query for for this 

268 `DatasetType` in one or more collections. 

269 

270 Parameters 

271 ---------- 

272 *collections : `CollectionRecord` 

273 The record object(s) describing the collection(s) to query. May 

274 not be of type `CollectionType.CHAINED`. If multiple collections 

275 are passed, the query will search all of them in an unspecified 

276 order, and all collections must have the same type. Must include 

277 at least one collection. 

278 columns : `~collections.abc.Set` [ `str` ] 

279 Columns to include in the relation. See `Query.find_datasets` for 

280 most options, but this method supports one more: 

281 

282 - ``rank``: a calculated integer column holding the index of the 

283 collection the dataset was found in, within the ``collections`` 

284 sequence given. 

285 context : `SqlQueryContext` 

286 The object that manages database connections, temporary tables and 

287 relation engines for this query. 

288 

289 Returns 

290 ------- 

291 relation : `~lsst.daf.relation.Relation` 

292 Representation of the query. 

293 """ 

294 raise NotImplementedError() 

295 

296 datasetType: DatasetType 

297 """Dataset type whose records this object manages (`DatasetType`). 

298 """ 

299 

300 

301class DatasetRecordStorageManager(VersionedExtension): 

302 """An interface that manages the tables that describe datasets. 

303 

304 `DatasetRecordStorageManager` primarily serves as a container and factory 

305 for `DatasetRecordStorage` instances, which each provide access to the 

306 records for a different `DatasetType`. 

307 """ 

308 

309 def __init__(self, *, registry_schema_version: VersionTuple | None = None) -> None: 

310 super().__init__(registry_schema_version=registry_schema_version) 

311 

312 @classmethod 

313 @abstractmethod 

314 def initialize( 

315 cls, 

316 db: Database, 

317 context: StaticTablesContext, 

318 *, 

319 collections: CollectionManager, 

320 dimensions: DimensionRecordStorageManager, 

321 registry_schema_version: VersionTuple | None = None, 

322 ) -> DatasetRecordStorageManager: 

323 """Construct an instance of the manager. 

324 

325 Parameters 

326 ---------- 

327 db : `Database` 

328 Interface to the underlying database engine and namespace. 

329 context : `StaticTablesContext` 

330 Context object obtained from `Database.declareStaticTables`; used 

331 to declare any tables that should always be present. 

332 collections: `CollectionManager` 

333 Manager object for the collections in this `Registry`. 

334 dimensions : `DimensionRecordStorageManager` 

335 Manager object for the dimensions in this `Registry`. 

336 registry_schema_version : `VersionTuple` or `None` 

337 Schema version of this extension as defined in registry. 

338 

339 Returns 

340 ------- 

341 manager : `DatasetRecordStorageManager` 

342 An instance of a concrete `DatasetRecordStorageManager` subclass. 

343 """ 

344 raise NotImplementedError() 

345 

346 @classmethod 

347 @abstractmethod 

348 def getIdColumnType(cls) -> type: 

349 """Return type used for columns storing dataset IDs. 

350 

351 This type is used for columns storing `DatasetRef.id` values, usually 

352 a `type` subclass provided by SQLAlchemy. 

353 

354 Returns 

355 ------- 

356 dtype : `type` 

357 Type used for dataset identification in database. 

358 """ 

359 raise NotImplementedError() 

360 

361 @classmethod 

362 @abstractmethod 

363 def supportsIdGenerationMode(cls, mode: DatasetIdGenEnum) -> bool: 

364 """Test whether the given dataset ID generation mode is supported by 

365 `insert`. 

366 

367 Parameters 

368 ---------- 

369 mode : `DatasetIdGenEnum` 

370 Enum value for the mode to test. 

371 

372 Returns 

373 ------- 

374 supported : `bool` 

375 Whether the given mode is supported. 

376 """ 

377 raise NotImplementedError() 

378 

379 @classmethod 

380 @abstractmethod 

381 def addDatasetForeignKey( 

382 cls, 

383 tableSpec: ddl.TableSpec, 

384 *, 

385 name: str = "dataset", 

386 constraint: bool = True, 

387 onDelete: str | None = None, 

388 **kwargs: Any, 

389 ) -> ddl.FieldSpec: 

390 """Add a foreign key (field and constraint) referencing the dataset 

391 table. 

392 

393 Parameters 

394 ---------- 

395 tableSpec : `ddl.TableSpec` 

396 Specification for the table that should reference the dataset 

397 table. Will be modified in place. 

398 name: `str`, optional 

399 A name to use for the prefix of the new field; the full name is 

400 ``{name}_id``. 

401 onDelete: `str`, optional 

402 One of "CASCADE" or "SET NULL", indicating what should happen to 

403 the referencing row if the collection row is deleted. `None` 

404 indicates that this should be an integrity error. 

405 constraint: `bool`, optional 

406 If `False` (`True` is default), add a field that can be joined to 

407 the dataset primary key, but do not add a foreign key constraint. 

408 **kwargs 

409 Additional keyword arguments are forwarded to the `ddl.FieldSpec` 

410 constructor (only the ``name`` and ``dtype`` arguments are 

411 otherwise provided). 

412 

413 Returns 

414 ------- 

415 idSpec : `ddl.FieldSpec` 

416 Specification for the ID field. 

417 """ 

418 raise NotImplementedError() 

419 

420 @abstractmethod 

421 def refresh(self) -> None: 

422 """Ensure all other operations on this manager are aware of any 

423 dataset types that may have been registered by other clients since 

424 it was initialized or last refreshed. 

425 """ 

426 raise NotImplementedError() 

427 

428 def __getitem__(self, name: str) -> DatasetRecordStorage: 

429 """Return the object that provides access to the records associated 

430 with the given `DatasetType` name. 

431 

432 This is simply a convenience wrapper for `find` that raises `KeyError` 

433 when the dataset type is not found. 

434 

435 Returns 

436 ------- 

437 records : `DatasetRecordStorage` 

438 The object representing the records for the given dataset type. 

439 

440 Raises 

441 ------ 

442 KeyError 

443 Raised if there is no dataset type with the given name. 

444 

445 Notes 

446 ----- 

447 Dataset types registered by another client of the same repository since 

448 the last call to `initialize` or `refresh` may not be found. 

449 """ 

450 result = self.find(name) 

451 if result is None: 

452 raise MissingDatasetTypeError(f"Dataset type with name '{name}' not found.") 

453 return result 

454 

455 @abstractmethod 

456 def find(self, name: str) -> DatasetRecordStorage | None: 

457 """Return an object that provides access to the records associated with 

458 the given `DatasetType` name, if one exists. 

459 

460 Parameters 

461 ---------- 

462 name : `str` 

463 Name of the dataset type. 

464 

465 Returns 

466 ------- 

467 records : `DatasetRecordStorage` or `None` 

468 The object representing the records for the given dataset type, or 

469 `None` if there are no records for that dataset type. 

470 

471 Notes 

472 ----- 

473 Dataset types registered by another client of the same repository since 

474 the last call to `initialize` or `refresh` may not be found. 

475 """ 

476 raise NotImplementedError() 

477 

478 @abstractmethod 

479 def register(self, datasetType: DatasetType) -> tuple[DatasetRecordStorage, bool]: 

480 """Ensure that this `Registry` can hold records for the given 

481 `DatasetType`, creating new tables as necessary. 

482 

483 Parameters 

484 ---------- 

485 datasetType : `DatasetType` 

486 Dataset type for which a table should created (as necessary) and 

487 an associated `DatasetRecordStorage` returned. 

488 

489 Returns 

490 ------- 

491 records : `DatasetRecordStorage` 

492 The object representing the records for the given dataset type. 

493 inserted : `bool` 

494 `True` if the dataset type did not exist in the registry before. 

495 

496 Notes 

497 ----- 

498 This operation may not be invoked within a `Database.transaction` 

499 context. 

500 """ 

501 raise NotImplementedError() 

502 

503 @abstractmethod 

504 def remove(self, name: str) -> None: 

505 """Remove the dataset type. 

506 

507 Parameters 

508 ---------- 

509 name : `str` 

510 Name of the dataset type. 

511 """ 

512 raise NotImplementedError() 

513 

514 @abstractmethod 

515 def resolve_wildcard( 

516 self, 

517 expression: Any, 

518 components: bool | None = None, 

519 missing: list[str] | None = None, 

520 explicit_only: bool = False, 

521 components_deprecated: bool = True, 

522 ) -> dict[DatasetType, list[str | None]]: 

523 """Resolve a dataset type wildcard expression. 

524 

525 Parameters 

526 ---------- 

527 expression 

528 Expression to resolve. Will be passed to 

529 `DatasetTypeWildcard.from_expression`. 

530 components : `bool`, optional 

531 If `True`, apply all expression patterns to component dataset type 

532 names as well. If `False`, never apply patterns to components. If 

533 `None` (default), apply patterns to components only if their parent 

534 datasets were not matched by the expression. Fully-specified 

535 component datasets (`str` or `DatasetType` instances) are always 

536 included. 

537 missing : `list` of `str`, optional 

538 String dataset type names that were explicitly given (i.e. not 

539 regular expression patterns) but not found will be appended to this 

540 list, if it is provided. 

541 explicit_only : `bool`, optional 

542 If `True`, require explicit `DatasetType` instances or `str` names, 

543 with `re.Pattern` instances deprecated and ``...`` prohibited. 

544 components_deprecated : `bool`, optional 

545 If `True`, this is a context in which component dataset support is 

546 deprecated. This will result in a deprecation warning when 

547 ``components=True`` or ``components=None`` and a component dataset 

548 is matched. In the future this will become an error. 

549 

550 Returns 

551 ------- 

552 dataset_types : `dict` [ `DatasetType`, `list` [ `None`, `str` ] ] 

553 A mapping with resolved dataset types as keys and lists of 

554 matched component names as values, where `None` indicates the 

555 parent composite dataset type was matched. 

556 """ 

557 raise NotImplementedError() 

558 

559 @abstractmethod 

560 def getDatasetRef(self, id: DatasetId) -> DatasetRef | None: 

561 """Return a `DatasetRef` for the given dataset primary key 

562 value. 

563 

564 Parameters 

565 ---------- 

566 id : `DatasetId` 

567 Primary key value for the dataset. 

568 

569 Returns 

570 ------- 

571 ref : `DatasetRef` or `None` 

572 Object representing the dataset, or `None` if no dataset with the 

573 given primary key values exists in this layer. 

574 """ 

575 raise NotImplementedError() 

576 

577 @abstractmethod 

578 def getCollectionSummary(self, collection: CollectionRecord) -> CollectionSummary: 

579 """Return a summary for the given collection. 

580 

581 Parameters 

582 ---------- 

583 collection : `CollectionRecord` 

584 Record describing the collection for which a summary is to be 

585 retrieved. 

586 

587 Returns 

588 ------- 

589 summary : `CollectionSummary` 

590 Summary of the dataset types and governor dimension values in 

591 this collection. 

592 """ 

593 raise NotImplementedError() 

594 

595 @abstractmethod 

596 def ingest_date_dtype(self) -> type: 

597 """Return type of the ``ingest_date`` column.""" 

598 raise NotImplementedError()