Coverage for python/lsst/daf/butler/registry/interfaces/_datasets.py: 57%

112 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2023-01-07 10:08 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ("DatasetRecordStorageManager", "DatasetRecordStorage", "DatasetIdFactory", "DatasetIdGenEnum") 

25 

26import enum 

27import uuid 

28from abc import ABC, abstractmethod 

29from collections.abc import Iterable, Iterator, Set 

30from typing import TYPE_CHECKING, Any 

31 

32from lsst.daf.relation import Relation 

33 

34from ...core import DataCoordinate, DatasetId, DatasetRef, DatasetType, Timespan, ddl 

35from .._exceptions import MissingDatasetTypeError 

36from ._versioning import VersionedExtension 

37 

38if TYPE_CHECKING: 38 ↛ 39line 38 didn't jump to line 39, because the condition on line 38 was never true

39 from .._collection_summary import CollectionSummary 

40 from ..queries import SqlQueryContext 

41 from ._collections import CollectionManager, CollectionRecord, RunRecord 

42 from ._database import Database, StaticTablesContext 

43 from ._dimensions import DimensionRecordStorageManager 

44 

45 

46class DatasetIdGenEnum(enum.Enum): 

47 """This enum is used to specify dataset ID generation options for 

48 ``insert()`` method. 

49 """ 

50 

51 UNIQUE = 0 

52 """Unique mode generates unique ID for each inserted dataset, e.g. 

53 auto-generated by database or random UUID. 

54 """ 

55 

56 DATAID_TYPE = 1 

57 """In this mode ID is computed deterministically from a combination of 

58 dataset type and dataId. 

59 """ 

60 

61 DATAID_TYPE_RUN = 2 

62 """In this mode ID is computed deterministically from a combination of 

63 dataset type, dataId, and run collection name. 

64 """ 

65 

66 

67class DatasetIdFactory: 

68 """Factory for dataset IDs (UUIDs). 

69 

70 For now the logic is hard-coded and is controlled by the user-provided 

71 value of `DatasetIdGenEnum`. In the future we may implement a configurable 

72 logic that can guess `DatasetIdGenEnum` value from other parameters. 

73 """ 

74 

75 NS_UUID = uuid.UUID("840b31d9-05cd-5161-b2c8-00d32b280d0f") 

76 """Namespace UUID used for UUID5 generation. Do not change. This was 

77 produced by `uuid.uuid5(uuid.NAMESPACE_DNS, "lsst.org")`. 

78 """ 

79 

80 def makeDatasetId( 

81 self, 

82 run: str, 

83 datasetType: DatasetType, 

84 dataId: DataCoordinate, 

85 idGenerationMode: DatasetIdGenEnum, 

86 ) -> uuid.UUID: 

87 """Generate dataset ID for a dataset. 

88 

89 Parameters 

90 ---------- 

91 run : `str` 

92 Name of the RUN collection for the dataset. 

93 datasetType : `DatasetType` 

94 Dataset type. 

95 dataId : `DataCoordinate` 

96 Expanded data ID for the dataset. 

97 idGenerationMode : `DatasetIdGenEnum` 

98 ID generation option. `~DatasetIdGenEnum.UNIQUE` makes a random 

99 UUID4-type ID. `~DatasetIdGenEnum.DATAID_TYPE` makes a 

100 deterministic UUID5-type ID based on a dataset type name and 

101 ``dataId``. `~DatasetIdGenEnum.DATAID_TYPE_RUN` makes a 

102 deterministic UUID5-type ID based on a dataset type name, run 

103 collection name, and ``dataId``. 

104 

105 Returns 

106 ------- 

107 datasetId : `uuid.UUID` 

108 Dataset identifier. 

109 """ 

110 if idGenerationMode is DatasetIdGenEnum.UNIQUE: 

111 return uuid.uuid4() 

112 else: 

113 # WARNING: If you modify this code make sure that the order of 

114 # items in the `items` list below never changes. 

115 items: list[tuple[str, str]] = [] 

116 if idGenerationMode is DatasetIdGenEnum.DATAID_TYPE: 

117 items = [ 

118 ("dataset_type", datasetType.name), 

119 ] 

120 elif idGenerationMode is DatasetIdGenEnum.DATAID_TYPE_RUN: 

121 items = [ 

122 ("dataset_type", datasetType.name), 

123 ("run", run), 

124 ] 

125 else: 

126 raise ValueError(f"Unexpected ID generation mode: {idGenerationMode}") 

127 

128 for name, value in sorted(dataId.byName().items()): 

129 items.append((name, str(value))) 

130 data = ",".join(f"{key}={value}" for key, value in items) 

131 return uuid.uuid5(self.NS_UUID, data) 

132 

133 

134class DatasetRecordStorage(ABC): 

135 """An interface that manages the records associated with a particular 

136 `DatasetType`. 

137 

138 Parameters 

139 ---------- 

140 datasetType : `DatasetType` 

141 Dataset type whose records this object manages. 

142 """ 

143 

144 def __init__(self, datasetType: DatasetType): 

145 self.datasetType = datasetType 

146 

147 @abstractmethod 

148 def insert( 

149 self, 

150 run: RunRecord, 

151 dataIds: Iterable[DataCoordinate], 

152 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

153 ) -> Iterator[DatasetRef]: 

154 """Insert one or more dataset entries into the database. 

155 

156 Parameters 

157 ---------- 

158 run : `RunRecord` 

159 The record object describing the `~CollectionType.RUN` collection 

160 this dataset will be associated with. 

161 dataIds : `Iterable` [ `DataCoordinate` ] 

162 Expanded data IDs (`DataCoordinate` instances) for the 

163 datasets to be added. The dimensions of all data IDs must be the 

164 same as ``self.datasetType.dimensions``. 

165 idMode : `DatasetIdGenEnum` 

166 With `UNIQUE` each new dataset is inserted with its new unique ID. 

167 With non-`UNIQUE` mode ID is computed from some combination of 

168 dataset type, dataId, and run collection name; if the same ID is 

169 already in the database then new record is not inserted. 

170 

171 Returns 

172 ------- 

173 datasets : `Iterable` [ `DatasetRef` ] 

174 References to the inserted datasets. 

175 """ 

176 raise NotImplementedError() 

177 

178 @abstractmethod 

179 def import_( 

180 self, 

181 run: RunRecord, 

182 datasets: Iterable[DatasetRef], 

183 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

184 reuseIds: bool = False, 

185 ) -> Iterator[DatasetRef]: 

186 """Insert one or more dataset entries into the database. 

187 

188 Parameters 

189 ---------- 

190 run : `RunRecord` 

191 The record object describing the `~CollectionType.RUN` collection 

192 this dataset will be associated with. 

193 datasets : `~collections.abc.Iterable` of `DatasetRef` 

194 Datasets to be inserted. Datasets can specify ``id`` attribute 

195 which will be used for inserted datasets. All dataset IDs must 

196 have the same type (`int` or `uuid.UUID`), if type of dataset IDs 

197 does not match type supported by this class then IDs will be 

198 ignored and new IDs will be generated by backend. 

199 idGenerationMode : `DatasetIdGenEnum` 

200 With `UNIQUE` each new dataset is inserted with its new unique ID. 

201 With non-`UNIQUE` mode ID is computed from some combination of 

202 dataset type, dataId, and run collection name; if the same ID is 

203 already in the database then new record is not inserted. 

204 reuseIds : `bool`, optional 

205 If `True` then forces re-use of imported dataset IDs for integer 

206 IDs which are normally generated as auto-incremented; exception 

207 will be raised if imported IDs clash with existing ones. This 

208 option has no effect on the use of globally-unique IDs which are 

209 always re-used (or generated if integer IDs are being imported). 

210 

211 Returns 

212 ------- 

213 datasets : `Iterable` [ `DatasetRef` ] 

214 References to the inserted or existing datasets. 

215 

216 Notes 

217 ----- 

218 The ``datasetType`` and ``run`` attributes of datasets are supposed to 

219 be identical across all datasets but this is not checked and it should 

220 be enforced by higher level registry code. This method does not need 

221 to use those attributes from datasets, only ``dataId`` and ``id`` are 

222 relevant. 

223 """ 

224 raise NotImplementedError() 

225 

226 @abstractmethod 

227 def delete(self, datasets: Iterable[DatasetRef]) -> None: 

228 """Fully delete the given datasets from the registry. 

229 

230 Parameters 

231 ---------- 

232 datasets : `Iterable` [ `DatasetRef` ] 

233 Datasets to be deleted. All datasets must be resolved and have 

234 the same `DatasetType` as ``self``. 

235 

236 Raises 

237 ------ 

238 AmbiguousDatasetError 

239 Raised if any of the given `DatasetRef` instances is unresolved. 

240 """ 

241 raise NotImplementedError() 

242 

243 @abstractmethod 

244 def associate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None: 

245 """Associate one or more datasets with a collection. 

246 

247 Parameters 

248 ---------- 

249 collection : `CollectionRecord` 

250 The record object describing the collection. ``collection.type`` 

251 must be `~CollectionType.TAGGED`. 

252 datasets : `Iterable` [ `DatasetRef` ] 

253 Datasets to be associated. All datasets must be resolved and have 

254 the same `DatasetType` as ``self``. 

255 

256 Raises 

257 ------ 

258 AmbiguousDatasetError 

259 Raised if any of the given `DatasetRef` instances is unresolved. 

260 

261 Notes 

262 ----- 

263 Associating a dataset with into collection that already contains a 

264 different dataset with the same `DatasetType` and data ID will remove 

265 the existing dataset from that collection. 

266 

267 Associating the same dataset into a collection multiple times is a 

268 no-op, but is still not permitted on read-only databases. 

269 """ 

270 raise NotImplementedError() 

271 

272 @abstractmethod 

273 def disassociate(self, collection: CollectionRecord, datasets: Iterable[DatasetRef]) -> None: 

274 """Remove one or more datasets from a collection. 

275 

276 Parameters 

277 ---------- 

278 collection : `CollectionRecord` 

279 The record object describing the collection. ``collection.type`` 

280 must be `~CollectionType.TAGGED`. 

281 datasets : `Iterable` [ `DatasetRef` ] 

282 Datasets to be disassociated. All datasets must be resolved and 

283 have the same `DatasetType` as ``self``. 

284 

285 Raises 

286 ------ 

287 AmbiguousDatasetError 

288 Raised if any of the given `DatasetRef` instances is unresolved. 

289 """ 

290 raise NotImplementedError() 

291 

292 @abstractmethod 

293 def certify( 

294 self, 

295 collection: CollectionRecord, 

296 datasets: Iterable[DatasetRef], 

297 timespan: Timespan, 

298 context: SqlQueryContext, 

299 ) -> None: 

300 """Associate one or more datasets with a calibration collection and a 

301 validity range within it. 

302 

303 Parameters 

304 ---------- 

305 collection : `CollectionRecord` 

306 The record object describing the collection. ``collection.type`` 

307 must be `~CollectionType.CALIBRATION`. 

308 datasets : `Iterable` [ `DatasetRef` ] 

309 Datasets to be associated. All datasets must be resolved and have 

310 the same `DatasetType` as ``self``. 

311 timespan : `Timespan` 

312 The validity range for these datasets within the collection. 

313 

314 Raises 

315 ------ 

316 AmbiguousDatasetError 

317 Raised if any of the given `DatasetRef` instances is unresolved. 

318 ConflictingDefinitionError 

319 Raised if the collection already contains a different dataset with 

320 the same `DatasetType` and data ID and an overlapping validity 

321 range. 

322 CollectionTypeError 

323 Raised if 

324 ``collection.type is not CollectionType.CALIBRATION`` or if 

325 ``self.datasetType.isCalibration() is False``. 

326 """ 

327 raise NotImplementedError() 

328 

329 @abstractmethod 

330 def decertify( 

331 self, 

332 collection: CollectionRecord, 

333 timespan: Timespan, 

334 *, 

335 dataIds: Iterable[DataCoordinate] | None = None, 

336 context: SqlQueryContext, 

337 ) -> None: 

338 """Remove or adjust datasets to clear a validity range within a 

339 calibration collection. 

340 

341 Parameters 

342 ---------- 

343 collection : `CollectionRecord` 

344 The record object describing the collection. ``collection.type`` 

345 must be `~CollectionType.CALIBRATION`. 

346 timespan : `Timespan` 

347 The validity range to remove datasets from within the collection. 

348 Datasets that overlap this range but are not contained by it will 

349 have their validity ranges adjusted to not overlap it, which may 

350 split a single dataset validity range into two. 

351 dataIds : `Iterable` [ `DataCoordinate` ], optional 

352 Data IDs that should be decertified within the given validity range 

353 If `None`, all data IDs for ``self.datasetType`` will be 

354 decertified. 

355 

356 Raises 

357 ------ 

358 CollectionTypeError 

359 Raised if ``collection.type is not CollectionType.CALIBRATION``. 

360 """ 

361 raise NotImplementedError() 

362 

363 @abstractmethod 

364 def make_relation( 

365 self, 

366 *collections: CollectionRecord, 

367 columns: Set[str], 

368 context: SqlQueryContext, 

369 ) -> Relation: 

370 """Return a `sql.Relation` that represents a query for for this 

371 `DatasetType` in one or more collections. 

372 

373 Parameters 

374 ---------- 

375 *collections : `CollectionRecord` 

376 The record object(s) describing the collection(s) to query. May 

377 not be of type `CollectionType.CHAINED`. If multiple collections 

378 are passed, the query will search all of them in an unspecified 

379 order, and all collections must have the same type. Must include 

380 at least one collection. 

381 columns : `~collections.abc.Set` [ `str` ] 

382 Columns to include in the relation. See `Query.find_datasets` for 

383 most options, but this method supports one more: 

384 

385 - ``rank``: a calculated integer column holding the index of the 

386 collection the dataset was found in, within the ``collections`` 

387 sequence given. 

388 context : `SqlQueryContext` 

389 The object that manages database connections, temporary tables and 

390 relation engines for this query. 

391 

392 Returns 

393 ------ 

394 relation : `~lsst.daf.relation.Relation` 

395 Representation of the query. 

396 """ 

397 raise NotImplementedError() 

398 

399 datasetType: DatasetType 

400 """Dataset type whose records this object manages (`DatasetType`). 

401 """ 

402 

403 

404class DatasetRecordStorageManager(VersionedExtension): 

405 """An interface that manages the tables that describe datasets. 

406 

407 `DatasetRecordStorageManager` primarily serves as a container and factory 

408 for `DatasetRecordStorage` instances, which each provide access to the 

409 records for a different `DatasetType`. 

410 """ 

411 

412 @classmethod 

413 @abstractmethod 

414 def initialize( 

415 cls, 

416 db: Database, 

417 context: StaticTablesContext, 

418 *, 

419 collections: CollectionManager, 

420 dimensions: DimensionRecordStorageManager, 

421 ) -> DatasetRecordStorageManager: 

422 """Construct an instance of the manager. 

423 

424 Parameters 

425 ---------- 

426 db : `Database` 

427 Interface to the underlying database engine and namespace. 

428 context : `StaticTablesContext` 

429 Context object obtained from `Database.declareStaticTables`; used 

430 to declare any tables that should always be present. 

431 collections: `CollectionManager` 

432 Manager object for the collections in this `Registry`. 

433 dimensions : `DimensionRecordStorageManager` 

434 Manager object for the dimensions in this `Registry`. 

435 

436 Returns 

437 ------- 

438 manager : `DatasetRecordStorageManager` 

439 An instance of a concrete `DatasetRecordStorageManager` subclass. 

440 """ 

441 raise NotImplementedError() 

442 

443 @classmethod 

444 @abstractmethod 

445 def getIdColumnType(cls) -> type: 

446 """Return type used for columns storing dataset IDs. 

447 

448 This type is used for columns storing `DatasetRef.id` values, usually 

449 a `type` subclass provided by SQLAlchemy. 

450 

451 Returns 

452 ------- 

453 dtype : `type` 

454 Type used for dataset identification in database. 

455 """ 

456 raise NotImplementedError() 

457 

458 @classmethod 

459 @abstractmethod 

460 def supportsIdGenerationMode(cls, mode: DatasetIdGenEnum) -> bool: 

461 """Test whether the given dataset ID generation mode is supported by 

462 `insert`. 

463 

464 Parameters 

465 ---------- 

466 mode : `DatasetIdGenEnum` 

467 Enum value for the mode to test. 

468 

469 Returns 

470 ------- 

471 supported : `bool` 

472 Whether the given mode is supported. 

473 """ 

474 raise NotImplementedError() 

475 

476 @classmethod 

477 @abstractmethod 

478 def addDatasetForeignKey( 

479 cls, 

480 tableSpec: ddl.TableSpec, 

481 *, 

482 name: str = "dataset", 

483 constraint: bool = True, 

484 onDelete: str | None = None, 

485 **kwargs: Any, 

486 ) -> ddl.FieldSpec: 

487 """Add a foreign key (field and constraint) referencing the dataset 

488 table. 

489 

490 Parameters 

491 ---------- 

492 tableSpec : `ddl.TableSpec` 

493 Specification for the table that should reference the dataset 

494 table. Will be modified in place. 

495 name: `str`, optional 

496 A name to use for the prefix of the new field; the full name is 

497 ``{name}_id``. 

498 onDelete: `str`, optional 

499 One of "CASCADE" or "SET NULL", indicating what should happen to 

500 the referencing row if the collection row is deleted. `None` 

501 indicates that this should be an integrity error. 

502 constraint: `bool`, optional 

503 If `False` (`True` is default), add a field that can be joined to 

504 the dataset primary key, but do not add a foreign key constraint. 

505 **kwargs 

506 Additional keyword arguments are forwarded to the `ddl.FieldSpec` 

507 constructor (only the ``name`` and ``dtype`` arguments are 

508 otherwise provided). 

509 

510 Returns 

511 ------- 

512 idSpec : `ddl.FieldSpec` 

513 Specification for the ID field. 

514 """ 

515 raise NotImplementedError() 

516 

517 @abstractmethod 

518 def refresh(self) -> None: 

519 """Ensure all other operations on this manager are aware of any 

520 dataset types that may have been registered by other clients since 

521 it was initialized or last refreshed. 

522 """ 

523 raise NotImplementedError() 

524 

525 def __getitem__(self, name: str) -> DatasetRecordStorage: 

526 """Return the object that provides access to the records associated 

527 with the given `DatasetType` name. 

528 

529 This is simply a convenience wrapper for `find` that raises `KeyError` 

530 when the dataset type is not found. 

531 

532 Returns 

533 ------- 

534 records : `DatasetRecordStorage` 

535 The object representing the records for the given dataset type. 

536 

537 Raises 

538 ------ 

539 KeyError 

540 Raised if there is no dataset type with the given name. 

541 

542 Notes 

543 ----- 

544 Dataset types registered by another client of the same repository since 

545 the last call to `initialize` or `refresh` may not be found. 

546 """ 

547 result = self.find(name) 

548 if result is None: 

549 raise MissingDatasetTypeError(f"Dataset type with name '{name}' not found.") 

550 return result 

551 

552 @abstractmethod 

553 def find(self, name: str) -> DatasetRecordStorage | None: 

554 """Return an object that provides access to the records associated with 

555 the given `DatasetType` name, if one exists. 

556 

557 Parameters 

558 ---------- 

559 name : `str` 

560 Name of the dataset type. 

561 

562 Returns 

563 ------- 

564 records : `DatasetRecordStorage` or `None` 

565 The object representing the records for the given dataset type, or 

566 `None` if there are no records for that dataset type. 

567 

568 Notes 

569 ----- 

570 Dataset types registered by another client of the same repository since 

571 the last call to `initialize` or `refresh` may not be found. 

572 """ 

573 raise NotImplementedError() 

574 

575 @abstractmethod 

576 def register(self, datasetType: DatasetType) -> tuple[DatasetRecordStorage, bool]: 

577 """Ensure that this `Registry` can hold records for the given 

578 `DatasetType`, creating new tables as necessary. 

579 

580 Parameters 

581 ---------- 

582 datasetType : `DatasetType` 

583 Dataset type for which a table should created (as necessary) and 

584 an associated `DatasetRecordStorage` returned. 

585 

586 Returns 

587 ------- 

588 records : `DatasetRecordStorage` 

589 The object representing the records for the given dataset type. 

590 inserted : `bool` 

591 `True` if the dataset type did not exist in the registry before. 

592 

593 Notes 

594 ----- 

595 This operation may not be invoked within a `Database.transaction` 

596 context. 

597 """ 

598 raise NotImplementedError() 

599 

600 @abstractmethod 

601 def remove(self, name: str) -> None: 

602 """Remove the dataset type. 

603 

604 Parameters 

605 ---------- 

606 name : `str` 

607 Name of the dataset type. 

608 """ 

609 raise NotImplementedError() 

610 

611 @abstractmethod 

612 def resolve_wildcard( 

613 self, 

614 expression: Any, 

615 components: bool | None = None, 

616 missing: list[str] | None = None, 

617 explicit_only: bool = False, 

618 components_deprecated: bool = True, 

619 ) -> dict[DatasetType, list[str | None]]: 

620 """Resolve a dataset type wildcard expression. 

621 

622 Parameters 

623 ---------- 

624 expression 

625 Expression to resolve. Will be passed to 

626 `DatasetTypeWildcard.from_expression`. 

627 components : `bool`, optional 

628 If `True`, apply all expression patterns to component dataset type 

629 names as well. If `False`, never apply patterns to components. If 

630 `None` (default), apply patterns to components only if their parent 

631 datasets were not matched by the expression. Fully-specified 

632 component datasets (`str` or `DatasetType` instances) are always 

633 included. 

634 missing : `list` of `str`, optional 

635 String dataset type names that were explicitly given (i.e. not 

636 regular expression patterns) but not found will be appended to this 

637 list, if it is provided. 

638 explicit_only : `bool`, optional 

639 If `True`, require explicit `DatasetType` instances or `str` names, 

640 with `re.Pattern` instances deprecated and ``...`` prohibited. 

641 components_deprecated : `bool`, optional 

642 If `True`, this is a context in which component dataset support is 

643 deprecated. This will result in a deprecation warning when 

644 ``components=True`` or ``components=None`` and a component dataset 

645 is matched. In the future this will become an error. 

646 

647 Returns 

648 ------- 

649 dataset_types : `dict` [ `DatasetType`, `list` [ `None`, `str` ] ] 

650 A mapping with resolved dataset types as keys and lists of 

651 matched component names as values, where `None` indicates the 

652 parent composite dataset type was matched. 

653 """ 

654 raise NotImplementedError() 

655 

656 @abstractmethod 

657 def getDatasetRef(self, id: DatasetId) -> DatasetRef | None: 

658 """Return a `DatasetRef` for the given dataset primary key 

659 value. 

660 

661 Parameters 

662 ---------- 

663 id : `DatasetId` 

664 Primary key value for the dataset. 

665 

666 Returns 

667 ------- 

668 ref : `DatasetRef` or `None` 

669 Object representing the dataset, or `None` if no dataset with the 

670 given primary key values exists in this layer. 

671 """ 

672 raise NotImplementedError() 

673 

674 @abstractmethod 

675 def getCollectionSummary(self, collection: CollectionRecord) -> CollectionSummary: 

676 """Return a summary for the given collection. 

677 

678 Parameters 

679 ---------- 

680 collection : `CollectionRecord` 

681 Record describing the collection for which a summary is to be 

682 retrieved. 

683 

684 Returns 

685 ------- 

686 summary : `CollectionSummary` 

687 Summary of the dataset types and governor dimension values in 

688 this collection. 

689 """ 

690 raise NotImplementedError()