Coverage for python / lsst / daf / butler / registry / interfaces / _datasets.py: 77%

78 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-18 08:43 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28from __future__ import annotations 

29 

30from ... import ddl 

31 

32__all__ = ("DatasetRecordStorageManager",) 

33 

34from abc import abstractmethod 

35from collections.abc import Iterable, Mapping, Sequence, Set 

36from typing import TYPE_CHECKING, Any 

37 

38from ..._dataset_ref import DatasetId, DatasetIdGenEnum, DatasetRef 

39from ..._dataset_type import DatasetType 

40from ..._exceptions import DatasetTypeError, DatasetTypeNotSupportedError 

41from ..._timespan import Timespan 

42from ...dimensions import DataCoordinate 

43from ...queries import QueryFactoryFunction 

44from ...queries.tree import AnyDatasetFieldName 

45from ._versioning import VersionedExtension, VersionTuple 

46 

47if TYPE_CHECKING: 

48 from ...direct_query_driver import SqlJoinsBuilder # new query system, server+direct only 

49 from .._caching_context import CachingContext 

50 from .._collection_summary import CollectionSummary 

51 from ._collections import CollectionManager, CollectionRecord, RunRecord 

52 from ._database import Database, StaticTablesContext 

53 from ._dimensions import DimensionRecordStorageManager 

54 

55 

56class DatasetRecordStorageManager(VersionedExtension): 

57 """An interface that manages the tables that describe datasets. 

58 

59 `DatasetRecordStorageManager` primarily serves as a container and factory 

60 for `DatasetRecordStorage` instances, which each provide access to the 

61 records for a different `DatasetType`. 

62 

63 Parameters 

64 ---------- 

65 registry_schema_version : `VersionTuple` or `None`, optional 

66 Version of registry schema. 

67 """ 

68 

69 def __init__(self, *, registry_schema_version: VersionTuple | None = None) -> None: 

70 super().__init__(registry_schema_version=registry_schema_version) 

71 

72 @abstractmethod 

73 def clone( 

74 self, 

75 *, 

76 db: Database, 

77 collections: CollectionManager, 

78 dimensions: DimensionRecordStorageManager, 

79 caching_context: CachingContext, 

80 ) -> DatasetRecordStorageManager: 

81 """Make an independent copy of this manager instance bound to new 

82 instances of `Database` and other managers. 

83 

84 Parameters 

85 ---------- 

86 db : `Database` 

87 New `Database` object to use when instantiating the manager. 

88 collections : `CollectionManager` 

89 New `CollectionManager` object to use when instantiating the 

90 manager. 

91 dimensions : `DimensionRecordStorageManager` 

92 New `DimensionRecordStorageManager` object to use when 

93 instantiating the manager. 

94 caching_context : `CachingContext` 

95 New `CachingContext` object to use when instantiating the manager. 

96 

97 Returns 

98 ------- 

99 instance : `DatasetRecordStorageManager` 

100 New manager instance with the same configuration as this instance, 

101 but bound to a new Database object. 

102 """ 

103 raise NotImplementedError() 

104 

105 @abstractmethod 

106 def preload_cache(self) -> None: 

107 """Fetch data from the database and use it to pre-populate caches to 

108 speed up later operations. 

109 """ 

110 raise NotImplementedError() 

111 

112 @classmethod 

113 @abstractmethod 

114 def initialize( 

115 cls, 

116 db: Database, 

117 context: StaticTablesContext, 

118 *, 

119 collections: CollectionManager, 

120 dimensions: DimensionRecordStorageManager, 

121 caching_context: CachingContext, 

122 registry_schema_version: VersionTuple | None = None, 

123 ) -> DatasetRecordStorageManager: 

124 """Construct an instance of the manager. 

125 

126 Parameters 

127 ---------- 

128 db : `Database` 

129 Interface to the underlying database engine and namespace. 

130 context : `StaticTablesContext` 

131 Context object obtained from `Database.declareStaticTables`; used 

132 to declare any tables that should always be present. 

133 collections : `CollectionManager` 

134 Manager object for the collections in this `Registry`. 

135 dimensions : `DimensionRecordStorageManager` 

136 Manager object for the dimensions in this `Registry`. 

137 caching_context : `CachingContext` 

138 Object controlling caching of information returned by managers. 

139 registry_schema_version : `VersionTuple` or `None` 

140 Schema version of this extension as defined in registry. 

141 

142 Returns 

143 ------- 

144 manager : `DatasetRecordStorageManager` 

145 An instance of a concrete `DatasetRecordStorageManager` subclass. 

146 """ 

147 raise NotImplementedError() 

148 

149 @classmethod 

150 @abstractmethod 

151 def addDatasetForeignKey( 

152 cls, 

153 tableSpec: ddl.TableSpec, 

154 *, 

155 name: str = "dataset", 

156 constraint: bool = True, 

157 onDelete: str | None = None, 

158 **kwargs: Any, 

159 ) -> ddl.FieldSpec: 

160 """Add a foreign key (field and constraint) referencing the dataset 

161 table. 

162 

163 Parameters 

164 ---------- 

165 tableSpec : `ddl.TableSpec` 

166 Specification for the table that should reference the dataset 

167 table. Will be modified in place. 

168 name : `str`, optional 

169 A name to use for the prefix of the new field; the full name is 

170 ``{name}_id``. 

171 constraint : `bool`, optional 

172 If `False` (`True` is default), add a field that can be joined to 

173 the dataset primary key, but do not add a foreign key constraint. 

174 onDelete : `str`, optional 

175 One of "CASCADE" or "SET NULL", indicating what should happen to 

176 the referencing row if the collection row is deleted. `None` 

177 indicates that this should be an integrity error. 

178 **kwargs 

179 Additional keyword arguments are forwarded to the `ddl.FieldSpec` 

180 constructor (only the ``name`` and ``dtype`` arguments are 

181 otherwise provided). 

182 

183 Returns 

184 ------- 

185 idSpec : `ddl.FieldSpec` 

186 Specification for the ID field. 

187 """ 

188 raise NotImplementedError() 

189 

190 @abstractmethod 

191 def refresh(self) -> None: 

192 """Ensure all other operations on this manager are aware of any 

193 dataset types that may have been registered by other clients since 

194 it was initialized or last refreshed. 

195 """ 

196 raise NotImplementedError() 

197 

198 @abstractmethod 

199 def get_dataset_type(self, name: str) -> DatasetType: 

200 """Look up a dataset type by name. 

201 

202 Parameters 

203 ---------- 

204 name : `str` 

205 Name of a parent dataset type. 

206 

207 Returns 

208 ------- 

209 dataset_type : `DatasetType` 

210 The object representing the records for the given dataset type. 

211 

212 Raises 

213 ------ 

214 MissingDatasetTypeError 

215 Raised if there is no dataset type with the given name. 

216 """ 

217 raise NotImplementedError() 

218 

219 def conform_exact_dataset_type(self, dataset_type: DatasetType | str) -> DatasetType: 

220 """Conform a value that may be a dataset type or dataset type name to 

221 just the dataset type name, while checking that the dataset type is not 

222 a component and (if a `DatasetType` instance is given) has the exact 

223 same definition in the registry. 

224 

225 Parameters 

226 ---------- 

227 dataset_type : `str` or `DatasetType` 

228 Dataset type object or name. 

229 

230 Returns 

231 ------- 

232 dataset_type : `DatasetType` 

233 The corresponding registered dataset type. 

234 

235 Raises 

236 ------ 

237 DatasetTypeError 

238 Raised if ``dataset_type`` is a component, or if its definition 

239 does not exactly match the registered dataset type. 

240 MissingDatasetTypeError 

241 Raised if this dataset type is not registered at all. 

242 """ 

243 if isinstance(dataset_type, DatasetType): 

244 dataset_type_name = dataset_type.name 

245 given_dataset_type = dataset_type 

246 else: 

247 dataset_type_name = dataset_type 

248 given_dataset_type = None 

249 parent_name, component = DatasetType.splitDatasetTypeName(dataset_type_name) 

250 if component is not None: 

251 raise DatasetTypeNotSupportedError( 

252 f"Component dataset {dataset_type_name!r} is not supported in this context." 

253 ) 

254 registered_dataset_type = self.get_dataset_type(dataset_type_name) 

255 if given_dataset_type is not None and registered_dataset_type != given_dataset_type: 

256 raise DatasetTypeError( 

257 f"Given dataset type {given_dataset_type} is not identical to the " 

258 f"registered one {registered_dataset_type}." 

259 ) 

260 return registered_dataset_type 

261 

262 @abstractmethod 

263 def register_dataset_type(self, dataset_type: DatasetType) -> bool: 

264 """Ensure that this `Registry` can hold records for the given 

265 `DatasetType`, creating new tables as necessary. 

266 

267 Parameters 

268 ---------- 

269 dataset_type : `DatasetType` 

270 Dataset type for which a table should created (as necessary) and 

271 an associated `DatasetRecordStorage` returned. 

272 

273 Returns 

274 ------- 

275 inserted : `bool` 

276 `True` if the dataset type did not exist in the registry before. 

277 

278 Notes 

279 ----- 

280 This operation may not be invoked within a `Database.transaction` 

281 context. 

282 """ 

283 raise NotImplementedError() 

284 

285 @abstractmethod 

286 def remove_dataset_type(self, name: str) -> None: 

287 """Remove the dataset type. 

288 

289 Parameters 

290 ---------- 

291 name : `str` 

292 Name of the dataset type. 

293 """ 

294 raise NotImplementedError() 

295 

296 @abstractmethod 

297 def resolve_wildcard( 

298 self, 

299 expression: Any, 

300 missing: list[str] | None = None, 

301 explicit_only: bool = False, 

302 ) -> list[DatasetType]: 

303 """Resolve a dataset type wildcard expression. 

304 

305 Parameters 

306 ---------- 

307 expression : `~typing.Any` 

308 Expression to resolve. Will be passed to 

309 `DatasetTypeWildcard.from_expression`. 

310 missing : `list` of `str`, optional 

311 String dataset type names that were explicitly given (i.e. not 

312 regular expression patterns) but not found will be appended to this 

313 list, if it is provided. 

314 explicit_only : `bool`, optional 

315 If `True`, require explicit `DatasetType` instances or `str` names, 

316 with `re.Pattern` instances deprecated and ``...`` prohibited. 

317 

318 Returns 

319 ------- 

320 dataset_types : `list` [ `DatasetType` ] 

321 A list of resolved dataset types. 

322 """ 

323 raise NotImplementedError() 

324 

325 @abstractmethod 

326 def get_dataset_refs(self, ids: list[DatasetId]) -> list[DatasetRef]: 

327 """ 

328 Return a `DatasetRef` for each of the given dataset UUID values. 

329 

330 Parameters 

331 ---------- 

332 ids : `list` [ `DatasetId` ] 

333 List of UUID instances to look up. 

334 

335 Returns 

336 ------- 

337 refs : `list` [ `DatasetRef` ] 

338 A list containing a `DatasetRef` for each of the given UUIDs that 

339 was found in the database. If a dataset was not found, no error is 

340 thrown -- it is just not included in the list. The returned 

341 datasets are in no particular order. 

342 """ 

343 raise NotImplementedError() 

344 

345 @abstractmethod 

346 def getCollectionSummary(self, collection: CollectionRecord) -> CollectionSummary: 

347 """Return a summary for the given collection. 

348 

349 Parameters 

350 ---------- 

351 collection : `CollectionRecord` 

352 Record describing the collection for which a summary is to be 

353 retrieved. 

354 

355 Returns 

356 ------- 

357 summary : `CollectionSummary` 

358 Summary of the dataset types and governor dimension values in 

359 this collection. 

360 """ 

361 raise NotImplementedError() 

362 

363 @abstractmethod 

364 def fetch_summaries( 

365 self, 

366 collections: Iterable[CollectionRecord], 

367 dataset_types: Iterable[DatasetType] | Iterable[str] | None = None, 

368 ) -> Mapping[Any, CollectionSummary]: 

369 """Fetch collection summaries given their names and dataset types. 

370 

371 Parameters 

372 ---------- 

373 collections : `~collections.abc.Iterable` [`CollectionRecord`] 

374 Collection records to query. 

375 dataset_types : `~collections.abc.Iterable` [`DatasetType`] or `None` 

376 Dataset types to include into returned summaries. If `None` then 

377 all dataset types will be included. 

378 

379 Returns 

380 ------- 

381 summaries : `~collections.abc.Mapping` [`typing.Any`, \ 

382 `CollectionSummary`] 

383 Collection summaries indexed by collection record key. This mapping 

384 will also contain all nested non-chained collections of the chained 

385 collections. 

386 """ 

387 raise NotImplementedError() 

388 

389 @abstractmethod 

390 def fetch_run_dataset_ids(self, run: RunRecord) -> list[DatasetId]: 

391 """Return the IDs of all datasets in the given ``RUN`` 

392 collection. 

393 

394 Parameters 

395 ---------- 

396 run : `RunRecord` 

397 Record describing the collection. 

398 

399 Returns 

400 ------- 

401 dataset_ids : `list` [`uuid.UUID`] 

402 List of dataset IDs. 

403 """ 

404 raise NotImplementedError() 

405 

406 @abstractmethod 

407 def ingest_date_dtype(self) -> type: 

408 """Return type of the ``ingest_date`` column.""" 

409 raise NotImplementedError() 

410 

411 @abstractmethod 

412 def insert( 

413 self, 

414 dataset_type_name: str, 

415 run: RunRecord, 

416 data_ids: Iterable[DataCoordinate], 

417 id_generation_mode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

418 ) -> list[DatasetRef]: 

419 """Insert one or more dataset entries into the database. 

420 

421 Parameters 

422 ---------- 

423 dataset_type_name : `str` 

424 Name of the dataset type. 

425 run : `RunRecord` 

426 The record object describing the `~CollectionType.RUN` collection 

427 these datasets will be associated with. 

428 data_ids : `~collections.abc.Iterable` [ `DataCoordinate` ] 

429 Expanded data IDs (`DataCoordinate` instances) for the 

430 datasets to be added. The dimensions of all data IDs must be the 

431 same as ``dataset_type.dimensions``. 

432 id_generation_mode : `DatasetIdGenEnum` 

433 With `~DatasetIdGenEnum.UNIQUE` each new dataset is inserted with 

434 its new unique ID. With non-`~DatasetIdGenEnum.UNIQUE` mode the ID 

435 is computed from some combination of dataset type, dataId, and run 

436 collection name; if the same ID is already in the database then new 

437 record is not inserted. 

438 

439 Returns 

440 ------- 

441 datasets : `list` [ `DatasetRef` ] 

442 References to the inserted datasets. 

443 """ 

444 raise NotImplementedError() 

445 

446 @abstractmethod 

447 def import_(self, run: RunRecord, refs: list[DatasetRef], assume_new: bool = False) -> None: 

448 """Insert one or more dataset entries into the database. 

449 

450 Parameters 

451 ---------- 

452 run : `RunRecord` 

453 The record object describing the `~CollectionType.RUN` collection 

454 these datasets will be associated with. 

455 refs : `list` [ `DatasetRef` ] 

456 List of datasets to be be inserted. All of the ``DatasetRef`` 

457 ``run`` attributes must match the ``run`` parameter. 

458 assume_new : `bool`, optional 

459 If `True`, assume all datasets are new and skip conflict resolution 

460 logic. 

461 """ 

462 raise NotImplementedError() 

463 

464 @abstractmethod 

465 def delete(self, datasets: Iterable[DatasetId | DatasetRef]) -> None: 

466 """Fully delete the given datasets from the registry. 

467 

468 Parameters 

469 ---------- 

470 datasets : `~collections.abc.Iterable` [ `DatasetId` or `DatasetRef` ] 

471 Datasets to be deleted. If `DatasetRef` instances are passed, 

472 only the `DatasetRef.id` attribute is used. 

473 """ 

474 raise NotImplementedError() 

475 

476 @abstractmethod 

477 def associate( 

478 self, dataset_type: DatasetType, collection: CollectionRecord, datasets: Iterable[DatasetRef] 

479 ) -> None: 

480 """Associate one or more datasets with a collection. 

481 

482 Parameters 

483 ---------- 

484 dataset_type : `DatasetType` 

485 Type of all datasets. 

486 collection : `CollectionRecord` 

487 The record object describing the collection. ``collection.type`` 

488 must be `~CollectionType.TAGGED`. 

489 datasets : `~collections.abc.Iterable` [ `DatasetRef` ] 

490 Datasets to be associated. All datasets must have the same 

491 `DatasetType` as ``dataset_type``, but this is not checked. 

492 

493 Notes 

494 ----- 

495 Associating a dataset into collection that already contains a 

496 different dataset with the same `DatasetType` and data ID will remove 

497 the existing dataset from that collection. 

498 

499 Associating the same dataset into a collection multiple times is a 

500 no-op, but is still not permitted on read-only databases. 

501 """ 

502 raise NotImplementedError() 

503 

504 @abstractmethod 

505 def disassociate( 

506 self, dataset_type: DatasetType, collection: CollectionRecord, datasets: Iterable[DatasetRef] 

507 ) -> None: 

508 """Remove one or more datasets from a collection. 

509 

510 Parameters 

511 ---------- 

512 dataset_type : `DatasetType` 

513 Type of all datasets. 

514 collection : `CollectionRecord` 

515 The record object describing the collection. ``collection.type`` 

516 must be `~CollectionType.TAGGED`. 

517 datasets : `~collections.abc.Iterable` [ `DatasetRef` ] 

518 Datasets to be disassociated. All datasets must have the same 

519 `DatasetType` as ``dataset_type``, but this is not checked. 

520 """ 

521 raise NotImplementedError() 

522 

523 @abstractmethod 

524 def certify( 

525 self, 

526 dataset_type: DatasetType, 

527 collection: CollectionRecord, 

528 datasets: Iterable[DatasetRef], 

529 timespan: Timespan, 

530 query_func: QueryFactoryFunction, 

531 ) -> None: 

532 """Associate one or more datasets with a calibration collection and a 

533 validity range within it. 

534 

535 Parameters 

536 ---------- 

537 dataset_type : `DatasetType` 

538 Type of all datasets. 

539 collection : `CollectionRecord` 

540 The record object describing the collection. ``collection.type`` 

541 must be `~CollectionType.CALIBRATION`. 

542 datasets : `~collections.abc.Iterable` [ `DatasetRef` ] 

543 Datasets to be associated. All datasets must have the same 

544 `DatasetType` as ``dataset_type``, but this is not checked. 

545 timespan : `Timespan` 

546 The validity range for these datasets within the collection. 

547 query_func : `QueryFactoryFunction` 

548 Function returning a context manager that sets up a `Query` object 

549 for querying the registry. (That is, a function equivalent to 

550 ``Butler.query()``). 

551 

552 Raises 

553 ------ 

554 ConflictingDefinitionError 

555 Raised if the collection already contains a different dataset with 

556 the same `DatasetType` and data ID and an overlapping validity 

557 range. 

558 DatasetTypeError 

559 Raised if ``dataset_type.isCalibration() is False``. 

560 CollectionTypeError 

561 Raised if 

562 ``collection.type is not CollectionType.CALIBRATION``. 

563 """ 

564 raise NotImplementedError() 

565 

566 @abstractmethod 

567 def decertify( 

568 self, 

569 dataset_type: DatasetType, 

570 collection: CollectionRecord, 

571 timespan: Timespan, 

572 *, 

573 data_ids: Iterable[DataCoordinate] | None = None, 

574 query_func: QueryFactoryFunction, 

575 ) -> None: 

576 """Remove or adjust datasets to clear a validity range within a 

577 calibration collection. 

578 

579 Parameters 

580 ---------- 

581 dataset_type : `DatasetType` 

582 Type of all datasets. 

583 collection : `CollectionRecord` 

584 The record object describing the collection. ``collection.type`` 

585 must be `~CollectionType.CALIBRATION`. 

586 timespan : `Timespan` 

587 The validity range to remove datasets from within the collection. 

588 Datasets that overlap this range but are not contained by it will 

589 have their validity ranges adjusted to not overlap it, which may 

590 split a single dataset validity range into two. 

591 data_ids : `~collections.abc.Iterable` [ `DataCoordinate` ], optional 

592 Data IDs that should be decertified within the given validity range 

593 If `None`, all data IDs for ``dataset_type`` in ``collection`` will 

594 be decertified. 

595 query_func : `QueryFactoryFunction` 

596 Function returning a context manager that sets up a `Query` object 

597 for querying the registry. (That is, a function equivalent to 

598 ``Butler.query()``). 

599 

600 Raises 

601 ------ 

602 DatasetTypeError 

603 Raised if ``dataset_type.isCalibration() is False``. 

604 CollectionTypeError 

605 Raised if 

606 ``collection.type is not CollectionType.CALIBRATION``. 

607 """ 

608 raise NotImplementedError() 

609 

610 @abstractmethod 

611 def make_joins_builder( 

612 self, 

613 dataset_type: DatasetType, 

614 collections: Sequence[CollectionRecord], 

615 fields: Set[AnyDatasetFieldName], 

616 is_union: bool = False, 

617 ) -> SqlJoinsBuilder: 

618 """Make a `lsst.daf.butler.direct_query_driver.SqlJoinsBuilder` 

619 that represents a search for datasets of this type. 

620 

621 Parameters 

622 ---------- 

623 dataset_type : `DatasetType` 

624 Type of dataset to query for. 

625 collections : `~collections.abc.Sequence` [ `CollectionRecord` ] 

626 Collections to search, in order, after filtering out collections 

627 with no datasets of this type via collection summaries. 

628 fields : `~collections.abc.Set` [ `str` ] 

629 Names of fields to make available in the builder. Options include: 

630 

631 - ``dataset_id`` (UUID) 

632 - ``run`` (collection name, `str`) 

633 - ``collection`` (collection name, `str`) 

634 - ``collection_key`` (collection primary key, manager-dependent) 

635 - ``timespan`` (validity range, or unbounded for non-calibrations) 

636 - ``ingest_date`` (time dataset was ingested into repository) 

637 

638 Dimension keys for the dataset type's required dimensions are 

639 always included. 

640 is_union : `bool`, optional 

641 If `True`, this search is being joined in as part of one term in 

642 a union over all dataset types. This causes fields to be added to 

643 the builder via the special ``...`` instad of the dataset type 

644 name. 

645 

646 Returns 

647 ------- 

648 builder : `lsst.daf.butler.direct_query_driver.SqlJoinsBuilder` 

649 A query-construction object representing a table or subquery. 

650 """ 

651 raise NotImplementedError() 

652 

653 @abstractmethod 

654 def refresh_collection_summaries(self, dataset_type: DatasetType) -> None: 

655 """Make sure that collection summaries for this dataset type are 

656 consistent with the contents of the dataset tables. 

657 

658 Parameters 

659 ---------- 

660 dataset_type : `DatasetType` 

661 Dataset type whose summary entries should be refreshed. 

662 """ 

663 raise NotImplementedError()