Coverage for python/lsst/daf/butler/registries/sql.py: 14%

444 statements  

« prev     ^ index     » next       coverage.py v7.2.5, created at 2023-05-15 00:10 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ("SqlRegistry",) 

25 

26import contextlib 

27import logging 

28import warnings 

29from typing import ( 

30 TYPE_CHECKING, 

31 Any, 

32 Dict, 

33 Iterable, 

34 Iterator, 

35 List, 

36 Literal, 

37 Mapping, 

38 Optional, 

39 Sequence, 

40 Set, 

41 Tuple, 

42 Union, 

43) 

44 

45import sqlalchemy 

46from lsst.resources import ResourcePathExpression 

47from lsst.utils.iteration import ensure_iterable 

48 

49from ..core import ( 

50 Config, 

51 DataCoordinate, 

52 DataCoordinateIterable, 

53 DataId, 

54 DatasetAssociation, 

55 DatasetId, 

56 DatasetRef, 

57 DatasetType, 

58 Dimension, 

59 DimensionConfig, 

60 DimensionElement, 

61 DimensionGraph, 

62 DimensionRecord, 

63 DimensionUniverse, 

64 NamedKeyMapping, 

65 NameLookupMapping, 

66 Progress, 

67 StorageClassFactory, 

68 Timespan, 

69 ddl, 

70) 

71from ..core.utils import transactional 

72from ..registry import ( 

73 ArgumentError, 

74 CollectionExpressionError, 

75 CollectionSummary, 

76 CollectionType, 

77 CollectionTypeError, 

78 ConflictingDefinitionError, 

79 DataIdValueError, 

80 DatasetTypeError, 

81 DimensionNameError, 

82 InconsistentDataIdError, 

83 NoDefaultCollectionError, 

84 OrphanedRecordError, 

85 Registry, 

86 RegistryConfig, 

87 RegistryDefaults, 

88 queries, 

89) 

90from ..registry.interfaces import ChainedCollectionRecord, DatasetIdFactory, DatasetIdGenEnum, RunRecord 

91from ..registry.managers import RegistryManagerInstances, RegistryManagerTypes 

92from ..registry.wildcards import CollectionWildcard, DatasetTypeWildcard 

93 

94if TYPE_CHECKING: 94 ↛ 95line 94 didn't jump to line 95, because the condition on line 94 was never true

95 from .._butlerConfig import ButlerConfig 

96 from ..registry.interfaces import CollectionRecord, Database, DatastoreRegistryBridgeManager 

97 

98 

99_LOG = logging.getLogger(__name__) 

100 

101 

102class SqlRegistry(Registry): 

103 """Registry implementation based on SQLAlchemy. 

104 

105 Parameters 

106 ---------- 

107 database : `Database` 

108 Database instance to store Registry. 

109 defaults : `RegistryDefaults` 

110 Default collection search path and/or output `~CollectionType.RUN` 

111 collection. 

112 managers : `RegistryManagerInstances` 

113 All the managers required for this registry. 

114 """ 

115 

116 defaultConfigFile: Optional[str] = None 

117 """Path to configuration defaults. Accessed within the ``configs`` resource 

118 or relative to a search path. Can be None if no defaults specified. 

119 """ 

120 

121 @classmethod 

122 def createFromConfig( 

123 cls, 

124 config: Optional[Union[RegistryConfig, str]] = None, 

125 dimensionConfig: Optional[Union[DimensionConfig, str]] = None, 

126 butlerRoot: Optional[ResourcePathExpression] = None, 

127 ) -> Registry: 

128 """Create registry database and return `SqlRegistry` instance. 

129 

130 This method initializes database contents, database must be empty 

131 prior to calling this method. 

132 

133 Parameters 

134 ---------- 

135 config : `RegistryConfig` or `str`, optional 

136 Registry configuration, if missing then default configuration will 

137 be loaded from registry.yaml. 

138 dimensionConfig : `DimensionConfig` or `str`, optional 

139 Dimensions configuration, if missing then default configuration 

140 will be loaded from dimensions.yaml. 

141 butlerRoot : convertible to `lsst.resources.ResourcePath`, optional 

142 Path to the repository root this `SqlRegistry` will manage. 

143 

144 Returns 

145 ------- 

146 registry : `SqlRegistry` 

147 A new `SqlRegistry` instance. 

148 """ 

149 config = cls.forceRegistryConfig(config) 

150 config.replaceRoot(butlerRoot) 

151 

152 if isinstance(dimensionConfig, str): 

153 dimensionConfig = DimensionConfig(dimensionConfig) 

154 elif dimensionConfig is None: 

155 dimensionConfig = DimensionConfig() 

156 elif not isinstance(dimensionConfig, DimensionConfig): 

157 raise TypeError(f"Incompatible Dimension configuration type: {type(dimensionConfig)}") 

158 

159 DatabaseClass = config.getDatabaseClass() 

160 database = DatabaseClass.fromUri( 

161 str(config.connectionString), origin=config.get("origin", 0), namespace=config.get("namespace") 

162 ) 

163 managerTypes = RegistryManagerTypes.fromConfig(config) 

164 managers = managerTypes.makeRepo(database, dimensionConfig) 

165 return cls(database, RegistryDefaults(), managers) 

166 

167 @classmethod 

168 def fromConfig( 

169 cls, 

170 config: Union[ButlerConfig, RegistryConfig, Config, str], 

171 butlerRoot: Optional[ResourcePathExpression] = None, 

172 writeable: bool = True, 

173 defaults: Optional[RegistryDefaults] = None, 

174 ) -> Registry: 

175 """Create `Registry` subclass instance from `config`. 

176 

177 Registry database must be initialized prior to calling this method. 

178 

179 Parameters 

180 ---------- 

181 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str` 

182 Registry configuration 

183 butlerRoot : `lsst.resources.ResourcePathExpression`, optional 

184 Path to the repository root this `Registry` will manage. 

185 writeable : `bool`, optional 

186 If `True` (default) create a read-write connection to the database. 

187 defaults : `RegistryDefaults`, optional 

188 Default collection search path and/or output `~CollectionType.RUN` 

189 collection. 

190 

191 Returns 

192 ------- 

193 registry : `SqlRegistry` (subclass) 

194 A new `SqlRegistry` subclass instance. 

195 """ 

196 config = cls.forceRegistryConfig(config) 

197 config.replaceRoot(butlerRoot) 

198 DatabaseClass = config.getDatabaseClass() 

199 database = DatabaseClass.fromUri( 

200 str(config.connectionString), 

201 origin=config.get("origin", 0), 

202 namespace=config.get("namespace"), 

203 writeable=writeable, 

204 ) 

205 managerTypes = RegistryManagerTypes.fromConfig(config) 

206 with database.session(): 

207 managers = managerTypes.loadRepo(database) 

208 if defaults is None: 

209 defaults = RegistryDefaults() 

210 return cls(database, defaults, managers) 

211 

212 def __init__(self, database: Database, defaults: RegistryDefaults, managers: RegistryManagerInstances): 

213 self._db = database 

214 self._managers = managers 

215 self.storageClasses = StorageClassFactory() 

216 # Intentionally invoke property setter to initialize defaults. This 

217 # can only be done after most of the rest of Registry has already been 

218 # initialized, and must be done before the property getter is used. 

219 self.defaults = defaults 

220 # In the future DatasetIdFactory may become configurable and this 

221 # instance will need to be shared with datasets manager. 

222 self.datasetIdFactory = DatasetIdFactory() 

223 

224 def __str__(self) -> str: 

225 return str(self._db) 

226 

227 def __repr__(self) -> str: 

228 return f"SqlRegistry({self._db!r}, {self.dimensions!r})" 

229 

230 def isWriteable(self) -> bool: 

231 # Docstring inherited from lsst.daf.butler.registry.Registry 

232 return self._db.isWriteable() 

233 

234 def copy(self, defaults: Optional[RegistryDefaults] = None) -> Registry: 

235 # Docstring inherited from lsst.daf.butler.registry.Registry 

236 if defaults is None: 

237 # No need to copy, because `RegistryDefaults` is immutable; we 

238 # effectively copy on write. 

239 defaults = self.defaults 

240 return type(self)(self._db, defaults, self._managers) 

241 

242 @property 

243 def dimensions(self) -> DimensionUniverse: 

244 # Docstring inherited from lsst.daf.butler.registry.Registry 

245 return self._managers.dimensions.universe 

246 

247 def refresh(self) -> None: 

248 # Docstring inherited from lsst.daf.butler.registry.Registry 

249 with self._db.transaction(): 

250 self._managers.refresh() 

251 

252 @contextlib.contextmanager 

253 def transaction(self, *, savepoint: bool = False) -> Iterator[None]: 

254 # Docstring inherited from lsst.daf.butler.registry.Registry 

255 try: 

256 with self._db.transaction(savepoint=savepoint): 

257 yield 

258 except BaseException: 

259 # TODO: this clears the caches sometimes when we wouldn't actually 

260 # need to. Can we avoid that? 

261 self._managers.dimensions.clearCaches() 

262 raise 

263 

264 def resetConnectionPool(self) -> None: 

265 """Reset SQLAlchemy connection pool for `SqlRegistry` database. 

266 

267 This operation is useful when using registry with fork-based 

268 multiprocessing. To use registry across fork boundary one has to make 

269 sure that there are no currently active connections (no session or 

270 transaction is in progress) and connection pool is reset using this 

271 method. This method should be called by the child process immediately 

272 after the fork. 

273 """ 

274 self._db._engine.dispose() 

275 

276 def registerOpaqueTable(self, tableName: str, spec: ddl.TableSpec) -> None: 

277 """Add an opaque (to the `Registry`) table for use by a `Datastore` or 

278 other data repository client. 

279 

280 Opaque table records can be added via `insertOpaqueData`, retrieved via 

281 `fetchOpaqueData`, and removed via `deleteOpaqueData`. 

282 

283 Parameters 

284 ---------- 

285 tableName : `str` 

286 Logical name of the opaque table. This may differ from the 

287 actual name used in the database by a prefix and/or suffix. 

288 spec : `ddl.TableSpec` 

289 Specification for the table to be added. 

290 """ 

291 self._managers.opaque.register(tableName, spec) 

292 

293 @transactional 

294 def insertOpaqueData(self, tableName: str, *data: dict) -> None: 

295 """Insert records into an opaque table. 

296 

297 Parameters 

298 ---------- 

299 tableName : `str` 

300 Logical name of the opaque table. Must match the name used in a 

301 previous call to `registerOpaqueTable`. 

302 data 

303 Each additional positional argument is a dictionary that represents 

304 a single row to be added. 

305 """ 

306 self._managers.opaque[tableName].insert(*data) 

307 

308 def fetchOpaqueData(self, tableName: str, **where: Any) -> Iterator[dict]: 

309 """Retrieve records from an opaque table. 

310 

311 Parameters 

312 ---------- 

313 tableName : `str` 

314 Logical name of the opaque table. Must match the name used in a 

315 previous call to `registerOpaqueTable`. 

316 where 

317 Additional keyword arguments are interpreted as equality 

318 constraints that restrict the returned rows (combined with AND); 

319 keyword arguments are column names and values are the values they 

320 must have. 

321 

322 Yields 

323 ------ 

324 row : `dict` 

325 A dictionary representing a single result row. 

326 """ 

327 yield from self._managers.opaque[tableName].fetch(**where) 

328 

329 @transactional 

330 def deleteOpaqueData(self, tableName: str, **where: Any) -> None: 

331 """Remove records from an opaque table. 

332 

333 Parameters 

334 ---------- 

335 tableName : `str` 

336 Logical name of the opaque table. Must match the name used in a 

337 previous call to `registerOpaqueTable`. 

338 where 

339 Additional keyword arguments are interpreted as equality 

340 constraints that restrict the deleted rows (combined with AND); 

341 keyword arguments are column names and values are the values they 

342 must have. 

343 """ 

344 self._managers.opaque[tableName].delete(where.keys(), where) 

345 

346 def registerCollection( 

347 self, name: str, type: CollectionType = CollectionType.TAGGED, doc: Optional[str] = None 

348 ) -> bool: 

349 # Docstring inherited from lsst.daf.butler.registry.Registry 

350 _, registered = self._managers.collections.register(name, type, doc=doc) 

351 return registered 

352 

353 def getCollectionType(self, name: str) -> CollectionType: 

354 # Docstring inherited from lsst.daf.butler.registry.Registry 

355 return self._managers.collections.find(name).type 

356 

357 def _get_collection_record(self, name: str) -> CollectionRecord: 

358 # Docstring inherited from lsst.daf.butler.registry.Registry 

359 return self._managers.collections.find(name) 

360 

361 def registerRun(self, name: str, doc: Optional[str] = None) -> bool: 

362 # Docstring inherited from lsst.daf.butler.registry.Registry 

363 _, registered = self._managers.collections.register(name, CollectionType.RUN, doc=doc) 

364 return registered 

365 

366 @transactional 

367 def removeCollection(self, name: str) -> None: 

368 # Docstring inherited from lsst.daf.butler.registry.Registry 

369 self._managers.collections.remove(name) 

370 

371 def getCollectionChain(self, parent: str) -> tuple[str, ...]: 

372 # Docstring inherited from lsst.daf.butler.registry.Registry 

373 record = self._managers.collections.find(parent) 

374 if record.type is not CollectionType.CHAINED: 

375 raise CollectionTypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.") 

376 assert isinstance(record, ChainedCollectionRecord) 

377 return record.children 

378 

379 @transactional 

380 def setCollectionChain(self, parent: str, children: Any, *, flatten: bool = False) -> None: 

381 # Docstring inherited from lsst.daf.butler.registry.Registry 

382 record = self._managers.collections.find(parent) 

383 if record.type is not CollectionType.CHAINED: 

384 raise CollectionTypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.") 

385 assert isinstance(record, ChainedCollectionRecord) 

386 children = CollectionWildcard.from_expression(children).require_ordered() 

387 if children != record.children or flatten: 

388 record.update(self._managers.collections, children, flatten=flatten) 

389 

390 def getCollectionParentChains(self, collection: str) -> Set[str]: 

391 # Docstring inherited from lsst.daf.butler.registry.Registry 

392 return { 

393 record.name 

394 for record in self._managers.collections.getParentChains( 

395 self._managers.collections.find(collection).key 

396 ) 

397 } 

398 

399 def getCollectionDocumentation(self, collection: str) -> Optional[str]: 

400 # Docstring inherited from lsst.daf.butler.registry.Registry 

401 return self._managers.collections.getDocumentation(self._managers.collections.find(collection).key) 

402 

403 def setCollectionDocumentation(self, collection: str, doc: Optional[str]) -> None: 

404 # Docstring inherited from lsst.daf.butler.registry.Registry 

405 self._managers.collections.setDocumentation(self._managers.collections.find(collection).key, doc) 

406 

407 def getCollectionSummary(self, collection: str) -> CollectionSummary: 

408 # Docstring inherited from lsst.daf.butler.registry.Registry 

409 record = self._managers.collections.find(collection) 

410 return self._managers.datasets.getCollectionSummary(record) 

411 

412 def registerDatasetType(self, datasetType: DatasetType) -> bool: 

413 # Docstring inherited from lsst.daf.butler.registry.Registry 

414 _, inserted = self._managers.datasets.register(datasetType) 

415 return inserted 

416 

417 def removeDatasetType(self, name: str) -> None: 

418 # Docstring inherited from lsst.daf.butler.registry.Registry 

419 self._managers.datasets.remove(name) 

420 

421 def getDatasetType(self, name: str) -> DatasetType: 

422 # Docstring inherited from lsst.daf.butler.registry.Registry 

423 parent_name, component = DatasetType.splitDatasetTypeName(name) 

424 storage = self._managers.datasets[parent_name] 

425 if component is None: 

426 return storage.datasetType 

427 else: 

428 return storage.datasetType.makeComponentDatasetType(component) 

429 

430 def supportsIdGenerationMode(self, mode: DatasetIdGenEnum) -> bool: 

431 # Docstring inherited from lsst.daf.butler.registry.Registry 

432 return self._managers.datasets.supportsIdGenerationMode(mode) 

433 

434 def findDataset( 

435 self, 

436 datasetType: Union[DatasetType, str], 

437 dataId: Optional[DataId] = None, 

438 *, 

439 collections: Any = None, 

440 timespan: Optional[Timespan] = None, 

441 **kwargs: Any, 

442 ) -> Optional[DatasetRef]: 

443 # Docstring inherited from lsst.daf.butler.registry.Registry 

444 if isinstance(datasetType, DatasetType): 

445 parent_name, component = datasetType.nameAndComponent() 

446 else: 

447 parent_name, component = DatasetType.splitDatasetTypeName(datasetType) 

448 storage = self._managers.datasets[parent_name] 

449 dataId = DataCoordinate.standardize( 

450 dataId, 

451 graph=storage.datasetType.dimensions, 

452 universe=self.dimensions, 

453 defaults=self.defaults.dataId, 

454 **kwargs, 

455 ) 

456 if collections is None: 

457 if not self.defaults.collections: 

458 raise NoDefaultCollectionError( 

459 "No collections provided to findDataset, and no defaults from registry construction." 

460 ) 

461 collections = self.defaults.collections 

462 collections = CollectionWildcard.from_expression(collections) 

463 collections.require_ordered() 

464 for collectionRecord in self._managers.collections.resolve_wildcard(collections): 

465 if collectionRecord.type is CollectionType.CALIBRATION and ( 

466 not storage.datasetType.isCalibration() or timespan is None 

467 ): 

468 continue 

469 result = storage.find(collectionRecord, dataId, timespan=timespan) 

470 if result is not None: 

471 if component is not None: 

472 return result.makeComponentRef(component) 

473 return result 

474 

475 return None 

476 

477 @transactional 

478 def insertDatasets( 

479 self, 

480 datasetType: Union[DatasetType, str], 

481 dataIds: Iterable[DataId], 

482 run: Optional[str] = None, 

483 expand: bool = True, 

484 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

485 ) -> List[DatasetRef]: 

486 # Docstring inherited from lsst.daf.butler.registry.Registry 

487 if isinstance(datasetType, DatasetType): 

488 storage = self._managers.datasets.find(datasetType.name) 

489 if storage is None: 

490 raise DatasetTypeError(f"DatasetType '{datasetType}' has not been registered.") 

491 else: 

492 storage = self._managers.datasets.find(datasetType) 

493 if storage is None: 

494 raise DatasetTypeError(f"DatasetType with name '{datasetType}' has not been registered.") 

495 if run is None: 

496 if self.defaults.run is None: 

497 raise NoDefaultCollectionError( 

498 "No run provided to insertDatasets, and no default from registry construction." 

499 ) 

500 run = self.defaults.run 

501 runRecord = self._managers.collections.find(run) 

502 if runRecord.type is not CollectionType.RUN: 

503 raise CollectionTypeError( 

504 f"Given collection is of type {runRecord.type.name}; RUN collection required." 

505 ) 

506 assert isinstance(runRecord, RunRecord) 

507 progress = Progress("daf.butler.Registry.insertDatasets", level=logging.DEBUG) 

508 if expand: 

509 expandedDataIds = [ 

510 self.expandDataId(dataId, graph=storage.datasetType.dimensions) 

511 for dataId in progress.wrap(dataIds, f"Expanding {storage.datasetType.name} data IDs") 

512 ] 

513 else: 

514 expandedDataIds = [ 

515 DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions) for dataId in dataIds 

516 ] 

517 try: 

518 refs = list(storage.insert(runRecord, expandedDataIds, idGenerationMode)) 

519 if self._managers.obscore: 

520 self._managers.obscore.add_datasets(refs) 

521 except sqlalchemy.exc.IntegrityError as err: 

522 raise ConflictingDefinitionError( 

523 f"A database constraint failure was triggered by inserting " 

524 f"one or more datasets of type {storage.datasetType} into " 

525 f"collection '{run}'. " 

526 f"This probably means a dataset with the same data ID " 

527 f"and dataset type already exists, but it may also mean a " 

528 f"dimension row is missing." 

529 ) from err 

530 return refs 

531 

532 @transactional 

533 def _importDatasets( 

534 self, 

535 datasets: Iterable[DatasetRef], 

536 expand: bool = True, 

537 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

538 reuseIds: bool = False, 

539 ) -> List[DatasetRef]: 

540 # Docstring inherited from lsst.daf.butler.registry.Registry 

541 datasets = list(datasets) 

542 if not datasets: 

543 # nothing to do 

544 return [] 

545 

546 # find dataset type 

547 datasetTypes = set(dataset.datasetType for dataset in datasets) 

548 if len(datasetTypes) != 1: 

549 raise DatasetTypeError(f"Multiple dataset types in input datasets: {datasetTypes}") 

550 datasetType = datasetTypes.pop() 

551 

552 # get storage handler for this dataset type 

553 storage = self._managers.datasets.find(datasetType.name) 

554 if storage is None: 

555 raise DatasetTypeError(f"DatasetType '{datasetType}' has not been registered.") 

556 

557 # find run name 

558 runs = set(dataset.run for dataset in datasets) 

559 if len(runs) != 1: 

560 raise ValueError(f"Multiple run names in input datasets: {runs}") 

561 run = runs.pop() 

562 if run is None: 

563 if self.defaults.run is None: 

564 raise NoDefaultCollectionError( 

565 "No run provided to ingestDatasets, and no default from registry construction." 

566 ) 

567 run = self.defaults.run 

568 

569 runRecord = self._managers.collections.find(run) 

570 if runRecord.type is not CollectionType.RUN: 

571 raise CollectionTypeError( 

572 f"Given collection '{runRecord.name}' is of type {runRecord.type.name};" 

573 " RUN collection required." 

574 ) 

575 assert isinstance(runRecord, RunRecord) 

576 

577 progress = Progress("daf.butler.Registry.insertDatasets", level=logging.DEBUG) 

578 if expand: 

579 expandedDatasets = [ 

580 dataset.expanded(self.expandDataId(dataset.dataId, graph=storage.datasetType.dimensions)) 

581 for dataset in progress.wrap(datasets, f"Expanding {storage.datasetType.name} data IDs") 

582 ] 

583 else: 

584 expandedDatasets = [ 

585 DatasetRef(datasetType, dataset.dataId, id=dataset.id, run=dataset.run, conform=True) 

586 for dataset in datasets 

587 ] 

588 

589 try: 

590 refs = list(storage.import_(runRecord, expandedDatasets, idGenerationMode, reuseIds)) 

591 if self._managers.obscore: 

592 self._managers.obscore.add_datasets(refs) 

593 except sqlalchemy.exc.IntegrityError as err: 

594 raise ConflictingDefinitionError( 

595 f"A database constraint failure was triggered by inserting " 

596 f"one or more datasets of type {storage.datasetType} into " 

597 f"collection '{run}'. " 

598 f"This probably means a dataset with the same data ID " 

599 f"and dataset type already exists, but it may also mean a " 

600 f"dimension row is missing." 

601 ) from err 

602 return refs 

603 

604 def getDataset(self, id: DatasetId) -> Optional[DatasetRef]: 

605 # Docstring inherited from lsst.daf.butler.registry.Registry 

606 return self._managers.datasets.getDatasetRef(id) 

607 

608 @transactional 

609 def removeDatasets(self, refs: Iterable[DatasetRef]) -> None: 

610 # Docstring inherited from lsst.daf.butler.registry.Registry 

611 progress = Progress("lsst.daf.butler.Registry.removeDatasets", level=logging.DEBUG) 

612 for datasetType, refsForType in progress.iter_item_chunks( 

613 DatasetRef.groupByType(refs).items(), desc="Removing datasets by type" 

614 ): 

615 storage = self._managers.datasets[datasetType.name] 

616 try: 

617 storage.delete(refsForType) 

618 except sqlalchemy.exc.IntegrityError as err: 

619 raise OrphanedRecordError( 

620 "One or more datasets is still present in one or more Datastores." 

621 ) from err 

622 

623 @transactional 

624 def associate(self, collection: str, refs: Iterable[DatasetRef]) -> None: 

625 # Docstring inherited from lsst.daf.butler.registry.Registry 

626 progress = Progress("lsst.daf.butler.Registry.associate", level=logging.DEBUG) 

627 collectionRecord = self._managers.collections.find(collection) 

628 if collectionRecord.type is not CollectionType.TAGGED: 

629 raise CollectionTypeError( 

630 f"Collection '{collection}' has type {collectionRecord.type.name}, not TAGGED." 

631 ) 

632 for datasetType, refsForType in progress.iter_item_chunks( 

633 DatasetRef.groupByType(refs).items(), desc="Associating datasets by type" 

634 ): 

635 storage = self._managers.datasets[datasetType.name] 

636 try: 

637 storage.associate(collectionRecord, refsForType) 

638 if self._managers.obscore: 

639 # If a TAGGED collection is being monitored by ObsCore 

640 # manager then we may need to save the dataset. 

641 self._managers.obscore.associate(refsForType, collectionRecord) 

642 except sqlalchemy.exc.IntegrityError as err: 

643 raise ConflictingDefinitionError( 

644 f"Constraint violation while associating dataset of type {datasetType.name} with " 

645 f"collection {collection}. This probably means that one or more datasets with the same " 

646 f"dataset type and data ID already exist in the collection, but it may also indicate " 

647 f"that the datasets do not exist." 

648 ) from err 

649 

650 @transactional 

651 def disassociate(self, collection: str, refs: Iterable[DatasetRef]) -> None: 

652 # Docstring inherited from lsst.daf.butler.registry.Registry 

653 progress = Progress("lsst.daf.butler.Registry.disassociate", level=logging.DEBUG) 

654 collectionRecord = self._managers.collections.find(collection) 

655 if collectionRecord.type is not CollectionType.TAGGED: 

656 raise CollectionTypeError( 

657 f"Collection '{collection}' has type {collectionRecord.type.name}; expected TAGGED." 

658 ) 

659 for datasetType, refsForType in progress.iter_item_chunks( 

660 DatasetRef.groupByType(refs).items(), desc="Disassociating datasets by type" 

661 ): 

662 storage = self._managers.datasets[datasetType.name] 

663 storage.disassociate(collectionRecord, refsForType) 

664 if self._managers.obscore: 

665 self._managers.obscore.disassociate(refsForType, collectionRecord) 

666 

667 @transactional 

668 def certify(self, collection: str, refs: Iterable[DatasetRef], timespan: Timespan) -> None: 

669 # Docstring inherited from lsst.daf.butler.registry.Registry 

670 progress = Progress("lsst.daf.butler.Registry.certify", level=logging.DEBUG) 

671 collectionRecord = self._managers.collections.find(collection) 

672 for datasetType, refsForType in progress.iter_item_chunks( 

673 DatasetRef.groupByType(refs).items(), desc="Certifying datasets by type" 

674 ): 

675 storage = self._managers.datasets[datasetType.name] 

676 storage.certify(collectionRecord, refsForType, timespan) 

677 

678 @transactional 

679 def decertify( 

680 self, 

681 collection: str, 

682 datasetType: Union[str, DatasetType], 

683 timespan: Timespan, 

684 *, 

685 dataIds: Optional[Iterable[DataId]] = None, 

686 ) -> None: 

687 # Docstring inherited from lsst.daf.butler.registry.Registry 

688 collectionRecord = self._managers.collections.find(collection) 

689 if isinstance(datasetType, str): 

690 storage = self._managers.datasets[datasetType] 

691 else: 

692 storage = self._managers.datasets[datasetType.name] 

693 standardizedDataIds = None 

694 if dataIds is not None: 

695 standardizedDataIds = [ 

696 DataCoordinate.standardize(d, graph=storage.datasetType.dimensions) for d in dataIds 

697 ] 

698 storage.decertify(collectionRecord, timespan, dataIds=standardizedDataIds) 

699 

700 def getDatastoreBridgeManager(self) -> DatastoreRegistryBridgeManager: 

701 """Return an object that allows a new `Datastore` instance to 

702 communicate with this `Registry`. 

703 

704 Returns 

705 ------- 

706 manager : `DatastoreRegistryBridgeManager` 

707 Object that mediates communication between this `Registry` and its 

708 associated datastores. 

709 """ 

710 return self._managers.datastores 

711 

712 def getDatasetLocations(self, ref: DatasetRef) -> Iterable[str]: 

713 # Docstring inherited from lsst.daf.butler.registry.Registry 

714 return self._managers.datastores.findDatastores(ref) 

715 

716 def expandDataId( 

717 self, 

718 dataId: Optional[DataId] = None, 

719 *, 

720 graph: Optional[DimensionGraph] = None, 

721 records: Optional[NameLookupMapping[DimensionElement, Optional[DimensionRecord]]] = None, 

722 withDefaults: bool = True, 

723 **kwargs: Any, 

724 ) -> DataCoordinate: 

725 # Docstring inherited from lsst.daf.butler.registry.Registry 

726 if not withDefaults: 

727 defaults = None 

728 else: 

729 defaults = self.defaults.dataId 

730 try: 

731 standardized = DataCoordinate.standardize( 

732 dataId, graph=graph, universe=self.dimensions, defaults=defaults, **kwargs 

733 ) 

734 except KeyError as exc: 

735 # This means either kwargs have some odd name or required 

736 # dimension is missing. 

737 raise DimensionNameError(str(exc)) from exc 

738 if standardized.hasRecords(): 

739 return standardized 

740 if records is None: 

741 records = {} 

742 elif isinstance(records, NamedKeyMapping): 

743 records = records.byName() 

744 else: 

745 records = dict(records) 

746 if isinstance(dataId, DataCoordinate) and dataId.hasRecords(): 

747 records.update(dataId.records.byName()) 

748 keys = standardized.byName() 

749 for element in standardized.graph.primaryKeyTraversalOrder: 

750 record = records.get(element.name, ...) # Use ... to mean not found; None might mean NULL 

751 if record is ...: 

752 if isinstance(element, Dimension) and keys.get(element.name) is None: 

753 if element in standardized.graph.required: 

754 raise DimensionNameError( 

755 f"No value or null value for required dimension {element.name}." 

756 ) 

757 keys[element.name] = None 

758 record = None 

759 else: 

760 storage = self._managers.dimensions[element] 

761 dataIdSet = DataCoordinateIterable.fromScalar( 

762 DataCoordinate.standardize(keys, graph=element.graph) 

763 ) 

764 fetched = tuple(storage.fetch(dataIdSet)) 

765 try: 

766 (record,) = fetched 

767 except ValueError: 

768 record = None 

769 records[element.name] = record 

770 if record is not None: 

771 for d in element.implied: 

772 value = getattr(record, d.name) 

773 if keys.setdefault(d.name, value) != value: 

774 raise InconsistentDataIdError( 

775 f"Data ID {standardized} has {d.name}={keys[d.name]!r}, " 

776 f"but {element.name} implies {d.name}={value!r}." 

777 ) 

778 else: 

779 if element in standardized.graph.required: 

780 raise DataIdValueError( 

781 f"Could not fetch record for required dimension {element.name} via keys {keys}." 

782 ) 

783 if element.alwaysJoin: 

784 raise InconsistentDataIdError( 

785 f"Could not fetch record for element {element.name} via keys {keys}, ", 

786 "but it is marked alwaysJoin=True; this means one or more dimensions are not " 

787 "related.", 

788 ) 

789 for d in element.implied: 

790 keys.setdefault(d.name, None) 

791 records.setdefault(d.name, None) 

792 return DataCoordinate.standardize(keys, graph=standardized.graph).expanded(records=records) 

793 

794 def insertDimensionData( 

795 self, 

796 element: Union[DimensionElement, str], 

797 *data: Union[Mapping[str, Any], DimensionRecord], 

798 conform: bool = True, 

799 replace: bool = False, 

800 skip_existing: bool = False, 

801 ) -> None: 

802 # Docstring inherited from lsst.daf.butler.registry.Registry 

803 if conform: 

804 if isinstance(element, str): 

805 element = self.dimensions[element] 

806 records = [ 

807 row if isinstance(row, DimensionRecord) else element.RecordClass(**row) for row in data 

808 ] 

809 else: 

810 # Ignore typing since caller said to trust them with conform=False. 

811 records = data # type: ignore 

812 storage = self._managers.dimensions[element] # type: ignore 

813 storage.insert(*records, replace=replace, skip_existing=skip_existing) 

814 

815 def syncDimensionData( 

816 self, 

817 element: Union[DimensionElement, str], 

818 row: Union[Mapping[str, Any], DimensionRecord], 

819 conform: bool = True, 

820 update: bool = False, 

821 ) -> Union[bool, Dict[str, Any]]: 

822 # Docstring inherited from lsst.daf.butler.registry.Registry 

823 if conform: 

824 if isinstance(element, str): 

825 element = self.dimensions[element] 

826 record = row if isinstance(row, DimensionRecord) else element.RecordClass(**row) 

827 else: 

828 # Ignore typing since caller said to trust them with conform=False. 

829 record = row # type: ignore 

830 storage = self._managers.dimensions[element] # type: ignore 

831 return storage.sync(record, update=update) 

832 

833 def queryDatasetTypes( 

834 self, 

835 expression: Any = ..., 

836 *, 

837 components: Optional[bool] = None, 

838 missing: Optional[List[str]] = None, 

839 ) -> Iterable[DatasetType]: 

840 # Docstring inherited from lsst.daf.butler.registry.Registry 

841 wildcard = DatasetTypeWildcard.from_expression(expression) 

842 composition_dict = self._managers.datasets.resolve_wildcard( 

843 wildcard, 

844 components=components, 

845 missing=missing, 

846 ) 

847 result: list[DatasetType] = [] 

848 for parent_dataset_type, components_for_parent in composition_dict.items(): 

849 result.extend( 

850 parent_dataset_type.makeComponentDatasetType(c) if c is not None else parent_dataset_type 

851 for c in components_for_parent 

852 ) 

853 return result 

854 

855 def queryCollections( 

856 self, 

857 expression: Any = ..., 

858 datasetType: Optional[DatasetType] = None, 

859 collectionTypes: Union[Iterable[CollectionType], CollectionType] = CollectionType.all(), 

860 flattenChains: bool = False, 

861 includeChains: Optional[bool] = None, 

862 ) -> Sequence[str]: 

863 # Docstring inherited from lsst.daf.butler.registry.Registry 

864 

865 # Right now the datasetTypes argument is completely ignored, but that 

866 # is consistent with its [lack of] guarantees. DM-24939 or a follow-up 

867 # ticket will take care of that. 

868 try: 

869 wildcard = CollectionWildcard.from_expression(expression) 

870 except TypeError as exc: 

871 raise CollectionExpressionError(f"Invalid collection expression '{expression}'") from exc 

872 collectionTypes = ensure_iterable(collectionTypes) 

873 return [ 

874 record.name 

875 for record in self._managers.collections.resolve_wildcard( 

876 wildcard, 

877 collection_types=frozenset(collectionTypes), 

878 flatten_chains=flattenChains, 

879 include_chains=includeChains, 

880 ) 

881 ] 

882 

883 def _makeQueryBuilder( 

884 self, summary: queries.QuerySummary, doomed_by: Iterable[str] = () 

885 ) -> queries.QueryBuilder: 

886 """Return a `QueryBuilder` instance capable of constructing and 

887 managing more complex queries than those obtainable via `Registry` 

888 interfaces. 

889 

890 This is an advanced interface; downstream code should prefer 

891 `Registry.queryDataIds` and `Registry.queryDatasets` whenever those 

892 are sufficient. 

893 

894 Parameters 

895 ---------- 

896 summary : `queries.QuerySummary` 

897 Object describing and categorizing the full set of dimensions that 

898 will be included in the query. 

899 doomed_by : `Iterable` of `str`, optional 

900 A list of diagnostic messages that indicate why the query is going 

901 to yield no results and should not even be executed. If an empty 

902 container (default) the query will be executed unless other code 

903 determines that it is doomed. 

904 

905 Returns 

906 ------- 

907 builder : `queries.QueryBuilder` 

908 Object that can be used to construct and perform advanced queries. 

909 """ 

910 return queries.QueryBuilder( 

911 summary, 

912 backend=queries.SqlQueryBackend(self._db, self._managers), 

913 doomed_by=doomed_by, 

914 ) 

915 

916 def _standardize_query_dataset_args( 

917 self, 

918 datasets: Any, 

919 collections: Any, 

920 components: bool | None, 

921 mode: Literal["find_first"] | Literal["find_all"] | Literal["constrain"] = "constrain", 

922 *, 

923 doomed_by: list[str], 

924 ) -> tuple[dict[DatasetType, list[str | None]], CollectionWildcard | None]: 

925 """Preprocess dataset arguments passed to query* methods. 

926 

927 Parameters 

928 ---------- 

929 datasets : `DatasetType`, `str`, `re.Pattern`, or iterable of these 

930 Expression identifying dataset types. See `queryDatasetTypes` for 

931 details. 

932 collections : `str`, `re.Pattern`, or iterable of these 

933 Expression identifying collections to be searched. See 

934 `queryCollections` for details. 

935 components : `bool`, optional 

936 If `True`, apply all expression patterns to component dataset type 

937 names as well. If `False`, never apply patterns to components. 

938 If `None` (default), apply patterns to components only if their 

939 parent datasets were not matched by the expression. 

940 Fully-specified component datasets (`str` or `DatasetType` 

941 instances) are always included. 

942 

943 Values other than `False` are deprecated, and only `False` will be 

944 supported after v26. After v27 this argument will be removed 

945 entirely. 

946 mode : `str`, optional 

947 The way in which datasets are being used in this query; one of: 

948 

949 - "find_first": this is a query for the first dataset in an 

950 ordered list of collections. Prohibits collection wildcards, 

951 but permits dataset type wildcards. 

952 

953 - "find_all": this is a query for all datasets in all matched 

954 collections. Permits collection and dataset type wildcards. 

955 

956 - "constrain": this is a query for something other than datasets, 

957 with results constrained by dataset existence. Permits 

958 collection wildcards and prohibits ``...`` as a dataset type 

959 wildcard. 

960 doomed_by : `list` [ `str` ] 

961 List to append messages indicating why the query is doomed to 

962 yield no results. 

963 

964 Returns 

965 ------- 

966 composition : `defaultdict` [ `DatasetType`, `list` [ `str` ] ] 

967 Dictionary mapping parent dataset type to `list` of components 

968 matched for that dataset type (or `None` for the parent itself). 

969 collections : `CollectionWildcard` 

970 Processed collection expression. 

971 """ 

972 composition: dict[DatasetType, list[str | None]] = {} 

973 if datasets is not None: 

974 if not collections: 

975 if not self.defaults.collections: 

976 raise NoDefaultCollectionError("No collections, and no registry default collections.") 

977 collections = self.defaults.collections 

978 else: 

979 collections = CollectionWildcard.from_expression(collections) 

980 if mode == "find_first" and collections.patterns: 

981 raise TypeError( 

982 f"Collection pattern(s) {collections.patterns} not allowed in this context." 

983 ) 

984 missing: list[str] = [] 

985 composition = self._managers.datasets.resolve_wildcard( 

986 datasets, components=components, missing=missing, explicit_only=(mode == "constrain") 

987 ) 

988 if missing and mode == "constrain": 

989 # After v26 this should raise MissingDatasetTypeError, to be 

990 # implemented on DM-36303. 

991 warnings.warn( 

992 f"Dataset type(s) {missing} are not registered; this will be an error after v26.", 

993 FutureWarning, 

994 ) 

995 doomed_by.extend(f"Dataset type {name} is not registered." for name in missing) 

996 elif collections: 

997 raise ArgumentError(f"Cannot pass 'collections' (='{collections}') without 'datasets'.") 

998 return composition, collections 

999 

1000 def queryDatasets( 

1001 self, 

1002 datasetType: Any, 

1003 *, 

1004 collections: Any = None, 

1005 dimensions: Optional[Iterable[Union[Dimension, str]]] = None, 

1006 dataId: Optional[DataId] = None, 

1007 where: Optional[str] = None, 

1008 findFirst: bool = False, 

1009 components: Optional[bool] = None, 

1010 bind: Optional[Mapping[str, Any]] = None, 

1011 check: bool = True, 

1012 **kwargs: Any, 

1013 ) -> queries.DatasetQueryResults: 

1014 # Docstring inherited from lsst.daf.butler.registry.Registry 

1015 doomed_by: list[str] = [] 

1016 data_id = self.expandDataId(dataId, **kwargs) 

1017 dataset_composition, collections = self._standardize_query_dataset_args( 

1018 datasetType, 

1019 collections, 

1020 components, 

1021 mode="find_first" if findFirst else "find_all", 

1022 doomed_by=doomed_by, 

1023 ) 

1024 parent_results: list[queries.ParentDatasetQueryResults] = [] 

1025 for parent_dataset_type, components_for_parent in dataset_composition.items(): 

1026 # The full set of dimensions in the query is the combination of 

1027 # those needed for the DatasetType and those explicitly requested, 

1028 # if any. 

1029 dimension_names = set(parent_dataset_type.dimensions.names) 

1030 if dimensions is not None: 

1031 dimension_names.update(self.dimensions.extract(dimensions).names) 

1032 # Construct the summary structure needed to construct a 

1033 # QueryBuilder. 

1034 summary = queries.QuerySummary( 

1035 requested=DimensionGraph(self.dimensions, names=dimension_names), 

1036 dataId=data_id, 

1037 expression=where, 

1038 bind=bind, 

1039 defaults=self.defaults.dataId, 

1040 check=check, 

1041 datasets=[parent_dataset_type], 

1042 ) 

1043 builder = self._makeQueryBuilder(summary) 

1044 # Add the dataset subquery to the query, telling the QueryBuilder 

1045 # to include the rank of the selected collection in the results 

1046 # only if we need to findFirst. Note that if any of the 

1047 # collections are actually wildcard expressions, and 

1048 # findFirst=True, this will raise TypeError for us. 

1049 builder.joinDataset(parent_dataset_type, collections, isResult=True, findFirst=findFirst) 

1050 query = builder.finish() 

1051 parent_results.append( 

1052 queries.ParentDatasetQueryResults( 

1053 self._db, query, datasetType=parent_dataset_type, components=components_for_parent 

1054 ) 

1055 ) 

1056 if not parent_results: 

1057 doomed_by.extend( 

1058 f"No registered dataset type matching {t!r} found, so no matching datasets can " 

1059 "exist in any collection." 

1060 for t in ensure_iterable(datasetType) 

1061 ) 

1062 return queries.ChainedDatasetQueryResults([], doomed_by=doomed_by) 

1063 elif len(parent_results) == 1: 

1064 return parent_results[0] 

1065 else: 

1066 return queries.ChainedDatasetQueryResults(parent_results) 

1067 

1068 def queryDataIds( 

1069 self, 

1070 dimensions: Union[Iterable[Union[Dimension, str]], Dimension, str], 

1071 *, 

1072 dataId: Optional[DataId] = None, 

1073 datasets: Any = None, 

1074 collections: Any = None, 

1075 where: Optional[str] = None, 

1076 components: Optional[bool] = None, 

1077 bind: Optional[Mapping[str, Any]] = None, 

1078 check: bool = True, 

1079 **kwargs: Any, 

1080 ) -> queries.DataCoordinateQueryResults: 

1081 # Docstring inherited from lsst.daf.butler.registry.Registry 

1082 dimensions = ensure_iterable(dimensions) 

1083 requestedDimensions = self.dimensions.extract(dimensions) 

1084 doomed_by: list[str] = [] 

1085 data_id = self.expandDataId(dataId, **kwargs) 

1086 dataset_composition, collections = self._standardize_query_dataset_args( 

1087 datasets, collections, components, doomed_by=doomed_by 

1088 ) 

1089 

1090 def query_factory( 

1091 order_by: Optional[Iterable[str]] = None, limit: Optional[Tuple[int, Optional[int]]] = None 

1092 ) -> queries.Query: 

1093 """Construct the Query object that generates query results.""" 

1094 summary = queries.QuerySummary( 

1095 requested=requestedDimensions, 

1096 dataId=data_id, 

1097 expression=where, 

1098 bind=bind, 

1099 defaults=self.defaults.dataId, 

1100 check=check, 

1101 datasets=dataset_composition.keys(), 

1102 order_by=order_by, 

1103 limit=limit, 

1104 ) 

1105 builder = self._makeQueryBuilder(summary, doomed_by=doomed_by) 

1106 for datasetType in dataset_composition: 

1107 builder.joinDataset(datasetType, collections, isResult=False) 

1108 return builder.finish() 

1109 

1110 return queries.DataCoordinateQueryResults(self._db, query_factory, requestedDimensions) 

1111 

1112 def queryDimensionRecords( 

1113 self, 

1114 element: Union[DimensionElement, str], 

1115 *, 

1116 dataId: Optional[DataId] = None, 

1117 datasets: Any = None, 

1118 collections: Any = None, 

1119 where: Optional[str] = None, 

1120 components: Optional[bool] = None, 

1121 bind: Optional[Mapping[str, Any]] = None, 

1122 check: bool = True, 

1123 **kwargs: Any, 

1124 ) -> queries.DimensionRecordQueryResults: 

1125 # Docstring inherited from lsst.daf.butler.registry.Registry 

1126 if not isinstance(element, DimensionElement): 

1127 try: 

1128 element = self.dimensions[element] 

1129 except KeyError as e: 

1130 raise DimensionNameError( 

1131 f"No such dimension '{element}', available dimensions: " 

1132 + str(self.dimensions.getStaticElements()) 

1133 ) from e 

1134 dataIds = self.queryDataIds( 

1135 element.graph, 

1136 dataId=dataId, 

1137 datasets=datasets, 

1138 collections=collections, 

1139 where=where, 

1140 components=components, 

1141 bind=bind, 

1142 check=check, 

1143 **kwargs, 

1144 ) 

1145 return queries.DatabaseDimensionRecordQueryResults(dataIds, self._managers.dimensions[element]) 

1146 

1147 def queryDatasetAssociations( 

1148 self, 

1149 datasetType: Union[str, DatasetType], 

1150 collections: Any = ..., 

1151 *, 

1152 collectionTypes: Iterable[CollectionType] = CollectionType.all(), 

1153 flattenChains: bool = False, 

1154 ) -> Iterator[DatasetAssociation]: 

1155 # Docstring inherited from lsst.daf.butler.registry.Registry 

1156 if collections is None: 

1157 if not self.defaults.collections: 

1158 raise NoDefaultCollectionError( 

1159 "No collections provided to findDataset, and no defaults from registry construction." 

1160 ) 

1161 collections = self.defaults.collections 

1162 collections = CollectionWildcard.from_expression(collections) 

1163 TimespanReprClass = self._db.getTimespanRepresentation() 

1164 if isinstance(datasetType, str): 

1165 storage = self._managers.datasets[datasetType] 

1166 else: 

1167 storage = self._managers.datasets[datasetType.name] 

1168 for collectionRecord in self._managers.collections.resolve_wildcard( 

1169 collections, 

1170 collection_types=frozenset(collectionTypes), 

1171 flatten_chains=flattenChains, 

1172 ): 

1173 query = storage.select(collectionRecord) 

1174 with self._db.query(query) as sql_result: 

1175 sql_mappings = sql_result.mappings().fetchall() 

1176 for row in sql_mappings: 

1177 dataId = DataCoordinate.fromRequiredValues( 

1178 storage.datasetType.dimensions, 

1179 tuple(row[name] for name in storage.datasetType.dimensions.required.names), 

1180 ) 

1181 runRecord = self._managers.collections[row[self._managers.collections.getRunForeignKeyName()]] 

1182 ref = DatasetRef(storage.datasetType, dataId, id=row["id"], run=runRecord.name, conform=False) 

1183 if collectionRecord.type is CollectionType.CALIBRATION: 

1184 timespan = TimespanReprClass.extract(row) 

1185 else: 

1186 timespan = None 

1187 yield DatasetAssociation(ref=ref, collection=collectionRecord.name, timespan=timespan) 

1188 

1189 storageClasses: StorageClassFactory 

1190 """All storage classes known to the registry (`StorageClassFactory`). 

1191 """