Coverage for python/lsst/daf/butler/registries/sql.py: 14%

440 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2022-10-07 02:47 -0700

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ("SqlRegistry",) 

25 

26import contextlib 

27import logging 

28import warnings 

29from typing import ( 

30 TYPE_CHECKING, 

31 Any, 

32 Dict, 

33 Iterable, 

34 Iterator, 

35 List, 

36 Literal, 

37 Mapping, 

38 Optional, 

39 Sequence, 

40 Set, 

41 Tuple, 

42 Union, 

43) 

44 

45import sqlalchemy 

46from lsst.resources import ResourcePathExpression 

47from lsst.utils.iteration import ensure_iterable 

48 

49from ..core import ( 

50 Config, 

51 DataCoordinate, 

52 DataCoordinateIterable, 

53 DataId, 

54 DatasetAssociation, 

55 DatasetId, 

56 DatasetRef, 

57 DatasetType, 

58 Dimension, 

59 DimensionConfig, 

60 DimensionElement, 

61 DimensionGraph, 

62 DimensionRecord, 

63 DimensionUniverse, 

64 NamedKeyMapping, 

65 NameLookupMapping, 

66 Progress, 

67 StorageClassFactory, 

68 Timespan, 

69 ddl, 

70) 

71from ..core.utils import transactional 

72from ..registry import ( 

73 ArgumentError, 

74 CollectionExpressionError, 

75 CollectionSummary, 

76 CollectionType, 

77 CollectionTypeError, 

78 ConflictingDefinitionError, 

79 DataIdValueError, 

80 DatasetTypeError, 

81 DimensionNameError, 

82 InconsistentDataIdError, 

83 NoDefaultCollectionError, 

84 OrphanedRecordError, 

85 Registry, 

86 RegistryConfig, 

87 RegistryDefaults, 

88 queries, 

89) 

90from ..registry.interfaces import ChainedCollectionRecord, DatasetIdFactory, DatasetIdGenEnum, RunRecord 

91from ..registry.managers import RegistryManagerInstances, RegistryManagerTypes 

92from ..registry.wildcards import CollectionWildcard, DatasetTypeWildcard 

93 

94if TYPE_CHECKING: 94 ↛ 95line 94 didn't jump to line 95, because the condition on line 94 was never true

95 from .._butlerConfig import ButlerConfig 

96 from ..registry.interfaces import CollectionRecord, Database, DatastoreRegistryBridgeManager 

97 

98 

99_LOG = logging.getLogger(__name__) 

100 

101 

102class SqlRegistry(Registry): 

103 """Registry implementation based on SQLAlchemy. 

104 

105 Parameters 

106 ---------- 

107 database : `Database` 

108 Database instance to store Registry. 

109 defaults : `RegistryDefaults` 

110 Default collection search path and/or output `~CollectionType.RUN` 

111 collection. 

112 managers : `RegistryManagerInstances` 

113 All the managers required for this registry. 

114 """ 

115 

116 defaultConfigFile: Optional[str] = None 

117 """Path to configuration defaults. Accessed within the ``configs`` resource 

118 or relative to a search path. Can be None if no defaults specified. 

119 """ 

120 

121 @classmethod 

122 def createFromConfig( 

123 cls, 

124 config: Optional[Union[RegistryConfig, str]] = None, 

125 dimensionConfig: Optional[Union[DimensionConfig, str]] = None, 

126 butlerRoot: Optional[ResourcePathExpression] = None, 

127 ) -> Registry: 

128 """Create registry database and return `SqlRegistry` instance. 

129 

130 This method initializes database contents, database must be empty 

131 prior to calling this method. 

132 

133 Parameters 

134 ---------- 

135 config : `RegistryConfig` or `str`, optional 

136 Registry configuration, if missing then default configuration will 

137 be loaded from registry.yaml. 

138 dimensionConfig : `DimensionConfig` or `str`, optional 

139 Dimensions configuration, if missing then default configuration 

140 will be loaded from dimensions.yaml. 

141 butlerRoot : convertible to `lsst.resources.ResourcePath`, optional 

142 Path to the repository root this `SqlRegistry` will manage. 

143 

144 Returns 

145 ------- 

146 registry : `SqlRegistry` 

147 A new `SqlRegistry` instance. 

148 """ 

149 config = cls.forceRegistryConfig(config) 

150 config.replaceRoot(butlerRoot) 

151 

152 if isinstance(dimensionConfig, str): 

153 dimensionConfig = DimensionConfig(dimensionConfig) 

154 elif dimensionConfig is None: 

155 dimensionConfig = DimensionConfig() 

156 elif not isinstance(dimensionConfig, DimensionConfig): 

157 raise TypeError(f"Incompatible Dimension configuration type: {type(dimensionConfig)}") 

158 

159 DatabaseClass = config.getDatabaseClass() 

160 database = DatabaseClass.fromUri( 

161 str(config.connectionString), origin=config.get("origin", 0), namespace=config.get("namespace") 

162 ) 

163 managerTypes = RegistryManagerTypes.fromConfig(config) 

164 managers = managerTypes.makeRepo(database, dimensionConfig) 

165 return cls(database, RegistryDefaults(), managers) 

166 

167 @classmethod 

168 def fromConfig( 

169 cls, 

170 config: Union[ButlerConfig, RegistryConfig, Config, str], 

171 butlerRoot: Optional[ResourcePathExpression] = None, 

172 writeable: bool = True, 

173 defaults: Optional[RegistryDefaults] = None, 

174 ) -> Registry: 

175 """Create `Registry` subclass instance from `config`. 

176 

177 Registry database must be initialized prior to calling this method. 

178 

179 Parameters 

180 ---------- 

181 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str` 

182 Registry configuration 

183 butlerRoot : `lsst.resources.ResourcePathExpression`, optional 

184 Path to the repository root this `Registry` will manage. 

185 writeable : `bool`, optional 

186 If `True` (default) create a read-write connection to the database. 

187 defaults : `RegistryDefaults`, optional 

188 Default collection search path and/or output `~CollectionType.RUN` 

189 collection. 

190 

191 Returns 

192 ------- 

193 registry : `SqlRegistry` (subclass) 

194 A new `SqlRegistry` subclass instance. 

195 """ 

196 config = cls.forceRegistryConfig(config) 

197 config.replaceRoot(butlerRoot) 

198 DatabaseClass = config.getDatabaseClass() 

199 database = DatabaseClass.fromUri( 

200 str(config.connectionString), 

201 origin=config.get("origin", 0), 

202 namespace=config.get("namespace"), 

203 writeable=writeable, 

204 ) 

205 managerTypes = RegistryManagerTypes.fromConfig(config) 

206 managers = managerTypes.loadRepo(database) 

207 if defaults is None: 

208 defaults = RegistryDefaults() 

209 return cls(database, defaults, managers) 

210 

211 def __init__(self, database: Database, defaults: RegistryDefaults, managers: RegistryManagerInstances): 

212 self._db = database 

213 self._managers = managers 

214 self.storageClasses = StorageClassFactory() 

215 # Intentionally invoke property setter to initialize defaults. This 

216 # can only be done after most of the rest of Registry has already been 

217 # initialized, and must be done before the property getter is used. 

218 self.defaults = defaults 

219 # In the future DatasetIdFactory may become configurable and this 

220 # instance will need to be shared with datasets manager. 

221 self.datasetIdFactory = DatasetIdFactory() 

222 

223 def __str__(self) -> str: 

224 return str(self._db) 

225 

226 def __repr__(self) -> str: 

227 return f"SqlRegistry({self._db!r}, {self.dimensions!r})" 

228 

229 def isWriteable(self) -> bool: 

230 # Docstring inherited from lsst.daf.butler.registry.Registry 

231 return self._db.isWriteable() 

232 

233 def copy(self, defaults: Optional[RegistryDefaults] = None) -> Registry: 

234 # Docstring inherited from lsst.daf.butler.registry.Registry 

235 if defaults is None: 

236 # No need to copy, because `RegistryDefaults` is immutable; we 

237 # effectively copy on write. 

238 defaults = self.defaults 

239 return type(self)(self._db, defaults, self._managers) 

240 

241 @property 

242 def dimensions(self) -> DimensionUniverse: 

243 # Docstring inherited from lsst.daf.butler.registry.Registry 

244 return self._managers.dimensions.universe 

245 

246 def refresh(self) -> None: 

247 # Docstring inherited from lsst.daf.butler.registry.Registry 

248 self._managers.refresh() 

249 

250 @contextlib.contextmanager 

251 def transaction(self, *, savepoint: bool = False) -> Iterator[None]: 

252 # Docstring inherited from lsst.daf.butler.registry.Registry 

253 try: 

254 with self._db.transaction(savepoint=savepoint): 

255 yield 

256 except BaseException: 

257 # TODO: this clears the caches sometimes when we wouldn't actually 

258 # need to. Can we avoid that? 

259 self._managers.dimensions.clearCaches() 

260 raise 

261 

262 def resetConnectionPool(self) -> None: 

263 """Reset SQLAlchemy connection pool for `SqlRegistry` database. 

264 

265 This operation is useful when using registry with fork-based 

266 multiprocessing. To use registry across fork boundary one has to make 

267 sure that there are no currently active connections (no session or 

268 transaction is in progress) and connection pool is reset using this 

269 method. This method should be called by the child process immediately 

270 after the fork. 

271 """ 

272 self._db._engine.dispose() 

273 

274 def registerOpaqueTable(self, tableName: str, spec: ddl.TableSpec) -> None: 

275 """Add an opaque (to the `Registry`) table for use by a `Datastore` or 

276 other data repository client. 

277 

278 Opaque table records can be added via `insertOpaqueData`, retrieved via 

279 `fetchOpaqueData`, and removed via `deleteOpaqueData`. 

280 

281 Parameters 

282 ---------- 

283 tableName : `str` 

284 Logical name of the opaque table. This may differ from the 

285 actual name used in the database by a prefix and/or suffix. 

286 spec : `ddl.TableSpec` 

287 Specification for the table to be added. 

288 """ 

289 self._managers.opaque.register(tableName, spec) 

290 

291 @transactional 

292 def insertOpaqueData(self, tableName: str, *data: dict) -> None: 

293 """Insert records into an opaque table. 

294 

295 Parameters 

296 ---------- 

297 tableName : `str` 

298 Logical name of the opaque table. Must match the name used in a 

299 previous call to `registerOpaqueTable`. 

300 data 

301 Each additional positional argument is a dictionary that represents 

302 a single row to be added. 

303 """ 

304 self._managers.opaque[tableName].insert(*data) 

305 

306 def fetchOpaqueData(self, tableName: str, **where: Any) -> Iterator[dict]: 

307 """Retrieve records from an opaque table. 

308 

309 Parameters 

310 ---------- 

311 tableName : `str` 

312 Logical name of the opaque table. Must match the name used in a 

313 previous call to `registerOpaqueTable`. 

314 where 

315 Additional keyword arguments are interpreted as equality 

316 constraints that restrict the returned rows (combined with AND); 

317 keyword arguments are column names and values are the values they 

318 must have. 

319 

320 Yields 

321 ------ 

322 row : `dict` 

323 A dictionary representing a single result row. 

324 """ 

325 yield from self._managers.opaque[tableName].fetch(**where) 

326 

327 @transactional 

328 def deleteOpaqueData(self, tableName: str, **where: Any) -> None: 

329 """Remove records from an opaque table. 

330 

331 Parameters 

332 ---------- 

333 tableName : `str` 

334 Logical name of the opaque table. Must match the name used in a 

335 previous call to `registerOpaqueTable`. 

336 where 

337 Additional keyword arguments are interpreted as equality 

338 constraints that restrict the deleted rows (combined with AND); 

339 keyword arguments are column names and values are the values they 

340 must have. 

341 """ 

342 self._managers.opaque[tableName].delete(where.keys(), where) 

343 

344 def registerCollection( 

345 self, name: str, type: CollectionType = CollectionType.TAGGED, doc: Optional[str] = None 

346 ) -> bool: 

347 # Docstring inherited from lsst.daf.butler.registry.Registry 

348 _, registered = self._managers.collections.register(name, type, doc=doc) 

349 return registered 

350 

351 def getCollectionType(self, name: str) -> CollectionType: 

352 # Docstring inherited from lsst.daf.butler.registry.Registry 

353 return self._managers.collections.find(name).type 

354 

355 def _get_collection_record(self, name: str) -> CollectionRecord: 

356 # Docstring inherited from lsst.daf.butler.registry.Registry 

357 return self._managers.collections.find(name) 

358 

359 def registerRun(self, name: str, doc: Optional[str] = None) -> bool: 

360 # Docstring inherited from lsst.daf.butler.registry.Registry 

361 _, registered = self._managers.collections.register(name, CollectionType.RUN, doc=doc) 

362 return registered 

363 

364 @transactional 

365 def removeCollection(self, name: str) -> None: 

366 # Docstring inherited from lsst.daf.butler.registry.Registry 

367 self._managers.collections.remove(name) 

368 

369 def getCollectionChain(self, parent: str) -> tuple[str, ...]: 

370 # Docstring inherited from lsst.daf.butler.registry.Registry 

371 record = self._managers.collections.find(parent) 

372 if record.type is not CollectionType.CHAINED: 

373 raise CollectionTypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.") 

374 assert isinstance(record, ChainedCollectionRecord) 

375 return record.children 

376 

377 @transactional 

378 def setCollectionChain(self, parent: str, children: Any, *, flatten: bool = False) -> None: 

379 # Docstring inherited from lsst.daf.butler.registry.Registry 

380 record = self._managers.collections.find(parent) 

381 if record.type is not CollectionType.CHAINED: 

382 raise CollectionTypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.") 

383 assert isinstance(record, ChainedCollectionRecord) 

384 children = CollectionWildcard.from_expression(children).require_ordered() 

385 if children != record.children or flatten: 

386 record.update(self._managers.collections, children, flatten=flatten) 

387 

388 def getCollectionParentChains(self, collection: str) -> Set[str]: 

389 # Docstring inherited from lsst.daf.butler.registry.Registry 

390 return { 

391 record.name 

392 for record in self._managers.collections.getParentChains( 

393 self._managers.collections.find(collection).key 

394 ) 

395 } 

396 

397 def getCollectionDocumentation(self, collection: str) -> Optional[str]: 

398 # Docstring inherited from lsst.daf.butler.registry.Registry 

399 return self._managers.collections.getDocumentation(self._managers.collections.find(collection).key) 

400 

401 def setCollectionDocumentation(self, collection: str, doc: Optional[str]) -> None: 

402 # Docstring inherited from lsst.daf.butler.registry.Registry 

403 self._managers.collections.setDocumentation(self._managers.collections.find(collection).key, doc) 

404 

405 def getCollectionSummary(self, collection: str) -> CollectionSummary: 

406 # Docstring inherited from lsst.daf.butler.registry.Registry 

407 record = self._managers.collections.find(collection) 

408 return self._managers.datasets.getCollectionSummary(record) 

409 

410 def registerDatasetType(self, datasetType: DatasetType) -> bool: 

411 # Docstring inherited from lsst.daf.butler.registry.Registry 

412 _, inserted = self._managers.datasets.register(datasetType) 

413 return inserted 

414 

415 def removeDatasetType(self, name: str) -> None: 

416 # Docstring inherited from lsst.daf.butler.registry.Registry 

417 self._managers.datasets.remove(name) 

418 

419 def getDatasetType(self, name: str) -> DatasetType: 

420 # Docstring inherited from lsst.daf.butler.registry.Registry 

421 parent_name, component = DatasetType.splitDatasetTypeName(name) 

422 storage = self._managers.datasets[parent_name] 

423 if component is None: 

424 return storage.datasetType 

425 else: 

426 return storage.datasetType.makeComponentDatasetType(component) 

427 

428 def supportsIdGenerationMode(self, mode: DatasetIdGenEnum) -> bool: 

429 # Docstring inherited from lsst.daf.butler.registry.Registry 

430 return self._managers.datasets.supportsIdGenerationMode(mode) 

431 

432 def findDataset( 

433 self, 

434 datasetType: Union[DatasetType, str], 

435 dataId: Optional[DataId] = None, 

436 *, 

437 collections: Any = None, 

438 timespan: Optional[Timespan] = None, 

439 **kwargs: Any, 

440 ) -> Optional[DatasetRef]: 

441 # Docstring inherited from lsst.daf.butler.registry.Registry 

442 if isinstance(datasetType, DatasetType): 

443 parent_name, component = datasetType.nameAndComponent() 

444 else: 

445 parent_name, component = DatasetType.splitDatasetTypeName(datasetType) 

446 storage = self._managers.datasets[parent_name] 

447 dataId = DataCoordinate.standardize( 

448 dataId, 

449 graph=storage.datasetType.dimensions, 

450 universe=self.dimensions, 

451 defaults=self.defaults.dataId, 

452 **kwargs, 

453 ) 

454 if collections is None: 

455 if not self.defaults.collections: 

456 raise NoDefaultCollectionError( 

457 "No collections provided to findDataset, and no defaults from registry construction." 

458 ) 

459 collections = self.defaults.collections 

460 collections = CollectionWildcard.from_expression(collections) 

461 collections.require_ordered() 

462 for collectionRecord in self._managers.collections.resolve_wildcard(collections): 

463 if collectionRecord.type is CollectionType.CALIBRATION and ( 

464 not storage.datasetType.isCalibration() or timespan is None 

465 ): 

466 continue 

467 result = storage.find(collectionRecord, dataId, timespan=timespan) 

468 if result is not None: 

469 if component is not None: 

470 return result.makeComponentRef(component) 

471 return result 

472 

473 return None 

474 

475 @transactional 

476 def insertDatasets( 

477 self, 

478 datasetType: Union[DatasetType, str], 

479 dataIds: Iterable[DataId], 

480 run: Optional[str] = None, 

481 expand: bool = True, 

482 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

483 ) -> List[DatasetRef]: 

484 # Docstring inherited from lsst.daf.butler.registry.Registry 

485 if isinstance(datasetType, DatasetType): 

486 storage = self._managers.datasets.find(datasetType.name) 

487 if storage is None: 

488 raise DatasetTypeError(f"DatasetType '{datasetType}' has not been registered.") 

489 else: 

490 storage = self._managers.datasets.find(datasetType) 

491 if storage is None: 

492 raise DatasetTypeError(f"DatasetType with name '{datasetType}' has not been registered.") 

493 if run is None: 

494 if self.defaults.run is None: 

495 raise NoDefaultCollectionError( 

496 "No run provided to insertDatasets, and no default from registry construction." 

497 ) 

498 run = self.defaults.run 

499 runRecord = self._managers.collections.find(run) 

500 if runRecord.type is not CollectionType.RUN: 

501 raise CollectionTypeError( 

502 f"Given collection is of type {runRecord.type.name}; RUN collection required." 

503 ) 

504 assert isinstance(runRecord, RunRecord) 

505 progress = Progress("daf.butler.Registry.insertDatasets", level=logging.DEBUG) 

506 if expand: 

507 expandedDataIds = [ 

508 self.expandDataId(dataId, graph=storage.datasetType.dimensions) 

509 for dataId in progress.wrap(dataIds, f"Expanding {storage.datasetType.name} data IDs") 

510 ] 

511 else: 

512 expandedDataIds = [ 

513 DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions) for dataId in dataIds 

514 ] 

515 try: 

516 refs = list(storage.insert(runRecord, expandedDataIds, idGenerationMode)) 

517 if self._managers.obscore: 

518 self._managers.obscore.add_datasets(refs) 

519 except sqlalchemy.exc.IntegrityError as err: 

520 raise ConflictingDefinitionError( 

521 f"A database constraint failure was triggered by inserting " 

522 f"one or more datasets of type {storage.datasetType} into " 

523 f"collection '{run}'. " 

524 f"This probably means a dataset with the same data ID " 

525 f"and dataset type already exists, but it may also mean a " 

526 f"dimension row is missing." 

527 ) from err 

528 return refs 

529 

530 @transactional 

531 def _importDatasets( 

532 self, 

533 datasets: Iterable[DatasetRef], 

534 expand: bool = True, 

535 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

536 reuseIds: bool = False, 

537 ) -> List[DatasetRef]: 

538 # Docstring inherited from lsst.daf.butler.registry.Registry 

539 datasets = list(datasets) 

540 if not datasets: 

541 # nothing to do 

542 return [] 

543 

544 # find dataset type 

545 datasetTypes = set(dataset.datasetType for dataset in datasets) 

546 if len(datasetTypes) != 1: 

547 raise DatasetTypeError(f"Multiple dataset types in input datasets: {datasetTypes}") 

548 datasetType = datasetTypes.pop() 

549 

550 # get storage handler for this dataset type 

551 storage = self._managers.datasets.find(datasetType.name) 

552 if storage is None: 

553 raise DatasetTypeError(f"DatasetType '{datasetType}' has not been registered.") 

554 

555 # find run name 

556 runs = set(dataset.run for dataset in datasets) 

557 if len(runs) != 1: 

558 raise ValueError(f"Multiple run names in input datasets: {runs}") 

559 run = runs.pop() 

560 if run is None: 

561 if self.defaults.run is None: 

562 raise NoDefaultCollectionError( 

563 "No run provided to ingestDatasets, and no default from registry construction." 

564 ) 

565 run = self.defaults.run 

566 

567 runRecord = self._managers.collections.find(run) 

568 if runRecord.type is not CollectionType.RUN: 

569 raise CollectionTypeError( 

570 f"Given collection '{runRecord.name}' is of type {runRecord.type.name};" 

571 " RUN collection required." 

572 ) 

573 assert isinstance(runRecord, RunRecord) 

574 

575 progress = Progress("daf.butler.Registry.insertDatasets", level=logging.DEBUG) 

576 if expand: 

577 expandedDatasets = [ 

578 dataset.expanded(self.expandDataId(dataset.dataId, graph=storage.datasetType.dimensions)) 

579 for dataset in progress.wrap(datasets, f"Expanding {storage.datasetType.name} data IDs") 

580 ] 

581 else: 

582 expandedDatasets = [ 

583 DatasetRef(datasetType, dataset.dataId, id=dataset.id, run=dataset.run, conform=True) 

584 for dataset in datasets 

585 ] 

586 

587 try: 

588 refs = list(storage.import_(runRecord, expandedDatasets, idGenerationMode, reuseIds)) 

589 if self._managers.obscore: 

590 self._managers.obscore.add_datasets(refs) 

591 except sqlalchemy.exc.IntegrityError as err: 

592 raise ConflictingDefinitionError( 

593 f"A database constraint failure was triggered by inserting " 

594 f"one or more datasets of type {storage.datasetType} into " 

595 f"collection '{run}'. " 

596 f"This probably means a dataset with the same data ID " 

597 f"and dataset type already exists, but it may also mean a " 

598 f"dimension row is missing." 

599 ) from err 

600 return refs 

601 

602 def getDataset(self, id: DatasetId) -> Optional[DatasetRef]: 

603 # Docstring inherited from lsst.daf.butler.registry.Registry 

604 return self._managers.datasets.getDatasetRef(id) 

605 

606 @transactional 

607 def removeDatasets(self, refs: Iterable[DatasetRef]) -> None: 

608 # Docstring inherited from lsst.daf.butler.registry.Registry 

609 progress = Progress("lsst.daf.butler.Registry.removeDatasets", level=logging.DEBUG) 

610 for datasetType, refsForType in progress.iter_item_chunks( 

611 DatasetRef.groupByType(refs).items(), desc="Removing datasets by type" 

612 ): 

613 storage = self._managers.datasets[datasetType.name] 

614 try: 

615 storage.delete(refsForType) 

616 except sqlalchemy.exc.IntegrityError as err: 

617 raise OrphanedRecordError( 

618 "One or more datasets is still present in one or more Datastores." 

619 ) from err 

620 

621 @transactional 

622 def associate(self, collection: str, refs: Iterable[DatasetRef]) -> None: 

623 # Docstring inherited from lsst.daf.butler.registry.Registry 

624 progress = Progress("lsst.daf.butler.Registry.associate", level=logging.DEBUG) 

625 collectionRecord = self._managers.collections.find(collection) 

626 if collectionRecord.type is not CollectionType.TAGGED: 

627 raise CollectionTypeError( 

628 f"Collection '{collection}' has type {collectionRecord.type.name}, not TAGGED." 

629 ) 

630 for datasetType, refsForType in progress.iter_item_chunks( 

631 DatasetRef.groupByType(refs).items(), desc="Associating datasets by type" 

632 ): 

633 storage = self._managers.datasets[datasetType.name] 

634 try: 

635 storage.associate(collectionRecord, refsForType) 

636 if self._managers.obscore: 

637 # If a TAGGED collection is being monitored by ObsCore 

638 # manager then we may need to save the dataset. 

639 self._managers.obscore.associate(refsForType, collectionRecord) 

640 except sqlalchemy.exc.IntegrityError as err: 

641 raise ConflictingDefinitionError( 

642 f"Constraint violation while associating dataset of type {datasetType.name} with " 

643 f"collection {collection}. This probably means that one or more datasets with the same " 

644 f"dataset type and data ID already exist in the collection, but it may also indicate " 

645 f"that the datasets do not exist." 

646 ) from err 

647 

648 @transactional 

649 def disassociate(self, collection: str, refs: Iterable[DatasetRef]) -> None: 

650 # Docstring inherited from lsst.daf.butler.registry.Registry 

651 progress = Progress("lsst.daf.butler.Registry.disassociate", level=logging.DEBUG) 

652 collectionRecord = self._managers.collections.find(collection) 

653 if collectionRecord.type is not CollectionType.TAGGED: 

654 raise CollectionTypeError( 

655 f"Collection '{collection}' has type {collectionRecord.type.name}; expected TAGGED." 

656 ) 

657 for datasetType, refsForType in progress.iter_item_chunks( 

658 DatasetRef.groupByType(refs).items(), desc="Disassociating datasets by type" 

659 ): 

660 storage = self._managers.datasets[datasetType.name] 

661 storage.disassociate(collectionRecord, refsForType) 

662 if self._managers.obscore: 

663 self._managers.obscore.disassociate(refsForType, collectionRecord) 

664 

665 @transactional 

666 def certify(self, collection: str, refs: Iterable[DatasetRef], timespan: Timespan) -> None: 

667 # Docstring inherited from lsst.daf.butler.registry.Registry 

668 progress = Progress("lsst.daf.butler.Registry.certify", level=logging.DEBUG) 

669 collectionRecord = self._managers.collections.find(collection) 

670 for datasetType, refsForType in progress.iter_item_chunks( 

671 DatasetRef.groupByType(refs).items(), desc="Certifying datasets by type" 

672 ): 

673 storage = self._managers.datasets[datasetType.name] 

674 storage.certify(collectionRecord, refsForType, timespan) 

675 

676 @transactional 

677 def decertify( 

678 self, 

679 collection: str, 

680 datasetType: Union[str, DatasetType], 

681 timespan: Timespan, 

682 *, 

683 dataIds: Optional[Iterable[DataId]] = None, 

684 ) -> None: 

685 # Docstring inherited from lsst.daf.butler.registry.Registry 

686 collectionRecord = self._managers.collections.find(collection) 

687 if isinstance(datasetType, str): 

688 storage = self._managers.datasets[datasetType] 

689 else: 

690 storage = self._managers.datasets[datasetType.name] 

691 standardizedDataIds = None 

692 if dataIds is not None: 

693 standardizedDataIds = [ 

694 DataCoordinate.standardize(d, graph=storage.datasetType.dimensions) for d in dataIds 

695 ] 

696 storage.decertify(collectionRecord, timespan, dataIds=standardizedDataIds) 

697 

698 def getDatastoreBridgeManager(self) -> DatastoreRegistryBridgeManager: 

699 """Return an object that allows a new `Datastore` instance to 

700 communicate with this `Registry`. 

701 

702 Returns 

703 ------- 

704 manager : `DatastoreRegistryBridgeManager` 

705 Object that mediates communication between this `Registry` and its 

706 associated datastores. 

707 """ 

708 return self._managers.datastores 

709 

710 def getDatasetLocations(self, ref: DatasetRef) -> Iterable[str]: 

711 # Docstring inherited from lsst.daf.butler.registry.Registry 

712 return self._managers.datastores.findDatastores(ref) 

713 

714 def expandDataId( 

715 self, 

716 dataId: Optional[DataId] = None, 

717 *, 

718 graph: Optional[DimensionGraph] = None, 

719 records: Optional[NameLookupMapping[DimensionElement, Optional[DimensionRecord]]] = None, 

720 withDefaults: bool = True, 

721 **kwargs: Any, 

722 ) -> DataCoordinate: 

723 # Docstring inherited from lsst.daf.butler.registry.Registry 

724 if not withDefaults: 

725 defaults = None 

726 else: 

727 defaults = self.defaults.dataId 

728 try: 

729 standardized = DataCoordinate.standardize( 

730 dataId, graph=graph, universe=self.dimensions, defaults=defaults, **kwargs 

731 ) 

732 except KeyError as exc: 

733 # This means either kwargs have some odd name or required 

734 # dimension is missing. 

735 raise DimensionNameError(str(exc)) from exc 

736 if standardized.hasRecords(): 

737 return standardized 

738 if records is None: 

739 records = {} 

740 elif isinstance(records, NamedKeyMapping): 

741 records = records.byName() 

742 else: 

743 records = dict(records) 

744 if isinstance(dataId, DataCoordinate) and dataId.hasRecords(): 

745 records.update(dataId.records.byName()) 

746 keys = standardized.byName() 

747 for element in standardized.graph.primaryKeyTraversalOrder: 

748 record = records.get(element.name, ...) # Use ... to mean not found; None might mean NULL 

749 if record is ...: 

750 if isinstance(element, Dimension) and keys.get(element.name) is None: 

751 if element in standardized.graph.required: 

752 raise DimensionNameError( 

753 f"No value or null value for required dimension {element.name}." 

754 ) 

755 keys[element.name] = None 

756 record = None 

757 else: 

758 storage = self._managers.dimensions[element] 

759 dataIdSet = DataCoordinateIterable.fromScalar( 

760 DataCoordinate.standardize(keys, graph=element.graph) 

761 ) 

762 fetched = tuple(storage.fetch(dataIdSet)) 

763 try: 

764 (record,) = fetched 

765 except ValueError: 

766 record = None 

767 records[element.name] = record 

768 if record is not None: 

769 for d in element.implied: 

770 value = getattr(record, d.name) 

771 if keys.setdefault(d.name, value) != value: 

772 raise InconsistentDataIdError( 

773 f"Data ID {standardized} has {d.name}={keys[d.name]!r}, " 

774 f"but {element.name} implies {d.name}={value!r}." 

775 ) 

776 else: 

777 if element in standardized.graph.required: 

778 raise DataIdValueError( 

779 f"Could not fetch record for required dimension {element.name} via keys {keys}." 

780 ) 

781 if element.alwaysJoin: 

782 raise InconsistentDataIdError( 

783 f"Could not fetch record for element {element.name} via keys {keys}, ", 

784 "but it is marked alwaysJoin=True; this means one or more dimensions are not " 

785 "related.", 

786 ) 

787 for d in element.implied: 

788 keys.setdefault(d.name, None) 

789 records.setdefault(d.name, None) 

790 return DataCoordinate.standardize(keys, graph=standardized.graph).expanded(records=records) 

791 

792 def insertDimensionData( 

793 self, 

794 element: Union[DimensionElement, str], 

795 *data: Union[Mapping[str, Any], DimensionRecord], 

796 conform: bool = True, 

797 replace: bool = False, 

798 skip_existing: bool = False, 

799 ) -> None: 

800 # Docstring inherited from lsst.daf.butler.registry.Registry 

801 if conform: 

802 if isinstance(element, str): 

803 element = self.dimensions[element] 

804 records = [ 

805 row if isinstance(row, DimensionRecord) else element.RecordClass(**row) for row in data 

806 ] 

807 else: 

808 # Ignore typing since caller said to trust them with conform=False. 

809 records = data # type: ignore 

810 storage = self._managers.dimensions[element] # type: ignore 

811 storage.insert(*records, replace=replace, skip_existing=skip_existing) 

812 

813 def syncDimensionData( 

814 self, 

815 element: Union[DimensionElement, str], 

816 row: Union[Mapping[str, Any], DimensionRecord], 

817 conform: bool = True, 

818 update: bool = False, 

819 ) -> Union[bool, Dict[str, Any]]: 

820 # Docstring inherited from lsst.daf.butler.registry.Registry 

821 if conform: 

822 if isinstance(element, str): 

823 element = self.dimensions[element] 

824 record = row if isinstance(row, DimensionRecord) else element.RecordClass(**row) 

825 else: 

826 # Ignore typing since caller said to trust them with conform=False. 

827 record = row # type: ignore 

828 storage = self._managers.dimensions[element] # type: ignore 

829 return storage.sync(record, update=update) 

830 

831 def queryDatasetTypes( 

832 self, 

833 expression: Any = ..., 

834 *, 

835 components: Optional[bool] = None, 

836 missing: Optional[List[str]] = None, 

837 ) -> Iterable[DatasetType]: 

838 # Docstring inherited from lsst.daf.butler.registry.Registry 

839 wildcard = DatasetTypeWildcard.from_expression(expression) 

840 composition_dict = self._managers.datasets.resolve_wildcard( 

841 wildcard, 

842 components=components, 

843 missing=missing, 

844 ) 

845 result: list[DatasetType] = [] 

846 for parent_dataset_type, components_for_parent in composition_dict.items(): 

847 result.extend( 

848 parent_dataset_type.makeComponentDatasetType(c) if c is not None else parent_dataset_type 

849 for c in components_for_parent 

850 ) 

851 return result 

852 

853 def queryCollections( 

854 self, 

855 expression: Any = ..., 

856 datasetType: Optional[DatasetType] = None, 

857 collectionTypes: Union[Iterable[CollectionType], CollectionType] = CollectionType.all(), 

858 flattenChains: bool = False, 

859 includeChains: Optional[bool] = None, 

860 ) -> Sequence[str]: 

861 # Docstring inherited from lsst.daf.butler.registry.Registry 

862 

863 # Right now the datasetTypes argument is completely ignored, but that 

864 # is consistent with its [lack of] guarantees. DM-24939 or a follow-up 

865 # ticket will take care of that. 

866 try: 

867 wildcard = CollectionWildcard.from_expression(expression) 

868 except TypeError as exc: 

869 raise CollectionExpressionError(f"Invalid collection expression '{expression}'") from exc 

870 collectionTypes = ensure_iterable(collectionTypes) 

871 return [ 

872 record.name 

873 for record in self._managers.collections.resolve_wildcard( 

874 wildcard, 

875 collection_types=frozenset(collectionTypes), 

876 flatten_chains=flattenChains, 

877 include_chains=includeChains, 

878 ) 

879 ] 

880 

881 def _makeQueryBuilder( 

882 self, summary: queries.QuerySummary, doomed_by: Iterable[str] = () 

883 ) -> queries.QueryBuilder: 

884 """Return a `QueryBuilder` instance capable of constructing and 

885 managing more complex queries than those obtainable via `Registry` 

886 interfaces. 

887 

888 This is an advanced interface; downstream code should prefer 

889 `Registry.queryDataIds` and `Registry.queryDatasets` whenever those 

890 are sufficient. 

891 

892 Parameters 

893 ---------- 

894 summary : `queries.QuerySummary` 

895 Object describing and categorizing the full set of dimensions that 

896 will be included in the query. 

897 doomed_by : `Iterable` of `str`, optional 

898 A list of diagnostic messages that indicate why the query is going 

899 to yield no results and should not even be executed. If an empty 

900 container (default) the query will be executed unless other code 

901 determines that it is doomed. 

902 

903 Returns 

904 ------- 

905 builder : `queries.QueryBuilder` 

906 Object that can be used to construct and perform advanced queries. 

907 """ 

908 return queries.QueryBuilder( 

909 summary, 

910 backend=queries.SqlQueryBackend(self._db, self._managers), 

911 doomed_by=doomed_by, 

912 ) 

913 

914 def _standardize_query_dataset_args( 

915 self, 

916 datasets: Any, 

917 collections: Any, 

918 components: bool | None, 

919 mode: Literal["find_first"] | Literal["find_all"] | Literal["constrain"] = "constrain", 

920 *, 

921 doomed_by: list[str], 

922 ) -> tuple[dict[DatasetType, list[str | None]], CollectionWildcard | None]: 

923 """Preprocess dataset arguments passed to query* methods. 

924 

925 Parameters 

926 ---------- 

927 datasets : `DatasetType`, `str`, `re.Pattern`, or iterable of these 

928 Expression identifying dataset types. See `queryDatasetTypes` for 

929 details. 

930 collections : `str`, `re.Pattern`, or iterable of these 

931 Expression identifying collections to be searched. See 

932 `queryCollections` for details. 

933 components : `bool`, optional 

934 If `True`, apply all expression patterns to component dataset type 

935 names as well. If `False`, never apply patterns to components. 

936 If `None` (default), apply patterns to components only if their 

937 parent datasets were not matched by the expression. 

938 Fully-specified component datasets (`str` or `DatasetType` 

939 instances) are always included. 

940 

941 Values other than `False` are deprecated, and only `False` will be 

942 supported after v26. After v27 this argument will be removed 

943 entirely. 

944 mode : `str`, optional 

945 The way in which datasets are being used in this query; one of: 

946 

947 - "find_first": this is a query for the first dataset in an 

948 ordered list of collections. Prohibits collection wildcards, 

949 but permits dataset type wildcards. 

950 

951 - "find_all": this is a query for all datasets in all matched 

952 collections. Permits collection and dataset type wildcards. 

953 

954 - "constrain": this is a query for something other than datasets, 

955 with results constrained by dataset existence. Permits 

956 collection wildcards and prohibits ``...`` as a dataset type 

957 wildcard. 

958 doomed_by : `list` [ `str` ] 

959 List to append messages indicating why the query is doomed to 

960 yield no results. 

961 

962 Returns 

963 ------- 

964 composition : `defaultdict` [ `DatasetType`, `list` [ `str` ] ] 

965 Dictionary mapping parent dataset type to `list` of components 

966 matched for that dataset type (or `None` for the parent itself). 

967 collections : `CollectionWildcard` 

968 Processed collection expression. 

969 """ 

970 composition: dict[DatasetType, list[str | None]] = {} 

971 if datasets is not None: 

972 if not collections: 

973 if not self.defaults.collections: 

974 raise NoDefaultCollectionError("No collections, and no registry default collections.") 

975 collections = self.defaults.collections 

976 else: 

977 collections = CollectionWildcard.from_expression(collections) 

978 if mode == "find_first" and collections.patterns: 

979 raise TypeError( 

980 f"Collection pattern(s) {collections.patterns} not allowed in this context." 

981 ) 

982 missing: list[str] = [] 

983 composition = self._managers.datasets.resolve_wildcard( 

984 datasets, components=components, missing=missing, explicit_only=(mode == "constrain") 

985 ) 

986 if missing and mode == "constrain": 

987 # After v26 this should raise MissingDatasetTypeError, to be 

988 # implemented on DM-36303. 

989 warnings.warn( 

990 f"Dataset type(s) {missing} are not registered; this will be an error after v26.", 

991 FutureWarning, 

992 ) 

993 doomed_by.extend(f"Dataset type {name} is not registered." for name in missing) 

994 elif collections: 

995 raise ArgumentError(f"Cannot pass 'collections' (='{collections}') without 'datasets'.") 

996 return composition, collections 

997 

998 def queryDatasets( 

999 self, 

1000 datasetType: Any, 

1001 *, 

1002 collections: Any = None, 

1003 dimensions: Optional[Iterable[Union[Dimension, str]]] = None, 

1004 dataId: Optional[DataId] = None, 

1005 where: Optional[str] = None, 

1006 findFirst: bool = False, 

1007 components: Optional[bool] = None, 

1008 bind: Optional[Mapping[str, Any]] = None, 

1009 check: bool = True, 

1010 **kwargs: Any, 

1011 ) -> queries.DatasetQueryResults: 

1012 # Docstring inherited from lsst.daf.butler.registry.Registry 

1013 doomed_by: list[str] = [] 

1014 data_id = self.expandDataId(dataId, **kwargs) 

1015 dataset_composition, collections = self._standardize_query_dataset_args( 

1016 datasetType, 

1017 collections, 

1018 components, 

1019 mode="find_first" if findFirst else "find_all", 

1020 doomed_by=doomed_by, 

1021 ) 

1022 parent_results: list[queries.ParentDatasetQueryResults] = [] 

1023 for parent_dataset_type, components_for_parent in dataset_composition.items(): 

1024 # The full set of dimensions in the query is the combination of 

1025 # those needed for the DatasetType and those explicitly requested, 

1026 # if any. 

1027 dimension_names = set(parent_dataset_type.dimensions.names) 

1028 if dimensions is not None: 

1029 dimension_names.update(self.dimensions.extract(dimensions).names) 

1030 # Construct the summary structure needed to construct a 

1031 # QueryBuilder. 

1032 summary = queries.QuerySummary( 

1033 requested=DimensionGraph(self.dimensions, names=dimension_names), 

1034 dataId=data_id, 

1035 expression=where, 

1036 bind=bind, 

1037 defaults=self.defaults.dataId, 

1038 check=check, 

1039 datasets=[parent_dataset_type], 

1040 ) 

1041 builder = self._makeQueryBuilder(summary) 

1042 # Add the dataset subquery to the query, telling the QueryBuilder 

1043 # to include the rank of the selected collection in the results 

1044 # only if we need to findFirst. Note that if any of the 

1045 # collections are actually wildcard expressions, and 

1046 # findFirst=True, this will raise TypeError for us. 

1047 builder.joinDataset(parent_dataset_type, collections, isResult=True, findFirst=findFirst) 

1048 query = builder.finish() 

1049 parent_results.append( 

1050 queries.ParentDatasetQueryResults( 

1051 self._db, query, datasetType=parent_dataset_type, components=components_for_parent 

1052 ) 

1053 ) 

1054 if not parent_results: 

1055 doomed_by.extend( 

1056 f"No registered dataset type matching {t!r} found, so no matching datasets can " 

1057 "exist in any collection." 

1058 for t in ensure_iterable(datasetType) 

1059 ) 

1060 return queries.ChainedDatasetQueryResults([], doomed_by=doomed_by) 

1061 elif len(parent_results) == 1: 

1062 return parent_results[0] 

1063 else: 

1064 return queries.ChainedDatasetQueryResults(parent_results) 

1065 

1066 def queryDataIds( 

1067 self, 

1068 dimensions: Union[Iterable[Union[Dimension, str]], Dimension, str], 

1069 *, 

1070 dataId: Optional[DataId] = None, 

1071 datasets: Any = None, 

1072 collections: Any = None, 

1073 where: Optional[str] = None, 

1074 components: Optional[bool] = None, 

1075 bind: Optional[Mapping[str, Any]] = None, 

1076 check: bool = True, 

1077 **kwargs: Any, 

1078 ) -> queries.DataCoordinateQueryResults: 

1079 # Docstring inherited from lsst.daf.butler.registry.Registry 

1080 dimensions = ensure_iterable(dimensions) 

1081 requestedDimensions = self.dimensions.extract(dimensions) 

1082 doomed_by: list[str] = [] 

1083 data_id = self.expandDataId(dataId, **kwargs) 

1084 dataset_composition, collections = self._standardize_query_dataset_args( 

1085 datasets, collections, components, doomed_by=doomed_by 

1086 ) 

1087 

1088 def query_factory( 

1089 order_by: Optional[Iterable[str]] = None, limit: Optional[Tuple[int, Optional[int]]] = None 

1090 ) -> queries.Query: 

1091 """Construct the Query object that generates query results.""" 

1092 summary = queries.QuerySummary( 

1093 requested=requestedDimensions, 

1094 dataId=data_id, 

1095 expression=where, 

1096 bind=bind, 

1097 defaults=self.defaults.dataId, 

1098 check=check, 

1099 datasets=dataset_composition.keys(), 

1100 order_by=order_by, 

1101 limit=limit, 

1102 ) 

1103 builder = self._makeQueryBuilder(summary, doomed_by=doomed_by) 

1104 for datasetType in dataset_composition: 

1105 builder.joinDataset(datasetType, collections, isResult=False) 

1106 return builder.finish() 

1107 

1108 return queries.DataCoordinateQueryResults(self._db, query_factory, requestedDimensions) 

1109 

1110 def queryDimensionRecords( 

1111 self, 

1112 element: Union[DimensionElement, str], 

1113 *, 

1114 dataId: Optional[DataId] = None, 

1115 datasets: Any = None, 

1116 collections: Any = None, 

1117 where: Optional[str] = None, 

1118 components: Optional[bool] = None, 

1119 bind: Optional[Mapping[str, Any]] = None, 

1120 check: bool = True, 

1121 **kwargs: Any, 

1122 ) -> queries.DimensionRecordQueryResults: 

1123 # Docstring inherited from lsst.daf.butler.registry.Registry 

1124 if not isinstance(element, DimensionElement): 

1125 try: 

1126 element = self.dimensions[element] 

1127 except KeyError as e: 

1128 raise DimensionNameError( 

1129 f"No such dimension '{element}', available dimensions: " 

1130 + str(self.dimensions.getStaticElements()) 

1131 ) from e 

1132 dataIds = self.queryDataIds( 

1133 element.graph, 

1134 dataId=dataId, 

1135 datasets=datasets, 

1136 collections=collections, 

1137 where=where, 

1138 components=components, 

1139 bind=bind, 

1140 check=check, 

1141 **kwargs, 

1142 ) 

1143 return queries.DatabaseDimensionRecordQueryResults(dataIds, self._managers.dimensions[element]) 

1144 

1145 def queryDatasetAssociations( 

1146 self, 

1147 datasetType: Union[str, DatasetType], 

1148 collections: Any = ..., 

1149 *, 

1150 collectionTypes: Iterable[CollectionType] = CollectionType.all(), 

1151 flattenChains: bool = False, 

1152 ) -> Iterator[DatasetAssociation]: 

1153 # Docstring inherited from lsst.daf.butler.registry.Registry 

1154 if collections is None: 

1155 if not self.defaults.collections: 

1156 raise NoDefaultCollectionError( 

1157 "No collections provided to findDataset, and no defaults from registry construction." 

1158 ) 

1159 collections = self.defaults.collections 

1160 collections = CollectionWildcard.from_expression(collections) 

1161 TimespanReprClass = self._db.getTimespanRepresentation() 

1162 if isinstance(datasetType, str): 

1163 storage = self._managers.datasets[datasetType] 

1164 else: 

1165 storage = self._managers.datasets[datasetType.name] 

1166 for collectionRecord in self._managers.collections.resolve_wildcard( 

1167 collections, 

1168 collection_types=frozenset(collectionTypes), 

1169 flatten_chains=flattenChains, 

1170 ): 

1171 query = storage.select(collectionRecord) 

1172 for row in self._db.query(query).mappings(): 

1173 dataId = DataCoordinate.fromRequiredValues( 

1174 storage.datasetType.dimensions, 

1175 tuple(row[name] for name in storage.datasetType.dimensions.required.names), 

1176 ) 

1177 runRecord = self._managers.collections[row[self._managers.collections.getRunForeignKeyName()]] 

1178 ref = DatasetRef(storage.datasetType, dataId, id=row["id"], run=runRecord.name, conform=False) 

1179 if collectionRecord.type is CollectionType.CALIBRATION: 

1180 timespan = TimespanReprClass.extract(row) 

1181 else: 

1182 timespan = None 

1183 yield DatasetAssociation(ref=ref, collection=collectionRecord.name, timespan=timespan) 

1184 

1185 storageClasses: StorageClassFactory 

1186 """All storage classes known to the registry (`StorageClassFactory`). 

1187 """