Coverage for python/lsst/daf/butler/registries/sql.py: 14%

447 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2023-01-05 10:36 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ("SqlRegistry",) 

25 

26import contextlib 

27import logging 

28import warnings 

29from typing import ( 

30 TYPE_CHECKING, 

31 Any, 

32 Dict, 

33 Iterable, 

34 Iterator, 

35 List, 

36 Literal, 

37 Mapping, 

38 Optional, 

39 Sequence, 

40 Set, 

41 Tuple, 

42 Union, 

43) 

44 

45import sqlalchemy 

46from lsst.resources import ResourcePathExpression 

47from lsst.utils.iteration import ensure_iterable 

48 

49from ..core import ( 

50 Config, 

51 DataCoordinate, 

52 DataCoordinateIterable, 

53 DataId, 

54 DatasetAssociation, 

55 DatasetId, 

56 DatasetRef, 

57 DatasetType, 

58 Dimension, 

59 DimensionConfig, 

60 DimensionElement, 

61 DimensionGraph, 

62 DimensionRecord, 

63 DimensionUniverse, 

64 NamedKeyMapping, 

65 NameLookupMapping, 

66 Progress, 

67 StorageClassFactory, 

68 Timespan, 

69 ddl, 

70) 

71from ..core.utils import transactional 

72from ..registry import ( 

73 ArgumentError, 

74 CollectionExpressionError, 

75 CollectionSummary, 

76 CollectionType, 

77 CollectionTypeError, 

78 ConflictingDefinitionError, 

79 DataIdValueError, 

80 DatasetTypeError, 

81 DimensionNameError, 

82 InconsistentDataIdError, 

83 NoDefaultCollectionError, 

84 OrphanedRecordError, 

85 Registry, 

86 RegistryConfig, 

87 RegistryDefaults, 

88 queries, 

89) 

90from ..registry.interfaces import ChainedCollectionRecord, DatasetIdFactory, DatasetIdGenEnum, RunRecord 

91from ..registry.managers import RegistryManagerInstances, RegistryManagerTypes 

92from ..registry.wildcards import CollectionWildcard, DatasetTypeWildcard 

93 

94if TYPE_CHECKING: 94 ↛ 95line 94 didn't jump to line 95, because the condition on line 94 was never true

95 from .._butlerConfig import ButlerConfig 

96 from ..registry.interfaces import CollectionRecord, Database, DatastoreRegistryBridgeManager 

97 

98 

99_LOG = logging.getLogger(__name__) 

100 

101 

102class SqlRegistry(Registry): 

103 """Registry implementation based on SQLAlchemy. 

104 

105 Parameters 

106 ---------- 

107 database : `Database` 

108 Database instance to store Registry. 

109 defaults : `RegistryDefaults` 

110 Default collection search path and/or output `~CollectionType.RUN` 

111 collection. 

112 managers : `RegistryManagerInstances` 

113 All the managers required for this registry. 

114 """ 

115 

116 defaultConfigFile: Optional[str] = None 

117 """Path to configuration defaults. Accessed within the ``configs`` resource 

118 or relative to a search path. Can be None if no defaults specified. 

119 """ 

120 

121 @classmethod 

122 def createFromConfig( 

123 cls, 

124 config: Optional[Union[RegistryConfig, str]] = None, 

125 dimensionConfig: Optional[Union[DimensionConfig, str]] = None, 

126 butlerRoot: Optional[ResourcePathExpression] = None, 

127 ) -> Registry: 

128 """Create registry database and return `SqlRegistry` instance. 

129 

130 This method initializes database contents, database must be empty 

131 prior to calling this method. 

132 

133 Parameters 

134 ---------- 

135 config : `RegistryConfig` or `str`, optional 

136 Registry configuration, if missing then default configuration will 

137 be loaded from registry.yaml. 

138 dimensionConfig : `DimensionConfig` or `str`, optional 

139 Dimensions configuration, if missing then default configuration 

140 will be loaded from dimensions.yaml. 

141 butlerRoot : convertible to `lsst.resources.ResourcePath`, optional 

142 Path to the repository root this `SqlRegistry` will manage. 

143 

144 Returns 

145 ------- 

146 registry : `SqlRegistry` 

147 A new `SqlRegistry` instance. 

148 """ 

149 config = cls.forceRegistryConfig(config) 

150 config.replaceRoot(butlerRoot) 

151 

152 if isinstance(dimensionConfig, str): 

153 dimensionConfig = DimensionConfig(dimensionConfig) 

154 elif dimensionConfig is None: 

155 dimensionConfig = DimensionConfig() 

156 elif not isinstance(dimensionConfig, DimensionConfig): 

157 raise TypeError(f"Incompatible Dimension configuration type: {type(dimensionConfig)}") 

158 

159 DatabaseClass = config.getDatabaseClass() 

160 database = DatabaseClass.fromUri( 

161 str(config.connectionString), origin=config.get("origin", 0), namespace=config.get("namespace") 

162 ) 

163 managerTypes = RegistryManagerTypes.fromConfig(config) 

164 managers = managerTypes.makeRepo(database, dimensionConfig) 

165 return cls(database, RegistryDefaults(), managers) 

166 

167 @classmethod 

168 def fromConfig( 

169 cls, 

170 config: Union[ButlerConfig, RegistryConfig, Config, str], 

171 butlerRoot: Optional[ResourcePathExpression] = None, 

172 writeable: bool = True, 

173 defaults: Optional[RegistryDefaults] = None, 

174 ) -> Registry: 

175 """Create `Registry` subclass instance from `config`. 

176 

177 Registry database must be initialized prior to calling this method. 

178 

179 Parameters 

180 ---------- 

181 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str` 

182 Registry configuration 

183 butlerRoot : `lsst.resources.ResourcePathExpression`, optional 

184 Path to the repository root this `Registry` will manage. 

185 writeable : `bool`, optional 

186 If `True` (default) create a read-write connection to the database. 

187 defaults : `RegistryDefaults`, optional 

188 Default collection search path and/or output `~CollectionType.RUN` 

189 collection. 

190 

191 Returns 

192 ------- 

193 registry : `SqlRegistry` (subclass) 

194 A new `SqlRegistry` subclass instance. 

195 """ 

196 config = cls.forceRegistryConfig(config) 

197 config.replaceRoot(butlerRoot) 

198 DatabaseClass = config.getDatabaseClass() 

199 database = DatabaseClass.fromUri( 

200 str(config.connectionString), 

201 origin=config.get("origin", 0), 

202 namespace=config.get("namespace"), 

203 writeable=writeable, 

204 ) 

205 managerTypes = RegistryManagerTypes.fromConfig(config) 

206 with database.session(): 

207 managers = managerTypes.loadRepo(database) 

208 if defaults is None: 

209 defaults = RegistryDefaults() 

210 return cls(database, defaults, managers) 

211 

212 def __init__(self, database: Database, defaults: RegistryDefaults, managers: RegistryManagerInstances): 

213 self._db = database 

214 self._managers = managers 

215 self.storageClasses = StorageClassFactory() 

216 # Intentionally invoke property setter to initialize defaults. This 

217 # can only be done after most of the rest of Registry has already been 

218 # initialized, and must be done before the property getter is used. 

219 self.defaults = defaults 

220 # In the future DatasetIdFactory may become configurable and this 

221 # instance will need to be shared with datasets manager. 

222 self.datasetIdFactory = DatasetIdFactory() 

223 

224 def __str__(self) -> str: 

225 return str(self._db) 

226 

227 def __repr__(self) -> str: 

228 return f"SqlRegistry({self._db!r}, {self.dimensions!r})" 

229 

230 def isWriteable(self) -> bool: 

231 # Docstring inherited from lsst.daf.butler.registry.Registry 

232 return self._db.isWriteable() 

233 

234 def copy(self, defaults: Optional[RegistryDefaults] = None) -> Registry: 

235 # Docstring inherited from lsst.daf.butler.registry.Registry 

236 if defaults is None: 

237 # No need to copy, because `RegistryDefaults` is immutable; we 

238 # effectively copy on write. 

239 defaults = self.defaults 

240 return type(self)(self._db, defaults, self._managers) 

241 

242 @property 

243 def dimensions(self) -> DimensionUniverse: 

244 # Docstring inherited from lsst.daf.butler.registry.Registry 

245 return self._managers.dimensions.universe 

246 

247 def refresh(self) -> None: 

248 # Docstring inherited from lsst.daf.butler.registry.Registry 

249 with self._db.transaction(): 

250 self._managers.refresh() 

251 

252 @contextlib.contextmanager 

253 def transaction(self, *, savepoint: bool = False) -> Iterator[None]: 

254 # Docstring inherited from lsst.daf.butler.registry.Registry 

255 try: 

256 with self._db.transaction(savepoint=savepoint): 

257 yield 

258 except BaseException: 

259 # TODO: this clears the caches sometimes when we wouldn't actually 

260 # need to. Can we avoid that? 

261 self._managers.dimensions.clearCaches() 

262 raise 

263 

264 def resetConnectionPool(self) -> None: 

265 """Reset SQLAlchemy connection pool for `SqlRegistry` database. 

266 

267 This operation is useful when using registry with fork-based 

268 multiprocessing. To use registry across fork boundary one has to make 

269 sure that there are no currently active connections (no session or 

270 transaction is in progress) and connection pool is reset using this 

271 method. This method should be called by the child process immediately 

272 after the fork. 

273 """ 

274 self._db._engine.dispose() 

275 

276 def registerOpaqueTable(self, tableName: str, spec: ddl.TableSpec) -> None: 

277 """Add an opaque (to the `Registry`) table for use by a `Datastore` or 

278 other data repository client. 

279 

280 Opaque table records can be added via `insertOpaqueData`, retrieved via 

281 `fetchOpaqueData`, and removed via `deleteOpaqueData`. 

282 

283 Parameters 

284 ---------- 

285 tableName : `str` 

286 Logical name of the opaque table. This may differ from the 

287 actual name used in the database by a prefix and/or suffix. 

288 spec : `ddl.TableSpec` 

289 Specification for the table to be added. 

290 """ 

291 self._managers.opaque.register(tableName, spec) 

292 

293 @transactional 

294 def insertOpaqueData(self, tableName: str, *data: dict) -> None: 

295 """Insert records into an opaque table. 

296 

297 Parameters 

298 ---------- 

299 tableName : `str` 

300 Logical name of the opaque table. Must match the name used in a 

301 previous call to `registerOpaqueTable`. 

302 data 

303 Each additional positional argument is a dictionary that represents 

304 a single row to be added. 

305 """ 

306 self._managers.opaque[tableName].insert(*data) 

307 

308 def fetchOpaqueData(self, tableName: str, **where: Any) -> Iterator[dict]: 

309 """Retrieve records from an opaque table. 

310 

311 Parameters 

312 ---------- 

313 tableName : `str` 

314 Logical name of the opaque table. Must match the name used in a 

315 previous call to `registerOpaqueTable`. 

316 where 

317 Additional keyword arguments are interpreted as equality 

318 constraints that restrict the returned rows (combined with AND); 

319 keyword arguments are column names and values are the values they 

320 must have. 

321 

322 Yields 

323 ------ 

324 row : `dict` 

325 A dictionary representing a single result row. 

326 """ 

327 yield from self._managers.opaque[tableName].fetch(**where) 

328 

329 @transactional 

330 def deleteOpaqueData(self, tableName: str, **where: Any) -> None: 

331 """Remove records from an opaque table. 

332 

333 Parameters 

334 ---------- 

335 tableName : `str` 

336 Logical name of the opaque table. Must match the name used in a 

337 previous call to `registerOpaqueTable`. 

338 where 

339 Additional keyword arguments are interpreted as equality 

340 constraints that restrict the deleted rows (combined with AND); 

341 keyword arguments are column names and values are the values they 

342 must have. 

343 """ 

344 self._managers.opaque[tableName].delete(where.keys(), where) 

345 

346 def registerCollection( 

347 self, name: str, type: CollectionType = CollectionType.TAGGED, doc: Optional[str] = None 

348 ) -> bool: 

349 # Docstring inherited from lsst.daf.butler.registry.Registry 

350 _, registered = self._managers.collections.register(name, type, doc=doc) 

351 return registered 

352 

353 def getCollectionType(self, name: str) -> CollectionType: 

354 # Docstring inherited from lsst.daf.butler.registry.Registry 

355 return self._managers.collections.find(name).type 

356 

357 def _get_collection_record(self, name: str) -> CollectionRecord: 

358 # Docstring inherited from lsst.daf.butler.registry.Registry 

359 return self._managers.collections.find(name) 

360 

361 def registerRun(self, name: str, doc: Optional[str] = None) -> bool: 

362 # Docstring inherited from lsst.daf.butler.registry.Registry 

363 _, registered = self._managers.collections.register(name, CollectionType.RUN, doc=doc) 

364 return registered 

365 

366 @transactional 

367 def removeCollection(self, name: str) -> None: 

368 # Docstring inherited from lsst.daf.butler.registry.Registry 

369 self._managers.collections.remove(name) 

370 

371 def getCollectionChain(self, parent: str) -> tuple[str, ...]: 

372 # Docstring inherited from lsst.daf.butler.registry.Registry 

373 record = self._managers.collections.find(parent) 

374 if record.type is not CollectionType.CHAINED: 

375 raise CollectionTypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.") 

376 assert isinstance(record, ChainedCollectionRecord) 

377 return record.children 

378 

379 @transactional 

380 def setCollectionChain(self, parent: str, children: Any, *, flatten: bool = False) -> None: 

381 # Docstring inherited from lsst.daf.butler.registry.Registry 

382 record = self._managers.collections.find(parent) 

383 if record.type is not CollectionType.CHAINED: 

384 raise CollectionTypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.") 

385 assert isinstance(record, ChainedCollectionRecord) 

386 children = CollectionWildcard.from_expression(children).require_ordered() 

387 if children != record.children or flatten: 

388 record.update(self._managers.collections, children, flatten=flatten) 

389 

390 def getCollectionParentChains(self, collection: str) -> Set[str]: 

391 # Docstring inherited from lsst.daf.butler.registry.Registry 

392 return { 

393 record.name 

394 for record in self._managers.collections.getParentChains( 

395 self._managers.collections.find(collection).key 

396 ) 

397 } 

398 

399 def getCollectionDocumentation(self, collection: str) -> Optional[str]: 

400 # Docstring inherited from lsst.daf.butler.registry.Registry 

401 return self._managers.collections.getDocumentation(self._managers.collections.find(collection).key) 

402 

403 def setCollectionDocumentation(self, collection: str, doc: Optional[str]) -> None: 

404 # Docstring inherited from lsst.daf.butler.registry.Registry 

405 self._managers.collections.setDocumentation(self._managers.collections.find(collection).key, doc) 

406 

407 def getCollectionSummary(self, collection: str) -> CollectionSummary: 

408 # Docstring inherited from lsst.daf.butler.registry.Registry 

409 record = self._managers.collections.find(collection) 

410 return self._managers.datasets.getCollectionSummary(record) 

411 

412 def registerDatasetType(self, datasetType: DatasetType) -> bool: 

413 # Docstring inherited from lsst.daf.butler.registry.Registry 

414 _, inserted = self._managers.datasets.register(datasetType) 

415 return inserted 

416 

417 def removeDatasetType(self, name: str) -> None: 

418 # Docstring inherited from lsst.daf.butler.registry.Registry 

419 self._managers.datasets.remove(name) 

420 

421 def getDatasetType(self, name: str) -> DatasetType: 

422 # Docstring inherited from lsst.daf.butler.registry.Registry 

423 parent_name, component = DatasetType.splitDatasetTypeName(name) 

424 storage = self._managers.datasets[parent_name] 

425 if component is None: 

426 return storage.datasetType 

427 else: 

428 return storage.datasetType.makeComponentDatasetType(component) 

429 

430 def supportsIdGenerationMode(self, mode: DatasetIdGenEnum) -> bool: 

431 # Docstring inherited from lsst.daf.butler.registry.Registry 

432 return self._managers.datasets.supportsIdGenerationMode(mode) 

433 

434 def findDataset( 

435 self, 

436 datasetType: Union[DatasetType, str], 

437 dataId: Optional[DataId] = None, 

438 *, 

439 collections: Any = None, 

440 timespan: Optional[Timespan] = None, 

441 **kwargs: Any, 

442 ) -> Optional[DatasetRef]: 

443 # Docstring inherited from lsst.daf.butler.registry.Registry 

444 storage_class: str | None = None 

445 if isinstance(datasetType, DatasetType): 

446 parent_name, component = datasetType.nameAndComponent() 

447 if component is None: 

448 storage_class = datasetType.storageClass_name 

449 else: 

450 parent_name, component = DatasetType.splitDatasetTypeName(datasetType) 

451 storage = self._managers.datasets[parent_name] 

452 dataId = DataCoordinate.standardize( 

453 dataId, 

454 graph=storage.datasetType.dimensions, 

455 universe=self.dimensions, 

456 defaults=self.defaults.dataId, 

457 **kwargs, 

458 ) 

459 if collections is None: 

460 if not self.defaults.collections: 

461 raise NoDefaultCollectionError( 

462 "No collections provided to findDataset, and no defaults from registry construction." 

463 ) 

464 collections = self.defaults.collections 

465 collections = CollectionWildcard.from_expression(collections) 

466 collections.require_ordered() 

467 for collectionRecord in self._managers.collections.resolve_wildcard(collections): 

468 if collectionRecord.type is CollectionType.CALIBRATION and ( 

469 not storage.datasetType.isCalibration() or timespan is None 

470 ): 

471 continue 

472 result = storage.find(collectionRecord, dataId, timespan=timespan, storage_class=storage_class) 

473 if result is not None: 

474 if component is not None: 

475 return result.makeComponentRef(component) 

476 return result 

477 

478 return None 

479 

480 @transactional 

481 def insertDatasets( 

482 self, 

483 datasetType: Union[DatasetType, str], 

484 dataIds: Iterable[DataId], 

485 run: Optional[str] = None, 

486 expand: bool = True, 

487 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

488 ) -> List[DatasetRef]: 

489 # Docstring inherited from lsst.daf.butler.registry.Registry 

490 if isinstance(datasetType, DatasetType): 

491 storage = self._managers.datasets.find(datasetType.name) 

492 if storage is None: 

493 raise DatasetTypeError(f"DatasetType '{datasetType}' has not been registered.") 

494 else: 

495 storage = self._managers.datasets.find(datasetType) 

496 if storage is None: 

497 raise DatasetTypeError(f"DatasetType with name '{datasetType}' has not been registered.") 

498 if run is None: 

499 if self.defaults.run is None: 

500 raise NoDefaultCollectionError( 

501 "No run provided to insertDatasets, and no default from registry construction." 

502 ) 

503 run = self.defaults.run 

504 runRecord = self._managers.collections.find(run) 

505 if runRecord.type is not CollectionType.RUN: 

506 raise CollectionTypeError( 

507 f"Given collection is of type {runRecord.type.name}; RUN collection required." 

508 ) 

509 assert isinstance(runRecord, RunRecord) 

510 progress = Progress("daf.butler.Registry.insertDatasets", level=logging.DEBUG) 

511 if expand: 

512 expandedDataIds = [ 

513 self.expandDataId(dataId, graph=storage.datasetType.dimensions) 

514 for dataId in progress.wrap(dataIds, f"Expanding {storage.datasetType.name} data IDs") 

515 ] 

516 else: 

517 expandedDataIds = [ 

518 DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions) for dataId in dataIds 

519 ] 

520 try: 

521 refs = list(storage.insert(runRecord, expandedDataIds, idGenerationMode)) 

522 if self._managers.obscore: 

523 self._managers.obscore.add_datasets(refs) 

524 except sqlalchemy.exc.IntegrityError as err: 

525 raise ConflictingDefinitionError( 

526 f"A database constraint failure was triggered by inserting " 

527 f"one or more datasets of type {storage.datasetType} into " 

528 f"collection '{run}'. " 

529 f"This probably means a dataset with the same data ID " 

530 f"and dataset type already exists, but it may also mean a " 

531 f"dimension row is missing." 

532 ) from err 

533 return refs 

534 

535 @transactional 

536 def _importDatasets( 

537 self, 

538 datasets: Iterable[DatasetRef], 

539 expand: bool = True, 

540 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

541 reuseIds: bool = False, 

542 ) -> List[DatasetRef]: 

543 # Docstring inherited from lsst.daf.butler.registry.Registry 

544 datasets = list(datasets) 

545 if not datasets: 

546 # nothing to do 

547 return [] 

548 

549 # find dataset type 

550 datasetTypes = set(dataset.datasetType for dataset in datasets) 

551 if len(datasetTypes) != 1: 

552 raise DatasetTypeError(f"Multiple dataset types in input datasets: {datasetTypes}") 

553 datasetType = datasetTypes.pop() 

554 

555 # get storage handler for this dataset type 

556 storage = self._managers.datasets.find(datasetType.name) 

557 if storage is None: 

558 raise DatasetTypeError(f"DatasetType '{datasetType}' has not been registered.") 

559 

560 # find run name 

561 runs = set(dataset.run for dataset in datasets) 

562 if len(runs) != 1: 

563 raise ValueError(f"Multiple run names in input datasets: {runs}") 

564 run = runs.pop() 

565 if run is None: 

566 if self.defaults.run is None: 

567 raise NoDefaultCollectionError( 

568 "No run provided to ingestDatasets, and no default from registry construction." 

569 ) 

570 run = self.defaults.run 

571 

572 runRecord = self._managers.collections.find(run) 

573 if runRecord.type is not CollectionType.RUN: 

574 raise CollectionTypeError( 

575 f"Given collection '{runRecord.name}' is of type {runRecord.type.name};" 

576 " RUN collection required." 

577 ) 

578 assert isinstance(runRecord, RunRecord) 

579 

580 progress = Progress("daf.butler.Registry.insertDatasets", level=logging.DEBUG) 

581 if expand: 

582 expandedDatasets = [ 

583 dataset.expanded(self.expandDataId(dataset.dataId, graph=storage.datasetType.dimensions)) 

584 for dataset in progress.wrap(datasets, f"Expanding {storage.datasetType.name} data IDs") 

585 ] 

586 else: 

587 expandedDatasets = [ 

588 DatasetRef(datasetType, dataset.dataId, id=dataset.id, run=dataset.run, conform=True) 

589 for dataset in datasets 

590 ] 

591 

592 try: 

593 refs = list(storage.import_(runRecord, expandedDatasets, idGenerationMode, reuseIds)) 

594 if self._managers.obscore: 

595 self._managers.obscore.add_datasets(refs) 

596 except sqlalchemy.exc.IntegrityError as err: 

597 raise ConflictingDefinitionError( 

598 f"A database constraint failure was triggered by inserting " 

599 f"one or more datasets of type {storage.datasetType} into " 

600 f"collection '{run}'. " 

601 f"This probably means a dataset with the same data ID " 

602 f"and dataset type already exists, but it may also mean a " 

603 f"dimension row is missing." 

604 ) from err 

605 return refs 

606 

607 def getDataset(self, id: DatasetId) -> Optional[DatasetRef]: 

608 # Docstring inherited from lsst.daf.butler.registry.Registry 

609 return self._managers.datasets.getDatasetRef(id) 

610 

611 @transactional 

612 def removeDatasets(self, refs: Iterable[DatasetRef]) -> None: 

613 # Docstring inherited from lsst.daf.butler.registry.Registry 

614 progress = Progress("lsst.daf.butler.Registry.removeDatasets", level=logging.DEBUG) 

615 for datasetType, refsForType in progress.iter_item_chunks( 

616 DatasetRef.groupByType(refs).items(), desc="Removing datasets by type" 

617 ): 

618 storage = self._managers.datasets[datasetType.name] 

619 try: 

620 storage.delete(refsForType) 

621 except sqlalchemy.exc.IntegrityError as err: 

622 raise OrphanedRecordError( 

623 "One or more datasets is still present in one or more Datastores." 

624 ) from err 

625 

626 @transactional 

627 def associate(self, collection: str, refs: Iterable[DatasetRef]) -> None: 

628 # Docstring inherited from lsst.daf.butler.registry.Registry 

629 progress = Progress("lsst.daf.butler.Registry.associate", level=logging.DEBUG) 

630 collectionRecord = self._managers.collections.find(collection) 

631 if collectionRecord.type is not CollectionType.TAGGED: 

632 raise CollectionTypeError( 

633 f"Collection '{collection}' has type {collectionRecord.type.name}, not TAGGED." 

634 ) 

635 for datasetType, refsForType in progress.iter_item_chunks( 

636 DatasetRef.groupByType(refs).items(), desc="Associating datasets by type" 

637 ): 

638 storage = self._managers.datasets[datasetType.name] 

639 try: 

640 storage.associate(collectionRecord, refsForType) 

641 if self._managers.obscore: 

642 # If a TAGGED collection is being monitored by ObsCore 

643 # manager then we may need to save the dataset. 

644 self._managers.obscore.associate(refsForType, collectionRecord) 

645 except sqlalchemy.exc.IntegrityError as err: 

646 raise ConflictingDefinitionError( 

647 f"Constraint violation while associating dataset of type {datasetType.name} with " 

648 f"collection {collection}. This probably means that one or more datasets with the same " 

649 f"dataset type and data ID already exist in the collection, but it may also indicate " 

650 f"that the datasets do not exist." 

651 ) from err 

652 

653 @transactional 

654 def disassociate(self, collection: str, refs: Iterable[DatasetRef]) -> None: 

655 # Docstring inherited from lsst.daf.butler.registry.Registry 

656 progress = Progress("lsst.daf.butler.Registry.disassociate", level=logging.DEBUG) 

657 collectionRecord = self._managers.collections.find(collection) 

658 if collectionRecord.type is not CollectionType.TAGGED: 

659 raise CollectionTypeError( 

660 f"Collection '{collection}' has type {collectionRecord.type.name}; expected TAGGED." 

661 ) 

662 for datasetType, refsForType in progress.iter_item_chunks( 

663 DatasetRef.groupByType(refs).items(), desc="Disassociating datasets by type" 

664 ): 

665 storage = self._managers.datasets[datasetType.name] 

666 storage.disassociate(collectionRecord, refsForType) 

667 if self._managers.obscore: 

668 self._managers.obscore.disassociate(refsForType, collectionRecord) 

669 

670 @transactional 

671 def certify(self, collection: str, refs: Iterable[DatasetRef], timespan: Timespan) -> None: 

672 # Docstring inherited from lsst.daf.butler.registry.Registry 

673 progress = Progress("lsst.daf.butler.Registry.certify", level=logging.DEBUG) 

674 collectionRecord = self._managers.collections.find(collection) 

675 for datasetType, refsForType in progress.iter_item_chunks( 

676 DatasetRef.groupByType(refs).items(), desc="Certifying datasets by type" 

677 ): 

678 storage = self._managers.datasets[datasetType.name] 

679 storage.certify(collectionRecord, refsForType, timespan) 

680 

681 @transactional 

682 def decertify( 

683 self, 

684 collection: str, 

685 datasetType: Union[str, DatasetType], 

686 timespan: Timespan, 

687 *, 

688 dataIds: Optional[Iterable[DataId]] = None, 

689 ) -> None: 

690 # Docstring inherited from lsst.daf.butler.registry.Registry 

691 collectionRecord = self._managers.collections.find(collection) 

692 if isinstance(datasetType, str): 

693 storage = self._managers.datasets[datasetType] 

694 else: 

695 storage = self._managers.datasets[datasetType.name] 

696 standardizedDataIds = None 

697 if dataIds is not None: 

698 standardizedDataIds = [ 

699 DataCoordinate.standardize(d, graph=storage.datasetType.dimensions) for d in dataIds 

700 ] 

701 storage.decertify(collectionRecord, timespan, dataIds=standardizedDataIds) 

702 

703 def getDatastoreBridgeManager(self) -> DatastoreRegistryBridgeManager: 

704 """Return an object that allows a new `Datastore` instance to 

705 communicate with this `Registry`. 

706 

707 Returns 

708 ------- 

709 manager : `DatastoreRegistryBridgeManager` 

710 Object that mediates communication between this `Registry` and its 

711 associated datastores. 

712 """ 

713 return self._managers.datastores 

714 

715 def getDatasetLocations(self, ref: DatasetRef) -> Iterable[str]: 

716 # Docstring inherited from lsst.daf.butler.registry.Registry 

717 return self._managers.datastores.findDatastores(ref) 

718 

719 def expandDataId( 

720 self, 

721 dataId: Optional[DataId] = None, 

722 *, 

723 graph: Optional[DimensionGraph] = None, 

724 records: Optional[NameLookupMapping[DimensionElement, Optional[DimensionRecord]]] = None, 

725 withDefaults: bool = True, 

726 **kwargs: Any, 

727 ) -> DataCoordinate: 

728 # Docstring inherited from lsst.daf.butler.registry.Registry 

729 if not withDefaults: 

730 defaults = None 

731 else: 

732 defaults = self.defaults.dataId 

733 try: 

734 standardized = DataCoordinate.standardize( 

735 dataId, graph=graph, universe=self.dimensions, defaults=defaults, **kwargs 

736 ) 

737 except KeyError as exc: 

738 # This means either kwargs have some odd name or required 

739 # dimension is missing. 

740 raise DimensionNameError(str(exc)) from exc 

741 if standardized.hasRecords(): 

742 return standardized 

743 if records is None: 

744 records = {} 

745 elif isinstance(records, NamedKeyMapping): 

746 records = records.byName() 

747 else: 

748 records = dict(records) 

749 if isinstance(dataId, DataCoordinate) and dataId.hasRecords(): 

750 records.update(dataId.records.byName()) 

751 keys = standardized.byName() 

752 for element in standardized.graph.primaryKeyTraversalOrder: 

753 record = records.get(element.name, ...) # Use ... to mean not found; None might mean NULL 

754 if record is ...: 

755 if isinstance(element, Dimension) and keys.get(element.name) is None: 

756 if element in standardized.graph.required: 

757 raise DimensionNameError( 

758 f"No value or null value for required dimension {element.name}." 

759 ) 

760 keys[element.name] = None 

761 record = None 

762 else: 

763 storage = self._managers.dimensions[element] 

764 dataIdSet = DataCoordinateIterable.fromScalar( 

765 DataCoordinate.standardize(keys, graph=element.graph) 

766 ) 

767 fetched = tuple(storage.fetch(dataIdSet)) 

768 try: 

769 (record,) = fetched 

770 except ValueError: 

771 record = None 

772 records[element.name] = record 

773 if record is not None: 

774 for d in element.implied: 

775 value = getattr(record, d.name) 

776 if keys.setdefault(d.name, value) != value: 

777 raise InconsistentDataIdError( 

778 f"Data ID {standardized} has {d.name}={keys[d.name]!r}, " 

779 f"but {element.name} implies {d.name}={value!r}." 

780 ) 

781 else: 

782 if element in standardized.graph.required: 

783 raise DataIdValueError( 

784 f"Could not fetch record for required dimension {element.name} via keys {keys}." 

785 ) 

786 if element.alwaysJoin: 

787 raise InconsistentDataIdError( 

788 f"Could not fetch record for element {element.name} via keys {keys}, ", 

789 "but it is marked alwaysJoin=True; this means one or more dimensions are not " 

790 "related.", 

791 ) 

792 for d in element.implied: 

793 keys.setdefault(d.name, None) 

794 records.setdefault(d.name, None) 

795 return DataCoordinate.standardize(keys, graph=standardized.graph).expanded(records=records) 

796 

797 def insertDimensionData( 

798 self, 

799 element: Union[DimensionElement, str], 

800 *data: Union[Mapping[str, Any], DimensionRecord], 

801 conform: bool = True, 

802 replace: bool = False, 

803 skip_existing: bool = False, 

804 ) -> None: 

805 # Docstring inherited from lsst.daf.butler.registry.Registry 

806 if conform: 

807 if isinstance(element, str): 

808 element = self.dimensions[element] 

809 records = [ 

810 row if isinstance(row, DimensionRecord) else element.RecordClass(**row) for row in data 

811 ] 

812 else: 

813 # Ignore typing since caller said to trust them with conform=False. 

814 records = data # type: ignore 

815 storage = self._managers.dimensions[element] # type: ignore 

816 storage.insert(*records, replace=replace, skip_existing=skip_existing) 

817 

818 def syncDimensionData( 

819 self, 

820 element: Union[DimensionElement, str], 

821 row: Union[Mapping[str, Any], DimensionRecord], 

822 conform: bool = True, 

823 update: bool = False, 

824 ) -> Union[bool, Dict[str, Any]]: 

825 # Docstring inherited from lsst.daf.butler.registry.Registry 

826 if conform: 

827 if isinstance(element, str): 

828 element = self.dimensions[element] 

829 record = row if isinstance(row, DimensionRecord) else element.RecordClass(**row) 

830 else: 

831 # Ignore typing since caller said to trust them with conform=False. 

832 record = row # type: ignore 

833 storage = self._managers.dimensions[element] # type: ignore 

834 return storage.sync(record, update=update) 

835 

836 def queryDatasetTypes( 

837 self, 

838 expression: Any = ..., 

839 *, 

840 components: Optional[bool] = None, 

841 missing: Optional[List[str]] = None, 

842 ) -> Iterable[DatasetType]: 

843 # Docstring inherited from lsst.daf.butler.registry.Registry 

844 wildcard = DatasetTypeWildcard.from_expression(expression) 

845 composition_dict = self._managers.datasets.resolve_wildcard( 

846 wildcard, 

847 components=components, 

848 missing=missing, 

849 ) 

850 result: list[DatasetType] = [] 

851 for parent_dataset_type, components_for_parent in composition_dict.items(): 

852 result.extend( 

853 parent_dataset_type.makeComponentDatasetType(c) if c is not None else parent_dataset_type 

854 for c in components_for_parent 

855 ) 

856 return result 

857 

858 def queryCollections( 

859 self, 

860 expression: Any = ..., 

861 datasetType: Optional[DatasetType] = None, 

862 collectionTypes: Union[Iterable[CollectionType], CollectionType] = CollectionType.all(), 

863 flattenChains: bool = False, 

864 includeChains: Optional[bool] = None, 

865 ) -> Sequence[str]: 

866 # Docstring inherited from lsst.daf.butler.registry.Registry 

867 

868 # Right now the datasetTypes argument is completely ignored, but that 

869 # is consistent with its [lack of] guarantees. DM-24939 or a follow-up 

870 # ticket will take care of that. 

871 try: 

872 wildcard = CollectionWildcard.from_expression(expression) 

873 except TypeError as exc: 

874 raise CollectionExpressionError(f"Invalid collection expression '{expression}'") from exc 

875 collectionTypes = ensure_iterable(collectionTypes) 

876 return [ 

877 record.name 

878 for record in self._managers.collections.resolve_wildcard( 

879 wildcard, 

880 collection_types=frozenset(collectionTypes), 

881 flatten_chains=flattenChains, 

882 include_chains=includeChains, 

883 ) 

884 ] 

885 

886 def _makeQueryBuilder( 

887 self, summary: queries.QuerySummary, doomed_by: Iterable[str] = () 

888 ) -> queries.QueryBuilder: 

889 """Return a `QueryBuilder` instance capable of constructing and 

890 managing more complex queries than those obtainable via `Registry` 

891 interfaces. 

892 

893 This is an advanced interface; downstream code should prefer 

894 `Registry.queryDataIds` and `Registry.queryDatasets` whenever those 

895 are sufficient. 

896 

897 Parameters 

898 ---------- 

899 summary : `queries.QuerySummary` 

900 Object describing and categorizing the full set of dimensions that 

901 will be included in the query. 

902 doomed_by : `Iterable` of `str`, optional 

903 A list of diagnostic messages that indicate why the query is going 

904 to yield no results and should not even be executed. If an empty 

905 container (default) the query will be executed unless other code 

906 determines that it is doomed. 

907 

908 Returns 

909 ------- 

910 builder : `queries.QueryBuilder` 

911 Object that can be used to construct and perform advanced queries. 

912 """ 

913 return queries.QueryBuilder( 

914 summary, 

915 backend=queries.SqlQueryBackend(self._db, self._managers), 

916 doomed_by=doomed_by, 

917 ) 

918 

919 def _standardize_query_dataset_args( 

920 self, 

921 datasets: Any, 

922 collections: Any, 

923 components: bool | None, 

924 mode: Literal["find_first"] | Literal["find_all"] | Literal["constrain"] = "constrain", 

925 *, 

926 doomed_by: list[str], 

927 ) -> tuple[dict[DatasetType, list[str | None]], CollectionWildcard | None]: 

928 """Preprocess dataset arguments passed to query* methods. 

929 

930 Parameters 

931 ---------- 

932 datasets : `DatasetType`, `str`, `re.Pattern`, or iterable of these 

933 Expression identifying dataset types. See `queryDatasetTypes` for 

934 details. 

935 collections : `str`, `re.Pattern`, or iterable of these 

936 Expression identifying collections to be searched. See 

937 `queryCollections` for details. 

938 components : `bool`, optional 

939 If `True`, apply all expression patterns to component dataset type 

940 names as well. If `False`, never apply patterns to components. 

941 If `None` (default), apply patterns to components only if their 

942 parent datasets were not matched by the expression. 

943 Fully-specified component datasets (`str` or `DatasetType` 

944 instances) are always included. 

945 

946 Values other than `False` are deprecated, and only `False` will be 

947 supported after v26. After v27 this argument will be removed 

948 entirely. 

949 mode : `str`, optional 

950 The way in which datasets are being used in this query; one of: 

951 

952 - "find_first": this is a query for the first dataset in an 

953 ordered list of collections. Prohibits collection wildcards, 

954 but permits dataset type wildcards. 

955 

956 - "find_all": this is a query for all datasets in all matched 

957 collections. Permits collection and dataset type wildcards. 

958 

959 - "constrain": this is a query for something other than datasets, 

960 with results constrained by dataset existence. Permits 

961 collection wildcards and prohibits ``...`` as a dataset type 

962 wildcard. 

963 doomed_by : `list` [ `str` ] 

964 List to append messages indicating why the query is doomed to 

965 yield no results. 

966 

967 Returns 

968 ------- 

969 composition : `defaultdict` [ `DatasetType`, `list` [ `str` ] ] 

970 Dictionary mapping parent dataset type to `list` of components 

971 matched for that dataset type (or `None` for the parent itself). 

972 collections : `CollectionWildcard` 

973 Processed collection expression. 

974 """ 

975 composition: dict[DatasetType, list[str | None]] = {} 

976 if datasets is not None: 

977 if not collections: 

978 if not self.defaults.collections: 

979 raise NoDefaultCollectionError("No collections, and no registry default collections.") 

980 collections = self.defaults.collections 

981 else: 

982 collections = CollectionWildcard.from_expression(collections) 

983 if mode == "find_first" and collections.patterns: 

984 raise TypeError( 

985 f"Collection pattern(s) {collections.patterns} not allowed in this context." 

986 ) 

987 missing: list[str] = [] 

988 composition = self._managers.datasets.resolve_wildcard( 

989 datasets, components=components, missing=missing, explicit_only=(mode == "constrain") 

990 ) 

991 if missing and mode == "constrain": 

992 # After v26 this should raise MissingDatasetTypeError, to be 

993 # implemented on DM-36303. 

994 warnings.warn( 

995 f"Dataset type(s) {missing} are not registered; this will be an error after v26.", 

996 FutureWarning, 

997 ) 

998 doomed_by.extend(f"Dataset type {name} is not registered." for name in missing) 

999 elif collections: 

1000 raise ArgumentError(f"Cannot pass 'collections' (='{collections}') without 'datasets'.") 

1001 return composition, collections 

1002 

1003 def queryDatasets( 

1004 self, 

1005 datasetType: Any, 

1006 *, 

1007 collections: Any = None, 

1008 dimensions: Optional[Iterable[Union[Dimension, str]]] = None, 

1009 dataId: Optional[DataId] = None, 

1010 where: Optional[str] = None, 

1011 findFirst: bool = False, 

1012 components: Optional[bool] = None, 

1013 bind: Optional[Mapping[str, Any]] = None, 

1014 check: bool = True, 

1015 **kwargs: Any, 

1016 ) -> queries.DatasetQueryResults: 

1017 # Docstring inherited from lsst.daf.butler.registry.Registry 

1018 doomed_by: list[str] = [] 

1019 data_id = self.expandDataId(dataId, **kwargs) 

1020 dataset_composition, collections = self._standardize_query_dataset_args( 

1021 datasetType, 

1022 collections, 

1023 components, 

1024 mode="find_first" if findFirst else "find_all", 

1025 doomed_by=doomed_by, 

1026 ) 

1027 parent_results: list[queries.ParentDatasetQueryResults] = [] 

1028 for parent_dataset_type, components_for_parent in dataset_composition.items(): 

1029 # The full set of dimensions in the query is the combination of 

1030 # those needed for the DatasetType and those explicitly requested, 

1031 # if any. 

1032 dimension_names = set(parent_dataset_type.dimensions.names) 

1033 if dimensions is not None: 

1034 dimension_names.update(self.dimensions.extract(dimensions).names) 

1035 # Construct the summary structure needed to construct a 

1036 # QueryBuilder. 

1037 summary = queries.QuerySummary( 

1038 requested=DimensionGraph(self.dimensions, names=dimension_names), 

1039 dataId=data_id, 

1040 expression=where, 

1041 bind=bind, 

1042 defaults=self.defaults.dataId, 

1043 check=check, 

1044 datasets=[parent_dataset_type], 

1045 ) 

1046 builder = self._makeQueryBuilder(summary) 

1047 # Add the dataset subquery to the query, telling the QueryBuilder 

1048 # to include the rank of the selected collection in the results 

1049 # only if we need to findFirst. Note that if any of the 

1050 # collections are actually wildcard expressions, and 

1051 # findFirst=True, this will raise TypeError for us. 

1052 builder.joinDataset(parent_dataset_type, collections, isResult=True, findFirst=findFirst) 

1053 query = builder.finish() 

1054 parent_results.append( 

1055 queries.ParentDatasetQueryResults( 

1056 self._db, query, datasetType=parent_dataset_type, components=components_for_parent 

1057 ) 

1058 ) 

1059 if not parent_results: 

1060 doomed_by.extend( 

1061 f"No registered dataset type matching {t!r} found, so no matching datasets can " 

1062 "exist in any collection." 

1063 for t in ensure_iterable(datasetType) 

1064 ) 

1065 return queries.ChainedDatasetQueryResults([], doomed_by=doomed_by) 

1066 elif len(parent_results) == 1: 

1067 return parent_results[0] 

1068 else: 

1069 return queries.ChainedDatasetQueryResults(parent_results) 

1070 

1071 def queryDataIds( 

1072 self, 

1073 dimensions: Union[Iterable[Union[Dimension, str]], Dimension, str], 

1074 *, 

1075 dataId: Optional[DataId] = None, 

1076 datasets: Any = None, 

1077 collections: Any = None, 

1078 where: Optional[str] = None, 

1079 components: Optional[bool] = None, 

1080 bind: Optional[Mapping[str, Any]] = None, 

1081 check: bool = True, 

1082 **kwargs: Any, 

1083 ) -> queries.DataCoordinateQueryResults: 

1084 # Docstring inherited from lsst.daf.butler.registry.Registry 

1085 dimensions = ensure_iterable(dimensions) 

1086 requestedDimensions = self.dimensions.extract(dimensions) 

1087 doomed_by: list[str] = [] 

1088 data_id = self.expandDataId(dataId, **kwargs) 

1089 dataset_composition, collections = self._standardize_query_dataset_args( 

1090 datasets, collections, components, doomed_by=doomed_by 

1091 ) 

1092 

1093 def query_factory( 

1094 order_by: Optional[Iterable[str]] = None, limit: Optional[Tuple[int, Optional[int]]] = None 

1095 ) -> queries.Query: 

1096 """Construct the Query object that generates query results.""" 

1097 summary = queries.QuerySummary( 

1098 requested=requestedDimensions, 

1099 dataId=data_id, 

1100 expression=where, 

1101 bind=bind, 

1102 defaults=self.defaults.dataId, 

1103 check=check, 

1104 datasets=dataset_composition.keys(), 

1105 order_by=order_by, 

1106 limit=limit, 

1107 ) 

1108 builder = self._makeQueryBuilder(summary, doomed_by=doomed_by) 

1109 for datasetType in dataset_composition: 

1110 builder.joinDataset(datasetType, collections, isResult=False) 

1111 return builder.finish() 

1112 

1113 return queries.DataCoordinateQueryResults(self._db, query_factory, requestedDimensions) 

1114 

1115 def queryDimensionRecords( 

1116 self, 

1117 element: Union[DimensionElement, str], 

1118 *, 

1119 dataId: Optional[DataId] = None, 

1120 datasets: Any = None, 

1121 collections: Any = None, 

1122 where: Optional[str] = None, 

1123 components: Optional[bool] = None, 

1124 bind: Optional[Mapping[str, Any]] = None, 

1125 check: bool = True, 

1126 **kwargs: Any, 

1127 ) -> queries.DimensionRecordQueryResults: 

1128 # Docstring inherited from lsst.daf.butler.registry.Registry 

1129 if not isinstance(element, DimensionElement): 

1130 try: 

1131 element = self.dimensions[element] 

1132 except KeyError as e: 

1133 raise DimensionNameError( 

1134 f"No such dimension '{element}', available dimensions: " 

1135 + str(self.dimensions.getStaticElements()) 

1136 ) from e 

1137 dataIds = self.queryDataIds( 

1138 element.graph, 

1139 dataId=dataId, 

1140 datasets=datasets, 

1141 collections=collections, 

1142 where=where, 

1143 components=components, 

1144 bind=bind, 

1145 check=check, 

1146 **kwargs, 

1147 ) 

1148 return queries.DatabaseDimensionRecordQueryResults(dataIds, self._managers.dimensions[element]) 

1149 

1150 def queryDatasetAssociations( 

1151 self, 

1152 datasetType: Union[str, DatasetType], 

1153 collections: Any = ..., 

1154 *, 

1155 collectionTypes: Iterable[CollectionType] = CollectionType.all(), 

1156 flattenChains: bool = False, 

1157 ) -> Iterator[DatasetAssociation]: 

1158 # Docstring inherited from lsst.daf.butler.registry.Registry 

1159 if collections is None: 

1160 if not self.defaults.collections: 

1161 raise NoDefaultCollectionError( 

1162 "No collections provided to findDataset, and no defaults from registry construction." 

1163 ) 

1164 collections = self.defaults.collections 

1165 collections = CollectionWildcard.from_expression(collections) 

1166 TimespanReprClass = self._db.getTimespanRepresentation() 

1167 if isinstance(datasetType, str): 

1168 storage = self._managers.datasets[datasetType] 

1169 else: 

1170 storage = self._managers.datasets[datasetType.name] 

1171 for collectionRecord in self._managers.collections.resolve_wildcard( 

1172 collections, 

1173 collection_types=frozenset(collectionTypes), 

1174 flatten_chains=flattenChains, 

1175 ): 

1176 query = storage.select(collectionRecord) 

1177 with self._db.query(query) as sql_result: 

1178 sql_mappings = sql_result.mappings().fetchall() 

1179 for row in sql_mappings: 

1180 dataId = DataCoordinate.fromRequiredValues( 

1181 storage.datasetType.dimensions, 

1182 tuple(row[name] for name in storage.datasetType.dimensions.required.names), 

1183 ) 

1184 runRecord = self._managers.collections[row[self._managers.collections.getRunForeignKeyName()]] 

1185 ref = DatasetRef(storage.datasetType, dataId, id=row["id"], run=runRecord.name, conform=False) 

1186 if collectionRecord.type is CollectionType.CALIBRATION: 

1187 timespan = TimespanReprClass.extract(row) 

1188 else: 

1189 timespan = None 

1190 yield DatasetAssociation(ref=ref, collection=collectionRecord.name, timespan=timespan) 

1191 

1192 storageClasses: StorageClassFactory 

1193 """All storage classes known to the registry (`StorageClassFactory`). 

1194 """