Coverage for python/lsst/daf/butler/registries/sql.py: 13%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

469 statements  

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ("SqlRegistry",) 

25 

26import contextlib 

27import logging 

28from collections import defaultdict 

29from typing import TYPE_CHECKING, Any, Dict, Iterable, Iterator, List, Mapping, Optional, Set, Tuple, Union 

30 

31import sqlalchemy 

32from lsst.resources import ResourcePathExpression 

33from lsst.utils.iteration import ensure_iterable 

34 

35from ..core import ( 

36 Config, 

37 DataCoordinate, 

38 DataCoordinateIterable, 

39 DataId, 

40 DatasetAssociation, 

41 DatasetId, 

42 DatasetRef, 

43 DatasetType, 

44 Dimension, 

45 DimensionConfig, 

46 DimensionElement, 

47 DimensionGraph, 

48 DimensionRecord, 

49 DimensionUniverse, 

50 NamedKeyMapping, 

51 NameLookupMapping, 

52 Progress, 

53 StorageClassFactory, 

54 Timespan, 

55 ddl, 

56) 

57from ..core.utils import transactional 

58from ..registry import ( 

59 CollectionSearch, 

60 CollectionType, 

61 ConflictingDefinitionError, 

62 InconsistentDataIdError, 

63 OrphanedRecordError, 

64 Registry, 

65 RegistryConfig, 

66 RegistryDefaults, 

67 queries, 

68) 

69from ..registry.interfaces import ChainedCollectionRecord, DatasetIdGenEnum, RunRecord 

70from ..registry.managers import RegistryManagerInstances, RegistryManagerTypes 

71from ..registry.queries import Query 

72from ..registry.summaries import CollectionSummary 

73from ..registry.wildcards import CategorizedWildcard, CollectionQuery, Ellipsis 

74 

75if TYPE_CHECKING: 75 ↛ 76line 75 didn't jump to line 76, because the condition on line 75 was never true

76 from .._butlerConfig import ButlerConfig 

77 from ..registry.interfaces import CollectionRecord, Database, DatastoreRegistryBridgeManager 

78 

79 

80_LOG = logging.getLogger(__name__) 

81 

82 

83class SqlRegistry(Registry): 

84 """Registry implementation based on SQLAlchemy. 

85 

86 Parameters 

87 ---------- 

88 database : `Database` 

89 Database instance to store Registry. 

90 defaults : `RegistryDefaults` 

91 Default collection search path and/or output `~CollectionType.RUN` 

92 collection. 

93 managers : `RegistryManagerInstances` 

94 All the managers required for this registry. 

95 """ 

96 

97 defaultConfigFile: Optional[str] = None 

98 """Path to configuration defaults. Accessed within the ``configs`` resource 

99 or relative to a search path. Can be None if no defaults specified. 

100 """ 

101 

102 @classmethod 

103 def createFromConfig( 

104 cls, 

105 config: Optional[Union[RegistryConfig, str]] = None, 

106 dimensionConfig: Optional[Union[DimensionConfig, str]] = None, 

107 butlerRoot: Optional[ResourcePathExpression] = None, 

108 ) -> Registry: 

109 """Create registry database and return `SqlRegistry` instance. 

110 

111 This method initializes database contents, database must be empty 

112 prior to calling this method. 

113 

114 Parameters 

115 ---------- 

116 config : `RegistryConfig` or `str`, optional 

117 Registry configuration, if missing then default configuration will 

118 be loaded from registry.yaml. 

119 dimensionConfig : `DimensionConfig` or `str`, optional 

120 Dimensions configuration, if missing then default configuration 

121 will be loaded from dimensions.yaml. 

122 butlerRoot : convertible to `lsst.resources.ResourcePath`, optional 

123 Path to the repository root this `SqlRegistry` will manage. 

124 

125 Returns 

126 ------- 

127 registry : `SqlRegistry` 

128 A new `SqlRegistry` instance. 

129 """ 

130 config = cls.forceRegistryConfig(config) 

131 config.replaceRoot(butlerRoot) 

132 

133 if isinstance(dimensionConfig, str): 

134 dimensionConfig = DimensionConfig(config) 

135 elif dimensionConfig is None: 

136 dimensionConfig = DimensionConfig() 

137 elif not isinstance(dimensionConfig, DimensionConfig): 

138 raise TypeError(f"Incompatible Dimension configuration type: {type(dimensionConfig)}") 

139 

140 DatabaseClass = config.getDatabaseClass() 

141 database = DatabaseClass.fromUri( 

142 str(config.connectionString), origin=config.get("origin", 0), namespace=config.get("namespace") 

143 ) 

144 managerTypes = RegistryManagerTypes.fromConfig(config) 

145 managers = managerTypes.makeRepo(database, dimensionConfig) 

146 return cls(database, RegistryDefaults(), managers) 

147 

148 @classmethod 

149 def fromConfig( 

150 cls, 

151 config: Union[ButlerConfig, RegistryConfig, Config, str], 

152 butlerRoot: Optional[ResourcePathExpression] = None, 

153 writeable: bool = True, 

154 defaults: Optional[RegistryDefaults] = None, 

155 ) -> Registry: 

156 """Create `Registry` subclass instance from `config`. 

157 

158 Registry database must be initialized prior to calling this method. 

159 

160 Parameters 

161 ---------- 

162 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str` 

163 Registry configuration 

164 butlerRoot : `lsst.resources.ResourcePathExpression`, optional 

165 Path to the repository root this `Registry` will manage. 

166 writeable : `bool`, optional 

167 If `True` (default) create a read-write connection to the database. 

168 defaults : `RegistryDefaults`, optional 

169 Default collection search path and/or output `~CollectionType.RUN` 

170 collection. 

171 

172 Returns 

173 ------- 

174 registry : `SqlRegistry` (subclass) 

175 A new `SqlRegistry` subclass instance. 

176 """ 

177 config = cls.forceRegistryConfig(config) 

178 config.replaceRoot(butlerRoot) 

179 DatabaseClass = config.getDatabaseClass() 

180 database = DatabaseClass.fromUri( 

181 str(config.connectionString), 

182 origin=config.get("origin", 0), 

183 namespace=config.get("namespace"), 

184 writeable=writeable, 

185 ) 

186 managerTypes = RegistryManagerTypes.fromConfig(config) 

187 managers = managerTypes.loadRepo(database) 

188 if defaults is None: 

189 defaults = RegistryDefaults() 

190 return cls(database, defaults, managers) 

191 

192 def __init__(self, database: Database, defaults: RegistryDefaults, managers: RegistryManagerInstances): 

193 self._db = database 

194 self._managers = managers 

195 self.storageClasses = StorageClassFactory() 

196 # Intentionally invoke property setter to initialize defaults. This 

197 # can only be done after most of the rest of Registry has already been 

198 # initialized, and must be done before the property getter is used. 

199 self.defaults = defaults 

200 

201 def __str__(self) -> str: 

202 return str(self._db) 

203 

204 def __repr__(self) -> str: 

205 return f"SqlRegistry({self._db!r}, {self.dimensions!r})" 

206 

207 def isWriteable(self) -> bool: 

208 # Docstring inherited from lsst.daf.butler.registry.Registry 

209 return self._db.isWriteable() 

210 

211 def copy(self, defaults: Optional[RegistryDefaults] = None) -> Registry: 

212 # Docstring inherited from lsst.daf.butler.registry.Registry 

213 if defaults is None: 

214 # No need to copy, because `RegistryDefaults` is immutable; we 

215 # effectively copy on write. 

216 defaults = self.defaults 

217 return type(self)(self._db, defaults, self._managers) 

218 

219 @property 

220 def dimensions(self) -> DimensionUniverse: 

221 # Docstring inherited from lsst.daf.butler.registry.Registry 

222 return self._managers.dimensions.universe 

223 

224 def refresh(self) -> None: 

225 # Docstring inherited from lsst.daf.butler.registry.Registry 

226 self._managers.refresh() 

227 

228 @contextlib.contextmanager 

229 def transaction(self, *, savepoint: bool = False) -> Iterator[None]: 

230 # Docstring inherited from lsst.daf.butler.registry.Registry 

231 try: 

232 with self._db.transaction(savepoint=savepoint): 

233 yield 

234 except BaseException: 

235 # TODO: this clears the caches sometimes when we wouldn't actually 

236 # need to. Can we avoid that? 

237 self._managers.dimensions.clearCaches() 

238 raise 

239 

240 def resetConnectionPool(self) -> None: 

241 """Reset SQLAlchemy connection pool for `SqlRegistry` database. 

242 

243 This operation is useful when using registry with fork-based 

244 multiprocessing. To use registry across fork boundary one has to make 

245 sure that there are no currently active connections (no session or 

246 transaction is in progress) and connection pool is reset using this 

247 method. This method should be called by the child process immediately 

248 after the fork. 

249 """ 

250 self._db._engine.dispose() 

251 

252 def registerOpaqueTable(self, tableName: str, spec: ddl.TableSpec) -> None: 

253 """Add an opaque (to the `Registry`) table for use by a `Datastore` or 

254 other data repository client. 

255 

256 Opaque table records can be added via `insertOpaqueData`, retrieved via 

257 `fetchOpaqueData`, and removed via `deleteOpaqueData`. 

258 

259 Parameters 

260 ---------- 

261 tableName : `str` 

262 Logical name of the opaque table. This may differ from the 

263 actual name used in the database by a prefix and/or suffix. 

264 spec : `ddl.TableSpec` 

265 Specification for the table to be added. 

266 """ 

267 self._managers.opaque.register(tableName, spec) 

268 

269 @transactional 

270 def insertOpaqueData(self, tableName: str, *data: dict) -> None: 

271 """Insert records into an opaque table. 

272 

273 Parameters 

274 ---------- 

275 tableName : `str` 

276 Logical name of the opaque table. Must match the name used in a 

277 previous call to `registerOpaqueTable`. 

278 data 

279 Each additional positional argument is a dictionary that represents 

280 a single row to be added. 

281 """ 

282 self._managers.opaque[tableName].insert(*data) 

283 

284 def fetchOpaqueData(self, tableName: str, **where: Any) -> Iterator[dict]: 

285 """Retrieve records from an opaque table. 

286 

287 Parameters 

288 ---------- 

289 tableName : `str` 

290 Logical name of the opaque table. Must match the name used in a 

291 previous call to `registerOpaqueTable`. 

292 where 

293 Additional keyword arguments are interpreted as equality 

294 constraints that restrict the returned rows (combined with AND); 

295 keyword arguments are column names and values are the values they 

296 must have. 

297 

298 Yields 

299 ------ 

300 row : `dict` 

301 A dictionary representing a single result row. 

302 """ 

303 yield from self._managers.opaque[tableName].fetch(**where) 

304 

305 @transactional 

306 def deleteOpaqueData(self, tableName: str, **where: Any) -> None: 

307 """Remove records from an opaque table. 

308 

309 Parameters 

310 ---------- 

311 tableName : `str` 

312 Logical name of the opaque table. Must match the name used in a 

313 previous call to `registerOpaqueTable`. 

314 where 

315 Additional keyword arguments are interpreted as equality 

316 constraints that restrict the deleted rows (combined with AND); 

317 keyword arguments are column names and values are the values they 

318 must have. 

319 """ 

320 self._managers.opaque[tableName].delete(where.keys(), where) 

321 

322 def registerCollection( 

323 self, name: str, type: CollectionType = CollectionType.TAGGED, doc: Optional[str] = None 

324 ) -> bool: 

325 # Docstring inherited from lsst.daf.butler.registry.Registry 

326 _, registered = self._managers.collections.register(name, type, doc=doc) 

327 return registered 

328 

329 def getCollectionType(self, name: str) -> CollectionType: 

330 # Docstring inherited from lsst.daf.butler.registry.Registry 

331 return self._managers.collections.find(name).type 

332 

333 def _get_collection_record(self, name: str) -> CollectionRecord: 

334 # Docstring inherited from lsst.daf.butler.registry.Registry 

335 return self._managers.collections.find(name) 

336 

337 def registerRun(self, name: str, doc: Optional[str] = None) -> bool: 

338 # Docstring inherited from lsst.daf.butler.registry.Registry 

339 _, registered = self._managers.collections.register(name, CollectionType.RUN, doc=doc) 

340 return registered 

341 

342 @transactional 

343 def removeCollection(self, name: str) -> None: 

344 # Docstring inherited from lsst.daf.butler.registry.Registry 

345 self._managers.collections.remove(name) 

346 

347 def getCollectionChain(self, parent: str) -> CollectionSearch: 

348 # Docstring inherited from lsst.daf.butler.registry.Registry 

349 record = self._managers.collections.find(parent) 

350 if record.type is not CollectionType.CHAINED: 

351 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.") 

352 assert isinstance(record, ChainedCollectionRecord) 

353 return record.children 

354 

355 @transactional 

356 def setCollectionChain(self, parent: str, children: Any, *, flatten: bool = False) -> None: 

357 # Docstring inherited from lsst.daf.butler.registry.Registry 

358 record = self._managers.collections.find(parent) 

359 if record.type is not CollectionType.CHAINED: 

360 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.") 

361 assert isinstance(record, ChainedCollectionRecord) 

362 children = CollectionSearch.fromExpression(children) 

363 if children != record.children or flatten: 

364 record.update(self._managers.collections, children, flatten=flatten) 

365 

366 def getCollectionParentChains(self, collection: str) -> Set[str]: 

367 # Docstring inherited from lsst.daf.butler.registry.Registry 

368 return { 

369 record.name 

370 for record in self._managers.collections.getParentChains( 

371 self._managers.collections.find(collection).key 

372 ) 

373 } 

374 

375 def getCollectionDocumentation(self, collection: str) -> Optional[str]: 

376 # Docstring inherited from lsst.daf.butler.registry.Registry 

377 return self._managers.collections.getDocumentation(self._managers.collections.find(collection).key) 

378 

379 def setCollectionDocumentation(self, collection: str, doc: Optional[str]) -> None: 

380 # Docstring inherited from lsst.daf.butler.registry.Registry 

381 self._managers.collections.setDocumentation(self._managers.collections.find(collection).key, doc) 

382 

383 def getCollectionSummary(self, collection: str) -> CollectionSummary: 

384 # Docstring inherited from lsst.daf.butler.registry.Registry 

385 record = self._managers.collections.find(collection) 

386 return self._managers.datasets.getCollectionSummary(record) 

387 

388 def registerDatasetType(self, datasetType: DatasetType) -> bool: 

389 # Docstring inherited from lsst.daf.butler.registry.Registry 

390 _, inserted = self._managers.datasets.register(datasetType) 

391 return inserted 

392 

393 def removeDatasetType(self, name: str) -> None: 

394 # Docstring inherited from lsst.daf.butler.registry.Registry 

395 self._managers.datasets.remove(name) 

396 

397 def getDatasetType(self, name: str) -> DatasetType: 

398 # Docstring inherited from lsst.daf.butler.registry.Registry 

399 return self._managers.datasets[name].datasetType 

400 

401 def supportsIdGenerationMode(self, mode: DatasetIdGenEnum) -> bool: 

402 # Docstring inherited from lsst.daf.butler.registry.Registry 

403 return self._managers.datasets.supportsIdGenerationMode(mode) 

404 

405 def findDataset( 

406 self, 

407 datasetType: Union[DatasetType, str], 

408 dataId: Optional[DataId] = None, 

409 *, 

410 collections: Any = None, 

411 timespan: Optional[Timespan] = None, 

412 **kwargs: Any, 

413 ) -> Optional[DatasetRef]: 

414 # Docstring inherited from lsst.daf.butler.registry.Registry 

415 if isinstance(datasetType, DatasetType): 

416 storage = self._managers.datasets[datasetType.name] 

417 else: 

418 storage = self._managers.datasets[datasetType] 

419 dataId = DataCoordinate.standardize( 

420 dataId, 

421 graph=storage.datasetType.dimensions, 

422 universe=self.dimensions, 

423 defaults=self.defaults.dataId, 

424 **kwargs, 

425 ) 

426 if collections is None: 

427 if not self.defaults.collections: 

428 raise TypeError( 

429 "No collections provided to findDataset, and no defaults from registry construction." 

430 ) 

431 collections = self.defaults.collections 

432 else: 

433 collections = CollectionSearch.fromExpression(collections) 

434 for collectionRecord in collections.iter(self._managers.collections): 

435 if collectionRecord.type is CollectionType.CALIBRATION and ( 

436 not storage.datasetType.isCalibration() or timespan is None 

437 ): 

438 continue 

439 result = storage.find(collectionRecord, dataId, timespan=timespan) 

440 if result is not None: 

441 return result 

442 

443 return None 

444 

445 @transactional 

446 def insertDatasets( 

447 self, 

448 datasetType: Union[DatasetType, str], 

449 dataIds: Iterable[DataId], 

450 run: Optional[str] = None, 

451 expand: bool = True, 

452 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

453 ) -> List[DatasetRef]: 

454 # Docstring inherited from lsst.daf.butler.registry.Registry 

455 if isinstance(datasetType, DatasetType): 

456 storage = self._managers.datasets.find(datasetType.name) 

457 if storage is None: 

458 raise LookupError(f"DatasetType '{datasetType}' has not been registered.") 

459 else: 

460 storage = self._managers.datasets.find(datasetType) 

461 if storage is None: 

462 raise LookupError(f"DatasetType with name '{datasetType}' has not been registered.") 

463 if run is None: 

464 if self.defaults.run is None: 

465 raise TypeError( 

466 "No run provided to insertDatasets, and no default from registry construction." 

467 ) 

468 run = self.defaults.run 

469 runRecord = self._managers.collections.find(run) 

470 if runRecord.type is not CollectionType.RUN: 

471 raise TypeError(f"Given collection is of type {runRecord.type.name}; RUN collection required.") 

472 assert isinstance(runRecord, RunRecord) 

473 progress = Progress("daf.butler.Registry.insertDatasets", level=logging.DEBUG) 

474 if expand: 

475 expandedDataIds = [ 

476 self.expandDataId(dataId, graph=storage.datasetType.dimensions) 

477 for dataId in progress.wrap(dataIds, f"Expanding {storage.datasetType.name} data IDs") 

478 ] 

479 else: 

480 expandedDataIds = [ 

481 DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions) for dataId in dataIds 

482 ] 

483 try: 

484 refs = list(storage.insert(runRecord, expandedDataIds, idGenerationMode)) 

485 except sqlalchemy.exc.IntegrityError as err: 

486 raise ConflictingDefinitionError( 

487 f"A database constraint failure was triggered by inserting " 

488 f"one or more datasets of type {storage.datasetType} into " 

489 f"collection '{run}'. " 

490 f"This probably means a dataset with the same data ID " 

491 f"and dataset type already exists, but it may also mean a " 

492 f"dimension row is missing." 

493 ) from err 

494 return refs 

495 

496 @transactional 

497 def _importDatasets( 

498 self, 

499 datasets: Iterable[DatasetRef], 

500 expand: bool = True, 

501 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

502 reuseIds: bool = False, 

503 ) -> List[DatasetRef]: 

504 # Docstring inherited from lsst.daf.butler.registry.Registry 

505 datasets = list(datasets) 

506 if not datasets: 

507 # nothing to do 

508 return [] 

509 

510 # find dataset type 

511 datasetTypes = set(dataset.datasetType for dataset in datasets) 

512 if len(datasetTypes) != 1: 

513 raise ValueError(f"Multiple dataset types in input datasets: {datasetTypes}") 

514 datasetType = datasetTypes.pop() 

515 

516 # get storage handler for this dataset type 

517 storage = self._managers.datasets.find(datasetType.name) 

518 if storage is None: 

519 raise LookupError(f"DatasetType '{datasetType}' has not been registered.") 

520 

521 # find run name 

522 runs = set(dataset.run for dataset in datasets) 

523 if len(runs) != 1: 

524 raise ValueError(f"Multiple run names in input datasets: {runs}") 

525 run = runs.pop() 

526 if run is None: 

527 if self.defaults.run is None: 

528 raise TypeError( 

529 "No run provided to ingestDatasets, and no default from registry construction." 

530 ) 

531 run = self.defaults.run 

532 

533 runRecord = self._managers.collections.find(run) 

534 if runRecord.type is not CollectionType.RUN: 

535 raise TypeError( 

536 f"Given collection '{runRecord.name}' is of type {runRecord.type.name};" 

537 " RUN collection required." 

538 ) 

539 assert isinstance(runRecord, RunRecord) 

540 

541 progress = Progress("daf.butler.Registry.insertDatasets", level=logging.DEBUG) 

542 if expand: 

543 expandedDatasets = [ 

544 dataset.expanded(self.expandDataId(dataset.dataId, graph=storage.datasetType.dimensions)) 

545 for dataset in progress.wrap(datasets, f"Expanding {storage.datasetType.name} data IDs") 

546 ] 

547 else: 

548 expandedDatasets = [ 

549 DatasetRef(datasetType, dataset.dataId, id=dataset.id, run=dataset.run, conform=True) 

550 for dataset in datasets 

551 ] 

552 

553 try: 

554 refs = list(storage.import_(runRecord, expandedDatasets, idGenerationMode, reuseIds)) 

555 except sqlalchemy.exc.IntegrityError as err: 

556 raise ConflictingDefinitionError( 

557 f"A database constraint failure was triggered by inserting " 

558 f"one or more datasets of type {storage.datasetType} into " 

559 f"collection '{run}'. " 

560 f"This probably means a dataset with the same data ID " 

561 f"and dataset type already exists, but it may also mean a " 

562 f"dimension row is missing." 

563 ) from err 

564 return refs 

565 

566 def getDataset(self, id: DatasetId) -> Optional[DatasetRef]: 

567 # Docstring inherited from lsst.daf.butler.registry.Registry 

568 return self._managers.datasets.getDatasetRef(id) 

569 

570 @transactional 

571 def removeDatasets(self, refs: Iterable[DatasetRef]) -> None: 

572 # Docstring inherited from lsst.daf.butler.registry.Registry 

573 progress = Progress("lsst.daf.butler.Registry.removeDatasets", level=logging.DEBUG) 

574 for datasetType, refsForType in progress.iter_item_chunks( 

575 DatasetRef.groupByType(refs).items(), desc="Removing datasets by type" 

576 ): 

577 storage = self._managers.datasets[datasetType.name] 

578 try: 

579 storage.delete(refsForType) 

580 except sqlalchemy.exc.IntegrityError as err: 

581 raise OrphanedRecordError( 

582 "One or more datasets is still present in one or more Datastores." 

583 ) from err 

584 

585 @transactional 

586 def associate(self, collection: str, refs: Iterable[DatasetRef]) -> None: 

587 # Docstring inherited from lsst.daf.butler.registry.Registry 

588 progress = Progress("lsst.daf.butler.Registry.associate", level=logging.DEBUG) 

589 collectionRecord = self._managers.collections.find(collection) 

590 if collectionRecord.type is not CollectionType.TAGGED: 

591 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}, not TAGGED.") 

592 for datasetType, refsForType in progress.iter_item_chunks( 

593 DatasetRef.groupByType(refs).items(), desc="Associating datasets by type" 

594 ): 

595 storage = self._managers.datasets[datasetType.name] 

596 try: 

597 storage.associate(collectionRecord, refsForType) 

598 except sqlalchemy.exc.IntegrityError as err: 

599 raise ConflictingDefinitionError( 

600 f"Constraint violation while associating dataset of type {datasetType.name} with " 

601 f"collection {collection}. This probably means that one or more datasets with the same " 

602 f"dataset type and data ID already exist in the collection, but it may also indicate " 

603 f"that the datasets do not exist." 

604 ) from err 

605 

606 @transactional 

607 def disassociate(self, collection: str, refs: Iterable[DatasetRef]) -> None: 

608 # Docstring inherited from lsst.daf.butler.registry.Registry 

609 progress = Progress("lsst.daf.butler.Registry.disassociate", level=logging.DEBUG) 

610 collectionRecord = self._managers.collections.find(collection) 

611 if collectionRecord.type is not CollectionType.TAGGED: 

612 raise TypeError( 

613 f"Collection '{collection}' has type {collectionRecord.type.name}; expected TAGGED." 

614 ) 

615 for datasetType, refsForType in progress.iter_item_chunks( 

616 DatasetRef.groupByType(refs).items(), desc="Disassociating datasets by type" 

617 ): 

618 storage = self._managers.datasets[datasetType.name] 

619 storage.disassociate(collectionRecord, refsForType) 

620 

621 @transactional 

622 def certify(self, collection: str, refs: Iterable[DatasetRef], timespan: Timespan) -> None: 

623 # Docstring inherited from lsst.daf.butler.registry.Registry 

624 progress = Progress("lsst.daf.butler.Registry.certify", level=logging.DEBUG) 

625 collectionRecord = self._managers.collections.find(collection) 

626 for datasetType, refsForType in progress.iter_item_chunks( 

627 DatasetRef.groupByType(refs).items(), desc="Certifying datasets by type" 

628 ): 

629 storage = self._managers.datasets[datasetType.name] 

630 storage.certify(collectionRecord, refsForType, timespan) 

631 

632 @transactional 

633 def decertify( 

634 self, 

635 collection: str, 

636 datasetType: Union[str, DatasetType], 

637 timespan: Timespan, 

638 *, 

639 dataIds: Optional[Iterable[DataId]] = None, 

640 ) -> None: 

641 # Docstring inherited from lsst.daf.butler.registry.Registry 

642 collectionRecord = self._managers.collections.find(collection) 

643 if isinstance(datasetType, str): 

644 storage = self._managers.datasets[datasetType] 

645 else: 

646 storage = self._managers.datasets[datasetType.name] 

647 standardizedDataIds = None 

648 if dataIds is not None: 

649 standardizedDataIds = [ 

650 DataCoordinate.standardize(d, graph=storage.datasetType.dimensions) for d in dataIds 

651 ] 

652 storage.decertify(collectionRecord, timespan, dataIds=standardizedDataIds) 

653 

654 def getDatastoreBridgeManager(self) -> DatastoreRegistryBridgeManager: 

655 """Return an object that allows a new `Datastore` instance to 

656 communicate with this `Registry`. 

657 

658 Returns 

659 ------- 

660 manager : `DatastoreRegistryBridgeManager` 

661 Object that mediates communication between this `Registry` and its 

662 associated datastores. 

663 """ 

664 return self._managers.datastores 

665 

666 def getDatasetLocations(self, ref: DatasetRef) -> Iterable[str]: 

667 # Docstring inherited from lsst.daf.butler.registry.Registry 

668 return self._managers.datastores.findDatastores(ref) 

669 

670 def expandDataId( 

671 self, 

672 dataId: Optional[DataId] = None, 

673 *, 

674 graph: Optional[DimensionGraph] = None, 

675 records: Optional[NameLookupMapping[DimensionElement, Optional[DimensionRecord]]] = None, 

676 withDefaults: bool = True, 

677 **kwargs: Any, 

678 ) -> DataCoordinate: 

679 # Docstring inherited from lsst.daf.butler.registry.Registry 

680 if not withDefaults: 

681 defaults = None 

682 else: 

683 defaults = self.defaults.dataId 

684 standardized = DataCoordinate.standardize( 

685 dataId, graph=graph, universe=self.dimensions, defaults=defaults, **kwargs 

686 ) 

687 if standardized.hasRecords(): 

688 return standardized 

689 if records is None: 

690 records = {} 

691 elif isinstance(records, NamedKeyMapping): 

692 records = records.byName() 

693 else: 

694 records = dict(records) 

695 if isinstance(dataId, DataCoordinate) and dataId.hasRecords(): 

696 records.update(dataId.records.byName()) 

697 keys = standardized.byName() 

698 for element in standardized.graph.primaryKeyTraversalOrder: 

699 record = records.get(element.name, ...) # Use ... to mean not found; None might mean NULL 

700 if record is ...: 

701 if isinstance(element, Dimension) and keys.get(element.name) is None: 

702 if element in standardized.graph.required: 

703 raise LookupError(f"No value or null value for required dimension {element.name}.") 

704 keys[element.name] = None 

705 record = None 

706 else: 

707 storage = self._managers.dimensions[element] 

708 dataIdSet = DataCoordinateIterable.fromScalar( 

709 DataCoordinate.standardize(keys, graph=element.graph) 

710 ) 

711 fetched = tuple(storage.fetch(dataIdSet)) 

712 try: 

713 (record,) = fetched 

714 except ValueError: 

715 record = None 

716 records[element.name] = record 

717 if record is not None: 

718 for d in element.implied: 

719 value = getattr(record, d.name) 

720 if keys.setdefault(d.name, value) != value: 

721 raise InconsistentDataIdError( 

722 f"Data ID {standardized} has {d.name}={keys[d.name]!r}, " 

723 f"but {element.name} implies {d.name}={value!r}." 

724 ) 

725 else: 

726 if element in standardized.graph.required: 

727 raise LookupError( 

728 f"Could not fetch record for required dimension {element.name} via keys {keys}." 

729 ) 

730 if element.alwaysJoin: 

731 raise InconsistentDataIdError( 

732 f"Could not fetch record for element {element.name} via keys {keys}, ", 

733 "but it is marked alwaysJoin=True; this means one or more dimensions are not " 

734 "related.", 

735 ) 

736 for d in element.implied: 

737 keys.setdefault(d.name, None) 

738 records.setdefault(d.name, None) 

739 return DataCoordinate.standardize(keys, graph=standardized.graph).expanded(records=records) 

740 

741 def insertDimensionData( 

742 self, 

743 element: Union[DimensionElement, str], 

744 *data: Union[Mapping[str, Any], DimensionRecord], 

745 conform: bool = True, 

746 replace: bool = False, 

747 ) -> None: 

748 # Docstring inherited from lsst.daf.butler.registry.Registry 

749 if conform: 

750 if isinstance(element, str): 

751 element = self.dimensions[element] 

752 records = [ 

753 row if isinstance(row, DimensionRecord) else element.RecordClass(**row) for row in data 

754 ] 

755 else: 

756 # Ignore typing since caller said to trust them with conform=False. 

757 records = data # type: ignore 

758 storage = self._managers.dimensions[element] # type: ignore 

759 storage.insert(*records, replace=replace) 

760 

761 def syncDimensionData( 

762 self, 

763 element: Union[DimensionElement, str], 

764 row: Union[Mapping[str, Any], DimensionRecord], 

765 conform: bool = True, 

766 update: bool = False, 

767 ) -> Union[bool, Dict[str, Any]]: 

768 # Docstring inherited from lsst.daf.butler.registry.Registry 

769 if conform: 

770 if isinstance(element, str): 

771 element = self.dimensions[element] 

772 record = row if isinstance(row, DimensionRecord) else element.RecordClass(**row) 

773 else: 

774 # Ignore typing since caller said to trust them with conform=False. 

775 record = row # type: ignore 

776 storage = self._managers.dimensions[element] # type: ignore 

777 return storage.sync(record, update=update) 

778 

779 def queryDatasetTypes( 

780 self, 

781 expression: Any = ..., 

782 *, 

783 components: Optional[bool] = None, 

784 missing: Optional[List[str]] = None, 

785 ) -> Iterator[DatasetType]: 

786 # Docstring inherited from lsst.daf.butler.registry.Registry 

787 wildcard = CategorizedWildcard.fromExpression(expression, coerceUnrecognized=lambda d: d.name) 

788 if wildcard is Ellipsis: 

789 for datasetType in self._managers.datasets: 

790 # The dataset type can no longer be a component 

791 yield datasetType 

792 if components: 

793 # Automatically create the component dataset types 

794 try: 

795 componentsForDatasetType = datasetType.makeAllComponentDatasetTypes() 

796 except KeyError as err: 

797 _LOG.warning( 

798 f"Could not load storage class {err} for {datasetType.name}; " 

799 "if it has components they will not be included in query results." 

800 ) 

801 else: 

802 yield from componentsForDatasetType 

803 return 

804 done: Set[str] = set() 

805 for name in wildcard.strings: 

806 storage = self._managers.datasets.find(name) 

807 done.add(name) 

808 if storage is None: 

809 if missing is not None: 

810 missing.append(name) 

811 else: 

812 yield storage.datasetType 

813 if wildcard.patterns: 

814 # If components (the argument) is None, we'll save component 

815 # dataset that we might want to match, but only if their parents 

816 # didn't get included. 

817 componentsForLater = [] 

818 for registeredDatasetType in self._managers.datasets: 

819 # Components are not stored in registry so expand them here 

820 allDatasetTypes = [registeredDatasetType] 

821 try: 

822 allDatasetTypes.extend(registeredDatasetType.makeAllComponentDatasetTypes()) 

823 except KeyError as err: 

824 _LOG.warning( 

825 f"Could not load storage class {err} for {registeredDatasetType.name}; " 

826 "if it has components they will not be included in query results." 

827 ) 

828 for datasetType in allDatasetTypes: 

829 if datasetType.name in done: 

830 continue 

831 parentName, componentName = datasetType.nameAndComponent() 

832 if componentName is not None and not components: 

833 if components is None and parentName not in done: 

834 componentsForLater.append(datasetType) 

835 continue 

836 if any(p.fullmatch(datasetType.name) for p in wildcard.patterns): 

837 done.add(datasetType.name) 

838 yield datasetType 

839 # Go back and try to match saved components. 

840 for datasetType in componentsForLater: 

841 parentName, _ = datasetType.nameAndComponent() 

842 if parentName not in done and any(p.fullmatch(datasetType.name) for p in wildcard.patterns): 

843 yield datasetType 

844 

845 def queryCollections( 

846 self, 

847 expression: Any = ..., 

848 datasetType: Optional[DatasetType] = None, 

849 collectionTypes: Union[Iterable[CollectionType], CollectionType] = CollectionType.all(), 

850 flattenChains: bool = False, 

851 includeChains: Optional[bool] = None, 

852 ) -> Iterator[str]: 

853 # Docstring inherited from lsst.daf.butler.registry.Registry 

854 

855 # Right now the datasetTypes argument is completely ignored, but that 

856 # is consistent with its [lack of] guarantees. DM-24939 or a follow-up 

857 # ticket will take care of that. 

858 query = CollectionQuery.fromExpression(expression) 

859 collectionTypes = ensure_iterable(collectionTypes) 

860 for record in query.iter( 

861 self._managers.collections, 

862 collectionTypes=frozenset(collectionTypes), 

863 flattenChains=flattenChains, 

864 includeChains=includeChains, 

865 ): 

866 yield record.name 

867 

868 def _makeQueryBuilder( 

869 self, summary: queries.QuerySummary, doomed_by: Iterable[str] = () 

870 ) -> queries.QueryBuilder: 

871 """Return a `QueryBuilder` instance capable of constructing and 

872 managing more complex queries than those obtainable via `Registry` 

873 interfaces. 

874 

875 This is an advanced interface; downstream code should prefer 

876 `Registry.queryDataIds` and `Registry.queryDatasets` whenever those 

877 are sufficient. 

878 

879 Parameters 

880 ---------- 

881 summary : `queries.QuerySummary` 

882 Object describing and categorizing the full set of dimensions that 

883 will be included in the query. 

884 doomed_by : `Iterable` of `str`, optional 

885 A list of diagnostic messages that indicate why the query is going 

886 to yield no results and should not even be executed. If an empty 

887 container (default) the query will be executed unless other code 

888 determines that it is doomed. 

889 

890 Returns 

891 ------- 

892 builder : `queries.QueryBuilder` 

893 Object that can be used to construct and perform advanced queries. 

894 """ 

895 return queries.QueryBuilder( 

896 summary, 

897 queries.RegistryManagers( 

898 collections=self._managers.collections, 

899 dimensions=self._managers.dimensions, 

900 datasets=self._managers.datasets, 

901 TimespanReprClass=self._db.getTimespanRepresentation(), 

902 ), 

903 doomed_by=doomed_by, 

904 ) 

905 

906 def queryDatasets( 

907 self, 

908 datasetType: Any, 

909 *, 

910 collections: Any = None, 

911 dimensions: Optional[Iterable[Union[Dimension, str]]] = None, 

912 dataId: Optional[DataId] = None, 

913 where: Optional[str] = None, 

914 findFirst: bool = False, 

915 components: Optional[bool] = None, 

916 bind: Optional[Mapping[str, Any]] = None, 

917 check: bool = True, 

918 **kwargs: Any, 

919 ) -> queries.DatasetQueryResults: 

920 # Docstring inherited from lsst.daf.butler.registry.Registry 

921 

922 # Standardize the collections expression. 

923 if collections is None: 

924 if not self.defaults.collections: 

925 raise TypeError( 

926 "No collections provided to findDataset, and no defaults from registry construction." 

927 ) 

928 collections = self.defaults.collections 

929 elif findFirst: 

930 collections = CollectionSearch.fromExpression(collections) 

931 else: 

932 collections = CollectionQuery.fromExpression(collections) 

933 # Standardize and expand the data ID provided as a constraint. 

934 standardizedDataId = self.expandDataId(dataId, **kwargs) 

935 

936 # We can only query directly if given a non-component DatasetType 

937 # instance. If we were given an expression or str or a component 

938 # DatasetType instance, we'll populate this dict, recurse, and return. 

939 # If we already have a non-component DatasetType, it will remain None 

940 # and we'll run the query directly. 

941 composition: Optional[ 

942 Dict[ 

943 DatasetType, List[Optional[str]] # parent dataset type # component name, or None for parent 

944 ] 

945 ] = None 

946 if not isinstance(datasetType, DatasetType): 

947 # We were given a dataset type expression (which may be as simple 

948 # as a str). Loop over all matching datasets, delegating handling 

949 # of the `components` argument to queryDatasetTypes, as we populate 

950 # the composition dict. 

951 composition = defaultdict(list) 

952 for trueDatasetType in self.queryDatasetTypes(datasetType, components=components): 

953 parentName, componentName = trueDatasetType.nameAndComponent() 

954 if componentName is not None: 

955 parentDatasetType = self.getDatasetType(parentName) 

956 composition.setdefault(parentDatasetType, []).append(componentName) 

957 else: 

958 composition.setdefault(trueDatasetType, []).append(None) 

959 if not composition: 

960 return queries.ChainedDatasetQueryResults( 

961 [], 

962 doomed_by=[ 

963 f"No registered dataset type matching {t!r} found." 

964 for t in ensure_iterable(datasetType) 

965 ], 

966 ) 

967 elif datasetType.isComponent(): 

968 # We were given a true DatasetType instance, but it's a component. 

969 # the composition dict will have exactly one item. 

970 parentName, componentName = datasetType.nameAndComponent() 

971 parentDatasetType = self.getDatasetType(parentName) 

972 composition = {parentDatasetType: [componentName]} 

973 if composition is not None: 

974 # We need to recurse. Do that once for each parent dataset type. 

975 chain = [] 

976 for parentDatasetType, componentNames in composition.items(): 

977 parentResults = self.queryDatasets( 

978 parentDatasetType, 

979 collections=collections, 

980 dimensions=dimensions, 

981 dataId=standardizedDataId, 

982 where=where, 

983 bind=bind, 

984 findFirst=findFirst, 

985 check=check, 

986 ) 

987 assert isinstance( 

988 parentResults, queries.ParentDatasetQueryResults 

989 ), "Should always be true if passing in a DatasetType instance, and we are." 

990 chain.append(parentResults.withComponents(componentNames)) 

991 return queries.ChainedDatasetQueryResults(chain) 

992 # If we get here, there's no need to recurse (or we are already 

993 # recursing; there can only ever be one level of recursion). 

994 

995 # The full set of dimensions in the query is the combination of those 

996 # needed for the DatasetType and those explicitly requested, if any. 

997 requestedDimensionNames = set(datasetType.dimensions.names) 

998 if dimensions is not None: 

999 requestedDimensionNames.update(self.dimensions.extract(dimensions).names) 

1000 # Construct the summary structure needed to construct a QueryBuilder. 

1001 summary = queries.QuerySummary( 

1002 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames), 

1003 dataId=standardizedDataId, 

1004 expression=where, 

1005 bind=bind, 

1006 defaults=self.defaults.dataId, 

1007 check=check, 

1008 datasets=[datasetType], 

1009 ) 

1010 builder = self._makeQueryBuilder(summary) 

1011 # Add the dataset subquery to the query, telling the QueryBuilder to 

1012 # include the rank of the selected collection in the results only if we 

1013 # need to findFirst. Note that if any of the collections are 

1014 # actually wildcard expressions, and we've asked for deduplication, 

1015 # this will raise TypeError for us. 

1016 builder.joinDataset(datasetType, collections, isResult=True, findFirst=findFirst) 

1017 query = builder.finish() 

1018 return queries.ParentDatasetQueryResults(self._db, query, components=[None], datasetType=datasetType) 

1019 

1020 def queryDataIds( 

1021 self, 

1022 dimensions: Union[Iterable[Union[Dimension, str]], Dimension, str], 

1023 *, 

1024 dataId: Optional[DataId] = None, 

1025 datasets: Any = None, 

1026 collections: Any = None, 

1027 where: Optional[str] = None, 

1028 components: Optional[bool] = None, 

1029 bind: Optional[Mapping[str, Any]] = None, 

1030 check: bool = True, 

1031 **kwargs: Any, 

1032 ) -> queries.DataCoordinateQueryResults: 

1033 # Docstring inherited from lsst.daf.butler.registry.Registry 

1034 dimensions = ensure_iterable(dimensions) 

1035 standardizedDataId = self.expandDataId(dataId, **kwargs) 

1036 standardizedDatasetTypes = set() 

1037 requestedDimensions = self.dimensions.extract(dimensions) 

1038 missing: List[str] = [] 

1039 if datasets is not None: 

1040 if not collections: 

1041 if not self.defaults.collections: 

1042 raise TypeError(f"Cannot pass 'datasets' (='{datasets}') without 'collections'.") 

1043 collections = self.defaults.collections 

1044 else: 

1045 # Preprocess collections expression in case the original 

1046 # included single-pass iterators (we'll want to use it multiple 

1047 # times below). 

1048 collections = CollectionQuery.fromExpression(collections) 

1049 for datasetType in self.queryDatasetTypes(datasets, components=components, missing=missing): 

1050 # If any matched dataset type is a component, just operate on 

1051 # its parent instead, because Registry doesn't know anything 

1052 # about what components exist, and here (unlike queryDatasets) 

1053 # we don't care about returning them. 

1054 parentDatasetTypeName, componentName = datasetType.nameAndComponent() 

1055 if componentName is not None: 

1056 datasetType = self.getDatasetType(parentDatasetTypeName) 

1057 standardizedDatasetTypes.add(datasetType) 

1058 elif collections: 

1059 raise TypeError(f"Cannot pass 'collections' (='{collections}') without 'datasets'.") 

1060 

1061 def query_factory( 

1062 order_by: Optional[Iterable[str]] = None, limit: Optional[Tuple[int, Optional[int]]] = None 

1063 ) -> Query: 

1064 """Construct the Query object that generates query results.""" 

1065 summary = queries.QuerySummary( 

1066 requested=requestedDimensions, 

1067 dataId=standardizedDataId, 

1068 expression=where, 

1069 bind=bind, 

1070 defaults=self.defaults.dataId, 

1071 check=check, 

1072 datasets=standardizedDatasetTypes, 

1073 order_by=order_by, 

1074 limit=limit, 

1075 ) 

1076 builder = self._makeQueryBuilder( 

1077 summary, doomed_by=[f"Dataset type {name} is not registered." for name in missing] 

1078 ) 

1079 for datasetType in standardizedDatasetTypes: 

1080 builder.joinDataset( 

1081 datasetType, 

1082 collections, 

1083 isResult=False, 

1084 ) 

1085 return builder.finish() 

1086 

1087 return queries.DataCoordinateQueryResults(self._db, query_factory, requestedDimensions) 

1088 

1089 def queryDimensionRecords( 

1090 self, 

1091 element: Union[DimensionElement, str], 

1092 *, 

1093 dataId: Optional[DataId] = None, 

1094 datasets: Any = None, 

1095 collections: Any = None, 

1096 where: Optional[str] = None, 

1097 components: Optional[bool] = None, 

1098 bind: Optional[Mapping[str, Any]] = None, 

1099 check: bool = True, 

1100 **kwargs: Any, 

1101 ) -> queries.DimensionRecordQueryResults: 

1102 # Docstring inherited from lsst.daf.butler.registry.Registry 

1103 if not isinstance(element, DimensionElement): 

1104 try: 

1105 element = self.dimensions[element] 

1106 except KeyError as e: 

1107 raise KeyError( 

1108 f"No such dimension '{element}', available dimensions: " 

1109 + str(self.dimensions.getStaticElements()) 

1110 ) from e 

1111 dataIds = self.queryDataIds( 

1112 element.graph, 

1113 dataId=dataId, 

1114 datasets=datasets, 

1115 collections=collections, 

1116 where=where, 

1117 components=components, 

1118 bind=bind, 

1119 check=check, 

1120 **kwargs, 

1121 ) 

1122 return queries.DatabaseDimensionRecordQueryResults(dataIds, self._managers.dimensions[element]) 

1123 

1124 def queryDatasetAssociations( 

1125 self, 

1126 datasetType: Union[str, DatasetType], 

1127 collections: Any = ..., 

1128 *, 

1129 collectionTypes: Iterable[CollectionType] = CollectionType.all(), 

1130 flattenChains: bool = False, 

1131 ) -> Iterator[DatasetAssociation]: 

1132 # Docstring inherited from lsst.daf.butler.registry.Registry 

1133 if collections is None: 

1134 if not self.defaults.collections: 

1135 raise TypeError( 

1136 "No collections provided to findDataset, and no defaults from registry construction." 

1137 ) 

1138 collections = self.defaults.collections 

1139 else: 

1140 collections = CollectionQuery.fromExpression(collections) 

1141 TimespanReprClass = self._db.getTimespanRepresentation() 

1142 if isinstance(datasetType, str): 

1143 storage = self._managers.datasets[datasetType] 

1144 else: 

1145 storage = self._managers.datasets[datasetType.name] 

1146 for collectionRecord in collections.iter( 

1147 self._managers.collections, 

1148 collectionTypes=frozenset(collectionTypes), 

1149 flattenChains=flattenChains, 

1150 ): 

1151 query = storage.select(collectionRecord) 

1152 for row in self._db.query(query.combine()).mappings(): 

1153 dataId = DataCoordinate.fromRequiredValues( 

1154 storage.datasetType.dimensions, 

1155 tuple(row[name] for name in storage.datasetType.dimensions.required.names), 

1156 ) 

1157 runRecord = self._managers.collections[row[self._managers.collections.getRunForeignKeyName()]] 

1158 ref = DatasetRef(storage.datasetType, dataId, id=row["id"], run=runRecord.name, conform=False) 

1159 if collectionRecord.type is CollectionType.CALIBRATION: 

1160 timespan = TimespanReprClass.extract(row) 

1161 else: 

1162 timespan = None 

1163 yield DatasetAssociation(ref=ref, collection=collectionRecord.name, timespan=timespan) 

1164 

1165 storageClasses: StorageClassFactory 

1166 """All storage classes known to the registry (`StorageClassFactory`). 

1167 """