Coverage for python/lsst/daf/butler/registries/sql.py: 13%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

471 statements  

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ("SqlRegistry",) 

25 

26import contextlib 

27import logging 

28from collections import defaultdict 

29from typing import TYPE_CHECKING, Any, Dict, Iterable, Iterator, List, Mapping, Optional, Set, Tuple, Union 

30 

31import sqlalchemy 

32from lsst.resources import ResourcePathExpression 

33from lsst.utils.iteration import ensure_iterable 

34 

35from ..core import ( 

36 Config, 

37 DataCoordinate, 

38 DataCoordinateIterable, 

39 DataId, 

40 DatasetAssociation, 

41 DatasetId, 

42 DatasetRef, 

43 DatasetType, 

44 Dimension, 

45 DimensionConfig, 

46 DimensionElement, 

47 DimensionGraph, 

48 DimensionRecord, 

49 DimensionUniverse, 

50 NamedKeyMapping, 

51 NameLookupMapping, 

52 Progress, 

53 StorageClassFactory, 

54 Timespan, 

55 ddl, 

56) 

57from ..core.utils import transactional 

58from ..registry import ( 

59 CollectionSearch, 

60 CollectionType, 

61 ConflictingDefinitionError, 

62 InconsistentDataIdError, 

63 OrphanedRecordError, 

64 Registry, 

65 RegistryConfig, 

66 RegistryDefaults, 

67 queries, 

68) 

69from ..registry.interfaces import ChainedCollectionRecord, DatasetIdGenEnum, RunRecord 

70from ..registry.managers import RegistryManagerInstances, RegistryManagerTypes 

71from ..registry.queries import Query 

72from ..registry.summaries import CollectionSummary 

73from ..registry.wildcards import CategorizedWildcard, CollectionQuery, Ellipsis 

74 

75if TYPE_CHECKING: 75 ↛ 76line 75 didn't jump to line 76, because the condition on line 75 was never true

76 from .._butlerConfig import ButlerConfig 

77 from ..registry.interfaces import CollectionRecord, Database, DatastoreRegistryBridgeManager 

78 

79 

80_LOG = logging.getLogger(__name__) 

81 

82 

83class SqlRegistry(Registry): 

84 """Registry implementation based on SQLAlchemy. 

85 

86 Parameters 

87 ---------- 

88 database : `Database` 

89 Database instance to store Registry. 

90 defaults : `RegistryDefaults` 

91 Default collection search path and/or output `~CollectionType.RUN` 

92 collection. 

93 managers : `RegistryManagerInstances` 

94 All the managers required for this registry. 

95 """ 

96 

97 defaultConfigFile: Optional[str] = None 

98 """Path to configuration defaults. Accessed within the ``configs`` resource 

99 or relative to a search path. Can be None if no defaults specified. 

100 """ 

101 

102 @classmethod 

103 def createFromConfig( 

104 cls, 

105 config: Optional[Union[RegistryConfig, str]] = None, 

106 dimensionConfig: Optional[Union[DimensionConfig, str]] = None, 

107 butlerRoot: Optional[ResourcePathExpression] = None, 

108 ) -> Registry: 

109 """Create registry database and return `SqlRegistry` instance. 

110 

111 This method initializes database contents, database must be empty 

112 prior to calling this method. 

113 

114 Parameters 

115 ---------- 

116 config : `RegistryConfig` or `str`, optional 

117 Registry configuration, if missing then default configuration will 

118 be loaded from registry.yaml. 

119 dimensionConfig : `DimensionConfig` or `str`, optional 

120 Dimensions configuration, if missing then default configuration 

121 will be loaded from dimensions.yaml. 

122 butlerRoot : convertible to `lsst.resources.ResourcePath`, optional 

123 Path to the repository root this `SqlRegistry` will manage. 

124 

125 Returns 

126 ------- 

127 registry : `SqlRegistry` 

128 A new `SqlRegistry` instance. 

129 """ 

130 config = cls.forceRegistryConfig(config) 

131 config.replaceRoot(butlerRoot) 

132 

133 if isinstance(dimensionConfig, str): 

134 dimensionConfig = DimensionConfig(config) 

135 elif dimensionConfig is None: 

136 dimensionConfig = DimensionConfig() 

137 elif not isinstance(dimensionConfig, DimensionConfig): 

138 raise TypeError(f"Incompatible Dimension configuration type: {type(dimensionConfig)}") 

139 

140 DatabaseClass = config.getDatabaseClass() 

141 database = DatabaseClass.fromUri( 

142 str(config.connectionString), origin=config.get("origin", 0), namespace=config.get("namespace") 

143 ) 

144 managerTypes = RegistryManagerTypes.fromConfig(config) 

145 managers = managerTypes.makeRepo(database, dimensionConfig) 

146 return cls(database, RegistryDefaults(), managers) 

147 

148 @classmethod 

149 def fromConfig( 

150 cls, 

151 config: Union[ButlerConfig, RegistryConfig, Config, str], 

152 butlerRoot: Optional[ResourcePathExpression] = None, 

153 writeable: bool = True, 

154 defaults: Optional[RegistryDefaults] = None, 

155 ) -> Registry: 

156 """Create `Registry` subclass instance from `config`. 

157 

158 Registry database must be initialized prior to calling this method. 

159 

160 Parameters 

161 ---------- 

162 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str` 

163 Registry configuration 

164 butlerRoot : `lsst.resources.ResourcePathExpression`, optional 

165 Path to the repository root this `Registry` will manage. 

166 writeable : `bool`, optional 

167 If `True` (default) create a read-write connection to the database. 

168 defaults : `RegistryDefaults`, optional 

169 Default collection search path and/or output `~CollectionType.RUN` 

170 collection. 

171 

172 Returns 

173 ------- 

174 registry : `SqlRegistry` (subclass) 

175 A new `SqlRegistry` subclass instance. 

176 """ 

177 config = cls.forceRegistryConfig(config) 

178 config.replaceRoot(butlerRoot) 

179 DatabaseClass = config.getDatabaseClass() 

180 database = DatabaseClass.fromUri( 

181 str(config.connectionString), 

182 origin=config.get("origin", 0), 

183 namespace=config.get("namespace"), 

184 writeable=writeable, 

185 ) 

186 managerTypes = RegistryManagerTypes.fromConfig(config) 

187 managers = managerTypes.loadRepo(database) 

188 if defaults is None: 

189 defaults = RegistryDefaults() 

190 return cls(database, defaults, managers) 

191 

192 def __init__(self, database: Database, defaults: RegistryDefaults, managers: RegistryManagerInstances): 

193 self._db = database 

194 self._managers = managers 

195 self.storageClasses = StorageClassFactory() 

196 # Intentionally invoke property setter to initialize defaults. This 

197 # can only be done after most of the rest of Registry has already been 

198 # initialized, and must be done before the property getter is used. 

199 self.defaults = defaults 

200 

201 def __str__(self) -> str: 

202 return str(self._db) 

203 

204 def __repr__(self) -> str: 

205 return f"SqlRegistry({self._db!r}, {self.dimensions!r})" 

206 

207 def isWriteable(self) -> bool: 

208 # Docstring inherited from lsst.daf.butler.registry.Registry 

209 return self._db.isWriteable() 

210 

211 def copy(self, defaults: Optional[RegistryDefaults] = None) -> Registry: 

212 # Docstring inherited from lsst.daf.butler.registry.Registry 

213 if defaults is None: 

214 # No need to copy, because `RegistryDefaults` is immutable; we 

215 # effectively copy on write. 

216 defaults = self.defaults 

217 return type(self)(self._db, defaults, self._managers) 

218 

219 @property 

220 def dimensions(self) -> DimensionUniverse: 

221 # Docstring inherited from lsst.daf.butler.registry.Registry 

222 return self._managers.dimensions.universe 

223 

224 def refresh(self) -> None: 

225 # Docstring inherited from lsst.daf.butler.registry.Registry 

226 self._managers.refresh() 

227 

228 @contextlib.contextmanager 

229 def transaction(self, *, savepoint: bool = False) -> Iterator[None]: 

230 # Docstring inherited from lsst.daf.butler.registry.Registry 

231 try: 

232 with self._db.transaction(savepoint=savepoint): 

233 yield 

234 except BaseException: 

235 # TODO: this clears the caches sometimes when we wouldn't actually 

236 # need to. Can we avoid that? 

237 self._managers.dimensions.clearCaches() 

238 raise 

239 

240 def resetConnectionPool(self) -> None: 

241 """Reset SQLAlchemy connection pool for `SqlRegistry` database. 

242 

243 This operation is useful when using registry with fork-based 

244 multiprocessing. To use registry across fork boundary one has to make 

245 sure that there are no currently active connections (no session or 

246 transaction is in progress) and connection pool is reset using this 

247 method. This method should be called by the child process immediately 

248 after the fork. 

249 """ 

250 self._db._engine.dispose() 

251 

252 def registerOpaqueTable(self, tableName: str, spec: ddl.TableSpec) -> None: 

253 """Add an opaque (to the `Registry`) table for use by a `Datastore` or 

254 other data repository client. 

255 

256 Opaque table records can be added via `insertOpaqueData`, retrieved via 

257 `fetchOpaqueData`, and removed via `deleteOpaqueData`. 

258 

259 Parameters 

260 ---------- 

261 tableName : `str` 

262 Logical name of the opaque table. This may differ from the 

263 actual name used in the database by a prefix and/or suffix. 

264 spec : `ddl.TableSpec` 

265 Specification for the table to be added. 

266 """ 

267 self._managers.opaque.register(tableName, spec) 

268 

269 @transactional 

270 def insertOpaqueData(self, tableName: str, *data: dict) -> None: 

271 """Insert records into an opaque table. 

272 

273 Parameters 

274 ---------- 

275 tableName : `str` 

276 Logical name of the opaque table. Must match the name used in a 

277 previous call to `registerOpaqueTable`. 

278 data 

279 Each additional positional argument is a dictionary that represents 

280 a single row to be added. 

281 """ 

282 self._managers.opaque[tableName].insert(*data) 

283 

284 def fetchOpaqueData(self, tableName: str, **where: Any) -> Iterator[dict]: 

285 """Retrieve records from an opaque table. 

286 

287 Parameters 

288 ---------- 

289 tableName : `str` 

290 Logical name of the opaque table. Must match the name used in a 

291 previous call to `registerOpaqueTable`. 

292 where 

293 Additional keyword arguments are interpreted as equality 

294 constraints that restrict the returned rows (combined with AND); 

295 keyword arguments are column names and values are the values they 

296 must have. 

297 

298 Yields 

299 ------ 

300 row : `dict` 

301 A dictionary representing a single result row. 

302 """ 

303 yield from self._managers.opaque[tableName].fetch(**where) 

304 

305 @transactional 

306 def deleteOpaqueData(self, tableName: str, **where: Any) -> None: 

307 """Remove records from an opaque table. 

308 

309 Parameters 

310 ---------- 

311 tableName : `str` 

312 Logical name of the opaque table. Must match the name used in a 

313 previous call to `registerOpaqueTable`. 

314 where 

315 Additional keyword arguments are interpreted as equality 

316 constraints that restrict the deleted rows (combined with AND); 

317 keyword arguments are column names and values are the values they 

318 must have. 

319 """ 

320 self._managers.opaque[tableName].delete(where.keys(), where) 

321 

322 def registerCollection( 

323 self, name: str, type: CollectionType = CollectionType.TAGGED, doc: Optional[str] = None 

324 ) -> bool: 

325 # Docstring inherited from lsst.daf.butler.registry.Registry 

326 _, registered = self._managers.collections.register(name, type, doc=doc) 

327 return registered 

328 

329 def getCollectionType(self, name: str) -> CollectionType: 

330 # Docstring inherited from lsst.daf.butler.registry.Registry 

331 return self._managers.collections.find(name).type 

332 

333 def _get_collection_record(self, name: str) -> CollectionRecord: 

334 # Docstring inherited from lsst.daf.butler.registry.Registry 

335 return self._managers.collections.find(name) 

336 

337 def registerRun(self, name: str, doc: Optional[str] = None) -> bool: 

338 # Docstring inherited from lsst.daf.butler.registry.Registry 

339 _, registered = self._managers.collections.register(name, CollectionType.RUN, doc=doc) 

340 return registered 

341 

342 @transactional 

343 def removeCollection(self, name: str) -> None: 

344 # Docstring inherited from lsst.daf.butler.registry.Registry 

345 self._managers.collections.remove(name) 

346 

347 def getCollectionChain(self, parent: str) -> CollectionSearch: 

348 # Docstring inherited from lsst.daf.butler.registry.Registry 

349 record = self._managers.collections.find(parent) 

350 if record.type is not CollectionType.CHAINED: 

351 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.") 

352 assert isinstance(record, ChainedCollectionRecord) 

353 return record.children 

354 

355 @transactional 

356 def setCollectionChain(self, parent: str, children: Any, *, flatten: bool = False) -> None: 

357 # Docstring inherited from lsst.daf.butler.registry.Registry 

358 record = self._managers.collections.find(parent) 

359 if record.type is not CollectionType.CHAINED: 

360 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.") 

361 assert isinstance(record, ChainedCollectionRecord) 

362 children = CollectionSearch.fromExpression(children) 

363 if children != record.children or flatten: 

364 record.update(self._managers.collections, children, flatten=flatten) 

365 

366 def getCollectionParentChains(self, collection: str) -> Set[str]: 

367 # Docstring inherited from lsst.daf.butler.registry.Registry 

368 return { 

369 record.name 

370 for record in self._managers.collections.getParentChains( 

371 self._managers.collections.find(collection).key 

372 ) 

373 } 

374 

375 def getCollectionDocumentation(self, collection: str) -> Optional[str]: 

376 # Docstring inherited from lsst.daf.butler.registry.Registry 

377 return self._managers.collections.getDocumentation(self._managers.collections.find(collection).key) 

378 

379 def setCollectionDocumentation(self, collection: str, doc: Optional[str]) -> None: 

380 # Docstring inherited from lsst.daf.butler.registry.Registry 

381 self._managers.collections.setDocumentation(self._managers.collections.find(collection).key, doc) 

382 

383 def getCollectionSummary(self, collection: str) -> CollectionSummary: 

384 # Docstring inherited from lsst.daf.butler.registry.Registry 

385 record = self._managers.collections.find(collection) 

386 return self._managers.datasets.getCollectionSummary(record) 

387 

388 def registerDatasetType(self, datasetType: DatasetType) -> bool: 

389 # Docstring inherited from lsst.daf.butler.registry.Registry 

390 _, inserted = self._managers.datasets.register(datasetType) 

391 return inserted 

392 

393 def removeDatasetType(self, name: str) -> None: 

394 # Docstring inherited from lsst.daf.butler.registry.Registry 

395 self._managers.datasets.remove(name) 

396 

397 def getDatasetType(self, name: str) -> DatasetType: 

398 # Docstring inherited from lsst.daf.butler.registry.Registry 

399 return self._managers.datasets[name].datasetType 

400 

401 def supportsIdGenerationMode(self, mode: DatasetIdGenEnum) -> bool: 

402 # Docstring inherited from lsst.daf.butler.registry.Registry 

403 return self._managers.datasets.supportsIdGenerationMode(mode) 

404 

405 def findDataset( 

406 self, 

407 datasetType: Union[DatasetType, str], 

408 dataId: Optional[DataId] = None, 

409 *, 

410 collections: Any = None, 

411 timespan: Optional[Timespan] = None, 

412 **kwargs: Any, 

413 ) -> Optional[DatasetRef]: 

414 # Docstring inherited from lsst.daf.butler.registry.Registry 

415 if isinstance(datasetType, DatasetType): 

416 storage = self._managers.datasets[datasetType.name] 

417 else: 

418 storage = self._managers.datasets[datasetType] 

419 dataId = DataCoordinate.standardize( 

420 dataId, 

421 graph=storage.datasetType.dimensions, 

422 universe=self.dimensions, 

423 defaults=self.defaults.dataId, 

424 **kwargs, 

425 ) 

426 if collections is None: 

427 if not self.defaults.collections: 

428 raise TypeError( 

429 "No collections provided to findDataset, and no defaults from registry construction." 

430 ) 

431 collections = self.defaults.collections 

432 else: 

433 collections = CollectionSearch.fromExpression(collections) 

434 for collectionRecord in collections.iter(self._managers.collections): 

435 if collectionRecord.type is CollectionType.CALIBRATION and ( 

436 not storage.datasetType.isCalibration() or timespan is None 

437 ): 

438 continue 

439 result = storage.find(collectionRecord, dataId, timespan=timespan) 

440 if result is not None: 

441 return result 

442 

443 return None 

444 

445 @transactional 

446 def insertDatasets( 

447 self, 

448 datasetType: Union[DatasetType, str], 

449 dataIds: Iterable[DataId], 

450 run: Optional[str] = None, 

451 expand: bool = True, 

452 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

453 ) -> List[DatasetRef]: 

454 # Docstring inherited from lsst.daf.butler.registry.Registry 

455 if isinstance(datasetType, DatasetType): 

456 storage = self._managers.datasets.find(datasetType.name) 

457 if storage is None: 

458 raise LookupError(f"DatasetType '{datasetType}' has not been registered.") 

459 else: 

460 storage = self._managers.datasets.find(datasetType) 

461 if storage is None: 

462 raise LookupError(f"DatasetType with name '{datasetType}' has not been registered.") 

463 if run is None: 

464 if self.defaults.run is None: 

465 raise TypeError( 

466 "No run provided to insertDatasets, and no default from registry construction." 

467 ) 

468 run = self.defaults.run 

469 runRecord = self._managers.collections.find(run) 

470 if runRecord.type is not CollectionType.RUN: 

471 raise TypeError(f"Given collection is of type {runRecord.type.name}; RUN collection required.") 

472 assert isinstance(runRecord, RunRecord) 

473 progress = Progress("daf.butler.Registry.insertDatasets", level=logging.DEBUG) 

474 if expand: 

475 expandedDataIds = [ 

476 self.expandDataId(dataId, graph=storage.datasetType.dimensions) 

477 for dataId in progress.wrap(dataIds, f"Expanding {storage.datasetType.name} data IDs") 

478 ] 

479 else: 

480 expandedDataIds = [ 

481 DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions) for dataId in dataIds 

482 ] 

483 try: 

484 refs = list(storage.insert(runRecord, expandedDataIds, idGenerationMode)) 

485 except sqlalchemy.exc.IntegrityError as err: 

486 raise ConflictingDefinitionError( 

487 f"A database constraint failure was triggered by inserting " 

488 f"one or more datasets of type {storage.datasetType} into " 

489 f"collection '{run}'. " 

490 f"This probably means a dataset with the same data ID " 

491 f"and dataset type already exists, but it may also mean a " 

492 f"dimension row is missing." 

493 ) from err 

494 return refs 

495 

496 @transactional 

497 def _importDatasets( 

498 self, 

499 datasets: Iterable[DatasetRef], 

500 expand: bool = True, 

501 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

502 reuseIds: bool = False, 

503 ) -> List[DatasetRef]: 

504 # Docstring inherited from lsst.daf.butler.registry.Registry 

505 datasets = list(datasets) 

506 if not datasets: 

507 # nothing to do 

508 return [] 

509 

510 # find dataset type 

511 datasetTypes = set(dataset.datasetType for dataset in datasets) 

512 if len(datasetTypes) != 1: 

513 raise ValueError(f"Multiple dataset types in input datasets: {datasetTypes}") 

514 datasetType = datasetTypes.pop() 

515 

516 # get storage handler for this dataset type 

517 storage = self._managers.datasets.find(datasetType.name) 

518 if storage is None: 

519 raise LookupError(f"DatasetType '{datasetType}' has not been registered.") 

520 

521 # find run name 

522 runs = set(dataset.run for dataset in datasets) 

523 if len(runs) != 1: 

524 raise ValueError(f"Multiple run names in input datasets: {runs}") 

525 run = runs.pop() 

526 if run is None: 

527 if self.defaults.run is None: 

528 raise TypeError( 

529 "No run provided to ingestDatasets, and no default from registry construction." 

530 ) 

531 run = self.defaults.run 

532 

533 runRecord = self._managers.collections.find(run) 

534 if runRecord.type is not CollectionType.RUN: 

535 raise TypeError( 

536 f"Given collection '{runRecord.name}' is of type {runRecord.type.name};" 

537 " RUN collection required." 

538 ) 

539 assert isinstance(runRecord, RunRecord) 

540 

541 progress = Progress("daf.butler.Registry.insertDatasets", level=logging.DEBUG) 

542 if expand: 

543 expandedDatasets = [ 

544 dataset.expanded(self.expandDataId(dataset.dataId, graph=storage.datasetType.dimensions)) 

545 for dataset in progress.wrap(datasets, f"Expanding {storage.datasetType.name} data IDs") 

546 ] 

547 else: 

548 expandedDatasets = [ 

549 DatasetRef(datasetType, dataset.dataId, id=dataset.id, run=dataset.run, conform=True) 

550 for dataset in datasets 

551 ] 

552 

553 try: 

554 refs = list(storage.import_(runRecord, expandedDatasets, idGenerationMode, reuseIds)) 

555 except sqlalchemy.exc.IntegrityError as err: 

556 raise ConflictingDefinitionError( 

557 f"A database constraint failure was triggered by inserting " 

558 f"one or more datasets of type {storage.datasetType} into " 

559 f"collection '{run}'. " 

560 f"This probably means a dataset with the same data ID " 

561 f"and dataset type already exists, but it may also mean a " 

562 f"dimension row is missing." 

563 ) from err 

564 return refs 

565 

566 def getDataset(self, id: DatasetId) -> Optional[DatasetRef]: 

567 # Docstring inherited from lsst.daf.butler.registry.Registry 

568 return self._managers.datasets.getDatasetRef(id) 

569 

570 @transactional 

571 def removeDatasets(self, refs: Iterable[DatasetRef]) -> None: 

572 # Docstring inherited from lsst.daf.butler.registry.Registry 

573 progress = Progress("lsst.daf.butler.Registry.removeDatasets", level=logging.DEBUG) 

574 for datasetType, refsForType in progress.iter_item_chunks( 

575 DatasetRef.groupByType(refs).items(), desc="Removing datasets by type" 

576 ): 

577 storage = self._managers.datasets[datasetType.name] 

578 try: 

579 storage.delete(refsForType) 

580 except sqlalchemy.exc.IntegrityError as err: 

581 raise OrphanedRecordError( 

582 "One or more datasets is still present in one or more Datastores." 

583 ) from err 

584 

585 @transactional 

586 def associate(self, collection: str, refs: Iterable[DatasetRef]) -> None: 

587 # Docstring inherited from lsst.daf.butler.registry.Registry 

588 progress = Progress("lsst.daf.butler.Registry.associate", level=logging.DEBUG) 

589 collectionRecord = self._managers.collections.find(collection) 

590 if collectionRecord.type is not CollectionType.TAGGED: 

591 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}, not TAGGED.") 

592 for datasetType, refsForType in progress.iter_item_chunks( 

593 DatasetRef.groupByType(refs).items(), desc="Associating datasets by type" 

594 ): 

595 storage = self._managers.datasets[datasetType.name] 

596 try: 

597 storage.associate(collectionRecord, refsForType) 

598 except sqlalchemy.exc.IntegrityError as err: 

599 raise ConflictingDefinitionError( 

600 f"Constraint violation while associating dataset of type {datasetType.name} with " 

601 f"collection {collection}. This probably means that one or more datasets with the same " 

602 f"dataset type and data ID already exist in the collection, but it may also indicate " 

603 f"that the datasets do not exist." 

604 ) from err 

605 

606 @transactional 

607 def disassociate(self, collection: str, refs: Iterable[DatasetRef]) -> None: 

608 # Docstring inherited from lsst.daf.butler.registry.Registry 

609 progress = Progress("lsst.daf.butler.Registry.disassociate", level=logging.DEBUG) 

610 collectionRecord = self._managers.collections.find(collection) 

611 if collectionRecord.type is not CollectionType.TAGGED: 

612 raise TypeError( 

613 f"Collection '{collection}' has type {collectionRecord.type.name}; expected TAGGED." 

614 ) 

615 for datasetType, refsForType in progress.iter_item_chunks( 

616 DatasetRef.groupByType(refs).items(), desc="Disassociating datasets by type" 

617 ): 

618 storage = self._managers.datasets[datasetType.name] 

619 storage.disassociate(collectionRecord, refsForType) 

620 

621 @transactional 

622 def certify(self, collection: str, refs: Iterable[DatasetRef], timespan: Timespan) -> None: 

623 # Docstring inherited from lsst.daf.butler.registry.Registry 

624 progress = Progress("lsst.daf.butler.Registry.certify", level=logging.DEBUG) 

625 collectionRecord = self._managers.collections.find(collection) 

626 for datasetType, refsForType in progress.iter_item_chunks( 

627 DatasetRef.groupByType(refs).items(), desc="Certifying datasets by type" 

628 ): 

629 storage = self._managers.datasets[datasetType.name] 

630 storage.certify(collectionRecord, refsForType, timespan) 

631 

632 @transactional 

633 def decertify( 

634 self, 

635 collection: str, 

636 datasetType: Union[str, DatasetType], 

637 timespan: Timespan, 

638 *, 

639 dataIds: Optional[Iterable[DataId]] = None, 

640 ) -> None: 

641 # Docstring inherited from lsst.daf.butler.registry.Registry 

642 collectionRecord = self._managers.collections.find(collection) 

643 if isinstance(datasetType, str): 

644 storage = self._managers.datasets[datasetType] 

645 else: 

646 storage = self._managers.datasets[datasetType.name] 

647 standardizedDataIds = None 

648 if dataIds is not None: 

649 standardizedDataIds = [ 

650 DataCoordinate.standardize(d, graph=storage.datasetType.dimensions) for d in dataIds 

651 ] 

652 storage.decertify(collectionRecord, timespan, dataIds=standardizedDataIds) 

653 

654 def getDatastoreBridgeManager(self) -> DatastoreRegistryBridgeManager: 

655 """Return an object that allows a new `Datastore` instance to 

656 communicate with this `Registry`. 

657 

658 Returns 

659 ------- 

660 manager : `DatastoreRegistryBridgeManager` 

661 Object that mediates communication between this `Registry` and its 

662 associated datastores. 

663 """ 

664 return self._managers.datastores 

665 

666 def getDatasetLocations(self, ref: DatasetRef) -> Iterable[str]: 

667 # Docstring inherited from lsst.daf.butler.registry.Registry 

668 return self._managers.datastores.findDatastores(ref) 

669 

670 def expandDataId( 

671 self, 

672 dataId: Optional[DataId] = None, 

673 *, 

674 graph: Optional[DimensionGraph] = None, 

675 records: Optional[NameLookupMapping[DimensionElement, Optional[DimensionRecord]]] = None, 

676 withDefaults: bool = True, 

677 **kwargs: Any, 

678 ) -> DataCoordinate: 

679 # Docstring inherited from lsst.daf.butler.registry.Registry 

680 if not withDefaults: 

681 defaults = None 

682 else: 

683 defaults = self.defaults.dataId 

684 standardized = DataCoordinate.standardize( 

685 dataId, graph=graph, universe=self.dimensions, defaults=defaults, **kwargs 

686 ) 

687 if standardized.hasRecords(): 

688 return standardized 

689 if records is None: 

690 records = {} 

691 elif isinstance(records, NamedKeyMapping): 

692 records = records.byName() 

693 else: 

694 records = dict(records) 

695 if isinstance(dataId, DataCoordinate) and dataId.hasRecords(): 

696 records.update(dataId.records.byName()) 

697 keys = standardized.byName() 

698 for element in standardized.graph.primaryKeyTraversalOrder: 

699 record = records.get(element.name, ...) # Use ... to mean not found; None might mean NULL 

700 if record is ...: 

701 if isinstance(element, Dimension) and keys.get(element.name) is None: 

702 if element in standardized.graph.required: 

703 raise LookupError(f"No value or null value for required dimension {element.name}.") 

704 keys[element.name] = None 

705 record = None 

706 else: 

707 storage = self._managers.dimensions[element] 

708 dataIdSet = DataCoordinateIterable.fromScalar( 

709 DataCoordinate.standardize(keys, graph=element.graph) 

710 ) 

711 fetched = tuple(storage.fetch(dataIdSet)) 

712 try: 

713 (record,) = fetched 

714 except ValueError: 

715 record = None 

716 records[element.name] = record 

717 if record is not None: 

718 for d in element.implied: 

719 value = getattr(record, d.name) 

720 if keys.setdefault(d.name, value) != value: 

721 raise InconsistentDataIdError( 

722 f"Data ID {standardized} has {d.name}={keys[d.name]!r}, " 

723 f"but {element.name} implies {d.name}={value!r}." 

724 ) 

725 else: 

726 if element in standardized.graph.required: 

727 raise LookupError( 

728 f"Could not fetch record for required dimension {element.name} via keys {keys}." 

729 ) 

730 if element.alwaysJoin: 

731 raise InconsistentDataIdError( 

732 f"Could not fetch record for element {element.name} via keys {keys}, ", 

733 "but it is marked alwaysJoin=True; this means one or more dimensions are not " 

734 "related.", 

735 ) 

736 for d in element.implied: 

737 keys.setdefault(d.name, None) 

738 records.setdefault(d.name, None) 

739 return DataCoordinate.standardize(keys, graph=standardized.graph).expanded(records=records) 

740 

741 def insertDimensionData( 

742 self, 

743 element: Union[DimensionElement, str], 

744 *data: Union[Mapping[str, Any], DimensionRecord], 

745 conform: bool = True, 

746 replace: bool = False, 

747 ) -> None: 

748 # Docstring inherited from lsst.daf.butler.registry.Registry 

749 if conform: 

750 if isinstance(element, str): 

751 element = self.dimensions[element] 

752 records = [ 

753 row if isinstance(row, DimensionRecord) else element.RecordClass(**row) for row in data 

754 ] 

755 else: 

756 # Ignore typing since caller said to trust them with conform=False. 

757 records = data # type: ignore 

758 storage = self._managers.dimensions[element] # type: ignore 

759 storage.insert(*records, replace=replace) 

760 

761 def syncDimensionData( 

762 self, 

763 element: Union[DimensionElement, str], 

764 row: Union[Mapping[str, Any], DimensionRecord], 

765 conform: bool = True, 

766 update: bool = False, 

767 ) -> Union[bool, Dict[str, Any]]: 

768 # Docstring inherited from lsst.daf.butler.registry.Registry 

769 if conform: 

770 if isinstance(element, str): 

771 element = self.dimensions[element] 

772 record = row if isinstance(row, DimensionRecord) else element.RecordClass(**row) 

773 else: 

774 # Ignore typing since caller said to trust them with conform=False. 

775 record = row # type: ignore 

776 storage = self._managers.dimensions[element] # type: ignore 

777 return storage.sync(record, update=update) 

778 

779 def queryDatasetTypes( 

780 self, 

781 expression: Any = ..., 

782 *, 

783 components: Optional[bool] = None, 

784 missing: Optional[List[str]] = None, 

785 ) -> Iterator[DatasetType]: 

786 # Docstring inherited from lsst.daf.butler.registry.Registry 

787 wildcard = CategorizedWildcard.fromExpression(expression, coerceUnrecognized=lambda d: d.name) 

788 unknownComponentsMessage = ( 

789 "Could not find definition for storage class %s for dataset type %r;" 

790 " if it has components they will not be included in dataset type query results." 

791 ) 

792 if wildcard is Ellipsis: 

793 for datasetType in self._managers.datasets: 

794 # The dataset type can no longer be a component 

795 yield datasetType 

796 if components: 

797 # Automatically create the component dataset types 

798 try: 

799 componentsForDatasetType = datasetType.makeAllComponentDatasetTypes() 

800 except KeyError as err: 

801 _LOG.warning(unknownComponentsMessage, err, datasetType.name) 

802 else: 

803 yield from componentsForDatasetType 

804 return 

805 done: Set[str] = set() 

806 for name in wildcard.strings: 

807 storage = self._managers.datasets.find(name) 

808 done.add(name) 

809 if storage is None: 

810 if missing is not None: 

811 missing.append(name) 

812 else: 

813 yield storage.datasetType 

814 if wildcard.patterns: 

815 # If components (the argument) is None, we'll save component 

816 # dataset that we might want to match, but only if their parents 

817 # didn't get included. 

818 componentsForLater = [] 

819 for registeredDatasetType in self._managers.datasets: 

820 # Components are not stored in registry so expand them here 

821 allDatasetTypes = [registeredDatasetType] 

822 if components is not False: 

823 # Only check for the components if we are being asked 

824 # for components or components is None. 

825 try: 

826 allDatasetTypes.extend(registeredDatasetType.makeAllComponentDatasetTypes()) 

827 except KeyError as err: 

828 _LOG.warning(unknownComponentsMessage, err, registeredDatasetType.name) 

829 for datasetType in allDatasetTypes: 

830 if datasetType.name in done: 

831 continue 

832 parentName, componentName = datasetType.nameAndComponent() 

833 if componentName is not None and not components: 

834 if components is None and parentName not in done: 

835 componentsForLater.append(datasetType) 

836 continue 

837 if any(p.fullmatch(datasetType.name) for p in wildcard.patterns): 

838 done.add(datasetType.name) 

839 yield datasetType 

840 # Go back and try to match saved components. 

841 for datasetType in componentsForLater: 

842 parentName, _ = datasetType.nameAndComponent() 

843 if parentName not in done and any(p.fullmatch(datasetType.name) for p in wildcard.patterns): 

844 yield datasetType 

845 

846 def queryCollections( 

847 self, 

848 expression: Any = ..., 

849 datasetType: Optional[DatasetType] = None, 

850 collectionTypes: Union[Iterable[CollectionType], CollectionType] = CollectionType.all(), 

851 flattenChains: bool = False, 

852 includeChains: Optional[bool] = None, 

853 ) -> Iterator[str]: 

854 # Docstring inherited from lsst.daf.butler.registry.Registry 

855 

856 # Right now the datasetTypes argument is completely ignored, but that 

857 # is consistent with its [lack of] guarantees. DM-24939 or a follow-up 

858 # ticket will take care of that. 

859 query = CollectionQuery.fromExpression(expression) 

860 collectionTypes = ensure_iterable(collectionTypes) 

861 for record in query.iter( 

862 self._managers.collections, 

863 collectionTypes=frozenset(collectionTypes), 

864 flattenChains=flattenChains, 

865 includeChains=includeChains, 

866 ): 

867 yield record.name 

868 

869 def _makeQueryBuilder( 

870 self, summary: queries.QuerySummary, doomed_by: Iterable[str] = () 

871 ) -> queries.QueryBuilder: 

872 """Return a `QueryBuilder` instance capable of constructing and 

873 managing more complex queries than those obtainable via `Registry` 

874 interfaces. 

875 

876 This is an advanced interface; downstream code should prefer 

877 `Registry.queryDataIds` and `Registry.queryDatasets` whenever those 

878 are sufficient. 

879 

880 Parameters 

881 ---------- 

882 summary : `queries.QuerySummary` 

883 Object describing and categorizing the full set of dimensions that 

884 will be included in the query. 

885 doomed_by : `Iterable` of `str`, optional 

886 A list of diagnostic messages that indicate why the query is going 

887 to yield no results and should not even be executed. If an empty 

888 container (default) the query will be executed unless other code 

889 determines that it is doomed. 

890 

891 Returns 

892 ------- 

893 builder : `queries.QueryBuilder` 

894 Object that can be used to construct and perform advanced queries. 

895 """ 

896 return queries.QueryBuilder( 

897 summary, 

898 queries.RegistryManagers( 

899 collections=self._managers.collections, 

900 dimensions=self._managers.dimensions, 

901 datasets=self._managers.datasets, 

902 TimespanReprClass=self._db.getTimespanRepresentation(), 

903 ), 

904 doomed_by=doomed_by, 

905 ) 

906 

907 def queryDatasets( 

908 self, 

909 datasetType: Any, 

910 *, 

911 collections: Any = None, 

912 dimensions: Optional[Iterable[Union[Dimension, str]]] = None, 

913 dataId: Optional[DataId] = None, 

914 where: Optional[str] = None, 

915 findFirst: bool = False, 

916 components: Optional[bool] = None, 

917 bind: Optional[Mapping[str, Any]] = None, 

918 check: bool = True, 

919 **kwargs: Any, 

920 ) -> queries.DatasetQueryResults: 

921 # Docstring inherited from lsst.daf.butler.registry.Registry 

922 

923 # Standardize the collections expression. 

924 if collections is None: 

925 if not self.defaults.collections: 

926 raise TypeError( 

927 "No collections provided to findDataset, and no defaults from registry construction." 

928 ) 

929 collections = self.defaults.collections 

930 elif findFirst: 

931 collections = CollectionSearch.fromExpression(collections) 

932 else: 

933 collections = CollectionQuery.fromExpression(collections) 

934 # Standardize and expand the data ID provided as a constraint. 

935 standardizedDataId = self.expandDataId(dataId, **kwargs) 

936 

937 # We can only query directly if given a non-component DatasetType 

938 # instance. If we were given an expression or str or a component 

939 # DatasetType instance, we'll populate this dict, recurse, and return. 

940 # If we already have a non-component DatasetType, it will remain None 

941 # and we'll run the query directly. 

942 composition: Optional[ 

943 Dict[ 

944 DatasetType, List[Optional[str]] # parent dataset type # component name, or None for parent 

945 ] 

946 ] = None 

947 if not isinstance(datasetType, DatasetType): 

948 # We were given a dataset type expression (which may be as simple 

949 # as a str). Loop over all matching datasets, delegating handling 

950 # of the `components` argument to queryDatasetTypes, as we populate 

951 # the composition dict. 

952 composition = defaultdict(list) 

953 for trueDatasetType in self.queryDatasetTypes(datasetType, components=components): 

954 parentName, componentName = trueDatasetType.nameAndComponent() 

955 if componentName is not None: 

956 parentDatasetType = self.getDatasetType(parentName) 

957 composition.setdefault(parentDatasetType, []).append(componentName) 

958 else: 

959 composition.setdefault(trueDatasetType, []).append(None) 

960 if not composition: 

961 return queries.ChainedDatasetQueryResults( 

962 [], 

963 doomed_by=[ 

964 f"No registered dataset type matching {t!r} found." 

965 for t in ensure_iterable(datasetType) 

966 ], 

967 ) 

968 elif datasetType.isComponent(): 

969 # We were given a true DatasetType instance, but it's a component. 

970 # the composition dict will have exactly one item. 

971 parentName, componentName = datasetType.nameAndComponent() 

972 parentDatasetType = self.getDatasetType(parentName) 

973 composition = {parentDatasetType: [componentName]} 

974 if composition is not None: 

975 # We need to recurse. Do that once for each parent dataset type. 

976 chain = [] 

977 for parentDatasetType, componentNames in composition.items(): 

978 parentResults = self.queryDatasets( 

979 parentDatasetType, 

980 collections=collections, 

981 dimensions=dimensions, 

982 dataId=standardizedDataId, 

983 where=where, 

984 bind=bind, 

985 findFirst=findFirst, 

986 check=check, 

987 ) 

988 assert isinstance( 

989 parentResults, queries.ParentDatasetQueryResults 

990 ), "Should always be true if passing in a DatasetType instance, and we are." 

991 chain.append(parentResults.withComponents(componentNames)) 

992 return queries.ChainedDatasetQueryResults(chain) 

993 # If we get here, there's no need to recurse (or we are already 

994 # recursing; there can only ever be one level of recursion). 

995 

996 # The full set of dimensions in the query is the combination of those 

997 # needed for the DatasetType and those explicitly requested, if any. 

998 requestedDimensionNames = set(datasetType.dimensions.names) 

999 if dimensions is not None: 

1000 requestedDimensionNames.update(self.dimensions.extract(dimensions).names) 

1001 # Construct the summary structure needed to construct a QueryBuilder. 

1002 summary = queries.QuerySummary( 

1003 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames), 

1004 dataId=standardizedDataId, 

1005 expression=where, 

1006 bind=bind, 

1007 defaults=self.defaults.dataId, 

1008 check=check, 

1009 datasets=[datasetType], 

1010 ) 

1011 builder = self._makeQueryBuilder(summary) 

1012 # Add the dataset subquery to the query, telling the QueryBuilder to 

1013 # include the rank of the selected collection in the results only if we 

1014 # need to findFirst. Note that if any of the collections are 

1015 # actually wildcard expressions, and we've asked for deduplication, 

1016 # this will raise TypeError for us. 

1017 builder.joinDataset(datasetType, collections, isResult=True, findFirst=findFirst) 

1018 query = builder.finish() 

1019 return queries.ParentDatasetQueryResults(self._db, query, components=[None], datasetType=datasetType) 

1020 

1021 def queryDataIds( 

1022 self, 

1023 dimensions: Union[Iterable[Union[Dimension, str]], Dimension, str], 

1024 *, 

1025 dataId: Optional[DataId] = None, 

1026 datasets: Any = None, 

1027 collections: Any = None, 

1028 where: Optional[str] = None, 

1029 components: Optional[bool] = None, 

1030 bind: Optional[Mapping[str, Any]] = None, 

1031 check: bool = True, 

1032 **kwargs: Any, 

1033 ) -> queries.DataCoordinateQueryResults: 

1034 # Docstring inherited from lsst.daf.butler.registry.Registry 

1035 dimensions = ensure_iterable(dimensions) 

1036 standardizedDataId = self.expandDataId(dataId, **kwargs) 

1037 standardizedDatasetTypes = set() 

1038 requestedDimensions = self.dimensions.extract(dimensions) 

1039 missing: List[str] = [] 

1040 if datasets is not None: 

1041 if not collections: 

1042 if not self.defaults.collections: 

1043 raise TypeError(f"Cannot pass 'datasets' (='{datasets}') without 'collections'.") 

1044 collections = self.defaults.collections 

1045 else: 

1046 # Preprocess collections expression in case the original 

1047 # included single-pass iterators (we'll want to use it multiple 

1048 # times below). 

1049 collections = CollectionQuery.fromExpression(collections) 

1050 for datasetType in self.queryDatasetTypes(datasets, components=components, missing=missing): 

1051 # If any matched dataset type is a component, just operate on 

1052 # its parent instead, because Registry doesn't know anything 

1053 # about what components exist, and here (unlike queryDatasets) 

1054 # we don't care about returning them. 

1055 parentDatasetTypeName, componentName = datasetType.nameAndComponent() 

1056 if componentName is not None: 

1057 datasetType = self.getDatasetType(parentDatasetTypeName) 

1058 standardizedDatasetTypes.add(datasetType) 

1059 elif collections: 

1060 raise TypeError(f"Cannot pass 'collections' (='{collections}') without 'datasets'.") 

1061 

1062 def query_factory( 

1063 order_by: Optional[Iterable[str]] = None, limit: Optional[Tuple[int, Optional[int]]] = None 

1064 ) -> Query: 

1065 """Construct the Query object that generates query results.""" 

1066 summary = queries.QuerySummary( 

1067 requested=requestedDimensions, 

1068 dataId=standardizedDataId, 

1069 expression=where, 

1070 bind=bind, 

1071 defaults=self.defaults.dataId, 

1072 check=check, 

1073 datasets=standardizedDatasetTypes, 

1074 order_by=order_by, 

1075 limit=limit, 

1076 ) 

1077 builder = self._makeQueryBuilder( 

1078 summary, doomed_by=[f"Dataset type {name} is not registered." for name in missing] 

1079 ) 

1080 for datasetType in standardizedDatasetTypes: 

1081 builder.joinDataset( 

1082 datasetType, 

1083 collections, 

1084 isResult=False, 

1085 ) 

1086 return builder.finish() 

1087 

1088 return queries.DataCoordinateQueryResults(self._db, query_factory, requestedDimensions) 

1089 

1090 def queryDimensionRecords( 

1091 self, 

1092 element: Union[DimensionElement, str], 

1093 *, 

1094 dataId: Optional[DataId] = None, 

1095 datasets: Any = None, 

1096 collections: Any = None, 

1097 where: Optional[str] = None, 

1098 components: Optional[bool] = None, 

1099 bind: Optional[Mapping[str, Any]] = None, 

1100 check: bool = True, 

1101 **kwargs: Any, 

1102 ) -> queries.DimensionRecordQueryResults: 

1103 # Docstring inherited from lsst.daf.butler.registry.Registry 

1104 if not isinstance(element, DimensionElement): 

1105 try: 

1106 element = self.dimensions[element] 

1107 except KeyError as e: 

1108 raise KeyError( 

1109 f"No such dimension '{element}', available dimensions: " 

1110 + str(self.dimensions.getStaticElements()) 

1111 ) from e 

1112 dataIds = self.queryDataIds( 

1113 element.graph, 

1114 dataId=dataId, 

1115 datasets=datasets, 

1116 collections=collections, 

1117 where=where, 

1118 components=components, 

1119 bind=bind, 

1120 check=check, 

1121 **kwargs, 

1122 ) 

1123 return queries.DatabaseDimensionRecordQueryResults(dataIds, self._managers.dimensions[element]) 

1124 

1125 def queryDatasetAssociations( 

1126 self, 

1127 datasetType: Union[str, DatasetType], 

1128 collections: Any = ..., 

1129 *, 

1130 collectionTypes: Iterable[CollectionType] = CollectionType.all(), 

1131 flattenChains: bool = False, 

1132 ) -> Iterator[DatasetAssociation]: 

1133 # Docstring inherited from lsst.daf.butler.registry.Registry 

1134 if collections is None: 

1135 if not self.defaults.collections: 

1136 raise TypeError( 

1137 "No collections provided to findDataset, and no defaults from registry construction." 

1138 ) 

1139 collections = self.defaults.collections 

1140 else: 

1141 collections = CollectionQuery.fromExpression(collections) 

1142 TimespanReprClass = self._db.getTimespanRepresentation() 

1143 if isinstance(datasetType, str): 

1144 storage = self._managers.datasets[datasetType] 

1145 else: 

1146 storage = self._managers.datasets[datasetType.name] 

1147 for collectionRecord in collections.iter( 

1148 self._managers.collections, 

1149 collectionTypes=frozenset(collectionTypes), 

1150 flattenChains=flattenChains, 

1151 ): 

1152 query = storage.select(collectionRecord) 

1153 for row in self._db.query(query.combine()).mappings(): 

1154 dataId = DataCoordinate.fromRequiredValues( 

1155 storage.datasetType.dimensions, 

1156 tuple(row[name] for name in storage.datasetType.dimensions.required.names), 

1157 ) 

1158 runRecord = self._managers.collections[row[self._managers.collections.getRunForeignKeyName()]] 

1159 ref = DatasetRef(storage.datasetType, dataId, id=row["id"], run=runRecord.name, conform=False) 

1160 if collectionRecord.type is CollectionType.CALIBRATION: 

1161 timespan = TimespanReprClass.extract(row) 

1162 else: 

1163 timespan = None 

1164 yield DatasetAssociation(ref=ref, collection=collectionRecord.name, timespan=timespan) 

1165 

1166 storageClasses: StorageClassFactory 

1167 """All storage classes known to the registry (`StorageClassFactory`). 

1168 """