Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ( 

25 "SqlRegistry", 

26) 

27 

28from collections import defaultdict 

29import contextlib 

30import logging 

31from typing import ( 

32 Any, 

33 Dict, 

34 Iterable, 

35 Iterator, 

36 List, 

37 Mapping, 

38 Optional, 

39 Set, 

40 TYPE_CHECKING, 

41 Union, 

42) 

43 

44import sqlalchemy 

45from lsst.utils.iteration import ensure_iterable 

46 

47from ..core import ( 

48 ButlerURI, 

49 Config, 

50 DataCoordinate, 

51 DataCoordinateIterable, 

52 DataId, 

53 DatasetAssociation, 

54 DatasetId, 

55 DatasetRef, 

56 DatasetType, 

57 ddl, 

58 Dimension, 

59 DimensionConfig, 

60 DimensionElement, 

61 DimensionGraph, 

62 DimensionRecord, 

63 DimensionUniverse, 

64 NamedKeyMapping, 

65 NameLookupMapping, 

66 Progress, 

67 StorageClassFactory, 

68 Timespan, 

69) 

70from ..core.utils import transactional 

71 

72from ..registry import ( 

73 Registry, 

74 RegistryConfig, 

75 CollectionType, 

76 RegistryDefaults, 

77 ConflictingDefinitionError, 

78 InconsistentDataIdError, 

79 OrphanedRecordError, 

80 CollectionSearch, 

81) 

82from ..registry import queries 

83from ..registry.wildcards import CategorizedWildcard, CollectionQuery, Ellipsis 

84from ..registry.summaries import CollectionSummary 

85from ..registry.managers import RegistryManagerTypes, RegistryManagerInstances 

86from ..registry.interfaces import ChainedCollectionRecord, DatasetIdGenEnum, RunRecord 

87 

88if TYPE_CHECKING: 88 ↛ 89line 88 didn't jump to line 89, because the condition on line 88 was never true

89 from .._butlerConfig import ButlerConfig 

90 from ..registry.interfaces import ( 

91 CollectionRecord, 

92 Database, 

93 DatastoreRegistryBridgeManager, 

94 ) 

95 

96 

97_LOG = logging.getLogger(__name__) 

98 

99 

100class SqlRegistry(Registry): 

101 """Registry implementation based on SQLAlchemy. 

102 

103 Parameters 

104 ---------- 

105 database : `Database` 

106 Database instance to store Registry. 

107 defaults : `RegistryDefaults` 

108 Default collection search path and/or output `~CollectionType.RUN` 

109 collection. 

110 managers : `RegistryManagerInstances` 

111 All the managers required for this registry. 

112 """ 

113 

114 defaultConfigFile: Optional[str] = None 

115 """Path to configuration defaults. Accessed within the ``configs`` resource 

116 or relative to a search path. Can be None if no defaults specified. 

117 """ 

118 

119 @classmethod 

120 def createFromConfig(cls, config: Optional[Union[RegistryConfig, str]] = None, 

121 dimensionConfig: Optional[Union[DimensionConfig, str]] = None, 

122 butlerRoot: Optional[str] = None) -> Registry: 

123 """Create registry database and return `SqlRegistry` instance. 

124 

125 This method initializes database contents, database must be empty 

126 prior to calling this method. 

127 

128 Parameters 

129 ---------- 

130 config : `RegistryConfig` or `str`, optional 

131 Registry configuration, if missing then default configuration will 

132 be loaded from registry.yaml. 

133 dimensionConfig : `DimensionConfig` or `str`, optional 

134 Dimensions configuration, if missing then default configuration 

135 will be loaded from dimensions.yaml. 

136 butlerRoot : `str`, optional 

137 Path to the repository root this `SqlRegistry` will manage. 

138 

139 Returns 

140 ------- 

141 registry : `SqlRegistry` 

142 A new `SqlRegistry` instance. 

143 """ 

144 config = cls.forceRegistryConfig(config) 

145 config.replaceRoot(butlerRoot) 

146 

147 if isinstance(dimensionConfig, str): 

148 dimensionConfig = DimensionConfig(config) 

149 elif dimensionConfig is None: 

150 dimensionConfig = DimensionConfig() 

151 elif not isinstance(dimensionConfig, DimensionConfig): 

152 raise TypeError(f"Incompatible Dimension configuration type: {type(dimensionConfig)}") 

153 

154 DatabaseClass = config.getDatabaseClass() 

155 database = DatabaseClass.fromUri(str(config.connectionString), origin=config.get("origin", 0), 

156 namespace=config.get("namespace")) 

157 managerTypes = RegistryManagerTypes.fromConfig(config) 

158 managers = managerTypes.makeRepo(database, dimensionConfig) 

159 return cls(database, RegistryDefaults(), managers) 

160 

161 @classmethod 

162 def fromConfig(cls, config: Union[ButlerConfig, RegistryConfig, Config, str], 

163 butlerRoot: Optional[Union[str, ButlerURI]] = None, writeable: bool = True, 

164 defaults: Optional[RegistryDefaults] = None) -> Registry: 

165 """Create `Registry` subclass instance from `config`. 

166 

167 Registry database must be inbitialized prior to calling this method. 

168 

169 Parameters 

170 ---------- 

171 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str` 

172 Registry configuration 

173 butlerRoot : `str` or `ButlerURI`, optional 

174 Path to the repository root this `Registry` will manage. 

175 writeable : `bool`, optional 

176 If `True` (default) create a read-write connection to the database. 

177 defaults : `RegistryDefaults`, optional 

178 Default collection search path and/or output `~CollectionType.RUN` 

179 collection. 

180 

181 Returns 

182 ------- 

183 registry : `SqlRegistry` (subclass) 

184 A new `SqlRegistry` subclass instance. 

185 """ 

186 config = cls.forceRegistryConfig(config) 

187 config.replaceRoot(butlerRoot) 

188 DatabaseClass = config.getDatabaseClass() 

189 database = DatabaseClass.fromUri(str(config.connectionString), origin=config.get("origin", 0), 

190 namespace=config.get("namespace"), writeable=writeable) 

191 managerTypes = RegistryManagerTypes.fromConfig(config) 

192 managers = managerTypes.loadRepo(database) 

193 if defaults is None: 

194 defaults = RegistryDefaults() 

195 return cls(database, defaults, managers) 

196 

197 def __init__(self, database: Database, defaults: RegistryDefaults, managers: RegistryManagerInstances): 

198 self._db = database 

199 self._managers = managers 

200 self.storageClasses = StorageClassFactory() 

201 # Intentionally invoke property setter to initialize defaults. This 

202 # can only be done after most of the rest of Registry has already been 

203 # initialized, and must be done before the property getter is used. 

204 self.defaults = defaults 

205 

206 def __str__(self) -> str: 

207 return str(self._db) 

208 

209 def __repr__(self) -> str: 

210 return f"SqlRegistry({self._db!r}, {self.dimensions!r})" 

211 

212 def isWriteable(self) -> bool: 

213 # Docstring inherited from lsst.daf.butler.registry.Registry 

214 return self._db.isWriteable() 

215 

216 def copy(self, defaults: Optional[RegistryDefaults] = None) -> Registry: 

217 # Docstring inherited from lsst.daf.butler.registry.Registry 

218 if defaults is None: 

219 # No need to copy, because `RegistryDefaults` is immutable; we 

220 # effectively copy on write. 

221 defaults = self.defaults 

222 return type(self)(self._db, defaults, self._managers) 

223 

224 @property 

225 def dimensions(self) -> DimensionUniverse: 

226 # Docstring inherited from lsst.daf.butler.registry.Registry 

227 return self._managers.dimensions.universe 

228 

229 def refresh(self) -> None: 

230 # Docstring inherited from lsst.daf.butler.registry.Registry 

231 self._managers.refresh() 

232 

233 @contextlib.contextmanager 

234 def transaction(self, *, savepoint: bool = False) -> Iterator[None]: 

235 # Docstring inherited from lsst.daf.butler.registry.Registry 

236 try: 

237 with self._db.transaction(savepoint=savepoint): 

238 yield 

239 except BaseException: 

240 # TODO: this clears the caches sometimes when we wouldn't actually 

241 # need to. Can we avoid that? 

242 self._managers.dimensions.clearCaches() 

243 raise 

244 

245 def resetConnectionPool(self) -> None: 

246 """Reset SQLAlchemy connection pool for `SqlRegistry` database. 

247 

248 This operation is useful when using registry with fork-based 

249 multiprocessing. To use registry across fork boundary one has to make 

250 sure that there are no currently active connections (no session or 

251 transaction is in progress) and connection pool is reset using this 

252 method. This method should be called by the child process immediately 

253 after the fork. 

254 """ 

255 self._db._engine.dispose() 

256 

257 def registerOpaqueTable(self, tableName: str, spec: ddl.TableSpec) -> None: 

258 """Add an opaque (to the `Registry`) table for use by a `Datastore` or 

259 other data repository client. 

260 

261 Opaque table records can be added via `insertOpaqueData`, retrieved via 

262 `fetchOpaqueData`, and removed via `deleteOpaqueData`. 

263 

264 Parameters 

265 ---------- 

266 tableName : `str` 

267 Logical name of the opaque table. This may differ from the 

268 actual name used in the database by a prefix and/or suffix. 

269 spec : `ddl.TableSpec` 

270 Specification for the table to be added. 

271 """ 

272 self._managers.opaque.register(tableName, spec) 

273 

274 @transactional 

275 def insertOpaqueData(self, tableName: str, *data: dict) -> None: 

276 """Insert records into an opaque table. 

277 

278 Parameters 

279 ---------- 

280 tableName : `str` 

281 Logical name of the opaque table. Must match the name used in a 

282 previous call to `registerOpaqueTable`. 

283 data 

284 Each additional positional argument is a dictionary that represents 

285 a single row to be added. 

286 """ 

287 self._managers.opaque[tableName].insert(*data) 

288 

289 def fetchOpaqueData(self, tableName: str, **where: Any) -> Iterator[dict]: 

290 """Retrieve records from an opaque table. 

291 

292 Parameters 

293 ---------- 

294 tableName : `str` 

295 Logical name of the opaque table. Must match the name used in a 

296 previous call to `registerOpaqueTable`. 

297 where 

298 Additional keyword arguments are interpreted as equality 

299 constraints that restrict the returned rows (combined with AND); 

300 keyword arguments are column names and values are the values they 

301 must have. 

302 

303 Yields 

304 ------ 

305 row : `dict` 

306 A dictionary representing a single result row. 

307 """ 

308 yield from self._managers.opaque[tableName].fetch(**where) 

309 

310 @transactional 

311 def deleteOpaqueData(self, tableName: str, **where: Any) -> None: 

312 """Remove records from an opaque table. 

313 

314 Parameters 

315 ---------- 

316 tableName : `str` 

317 Logical name of the opaque table. Must match the name used in a 

318 previous call to `registerOpaqueTable`. 

319 where 

320 Additional keyword arguments are interpreted as equality 

321 constraints that restrict the deleted rows (combined with AND); 

322 keyword arguments are column names and values are the values they 

323 must have. 

324 """ 

325 self._managers.opaque[tableName].delete(where.keys(), where) 

326 

327 def registerCollection(self, name: str, type: CollectionType = CollectionType.TAGGED, 

328 doc: Optional[str] = None) -> bool: 

329 # Docstring inherited from lsst.daf.butler.registry.Registry 

330 _, registered = self._managers.collections.register(name, type, doc=doc) 

331 return registered 

332 

333 def getCollectionType(self, name: str) -> CollectionType: 

334 # Docstring inherited from lsst.daf.butler.registry.Registry 

335 return self._managers.collections.find(name).type 

336 

337 def _get_collection_record(self, name: str) -> CollectionRecord: 

338 # Docstring inherited from lsst.daf.butler.registry.Registry 

339 return self._managers.collections.find(name) 

340 

341 def registerRun(self, name: str, doc: Optional[str] = None) -> bool: 

342 # Docstring inherited from lsst.daf.butler.registry.Registry 

343 _, registered = self._managers.collections.register(name, CollectionType.RUN, doc=doc) 

344 return registered 

345 

346 @transactional 

347 def removeCollection(self, name: str) -> None: 

348 # Docstring inherited from lsst.daf.butler.registry.Registry 

349 self._managers.collections.remove(name) 

350 

351 def getCollectionChain(self, parent: str) -> CollectionSearch: 

352 # Docstring inherited from lsst.daf.butler.registry.Registry 

353 record = self._managers.collections.find(parent) 

354 if record.type is not CollectionType.CHAINED: 

355 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.") 

356 assert isinstance(record, ChainedCollectionRecord) 

357 return record.children 

358 

359 @transactional 

360 def setCollectionChain(self, parent: str, children: Any, *, flatten: bool = False) -> None: 

361 # Docstring inherited from lsst.daf.butler.registry.Registry 

362 record = self._managers.collections.find(parent) 

363 if record.type is not CollectionType.CHAINED: 

364 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.") 

365 assert isinstance(record, ChainedCollectionRecord) 

366 children = CollectionSearch.fromExpression(children) 

367 if children != record.children or flatten: 

368 record.update(self._managers.collections, children, flatten=flatten) 

369 

370 def getCollectionDocumentation(self, collection: str) -> Optional[str]: 

371 # Docstring inherited from lsst.daf.butler.registry.Registry 

372 return self._managers.collections.getDocumentation(self._managers.collections.find(collection).key) 

373 

374 def setCollectionDocumentation(self, collection: str, doc: Optional[str]) -> None: 

375 # Docstring inherited from lsst.daf.butler.registry.Registry 

376 self._managers.collections.setDocumentation(self._managers.collections.find(collection).key, doc) 

377 

378 def getCollectionSummary(self, collection: str) -> CollectionSummary: 

379 # Docstring inherited from lsst.daf.butler.registry.Registry 

380 record = self._managers.collections.find(collection) 

381 return self._managers.datasets.getCollectionSummary(record) 

382 

383 def registerDatasetType(self, datasetType: DatasetType) -> bool: 

384 # Docstring inherited from lsst.daf.butler.registry.Registry 

385 _, inserted = self._managers.datasets.register(datasetType) 

386 return inserted 

387 

388 def removeDatasetType(self, name: str) -> None: 

389 # Docstring inherited from lsst.daf.butler.registry.Registry 

390 self._managers.datasets.remove(name) 

391 

392 def getDatasetType(self, name: str) -> DatasetType: 

393 # Docstring inherited from lsst.daf.butler.registry.Registry 

394 return self._managers.datasets[name].datasetType 

395 

396 def supportsIdGenerationMode(self, mode: DatasetIdGenEnum) -> bool: 

397 # Docstring inherited from lsst.daf.butler.registry.Registry 

398 return self._managers.datasets.supportsIdGenerationMode(mode) 

399 

400 def findDataset(self, datasetType: Union[DatasetType, str], dataId: Optional[DataId] = None, *, 

401 collections: Any = None, timespan: Optional[Timespan] = None, 

402 **kwargs: Any) -> Optional[DatasetRef]: 

403 # Docstring inherited from lsst.daf.butler.registry.Registry 

404 if isinstance(datasetType, DatasetType): 

405 storage = self._managers.datasets[datasetType.name] 

406 else: 

407 storage = self._managers.datasets[datasetType] 

408 dataId = DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions, 

409 universe=self.dimensions, defaults=self.defaults.dataId, 

410 **kwargs) 

411 if collections is None: 

412 if not self.defaults.collections: 

413 raise TypeError("No collections provided to findDataset, " 

414 "and no defaults from registry construction.") 

415 collections = self.defaults.collections 

416 else: 

417 collections = CollectionSearch.fromExpression(collections) 

418 for collectionRecord in collections.iter(self._managers.collections): 

419 if (collectionRecord.type is CollectionType.CALIBRATION 

420 and (not storage.datasetType.isCalibration() or timespan is None)): 

421 continue 

422 result = storage.find(collectionRecord, dataId, timespan=timespan) 

423 if result is not None: 

424 return result 

425 

426 return None 

427 

428 @transactional 

429 def insertDatasets(self, datasetType: Union[DatasetType, str], dataIds: Iterable[DataId], 

430 run: Optional[str] = None, expand: bool = True, 

431 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE) -> List[DatasetRef]: 

432 # Docstring inherited from lsst.daf.butler.registry.Registry 

433 if isinstance(datasetType, DatasetType): 

434 storage = self._managers.datasets.find(datasetType.name) 

435 if storage is None: 

436 raise LookupError(f"DatasetType '{datasetType}' has not been registered.") 

437 else: 

438 storage = self._managers.datasets.find(datasetType) 

439 if storage is None: 

440 raise LookupError(f"DatasetType with name '{datasetType}' has not been registered.") 

441 if run is None: 

442 if self.defaults.run is None: 

443 raise TypeError("No run provided to insertDatasets, " 

444 "and no default from registry construction.") 

445 run = self.defaults.run 

446 runRecord = self._managers.collections.find(run) 

447 if runRecord.type is not CollectionType.RUN: 

448 raise TypeError(f"Given collection is of type {runRecord.type.name}; RUN collection required.") 

449 assert isinstance(runRecord, RunRecord) 

450 progress = Progress("daf.butler.Registry.insertDatasets", level=logging.DEBUG) 

451 if expand: 

452 expandedDataIds = [self.expandDataId(dataId, graph=storage.datasetType.dimensions) 

453 for dataId in progress.wrap(dataIds, 

454 f"Expanding {storage.datasetType.name} data IDs")] 

455 else: 

456 expandedDataIds = [DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions) 

457 for dataId in dataIds] 

458 try: 

459 refs = list(storage.insert(runRecord, expandedDataIds, idGenerationMode)) 

460 except sqlalchemy.exc.IntegrityError as err: 

461 raise ConflictingDefinitionError(f"A database constraint failure was triggered by inserting " 

462 f"one or more datasets of type {storage.datasetType} into " 

463 f"collection '{run}'. " 

464 f"This probably means a dataset with the same data ID " 

465 f"and dataset type already exists, but it may also mean a " 

466 f"dimension row is missing.") from err 

467 return refs 

468 

469 @transactional 

470 def _importDatasets(self, datasets: Iterable[DatasetRef], expand: bool = True, 

471 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

472 reuseIds: bool = False) -> List[DatasetRef]: 

473 # Docstring inherited from lsst.daf.butler.registry.Registry 

474 datasets = list(datasets) 

475 if not datasets: 

476 # nothing to do 

477 return [] 

478 

479 # find dataset type 

480 datasetTypes = set(dataset.datasetType for dataset in datasets) 

481 if len(datasetTypes) != 1: 

482 raise ValueError(f"Multiple dataset types in input datasets: {datasetTypes}") 

483 datasetType = datasetTypes.pop() 

484 

485 # get storage handler for this dataset type 

486 storage = self._managers.datasets.find(datasetType.name) 

487 if storage is None: 

488 raise LookupError(f"DatasetType '{datasetType}' has not been registered.") 

489 

490 # find run name 

491 runs = set(dataset.run for dataset in datasets) 

492 if len(runs) != 1: 

493 raise ValueError(f"Multiple run names in input datasets: {runs}") 

494 run = runs.pop() 

495 if run is None: 

496 if self.defaults.run is None: 

497 raise TypeError("No run provided to ingestDatasets, " 

498 "and no default from registry construction.") 

499 run = self.defaults.run 

500 

501 runRecord = self._managers.collections.find(run) 

502 if runRecord.type is not CollectionType.RUN: 

503 raise TypeError(f"Given collection '{runRecord.name}' is of type {runRecord.type.name};" 

504 " RUN collection required.") 

505 assert isinstance(runRecord, RunRecord) 

506 

507 progress = Progress("daf.butler.Registry.insertDatasets", level=logging.DEBUG) 

508 if expand: 

509 expandedDatasets = [ 

510 dataset.expanded(self.expandDataId(dataset.dataId, graph=storage.datasetType.dimensions)) 

511 for dataset in progress.wrap(datasets, f"Expanding {storage.datasetType.name} data IDs")] 

512 else: 

513 expandedDatasets = [ 

514 DatasetRef(datasetType, dataset.dataId, id=dataset.id, run=dataset.run, conform=True) 

515 for dataset in datasets 

516 ] 

517 

518 try: 

519 refs = list(storage.import_(runRecord, expandedDatasets, idGenerationMode, reuseIds)) 

520 except sqlalchemy.exc.IntegrityError as err: 

521 raise ConflictingDefinitionError(f"A database constraint failure was triggered by inserting " 

522 f"one or more datasets of type {storage.datasetType} into " 

523 f"collection '{run}'. " 

524 f"This probably means a dataset with the same data ID " 

525 f"and dataset type already exists, but it may also mean a " 

526 f"dimension row is missing.") from err 

527 return refs 

528 

529 def getDataset(self, id: DatasetId) -> Optional[DatasetRef]: 

530 # Docstring inherited from lsst.daf.butler.registry.Registry 

531 return self._managers.datasets.getDatasetRef(id) 

532 

533 @transactional 

534 def removeDatasets(self, refs: Iterable[DatasetRef]) -> None: 

535 # Docstring inherited from lsst.daf.butler.registry.Registry 

536 progress = Progress("lsst.daf.butler.Registry.removeDatasets", level=logging.DEBUG) 

537 for datasetType, refsForType in progress.iter_item_chunks(DatasetRef.groupByType(refs).items(), 

538 desc="Removing datasets by type"): 

539 storage = self._managers.datasets[datasetType.name] 

540 try: 

541 storage.delete(refsForType) 

542 except sqlalchemy.exc.IntegrityError as err: 

543 raise OrphanedRecordError("One or more datasets is still " 

544 "present in one or more Datastores.") from err 

545 

546 @transactional 

547 def associate(self, collection: str, refs: Iterable[DatasetRef]) -> None: 

548 # Docstring inherited from lsst.daf.butler.registry.Registry 

549 progress = Progress("lsst.daf.butler.Registry.associate", level=logging.DEBUG) 

550 collectionRecord = self._managers.collections.find(collection) 

551 if collectionRecord.type is not CollectionType.TAGGED: 

552 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}, not TAGGED.") 

553 for datasetType, refsForType in progress.iter_item_chunks(DatasetRef.groupByType(refs).items(), 

554 desc="Associating datasets by type"): 

555 storage = self._managers.datasets[datasetType.name] 

556 try: 

557 storage.associate(collectionRecord, refsForType) 

558 except sqlalchemy.exc.IntegrityError as err: 

559 raise ConflictingDefinitionError( 

560 f"Constraint violation while associating dataset of type {datasetType.name} with " 

561 f"collection {collection}. This probably means that one or more datasets with the same " 

562 f"dataset type and data ID already exist in the collection, but it may also indicate " 

563 f"that the datasets do not exist." 

564 ) from err 

565 

566 @transactional 

567 def disassociate(self, collection: str, refs: Iterable[DatasetRef]) -> None: 

568 # Docstring inherited from lsst.daf.butler.registry.Registry 

569 progress = Progress("lsst.daf.butler.Registry.disassociate", level=logging.DEBUG) 

570 collectionRecord = self._managers.collections.find(collection) 

571 if collectionRecord.type is not CollectionType.TAGGED: 

572 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}; " 

573 "expected TAGGED.") 

574 for datasetType, refsForType in progress.iter_item_chunks(DatasetRef.groupByType(refs).items(), 

575 desc="Disassociating datasets by type"): 

576 storage = self._managers.datasets[datasetType.name] 

577 storage.disassociate(collectionRecord, refsForType) 

578 

579 @transactional 

580 def certify(self, collection: str, refs: Iterable[DatasetRef], timespan: Timespan) -> None: 

581 # Docstring inherited from lsst.daf.butler.registry.Registry 

582 progress = Progress("lsst.daf.butler.Registry.certify", level=logging.DEBUG) 

583 collectionRecord = self._managers.collections.find(collection) 

584 for datasetType, refsForType in progress.iter_item_chunks(DatasetRef.groupByType(refs).items(), 

585 desc="Certifying datasets by type"): 

586 storage = self._managers.datasets[datasetType.name] 

587 storage.certify(collectionRecord, refsForType, timespan) 

588 

589 @transactional 

590 def decertify(self, collection: str, datasetType: Union[str, DatasetType], timespan: Timespan, *, 

591 dataIds: Optional[Iterable[DataId]] = None) -> None: 

592 # Docstring inherited from lsst.daf.butler.registry.Registry 

593 collectionRecord = self._managers.collections.find(collection) 

594 if isinstance(datasetType, str): 

595 storage = self._managers.datasets[datasetType] 

596 else: 

597 storage = self._managers.datasets[datasetType.name] 

598 standardizedDataIds = None 

599 if dataIds is not None: 

600 standardizedDataIds = [DataCoordinate.standardize(d, graph=storage.datasetType.dimensions) 

601 for d in dataIds] 

602 storage.decertify(collectionRecord, timespan, dataIds=standardizedDataIds) 

603 

604 def getDatastoreBridgeManager(self) -> DatastoreRegistryBridgeManager: 

605 """Return an object that allows a new `Datastore` instance to 

606 communicate with this `Registry`. 

607 

608 Returns 

609 ------- 

610 manager : `DatastoreRegistryBridgeManager` 

611 Object that mediates communication between this `Registry` and its 

612 associated datastores. 

613 """ 

614 return self._managers.datastores 

615 

616 def getDatasetLocations(self, ref: DatasetRef) -> Iterable[str]: 

617 # Docstring inherited from lsst.daf.butler.registry.Registry 

618 return self._managers.datastores.findDatastores(ref) 

619 

620 def expandDataId(self, dataId: Optional[DataId] = None, *, graph: Optional[DimensionGraph] = None, 

621 records: Optional[NameLookupMapping[DimensionElement, Optional[DimensionRecord]]] = None, 

622 withDefaults: bool = True, 

623 **kwargs: Any) -> DataCoordinate: 

624 # Docstring inherited from lsst.daf.butler.registry.Registry 

625 if not withDefaults: 

626 defaults = None 

627 else: 

628 defaults = self.defaults.dataId 

629 standardized = DataCoordinate.standardize(dataId, graph=graph, universe=self.dimensions, 

630 defaults=defaults, **kwargs) 

631 if standardized.hasRecords(): 

632 return standardized 

633 if records is None: 

634 records = {} 

635 elif isinstance(records, NamedKeyMapping): 

636 records = records.byName() 

637 else: 

638 records = dict(records) 

639 if isinstance(dataId, DataCoordinate) and dataId.hasRecords(): 

640 records.update(dataId.records.byName()) 

641 keys = standardized.byName() 

642 for element in standardized.graph.primaryKeyTraversalOrder: 

643 record = records.get(element.name, ...) # Use ... to mean not found; None might mean NULL 

644 if record is ...: 

645 if isinstance(element, Dimension) and keys.get(element.name) is None: 

646 if element in standardized.graph.required: 

647 raise LookupError( 

648 f"No value or null value for required dimension {element.name}." 

649 ) 

650 keys[element.name] = None 

651 record = None 

652 else: 

653 storage = self._managers.dimensions[element] 

654 dataIdSet = DataCoordinateIterable.fromScalar( 

655 DataCoordinate.standardize(keys, graph=element.graph) 

656 ) 

657 fetched = tuple(storage.fetch(dataIdSet)) 

658 try: 

659 (record,) = fetched 

660 except ValueError: 

661 record = None 

662 records[element.name] = record 

663 if record is not None: 

664 for d in element.implied: 

665 value = getattr(record, d.name) 

666 if keys.setdefault(d.name, value) != value: 

667 raise InconsistentDataIdError( 

668 f"Data ID {standardized} has {d.name}={keys[d.name]!r}, " 

669 f"but {element.name} implies {d.name}={value!r}." 

670 ) 

671 else: 

672 if element in standardized.graph.required: 

673 raise LookupError( 

674 f"Could not fetch record for required dimension {element.name} via keys {keys}." 

675 ) 

676 if element.alwaysJoin: 

677 raise InconsistentDataIdError( 

678 f"Could not fetch record for element {element.name} via keys {keys}, ", 

679 "but it is marked alwaysJoin=True; this means one or more dimensions are not " 

680 "related." 

681 ) 

682 for d in element.implied: 

683 keys.setdefault(d.name, None) 

684 records.setdefault(d.name, None) 

685 return DataCoordinate.standardize(keys, graph=standardized.graph).expanded(records=records) 

686 

687 def insertDimensionData(self, element: Union[DimensionElement, str], 

688 *data: Union[Mapping[str, Any], DimensionRecord], 

689 conform: bool = True, 

690 replace: bool = False) -> None: 

691 # Docstring inherited from lsst.daf.butler.registry.Registry 

692 if conform: 

693 if isinstance(element, str): 

694 element = self.dimensions[element] 

695 records = [row if isinstance(row, DimensionRecord) else element.RecordClass(**row) 

696 for row in data] 

697 else: 

698 # Ignore typing since caller said to trust them with conform=False. 

699 records = data # type: ignore 

700 storage = self._managers.dimensions[element] # type: ignore 

701 storage.insert(*records, replace=replace) 

702 

703 def syncDimensionData(self, element: Union[DimensionElement, str], 

704 row: Union[Mapping[str, Any], DimensionRecord], 

705 conform: bool = True, 

706 update: bool = False) -> Union[bool, Dict[str, Any]]: 

707 # Docstring inherited from lsst.daf.butler.registry.Registry 

708 if conform: 

709 if isinstance(element, str): 

710 element = self.dimensions[element] 

711 record = row if isinstance(row, DimensionRecord) else element.RecordClass(**row) 

712 else: 

713 # Ignore typing since caller said to trust them with conform=False. 

714 record = row # type: ignore 

715 storage = self._managers.dimensions[element] # type: ignore 

716 return storage.sync(record, update=update) 

717 

718 def queryDatasetTypes(self, expression: Any = ..., *, components: Optional[bool] = None 

719 ) -> Iterator[DatasetType]: 

720 # Docstring inherited from lsst.daf.butler.registry.Registry 

721 wildcard = CategorizedWildcard.fromExpression(expression, coerceUnrecognized=lambda d: d.name) 

722 if wildcard is Ellipsis: 

723 for datasetType in self._managers.datasets: 

724 # The dataset type can no longer be a component 

725 yield datasetType 

726 if components: 

727 # Automatically create the component dataset types 

728 try: 

729 componentsForDatasetType = datasetType.makeAllComponentDatasetTypes() 

730 except KeyError as err: 

731 _LOG.warning(f"Could not load storage class {err} for {datasetType.name}; " 

732 "if it has components they will not be included in query results.") 

733 else: 

734 yield from componentsForDatasetType 

735 return 

736 done: Set[str] = set() 

737 for name in wildcard.strings: 

738 storage = self._managers.datasets.find(name) 

739 if storage is not None: 

740 done.add(storage.datasetType.name) 

741 yield storage.datasetType 

742 if wildcard.patterns: 

743 # If components (the argument) is None, we'll save component 

744 # dataset that we might want to match, but only if their parents 

745 # didn't get included. 

746 componentsForLater = [] 

747 for registeredDatasetType in self._managers.datasets: 

748 # Components are not stored in registry so expand them here 

749 allDatasetTypes = [registeredDatasetType] 

750 try: 

751 allDatasetTypes.extend(registeredDatasetType.makeAllComponentDatasetTypes()) 

752 except KeyError as err: 

753 _LOG.warning(f"Could not load storage class {err} for {registeredDatasetType.name}; " 

754 "if it has components they will not be included in query results.") 

755 for datasetType in allDatasetTypes: 

756 if datasetType.name in done: 

757 continue 

758 parentName, componentName = datasetType.nameAndComponent() 

759 if componentName is not None and not components: 

760 if components is None and parentName not in done: 

761 componentsForLater.append(datasetType) 

762 continue 

763 if any(p.fullmatch(datasetType.name) for p in wildcard.patterns): 

764 done.add(datasetType.name) 

765 yield datasetType 

766 # Go back and try to match saved components. 

767 for datasetType in componentsForLater: 

768 parentName, _ = datasetType.nameAndComponent() 

769 if parentName not in done and any(p.fullmatch(datasetType.name) for p in wildcard.patterns): 

770 yield datasetType 

771 

772 def queryCollections(self, expression: Any = ..., 

773 datasetType: Optional[DatasetType] = None, 

774 collectionTypes: Iterable[CollectionType] = CollectionType.all(), 

775 flattenChains: bool = False, 

776 includeChains: Optional[bool] = None) -> Iterator[str]: 

777 # Docstring inherited from lsst.daf.butler.registry.Registry 

778 

779 # Right now the datasetTypes argument is completely ignored, but that 

780 # is consistent with its [lack of] guarantees. DM-24939 or a follow-up 

781 # ticket will take care of that. 

782 query = CollectionQuery.fromExpression(expression) 

783 for record in query.iter(self._managers.collections, collectionTypes=frozenset(collectionTypes), 

784 flattenChains=flattenChains, includeChains=includeChains): 

785 yield record.name 

786 

787 def _makeQueryBuilder(self, summary: queries.QuerySummary) -> queries.QueryBuilder: 

788 """Return a `QueryBuilder` instance capable of constructing and 

789 managing more complex queries than those obtainable via `Registry` 

790 interfaces. 

791 

792 This is an advanced interface; downstream code should prefer 

793 `Registry.queryDataIds` and `Registry.queryDatasets` whenever those 

794 are sufficient. 

795 

796 Parameters 

797 ---------- 

798 summary : `queries.QuerySummary` 

799 Object describing and categorizing the full set of dimensions that 

800 will be included in the query. 

801 

802 Returns 

803 ------- 

804 builder : `queries.QueryBuilder` 

805 Object that can be used to construct and perform advanced queries. 

806 """ 

807 return queries.QueryBuilder( 

808 summary, 

809 queries.RegistryManagers( 

810 collections=self._managers.collections, 

811 dimensions=self._managers.dimensions, 

812 datasets=self._managers.datasets, 

813 TimespanReprClass=self._db.getTimespanRepresentation(), 

814 ), 

815 ) 

816 

817 def queryDatasets(self, datasetType: Any, *, 

818 collections: Any = None, 

819 dimensions: Optional[Iterable[Union[Dimension, str]]] = None, 

820 dataId: Optional[DataId] = None, 

821 where: Optional[str] = None, 

822 findFirst: bool = False, 

823 components: Optional[bool] = None, 

824 bind: Optional[Mapping[str, Any]] = None, 

825 check: bool = True, 

826 **kwargs: Any) -> queries.DatasetQueryResults: 

827 # Docstring inherited from lsst.daf.butler.registry.Registry 

828 

829 # Standardize the collections expression. 

830 if collections is None: 

831 if not self.defaults.collections: 

832 raise TypeError("No collections provided to findDataset, " 

833 "and no defaults from registry construction.") 

834 collections = self.defaults.collections 

835 elif findFirst: 

836 collections = CollectionSearch.fromExpression(collections) 

837 else: 

838 collections = CollectionQuery.fromExpression(collections) 

839 # Standardize and expand the data ID provided as a constraint. 

840 standardizedDataId = self.expandDataId(dataId, **kwargs) 

841 

842 # We can only query directly if given a non-component DatasetType 

843 # instance. If we were given an expression or str or a component 

844 # DatasetType instance, we'll populate this dict, recurse, and return. 

845 # If we already have a non-component DatasetType, it will remain None 

846 # and we'll run the query directly. 

847 composition: Optional[ 

848 Dict[ 

849 DatasetType, # parent dataset type 

850 List[Optional[str]] # component name, or None for parent 

851 ] 

852 ] = None 

853 if not isinstance(datasetType, DatasetType): 

854 # We were given a dataset type expression (which may be as simple 

855 # as a str). Loop over all matching datasets, delegating handling 

856 # of the `components` argument to queryDatasetTypes, as we populate 

857 # the composition dict. 

858 composition = defaultdict(list) 

859 for trueDatasetType in self.queryDatasetTypes(datasetType, components=components): 

860 parentName, componentName = trueDatasetType.nameAndComponent() 

861 if componentName is not None: 

862 parentDatasetType = self.getDatasetType(parentName) 

863 composition.setdefault(parentDatasetType, []).append(componentName) 

864 else: 

865 composition.setdefault(trueDatasetType, []).append(None) 

866 if not composition: 

867 return queries.ChainedDatasetQueryResults( 

868 [], 

869 doomed_by=[f"No registered dataset type matching {t!r} found." 

870 for t in ensure_iterable(datasetType)], 

871 ) 

872 elif datasetType.isComponent(): 

873 # We were given a true DatasetType instance, but it's a component. 

874 # the composition dict will have exactly one item. 

875 parentName, componentName = datasetType.nameAndComponent() 

876 parentDatasetType = self.getDatasetType(parentName) 

877 composition = {parentDatasetType: [componentName]} 

878 if composition is not None: 

879 # We need to recurse. Do that once for each parent dataset type. 

880 chain = [] 

881 for parentDatasetType, componentNames in composition.items(): 

882 parentResults = self.queryDatasets( 

883 parentDatasetType, 

884 collections=collections, 

885 dimensions=dimensions, 

886 dataId=standardizedDataId, 

887 where=where, 

888 bind=bind, 

889 findFirst=findFirst, 

890 check=check, 

891 ) 

892 assert isinstance(parentResults, queries.ParentDatasetQueryResults), \ 

893 "Should always be true if passing in a DatasetType instance, and we are." 

894 chain.append( 

895 parentResults.withComponents(componentNames) 

896 ) 

897 return queries.ChainedDatasetQueryResults(chain) 

898 # If we get here, there's no need to recurse (or we are already 

899 # recursing; there can only ever be one level of recursion). 

900 

901 # The full set of dimensions in the query is the combination of those 

902 # needed for the DatasetType and those explicitly requested, if any. 

903 requestedDimensionNames = set(datasetType.dimensions.names) 

904 if dimensions is not None: 

905 requestedDimensionNames.update(self.dimensions.extract(dimensions).names) 

906 # Construct the summary structure needed to construct a QueryBuilder. 

907 summary = queries.QuerySummary( 

908 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames), 

909 dataId=standardizedDataId, 

910 expression=where, 

911 bind=bind, 

912 defaults=self.defaults.dataId, 

913 check=check, 

914 ) 

915 builder = self._makeQueryBuilder(summary) 

916 # Add the dataset subquery to the query, telling the QueryBuilder to 

917 # include the rank of the selected collection in the results only if we 

918 # need to findFirst. Note that if any of the collections are 

919 # actually wildcard expressions, and we've asked for deduplication, 

920 # this will raise TypeError for us. 

921 builder.joinDataset(datasetType, collections, isResult=True, findFirst=findFirst) 

922 query = builder.finish() 

923 return queries.ParentDatasetQueryResults(self._db, query, components=[None], datasetType=datasetType) 

924 

925 def queryDataIds(self, dimensions: Union[Iterable[Union[Dimension, str]], Dimension, str], *, 

926 dataId: Optional[DataId] = None, 

927 datasets: Any = None, 

928 collections: Any = None, 

929 where: Optional[str] = None, 

930 components: Optional[bool] = None, 

931 bind: Optional[Mapping[str, Any]] = None, 

932 check: bool = True, 

933 **kwargs: Any) -> queries.DataCoordinateQueryResults: 

934 # Docstring inherited from lsst.daf.butler.registry.Registry 

935 dimensions = ensure_iterable(dimensions) 

936 standardizedDataId = self.expandDataId(dataId, **kwargs) 

937 standardizedDatasetTypes = set() 

938 requestedDimensions = self.dimensions.extract(dimensions) 

939 queryDimensionNames = set(requestedDimensions.names) 

940 if datasets is not None: 

941 if not collections: 

942 if not self.defaults.collections: 

943 raise TypeError(f"Cannot pass 'datasets' (='{datasets}') without 'collections'.") 

944 collections = self.defaults.collections 

945 else: 

946 # Preprocess collections expression in case the original 

947 # included single-pass iterators (we'll want to use it multiple 

948 # times below). 

949 collections = CollectionQuery.fromExpression(collections) 

950 for datasetType in self.queryDatasetTypes(datasets, components=components): 

951 queryDimensionNames.update(datasetType.dimensions.names) 

952 # If any matched dataset type is a component, just operate on 

953 # its parent instead, because Registry doesn't know anything 

954 # about what components exist, and here (unlike queryDatasets) 

955 # we don't care about returning them. 

956 parentDatasetTypeName, componentName = datasetType.nameAndComponent() 

957 if componentName is not None: 

958 datasetType = self.getDatasetType(parentDatasetTypeName) 

959 standardizedDatasetTypes.add(datasetType) 

960 elif collections: 

961 raise TypeError(f"Cannot pass 'collections' (='{collections}') without 'datasets'.") 

962 

963 summary = queries.QuerySummary( 

964 requested=DimensionGraph(self.dimensions, names=queryDimensionNames), 

965 dataId=standardizedDataId, 

966 expression=where, 

967 bind=bind, 

968 defaults=self.defaults.dataId, 

969 check=check, 

970 ) 

971 builder = self._makeQueryBuilder(summary) 

972 for datasetType in standardizedDatasetTypes: 

973 builder.joinDataset(datasetType, collections, isResult=False) 

974 query = builder.finish() 

975 return queries.DataCoordinateQueryResults(self._db, query) 

976 

977 def queryDimensionRecords(self, element: Union[DimensionElement, str], *, 

978 dataId: Optional[DataId] = None, 

979 datasets: Any = None, 

980 collections: Any = None, 

981 where: Optional[str] = None, 

982 components: Optional[bool] = None, 

983 bind: Optional[Mapping[str, Any]] = None, 

984 check: bool = True, 

985 **kwargs: Any) -> Iterator[DimensionRecord]: 

986 # Docstring inherited from lsst.daf.butler.registry.Registry 

987 if not isinstance(element, DimensionElement): 

988 try: 

989 element = self.dimensions[element] 

990 except KeyError as e: 

991 raise KeyError(f"No such dimension '{element}', available dimensions: " 

992 + str(self.dimensions.getStaticElements())) from e 

993 dataIds = self.queryDataIds(element.graph, dataId=dataId, datasets=datasets, collections=collections, 

994 where=where, components=components, bind=bind, check=check, **kwargs) 

995 return iter(self._managers.dimensions[element].fetch(dataIds)) 

996 

997 def queryDatasetAssociations( 

998 self, 

999 datasetType: Union[str, DatasetType], 

1000 collections: Any = ..., 

1001 *, 

1002 collectionTypes: Iterable[CollectionType] = CollectionType.all(), 

1003 flattenChains: bool = False, 

1004 ) -> Iterator[DatasetAssociation]: 

1005 # Docstring inherited from lsst.daf.butler.registry.Registry 

1006 if collections is None: 

1007 if not self.defaults.collections: 

1008 raise TypeError("No collections provided to findDataset, " 

1009 "and no defaults from registry construction.") 

1010 collections = self.defaults.collections 

1011 else: 

1012 collections = CollectionQuery.fromExpression(collections) 

1013 TimespanReprClass = self._db.getTimespanRepresentation() 

1014 if isinstance(datasetType, str): 

1015 storage = self._managers.datasets[datasetType] 

1016 else: 

1017 storage = self._managers.datasets[datasetType.name] 

1018 for collectionRecord in collections.iter(self._managers.collections, 

1019 collectionTypes=frozenset(collectionTypes), 

1020 flattenChains=flattenChains): 

1021 query = storage.select(collectionRecord) 

1022 for row in self._db.query(query.combine()).mappings(): 

1023 dataId = DataCoordinate.fromRequiredValues( 

1024 storage.datasetType.dimensions, 

1025 tuple(row[name] for name in storage.datasetType.dimensions.required.names) 

1026 ) 

1027 runRecord = self._managers.collections[row[self._managers.collections.getRunForeignKeyName()]] 

1028 ref = DatasetRef(storage.datasetType, dataId, id=row["id"], run=runRecord.name, 

1029 conform=False) 

1030 if collectionRecord.type is CollectionType.CALIBRATION: 

1031 timespan = TimespanReprClass.extract(row) 

1032 else: 

1033 timespan = None 

1034 yield DatasetAssociation(ref=ref, collection=collectionRecord.name, timespan=timespan) 

1035 

1036 storageClasses: StorageClassFactory 

1037 """All storage classes known to the registry (`StorageClassFactory`). 

1038 """