Coverage for python/lsst/daf/butler/registries/sql.py: 13%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

466 statements  

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ( 

25 "SqlRegistry", 

26) 

27 

28from collections import defaultdict 

29import contextlib 

30import logging 

31from typing import ( 

32 Any, 

33 Dict, 

34 Iterable, 

35 Iterator, 

36 List, 

37 Mapping, 

38 Optional, 

39 Set, 

40 TYPE_CHECKING, 

41 Union, 

42) 

43 

44import sqlalchemy 

45from lsst.utils.iteration import ensure_iterable 

46 

47from ..core import ( 

48 ButlerURI, 

49 Config, 

50 DataCoordinate, 

51 DataCoordinateIterable, 

52 DataId, 

53 DatasetAssociation, 

54 DatasetId, 

55 DatasetRef, 

56 DatasetType, 

57 ddl, 

58 Dimension, 

59 DimensionConfig, 

60 DimensionElement, 

61 DimensionGraph, 

62 DimensionRecord, 

63 DimensionUniverse, 

64 NamedKeyMapping, 

65 NameLookupMapping, 

66 Progress, 

67 StorageClassFactory, 

68 Timespan, 

69) 

70from ..core.utils import transactional 

71 

72from ..registry import ( 

73 Registry, 

74 RegistryConfig, 

75 CollectionType, 

76 RegistryDefaults, 

77 ConflictingDefinitionError, 

78 InconsistentDataIdError, 

79 OrphanedRecordError, 

80 CollectionSearch, 

81) 

82from ..registry import queries 

83 

84from ..registry.wildcards import CategorizedWildcard, CollectionQuery, Ellipsis 

85from ..registry.summaries import CollectionSummary 

86from ..registry.managers import RegistryManagerTypes, RegistryManagerInstances 

87from ..registry.interfaces import ChainedCollectionRecord, DatasetIdGenEnum, RunRecord 

88 

89if TYPE_CHECKING: 89 ↛ 90line 89 didn't jump to line 90, because the condition on line 89 was never true

90 from .._butlerConfig import ButlerConfig 

91 from ..registry.interfaces import ( 

92 CollectionRecord, 

93 Database, 

94 DatastoreRegistryBridgeManager, 

95 ) 

96 

97 

98_LOG = logging.getLogger(__name__) 

99 

100 

101class SqlRegistry(Registry): 

102 """Registry implementation based on SQLAlchemy. 

103 

104 Parameters 

105 ---------- 

106 database : `Database` 

107 Database instance to store Registry. 

108 defaults : `RegistryDefaults` 

109 Default collection search path and/or output `~CollectionType.RUN` 

110 collection. 

111 managers : `RegistryManagerInstances` 

112 All the managers required for this registry. 

113 """ 

114 

115 defaultConfigFile: Optional[str] = None 

116 """Path to configuration defaults. Accessed within the ``configs`` resource 

117 or relative to a search path. Can be None if no defaults specified. 

118 """ 

119 

120 @classmethod 

121 def createFromConfig(cls, config: Optional[Union[RegistryConfig, str]] = None, 

122 dimensionConfig: Optional[Union[DimensionConfig, str]] = None, 

123 butlerRoot: Optional[str] = None) -> Registry: 

124 """Create registry database and return `SqlRegistry` instance. 

125 

126 This method initializes database contents, database must be empty 

127 prior to calling this method. 

128 

129 Parameters 

130 ---------- 

131 config : `RegistryConfig` or `str`, optional 

132 Registry configuration, if missing then default configuration will 

133 be loaded from registry.yaml. 

134 dimensionConfig : `DimensionConfig` or `str`, optional 

135 Dimensions configuration, if missing then default configuration 

136 will be loaded from dimensions.yaml. 

137 butlerRoot : `str`, optional 

138 Path to the repository root this `SqlRegistry` will manage. 

139 

140 Returns 

141 ------- 

142 registry : `SqlRegistry` 

143 A new `SqlRegistry` instance. 

144 """ 

145 config = cls.forceRegistryConfig(config) 

146 config.replaceRoot(butlerRoot) 

147 

148 if isinstance(dimensionConfig, str): 

149 dimensionConfig = DimensionConfig(config) 

150 elif dimensionConfig is None: 

151 dimensionConfig = DimensionConfig() 

152 elif not isinstance(dimensionConfig, DimensionConfig): 

153 raise TypeError(f"Incompatible Dimension configuration type: {type(dimensionConfig)}") 

154 

155 DatabaseClass = config.getDatabaseClass() 

156 database = DatabaseClass.fromUri(str(config.connectionString), origin=config.get("origin", 0), 

157 namespace=config.get("namespace")) 

158 managerTypes = RegistryManagerTypes.fromConfig(config) 

159 managers = managerTypes.makeRepo(database, dimensionConfig) 

160 return cls(database, RegistryDefaults(), managers) 

161 

162 @classmethod 

163 def fromConfig(cls, config: Union[ButlerConfig, RegistryConfig, Config, str], 

164 butlerRoot: Optional[Union[str, ButlerURI]] = None, writeable: bool = True, 

165 defaults: Optional[RegistryDefaults] = None) -> Registry: 

166 """Create `Registry` subclass instance from `config`. 

167 

168 Registry database must be inbitialized prior to calling this method. 

169 

170 Parameters 

171 ---------- 

172 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str` 

173 Registry configuration 

174 butlerRoot : `str` or `ButlerURI`, optional 

175 Path to the repository root this `Registry` will manage. 

176 writeable : `bool`, optional 

177 If `True` (default) create a read-write connection to the database. 

178 defaults : `RegistryDefaults`, optional 

179 Default collection search path and/or output `~CollectionType.RUN` 

180 collection. 

181 

182 Returns 

183 ------- 

184 registry : `SqlRegistry` (subclass) 

185 A new `SqlRegistry` subclass instance. 

186 """ 

187 config = cls.forceRegistryConfig(config) 

188 config.replaceRoot(butlerRoot) 

189 DatabaseClass = config.getDatabaseClass() 

190 database = DatabaseClass.fromUri(str(config.connectionString), origin=config.get("origin", 0), 

191 namespace=config.get("namespace"), writeable=writeable) 

192 managerTypes = RegistryManagerTypes.fromConfig(config) 

193 managers = managerTypes.loadRepo(database) 

194 if defaults is None: 

195 defaults = RegistryDefaults() 

196 return cls(database, defaults, managers) 

197 

198 def __init__(self, database: Database, defaults: RegistryDefaults, managers: RegistryManagerInstances): 

199 self._db = database 

200 self._managers = managers 

201 self.storageClasses = StorageClassFactory() 

202 # Intentionally invoke property setter to initialize defaults. This 

203 # can only be done after most of the rest of Registry has already been 

204 # initialized, and must be done before the property getter is used. 

205 self.defaults = defaults 

206 

207 def __str__(self) -> str: 

208 return str(self._db) 

209 

210 def __repr__(self) -> str: 

211 return f"SqlRegistry({self._db!r}, {self.dimensions!r})" 

212 

213 def isWriteable(self) -> bool: 

214 # Docstring inherited from lsst.daf.butler.registry.Registry 

215 return self._db.isWriteable() 

216 

217 def copy(self, defaults: Optional[RegistryDefaults] = None) -> Registry: 

218 # Docstring inherited from lsst.daf.butler.registry.Registry 

219 if defaults is None: 

220 # No need to copy, because `RegistryDefaults` is immutable; we 

221 # effectively copy on write. 

222 defaults = self.defaults 

223 return type(self)(self._db, defaults, self._managers) 

224 

225 @property 

226 def dimensions(self) -> DimensionUniverse: 

227 # Docstring inherited from lsst.daf.butler.registry.Registry 

228 return self._managers.dimensions.universe 

229 

230 def refresh(self) -> None: 

231 # Docstring inherited from lsst.daf.butler.registry.Registry 

232 self._managers.refresh() 

233 

234 @contextlib.contextmanager 

235 def transaction(self, *, savepoint: bool = False) -> Iterator[None]: 

236 # Docstring inherited from lsst.daf.butler.registry.Registry 

237 try: 

238 with self._db.transaction(savepoint=savepoint): 

239 yield 

240 except BaseException: 

241 # TODO: this clears the caches sometimes when we wouldn't actually 

242 # need to. Can we avoid that? 

243 self._managers.dimensions.clearCaches() 

244 raise 

245 

246 def resetConnectionPool(self) -> None: 

247 """Reset SQLAlchemy connection pool for `SqlRegistry` database. 

248 

249 This operation is useful when using registry with fork-based 

250 multiprocessing. To use registry across fork boundary one has to make 

251 sure that there are no currently active connections (no session or 

252 transaction is in progress) and connection pool is reset using this 

253 method. This method should be called by the child process immediately 

254 after the fork. 

255 """ 

256 self._db._engine.dispose() 

257 

258 def registerOpaqueTable(self, tableName: str, spec: ddl.TableSpec) -> None: 

259 """Add an opaque (to the `Registry`) table for use by a `Datastore` or 

260 other data repository client. 

261 

262 Opaque table records can be added via `insertOpaqueData`, retrieved via 

263 `fetchOpaqueData`, and removed via `deleteOpaqueData`. 

264 

265 Parameters 

266 ---------- 

267 tableName : `str` 

268 Logical name of the opaque table. This may differ from the 

269 actual name used in the database by a prefix and/or suffix. 

270 spec : `ddl.TableSpec` 

271 Specification for the table to be added. 

272 """ 

273 self._managers.opaque.register(tableName, spec) 

274 

275 @transactional 

276 def insertOpaqueData(self, tableName: str, *data: dict) -> None: 

277 """Insert records into an opaque table. 

278 

279 Parameters 

280 ---------- 

281 tableName : `str` 

282 Logical name of the opaque table. Must match the name used in a 

283 previous call to `registerOpaqueTable`. 

284 data 

285 Each additional positional argument is a dictionary that represents 

286 a single row to be added. 

287 """ 

288 self._managers.opaque[tableName].insert(*data) 

289 

290 def fetchOpaqueData(self, tableName: str, **where: Any) -> Iterator[dict]: 

291 """Retrieve records from an opaque table. 

292 

293 Parameters 

294 ---------- 

295 tableName : `str` 

296 Logical name of the opaque table. Must match the name used in a 

297 previous call to `registerOpaqueTable`. 

298 where 

299 Additional keyword arguments are interpreted as equality 

300 constraints that restrict the returned rows (combined with AND); 

301 keyword arguments are column names and values are the values they 

302 must have. 

303 

304 Yields 

305 ------ 

306 row : `dict` 

307 A dictionary representing a single result row. 

308 """ 

309 yield from self._managers.opaque[tableName].fetch(**where) 

310 

311 @transactional 

312 def deleteOpaqueData(self, tableName: str, **where: Any) -> None: 

313 """Remove records from an opaque table. 

314 

315 Parameters 

316 ---------- 

317 tableName : `str` 

318 Logical name of the opaque table. Must match the name used in a 

319 previous call to `registerOpaqueTable`. 

320 where 

321 Additional keyword arguments are interpreted as equality 

322 constraints that restrict the deleted rows (combined with AND); 

323 keyword arguments are column names and values are the values they 

324 must have. 

325 """ 

326 self._managers.opaque[tableName].delete(where.keys(), where) 

327 

328 def registerCollection(self, name: str, type: CollectionType = CollectionType.TAGGED, 

329 doc: Optional[str] = None) -> bool: 

330 # Docstring inherited from lsst.daf.butler.registry.Registry 

331 _, registered = self._managers.collections.register(name, type, doc=doc) 

332 return registered 

333 

334 def getCollectionType(self, name: str) -> CollectionType: 

335 # Docstring inherited from lsst.daf.butler.registry.Registry 

336 return self._managers.collections.find(name).type 

337 

338 def _get_collection_record(self, name: str) -> CollectionRecord: 

339 # Docstring inherited from lsst.daf.butler.registry.Registry 

340 return self._managers.collections.find(name) 

341 

342 def registerRun(self, name: str, doc: Optional[str] = None) -> bool: 

343 # Docstring inherited from lsst.daf.butler.registry.Registry 

344 _, registered = self._managers.collections.register(name, CollectionType.RUN, doc=doc) 

345 return registered 

346 

347 @transactional 

348 def removeCollection(self, name: str) -> None: 

349 # Docstring inherited from lsst.daf.butler.registry.Registry 

350 self._managers.collections.remove(name) 

351 

352 def getCollectionChain(self, parent: str) -> CollectionSearch: 

353 # Docstring inherited from lsst.daf.butler.registry.Registry 

354 record = self._managers.collections.find(parent) 

355 if record.type is not CollectionType.CHAINED: 

356 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.") 

357 assert isinstance(record, ChainedCollectionRecord) 

358 return record.children 

359 

360 @transactional 

361 def setCollectionChain(self, parent: str, children: Any, *, flatten: bool = False) -> None: 

362 # Docstring inherited from lsst.daf.butler.registry.Registry 

363 record = self._managers.collections.find(parent) 

364 if record.type is not CollectionType.CHAINED: 

365 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.") 

366 assert isinstance(record, ChainedCollectionRecord) 

367 children = CollectionSearch.fromExpression(children) 

368 if children != record.children or flatten: 

369 record.update(self._managers.collections, children, flatten=flatten) 

370 

371 def getCollectionDocumentation(self, collection: str) -> Optional[str]: 

372 # Docstring inherited from lsst.daf.butler.registry.Registry 

373 return self._managers.collections.getDocumentation(self._managers.collections.find(collection).key) 

374 

375 def setCollectionDocumentation(self, collection: str, doc: Optional[str]) -> None: 

376 # Docstring inherited from lsst.daf.butler.registry.Registry 

377 self._managers.collections.setDocumentation(self._managers.collections.find(collection).key, doc) 

378 

379 def getCollectionSummary(self, collection: str) -> CollectionSummary: 

380 # Docstring inherited from lsst.daf.butler.registry.Registry 

381 record = self._managers.collections.find(collection) 

382 return self._managers.datasets.getCollectionSummary(record) 

383 

384 def registerDatasetType(self, datasetType: DatasetType) -> bool: 

385 # Docstring inherited from lsst.daf.butler.registry.Registry 

386 _, inserted = self._managers.datasets.register(datasetType) 

387 return inserted 

388 

389 def removeDatasetType(self, name: str) -> None: 

390 # Docstring inherited from lsst.daf.butler.registry.Registry 

391 self._managers.datasets.remove(name) 

392 

393 def getDatasetType(self, name: str) -> DatasetType: 

394 # Docstring inherited from lsst.daf.butler.registry.Registry 

395 return self._managers.datasets[name].datasetType 

396 

397 def supportsIdGenerationMode(self, mode: DatasetIdGenEnum) -> bool: 

398 # Docstring inherited from lsst.daf.butler.registry.Registry 

399 return self._managers.datasets.supportsIdGenerationMode(mode) 

400 

401 def findDataset(self, datasetType: Union[DatasetType, str], dataId: Optional[DataId] = None, *, 

402 collections: Any = None, timespan: Optional[Timespan] = None, 

403 **kwargs: Any) -> Optional[DatasetRef]: 

404 # Docstring inherited from lsst.daf.butler.registry.Registry 

405 if isinstance(datasetType, DatasetType): 

406 storage = self._managers.datasets[datasetType.name] 

407 else: 

408 storage = self._managers.datasets[datasetType] 

409 dataId = DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions, 

410 universe=self.dimensions, defaults=self.defaults.dataId, 

411 **kwargs) 

412 if collections is None: 

413 if not self.defaults.collections: 

414 raise TypeError("No collections provided to findDataset, " 

415 "and no defaults from registry construction.") 

416 collections = self.defaults.collections 

417 else: 

418 collections = CollectionSearch.fromExpression(collections) 

419 for collectionRecord in collections.iter(self._managers.collections): 

420 if (collectionRecord.type is CollectionType.CALIBRATION 

421 and (not storage.datasetType.isCalibration() or timespan is None)): 

422 continue 

423 result = storage.find(collectionRecord, dataId, timespan=timespan) 

424 if result is not None: 

425 return result 

426 

427 return None 

428 

429 @transactional 

430 def insertDatasets(self, datasetType: Union[DatasetType, str], dataIds: Iterable[DataId], 

431 run: Optional[str] = None, expand: bool = True, 

432 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE) -> List[DatasetRef]: 

433 # Docstring inherited from lsst.daf.butler.registry.Registry 

434 if isinstance(datasetType, DatasetType): 

435 storage = self._managers.datasets.find(datasetType.name) 

436 if storage is None: 

437 raise LookupError(f"DatasetType '{datasetType}' has not been registered.") 

438 else: 

439 storage = self._managers.datasets.find(datasetType) 

440 if storage is None: 

441 raise LookupError(f"DatasetType with name '{datasetType}' has not been registered.") 

442 if run is None: 

443 if self.defaults.run is None: 

444 raise TypeError("No run provided to insertDatasets, " 

445 "and no default from registry construction.") 

446 run = self.defaults.run 

447 runRecord = self._managers.collections.find(run) 

448 if runRecord.type is not CollectionType.RUN: 

449 raise TypeError(f"Given collection is of type {runRecord.type.name}; RUN collection required.") 

450 assert isinstance(runRecord, RunRecord) 

451 progress = Progress("daf.butler.Registry.insertDatasets", level=logging.DEBUG) 

452 if expand: 

453 expandedDataIds = [self.expandDataId(dataId, graph=storage.datasetType.dimensions) 

454 for dataId in progress.wrap(dataIds, 

455 f"Expanding {storage.datasetType.name} data IDs")] 

456 else: 

457 expandedDataIds = [DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions) 

458 for dataId in dataIds] 

459 try: 

460 refs = list(storage.insert(runRecord, expandedDataIds, idGenerationMode)) 

461 except sqlalchemy.exc.IntegrityError as err: 

462 raise ConflictingDefinitionError(f"A database constraint failure was triggered by inserting " 

463 f"one or more datasets of type {storage.datasetType} into " 

464 f"collection '{run}'. " 

465 f"This probably means a dataset with the same data ID " 

466 f"and dataset type already exists, but it may also mean a " 

467 f"dimension row is missing.") from err 

468 return refs 

469 

470 @transactional 

471 def _importDatasets(self, datasets: Iterable[DatasetRef], expand: bool = True, 

472 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

473 reuseIds: bool = False) -> List[DatasetRef]: 

474 # Docstring inherited from lsst.daf.butler.registry.Registry 

475 datasets = list(datasets) 

476 if not datasets: 

477 # nothing to do 

478 return [] 

479 

480 # find dataset type 

481 datasetTypes = set(dataset.datasetType for dataset in datasets) 

482 if len(datasetTypes) != 1: 

483 raise ValueError(f"Multiple dataset types in input datasets: {datasetTypes}") 

484 datasetType = datasetTypes.pop() 

485 

486 # get storage handler for this dataset type 

487 storage = self._managers.datasets.find(datasetType.name) 

488 if storage is None: 

489 raise LookupError(f"DatasetType '{datasetType}' has not been registered.") 

490 

491 # find run name 

492 runs = set(dataset.run for dataset in datasets) 

493 if len(runs) != 1: 

494 raise ValueError(f"Multiple run names in input datasets: {runs}") 

495 run = runs.pop() 

496 if run is None: 

497 if self.defaults.run is None: 

498 raise TypeError("No run provided to ingestDatasets, " 

499 "and no default from registry construction.") 

500 run = self.defaults.run 

501 

502 runRecord = self._managers.collections.find(run) 

503 if runRecord.type is not CollectionType.RUN: 

504 raise TypeError(f"Given collection '{runRecord.name}' is of type {runRecord.type.name};" 

505 " RUN collection required.") 

506 assert isinstance(runRecord, RunRecord) 

507 

508 progress = Progress("daf.butler.Registry.insertDatasets", level=logging.DEBUG) 

509 if expand: 

510 expandedDatasets = [ 

511 dataset.expanded(self.expandDataId(dataset.dataId, graph=storage.datasetType.dimensions)) 

512 for dataset in progress.wrap(datasets, f"Expanding {storage.datasetType.name} data IDs")] 

513 else: 

514 expandedDatasets = [ 

515 DatasetRef(datasetType, dataset.dataId, id=dataset.id, run=dataset.run, conform=True) 

516 for dataset in datasets 

517 ] 

518 

519 try: 

520 refs = list(storage.import_(runRecord, expandedDatasets, idGenerationMode, reuseIds)) 

521 except sqlalchemy.exc.IntegrityError as err: 

522 raise ConflictingDefinitionError(f"A database constraint failure was triggered by inserting " 

523 f"one or more datasets of type {storage.datasetType} into " 

524 f"collection '{run}'. " 

525 f"This probably means a dataset with the same data ID " 

526 f"and dataset type already exists, but it may also mean a " 

527 f"dimension row is missing.") from err 

528 return refs 

529 

530 def getDataset(self, id: DatasetId) -> Optional[DatasetRef]: 

531 # Docstring inherited from lsst.daf.butler.registry.Registry 

532 return self._managers.datasets.getDatasetRef(id) 

533 

534 @transactional 

535 def removeDatasets(self, refs: Iterable[DatasetRef]) -> None: 

536 # Docstring inherited from lsst.daf.butler.registry.Registry 

537 progress = Progress("lsst.daf.butler.Registry.removeDatasets", level=logging.DEBUG) 

538 for datasetType, refsForType in progress.iter_item_chunks(DatasetRef.groupByType(refs).items(), 

539 desc="Removing datasets by type"): 

540 storage = self._managers.datasets[datasetType.name] 

541 try: 

542 storage.delete(refsForType) 

543 except sqlalchemy.exc.IntegrityError as err: 

544 raise OrphanedRecordError("One or more datasets is still " 

545 "present in one or more Datastores.") from err 

546 

547 @transactional 

548 def associate(self, collection: str, refs: Iterable[DatasetRef]) -> None: 

549 # Docstring inherited from lsst.daf.butler.registry.Registry 

550 progress = Progress("lsst.daf.butler.Registry.associate", level=logging.DEBUG) 

551 collectionRecord = self._managers.collections.find(collection) 

552 if collectionRecord.type is not CollectionType.TAGGED: 

553 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}, not TAGGED.") 

554 for datasetType, refsForType in progress.iter_item_chunks(DatasetRef.groupByType(refs).items(), 

555 desc="Associating datasets by type"): 

556 storage = self._managers.datasets[datasetType.name] 

557 try: 

558 storage.associate(collectionRecord, refsForType) 

559 except sqlalchemy.exc.IntegrityError as err: 

560 raise ConflictingDefinitionError( 

561 f"Constraint violation while associating dataset of type {datasetType.name} with " 

562 f"collection {collection}. This probably means that one or more datasets with the same " 

563 f"dataset type and data ID already exist in the collection, but it may also indicate " 

564 f"that the datasets do not exist." 

565 ) from err 

566 

567 @transactional 

568 def disassociate(self, collection: str, refs: Iterable[DatasetRef]) -> None: 

569 # Docstring inherited from lsst.daf.butler.registry.Registry 

570 progress = Progress("lsst.daf.butler.Registry.disassociate", level=logging.DEBUG) 

571 collectionRecord = self._managers.collections.find(collection) 

572 if collectionRecord.type is not CollectionType.TAGGED: 

573 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}; " 

574 "expected TAGGED.") 

575 for datasetType, refsForType in progress.iter_item_chunks(DatasetRef.groupByType(refs).items(), 

576 desc="Disassociating datasets by type"): 

577 storage = self._managers.datasets[datasetType.name] 

578 storage.disassociate(collectionRecord, refsForType) 

579 

580 @transactional 

581 def certify(self, collection: str, refs: Iterable[DatasetRef], timespan: Timespan) -> None: 

582 # Docstring inherited from lsst.daf.butler.registry.Registry 

583 progress = Progress("lsst.daf.butler.Registry.certify", level=logging.DEBUG) 

584 collectionRecord = self._managers.collections.find(collection) 

585 for datasetType, refsForType in progress.iter_item_chunks(DatasetRef.groupByType(refs).items(), 

586 desc="Certifying datasets by type"): 

587 storage = self._managers.datasets[datasetType.name] 

588 storage.certify(collectionRecord, refsForType, timespan) 

589 

590 @transactional 

591 def decertify(self, collection: str, datasetType: Union[str, DatasetType], timespan: Timespan, *, 

592 dataIds: Optional[Iterable[DataId]] = None) -> None: 

593 # Docstring inherited from lsst.daf.butler.registry.Registry 

594 collectionRecord = self._managers.collections.find(collection) 

595 if isinstance(datasetType, str): 

596 storage = self._managers.datasets[datasetType] 

597 else: 

598 storage = self._managers.datasets[datasetType.name] 

599 standardizedDataIds = None 

600 if dataIds is not None: 

601 standardizedDataIds = [DataCoordinate.standardize(d, graph=storage.datasetType.dimensions) 

602 for d in dataIds] 

603 storage.decertify(collectionRecord, timespan, dataIds=standardizedDataIds) 

604 

605 def getDatastoreBridgeManager(self) -> DatastoreRegistryBridgeManager: 

606 """Return an object that allows a new `Datastore` instance to 

607 communicate with this `Registry`. 

608 

609 Returns 

610 ------- 

611 manager : `DatastoreRegistryBridgeManager` 

612 Object that mediates communication between this `Registry` and its 

613 associated datastores. 

614 """ 

615 return self._managers.datastores 

616 

617 def getDatasetLocations(self, ref: DatasetRef) -> Iterable[str]: 

618 # Docstring inherited from lsst.daf.butler.registry.Registry 

619 return self._managers.datastores.findDatastores(ref) 

620 

621 def expandDataId(self, dataId: Optional[DataId] = None, *, graph: Optional[DimensionGraph] = None, 

622 records: Optional[NameLookupMapping[DimensionElement, Optional[DimensionRecord]]] = None, 

623 withDefaults: bool = True, 

624 **kwargs: Any) -> DataCoordinate: 

625 # Docstring inherited from lsst.daf.butler.registry.Registry 

626 if not withDefaults: 

627 defaults = None 

628 else: 

629 defaults = self.defaults.dataId 

630 standardized = DataCoordinate.standardize(dataId, graph=graph, universe=self.dimensions, 

631 defaults=defaults, **kwargs) 

632 if standardized.hasRecords(): 

633 return standardized 

634 if records is None: 

635 records = {} 

636 elif isinstance(records, NamedKeyMapping): 

637 records = records.byName() 

638 else: 

639 records = dict(records) 

640 if isinstance(dataId, DataCoordinate) and dataId.hasRecords(): 

641 records.update(dataId.records.byName()) 

642 keys = standardized.byName() 

643 for element in standardized.graph.primaryKeyTraversalOrder: 

644 record = records.get(element.name, ...) # Use ... to mean not found; None might mean NULL 

645 if record is ...: 

646 if isinstance(element, Dimension) and keys.get(element.name) is None: 

647 if element in standardized.graph.required: 

648 raise LookupError( 

649 f"No value or null value for required dimension {element.name}." 

650 ) 

651 keys[element.name] = None 

652 record = None 

653 else: 

654 storage = self._managers.dimensions[element] 

655 dataIdSet = DataCoordinateIterable.fromScalar( 

656 DataCoordinate.standardize(keys, graph=element.graph) 

657 ) 

658 fetched = tuple(storage.fetch(dataIdSet)) 

659 try: 

660 (record,) = fetched 

661 except ValueError: 

662 record = None 

663 records[element.name] = record 

664 if record is not None: 

665 for d in element.implied: 

666 value = getattr(record, d.name) 

667 if keys.setdefault(d.name, value) != value: 

668 raise InconsistentDataIdError( 

669 f"Data ID {standardized} has {d.name}={keys[d.name]!r}, " 

670 f"but {element.name} implies {d.name}={value!r}." 

671 ) 

672 else: 

673 if element in standardized.graph.required: 

674 raise LookupError( 

675 f"Could not fetch record for required dimension {element.name} via keys {keys}." 

676 ) 

677 if element.alwaysJoin: 

678 raise InconsistentDataIdError( 

679 f"Could not fetch record for element {element.name} via keys {keys}, ", 

680 "but it is marked alwaysJoin=True; this means one or more dimensions are not " 

681 "related." 

682 ) 

683 for d in element.implied: 

684 keys.setdefault(d.name, None) 

685 records.setdefault(d.name, None) 

686 return DataCoordinate.standardize(keys, graph=standardized.graph).expanded(records=records) 

687 

688 def insertDimensionData(self, element: Union[DimensionElement, str], 

689 *data: Union[Mapping[str, Any], DimensionRecord], 

690 conform: bool = True, 

691 replace: bool = False) -> None: 

692 # Docstring inherited from lsst.daf.butler.registry.Registry 

693 if conform: 

694 if isinstance(element, str): 

695 element = self.dimensions[element] 

696 records = [row if isinstance(row, DimensionRecord) else element.RecordClass(**row) 

697 for row in data] 

698 else: 

699 # Ignore typing since caller said to trust them with conform=False. 

700 records = data # type: ignore 

701 storage = self._managers.dimensions[element] # type: ignore 

702 storage.insert(*records, replace=replace) 

703 

704 def syncDimensionData(self, element: Union[DimensionElement, str], 

705 row: Union[Mapping[str, Any], DimensionRecord], 

706 conform: bool = True, 

707 update: bool = False) -> Union[bool, Dict[str, Any]]: 

708 # Docstring inherited from lsst.daf.butler.registry.Registry 

709 if conform: 

710 if isinstance(element, str): 

711 element = self.dimensions[element] 

712 record = row if isinstance(row, DimensionRecord) else element.RecordClass(**row) 

713 else: 

714 # Ignore typing since caller said to trust them with conform=False. 

715 record = row # type: ignore 

716 storage = self._managers.dimensions[element] # type: ignore 

717 return storage.sync(record, update=update) 

718 

719 def queryDatasetTypes(self, expression: Any = ..., *, components: Optional[bool] = None, 

720 missing: Optional[List[str]] = None, 

721 ) -> Iterator[DatasetType]: 

722 # Docstring inherited from lsst.daf.butler.registry.Registry 

723 wildcard = CategorizedWildcard.fromExpression(expression, coerceUnrecognized=lambda d: d.name) 

724 if wildcard is Ellipsis: 

725 for datasetType in self._managers.datasets: 

726 # The dataset type can no longer be a component 

727 yield datasetType 

728 if components: 

729 # Automatically create the component dataset types 

730 try: 

731 componentsForDatasetType = datasetType.makeAllComponentDatasetTypes() 

732 except KeyError as err: 

733 _LOG.warning(f"Could not load storage class {err} for {datasetType.name}; " 

734 "if it has components they will not be included in query results.") 

735 else: 

736 yield from componentsForDatasetType 

737 return 

738 done: Set[str] = set() 

739 for name in wildcard.strings: 

740 storage = self._managers.datasets.find(name) 

741 done.add(name) 

742 if storage is None: 

743 if missing is not None: 

744 missing.append(name) 

745 else: 

746 yield storage.datasetType 

747 if wildcard.patterns: 

748 # If components (the argument) is None, we'll save component 

749 # dataset that we might want to match, but only if their parents 

750 # didn't get included. 

751 componentsForLater = [] 

752 for registeredDatasetType in self._managers.datasets: 

753 # Components are not stored in registry so expand them here 

754 allDatasetTypes = [registeredDatasetType] 

755 try: 

756 allDatasetTypes.extend(registeredDatasetType.makeAllComponentDatasetTypes()) 

757 except KeyError as err: 

758 _LOG.warning(f"Could not load storage class {err} for {registeredDatasetType.name}; " 

759 "if it has components they will not be included in query results.") 

760 for datasetType in allDatasetTypes: 

761 if datasetType.name in done: 

762 continue 

763 parentName, componentName = datasetType.nameAndComponent() 

764 if componentName is not None and not components: 

765 if components is None and parentName not in done: 

766 componentsForLater.append(datasetType) 

767 continue 

768 if any(p.fullmatch(datasetType.name) for p in wildcard.patterns): 

769 done.add(datasetType.name) 

770 yield datasetType 

771 # Go back and try to match saved components. 

772 for datasetType in componentsForLater: 

773 parentName, _ = datasetType.nameAndComponent() 

774 if parentName not in done and any(p.fullmatch(datasetType.name) for p in wildcard.patterns): 

775 yield datasetType 

776 

777 def queryCollections(self, expression: Any = ..., 

778 datasetType: Optional[DatasetType] = None, 

779 collectionTypes: Iterable[CollectionType] = CollectionType.all(), 

780 flattenChains: bool = False, 

781 includeChains: Optional[bool] = None) -> Iterator[str]: 

782 # Docstring inherited from lsst.daf.butler.registry.Registry 

783 

784 # Right now the datasetTypes argument is completely ignored, but that 

785 # is consistent with its [lack of] guarantees. DM-24939 or a follow-up 

786 # ticket will take care of that. 

787 query = CollectionQuery.fromExpression(expression) 

788 for record in query.iter(self._managers.collections, collectionTypes=frozenset(collectionTypes), 

789 flattenChains=flattenChains, includeChains=includeChains): 

790 yield record.name 

791 

792 def _makeQueryBuilder(self, summary: queries.QuerySummary, 

793 doomed_by: Iterable[str] = ()) -> queries.QueryBuilder: 

794 """Return a `QueryBuilder` instance capable of constructing and 

795 managing more complex queries than those obtainable via `Registry` 

796 interfaces. 

797 

798 This is an advanced interface; downstream code should prefer 

799 `Registry.queryDataIds` and `Registry.queryDatasets` whenever those 

800 are sufficient. 

801 

802 Parameters 

803 ---------- 

804 summary : `queries.QuerySummary` 

805 Object describing and categorizing the full set of dimensions that 

806 will be included in the query. 

807 doomed_by : `Iterable` of `str`, optional 

808 A list of diagnostic messages that indicate why the query is going 

809 to yield no results and should not even be executed. If an empty 

810 container (default) the query will be executed unless other code 

811 determines that it is doomed. 

812 

813 Returns 

814 ------- 

815 builder : `queries.QueryBuilder` 

816 Object that can be used to construct and perform advanced queries. 

817 """ 

818 return queries.QueryBuilder( 

819 summary, 

820 queries.RegistryManagers( 

821 collections=self._managers.collections, 

822 dimensions=self._managers.dimensions, 

823 datasets=self._managers.datasets, 

824 TimespanReprClass=self._db.getTimespanRepresentation(), 

825 ), 

826 doomed_by=doomed_by, 

827 ) 

828 

829 def queryDatasets(self, datasetType: Any, *, 

830 collections: Any = None, 

831 dimensions: Optional[Iterable[Union[Dimension, str]]] = None, 

832 dataId: Optional[DataId] = None, 

833 where: Optional[str] = None, 

834 findFirst: bool = False, 

835 components: Optional[bool] = None, 

836 bind: Optional[Mapping[str, Any]] = None, 

837 check: bool = True, 

838 **kwargs: Any) -> queries.DatasetQueryResults: 

839 # Docstring inherited from lsst.daf.butler.registry.Registry 

840 

841 # Standardize the collections expression. 

842 if collections is None: 

843 if not self.defaults.collections: 

844 raise TypeError("No collections provided to findDataset, " 

845 "and no defaults from registry construction.") 

846 collections = self.defaults.collections 

847 elif findFirst: 

848 collections = CollectionSearch.fromExpression(collections) 

849 else: 

850 collections = CollectionQuery.fromExpression(collections) 

851 # Standardize and expand the data ID provided as a constraint. 

852 standardizedDataId = self.expandDataId(dataId, **kwargs) 

853 

854 # We can only query directly if given a non-component DatasetType 

855 # instance. If we were given an expression or str or a component 

856 # DatasetType instance, we'll populate this dict, recurse, and return. 

857 # If we already have a non-component DatasetType, it will remain None 

858 # and we'll run the query directly. 

859 composition: Optional[ 

860 Dict[ 

861 DatasetType, # parent dataset type 

862 List[Optional[str]] # component name, or None for parent 

863 ] 

864 ] = None 

865 if not isinstance(datasetType, DatasetType): 

866 # We were given a dataset type expression (which may be as simple 

867 # as a str). Loop over all matching datasets, delegating handling 

868 # of the `components` argument to queryDatasetTypes, as we populate 

869 # the composition dict. 

870 composition = defaultdict(list) 

871 for trueDatasetType in self.queryDatasetTypes(datasetType, components=components): 

872 parentName, componentName = trueDatasetType.nameAndComponent() 

873 if componentName is not None: 

874 parentDatasetType = self.getDatasetType(parentName) 

875 composition.setdefault(parentDatasetType, []).append(componentName) 

876 else: 

877 composition.setdefault(trueDatasetType, []).append(None) 

878 if not composition: 

879 return queries.ChainedDatasetQueryResults( 

880 [], 

881 doomed_by=[f"No registered dataset type matching {t!r} found." 

882 for t in ensure_iterable(datasetType)], 

883 ) 

884 elif datasetType.isComponent(): 

885 # We were given a true DatasetType instance, but it's a component. 

886 # the composition dict will have exactly one item. 

887 parentName, componentName = datasetType.nameAndComponent() 

888 parentDatasetType = self.getDatasetType(parentName) 

889 composition = {parentDatasetType: [componentName]} 

890 if composition is not None: 

891 # We need to recurse. Do that once for each parent dataset type. 

892 chain = [] 

893 for parentDatasetType, componentNames in composition.items(): 

894 parentResults = self.queryDatasets( 

895 parentDatasetType, 

896 collections=collections, 

897 dimensions=dimensions, 

898 dataId=standardizedDataId, 

899 where=where, 

900 bind=bind, 

901 findFirst=findFirst, 

902 check=check, 

903 ) 

904 assert isinstance(parentResults, queries.ParentDatasetQueryResults), \ 

905 "Should always be true if passing in a DatasetType instance, and we are." 

906 chain.append( 

907 parentResults.withComponents(componentNames) 

908 ) 

909 return queries.ChainedDatasetQueryResults(chain) 

910 # If we get here, there's no need to recurse (or we are already 

911 # recursing; there can only ever be one level of recursion). 

912 

913 # The full set of dimensions in the query is the combination of those 

914 # needed for the DatasetType and those explicitly requested, if any. 

915 requestedDimensionNames = set(datasetType.dimensions.names) 

916 if dimensions is not None: 

917 requestedDimensionNames.update(self.dimensions.extract(dimensions).names) 

918 # Construct the summary structure needed to construct a QueryBuilder. 

919 summary = queries.QuerySummary( 

920 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames), 

921 dataId=standardizedDataId, 

922 expression=where, 

923 bind=bind, 

924 defaults=self.defaults.dataId, 

925 check=check, 

926 ) 

927 builder = self._makeQueryBuilder(summary) 

928 # Add the dataset subquery to the query, telling the QueryBuilder to 

929 # include the rank of the selected collection in the results only if we 

930 # need to findFirst. Note that if any of the collections are 

931 # actually wildcard expressions, and we've asked for deduplication, 

932 # this will raise TypeError for us. 

933 builder.joinDataset(datasetType, collections, isResult=True, findFirst=findFirst) 

934 query = builder.finish() 

935 return queries.ParentDatasetQueryResults(self._db, query, components=[None], datasetType=datasetType) 

936 

937 def queryDataIds(self, dimensions: Union[Iterable[Union[Dimension, str]], Dimension, str], *, 

938 dataId: Optional[DataId] = None, 

939 datasets: Any = None, 

940 collections: Any = None, 

941 where: Optional[str] = None, 

942 components: Optional[bool] = None, 

943 bind: Optional[Mapping[str, Any]] = None, 

944 check: bool = True, 

945 **kwargs: Any) -> queries.DataCoordinateQueryResults: 

946 # Docstring inherited from lsst.daf.butler.registry.Registry 

947 dimensions = ensure_iterable(dimensions) 

948 standardizedDataId = self.expandDataId(dataId, **kwargs) 

949 standardizedDatasetTypes = set() 

950 requestedDimensions = self.dimensions.extract(dimensions) 

951 queryDimensionNames = set(requestedDimensions.names) 

952 missing: List[str] = [] 

953 if datasets is not None: 

954 if not collections: 

955 if not self.defaults.collections: 

956 raise TypeError(f"Cannot pass 'datasets' (='{datasets}') without 'collections'.") 

957 collections = self.defaults.collections 

958 else: 

959 # Preprocess collections expression in case the original 

960 # included single-pass iterators (we'll want to use it multiple 

961 # times below). 

962 collections = CollectionQuery.fromExpression(collections) 

963 for datasetType in self.queryDatasetTypes(datasets, components=components, missing=missing): 

964 queryDimensionNames.update(datasetType.dimensions.names) 

965 # If any matched dataset type is a component, just operate on 

966 # its parent instead, because Registry doesn't know anything 

967 # about what components exist, and here (unlike queryDatasets) 

968 # we don't care about returning them. 

969 parentDatasetTypeName, componentName = datasetType.nameAndComponent() 

970 if componentName is not None: 

971 datasetType = self.getDatasetType(parentDatasetTypeName) 

972 standardizedDatasetTypes.add(datasetType) 

973 elif collections: 

974 raise TypeError(f"Cannot pass 'collections' (='{collections}') without 'datasets'.") 

975 

976 summary = queries.QuerySummary( 

977 requested=DimensionGraph(self.dimensions, names=queryDimensionNames), 

978 dataId=standardizedDataId, 

979 expression=where, 

980 bind=bind, 

981 defaults=self.defaults.dataId, 

982 check=check, 

983 ) 

984 builder = self._makeQueryBuilder( 

985 summary, 

986 doomed_by=[f"Dataset type {name} is not registered." for name in missing] 

987 ) 

988 for datasetType in standardizedDatasetTypes: 

989 builder.joinDataset(datasetType, collections, isResult=False) 

990 query = builder.finish() 

991 return queries.DataCoordinateQueryResults(self._db, query) 

992 

993 def queryDimensionRecords(self, element: Union[DimensionElement, str], *, 

994 dataId: Optional[DataId] = None, 

995 datasets: Any = None, 

996 collections: Any = None, 

997 where: Optional[str] = None, 

998 components: Optional[bool] = None, 

999 bind: Optional[Mapping[str, Any]] = None, 

1000 check: bool = True, 

1001 **kwargs: Any) -> Iterator[DimensionRecord]: 

1002 # Docstring inherited from lsst.daf.butler.registry.Registry 

1003 if not isinstance(element, DimensionElement): 

1004 try: 

1005 element = self.dimensions[element] 

1006 except KeyError as e: 

1007 raise KeyError(f"No such dimension '{element}', available dimensions: " 

1008 + str(self.dimensions.getStaticElements())) from e 

1009 dataIds = self.queryDataIds(element.graph, dataId=dataId, datasets=datasets, collections=collections, 

1010 where=where, components=components, bind=bind, check=check, **kwargs) 

1011 return iter(self._managers.dimensions[element].fetch(dataIds)) 

1012 

1013 def queryDatasetAssociations( 

1014 self, 

1015 datasetType: Union[str, DatasetType], 

1016 collections: Any = ..., 

1017 *, 

1018 collectionTypes: Iterable[CollectionType] = CollectionType.all(), 

1019 flattenChains: bool = False, 

1020 ) -> Iterator[DatasetAssociation]: 

1021 # Docstring inherited from lsst.daf.butler.registry.Registry 

1022 if collections is None: 

1023 if not self.defaults.collections: 

1024 raise TypeError("No collections provided to findDataset, " 

1025 "and no defaults from registry construction.") 

1026 collections = self.defaults.collections 

1027 else: 

1028 collections = CollectionQuery.fromExpression(collections) 

1029 TimespanReprClass = self._db.getTimespanRepresentation() 

1030 if isinstance(datasetType, str): 

1031 storage = self._managers.datasets[datasetType] 

1032 else: 

1033 storage = self._managers.datasets[datasetType.name] 

1034 for collectionRecord in collections.iter(self._managers.collections, 

1035 collectionTypes=frozenset(collectionTypes), 

1036 flattenChains=flattenChains): 

1037 query = storage.select(collectionRecord) 

1038 for row in self._db.query(query.combine()).mappings(): 

1039 dataId = DataCoordinate.fromRequiredValues( 

1040 storage.datasetType.dimensions, 

1041 tuple(row[name] for name in storage.datasetType.dimensions.required.names) 

1042 ) 

1043 runRecord = self._managers.collections[row[self._managers.collections.getRunForeignKeyName()]] 

1044 ref = DatasetRef(storage.datasetType, dataId, id=row["id"], run=runRecord.name, 

1045 conform=False) 

1046 if collectionRecord.type is CollectionType.CALIBRATION: 

1047 timespan = TimespanReprClass.extract(row) 

1048 else: 

1049 timespan = None 

1050 yield DatasetAssociation(ref=ref, collection=collectionRecord.name, timespan=timespan) 

1051 

1052 storageClasses: StorageClassFactory 

1053 """All storage classes known to the registry (`StorageClassFactory`). 

1054 """