Coverage for python/lsst/daf/butler/registries/sql.py: 13%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

466 statements  

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ( 

25 "SqlRegistry", 

26) 

27 

28from collections import defaultdict 

29import contextlib 

30import logging 

31from typing import ( 

32 Any, 

33 Dict, 

34 Iterable, 

35 Iterator, 

36 List, 

37 Mapping, 

38 Optional, 

39 Set, 

40 Tuple, 

41 TYPE_CHECKING, 

42 Union, 

43) 

44 

45import sqlalchemy 

46from lsst.utils.iteration import ensure_iterable 

47 

48from ..core import ( 

49 ButlerURI, 

50 Config, 

51 DataCoordinate, 

52 DataCoordinateIterable, 

53 DataId, 

54 DatasetAssociation, 

55 DatasetId, 

56 DatasetRef, 

57 DatasetType, 

58 ddl, 

59 Dimension, 

60 DimensionConfig, 

61 DimensionElement, 

62 DimensionGraph, 

63 DimensionRecord, 

64 DimensionUniverse, 

65 NamedKeyMapping, 

66 NameLookupMapping, 

67 Progress, 

68 StorageClassFactory, 

69 Timespan, 

70) 

71from ..core.utils import transactional 

72 

73from ..registry import ( 

74 Registry, 

75 RegistryConfig, 

76 CollectionType, 

77 RegistryDefaults, 

78 ConflictingDefinitionError, 

79 InconsistentDataIdError, 

80 OrphanedRecordError, 

81 CollectionSearch, 

82) 

83from ..registry import queries 

84 

85from ..registry.wildcards import CategorizedWildcard, CollectionQuery, Ellipsis 

86from ..registry.summaries import CollectionSummary 

87from ..registry.managers import RegistryManagerTypes, RegistryManagerInstances 

88from ..registry.queries import Query 

89from ..registry.interfaces import ChainedCollectionRecord, DatasetIdGenEnum, RunRecord 

90 

91if TYPE_CHECKING: 91 ↛ 92line 91 didn't jump to line 92, because the condition on line 91 was never true

92 from .._butlerConfig import ButlerConfig 

93 from ..registry.interfaces import ( 

94 CollectionRecord, 

95 Database, 

96 DatastoreRegistryBridgeManager, 

97 ) 

98 

99 

100_LOG = logging.getLogger(__name__) 

101 

102 

103class SqlRegistry(Registry): 

104 """Registry implementation based on SQLAlchemy. 

105 

106 Parameters 

107 ---------- 

108 database : `Database` 

109 Database instance to store Registry. 

110 defaults : `RegistryDefaults` 

111 Default collection search path and/or output `~CollectionType.RUN` 

112 collection. 

113 managers : `RegistryManagerInstances` 

114 All the managers required for this registry. 

115 """ 

116 

117 defaultConfigFile: Optional[str] = None 

118 """Path to configuration defaults. Accessed within the ``configs`` resource 

119 or relative to a search path. Can be None if no defaults specified. 

120 """ 

121 

122 @classmethod 

123 def createFromConfig(cls, config: Optional[Union[RegistryConfig, str]] = None, 

124 dimensionConfig: Optional[Union[DimensionConfig, str]] = None, 

125 butlerRoot: Optional[str] = None) -> Registry: 

126 """Create registry database and return `SqlRegistry` instance. 

127 

128 This method initializes database contents, database must be empty 

129 prior to calling this method. 

130 

131 Parameters 

132 ---------- 

133 config : `RegistryConfig` or `str`, optional 

134 Registry configuration, if missing then default configuration will 

135 be loaded from registry.yaml. 

136 dimensionConfig : `DimensionConfig` or `str`, optional 

137 Dimensions configuration, if missing then default configuration 

138 will be loaded from dimensions.yaml. 

139 butlerRoot : `str`, optional 

140 Path to the repository root this `SqlRegistry` will manage. 

141 

142 Returns 

143 ------- 

144 registry : `SqlRegistry` 

145 A new `SqlRegistry` instance. 

146 """ 

147 config = cls.forceRegistryConfig(config) 

148 config.replaceRoot(butlerRoot) 

149 

150 if isinstance(dimensionConfig, str): 

151 dimensionConfig = DimensionConfig(config) 

152 elif dimensionConfig is None: 

153 dimensionConfig = DimensionConfig() 

154 elif not isinstance(dimensionConfig, DimensionConfig): 

155 raise TypeError(f"Incompatible Dimension configuration type: {type(dimensionConfig)}") 

156 

157 DatabaseClass = config.getDatabaseClass() 

158 database = DatabaseClass.fromUri(str(config.connectionString), origin=config.get("origin", 0), 

159 namespace=config.get("namespace")) 

160 managerTypes = RegistryManagerTypes.fromConfig(config) 

161 managers = managerTypes.makeRepo(database, dimensionConfig) 

162 return cls(database, RegistryDefaults(), managers) 

163 

164 @classmethod 

165 def fromConfig(cls, config: Union[ButlerConfig, RegistryConfig, Config, str], 

166 butlerRoot: Optional[Union[str, ButlerURI]] = None, writeable: bool = True, 

167 defaults: Optional[RegistryDefaults] = None) -> Registry: 

168 """Create `Registry` subclass instance from `config`. 

169 

170 Registry database must be initialized prior to calling this method. 

171 

172 Parameters 

173 ---------- 

174 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str` 

175 Registry configuration 

176 butlerRoot : `str` or `ButlerURI`, optional 

177 Path to the repository root this `Registry` will manage. 

178 writeable : `bool`, optional 

179 If `True` (default) create a read-write connection to the database. 

180 defaults : `RegistryDefaults`, optional 

181 Default collection search path and/or output `~CollectionType.RUN` 

182 collection. 

183 

184 Returns 

185 ------- 

186 registry : `SqlRegistry` (subclass) 

187 A new `SqlRegistry` subclass instance. 

188 """ 

189 config = cls.forceRegistryConfig(config) 

190 config.replaceRoot(butlerRoot) 

191 DatabaseClass = config.getDatabaseClass() 

192 database = DatabaseClass.fromUri(str(config.connectionString), origin=config.get("origin", 0), 

193 namespace=config.get("namespace"), writeable=writeable) 

194 managerTypes = RegistryManagerTypes.fromConfig(config) 

195 managers = managerTypes.loadRepo(database) 

196 if defaults is None: 

197 defaults = RegistryDefaults() 

198 return cls(database, defaults, managers) 

199 

200 def __init__(self, database: Database, defaults: RegistryDefaults, managers: RegistryManagerInstances): 

201 self._db = database 

202 self._managers = managers 

203 self.storageClasses = StorageClassFactory() 

204 # Intentionally invoke property setter to initialize defaults. This 

205 # can only be done after most of the rest of Registry has already been 

206 # initialized, and must be done before the property getter is used. 

207 self.defaults = defaults 

208 

209 def __str__(self) -> str: 

210 return str(self._db) 

211 

212 def __repr__(self) -> str: 

213 return f"SqlRegistry({self._db!r}, {self.dimensions!r})" 

214 

215 def isWriteable(self) -> bool: 

216 # Docstring inherited from lsst.daf.butler.registry.Registry 

217 return self._db.isWriteable() 

218 

219 def copy(self, defaults: Optional[RegistryDefaults] = None) -> Registry: 

220 # Docstring inherited from lsst.daf.butler.registry.Registry 

221 if defaults is None: 

222 # No need to copy, because `RegistryDefaults` is immutable; we 

223 # effectively copy on write. 

224 defaults = self.defaults 

225 return type(self)(self._db, defaults, self._managers) 

226 

227 @property 

228 def dimensions(self) -> DimensionUniverse: 

229 # Docstring inherited from lsst.daf.butler.registry.Registry 

230 return self._managers.dimensions.universe 

231 

232 def refresh(self) -> None: 

233 # Docstring inherited from lsst.daf.butler.registry.Registry 

234 self._managers.refresh() 

235 

236 @contextlib.contextmanager 

237 def transaction(self, *, savepoint: bool = False) -> Iterator[None]: 

238 # Docstring inherited from lsst.daf.butler.registry.Registry 

239 try: 

240 with self._db.transaction(savepoint=savepoint): 

241 yield 

242 except BaseException: 

243 # TODO: this clears the caches sometimes when we wouldn't actually 

244 # need to. Can we avoid that? 

245 self._managers.dimensions.clearCaches() 

246 raise 

247 

248 def resetConnectionPool(self) -> None: 

249 """Reset SQLAlchemy connection pool for `SqlRegistry` database. 

250 

251 This operation is useful when using registry with fork-based 

252 multiprocessing. To use registry across fork boundary one has to make 

253 sure that there are no currently active connections (no session or 

254 transaction is in progress) and connection pool is reset using this 

255 method. This method should be called by the child process immediately 

256 after the fork. 

257 """ 

258 self._db._engine.dispose() 

259 

260 def registerOpaqueTable(self, tableName: str, spec: ddl.TableSpec) -> None: 

261 """Add an opaque (to the `Registry`) table for use by a `Datastore` or 

262 other data repository client. 

263 

264 Opaque table records can be added via `insertOpaqueData`, retrieved via 

265 `fetchOpaqueData`, and removed via `deleteOpaqueData`. 

266 

267 Parameters 

268 ---------- 

269 tableName : `str` 

270 Logical name of the opaque table. This may differ from the 

271 actual name used in the database by a prefix and/or suffix. 

272 spec : `ddl.TableSpec` 

273 Specification for the table to be added. 

274 """ 

275 self._managers.opaque.register(tableName, spec) 

276 

277 @transactional 

278 def insertOpaqueData(self, tableName: str, *data: dict) -> None: 

279 """Insert records into an opaque table. 

280 

281 Parameters 

282 ---------- 

283 tableName : `str` 

284 Logical name of the opaque table. Must match the name used in a 

285 previous call to `registerOpaqueTable`. 

286 data 

287 Each additional positional argument is a dictionary that represents 

288 a single row to be added. 

289 """ 

290 self._managers.opaque[tableName].insert(*data) 

291 

292 def fetchOpaqueData(self, tableName: str, **where: Any) -> Iterator[dict]: 

293 """Retrieve records from an opaque table. 

294 

295 Parameters 

296 ---------- 

297 tableName : `str` 

298 Logical name of the opaque table. Must match the name used in a 

299 previous call to `registerOpaqueTable`. 

300 where 

301 Additional keyword arguments are interpreted as equality 

302 constraints that restrict the returned rows (combined with AND); 

303 keyword arguments are column names and values are the values they 

304 must have. 

305 

306 Yields 

307 ------ 

308 row : `dict` 

309 A dictionary representing a single result row. 

310 """ 

311 yield from self._managers.opaque[tableName].fetch(**where) 

312 

313 @transactional 

314 def deleteOpaqueData(self, tableName: str, **where: Any) -> None: 

315 """Remove records from an opaque table. 

316 

317 Parameters 

318 ---------- 

319 tableName : `str` 

320 Logical name of the opaque table. Must match the name used in a 

321 previous call to `registerOpaqueTable`. 

322 where 

323 Additional keyword arguments are interpreted as equality 

324 constraints that restrict the deleted rows (combined with AND); 

325 keyword arguments are column names and values are the values they 

326 must have. 

327 """ 

328 self._managers.opaque[tableName].delete(where.keys(), where) 

329 

330 def registerCollection(self, name: str, type: CollectionType = CollectionType.TAGGED, 

331 doc: Optional[str] = None) -> bool: 

332 # Docstring inherited from lsst.daf.butler.registry.Registry 

333 _, registered = self._managers.collections.register(name, type, doc=doc) 

334 return registered 

335 

336 def getCollectionType(self, name: str) -> CollectionType: 

337 # Docstring inherited from lsst.daf.butler.registry.Registry 

338 return self._managers.collections.find(name).type 

339 

340 def _get_collection_record(self, name: str) -> CollectionRecord: 

341 # Docstring inherited from lsst.daf.butler.registry.Registry 

342 return self._managers.collections.find(name) 

343 

344 def registerRun(self, name: str, doc: Optional[str] = None) -> bool: 

345 # Docstring inherited from lsst.daf.butler.registry.Registry 

346 _, registered = self._managers.collections.register(name, CollectionType.RUN, doc=doc) 

347 return registered 

348 

349 @transactional 

350 def removeCollection(self, name: str) -> None: 

351 # Docstring inherited from lsst.daf.butler.registry.Registry 

352 self._managers.collections.remove(name) 

353 

354 def getCollectionChain(self, parent: str) -> CollectionSearch: 

355 # Docstring inherited from lsst.daf.butler.registry.Registry 

356 record = self._managers.collections.find(parent) 

357 if record.type is not CollectionType.CHAINED: 

358 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.") 

359 assert isinstance(record, ChainedCollectionRecord) 

360 return record.children 

361 

362 @transactional 

363 def setCollectionChain(self, parent: str, children: Any, *, flatten: bool = False) -> None: 

364 # Docstring inherited from lsst.daf.butler.registry.Registry 

365 record = self._managers.collections.find(parent) 

366 if record.type is not CollectionType.CHAINED: 

367 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.") 

368 assert isinstance(record, ChainedCollectionRecord) 

369 children = CollectionSearch.fromExpression(children) 

370 if children != record.children or flatten: 

371 record.update(self._managers.collections, children, flatten=flatten) 

372 

373 def getCollectionDocumentation(self, collection: str) -> Optional[str]: 

374 # Docstring inherited from lsst.daf.butler.registry.Registry 

375 return self._managers.collections.getDocumentation(self._managers.collections.find(collection).key) 

376 

377 def setCollectionDocumentation(self, collection: str, doc: Optional[str]) -> None: 

378 # Docstring inherited from lsst.daf.butler.registry.Registry 

379 self._managers.collections.setDocumentation(self._managers.collections.find(collection).key, doc) 

380 

381 def getCollectionSummary(self, collection: str) -> CollectionSummary: 

382 # Docstring inherited from lsst.daf.butler.registry.Registry 

383 record = self._managers.collections.find(collection) 

384 return self._managers.datasets.getCollectionSummary(record) 

385 

386 def registerDatasetType(self, datasetType: DatasetType) -> bool: 

387 # Docstring inherited from lsst.daf.butler.registry.Registry 

388 _, inserted = self._managers.datasets.register(datasetType) 

389 return inserted 

390 

391 def removeDatasetType(self, name: str) -> None: 

392 # Docstring inherited from lsst.daf.butler.registry.Registry 

393 self._managers.datasets.remove(name) 

394 

395 def getDatasetType(self, name: str) -> DatasetType: 

396 # Docstring inherited from lsst.daf.butler.registry.Registry 

397 return self._managers.datasets[name].datasetType 

398 

399 def supportsIdGenerationMode(self, mode: DatasetIdGenEnum) -> bool: 

400 # Docstring inherited from lsst.daf.butler.registry.Registry 

401 return self._managers.datasets.supportsIdGenerationMode(mode) 

402 

403 def findDataset(self, datasetType: Union[DatasetType, str], dataId: Optional[DataId] = None, *, 

404 collections: Any = None, timespan: Optional[Timespan] = None, 

405 **kwargs: Any) -> Optional[DatasetRef]: 

406 # Docstring inherited from lsst.daf.butler.registry.Registry 

407 if isinstance(datasetType, DatasetType): 

408 storage = self._managers.datasets[datasetType.name] 

409 else: 

410 storage = self._managers.datasets[datasetType] 

411 dataId = DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions, 

412 universe=self.dimensions, defaults=self.defaults.dataId, 

413 **kwargs) 

414 if collections is None: 

415 if not self.defaults.collections: 

416 raise TypeError("No collections provided to findDataset, " 

417 "and no defaults from registry construction.") 

418 collections = self.defaults.collections 

419 else: 

420 collections = CollectionSearch.fromExpression(collections) 

421 for collectionRecord in collections.iter(self._managers.collections): 

422 if (collectionRecord.type is CollectionType.CALIBRATION 

423 and (not storage.datasetType.isCalibration() or timespan is None)): 

424 continue 

425 result = storage.find(collectionRecord, dataId, timespan=timespan) 

426 if result is not None: 

427 return result 

428 

429 return None 

430 

431 @transactional 

432 def insertDatasets(self, datasetType: Union[DatasetType, str], dataIds: Iterable[DataId], 

433 run: Optional[str] = None, expand: bool = True, 

434 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE) -> List[DatasetRef]: 

435 # Docstring inherited from lsst.daf.butler.registry.Registry 

436 if isinstance(datasetType, DatasetType): 

437 storage = self._managers.datasets.find(datasetType.name) 

438 if storage is None: 

439 raise LookupError(f"DatasetType '{datasetType}' has not been registered.") 

440 else: 

441 storage = self._managers.datasets.find(datasetType) 

442 if storage is None: 

443 raise LookupError(f"DatasetType with name '{datasetType}' has not been registered.") 

444 if run is None: 

445 if self.defaults.run is None: 

446 raise TypeError("No run provided to insertDatasets, " 

447 "and no default from registry construction.") 

448 run = self.defaults.run 

449 runRecord = self._managers.collections.find(run) 

450 if runRecord.type is not CollectionType.RUN: 

451 raise TypeError(f"Given collection is of type {runRecord.type.name}; RUN collection required.") 

452 assert isinstance(runRecord, RunRecord) 

453 progress = Progress("daf.butler.Registry.insertDatasets", level=logging.DEBUG) 

454 if expand: 

455 expandedDataIds = [self.expandDataId(dataId, graph=storage.datasetType.dimensions) 

456 for dataId in progress.wrap(dataIds, 

457 f"Expanding {storage.datasetType.name} data IDs")] 

458 else: 

459 expandedDataIds = [DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions) 

460 for dataId in dataIds] 

461 try: 

462 refs = list(storage.insert(runRecord, expandedDataIds, idGenerationMode)) 

463 except sqlalchemy.exc.IntegrityError as err: 

464 raise ConflictingDefinitionError(f"A database constraint failure was triggered by inserting " 

465 f"one or more datasets of type {storage.datasetType} into " 

466 f"collection '{run}'. " 

467 f"This probably means a dataset with the same data ID " 

468 f"and dataset type already exists, but it may also mean a " 

469 f"dimension row is missing.") from err 

470 return refs 

471 

472 @transactional 

473 def _importDatasets(self, datasets: Iterable[DatasetRef], expand: bool = True, 

474 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

475 reuseIds: bool = False) -> List[DatasetRef]: 

476 # Docstring inherited from lsst.daf.butler.registry.Registry 

477 datasets = list(datasets) 

478 if not datasets: 

479 # nothing to do 

480 return [] 

481 

482 # find dataset type 

483 datasetTypes = set(dataset.datasetType for dataset in datasets) 

484 if len(datasetTypes) != 1: 

485 raise ValueError(f"Multiple dataset types in input datasets: {datasetTypes}") 

486 datasetType = datasetTypes.pop() 

487 

488 # get storage handler for this dataset type 

489 storage = self._managers.datasets.find(datasetType.name) 

490 if storage is None: 

491 raise LookupError(f"DatasetType '{datasetType}' has not been registered.") 

492 

493 # find run name 

494 runs = set(dataset.run for dataset in datasets) 

495 if len(runs) != 1: 

496 raise ValueError(f"Multiple run names in input datasets: {runs}") 

497 run = runs.pop() 

498 if run is None: 

499 if self.defaults.run is None: 

500 raise TypeError("No run provided to ingestDatasets, " 

501 "and no default from registry construction.") 

502 run = self.defaults.run 

503 

504 runRecord = self._managers.collections.find(run) 

505 if runRecord.type is not CollectionType.RUN: 

506 raise TypeError(f"Given collection '{runRecord.name}' is of type {runRecord.type.name};" 

507 " RUN collection required.") 

508 assert isinstance(runRecord, RunRecord) 

509 

510 progress = Progress("daf.butler.Registry.insertDatasets", level=logging.DEBUG) 

511 if expand: 

512 expandedDatasets = [ 

513 dataset.expanded(self.expandDataId(dataset.dataId, graph=storage.datasetType.dimensions)) 

514 for dataset in progress.wrap(datasets, f"Expanding {storage.datasetType.name} data IDs")] 

515 else: 

516 expandedDatasets = [ 

517 DatasetRef(datasetType, dataset.dataId, id=dataset.id, run=dataset.run, conform=True) 

518 for dataset in datasets 

519 ] 

520 

521 try: 

522 refs = list(storage.import_(runRecord, expandedDatasets, idGenerationMode, reuseIds)) 

523 except sqlalchemy.exc.IntegrityError as err: 

524 raise ConflictingDefinitionError(f"A database constraint failure was triggered by inserting " 

525 f"one or more datasets of type {storage.datasetType} into " 

526 f"collection '{run}'. " 

527 f"This probably means a dataset with the same data ID " 

528 f"and dataset type already exists, but it may also mean a " 

529 f"dimension row is missing.") from err 

530 return refs 

531 

532 def getDataset(self, id: DatasetId) -> Optional[DatasetRef]: 

533 # Docstring inherited from lsst.daf.butler.registry.Registry 

534 return self._managers.datasets.getDatasetRef(id) 

535 

536 @transactional 

537 def removeDatasets(self, refs: Iterable[DatasetRef]) -> None: 

538 # Docstring inherited from lsst.daf.butler.registry.Registry 

539 progress = Progress("lsst.daf.butler.Registry.removeDatasets", level=logging.DEBUG) 

540 for datasetType, refsForType in progress.iter_item_chunks(DatasetRef.groupByType(refs).items(), 

541 desc="Removing datasets by type"): 

542 storage = self._managers.datasets[datasetType.name] 

543 try: 

544 storage.delete(refsForType) 

545 except sqlalchemy.exc.IntegrityError as err: 

546 raise OrphanedRecordError("One or more datasets is still " 

547 "present in one or more Datastores.") from err 

548 

549 @transactional 

550 def associate(self, collection: str, refs: Iterable[DatasetRef]) -> None: 

551 # Docstring inherited from lsst.daf.butler.registry.Registry 

552 progress = Progress("lsst.daf.butler.Registry.associate", level=logging.DEBUG) 

553 collectionRecord = self._managers.collections.find(collection) 

554 if collectionRecord.type is not CollectionType.TAGGED: 

555 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}, not TAGGED.") 

556 for datasetType, refsForType in progress.iter_item_chunks(DatasetRef.groupByType(refs).items(), 

557 desc="Associating datasets by type"): 

558 storage = self._managers.datasets[datasetType.name] 

559 try: 

560 storage.associate(collectionRecord, refsForType) 

561 except sqlalchemy.exc.IntegrityError as err: 

562 raise ConflictingDefinitionError( 

563 f"Constraint violation while associating dataset of type {datasetType.name} with " 

564 f"collection {collection}. This probably means that one or more datasets with the same " 

565 f"dataset type and data ID already exist in the collection, but it may also indicate " 

566 f"that the datasets do not exist." 

567 ) from err 

568 

569 @transactional 

570 def disassociate(self, collection: str, refs: Iterable[DatasetRef]) -> None: 

571 # Docstring inherited from lsst.daf.butler.registry.Registry 

572 progress = Progress("lsst.daf.butler.Registry.disassociate", level=logging.DEBUG) 

573 collectionRecord = self._managers.collections.find(collection) 

574 if collectionRecord.type is not CollectionType.TAGGED: 

575 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}; " 

576 "expected TAGGED.") 

577 for datasetType, refsForType in progress.iter_item_chunks(DatasetRef.groupByType(refs).items(), 

578 desc="Disassociating datasets by type"): 

579 storage = self._managers.datasets[datasetType.name] 

580 storage.disassociate(collectionRecord, refsForType) 

581 

582 @transactional 

583 def certify(self, collection: str, refs: Iterable[DatasetRef], timespan: Timespan) -> None: 

584 # Docstring inherited from lsst.daf.butler.registry.Registry 

585 progress = Progress("lsst.daf.butler.Registry.certify", level=logging.DEBUG) 

586 collectionRecord = self._managers.collections.find(collection) 

587 for datasetType, refsForType in progress.iter_item_chunks(DatasetRef.groupByType(refs).items(), 

588 desc="Certifying datasets by type"): 

589 storage = self._managers.datasets[datasetType.name] 

590 storage.certify(collectionRecord, refsForType, timespan) 

591 

592 @transactional 

593 def decertify(self, collection: str, datasetType: Union[str, DatasetType], timespan: Timespan, *, 

594 dataIds: Optional[Iterable[DataId]] = None) -> None: 

595 # Docstring inherited from lsst.daf.butler.registry.Registry 

596 collectionRecord = self._managers.collections.find(collection) 

597 if isinstance(datasetType, str): 

598 storage = self._managers.datasets[datasetType] 

599 else: 

600 storage = self._managers.datasets[datasetType.name] 

601 standardizedDataIds = None 

602 if dataIds is not None: 

603 standardizedDataIds = [DataCoordinate.standardize(d, graph=storage.datasetType.dimensions) 

604 for d in dataIds] 

605 storage.decertify(collectionRecord, timespan, dataIds=standardizedDataIds) 

606 

607 def getDatastoreBridgeManager(self) -> DatastoreRegistryBridgeManager: 

608 """Return an object that allows a new `Datastore` instance to 

609 communicate with this `Registry`. 

610 

611 Returns 

612 ------- 

613 manager : `DatastoreRegistryBridgeManager` 

614 Object that mediates communication between this `Registry` and its 

615 associated datastores. 

616 """ 

617 return self._managers.datastores 

618 

619 def getDatasetLocations(self, ref: DatasetRef) -> Iterable[str]: 

620 # Docstring inherited from lsst.daf.butler.registry.Registry 

621 return self._managers.datastores.findDatastores(ref) 

622 

623 def expandDataId(self, dataId: Optional[DataId] = None, *, graph: Optional[DimensionGraph] = None, 

624 records: Optional[NameLookupMapping[DimensionElement, Optional[DimensionRecord]]] = None, 

625 withDefaults: bool = True, 

626 **kwargs: Any) -> DataCoordinate: 

627 # Docstring inherited from lsst.daf.butler.registry.Registry 

628 if not withDefaults: 

629 defaults = None 

630 else: 

631 defaults = self.defaults.dataId 

632 standardized = DataCoordinate.standardize(dataId, graph=graph, universe=self.dimensions, 

633 defaults=defaults, **kwargs) 

634 if standardized.hasRecords(): 

635 return standardized 

636 if records is None: 

637 records = {} 

638 elif isinstance(records, NamedKeyMapping): 

639 records = records.byName() 

640 else: 

641 records = dict(records) 

642 if isinstance(dataId, DataCoordinate) and dataId.hasRecords(): 

643 records.update(dataId.records.byName()) 

644 keys = standardized.byName() 

645 for element in standardized.graph.primaryKeyTraversalOrder: 

646 record = records.get(element.name, ...) # Use ... to mean not found; None might mean NULL 

647 if record is ...: 

648 if isinstance(element, Dimension) and keys.get(element.name) is None: 

649 if element in standardized.graph.required: 

650 raise LookupError( 

651 f"No value or null value for required dimension {element.name}." 

652 ) 

653 keys[element.name] = None 

654 record = None 

655 else: 

656 storage = self._managers.dimensions[element] 

657 dataIdSet = DataCoordinateIterable.fromScalar( 

658 DataCoordinate.standardize(keys, graph=element.graph) 

659 ) 

660 fetched = tuple(storage.fetch(dataIdSet)) 

661 try: 

662 (record,) = fetched 

663 except ValueError: 

664 record = None 

665 records[element.name] = record 

666 if record is not None: 

667 for d in element.implied: 

668 value = getattr(record, d.name) 

669 if keys.setdefault(d.name, value) != value: 

670 raise InconsistentDataIdError( 

671 f"Data ID {standardized} has {d.name}={keys[d.name]!r}, " 

672 f"but {element.name} implies {d.name}={value!r}." 

673 ) 

674 else: 

675 if element in standardized.graph.required: 

676 raise LookupError( 

677 f"Could not fetch record for required dimension {element.name} via keys {keys}." 

678 ) 

679 if element.alwaysJoin: 

680 raise InconsistentDataIdError( 

681 f"Could not fetch record for element {element.name} via keys {keys}, ", 

682 "but it is marked alwaysJoin=True; this means one or more dimensions are not " 

683 "related." 

684 ) 

685 for d in element.implied: 

686 keys.setdefault(d.name, None) 

687 records.setdefault(d.name, None) 

688 return DataCoordinate.standardize(keys, graph=standardized.graph).expanded(records=records) 

689 

690 def insertDimensionData(self, element: Union[DimensionElement, str], 

691 *data: Union[Mapping[str, Any], DimensionRecord], 

692 conform: bool = True, 

693 replace: bool = False) -> None: 

694 # Docstring inherited from lsst.daf.butler.registry.Registry 

695 if conform: 

696 if isinstance(element, str): 

697 element = self.dimensions[element] 

698 records = [row if isinstance(row, DimensionRecord) else element.RecordClass(**row) 

699 for row in data] 

700 else: 

701 # Ignore typing since caller said to trust them with conform=False. 

702 records = data # type: ignore 

703 storage = self._managers.dimensions[element] # type: ignore 

704 storage.insert(*records, replace=replace) 

705 

706 def syncDimensionData(self, element: Union[DimensionElement, str], 

707 row: Union[Mapping[str, Any], DimensionRecord], 

708 conform: bool = True, 

709 update: bool = False) -> Union[bool, Dict[str, Any]]: 

710 # Docstring inherited from lsst.daf.butler.registry.Registry 

711 if conform: 

712 if isinstance(element, str): 

713 element = self.dimensions[element] 

714 record = row if isinstance(row, DimensionRecord) else element.RecordClass(**row) 

715 else: 

716 # Ignore typing since caller said to trust them with conform=False. 

717 record = row # type: ignore 

718 storage = self._managers.dimensions[element] # type: ignore 

719 return storage.sync(record, update=update) 

720 

721 def queryDatasetTypes(self, expression: Any = ..., *, components: Optional[bool] = None, 

722 missing: Optional[List[str]] = None, 

723 ) -> Iterator[DatasetType]: 

724 # Docstring inherited from lsst.daf.butler.registry.Registry 

725 wildcard = CategorizedWildcard.fromExpression(expression, coerceUnrecognized=lambda d: d.name) 

726 if wildcard is Ellipsis: 

727 for datasetType in self._managers.datasets: 

728 # The dataset type can no longer be a component 

729 yield datasetType 

730 if components: 

731 # Automatically create the component dataset types 

732 try: 

733 componentsForDatasetType = datasetType.makeAllComponentDatasetTypes() 

734 except KeyError as err: 

735 _LOG.warning(f"Could not load storage class {err} for {datasetType.name}; " 

736 "if it has components they will not be included in query results.") 

737 else: 

738 yield from componentsForDatasetType 

739 return 

740 done: Set[str] = set() 

741 for name in wildcard.strings: 

742 storage = self._managers.datasets.find(name) 

743 done.add(name) 

744 if storage is None: 

745 if missing is not None: 

746 missing.append(name) 

747 else: 

748 yield storage.datasetType 

749 if wildcard.patterns: 

750 # If components (the argument) is None, we'll save component 

751 # dataset that we might want to match, but only if their parents 

752 # didn't get included. 

753 componentsForLater = [] 

754 for registeredDatasetType in self._managers.datasets: 

755 # Components are not stored in registry so expand them here 

756 allDatasetTypes = [registeredDatasetType] 

757 try: 

758 allDatasetTypes.extend(registeredDatasetType.makeAllComponentDatasetTypes()) 

759 except KeyError as err: 

760 _LOG.warning(f"Could not load storage class {err} for {registeredDatasetType.name}; " 

761 "if it has components they will not be included in query results.") 

762 for datasetType in allDatasetTypes: 

763 if datasetType.name in done: 

764 continue 

765 parentName, componentName = datasetType.nameAndComponent() 

766 if componentName is not None and not components: 

767 if components is None and parentName not in done: 

768 componentsForLater.append(datasetType) 

769 continue 

770 if any(p.fullmatch(datasetType.name) for p in wildcard.patterns): 

771 done.add(datasetType.name) 

772 yield datasetType 

773 # Go back and try to match saved components. 

774 for datasetType in componentsForLater: 

775 parentName, _ = datasetType.nameAndComponent() 

776 if parentName not in done and any(p.fullmatch(datasetType.name) for p in wildcard.patterns): 

777 yield datasetType 

778 

779 def queryCollections(self, expression: Any = ..., 

780 datasetType: Optional[DatasetType] = None, 

781 collectionTypes: Iterable[CollectionType] = CollectionType.all(), 

782 flattenChains: bool = False, 

783 includeChains: Optional[bool] = None) -> Iterator[str]: 

784 # Docstring inherited from lsst.daf.butler.registry.Registry 

785 

786 # Right now the datasetTypes argument is completely ignored, but that 

787 # is consistent with its [lack of] guarantees. DM-24939 or a follow-up 

788 # ticket will take care of that. 

789 query = CollectionQuery.fromExpression(expression) 

790 for record in query.iter(self._managers.collections, collectionTypes=frozenset(collectionTypes), 

791 flattenChains=flattenChains, includeChains=includeChains): 

792 yield record.name 

793 

794 def _makeQueryBuilder(self, summary: queries.QuerySummary, 

795 doomed_by: Iterable[str] = ()) -> queries.QueryBuilder: 

796 """Return a `QueryBuilder` instance capable of constructing and 

797 managing more complex queries than those obtainable via `Registry` 

798 interfaces. 

799 

800 This is an advanced interface; downstream code should prefer 

801 `Registry.queryDataIds` and `Registry.queryDatasets` whenever those 

802 are sufficient. 

803 

804 Parameters 

805 ---------- 

806 summary : `queries.QuerySummary` 

807 Object describing and categorizing the full set of dimensions that 

808 will be included in the query. 

809 doomed_by : `Iterable` of `str`, optional 

810 A list of diagnostic messages that indicate why the query is going 

811 to yield no results and should not even be executed. If an empty 

812 container (default) the query will be executed unless other code 

813 determines that it is doomed. 

814 

815 Returns 

816 ------- 

817 builder : `queries.QueryBuilder` 

818 Object that can be used to construct and perform advanced queries. 

819 """ 

820 return queries.QueryBuilder( 

821 summary, 

822 queries.RegistryManagers( 

823 collections=self._managers.collections, 

824 dimensions=self._managers.dimensions, 

825 datasets=self._managers.datasets, 

826 TimespanReprClass=self._db.getTimespanRepresentation(), 

827 ), 

828 doomed_by=doomed_by, 

829 ) 

830 

831 def queryDatasets(self, datasetType: Any, *, 

832 collections: Any = None, 

833 dimensions: Optional[Iterable[Union[Dimension, str]]] = None, 

834 dataId: Optional[DataId] = None, 

835 where: Optional[str] = None, 

836 findFirst: bool = False, 

837 components: Optional[bool] = None, 

838 bind: Optional[Mapping[str, Any]] = None, 

839 check: bool = True, 

840 **kwargs: Any) -> queries.DatasetQueryResults: 

841 # Docstring inherited from lsst.daf.butler.registry.Registry 

842 

843 # Standardize the collections expression. 

844 if collections is None: 

845 if not self.defaults.collections: 

846 raise TypeError("No collections provided to findDataset, " 

847 "and no defaults from registry construction.") 

848 collections = self.defaults.collections 

849 elif findFirst: 

850 collections = CollectionSearch.fromExpression(collections) 

851 else: 

852 collections = CollectionQuery.fromExpression(collections) 

853 # Standardize and expand the data ID provided as a constraint. 

854 standardizedDataId = self.expandDataId(dataId, **kwargs) 

855 

856 # We can only query directly if given a non-component DatasetType 

857 # instance. If we were given an expression or str or a component 

858 # DatasetType instance, we'll populate this dict, recurse, and return. 

859 # If we already have a non-component DatasetType, it will remain None 

860 # and we'll run the query directly. 

861 composition: Optional[ 

862 Dict[ 

863 DatasetType, # parent dataset type 

864 List[Optional[str]] # component name, or None for parent 

865 ] 

866 ] = None 

867 if not isinstance(datasetType, DatasetType): 

868 # We were given a dataset type expression (which may be as simple 

869 # as a str). Loop over all matching datasets, delegating handling 

870 # of the `components` argument to queryDatasetTypes, as we populate 

871 # the composition dict. 

872 composition = defaultdict(list) 

873 for trueDatasetType in self.queryDatasetTypes(datasetType, components=components): 

874 parentName, componentName = trueDatasetType.nameAndComponent() 

875 if componentName is not None: 

876 parentDatasetType = self.getDatasetType(parentName) 

877 composition.setdefault(parentDatasetType, []).append(componentName) 

878 else: 

879 composition.setdefault(trueDatasetType, []).append(None) 

880 if not composition: 

881 return queries.ChainedDatasetQueryResults( 

882 [], 

883 doomed_by=[f"No registered dataset type matching {t!r} found." 

884 for t in ensure_iterable(datasetType)], 

885 ) 

886 elif datasetType.isComponent(): 

887 # We were given a true DatasetType instance, but it's a component. 

888 # the composition dict will have exactly one item. 

889 parentName, componentName = datasetType.nameAndComponent() 

890 parentDatasetType = self.getDatasetType(parentName) 

891 composition = {parentDatasetType: [componentName]} 

892 if composition is not None: 

893 # We need to recurse. Do that once for each parent dataset type. 

894 chain = [] 

895 for parentDatasetType, componentNames in composition.items(): 

896 parentResults = self.queryDatasets( 

897 parentDatasetType, 

898 collections=collections, 

899 dimensions=dimensions, 

900 dataId=standardizedDataId, 

901 where=where, 

902 bind=bind, 

903 findFirst=findFirst, 

904 check=check, 

905 ) 

906 assert isinstance(parentResults, queries.ParentDatasetQueryResults), \ 

907 "Should always be true if passing in a DatasetType instance, and we are." 

908 chain.append( 

909 parentResults.withComponents(componentNames) 

910 ) 

911 return queries.ChainedDatasetQueryResults(chain) 

912 # If we get here, there's no need to recurse (or we are already 

913 # recursing; there can only ever be one level of recursion). 

914 

915 # The full set of dimensions in the query is the combination of those 

916 # needed for the DatasetType and those explicitly requested, if any. 

917 requestedDimensionNames = set(datasetType.dimensions.names) 

918 if dimensions is not None: 

919 requestedDimensionNames.update(self.dimensions.extract(dimensions).names) 

920 # Construct the summary structure needed to construct a QueryBuilder. 

921 summary = queries.QuerySummary( 

922 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames), 

923 dataId=standardizedDataId, 

924 expression=where, 

925 bind=bind, 

926 defaults=self.defaults.dataId, 

927 check=check, 

928 datasets=[datasetType], 

929 ) 

930 builder = self._makeQueryBuilder(summary) 

931 # Add the dataset subquery to the query, telling the QueryBuilder to 

932 # include the rank of the selected collection in the results only if we 

933 # need to findFirst. Note that if any of the collections are 

934 # actually wildcard expressions, and we've asked for deduplication, 

935 # this will raise TypeError for us. 

936 builder.joinDataset(datasetType, collections, isResult=True, findFirst=findFirst) 

937 query = builder.finish() 

938 return queries.ParentDatasetQueryResults(self._db, query, components=[None], datasetType=datasetType) 

939 

940 def queryDataIds(self, dimensions: Union[Iterable[Union[Dimension, str]], Dimension, str], *, 

941 dataId: Optional[DataId] = None, 

942 datasets: Any = None, 

943 collections: Any = None, 

944 where: Optional[str] = None, 

945 components: Optional[bool] = None, 

946 bind: Optional[Mapping[str, Any]] = None, 

947 check: bool = True, 

948 **kwargs: Any) -> queries.DataCoordinateQueryResults: 

949 # Docstring inherited from lsst.daf.butler.registry.Registry 

950 dimensions = ensure_iterable(dimensions) 

951 standardizedDataId = self.expandDataId(dataId, **kwargs) 

952 standardizedDatasetTypes = set() 

953 requestedDimensions = self.dimensions.extract(dimensions) 

954 missing: List[str] = [] 

955 if datasets is not None: 

956 if not collections: 

957 if not self.defaults.collections: 

958 raise TypeError(f"Cannot pass 'datasets' (='{datasets}') without 'collections'.") 

959 collections = self.defaults.collections 

960 else: 

961 # Preprocess collections expression in case the original 

962 # included single-pass iterators (we'll want to use it multiple 

963 # times below). 

964 collections = CollectionQuery.fromExpression(collections) 

965 for datasetType in self.queryDatasetTypes(datasets, components=components, missing=missing): 

966 # If any matched dataset type is a component, just operate on 

967 # its parent instead, because Registry doesn't know anything 

968 # about what components exist, and here (unlike queryDatasets) 

969 # we don't care about returning them. 

970 parentDatasetTypeName, componentName = datasetType.nameAndComponent() 

971 if componentName is not None: 

972 datasetType = self.getDatasetType(parentDatasetTypeName) 

973 standardizedDatasetTypes.add(datasetType) 

974 elif collections: 

975 raise TypeError(f"Cannot pass 'collections' (='{collections}') without 'datasets'.") 

976 

977 def query_factory(order_by: Optional[Iterable[str]] = None, 

978 limit: Optional[Tuple[int, Optional[int]]] = None) -> Query: 

979 """Construct the Query object that generates query results. 

980 """ 

981 summary = queries.QuerySummary( 

982 requested=requestedDimensions, 

983 dataId=standardizedDataId, 

984 expression=where, 

985 bind=bind, 

986 defaults=self.defaults.dataId, 

987 check=check, 

988 datasets=standardizedDatasetTypes, 

989 order_by=order_by, 

990 limit=limit 

991 ) 

992 builder = self._makeQueryBuilder( 

993 summary, 

994 doomed_by=[f"Dataset type {name} is not registered." for name in missing] 

995 ) 

996 for datasetType in standardizedDatasetTypes: 

997 builder.joinDataset(datasetType, collections, isResult=False,) 

998 return builder.finish() 

999 

1000 return queries.DataCoordinateQueryResults(self._db, query_factory, requestedDimensions) 

1001 

1002 def queryDimensionRecords(self, element: Union[DimensionElement, str], *, 

1003 dataId: Optional[DataId] = None, 

1004 datasets: Any = None, 

1005 collections: Any = None, 

1006 where: Optional[str] = None, 

1007 components: Optional[bool] = None, 

1008 bind: Optional[Mapping[str, Any]] = None, 

1009 check: bool = True, 

1010 **kwargs: Any) -> queries.DimensionRecordQueryResults: 

1011 # Docstring inherited from lsst.daf.butler.registry.Registry 

1012 if not isinstance(element, DimensionElement): 

1013 try: 

1014 element = self.dimensions[element] 

1015 except KeyError as e: 

1016 raise KeyError(f"No such dimension '{element}', available dimensions: " 

1017 + str(self.dimensions.getStaticElements())) from e 

1018 dataIds = self.queryDataIds(element.graph, dataId=dataId, datasets=datasets, collections=collections, 

1019 where=where, components=components, bind=bind, check=check, **kwargs) 

1020 return queries.DatabaseDimensionRecordQueryResults(dataIds, self._managers.dimensions[element]) 

1021 

1022 def queryDatasetAssociations( 

1023 self, 

1024 datasetType: Union[str, DatasetType], 

1025 collections: Any = ..., 

1026 *, 

1027 collectionTypes: Iterable[CollectionType] = CollectionType.all(), 

1028 flattenChains: bool = False, 

1029 ) -> Iterator[DatasetAssociation]: 

1030 # Docstring inherited from lsst.daf.butler.registry.Registry 

1031 if collections is None: 

1032 if not self.defaults.collections: 

1033 raise TypeError("No collections provided to findDataset, " 

1034 "and no defaults from registry construction.") 

1035 collections = self.defaults.collections 

1036 else: 

1037 collections = CollectionQuery.fromExpression(collections) 

1038 TimespanReprClass = self._db.getTimespanRepresentation() 

1039 if isinstance(datasetType, str): 

1040 storage = self._managers.datasets[datasetType] 

1041 else: 

1042 storage = self._managers.datasets[datasetType.name] 

1043 for collectionRecord in collections.iter(self._managers.collections, 

1044 collectionTypes=frozenset(collectionTypes), 

1045 flattenChains=flattenChains): 

1046 query = storage.select(collectionRecord) 

1047 for row in self._db.query(query.combine()).mappings(): 

1048 dataId = DataCoordinate.fromRequiredValues( 

1049 storage.datasetType.dimensions, 

1050 tuple(row[name] for name in storage.datasetType.dimensions.required.names) 

1051 ) 

1052 runRecord = self._managers.collections[row[self._managers.collections.getRunForeignKeyName()]] 

1053 ref = DatasetRef(storage.datasetType, dataId, id=row["id"], run=runRecord.name, 

1054 conform=False) 

1055 if collectionRecord.type is CollectionType.CALIBRATION: 

1056 timespan = TimespanReprClass.extract(row) 

1057 else: 

1058 timespan = None 

1059 yield DatasetAssociation(ref=ref, collection=collectionRecord.name, timespan=timespan) 

1060 

1061 storageClasses: StorageClassFactory 

1062 """All storage classes known to the registry (`StorageClassFactory`). 

1063 """