Coverage for python/lsst/daf/butler/registries/sql.py: 13%

462 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2022-12-01 19:55 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ( 

25 "SqlRegistry", 

26) 

27 

28from collections import defaultdict 

29import contextlib 

30import logging 

31from typing import ( 

32 Any, 

33 Dict, 

34 Iterable, 

35 Iterator, 

36 List, 

37 Mapping, 

38 Optional, 

39 Set, 

40 TYPE_CHECKING, 

41 Union, 

42) 

43 

44import sqlalchemy 

45 

46from ..core import ( 

47 ButlerURI, 

48 Config, 

49 DataCoordinate, 

50 DataCoordinateIterable, 

51 DataId, 

52 DatasetAssociation, 

53 DatasetId, 

54 DatasetRef, 

55 DatasetType, 

56 ddl, 

57 Dimension, 

58 DimensionConfig, 

59 DimensionElement, 

60 DimensionGraph, 

61 DimensionRecord, 

62 DimensionUniverse, 

63 NamedKeyMapping, 

64 NameLookupMapping, 

65 Progress, 

66 StorageClassFactory, 

67 Timespan, 

68) 

69from ..core.utils import iterable, transactional 

70 

71from ..registry import ( 

72 Registry, 

73 RegistryConfig, 

74 CollectionType, 

75 RegistryDefaults, 

76 ConflictingDefinitionError, 

77 InconsistentDataIdError, 

78 OrphanedRecordError, 

79 CollectionSearch, 

80) 

81from ..registry import queries 

82from ..registry.wildcards import CategorizedWildcard, CollectionQuery, Ellipsis 

83from ..registry.summaries import CollectionSummary 

84from ..registry.managers import RegistryManagerTypes, RegistryManagerInstances 

85from ..registry.interfaces import ChainedCollectionRecord, DatasetIdGenEnum, RunRecord 

86 

87if TYPE_CHECKING: 87 ↛ 88line 87 didn't jump to line 88, because the condition on line 87 was never true

88 from .._butlerConfig import ButlerConfig 

89 from ..registry.interfaces import ( 

90 CollectionRecord, 

91 Database, 

92 DatastoreRegistryBridgeManager, 

93 ) 

94 

95 

96_LOG = logging.getLogger(__name__) 

97 

98 

99class SqlRegistry(Registry): 

100 """Registry implementation based on SQLAlchemy. 

101 

102 Parameters 

103 ---------- 

104 database : `Database` 

105 Database instance to store Registry. 

106 defaults : `RegistryDefaults` 

107 Default collection search path and/or output `~CollectionType.RUN` 

108 collection. 

109 managers : `RegistryManagerInstances` 

110 All the managers required for this registry. 

111 """ 

112 

113 defaultConfigFile: Optional[str] = None 

114 """Path to configuration defaults. Accessed within the ``configs`` resource 

115 or relative to a search path. Can be None if no defaults specified. 

116 """ 

117 

118 @classmethod 

119 def createFromConfig(cls, config: Optional[Union[RegistryConfig, str]] = None, 

120 dimensionConfig: Optional[Union[DimensionConfig, str]] = None, 

121 butlerRoot: Optional[str] = None) -> Registry: 

122 """Create registry database and return `SqlRegistry` instance. 

123 

124 This method initializes database contents, database must be empty 

125 prior to calling this method. 

126 

127 Parameters 

128 ---------- 

129 config : `RegistryConfig` or `str`, optional 

130 Registry configuration, if missing then default configuration will 

131 be loaded from registry.yaml. 

132 dimensionConfig : `DimensionConfig` or `str`, optional 

133 Dimensions configuration, if missing then default configuration 

134 will be loaded from dimensions.yaml. 

135 butlerRoot : `str`, optional 

136 Path to the repository root this `SqlRegistry` will manage. 

137 

138 Returns 

139 ------- 

140 registry : `SqlRegistry` 

141 A new `SqlRegistry` instance. 

142 """ 

143 config = cls.forceRegistryConfig(config) 

144 config.replaceRoot(butlerRoot) 

145 

146 if isinstance(dimensionConfig, str): 

147 dimensionConfig = DimensionConfig(config) 

148 elif dimensionConfig is None: 

149 dimensionConfig = DimensionConfig() 

150 elif not isinstance(dimensionConfig, DimensionConfig): 

151 raise TypeError(f"Incompatible Dimension configuration type: {type(dimensionConfig)}") 

152 

153 DatabaseClass = config.getDatabaseClass() 

154 database = DatabaseClass.fromUri(str(config.connectionString), origin=config.get("origin", 0), 

155 namespace=config.get("namespace")) 

156 managerTypes = RegistryManagerTypes.fromConfig(config) 

157 managers = managerTypes.makeRepo(database, dimensionConfig) 

158 return cls(database, RegistryDefaults(), managers) 

159 

160 @classmethod 

161 def fromConfig(cls, config: Union[ButlerConfig, RegistryConfig, Config, str], 

162 butlerRoot: Optional[Union[str, ButlerURI]] = None, writeable: bool = True, 

163 defaults: Optional[RegistryDefaults] = None) -> Registry: 

164 """Create `Registry` subclass instance from `config`. 

165 

166 Registry database must be inbitialized prior to calling this method. 

167 

168 Parameters 

169 ---------- 

170 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str` 

171 Registry configuration 

172 butlerRoot : `str` or `ButlerURI`, optional 

173 Path to the repository root this `Registry` will manage. 

174 writeable : `bool`, optional 

175 If `True` (default) create a read-write connection to the database. 

176 defaults : `RegistryDefaults`, optional 

177 Default collection search path and/or output `~CollectionType.RUN` 

178 collection. 

179 

180 Returns 

181 ------- 

182 registry : `SqlRegistry` (subclass) 

183 A new `SqlRegistry` subclass instance. 

184 """ 

185 config = cls.forceRegistryConfig(config) 

186 config.replaceRoot(butlerRoot) 

187 DatabaseClass = config.getDatabaseClass() 

188 database = DatabaseClass.fromUri(str(config.connectionString), origin=config.get("origin", 0), 

189 namespace=config.get("namespace"), writeable=writeable) 

190 managerTypes = RegistryManagerTypes.fromConfig(config) 

191 managers = managerTypes.loadRepo(database) 

192 if defaults is None: 

193 defaults = RegistryDefaults() 

194 return cls(database, defaults, managers) 

195 

196 def __init__(self, database: Database, defaults: RegistryDefaults, managers: RegistryManagerInstances): 

197 self._db = database 

198 self._managers = managers 

199 self.storageClasses = StorageClassFactory() 

200 # Intentionally invoke property setter to initialize defaults. This 

201 # can only be done after most of the rest of Registry has already been 

202 # initialized, and must be done before the property getter is used. 

203 self.defaults = defaults 

204 

205 def __str__(self) -> str: 

206 return str(self._db) 

207 

208 def __repr__(self) -> str: 

209 return f"SqlRegistry({self._db!r}, {self.dimensions!r})" 

210 

211 def isWriteable(self) -> bool: 

212 # Docstring inherited from lsst.daf.butler.registry.Registry 

213 return self._db.isWriteable() 

214 

215 def copy(self, defaults: Optional[RegistryDefaults] = None) -> Registry: 

216 # Docstring inherited from lsst.daf.butler.registry.Registry 

217 if defaults is None: 

218 # No need to copy, because `RegistryDefaults` is immutable; we 

219 # effectively copy on write. 

220 defaults = self.defaults 

221 return type(self)(self._db, defaults, self._managers) 

222 

223 @property 

224 def dimensions(self) -> DimensionUniverse: 

225 # Docstring inherited from lsst.daf.butler.registry.Registry 

226 return self._managers.dimensions.universe 

227 

228 def refresh(self) -> None: 

229 # Docstring inherited from lsst.daf.butler.registry.Registry 

230 self._managers.refresh() 

231 

232 @contextlib.contextmanager 

233 def transaction(self, *, savepoint: bool = False) -> Iterator[None]: 

234 # Docstring inherited from lsst.daf.butler.registry.Registry 

235 try: 

236 with self._db.transaction(savepoint=savepoint): 

237 yield 

238 except BaseException: 

239 # TODO: this clears the caches sometimes when we wouldn't actually 

240 # need to. Can we avoid that? 

241 self._managers.dimensions.clearCaches() 

242 raise 

243 

244 def resetConnectionPool(self) -> None: 

245 """Reset SQLAlchemy connection pool for `SqlRegistry` database. 

246 

247 This operation is useful when using registry with fork-based 

248 multiprocessing. To use registry across fork boundary one has to make 

249 sure that there are no currently active connections (no session or 

250 transaction is in progress) and connection pool is reset using this 

251 method. This method should be called by the child process immediately 

252 after the fork. 

253 """ 

254 self._db._engine.dispose() 

255 

256 def registerOpaqueTable(self, tableName: str, spec: ddl.TableSpec) -> None: 

257 """Add an opaque (to the `Registry`) table for use by a `Datastore` or 

258 other data repository client. 

259 

260 Opaque table records can be added via `insertOpaqueData`, retrieved via 

261 `fetchOpaqueData`, and removed via `deleteOpaqueData`. 

262 

263 Parameters 

264 ---------- 

265 tableName : `str` 

266 Logical name of the opaque table. This may differ from the 

267 actual name used in the database by a prefix and/or suffix. 

268 spec : `ddl.TableSpec` 

269 Specification for the table to be added. 

270 """ 

271 self._managers.opaque.register(tableName, spec) 

272 

273 @transactional 

274 def insertOpaqueData(self, tableName: str, *data: dict) -> None: 

275 """Insert records into an opaque table. 

276 

277 Parameters 

278 ---------- 

279 tableName : `str` 

280 Logical name of the opaque table. Must match the name used in a 

281 previous call to `registerOpaqueTable`. 

282 data 

283 Each additional positional argument is a dictionary that represents 

284 a single row to be added. 

285 """ 

286 self._managers.opaque[tableName].insert(*data) 

287 

288 def fetchOpaqueData(self, tableName: str, **where: Any) -> Iterator[dict]: 

289 """Retrieve records from an opaque table. 

290 

291 Parameters 

292 ---------- 

293 tableName : `str` 

294 Logical name of the opaque table. Must match the name used in a 

295 previous call to `registerOpaqueTable`. 

296 where 

297 Additional keyword arguments are interpreted as equality 

298 constraints that restrict the returned rows (combined with AND); 

299 keyword arguments are column names and values are the values they 

300 must have. 

301 

302 Yields 

303 ------ 

304 row : `dict` 

305 A dictionary representing a single result row. 

306 """ 

307 yield from self._managers.opaque[tableName].fetch(**where) 

308 

309 @transactional 

310 def deleteOpaqueData(self, tableName: str, **where: Any) -> None: 

311 """Remove records from an opaque table. 

312 

313 Parameters 

314 ---------- 

315 tableName : `str` 

316 Logical name of the opaque table. Must match the name used in a 

317 previous call to `registerOpaqueTable`. 

318 where 

319 Additional keyword arguments are interpreted as equality 

320 constraints that restrict the deleted rows (combined with AND); 

321 keyword arguments are column names and values are the values they 

322 must have. 

323 """ 

324 self._managers.opaque[tableName].delete(where.keys(), where) 

325 

326 def registerCollection(self, name: str, type: CollectionType = CollectionType.TAGGED, 

327 doc: Optional[str] = None) -> bool: 

328 # Docstring inherited from lsst.daf.butler.registry.Registry 

329 _, registered = self._managers.collections.register(name, type, doc=doc) 

330 return registered 

331 

332 def getCollectionType(self, name: str) -> CollectionType: 

333 # Docstring inherited from lsst.daf.butler.registry.Registry 

334 return self._managers.collections.find(name).type 

335 

336 def _get_collection_record(self, name: str) -> CollectionRecord: 

337 # Docstring inherited from lsst.daf.butler.registry.Registry 

338 return self._managers.collections.find(name) 

339 

340 def registerRun(self, name: str, doc: Optional[str] = None) -> bool: 

341 # Docstring inherited from lsst.daf.butler.registry.Registry 

342 _, registered = self._managers.collections.register(name, CollectionType.RUN, doc=doc) 

343 return registered 

344 

345 @transactional 

346 def removeCollection(self, name: str) -> None: 

347 # Docstring inherited from lsst.daf.butler.registry.Registry 

348 self._managers.collections.remove(name) 

349 

350 def getCollectionChain(self, parent: str) -> CollectionSearch: 

351 # Docstring inherited from lsst.daf.butler.registry.Registry 

352 record = self._managers.collections.find(parent) 

353 if record.type is not CollectionType.CHAINED: 

354 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.") 

355 assert isinstance(record, ChainedCollectionRecord) 

356 return record.children 

357 

358 @transactional 

359 def setCollectionChain(self, parent: str, children: Any, *, flatten: bool = False) -> None: 

360 # Docstring inherited from lsst.daf.butler.registry.Registry 

361 record = self._managers.collections.find(parent) 

362 if record.type is not CollectionType.CHAINED: 

363 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.") 

364 assert isinstance(record, ChainedCollectionRecord) 

365 children = CollectionSearch.fromExpression(children) 

366 if children != record.children or flatten: 

367 record.update(self._managers.collections, children, flatten=flatten) 

368 

369 def getCollectionDocumentation(self, collection: str) -> Optional[str]: 

370 # Docstring inherited from lsst.daf.butler.registry.Registry 

371 return self._managers.collections.getDocumentation(self._managers.collections.find(collection).key) 

372 

373 def setCollectionDocumentation(self, collection: str, doc: Optional[str]) -> None: 

374 # Docstring inherited from lsst.daf.butler.registry.Registry 

375 self._managers.collections.setDocumentation(self._managers.collections.find(collection).key, doc) 

376 

377 def getCollectionSummary(self, collection: str) -> CollectionSummary: 

378 # Docstring inherited from lsst.daf.butler.registry.Registry 

379 record = self._managers.collections.find(collection) 

380 return self._managers.datasets.getCollectionSummary(record) 

381 

382 def registerDatasetType(self, datasetType: DatasetType) -> bool: 

383 # Docstring inherited from lsst.daf.butler.registry.Registry 

384 _, inserted = self._managers.datasets.register(datasetType) 

385 return inserted 

386 

387 def removeDatasetType(self, name: str) -> None: 

388 # Docstring inherited from lsst.daf.butler.registry.Registry 

389 self._managers.datasets.remove(name) 

390 

391 def getDatasetType(self, name: str) -> DatasetType: 

392 # Docstring inherited from lsst.daf.butler.registry.Registry 

393 return self._managers.datasets[name].datasetType 

394 

395 def supportsIdGenerationMode(self, mode: DatasetIdGenEnum) -> bool: 

396 # Docstring inherited from lsst.daf.butler.registry.Registry 

397 return self._managers.datasets.supportsIdGenerationMode(mode) 

398 

399 def findDataset(self, datasetType: Union[DatasetType, str], dataId: Optional[DataId] = None, *, 

400 collections: Any = None, timespan: Optional[Timespan] = None, 

401 **kwargs: Any) -> Optional[DatasetRef]: 

402 # Docstring inherited from lsst.daf.butler.registry.Registry 

403 if isinstance(datasetType, DatasetType): 

404 storage = self._managers.datasets[datasetType.name] 

405 else: 

406 storage = self._managers.datasets[datasetType] 

407 dataId = DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions, 

408 universe=self.dimensions, defaults=self.defaults.dataId, 

409 **kwargs) 

410 if collections is None: 

411 if not self.defaults.collections: 

412 raise TypeError("No collections provided to findDataset, " 

413 "and no defaults from registry construction.") 

414 collections = self.defaults.collections 

415 else: 

416 collections = CollectionSearch.fromExpression(collections) 

417 for collectionRecord in collections.iter(self._managers.collections): 

418 if (collectionRecord.type is CollectionType.CALIBRATION 

419 and (not storage.datasetType.isCalibration() or timespan is None)): 

420 continue 

421 result = storage.find(collectionRecord, dataId, timespan=timespan) 

422 if result is not None: 

423 return result 

424 

425 return None 

426 

427 @transactional 

428 def insertDatasets(self, datasetType: Union[DatasetType, str], dataIds: Iterable[DataId], 

429 run: Optional[str] = None, expand: bool = True, 

430 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE) -> List[DatasetRef]: 

431 # Docstring inherited from lsst.daf.butler.registry.Registry 

432 if isinstance(datasetType, DatasetType): 

433 storage = self._managers.datasets.find(datasetType.name) 

434 if storage is None: 

435 raise LookupError(f"DatasetType '{datasetType}' has not been registered.") 

436 else: 

437 storage = self._managers.datasets.find(datasetType) 

438 if storage is None: 

439 raise LookupError(f"DatasetType with name '{datasetType}' has not been registered.") 

440 if run is None: 

441 if self.defaults.run is None: 

442 raise TypeError("No run provided to insertDatasets, " 

443 "and no default from registry construction.") 

444 run = self.defaults.run 

445 runRecord = self._managers.collections.find(run) 

446 if runRecord.type is not CollectionType.RUN: 

447 raise TypeError(f"Given collection is of type {runRecord.type.name}; RUN collection required.") 

448 assert isinstance(runRecord, RunRecord) 

449 progress = Progress("daf.butler.Registry.insertDatasets", level=logging.DEBUG) 

450 if expand: 

451 expandedDataIds = [self.expandDataId(dataId, graph=storage.datasetType.dimensions) 

452 for dataId in progress.wrap(dataIds, 

453 f"Expanding {storage.datasetType.name} data IDs")] 

454 else: 

455 expandedDataIds = [DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions) 

456 for dataId in dataIds] 

457 try: 

458 refs = list(storage.insert(runRecord, expandedDataIds, idGenerationMode)) 

459 except sqlalchemy.exc.IntegrityError as err: 

460 raise ConflictingDefinitionError(f"A database constraint failure was triggered by inserting " 

461 f"one or more datasets of type {storage.datasetType} into " 

462 f"collection '{run}'. " 

463 f"This probably means a dataset with the same data ID " 

464 f"and dataset type already exists, but it may also mean a " 

465 f"dimension row is missing.") from err 

466 return refs 

467 

468 @transactional 

469 def _importDatasets(self, datasets: Iterable[DatasetRef], expand: bool = True, 

470 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

471 reuseIds: bool = False) -> List[DatasetRef]: 

472 # Docstring inherited from lsst.daf.butler.registry.Registry 

473 datasets = list(datasets) 

474 if not datasets: 

475 # nothing to do 

476 return [] 

477 

478 # find dataset type 

479 datasetTypes = set(dataset.datasetType for dataset in datasets) 

480 if len(datasetTypes) != 1: 

481 raise ValueError(f"Multiple dataset types in input datasets: {datasetTypes}") 

482 datasetType = datasetTypes.pop() 

483 

484 # get storage handler for this dataset type 

485 storage = self._managers.datasets.find(datasetType.name) 

486 if storage is None: 

487 raise LookupError(f"DatasetType '{datasetType}' has not been registered.") 

488 

489 # find run name 

490 runs = set(dataset.run for dataset in datasets) 

491 if len(runs) != 1: 

492 raise ValueError(f"Multiple run names in input datasets: {runs}") 

493 run = runs.pop() 

494 if run is None: 

495 if self.defaults.run is None: 

496 raise TypeError("No run provided to ingestDatasets, " 

497 "and no default from registry construction.") 

498 run = self.defaults.run 

499 

500 runRecord = self._managers.collections.find(run) 

501 if runRecord.type is not CollectionType.RUN: 

502 raise TypeError(f"Given collection '{runRecord.name}' is of type {runRecord.type.name};" 

503 " RUN collection required.") 

504 assert isinstance(runRecord, RunRecord) 

505 

506 progress = Progress("daf.butler.Registry.insertDatasets", level=logging.DEBUG) 

507 if expand: 

508 expandedDatasets = [ 

509 dataset.expanded(self.expandDataId(dataset.dataId, graph=storage.datasetType.dimensions)) 

510 for dataset in progress.wrap(datasets, f"Expanding {storage.datasetType.name} data IDs")] 

511 else: 

512 expandedDatasets = [ 

513 DatasetRef(datasetType, dataset.dataId, id=dataset.id, run=dataset.run, conform=True) 

514 for dataset in datasets 

515 ] 

516 

517 try: 

518 refs = list(storage.import_(runRecord, expandedDatasets, idGenerationMode, reuseIds)) 

519 except sqlalchemy.exc.IntegrityError as err: 

520 raise ConflictingDefinitionError(f"A database constraint failure was triggered by inserting " 

521 f"one or more datasets of type {storage.datasetType} into " 

522 f"collection '{run}'. " 

523 f"This probably means a dataset with the same data ID " 

524 f"and dataset type already exists, but it may also mean a " 

525 f"dimension row is missing.") from err 

526 return refs 

527 

528 def getDataset(self, id: DatasetId) -> Optional[DatasetRef]: 

529 # Docstring inherited from lsst.daf.butler.registry.Registry 

530 return self._managers.datasets.getDatasetRef(id) 

531 

532 @transactional 

533 def removeDatasets(self, refs: Iterable[DatasetRef]) -> None: 

534 # Docstring inherited from lsst.daf.butler.registry.Registry 

535 progress = Progress("lsst.daf.butler.Registry.removeDatasets", level=logging.DEBUG) 

536 for datasetType, refsForType in progress.iter_item_chunks(DatasetRef.groupByType(refs).items(), 

537 desc="Removing datasets by type"): 

538 storage = self._managers.datasets[datasetType.name] 

539 try: 

540 storage.delete(refsForType) 

541 except sqlalchemy.exc.IntegrityError as err: 

542 raise OrphanedRecordError("One or more datasets is still " 

543 "present in one or more Datastores.") from err 

544 

545 @transactional 

546 def associate(self, collection: str, refs: Iterable[DatasetRef]) -> None: 

547 # Docstring inherited from lsst.daf.butler.registry.Registry 

548 progress = Progress("lsst.daf.butler.Registry.associate", level=logging.DEBUG) 

549 collectionRecord = self._managers.collections.find(collection) 

550 if collectionRecord.type is not CollectionType.TAGGED: 

551 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}, not TAGGED.") 

552 for datasetType, refsForType in progress.iter_item_chunks(DatasetRef.groupByType(refs).items(), 

553 desc="Associating datasets by type"): 

554 storage = self._managers.datasets[datasetType.name] 

555 try: 

556 storage.associate(collectionRecord, refsForType) 

557 except sqlalchemy.exc.IntegrityError as err: 

558 raise ConflictingDefinitionError( 

559 f"Constraint violation while associating dataset of type {datasetType.name} with " 

560 f"collection {collection}. This probably means that one or more datasets with the same " 

561 f"dataset type and data ID already exist in the collection, but it may also indicate " 

562 f"that the datasets do not exist." 

563 ) from err 

564 

565 @transactional 

566 def disassociate(self, collection: str, refs: Iterable[DatasetRef]) -> None: 

567 # Docstring inherited from lsst.daf.butler.registry.Registry 

568 progress = Progress("lsst.daf.butler.Registry.disassociate", level=logging.DEBUG) 

569 collectionRecord = self._managers.collections.find(collection) 

570 if collectionRecord.type is not CollectionType.TAGGED: 

571 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}; " 

572 "expected TAGGED.") 

573 for datasetType, refsForType in progress.iter_item_chunks(DatasetRef.groupByType(refs).items(), 

574 desc="Disassociating datasets by type"): 

575 storage = self._managers.datasets[datasetType.name] 

576 storage.disassociate(collectionRecord, refsForType) 

577 

578 @transactional 

579 def certify(self, collection: str, refs: Iterable[DatasetRef], timespan: Timespan) -> None: 

580 # Docstring inherited from lsst.daf.butler.registry.Registry 

581 progress = Progress("lsst.daf.butler.Registry.certify", level=logging.DEBUG) 

582 collectionRecord = self._managers.collections.find(collection) 

583 for datasetType, refsForType in progress.iter_item_chunks(DatasetRef.groupByType(refs).items(), 

584 desc="Certifying datasets by type"): 

585 storage = self._managers.datasets[datasetType.name] 

586 storage.certify(collectionRecord, refsForType, timespan) 

587 

588 @transactional 

589 def decertify(self, collection: str, datasetType: Union[str, DatasetType], timespan: Timespan, *, 

590 dataIds: Optional[Iterable[DataId]] = None) -> None: 

591 # Docstring inherited from lsst.daf.butler.registry.Registry 

592 collectionRecord = self._managers.collections.find(collection) 

593 if isinstance(datasetType, str): 

594 storage = self._managers.datasets[datasetType] 

595 else: 

596 storage = self._managers.datasets[datasetType.name] 

597 standardizedDataIds = None 

598 if dataIds is not None: 

599 standardizedDataIds = [DataCoordinate.standardize(d, graph=storage.datasetType.dimensions) 

600 for d in dataIds] 

601 storage.decertify(collectionRecord, timespan, dataIds=standardizedDataIds) 

602 

603 def getDatastoreBridgeManager(self) -> DatastoreRegistryBridgeManager: 

604 """Return an object that allows a new `Datastore` instance to 

605 communicate with this `Registry`. 

606 

607 Returns 

608 ------- 

609 manager : `DatastoreRegistryBridgeManager` 

610 Object that mediates communication between this `Registry` and its 

611 associated datastores. 

612 """ 

613 return self._managers.datastores 

614 

615 def getDatasetLocations(self, ref: DatasetRef) -> Iterable[str]: 

616 # Docstring inherited from lsst.daf.butler.registry.Registry 

617 return self._managers.datastores.findDatastores(ref) 

618 

619 def expandDataId(self, dataId: Optional[DataId] = None, *, graph: Optional[DimensionGraph] = None, 

620 records: Optional[NameLookupMapping[DimensionElement, Optional[DimensionRecord]]] = None, 

621 withDefaults: bool = True, 

622 **kwargs: Any) -> DataCoordinate: 

623 # Docstring inherited from lsst.daf.butler.registry.Registry 

624 if not withDefaults: 

625 defaults = None 

626 else: 

627 defaults = self.defaults.dataId 

628 standardized = DataCoordinate.standardize(dataId, graph=graph, universe=self.dimensions, 

629 defaults=defaults, **kwargs) 

630 if standardized.hasRecords(): 

631 return standardized 

632 if records is None: 

633 records = {} 

634 elif isinstance(records, NamedKeyMapping): 

635 records = records.byName() 

636 else: 

637 records = dict(records) 

638 if isinstance(dataId, DataCoordinate) and dataId.hasRecords(): 

639 records.update(dataId.records.byName()) 

640 keys = standardized.byName() 

641 for element in standardized.graph.primaryKeyTraversalOrder: 

642 record = records.get(element.name, ...) # Use ... to mean not found; None might mean NULL 

643 if record is ...: 

644 if isinstance(element, Dimension) and keys.get(element.name) is None: 

645 if element in standardized.graph.required: 

646 raise LookupError( 

647 f"No value or null value for required dimension {element.name}." 

648 ) 

649 keys[element.name] = None 

650 record = None 

651 else: 

652 storage = self._managers.dimensions[element] 

653 dataIdSet = DataCoordinateIterable.fromScalar( 

654 DataCoordinate.standardize(keys, graph=element.graph) 

655 ) 

656 fetched = tuple(storage.fetch(dataIdSet)) 

657 try: 

658 (record,) = fetched 

659 except ValueError: 

660 record = None 

661 records[element.name] = record 

662 if record is not None: 

663 for d in element.implied: 

664 value = getattr(record, d.name) 

665 if keys.setdefault(d.name, value) != value: 

666 raise InconsistentDataIdError( 

667 f"Data ID {standardized} has {d.name}={keys[d.name]!r}, " 

668 f"but {element.name} implies {d.name}={value!r}." 

669 ) 

670 else: 

671 if element in standardized.graph.required: 

672 raise LookupError( 

673 f"Could not fetch record for required dimension {element.name} via keys {keys}." 

674 ) 

675 if element.alwaysJoin: 

676 raise InconsistentDataIdError( 

677 f"Could not fetch record for element {element.name} via keys {keys}, ", 

678 "but it is marked alwaysJoin=True; this means one or more dimensions are not " 

679 "related." 

680 ) 

681 for d in element.implied: 

682 keys.setdefault(d.name, None) 

683 records.setdefault(d.name, None) 

684 return DataCoordinate.standardize(keys, graph=standardized.graph).expanded(records=records) 

685 

686 def insertDimensionData(self, element: Union[DimensionElement, str], 

687 *data: Union[Mapping[str, Any], DimensionRecord], 

688 conform: bool = True, 

689 replace: bool = False) -> None: 

690 # Docstring inherited from lsst.daf.butler.registry.Registry 

691 if conform: 

692 if isinstance(element, str): 

693 element = self.dimensions[element] 

694 records = [row if isinstance(row, DimensionRecord) else element.RecordClass(**row) 

695 for row in data] 

696 else: 

697 # Ignore typing since caller said to trust them with conform=False. 

698 records = data # type: ignore 

699 storage = self._managers.dimensions[element] # type: ignore 

700 storage.insert(*records, replace=replace) 

701 

702 def syncDimensionData(self, element: Union[DimensionElement, str], 

703 row: Union[Mapping[str, Any], DimensionRecord], 

704 conform: bool = True, 

705 update: bool = False) -> Union[bool, Dict[str, Any]]: 

706 # Docstring inherited from lsst.daf.butler.registry.Registry 

707 if conform: 

708 if isinstance(element, str): 

709 element = self.dimensions[element] 

710 record = row if isinstance(row, DimensionRecord) else element.RecordClass(**row) 

711 else: 

712 # Ignore typing since caller said to trust them with conform=False. 

713 record = row # type: ignore 

714 storage = self._managers.dimensions[element] # type: ignore 

715 return storage.sync(record, update=update) 

716 

717 def queryDatasetTypes(self, expression: Any = ..., *, components: Optional[bool] = None 

718 ) -> Iterator[DatasetType]: 

719 # Docstring inherited from lsst.daf.butler.registry.Registry 

720 wildcard = CategorizedWildcard.fromExpression(expression, coerceUnrecognized=lambda d: d.name) 

721 if wildcard is Ellipsis: 

722 for datasetType in self._managers.datasets: 

723 # The dataset type can no longer be a component 

724 yield datasetType 

725 if components: 

726 # Automatically create the component dataset types 

727 try: 

728 componentsForDatasetType = datasetType.makeAllComponentDatasetTypes() 

729 except KeyError as err: 

730 _LOG.warning(f"Could not load storage class {err} for {datasetType.name}; " 

731 "if it has components they will not be included in query results.") 

732 else: 

733 yield from componentsForDatasetType 

734 return 

735 done: Set[str] = set() 

736 for name in wildcard.strings: 

737 storage = self._managers.datasets.find(name) 

738 if storage is not None: 

739 done.add(storage.datasetType.name) 

740 yield storage.datasetType 

741 if wildcard.patterns: 

742 # If components (the argument) is None, we'll save component 

743 # dataset that we might want to match, but only if their parents 

744 # didn't get included. 

745 componentsForLater = [] 

746 for registeredDatasetType in self._managers.datasets: 

747 # Components are not stored in registry so expand them here 

748 allDatasetTypes = [registeredDatasetType] 

749 try: 

750 allDatasetTypes.extend(registeredDatasetType.makeAllComponentDatasetTypes()) 

751 except KeyError as err: 

752 _LOG.warning(f"Could not load storage class {err} for {registeredDatasetType.name}; " 

753 "if it has components they will not be included in query results.") 

754 for datasetType in allDatasetTypes: 

755 if datasetType.name in done: 

756 continue 

757 parentName, componentName = datasetType.nameAndComponent() 

758 if componentName is not None and not components: 

759 if components is None and parentName not in done: 

760 componentsForLater.append(datasetType) 

761 continue 

762 if any(p.fullmatch(datasetType.name) for p in wildcard.patterns): 

763 done.add(datasetType.name) 

764 yield datasetType 

765 # Go back and try to match saved components. 

766 for datasetType in componentsForLater: 

767 parentName, _ = datasetType.nameAndComponent() 

768 if parentName not in done and any(p.fullmatch(datasetType.name) for p in wildcard.patterns): 

769 yield datasetType 

770 

771 def queryCollections(self, expression: Any = ..., 

772 datasetType: Optional[DatasetType] = None, 

773 collectionTypes: Iterable[CollectionType] = CollectionType.all(), 

774 flattenChains: bool = False, 

775 includeChains: Optional[bool] = None) -> Iterator[str]: 

776 # Docstring inherited from lsst.daf.butler.registry.Registry 

777 

778 # Right now the datasetTypes argument is completely ignored, but that 

779 # is consistent with its [lack of] guarantees. DM-24939 or a follow-up 

780 # ticket will take care of that. 

781 query = CollectionQuery.fromExpression(expression) 

782 for record in query.iter(self._managers.collections, collectionTypes=frozenset(collectionTypes), 

783 flattenChains=flattenChains, includeChains=includeChains): 

784 yield record.name 

785 

786 def _makeQueryBuilder(self, summary: queries.QuerySummary) -> queries.QueryBuilder: 

787 """Return a `QueryBuilder` instance capable of constructing and 

788 managing more complex queries than those obtainable via `Registry` 

789 interfaces. 

790 

791 This is an advanced interface; downstream code should prefer 

792 `Registry.queryDataIds` and `Registry.queryDatasets` whenever those 

793 are sufficient. 

794 

795 Parameters 

796 ---------- 

797 summary : `queries.QuerySummary` 

798 Object describing and categorizing the full set of dimensions that 

799 will be included in the query. 

800 

801 Returns 

802 ------- 

803 builder : `queries.QueryBuilder` 

804 Object that can be used to construct and perform advanced queries. 

805 """ 

806 return queries.QueryBuilder( 

807 summary, 

808 queries.RegistryManagers( 

809 collections=self._managers.collections, 

810 dimensions=self._managers.dimensions, 

811 datasets=self._managers.datasets, 

812 TimespanReprClass=self._db.getTimespanRepresentation(), 

813 ), 

814 ) 

815 

816 def queryDatasets(self, datasetType: Any, *, 

817 collections: Any = None, 

818 dimensions: Optional[Iterable[Union[Dimension, str]]] = None, 

819 dataId: Optional[DataId] = None, 

820 where: Optional[str] = None, 

821 findFirst: bool = False, 

822 components: Optional[bool] = None, 

823 bind: Optional[Mapping[str, Any]] = None, 

824 check: bool = True, 

825 **kwargs: Any) -> queries.DatasetQueryResults: 

826 # Docstring inherited from lsst.daf.butler.registry.Registry 

827 

828 # Standardize the collections expression. 

829 if collections is None: 

830 if not self.defaults.collections: 

831 raise TypeError("No collections provided to findDataset, " 

832 "and no defaults from registry construction.") 

833 collections = self.defaults.collections 

834 elif findFirst: 

835 collections = CollectionSearch.fromExpression(collections) 

836 else: 

837 collections = CollectionQuery.fromExpression(collections) 

838 # Standardize and expand the data ID provided as a constraint. 

839 standardizedDataId = self.expandDataId(dataId, **kwargs) 

840 

841 # We can only query directly if given a non-component DatasetType 

842 # instance. If we were given an expression or str or a component 

843 # DatasetType instance, we'll populate this dict, recurse, and return. 

844 # If we already have a non-component DatasetType, it will remain None 

845 # and we'll run the query directly. 

846 composition: Optional[ 

847 Dict[ 

848 DatasetType, # parent dataset type 

849 List[Optional[str]] # component name, or None for parent 

850 ] 

851 ] = None 

852 if not isinstance(datasetType, DatasetType): 

853 # We were given a dataset type expression (which may be as simple 

854 # as a str). Loop over all matching datasets, delegating handling 

855 # of the `components` argument to queryDatasetTypes, as we populate 

856 # the composition dict. 

857 composition = defaultdict(list) 

858 for trueDatasetType in self.queryDatasetTypes(datasetType, components=components): 

859 parentName, componentName = trueDatasetType.nameAndComponent() 

860 if componentName is not None: 

861 parentDatasetType = self.getDatasetType(parentName) 

862 composition.setdefault(parentDatasetType, []).append(componentName) 

863 else: 

864 composition.setdefault(trueDatasetType, []).append(None) 

865 if not composition: 

866 return queries.ChainedDatasetQueryResults( 

867 [], 

868 doomed_by=[f"No registered dataset type matching {t!r} found." 

869 for t in iterable(datasetType)], 

870 ) 

871 elif datasetType.isComponent(): 

872 # We were given a true DatasetType instance, but it's a component. 

873 # the composition dict will have exactly one item. 

874 parentName, componentName = datasetType.nameAndComponent() 

875 parentDatasetType = self.getDatasetType(parentName) 

876 composition = {parentDatasetType: [componentName]} 

877 if composition is not None: 

878 # We need to recurse. Do that once for each parent dataset type. 

879 chain = [] 

880 for parentDatasetType, componentNames in composition.items(): 

881 parentResults = self.queryDatasets( 

882 parentDatasetType, 

883 collections=collections, 

884 dimensions=dimensions, 

885 dataId=standardizedDataId, 

886 where=where, 

887 bind=bind, 

888 findFirst=findFirst, 

889 check=check, 

890 ) 

891 assert isinstance(parentResults, queries.ParentDatasetQueryResults), \ 

892 "Should always be true if passing in a DatasetType instance, and we are." 

893 chain.append( 

894 parentResults.withComponents(componentNames) 

895 ) 

896 return queries.ChainedDatasetQueryResults(chain) 

897 # If we get here, there's no need to recurse (or we are already 

898 # recursing; there can only ever be one level of recursion). 

899 

900 # The full set of dimensions in the query is the combination of those 

901 # needed for the DatasetType and those explicitly requested, if any. 

902 requestedDimensionNames = set(datasetType.dimensions.names) 

903 if dimensions is not None: 

904 requestedDimensionNames.update(self.dimensions.extract(dimensions).names) 

905 # Construct the summary structure needed to construct a QueryBuilder. 

906 summary = queries.QuerySummary( 

907 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames), 

908 dataId=standardizedDataId, 

909 expression=where, 

910 bind=bind, 

911 defaults=self.defaults.dataId, 

912 check=check, 

913 ) 

914 builder = self._makeQueryBuilder(summary) 

915 # Add the dataset subquery to the query, telling the QueryBuilder to 

916 # include the rank of the selected collection in the results only if we 

917 # need to findFirst. Note that if any of the collections are 

918 # actually wildcard expressions, and we've asked for deduplication, 

919 # this will raise TypeError for us. 

920 builder.joinDataset(datasetType, collections, isResult=True, findFirst=findFirst) 

921 query = builder.finish() 

922 return queries.ParentDatasetQueryResults(self._db, query, components=[None], datasetType=datasetType) 

923 

924 def queryDataIds(self, dimensions: Union[Iterable[Union[Dimension, str]], Dimension, str], *, 

925 dataId: Optional[DataId] = None, 

926 datasets: Any = None, 

927 collections: Any = None, 

928 where: Optional[str] = None, 

929 components: Optional[bool] = None, 

930 bind: Optional[Mapping[str, Any]] = None, 

931 check: bool = True, 

932 **kwargs: Any) -> queries.DataCoordinateQueryResults: 

933 # Docstring inherited from lsst.daf.butler.registry.Registry 

934 dimensions = iterable(dimensions) 

935 standardizedDataId = self.expandDataId(dataId, **kwargs) 

936 standardizedDatasetTypes = set() 

937 requestedDimensions = self.dimensions.extract(dimensions) 

938 queryDimensionNames = set(requestedDimensions.names) 

939 if datasets is not None: 

940 if not collections: 

941 if not self.defaults.collections: 

942 raise TypeError(f"Cannot pass 'datasets' (='{datasets}') without 'collections'.") 

943 collections = self.defaults.collections 

944 else: 

945 # Preprocess collections expression in case the original 

946 # included single-pass iterators (we'll want to use it multiple 

947 # times below). 

948 collections = CollectionQuery.fromExpression(collections) 

949 for datasetType in self.queryDatasetTypes(datasets, components=components): 

950 queryDimensionNames.update(datasetType.dimensions.names) 

951 # If any matched dataset type is a component, just operate on 

952 # its parent instead, because Registry doesn't know anything 

953 # about what components exist, and here (unlike queryDatasets) 

954 # we don't care about returning them. 

955 parentDatasetTypeName, componentName = datasetType.nameAndComponent() 

956 if componentName is not None: 

957 datasetType = self.getDatasetType(parentDatasetTypeName) 

958 standardizedDatasetTypes.add(datasetType) 

959 elif collections: 

960 raise TypeError(f"Cannot pass 'collections' (='{collections}') without 'datasets'.") 

961 

962 summary = queries.QuerySummary( 

963 requested=DimensionGraph(self.dimensions, names=queryDimensionNames), 

964 dataId=standardizedDataId, 

965 expression=where, 

966 bind=bind, 

967 defaults=self.defaults.dataId, 

968 check=check, 

969 ) 

970 builder = self._makeQueryBuilder(summary) 

971 for datasetType in standardizedDatasetTypes: 

972 builder.joinDataset(datasetType, collections, isResult=False) 

973 query = builder.finish() 

974 return queries.DataCoordinateQueryResults(self._db, query) 

975 

976 def queryDimensionRecords(self, element: Union[DimensionElement, str], *, 

977 dataId: Optional[DataId] = None, 

978 datasets: Any = None, 

979 collections: Any = None, 

980 where: Optional[str] = None, 

981 components: Optional[bool] = None, 

982 bind: Optional[Mapping[str, Any]] = None, 

983 check: bool = True, 

984 **kwargs: Any) -> Iterator[DimensionRecord]: 

985 # Docstring inherited from lsst.daf.butler.registry.Registry 

986 if not isinstance(element, DimensionElement): 

987 try: 

988 element = self.dimensions[element] 

989 except KeyError as e: 

990 raise KeyError(f"No such dimension '{element}', available dimensions: " 

991 + str(self.dimensions.getStaticElements())) from e 

992 dataIds = self.queryDataIds(element.graph, dataId=dataId, datasets=datasets, collections=collections, 

993 where=where, components=components, bind=bind, check=check, **kwargs) 

994 return iter(self._managers.dimensions[element].fetch(dataIds)) 

995 

996 def queryDatasetAssociations( 

997 self, 

998 datasetType: Union[str, DatasetType], 

999 collections: Any = ..., 

1000 *, 

1001 collectionTypes: Iterable[CollectionType] = CollectionType.all(), 

1002 flattenChains: bool = False, 

1003 ) -> Iterator[DatasetAssociation]: 

1004 # Docstring inherited from lsst.daf.butler.registry.Registry 

1005 if collections is None: 

1006 if not self.defaults.collections: 

1007 raise TypeError("No collections provided to findDataset, " 

1008 "and no defaults from registry construction.") 

1009 collections = self.defaults.collections 

1010 else: 

1011 collections = CollectionQuery.fromExpression(collections) 

1012 TimespanReprClass = self._db.getTimespanRepresentation() 

1013 if isinstance(datasetType, str): 

1014 storage = self._managers.datasets[datasetType] 

1015 else: 

1016 storage = self._managers.datasets[datasetType.name] 

1017 for collectionRecord in collections.iter(self._managers.collections, 

1018 collectionTypes=frozenset(collectionTypes), 

1019 flattenChains=flattenChains): 

1020 query = storage.select(collectionRecord) 

1021 for row in self._db.query(query.combine()).mappings(): 

1022 dataId = DataCoordinate.fromRequiredValues( 

1023 storage.datasetType.dimensions, 

1024 tuple(row[name] for name in storage.datasetType.dimensions.required.names) 

1025 ) 

1026 runRecord = self._managers.collections[row[self._managers.collections.getRunForeignKeyName()]] 

1027 ref = DatasetRef(storage.datasetType, dataId, id=row["id"], run=runRecord.name, 

1028 conform=False) 

1029 if collectionRecord.type is CollectionType.CALIBRATION: 

1030 timespan = TimespanReprClass.extract(row) 

1031 else: 

1032 timespan = None 

1033 yield DatasetAssociation(ref=ref, collection=collectionRecord.name, timespan=timespan) 

1034 

1035 storageClasses: StorageClassFactory 

1036 """All storage classes known to the registry (`StorageClassFactory`). 

1037 """