Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ( 

25 "SqlRegistry", 

26) 

27 

28from collections import defaultdict 

29import contextlib 

30import logging 

31from typing import ( 

32 Any, 

33 Dict, 

34 Iterable, 

35 Iterator, 

36 List, 

37 Mapping, 

38 Optional, 

39 Set, 

40 TYPE_CHECKING, 

41 Union, 

42) 

43 

44import sqlalchemy 

45 

46from ..core import ( 

47 ButlerURI, 

48 Config, 

49 DataCoordinate, 

50 DataCoordinateIterable, 

51 DataId, 

52 DatasetAssociation, 

53 DatasetId, 

54 DatasetRef, 

55 DatasetType, 

56 ddl, 

57 Dimension, 

58 DimensionConfig, 

59 DimensionElement, 

60 DimensionGraph, 

61 DimensionRecord, 

62 DimensionUniverse, 

63 NamedKeyMapping, 

64 NameLookupMapping, 

65 Progress, 

66 StorageClassFactory, 

67 Timespan, 

68) 

69from ..core.utils import iterable, transactional 

70 

71from ..registry import ( 

72 Registry, 

73 RegistryConfig, 

74 CollectionType, 

75 RegistryDefaults, 

76 ConflictingDefinitionError, 

77 InconsistentDataIdError, 

78 OrphanedRecordError, 

79 CollectionSearch, 

80) 

81from ..registry import queries 

82from ..registry.wildcards import CategorizedWildcard, CollectionQuery, Ellipsis 

83from ..registry.summaries import CollectionSummary 

84from ..registry.managers import RegistryManagerTypes, RegistryManagerInstances 

85from ..registry.interfaces import ChainedCollectionRecord, DatasetIdGenEnum, RunRecord 

86 

87if TYPE_CHECKING: 87 ↛ 88line 87 didn't jump to line 88, because the condition on line 87 was never true

88 from .._butlerConfig import ButlerConfig 

89 from ..registry.interfaces import ( 

90 CollectionRecord, 

91 Database, 

92 DatastoreRegistryBridgeManager, 

93 ) 

94 

95 

96_LOG = logging.getLogger(__name__) 

97 

98 

99class SqlRegistry(Registry): 

100 """Registry implementation based on SQLAlchemy. 

101 

102 Parameters 

103 ---------- 

104 database : `Database` 

105 Database instance to store Registry. 

106 defaults : `RegistryDefaults` 

107 Default collection search path and/or output `~CollectionType.RUN` 

108 collection. 

109 managers : `RegistryManagerInstances` 

110 All the managers required for this registry. 

111 """ 

112 

113 defaultConfigFile: Optional[str] = None 

114 """Path to configuration defaults. Accessed within the ``configs`` resource 

115 or relative to a search path. Can be None if no defaults specified. 

116 """ 

117 

118 @classmethod 

119 def createFromConfig(cls, config: Optional[Union[RegistryConfig, str]] = None, 

120 dimensionConfig: Optional[Union[DimensionConfig, str]] = None, 

121 butlerRoot: Optional[str] = None) -> Registry: 

122 """Create registry database and return `SqlRegistry` instance. 

123 

124 This method initializes database contents, database must be empty 

125 prior to calling this method. 

126 

127 Parameters 

128 ---------- 

129 config : `RegistryConfig` or `str`, optional 

130 Registry configuration, if missing then default configuration will 

131 be loaded from registry.yaml. 

132 dimensionConfig : `DimensionConfig` or `str`, optional 

133 Dimensions configuration, if missing then default configuration 

134 will be loaded from dimensions.yaml. 

135 butlerRoot : `str`, optional 

136 Path to the repository root this `SqlRegistry` will manage. 

137 

138 Returns 

139 ------- 

140 registry : `SqlRegistry` 

141 A new `SqlRegistry` instance. 

142 """ 

143 config = cls.forceRegistryConfig(config) 

144 config.replaceRoot(butlerRoot) 

145 

146 if isinstance(dimensionConfig, str): 

147 dimensionConfig = DimensionConfig(config) 

148 elif dimensionConfig is None: 

149 dimensionConfig = DimensionConfig() 

150 elif not isinstance(dimensionConfig, DimensionConfig): 

151 raise TypeError(f"Incompatible Dimension configuration type: {type(dimensionConfig)}") 

152 

153 DatabaseClass = config.getDatabaseClass() 

154 database = DatabaseClass.fromUri(str(config.connectionString), origin=config.get("origin", 0), 

155 namespace=config.get("namespace")) 

156 managerTypes = RegistryManagerTypes.fromConfig(config) 

157 managers = managerTypes.makeRepo(database, dimensionConfig) 

158 return cls(database, RegistryDefaults(), managers) 

159 

160 @classmethod 

161 def fromConfig(cls, config: Union[ButlerConfig, RegistryConfig, Config, str], 

162 butlerRoot: Optional[Union[str, ButlerURI]] = None, writeable: bool = True, 

163 defaults: Optional[RegistryDefaults] = None) -> Registry: 

164 """Create `Registry` subclass instance from `config`. 

165 

166 Registry database must be inbitialized prior to calling this method. 

167 

168 Parameters 

169 ---------- 

170 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str` 

171 Registry configuration 

172 butlerRoot : `str` or `ButlerURI`, optional 

173 Path to the repository root this `Registry` will manage. 

174 writeable : `bool`, optional 

175 If `True` (default) create a read-write connection to the database. 

176 defaults : `RegistryDefaults`, optional 

177 Default collection search path and/or output `~CollectionType.RUN` 

178 collection. 

179 

180 Returns 

181 ------- 

182 registry : `SqlRegistry` (subclass) 

183 A new `SqlRegistry` subclass instance. 

184 """ 

185 config = cls.forceRegistryConfig(config) 

186 config.replaceRoot(butlerRoot) 

187 DatabaseClass = config.getDatabaseClass() 

188 database = DatabaseClass.fromUri(str(config.connectionString), origin=config.get("origin", 0), 

189 namespace=config.get("namespace"), writeable=writeable) 

190 managerTypes = RegistryManagerTypes.fromConfig(config) 

191 managers = managerTypes.loadRepo(database) 

192 if defaults is None: 

193 defaults = RegistryDefaults() 

194 return cls(database, defaults, managers) 

195 

196 def __init__(self, database: Database, defaults: RegistryDefaults, managers: RegistryManagerInstances): 

197 self._db = database 

198 self._managers = managers 

199 self.storageClasses = StorageClassFactory() 

200 # Intentionally invoke property setter to initialize defaults. This 

201 # can only be done after most of the rest of Registry has already been 

202 # initialized, and must be done before the property getter is used. 

203 self.defaults = defaults 

204 

205 def __str__(self) -> str: 

206 return str(self._db) 

207 

208 def __repr__(self) -> str: 

209 return f"SqlRegistry({self._db!r}, {self.dimensions!r})" 

210 

211 def isWriteable(self) -> bool: 

212 # Docstring inherited from lsst.daf.butler.registry.Registry 

213 return self._db.isWriteable() 

214 

215 def copy(self, defaults: Optional[RegistryDefaults] = None) -> Registry: 

216 # Docstring inherited from lsst.daf.butler.registry.Registry 

217 if defaults is None: 

218 # No need to copy, because `RegistryDefaults` is immutable; we 

219 # effectively copy on write. 

220 defaults = self.defaults 

221 return type(self)(self._db, defaults, self._managers) 

222 

223 @property 

224 def dimensions(self) -> DimensionUniverse: 

225 # Docstring inherited from lsst.daf.butler.registry.Registry 

226 return self._managers.dimensions.universe 

227 

228 def refresh(self) -> None: 

229 # Docstring inherited from lsst.daf.butler.registry.Registry 

230 self._managers.refresh() 

231 

232 @contextlib.contextmanager 

233 def transaction(self, *, savepoint: bool = False) -> Iterator[None]: 

234 # Docstring inherited from lsst.daf.butler.registry.Registry 

235 try: 

236 with self._db.transaction(savepoint=savepoint): 

237 yield 

238 except BaseException: 

239 # TODO: this clears the caches sometimes when we wouldn't actually 

240 # need to. Can we avoid that? 

241 self._managers.dimensions.clearCaches() 

242 raise 

243 

244 def resetConnectionPool(self) -> None: 

245 """Reset SQLAlchemy connection pool for `SqlRegistry` database. 

246 

247 This operation is useful when using registry with fork-based 

248 multiprocessing. To use registry across fork boundary one has to make 

249 sure that there are no currently active connections (no session or 

250 transaction is in progress) and connection pool is reset using this 

251 method. This method should be called by the child process immediately 

252 after the fork. 

253 """ 

254 self._db._engine.dispose() 

255 

256 def registerOpaqueTable(self, tableName: str, spec: ddl.TableSpec) -> None: 

257 """Add an opaque (to the `Registry`) table for use by a `Datastore` or 

258 other data repository client. 

259 

260 Opaque table records can be added via `insertOpaqueData`, retrieved via 

261 `fetchOpaqueData`, and removed via `deleteOpaqueData`. 

262 

263 Parameters 

264 ---------- 

265 tableName : `str` 

266 Logical name of the opaque table. This may differ from the 

267 actual name used in the database by a prefix and/or suffix. 

268 spec : `ddl.TableSpec` 

269 Specification for the table to be added. 

270 """ 

271 self._managers.opaque.register(tableName, spec) 

272 

273 @transactional 

274 def insertOpaqueData(self, tableName: str, *data: dict) -> None: 

275 """Insert records into an opaque table. 

276 

277 Parameters 

278 ---------- 

279 tableName : `str` 

280 Logical name of the opaque table. Must match the name used in a 

281 previous call to `registerOpaqueTable`. 

282 data 

283 Each additional positional argument is a dictionary that represents 

284 a single row to be added. 

285 """ 

286 self._managers.opaque[tableName].insert(*data) 

287 

288 def fetchOpaqueData(self, tableName: str, **where: Any) -> Iterator[dict]: 

289 """Retrieve records from an opaque table. 

290 

291 Parameters 

292 ---------- 

293 tableName : `str` 

294 Logical name of the opaque table. Must match the name used in a 

295 previous call to `registerOpaqueTable`. 

296 where 

297 Additional keyword arguments are interpreted as equality 

298 constraints that restrict the returned rows (combined with AND); 

299 keyword arguments are column names and values are the values they 

300 must have. 

301 

302 Yields 

303 ------ 

304 row : `dict` 

305 A dictionary representing a single result row. 

306 """ 

307 yield from self._managers.opaque[tableName].fetch(**where) 

308 

309 @transactional 

310 def deleteOpaqueData(self, tableName: str, **where: Any) -> None: 

311 """Remove records from an opaque table. 

312 

313 Parameters 

314 ---------- 

315 tableName : `str` 

316 Logical name of the opaque table. Must match the name used in a 

317 previous call to `registerOpaqueTable`. 

318 where 

319 Additional keyword arguments are interpreted as equality 

320 constraints that restrict the deleted rows (combined with AND); 

321 keyword arguments are column names and values are the values they 

322 must have. 

323 """ 

324 self._managers.opaque[tableName].delete(where.keys(), where) 

325 

326 def registerCollection(self, name: str, type: CollectionType = CollectionType.TAGGED, 

327 doc: Optional[str] = None) -> None: 

328 # Docstring inherited from lsst.daf.butler.registry.Registry 

329 self._managers.collections.register(name, type, doc=doc) 

330 

331 def getCollectionType(self, name: str) -> CollectionType: 

332 # Docstring inherited from lsst.daf.butler.registry.Registry 

333 return self._managers.collections.find(name).type 

334 

335 def _get_collection_record(self, name: str) -> CollectionRecord: 

336 # Docstring inherited from lsst.daf.butler.registry.Registry 

337 return self._managers.collections.find(name) 

338 

339 def registerRun(self, name: str, doc: Optional[str] = None) -> None: 

340 # Docstring inherited from lsst.daf.butler.registry.Registry 

341 self._managers.collections.register(name, CollectionType.RUN, doc=doc) 

342 

343 @transactional 

344 def removeCollection(self, name: str) -> None: 

345 # Docstring inherited from lsst.daf.butler.registry.Registry 

346 self._managers.collections.remove(name) 

347 

348 def getCollectionChain(self, parent: str) -> CollectionSearch: 

349 # Docstring inherited from lsst.daf.butler.registry.Registry 

350 record = self._managers.collections.find(parent) 

351 if record.type is not CollectionType.CHAINED: 

352 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.") 

353 assert isinstance(record, ChainedCollectionRecord) 

354 return record.children 

355 

356 @transactional 

357 def setCollectionChain(self, parent: str, children: Any, *, flatten: bool = False) -> None: 

358 # Docstring inherited from lsst.daf.butler.registry.Registry 

359 record = self._managers.collections.find(parent) 

360 if record.type is not CollectionType.CHAINED: 

361 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.") 

362 assert isinstance(record, ChainedCollectionRecord) 

363 children = CollectionSearch.fromExpression(children) 

364 if children != record.children or flatten: 

365 record.update(self._managers.collections, children, flatten=flatten) 

366 

367 def getCollectionDocumentation(self, collection: str) -> Optional[str]: 

368 # Docstring inherited from lsst.daf.butler.registry.Registry 

369 return self._managers.collections.getDocumentation(self._managers.collections.find(collection).key) 

370 

371 def setCollectionDocumentation(self, collection: str, doc: Optional[str]) -> None: 

372 # Docstring inherited from lsst.daf.butler.registry.Registry 

373 self._managers.collections.setDocumentation(self._managers.collections.find(collection).key, doc) 

374 

375 def getCollectionSummary(self, collection: str) -> CollectionSummary: 

376 # Docstring inherited from lsst.daf.butler.registry.Registry 

377 record = self._managers.collections.find(collection) 

378 return self._managers.datasets.getCollectionSummary(record) 

379 

380 def registerDatasetType(self, datasetType: DatasetType) -> bool: 

381 # Docstring inherited from lsst.daf.butler.registry.Registry 

382 _, inserted = self._managers.datasets.register(datasetType) 

383 return inserted 

384 

385 def removeDatasetType(self, name: str) -> None: 

386 # Docstring inherited from lsst.daf.butler.registry.Registry 

387 self._managers.datasets.remove(name) 

388 

389 def getDatasetType(self, name: str) -> DatasetType: 

390 # Docstring inherited from lsst.daf.butler.registry.Registry 

391 return self._managers.datasets[name].datasetType 

392 

393 def findDataset(self, datasetType: Union[DatasetType, str], dataId: Optional[DataId] = None, *, 

394 collections: Any = None, timespan: Optional[Timespan] = None, 

395 **kwargs: Any) -> Optional[DatasetRef]: 

396 # Docstring inherited from lsst.daf.butler.registry.Registry 

397 if isinstance(datasetType, DatasetType): 

398 storage = self._managers.datasets[datasetType.name] 

399 else: 

400 storage = self._managers.datasets[datasetType] 

401 dataId = DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions, 

402 universe=self.dimensions, defaults=self.defaults.dataId, 

403 **kwargs) 

404 if collections is None: 

405 if not self.defaults.collections: 

406 raise TypeError("No collections provided to findDataset, " 

407 "and no defaults from registry construction.") 

408 collections = self.defaults.collections 

409 else: 

410 collections = CollectionSearch.fromExpression(collections) 

411 for collectionRecord in collections.iter(self._managers.collections): 

412 if (collectionRecord.type is CollectionType.CALIBRATION 

413 and (not storage.datasetType.isCalibration() or timespan is None)): 

414 continue 

415 result = storage.find(collectionRecord, dataId, timespan=timespan) 

416 if result is not None: 

417 return result 

418 

419 return None 

420 

421 @transactional 

422 def insertDatasets(self, datasetType: Union[DatasetType, str], dataIds: Iterable[DataId], 

423 run: Optional[str] = None, expand: bool = True, 

424 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE) -> List[DatasetRef]: 

425 # Docstring inherited from lsst.daf.butler.registry.Registry 

426 if isinstance(datasetType, DatasetType): 

427 storage = self._managers.datasets.find(datasetType.name) 

428 if storage is None: 

429 raise LookupError(f"DatasetType '{datasetType}' has not been registered.") 

430 else: 

431 storage = self._managers.datasets.find(datasetType) 

432 if storage is None: 

433 raise LookupError(f"DatasetType with name '{datasetType}' has not been registered.") 

434 if run is None: 

435 if self.defaults.run is None: 

436 raise TypeError("No run provided to insertDatasets, " 

437 "and no default from registry construction.") 

438 run = self.defaults.run 

439 runRecord = self._managers.collections.find(run) 

440 if runRecord.type is not CollectionType.RUN: 

441 raise TypeError(f"Given collection is of type {runRecord.type.name}; RUN collection required.") 

442 assert isinstance(runRecord, RunRecord) 

443 progress = Progress("daf.butler.Registry.insertDatasets", level=logging.DEBUG) 

444 if expand: 

445 expandedDataIds = [self.expandDataId(dataId, graph=storage.datasetType.dimensions) 

446 for dataId in progress.wrap(dataIds, 

447 f"Expanding {storage.datasetType.name} data IDs")] 

448 else: 

449 expandedDataIds = [DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions) 

450 for dataId in dataIds] 

451 try: 

452 refs = list(storage.insert(runRecord, expandedDataIds, idGenerationMode)) 

453 except sqlalchemy.exc.IntegrityError as err: 

454 raise ConflictingDefinitionError(f"A database constraint failure was triggered by inserting " 

455 f"one or more datasets of type {storage.datasetType} into " 

456 f"collection '{run}'. " 

457 f"This probably means a dataset with the same data ID " 

458 f"and dataset type already exists, but it may also mean a " 

459 f"dimension row is missing.") from err 

460 return refs 

461 

462 @transactional 

463 def _importDatasets(self, datasets: Iterable[DatasetRef], expand: bool = True, 

464 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

465 reuseIds: bool = False) -> List[DatasetRef]: 

466 # Docstring inherited from lsst.daf.butler.registry.Registry 

467 datasets = list(datasets) 

468 if not datasets: 

469 # nothing to do 

470 return [] 

471 

472 # find dataset type 

473 datasetTypes = set(dataset.datasetType for dataset in datasets) 

474 if len(datasetTypes) != 1: 

475 raise ValueError(f"Multiple dataset types in input datasets: {datasetTypes}") 

476 datasetType = datasetTypes.pop() 

477 

478 # get storage handler for this dataset type 

479 storage = self._managers.datasets.find(datasetType.name) 

480 if storage is None: 

481 raise LookupError(f"DatasetType '{datasetType}' has not been registered.") 

482 

483 # find run name 

484 runs = set(dataset.run for dataset in datasets) 

485 if len(runs) != 1: 

486 raise ValueError(f"Multiple run names in input datasets: {runs}") 

487 run = runs.pop() 

488 if run is None: 

489 if self.defaults.run is None: 

490 raise TypeError("No run provided to ingestDatasets, " 

491 "and no default from registry construction.") 

492 run = self.defaults.run 

493 

494 runRecord = self._managers.collections.find(run) 

495 if runRecord.type is not CollectionType.RUN: 

496 raise TypeError(f"Given collection '{runRecord.name}' is of type {runRecord.type.name};" 

497 " RUN collection required.") 

498 assert isinstance(runRecord, RunRecord) 

499 

500 progress = Progress("daf.butler.Registry.insertDatasets", level=logging.DEBUG) 

501 if expand: 

502 expandedDatasets = [ 

503 dataset.expanded(self.expandDataId(dataset.dataId, graph=storage.datasetType.dimensions)) 

504 for dataset in progress.wrap(datasets, f"Expanding {storage.datasetType.name} data IDs")] 

505 else: 

506 expandedDatasets = [ 

507 DatasetRef(datasetType, dataset.dataId, id=dataset.id, run=dataset.run, conform=True) 

508 for dataset in datasets 

509 ] 

510 

511 try: 

512 refs = list(storage.import_(runRecord, expandedDatasets, idGenerationMode, reuseIds)) 

513 except sqlalchemy.exc.IntegrityError as err: 

514 raise ConflictingDefinitionError(f"A database constraint failure was triggered by inserting " 

515 f"one or more datasets of type {storage.datasetType} into " 

516 f"collection '{run}'. " 

517 f"This probably means a dataset with the same data ID " 

518 f"and dataset type already exists, but it may also mean a " 

519 f"dimension row is missing.") from err 

520 return refs 

521 

522 def getDataset(self, id: DatasetId) -> Optional[DatasetRef]: 

523 # Docstring inherited from lsst.daf.butler.registry.Registry 

524 return self._managers.datasets.getDatasetRef(id) 

525 

526 @transactional 

527 def removeDatasets(self, refs: Iterable[DatasetRef]) -> None: 

528 # Docstring inherited from lsst.daf.butler.registry.Registry 

529 progress = Progress("lsst.daf.butler.Registry.removeDatasets", level=logging.DEBUG) 

530 for datasetType, refsForType in progress.iter_item_chunks(DatasetRef.groupByType(refs).items(), 

531 desc="Removing datasets by type"): 

532 storage = self._managers.datasets[datasetType.name] 

533 try: 

534 storage.delete(refsForType) 

535 except sqlalchemy.exc.IntegrityError as err: 

536 raise OrphanedRecordError("One or more datasets is still " 

537 "present in one or more Datastores.") from err 

538 

539 @transactional 

540 def associate(self, collection: str, refs: Iterable[DatasetRef]) -> None: 

541 # Docstring inherited from lsst.daf.butler.registry.Registry 

542 progress = Progress("lsst.daf.butler.Registry.associate", level=logging.DEBUG) 

543 collectionRecord = self._managers.collections.find(collection) 

544 if collectionRecord.type is not CollectionType.TAGGED: 

545 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}, not TAGGED.") 

546 for datasetType, refsForType in progress.iter_item_chunks(DatasetRef.groupByType(refs).items(), 

547 desc="Associating datasets by type"): 

548 storage = self._managers.datasets[datasetType.name] 

549 try: 

550 storage.associate(collectionRecord, refsForType) 

551 except sqlalchemy.exc.IntegrityError as err: 

552 raise ConflictingDefinitionError( 

553 f"Constraint violation while associating dataset of type {datasetType.name} with " 

554 f"collection {collection}. This probably means that one or more datasets with the same " 

555 f"dataset type and data ID already exist in the collection, but it may also indicate " 

556 f"that the datasets do not exist." 

557 ) from err 

558 

559 @transactional 

560 def disassociate(self, collection: str, refs: Iterable[DatasetRef]) -> None: 

561 # Docstring inherited from lsst.daf.butler.registry.Registry 

562 progress = Progress("lsst.daf.butler.Registry.disassociate", level=logging.DEBUG) 

563 collectionRecord = self._managers.collections.find(collection) 

564 if collectionRecord.type is not CollectionType.TAGGED: 

565 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}; " 

566 "expected TAGGED.") 

567 for datasetType, refsForType in progress.iter_item_chunks(DatasetRef.groupByType(refs).items(), 

568 desc="Disassociating datasets by type"): 

569 storage = self._managers.datasets[datasetType.name] 

570 storage.disassociate(collectionRecord, refsForType) 

571 

572 @transactional 

573 def certify(self, collection: str, refs: Iterable[DatasetRef], timespan: Timespan) -> None: 

574 # Docstring inherited from lsst.daf.butler.registry.Registry 

575 progress = Progress("lsst.daf.butler.Registry.certify", level=logging.DEBUG) 

576 collectionRecord = self._managers.collections.find(collection) 

577 for datasetType, refsForType in progress.iter_item_chunks(DatasetRef.groupByType(refs).items(), 

578 desc="Certifying datasets by type"): 

579 storage = self._managers.datasets[datasetType.name] 

580 storage.certify(collectionRecord, refsForType, timespan) 

581 

582 @transactional 

583 def decertify(self, collection: str, datasetType: Union[str, DatasetType], timespan: Timespan, *, 

584 dataIds: Optional[Iterable[DataId]] = None) -> None: 

585 # Docstring inherited from lsst.daf.butler.registry.Registry 

586 collectionRecord = self._managers.collections.find(collection) 

587 if isinstance(datasetType, str): 

588 storage = self._managers.datasets[datasetType] 

589 else: 

590 storage = self._managers.datasets[datasetType.name] 

591 standardizedDataIds = None 

592 if dataIds is not None: 

593 standardizedDataIds = [DataCoordinate.standardize(d, graph=storage.datasetType.dimensions) 

594 for d in dataIds] 

595 storage.decertify(collectionRecord, timespan, dataIds=standardizedDataIds) 

596 

597 def getDatastoreBridgeManager(self) -> DatastoreRegistryBridgeManager: 

598 """Return an object that allows a new `Datastore` instance to 

599 communicate with this `Registry`. 

600 

601 Returns 

602 ------- 

603 manager : `DatastoreRegistryBridgeManager` 

604 Object that mediates communication between this `Registry` and its 

605 associated datastores. 

606 """ 

607 return self._managers.datastores 

608 

609 def getDatasetLocations(self, ref: DatasetRef) -> Iterable[str]: 

610 # Docstring inherited from lsst.daf.butler.registry.Registry 

611 return self._managers.datastores.findDatastores(ref) 

612 

613 def expandDataId(self, dataId: Optional[DataId] = None, *, graph: Optional[DimensionGraph] = None, 

614 records: Optional[NameLookupMapping[DimensionElement, Optional[DimensionRecord]]] = None, 

615 withDefaults: bool = True, 

616 **kwargs: Any) -> DataCoordinate: 

617 # Docstring inherited from lsst.daf.butler.registry.Registry 

618 if not withDefaults: 

619 defaults = None 

620 else: 

621 defaults = self.defaults.dataId 

622 standardized = DataCoordinate.standardize(dataId, graph=graph, universe=self.dimensions, 

623 defaults=defaults, **kwargs) 

624 if standardized.hasRecords(): 

625 return standardized 

626 if records is None: 

627 records = {} 

628 elif isinstance(records, NamedKeyMapping): 

629 records = records.byName() 

630 else: 

631 records = dict(records) 

632 if isinstance(dataId, DataCoordinate) and dataId.hasRecords(): 

633 records.update(dataId.records.byName()) 

634 keys = standardized.byName() 

635 for element in standardized.graph.primaryKeyTraversalOrder: 

636 record = records.get(element.name, ...) # Use ... to mean not found; None might mean NULL 

637 if record is ...: 

638 if isinstance(element, Dimension) and keys.get(element.name) is None: 

639 if element in standardized.graph.required: 

640 raise LookupError( 

641 f"No value or null value for required dimension {element.name}." 

642 ) 

643 keys[element.name] = None 

644 record = None 

645 else: 

646 storage = self._managers.dimensions[element] 

647 dataIdSet = DataCoordinateIterable.fromScalar( 

648 DataCoordinate.standardize(keys, graph=element.graph) 

649 ) 

650 fetched = tuple(storage.fetch(dataIdSet)) 

651 try: 

652 (record,) = fetched 

653 except ValueError: 

654 record = None 

655 records[element.name] = record 

656 if record is not None: 

657 for d in element.implied: 

658 value = getattr(record, d.name) 

659 if keys.setdefault(d.name, value) != value: 

660 raise InconsistentDataIdError( 

661 f"Data ID {standardized} has {d.name}={keys[d.name]!r}, " 

662 f"but {element.name} implies {d.name}={value!r}." 

663 ) 

664 else: 

665 if element in standardized.graph.required: 

666 raise LookupError( 

667 f"Could not fetch record for required dimension {element.name} via keys {keys}." 

668 ) 

669 if element.alwaysJoin: 

670 raise InconsistentDataIdError( 

671 f"Could not fetch record for element {element.name} via keys {keys}, ", 

672 "but it is marked alwaysJoin=True; this means one or more dimensions are not " 

673 "related." 

674 ) 

675 for d in element.implied: 

676 keys.setdefault(d.name, None) 

677 records.setdefault(d.name, None) 

678 return DataCoordinate.standardize(keys, graph=standardized.graph).expanded(records=records) 

679 

680 def insertDimensionData(self, element: Union[DimensionElement, str], 

681 *data: Union[Mapping[str, Any], DimensionRecord], 

682 conform: bool = True, 

683 replace: bool = False) -> None: 

684 # Docstring inherited from lsst.daf.butler.registry.Registry 

685 if conform: 

686 if isinstance(element, str): 

687 element = self.dimensions[element] 

688 records = [row if isinstance(row, DimensionRecord) else element.RecordClass(**row) 

689 for row in data] 

690 else: 

691 # Ignore typing since caller said to trust them with conform=False. 

692 records = data # type: ignore 

693 storage = self._managers.dimensions[element] # type: ignore 

694 storage.insert(*records, replace=replace) 

695 

696 def syncDimensionData(self, element: Union[DimensionElement, str], 

697 row: Union[Mapping[str, Any], DimensionRecord], 

698 conform: bool = True, 

699 update: bool = False) -> Union[bool, Dict[str, Any]]: 

700 # Docstring inherited from lsst.daf.butler.registry.Registry 

701 if conform: 

702 if isinstance(element, str): 

703 element = self.dimensions[element] 

704 record = row if isinstance(row, DimensionRecord) else element.RecordClass(**row) 

705 else: 

706 # Ignore typing since caller said to trust them with conform=False. 

707 record = row # type: ignore 

708 storage = self._managers.dimensions[element] # type: ignore 

709 return storage.sync(record, update=update) 

710 

711 def queryDatasetTypes(self, expression: Any = ..., *, components: Optional[bool] = None 

712 ) -> Iterator[DatasetType]: 

713 # Docstring inherited from lsst.daf.butler.registry.Registry 

714 wildcard = CategorizedWildcard.fromExpression(expression, coerceUnrecognized=lambda d: d.name) 

715 if wildcard is Ellipsis: 

716 for datasetType in self._managers.datasets: 

717 # The dataset type can no longer be a component 

718 yield datasetType 

719 if components: 

720 # Automatically create the component dataset types 

721 try: 

722 componentsForDatasetType = datasetType.makeAllComponentDatasetTypes() 

723 except KeyError as err: 

724 _LOG.warning(f"Could not load storage class {err} for {datasetType.name}; " 

725 "if it has components they will not be included in query results.") 

726 else: 

727 yield from componentsForDatasetType 

728 return 

729 done: Set[str] = set() 

730 for name in wildcard.strings: 

731 storage = self._managers.datasets.find(name) 

732 if storage is not None: 

733 done.add(storage.datasetType.name) 

734 yield storage.datasetType 

735 if wildcard.patterns: 

736 # If components (the argument) is None, we'll save component 

737 # dataset that we might want to match, but only if their parents 

738 # didn't get included. 

739 componentsForLater = [] 

740 for registeredDatasetType in self._managers.datasets: 

741 # Components are not stored in registry so expand them here 

742 allDatasetTypes = [registeredDatasetType] 

743 try: 

744 allDatasetTypes.extend(registeredDatasetType.makeAllComponentDatasetTypes()) 

745 except KeyError as err: 

746 _LOG.warning(f"Could not load storage class {err} for {registeredDatasetType.name}; " 

747 "if it has components they will not be included in query results.") 

748 for datasetType in allDatasetTypes: 

749 if datasetType.name in done: 

750 continue 

751 parentName, componentName = datasetType.nameAndComponent() 

752 if componentName is not None and not components: 

753 if components is None and parentName not in done: 

754 componentsForLater.append(datasetType) 

755 continue 

756 if any(p.fullmatch(datasetType.name) for p in wildcard.patterns): 

757 done.add(datasetType.name) 

758 yield datasetType 

759 # Go back and try to match saved components. 

760 for datasetType in componentsForLater: 

761 parentName, _ = datasetType.nameAndComponent() 

762 if parentName not in done and any(p.fullmatch(datasetType.name) for p in wildcard.patterns): 

763 yield datasetType 

764 

765 def queryCollections(self, expression: Any = ..., 

766 datasetType: Optional[DatasetType] = None, 

767 collectionTypes: Iterable[CollectionType] = CollectionType.all(), 

768 flattenChains: bool = False, 

769 includeChains: Optional[bool] = None) -> Iterator[str]: 

770 # Docstring inherited from lsst.daf.butler.registry.Registry 

771 

772 # Right now the datasetTypes argument is completely ignored, but that 

773 # is consistent with its [lack of] guarantees. DM-24939 or a follow-up 

774 # ticket will take care of that. 

775 query = CollectionQuery.fromExpression(expression) 

776 for record in query.iter(self._managers.collections, collectionTypes=frozenset(collectionTypes), 

777 flattenChains=flattenChains, includeChains=includeChains): 

778 yield record.name 

779 

780 def _makeQueryBuilder(self, summary: queries.QuerySummary) -> queries.QueryBuilder: 

781 """Return a `QueryBuilder` instance capable of constructing and 

782 managing more complex queries than those obtainable via `Registry` 

783 interfaces. 

784 

785 This is an advanced interface; downstream code should prefer 

786 `Registry.queryDataIds` and `Registry.queryDatasets` whenever those 

787 are sufficient. 

788 

789 Parameters 

790 ---------- 

791 summary : `queries.QuerySummary` 

792 Object describing and categorizing the full set of dimensions that 

793 will be included in the query. 

794 

795 Returns 

796 ------- 

797 builder : `queries.QueryBuilder` 

798 Object that can be used to construct and perform advanced queries. 

799 """ 

800 return queries.QueryBuilder( 

801 summary, 

802 queries.RegistryManagers( 

803 collections=self._managers.collections, 

804 dimensions=self._managers.dimensions, 

805 datasets=self._managers.datasets, 

806 TimespanReprClass=self._db.getTimespanRepresentation(), 

807 ), 

808 ) 

809 

810 def queryDatasets(self, datasetType: Any, *, 

811 collections: Any = None, 

812 dimensions: Optional[Iterable[Union[Dimension, str]]] = None, 

813 dataId: Optional[DataId] = None, 

814 where: Optional[str] = None, 

815 findFirst: bool = False, 

816 components: Optional[bool] = None, 

817 bind: Optional[Mapping[str, Any]] = None, 

818 check: bool = True, 

819 **kwargs: Any) -> queries.DatasetQueryResults: 

820 # Docstring inherited from lsst.daf.butler.registry.Registry 

821 

822 # Standardize the collections expression. 

823 if collections is None: 

824 if not self.defaults.collections: 

825 raise TypeError("No collections provided to findDataset, " 

826 "and no defaults from registry construction.") 

827 collections = self.defaults.collections 

828 elif findFirst: 

829 collections = CollectionSearch.fromExpression(collections) 

830 else: 

831 collections = CollectionQuery.fromExpression(collections) 

832 # Standardize and expand the data ID provided as a constraint. 

833 standardizedDataId = self.expandDataId(dataId, **kwargs) 

834 

835 # We can only query directly if given a non-component DatasetType 

836 # instance. If we were given an expression or str or a component 

837 # DatasetType instance, we'll populate this dict, recurse, and return. 

838 # If we already have a non-component DatasetType, it will remain None 

839 # and we'll run the query directly. 

840 composition: Optional[ 

841 Dict[ 

842 DatasetType, # parent dataset type 

843 List[Optional[str]] # component name, or None for parent 

844 ] 

845 ] = None 

846 if not isinstance(datasetType, DatasetType): 

847 # We were given a dataset type expression (which may be as simple 

848 # as a str). Loop over all matching datasets, delegating handling 

849 # of the `components` argument to queryDatasetTypes, as we populate 

850 # the composition dict. 

851 composition = defaultdict(list) 

852 for trueDatasetType in self.queryDatasetTypes(datasetType, components=components): 

853 parentName, componentName = trueDatasetType.nameAndComponent() 

854 if componentName is not None: 

855 parentDatasetType = self.getDatasetType(parentName) 

856 composition.setdefault(parentDatasetType, []).append(componentName) 

857 else: 

858 composition.setdefault(trueDatasetType, []).append(None) 

859 elif datasetType.isComponent(): 

860 # We were given a true DatasetType instance, but it's a component. 

861 # the composition dict will have exactly one item. 

862 parentName, componentName = datasetType.nameAndComponent() 

863 parentDatasetType = self.getDatasetType(parentName) 

864 composition = {parentDatasetType: [componentName]} 

865 if composition is not None: 

866 # We need to recurse. Do that once for each parent dataset type. 

867 chain = [] 

868 for parentDatasetType, componentNames in composition.items(): 

869 parentResults = self.queryDatasets( 

870 parentDatasetType, 

871 collections=collections, 

872 dimensions=dimensions, 

873 dataId=standardizedDataId, 

874 where=where, 

875 bind=bind, 

876 findFirst=findFirst, 

877 check=check, 

878 ) 

879 if isinstance(parentResults, queries.ParentDatasetQueryResults): 

880 chain.append( 

881 parentResults.withComponents(componentNames) 

882 ) 

883 else: 

884 # Should only happen if we know there would be no results. 

885 assert isinstance(parentResults, queries.ChainedDatasetQueryResults) \ 

886 and not parentResults._chain 

887 return queries.ChainedDatasetQueryResults(chain) 

888 # If we get here, there's no need to recurse (or we are already 

889 # recursing; there can only ever be one level of recursion). 

890 

891 # The full set of dimensions in the query is the combination of those 

892 # needed for the DatasetType and those explicitly requested, if any. 

893 requestedDimensionNames = set(datasetType.dimensions.names) 

894 if dimensions is not None: 

895 requestedDimensionNames.update(self.dimensions.extract(dimensions).names) 

896 # Construct the summary structure needed to construct a QueryBuilder. 

897 summary = queries.QuerySummary( 

898 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames), 

899 dataId=standardizedDataId, 

900 expression=where, 

901 bind=bind, 

902 defaults=self.defaults.dataId, 

903 check=check, 

904 ) 

905 builder = self._makeQueryBuilder(summary) 

906 # Add the dataset subquery to the query, telling the QueryBuilder to 

907 # include the rank of the selected collection in the results only if we 

908 # need to findFirst. Note that if any of the collections are 

909 # actually wildcard expressions, and we've asked for deduplication, 

910 # this will raise TypeError for us. 

911 if not builder.joinDataset(datasetType, collections, isResult=True, findFirst=findFirst): 

912 return queries.ChainedDatasetQueryResults(()) 

913 query = builder.finish() 

914 return queries.ParentDatasetQueryResults(self._db, query, components=[None]) 

915 

916 def queryDataIds(self, dimensions: Union[Iterable[Union[Dimension, str]], Dimension, str], *, 

917 dataId: Optional[DataId] = None, 

918 datasets: Any = None, 

919 collections: Any = None, 

920 where: Optional[str] = None, 

921 components: Optional[bool] = None, 

922 bind: Optional[Mapping[str, Any]] = None, 

923 check: bool = True, 

924 **kwargs: Any) -> queries.DataCoordinateQueryResults: 

925 # Docstring inherited from lsst.daf.butler.registry.Registry 

926 dimensions = iterable(dimensions) 

927 standardizedDataId = self.expandDataId(dataId, **kwargs) 

928 standardizedDatasetTypes = set() 

929 requestedDimensions = self.dimensions.extract(dimensions) 

930 queryDimensionNames = set(requestedDimensions.names) 

931 if datasets is not None: 

932 if collections is None: 

933 if not self.defaults.collections: 

934 raise TypeError("Cannot pass 'datasets' without 'collections'.") 

935 collections = self.defaults.collections 

936 else: 

937 # Preprocess collections expression in case the original 

938 # included single-pass iterators (we'll want to use it multiple 

939 # times below). 

940 collections = CollectionQuery.fromExpression(collections) 

941 for datasetType in self.queryDatasetTypes(datasets, components=components): 

942 queryDimensionNames.update(datasetType.dimensions.names) 

943 # If any matched dataset type is a component, just operate on 

944 # its parent instead, because Registry doesn't know anything 

945 # about what components exist, and here (unlike queryDatasets) 

946 # we don't care about returning them. 

947 parentDatasetTypeName, componentName = datasetType.nameAndComponent() 

948 if componentName is not None: 

949 datasetType = self.getDatasetType(parentDatasetTypeName) 

950 standardizedDatasetTypes.add(datasetType) 

951 

952 summary = queries.QuerySummary( 

953 requested=DimensionGraph(self.dimensions, names=queryDimensionNames), 

954 dataId=standardizedDataId, 

955 expression=where, 

956 bind=bind, 

957 defaults=self.defaults.dataId, 

958 check=check, 

959 ) 

960 builder = self._makeQueryBuilder(summary) 

961 for datasetType in standardizedDatasetTypes: 

962 builder.joinDataset(datasetType, collections, isResult=False) 

963 query = builder.finish() 

964 return queries.DataCoordinateQueryResults(self._db, query) 

965 

966 def queryDimensionRecords(self, element: Union[DimensionElement, str], *, 

967 dataId: Optional[DataId] = None, 

968 datasets: Any = None, 

969 collections: Any = None, 

970 where: Optional[str] = None, 

971 components: Optional[bool] = None, 

972 bind: Optional[Mapping[str, Any]] = None, 

973 check: bool = True, 

974 **kwargs: Any) -> Iterator[DimensionRecord]: 

975 # Docstring inherited from lsst.daf.butler.registry.Registry 

976 if not isinstance(element, DimensionElement): 

977 try: 

978 element = self.dimensions[element] 

979 except KeyError as e: 

980 raise KeyError(f"No such dimension '{element}', available dimensions: " 

981 + str(self.dimensions.getStaticElements())) from e 

982 dataIds = self.queryDataIds(element.graph, dataId=dataId, datasets=datasets, collections=collections, 

983 where=where, components=components, bind=bind, check=check, **kwargs) 

984 return iter(self._managers.dimensions[element].fetch(dataIds)) 

985 

986 def queryDatasetAssociations( 

987 self, 

988 datasetType: Union[str, DatasetType], 

989 collections: Any = ..., 

990 *, 

991 collectionTypes: Iterable[CollectionType] = CollectionType.all(), 

992 flattenChains: bool = False, 

993 ) -> Iterator[DatasetAssociation]: 

994 # Docstring inherited from lsst.daf.butler.registry.Registry 

995 if collections is None: 

996 if not self.defaults.collections: 

997 raise TypeError("No collections provided to findDataset, " 

998 "and no defaults from registry construction.") 

999 collections = self.defaults.collections 

1000 else: 

1001 collections = CollectionQuery.fromExpression(collections) 

1002 TimespanReprClass = self._db.getTimespanRepresentation() 

1003 if isinstance(datasetType, str): 

1004 storage = self._managers.datasets[datasetType] 

1005 else: 

1006 storage = self._managers.datasets[datasetType.name] 

1007 for collectionRecord in collections.iter(self._managers.collections, 

1008 collectionTypes=frozenset(collectionTypes), 

1009 flattenChains=flattenChains): 

1010 query = storage.select(collectionRecord) 

1011 if query is None: 

1012 continue 

1013 for row in self._db.query(query.combine()): 

1014 dataId = DataCoordinate.fromRequiredValues( 

1015 storage.datasetType.dimensions, 

1016 tuple(row[name] for name in storage.datasetType.dimensions.required.names) 

1017 ) 

1018 runRecord = self._managers.collections[row[self._managers.collections.getRunForeignKeyName()]] 

1019 ref = DatasetRef(storage.datasetType, dataId, id=row["id"], run=runRecord.name, 

1020 conform=False) 

1021 if collectionRecord.type is CollectionType.CALIBRATION: 

1022 timespan = TimespanReprClass.extract(row) 

1023 else: 

1024 timespan = None 

1025 yield DatasetAssociation(ref=ref, collection=collectionRecord.name, timespan=timespan) 

1026 

1027 storageClasses: StorageClassFactory 

1028 """All storage classes known to the registry (`StorageClassFactory`). 

1029 """