Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ( 

25 "SqlRegistry", 

26) 

27 

28from collections import defaultdict 

29import contextlib 

30import logging 

31from typing import ( 

32 Any, 

33 Dict, 

34 Iterable, 

35 Iterator, 

36 List, 

37 Mapping, 

38 Optional, 

39 Set, 

40 TYPE_CHECKING, 

41 Union, 

42) 

43 

44import sqlalchemy 

45 

46from ..core import ( 

47 ButlerURI, 

48 Config, 

49 DataCoordinate, 

50 DataCoordinateIterable, 

51 DataId, 

52 DatasetAssociation, 

53 DatasetId, 

54 DatasetRef, 

55 DatasetType, 

56 ddl, 

57 Dimension, 

58 DimensionConfig, 

59 DimensionElement, 

60 DimensionGraph, 

61 DimensionRecord, 

62 DimensionUniverse, 

63 NamedKeyMapping, 

64 NameLookupMapping, 

65 Progress, 

66 StorageClassFactory, 

67 Timespan, 

68) 

69from ..core.utils import iterable, transactional 

70 

71from ..registry import ( 

72 Registry, 

73 RegistryConfig, 

74 CollectionType, 

75 RegistryDefaults, 

76 ConflictingDefinitionError, 

77 InconsistentDataIdError, 

78 OrphanedRecordError, 

79 CollectionSearch, 

80) 

81from ..registry import queries 

82from ..registry.wildcards import CategorizedWildcard, CollectionQuery, Ellipsis 

83from ..registry.summaries import CollectionSummary 

84from ..registry.managers import RegistryManagerTypes, RegistryManagerInstances 

85from ..registry.interfaces import ChainedCollectionRecord, DatasetIdGenEnum, RunRecord 

86 

87if TYPE_CHECKING: 87 ↛ 88line 87 didn't jump to line 88, because the condition on line 87 was never true

88 from .._butlerConfig import ButlerConfig 

89 from ..registry.interfaces import ( 

90 CollectionRecord, 

91 Database, 

92 DatastoreRegistryBridgeManager, 

93 ) 

94 

95 

96_LOG = logging.getLogger(__name__) 

97 

98 

99class SqlRegistry(Registry): 

100 """Registry implementation based on SQLAlchemy. 

101 

102 Parameters 

103 ---------- 

104 database : `Database` 

105 Database instance to store Registry. 

106 defaults : `RegistryDefaults` 

107 Default collection search path and/or output `~CollectionType.RUN` 

108 collection. 

109 managers : `RegistryManagerInstances` 

110 All the managers required for this registry. 

111 """ 

112 

113 defaultConfigFile: Optional[str] = None 

114 """Path to configuration defaults. Accessed within the ``configs`` resource 

115 or relative to a search path. Can be None if no defaults specified. 

116 """ 

117 

118 @classmethod 

119 def createFromConfig(cls, config: Optional[Union[RegistryConfig, str]] = None, 

120 dimensionConfig: Optional[Union[DimensionConfig, str]] = None, 

121 butlerRoot: Optional[str] = None) -> Registry: 

122 """Create registry database and return `SqlRegistry` instance. 

123 

124 This method initializes database contents, database must be empty 

125 prior to calling this method. 

126 

127 Parameters 

128 ---------- 

129 config : `RegistryConfig` or `str`, optional 

130 Registry configuration, if missing then default configuration will 

131 be loaded from registry.yaml. 

132 dimensionConfig : `DimensionConfig` or `str`, optional 

133 Dimensions configuration, if missing then default configuration 

134 will be loaded from dimensions.yaml. 

135 butlerRoot : `str`, optional 

136 Path to the repository root this `SqlRegistry` will manage. 

137 

138 Returns 

139 ------- 

140 registry : `SqlRegistry` 

141 A new `SqlRegistry` instance. 

142 """ 

143 config = cls.forceRegistryConfig(config) 

144 config.replaceRoot(butlerRoot) 

145 

146 if isinstance(dimensionConfig, str): 

147 dimensionConfig = DimensionConfig(config) 

148 elif dimensionConfig is None: 

149 dimensionConfig = DimensionConfig() 

150 elif not isinstance(dimensionConfig, DimensionConfig): 

151 raise TypeError(f"Incompatible Dimension configuration type: {type(dimensionConfig)}") 

152 

153 DatabaseClass = config.getDatabaseClass() 

154 database = DatabaseClass.fromUri(str(config.connectionString), origin=config.get("origin", 0), 

155 namespace=config.get("namespace")) 

156 managerTypes = RegistryManagerTypes.fromConfig(config) 

157 managers = managerTypes.makeRepo(database, dimensionConfig) 

158 return cls(database, RegistryDefaults(), managers) 

159 

160 @classmethod 

161 def fromConfig(cls, config: Union[ButlerConfig, RegistryConfig, Config, str], 

162 butlerRoot: Optional[Union[str, ButlerURI]] = None, writeable: bool = True, 

163 defaults: Optional[RegistryDefaults] = None) -> Registry: 

164 """Create `Registry` subclass instance from `config`. 

165 

166 Registry database must be inbitialized prior to calling this method. 

167 

168 Parameters 

169 ---------- 

170 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str` 

171 Registry configuration 

172 butlerRoot : `str` or `ButlerURI`, optional 

173 Path to the repository root this `Registry` will manage. 

174 writeable : `bool`, optional 

175 If `True` (default) create a read-write connection to the database. 

176 defaults : `RegistryDefaults`, optional 

177 Default collection search path and/or output `~CollectionType.RUN` 

178 collection. 

179 

180 Returns 

181 ------- 

182 registry : `SqlRegistry` (subclass) 

183 A new `SqlRegistry` subclass instance. 

184 """ 

185 config = cls.forceRegistryConfig(config) 

186 config.replaceRoot(butlerRoot) 

187 DatabaseClass = config.getDatabaseClass() 

188 database = DatabaseClass.fromUri(str(config.connectionString), origin=config.get("origin", 0), 

189 namespace=config.get("namespace"), writeable=writeable) 

190 managerTypes = RegistryManagerTypes.fromConfig(config) 

191 managers = managerTypes.loadRepo(database) 

192 if defaults is None: 

193 defaults = RegistryDefaults() 

194 return cls(database, defaults, managers) 

195 

196 def __init__(self, database: Database, defaults: RegistryDefaults, managers: RegistryManagerInstances): 

197 self._db = database 

198 self._managers = managers 

199 self.storageClasses = StorageClassFactory() 

200 # Intentionally invoke property setter to initialize defaults. This 

201 # can only be done after most of the rest of Registry has already been 

202 # initialized, and must be done before the property getter is used. 

203 self.defaults = defaults 

204 

205 def __str__(self) -> str: 

206 return str(self._db) 

207 

208 def __repr__(self) -> str: 

209 return f"SqlRegistry({self._db!r}, {self.dimensions!r})" 

210 

211 def isWriteable(self) -> bool: 

212 # Docstring inherited from lsst.daf.butler.registry.Registry 

213 return self._db.isWriteable() 

214 

215 def copy(self, defaults: Optional[RegistryDefaults] = None) -> Registry: 

216 # Docstring inherited from lsst.daf.butler.registry.Registry 

217 if defaults is None: 

218 # No need to copy, because `RegistryDefaults` is immutable; we 

219 # effectively copy on write. 

220 defaults = self.defaults 

221 return type(self)(self._db, defaults, self._managers) 

222 

223 @property 

224 def dimensions(self) -> DimensionUniverse: 

225 # Docstring inherited from lsst.daf.butler.registry.Registry 

226 return self._managers.dimensions.universe 

227 

228 def refresh(self) -> None: 

229 # Docstring inherited from lsst.daf.butler.registry.Registry 

230 self._managers.refresh() 

231 

232 @contextlib.contextmanager 

233 def transaction(self, *, savepoint: bool = False) -> Iterator[None]: 

234 # Docstring inherited from lsst.daf.butler.registry.Registry 

235 try: 

236 with self._db.transaction(savepoint=savepoint): 

237 yield 

238 except BaseException: 

239 # TODO: this clears the caches sometimes when we wouldn't actually 

240 # need to. Can we avoid that? 

241 self._managers.dimensions.clearCaches() 

242 raise 

243 

244 def resetConnectionPool(self) -> None: 

245 """Reset SQLAlchemy connection pool for `SqlRegistry` database. 

246 

247 This operation is useful when using registry with fork-based 

248 multiprocessing. To use registry across fork boundary one has to make 

249 sure that there are no currently active connections (no session or 

250 transaction is in progress) and connection pool is reset using this 

251 method. This method should be called by the child process immediately 

252 after the fork. 

253 """ 

254 self._db._engine.dispose() 

255 

256 def registerOpaqueTable(self, tableName: str, spec: ddl.TableSpec) -> None: 

257 """Add an opaque (to the `Registry`) table for use by a `Datastore` or 

258 other data repository client. 

259 

260 Opaque table records can be added via `insertOpaqueData`, retrieved via 

261 `fetchOpaqueData`, and removed via `deleteOpaqueData`. 

262 

263 Parameters 

264 ---------- 

265 tableName : `str` 

266 Logical name of the opaque table. This may differ from the 

267 actual name used in the database by a prefix and/or suffix. 

268 spec : `ddl.TableSpec` 

269 Specification for the table to be added. 

270 """ 

271 self._managers.opaque.register(tableName, spec) 

272 

273 @transactional 

274 def insertOpaqueData(self, tableName: str, *data: dict) -> None: 

275 """Insert records into an opaque table. 

276 

277 Parameters 

278 ---------- 

279 tableName : `str` 

280 Logical name of the opaque table. Must match the name used in a 

281 previous call to `registerOpaqueTable`. 

282 data 

283 Each additional positional argument is a dictionary that represents 

284 a single row to be added. 

285 """ 

286 self._managers.opaque[tableName].insert(*data) 

287 

288 def fetchOpaqueData(self, tableName: str, **where: Any) -> Iterator[dict]: 

289 """Retrieve records from an opaque table. 

290 

291 Parameters 

292 ---------- 

293 tableName : `str` 

294 Logical name of the opaque table. Must match the name used in a 

295 previous call to `registerOpaqueTable`. 

296 where 

297 Additional keyword arguments are interpreted as equality 

298 constraints that restrict the returned rows (combined with AND); 

299 keyword arguments are column names and values are the values they 

300 must have. 

301 

302 Yields 

303 ------ 

304 row : `dict` 

305 A dictionary representing a single result row. 

306 """ 

307 yield from self._managers.opaque[tableName].fetch(**where) 

308 

309 @transactional 

310 def deleteOpaqueData(self, tableName: str, **where: Any) -> None: 

311 """Remove records from an opaque table. 

312 

313 Parameters 

314 ---------- 

315 tableName : `str` 

316 Logical name of the opaque table. Must match the name used in a 

317 previous call to `registerOpaqueTable`. 

318 where 

319 Additional keyword arguments are interpreted as equality 

320 constraints that restrict the deleted rows (combined with AND); 

321 keyword arguments are column names and values are the values they 

322 must have. 

323 """ 

324 self._managers.opaque[tableName].delete(where.keys(), where) 

325 

326 def registerCollection(self, name: str, type: CollectionType = CollectionType.TAGGED, 

327 doc: Optional[str] = None) -> None: 

328 # Docstring inherited from lsst.daf.butler.registry.Registry 

329 self._managers.collections.register(name, type, doc=doc) 

330 

331 def getCollectionType(self, name: str) -> CollectionType: 

332 # Docstring inherited from lsst.daf.butler.registry.Registry 

333 return self._managers.collections.find(name).type 

334 

335 def _get_collection_record(self, name: str) -> CollectionRecord: 

336 # Docstring inherited from lsst.daf.butler.registry.Registry 

337 return self._managers.collections.find(name) 

338 

339 def registerRun(self, name: str, doc: Optional[str] = None) -> None: 

340 # Docstring inherited from lsst.daf.butler.registry.Registry 

341 self._managers.collections.register(name, CollectionType.RUN, doc=doc) 

342 

343 @transactional 

344 def removeCollection(self, name: str) -> None: 

345 # Docstring inherited from lsst.daf.butler.registry.Registry 

346 self._managers.collections.remove(name) 

347 

348 def getCollectionChain(self, parent: str) -> CollectionSearch: 

349 # Docstring inherited from lsst.daf.butler.registry.Registry 

350 record = self._managers.collections.find(parent) 

351 if record.type is not CollectionType.CHAINED: 

352 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.") 

353 assert isinstance(record, ChainedCollectionRecord) 

354 return record.children 

355 

356 @transactional 

357 def setCollectionChain(self, parent: str, children: Any, *, flatten: bool = False) -> None: 

358 # Docstring inherited from lsst.daf.butler.registry.Registry 

359 record = self._managers.collections.find(parent) 

360 if record.type is not CollectionType.CHAINED: 

361 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.") 

362 assert isinstance(record, ChainedCollectionRecord) 

363 children = CollectionSearch.fromExpression(children) 

364 if children != record.children or flatten: 

365 record.update(self._managers.collections, children, flatten=flatten) 

366 

367 def getCollectionDocumentation(self, collection: str) -> Optional[str]: 

368 # Docstring inherited from lsst.daf.butler.registry.Registry 

369 return self._managers.collections.getDocumentation(self._managers.collections.find(collection).key) 

370 

371 def setCollectionDocumentation(self, collection: str, doc: Optional[str]) -> None: 

372 # Docstring inherited from lsst.daf.butler.registry.Registry 

373 self._managers.collections.setDocumentation(self._managers.collections.find(collection).key, doc) 

374 

375 def getCollectionSummary(self, collection: str) -> CollectionSummary: 

376 # Docstring inherited from lsst.daf.butler.registry.Registry 

377 record = self._managers.collections.find(collection) 

378 return self._managers.datasets.getCollectionSummary(record) 

379 

380 def registerDatasetType(self, datasetType: DatasetType) -> bool: 

381 # Docstring inherited from lsst.daf.butler.registry.Registry 

382 _, inserted = self._managers.datasets.register(datasetType) 

383 return inserted 

384 

385 def removeDatasetType(self, name: str) -> None: 

386 # Docstring inherited from lsst.daf.butler.registry.Registry 

387 self._managers.datasets.remove(name) 

388 

389 def getDatasetType(self, name: str) -> DatasetType: 

390 # Docstring inherited from lsst.daf.butler.registry.Registry 

391 return self._managers.datasets[name].datasetType 

392 

393 def supportsIdGenerationMode(self, mode: DatasetIdGenEnum) -> bool: 

394 # Docstring inherited from lsst.daf.butler.registry.Registry 

395 return self._managers.datasets.supportsIdGenerationMode(mode) 

396 

397 def findDataset(self, datasetType: Union[DatasetType, str], dataId: Optional[DataId] = None, *, 

398 collections: Any = None, timespan: Optional[Timespan] = None, 

399 **kwargs: Any) -> Optional[DatasetRef]: 

400 # Docstring inherited from lsst.daf.butler.registry.Registry 

401 if isinstance(datasetType, DatasetType): 

402 storage = self._managers.datasets[datasetType.name] 

403 else: 

404 storage = self._managers.datasets[datasetType] 

405 dataId = DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions, 

406 universe=self.dimensions, defaults=self.defaults.dataId, 

407 **kwargs) 

408 if collections is None: 

409 if not self.defaults.collections: 

410 raise TypeError("No collections provided to findDataset, " 

411 "and no defaults from registry construction.") 

412 collections = self.defaults.collections 

413 else: 

414 collections = CollectionSearch.fromExpression(collections) 

415 for collectionRecord in collections.iter(self._managers.collections): 

416 if (collectionRecord.type is CollectionType.CALIBRATION 

417 and (not storage.datasetType.isCalibration() or timespan is None)): 

418 continue 

419 result = storage.find(collectionRecord, dataId, timespan=timespan) 

420 if result is not None: 

421 return result 

422 

423 return None 

424 

425 @transactional 

426 def insertDatasets(self, datasetType: Union[DatasetType, str], dataIds: Iterable[DataId], 

427 run: Optional[str] = None, expand: bool = True, 

428 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE) -> List[DatasetRef]: 

429 # Docstring inherited from lsst.daf.butler.registry.Registry 

430 if isinstance(datasetType, DatasetType): 

431 storage = self._managers.datasets.find(datasetType.name) 

432 if storage is None: 

433 raise LookupError(f"DatasetType '{datasetType}' has not been registered.") 

434 else: 

435 storage = self._managers.datasets.find(datasetType) 

436 if storage is None: 

437 raise LookupError(f"DatasetType with name '{datasetType}' has not been registered.") 

438 if run is None: 

439 if self.defaults.run is None: 

440 raise TypeError("No run provided to insertDatasets, " 

441 "and no default from registry construction.") 

442 run = self.defaults.run 

443 runRecord = self._managers.collections.find(run) 

444 if runRecord.type is not CollectionType.RUN: 

445 raise TypeError(f"Given collection is of type {runRecord.type.name}; RUN collection required.") 

446 assert isinstance(runRecord, RunRecord) 

447 progress = Progress("daf.butler.Registry.insertDatasets", level=logging.DEBUG) 

448 if expand: 

449 expandedDataIds = [self.expandDataId(dataId, graph=storage.datasetType.dimensions) 

450 for dataId in progress.wrap(dataIds, 

451 f"Expanding {storage.datasetType.name} data IDs")] 

452 else: 

453 expandedDataIds = [DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions) 

454 for dataId in dataIds] 

455 try: 

456 refs = list(storage.insert(runRecord, expandedDataIds, idGenerationMode)) 

457 except sqlalchemy.exc.IntegrityError as err: 

458 raise ConflictingDefinitionError(f"A database constraint failure was triggered by inserting " 

459 f"one or more datasets of type {storage.datasetType} into " 

460 f"collection '{run}'. " 

461 f"This probably means a dataset with the same data ID " 

462 f"and dataset type already exists, but it may also mean a " 

463 f"dimension row is missing.") from err 

464 return refs 

465 

466 @transactional 

467 def _importDatasets(self, datasets: Iterable[DatasetRef], expand: bool = True, 

468 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

469 reuseIds: bool = False) -> List[DatasetRef]: 

470 # Docstring inherited from lsst.daf.butler.registry.Registry 

471 datasets = list(datasets) 

472 if not datasets: 

473 # nothing to do 

474 return [] 

475 

476 # find dataset type 

477 datasetTypes = set(dataset.datasetType for dataset in datasets) 

478 if len(datasetTypes) != 1: 

479 raise ValueError(f"Multiple dataset types in input datasets: {datasetTypes}") 

480 datasetType = datasetTypes.pop() 

481 

482 # get storage handler for this dataset type 

483 storage = self._managers.datasets.find(datasetType.name) 

484 if storage is None: 

485 raise LookupError(f"DatasetType '{datasetType}' has not been registered.") 

486 

487 # find run name 

488 runs = set(dataset.run for dataset in datasets) 

489 if len(runs) != 1: 

490 raise ValueError(f"Multiple run names in input datasets: {runs}") 

491 run = runs.pop() 

492 if run is None: 

493 if self.defaults.run is None: 

494 raise TypeError("No run provided to ingestDatasets, " 

495 "and no default from registry construction.") 

496 run = self.defaults.run 

497 

498 runRecord = self._managers.collections.find(run) 

499 if runRecord.type is not CollectionType.RUN: 

500 raise TypeError(f"Given collection '{runRecord.name}' is of type {runRecord.type.name};" 

501 " RUN collection required.") 

502 assert isinstance(runRecord, RunRecord) 

503 

504 progress = Progress("daf.butler.Registry.insertDatasets", level=logging.DEBUG) 

505 if expand: 

506 expandedDatasets = [ 

507 dataset.expanded(self.expandDataId(dataset.dataId, graph=storage.datasetType.dimensions)) 

508 for dataset in progress.wrap(datasets, f"Expanding {storage.datasetType.name} data IDs")] 

509 else: 

510 expandedDatasets = [ 

511 DatasetRef(datasetType, dataset.dataId, id=dataset.id, run=dataset.run, conform=True) 

512 for dataset in datasets 

513 ] 

514 

515 try: 

516 refs = list(storage.import_(runRecord, expandedDatasets, idGenerationMode, reuseIds)) 

517 except sqlalchemy.exc.IntegrityError as err: 

518 raise ConflictingDefinitionError(f"A database constraint failure was triggered by inserting " 

519 f"one or more datasets of type {storage.datasetType} into " 

520 f"collection '{run}'. " 

521 f"This probably means a dataset with the same data ID " 

522 f"and dataset type already exists, but it may also mean a " 

523 f"dimension row is missing.") from err 

524 return refs 

525 

526 def getDataset(self, id: DatasetId) -> Optional[DatasetRef]: 

527 # Docstring inherited from lsst.daf.butler.registry.Registry 

528 return self._managers.datasets.getDatasetRef(id) 

529 

530 @transactional 

531 def removeDatasets(self, refs: Iterable[DatasetRef]) -> None: 

532 # Docstring inherited from lsst.daf.butler.registry.Registry 

533 progress = Progress("lsst.daf.butler.Registry.removeDatasets", level=logging.DEBUG) 

534 for datasetType, refsForType in progress.iter_item_chunks(DatasetRef.groupByType(refs).items(), 

535 desc="Removing datasets by type"): 

536 storage = self._managers.datasets[datasetType.name] 

537 try: 

538 storage.delete(refsForType) 

539 except sqlalchemy.exc.IntegrityError as err: 

540 raise OrphanedRecordError("One or more datasets is still " 

541 "present in one or more Datastores.") from err 

542 

543 @transactional 

544 def associate(self, collection: str, refs: Iterable[DatasetRef]) -> None: 

545 # Docstring inherited from lsst.daf.butler.registry.Registry 

546 progress = Progress("lsst.daf.butler.Registry.associate", level=logging.DEBUG) 

547 collectionRecord = self._managers.collections.find(collection) 

548 if collectionRecord.type is not CollectionType.TAGGED: 

549 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}, not TAGGED.") 

550 for datasetType, refsForType in progress.iter_item_chunks(DatasetRef.groupByType(refs).items(), 

551 desc="Associating datasets by type"): 

552 storage = self._managers.datasets[datasetType.name] 

553 try: 

554 storage.associate(collectionRecord, refsForType) 

555 except sqlalchemy.exc.IntegrityError as err: 

556 raise ConflictingDefinitionError( 

557 f"Constraint violation while associating dataset of type {datasetType.name} with " 

558 f"collection {collection}. This probably means that one or more datasets with the same " 

559 f"dataset type and data ID already exist in the collection, but it may also indicate " 

560 f"that the datasets do not exist." 

561 ) from err 

562 

563 @transactional 

564 def disassociate(self, collection: str, refs: Iterable[DatasetRef]) -> None: 

565 # Docstring inherited from lsst.daf.butler.registry.Registry 

566 progress = Progress("lsst.daf.butler.Registry.disassociate", level=logging.DEBUG) 

567 collectionRecord = self._managers.collections.find(collection) 

568 if collectionRecord.type is not CollectionType.TAGGED: 

569 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}; " 

570 "expected TAGGED.") 

571 for datasetType, refsForType in progress.iter_item_chunks(DatasetRef.groupByType(refs).items(), 

572 desc="Disassociating datasets by type"): 

573 storage = self._managers.datasets[datasetType.name] 

574 storage.disassociate(collectionRecord, refsForType) 

575 

576 @transactional 

577 def certify(self, collection: str, refs: Iterable[DatasetRef], timespan: Timespan) -> None: 

578 # Docstring inherited from lsst.daf.butler.registry.Registry 

579 progress = Progress("lsst.daf.butler.Registry.certify", level=logging.DEBUG) 

580 collectionRecord = self._managers.collections.find(collection) 

581 for datasetType, refsForType in progress.iter_item_chunks(DatasetRef.groupByType(refs).items(), 

582 desc="Certifying datasets by type"): 

583 storage = self._managers.datasets[datasetType.name] 

584 storage.certify(collectionRecord, refsForType, timespan) 

585 

586 @transactional 

587 def decertify(self, collection: str, datasetType: Union[str, DatasetType], timespan: Timespan, *, 

588 dataIds: Optional[Iterable[DataId]] = None) -> None: 

589 # Docstring inherited from lsst.daf.butler.registry.Registry 

590 collectionRecord = self._managers.collections.find(collection) 

591 if isinstance(datasetType, str): 

592 storage = self._managers.datasets[datasetType] 

593 else: 

594 storage = self._managers.datasets[datasetType.name] 

595 standardizedDataIds = None 

596 if dataIds is not None: 

597 standardizedDataIds = [DataCoordinate.standardize(d, graph=storage.datasetType.dimensions) 

598 for d in dataIds] 

599 storage.decertify(collectionRecord, timespan, dataIds=standardizedDataIds) 

600 

601 def getDatastoreBridgeManager(self) -> DatastoreRegistryBridgeManager: 

602 """Return an object that allows a new `Datastore` instance to 

603 communicate with this `Registry`. 

604 

605 Returns 

606 ------- 

607 manager : `DatastoreRegistryBridgeManager` 

608 Object that mediates communication between this `Registry` and its 

609 associated datastores. 

610 """ 

611 return self._managers.datastores 

612 

613 def getDatasetLocations(self, ref: DatasetRef) -> Iterable[str]: 

614 # Docstring inherited from lsst.daf.butler.registry.Registry 

615 return self._managers.datastores.findDatastores(ref) 

616 

617 def expandDataId(self, dataId: Optional[DataId] = None, *, graph: Optional[DimensionGraph] = None, 

618 records: Optional[NameLookupMapping[DimensionElement, Optional[DimensionRecord]]] = None, 

619 withDefaults: bool = True, 

620 **kwargs: Any) -> DataCoordinate: 

621 # Docstring inherited from lsst.daf.butler.registry.Registry 

622 if not withDefaults: 

623 defaults = None 

624 else: 

625 defaults = self.defaults.dataId 

626 standardized = DataCoordinate.standardize(dataId, graph=graph, universe=self.dimensions, 

627 defaults=defaults, **kwargs) 

628 if standardized.hasRecords(): 

629 return standardized 

630 if records is None: 

631 records = {} 

632 elif isinstance(records, NamedKeyMapping): 

633 records = records.byName() 

634 else: 

635 records = dict(records) 

636 if isinstance(dataId, DataCoordinate) and dataId.hasRecords(): 

637 records.update(dataId.records.byName()) 

638 keys = standardized.byName() 

639 for element in standardized.graph.primaryKeyTraversalOrder: 

640 record = records.get(element.name, ...) # Use ... to mean not found; None might mean NULL 

641 if record is ...: 

642 if isinstance(element, Dimension) and keys.get(element.name) is None: 

643 if element in standardized.graph.required: 

644 raise LookupError( 

645 f"No value or null value for required dimension {element.name}." 

646 ) 

647 keys[element.name] = None 

648 record = None 

649 else: 

650 storage = self._managers.dimensions[element] 

651 dataIdSet = DataCoordinateIterable.fromScalar( 

652 DataCoordinate.standardize(keys, graph=element.graph) 

653 ) 

654 fetched = tuple(storage.fetch(dataIdSet)) 

655 try: 

656 (record,) = fetched 

657 except ValueError: 

658 record = None 

659 records[element.name] = record 

660 if record is not None: 

661 for d in element.implied: 

662 value = getattr(record, d.name) 

663 if keys.setdefault(d.name, value) != value: 

664 raise InconsistentDataIdError( 

665 f"Data ID {standardized} has {d.name}={keys[d.name]!r}, " 

666 f"but {element.name} implies {d.name}={value!r}." 

667 ) 

668 else: 

669 if element in standardized.graph.required: 

670 raise LookupError( 

671 f"Could not fetch record for required dimension {element.name} via keys {keys}." 

672 ) 

673 if element.alwaysJoin: 

674 raise InconsistentDataIdError( 

675 f"Could not fetch record for element {element.name} via keys {keys}, ", 

676 "but it is marked alwaysJoin=True; this means one or more dimensions are not " 

677 "related." 

678 ) 

679 for d in element.implied: 

680 keys.setdefault(d.name, None) 

681 records.setdefault(d.name, None) 

682 return DataCoordinate.standardize(keys, graph=standardized.graph).expanded(records=records) 

683 

684 def insertDimensionData(self, element: Union[DimensionElement, str], 

685 *data: Union[Mapping[str, Any], DimensionRecord], 

686 conform: bool = True, 

687 replace: bool = False) -> None: 

688 # Docstring inherited from lsst.daf.butler.registry.Registry 

689 if conform: 

690 if isinstance(element, str): 

691 element = self.dimensions[element] 

692 records = [row if isinstance(row, DimensionRecord) else element.RecordClass(**row) 

693 for row in data] 

694 else: 

695 # Ignore typing since caller said to trust them with conform=False. 

696 records = data # type: ignore 

697 storage = self._managers.dimensions[element] # type: ignore 

698 storage.insert(*records, replace=replace) 

699 

700 def syncDimensionData(self, element: Union[DimensionElement, str], 

701 row: Union[Mapping[str, Any], DimensionRecord], 

702 conform: bool = True, 

703 update: bool = False) -> Union[bool, Dict[str, Any]]: 

704 # Docstring inherited from lsst.daf.butler.registry.Registry 

705 if conform: 

706 if isinstance(element, str): 

707 element = self.dimensions[element] 

708 record = row if isinstance(row, DimensionRecord) else element.RecordClass(**row) 

709 else: 

710 # Ignore typing since caller said to trust them with conform=False. 

711 record = row # type: ignore 

712 storage = self._managers.dimensions[element] # type: ignore 

713 return storage.sync(record, update=update) 

714 

715 def queryDatasetTypes(self, expression: Any = ..., *, components: Optional[bool] = None 

716 ) -> Iterator[DatasetType]: 

717 # Docstring inherited from lsst.daf.butler.registry.Registry 

718 wildcard = CategorizedWildcard.fromExpression(expression, coerceUnrecognized=lambda d: d.name) 

719 if wildcard is Ellipsis: 

720 for datasetType in self._managers.datasets: 

721 # The dataset type can no longer be a component 

722 yield datasetType 

723 if components: 

724 # Automatically create the component dataset types 

725 try: 

726 componentsForDatasetType = datasetType.makeAllComponentDatasetTypes() 

727 except KeyError as err: 

728 _LOG.warning(f"Could not load storage class {err} for {datasetType.name}; " 

729 "if it has components they will not be included in query results.") 

730 else: 

731 yield from componentsForDatasetType 

732 return 

733 done: Set[str] = set() 

734 for name in wildcard.strings: 

735 storage = self._managers.datasets.find(name) 

736 if storage is not None: 

737 done.add(storage.datasetType.name) 

738 yield storage.datasetType 

739 if wildcard.patterns: 

740 # If components (the argument) is None, we'll save component 

741 # dataset that we might want to match, but only if their parents 

742 # didn't get included. 

743 componentsForLater = [] 

744 for registeredDatasetType in self._managers.datasets: 

745 # Components are not stored in registry so expand them here 

746 allDatasetTypes = [registeredDatasetType] 

747 try: 

748 allDatasetTypes.extend(registeredDatasetType.makeAllComponentDatasetTypes()) 

749 except KeyError as err: 

750 _LOG.warning(f"Could not load storage class {err} for {registeredDatasetType.name}; " 

751 "if it has components they will not be included in query results.") 

752 for datasetType in allDatasetTypes: 

753 if datasetType.name in done: 

754 continue 

755 parentName, componentName = datasetType.nameAndComponent() 

756 if componentName is not None and not components: 

757 if components is None and parentName not in done: 

758 componentsForLater.append(datasetType) 

759 continue 

760 if any(p.fullmatch(datasetType.name) for p in wildcard.patterns): 

761 done.add(datasetType.name) 

762 yield datasetType 

763 # Go back and try to match saved components. 

764 for datasetType in componentsForLater: 

765 parentName, _ = datasetType.nameAndComponent() 

766 if parentName not in done and any(p.fullmatch(datasetType.name) for p in wildcard.patterns): 

767 yield datasetType 

768 

769 def queryCollections(self, expression: Any = ..., 

770 datasetType: Optional[DatasetType] = None, 

771 collectionTypes: Iterable[CollectionType] = CollectionType.all(), 

772 flattenChains: bool = False, 

773 includeChains: Optional[bool] = None) -> Iterator[str]: 

774 # Docstring inherited from lsst.daf.butler.registry.Registry 

775 

776 # Right now the datasetTypes argument is completely ignored, but that 

777 # is consistent with its [lack of] guarantees. DM-24939 or a follow-up 

778 # ticket will take care of that. 

779 query = CollectionQuery.fromExpression(expression) 

780 for record in query.iter(self._managers.collections, collectionTypes=frozenset(collectionTypes), 

781 flattenChains=flattenChains, includeChains=includeChains): 

782 yield record.name 

783 

784 def _makeQueryBuilder(self, summary: queries.QuerySummary) -> queries.QueryBuilder: 

785 """Return a `QueryBuilder` instance capable of constructing and 

786 managing more complex queries than those obtainable via `Registry` 

787 interfaces. 

788 

789 This is an advanced interface; downstream code should prefer 

790 `Registry.queryDataIds` and `Registry.queryDatasets` whenever those 

791 are sufficient. 

792 

793 Parameters 

794 ---------- 

795 summary : `queries.QuerySummary` 

796 Object describing and categorizing the full set of dimensions that 

797 will be included in the query. 

798 

799 Returns 

800 ------- 

801 builder : `queries.QueryBuilder` 

802 Object that can be used to construct and perform advanced queries. 

803 """ 

804 return queries.QueryBuilder( 

805 summary, 

806 queries.RegistryManagers( 

807 collections=self._managers.collections, 

808 dimensions=self._managers.dimensions, 

809 datasets=self._managers.datasets, 

810 TimespanReprClass=self._db.getTimespanRepresentation(), 

811 ), 

812 ) 

813 

814 def queryDatasets(self, datasetType: Any, *, 

815 collections: Any = None, 

816 dimensions: Optional[Iterable[Union[Dimension, str]]] = None, 

817 dataId: Optional[DataId] = None, 

818 where: Optional[str] = None, 

819 findFirst: bool = False, 

820 components: Optional[bool] = None, 

821 bind: Optional[Mapping[str, Any]] = None, 

822 check: bool = True, 

823 **kwargs: Any) -> queries.DatasetQueryResults: 

824 # Docstring inherited from lsst.daf.butler.registry.Registry 

825 

826 # Standardize the collections expression. 

827 if collections is None: 

828 if not self.defaults.collections: 

829 raise TypeError("No collections provided to findDataset, " 

830 "and no defaults from registry construction.") 

831 collections = self.defaults.collections 

832 elif findFirst: 

833 collections = CollectionSearch.fromExpression(collections) 

834 else: 

835 collections = CollectionQuery.fromExpression(collections) 

836 # Standardize and expand the data ID provided as a constraint. 

837 standardizedDataId = self.expandDataId(dataId, **kwargs) 

838 

839 # We can only query directly if given a non-component DatasetType 

840 # instance. If we were given an expression or str or a component 

841 # DatasetType instance, we'll populate this dict, recurse, and return. 

842 # If we already have a non-component DatasetType, it will remain None 

843 # and we'll run the query directly. 

844 composition: Optional[ 

845 Dict[ 

846 DatasetType, # parent dataset type 

847 List[Optional[str]] # component name, or None for parent 

848 ] 

849 ] = None 

850 if not isinstance(datasetType, DatasetType): 

851 # We were given a dataset type expression (which may be as simple 

852 # as a str). Loop over all matching datasets, delegating handling 

853 # of the `components` argument to queryDatasetTypes, as we populate 

854 # the composition dict. 

855 composition = defaultdict(list) 

856 for trueDatasetType in self.queryDatasetTypes(datasetType, components=components): 

857 parentName, componentName = trueDatasetType.nameAndComponent() 

858 if componentName is not None: 

859 parentDatasetType = self.getDatasetType(parentName) 

860 composition.setdefault(parentDatasetType, []).append(componentName) 

861 else: 

862 composition.setdefault(trueDatasetType, []).append(None) 

863 elif datasetType.isComponent(): 

864 # We were given a true DatasetType instance, but it's a component. 

865 # the composition dict will have exactly one item. 

866 parentName, componentName = datasetType.nameAndComponent() 

867 parentDatasetType = self.getDatasetType(parentName) 

868 composition = {parentDatasetType: [componentName]} 

869 if composition is not None: 

870 # We need to recurse. Do that once for each parent dataset type. 

871 chain = [] 

872 for parentDatasetType, componentNames in composition.items(): 

873 parentResults = self.queryDatasets( 

874 parentDatasetType, 

875 collections=collections, 

876 dimensions=dimensions, 

877 dataId=standardizedDataId, 

878 where=where, 

879 bind=bind, 

880 findFirst=findFirst, 

881 check=check, 

882 ) 

883 if isinstance(parentResults, queries.ParentDatasetQueryResults): 

884 chain.append( 

885 parentResults.withComponents(componentNames) 

886 ) 

887 else: 

888 # Should only happen if we know there would be no results. 

889 assert isinstance(parentResults, queries.ChainedDatasetQueryResults) \ 

890 and not parentResults._chain 

891 return queries.ChainedDatasetQueryResults(chain) 

892 # If we get here, there's no need to recurse (or we are already 

893 # recursing; there can only ever be one level of recursion). 

894 

895 # The full set of dimensions in the query is the combination of those 

896 # needed for the DatasetType and those explicitly requested, if any. 

897 requestedDimensionNames = set(datasetType.dimensions.names) 

898 if dimensions is not None: 

899 requestedDimensionNames.update(self.dimensions.extract(dimensions).names) 

900 # Construct the summary structure needed to construct a QueryBuilder. 

901 summary = queries.QuerySummary( 

902 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames), 

903 dataId=standardizedDataId, 

904 expression=where, 

905 bind=bind, 

906 defaults=self.defaults.dataId, 

907 check=check, 

908 ) 

909 builder = self._makeQueryBuilder(summary) 

910 # Add the dataset subquery to the query, telling the QueryBuilder to 

911 # include the rank of the selected collection in the results only if we 

912 # need to findFirst. Note that if any of the collections are 

913 # actually wildcard expressions, and we've asked for deduplication, 

914 # this will raise TypeError for us. 

915 if not builder.joinDataset(datasetType, collections, isResult=True, findFirst=findFirst): 

916 return queries.ChainedDatasetQueryResults(()) 

917 query = builder.finish() 

918 return queries.ParentDatasetQueryResults(self._db, query, components=[None]) 

919 

920 def queryDataIds(self, dimensions: Union[Iterable[Union[Dimension, str]], Dimension, str], *, 

921 dataId: Optional[DataId] = None, 

922 datasets: Any = None, 

923 collections: Any = None, 

924 where: Optional[str] = None, 

925 components: Optional[bool] = None, 

926 bind: Optional[Mapping[str, Any]] = None, 

927 check: bool = True, 

928 **kwargs: Any) -> queries.DataCoordinateQueryResults: 

929 # Docstring inherited from lsst.daf.butler.registry.Registry 

930 dimensions = iterable(dimensions) 

931 standardizedDataId = self.expandDataId(dataId, **kwargs) 

932 standardizedDatasetTypes = set() 

933 requestedDimensions = self.dimensions.extract(dimensions) 

934 queryDimensionNames = set(requestedDimensions.names) 

935 if datasets is not None: 

936 if collections is None: 

937 if not self.defaults.collections: 

938 raise TypeError("Cannot pass 'datasets' without 'collections'.") 

939 collections = self.defaults.collections 

940 else: 

941 # Preprocess collections expression in case the original 

942 # included single-pass iterators (we'll want to use it multiple 

943 # times below). 

944 collections = CollectionQuery.fromExpression(collections) 

945 for datasetType in self.queryDatasetTypes(datasets, components=components): 

946 queryDimensionNames.update(datasetType.dimensions.names) 

947 # If any matched dataset type is a component, just operate on 

948 # its parent instead, because Registry doesn't know anything 

949 # about what components exist, and here (unlike queryDatasets) 

950 # we don't care about returning them. 

951 parentDatasetTypeName, componentName = datasetType.nameAndComponent() 

952 if componentName is not None: 

953 datasetType = self.getDatasetType(parentDatasetTypeName) 

954 standardizedDatasetTypes.add(datasetType) 

955 

956 summary = queries.QuerySummary( 

957 requested=DimensionGraph(self.dimensions, names=queryDimensionNames), 

958 dataId=standardizedDataId, 

959 expression=where, 

960 bind=bind, 

961 defaults=self.defaults.dataId, 

962 check=check, 

963 ) 

964 builder = self._makeQueryBuilder(summary) 

965 for datasetType in standardizedDatasetTypes: 

966 builder.joinDataset(datasetType, collections, isResult=False) 

967 query = builder.finish() 

968 return queries.DataCoordinateQueryResults(self._db, query) 

969 

970 def queryDimensionRecords(self, element: Union[DimensionElement, str], *, 

971 dataId: Optional[DataId] = None, 

972 datasets: Any = None, 

973 collections: Any = None, 

974 where: Optional[str] = None, 

975 components: Optional[bool] = None, 

976 bind: Optional[Mapping[str, Any]] = None, 

977 check: bool = True, 

978 **kwargs: Any) -> Iterator[DimensionRecord]: 

979 # Docstring inherited from lsst.daf.butler.registry.Registry 

980 if not isinstance(element, DimensionElement): 

981 try: 

982 element = self.dimensions[element] 

983 except KeyError as e: 

984 raise KeyError(f"No such dimension '{element}', available dimensions: " 

985 + str(self.dimensions.getStaticElements())) from e 

986 dataIds = self.queryDataIds(element.graph, dataId=dataId, datasets=datasets, collections=collections, 

987 where=where, components=components, bind=bind, check=check, **kwargs) 

988 return iter(self._managers.dimensions[element].fetch(dataIds)) 

989 

990 def queryDatasetAssociations( 

991 self, 

992 datasetType: Union[str, DatasetType], 

993 collections: Any = ..., 

994 *, 

995 collectionTypes: Iterable[CollectionType] = CollectionType.all(), 

996 flattenChains: bool = False, 

997 ) -> Iterator[DatasetAssociation]: 

998 # Docstring inherited from lsst.daf.butler.registry.Registry 

999 if collections is None: 

1000 if not self.defaults.collections: 

1001 raise TypeError("No collections provided to findDataset, " 

1002 "and no defaults from registry construction.") 

1003 collections = self.defaults.collections 

1004 else: 

1005 collections = CollectionQuery.fromExpression(collections) 

1006 TimespanReprClass = self._db.getTimespanRepresentation() 

1007 if isinstance(datasetType, str): 

1008 storage = self._managers.datasets[datasetType] 

1009 else: 

1010 storage = self._managers.datasets[datasetType.name] 

1011 for collectionRecord in collections.iter(self._managers.collections, 

1012 collectionTypes=frozenset(collectionTypes), 

1013 flattenChains=flattenChains): 

1014 query = storage.select(collectionRecord) 

1015 if query is None: 

1016 continue 

1017 for row in self._db.query(query.combine()): 

1018 dataId = DataCoordinate.fromRequiredValues( 

1019 storage.datasetType.dimensions, 

1020 tuple(row[name] for name in storage.datasetType.dimensions.required.names) 

1021 ) 

1022 runRecord = self._managers.collections[row[self._managers.collections.getRunForeignKeyName()]] 

1023 ref = DatasetRef(storage.datasetType, dataId, id=row["id"], run=runRecord.name, 

1024 conform=False) 

1025 if collectionRecord.type is CollectionType.CALIBRATION: 

1026 timespan = TimespanReprClass.extract(row) 

1027 else: 

1028 timespan = None 

1029 yield DatasetAssociation(ref=ref, collection=collectionRecord.name, timespan=timespan) 

1030 

1031 storageClasses: StorageClassFactory 

1032 """All storage classes known to the registry (`StorageClassFactory`). 

1033 """