Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ( 

25 "SqlRegistry", 

26) 

27 

28from collections import defaultdict 

29import contextlib 

30import logging 

31from typing import ( 

32 Any, 

33 Dict, 

34 Iterable, 

35 Iterator, 

36 List, 

37 Mapping, 

38 Optional, 

39 Set, 

40 TYPE_CHECKING, 

41 Union, 

42) 

43 

44import sqlalchemy 

45 

46from ..core import ( 

47 ButlerURI, 

48 Config, 

49 DataCoordinate, 

50 DataCoordinateIterable, 

51 DataId, 

52 DatasetAssociation, 

53 DatasetId, 

54 DatasetRef, 

55 DatasetType, 

56 ddl, 

57 Dimension, 

58 DimensionConfig, 

59 DimensionElement, 

60 DimensionGraph, 

61 DimensionRecord, 

62 DimensionUniverse, 

63 NamedKeyMapping, 

64 NameLookupMapping, 

65 Progress, 

66 StorageClassFactory, 

67 Timespan, 

68) 

69from . import queries 

70from ..core.utils import iterable, transactional 

71from ._config import RegistryConfig 

72from ._collectionType import CollectionType 

73from ._defaults import RegistryDefaults 

74from ._exceptions import ConflictingDefinitionError, InconsistentDataIdError, OrphanedRecordError 

75from .managers import RegistryManagerTypes, RegistryManagerInstances 

76from .wildcards import CategorizedWildcard, CollectionQuery, CollectionSearch, Ellipsis 

77from .summaries import CollectionSummary 

78from .interfaces import ChainedCollectionRecord, DatasetIdGenEnum, RunRecord 

79from ._registry import Registry 

80 

81if TYPE_CHECKING: 81 ↛ 82line 81 didn't jump to line 82, because the condition on line 81 was never true

82 from .._butlerConfig import ButlerConfig 

83 from .interfaces import ( 

84 CollectionRecord, 

85 Database, 

86 DatastoreRegistryBridgeManager, 

87 ) 

88 

89 

90_LOG = logging.getLogger(__name__) 

91 

92 

93class SqlRegistry(Registry): 

94 """Registry implementation based on SQLAlchemy. 

95 

96 Parameters 

97 ---------- 

98 database : `Database` 

99 Database instance to store Registry. 

100 defaults : `RegistryDefaults` 

101 Default collection search path and/or output `~CollectionType.RUN` 

102 collection. 

103 managers : `RegistryManagerInstances` 

104 All the managers required for this registry. 

105 """ 

106 

107 defaultConfigFile: Optional[str] = None 

108 """Path to configuration defaults. Accessed within the ``configs`` resource 

109 or relative to a search path. Can be None if no defaults specified. 

110 """ 

111 

112 @classmethod 

113 def createFromConfig(cls, config: Optional[Union[RegistryConfig, str]] = None, 

114 dimensionConfig: Optional[Union[DimensionConfig, str]] = None, 

115 butlerRoot: Optional[str] = None) -> Registry: 

116 """Create registry database and return `SqlRegistry` instance. 

117 

118 This method initializes database contents, database must be empty 

119 prior to calling this method. 

120 

121 Parameters 

122 ---------- 

123 config : `RegistryConfig` or `str`, optional 

124 Registry configuration, if missing then default configuration will 

125 be loaded from registry.yaml. 

126 dimensionConfig : `DimensionConfig` or `str`, optional 

127 Dimensions configuration, if missing then default configuration 

128 will be loaded from dimensions.yaml. 

129 butlerRoot : `str`, optional 

130 Path to the repository root this `SqlRegistry` will manage. 

131 

132 Returns 

133 ------- 

134 registry : `SqlRegistry` 

135 A new `SqlRegistry` instance. 

136 """ 

137 config = cls.forceRegistryConfig(config) 

138 config.replaceRoot(butlerRoot) 

139 

140 if isinstance(dimensionConfig, str): 

141 dimensionConfig = DimensionConfig(config) 

142 elif dimensionConfig is None: 

143 dimensionConfig = DimensionConfig() 

144 elif not isinstance(dimensionConfig, DimensionConfig): 

145 raise TypeError(f"Incompatible Dimension configuration type: {type(dimensionConfig)}") 

146 

147 DatabaseClass = config.getDatabaseClass() 

148 database = DatabaseClass.fromUri(str(config.connectionString), origin=config.get("origin", 0), 

149 namespace=config.get("namespace")) 

150 managerTypes = RegistryManagerTypes.fromConfig(config) 

151 managers = managerTypes.makeRepo(database, dimensionConfig) 

152 return cls(database, RegistryDefaults(), managers) 

153 

154 @classmethod 

155 def fromConfig(cls, config: Union[ButlerConfig, RegistryConfig, Config, str], 

156 butlerRoot: Optional[Union[str, ButlerURI]] = None, writeable: bool = True, 

157 defaults: Optional[RegistryDefaults] = None) -> Registry: 

158 """Create `Registry` subclass instance from `config`. 

159 

160 Registry database must be inbitialized prior to calling this method. 

161 

162 Parameters 

163 ---------- 

164 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str` 

165 Registry configuration 

166 butlerRoot : `str` or `ButlerURI`, optional 

167 Path to the repository root this `Registry` will manage. 

168 writeable : `bool`, optional 

169 If `True` (default) create a read-write connection to the database. 

170 defaults : `RegistryDefaults`, optional 

171 Default collection search path and/or output `~CollectionType.RUN` 

172 collection. 

173 

174 Returns 

175 ------- 

176 registry : `SqlRegistry` (subclass) 

177 A new `SqlRegistry` subclass instance. 

178 """ 

179 config = cls.forceRegistryConfig(config) 

180 config.replaceRoot(butlerRoot) 

181 DatabaseClass = config.getDatabaseClass() 

182 database = DatabaseClass.fromUri(str(config.connectionString), origin=config.get("origin", 0), 

183 namespace=config.get("namespace"), writeable=writeable) 

184 managerTypes = RegistryManagerTypes.fromConfig(config) 

185 managers = managerTypes.loadRepo(database) 

186 if defaults is None: 

187 defaults = RegistryDefaults() 

188 return cls(database, defaults, managers) 

189 

190 def __init__(self, database: Database, defaults: RegistryDefaults, managers: RegistryManagerInstances): 

191 self._db = database 

192 self._managers = managers 

193 self.storageClasses = StorageClassFactory() 

194 # Intentionally invoke property setter to initialize defaults. This 

195 # can only be done after most of the rest of Registry has already been 

196 # initialized, and must be done before the property getter is used. 

197 self.defaults = defaults 

198 

199 def __str__(self) -> str: 

200 return str(self._db) 

201 

202 def __repr__(self) -> str: 

203 return f"SqlRegistry({self._db!r}, {self.dimensions!r})" 

204 

205 def isWriteable(self) -> bool: 

206 # Docstring inherited from lsst.daf.butler.registry.Registry 

207 return self._db.isWriteable() 

208 

209 def copy(self, defaults: Optional[RegistryDefaults] = None) -> Registry: 

210 # Docstring inherited from lsst.daf.butler.registry.Registry 

211 if defaults is None: 

212 # No need to copy, because `RegistryDefaults` is immutable; we 

213 # effectively copy on write. 

214 defaults = self.defaults 

215 return type(self)(self._db, defaults, self._managers) 

216 

217 @property 

218 def dimensions(self) -> DimensionUniverse: 

219 # Docstring inherited from lsst.daf.butler.registry.Registry 

220 return self._managers.dimensions.universe 

221 

222 def refresh(self) -> None: 

223 # Docstring inherited from lsst.daf.butler.registry.Registry 

224 self._managers.refresh() 

225 

226 @contextlib.contextmanager 

227 def transaction(self, *, savepoint: bool = False) -> Iterator[None]: 

228 # Docstring inherited from lsst.daf.butler.registry.Registry 

229 try: 

230 with self._db.transaction(savepoint=savepoint): 

231 yield 

232 except BaseException: 

233 # TODO: this clears the caches sometimes when we wouldn't actually 

234 # need to. Can we avoid that? 

235 self._managers.dimensions.clearCaches() 

236 raise 

237 

238 def resetConnectionPool(self) -> None: 

239 """Reset SQLAlchemy connection pool for `SqlRegistry` database. 

240 

241 This operation is useful when using registry with fork-based 

242 multiprocessing. To use registry across fork boundary one has to make 

243 sure that there are no currently active connections (no session or 

244 transaction is in progress) and connection pool is reset using this 

245 method. This method should be called by the child process immediately 

246 after the fork. 

247 """ 

248 self._db._engine.dispose() 

249 

250 def registerOpaqueTable(self, tableName: str, spec: ddl.TableSpec) -> None: 

251 """Add an opaque (to the `Registry`) table for use by a `Datastore` or 

252 other data repository client. 

253 

254 Opaque table records can be added via `insertOpaqueData`, retrieved via 

255 `fetchOpaqueData`, and removed via `deleteOpaqueData`. 

256 

257 Parameters 

258 ---------- 

259 tableName : `str` 

260 Logical name of the opaque table. This may differ from the 

261 actual name used in the database by a prefix and/or suffix. 

262 spec : `ddl.TableSpec` 

263 Specification for the table to be added. 

264 """ 

265 self._managers.opaque.register(tableName, spec) 

266 

267 @transactional 

268 def insertOpaqueData(self, tableName: str, *data: dict) -> None: 

269 """Insert records into an opaque table. 

270 

271 Parameters 

272 ---------- 

273 tableName : `str` 

274 Logical name of the opaque table. Must match the name used in a 

275 previous call to `registerOpaqueTable`. 

276 data 

277 Each additional positional argument is a dictionary that represents 

278 a single row to be added. 

279 """ 

280 self._managers.opaque[tableName].insert(*data) 

281 

282 def fetchOpaqueData(self, tableName: str, **where: Any) -> Iterator[dict]: 

283 """Retrieve records from an opaque table. 

284 

285 Parameters 

286 ---------- 

287 tableName : `str` 

288 Logical name of the opaque table. Must match the name used in a 

289 previous call to `registerOpaqueTable`. 

290 where 

291 Additional keyword arguments are interpreted as equality 

292 constraints that restrict the returned rows (combined with AND); 

293 keyword arguments are column names and values are the values they 

294 must have. 

295 

296 Yields 

297 ------ 

298 row : `dict` 

299 A dictionary representing a single result row. 

300 """ 

301 yield from self._managers.opaque[tableName].fetch(**where) 

302 

303 @transactional 

304 def deleteOpaqueData(self, tableName: str, **where: Any) -> None: 

305 """Remove records from an opaque table. 

306 

307 Parameters 

308 ---------- 

309 tableName : `str` 

310 Logical name of the opaque table. Must match the name used in a 

311 previous call to `registerOpaqueTable`. 

312 where 

313 Additional keyword arguments are interpreted as equality 

314 constraints that restrict the deleted rows (combined with AND); 

315 keyword arguments are column names and values are the values they 

316 must have. 

317 """ 

318 self._managers.opaque[tableName].delete(where.keys(), where) 

319 

320 def registerCollection(self, name: str, type: CollectionType = CollectionType.TAGGED, 

321 doc: Optional[str] = None) -> None: 

322 # Docstring inherited from lsst.daf.butler.registry.Registry 

323 self._managers.collections.register(name, type, doc=doc) 

324 

325 def getCollectionType(self, name: str) -> CollectionType: 

326 # Docstring inherited from lsst.daf.butler.registry.Registry 

327 return self._managers.collections.find(name).type 

328 

329 def _get_collection_record(self, name: str) -> CollectionRecord: 

330 # Docstring inherited from lsst.daf.butler.registry.Registry 

331 return self._managers.collections.find(name) 

332 

333 def registerRun(self, name: str, doc: Optional[str] = None) -> None: 

334 # Docstring inherited from lsst.daf.butler.registry.Registry 

335 self._managers.collections.register(name, CollectionType.RUN, doc=doc) 

336 

337 @transactional 

338 def removeCollection(self, name: str) -> None: 

339 # Docstring inherited from lsst.daf.butler.registry.Registry 

340 self._managers.collections.remove(name) 

341 

342 def getCollectionChain(self, parent: str) -> CollectionSearch: 

343 # Docstring inherited from lsst.daf.butler.registry.Registry 

344 record = self._managers.collections.find(parent) 

345 if record.type is not CollectionType.CHAINED: 

346 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.") 

347 assert isinstance(record, ChainedCollectionRecord) 

348 return record.children 

349 

350 @transactional 

351 def setCollectionChain(self, parent: str, children: Any, *, flatten: bool = False) -> None: 

352 # Docstring inherited from lsst.daf.butler.registry.Registry 

353 record = self._managers.collections.find(parent) 

354 if record.type is not CollectionType.CHAINED: 

355 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.") 

356 assert isinstance(record, ChainedCollectionRecord) 

357 children = CollectionSearch.fromExpression(children) 

358 if children != record.children or flatten: 

359 record.update(self._managers.collections, children, flatten=flatten) 

360 

361 def getCollectionDocumentation(self, collection: str) -> Optional[str]: 

362 # Docstring inherited from lsst.daf.butler.registry.Registry 

363 return self._managers.collections.getDocumentation(self._managers.collections.find(collection).key) 

364 

365 def setCollectionDocumentation(self, collection: str, doc: Optional[str]) -> None: 

366 # Docstring inherited from lsst.daf.butler.registry.Registry 

367 self._managers.collections.setDocumentation(self._managers.collections.find(collection).key, doc) 

368 

369 def getCollectionSummary(self, collection: str) -> CollectionSummary: 

370 # Docstring inherited from lsst.daf.butler.registry.Registry 

371 record = self._managers.collections.find(collection) 

372 return self._managers.datasets.getCollectionSummary(record) 

373 

374 def registerDatasetType(self, datasetType: DatasetType) -> bool: 

375 # Docstring inherited from lsst.daf.butler.registry.Registry 

376 _, inserted = self._managers.datasets.register(datasetType) 

377 return inserted 

378 

379 def removeDatasetType(self, name: str) -> None: 

380 # Docstring inherited from lsst.daf.butler.registry.Registry 

381 self._managers.datasets.remove(name) 

382 

383 def getDatasetType(self, name: str) -> DatasetType: 

384 # Docstring inherited from lsst.daf.butler.registry.Registry 

385 return self._managers.datasets[name].datasetType 

386 

387 def findDataset(self, datasetType: Union[DatasetType, str], dataId: Optional[DataId] = None, *, 

388 collections: Any = None, timespan: Optional[Timespan] = None, 

389 **kwargs: Any) -> Optional[DatasetRef]: 

390 # Docstring inherited from lsst.daf.butler.registry.Registry 

391 if isinstance(datasetType, DatasetType): 

392 storage = self._managers.datasets[datasetType.name] 

393 else: 

394 storage = self._managers.datasets[datasetType] 

395 dataId = DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions, 

396 universe=self.dimensions, defaults=self.defaults.dataId, 

397 **kwargs) 

398 if collections is None: 

399 if not self.defaults.collections: 

400 raise TypeError("No collections provided to findDataset, " 

401 "and no defaults from registry construction.") 

402 collections = self.defaults.collections 

403 else: 

404 collections = CollectionSearch.fromExpression(collections) 

405 for collectionRecord in collections.iter(self._managers.collections): 

406 if (collectionRecord.type is CollectionType.CALIBRATION 

407 and (not storage.datasetType.isCalibration() or timespan is None)): 

408 continue 

409 result = storage.find(collectionRecord, dataId, timespan=timespan) 

410 if result is not None: 

411 return result 

412 

413 return None 

414 

415 @transactional 

416 def insertDatasets(self, datasetType: Union[DatasetType, str], dataIds: Iterable[DataId], 

417 run: Optional[str] = None, expand: bool = True, 

418 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE) -> List[DatasetRef]: 

419 # Docstring inherited from lsst.daf.butler.registry.Registry 

420 if isinstance(datasetType, DatasetType): 

421 storage = self._managers.datasets.find(datasetType.name) 

422 if storage is None: 

423 raise LookupError(f"DatasetType '{datasetType}' has not been registered.") 

424 else: 

425 storage = self._managers.datasets.find(datasetType) 

426 if storage is None: 

427 raise LookupError(f"DatasetType with name '{datasetType}' has not been registered.") 

428 if run is None: 

429 if self.defaults.run is None: 

430 raise TypeError("No run provided to insertDatasets, " 

431 "and no default from registry construction.") 

432 run = self.defaults.run 

433 runRecord = self._managers.collections.find(run) 

434 if runRecord.type is not CollectionType.RUN: 

435 raise TypeError(f"Given collection is of type {runRecord.type.name}; RUN collection required.") 

436 assert isinstance(runRecord, RunRecord) 

437 progress = Progress("daf.butler.Registry.insertDatasets", level=logging.DEBUG) 

438 if expand: 

439 expandedDataIds = [self.expandDataId(dataId, graph=storage.datasetType.dimensions) 

440 for dataId in progress.wrap(dataIds, 

441 f"Expanding {storage.datasetType.name} data IDs")] 

442 else: 

443 expandedDataIds = [DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions) 

444 for dataId in dataIds] 

445 try: 

446 refs = list(storage.insert(runRecord, expandedDataIds, idGenerationMode)) 

447 except sqlalchemy.exc.IntegrityError as err: 

448 raise ConflictingDefinitionError(f"A database constraint failure was triggered by inserting " 

449 f"one or more datasets of type {storage.datasetType} into " 

450 f"collection '{run}'. " 

451 f"This probably means a dataset with the same data ID " 

452 f"and dataset type already exists, but it may also mean a " 

453 f"dimension row is missing.") from err 

454 return refs 

455 

456 @transactional 

457 def _importDatasets(self, datasets: Iterable[DatasetRef], expand: bool = True, 

458 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

459 reuseIds: bool = False) -> List[DatasetRef]: 

460 # Docstring inherited from lsst.daf.butler.registry.Registry 

461 datasets = list(datasets) 

462 if not datasets: 

463 # nothing to do 

464 return [] 

465 

466 # find dataset type 

467 datasetTypes = set(dataset.datasetType for dataset in datasets) 

468 if len(datasetTypes) != 1: 

469 raise ValueError(f"Multiple dataset types in input datasets: {datasetTypes}") 

470 datasetType = datasetTypes.pop() 

471 

472 # get storage handler for this dataset type 

473 storage = self._managers.datasets.find(datasetType.name) 

474 if storage is None: 

475 raise LookupError(f"DatasetType '{datasetType}' has not been registered.") 

476 

477 # find run name 

478 runs = set(dataset.run for dataset in datasets) 

479 if len(runs) != 1: 

480 raise ValueError(f"Multiple run names in input datasets: {runs}") 

481 run = runs.pop() 

482 if run is None: 

483 if self.defaults.run is None: 

484 raise TypeError("No run provided to ingestDatasets, " 

485 "and no default from registry construction.") 

486 run = self.defaults.run 

487 

488 runRecord = self._managers.collections.find(run) 

489 if runRecord.type is not CollectionType.RUN: 

490 raise TypeError(f"Given collection is of type {runRecord.type.name}; RUN collection required.") 

491 assert isinstance(runRecord, RunRecord) 

492 

493 progress = Progress("daf.butler.Registry.insertDatasets", level=logging.DEBUG) 

494 if expand: 

495 expandedDatasets = [ 

496 dataset.expanded(self.expandDataId(dataset.dataId, graph=storage.datasetType.dimensions)) 

497 for dataset in progress.wrap(datasets, f"Expanding {storage.datasetType.name} data IDs")] 

498 else: 

499 expandedDatasets = [ 

500 DatasetRef(datasetType, dataset.dataId, id=dataset.id, run=dataset.run, conform=True) 

501 for dataset in datasets 

502 ] 

503 

504 try: 

505 refs = list(storage.import_(runRecord, expandedDatasets, idGenerationMode, reuseIds)) 

506 except sqlalchemy.exc.IntegrityError as err: 

507 raise ConflictingDefinitionError(f"A database constraint failure was triggered by inserting " 

508 f"one or more datasets of type {storage.datasetType} into " 

509 f"collection '{run}'. " 

510 f"This probably means a dataset with the same data ID " 

511 f"and dataset type already exists, but it may also mean a " 

512 f"dimension row is missing.") from err 

513 return refs 

514 

515 def getDataset(self, id: DatasetId) -> Optional[DatasetRef]: 

516 # Docstring inherited from lsst.daf.butler.registry.Registry 

517 return self._managers.datasets.getDatasetRef(id) 

518 

519 @transactional 

520 def removeDatasets(self, refs: Iterable[DatasetRef]) -> None: 

521 # Docstring inherited from lsst.daf.butler.registry.Registry 

522 progress = Progress("lsst.daf.butler.Registry.removeDatasets", level=logging.DEBUG) 

523 for datasetType, refsForType in progress.iter_item_chunks(DatasetRef.groupByType(refs).items(), 

524 desc="Removing datasets by type"): 

525 storage = self._managers.datasets[datasetType.name] 

526 try: 

527 storage.delete(refsForType) 

528 except sqlalchemy.exc.IntegrityError as err: 

529 raise OrphanedRecordError("One or more datasets is still " 

530 "present in one or more Datastores.") from err 

531 

532 @transactional 

533 def associate(self, collection: str, refs: Iterable[DatasetRef]) -> None: 

534 # Docstring inherited from lsst.daf.butler.registry.Registry 

535 progress = Progress("lsst.daf.butler.Registry.associate", level=logging.DEBUG) 

536 collectionRecord = self._managers.collections.find(collection) 

537 if collectionRecord.type is not CollectionType.TAGGED: 

538 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}, not TAGGED.") 

539 for datasetType, refsForType in progress.iter_item_chunks(DatasetRef.groupByType(refs).items(), 

540 desc="Associating datasets by type"): 

541 storage = self._managers.datasets[datasetType.name] 

542 try: 

543 storage.associate(collectionRecord, refsForType) 

544 except sqlalchemy.exc.IntegrityError as err: 

545 raise ConflictingDefinitionError( 

546 f"Constraint violation while associating dataset of type {datasetType.name} with " 

547 f"collection {collection}. This probably means that one or more datasets with the same " 

548 f"dataset type and data ID already exist in the collection, but it may also indicate " 

549 f"that the datasets do not exist." 

550 ) from err 

551 

552 @transactional 

553 def disassociate(self, collection: str, refs: Iterable[DatasetRef]) -> None: 

554 # Docstring inherited from lsst.daf.butler.registry.Registry 

555 progress = Progress("lsst.daf.butler.Registry.disassociate", level=logging.DEBUG) 

556 collectionRecord = self._managers.collections.find(collection) 

557 if collectionRecord.type is not CollectionType.TAGGED: 

558 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}; " 

559 "expected TAGGED.") 

560 for datasetType, refsForType in progress.iter_item_chunks(DatasetRef.groupByType(refs).items(), 

561 desc="Disassociating datasets by type"): 

562 storage = self._managers.datasets[datasetType.name] 

563 storage.disassociate(collectionRecord, refsForType) 

564 

565 @transactional 

566 def certify(self, collection: str, refs: Iterable[DatasetRef], timespan: Timespan) -> None: 

567 # Docstring inherited from lsst.daf.butler.registry.Registry 

568 progress = Progress("lsst.daf.butler.Registry.certify", level=logging.DEBUG) 

569 collectionRecord = self._managers.collections.find(collection) 

570 for datasetType, refsForType in progress.iter_item_chunks(DatasetRef.groupByType(refs).items(), 

571 desc="Certifying datasets by type"): 

572 storage = self._managers.datasets[datasetType.name] 

573 storage.certify(collectionRecord, refsForType, timespan) 

574 

575 @transactional 

576 def decertify(self, collection: str, datasetType: Union[str, DatasetType], timespan: Timespan, *, 

577 dataIds: Optional[Iterable[DataId]] = None) -> None: 

578 # Docstring inherited from lsst.daf.butler.registry.Registry 

579 collectionRecord = self._managers.collections.find(collection) 

580 if isinstance(datasetType, str): 

581 storage = self._managers.datasets[datasetType] 

582 else: 

583 storage = self._managers.datasets[datasetType.name] 

584 standardizedDataIds = None 

585 if dataIds is not None: 

586 standardizedDataIds = [DataCoordinate.standardize(d, graph=storage.datasetType.dimensions) 

587 for d in dataIds] 

588 storage.decertify(collectionRecord, timespan, dataIds=standardizedDataIds) 

589 

590 def getDatastoreBridgeManager(self) -> DatastoreRegistryBridgeManager: 

591 """Return an object that allows a new `Datastore` instance to 

592 communicate with this `Registry`. 

593 

594 Returns 

595 ------- 

596 manager : `DatastoreRegistryBridgeManager` 

597 Object that mediates communication between this `Registry` and its 

598 associated datastores. 

599 """ 

600 return self._managers.datastores 

601 

602 def getDatasetLocations(self, ref: DatasetRef) -> Iterable[str]: 

603 # Docstring inherited from lsst.daf.butler.registry.Registry 

604 return self._managers.datastores.findDatastores(ref) 

605 

606 def expandDataId(self, dataId: Optional[DataId] = None, *, graph: Optional[DimensionGraph] = None, 

607 records: Optional[NameLookupMapping[DimensionElement, Optional[DimensionRecord]]] = None, 

608 withDefaults: bool = True, 

609 **kwargs: Any) -> DataCoordinate: 

610 # Docstring inherited from lsst.daf.butler.registry.Registry 

611 if not withDefaults: 

612 defaults = None 

613 else: 

614 defaults = self.defaults.dataId 

615 standardized = DataCoordinate.standardize(dataId, graph=graph, universe=self.dimensions, 

616 defaults=defaults, **kwargs) 

617 if standardized.hasRecords(): 

618 return standardized 

619 if records is None: 

620 records = {} 

621 elif isinstance(records, NamedKeyMapping): 

622 records = records.byName() 

623 else: 

624 records = dict(records) 

625 if isinstance(dataId, DataCoordinate) and dataId.hasRecords(): 

626 records.update(dataId.records.byName()) 

627 keys = standardized.byName() 

628 for element in standardized.graph.primaryKeyTraversalOrder: 

629 record = records.get(element.name, ...) # Use ... to mean not found; None might mean NULL 

630 if record is ...: 

631 if isinstance(element, Dimension) and keys.get(element.name) is None: 

632 if element in standardized.graph.required: 

633 raise LookupError( 

634 f"No value or null value for required dimension {element.name}." 

635 ) 

636 keys[element.name] = None 

637 record = None 

638 else: 

639 storage = self._managers.dimensions[element] 

640 dataIdSet = DataCoordinateIterable.fromScalar( 

641 DataCoordinate.standardize(keys, graph=element.graph) 

642 ) 

643 fetched = tuple(storage.fetch(dataIdSet)) 

644 try: 

645 (record,) = fetched 

646 except ValueError: 

647 record = None 

648 records[element.name] = record 

649 if record is not None: 

650 for d in element.implied: 

651 value = getattr(record, d.name) 

652 if keys.setdefault(d.name, value) != value: 

653 raise InconsistentDataIdError( 

654 f"Data ID {standardized} has {d.name}={keys[d.name]!r}, " 

655 f"but {element.name} implies {d.name}={value!r}." 

656 ) 

657 else: 

658 if element in standardized.graph.required: 

659 raise LookupError( 

660 f"Could not fetch record for required dimension {element.name} via keys {keys}." 

661 ) 

662 if element.alwaysJoin: 

663 raise InconsistentDataIdError( 

664 f"Could not fetch record for element {element.name} via keys {keys}, ", 

665 "but it is marked alwaysJoin=True; this means one or more dimensions are not " 

666 "related." 

667 ) 

668 for d in element.implied: 

669 keys.setdefault(d.name, None) 

670 records.setdefault(d.name, None) 

671 return DataCoordinate.standardize(keys, graph=standardized.graph).expanded(records=records) 

672 

673 def insertDimensionData(self, element: Union[DimensionElement, str], 

674 *data: Union[Mapping[str, Any], DimensionRecord], 

675 conform: bool = True) -> None: 

676 # Docstring inherited from lsst.daf.butler.registry.Registry 

677 if conform: 

678 if isinstance(element, str): 

679 element = self.dimensions[element] 

680 records = [row if isinstance(row, DimensionRecord) else element.RecordClass(**row) 

681 for row in data] 

682 else: 

683 # Ignore typing since caller said to trust them with conform=False. 

684 records = data # type: ignore 

685 storage = self._managers.dimensions[element] # type: ignore 

686 storage.insert(*records) 

687 

688 def syncDimensionData(self, element: Union[DimensionElement, str], 

689 row: Union[Mapping[str, Any], DimensionRecord], 

690 conform: bool = True) -> bool: 

691 # Docstring inherited from lsst.daf.butler.registry.Registry 

692 if conform: 

693 if isinstance(element, str): 

694 element = self.dimensions[element] 

695 record = row if isinstance(row, DimensionRecord) else element.RecordClass(**row) 

696 else: 

697 # Ignore typing since caller said to trust them with conform=False. 

698 record = row # type: ignore 

699 storage = self._managers.dimensions[element] # type: ignore 

700 return storage.sync(record) 

701 

702 def queryDatasetTypes(self, expression: Any = ..., *, components: Optional[bool] = None 

703 ) -> Iterator[DatasetType]: 

704 # Docstring inherited from lsst.daf.butler.registry.Registry 

705 wildcard = CategorizedWildcard.fromExpression(expression, coerceUnrecognized=lambda d: d.name) 

706 if wildcard is Ellipsis: 

707 for datasetType in self._managers.datasets: 

708 # The dataset type can no longer be a component 

709 yield datasetType 

710 if components: 

711 # Automatically create the component dataset types 

712 try: 

713 componentsForDatasetType = datasetType.makeAllComponentDatasetTypes() 

714 except KeyError as err: 

715 _LOG.warning(f"Could not load storage class {err} for {datasetType.name}; " 

716 "if it has components they will not be included in query results.") 

717 else: 

718 yield from componentsForDatasetType 

719 return 

720 done: Set[str] = set() 

721 for name in wildcard.strings: 

722 storage = self._managers.datasets.find(name) 

723 if storage is not None: 

724 done.add(storage.datasetType.name) 

725 yield storage.datasetType 

726 if wildcard.patterns: 

727 # If components (the argument) is None, we'll save component 

728 # dataset that we might want to match, but only if their parents 

729 # didn't get included. 

730 componentsForLater = [] 

731 for registeredDatasetType in self._managers.datasets: 

732 # Components are not stored in registry so expand them here 

733 allDatasetTypes = [registeredDatasetType] 

734 try: 

735 allDatasetTypes.extend(registeredDatasetType.makeAllComponentDatasetTypes()) 

736 except KeyError as err: 

737 _LOG.warning(f"Could not load storage class {err} for {registeredDatasetType.name}; " 

738 "if it has components they will not be included in query results.") 

739 for datasetType in allDatasetTypes: 

740 if datasetType.name in done: 

741 continue 

742 parentName, componentName = datasetType.nameAndComponent() 

743 if componentName is not None and not components: 

744 if components is None and parentName not in done: 

745 componentsForLater.append(datasetType) 

746 continue 

747 if any(p.fullmatch(datasetType.name) for p in wildcard.patterns): 

748 done.add(datasetType.name) 

749 yield datasetType 

750 # Go back and try to match saved components. 

751 for datasetType in componentsForLater: 

752 parentName, _ = datasetType.nameAndComponent() 

753 if parentName not in done and any(p.fullmatch(datasetType.name) for p in wildcard.patterns): 

754 yield datasetType 

755 

756 def queryCollections(self, expression: Any = ..., 

757 datasetType: Optional[DatasetType] = None, 

758 collectionTypes: Iterable[CollectionType] = CollectionType.all(), 

759 flattenChains: bool = False, 

760 includeChains: Optional[bool] = None) -> Iterator[str]: 

761 # Docstring inherited from lsst.daf.butler.registry.Registry 

762 

763 # Right now the datasetTypes argument is completely ignored, but that 

764 # is consistent with its [lack of] guarantees. DM-24939 or a follow-up 

765 # ticket will take care of that. 

766 query = CollectionQuery.fromExpression(expression) 

767 for record in query.iter(self._managers.collections, collectionTypes=frozenset(collectionTypes), 

768 flattenChains=flattenChains, includeChains=includeChains): 

769 yield record.name 

770 

771 def makeQueryBuilder(self, summary: queries.QuerySummary) -> queries.QueryBuilder: 

772 """Return a `QueryBuilder` instance capable of constructing and 

773 managing more complex queries than those obtainable via `Registry` 

774 interfaces. 

775 

776 This is an advanced interface; downstream code should prefer 

777 `Registry.queryDataIds` and `Registry.queryDatasets` whenever those 

778 are sufficient. 

779 

780 Parameters 

781 ---------- 

782 summary : `queries.QuerySummary` 

783 Object describing and categorizing the full set of dimensions that 

784 will be included in the query. 

785 

786 Returns 

787 ------- 

788 builder : `queries.QueryBuilder` 

789 Object that can be used to construct and perform advanced queries. 

790 """ 

791 return queries.QueryBuilder( 

792 summary, 

793 queries.RegistryManagers( 

794 collections=self._managers.collections, 

795 dimensions=self._managers.dimensions, 

796 datasets=self._managers.datasets, 

797 TimespanReprClass=self._db.getTimespanRepresentation(), 

798 ), 

799 ) 

800 

801 def queryDatasets(self, datasetType: Any, *, 

802 collections: Any = None, 

803 dimensions: Optional[Iterable[Union[Dimension, str]]] = None, 

804 dataId: Optional[DataId] = None, 

805 where: Optional[str] = None, 

806 findFirst: bool = False, 

807 components: Optional[bool] = None, 

808 bind: Optional[Mapping[str, Any]] = None, 

809 check: bool = True, 

810 **kwargs: Any) -> queries.DatasetQueryResults: 

811 # Docstring inherited from lsst.daf.butler.registry.Registry 

812 

813 # Standardize the collections expression. 

814 if collections is None: 

815 if not self.defaults.collections: 

816 raise TypeError("No collections provided to findDataset, " 

817 "and no defaults from registry construction.") 

818 collections = self.defaults.collections 

819 elif findFirst: 

820 collections = CollectionSearch.fromExpression(collections) 

821 else: 

822 collections = CollectionQuery.fromExpression(collections) 

823 # Standardize and expand the data ID provided as a constraint. 

824 standardizedDataId = self.expandDataId(dataId, **kwargs) 

825 

826 # We can only query directly if given a non-component DatasetType 

827 # instance. If we were given an expression or str or a component 

828 # DatasetType instance, we'll populate this dict, recurse, and return. 

829 # If we already have a non-component DatasetType, it will remain None 

830 # and we'll run the query directly. 

831 composition: Optional[ 

832 Dict[ 

833 DatasetType, # parent dataset type 

834 List[Optional[str]] # component name, or None for parent 

835 ] 

836 ] = None 

837 if not isinstance(datasetType, DatasetType): 

838 # We were given a dataset type expression (which may be as simple 

839 # as a str). Loop over all matching datasets, delegating handling 

840 # of the `components` argument to queryDatasetTypes, as we populate 

841 # the composition dict. 

842 composition = defaultdict(list) 

843 for trueDatasetType in self.queryDatasetTypes(datasetType, components=components): 

844 parentName, componentName = trueDatasetType.nameAndComponent() 

845 if componentName is not None: 

846 parentDatasetType = self.getDatasetType(parentName) 

847 composition.setdefault(parentDatasetType, []).append(componentName) 

848 else: 

849 composition.setdefault(trueDatasetType, []).append(None) 

850 elif datasetType.isComponent(): 

851 # We were given a true DatasetType instance, but it's a component. 

852 # the composition dict will have exactly one item. 

853 parentName, componentName = datasetType.nameAndComponent() 

854 parentDatasetType = self.getDatasetType(parentName) 

855 composition = {parentDatasetType: [componentName]} 

856 if composition is not None: 

857 # We need to recurse. Do that once for each parent dataset type. 

858 chain = [] 

859 for parentDatasetType, componentNames in composition.items(): 

860 parentResults = self.queryDatasets( 

861 parentDatasetType, 

862 collections=collections, 

863 dimensions=dimensions, 

864 dataId=standardizedDataId, 

865 where=where, 

866 findFirst=findFirst, 

867 check=check, 

868 ) 

869 if isinstance(parentResults, queries.ParentDatasetQueryResults): 

870 chain.append( 

871 parentResults.withComponents(componentNames) 

872 ) 

873 else: 

874 # Should only happen if we know there would be no results. 

875 assert isinstance(parentResults, queries.ChainedDatasetQueryResults) \ 

876 and not parentResults._chain 

877 return queries.ChainedDatasetQueryResults(chain) 

878 # If we get here, there's no need to recurse (or we are already 

879 # recursing; there can only ever be one level of recursion). 

880 

881 # The full set of dimensions in the query is the combination of those 

882 # needed for the DatasetType and those explicitly requested, if any. 

883 requestedDimensionNames = set(datasetType.dimensions.names) 

884 if dimensions is not None: 

885 requestedDimensionNames.update(self.dimensions.extract(dimensions).names) 

886 # Construct the summary structure needed to construct a QueryBuilder. 

887 summary = queries.QuerySummary( 

888 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames), 

889 dataId=standardizedDataId, 

890 expression=where, 

891 bind=bind, 

892 defaults=self.defaults.dataId, 

893 check=check, 

894 ) 

895 builder = self.makeQueryBuilder(summary) 

896 # Add the dataset subquery to the query, telling the QueryBuilder to 

897 # include the rank of the selected collection in the results only if we 

898 # need to findFirst. Note that if any of the collections are 

899 # actually wildcard expressions, and we've asked for deduplication, 

900 # this will raise TypeError for us. 

901 if not builder.joinDataset(datasetType, collections, isResult=True, findFirst=findFirst): 

902 return queries.ChainedDatasetQueryResults(()) 

903 query = builder.finish() 

904 return queries.ParentDatasetQueryResults(self._db, query, components=[None]) 

905 

906 def queryDataIds(self, dimensions: Union[Iterable[Union[Dimension, str]], Dimension, str], *, 

907 dataId: Optional[DataId] = None, 

908 datasets: Any = None, 

909 collections: Any = None, 

910 where: Optional[str] = None, 

911 components: Optional[bool] = None, 

912 bind: Optional[Mapping[str, Any]] = None, 

913 check: bool = True, 

914 **kwargs: Any) -> queries.DataCoordinateQueryResults: 

915 # Docstring inherited from lsst.daf.butler.registry.Registry 

916 dimensions = iterable(dimensions) 

917 standardizedDataId = self.expandDataId(dataId, **kwargs) 

918 standardizedDatasetTypes = set() 

919 requestedDimensions = self.dimensions.extract(dimensions) 

920 queryDimensionNames = set(requestedDimensions.names) 

921 if datasets is not None: 

922 if collections is None: 

923 if not self.defaults.collections: 

924 raise TypeError("Cannot pass 'datasets' without 'collections'.") 

925 collections = self.defaults.collections 

926 else: 

927 # Preprocess collections expression in case the original 

928 # included single-pass iterators (we'll want to use it multiple 

929 # times below). 

930 collections = CollectionQuery.fromExpression(collections) 

931 for datasetType in self.queryDatasetTypes(datasets, components=components): 

932 queryDimensionNames.update(datasetType.dimensions.names) 

933 # If any matched dataset type is a component, just operate on 

934 # its parent instead, because Registry doesn't know anything 

935 # about what components exist, and here (unlike queryDatasets) 

936 # we don't care about returning them. 

937 parentDatasetTypeName, componentName = datasetType.nameAndComponent() 

938 if componentName is not None: 

939 datasetType = self.getDatasetType(parentDatasetTypeName) 

940 standardizedDatasetTypes.add(datasetType) 

941 

942 summary = queries.QuerySummary( 

943 requested=DimensionGraph(self.dimensions, names=queryDimensionNames), 

944 dataId=standardizedDataId, 

945 expression=where, 

946 bind=bind, 

947 defaults=self.defaults.dataId, 

948 check=check, 

949 ) 

950 builder = self.makeQueryBuilder(summary) 

951 for datasetType in standardizedDatasetTypes: 

952 builder.joinDataset(datasetType, collections, isResult=False) 

953 query = builder.finish() 

954 return queries.DataCoordinateQueryResults(self._db, query) 

955 

956 def queryDimensionRecords(self, element: Union[DimensionElement, str], *, 

957 dataId: Optional[DataId] = None, 

958 datasets: Any = None, 

959 collections: Any = None, 

960 where: Optional[str] = None, 

961 components: Optional[bool] = None, 

962 bind: Optional[Mapping[str, Any]] = None, 

963 check: bool = True, 

964 **kwargs: Any) -> Iterator[DimensionRecord]: 

965 # Docstring inherited from lsst.daf.butler.registry.Registry 

966 if not isinstance(element, DimensionElement): 

967 try: 

968 element = self.dimensions[element] 

969 except KeyError as e: 

970 raise KeyError(f"No such dimension '{element}', available dimensions: " 

971 + str(self.dimensions.getStaticElements())) from e 

972 dataIds = self.queryDataIds(element.graph, dataId=dataId, datasets=datasets, collections=collections, 

973 where=where, components=components, bind=bind, check=check, **kwargs) 

974 return iter(self._managers.dimensions[element].fetch(dataIds)) 

975 

976 def queryDatasetAssociations( 

977 self, 

978 datasetType: Union[str, DatasetType], 

979 collections: Any = ..., 

980 *, 

981 collectionTypes: Iterable[CollectionType] = CollectionType.all(), 

982 flattenChains: bool = False, 

983 ) -> Iterator[DatasetAssociation]: 

984 # Docstring inherited from lsst.daf.butler.registry.Registry 

985 if collections is None: 

986 if not self.defaults.collections: 

987 raise TypeError("No collections provided to findDataset, " 

988 "and no defaults from registry construction.") 

989 collections = self.defaults.collections 

990 else: 

991 collections = CollectionQuery.fromExpression(collections) 

992 TimespanReprClass = self._db.getTimespanRepresentation() 

993 if isinstance(datasetType, str): 

994 storage = self._managers.datasets[datasetType] 

995 else: 

996 storage = self._managers.datasets[datasetType.name] 

997 for collectionRecord in collections.iter(self._managers.collections, 

998 collectionTypes=frozenset(collectionTypes), 

999 flattenChains=flattenChains): 

1000 query = storage.select(collectionRecord) 

1001 if query is None: 

1002 continue 

1003 for row in self._db.query(query.combine()): 

1004 dataId = DataCoordinate.fromRequiredValues( 

1005 storage.datasetType.dimensions, 

1006 tuple(row[name] for name in storage.datasetType.dimensions.required.names) 

1007 ) 

1008 runRecord = self._managers.collections[row[self._managers.collections.getRunForeignKeyName()]] 

1009 ref = DatasetRef(storage.datasetType, dataId, id=row["id"], run=runRecord.name, 

1010 conform=False) 

1011 if collectionRecord.type is CollectionType.CALIBRATION: 

1012 timespan = TimespanReprClass.extract(row) 

1013 else: 

1014 timespan = None 

1015 yield DatasetAssociation(ref=ref, collection=collectionRecord.name, timespan=timespan) 

1016 

1017 storageClasses: StorageClassFactory 

1018 """All storage classes known to the registry (`StorageClassFactory`). 

1019 """