Coverage for python/lsst/daf/butler/registries/sql.py: 14%

433 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2022-10-04 02:19 -0700

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ("SqlRegistry",) 

25 

26import contextlib 

27import logging 

28import warnings 

29from typing import ( 

30 TYPE_CHECKING, 

31 Any, 

32 Dict, 

33 Iterable, 

34 Iterator, 

35 List, 

36 Literal, 

37 Mapping, 

38 Optional, 

39 Sequence, 

40 Set, 

41 Tuple, 

42 Union, 

43) 

44 

45import sqlalchemy 

46from lsst.resources import ResourcePathExpression 

47from lsst.utils.iteration import ensure_iterable 

48 

49from ..core import ( 

50 Config, 

51 DataCoordinate, 

52 DataCoordinateIterable, 

53 DataId, 

54 DatasetAssociation, 

55 DatasetId, 

56 DatasetRef, 

57 DatasetType, 

58 Dimension, 

59 DimensionConfig, 

60 DimensionElement, 

61 DimensionGraph, 

62 DimensionRecord, 

63 DimensionUniverse, 

64 NamedKeyMapping, 

65 NameLookupMapping, 

66 Progress, 

67 StorageClassFactory, 

68 Timespan, 

69 ddl, 

70) 

71from ..core.utils import transactional 

72from ..registry import ( 

73 ArgumentError, 

74 CollectionExpressionError, 

75 CollectionSummary, 

76 CollectionType, 

77 CollectionTypeError, 

78 ConflictingDefinitionError, 

79 DataIdValueError, 

80 DatasetTypeError, 

81 DimensionNameError, 

82 InconsistentDataIdError, 

83 NoDefaultCollectionError, 

84 OrphanedRecordError, 

85 Registry, 

86 RegistryConfig, 

87 RegistryDefaults, 

88 queries, 

89) 

90from ..registry.interfaces import ChainedCollectionRecord, DatasetIdFactory, DatasetIdGenEnum, RunRecord 

91from ..registry.managers import RegistryManagerInstances, RegistryManagerTypes 

92from ..registry.wildcards import CollectionWildcard, DatasetTypeWildcard 

93 

94if TYPE_CHECKING: 94 ↛ 95line 94 didn't jump to line 95, because the condition on line 94 was never true

95 from .._butlerConfig import ButlerConfig 

96 from ..registry.interfaces import CollectionRecord, Database, DatastoreRegistryBridgeManager 

97 

98 

99_LOG = logging.getLogger(__name__) 

100 

101 

102class SqlRegistry(Registry): 

103 """Registry implementation based on SQLAlchemy. 

104 

105 Parameters 

106 ---------- 

107 database : `Database` 

108 Database instance to store Registry. 

109 defaults : `RegistryDefaults` 

110 Default collection search path and/or output `~CollectionType.RUN` 

111 collection. 

112 managers : `RegistryManagerInstances` 

113 All the managers required for this registry. 

114 """ 

115 

116 defaultConfigFile: Optional[str] = None 

117 """Path to configuration defaults. Accessed within the ``configs`` resource 

118 or relative to a search path. Can be None if no defaults specified. 

119 """ 

120 

121 @classmethod 

122 def createFromConfig( 

123 cls, 

124 config: Optional[Union[RegistryConfig, str]] = None, 

125 dimensionConfig: Optional[Union[DimensionConfig, str]] = None, 

126 butlerRoot: Optional[ResourcePathExpression] = None, 

127 ) -> Registry: 

128 """Create registry database and return `SqlRegistry` instance. 

129 

130 This method initializes database contents, database must be empty 

131 prior to calling this method. 

132 

133 Parameters 

134 ---------- 

135 config : `RegistryConfig` or `str`, optional 

136 Registry configuration, if missing then default configuration will 

137 be loaded from registry.yaml. 

138 dimensionConfig : `DimensionConfig` or `str`, optional 

139 Dimensions configuration, if missing then default configuration 

140 will be loaded from dimensions.yaml. 

141 butlerRoot : convertible to `lsst.resources.ResourcePath`, optional 

142 Path to the repository root this `SqlRegistry` will manage. 

143 

144 Returns 

145 ------- 

146 registry : `SqlRegistry` 

147 A new `SqlRegistry` instance. 

148 """ 

149 config = cls.forceRegistryConfig(config) 

150 config.replaceRoot(butlerRoot) 

151 

152 if isinstance(dimensionConfig, str): 

153 dimensionConfig = DimensionConfig(dimensionConfig) 

154 elif dimensionConfig is None: 

155 dimensionConfig = DimensionConfig() 

156 elif not isinstance(dimensionConfig, DimensionConfig): 

157 raise TypeError(f"Incompatible Dimension configuration type: {type(dimensionConfig)}") 

158 

159 DatabaseClass = config.getDatabaseClass() 

160 database = DatabaseClass.fromUri( 

161 str(config.connectionString), origin=config.get("origin", 0), namespace=config.get("namespace") 

162 ) 

163 managerTypes = RegistryManagerTypes.fromConfig(config) 

164 managers = managerTypes.makeRepo(database, dimensionConfig) 

165 return cls(database, RegistryDefaults(), managers) 

166 

167 @classmethod 

168 def fromConfig( 

169 cls, 

170 config: Union[ButlerConfig, RegistryConfig, Config, str], 

171 butlerRoot: Optional[ResourcePathExpression] = None, 

172 writeable: bool = True, 

173 defaults: Optional[RegistryDefaults] = None, 

174 ) -> Registry: 

175 """Create `Registry` subclass instance from `config`. 

176 

177 Registry database must be initialized prior to calling this method. 

178 

179 Parameters 

180 ---------- 

181 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str` 

182 Registry configuration 

183 butlerRoot : `lsst.resources.ResourcePathExpression`, optional 

184 Path to the repository root this `Registry` will manage. 

185 writeable : `bool`, optional 

186 If `True` (default) create a read-write connection to the database. 

187 defaults : `RegistryDefaults`, optional 

188 Default collection search path and/or output `~CollectionType.RUN` 

189 collection. 

190 

191 Returns 

192 ------- 

193 registry : `SqlRegistry` (subclass) 

194 A new `SqlRegistry` subclass instance. 

195 """ 

196 config = cls.forceRegistryConfig(config) 

197 config.replaceRoot(butlerRoot) 

198 DatabaseClass = config.getDatabaseClass() 

199 database = DatabaseClass.fromUri( 

200 str(config.connectionString), 

201 origin=config.get("origin", 0), 

202 namespace=config.get("namespace"), 

203 writeable=writeable, 

204 ) 

205 managerTypes = RegistryManagerTypes.fromConfig(config) 

206 managers = managerTypes.loadRepo(database) 

207 if defaults is None: 

208 defaults = RegistryDefaults() 

209 return cls(database, defaults, managers) 

210 

211 def __init__(self, database: Database, defaults: RegistryDefaults, managers: RegistryManagerInstances): 

212 self._db = database 

213 self._managers = managers 

214 self.storageClasses = StorageClassFactory() 

215 # Intentionally invoke property setter to initialize defaults. This 

216 # can only be done after most of the rest of Registry has already been 

217 # initialized, and must be done before the property getter is used. 

218 self.defaults = defaults 

219 # In the future DatasetIdFactory may become configurable and this 

220 # instance will need to be shared with datasets manager. 

221 self.datasetIdFactory = DatasetIdFactory() 

222 

223 def __str__(self) -> str: 

224 return str(self._db) 

225 

226 def __repr__(self) -> str: 

227 return f"SqlRegistry({self._db!r}, {self.dimensions!r})" 

228 

229 def isWriteable(self) -> bool: 

230 # Docstring inherited from lsst.daf.butler.registry.Registry 

231 return self._db.isWriteable() 

232 

233 def copy(self, defaults: Optional[RegistryDefaults] = None) -> Registry: 

234 # Docstring inherited from lsst.daf.butler.registry.Registry 

235 if defaults is None: 

236 # No need to copy, because `RegistryDefaults` is immutable; we 

237 # effectively copy on write. 

238 defaults = self.defaults 

239 return type(self)(self._db, defaults, self._managers) 

240 

241 @property 

242 def dimensions(self) -> DimensionUniverse: 

243 # Docstring inherited from lsst.daf.butler.registry.Registry 

244 return self._managers.dimensions.universe 

245 

246 def refresh(self) -> None: 

247 # Docstring inherited from lsst.daf.butler.registry.Registry 

248 self._managers.refresh() 

249 

250 @contextlib.contextmanager 

251 def transaction(self, *, savepoint: bool = False) -> Iterator[None]: 

252 # Docstring inherited from lsst.daf.butler.registry.Registry 

253 try: 

254 with self._db.transaction(savepoint=savepoint): 

255 yield 

256 except BaseException: 

257 # TODO: this clears the caches sometimes when we wouldn't actually 

258 # need to. Can we avoid that? 

259 self._managers.dimensions.clearCaches() 

260 raise 

261 

262 def resetConnectionPool(self) -> None: 

263 """Reset SQLAlchemy connection pool for `SqlRegistry` database. 

264 

265 This operation is useful when using registry with fork-based 

266 multiprocessing. To use registry across fork boundary one has to make 

267 sure that there are no currently active connections (no session or 

268 transaction is in progress) and connection pool is reset using this 

269 method. This method should be called by the child process immediately 

270 after the fork. 

271 """ 

272 self._db._engine.dispose() 

273 

274 def registerOpaqueTable(self, tableName: str, spec: ddl.TableSpec) -> None: 

275 """Add an opaque (to the `Registry`) table for use by a `Datastore` or 

276 other data repository client. 

277 

278 Opaque table records can be added via `insertOpaqueData`, retrieved via 

279 `fetchOpaqueData`, and removed via `deleteOpaqueData`. 

280 

281 Parameters 

282 ---------- 

283 tableName : `str` 

284 Logical name of the opaque table. This may differ from the 

285 actual name used in the database by a prefix and/or suffix. 

286 spec : `ddl.TableSpec` 

287 Specification for the table to be added. 

288 """ 

289 self._managers.opaque.register(tableName, spec) 

290 

291 @transactional 

292 def insertOpaqueData(self, tableName: str, *data: dict) -> None: 

293 """Insert records into an opaque table. 

294 

295 Parameters 

296 ---------- 

297 tableName : `str` 

298 Logical name of the opaque table. Must match the name used in a 

299 previous call to `registerOpaqueTable`. 

300 data 

301 Each additional positional argument is a dictionary that represents 

302 a single row to be added. 

303 """ 

304 self._managers.opaque[tableName].insert(*data) 

305 

306 def fetchOpaqueData(self, tableName: str, **where: Any) -> Iterator[dict]: 

307 """Retrieve records from an opaque table. 

308 

309 Parameters 

310 ---------- 

311 tableName : `str` 

312 Logical name of the opaque table. Must match the name used in a 

313 previous call to `registerOpaqueTable`. 

314 where 

315 Additional keyword arguments are interpreted as equality 

316 constraints that restrict the returned rows (combined with AND); 

317 keyword arguments are column names and values are the values they 

318 must have. 

319 

320 Yields 

321 ------ 

322 row : `dict` 

323 A dictionary representing a single result row. 

324 """ 

325 yield from self._managers.opaque[tableName].fetch(**where) 

326 

327 @transactional 

328 def deleteOpaqueData(self, tableName: str, **where: Any) -> None: 

329 """Remove records from an opaque table. 

330 

331 Parameters 

332 ---------- 

333 tableName : `str` 

334 Logical name of the opaque table. Must match the name used in a 

335 previous call to `registerOpaqueTable`. 

336 where 

337 Additional keyword arguments are interpreted as equality 

338 constraints that restrict the deleted rows (combined with AND); 

339 keyword arguments are column names and values are the values they 

340 must have. 

341 """ 

342 self._managers.opaque[tableName].delete(where.keys(), where) 

343 

344 def registerCollection( 

345 self, name: str, type: CollectionType = CollectionType.TAGGED, doc: Optional[str] = None 

346 ) -> bool: 

347 # Docstring inherited from lsst.daf.butler.registry.Registry 

348 _, registered = self._managers.collections.register(name, type, doc=doc) 

349 return registered 

350 

351 def getCollectionType(self, name: str) -> CollectionType: 

352 # Docstring inherited from lsst.daf.butler.registry.Registry 

353 return self._managers.collections.find(name).type 

354 

355 def _get_collection_record(self, name: str) -> CollectionRecord: 

356 # Docstring inherited from lsst.daf.butler.registry.Registry 

357 return self._managers.collections.find(name) 

358 

359 def registerRun(self, name: str, doc: Optional[str] = None) -> bool: 

360 # Docstring inherited from lsst.daf.butler.registry.Registry 

361 _, registered = self._managers.collections.register(name, CollectionType.RUN, doc=doc) 

362 return registered 

363 

364 @transactional 

365 def removeCollection(self, name: str) -> None: 

366 # Docstring inherited from lsst.daf.butler.registry.Registry 

367 self._managers.collections.remove(name) 

368 

369 def getCollectionChain(self, parent: str) -> tuple[str, ...]: 

370 # Docstring inherited from lsst.daf.butler.registry.Registry 

371 record = self._managers.collections.find(parent) 

372 if record.type is not CollectionType.CHAINED: 

373 raise CollectionTypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.") 

374 assert isinstance(record, ChainedCollectionRecord) 

375 return record.children 

376 

377 @transactional 

378 def setCollectionChain(self, parent: str, children: Any, *, flatten: bool = False) -> None: 

379 # Docstring inherited from lsst.daf.butler.registry.Registry 

380 record = self._managers.collections.find(parent) 

381 if record.type is not CollectionType.CHAINED: 

382 raise CollectionTypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.") 

383 assert isinstance(record, ChainedCollectionRecord) 

384 children = CollectionWildcard.from_expression(children).require_ordered() 

385 if children != record.children or flatten: 

386 record.update(self._managers.collections, children, flatten=flatten) 

387 

388 def getCollectionParentChains(self, collection: str) -> Set[str]: 

389 # Docstring inherited from lsst.daf.butler.registry.Registry 

390 return { 

391 record.name 

392 for record in self._managers.collections.getParentChains( 

393 self._managers.collections.find(collection).key 

394 ) 

395 } 

396 

397 def getCollectionDocumentation(self, collection: str) -> Optional[str]: 

398 # Docstring inherited from lsst.daf.butler.registry.Registry 

399 return self._managers.collections.getDocumentation(self._managers.collections.find(collection).key) 

400 

401 def setCollectionDocumentation(self, collection: str, doc: Optional[str]) -> None: 

402 # Docstring inherited from lsst.daf.butler.registry.Registry 

403 self._managers.collections.setDocumentation(self._managers.collections.find(collection).key, doc) 

404 

405 def getCollectionSummary(self, collection: str) -> CollectionSummary: 

406 # Docstring inherited from lsst.daf.butler.registry.Registry 

407 record = self._managers.collections.find(collection) 

408 return self._managers.datasets.getCollectionSummary(record) 

409 

410 def registerDatasetType(self, datasetType: DatasetType) -> bool: 

411 # Docstring inherited from lsst.daf.butler.registry.Registry 

412 _, inserted = self._managers.datasets.register(datasetType) 

413 return inserted 

414 

415 def removeDatasetType(self, name: str) -> None: 

416 # Docstring inherited from lsst.daf.butler.registry.Registry 

417 self._managers.datasets.remove(name) 

418 

419 def getDatasetType(self, name: str) -> DatasetType: 

420 # Docstring inherited from lsst.daf.butler.registry.Registry 

421 return self._managers.datasets[name].datasetType 

422 

423 def supportsIdGenerationMode(self, mode: DatasetIdGenEnum) -> bool: 

424 # Docstring inherited from lsst.daf.butler.registry.Registry 

425 return self._managers.datasets.supportsIdGenerationMode(mode) 

426 

427 def findDataset( 

428 self, 

429 datasetType: Union[DatasetType, str], 

430 dataId: Optional[DataId] = None, 

431 *, 

432 collections: Any = None, 

433 timespan: Optional[Timespan] = None, 

434 **kwargs: Any, 

435 ) -> Optional[DatasetRef]: 

436 # Docstring inherited from lsst.daf.butler.registry.Registry 

437 if isinstance(datasetType, DatasetType): 

438 storage = self._managers.datasets[datasetType.name] 

439 else: 

440 storage = self._managers.datasets[datasetType] 

441 dataId = DataCoordinate.standardize( 

442 dataId, 

443 graph=storage.datasetType.dimensions, 

444 universe=self.dimensions, 

445 defaults=self.defaults.dataId, 

446 **kwargs, 

447 ) 

448 if collections is None: 

449 if not self.defaults.collections: 

450 raise NoDefaultCollectionError( 

451 "No collections provided to findDataset, and no defaults from registry construction." 

452 ) 

453 collections = self.defaults.collections 

454 collections = CollectionWildcard.from_expression(collections) 

455 collections.require_ordered() 

456 for collectionRecord in self._managers.collections.resolve_wildcard(collections): 

457 if collectionRecord.type is CollectionType.CALIBRATION and ( 

458 not storage.datasetType.isCalibration() or timespan is None 

459 ): 

460 continue 

461 result = storage.find(collectionRecord, dataId, timespan=timespan) 

462 if result is not None: 

463 return result 

464 

465 return None 

466 

467 @transactional 

468 def insertDatasets( 

469 self, 

470 datasetType: Union[DatasetType, str], 

471 dataIds: Iterable[DataId], 

472 run: Optional[str] = None, 

473 expand: bool = True, 

474 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

475 ) -> List[DatasetRef]: 

476 # Docstring inherited from lsst.daf.butler.registry.Registry 

477 if isinstance(datasetType, DatasetType): 

478 storage = self._managers.datasets.find(datasetType.name) 

479 if storage is None: 

480 raise DatasetTypeError(f"DatasetType '{datasetType}' has not been registered.") 

481 else: 

482 storage = self._managers.datasets.find(datasetType) 

483 if storage is None: 

484 raise DatasetTypeError(f"DatasetType with name '{datasetType}' has not been registered.") 

485 if run is None: 

486 if self.defaults.run is None: 

487 raise NoDefaultCollectionError( 

488 "No run provided to insertDatasets, and no default from registry construction." 

489 ) 

490 run = self.defaults.run 

491 runRecord = self._managers.collections.find(run) 

492 if runRecord.type is not CollectionType.RUN: 

493 raise CollectionTypeError( 

494 f"Given collection is of type {runRecord.type.name}; RUN collection required." 

495 ) 

496 assert isinstance(runRecord, RunRecord) 

497 progress = Progress("daf.butler.Registry.insertDatasets", level=logging.DEBUG) 

498 if expand: 

499 expandedDataIds = [ 

500 self.expandDataId(dataId, graph=storage.datasetType.dimensions) 

501 for dataId in progress.wrap(dataIds, f"Expanding {storage.datasetType.name} data IDs") 

502 ] 

503 else: 

504 expandedDataIds = [ 

505 DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions) for dataId in dataIds 

506 ] 

507 try: 

508 refs = list(storage.insert(runRecord, expandedDataIds, idGenerationMode)) 

509 if self._managers.obscore: 

510 self._managers.obscore.add_datasets(refs) 

511 except sqlalchemy.exc.IntegrityError as err: 

512 raise ConflictingDefinitionError( 

513 f"A database constraint failure was triggered by inserting " 

514 f"one or more datasets of type {storage.datasetType} into " 

515 f"collection '{run}'. " 

516 f"This probably means a dataset with the same data ID " 

517 f"and dataset type already exists, but it may also mean a " 

518 f"dimension row is missing." 

519 ) from err 

520 return refs 

521 

522 @transactional 

523 def _importDatasets( 

524 self, 

525 datasets: Iterable[DatasetRef], 

526 expand: bool = True, 

527 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

528 reuseIds: bool = False, 

529 ) -> List[DatasetRef]: 

530 # Docstring inherited from lsst.daf.butler.registry.Registry 

531 datasets = list(datasets) 

532 if not datasets: 

533 # nothing to do 

534 return [] 

535 

536 # find dataset type 

537 datasetTypes = set(dataset.datasetType for dataset in datasets) 

538 if len(datasetTypes) != 1: 

539 raise DatasetTypeError(f"Multiple dataset types in input datasets: {datasetTypes}") 

540 datasetType = datasetTypes.pop() 

541 

542 # get storage handler for this dataset type 

543 storage = self._managers.datasets.find(datasetType.name) 

544 if storage is None: 

545 raise DatasetTypeError(f"DatasetType '{datasetType}' has not been registered.") 

546 

547 # find run name 

548 runs = set(dataset.run for dataset in datasets) 

549 if len(runs) != 1: 

550 raise ValueError(f"Multiple run names in input datasets: {runs}") 

551 run = runs.pop() 

552 if run is None: 

553 if self.defaults.run is None: 

554 raise NoDefaultCollectionError( 

555 "No run provided to ingestDatasets, and no default from registry construction." 

556 ) 

557 run = self.defaults.run 

558 

559 runRecord = self._managers.collections.find(run) 

560 if runRecord.type is not CollectionType.RUN: 

561 raise CollectionTypeError( 

562 f"Given collection '{runRecord.name}' is of type {runRecord.type.name};" 

563 " RUN collection required." 

564 ) 

565 assert isinstance(runRecord, RunRecord) 

566 

567 progress = Progress("daf.butler.Registry.insertDatasets", level=logging.DEBUG) 

568 if expand: 

569 expandedDatasets = [ 

570 dataset.expanded(self.expandDataId(dataset.dataId, graph=storage.datasetType.dimensions)) 

571 for dataset in progress.wrap(datasets, f"Expanding {storage.datasetType.name} data IDs") 

572 ] 

573 else: 

574 expandedDatasets = [ 

575 DatasetRef(datasetType, dataset.dataId, id=dataset.id, run=dataset.run, conform=True) 

576 for dataset in datasets 

577 ] 

578 

579 try: 

580 refs = list(storage.import_(runRecord, expandedDatasets, idGenerationMode, reuseIds)) 

581 if self._managers.obscore: 

582 self._managers.obscore.add_datasets(refs) 

583 except sqlalchemy.exc.IntegrityError as err: 

584 raise ConflictingDefinitionError( 

585 f"A database constraint failure was triggered by inserting " 

586 f"one or more datasets of type {storage.datasetType} into " 

587 f"collection '{run}'. " 

588 f"This probably means a dataset with the same data ID " 

589 f"and dataset type already exists, but it may also mean a " 

590 f"dimension row is missing." 

591 ) from err 

592 return refs 

593 

594 def getDataset(self, id: DatasetId) -> Optional[DatasetRef]: 

595 # Docstring inherited from lsst.daf.butler.registry.Registry 

596 return self._managers.datasets.getDatasetRef(id) 

597 

598 @transactional 

599 def removeDatasets(self, refs: Iterable[DatasetRef]) -> None: 

600 # Docstring inherited from lsst.daf.butler.registry.Registry 

601 progress = Progress("lsst.daf.butler.Registry.removeDatasets", level=logging.DEBUG) 

602 for datasetType, refsForType in progress.iter_item_chunks( 

603 DatasetRef.groupByType(refs).items(), desc="Removing datasets by type" 

604 ): 

605 storage = self._managers.datasets[datasetType.name] 

606 try: 

607 storage.delete(refsForType) 

608 except sqlalchemy.exc.IntegrityError as err: 

609 raise OrphanedRecordError( 

610 "One or more datasets is still present in one or more Datastores." 

611 ) from err 

612 

613 @transactional 

614 def associate(self, collection: str, refs: Iterable[DatasetRef]) -> None: 

615 # Docstring inherited from lsst.daf.butler.registry.Registry 

616 progress = Progress("lsst.daf.butler.Registry.associate", level=logging.DEBUG) 

617 collectionRecord = self._managers.collections.find(collection) 

618 if collectionRecord.type is not CollectionType.TAGGED: 

619 raise CollectionTypeError( 

620 f"Collection '{collection}' has type {collectionRecord.type.name}, not TAGGED." 

621 ) 

622 for datasetType, refsForType in progress.iter_item_chunks( 

623 DatasetRef.groupByType(refs).items(), desc="Associating datasets by type" 

624 ): 

625 storage = self._managers.datasets[datasetType.name] 

626 try: 

627 storage.associate(collectionRecord, refsForType) 

628 if self._managers.obscore: 

629 # If a TAGGED collection is being monitored by ObsCore 

630 # manager then we may need to save the dataset. 

631 self._managers.obscore.associate(refsForType, collectionRecord) 

632 except sqlalchemy.exc.IntegrityError as err: 

633 raise ConflictingDefinitionError( 

634 f"Constraint violation while associating dataset of type {datasetType.name} with " 

635 f"collection {collection}. This probably means that one or more datasets with the same " 

636 f"dataset type and data ID already exist in the collection, but it may also indicate " 

637 f"that the datasets do not exist." 

638 ) from err 

639 

640 @transactional 

641 def disassociate(self, collection: str, refs: Iterable[DatasetRef]) -> None: 

642 # Docstring inherited from lsst.daf.butler.registry.Registry 

643 progress = Progress("lsst.daf.butler.Registry.disassociate", level=logging.DEBUG) 

644 collectionRecord = self._managers.collections.find(collection) 

645 if collectionRecord.type is not CollectionType.TAGGED: 

646 raise CollectionTypeError( 

647 f"Collection '{collection}' has type {collectionRecord.type.name}; expected TAGGED." 

648 ) 

649 for datasetType, refsForType in progress.iter_item_chunks( 

650 DatasetRef.groupByType(refs).items(), desc="Disassociating datasets by type" 

651 ): 

652 storage = self._managers.datasets[datasetType.name] 

653 storage.disassociate(collectionRecord, refsForType) 

654 if self._managers.obscore: 

655 self._managers.obscore.disassociate(refsForType, collectionRecord) 

656 

657 @transactional 

658 def certify(self, collection: str, refs: Iterable[DatasetRef], timespan: Timespan) -> None: 

659 # Docstring inherited from lsst.daf.butler.registry.Registry 

660 progress = Progress("lsst.daf.butler.Registry.certify", level=logging.DEBUG) 

661 collectionRecord = self._managers.collections.find(collection) 

662 for datasetType, refsForType in progress.iter_item_chunks( 

663 DatasetRef.groupByType(refs).items(), desc="Certifying datasets by type" 

664 ): 

665 storage = self._managers.datasets[datasetType.name] 

666 storage.certify(collectionRecord, refsForType, timespan) 

667 

668 @transactional 

669 def decertify( 

670 self, 

671 collection: str, 

672 datasetType: Union[str, DatasetType], 

673 timespan: Timespan, 

674 *, 

675 dataIds: Optional[Iterable[DataId]] = None, 

676 ) -> None: 

677 # Docstring inherited from lsst.daf.butler.registry.Registry 

678 collectionRecord = self._managers.collections.find(collection) 

679 if isinstance(datasetType, str): 

680 storage = self._managers.datasets[datasetType] 

681 else: 

682 storage = self._managers.datasets[datasetType.name] 

683 standardizedDataIds = None 

684 if dataIds is not None: 

685 standardizedDataIds = [ 

686 DataCoordinate.standardize(d, graph=storage.datasetType.dimensions) for d in dataIds 

687 ] 

688 storage.decertify(collectionRecord, timespan, dataIds=standardizedDataIds) 

689 

690 def getDatastoreBridgeManager(self) -> DatastoreRegistryBridgeManager: 

691 """Return an object that allows a new `Datastore` instance to 

692 communicate with this `Registry`. 

693 

694 Returns 

695 ------- 

696 manager : `DatastoreRegistryBridgeManager` 

697 Object that mediates communication between this `Registry` and its 

698 associated datastores. 

699 """ 

700 return self._managers.datastores 

701 

702 def getDatasetLocations(self, ref: DatasetRef) -> Iterable[str]: 

703 # Docstring inherited from lsst.daf.butler.registry.Registry 

704 return self._managers.datastores.findDatastores(ref) 

705 

706 def expandDataId( 

707 self, 

708 dataId: Optional[DataId] = None, 

709 *, 

710 graph: Optional[DimensionGraph] = None, 

711 records: Optional[NameLookupMapping[DimensionElement, Optional[DimensionRecord]]] = None, 

712 withDefaults: bool = True, 

713 **kwargs: Any, 

714 ) -> DataCoordinate: 

715 # Docstring inherited from lsst.daf.butler.registry.Registry 

716 if not withDefaults: 

717 defaults = None 

718 else: 

719 defaults = self.defaults.dataId 

720 try: 

721 standardized = DataCoordinate.standardize( 

722 dataId, graph=graph, universe=self.dimensions, defaults=defaults, **kwargs 

723 ) 

724 except KeyError as exc: 

725 # This means either kwargs have some odd name or required 

726 # dimension is missing. 

727 raise DimensionNameError(str(exc)) from exc 

728 if standardized.hasRecords(): 

729 return standardized 

730 if records is None: 

731 records = {} 

732 elif isinstance(records, NamedKeyMapping): 

733 records = records.byName() 

734 else: 

735 records = dict(records) 

736 if isinstance(dataId, DataCoordinate) and dataId.hasRecords(): 

737 records.update(dataId.records.byName()) 

738 keys = standardized.byName() 

739 for element in standardized.graph.primaryKeyTraversalOrder: 

740 record = records.get(element.name, ...) # Use ... to mean not found; None might mean NULL 

741 if record is ...: 

742 if isinstance(element, Dimension) and keys.get(element.name) is None: 

743 if element in standardized.graph.required: 

744 raise DimensionNameError( 

745 f"No value or null value for required dimension {element.name}." 

746 ) 

747 keys[element.name] = None 

748 record = None 

749 else: 

750 storage = self._managers.dimensions[element] 

751 dataIdSet = DataCoordinateIterable.fromScalar( 

752 DataCoordinate.standardize(keys, graph=element.graph) 

753 ) 

754 fetched = tuple(storage.fetch(dataIdSet)) 

755 try: 

756 (record,) = fetched 

757 except ValueError: 

758 record = None 

759 records[element.name] = record 

760 if record is not None: 

761 for d in element.implied: 

762 value = getattr(record, d.name) 

763 if keys.setdefault(d.name, value) != value: 

764 raise InconsistentDataIdError( 

765 f"Data ID {standardized} has {d.name}={keys[d.name]!r}, " 

766 f"but {element.name} implies {d.name}={value!r}." 

767 ) 

768 else: 

769 if element in standardized.graph.required: 

770 raise DataIdValueError( 

771 f"Could not fetch record for required dimension {element.name} via keys {keys}." 

772 ) 

773 if element.alwaysJoin: 

774 raise InconsistentDataIdError( 

775 f"Could not fetch record for element {element.name} via keys {keys}, ", 

776 "but it is marked alwaysJoin=True; this means one or more dimensions are not " 

777 "related.", 

778 ) 

779 for d in element.implied: 

780 keys.setdefault(d.name, None) 

781 records.setdefault(d.name, None) 

782 return DataCoordinate.standardize(keys, graph=standardized.graph).expanded(records=records) 

783 

784 def insertDimensionData( 

785 self, 

786 element: Union[DimensionElement, str], 

787 *data: Union[Mapping[str, Any], DimensionRecord], 

788 conform: bool = True, 

789 replace: bool = False, 

790 skip_existing: bool = False, 

791 ) -> None: 

792 # Docstring inherited from lsst.daf.butler.registry.Registry 

793 if conform: 

794 if isinstance(element, str): 

795 element = self.dimensions[element] 

796 records = [ 

797 row if isinstance(row, DimensionRecord) else element.RecordClass(**row) for row in data 

798 ] 

799 else: 

800 # Ignore typing since caller said to trust them with conform=False. 

801 records = data # type: ignore 

802 storage = self._managers.dimensions[element] # type: ignore 

803 storage.insert(*records, replace=replace, skip_existing=skip_existing) 

804 

805 def syncDimensionData( 

806 self, 

807 element: Union[DimensionElement, str], 

808 row: Union[Mapping[str, Any], DimensionRecord], 

809 conform: bool = True, 

810 update: bool = False, 

811 ) -> Union[bool, Dict[str, Any]]: 

812 # Docstring inherited from lsst.daf.butler.registry.Registry 

813 if conform: 

814 if isinstance(element, str): 

815 element = self.dimensions[element] 

816 record = row if isinstance(row, DimensionRecord) else element.RecordClass(**row) 

817 else: 

818 # Ignore typing since caller said to trust them with conform=False. 

819 record = row # type: ignore 

820 storage = self._managers.dimensions[element] # type: ignore 

821 return storage.sync(record, update=update) 

822 

823 def queryDatasetTypes( 

824 self, 

825 expression: Any = ..., 

826 *, 

827 components: Optional[bool] = None, 

828 missing: Optional[List[str]] = None, 

829 ) -> Iterable[DatasetType]: 

830 # Docstring inherited from lsst.daf.butler.registry.Registry 

831 wildcard = DatasetTypeWildcard.from_expression(expression) 

832 composition_dict = self._managers.datasets.resolve_wildcard( 

833 wildcard, 

834 components=components, 

835 missing=missing, 

836 ) 

837 result: list[DatasetType] = [] 

838 for parent_dataset_type, components_for_parent in composition_dict.items(): 

839 result.extend( 

840 parent_dataset_type.makeComponentDatasetType(c) if c is not None else parent_dataset_type 

841 for c in components_for_parent 

842 ) 

843 return result 

844 

845 def queryCollections( 

846 self, 

847 expression: Any = ..., 

848 datasetType: Optional[DatasetType] = None, 

849 collectionTypes: Union[Iterable[CollectionType], CollectionType] = CollectionType.all(), 

850 flattenChains: bool = False, 

851 includeChains: Optional[bool] = None, 

852 ) -> Sequence[str]: 

853 # Docstring inherited from lsst.daf.butler.registry.Registry 

854 

855 # Right now the datasetTypes argument is completely ignored, but that 

856 # is consistent with its [lack of] guarantees. DM-24939 or a follow-up 

857 # ticket will take care of that. 

858 try: 

859 wildcard = CollectionWildcard.from_expression(expression) 

860 except TypeError as exc: 

861 raise CollectionExpressionError(f"Invalid collection expression '{expression}'") from exc 

862 collectionTypes = ensure_iterable(collectionTypes) 

863 return [ 

864 record.name 

865 for record in self._managers.collections.resolve_wildcard( 

866 wildcard, 

867 collection_types=frozenset(collectionTypes), 

868 flatten_chains=flattenChains, 

869 include_chains=includeChains, 

870 ) 

871 ] 

872 

873 def _makeQueryBuilder( 

874 self, summary: queries.QuerySummary, doomed_by: Iterable[str] = () 

875 ) -> queries.QueryBuilder: 

876 """Return a `QueryBuilder` instance capable of constructing and 

877 managing more complex queries than those obtainable via `Registry` 

878 interfaces. 

879 

880 This is an advanced interface; downstream code should prefer 

881 `Registry.queryDataIds` and `Registry.queryDatasets` whenever those 

882 are sufficient. 

883 

884 Parameters 

885 ---------- 

886 summary : `queries.QuerySummary` 

887 Object describing and categorizing the full set of dimensions that 

888 will be included in the query. 

889 doomed_by : `Iterable` of `str`, optional 

890 A list of diagnostic messages that indicate why the query is going 

891 to yield no results and should not even be executed. If an empty 

892 container (default) the query will be executed unless other code 

893 determines that it is doomed. 

894 

895 Returns 

896 ------- 

897 builder : `queries.QueryBuilder` 

898 Object that can be used to construct and perform advanced queries. 

899 """ 

900 return queries.QueryBuilder( 

901 summary, 

902 backend=queries.SqlQueryBackend(self._db, self._managers), 

903 doomed_by=doomed_by, 

904 ) 

905 

906 def _standardize_query_dataset_args( 

907 self, 

908 datasets: Any, 

909 collections: Any, 

910 components: bool | None, 

911 mode: Literal["find_first"] | Literal["find_all"] | Literal["constrain"] = "constrain", 

912 *, 

913 doomed_by: list[str], 

914 ) -> tuple[dict[DatasetType, list[str | None]], CollectionWildcard | None]: 

915 """Preprocess dataset arguments passed to query* methods. 

916 

917 Parameters 

918 ---------- 

919 datasets : `DatasetType`, `str`, `re.Pattern`, or iterable of these 

920 Expression identifying dataset types. See `queryDatasetTypes` for 

921 details. 

922 collections : `str`, `re.Pattern`, or iterable of these 

923 Expression identifying collections to be searched. See 

924 `queryCollections` for details. 

925 components : `bool`, optional 

926 If `True`, apply all expression patterns to component dataset type 

927 names as well. If `False`, never apply patterns to components. 

928 If `None` (default), apply patterns to components only if their 

929 parent datasets were not matched by the expression. 

930 Fully-specified component datasets (`str` or `DatasetType` 

931 instances) are always included. 

932 mode : `str`, optional 

933 The way in which datasets are being used in this query; one of: 

934 

935 - "find_first": this is a query for the first dataset in an 

936 ordered list of collections. Prohibits collection wildcards, 

937 but permits dataset type wildcards. 

938 

939 - "find_all": this is a query for all datasets in all matched 

940 collections. Permits collection and dataset type wildcards. 

941 

942 - "constrain": this is a query for something other than datasets, 

943 with results constrained by dataset existence. Permits 

944 collection wildcards and prohibits ``...`` as a dataset type 

945 wildcard. 

946 doomed_by : `list` [ `str` ] 

947 List to append messages indicating why the query is doomed to 

948 yield no results. 

949 

950 Returns 

951 ------- 

952 composition : `defaultdict` [ `DatasetType`, `list` [ `str` ] ] 

953 Dictionary mapping parent dataset type to `list` of components 

954 matched for that dataset type (or `None` for the parent itself). 

955 collections : `CollectionWildcard` 

956 Processed collection expression. 

957 """ 

958 composition: dict[DatasetType, list[str | None]] = {} 

959 if datasets is not None: 

960 if not collections: 

961 if not self.defaults.collections: 

962 raise NoDefaultCollectionError("No collections, and no registry default collections.") 

963 collections = self.defaults.collections 

964 else: 

965 collections = CollectionWildcard.from_expression(collections) 

966 if mode == "find_first" and collections.patterns: 

967 raise TypeError( 

968 f"Collection pattern(s) {collections.patterns} not allowed in this context." 

969 ) 

970 missing: list[str] = [] 

971 composition = self._managers.datasets.resolve_wildcard( 

972 datasets, components=components, missing=missing, explicit_only=(mode == "constrain") 

973 ) 

974 if missing and mode == "constrain": 

975 # After v26 this should raise MissingDatasetTypeError, to be 

976 # implemented on DM-36303. 

977 warnings.warn( 

978 f"Dataset type(s) {missing} are not registered; this will be an error after v26.", 

979 FutureWarning, 

980 ) 

981 doomed_by.extend(f"Dataset type {name} is not registered." for name in missing) 

982 elif collections: 

983 raise ArgumentError(f"Cannot pass 'collections' (='{collections}') without 'datasets'.") 

984 return composition, collections 

985 

986 def queryDatasets( 

987 self, 

988 datasetType: Any, 

989 *, 

990 collections: Any = None, 

991 dimensions: Optional[Iterable[Union[Dimension, str]]] = None, 

992 dataId: Optional[DataId] = None, 

993 where: Optional[str] = None, 

994 findFirst: bool = False, 

995 components: Optional[bool] = None, 

996 bind: Optional[Mapping[str, Any]] = None, 

997 check: bool = True, 

998 **kwargs: Any, 

999 ) -> queries.DatasetQueryResults: 

1000 # Docstring inherited from lsst.daf.butler.registry.Registry 

1001 doomed_by: list[str] = [] 

1002 data_id = self.expandDataId(dataId, **kwargs) 

1003 dataset_composition, collections = self._standardize_query_dataset_args( 

1004 datasetType, 

1005 collections, 

1006 components, 

1007 mode="find_first" if findFirst else "find_all", 

1008 doomed_by=doomed_by, 

1009 ) 

1010 parent_results: list[queries.ParentDatasetQueryResults] = [] 

1011 for parent_dataset_type, components_for_parent in dataset_composition.items(): 

1012 # The full set of dimensions in the query is the combination of 

1013 # those needed for the DatasetType and those explicitly requested, 

1014 # if any. 

1015 dimension_names = set(parent_dataset_type.dimensions.names) 

1016 if dimensions is not None: 

1017 dimension_names.update(self.dimensions.extract(dimensions).names) 

1018 # Construct the summary structure needed to construct a 

1019 # QueryBuilder. 

1020 summary = queries.QuerySummary( 

1021 requested=DimensionGraph(self.dimensions, names=dimension_names), 

1022 dataId=data_id, 

1023 expression=where, 

1024 bind=bind, 

1025 defaults=self.defaults.dataId, 

1026 check=check, 

1027 datasets=[parent_dataset_type], 

1028 ) 

1029 builder = self._makeQueryBuilder(summary, doomed_by=doomed_by) 

1030 # Add the dataset subquery to the query, telling the QueryBuilder 

1031 # to include the rank of the selected collection in the results 

1032 # only if we need to findFirst. Note that if any of the 

1033 # collections are actually wildcard expressions, and 

1034 # findFirst=True, this will raise TypeError for us. 

1035 builder.joinDataset(parent_dataset_type, collections, isResult=True, findFirst=findFirst) 

1036 query = builder.finish() 

1037 parent_results.append( 

1038 queries.ParentDatasetQueryResults( 

1039 self._db, query, datasetType=parent_dataset_type, components=components_for_parent 

1040 ) 

1041 ) 

1042 if not parent_results: 

1043 doomed_by.extend( 

1044 f"No registered dataset type matching {t!r} found, so no matching datasets can " 

1045 "exist in any collection." 

1046 for t in ensure_iterable(datasetType) 

1047 ) 

1048 return queries.ChainedDatasetQueryResults([], doomed_by=doomed_by) 

1049 elif len(parent_results) == 1: 

1050 return parent_results[0] 

1051 else: 

1052 return queries.ChainedDatasetQueryResults(parent_results) 

1053 

1054 def queryDataIds( 

1055 self, 

1056 dimensions: Union[Iterable[Union[Dimension, str]], Dimension, str], 

1057 *, 

1058 dataId: Optional[DataId] = None, 

1059 datasets: Any = None, 

1060 collections: Any = None, 

1061 where: Optional[str] = None, 

1062 components: Optional[bool] = None, 

1063 bind: Optional[Mapping[str, Any]] = None, 

1064 check: bool = True, 

1065 **kwargs: Any, 

1066 ) -> queries.DataCoordinateQueryResults: 

1067 # Docstring inherited from lsst.daf.butler.registry.Registry 

1068 dimensions = ensure_iterable(dimensions) 

1069 requestedDimensions = self.dimensions.extract(dimensions) 

1070 doomed_by: list[str] = [] 

1071 data_id = self.expandDataId(dataId, **kwargs) 

1072 dataset_composition, collections = self._standardize_query_dataset_args( 

1073 datasets, collections, components, doomed_by=doomed_by 

1074 ) 

1075 

1076 def query_factory( 

1077 order_by: Optional[Iterable[str]] = None, limit: Optional[Tuple[int, Optional[int]]] = None 

1078 ) -> queries.Query: 

1079 """Construct the Query object that generates query results.""" 

1080 summary = queries.QuerySummary( 

1081 requested=requestedDimensions, 

1082 dataId=data_id, 

1083 expression=where, 

1084 bind=bind, 

1085 defaults=self.defaults.dataId, 

1086 check=check, 

1087 datasets=dataset_composition.keys(), 

1088 order_by=order_by, 

1089 limit=limit, 

1090 ) 

1091 builder = self._makeQueryBuilder(summary, doomed_by=doomed_by) 

1092 for datasetType in dataset_composition: 

1093 builder.joinDataset(datasetType, collections, isResult=False) 

1094 return builder.finish() 

1095 

1096 return queries.DataCoordinateQueryResults(self._db, query_factory, requestedDimensions) 

1097 

1098 def queryDimensionRecords( 

1099 self, 

1100 element: Union[DimensionElement, str], 

1101 *, 

1102 dataId: Optional[DataId] = None, 

1103 datasets: Any = None, 

1104 collections: Any = None, 

1105 where: Optional[str] = None, 

1106 components: Optional[bool] = None, 

1107 bind: Optional[Mapping[str, Any]] = None, 

1108 check: bool = True, 

1109 **kwargs: Any, 

1110 ) -> queries.DimensionRecordQueryResults: 

1111 # Docstring inherited from lsst.daf.butler.registry.Registry 

1112 if not isinstance(element, DimensionElement): 

1113 try: 

1114 element = self.dimensions[element] 

1115 except KeyError as e: 

1116 raise DimensionNameError( 

1117 f"No such dimension '{element}', available dimensions: " 

1118 + str(self.dimensions.getStaticElements()) 

1119 ) from e 

1120 dataIds = self.queryDataIds( 

1121 element.graph, 

1122 dataId=dataId, 

1123 datasets=datasets, 

1124 collections=collections, 

1125 where=where, 

1126 components=components, 

1127 bind=bind, 

1128 check=check, 

1129 **kwargs, 

1130 ) 

1131 return queries.DatabaseDimensionRecordQueryResults(dataIds, self._managers.dimensions[element]) 

1132 

1133 def queryDatasetAssociations( 

1134 self, 

1135 datasetType: Union[str, DatasetType], 

1136 collections: Any = ..., 

1137 *, 

1138 collectionTypes: Iterable[CollectionType] = CollectionType.all(), 

1139 flattenChains: bool = False, 

1140 ) -> Iterator[DatasetAssociation]: 

1141 # Docstring inherited from lsst.daf.butler.registry.Registry 

1142 if collections is None: 

1143 if not self.defaults.collections: 

1144 raise NoDefaultCollectionError( 

1145 "No collections provided to findDataset, and no defaults from registry construction." 

1146 ) 

1147 collections = self.defaults.collections 

1148 collections = CollectionWildcard.from_expression(collections) 

1149 TimespanReprClass = self._db.getTimespanRepresentation() 

1150 if isinstance(datasetType, str): 

1151 storage = self._managers.datasets[datasetType] 

1152 else: 

1153 storage = self._managers.datasets[datasetType.name] 

1154 for collectionRecord in self._managers.collections.resolve_wildcard( 

1155 collections, 

1156 collection_types=frozenset(collectionTypes), 

1157 flatten_chains=flattenChains, 

1158 ): 

1159 query = storage.select(collectionRecord) 

1160 for row in self._db.query(query).mappings(): 

1161 dataId = DataCoordinate.fromRequiredValues( 

1162 storage.datasetType.dimensions, 

1163 tuple(row[name] for name in storage.datasetType.dimensions.required.names), 

1164 ) 

1165 runRecord = self._managers.collections[row[self._managers.collections.getRunForeignKeyName()]] 

1166 ref = DatasetRef(storage.datasetType, dataId, id=row["id"], run=runRecord.name, conform=False) 

1167 if collectionRecord.type is CollectionType.CALIBRATION: 

1168 timespan = TimespanReprClass.extract(row) 

1169 else: 

1170 timespan = None 

1171 yield DatasetAssociation(ref=ref, collection=collectionRecord.name, timespan=timespan) 

1172 

1173 storageClasses: StorageClassFactory 

1174 """All storage classes known to the registry (`StorageClassFactory`). 

1175 """