Coverage for python/lsst/daf/butler/registry/managers.py: 33%

173 statements  

« prev     ^ index     » next       coverage.py v7.5.1, created at 2024-05-07 02:46 -0700

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28from __future__ import annotations 

29 

30from .. import ddl 

31 

32__all__ = ( 

33 "RegistryManagerInstances", 

34 "RegistryManagerTypes", 

35) 

36 

37import dataclasses 

38import logging 

39from collections.abc import Iterator, Mapping 

40from contextlib import contextmanager 

41from typing import Any, Generic, TypeVar 

42 

43import sqlalchemy 

44from lsst.utils import doImportType 

45 

46from .._column_type_info import ColumnTypeInfo 

47from .._config import Config 

48from ..dimensions import DimensionConfig, DimensionUniverse 

49from ._caching_context import CachingContext 

50from ._config import RegistryConfig 

51from .interfaces import ( 

52 ButlerAttributeManager, 

53 CollectionManager, 

54 Database, 

55 DatasetRecordStorageManager, 

56 DatastoreRegistryBridgeManager, 

57 DimensionRecordStorageManager, 

58 ObsCoreTableManager, 

59 OpaqueTableStorageManager, 

60 StaticTablesContext, 

61 VersionedExtension, 

62 VersionTuple, 

63) 

64from .versions import ButlerVersionsManager 

65 

66_Attributes = TypeVar("_Attributes") 

67_Dimensions = TypeVar("_Dimensions") 

68_Collections = TypeVar("_Collections") 

69_Datasets = TypeVar("_Datasets") 

70_Opaque = TypeVar("_Opaque") 

71_Datastores = TypeVar("_Datastores") 

72_ObsCore = TypeVar("_ObsCore") 

73 

74 

75_LOG = logging.getLogger(__name__) 

76 

77# key for dimensions configuration in attributes table 

78_DIMENSIONS_ATTR = "config:dimensions.json" 

79 

80# key for obscore configuration in attributes table 

81_OBSCORE_ATTR = "config:obscore.json" 

82 

83 

84@dataclasses.dataclass(frozen=True, eq=False) 

85class _GenericRegistryManagers( 

86 Generic[_Attributes, _Dimensions, _Collections, _Datasets, _Opaque, _Datastores, _ObsCore] 

87): 

88 """Base struct used to pass around the manager instances or types that back 

89 a `Registry`. 

90 

91 This class should only be used via its non-generic subclasses, 

92 `RegistryManagerInstances` and `RegistryManagerTypes`. 

93 """ 

94 

95 attributes: _Attributes 

96 """Manager for flat key-value pairs, including versions. 

97 """ 

98 

99 dimensions: _Dimensions 

100 """Manager for dimensions. 

101 """ 

102 

103 collections: _Collections 

104 """Manager for collections. 

105 """ 

106 

107 datasets: _Datasets 

108 """Manager for datasets, dataset types, and collection summaries. 

109 """ 

110 

111 opaque: _Opaque 

112 """Manager for opaque (to the Registry) tables. 

113 """ 

114 

115 datastores: _Datastores 

116 """Manager for the interface between `Registry` and `Datastore`. 

117 """ 

118 

119 obscore: _ObsCore | None 

120 """Manager for `ObsCore` table(s). 

121 """ 

122 

123 

124@dataclasses.dataclass(frozen=True, eq=False) 

125class RegistryManagerTypes( 

126 _GenericRegistryManagers[ 

127 type[ButlerAttributeManager], 

128 type[DimensionRecordStorageManager], 

129 type[CollectionManager], 

130 type[DatasetRecordStorageManager], 

131 type[OpaqueTableStorageManager], 

132 type[DatastoreRegistryBridgeManager], 

133 type[ObsCoreTableManager], 

134 ] 

135): 

136 """A struct used to pass around the types of the manager objects that back 

137 a `Registry`. 

138 """ 

139 

140 @classmethod 

141 def fromConfig(cls, config: RegistryConfig) -> RegistryManagerTypes: 

142 """Construct by extracting class names from configuration and importing 

143 them. 

144 

145 Parameters 

146 ---------- 

147 config : `RegistryConfig` 

148 Configuration object with a "managers" section that contains all 

149 fully-qualified class names for all manager types. 

150 

151 Returns 

152 ------- 

153 types : `RegistryManagerTypes` 

154 A new struct containing type objects. 

155 """ 

156 # We only check for manager names defined in class attributes. 

157 # TODO: Maybe we need to check keys for unknown names/typos? 

158 managers = {field.name for field in dataclasses.fields(cls)} - {"manager_configs", "schema_versions"} 

159 # Values of "config" sub-key, if any, indexed by manager name. 

160 configs: dict[str, Mapping] = {} 

161 schema_versions: dict[str, VersionTuple] = {} 

162 manager_types: dict[str, type] = {} 

163 for manager in managers: 

164 manager_config = config["managers"].get(manager) 

165 if isinstance(manager_config, Config): 

166 # Expect "cls" and optional "config" and "schema_version" 

167 # sub-keys. 

168 manager_config_dict = manager_config.toDict() 

169 try: 

170 class_name = manager_config_dict.pop("cls") 

171 except KeyError: 

172 raise KeyError(f"'cls' key is not defined in {manager!r} manager configuration") from None 

173 if (mgr_config := manager_config_dict.pop("config", None)) is not None: 

174 configs[manager] = mgr_config 

175 if (mgr_version := manager_config_dict.pop("schema_version", None)) is not None: 

176 # Note that we do not check versions that come from config 

177 # for compatibility, they may be overriden later by 

178 # versions from registry. 

179 schema_versions[manager] = VersionTuple.fromString(mgr_version) 

180 if manager_config_dict: 

181 raise ValueError( 

182 f"{manager!r} manager configuration has unexpected keys: {set(manager_config_dict)}" 

183 ) 

184 elif isinstance(manager_config, str): 

185 class_name = manager_config 

186 elif manager_config is None: 

187 # Some managers may be optional. 

188 continue 

189 else: 

190 raise KeyError(f"Unexpected type of {manager!r} manager configuration: {manager_config!r}") 

191 manager_types[manager] = doImportType(class_name) 

192 

193 # obscore need special care because it's the only manager which can be 

194 # None, and we cannot define default value for it. 

195 if "obscore" in manager_types: 

196 return cls(**manager_types, manager_configs=configs, schema_versions=schema_versions) 

197 else: 

198 return cls( 

199 **manager_types, obscore=None, manager_configs=configs, schema_versions=schema_versions 

200 ) 

201 

202 def makeRepo(self, database: Database, dimensionConfig: DimensionConfig) -> RegistryManagerInstances: 

203 """Create all persistent `Registry` state for a new, empty data 

204 repository, and return a new struct containing manager instances. 

205 

206 Parameters 

207 ---------- 

208 database : `Database` 

209 Object that represents a connection to the SQL database that will 

210 back the data repository. Must point to an empty namespace, or at 

211 least one with no tables or other entities whose names might clash 

212 with those used by butler. 

213 dimensionConfig : `DimensionConfig` 

214 Configuration that defines a `DimensionUniverse`, to be written 

215 into the data repository and used to define aspects of the schema. 

216 

217 Returns 

218 ------- 

219 instances : `RegistryManagerInstances` 

220 Struct containing instances of the types contained by ``self``, 

221 pointing to the new repository and backed by ``database``. 

222 """ 

223 # If schema versions were specified in the config, check that they are 

224 # compatible with their managers. 

225 managers = self.as_dict() 

226 for manager_type, schema_version in self.schema_versions.items(): 

227 manager_class = managers[manager_type] 

228 manager_class.checkNewSchemaVersion(schema_version) 

229 

230 universe = DimensionUniverse(dimensionConfig) 

231 with database.declareStaticTables(create=True) as context: 

232 if self.datasets.getIdColumnType() is sqlalchemy.BigInteger: 

233 raise RuntimeError( 

234 "New data repositories should be created with UUID dataset IDs instead of autoincrement " 

235 "integer dataset IDs.", 

236 ) 

237 instances = RegistryManagerInstances.initialize(database, context, types=self, universe=universe) 

238 

239 # store managers and their versions in attributes table 

240 versions = ButlerVersionsManager(instances.attributes) 

241 versions.storeManagersConfig(instances.as_dict()) 

242 

243 # dump universe config as json into attributes (faster than YAML) 

244 json = dimensionConfig.dump(format="json") 

245 if json is not None: 

246 instances.attributes.set(_DIMENSIONS_ATTR, json) 

247 else: 

248 raise RuntimeError("Unexpectedly failed to serialize DimensionConfig to JSON") 

249 if instances.obscore is not None: 

250 json = instances.obscore.config_json() 

251 instances.attributes.set(_OBSCORE_ATTR, json) 

252 return instances 

253 

254 def loadRepo(self, database: Database) -> RegistryManagerInstances: 

255 """Construct manager instances that point to an existing data 

256 repository. 

257 

258 Parameters 

259 ---------- 

260 database : `Database` 

261 Object that represents a connection to the SQL database that backs 

262 the data repository. Must point to a namespace that already holds 

263 all tables and other persistent entities used by butler. 

264 

265 Returns 

266 ------- 

267 instances : `RegistryManagerInstances` 

268 Struct containing instances of the types contained by ``self``, 

269 pointing to the new repository and backed by ``database``. 

270 """ 

271 # Create attributes manager only first, so we can use it to load the 

272 # embedded dimensions configuration. Note that we do not check this 

273 # manager version before initializing it, it is supposed to be 

274 # completely backward- and forward-compatible. 

275 with database.declareStaticTables(create=False) as context: 

276 attributes = self.attributes.initialize(database, context) 

277 

278 # Verify that configured classes are compatible with the ones stored 

279 # in registry. 

280 versions = ButlerVersionsManager(attributes) 

281 versions.checkManagersConfig(self.as_dict()) 

282 

283 # Read schema versions from registry and validate them. 

284 self.schema_versions.update(versions.managerVersions()) 

285 for manager_type, manager_class in self.as_dict().items(): 

286 schema_version = self.schema_versions.get(manager_type) 

287 if schema_version is not None: 

288 manager_class.checkCompatibility(schema_version, database.isWriteable()) 

289 

290 # get serialized as a string from database 

291 dimensionsString = attributes.get(_DIMENSIONS_ATTR) 

292 if dimensionsString is not None: 

293 dimensionConfig = DimensionConfig(Config.fromString(dimensionsString, format="json")) 

294 else: 

295 raise LookupError(f"Registry attribute {_DIMENSIONS_ATTR} is missing from database") 

296 universe = DimensionUniverse(dimensionConfig) 

297 if self.obscore is not None: 

298 # Get ObsCore configuration from attributes table, this silently 

299 # overrides whatever may come from config file. Idea is that we do 

300 # not want to carry around the whole thing, and butler config will 

301 # have empty obscore configuration after initialization. When 

302 # configuration is missing from attributes table, the obscore table 

303 # does not exist, and we do not instantiate obscore manager. 

304 obscoreString = attributes.get(_OBSCORE_ATTR) 

305 if obscoreString is not None: 

306 self.manager_configs["obscore"] = Config.fromString(obscoreString, format="json") 

307 

308 with database.declareStaticTables(create=False) as context: 

309 instances = RegistryManagerInstances.initialize(database, context, types=self, universe=universe) 

310 

311 # Load content from database that we try to keep in-memory. 

312 instances.refresh() 

313 return instances 

314 

315 def as_dict(self) -> Mapping[str, type[VersionedExtension]]: 

316 """Return contained managers as a dictionary with manager type name as 

317 a key. 

318 

319 Returns 

320 ------- 

321 extensions : `~collections.abc.Mapping` [`str`, `VersionedExtension`] 

322 Maps manager type name (e.g. "datasets") to its corresponding 

323 manager class. Only existing managers are returned. 

324 """ 

325 extras = {"manager_configs", "schema_versions"} 

326 managers = {f.name: getattr(self, f.name) for f in dataclasses.fields(self) if f.name not in extras} 

327 return {key: value for key, value in managers.items() if value is not None} 

328 

329 manager_configs: dict[str, Mapping] = dataclasses.field(default_factory=dict) 

330 """Per-manager configuration options passed to their initialize methods. 

331 """ 

332 

333 schema_versions: dict[str, VersionTuple] = dataclasses.field(default_factory=dict) 

334 """Per-manager schema versions defined by configuration, optional.""" 

335 

336 

337@dataclasses.dataclass(frozen=True, eq=False) 

338class RegistryManagerInstances( 

339 _GenericRegistryManagers[ 

340 ButlerAttributeManager, 

341 DimensionRecordStorageManager, 

342 CollectionManager, 

343 DatasetRecordStorageManager, 

344 OpaqueTableStorageManager, 

345 DatastoreRegistryBridgeManager, 

346 ObsCoreTableManager, 

347 ] 

348): 

349 """A struct used to pass around the manager instances that back a 

350 `Registry`. 

351 """ 

352 

353 column_types: ColumnTypeInfo 

354 """Information about column types that can differ between data repositories 

355 and registry instances, including the dimension universe. 

356 """ 

357 

358 caching_context: CachingContext 

359 """Object containing caches for for various information generated by 

360 managers. 

361 """ 

362 

363 @contextmanager 

364 def caching_context_manager(self) -> Iterator[None]: 

365 """Context manager that enables caching. 

366 

367 Calls to this method may be nested and the returned context managers 

368 may even be closed out of order, with only the context manager entered 

369 and the last context manager exited having any effect. 

370 """ 

371 self.caching_context._enable() 

372 try: 

373 yield 

374 finally: 

375 self.caching_context._disable() 

376 

377 @classmethod 

378 def initialize( 

379 cls, 

380 database: Database, 

381 context: StaticTablesContext, 

382 *, 

383 types: RegistryManagerTypes, 

384 universe: DimensionUniverse, 

385 caching_context: CachingContext | None = None, 

386 ) -> RegistryManagerInstances: 

387 """Construct manager instances from their types and an existing 

388 database connection. 

389 

390 Parameters 

391 ---------- 

392 database : `Database` 

393 Object that represents a connection to the SQL database that backs 

394 the data repository. 

395 context : `StaticTablesContext` 

396 Object used to create tables in ``database``. 

397 types : `RegistryManagerTypes` 

398 Struct containing type objects for the manager instances to 

399 construct. 

400 universe : `DimensionUniverse` 

401 Object that describes all dimensions in this data repository. 

402 caching_context : `CachingContext` or `None`, optional 

403 Caching context to use. 

404 

405 Returns 

406 ------- 

407 instances : `RegistryManagerInstances` 

408 Struct containing manager instances. 

409 """ 

410 if caching_context is None: 

411 caching_context = CachingContext() 

412 dummy_table = ddl.TableSpec(fields=()) 

413 kwargs: dict[str, Any] = {} 

414 schema_versions = types.schema_versions 

415 kwargs["attributes"] = types.attributes.initialize( 

416 database, context, registry_schema_version=schema_versions.get("attributes") 

417 ) 

418 kwargs["dimensions"] = types.dimensions.initialize( 

419 database, context, universe=universe, registry_schema_version=schema_versions.get("dimensions") 

420 ) 

421 kwargs["collections"] = types.collections.initialize( 

422 database, 

423 context, 

424 caching_context=caching_context, 

425 registry_schema_version=schema_versions.get("collections"), 

426 ) 

427 datasets = types.datasets.initialize( 

428 database, 

429 context, 

430 collections=kwargs["collections"], 

431 dimensions=kwargs["dimensions"], 

432 registry_schema_version=schema_versions.get("datasets"), 

433 caching_context=caching_context, 

434 ) 

435 kwargs["datasets"] = datasets 

436 kwargs["opaque"] = types.opaque.initialize( 

437 database, context, registry_schema_version=schema_versions.get("opaque") 

438 ) 

439 kwargs["datastores"] = types.datastores.initialize( 

440 database, 

441 context, 

442 opaque=kwargs["opaque"], 

443 datasets=types.datasets, 

444 universe=universe, 

445 registry_schema_version=schema_versions.get("datastores"), 

446 ) 

447 if types.obscore is not None and "obscore" in types.manager_configs: 

448 kwargs["obscore"] = types.obscore.initialize( 

449 database, 

450 context, 

451 universe=universe, 

452 config=types.manager_configs["obscore"], 

453 datasets=types.datasets, 

454 dimensions=kwargs["dimensions"], 

455 registry_schema_version=schema_versions.get("obscore"), 

456 ) 

457 else: 

458 kwargs["obscore"] = None 

459 kwargs["column_types"] = ColumnTypeInfo( 

460 database.getTimespanRepresentation(), 

461 universe, 

462 dataset_id_spec=types.datasets.addDatasetForeignKey( 

463 dummy_table, 

464 primaryKey=False, 

465 nullable=False, 

466 ), 

467 run_key_spec=types.collections.addRunForeignKey(dummy_table, primaryKey=False, nullable=False), 

468 ingest_date_dtype=datasets.ingest_date_dtype(), 

469 ) 

470 kwargs["caching_context"] = caching_context 

471 return cls(**kwargs) 

472 

473 def clone( 

474 self, 

475 db: Database, 

476 ) -> RegistryManagerInstances: 

477 """Make an independent copy of the manager instances with a new 

478 `Database` instance. 

479 

480 Parameters 

481 ---------- 

482 db : `Database` 

483 New `Database` object to use when instantiating managers. 

484 

485 Returns 

486 ------- 

487 instances : `RegistryManagerInstances` 

488 New manager instances with the same configuration as this instance, 

489 but bound to a new Database object. 

490 """ 

491 caching_context = CachingContext() 

492 dimensions = self.dimensions.clone(db) 

493 collections = self.collections.clone(db, caching_context) 

494 opaque = self.opaque.clone(db) 

495 datasets = self.datasets.clone( 

496 db=db, collections=collections, dimensions=dimensions, caching_context=caching_context 

497 ) 

498 obscore = None 

499 if self.obscore is not None: 

500 obscore = self.obscore.clone(db=db, dimensions=dimensions) 

501 return RegistryManagerInstances( 

502 attributes=self.attributes.clone(db), 

503 dimensions=dimensions, 

504 collections=collections, 

505 datasets=datasets, 

506 opaque=opaque, 

507 datastores=self.datastores.clone(db=db, opaque=opaque), 

508 obscore=obscore, 

509 column_types=self.column_types, 

510 caching_context=caching_context, 

511 ) 

512 

513 def as_dict(self) -> Mapping[str, VersionedExtension]: 

514 """Return contained managers as a dictionary with manager type name as 

515 a key. 

516 

517 Returns 

518 ------- 

519 extensions : `~collections.abc.Mapping` [`str`, `VersionedExtension`] 

520 Maps manager type name (e.g. "datasets") to its corresponding 

521 manager instance. Only existing managers are returned. 

522 """ 

523 instances = { 

524 f.name: getattr(self, f.name) 

525 for f in dataclasses.fields(self) 

526 if f.name not in ("column_types", "caching_context") 

527 } 

528 return {key: value for key, value in instances.items() if value is not None} 

529 

530 def refresh(self) -> None: 

531 """Refresh all in-memory state by querying the database or clearing 

532 caches. 

533 """ 

534 self.collections.refresh() 

535 self.datasets.refresh()