Coverage for python/lsst/daf/butler/registry/managers.py: 33%

157 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-12-01 11:00 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28from __future__ import annotations 

29 

30from .. import ddl 

31 

32__all__ = ( 

33 "RegistryManagerInstances", 

34 "RegistryManagerTypes", 

35) 

36 

37import dataclasses 

38import logging 

39from collections.abc import Mapping 

40from typing import Any, Generic, TypeVar 

41 

42import sqlalchemy 

43from lsst.utils import doImportType 

44 

45from .._column_type_info import ColumnTypeInfo 

46from .._config import Config 

47from ..dimensions import DimensionConfig, DimensionUniverse 

48from ._caching_context import CachingContext 

49from ._config import RegistryConfig 

50from .interfaces import ( 

51 ButlerAttributeManager, 

52 CollectionManager, 

53 Database, 

54 DatasetRecordStorageManager, 

55 DatastoreRegistryBridgeManager, 

56 DimensionRecordStorageManager, 

57 ObsCoreTableManager, 

58 OpaqueTableStorageManager, 

59 StaticTablesContext, 

60 VersionedExtension, 

61 VersionTuple, 

62) 

63from .versions import ButlerVersionsManager 

64 

65_Attributes = TypeVar("_Attributes") 

66_Dimensions = TypeVar("_Dimensions") 

67_Collections = TypeVar("_Collections") 

68_Datasets = TypeVar("_Datasets") 

69_Opaque = TypeVar("_Opaque") 

70_Datastores = TypeVar("_Datastores") 

71_ObsCore = TypeVar("_ObsCore") 

72 

73 

74_LOG = logging.getLogger(__name__) 

75 

76# key for dimensions configuration in attributes table 

77_DIMENSIONS_ATTR = "config:dimensions.json" 

78 

79# key for obscore configuration in attributes table 

80_OBSCORE_ATTR = "config:obscore.json" 

81 

82 

83@dataclasses.dataclass(frozen=True, eq=False) 

84class _GenericRegistryManagers( 

85 Generic[_Attributes, _Dimensions, _Collections, _Datasets, _Opaque, _Datastores, _ObsCore] 

86): 

87 """Base struct used to pass around the manager instances or types that back 

88 a `Registry`. 

89 

90 This class should only be used via its non-generic subclasses, 

91 `RegistryManagerInstances` and `RegistryManagerTypes`. 

92 """ 

93 

94 attributes: _Attributes 

95 """Manager for flat key-value pairs, including versions. 

96 """ 

97 

98 dimensions: _Dimensions 

99 """Manager for dimensions. 

100 """ 

101 

102 collections: _Collections 

103 """Manager for collections. 

104 """ 

105 

106 datasets: _Datasets 

107 """Manager for datasets, dataset types, and collection summaries. 

108 """ 

109 

110 opaque: _Opaque 

111 """Manager for opaque (to the Registry) tables. 

112 """ 

113 

114 datastores: _Datastores 

115 """Manager for the interface between `Registry` and `Datastore`. 

116 """ 

117 

118 obscore: _ObsCore | None 

119 """Manager for `ObsCore` table(s). 

120 """ 

121 

122 

123@dataclasses.dataclass(frozen=True, eq=False) 

124class RegistryManagerTypes( 

125 _GenericRegistryManagers[ 

126 type[ButlerAttributeManager], 

127 type[DimensionRecordStorageManager], 

128 type[CollectionManager], 

129 type[DatasetRecordStorageManager], 

130 type[OpaqueTableStorageManager], 

131 type[DatastoreRegistryBridgeManager], 

132 type[ObsCoreTableManager], 

133 ] 

134): 

135 """A struct used to pass around the types of the manager objects that back 

136 a `Registry`. 

137 """ 

138 

139 @classmethod 

140 def fromConfig(cls, config: RegistryConfig) -> RegistryManagerTypes: 

141 """Construct by extracting class names from configuration and importing 

142 them. 

143 

144 Parameters 

145 ---------- 

146 config : `RegistryConfig` 

147 Configuration object with a "managers" section that contains all 

148 fully-qualified class names for all manager types. 

149 

150 Returns 

151 ------- 

152 types : `RegistryManagerTypes` 

153 A new struct containing type objects. 

154 """ 

155 # We only check for manager names defined in class attributes. 

156 # TODO: Maybe we need to check keys for unknown names/typos? 

157 managers = {field.name for field in dataclasses.fields(cls)} - {"manager_configs", "schema_versions"} 

158 # Values of "config" sub-key, if any, indexed by manager name. 

159 configs: dict[str, Mapping] = {} 

160 schema_versions: dict[str, VersionTuple] = {} 

161 manager_types: dict[str, type] = {} 

162 for manager in managers: 

163 manager_config = config["managers"].get(manager) 

164 if isinstance(manager_config, Config): 

165 # Expect "cls" and optional "config" and "schema_version" 

166 # sub-keys. 

167 manager_config_dict = manager_config.toDict() 

168 try: 

169 class_name = manager_config_dict.pop("cls") 

170 except KeyError: 

171 raise KeyError(f"'cls' key is not defined in {manager!r} manager configuration") from None 

172 if (mgr_config := manager_config_dict.pop("config", None)) is not None: 

173 configs[manager] = mgr_config 

174 if (mgr_version := manager_config_dict.pop("schema_version", None)) is not None: 

175 # Note that we do not check versions that come from config 

176 # for compatibility, they may be overriden later by 

177 # versions from registry. 

178 schema_versions[manager] = VersionTuple.fromString(mgr_version) 

179 if manager_config_dict: 

180 raise ValueError( 

181 f"{manager!r} manager configuration has unexpected keys: {set(manager_config_dict)}" 

182 ) 

183 elif isinstance(manager_config, str): 

184 class_name = manager_config 

185 elif manager_config is None: 

186 # Some managers may be optional. 

187 continue 

188 else: 

189 raise KeyError(f"Unexpected type of {manager!r} manager configuration: {manager_config!r}") 

190 manager_types[manager] = doImportType(class_name) 

191 

192 # obscore need special care because it's the only manager which can be 

193 # None, and we cannot define default value for it. 

194 if "obscore" in manager_types: 

195 return cls(**manager_types, manager_configs=configs, schema_versions=schema_versions) 

196 else: 

197 return cls( 

198 **manager_types, obscore=None, manager_configs=configs, schema_versions=schema_versions 

199 ) 

200 

201 def makeRepo(self, database: Database, dimensionConfig: DimensionConfig) -> RegistryManagerInstances: 

202 """Create all persistent `Registry` state for a new, empty data 

203 repository, and return a new struct containing manager instances. 

204 

205 Parameters 

206 ---------- 

207 database : `Database` 

208 Object that represents a connection to the SQL database that will 

209 back the data repository. Must point to an empty namespace, or at 

210 least one with no tables or other entities whose names might clash 

211 with those used by butler. 

212 dimensionConfig : `DimensionConfig` 

213 Configuration that defines a `DimensionUniverse`, to be written 

214 into the data repository and used to define aspects of the schema. 

215 

216 Returns 

217 ------- 

218 instances : `RegistryManagerInstances` 

219 Struct containing instances of the types contained by ``self``, 

220 pointing to the new repository and backed by ``database``. 

221 """ 

222 # If schema versions were specified in the config, check that they are 

223 # compatible with their managers. 

224 managers = self.as_dict() 

225 for manager_type, schema_version in self.schema_versions.items(): 

226 manager_class = managers[manager_type] 

227 manager_class.checkNewSchemaVersion(schema_version) 

228 

229 universe = DimensionUniverse(dimensionConfig) 

230 with database.declareStaticTables(create=True) as context: 

231 if self.datasets.getIdColumnType() is sqlalchemy.BigInteger: 

232 raise RuntimeError( 

233 "New data repositories should be created with UUID dataset IDs instead of autoincrement " 

234 "integer dataset IDs.", 

235 ) 

236 instances = RegistryManagerInstances.initialize(database, context, types=self, universe=universe) 

237 

238 # store managers and their versions in attributes table 

239 versions = ButlerVersionsManager(instances.attributes) 

240 versions.storeManagersConfig(instances.as_dict()) 

241 

242 # dump universe config as json into attributes (faster than YAML) 

243 json = dimensionConfig.dump(format="json") 

244 if json is not None: 

245 instances.attributes.set(_DIMENSIONS_ATTR, json) 

246 else: 

247 raise RuntimeError("Unexpectedly failed to serialize DimensionConfig to JSON") 

248 if instances.obscore is not None: 

249 json = instances.obscore.config_json() 

250 instances.attributes.set(_OBSCORE_ATTR, json) 

251 return instances 

252 

253 def loadRepo(self, database: Database) -> RegistryManagerInstances: 

254 """Construct manager instances that point to an existing data 

255 repository. 

256 

257 Parameters 

258 ---------- 

259 database : `Database` 

260 Object that represents a connection to the SQL database that backs 

261 the data repository. Must point to a namespace that already holds 

262 all tables and other persistent entities used by butler. 

263 

264 Returns 

265 ------- 

266 instances : `RegistryManagerInstances` 

267 Struct containing instances of the types contained by ``self``, 

268 pointing to the new repository and backed by ``database``. 

269 """ 

270 # Create attributes manager only first, so we can use it to load the 

271 # embedded dimensions configuration. Note that we do not check this 

272 # manager version before initializing it, it is supposed to be 

273 # completely backward- and forward-compatible. 

274 with database.declareStaticTables(create=False) as context: 

275 attributes = self.attributes.initialize(database, context) 

276 

277 # Verify that configured classes are compatible with the ones stored 

278 # in registry. 

279 versions = ButlerVersionsManager(attributes) 

280 versions.checkManagersConfig(self.as_dict()) 

281 

282 # Read schema versions from registry and validate them. 

283 self.schema_versions.update(versions.managerVersions()) 

284 for manager_type, manager_class in self.as_dict().items(): 

285 schema_version = self.schema_versions.get(manager_type) 

286 if schema_version is not None: 

287 manager_class.checkCompatibility(schema_version, database.isWriteable()) 

288 

289 # get serialized as a string from database 

290 dimensionsString = attributes.get(_DIMENSIONS_ATTR) 

291 if dimensionsString is not None: 

292 dimensionConfig = DimensionConfig(Config.fromString(dimensionsString, format="json")) 

293 else: 

294 raise LookupError(f"Registry attribute {_DIMENSIONS_ATTR} is missing from database") 

295 universe = DimensionUniverse(dimensionConfig) 

296 if self.obscore is not None: 

297 # Get ObsCore configuration from attributes table, this silently 

298 # overrides whatever may come from config file. Idea is that we do 

299 # not want to carry around the whole thing, and butler config will 

300 # have empty obscore configuration after initialization. When 

301 # configuration is missing from attributes table, the obscore table 

302 # does not exist, and we do not instantiate obscore manager. 

303 obscoreString = attributes.get(_OBSCORE_ATTR) 

304 if obscoreString is not None: 

305 self.manager_configs["obscore"] = Config.fromString(obscoreString, format="json") 

306 

307 with database.declareStaticTables(create=False) as context: 

308 instances = RegistryManagerInstances.initialize(database, context, types=self, universe=universe) 

309 

310 # Load content from database that we try to keep in-memory. 

311 instances.refresh() 

312 return instances 

313 

314 def as_dict(self) -> Mapping[str, type[VersionedExtension]]: 

315 """Return contained managers as a dictionary with manager type name as 

316 a key. 

317 

318 Returns 

319 ------- 

320 extensions : `~collections.abc.Mapping` [`str`, `VersionedExtension`] 

321 Maps manager type name (e.g. "datasets") to its corresponding 

322 manager class. Only existing managers are returned. 

323 """ 

324 extras = {"manager_configs", "schema_versions"} 

325 managers = {f.name: getattr(self, f.name) for f in dataclasses.fields(self) if f.name not in extras} 

326 return {key: value for key, value in managers.items() if value is not None} 

327 

328 manager_configs: dict[str, Mapping] = dataclasses.field(default_factory=dict) 

329 """Per-manager configuration options passed to their initialize methods. 

330 """ 

331 

332 schema_versions: dict[str, VersionTuple] = dataclasses.field(default_factory=dict) 

333 """Per-manager schema versions defined by configuration, optional.""" 

334 

335 

336@dataclasses.dataclass(frozen=True, eq=False) 

337class RegistryManagerInstances( 

338 _GenericRegistryManagers[ 

339 ButlerAttributeManager, 

340 DimensionRecordStorageManager, 

341 CollectionManager, 

342 DatasetRecordStorageManager, 

343 OpaqueTableStorageManager, 

344 DatastoreRegistryBridgeManager, 

345 ObsCoreTableManager, 

346 ] 

347): 

348 """A struct used to pass around the manager instances that back a 

349 `Registry`. 

350 """ 

351 

352 column_types: ColumnTypeInfo 

353 """Information about column types that can differ between data repositories 

354 and registry instances, including the dimension universe. 

355 """ 

356 

357 caching_context: CachingContext 

358 """Object containing caches for for various information generated by 

359 managers. 

360 """ 

361 

362 @classmethod 

363 def initialize( 

364 cls, 

365 database: Database, 

366 context: StaticTablesContext, 

367 *, 

368 types: RegistryManagerTypes, 

369 universe: DimensionUniverse, 

370 caching_context: CachingContext | None = None, 

371 ) -> RegistryManagerInstances: 

372 """Construct manager instances from their types and an existing 

373 database connection. 

374 

375 Parameters 

376 ---------- 

377 database : `Database` 

378 Object that represents a connection to the SQL database that backs 

379 the data repository. 

380 context : `StaticTablesContext` 

381 Object used to create tables in ``database``. 

382 types : `RegistryManagerTypes` 

383 Struct containing type objects for the manager instances to 

384 construct. 

385 universe : `DimensionUniverse` 

386 Object that describes all dimensions in this data repository. 

387 

388 Returns 

389 ------- 

390 instances : `RegistryManagerInstances` 

391 Struct containing manager instances. 

392 """ 

393 if caching_context is None: 

394 caching_context = CachingContext() 

395 dummy_table = ddl.TableSpec(fields=()) 

396 kwargs: dict[str, Any] = {} 

397 schema_versions = types.schema_versions 

398 kwargs["attributes"] = types.attributes.initialize( 

399 database, context, registry_schema_version=schema_versions.get("attributes") 

400 ) 

401 kwargs["dimensions"] = types.dimensions.initialize( 

402 database, context, universe=universe, registry_schema_version=schema_versions.get("dimensions") 

403 ) 

404 kwargs["collections"] = types.collections.initialize( 

405 database, 

406 context, 

407 dimensions=kwargs["dimensions"], 

408 caching_context=caching_context, 

409 registry_schema_version=schema_versions.get("collections"), 

410 ) 

411 datasets = types.datasets.initialize( 

412 database, 

413 context, 

414 collections=kwargs["collections"], 

415 dimensions=kwargs["dimensions"], 

416 registry_schema_version=schema_versions.get("datasets"), 

417 caching_context=caching_context, 

418 ) 

419 kwargs["datasets"] = datasets 

420 kwargs["opaque"] = types.opaque.initialize( 

421 database, context, registry_schema_version=schema_versions.get("opaque") 

422 ) 

423 kwargs["datastores"] = types.datastores.initialize( 

424 database, 

425 context, 

426 opaque=kwargs["opaque"], 

427 datasets=types.datasets, 

428 universe=universe, 

429 registry_schema_version=schema_versions.get("datastores"), 

430 ) 

431 if types.obscore is not None and "obscore" in types.manager_configs: 

432 kwargs["obscore"] = types.obscore.initialize( 

433 database, 

434 context, 

435 universe=universe, 

436 config=types.manager_configs["obscore"], 

437 datasets=types.datasets, 

438 dimensions=kwargs["dimensions"], 

439 registry_schema_version=schema_versions.get("obscore"), 

440 ) 

441 else: 

442 kwargs["obscore"] = None 

443 kwargs["column_types"] = ColumnTypeInfo( 

444 database.getTimespanRepresentation(), 

445 universe, 

446 dataset_id_spec=types.datasets.addDatasetForeignKey( 

447 dummy_table, 

448 primaryKey=False, 

449 nullable=False, 

450 ), 

451 run_key_spec=types.collections.addRunForeignKey(dummy_table, primaryKey=False, nullable=False), 

452 ingest_date_dtype=datasets.ingest_date_dtype(), 

453 ) 

454 kwargs["caching_context"] = caching_context 

455 return cls(**kwargs) 

456 

457 def as_dict(self) -> Mapping[str, VersionedExtension]: 

458 """Return contained managers as a dictionary with manager type name as 

459 a key. 

460 

461 Returns 

462 ------- 

463 extensions : `~collections.abc.Mapping` [`str`, `VersionedExtension`] 

464 Maps manager type name (e.g. "datasets") to its corresponding 

465 manager instance. Only existing managers are returned. 

466 """ 

467 instances = { 

468 f.name: getattr(self, f.name) 

469 for f in dataclasses.fields(self) 

470 if f.name not in ("column_types", "caching_context") 

471 } 

472 return {key: value for key, value in instances.items() if value is not None} 

473 

474 def refresh(self) -> None: 

475 """Refresh all in-memory state by querying the database or clearing 

476 caches. 

477 """ 

478 self.dimensions.clearCaches() 

479 self.collections.refresh() 

480 self.datasets.refresh()