Coverage for python/lsst/daf/butler/tests/_testRepo.py: 14%

144 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-08-12 09:20 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = [ 

25 "makeTestRepo", 

26 "makeTestCollection", 

27 "addDatasetType", 

28 "expandUniqueId", 

29 "DatastoreMock", 

30 "addDataIdValue", 

31] 

32 

33import random 

34from collections.abc import Iterable, Mapping 

35from typing import TYPE_CHECKING, Any 

36from unittest.mock import MagicMock 

37 

38import sqlalchemy 

39from lsst.daf.butler import ( 

40 Butler, 

41 Config, 

42 DataCoordinate, 

43 DatasetRef, 

44 DatasetType, 

45 Dimension, 

46 DimensionUniverse, 

47 FileDataset, 

48 StorageClass, 

49) 

50 

51if TYPE_CHECKING: 

52 from lsst.daf.butler import DatasetId 

53 

54 

55def makeTestRepo( 

56 root: str, dataIds: Mapping[str, Iterable] | None = None, *, config: Config | None = None, **kwargs: Any 

57) -> Butler: 

58 """Create an empty test repository. 

59 

60 Parameters 

61 ---------- 

62 root : `str` 

63 The location of the root directory for the repository. 

64 dataIds : `~collections.abc.Mapping` [`str`, `iterable`], optional 

65 A mapping keyed by the dimensions used in the test. Each value is an 

66 iterable of names for that dimension (e.g., detector IDs for 

67 `"detector"`). Related dimensions (e.g., instruments and detectors) are 

68 linked arbitrarily, with values created for implied dimensions only 

69 when needed. This parameter is provided for compatibility with old 

70 code; newer code should make the repository, then call 

71 `~lsst.daf.butler.tests.addDataIdValue`. 

72 config : `lsst.daf.butler.Config`, optional 

73 A configuration for the repository (for details, see 

74 `lsst.daf.butler.Butler.makeRepo`). If omitted, creates a repository 

75 with default dataset and storage types, but optimized for speed. The 

76 defaults set ``.datastore.cls``, ``.datastore.checksum`` and 

77 ``.registry.db``. If a supplied config does not specify these values 

78 the internal defaults will be used to ensure that we have a usable 

79 configuration. 

80 **kwargs 

81 Extra arguments to `lsst.daf.butler.Butler.makeRepo`. 

82 

83 Returns 

84 ------- 

85 butler : `lsst.daf.butler.Butler` 

86 A Butler referring to the new repository. This Butler is provided only 

87 for additional setup; to keep test cases isolated, it is highly 

88 recommended that each test create its own Butler with a unique 

89 run/collection. See `makeTestCollection`. 

90 

91 Notes 

92 ----- 

93 This function provides a "quick and dirty" repository for simple unit tests 

94 that don't depend on complex data relationships. It is ill-suited for tests 

95 where the structure of the data matters. If you need such a dataset, create 

96 it directly or use a saved test dataset. 

97 """ 

98 defaults = Config() 

99 defaults["datastore", "cls"] = "lsst.daf.butler.datastores.inMemoryDatastore.InMemoryDatastore" 

100 defaults["datastore", "checksum"] = False # In case of future changes 

101 defaults["registry", "db"] = "sqlite:///<butlerRoot>/gen3.sqlite3" 

102 

103 if config: 

104 defaults.update(config) 

105 

106 if not dataIds: 

107 dataIds = {} 

108 

109 # Disable config root by default so that our registry override will 

110 # not be ignored. 

111 # newConfig guards against location-related keywords like outfile 

112 newConfig = Butler.makeRepo(root, config=defaults, forceConfigRoot=False, **kwargs) 

113 butler = Butler(newConfig, writeable=True) 

114 dimensionRecords = _makeRecords(dataIds, butler.dimensions) 

115 for dimension, records in dimensionRecords.items(): 

116 if butler.dimensions[dimension].viewOf is None: 

117 butler.registry.insertDimensionData(dimension, *records) 

118 return butler 

119 

120 

121def makeTestCollection(repo: Butler, uniqueId: str | None = None) -> Butler: 

122 """Create a read/write Butler to a fresh collection. 

123 

124 Parameters 

125 ---------- 

126 repo : `lsst.daf.butler.Butler` 

127 A previously existing Butler to a repository, such as that returned by 

128 `~lsst.daf.butler.Butler.makeRepo` or `makeTestRepo`. 

129 uniqueId : `str`, optional 

130 A collection ID guaranteed by external code to be unique across all 

131 calls to ``makeTestCollection`` for the same repository. 

132 

133 Returns 

134 ------- 

135 butler : `lsst.daf.butler.Butler` 

136 A Butler referring to a new collection in the repository at ``root``. 

137 The collection is (almost) guaranteed to be new. 

138 

139 Notes 

140 ----- 

141 This function creates a single run collection that does not necessarily 

142 conform to any repository conventions. It is only suitable for creating an 

143 isolated test area, and not for repositories intended for real data 

144 processing or analysis. 

145 """ 

146 if not uniqueId: 

147 # Create a "random" collection name 

148 # Speed matters more than cryptographic guarantees 

149 uniqueId = str(random.randrange(1_000_000_000)) 

150 collection = "test_" + uniqueId 

151 return Butler(butler=repo, run=collection) 

152 

153 

154def _makeRecords(dataIds: Mapping[str, Iterable], universe: DimensionUniverse) -> Mapping[str, Iterable]: 

155 """Create cross-linked dimension records from a collection of 

156 data ID values. 

157 

158 Parameters 

159 ---------- 

160 dataIds : `~collections.abc.Mapping` [`str`, `iterable`] 

161 A mapping keyed by the dimensions of interest. Each value is an 

162 iterable of names for that dimension (e.g., detector IDs for 

163 `"detector"`). 

164 universe : lsst.daf.butler.DimensionUniverse 

165 Set of all known dimensions and their relationships. 

166 

167 Returns 

168 ------- 

169 dataIds : `~collections.abc.Mapping` [`str`, `iterable`] 

170 A mapping keyed by the dimensions of interest, giving one 

171 `~lsst.daf.butler.DimensionRecord` for each input name. Related 

172 dimensions (e.g., instruments and detectors) are linked arbitrarily. 

173 """ 

174 # Create values for all dimensions that are (recursive) required or implied 

175 # dependencies of the given ones. 

176 complete_data_id_values = {} 

177 for dimension in universe.extract(dataIds.keys()): 

178 if dimension.name in dataIds: 

179 complete_data_id_values[dimension.name] = list(dataIds[dimension.name]) 

180 if dimension.name not in complete_data_id_values: 

181 complete_data_id_values[dimension.name] = [_makeRandomDataIdValue(dimension)] 

182 

183 # Start populating dicts that will become DimensionRecords by providing 

184 # alternate keys like detector names 

185 record_dicts_by_dimension_name: dict[str, list[dict[str, str | int | bytes]]] = {} 

186 for name, values in complete_data_id_values.items(): 

187 record_dicts_by_dimension_name[name] = [] 

188 dimension_el = universe[name] 

189 for value in values: 

190 # _fillAllKeys wants Dimension and not DimensionElement. 

191 # universe.__getitem__ says it returns DimensionElement but this 

192 # really does also seem to be a Dimension here. 

193 record_dicts_by_dimension_name[name].append( 

194 _fillAllKeys(dimension_el, value) # type: ignore[arg-type] 

195 ) 

196 

197 # Pick cross-relationships arbitrarily 

198 for name, record_dicts in record_dicts_by_dimension_name.items(): 

199 dimension_el = universe[name] 

200 for record_dict in record_dicts: 

201 for other in dimension_el.dimensions: 

202 if other != dimension_el: 

203 relation = record_dicts_by_dimension_name[other.name][0] 

204 record_dict[other.name] = relation[other.primaryKey.name] 

205 

206 return { 

207 dimension: [universe[dimension].RecordClass(**record_dict) for record_dict in record_dicts] 

208 for dimension, record_dicts in record_dicts_by_dimension_name.items() 

209 } 

210 

211 

212def _fillAllKeys(dimension: Dimension, value: str | int) -> dict[str, str | int | bytes]: 

213 """Create an arbitrary mapping of all required keys for a given dimension 

214 that do not refer to other dimensions. 

215 

216 Parameters 

217 ---------- 

218 dimension : `lsst.daf.butler.Dimension` 

219 The dimension for which to generate a set of keys (e.g., detector). 

220 value 

221 The value assigned to ``dimension`` (e.g., detector ID). 

222 

223 Returns 

224 ------- 

225 expandedValue : `dict` [`str`] 

226 A mapping of dimension keys to values. ``dimension's`` primary key 

227 maps to ``value``, but all other mappings (e.g., detector name) 

228 are arbitrary. 

229 """ 

230 expandedValue: dict[str, str | int | bytes] = {} 

231 for key in dimension.uniqueKeys: 

232 if key.nbytes: 

233 # For `bytes` fields, we want something that casts at least `str` 

234 # and `int` values to bytes and yields b'' when called with no 

235 # arguments (as in the except block below). Unfortunately, the 

236 # `bytes` type itself fails for both `str` and `int`, but this 

237 # lambda does what we need. This particularly important for the 

238 # skymap dimensions' bytes 'hash' field, which has a unique 

239 # constraint; without this, all skymaps would get a hash of b'' 

240 # and end up conflicting. 

241 castType = lambda *args: str(*args).encode() # noqa: E731 

242 else: 

243 castType = key.dtype().python_type 

244 try: 

245 castValue = castType(value) 

246 except TypeError: 

247 castValue = castType() 

248 expandedValue[key.name] = castValue 

249 for key in dimension.metadata: 

250 if not key.nullable: 

251 expandedValue[key.name] = key.dtype().python_type(value) 

252 return expandedValue 

253 

254 

255def _makeRandomDataIdValue(dimension: Dimension) -> int | str: 

256 """Generate a random value of the appropriate type for a data ID key. 

257 

258 Parameters 

259 ---------- 

260 dimension : `Dimension` 

261 Dimension the value corresponds to. 

262 

263 Returns 

264 ------- 

265 value : `int` or `str` 

266 Random value. 

267 """ 

268 if dimension.primaryKey.getPythonType() is str: 

269 return str(random.randrange(1000)) 

270 else: 

271 return random.randrange(1000) 

272 

273 

274def expandUniqueId(butler: Butler, partialId: Mapping[str, Any]) -> DataCoordinate: 

275 """Return a complete data ID matching some criterion. 

276 

277 Parameters 

278 ---------- 

279 butler : `lsst.daf.butler.Butler` 

280 The repository to query. 

281 partialId : `~collections.abc.Mapping` [`str`] 

282 A mapping of known dimensions and values. 

283 

284 Returns 

285 ------- 

286 dataId : `lsst.daf.butler.DataCoordinate` 

287 The unique data ID that matches ``partialId``. 

288 

289 Raises 

290 ------ 

291 ValueError 

292 Raised if ``partialId`` does not uniquely identify a data ID. 

293 

294 Notes 

295 ----- 

296 This method will only work correctly if all dimensions attached to the 

297 target dimension (eg., "physical_filter" for "visit") are known to the 

298 repository, even if they're not needed to identify a dataset. This function 

299 is only suitable for certain kinds of test repositories, and not for 

300 repositories intended for real data processing or analysis. 

301 

302 Examples 

303 -------- 

304 .. code-block:: py 

305 

306 >>> butler = makeTestRepo( 

307 "testdir", {"instrument": ["notACam"], "detector": [1]}) 

308 >>> expandUniqueId(butler, {"detector": 1}) 

309 DataCoordinate({instrument, detector}, ('notACam', 1)) 

310 """ 

311 # The example is *not* a doctest because it requires dangerous I/O 

312 registry = butler.registry 

313 dimensions = registry.dimensions.extract(partialId.keys()).required 

314 

315 query = " AND ".join(f"{dimension} = {value!r}" for dimension, value in partialId.items()) 

316 

317 # Much of the purpose of this function is to do something we explicitly 

318 # reject most of the time: query for a governor dimension (e.g. instrument) 

319 # given something that depends on it (e.g. visit), hence check=False. 

320 dataId = list(registry.queryDataIds(dimensions, where=query, check=False)) 

321 if len(dataId) == 1: 

322 return dataId[0] 

323 else: 

324 raise ValueError(f"Found {len(dataId)} matches for {partialId}, expected 1.") 

325 

326 

327def _findOrInventDataIdValue( 

328 butler: Butler, data_id: dict[str, str | int], dimension: Dimension 

329) -> tuple[str | int, bool]: 

330 """Look up an arbitrary value for a dimension that is consistent with a 

331 partial data ID that does not specify that dimension, or invent one if no 

332 such value exists. 

333 

334 Parameters 

335 ---------- 

336 butler : `Butler` 

337 Butler to use to look up data ID values. 

338 data_id : `dict` [ `str`, `str` or `int` ] 

339 Dictionary of possibly-related data ID values. 

340 dimension : `Dimension` 

341 Dimension to obtain a value for. 

342 

343 Returns 

344 ------- 

345 value : `int` or `str` 

346 Value for this dimension. 

347 invented : `bool` 

348 `True` if the value had to be invented, `False` if a compatible value 

349 already existed. 

350 """ 

351 # No values given by caller for this dimension. See if any exist 

352 # in the registry that are consistent with the values of dimensions 

353 # we do have: 

354 match_data_id = {key: data_id[key] for key in data_id.keys() & dimension.dimensions.names} 

355 matches = list(butler.registry.queryDimensionRecords(dimension, dataId=match_data_id).limit(1)) 

356 if not matches: 

357 # Nothing in the registry matches: invent a data ID value 

358 # with the right type (actual value does not matter). 

359 # We may or may not actually make a record with this; that's 

360 # easier to check later. 

361 dimension_value = _makeRandomDataIdValue(dimension) 

362 return dimension_value, True 

363 else: 

364 # A record does exist in the registry. Use its data ID value. 

365 dim_value = matches[0].dataId[dimension.name] 

366 assert dim_value is not None 

367 return dim_value, False 

368 

369 

370def _makeDimensionRecordDict(data_id: dict[str, str | int], dimension: Dimension) -> dict[str, Any]: 

371 """Create a dictionary that can be used to build a `DimensionRecord` that 

372 is consistent with the given data ID. 

373 

374 Parameters 

375 ---------- 

376 data_id : `dict` [ `str`, `str` or `int` ] 

377 Dictionary that contains values for at least all of 

378 ``dimension.dimensions.names`` (the main dimension, its recursive 

379 required dependencies, and its non-recursive implied dependencies). 

380 dimension : `Dimension` 

381 Dimension to build a record dictionary for. 

382 

383 Returns 

384 ------- 

385 record_dict : `dict` [ `str`, `object` ] 

386 Dictionary that can be passed as ``**kwargs`` to this dimensions 

387 record class constructor. 

388 """ 

389 # Add the primary key field for this dimension. 

390 record_dict: dict[str, Any] = {dimension.primaryKey.name: data_id[dimension.name]} 

391 # Define secondary keys (e.g., detector name given detector id) 

392 record_dict.update(_fillAllKeys(dimension, data_id[dimension.name])) 

393 # Set the foreign key values for any related dimensions that should 

394 # appear in the record. 

395 for related_dimension in dimension.dimensions: 

396 if related_dimension.name != dimension.name: 

397 record_dict[related_dimension.name] = data_id[related_dimension.name] 

398 return record_dict 

399 

400 

401def addDataIdValue(butler: Butler, dimension: str, value: str | int, **related: str | int) -> None: 

402 """Add the records that back a new data ID to a repository. 

403 

404 Parameters 

405 ---------- 

406 butler : `lsst.daf.butler.Butler` 

407 The repository to update. 

408 dimension : `str` 

409 The name of the dimension to gain a new value. 

410 value 

411 The value to register for the dimension. 

412 **related 

413 Any existing dimensions to be linked to ``value``. 

414 

415 Notes 

416 ----- 

417 Related dimensions (e.g., the instrument associated with a detector) may be 

418 specified using ``related``, which requires a value for those dimensions to 

419 have been added to the repository already (generally with a previous call 

420 to `addDataIdValue`. Any dependencies of the given dimension that are not 

421 included in ``related`` will be linked to existing values arbitrarily, and 

422 (for implied dependencies only) created and also inserted into the registry 

423 if they do not exist. Values for required dimensions and those given in 

424 ``related`` are never created. 

425 

426 Because this function creates filler data, it is only suitable for test 

427 repositories. It should not be used for repositories intended for real data 

428 processing or analysis, which have known dimension values. 

429 

430 Examples 

431 -------- 

432 See the guide on :ref:`using-butler-in-tests-make-repo` for usage examples. 

433 """ 

434 # Example is not doctest, because it's probably unsafe to create even an 

435 # in-memory butler in that environment. 

436 try: 

437 fullDimension = butler.dimensions[dimension] 

438 except KeyError as e: 

439 raise ValueError from e 

440 # Bad keys ignored by registry code 

441 extraKeys = related.keys() - fullDimension.graph.dimensions.names 

442 if extraKeys: 

443 raise ValueError( 

444 f"Unexpected keywords {extraKeys} not found in {fullDimension.graph.dimensions.names}" 

445 ) 

446 

447 # Assemble a dictionary data ID holding the given primary dimension value 

448 # and all of the related ones. 

449 data_id: dict[str, int | str] = {dimension: value} 

450 data_id.update(related) 

451 

452 # Compute the set of all dimensions that these recursively depend on. 

453 all_dimensions = butler.dimensions.extract(data_id.keys()) 

454 

455 # Create dicts that will become DimensionRecords for all of these data IDs. 

456 # This iteration is guaranteed to be in topological order, so we can count 

457 # on new data ID values being invented before they are needed. 

458 record_dicts_by_dimension: dict[Dimension, dict[str, Any]] = {} 

459 for dimension_obj in all_dimensions: 

460 dimension_value = data_id.get(dimension_obj.name) 

461 if dimension_value is None: 

462 data_id[dimension_obj.name], invented = _findOrInventDataIdValue(butler, data_id, dimension_obj) 

463 if not invented: 

464 # No need to make a new record; one already exists. 

465 continue 

466 if dimension_obj.name in related: 

467 # Caller passed in a value of this dimension explicitly, but it 

468 # isn't the primary dimension they asked to have a record created 

469 # for. That means they expect this record to already exist. 

470 continue 

471 if dimension_obj != fullDimension and dimension_obj in all_dimensions.required: 

472 # We also don't want to automatically create new dimension records 

473 # for required dimensions (except for the main dimension the caller 

474 # asked for); those are also asserted by the caller to already 

475 # exist. 

476 continue 

477 if dimension_obj.viewOf is not None: 

478 # Don't need to bother generating full records for dimensions whose 

479 # records are just a view into some other's records anyway. 

480 continue 

481 record_dicts_by_dimension[dimension_obj] = _makeDimensionRecordDict(data_id, dimension_obj) 

482 

483 # Sync those dimension record dictionaries with the database. 

484 for dimension_obj, record_dict in record_dicts_by_dimension.items(): 

485 record = dimension_obj.RecordClass(**record_dict) 

486 try: 

487 butler.registry.syncDimensionData(dimension_obj, record) 

488 except sqlalchemy.exc.IntegrityError as e: 

489 raise RuntimeError( 

490 "Could not create data ID value. Automatic relationship generation " 

491 "may have failed; try adding keywords to assign a specific instrument, " 

492 "physical_filter, etc. based on the nested exception message." 

493 ) from e 

494 

495 

496def addDatasetType(butler: Butler, name: str, dimensions: set[str], storageClass: str) -> DatasetType: 

497 """Add a new dataset type to a repository. 

498 

499 Parameters 

500 ---------- 

501 butler : `lsst.daf.butler.Butler` 

502 The repository to update. 

503 name : `str` 

504 The name of the dataset type. 

505 dimensions : `set` [`str`] 

506 The dimensions of the new dataset type. 

507 storageClass : `str` 

508 The storage class the dataset will use. 

509 

510 Returns 

511 ------- 

512 datasetType : `lsst.daf.butler.DatasetType` 

513 The new type. 

514 

515 Raises 

516 ------ 

517 ValueError 

518 Raised if the dimensions or storage class is invalid. 

519 

520 Notes 

521 ----- 

522 Dataset types are shared across all collections in a repository, so this 

523 function does not need to be run for each collection. 

524 """ 

525 try: 

526 datasetType = DatasetType(name, dimensions, storageClass, universe=butler.dimensions) 

527 butler.registry.registerDatasetType(datasetType) 

528 return datasetType 

529 except KeyError as e: 

530 raise ValueError from e 

531 

532 

533class DatastoreMock: 

534 """Mocks a butler datastore. 

535 

536 Has functions that mock the datastore in a butler. Provides an `apply` 

537 function to replace the relevent butler datastore functions with the mock 

538 functions. 

539 """ 

540 

541 @staticmethod 

542 def apply(butler: Butler) -> None: 

543 """Apply datastore mocks to a butler.""" 

544 butler._datastore.export = DatastoreMock._mock_export # type: ignore 

545 butler._datastore.get = DatastoreMock._mock_get # type: ignore 

546 butler._datastore.ingest = MagicMock() # type: ignore 

547 

548 @staticmethod 

549 def _mock_export( 

550 refs: Iterable[DatasetRef], *, directory: str | None = None, transfer: str | None = None 

551 ) -> Iterable[FileDataset]: 

552 """Mock of `Datastore.export` that satisfies the requirement that 

553 the refs passed in are included in the `FileDataset` objects 

554 returned. 

555 

556 This can be used to construct a `Datastore` mock that can be used 

557 in repository export via:: 

558 

559 datastore = unittest.mock.Mock(spec=Datastore) 

560 datastore.export = DatastoreMock._mock_export 

561 

562 """ 

563 for ref in refs: 

564 yield FileDataset( 

565 refs=[ref], path="mock/path", formatter="lsst.daf.butler.formatters.json.JsonFormatter" 

566 ) 

567 

568 @staticmethod 

569 def _mock_get( 

570 ref: DatasetRef, 

571 parameters: Mapping[str, Any] | None = None, 

572 storageClass: StorageClass | str | None = None, 

573 ) -> tuple[DatasetId, Mapping[str, Any] | None]: 

574 """Mock of `Datastore.get` that just returns the integer dataset ID 

575 value and parameters it was given. 

576 """ 

577 return (ref.id, parameters)