Coverage for python/lsst/daf/butler/tests/_testRepo.py: 11%

141 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2022-10-07 09:47 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = [ 

25 "makeTestRepo", 

26 "makeTestCollection", 

27 "addDatasetType", 

28 "expandUniqueId", 

29 "DatastoreMock", 

30 "addDataIdValue", 

31] 

32 

33import random 

34from typing import Any, Iterable, Mapping, Optional, Set, Tuple, Union 

35from unittest.mock import MagicMock 

36 

37import sqlalchemy 

38from lsst.daf.butler import ( 

39 Butler, 

40 Config, 

41 DataCoordinate, 

42 DatasetRef, 

43 DatasetType, 

44 Dimension, 

45 DimensionUniverse, 

46 FileDataset, 

47 StorageClass, 

48) 

49 

50 

51def makeTestRepo( 

52 root: str, dataIds: Optional[Mapping[str, Iterable]] = None, *, config: Config = None, **kwargs 

53) -> Butler: 

54 """Create an empty test repository. 

55 

56 Parameters 

57 ---------- 

58 root : `str` 

59 The location of the root directory for the repository. 

60 dataIds : `~collections.abc.Mapping` [`str`, `iterable`], optional 

61 A mapping keyed by the dimensions used in the test. Each value is an 

62 iterable of names for that dimension (e.g., detector IDs for 

63 `"detector"`). Related dimensions (e.g., instruments and detectors) are 

64 linked arbitrarily, with values created for implied dimensions only 

65 when needed. This parameter is provided for compatibility with old 

66 code; newer code should make the repository, then call 

67 `~lsst.daf.butler.tests.addDataIdValue`. 

68 config : `lsst.daf.butler.Config`, optional 

69 A configuration for the repository (for details, see 

70 `lsst.daf.butler.Butler.makeRepo`). If omitted, creates a repository 

71 with default dataset and storage types, but optimized for speed. The 

72 defaults set ``.datastore.cls``, ``.datastore.checksum`` and 

73 ``.registry.db``. If a supplied config does not specify these values 

74 the internal defaults will be used to ensure that we have a usable 

75 configuration. 

76 **kwargs 

77 Extra arguments to `lsst.daf.butler.Butler.makeRepo`. 

78 

79 Returns 

80 ------- 

81 butler : `lsst.daf.butler.Butler` 

82 A Butler referring to the new repository. This Butler is provided only 

83 for additional setup; to keep test cases isolated, it is highly 

84 recommended that each test create its own Butler with a unique 

85 run/collection. See `makeTestCollection`. 

86 

87 Notes 

88 ----- 

89 This function provides a "quick and dirty" repository for simple unit tests 

90 that don't depend on complex data relationships. It is ill-suited for tests 

91 where the structure of the data matters. If you need such a dataset, create 

92 it directly or use a saved test dataset. 

93 """ 

94 defaults = Config() 

95 defaults["datastore", "cls"] = "lsst.daf.butler.datastores.inMemoryDatastore.InMemoryDatastore" 

96 defaults["datastore", "checksum"] = False # In case of future changes 

97 defaults["registry", "db"] = "sqlite:///<butlerRoot>/gen3.sqlite3" 

98 

99 if config: 

100 defaults.update(config) 

101 

102 if not dataIds: 

103 dataIds = {} 

104 

105 # Disable config root by default so that our registry override will 

106 # not be ignored. 

107 # newConfig guards against location-related keywords like outfile 

108 newConfig = Butler.makeRepo(root, config=defaults, forceConfigRoot=False, **kwargs) 

109 butler = Butler(newConfig, writeable=True) 

110 dimensionRecords = _makeRecords(dataIds, butler.registry.dimensions) 

111 for dimension, records in dimensionRecords.items(): 

112 if butler.registry.dimensions[dimension].viewOf is None: 

113 butler.registry.insertDimensionData(dimension, *records) 

114 return butler 

115 

116 

117def makeTestCollection(repo: Butler, uniqueId: Optional[str] = None) -> Butler: 

118 """Create a read/write Butler to a fresh collection. 

119 

120 Parameters 

121 ---------- 

122 repo : `lsst.daf.butler.Butler` 

123 A previously existing Butler to a repository, such as that returned by 

124 `~lsst.daf.butler.Butler.makeRepo` or `makeTestRepo`. 

125 uniqueId : `str`, optional 

126 A collection ID guaranteed by external code to be unique across all 

127 calls to ``makeTestCollection`` for the same repository. 

128 

129 Returns 

130 ------- 

131 butler : `lsst.daf.butler.Butler` 

132 A Butler referring to a new collection in the repository at ``root``. 

133 The collection is (almost) guaranteed to be new. 

134 

135 Notes 

136 ----- 

137 This function creates a single run collection that does not necessarily 

138 conform to any repository conventions. It is only suitable for creating an 

139 isolated test area, and not for repositories intended for real data 

140 processing or analysis. 

141 """ 

142 if not uniqueId: 

143 # Create a "random" collection name 

144 # Speed matters more than cryptographic guarantees 

145 uniqueId = str(random.randrange(1_000_000_000)) 

146 collection = "test_" + uniqueId 

147 return Butler(butler=repo, run=collection) 

148 

149 

150def _makeRecords(dataIds: Mapping[str, Iterable], universe: DimensionUniverse) -> Mapping[str, Iterable]: 

151 """Create cross-linked dimension records from a collection of 

152 data ID values. 

153 

154 Parameters 

155 ---------- 

156 dataIds : `~collections.abc.Mapping` [`str`, `iterable`] 

157 A mapping keyed by the dimensions of interest. Each value is an 

158 iterable of names for that dimension (e.g., detector IDs for 

159 `"detector"`). 

160 universe : lsst.daf.butler.DimensionUniverse 

161 Set of all known dimensions and their relationships. 

162 

163 Returns 

164 ------- 

165 dataIds : `~collections.abc.Mapping` [`str`, `iterable`] 

166 A mapping keyed by the dimensions of interest, giving one 

167 `~lsst.daf.butler.DimensionRecord` for each input name. Related 

168 dimensions (e.g., instruments and detectors) are linked arbitrarily. 

169 """ 

170 

171 # Create values for all dimensions that are (recursive) required or implied 

172 # dependencies of the given ones. 

173 complete_data_id_values = {} 

174 for dimension in universe.extract(dataIds.keys()): 

175 if dimension.name in dataIds: 

176 complete_data_id_values[dimension.name] = list(dataIds[dimension.name]) 

177 if dimension.name not in complete_data_id_values: 

178 complete_data_id_values[dimension.name] = [_makeRandomDataIdValue(dimension)] 

179 

180 # Start populating dicts that will become DimensionRecords by providing 

181 # alternate keys like detector names 

182 record_dicts_by_dimension_name = {} 

183 for name, values in complete_data_id_values.items(): 

184 record_dicts_by_dimension_name[name] = [] 

185 dimension = universe[name] 

186 for value in values: 

187 record_dicts_by_dimension_name[name].append(_fillAllKeys(dimension, value)) 

188 

189 # Pick cross-relationships arbitrarily 

190 for name, record_dicts in record_dicts_by_dimension_name.items(): 

191 dimension = universe[name] 

192 for record_dict in record_dicts: 

193 for other in dimension.dimensions: 

194 if other != dimension: 

195 relation = record_dicts_by_dimension_name[other.name][0] 

196 record_dict[other.name] = relation[other.primaryKey.name] 

197 

198 return { 

199 dimension: [universe[dimension].RecordClass(**record_dict) for record_dict in record_dicts] 

200 for dimension, record_dicts in record_dicts_by_dimension_name.items() 

201 } 

202 

203 

204def _fillAllKeys(dimension: Dimension, value: Union[str, int]) -> Mapping[str, Union[str, int]]: 

205 """Create an arbitrary mapping of all required keys for a given dimension 

206 that do not refer to other dimensions. 

207 

208 Parameters 

209 ---------- 

210 dimension : `lsst.daf.butler.Dimension` 

211 The dimension for which to generate a set of keys (e.g., detector). 

212 value 

213 The value assigned to ``dimension`` (e.g., detector ID). 

214 

215 Returns 

216 ------- 

217 expandedValue : `dict` [`str`] 

218 A mapping of dimension keys to values. ``dimension's`` primary key 

219 maps to ``value``, but all other mappings (e.g., detector name) 

220 are arbitrary. 

221 """ 

222 expandedValue = {} 

223 for key in dimension.uniqueKeys: 

224 if key.nbytes: 

225 # For `bytes` fields, we want something that casts at least `str` 

226 # and `int` values to bytes and yields b'' when called with no 

227 # arguments (as in the except block below). Unfortunately, the 

228 # `bytes` type itself fails for both `str` and `int`, but this 

229 # lambda does what we need. This particularly important for the 

230 # skymap dimensions' bytes 'hash' field, which has a unique 

231 # constraint; without this, all skymaps would get a hash of b'' 

232 # and end up conflicting. 

233 castType = lambda *args: str(*args).encode() # noqa: E731 

234 else: 

235 castType = key.dtype().python_type 

236 try: 

237 castValue = castType(value) 

238 except TypeError: 

239 castValue = castType() 

240 expandedValue[key.name] = castValue 

241 for key in dimension.metadata: 

242 if not key.nullable: 

243 expandedValue[key.name] = key.dtype().python_type(value) 

244 return expandedValue 

245 

246 

247def _makeRandomDataIdValue(dimension: Dimension) -> Union[int, str]: 

248 """Generate a random value of the appropriate type for a data ID key. 

249 

250 Parameters 

251 ---------- 

252 dimension : `Dimension` 

253 Dimension the value corresponds to. 

254 

255 Returns 

256 ------- 

257 value : `int` or `str` 

258 Random value. 

259 """ 

260 if dimension.primaryKey.getPythonType() is str: 

261 return str(random.randrange(1000)) 

262 else: 

263 return random.randrange(1000) 

264 

265 

266def expandUniqueId(butler: Butler, partialId: Mapping[str, Any]) -> DataCoordinate: 

267 """Return a complete data ID matching some criterion. 

268 

269 Parameters 

270 ---------- 

271 butler : `lsst.daf.butler.Butler` 

272 The repository to query. 

273 partialId : `~collections.abc.Mapping` [`str`] 

274 A mapping of known dimensions and values. 

275 

276 Returns 

277 ------- 

278 dataId : `lsst.daf.butler.DataCoordinate` 

279 The unique data ID that matches ``partialId``. 

280 

281 Raises 

282 ------ 

283 ValueError 

284 Raised if ``partialId`` does not uniquely identify a data ID. 

285 

286 Notes 

287 ----- 

288 This method will only work correctly if all dimensions attached to the 

289 target dimension (eg., "physical_filter" for "visit") are known to the 

290 repository, even if they're not needed to identify a dataset. This function 

291 is only suitable for certain kinds of test repositories, and not for 

292 repositories intended for real data processing or analysis. 

293 

294 Examples 

295 -------- 

296 .. code-block:: py 

297 

298 >>> butler = makeTestRepo( 

299 "testdir", {"instrument": ["notACam"], "detector": [1]}) 

300 >>> expandUniqueId(butler, {"detector": 1}) 

301 DataCoordinate({instrument, detector}, ('notACam', 1)) 

302 """ 

303 # The example is *not* a doctest because it requires dangerous I/O 

304 registry = butler.registry 

305 dimensions = registry.dimensions.extract(partialId.keys()).required 

306 

307 query = " AND ".join(f"{dimension} = {value!r}" for dimension, value in partialId.items()) 

308 

309 # Much of the purpose of this function is to do something we explicitly 

310 # reject most of the time: query for a governor dimension (e.g. instrument) 

311 # given something that depends on it (e.g. visit), hence check=False. 

312 dataId = list(registry.queryDataIds(dimensions, where=query, check=False)) 

313 if len(dataId) == 1: 

314 return dataId[0] 

315 else: 

316 raise ValueError(f"Found {len(dataId)} matches for {partialId}, expected 1.") 

317 

318 

319def _findOrInventDataIdValue( 

320 butler: Butler, data_id: dict[str, Union[str, int]], dimension: Dimension 

321) -> tuple[Union[str, int], bool]: 

322 """Look up an arbitrary value for a dimension that is consistent with a 

323 partial data ID that does not specify that dimension, or invent one if no 

324 such value exists. 

325 

326 Parameters 

327 ---------- 

328 butler : `Butler` 

329 Butler to use to look up data ID values. 

330 data_id : `dict` [ `str`, `str` or `int` ] 

331 Dictionary of possibly-related data ID values. 

332 dimension : `Dimension` 

333 Dimension to obtain a value for. 

334 

335 Returns 

336 ------- 

337 value : `int` or `str` 

338 Value for this dimension. 

339 invented : `bool` 

340 `True` if the value had to be invented, `False` if a compatible value 

341 already existed. 

342 """ 

343 # No values given by caller for this dimension. See if any exist 

344 # in the registry that are consistent with the values of dimensions 

345 # we do have: 

346 match_data_id = {key: data_id[key] for key in data_id.keys() & dimension.dimensions.names} 

347 matches = list(butler.registry.queryDimensionRecords(dimension, dataId=match_data_id).limit(1)) 

348 if not matches: 

349 # Nothing in the registry matches: invent a data ID value 

350 # with the right type (actual value does not matter). 

351 # We may or may not actually make a record with this; that's 

352 # easier to check later. 

353 dimension_value = _makeRandomDataIdValue(dimension) 

354 return dimension_value, True 

355 else: 

356 # A record does exist in the registry. Use its data ID value. 

357 return matches[0].dataId[dimension.name], False 

358 

359 

360def _makeDimensionRecordDict(data_id: dict[str, Union[str, int]], dimension: Dimension) -> dict[str, Any]: 

361 """Create a dictionary that can be used to build a `DimensionRecord` that 

362 is consistent with the given data ID. 

363 

364 Parameters 

365 ---------- 

366 data_id : `dict` [ `str`, `str` or `int` ] 

367 Dictionary that contains values for at least all of 

368 ``dimension.dimensions.names`` (the main dimension, its recursive 

369 required dependencies, and its non-recursive implied dependencies). 

370 dimension : `Dimension` 

371 Dimension to build a record dictionary for. 

372 

373 Returns 

374 ------- 

375 record_dict : `dict` [ `str`, `object` ] 

376 Dictionary that can be passed as ``**kwargs`` to this dimensions 

377 record class constructor. 

378 """ 

379 # Add the primary key field for this dimension. 

380 record_dict: dict[str, Any] = {dimension.primaryKey.name: data_id[dimension.name]} 

381 # Define secondary keys (e.g., detector name given detector id) 

382 record_dict.update(_fillAllKeys(dimension, data_id[dimension.name])) 

383 # Set the foreign key values for any related dimensions that should 

384 # appear in the record. 

385 for related_dimension in dimension.dimensions: 

386 if related_dimension.name != dimension.name: 

387 record_dict[related_dimension.name] = data_id[related_dimension.name] 

388 return record_dict 

389 

390 

391def addDataIdValue(butler: Butler, dimension: str, value: Union[str, int], **related: Union[str, int]): 

392 """Add the records that back a new data ID to a repository. 

393 

394 Parameters 

395 ---------- 

396 butler : `lsst.daf.butler.Butler` 

397 The repository to update. 

398 dimension : `str` 

399 The name of the dimension to gain a new value. 

400 value 

401 The value to register for the dimension. 

402 **related 

403 Any existing dimensions to be linked to ``value``. 

404 

405 Notes 

406 ----- 

407 Related dimensions (e.g., the instrument associated with a detector) may be 

408 specified using ``related``, which requires a value for those dimensions to 

409 have been added to the repository already (generally with a previous call 

410 to `addDataIdValue`. Any dependencies of the given dimension that are not 

411 included in ``related`` will be linked to existing values arbitrarily, and 

412 (for implied dependencies only) created and also inserted into the registry 

413 if they do not exist. Values for required dimensions and those given in 

414 ``related`` are never created. 

415 

416 Because this function creates filler data, it is only suitable for test 

417 repositories. It should not be used for repositories intended for real data 

418 processing or analysis, which have known dimension values. 

419 

420 Examples 

421 -------- 

422 See the guide on :ref:`using-butler-in-tests-make-repo` for usage examples. 

423 """ 

424 # Example is not doctest, because it's probably unsafe to create even an 

425 # in-memory butler in that environment. 

426 try: 

427 fullDimension = butler.registry.dimensions[dimension] 

428 except KeyError as e: 

429 raise ValueError from e 

430 # Bad keys ignored by registry code 

431 extraKeys = related.keys() - fullDimension.graph.dimensions.names 

432 if extraKeys: 

433 raise ValueError( 

434 f"Unexpected keywords {extraKeys} not found in {fullDimension.graph.dimensions.names}" 

435 ) 

436 

437 # Assemble a dictionary data ID holding the given primary dimension value 

438 # and all of the related ones. 

439 data_id: dict[str, Union[int, str]] = {dimension: value} 

440 data_id.update(related) 

441 

442 # Compute the set of all dimensions that these recursively depend on. 

443 all_dimensions = butler.registry.dimensions.extract(data_id.keys()) 

444 

445 # Create dicts that will become DimensionRecords for all of these data IDs. 

446 # This iteration is guaranteed to be in topological order, so we can count 

447 # on new data ID values being invented before they are needed. 

448 record_dicts_by_dimension: dict[Dimension, dict[str, Any]] = {} 

449 for dimension_obj in all_dimensions: 

450 dimension_value = data_id.get(dimension_obj.name) 

451 if dimension_value is None: 

452 data_id[dimension_obj.name], invented = _findOrInventDataIdValue(butler, data_id, dimension_obj) 

453 if not invented: 

454 # No need to make a new record; one already exists. 

455 continue 

456 if dimension_obj.name in related: 

457 # Caller passed in a value of this dimension explicitly, but it 

458 # isn't the primary dimension they asked to have a record created 

459 # for. That means they expect this record to already exist. 

460 continue 

461 if dimension_obj != fullDimension and dimension_obj in all_dimensions.required: 

462 # We also don't want to automatically create new dimension records 

463 # for required dimensions (except for the main dimension the caller 

464 # asked for); those are also asserted by the caller to already 

465 # exist. 

466 continue 

467 if dimension_obj.viewOf is not None: 

468 # Don't need to bother generating full records for dimensions whose 

469 # records are just a view into some other's records anyway. 

470 continue 

471 record_dicts_by_dimension[dimension_obj] = _makeDimensionRecordDict(data_id, dimension_obj) 

472 

473 # Sync those dimension record dictionaries with the database. 

474 for dimension_obj, record_dict in record_dicts_by_dimension.items(): 

475 record = dimension_obj.RecordClass(**record_dict) 

476 try: 

477 butler.registry.syncDimensionData(dimension_obj, record) 

478 except sqlalchemy.exc.IntegrityError as e: 

479 raise RuntimeError( 

480 "Could not create data ID value. Automatic relationship generation " 

481 "may have failed; try adding keywords to assign a specific instrument, " 

482 "physical_filter, etc. based on the nested exception message." 

483 ) from e 

484 

485 

486def addDatasetType(butler: Butler, name: str, dimensions: Set[str], storageClass: str) -> DatasetType: 

487 """Add a new dataset type to a repository. 

488 

489 Parameters 

490 ---------- 

491 butler : `lsst.daf.butler.Butler` 

492 The repository to update. 

493 name : `str` 

494 The name of the dataset type. 

495 dimensions : `set` [`str`] 

496 The dimensions of the new dataset type. 

497 storageClass : `str` 

498 The storage class the dataset will use. 

499 

500 Returns 

501 ------- 

502 datasetType : `lsst.daf.butler.DatasetType` 

503 The new type. 

504 

505 Raises 

506 ------ 

507 ValueError 

508 Raised if the dimensions or storage class is invalid. 

509 

510 Notes 

511 ----- 

512 Dataset types are shared across all collections in a repository, so this 

513 function does not need to be run for each collection. 

514 """ 

515 try: 

516 datasetType = DatasetType(name, dimensions, storageClass, universe=butler.registry.dimensions) 

517 butler.registry.registerDatasetType(datasetType) 

518 return datasetType 

519 except KeyError as e: 

520 raise ValueError from e 

521 

522 

523class DatastoreMock: 

524 """Mocks a butler datastore. 

525 

526 Has functions that mock the datastore in a butler. Provides an `apply` 

527 function to replace the relevent butler datastore functions with the mock 

528 functions. 

529 """ 

530 

531 @staticmethod 

532 def apply(butler): 

533 """Apply datastore mocks to a butler.""" 

534 butler.datastore.export = DatastoreMock._mock_export 

535 butler.datastore.get = DatastoreMock._mock_get 

536 butler.datastore.ingest = MagicMock() 

537 

538 @staticmethod 

539 def _mock_export( 

540 refs: Iterable[DatasetRef], *, directory: Optional[str] = None, transfer: Optional[str] = None 

541 ) -> Iterable[FileDataset]: 

542 """A mock of `Datastore.export` that satisfies the requirement that 

543 the refs passed in are included in the `FileDataset` objects 

544 returned. 

545 

546 This can be used to construct a `Datastore` mock that can be used 

547 in repository export via:: 

548 

549 datastore = unittest.mock.Mock(spec=Datastore) 

550 datastore.export = DatastoreMock._mock_export 

551 

552 """ 

553 for ref in refs: 

554 yield FileDataset( 

555 refs=[ref], path="mock/path", formatter="lsst.daf.butler.formatters.json.JsonFormatter" 

556 ) 

557 

558 @staticmethod 

559 def _mock_get( 

560 ref: DatasetRef, 

561 parameters: Optional[Mapping[str, Any]] = None, 

562 storageClass: Optional[Union[StorageClass, str]] = None, 

563 ) -> Tuple[int, Optional[Mapping[str, Any]]]: 

564 """A mock of `Datastore.get` that just returns the integer dataset ID 

565 value and parameters it was given. 

566 """ 

567 return (ref.id, parameters)