Coverage for python/lsst/daf/butler/tests/_testRepo.py: 11%

141 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2022-12-08 14:18 -0800

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = [ 

25 "makeTestRepo", 

26 "makeTestCollection", 

27 "addDatasetType", 

28 "expandUniqueId", 

29 "DatastoreMock", 

30 "addDataIdValue", 

31] 

32 

33import random 

34from typing import Any, Iterable, Mapping, Optional, Set, Tuple, Union 

35from unittest.mock import MagicMock 

36 

37import sqlalchemy 

38from lsst.daf.butler import ( 

39 Butler, 

40 Config, 

41 DataCoordinate, 

42 DatasetRef, 

43 DatasetType, 

44 Dimension, 

45 DimensionUniverse, 

46 FileDataset, 

47) 

48 

49 

50def makeTestRepo( 

51 root: str, dataIds: Optional[Mapping[str, Iterable]] = None, *, config: Config = None, **kwargs 

52) -> Butler: 

53 """Create an empty test repository. 

54 

55 Parameters 

56 ---------- 

57 root : `str` 

58 The location of the root directory for the repository. 

59 dataIds : `~collections.abc.Mapping` [`str`, `iterable`], optional 

60 A mapping keyed by the dimensions used in the test. Each value is an 

61 iterable of names for that dimension (e.g., detector IDs for 

62 `"detector"`). Related dimensions (e.g., instruments and detectors) are 

63 linked arbitrarily, with values created for implied dimensions only 

64 when needed. This parameter is provided for compatibility with old 

65 code; newer code should make the repository, then call 

66 `~lsst.daf.butler.tests.addDataIdValue`. 

67 config : `lsst.daf.butler.Config`, optional 

68 A configuration for the repository (for details, see 

69 `lsst.daf.butler.Butler.makeRepo`). If omitted, creates a repository 

70 with default dataset and storage types, but optimized for speed. The 

71 defaults set ``.datastore.cls``, ``.datastore.checksum`` and 

72 ``.registry.db``. If a supplied config does not specify these values 

73 the internal defaults will be used to ensure that we have a usable 

74 configuration. 

75 **kwargs 

76 Extra arguments to `lsst.daf.butler.Butler.makeRepo`. 

77 

78 Returns 

79 ------- 

80 butler : `lsst.daf.butler.Butler` 

81 A Butler referring to the new repository. This Butler is provided only 

82 for additional setup; to keep test cases isolated, it is highly 

83 recommended that each test create its own Butler with a unique 

84 run/collection. See `makeTestCollection`. 

85 

86 Notes 

87 ----- 

88 This function provides a "quick and dirty" repository for simple unit tests 

89 that don't depend on complex data relationships. It is ill-suited for tests 

90 where the structure of the data matters. If you need such a dataset, create 

91 it directly or use a saved test dataset. 

92 """ 

93 defaults = Config() 

94 defaults["datastore", "cls"] = "lsst.daf.butler.datastores.inMemoryDatastore.InMemoryDatastore" 

95 defaults["datastore", "checksum"] = False # In case of future changes 

96 defaults["registry", "db"] = "sqlite:///<butlerRoot>/gen3.sqlite3" 

97 

98 if config: 

99 defaults.update(config) 

100 

101 if not dataIds: 

102 dataIds = {} 

103 

104 # Disable config root by default so that our registry override will 

105 # not be ignored. 

106 # newConfig guards against location-related keywords like outfile 

107 newConfig = Butler.makeRepo(root, config=defaults, forceConfigRoot=False, **kwargs) 

108 butler = Butler(newConfig, writeable=True) 

109 dimensionRecords = _makeRecords(dataIds, butler.registry.dimensions) 

110 for dimension, records in dimensionRecords.items(): 

111 if butler.registry.dimensions[dimension].viewOf is None: 

112 butler.registry.insertDimensionData(dimension, *records) 

113 return butler 

114 

115 

116def makeTestCollection(repo: Butler, uniqueId: Optional[str] = None) -> Butler: 

117 """Create a read/write Butler to a fresh collection. 

118 

119 Parameters 

120 ---------- 

121 repo : `lsst.daf.butler.Butler` 

122 A previously existing Butler to a repository, such as that returned by 

123 `~lsst.daf.butler.Butler.makeRepo` or `makeTestRepo`. 

124 uniqueId : `str`, optional 

125 A collection ID guaranteed by external code to be unique across all 

126 calls to ``makeTestCollection`` for the same repository. 

127 

128 Returns 

129 ------- 

130 butler : `lsst.daf.butler.Butler` 

131 A Butler referring to a new collection in the repository at ``root``. 

132 The collection is (almost) guaranteed to be new. 

133 

134 Notes 

135 ----- 

136 This function creates a single run collection that does not necessarily 

137 conform to any repository conventions. It is only suitable for creating an 

138 isolated test area, and not for repositories intended for real data 

139 processing or analysis. 

140 """ 

141 if not uniqueId: 

142 # Create a "random" collection name 

143 # Speed matters more than cryptographic guarantees 

144 uniqueId = str(random.randrange(1_000_000_000)) 

145 collection = "test_" + uniqueId 

146 return Butler(butler=repo, run=collection) 

147 

148 

149def _makeRecords(dataIds: Mapping[str, Iterable], universe: DimensionUniverse) -> Mapping[str, Iterable]: 

150 """Create cross-linked dimension records from a collection of 

151 data ID values. 

152 

153 Parameters 

154 ---------- 

155 dataIds : `~collections.abc.Mapping` [`str`, `iterable`] 

156 A mapping keyed by the dimensions of interest. Each value is an 

157 iterable of names for that dimension (e.g., detector IDs for 

158 `"detector"`). 

159 universe : lsst.daf.butler.DimensionUniverse 

160 Set of all known dimensions and their relationships. 

161 

162 Returns 

163 ------- 

164 dataIds : `~collections.abc.Mapping` [`str`, `iterable`] 

165 A mapping keyed by the dimensions of interest, giving one 

166 `~lsst.daf.butler.DimensionRecord` for each input name. Related 

167 dimensions (e.g., instruments and detectors) are linked arbitrarily. 

168 """ 

169 

170 # Create values for all dimensions that are (recursive) required or implied 

171 # dependencies of the given ones. 

172 complete_data_id_values = {} 

173 for dimension in universe.extract(dataIds.keys()): 

174 if dimension.name in dataIds: 

175 complete_data_id_values[dimension.name] = list(dataIds[dimension.name]) 

176 if dimension.name not in complete_data_id_values: 

177 complete_data_id_values[dimension.name] = [_makeRandomDataIdValue(dimension)] 

178 

179 # Start populating dicts that will become DimensionRecords by providing 

180 # alternate keys like detector names 

181 record_dicts_by_dimension_name = {} 

182 for name, values in complete_data_id_values.items(): 

183 record_dicts_by_dimension_name[name] = [] 

184 dimension = universe[name] 

185 for value in values: 

186 record_dicts_by_dimension_name[name].append(_fillAllKeys(dimension, value)) 

187 

188 # Pick cross-relationships arbitrarily 

189 for name, record_dicts in record_dicts_by_dimension_name.items(): 

190 dimension = universe[name] 

191 for record_dict in record_dicts: 

192 for other in dimension.dimensions: 

193 if other != dimension: 

194 relation = record_dicts_by_dimension_name[other.name][0] 

195 record_dict[other.name] = relation[other.primaryKey.name] 

196 

197 return { 

198 dimension: [universe[dimension].RecordClass(**record_dict) for record_dict in record_dicts] 

199 for dimension, record_dicts in record_dicts_by_dimension_name.items() 

200 } 

201 

202 

203def _fillAllKeys(dimension: Dimension, value: Union[str, int]) -> Mapping[str, Union[str, int]]: 

204 """Create an arbitrary mapping of all required keys for a given dimension 

205 that do not refer to other dimensions. 

206 

207 Parameters 

208 ---------- 

209 dimension : `lsst.daf.butler.Dimension` 

210 The dimension for which to generate a set of keys (e.g., detector). 

211 value 

212 The value assigned to ``dimension`` (e.g., detector ID). 

213 

214 Returns 

215 ------- 

216 expandedValue : `dict` [`str`] 

217 A mapping of dimension keys to values. ``dimension's`` primary key 

218 maps to ``value``, but all other mappings (e.g., detector name) 

219 are arbitrary. 

220 """ 

221 expandedValue = {} 

222 for key in dimension.uniqueKeys: 

223 if key.nbytes: 

224 # For `bytes` fields, we want something that casts at least `str` 

225 # and `int` values to bytes and yields b'' when called with no 

226 # arguments (as in the except block below). Unfortunately, the 

227 # `bytes` type itself fails for both `str` and `int`, but this 

228 # lambda does what we need. This particularly important for the 

229 # skymap dimensions' bytes 'hash' field, which has a unique 

230 # constraint; without this, all skymaps would get a hash of b'' 

231 # and end up conflicting. 

232 castType = lambda *args: str(*args).encode() # noqa: E731 

233 else: 

234 castType = key.dtype().python_type 

235 try: 

236 castValue = castType(value) 

237 except TypeError: 

238 castValue = castType() 

239 expandedValue[key.name] = castValue 

240 for key in dimension.metadata: 

241 if not key.nullable: 

242 expandedValue[key.name] = key.dtype().python_type(value) 

243 return expandedValue 

244 

245 

246def _makeRandomDataIdValue(dimension: Dimension) -> Union[int, str]: 

247 """Generate a random value of the appropriate type for a data ID key. 

248 

249 Parameters 

250 ---------- 

251 dimension : `Dimension` 

252 Dimension the value corresponds to. 

253 

254 Returns 

255 ------- 

256 value : `int` or `str` 

257 Random value. 

258 """ 

259 if dimension.primaryKey.getPythonType() is str: 

260 return str(random.randrange(1000)) 

261 else: 

262 return random.randrange(1000) 

263 

264 

265def expandUniqueId(butler: Butler, partialId: Mapping[str, Any]) -> DataCoordinate: 

266 """Return a complete data ID matching some criterion. 

267 

268 Parameters 

269 ---------- 

270 butler : `lsst.daf.butler.Butler` 

271 The repository to query. 

272 partialId : `~collections.abc.Mapping` [`str`] 

273 A mapping of known dimensions and values. 

274 

275 Returns 

276 ------- 

277 dataId : `lsst.daf.butler.DataCoordinate` 

278 The unique data ID that matches ``partialId``. 

279 

280 Raises 

281 ------ 

282 ValueError 

283 Raised if ``partialId`` does not uniquely identify a data ID. 

284 

285 Notes 

286 ----- 

287 This method will only work correctly if all dimensions attached to the 

288 target dimension (eg., "physical_filter" for "visit") are known to the 

289 repository, even if they're not needed to identify a dataset. This function 

290 is only suitable for certain kinds of test repositories, and not for 

291 repositories intended for real data processing or analysis. 

292 

293 Examples 

294 -------- 

295 .. code-block:: py 

296 

297 >>> butler = makeTestRepo( 

298 "testdir", {"instrument": ["notACam"], "detector": [1]}) 

299 >>> expandUniqueId(butler, {"detector": 1}) 

300 DataCoordinate({instrument, detector}, ('notACam', 1)) 

301 """ 

302 # The example is *not* a doctest because it requires dangerous I/O 

303 registry = butler.registry 

304 dimensions = registry.dimensions.extract(partialId.keys()).required 

305 

306 query = " AND ".join(f"{dimension} = {value!r}" for dimension, value in partialId.items()) 

307 

308 # Much of the purpose of this function is to do something we explicitly 

309 # reject most of the time: query for a governor dimension (e.g. instrument) 

310 # given something that depends on it (e.g. visit), hence check=False. 

311 dataId = list(registry.queryDataIds(dimensions, where=query, check=False)) 

312 if len(dataId) == 1: 

313 return dataId[0] 

314 else: 

315 raise ValueError(f"Found {len(dataId)} matches for {partialId}, expected 1.") 

316 

317 

318def _findOrInventDataIdValue( 

319 butler: Butler, data_id: dict[str, Union[str, int]], dimension: Dimension 

320) -> tuple[Union[str, int], bool]: 

321 """Look up an arbitrary value for a dimension that is consistent with a 

322 partial data ID that does not specify that dimension, or invent one if no 

323 such value exists. 

324 

325 Parameters 

326 ---------- 

327 butler : `Butler` 

328 Butler to use to look up data ID values. 

329 data_id : `dict` [ `str`, `str` or `int` ] 

330 Dictionary of possibly-related data ID values. 

331 dimension : `Dimension` 

332 Dimension to obtain a value for. 

333 

334 Returns 

335 ------- 

336 value : `int` or `str` 

337 Value for this dimension. 

338 invented : `bool` 

339 `True` if the value had to be invented, `False` if a compatible value 

340 already existed. 

341 """ 

342 # No values given by caller for this dimension. See if any exist 

343 # in the registry that are consistent with the values of dimensions 

344 # we do have: 

345 match_data_id = {key: data_id[key] for key in data_id.keys() & dimension.dimensions.names} 

346 matches = list(butler.registry.queryDimensionRecords(dimension, dataId=match_data_id).limit(1)) 

347 if not matches: 

348 # Nothing in the registry matches: invent a data ID value 

349 # with the right type (actual value does not matter). 

350 # We may or may not actually make a record with this; that's 

351 # easier to check later. 

352 dimension_value = _makeRandomDataIdValue(dimension) 

353 return dimension_value, True 

354 else: 

355 # A record does exist in the registry. Use its data ID value. 

356 return matches[0].dataId[dimension.name], False 

357 

358 

359def _makeDimensionRecordDict(data_id: dict[str, Union[str, int]], dimension: Dimension) -> dict[str, Any]: 

360 """Create a dictionary that can be used to build a `DimensionRecord` that 

361 is consistent with the given data ID. 

362 

363 Parameters 

364 ---------- 

365 data_id : `dict` [ `str`, `str` or `int` ] 

366 Dictionary that contains values for at least all of 

367 ``dimension.dimensions.names`` (the main dimension, its recursive 

368 required dependencies, and its non-recursive implied dependencies). 

369 dimension : `Dimension` 

370 Dimension to build a record dictionary for. 

371 

372 Returns 

373 ------- 

374 record_dict : `dict` [ `str`, `object` ] 

375 Dictionary that can be passed as ``**kwargs`` to this dimensions 

376 record class constructor. 

377 """ 

378 # Add the primary key field for this dimension. 

379 record_dict: dict[str, Any] = {dimension.primaryKey.name: data_id[dimension.name]} 

380 # Define secondary keys (e.g., detector name given detector id) 

381 record_dict.update(_fillAllKeys(dimension, data_id[dimension.name])) 

382 # Set the foreign key values for any related dimensions that should 

383 # appear in the record. 

384 for related_dimension in dimension.dimensions: 

385 if related_dimension.name != dimension.name: 

386 record_dict[related_dimension.name] = data_id[related_dimension.name] 

387 return record_dict 

388 

389 

390def addDataIdValue(butler: Butler, dimension: str, value: Union[str, int], **related: Union[str, int]): 

391 """Add the records that back a new data ID to a repository. 

392 

393 Parameters 

394 ---------- 

395 butler : `lsst.daf.butler.Butler` 

396 The repository to update. 

397 dimension : `str` 

398 The name of the dimension to gain a new value. 

399 value 

400 The value to register for the dimension. 

401 **related 

402 Any existing dimensions to be linked to ``value``. 

403 

404 Notes 

405 ----- 

406 Related dimensions (e.g., the instrument associated with a detector) may be 

407 specified using ``related``, which requires a value for those dimensions to 

408 have been added to the repository already (generally with a previous call 

409 to `addDataIdValue`. Any dependencies of the given dimension that are not 

410 included in ``related`` will be linked to existing values arbitrarily, and 

411 (for implied dependencies only) created and also inserted into the registry 

412 if they do not exist. Values for required dimensions and those given in 

413 ``related`` are never created. 

414 

415 Because this function creates filler data, it is only suitable for test 

416 repositories. It should not be used for repositories intended for real data 

417 processing or analysis, which have known dimension values. 

418 

419 Examples 

420 -------- 

421 See the guide on :ref:`using-butler-in-tests-make-repo` for usage examples. 

422 """ 

423 # Example is not doctest, because it's probably unsafe to create even an 

424 # in-memory butler in that environment. 

425 try: 

426 fullDimension = butler.registry.dimensions[dimension] 

427 except KeyError as e: 

428 raise ValueError from e 

429 # Bad keys ignored by registry code 

430 extraKeys = related.keys() - fullDimension.graph.dimensions.names 

431 if extraKeys: 

432 raise ValueError( 

433 f"Unexpected keywords {extraKeys} not found in {fullDimension.graph.dimensions.names}" 

434 ) 

435 

436 # Assemble a dictionary data ID holding the given primary dimension value 

437 # and all of the related ones. 

438 data_id: dict[str, Union[int, str]] = {dimension: value} 

439 data_id.update(related) 

440 

441 # Compute the set of all dimensions that these recursively depend on. 

442 all_dimensions = butler.registry.dimensions.extract(data_id.keys()) 

443 

444 # Create dicts that will become DimensionRecords for all of these data IDs. 

445 # This iteration is guaranteed to be in topological order, so we can count 

446 # on new data ID values being invented before they are needed. 

447 record_dicts_by_dimension: dict[Dimension, dict[str, Any]] = {} 

448 for dimension_obj in all_dimensions: 

449 dimension_value = data_id.get(dimension_obj.name) 

450 if dimension_value is None: 

451 data_id[dimension_obj.name], invented = _findOrInventDataIdValue(butler, data_id, dimension_obj) 

452 if not invented: 

453 # No need to make a new record; one already exists. 

454 continue 

455 if dimension_obj.name in related: 

456 # Caller passed in a value of this dimension explicitly, but it 

457 # isn't the primary dimension they asked to have a record created 

458 # for. That means they expect this record to already exist. 

459 continue 

460 if dimension_obj != fullDimension and dimension_obj in all_dimensions.required: 

461 # We also don't want to automatically create new dimension records 

462 # for required dimensions (except for the main dimension the caller 

463 # asked for); those are also asserted by the caller to already 

464 # exist. 

465 continue 

466 if dimension_obj.viewOf is not None: 

467 # Don't need to bother generating full records for dimensions whose 

468 # records are just a view into some other's records anyway. 

469 continue 

470 record_dicts_by_dimension[dimension_obj] = _makeDimensionRecordDict(data_id, dimension_obj) 

471 

472 # Sync those dimension record dictionaries with the database. 

473 for dimension_obj, record_dict in record_dicts_by_dimension.items(): 

474 record = dimension_obj.RecordClass(**record_dict) 

475 try: 

476 butler.registry.syncDimensionData(dimension_obj, record) 

477 except sqlalchemy.exc.IntegrityError as e: 

478 raise RuntimeError( 

479 "Could not create data ID value. Automatic relationship generation " 

480 "may have failed; try adding keywords to assign a specific instrument, " 

481 "physical_filter, etc. based on the nested exception message." 

482 ) from e 

483 

484 

485def addDatasetType(butler: Butler, name: str, dimensions: Set[str], storageClass: str) -> DatasetType: 

486 """Add a new dataset type to a repository. 

487 

488 Parameters 

489 ---------- 

490 butler : `lsst.daf.butler.Butler` 

491 The repository to update. 

492 name : `str` 

493 The name of the dataset type. 

494 dimensions : `set` [`str`] 

495 The dimensions of the new dataset type. 

496 storageClass : `str` 

497 The storage class the dataset will use. 

498 

499 Returns 

500 ------- 

501 datasetType : `lsst.daf.butler.DatasetType` 

502 The new type. 

503 

504 Raises 

505 ------ 

506 ValueError 

507 Raised if the dimensions or storage class is invalid. 

508 

509 Notes 

510 ----- 

511 Dataset types are shared across all collections in a repository, so this 

512 function does not need to be run for each collection. 

513 """ 

514 try: 

515 datasetType = DatasetType(name, dimensions, storageClass, universe=butler.registry.dimensions) 

516 butler.registry.registerDatasetType(datasetType) 

517 return datasetType 

518 except KeyError as e: 

519 raise ValueError from e 

520 

521 

522class DatastoreMock: 

523 """Mocks a butler datastore. 

524 

525 Has functions that mock the datastore in a butler. Provides an `apply` 

526 function to replace the relevent butler datastore functions with the mock 

527 functions. 

528 """ 

529 

530 @staticmethod 

531 def apply(butler): 

532 """Apply datastore mocks to a butler.""" 

533 butler.datastore.export = DatastoreMock._mock_export 

534 butler.datastore.get = DatastoreMock._mock_get 

535 butler.datastore.ingest = MagicMock() 

536 

537 @staticmethod 

538 def _mock_export( 

539 refs: Iterable[DatasetRef], *, directory: Optional[str] = None, transfer: Optional[str] = None 

540 ) -> Iterable[FileDataset]: 

541 """A mock of `Datastore.export` that satisfies the requirement that 

542 the refs passed in are included in the `FileDataset` objects 

543 returned. 

544 

545 This can be used to construct a `Datastore` mock that can be used 

546 in repository export via:: 

547 

548 datastore = unittest.mock.Mock(spec=Datastore) 

549 datastore.export = DatastoreMock._mock_export 

550 

551 """ 

552 for ref in refs: 

553 yield FileDataset( 

554 refs=[ref], path="mock/path", formatter="lsst.daf.butler.formatters.json.JsonFormatter" 

555 ) 

556 

557 @staticmethod 

558 def _mock_get( 

559 ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None 

560 ) -> Tuple[int, Optional[Mapping[str, Any]]]: 

561 """A mock of `Datastore.get` that just returns the integer dataset ID 

562 value and parameters it was given. 

563 """ 

564 return (ref.id, parameters)