Coverage for python/lsst/daf/butler/tests/_testRepo.py: 12%

147 statements  

« prev     ^ index     » next       coverage.py v7.2.5, created at 2023-05-15 00:10 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = [ 

25 "makeTestRepo", 

26 "makeTestCollection", 

27 "addDatasetType", 

28 "expandUniqueId", 

29 "DatastoreMock", 

30 "addDataIdValue", 

31] 

32 

33import random 

34from collections.abc import Iterable, Mapping 

35from typing import TYPE_CHECKING, Any 

36from unittest.mock import MagicMock 

37 

38import sqlalchemy 

39from lsst.daf.butler import ( 

40 Butler, 

41 Config, 

42 DataCoordinate, 

43 DatasetRef, 

44 DatasetType, 

45 Dimension, 

46 DimensionUniverse, 

47 FileDataset, 

48 StorageClass, 

49) 

50 

51if TYPE_CHECKING: 51 ↛ 52line 51 didn't jump to line 52, because the condition on line 51 was never true

52 from lsst.daf.butler import DatasetId 

53 

54 

55def makeTestRepo( 

56 root: str, dataIds: Mapping[str, Iterable] | None = None, *, config: Config | None = None, **kwargs: Any 

57) -> Butler: 

58 """Create an empty test repository. 

59 

60 Parameters 

61 ---------- 

62 root : `str` 

63 The location of the root directory for the repository. 

64 dataIds : `~collections.abc.Mapping` [`str`, `iterable`], optional 

65 A mapping keyed by the dimensions used in the test. Each value is an 

66 iterable of names for that dimension (e.g., detector IDs for 

67 `"detector"`). Related dimensions (e.g., instruments and detectors) are 

68 linked arbitrarily, with values created for implied dimensions only 

69 when needed. This parameter is provided for compatibility with old 

70 code; newer code should make the repository, then call 

71 `~lsst.daf.butler.tests.addDataIdValue`. 

72 config : `lsst.daf.butler.Config`, optional 

73 A configuration for the repository (for details, see 

74 `lsst.daf.butler.Butler.makeRepo`). If omitted, creates a repository 

75 with default dataset and storage types, but optimized for speed. The 

76 defaults set ``.datastore.cls``, ``.datastore.checksum`` and 

77 ``.registry.db``. If a supplied config does not specify these values 

78 the internal defaults will be used to ensure that we have a usable 

79 configuration. 

80 **kwargs 

81 Extra arguments to `lsst.daf.butler.Butler.makeRepo`. 

82 

83 Returns 

84 ------- 

85 butler : `lsst.daf.butler.Butler` 

86 A Butler referring to the new repository. This Butler is provided only 

87 for additional setup; to keep test cases isolated, it is highly 

88 recommended that each test create its own Butler with a unique 

89 run/collection. See `makeTestCollection`. 

90 

91 Notes 

92 ----- 

93 This function provides a "quick and dirty" repository for simple unit tests 

94 that don't depend on complex data relationships. It is ill-suited for tests 

95 where the structure of the data matters. If you need such a dataset, create 

96 it directly or use a saved test dataset. 

97 """ 

98 defaults = Config() 

99 defaults["datastore", "cls"] = "lsst.daf.butler.datastores.inMemoryDatastore.InMemoryDatastore" 

100 defaults["datastore", "checksum"] = False # In case of future changes 

101 defaults["registry", "db"] = "sqlite:///<butlerRoot>/gen3.sqlite3" 

102 

103 if config: 

104 defaults.update(config) 

105 

106 if not dataIds: 

107 dataIds = {} 

108 

109 # Disable config root by default so that our registry override will 

110 # not be ignored. 

111 # newConfig guards against location-related keywords like outfile 

112 newConfig = Butler.makeRepo(root, config=defaults, forceConfigRoot=False, **kwargs) 

113 butler = Butler(newConfig, writeable=True) 

114 dimensionRecords = _makeRecords(dataIds, butler.registry.dimensions) 

115 for dimension, records in dimensionRecords.items(): 

116 if butler.registry.dimensions[dimension].viewOf is None: 

117 butler.registry.insertDimensionData(dimension, *records) 

118 return butler 

119 

120 

121def makeTestCollection(repo: Butler, uniqueId: str | None = None) -> Butler: 

122 """Create a read/write Butler to a fresh collection. 

123 

124 Parameters 

125 ---------- 

126 repo : `lsst.daf.butler.Butler` 

127 A previously existing Butler to a repository, such as that returned by 

128 `~lsst.daf.butler.Butler.makeRepo` or `makeTestRepo`. 

129 uniqueId : `str`, optional 

130 A collection ID guaranteed by external code to be unique across all 

131 calls to ``makeTestCollection`` for the same repository. 

132 

133 Returns 

134 ------- 

135 butler : `lsst.daf.butler.Butler` 

136 A Butler referring to a new collection in the repository at ``root``. 

137 The collection is (almost) guaranteed to be new. 

138 

139 Notes 

140 ----- 

141 This function creates a single run collection that does not necessarily 

142 conform to any repository conventions. It is only suitable for creating an 

143 isolated test area, and not for repositories intended for real data 

144 processing or analysis. 

145 """ 

146 if not uniqueId: 

147 # Create a "random" collection name 

148 # Speed matters more than cryptographic guarantees 

149 uniqueId = str(random.randrange(1_000_000_000)) 

150 collection = "test_" + uniqueId 

151 return Butler(butler=repo, run=collection) 

152 

153 

154def _makeRecords(dataIds: Mapping[str, Iterable], universe: DimensionUniverse) -> Mapping[str, Iterable]: 

155 """Create cross-linked dimension records from a collection of 

156 data ID values. 

157 

158 Parameters 

159 ---------- 

160 dataIds : `~collections.abc.Mapping` [`str`, `iterable`] 

161 A mapping keyed by the dimensions of interest. Each value is an 

162 iterable of names for that dimension (e.g., detector IDs for 

163 `"detector"`). 

164 universe : lsst.daf.butler.DimensionUniverse 

165 Set of all known dimensions and their relationships. 

166 

167 Returns 

168 ------- 

169 dataIds : `~collections.abc.Mapping` [`str`, `iterable`] 

170 A mapping keyed by the dimensions of interest, giving one 

171 `~lsst.daf.butler.DimensionRecord` for each input name. Related 

172 dimensions (e.g., instruments and detectors) are linked arbitrarily. 

173 """ 

174 

175 # Create values for all dimensions that are (recursive) required or implied 

176 # dependencies of the given ones. 

177 complete_data_id_values = {} 

178 for dimension in universe.extract(dataIds.keys()): 

179 if dimension.name in dataIds: 

180 complete_data_id_values[dimension.name] = list(dataIds[dimension.name]) 

181 if dimension.name not in complete_data_id_values: 

182 complete_data_id_values[dimension.name] = [_makeRandomDataIdValue(dimension)] 

183 

184 # Start populating dicts that will become DimensionRecords by providing 

185 # alternate keys like detector names 

186 record_dicts_by_dimension_name: dict[str, list[dict[str, str | int | bytes]]] = {} 

187 for name, values in complete_data_id_values.items(): 

188 record_dicts_by_dimension_name[name] = [] 

189 dimension_el = universe[name] 

190 for value in values: 

191 # _fillAllKeys wants Dimension and not DimensionElement. 

192 # universe.__getitem__ says it returns DimensionElement but this 

193 # really does also seem to be a Dimension here. 

194 record_dicts_by_dimension_name[name].append( 

195 _fillAllKeys(dimension_el, value) # type: ignore[arg-type] 

196 ) 

197 

198 # Pick cross-relationships arbitrarily 

199 for name, record_dicts in record_dicts_by_dimension_name.items(): 

200 dimension_el = universe[name] 

201 for record_dict in record_dicts: 

202 for other in dimension_el.dimensions: 

203 if other != dimension_el: 

204 relation = record_dicts_by_dimension_name[other.name][0] 

205 record_dict[other.name] = relation[other.primaryKey.name] 

206 

207 return { 

208 dimension: [universe[dimension].RecordClass(**record_dict) for record_dict in record_dicts] 

209 for dimension, record_dicts in record_dicts_by_dimension_name.items() 

210 } 

211 

212 

213def _fillAllKeys(dimension: Dimension, value: str | int) -> dict[str, str | int | bytes]: 

214 """Create an arbitrary mapping of all required keys for a given dimension 

215 that do not refer to other dimensions. 

216 

217 Parameters 

218 ---------- 

219 dimension : `lsst.daf.butler.Dimension` 

220 The dimension for which to generate a set of keys (e.g., detector). 

221 value 

222 The value assigned to ``dimension`` (e.g., detector ID). 

223 

224 Returns 

225 ------- 

226 expandedValue : `dict` [`str`] 

227 A mapping of dimension keys to values. ``dimension's`` primary key 

228 maps to ``value``, but all other mappings (e.g., detector name) 

229 are arbitrary. 

230 """ 

231 expandedValue: dict[str, str | int | bytes] = {} 

232 for key in dimension.uniqueKeys: 

233 if key.nbytes: 

234 # For `bytes` fields, we want something that casts at least `str` 

235 # and `int` values to bytes and yields b'' when called with no 

236 # arguments (as in the except block below). Unfortunately, the 

237 # `bytes` type itself fails for both `str` and `int`, but this 

238 # lambda does what we need. This particularly important for the 

239 # skymap dimensions' bytes 'hash' field, which has a unique 

240 # constraint; without this, all skymaps would get a hash of b'' 

241 # and end up conflicting. 

242 castType = lambda *args: str(*args).encode() # noqa: E731 

243 else: 

244 castType = key.dtype().python_type 

245 try: 

246 castValue = castType(value) 

247 except TypeError: 

248 castValue = castType() 

249 expandedValue[key.name] = castValue 

250 for key in dimension.metadata: 

251 if not key.nullable: 

252 expandedValue[key.name] = key.dtype().python_type(value) 

253 return expandedValue 

254 

255 

256def _makeRandomDataIdValue(dimension: Dimension) -> int | str: 

257 """Generate a random value of the appropriate type for a data ID key. 

258 

259 Parameters 

260 ---------- 

261 dimension : `Dimension` 

262 Dimension the value corresponds to. 

263 

264 Returns 

265 ------- 

266 value : `int` or `str` 

267 Random value. 

268 """ 

269 if dimension.primaryKey.getPythonType() is str: 

270 return str(random.randrange(1000)) 

271 else: 

272 return random.randrange(1000) 

273 

274 

275def expandUniqueId(butler: Butler, partialId: Mapping[str, Any]) -> DataCoordinate: 

276 """Return a complete data ID matching some criterion. 

277 

278 Parameters 

279 ---------- 

280 butler : `lsst.daf.butler.Butler` 

281 The repository to query. 

282 partialId : `~collections.abc.Mapping` [`str`] 

283 A mapping of known dimensions and values. 

284 

285 Returns 

286 ------- 

287 dataId : `lsst.daf.butler.DataCoordinate` 

288 The unique data ID that matches ``partialId``. 

289 

290 Raises 

291 ------ 

292 ValueError 

293 Raised if ``partialId`` does not uniquely identify a data ID. 

294 

295 Notes 

296 ----- 

297 This method will only work correctly if all dimensions attached to the 

298 target dimension (eg., "physical_filter" for "visit") are known to the 

299 repository, even if they're not needed to identify a dataset. This function 

300 is only suitable for certain kinds of test repositories, and not for 

301 repositories intended for real data processing or analysis. 

302 

303 Examples 

304 -------- 

305 .. code-block:: py 

306 

307 >>> butler = makeTestRepo( 

308 "testdir", {"instrument": ["notACam"], "detector": [1]}) 

309 >>> expandUniqueId(butler, {"detector": 1}) 

310 DataCoordinate({instrument, detector}, ('notACam', 1)) 

311 """ 

312 # The example is *not* a doctest because it requires dangerous I/O 

313 registry = butler.registry 

314 dimensions = registry.dimensions.extract(partialId.keys()).required 

315 

316 query = " AND ".join(f"{dimension} = {value!r}" for dimension, value in partialId.items()) 

317 

318 # Much of the purpose of this function is to do something we explicitly 

319 # reject most of the time: query for a governor dimension (e.g. instrument) 

320 # given something that depends on it (e.g. visit), hence check=False. 

321 dataId = list(registry.queryDataIds(dimensions, where=query, check=False)) 

322 if len(dataId) == 1: 

323 return dataId[0] 

324 else: 

325 raise ValueError(f"Found {len(dataId)} matches for {partialId}, expected 1.") 

326 

327 

328def _findOrInventDataIdValue( 

329 butler: Butler, data_id: dict[str, str | int], dimension: Dimension 

330) -> tuple[str | int, bool]: 

331 """Look up an arbitrary value for a dimension that is consistent with a 

332 partial data ID that does not specify that dimension, or invent one if no 

333 such value exists. 

334 

335 Parameters 

336 ---------- 

337 butler : `Butler` 

338 Butler to use to look up data ID values. 

339 data_id : `dict` [ `str`, `str` or `int` ] 

340 Dictionary of possibly-related data ID values. 

341 dimension : `Dimension` 

342 Dimension to obtain a value for. 

343 

344 Returns 

345 ------- 

346 value : `int` or `str` 

347 Value for this dimension. 

348 invented : `bool` 

349 `True` if the value had to be invented, `False` if a compatible value 

350 already existed. 

351 """ 

352 # No values given by caller for this dimension. See if any exist 

353 # in the registry that are consistent with the values of dimensions 

354 # we do have: 

355 match_data_id = {key: data_id[key] for key in data_id.keys() & dimension.dimensions.names} 

356 matches = list(butler.registry.queryDimensionRecords(dimension, dataId=match_data_id).limit(1)) 

357 if not matches: 

358 # Nothing in the registry matches: invent a data ID value 

359 # with the right type (actual value does not matter). 

360 # We may or may not actually make a record with this; that's 

361 # easier to check later. 

362 dimension_value = _makeRandomDataIdValue(dimension) 

363 return dimension_value, True 

364 else: 

365 # A record does exist in the registry. Use its data ID value. 

366 dim_value = matches[0].dataId[dimension.name] 

367 assert dim_value is not None 

368 return dim_value, False 

369 

370 

371def _makeDimensionRecordDict(data_id: dict[str, str | int], dimension: Dimension) -> dict[str, Any]: 

372 """Create a dictionary that can be used to build a `DimensionRecord` that 

373 is consistent with the given data ID. 

374 

375 Parameters 

376 ---------- 

377 data_id : `dict` [ `str`, `str` or `int` ] 

378 Dictionary that contains values for at least all of 

379 ``dimension.dimensions.names`` (the main dimension, its recursive 

380 required dependencies, and its non-recursive implied dependencies). 

381 dimension : `Dimension` 

382 Dimension to build a record dictionary for. 

383 

384 Returns 

385 ------- 

386 record_dict : `dict` [ `str`, `object` ] 

387 Dictionary that can be passed as ``**kwargs`` to this dimensions 

388 record class constructor. 

389 """ 

390 # Add the primary key field for this dimension. 

391 record_dict: dict[str, Any] = {dimension.primaryKey.name: data_id[dimension.name]} 

392 # Define secondary keys (e.g., detector name given detector id) 

393 record_dict.update(_fillAllKeys(dimension, data_id[dimension.name])) 

394 # Set the foreign key values for any related dimensions that should 

395 # appear in the record. 

396 for related_dimension in dimension.dimensions: 

397 if related_dimension.name != dimension.name: 

398 record_dict[related_dimension.name] = data_id[related_dimension.name] 

399 return record_dict 

400 

401 

402def addDataIdValue(butler: Butler, dimension: str, value: str | int, **related: str | int) -> None: 

403 """Add the records that back a new data ID to a repository. 

404 

405 Parameters 

406 ---------- 

407 butler : `lsst.daf.butler.Butler` 

408 The repository to update. 

409 dimension : `str` 

410 The name of the dimension to gain a new value. 

411 value 

412 The value to register for the dimension. 

413 **related 

414 Any existing dimensions to be linked to ``value``. 

415 

416 Notes 

417 ----- 

418 Related dimensions (e.g., the instrument associated with a detector) may be 

419 specified using ``related``, which requires a value for those dimensions to 

420 have been added to the repository already (generally with a previous call 

421 to `addDataIdValue`. Any dependencies of the given dimension that are not 

422 included in ``related`` will be linked to existing values arbitrarily, and 

423 (for implied dependencies only) created and also inserted into the registry 

424 if they do not exist. Values for required dimensions and those given in 

425 ``related`` are never created. 

426 

427 Because this function creates filler data, it is only suitable for test 

428 repositories. It should not be used for repositories intended for real data 

429 processing or analysis, which have known dimension values. 

430 

431 Examples 

432 -------- 

433 See the guide on :ref:`using-butler-in-tests-make-repo` for usage examples. 

434 """ 

435 # Example is not doctest, because it's probably unsafe to create even an 

436 # in-memory butler in that environment. 

437 try: 

438 fullDimension = butler.registry.dimensions[dimension] 

439 except KeyError as e: 

440 raise ValueError from e 

441 # Bad keys ignored by registry code 

442 extraKeys = related.keys() - fullDimension.graph.dimensions.names 

443 if extraKeys: 

444 raise ValueError( 

445 f"Unexpected keywords {extraKeys} not found in {fullDimension.graph.dimensions.names}" 

446 ) 

447 

448 # Assemble a dictionary data ID holding the given primary dimension value 

449 # and all of the related ones. 

450 data_id: dict[str, int | str] = {dimension: value} 

451 data_id.update(related) 

452 

453 # Compute the set of all dimensions that these recursively depend on. 

454 all_dimensions = butler.registry.dimensions.extract(data_id.keys()) 

455 

456 # Create dicts that will become DimensionRecords for all of these data IDs. 

457 # This iteration is guaranteed to be in topological order, so we can count 

458 # on new data ID values being invented before they are needed. 

459 record_dicts_by_dimension: dict[Dimension, dict[str, Any]] = {} 

460 for dimension_obj in all_dimensions: 

461 dimension_value = data_id.get(dimension_obj.name) 

462 if dimension_value is None: 

463 data_id[dimension_obj.name], invented = _findOrInventDataIdValue(butler, data_id, dimension_obj) 

464 if not invented: 

465 # No need to make a new record; one already exists. 

466 continue 

467 if dimension_obj.name in related: 

468 # Caller passed in a value of this dimension explicitly, but it 

469 # isn't the primary dimension they asked to have a record created 

470 # for. That means they expect this record to already exist. 

471 continue 

472 if dimension_obj != fullDimension and dimension_obj in all_dimensions.required: 

473 # We also don't want to automatically create new dimension records 

474 # for required dimensions (except for the main dimension the caller 

475 # asked for); those are also asserted by the caller to already 

476 # exist. 

477 continue 

478 if dimension_obj.viewOf is not None: 

479 # Don't need to bother generating full records for dimensions whose 

480 # records are just a view into some other's records anyway. 

481 continue 

482 record_dicts_by_dimension[dimension_obj] = _makeDimensionRecordDict(data_id, dimension_obj) 

483 

484 # Sync those dimension record dictionaries with the database. 

485 for dimension_obj, record_dict in record_dicts_by_dimension.items(): 

486 record = dimension_obj.RecordClass(**record_dict) 

487 try: 

488 butler.registry.syncDimensionData(dimension_obj, record) 

489 except sqlalchemy.exc.IntegrityError as e: 

490 raise RuntimeError( 

491 "Could not create data ID value. Automatic relationship generation " 

492 "may have failed; try adding keywords to assign a specific instrument, " 

493 "physical_filter, etc. based on the nested exception message." 

494 ) from e 

495 

496 

497def addDatasetType(butler: Butler, name: str, dimensions: set[str], storageClass: str) -> DatasetType: 

498 """Add a new dataset type to a repository. 

499 

500 Parameters 

501 ---------- 

502 butler : `lsst.daf.butler.Butler` 

503 The repository to update. 

504 name : `str` 

505 The name of the dataset type. 

506 dimensions : `set` [`str`] 

507 The dimensions of the new dataset type. 

508 storageClass : `str` 

509 The storage class the dataset will use. 

510 

511 Returns 

512 ------- 

513 datasetType : `lsst.daf.butler.DatasetType` 

514 The new type. 

515 

516 Raises 

517 ------ 

518 ValueError 

519 Raised if the dimensions or storage class is invalid. 

520 

521 Notes 

522 ----- 

523 Dataset types are shared across all collections in a repository, so this 

524 function does not need to be run for each collection. 

525 """ 

526 try: 

527 datasetType = DatasetType(name, dimensions, storageClass, universe=butler.registry.dimensions) 

528 butler.registry.registerDatasetType(datasetType) 

529 return datasetType 

530 except KeyError as e: 

531 raise ValueError from e 

532 

533 

534class DatastoreMock: 

535 """Mocks a butler datastore. 

536 

537 Has functions that mock the datastore in a butler. Provides an `apply` 

538 function to replace the relevent butler datastore functions with the mock 

539 functions. 

540 """ 

541 

542 @staticmethod 

543 def apply(butler: Butler) -> None: 

544 """Apply datastore mocks to a butler.""" 

545 butler.datastore.export = DatastoreMock._mock_export # type: ignore 

546 butler.datastore.get = DatastoreMock._mock_get # type: ignore 

547 butler.datastore.ingest = MagicMock() # type: ignore 

548 

549 @staticmethod 

550 def _mock_export( 

551 refs: Iterable[DatasetRef], *, directory: str | None = None, transfer: str | None = None 

552 ) -> Iterable[FileDataset]: 

553 """A mock of `Datastore.export` that satisfies the requirement that 

554 the refs passed in are included in the `FileDataset` objects 

555 returned. 

556 

557 This can be used to construct a `Datastore` mock that can be used 

558 in repository export via:: 

559 

560 datastore = unittest.mock.Mock(spec=Datastore) 

561 datastore.export = DatastoreMock._mock_export 

562 

563 """ 

564 for ref in refs: 

565 yield FileDataset( 

566 refs=[ref], path="mock/path", formatter="lsst.daf.butler.formatters.json.JsonFormatter" 

567 ) 

568 

569 @staticmethod 

570 def _mock_get( 

571 ref: DatasetRef, 

572 parameters: Mapping[str, Any] | None = None, 

573 storageClass: StorageClass | str | None = None, 

574 ) -> tuple[DatasetId, Mapping[str, Any] | None]: 

575 """A mock of `Datastore.get` that just returns the integer dataset ID 

576 value and parameters it was given. 

577 """ 

578 assert ref.id is not None 

579 return (ref.id, parameters)