Coverage for python/lsst/daf/butler/tests/_testRepo.py: 14%

144 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-10-27 09:44 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28from __future__ import annotations 

29 

30__all__ = [ 

31 "makeTestRepo", 

32 "makeTestCollection", 

33 "addDatasetType", 

34 "expandUniqueId", 

35 "DatastoreMock", 

36 "addDataIdValue", 

37] 

38 

39import random 

40from collections.abc import Iterable, Mapping 

41from typing import TYPE_CHECKING, Any 

42from unittest.mock import MagicMock 

43 

44import sqlalchemy 

45from lsst.daf.butler import ( 

46 Butler, 

47 Config, 

48 DataCoordinate, 

49 DatasetRef, 

50 DatasetType, 

51 Dimension, 

52 DimensionUniverse, 

53 FileDataset, 

54 StorageClass, 

55) 

56 

57if TYPE_CHECKING: 

58 from lsst.daf.butler import DatasetId 

59 

60 

61def makeTestRepo( 

62 root: str, dataIds: Mapping[str, Iterable] | None = None, *, config: Config | None = None, **kwargs: Any 

63) -> Butler: 

64 """Create an empty test repository. 

65 

66 Parameters 

67 ---------- 

68 root : `str` 

69 The location of the root directory for the repository. 

70 dataIds : `~collections.abc.Mapping` [`str`, `iterable`], optional 

71 A mapping keyed by the dimensions used in the test. Each value is an 

72 iterable of names for that dimension (e.g., detector IDs for 

73 `"detector"`). Related dimensions (e.g., instruments and detectors) are 

74 linked arbitrarily, with values created for implied dimensions only 

75 when needed. This parameter is provided for compatibility with old 

76 code; newer code should make the repository, then call 

77 `~lsst.daf.butler.tests.addDataIdValue`. 

78 config : `lsst.daf.butler.Config`, optional 

79 A configuration for the repository (for details, see 

80 `lsst.daf.butler.Butler.makeRepo`). If omitted, creates a repository 

81 with default dataset and storage types, but optimized for speed. The 

82 defaults set ``.datastore.cls``, ``.datastore.checksum`` and 

83 ``.registry.db``. If a supplied config does not specify these values 

84 the internal defaults will be used to ensure that we have a usable 

85 configuration. 

86 **kwargs 

87 Extra arguments to `lsst.daf.butler.Butler.makeRepo`. 

88 

89 Returns 

90 ------- 

91 butler : `lsst.daf.butler.Butler` 

92 A Butler referring to the new repository. This Butler is provided only 

93 for additional setup; to keep test cases isolated, it is highly 

94 recommended that each test create its own Butler with a unique 

95 run/collection. See `makeTestCollection`. 

96 

97 Notes 

98 ----- 

99 This function provides a "quick and dirty" repository for simple unit tests 

100 that don't depend on complex data relationships. It is ill-suited for tests 

101 where the structure of the data matters. If you need such a dataset, create 

102 it directly or use a saved test dataset. 

103 """ 

104 defaults = Config() 

105 defaults["datastore", "cls"] = "lsst.daf.butler.datastores.inMemoryDatastore.InMemoryDatastore" 

106 defaults["datastore", "checksum"] = False # In case of future changes 

107 defaults["registry", "db"] = "sqlite:///<butlerRoot>/gen3.sqlite3" 

108 

109 if config: 

110 defaults.update(config) 

111 

112 if not dataIds: 

113 dataIds = {} 

114 

115 # Disable config root by default so that our registry override will 

116 # not be ignored. 

117 # newConfig guards against location-related keywords like outfile 

118 newConfig = Butler.makeRepo(root, config=defaults, forceConfigRoot=False, **kwargs) 

119 butler = Butler.from_config(newConfig, writeable=True) 

120 dimensionRecords = _makeRecords(dataIds, butler.dimensions) 

121 for dimension, records in dimensionRecords.items(): 

122 if butler.dimensions[dimension].viewOf is None: 

123 butler.registry.insertDimensionData(dimension, *records) 

124 return butler 

125 

126 

127def makeTestCollection(repo: Butler, uniqueId: str | None = None) -> Butler: 

128 """Create a read/write Butler to a fresh collection. 

129 

130 Parameters 

131 ---------- 

132 repo : `lsst.daf.butler.Butler` 

133 A previously existing Butler to a repository, such as that returned by 

134 `~lsst.daf.butler.Butler.makeRepo` or `makeTestRepo`. 

135 uniqueId : `str`, optional 

136 A collection ID guaranteed by external code to be unique across all 

137 calls to ``makeTestCollection`` for the same repository. 

138 

139 Returns 

140 ------- 

141 butler : `lsst.daf.butler.Butler` 

142 A Butler referring to a new collection in the repository at ``root``. 

143 The collection is (almost) guaranteed to be new. 

144 

145 Notes 

146 ----- 

147 This function creates a single run collection that does not necessarily 

148 conform to any repository conventions. It is only suitable for creating an 

149 isolated test area, and not for repositories intended for real data 

150 processing or analysis. 

151 """ 

152 if not uniqueId: 

153 # Create a "random" collection name 

154 # Speed matters more than cryptographic guarantees 

155 uniqueId = str(random.randrange(1_000_000_000)) 

156 collection = "test_" + uniqueId 

157 return Butler.from_config(butler=repo, run=collection) 

158 

159 

160def _makeRecords(dataIds: Mapping[str, Iterable], universe: DimensionUniverse) -> Mapping[str, Iterable]: 

161 """Create cross-linked dimension records from a collection of 

162 data ID values. 

163 

164 Parameters 

165 ---------- 

166 dataIds : `~collections.abc.Mapping` [`str`, `iterable`] 

167 A mapping keyed by the dimensions of interest. Each value is an 

168 iterable of names for that dimension (e.g., detector IDs for 

169 `"detector"`). 

170 universe : lsst.daf.butler.DimensionUniverse 

171 Set of all known dimensions and their relationships. 

172 

173 Returns 

174 ------- 

175 dataIds : `~collections.abc.Mapping` [`str`, `iterable`] 

176 A mapping keyed by the dimensions of interest, giving one 

177 `~lsst.daf.butler.DimensionRecord` for each input name. Related 

178 dimensions (e.g., instruments and detectors) are linked arbitrarily. 

179 """ 

180 # Create values for all dimensions that are (recursive) required or implied 

181 # dependencies of the given ones. 

182 complete_data_id_values = {} 

183 for dimension in universe.extract(dataIds.keys()): 

184 if dimension.name in dataIds: 

185 complete_data_id_values[dimension.name] = list(dataIds[dimension.name]) 

186 if dimension.name not in complete_data_id_values: 

187 complete_data_id_values[dimension.name] = [_makeRandomDataIdValue(dimension)] 

188 

189 # Start populating dicts that will become DimensionRecords by providing 

190 # alternate keys like detector names 

191 record_dicts_by_dimension_name: dict[str, list[dict[str, str | int | bytes]]] = {} 

192 for name, values in complete_data_id_values.items(): 

193 record_dicts_by_dimension_name[name] = [] 

194 dimension_el = universe[name] 

195 for value in values: 

196 # _fillAllKeys wants Dimension and not DimensionElement. 

197 # universe.__getitem__ says it returns DimensionElement but this 

198 # really does also seem to be a Dimension here. 

199 record_dicts_by_dimension_name[name].append( 

200 _fillAllKeys(dimension_el, value) # type: ignore[arg-type] 

201 ) 

202 

203 # Pick cross-relationships arbitrarily 

204 for name, record_dicts in record_dicts_by_dimension_name.items(): 

205 dimension_el = universe[name] 

206 for record_dict in record_dicts: 

207 for other in dimension_el.dimensions: 

208 if other != dimension_el: 

209 relation = record_dicts_by_dimension_name[other.name][0] 

210 record_dict[other.name] = relation[other.primaryKey.name] 

211 

212 return { 

213 dimension: [universe[dimension].RecordClass(**record_dict) for record_dict in record_dicts] 

214 for dimension, record_dicts in record_dicts_by_dimension_name.items() 

215 } 

216 

217 

218def _fillAllKeys(dimension: Dimension, value: str | int) -> dict[str, str | int | bytes]: 

219 """Create an arbitrary mapping of all required keys for a given dimension 

220 that do not refer to other dimensions. 

221 

222 Parameters 

223 ---------- 

224 dimension : `lsst.daf.butler.Dimension` 

225 The dimension for which to generate a set of keys (e.g., detector). 

226 value 

227 The value assigned to ``dimension`` (e.g., detector ID). 

228 

229 Returns 

230 ------- 

231 expandedValue : `dict` [`str`] 

232 A mapping of dimension keys to values. ``dimension's`` primary key 

233 maps to ``value``, but all other mappings (e.g., detector name) 

234 are arbitrary. 

235 """ 

236 expandedValue: dict[str, str | int | bytes] = {} 

237 for key in dimension.uniqueKeys: 

238 if key.nbytes: 

239 # For `bytes` fields, we want something that casts at least `str` 

240 # and `int` values to bytes and yields b'' when called with no 

241 # arguments (as in the except block below). Unfortunately, the 

242 # `bytes` type itself fails for both `str` and `int`, but this 

243 # lambda does what we need. This particularly important for the 

244 # skymap dimensions' bytes 'hash' field, which has a unique 

245 # constraint; without this, all skymaps would get a hash of b'' 

246 # and end up conflicting. 

247 castType = lambda *args: str(*args).encode() # noqa: E731 

248 else: 

249 castType = key.dtype().python_type 

250 try: 

251 castValue = castType(value) 

252 except TypeError: 

253 castValue = castType() 

254 expandedValue[key.name] = castValue 

255 for key in dimension.metadata: 

256 if not key.nullable: 

257 expandedValue[key.name] = key.dtype().python_type(value) 

258 return expandedValue 

259 

260 

261def _makeRandomDataIdValue(dimension: Dimension) -> int | str: 

262 """Generate a random value of the appropriate type for a data ID key. 

263 

264 Parameters 

265 ---------- 

266 dimension : `Dimension` 

267 Dimension the value corresponds to. 

268 

269 Returns 

270 ------- 

271 value : `int` or `str` 

272 Random value. 

273 """ 

274 if dimension.primaryKey.getPythonType() is str: 

275 return str(random.randrange(1000)) 

276 else: 

277 return random.randrange(1000) 

278 

279 

280def expandUniqueId(butler: Butler, partialId: Mapping[str, Any]) -> DataCoordinate: 

281 """Return a complete data ID matching some criterion. 

282 

283 Parameters 

284 ---------- 

285 butler : `lsst.daf.butler.Butler` 

286 The repository to query. 

287 partialId : `~collections.abc.Mapping` [`str`] 

288 A mapping of known dimensions and values. 

289 

290 Returns 

291 ------- 

292 dataId : `lsst.daf.butler.DataCoordinate` 

293 The unique data ID that matches ``partialId``. 

294 

295 Raises 

296 ------ 

297 ValueError 

298 Raised if ``partialId`` does not uniquely identify a data ID. 

299 

300 Notes 

301 ----- 

302 This method will only work correctly if all dimensions attached to the 

303 target dimension (eg., "physical_filter" for "visit") are known to the 

304 repository, even if they're not needed to identify a dataset. This function 

305 is only suitable for certain kinds of test repositories, and not for 

306 repositories intended for real data processing or analysis. 

307 

308 Examples 

309 -------- 

310 .. code-block:: py 

311 

312 >>> butler = makeTestRepo( 

313 "testdir", {"instrument": ["notACam"], "detector": [1]}) 

314 >>> expandUniqueId(butler, {"detector": 1}) 

315 DataCoordinate({instrument, detector}, ('notACam', 1)) 

316 """ 

317 # The example is *not* a doctest because it requires dangerous I/O 

318 registry = butler.registry 

319 dimensions = registry.dimensions.extract(partialId.keys()).required 

320 

321 query = " AND ".join(f"{dimension} = {value!r}" for dimension, value in partialId.items()) 

322 

323 # Much of the purpose of this function is to do something we explicitly 

324 # reject most of the time: query for a governor dimension (e.g. instrument) 

325 # given something that depends on it (e.g. visit), hence check=False. 

326 dataId = list(registry.queryDataIds(dimensions, where=query, check=False)) 

327 if len(dataId) == 1: 

328 return dataId[0] 

329 else: 

330 raise ValueError(f"Found {len(dataId)} matches for {partialId}, expected 1.") 

331 

332 

333def _findOrInventDataIdValue( 

334 butler: Butler, data_id: dict[str, str | int], dimension: Dimension 

335) -> tuple[str | int, bool]: 

336 """Look up an arbitrary value for a dimension that is consistent with a 

337 partial data ID that does not specify that dimension, or invent one if no 

338 such value exists. 

339 

340 Parameters 

341 ---------- 

342 butler : `Butler` 

343 Butler to use to look up data ID values. 

344 data_id : `dict` [ `str`, `str` or `int` ] 

345 Dictionary of possibly-related data ID values. 

346 dimension : `Dimension` 

347 Dimension to obtain a value for. 

348 

349 Returns 

350 ------- 

351 value : `int` or `str` 

352 Value for this dimension. 

353 invented : `bool` 

354 `True` if the value had to be invented, `False` if a compatible value 

355 already existed. 

356 """ 

357 # No values given by caller for this dimension. See if any exist 

358 # in the registry that are consistent with the values of dimensions 

359 # we do have: 

360 match_data_id = {key: data_id[key] for key in data_id.keys() & dimension.dimensions.names} 

361 matches = list(butler.registry.queryDimensionRecords(dimension, dataId=match_data_id).limit(1)) 

362 if not matches: 

363 # Nothing in the registry matches: invent a data ID value 

364 # with the right type (actual value does not matter). 

365 # We may or may not actually make a record with this; that's 

366 # easier to check later. 

367 dimension_value = _makeRandomDataIdValue(dimension) 

368 return dimension_value, True 

369 else: 

370 # A record does exist in the registry. Use its data ID value. 

371 dim_value = matches[0].dataId[dimension.name] 

372 assert dim_value is not None 

373 return dim_value, False 

374 

375 

376def _makeDimensionRecordDict(data_id: dict[str, str | int], dimension: Dimension) -> dict[str, Any]: 

377 """Create a dictionary that can be used to build a `DimensionRecord` that 

378 is consistent with the given data ID. 

379 

380 Parameters 

381 ---------- 

382 data_id : `dict` [ `str`, `str` or `int` ] 

383 Dictionary that contains values for at least all of 

384 ``dimension.dimensions.names`` (the main dimension, its recursive 

385 required dependencies, and its non-recursive implied dependencies). 

386 dimension : `Dimension` 

387 Dimension to build a record dictionary for. 

388 

389 Returns 

390 ------- 

391 record_dict : `dict` [ `str`, `object` ] 

392 Dictionary that can be passed as ``**kwargs`` to this dimensions 

393 record class constructor. 

394 """ 

395 # Add the primary key field for this dimension. 

396 record_dict: dict[str, Any] = {dimension.primaryKey.name: data_id[dimension.name]} 

397 # Define secondary keys (e.g., detector name given detector id) 

398 record_dict.update(_fillAllKeys(dimension, data_id[dimension.name])) 

399 # Set the foreign key values for any related dimensions that should 

400 # appear in the record. 

401 for related_dimension in dimension.dimensions: 

402 if related_dimension.name != dimension.name: 

403 record_dict[related_dimension.name] = data_id[related_dimension.name] 

404 return record_dict 

405 

406 

407def addDataIdValue(butler: Butler, dimension: str, value: str | int, **related: str | int) -> None: 

408 """Add the records that back a new data ID to a repository. 

409 

410 Parameters 

411 ---------- 

412 butler : `lsst.daf.butler.Butler` 

413 The repository to update. 

414 dimension : `str` 

415 The name of the dimension to gain a new value. 

416 value 

417 The value to register for the dimension. 

418 **related 

419 Any existing dimensions to be linked to ``value``. 

420 

421 Notes 

422 ----- 

423 Related dimensions (e.g., the instrument associated with a detector) may be 

424 specified using ``related``, which requires a value for those dimensions to 

425 have been added to the repository already (generally with a previous call 

426 to `addDataIdValue`. Any dependencies of the given dimension that are not 

427 included in ``related`` will be linked to existing values arbitrarily, and 

428 (for implied dependencies only) created and also inserted into the registry 

429 if they do not exist. Values for required dimensions and those given in 

430 ``related`` are never created. 

431 

432 Because this function creates filler data, it is only suitable for test 

433 repositories. It should not be used for repositories intended for real data 

434 processing or analysis, which have known dimension values. 

435 

436 Examples 

437 -------- 

438 See the guide on :ref:`using-butler-in-tests-make-repo` for usage examples. 

439 """ 

440 # Example is not doctest, because it's probably unsafe to create even an 

441 # in-memory butler in that environment. 

442 try: 

443 fullDimension = butler.dimensions[dimension] 

444 except KeyError as e: 

445 raise ValueError from e 

446 # Bad keys ignored by registry code 

447 extraKeys = related.keys() - fullDimension.graph.dimensions.names 

448 if extraKeys: 

449 raise ValueError( 

450 f"Unexpected keywords {extraKeys} not found in {fullDimension.graph.dimensions.names}" 

451 ) 

452 

453 # Assemble a dictionary data ID holding the given primary dimension value 

454 # and all of the related ones. 

455 data_id: dict[str, int | str] = {dimension: value} 

456 data_id.update(related) 

457 

458 # Compute the set of all dimensions that these recursively depend on. 

459 all_dimensions = butler.dimensions.extract(data_id.keys()) 

460 

461 # Create dicts that will become DimensionRecords for all of these data IDs. 

462 # This iteration is guaranteed to be in topological order, so we can count 

463 # on new data ID values being invented before they are needed. 

464 record_dicts_by_dimension: dict[Dimension, dict[str, Any]] = {} 

465 for dimension_obj in all_dimensions: 

466 dimension_value = data_id.get(dimension_obj.name) 

467 if dimension_value is None: 

468 data_id[dimension_obj.name], invented = _findOrInventDataIdValue(butler, data_id, dimension_obj) 

469 if not invented: 

470 # No need to make a new record; one already exists. 

471 continue 

472 if dimension_obj.name in related: 

473 # Caller passed in a value of this dimension explicitly, but it 

474 # isn't the primary dimension they asked to have a record created 

475 # for. That means they expect this record to already exist. 

476 continue 

477 if dimension_obj != fullDimension and dimension_obj in all_dimensions.required: 

478 # We also don't want to automatically create new dimension records 

479 # for required dimensions (except for the main dimension the caller 

480 # asked for); those are also asserted by the caller to already 

481 # exist. 

482 continue 

483 if dimension_obj.viewOf is not None: 

484 # Don't need to bother generating full records for dimensions whose 

485 # records are just a view into some other's records anyway. 

486 continue 

487 record_dicts_by_dimension[dimension_obj] = _makeDimensionRecordDict(data_id, dimension_obj) 

488 

489 # Sync those dimension record dictionaries with the database. 

490 for dimension_obj, record_dict in record_dicts_by_dimension.items(): 

491 record = dimension_obj.RecordClass(**record_dict) 

492 try: 

493 butler.registry.syncDimensionData(dimension_obj, record) 

494 except sqlalchemy.exc.IntegrityError as e: 

495 raise RuntimeError( 

496 "Could not create data ID value. Automatic relationship generation " 

497 "may have failed; try adding keywords to assign a specific instrument, " 

498 "physical_filter, etc. based on the nested exception message." 

499 ) from e 

500 

501 

502def addDatasetType(butler: Butler, name: str, dimensions: set[str], storageClass: str) -> DatasetType: 

503 """Add a new dataset type to a repository. 

504 

505 Parameters 

506 ---------- 

507 butler : `lsst.daf.butler.Butler` 

508 The repository to update. 

509 name : `str` 

510 The name of the dataset type. 

511 dimensions : `set` [`str`] 

512 The dimensions of the new dataset type. 

513 storageClass : `str` 

514 The storage class the dataset will use. 

515 

516 Returns 

517 ------- 

518 datasetType : `lsst.daf.butler.DatasetType` 

519 The new type. 

520 

521 Raises 

522 ------ 

523 ValueError 

524 Raised if the dimensions or storage class is invalid. 

525 

526 Notes 

527 ----- 

528 Dataset types are shared across all collections in a repository, so this 

529 function does not need to be run for each collection. 

530 """ 

531 try: 

532 datasetType = DatasetType(name, dimensions, storageClass, universe=butler.dimensions) 

533 butler.registry.registerDatasetType(datasetType) 

534 return datasetType 

535 except KeyError as e: 

536 raise ValueError from e 

537 

538 

539class DatastoreMock: 

540 """Mocks a butler datastore. 

541 

542 Has functions that mock the datastore in a butler. Provides an `apply` 

543 function to replace the relevent butler datastore functions with the mock 

544 functions. 

545 """ 

546 

547 @staticmethod 

548 def apply(butler: Butler) -> None: 

549 """Apply datastore mocks to a butler.""" 

550 butler._datastore.export = DatastoreMock._mock_export # type: ignore 

551 butler._datastore.get = DatastoreMock._mock_get # type: ignore 

552 butler._datastore.ingest = MagicMock() # type: ignore 

553 

554 @staticmethod 

555 def _mock_export( 

556 refs: Iterable[DatasetRef], *, directory: str | None = None, transfer: str | None = None 

557 ) -> Iterable[FileDataset]: 

558 """Mock of `Datastore.export` that satisfies the requirement that 

559 the refs passed in are included in the `FileDataset` objects 

560 returned. 

561 

562 This can be used to construct a `Datastore` mock that can be used 

563 in repository export via:: 

564 

565 datastore = unittest.mock.Mock(spec=Datastore) 

566 datastore.export = DatastoreMock._mock_export 

567 

568 """ 

569 for ref in refs: 

570 yield FileDataset( 

571 refs=[ref], path="mock/path", formatter="lsst.daf.butler.formatters.json.JsonFormatter" 

572 ) 

573 

574 @staticmethod 

575 def _mock_get( 

576 ref: DatasetRef, 

577 parameters: Mapping[str, Any] | None = None, 

578 storageClass: StorageClass | str | None = None, 

579 ) -> tuple[DatasetId, Mapping[str, Any] | None]: 

580 """Mock of `Datastore.get` that just returns the integer dataset ID 

581 value and parameters it was given. 

582 """ 

583 return (ref.id, parameters)