Coverage for python/lsst/daf/butler/tests/_testRepo.py: 14%

145 statements  

« prev     ^ index     » next       coverage.py v7.4.4, created at 2024-04-05 02:53 -0700

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28from __future__ import annotations 

29 

30__all__ = [ 

31 "makeTestRepo", 

32 "makeTestCollection", 

33 "addDatasetType", 

34 "expandUniqueId", 

35 "DatastoreMock", 

36 "addDataIdValue", 

37] 

38 

39import random 

40from collections.abc import Iterable, Mapping 

41from typing import TYPE_CHECKING, Any 

42from unittest.mock import MagicMock 

43 

44import sqlalchemy 

45from lsst.daf.butler import ( 

46 Butler, 

47 Config, 

48 DataCoordinate, 

49 DatasetRef, 

50 DatasetType, 

51 Dimension, 

52 DimensionUniverse, 

53 FileDataset, 

54 StorageClass, 

55) 

56 

57if TYPE_CHECKING: 

58 from lsst.daf.butler import DatasetId 

59 

60 

61def makeTestRepo( 

62 root: str, dataIds: Mapping[str, Iterable] | None = None, *, config: Config | None = None, **kwargs: Any 

63) -> Butler: 

64 """Create an empty test repository. 

65 

66 Parameters 

67 ---------- 

68 root : `str` 

69 The location of the root directory for the repository. 

70 dataIds : `~collections.abc.Mapping` [`str`, `iterable`], optional 

71 A mapping keyed by the dimensions used in the test. Each value is an 

72 iterable of names for that dimension (e.g., detector IDs for 

73 `"detector"`). Related dimensions (e.g., instruments and detectors) are 

74 linked arbitrarily, with values created for implied dimensions only 

75 when needed. This parameter is provided for compatibility with old 

76 code; newer code should make the repository, then call 

77 `~lsst.daf.butler.tests.addDataIdValue`. 

78 config : `lsst.daf.butler.Config`, optional 

79 A configuration for the repository (for details, see 

80 `lsst.daf.butler.Butler.makeRepo`). If omitted, creates a repository 

81 with default dataset and storage types, but optimized for speed. The 

82 defaults set ``.datastore.cls``, ``.datastore.checksum`` and 

83 ``.registry.db``. If a supplied config does not specify these values 

84 the internal defaults will be used to ensure that we have a usable 

85 configuration. 

86 **kwargs 

87 Extra arguments to `lsst.daf.butler.Butler.makeRepo`. 

88 

89 Returns 

90 ------- 

91 butler : `lsst.daf.butler.Butler` 

92 A Butler referring to the new repository. This Butler is provided only 

93 for additional setup; to keep test cases isolated, it is highly 

94 recommended that each test create its own Butler with a unique 

95 run/collection. See `makeTestCollection`. 

96 

97 Notes 

98 ----- 

99 This function provides a "quick and dirty" repository for simple unit tests 

100 that don't depend on complex data relationships. It is ill-suited for tests 

101 where the structure of the data matters. If you need such a dataset, create 

102 it directly or use a saved test dataset. 

103 """ 

104 defaults = Config() 

105 defaults["datastore", "cls"] = "lsst.daf.butler.datastores.inMemoryDatastore.InMemoryDatastore" 

106 defaults["datastore", "checksum"] = False # In case of future changes 

107 defaults["registry", "db"] = "sqlite:///<butlerRoot>/gen3.sqlite3" 

108 

109 if config: 

110 defaults.update(config) 

111 

112 if not dataIds: 

113 dataIds = {} 

114 

115 # Disable config root by default so that our registry override will 

116 # not be ignored. 

117 # newConfig guards against location-related keywords like outfile 

118 newConfig = Butler.makeRepo(root, config=defaults, forceConfigRoot=False, **kwargs) 

119 butler = Butler.from_config(newConfig, writeable=True) 

120 dimensionRecords = _makeRecords(dataIds, butler.dimensions) 

121 for dimension, records in dimensionRecords.items(): 

122 if butler.dimensions[dimension].has_own_table: 

123 butler.registry.insertDimensionData(dimension, *records) 

124 return butler 

125 

126 

127def makeTestCollection(repo: Butler, uniqueId: str | None = None) -> Butler: 

128 """Create a read/write Butler to a fresh collection. 

129 

130 Parameters 

131 ---------- 

132 repo : `lsst.daf.butler.Butler` 

133 A previously existing Butler to a repository, such as that returned by 

134 `~lsst.daf.butler.Butler.makeRepo` or `makeTestRepo`. 

135 uniqueId : `str`, optional 

136 A collection ID guaranteed by external code to be unique across all 

137 calls to ``makeTestCollection`` for the same repository. 

138 

139 Returns 

140 ------- 

141 butler : `lsst.daf.butler.Butler` 

142 A Butler referring to a new collection in the repository at ``root``. 

143 The collection is (almost) guaranteed to be new. 

144 

145 Notes 

146 ----- 

147 This function creates a single run collection that does not necessarily 

148 conform to any repository conventions. It is only suitable for creating an 

149 isolated test area, and not for repositories intended for real data 

150 processing or analysis. 

151 """ 

152 if not uniqueId: 

153 # Create a "random" collection name 

154 # Speed matters more than cryptographic guarantees 

155 uniqueId = str(random.randrange(1_000_000_000)) 

156 collection = "test_" + uniqueId 

157 return Butler.from_config(butler=repo, run=collection) 

158 

159 

160def _makeRecords(dataIds: Mapping[str, Iterable], universe: DimensionUniverse) -> Mapping[str, Iterable]: 

161 """Create cross-linked dimension records from a collection of 

162 data ID values. 

163 

164 Parameters 

165 ---------- 

166 dataIds : `~collections.abc.Mapping` [`str`, `iterable`] 

167 A mapping keyed by the dimensions of interest. Each value is an 

168 iterable of names for that dimension (e.g., detector IDs for 

169 `"detector"`). 

170 universe : lsst.daf.butler.DimensionUniverse 

171 Set of all known dimensions and their relationships. 

172 

173 Returns 

174 ------- 

175 dataIds : `~collections.abc.Mapping` [`str`, `iterable`] 

176 A mapping keyed by the dimensions of interest, giving one 

177 `~lsst.daf.butler.DimensionRecord` for each input name. Related 

178 dimensions (e.g., instruments and detectors) are linked arbitrarily. 

179 """ 

180 # Create values for all dimensions that are (recursive) required or implied 

181 # dependencies of the given ones. 

182 complete_data_id_values = {} 

183 for dimension_name in universe.conform(dataIds.keys()).names: 

184 if dimension_name in dataIds: 

185 complete_data_id_values[dimension_name] = list(dataIds[dimension_name]) 

186 if dimension_name not in complete_data_id_values: 

187 complete_data_id_values[dimension_name] = [ 

188 _makeRandomDataIdValue(universe.dimensions[dimension_name]) 

189 ] 

190 

191 # Start populating dicts that will become DimensionRecords by providing 

192 # alternate keys like detector names 

193 record_dicts_by_dimension_name: dict[str, list[dict[str, str | int | bytes]]] = {} 

194 for name, values in complete_data_id_values.items(): 

195 record_dicts_by_dimension_name[name] = [] 

196 dimension_el = universe[name] 

197 for value in values: 

198 # _fillAllKeys wants Dimension and not DimensionElement. 

199 # universe.__getitem__ says it returns DimensionElement but this 

200 # really does also seem to be a Dimension here. 

201 record_dicts_by_dimension_name[name].append( 

202 _fillAllKeys(dimension_el, value) # type: ignore[arg-type] 

203 ) 

204 

205 # Pick cross-relationships arbitrarily 

206 for name, record_dicts in record_dicts_by_dimension_name.items(): 

207 dimension_el = universe[name] 

208 for record_dict in record_dicts: 

209 for other in dimension_el.dimensions: 

210 if other != dimension_el: 

211 relation = record_dicts_by_dimension_name[other.name][0] 

212 record_dict[other.name] = relation[other.primaryKey.name] 

213 

214 return { 

215 dimension: [universe[dimension].RecordClass(**record_dict) for record_dict in record_dicts] 

216 for dimension, record_dicts in record_dicts_by_dimension_name.items() 

217 } 

218 

219 

220def _fillAllKeys(dimension: Dimension, value: str | int) -> dict[str, str | int | bytes]: 

221 """Create an arbitrary mapping of all required keys for a given dimension 

222 that do not refer to other dimensions. 

223 

224 Parameters 

225 ---------- 

226 dimension : `lsst.daf.butler.Dimension` 

227 The dimension for which to generate a set of keys (e.g., detector). 

228 value 

229 The value assigned to ``dimension`` (e.g., detector ID). 

230 

231 Returns 

232 ------- 

233 expandedValue : `dict` [`str`] 

234 A mapping of dimension keys to values. ``dimension's`` primary key 

235 maps to ``value``, but all other mappings (e.g., detector name) 

236 are arbitrary. 

237 """ 

238 expandedValue: dict[str, str | int | bytes] = {} 

239 for key in dimension.uniqueKeys: 

240 if key.nbytes: 

241 # For `bytes` fields, we want something that casts at least `str` 

242 # and `int` values to bytes and yields b'' when called with no 

243 # arguments (as in the except block below). Unfortunately, the 

244 # `bytes` type itself fails for both `str` and `int`, but this 

245 # lambda does what we need. This particularly important for the 

246 # skymap dimensions' bytes 'hash' field, which has a unique 

247 # constraint; without this, all skymaps would get a hash of b'' 

248 # and end up conflicting. 

249 castType = lambda *args: str(*args).encode() # noqa: E731 

250 else: 

251 castType = key.dtype().python_type 

252 try: 

253 castValue = castType(value) 

254 except TypeError: 

255 castValue = castType() 

256 expandedValue[key.name] = castValue 

257 for key in dimension.metadata: 

258 if not key.nullable: 

259 expandedValue[key.name] = key.dtype().python_type(value) 

260 return expandedValue 

261 

262 

263def _makeRandomDataIdValue(dimension: Dimension) -> int | str: 

264 """Generate a random value of the appropriate type for a data ID key. 

265 

266 Parameters 

267 ---------- 

268 dimension : `Dimension` 

269 Dimension the value corresponds to. 

270 

271 Returns 

272 ------- 

273 value : `int` or `str` 

274 Random value. 

275 """ 

276 if dimension.primaryKey.getPythonType() is str: 

277 return str(random.randrange(1000)) 

278 else: 

279 return random.randrange(1000) 

280 

281 

282def expandUniqueId(butler: Butler, partialId: Mapping[str, Any]) -> DataCoordinate: 

283 """Return a complete data ID matching some criterion. 

284 

285 Parameters 

286 ---------- 

287 butler : `lsst.daf.butler.Butler` 

288 The repository to query. 

289 partialId : `~collections.abc.Mapping` [`str`] 

290 A mapping of known dimensions and values. 

291 

292 Returns 

293 ------- 

294 dataId : `lsst.daf.butler.DataCoordinate` 

295 The unique data ID that matches ``partialId``. 

296 

297 Raises 

298 ------ 

299 ValueError 

300 Raised if ``partialId`` does not uniquely identify a data ID. 

301 

302 Notes 

303 ----- 

304 This method will only work correctly if all dimensions attached to the 

305 target dimension (eg., "physical_filter" for "visit") are known to the 

306 repository, even if they're not needed to identify a dataset. This function 

307 is only suitable for certain kinds of test repositories, and not for 

308 repositories intended for real data processing or analysis. 

309 

310 Examples 

311 -------- 

312 .. code-block:: py 

313 

314 >>> butler = makeTestRepo( 

315 "testdir", {"instrument": ["notACam"], "detector": [1]}) 

316 >>> expandUniqueId(butler, {"detector": 1}) 

317 DataCoordinate({instrument, detector}, ('notACam', 1)) 

318 """ 

319 # The example is *not* a doctest because it requires dangerous I/O 

320 registry = butler.registry 

321 dimensions = registry.dimensions.conform(partialId.keys()).required 

322 

323 query = " AND ".join(f"{dimension} = {value!r}" for dimension, value in partialId.items()) 

324 

325 # Much of the purpose of this function is to do something we explicitly 

326 # reject most of the time: query for a governor dimension (e.g. instrument) 

327 # given something that depends on it (e.g. visit), hence check=False. 

328 dataId = list(registry.queryDataIds(dimensions, where=query, check=False)) 

329 if len(dataId) == 1: 

330 return dataId[0] 

331 else: 

332 raise ValueError(f"Found {len(dataId)} matches for {partialId}, expected 1.") 

333 

334 

335def _findOrInventDataIdValue( 

336 butler: Butler, data_id: dict[str, str | int], dimension: Dimension 

337) -> tuple[str | int, bool]: 

338 """Look up an arbitrary value for a dimension that is consistent with a 

339 partial data ID that does not specify that dimension, or invent one if no 

340 such value exists. 

341 

342 Parameters 

343 ---------- 

344 butler : `Butler` 

345 Butler to use to look up data ID values. 

346 data_id : `dict` [ `str`, `str` or `int` ] 

347 Dictionary of possibly-related data ID values. 

348 dimension : `Dimension` 

349 Dimension to obtain a value for. 

350 

351 Returns 

352 ------- 

353 value : `int` or `str` 

354 Value for this dimension. 

355 invented : `bool` 

356 `True` if the value had to be invented, `False` if a compatible value 

357 already existed. 

358 """ 

359 # No values given by caller for this dimension. See if any exist 

360 # in the registry that are consistent with the values of dimensions 

361 # we do have: 

362 match_data_id = {key: data_id[key] for key in data_id.keys() & dimension.dimensions.names} 

363 matches = list(butler.registry.queryDimensionRecords(dimension, dataId=match_data_id).limit(1)) 

364 if not matches: 

365 # Nothing in the registry matches: invent a data ID value 

366 # with the right type (actual value does not matter). 

367 # We may or may not actually make a record with this; that's 

368 # easier to check later. 

369 dimension_value = _makeRandomDataIdValue(dimension) 

370 return dimension_value, True 

371 else: 

372 # A record does exist in the registry. Use its data ID value. 

373 dim_value = matches[0].dataId[dimension.name] 

374 assert dim_value is not None 

375 return dim_value, False 

376 

377 

378def _makeDimensionRecordDict(data_id: dict[str, str | int], dimension: Dimension) -> dict[str, Any]: 

379 """Create a dictionary that can be used to build a `DimensionRecord` that 

380 is consistent with the given data ID. 

381 

382 Parameters 

383 ---------- 

384 data_id : `dict` [ `str`, `str` or `int` ] 

385 Dictionary that contains values for at least all of 

386 ``dimension.dimensions.names`` (the main dimension, its recursive 

387 required dependencies, and its non-recursive implied dependencies). 

388 dimension : `Dimension` 

389 Dimension to build a record dictionary for. 

390 

391 Returns 

392 ------- 

393 record_dict : `dict` [ `str`, `object` ] 

394 Dictionary that can be passed as ``**kwargs`` to this dimensions 

395 record class constructor. 

396 """ 

397 # Add the primary key field for this dimension. 

398 record_dict: dict[str, Any] = {dimension.primaryKey.name: data_id[dimension.name]} 

399 # Define secondary keys (e.g., detector name given detector id) 

400 record_dict.update(_fillAllKeys(dimension, data_id[dimension.name])) 

401 # Set the foreign key values for any related dimensions that should 

402 # appear in the record. 

403 for related_dimension in dimension.dimensions: 

404 if related_dimension.name != dimension.name: 

405 record_dict[related_dimension.name] = data_id[related_dimension.name] 

406 return record_dict 

407 

408 

409def addDataIdValue(butler: Butler, dimension: str, value: str | int, **related: str | int) -> None: 

410 """Add the records that back a new data ID to a repository. 

411 

412 Parameters 

413 ---------- 

414 butler : `lsst.daf.butler.Butler` 

415 The repository to update. 

416 dimension : `str` 

417 The name of the dimension to gain a new value. 

418 value : `str` or `int` 

419 The value to register for the dimension. 

420 **related : `typing.Any` 

421 Any existing dimensions to be linked to ``value``. 

422 

423 Notes 

424 ----- 

425 Related dimensions (e.g., the instrument associated with a detector) may be 

426 specified using ``related``, which requires a value for those dimensions to 

427 have been added to the repository already (generally with a previous call 

428 to `addDataIdValue`. Any dependencies of the given dimension that are not 

429 included in ``related`` will be linked to existing values arbitrarily, and 

430 (for implied dependencies only) created and also inserted into the registry 

431 if they do not exist. Values for required dimensions and those given in 

432 ``related`` are never created. 

433 

434 Because this function creates filler data, it is only suitable for test 

435 repositories. It should not be used for repositories intended for real data 

436 processing or analysis, which have known dimension values. 

437 

438 Examples 

439 -------- 

440 See the guide on :ref:`using-butler-in-tests-make-repo` for usage examples. 

441 """ 

442 # Example is not doctest, because it's probably unsafe to create even an 

443 # in-memory butler in that environment. 

444 try: 

445 full_dimension = butler.dimensions[dimension] 

446 except KeyError as e: 

447 raise ValueError from e 

448 # Bad keys ignored by registry code 

449 extra_keys = related.keys() - full_dimension.minimal_group.names 

450 if extra_keys: 

451 raise ValueError( 

452 f"Unexpected keywords {extra_keys} not found in {full_dimension.minimal_group.names}" 

453 ) 

454 

455 # Assemble a dictionary data ID holding the given primary dimension value 

456 # and all of the related ones. 

457 data_id: dict[str, int | str] = {dimension: value} 

458 data_id.update(related) 

459 

460 # Compute the set of all dimensions that these recursively depend on. 

461 all_dimensions = butler.dimensions.conform(data_id.keys()) 

462 

463 # Create dicts that will become DimensionRecords for all of these data IDs. 

464 # This iteration is guaranteed to be in topological order, so we can count 

465 # on new data ID values being invented before they are needed. 

466 record_dicts_by_dimension: dict[Dimension, dict[str, Any]] = {} 

467 for dimension_name in all_dimensions.names: 

468 dimension_obj = butler.dimensions.dimensions[dimension_name] 

469 dimension_value = data_id.get(dimension_name) 

470 if dimension_value is None: 

471 data_id[dimension_name], invented = _findOrInventDataIdValue(butler, data_id, dimension_obj) 

472 if not invented: 

473 # No need to make a new record; one already exists. 

474 continue 

475 if dimension_name in related: 

476 # Caller passed in a value of this dimension explicitly, but it 

477 # isn't the primary dimension they asked to have a record created 

478 # for. That means they expect this record to already exist. 

479 continue 

480 if dimension_name != dimension and dimension_name in all_dimensions.required: 

481 # We also don't want to automatically create new dimension records 

482 # for required dimensions (except for the main dimension the caller 

483 # asked for); those are also asserted by the caller to already 

484 # exist. 

485 continue 

486 if not dimension_obj.has_own_table: 

487 # Don't need to bother generating full records for dimensions whose 

488 # records are not actually stored. 

489 continue 

490 record_dicts_by_dimension[dimension_obj] = _makeDimensionRecordDict(data_id, dimension_obj) 

491 

492 # Sync those dimension record dictionaries with the database. 

493 for dimension_obj, record_dict in record_dicts_by_dimension.items(): 

494 record = dimension_obj.RecordClass(**record_dict) 

495 try: 

496 butler.registry.syncDimensionData(dimension_obj, record) 

497 except sqlalchemy.exc.IntegrityError as e: 

498 raise RuntimeError( 

499 "Could not create data ID value. Automatic relationship generation " 

500 "may have failed; try adding keywords to assign a specific instrument, " 

501 "physical_filter, etc. based on the nested exception message." 

502 ) from e 

503 

504 

505def addDatasetType(butler: Butler, name: str, dimensions: set[str], storageClass: str) -> DatasetType: 

506 """Add a new dataset type to a repository. 

507 

508 Parameters 

509 ---------- 

510 butler : `lsst.daf.butler.Butler` 

511 The repository to update. 

512 name : `str` 

513 The name of the dataset type. 

514 dimensions : `set` [`str`] 

515 The dimensions of the new dataset type. 

516 storageClass : `str` 

517 The storage class the dataset will use. 

518 

519 Returns 

520 ------- 

521 datasetType : `lsst.daf.butler.DatasetType` 

522 The new type. 

523 

524 Raises 

525 ------ 

526 ValueError 

527 Raised if the dimensions or storage class is invalid. 

528 

529 Notes 

530 ----- 

531 Dataset types are shared across all collections in a repository, so this 

532 function does not need to be run for each collection. 

533 """ 

534 try: 

535 datasetType = DatasetType(name, dimensions, storageClass, universe=butler.dimensions) 

536 butler.registry.registerDatasetType(datasetType) 

537 return datasetType 

538 except KeyError as e: 

539 raise ValueError from e 

540 

541 

542class DatastoreMock: 

543 """Mocks a butler datastore. 

544 

545 Has functions that mock the datastore in a butler. Provides an `apply` 

546 function to replace the relevent butler datastore functions with the mock 

547 functions. 

548 """ 

549 

550 @staticmethod 

551 def apply(butler: Butler) -> None: 

552 """Apply datastore mocks to a butler. 

553 

554 Parameters 

555 ---------- 

556 butler : `~lsst.daf.butler.Butler` 

557 Butler to be modified. 

558 """ 

559 butler._datastore.export = DatastoreMock._mock_export # type: ignore 

560 butler._datastore.get = DatastoreMock._mock_get # type: ignore 

561 butler._datastore.ingest = MagicMock() # type: ignore 

562 

563 @staticmethod 

564 def _mock_export( 

565 refs: Iterable[DatasetRef], *, directory: str | None = None, transfer: str | None = None 

566 ) -> Iterable[FileDataset]: 

567 """Mock of `Datastore.export` that satisfies the requirement that 

568 the refs passed in are included in the `FileDataset` objects 

569 returned. 

570 

571 This can be used to construct a `Datastore` mock that can be used 

572 in repository export via:: 

573 

574 datastore = unittest.mock.Mock(spec=Datastore) 

575 datastore.export = DatastoreMock._mock_export 

576 

577 """ 

578 for ref in refs: 

579 yield FileDataset( 

580 refs=[ref], path="mock/path", formatter="lsst.daf.butler.formatters.json.JsonFormatter" 

581 ) 

582 

583 @staticmethod 

584 def _mock_get( 

585 ref: DatasetRef, 

586 parameters: Mapping[str, Any] | None = None, 

587 storageClass: StorageClass | str | None = None, 

588 ) -> tuple[DatasetId, Mapping[str, Any] | None]: 

589 """Mock of `Datastore.get` that just returns the integer dataset ID 

590 value and parameters it was given. 

591 """ 

592 return (ref.id, parameters)