Coverage for python / lsst / daf / butler / tests / _testRepo.py: 12%

145 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-28 08:36 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28from __future__ import annotations 

29 

30__all__ = [ 

31 "DatastoreMock", 

32 "addDataIdValue", 

33 "addDatasetType", 

34 "expandUniqueId", 

35 "makeTestCollection", 

36 "makeTestRepo", 

37] 

38 

39import random 

40from collections.abc import Iterable, Mapping 

41from typing import TYPE_CHECKING, Any 

42from unittest.mock import MagicMock 

43 

44import sqlalchemy 

45 

46from lsst.daf.butler import ( 

47 Butler, 

48 Config, 

49 DataCoordinate, 

50 DatasetRef, 

51 DatasetType, 

52 Dimension, 

53 DimensionUniverse, 

54 FileDataset, 

55 StorageClass, 

56) 

57 

58if TYPE_CHECKING: 

59 from lsst.daf.butler import DatasetId 

60 

61 

62def makeTestRepo( 

63 root: str, dataIds: Mapping[str, Iterable] | None = None, *, config: Config | None = None, **kwargs: Any 

64) -> Butler: 

65 """Create an empty test repository. 

66 

67 Parameters 

68 ---------- 

69 root : `str` 

70 The location of the root directory for the repository. 

71 dataIds : `~collections.abc.Mapping` \ 

72 [`str`, `~collections.abc.Iterable`], optional 

73 A mapping keyed by the dimensions used in the test. Each value is an 

74 iterable of names for that dimension (e.g., detector IDs for 

75 ``"detector"``). Related dimensions (e.g., instruments and detectors) 

76 are linked arbitrarily, with values created for implied dimensions only 

77 when needed. This parameter is provided for compatibility with old 

78 code; newer code should make the repository, then call 

79 `~lsst.daf.butler.tests.addDataIdValue`. 

80 config : `lsst.daf.butler.Config`, optional 

81 A configuration for the repository (for details, see 

82 `lsst.daf.butler.Butler.makeRepo`). If omitted, creates a repository 

83 with default dataset and storage types, but optimized for speed. The 

84 defaults set ``.datastore.cls``, ``.datastore.checksum`` and 

85 ``.registry.db``. If a supplied config does not specify these values 

86 the internal defaults will be used to ensure that we have a usable 

87 configuration. 

88 **kwargs 

89 Extra arguments to `lsst.daf.butler.Butler.makeRepo`. 

90 

91 Returns 

92 ------- 

93 butler : `lsst.daf.butler.Butler` 

94 A Butler referring to the new repository. This Butler is provided only 

95 for additional setup; to keep test cases isolated, it is highly 

96 recommended that each test create its own Butler with a unique 

97 run/collection. See `makeTestCollection`. 

98 

99 Notes 

100 ----- 

101 This function provides a "quick and dirty" repository for simple unit tests 

102 that don't depend on complex data relationships. It is ill-suited for tests 

103 where the structure of the data matters. If you need such a dataset, create 

104 it directly or use a saved test dataset. 

105 """ 

106 defaults = Config() 

107 defaults["datastore", "cls"] = "lsst.daf.butler.datastores.inMemoryDatastore.InMemoryDatastore" 

108 defaults["datastore", "checksum"] = False # In case of future changes 

109 defaults["registry", "db"] = "sqlite:///<butlerRoot>/gen3.sqlite3" 

110 

111 if config: 

112 defaults.update(config) 

113 

114 if not dataIds: 

115 dataIds = {} 

116 

117 # Disable config root by default so that our registry override will 

118 # not be ignored. 

119 # newConfig guards against location-related keywords like outfile 

120 newConfig = Butler.makeRepo(root, config=defaults, forceConfigRoot=False, **kwargs) 

121 butler = Butler.from_config(newConfig, writeable=True) 

122 dimensionRecords = _makeRecords(dataIds, butler.dimensions) 

123 for dimension, records in dimensionRecords.items(): 

124 if butler.dimensions[dimension].has_own_table: 

125 butler.registry.insertDimensionData(dimension, *records) 

126 return butler 

127 

128 

129def makeTestCollection(repo: Butler, uniqueId: str | None = None) -> Butler: 

130 """Create a read/write Butler to a fresh collection. 

131 

132 Parameters 

133 ---------- 

134 repo : `lsst.daf.butler.Butler` 

135 A previously existing Butler to a repository, such as that returned by 

136 `~lsst.daf.butler.Butler.makeRepo` or `makeTestRepo`. 

137 uniqueId : `str`, optional 

138 A collection ID guaranteed by external code to be unique across all 

139 calls to ``makeTestCollection`` for the same repository. 

140 

141 Returns 

142 ------- 

143 butler : `lsst.daf.butler.Butler` 

144 A Butler referring to a new collection in the repository at ``root``. 

145 The collection is (almost) guaranteed to be new. 

146 

147 Notes 

148 ----- 

149 This function creates a single run collection that does not necessarily 

150 conform to any repository conventions. It is only suitable for creating an 

151 isolated test area, and not for repositories intended for real data 

152 processing or analysis. 

153 """ 

154 if not uniqueId: 

155 # Create a "random" collection name 

156 # Speed matters more than cryptographic guarantees 

157 uniqueId = str(random.randrange(1_000_000_000)) 

158 collection = "test_" + uniqueId 

159 return Butler.from_config(butler=repo, run=collection) 

160 

161 

162def _makeRecords(dataIds: Mapping[str, Iterable], universe: DimensionUniverse) -> Mapping[str, Iterable]: 

163 """Create cross-linked dimension records from a collection of 

164 data ID values. 

165 

166 Parameters 

167 ---------- 

168 dataIds : `~collections.abc.Mapping` [`str`, `~collections.abc.Iterable`] 

169 A mapping keyed by the dimensions of interest. Each value is an 

170 iterable of names for that dimension (e.g., detector IDs for 

171 ``"detector"``). 

172 universe : lsst.daf.butler.DimensionUniverse 

173 Set of all known dimensions and their relationships. 

174 

175 Returns 

176 ------- 

177 dataIds : `~collections.abc.Mapping` [`str`, `~collections.abc.Iterable`] 

178 A mapping keyed by the dimensions of interest, giving one 

179 `~lsst.daf.butler.DimensionRecord` for each input name. Related 

180 dimensions (e.g., instruments and detectors) are linked arbitrarily. 

181 """ 

182 # Create values for all dimensions that are (recursive) required or implied 

183 # dependencies of the given ones. 

184 complete_data_id_values = {} 

185 for dimension_name in universe.conform(dataIds.keys()).names: 

186 if dimension_name in dataIds: 

187 complete_data_id_values[dimension_name] = list(dataIds[dimension_name]) 

188 if dimension_name not in complete_data_id_values: 

189 complete_data_id_values[dimension_name] = [ 

190 _makeRandomDataIdValue(universe.dimensions[dimension_name]) 

191 ] 

192 

193 # Start populating dicts that will become DimensionRecords by providing 

194 # alternate keys like detector names 

195 record_dicts_by_dimension_name: dict[str, list[dict[str, str | int | bytes]]] = {} 

196 for name, values in complete_data_id_values.items(): 

197 record_dicts_by_dimension_name[name] = [] 

198 dimension_el = universe[name] 

199 for value in values: 

200 # _fillAllKeys wants Dimension and not DimensionElement. 

201 # universe.__getitem__ says it returns DimensionElement but this 

202 # really does also seem to be a Dimension here. 

203 record_dicts_by_dimension_name[name].append( 

204 _fillAllKeys(dimension_el, value) # type: ignore[arg-type] 

205 ) 

206 

207 # Pick cross-relationships arbitrarily 

208 for name, record_dicts in record_dicts_by_dimension_name.items(): 

209 dimension_el = universe[name] 

210 for record_dict in record_dicts: 

211 for other in dimension_el.dimensions: 

212 if other != dimension_el: 

213 relation = record_dicts_by_dimension_name[other.name][0] 

214 record_dict[other.name] = relation[other.primaryKey.name] 

215 

216 return { 

217 dimension: [universe[dimension].RecordClass(**record_dict) for record_dict in record_dicts] 

218 for dimension, record_dicts in record_dicts_by_dimension_name.items() 

219 } 

220 

221 

222def _fillAllKeys(dimension: Dimension, value: str | int) -> dict[str, str | int | bytes]: 

223 """Create an arbitrary mapping of all required keys for a given dimension 

224 that do not refer to other dimensions. 

225 

226 Parameters 

227 ---------- 

228 dimension : `lsst.daf.butler.Dimension` 

229 The dimension for which to generate a set of keys (e.g., detector). 

230 value 

231 The value assigned to ``dimension`` (e.g., detector ID). 

232 

233 Returns 

234 ------- 

235 expandedValue : `dict` [`str`] 

236 A mapping of dimension keys to values. ``dimension's`` primary key 

237 maps to ``value``, but all other mappings (e.g., detector name) 

238 are arbitrary. 

239 """ 

240 expandedValue: dict[str, str | int | bytes] = {} 

241 for key in dimension.uniqueKeys: 

242 if key.nbytes: 

243 # For `bytes` fields, we want something that casts at least `str` 

244 # and `int` values to bytes and yields b'' when called with no 

245 # arguments (as in the except block below). Unfortunately, the 

246 # `bytes` type itself fails for both `str` and `int`, but this 

247 # lambda does what we need. This particularly important for the 

248 # skymap dimensions' bytes 'hash' field, which has a unique 

249 # constraint; without this, all skymaps would get a hash of b'' 

250 # and end up conflicting. 

251 castType = lambda *args: str(*args).encode() # noqa: E731 

252 else: 

253 castType = key.dtype().python_type 

254 try: 

255 castValue = castType(value) 

256 except TypeError: 

257 castValue = castType() 

258 expandedValue[key.name] = castValue 

259 for key in dimension.metadata: 

260 if not key.nullable: 

261 expandedValue[key.name] = key.dtype().python_type(value) 

262 return expandedValue 

263 

264 

265def _makeRandomDataIdValue(dimension: Dimension) -> int | str: 

266 """Generate a random value of the appropriate type for a data ID key. 

267 

268 Parameters 

269 ---------- 

270 dimension : `Dimension` 

271 Dimension the value corresponds to. 

272 

273 Returns 

274 ------- 

275 value : `int` or `str` 

276 Random value. 

277 """ 

278 if dimension.primaryKey.getPythonType() is str: 

279 return str(random.randrange(1000)) 

280 else: 

281 return random.randrange(1000) 

282 

283 

284def expandUniqueId(butler: Butler, partialId: Mapping[str, Any]) -> DataCoordinate: 

285 """Return a complete data ID matching some criterion. 

286 

287 Parameters 

288 ---------- 

289 butler : `lsst.daf.butler.Butler` 

290 The repository to query. 

291 partialId : `~collections.abc.Mapping` [`str`] 

292 A mapping of known dimensions and values. 

293 

294 Returns 

295 ------- 

296 dataId : `lsst.daf.butler.DataCoordinate` 

297 The unique data ID that matches ``partialId``. 

298 

299 Raises 

300 ------ 

301 ValueError 

302 Raised if ``partialId`` does not uniquely identify a data ID. 

303 

304 Notes 

305 ----- 

306 This method will only work correctly if all dimensions attached to the 

307 target dimension (eg., "physical_filter" for "visit") are known to the 

308 repository, even if they're not needed to identify a dataset. This function 

309 is only suitable for certain kinds of test repositories, and not for 

310 repositories intended for real data processing or analysis. 

311 

312 Examples 

313 -------- 

314 .. code-block:: py 

315 

316 >>> butler = makeTestRepo( 

317 "testdir", {"instrument": ["notACam"], "detector": [1]}) 

318 >>> expandUniqueId(butler, {"detector": 1}) 

319 DataCoordinate({instrument, detector}, ('notACam', 1)) 

320 """ 

321 # The example is *not* a doctest because it requires dangerous I/O 

322 registry = butler.registry 

323 dimensions = registry.dimensions.conform(partialId.keys()).required 

324 

325 query = " AND ".join(f"{dimension} = {value!r}" for dimension, value in partialId.items()) 

326 

327 # Much of the purpose of this function is to do something we explicitly 

328 # reject most of the time: query for a governor dimension (e.g. instrument) 

329 # given something that depends on it (e.g. visit), hence check=False. 

330 dataId = list(registry.queryDataIds(dimensions, where=query, check=False)) 

331 if len(dataId) == 1: 

332 return dataId[0] 

333 else: 

334 raise ValueError(f"Found {len(dataId)} matches for {partialId}, expected 1.") 

335 

336 

337def _findOrInventDataIdValue( 

338 butler: Butler, data_id: dict[str, str | int], dimension: Dimension 

339) -> tuple[str | int, bool]: 

340 """Look up an arbitrary value for a dimension that is consistent with a 

341 partial data ID that does not specify that dimension, or invent one if no 

342 such value exists. 

343 

344 Parameters 

345 ---------- 

346 butler : `Butler` 

347 Butler to use to look up data ID values. 

348 data_id : `dict` [ `str`, `str` or `int` ] 

349 Dictionary of possibly-related data ID values. 

350 dimension : `Dimension` 

351 Dimension to obtain a value for. 

352 

353 Returns 

354 ------- 

355 value : `int` or `str` 

356 Value for this dimension. 

357 invented : `bool` 

358 `True` if the value had to be invented, `False` if a compatible value 

359 already existed. 

360 """ 

361 # No values given by caller for this dimension. See if any exist 

362 # in the registry that are consistent with the values of dimensions 

363 # we do have: 

364 match_data_id = {key: data_id[key] for key in data_id.keys() & dimension.dimensions.names} 

365 matches = list(butler.registry.queryDimensionRecords(dimension, dataId=match_data_id).limit(1)) 

366 if not matches: 

367 # Nothing in the registry matches: invent a data ID value 

368 # with the right type (actual value does not matter). 

369 # We may or may not actually make a record with this; that's 

370 # easier to check later. 

371 dimension_value = _makeRandomDataIdValue(dimension) 

372 return dimension_value, True 

373 else: 

374 # A record does exist in the registry. Use its data ID value. 

375 dim_value = matches[0].dataId[dimension.name] 

376 assert dim_value is not None 

377 return dim_value, False 

378 

379 

380def _makeDimensionRecordDict(data_id: dict[str, str | int], dimension: Dimension) -> dict[str, Any]: 

381 """Create a dictionary that can be used to build a `DimensionRecord` that 

382 is consistent with the given data ID. 

383 

384 Parameters 

385 ---------- 

386 data_id : `dict` [ `str`, `str` or `int` ] 

387 Dictionary that contains values for at least all of 

388 ``dimension.dimensions.names`` (the main dimension, its recursive 

389 required dependencies, and its non-recursive implied dependencies). 

390 dimension : `Dimension` 

391 Dimension to build a record dictionary for. 

392 

393 Returns 

394 ------- 

395 record_dict : `dict` [ `str`, `object` ] 

396 Dictionary that can be passed as ``**kwargs`` to this dimensions 

397 record class constructor. 

398 """ 

399 # Add the primary key field for this dimension. 

400 record_dict: dict[str, Any] = {dimension.primaryKey.name: data_id[dimension.name]} 

401 # Define secondary keys (e.g., detector name given detector id) 

402 record_dict.update(_fillAllKeys(dimension, data_id[dimension.name])) 

403 # Set the foreign key values for any related dimensions that should 

404 # appear in the record. 

405 for related_dimension in dimension.dimensions: 

406 if related_dimension.name != dimension.name: 

407 record_dict[related_dimension.name] = data_id[related_dimension.name] 

408 return record_dict 

409 

410 

411def addDataIdValue(butler: Butler, dimension: str, value: str | int, **related: str | int) -> None: 

412 """Add the records that back a new data ID to a repository. 

413 

414 Parameters 

415 ---------- 

416 butler : `lsst.daf.butler.Butler` 

417 The repository to update. 

418 dimension : `str` 

419 The name of the dimension to gain a new value. 

420 value : `str` or `int` 

421 The value to register for the dimension. 

422 **related : `typing.Any` 

423 Any existing dimensions to be linked to ``value``. 

424 

425 Notes 

426 ----- 

427 Related dimensions (e.g., the instrument associated with a detector) may be 

428 specified using ``related``, which requires a value for those dimensions to 

429 have been added to the repository already (generally with a previous call 

430 to `addDataIdValue`. Any dependencies of the given dimension that are not 

431 included in ``related`` will be linked to existing values arbitrarily, and 

432 (for implied dependencies only) created and also inserted into the registry 

433 if they do not exist. Values for required dimensions and those given in 

434 ``related`` are never created. 

435 

436 Because this function creates filler data, it is only suitable for test 

437 repositories. It should not be used for repositories intended for real data 

438 processing or analysis, which have known dimension values. 

439 

440 Examples 

441 -------- 

442 See the guide on :ref:`using-butler-in-tests-make-repo` for usage examples. 

443 """ 

444 # Example is not doctest, because it's probably unsafe to create even an 

445 # in-memory butler in that environment. 

446 try: 

447 full_dimension = butler.dimensions[dimension] 

448 except KeyError as e: 

449 raise ValueError from e 

450 # Bad keys ignored by registry code 

451 extra_keys = related.keys() - full_dimension.minimal_group.names 

452 if extra_keys: 

453 raise ValueError( 

454 f"Unexpected keywords {extra_keys} not found in {full_dimension.minimal_group.names}" 

455 ) 

456 

457 # Assemble a dictionary data ID holding the given primary dimension value 

458 # and all of the related ones. 

459 data_id: dict[str, int | str] = {dimension: value} 

460 data_id.update(related) 

461 

462 # Compute the set of all dimensions that these recursively depend on. 

463 all_dimensions = butler.dimensions.conform(data_id.keys()) 

464 

465 # Create dicts that will become DimensionRecords for all of these data IDs. 

466 # This iteration is guaranteed to be in topological order, so we can count 

467 # on new data ID values being invented before they are needed. 

468 record_dicts_by_dimension: dict[Dimension, dict[str, Any]] = {} 

469 for dimension_name in all_dimensions.names: 

470 dimension_obj = butler.dimensions.dimensions[dimension_name] 

471 dimension_value = data_id.get(dimension_name) 

472 if dimension_value is None: 

473 data_id[dimension_name], invented = _findOrInventDataIdValue(butler, data_id, dimension_obj) 

474 if not invented: 

475 # No need to make a new record; one already exists. 

476 continue 

477 if dimension_name in related: 

478 # Caller passed in a value of this dimension explicitly, but it 

479 # isn't the primary dimension they asked to have a record created 

480 # for. That means they expect this record to already exist. 

481 continue 

482 if dimension_name != dimension and dimension_name in all_dimensions.required: 

483 # We also don't want to automatically create new dimension records 

484 # for required dimensions (except for the main dimension the caller 

485 # asked for); those are also asserted by the caller to already 

486 # exist. 

487 continue 

488 if not dimension_obj.has_own_table: 

489 # Don't need to bother generating full records for dimensions whose 

490 # records are not actually stored. 

491 continue 

492 record_dicts_by_dimension[dimension_obj] = _makeDimensionRecordDict(data_id, dimension_obj) 

493 

494 # Sync those dimension record dictionaries with the database. 

495 for dimension_obj, record_dict in record_dicts_by_dimension.items(): 

496 record = dimension_obj.RecordClass(**record_dict) 

497 try: 

498 butler.registry.syncDimensionData(dimension_obj, record) 

499 except sqlalchemy.exc.IntegrityError as e: 

500 raise RuntimeError( 

501 "Could not create data ID value. Automatic relationship generation " 

502 "may have failed; try adding keywords to assign a specific instrument, " 

503 "physical_filter, etc. based on the nested exception message." 

504 ) from e 

505 

506 

507def addDatasetType(butler: Butler, name: str, dimensions: set[str], storageClass: str) -> DatasetType: 

508 """Add a new dataset type to a repository. 

509 

510 Parameters 

511 ---------- 

512 butler : `lsst.daf.butler.Butler` 

513 The repository to update. 

514 name : `str` 

515 The name of the dataset type. 

516 dimensions : `set` [`str`] 

517 The dimensions of the new dataset type. 

518 storageClass : `str` 

519 The storage class the dataset will use. 

520 

521 Returns 

522 ------- 

523 datasetType : `lsst.daf.butler.DatasetType` 

524 The new type. 

525 

526 Raises 

527 ------ 

528 ValueError 

529 Raised if the dimensions or storage class is invalid. 

530 

531 Notes 

532 ----- 

533 Dataset types are shared across all collections in a repository, so this 

534 function does not need to be run for each collection. 

535 """ 

536 try: 

537 datasetType = DatasetType(name, dimensions, storageClass, universe=butler.dimensions) 

538 butler.registry.registerDatasetType(datasetType) 

539 return datasetType 

540 except KeyError as e: 

541 raise ValueError from e 

542 

543 

544class DatastoreMock: 

545 """Mocks a butler datastore. 

546 

547 Has functions that mock the datastore in a butler. Provides an `apply` 

548 function to replace the relevent butler datastore functions with the mock 

549 functions. 

550 """ 

551 

552 @staticmethod 

553 def apply(butler: Butler) -> None: 

554 """Apply datastore mocks to a butler. 

555 

556 Parameters 

557 ---------- 

558 butler : `~lsst.daf.butler.Butler` 

559 Butler to be modified. 

560 """ 

561 butler._datastore.export = DatastoreMock._mock_export # type: ignore 

562 butler._datastore.get = DatastoreMock._mock_get # type: ignore 

563 butler._datastore.ingest = MagicMock() # type: ignore 

564 

565 @staticmethod 

566 def _mock_export( 

567 refs: Iterable[DatasetRef], *, directory: str | None = None, transfer: str | None = None 

568 ) -> Iterable[FileDataset]: 

569 """Mock of `Datastore.export` that satisfies the requirement that 

570 the refs passed in are included in the `FileDataset` objects 

571 returned. 

572 

573 This can be used to construct a `Datastore` mock that can be used 

574 in repository export via:: 

575 

576 datastore = unittest.mock.Mock(spec=Datastore) 

577 datastore.export = DatastoreMock._mock_export 

578 

579 """ 

580 for ref in refs: 

581 yield FileDataset( 

582 refs=[ref], path="mock/path", formatter="lsst.daf.butler.formatters.json.JsonFormatter" 

583 ) 

584 

585 @staticmethod 

586 def _mock_get( 

587 ref: DatasetRef, 

588 parameters: Mapping[str, Any] | None = None, 

589 storageClass: StorageClass | str | None = None, 

590 ) -> tuple[DatasetId, Mapping[str, Any] | None]: 

591 """Mock of `Datastore.get` that just returns the integer dataset ID 

592 value and parameters it was given. 

593 """ 

594 return (ref.id, parameters)