Coverage for python/lsst/obs/base/ingest_tests.py: 32%

215 statements  

« prev     ^ index     » next       coverage.py v7.4.4, created at 2024-03-23 10:53 +0000

1# This file is part of obs_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <https://www.gnu.org/licenses/>. 

21 

22"""Base class for writing Gen3 raw data ingest tests. 

23""" 

24 

25__all__ = ("IngestTestBase",) 

26 

27import abc 

28import os 

29import shutil 

30import tempfile 

31import unittest 

32 

33import lsst.afw.cameraGeom 

34import lsst.afw.cameraGeom.testUtils # For assertDetectorsEqual 

35import lsst.obs.base 

36from lsst.daf.butler import Butler, Registry 

37from lsst.daf.butler.cli.butler import cli as butlerCli 

38from lsst.daf.butler.cli.utils import LogCliRunner 

39from lsst.pipe.base import Instrument 

40from lsst.resources import ResourcePath 

41from lsst.utils import doImportType 

42 

43from . import script 

44 

45 

46class IngestTestBase(metaclass=abc.ABCMeta): 

47 """Base class for tests of gen3 ingest. Subclass from this, then 

48 `unittest.TestCase` to get a working test suite. 

49 """ 

50 

51 ingestDir = "" 

52 """Root path to ingest files into. Typically `obs_package/tests/`; the 

53 actual directory will be a tempdir under this one. 

54 """ 

55 

56 ingestDatasetTypeName = "raw" 

57 """The DatasetType to use for the ingest. 

58 

59 If this is not an Exposure dataset type the tests will be more limited. 

60 """ 

61 

62 dataIds = [] 

63 """list of butler data IDs of files that should have been ingested.""" 

64 

65 file = "" 

66 """Full path to a file to ingest in tests.""" 

67 

68 filterLabel = None 

69 """The lsst.afw.image.FilterLabel that should be returned by the above 

70 file.""" 

71 

72 rawIngestTask = "lsst.obs.base.RawIngestTask" 

73 """The task to use in the Ingest test.""" 

74 

75 curatedCalibrationDatasetTypes = None 

76 """List or tuple of Datasets types that should be present after calling 

77 writeCuratedCalibrations. If `None` writeCuratedCalibrations will 

78 not be called and the test will be skipped.""" 

79 

80 defineVisitsTask = lsst.obs.base.DefineVisitsTask 

81 """The task to use to define visits from groups of exposures. 

82 This is ignored if ``visits`` is `None`. 

83 """ 

84 

85 visits = {} 

86 """A dictionary mapping visit data IDs the lists of exposure data IDs that 

87 are associated with them. 

88 If this is empty (but not `None`), visit definition will be run but no 

89 visits will be expected (e.g. because no exposures are on-sky 

90 observations). 

91 """ 

92 

93 seed_config = None 

94 """Location of a seed configuration file to pass to butler create. 

95 

96 Useful if additional formatters or storage classes need to be defined. 

97 """ 

98 

99 @property 

100 @abc.abstractmethod 

101 def instrumentClassName(self): 

102 """The fully qualified instrument class name. 

103 

104 Returns 

105 ------- 

106 `str` 

107 The fully qualified instrument class name. 

108 """ 

109 pass 

110 

111 @property 

112 def instrumentClass(self): 

113 """The instrument class.""" 

114 return doImportType(self.instrumentClassName) 

115 

116 @property 

117 def instrumentName(self): 

118 """The name of the instrument. 

119 

120 Returns 

121 ------- 

122 `str` 

123 The name of the instrument. 

124 """ 

125 return self.instrumentClass.getName() 

126 

127 @classmethod 

128 def setUpClass(cls): 

129 # Use a temporary working directory. 

130 cls.root = tempfile.mkdtemp(dir=cls.ingestDir) 

131 cls._createRepo() 

132 

133 # Register the instrument and its static metadata. 

134 cls._registerInstrument() 

135 

136 # Determine the relevant datastore root to use for testing. 

137 butler = Butler(cls.root) 

138 roots = butler.get_datastore_roots() 

139 assert len(roots) == 1 # Only one datastore. 

140 cls.datastore_root = list(roots.values())[0] 

141 

142 def setUp(self): 

143 # Want a unique run name per test. 

144 self.outputRun = "raw_ingest_" + self.id() 

145 

146 @classmethod 

147 def tearDownClass(cls): 

148 if os.path.exists(cls.root): 

149 shutil.rmtree(cls.root, ignore_errors=True) 

150 

151 def verifyIngest(self, files=None, cli=False, fullCheck=False): 

152 """ 

153 Test that RawIngestTask ingested the expected files. 

154 

155 Parameters 

156 ---------- 

157 files : `list` [`str`], or None 

158 List of files to be ingested, or None to use ``self.file`` 

159 fullCheck : `bool`, optional 

160 If `True`, read the full raw dataset and check component 

161 consistency. If `False` check that a component can be read 

162 but do not read the entire raw exposure. 

163 

164 Notes 

165 ----- 

166 Reading all the ingested test data can be expensive. The code paths 

167 for reading the second raw are the same as reading the first so 

168 we do not gain anything by doing full checks of everything. 

169 Only read full pixel data for first dataset from file. 

170 Don't even do that if we are requested not to by the caller. 

171 This only really affects files that contain multiple datasets. 

172 """ 

173 butler = Butler(self.root, run=self.outputRun) 

174 datasets = list(butler.registry.queryDatasets(self.ingestDatasetTypeName, collections=self.outputRun)) 

175 self.assertEqual(len(datasets), len(self.dataIds)) 

176 

177 # Can check that the timespan in the day_obs matches the exposure 

178 # record. 

179 if "day_obs" in butler.dimensions: 

180 days = { 

181 (rec.instrument, rec.id): rec.timespan 

182 for rec in butler.registry.queryDimensionRecords("day_obs") 

183 } 

184 

185 exp_records = list(butler.registry.queryDimensionRecords("exposure")) 

186 for exp in exp_records: 

187 day_span = days[exp.instrument, exp.day_obs] 

188 if day_span is not None: 

189 self.assertTrue( 

190 day_span.contains(exp.timespan.begin), f"Timespan mismatch of {exp} and {day_span}" 

191 ) 

192 

193 # Get the URI to the first dataset and check it is inside the 

194 # datastore. 

195 datasetUri = butler.getURI(datasets[0]) 

196 self.assertIsNotNone(datasetUri.relative_to(self.datastore_root)) 

197 

198 # Get the relevant dataset type. 

199 datasetType = butler.get_dataset_type(self.ingestDatasetTypeName) 

200 

201 for dataId in self.dataIds: 

202 # For testing we only read the entire dataset the first time 

203 # round if this is an Exposure. If it's not an Exposure 

204 # we always read it completely but we don't read components 

205 # because for an arbitrary dataset type we can't easily tell 

206 # what component to test. 

207 

208 if not datasetType.storageClass.name.startswith("Exposure"): 

209 exposure = butler.get(self.ingestDatasetTypeName, dataId) 

210 # Could be anything so nothing to test by default 

211 continue 

212 

213 # Check that we can read metadata from a raw. 

214 metadata = butler.get(f"{self.ingestDatasetTypeName}.metadata", dataId) 

215 if not fullCheck: 

216 continue 

217 fullCheck = False 

218 exposure = butler.get(self.ingestDatasetTypeName, dataId) 

219 

220 # Comparing headers will not work directly because of header 

221 # fix up provenance. 

222 metadata_headers = metadata.toDict() 

223 exposure_headers = exposure.getMetadata().toDict() 

224 metadata_headers.pop("HIERARCH ASTRO METADATA FIX DATE", None) 

225 exposure_headers.pop("HIERARCH ASTRO METADATA FIX DATE", None) 

226 self.assertEqual(metadata_headers, exposure_headers) 

227 

228 # Since components follow a different code path we check that 

229 # WCS match and also we check that at least the shape 

230 # of the image is the same (rather than doing per-pixel equality) 

231 wcs = butler.get(f"{self.ingestDatasetTypeName}.wcs", dataId) 

232 self.assertEqual(wcs, exposure.getWcs()) 

233 

234 rawImage = butler.get(f"{self.ingestDatasetTypeName}.image", dataId) 

235 self.assertEqual(rawImage.getBBox(), exposure.getBBox()) 

236 

237 # Check that the filter label got the correct band. 

238 filterLabel = butler.get(f"{self.ingestDatasetTypeName}.filter", dataId) 

239 self.assertEqual(filterLabel, self.filterLabel) 

240 

241 # Check that the exposure's Detector is the same as the component 

242 # we would read (this is tricky for LSST, which modifies its 

243 # detector at read time; for most other cameras it should be 

244 # trivially satisfied. 

245 detector = butler.get(f"{self.ingestDatasetTypeName}.detector", dataId) 

246 self.assertDetectorsEqual(detector, exposure.getDetector(), compareTransforms=False) 

247 

248 self.checkRepo(files=files) 

249 

250 def checkRepo(self, files=None): 

251 """Check the state of the repository after ingest. 

252 

253 This is an optional hook provided for subclasses; by default it does 

254 nothing. 

255 

256 Parameters 

257 ---------- 

258 files : `list` [`str`], or None 

259 List of files to be ingested, or None to use ``self.file`` 

260 """ 

261 return 

262 

263 @classmethod 

264 def _createRepo(cls): 

265 """Use the Click `testing` module to call the butler command line api 

266 to create a repository. 

267 """ 

268 runner = LogCliRunner() 

269 args = [] 

270 if cls.seed_config: 

271 args.extend(["--seed-config", cls.seed_config]) 

272 result = runner.invoke(butlerCli, ["create", cls.root, *args]) 

273 # Classmethod so assertEqual does not work. 

274 assert result.exit_code == 0, f"output: {result.output} exception: {result.exception}" 

275 

276 def _ingestRaws(self, transfer, file=None): 

277 """Use the Click `testing` module to call the butler command line api 

278 to ingest raws. 

279 

280 Parameters 

281 ---------- 

282 transfer : `str` 

283 The external data transfer type. 

284 file : `str` 

285 Path to a file to ingest instead of the default associated with 

286 the object. 

287 """ 

288 if file is None: 

289 file = self.file 

290 runner = LogCliRunner() 

291 result = runner.invoke( 

292 butlerCli, 

293 [ 

294 "ingest-raws", 

295 self.root, 

296 file, 

297 "--output-run", 

298 self.outputRun, 

299 "--transfer", 

300 transfer, 

301 "--ingest-task", 

302 self.rawIngestTask, 

303 ], 

304 ) 

305 self.assertEqual(result.exit_code, 0, f"output: {result.output} exception: {result.exception}") 

306 

307 @classmethod 

308 def _registerInstrument(cls): 

309 """Use the Click `testing` module to call the butler command line api 

310 to register the instrument. 

311 """ 

312 runner = LogCliRunner() 

313 result = runner.invoke(butlerCli, ["register-instrument", cls.root, cls.instrumentClassName]) 

314 # Classmethod so assertEqual does not work. 

315 assert result.exit_code == 0, f"output: {result.output} exception: {result.exception}" 

316 

317 def _writeCuratedCalibrations(self): 

318 """Use the Click `testing` module to call the butler command line api 

319 to write curated calibrations. 

320 """ 

321 runner = LogCliRunner() 

322 result = runner.invoke(butlerCli, ["write-curated-calibrations", self.root, self.instrumentName]) 

323 self.assertEqual(result.exit_code, 0, f"output: {result.output} exception: {result.exception}") 

324 

325 def testLink(self): 

326 self._ingestRaws(transfer="link") 

327 self.verifyIngest() 

328 

329 def testSymLink(self): 

330 self._ingestRaws(transfer="symlink") 

331 self.verifyIngest() 

332 

333 def testDirect(self): 

334 self._ingestRaws(transfer="direct") 

335 

336 # Check that it really did have a URI outside of datastore. 

337 srcUri = ResourcePath(self.file, forceAbsolute=True) 

338 butler = Butler(self.root, run=self.outputRun) 

339 datasets = list(butler.registry.queryDatasets(self.ingestDatasetTypeName, collections=self.outputRun)) 

340 datastoreUri = butler.getURI(datasets[0]) 

341 self.assertEqual(datastoreUri, srcUri) 

342 

343 def testCopy(self): 

344 self._ingestRaws(transfer="copy") 

345 # Only test full read of raws for the copy test. No need to do it 

346 # in the other tests since the formatter will be the same in all 

347 # cases. 

348 self.verifyIngest(fullCheck=True) 

349 

350 def testHardLink(self): 

351 try: 

352 self._ingestRaws(transfer="hardlink") 

353 # Running ingest through the Click testing infrastructure causes 

354 # the original exception indicating that we can't hard-link 

355 # on this filesystem to be turned into a nonzero exit code, which 

356 # then trips the test assertion. 

357 except (AssertionError, PermissionError) as err: 

358 raise unittest.SkipTest( 

359 "Skipping hard-link test because input data is on a different filesystem." 

360 ) from err 

361 self.verifyIngest() 

362 

363 def testInPlace(self): 

364 """Test that files already in the directory can be added to the 

365 registry in-place. 

366 """ 

367 butler = Butler(self.root, run=self.outputRun) 

368 

369 # If the test uses an index file the index file needs to also 

370 # appear in the datastore root along with the file to be ingested. 

371 # In that scenario the file name being used for ingest can not 

372 # be modified and must have the same name as found in the index 

373 # file itself. 

374 source_file_uri = ResourcePath(self.file) 

375 index_file = source_file_uri.dirname().join("_index.json") 

376 pathInStore = source_file_uri.basename() 

377 if index_file.exists(): 

378 os.symlink(index_file.ospath, self.datastore_root.join("_index.json").ospath) 

379 else: 

380 # No index file so we are free to pick any name. 

381 pathInStore = "prefix-" + pathInStore 

382 

383 # Create a symlink to the original file so that it looks like it 

384 # is now inside the datastore. 

385 newPath = self.datastore_root.join(pathInStore) 

386 os.symlink(os.path.abspath(self.file), newPath.ospath) 

387 

388 # If there is a sidecar file it needs to be linked in as well 

389 # since ingest code does not follow symlinks. 

390 sidecar_uri = ResourcePath(source_file_uri).updatedExtension(".json") 

391 if sidecar_uri.exists(): 

392 newSidecar = ResourcePath(newPath).updatedExtension(".json") 

393 os.symlink(sidecar_uri.ospath, newSidecar.ospath) 

394 

395 # Run ingest with auto mode since that should automatically determine 

396 # that an in-place ingest is happening. 

397 self._ingestRaws(transfer="auto", file=newPath.ospath) 

398 self.verifyIngest() 

399 

400 # Recreate a butler post-ingest (the earlier one won't see the 

401 # ingested files). 

402 butler = Butler(self.root, run=self.outputRun) 

403 

404 # Check that the URI associated with this path is the right one. 

405 uri = butler.getURI(self.ingestDatasetTypeName, self.dataIds[0]) 

406 self.assertEqual(uri.relative_to(self.datastore_root), pathInStore) 

407 

408 def testFailOnConflict(self): 

409 """Re-ingesting the same data into the repository should fail.""" 

410 self._ingestRaws(transfer="symlink") 

411 with self.assertRaises(AssertionError): 

412 self._ingestRaws(transfer="symlink") 

413 

414 def testWriteCuratedCalibrations(self): 

415 """Test that we can ingest the curated calibrations, and read them 

416 with `loadCamera` both before and after. 

417 """ 

418 if self.curatedCalibrationDatasetTypes is None: 

419 raise unittest.SkipTest("Class requests disabling of writeCuratedCalibrations test") 

420 

421 butler = Butler(self.root, writeable=False) 

422 collection = self.instrumentClass().makeCalibrationCollectionName() 

423 

424 # Trying to load a camera with a data ID not known to the registry 

425 # is an error, because we can't get any temporal information. 

426 with self.assertRaises(LookupError): 

427 lsst.obs.base.loadCamera(butler, {"exposure": 0}, collections=collection) 

428 

429 # Ingest raws in order to get some exposure records. 

430 self._ingestRaws(transfer="auto") 

431 

432 # Load camera should returned an unversioned camera because there's 

433 # nothing in the repo. 

434 camera, isVersioned = lsst.obs.base.loadCamera(butler, self.dataIds[0], collections=collection) 

435 self.assertFalse(isVersioned) 

436 self.assertIsInstance(camera, lsst.afw.cameraGeom.Camera) 

437 

438 self._writeCuratedCalibrations() 

439 

440 # Make a new butler instance to make sure we don't have any stale 

441 # caches (e.g. of DatasetTypes). Note that we didn't give 

442 # _writeCuratedCalibrations the butler instance we had, because it's 

443 # trying to test the CLI interface anyway. 

444 butler = Butler(self.root, writeable=False) 

445 

446 instrumentClass = self.instrumentClass() 

447 calibration_names = instrumentClass.getCuratedCalibrationNames() 

448 

449 for datasetTypeName in self.curatedCalibrationDatasetTypes: 

450 with self.subTest(dtype=datasetTypeName): 

451 found = list( 

452 butler.registry.queryDatasetAssociations( 

453 datasetTypeName, 

454 collections=collection, 

455 ) 

456 ) 

457 self.assertGreater(len(found), 0, f"Checking {datasetTypeName}") 

458 self.assertIn(datasetTypeName, calibration_names) 

459 

460 # Load camera should returned the versioned camera from the repo. 

461 camera, isVersioned = lsst.obs.base.loadCamera(butler, self.dataIds[0], collections=collection) 

462 self.assertTrue(isVersioned) 

463 self.assertIsInstance(camera, lsst.afw.cameraGeom.Camera) 

464 

465 def testDefineVisits(self): 

466 if self.visits is None: 

467 self.skipTest("Expected visits were not defined.") 

468 self._ingestRaws(transfer="link") 

469 

470 # Check that obscore table (if configured) has correct contents. 

471 butler = Butler(self.root, run=self.outputRun) 

472 self._check_obscore(butler.registry, has_visits=False) 

473 

474 # Calling defineVisits tests the implementation of the butler command 

475 # line interface "define-visits" subcommand. Functions in the script 

476 # folder are generally considered protected and should not be used 

477 # as public api. 

478 script.defineVisits( 

479 self.root, 

480 config_file=None, 

481 collections=self.outputRun, 

482 instrument=self.instrumentName, 

483 raw_name=self.ingestDatasetTypeName, 

484 ) 

485 

486 # Test that we got the visits we expected. 

487 visits = butler.registry.queryDataIds(["visit"]).expanded().toSet() 

488 self.assertCountEqual(visits, self.visits.keys()) 

489 instr = Instrument.from_string(self.instrumentName, butler.registry) 

490 camera = instr.getCamera() 

491 for foundVisit, (expectedVisit, expectedExposures) in zip(visits, self.visits.items(), strict=True): 

492 # Test that this visit is associated with the expected exposures. 

493 foundExposures = ( 

494 butler.registry.queryDataIds(["exposure"], dataId=expectedVisit).expanded().toSet() 

495 ) 

496 self.assertCountEqual(foundExposures, expectedExposures) 

497 # Test that we have a visit region, and that it contains all of the 

498 # detector+visit regions. 

499 self.assertIsNotNone(foundVisit.region) 

500 detectorVisitDataIds = ( 

501 butler.registry.queryDataIds(["visit", "detector"], dataId=expectedVisit).expanded().toSet() 

502 ) 

503 self.assertEqual(len(detectorVisitDataIds), len(camera)) 

504 for dataId in detectorVisitDataIds: 

505 self.assertTrue(foundVisit.region.contains(dataId.region)) 

506 

507 # Check obscore table again. 

508 self._check_obscore(butler.registry, has_visits=True) 

509 

510 def _check_obscore(self, registry: Registry, has_visits: bool) -> None: 

511 """Verify contents of obscore table.""" 

512 return