Coverage for python/lsst/obs/base/ingest_tests.py: 33%

208 statements  

« prev     ^ index     » next       coverage.py v7.3.1, created at 2023-09-27 11:09 +0000

1# This file is part of obs_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <https://www.gnu.org/licenses/>. 

21 

22"""Base class for writing Gen3 raw data ingest tests. 

23""" 

24 

25__all__ = ("IngestTestBase",) 

26 

27import abc 

28import os 

29import shutil 

30import tempfile 

31import unittest 

32 

33import lsst.afw.cameraGeom 

34import lsst.afw.cameraGeom.testUtils # For assertDetectorsEqual 

35import lsst.obs.base 

36from lsst.daf.butler import Butler, Registry 

37from lsst.daf.butler.cli.butler import cli as butlerCli 

38from lsst.daf.butler.cli.utils import LogCliRunner 

39from lsst.pipe.base import Instrument 

40from lsst.resources import ResourcePath 

41from lsst.utils import doImportType 

42 

43from . import script 

44 

45 

46class IngestTestBase(metaclass=abc.ABCMeta): 

47 """Base class for tests of gen3 ingest. Subclass from this, then 

48 `unittest.TestCase` to get a working test suite. 

49 """ 

50 

51 ingestDir = "" 

52 """Root path to ingest files into. Typically `obs_package/tests/`; the 

53 actual directory will be a tempdir under this one. 

54 """ 

55 

56 ingestDatasetTypeName = "raw" 

57 """The DatasetType to use for the ingest. 

58 

59 If this is not an Exposure dataset type the tests will be more limited. 

60 """ 

61 

62 dataIds = [] 

63 """list of butler data IDs of files that should have been ingested.""" 

64 

65 file = "" 

66 """Full path to a file to ingest in tests.""" 

67 

68 filterLabel = None 

69 """The lsst.afw.image.FilterLabel that should be returned by the above 

70 file.""" 

71 

72 rawIngestTask = "lsst.obs.base.RawIngestTask" 

73 """The task to use in the Ingest test.""" 

74 

75 curatedCalibrationDatasetTypes = None 

76 """List or tuple of Datasets types that should be present after calling 

77 writeCuratedCalibrations. If `None` writeCuratedCalibrations will 

78 not be called and the test will be skipped.""" 

79 

80 defineVisitsTask = lsst.obs.base.DefineVisitsTask 

81 """The task to use to define visits from groups of exposures. 

82 This is ignored if ``visits`` is `None`. 

83 """ 

84 

85 visits = {} 

86 """A dictionary mapping visit data IDs the lists of exposure data IDs that 

87 are associated with them. 

88 If this is empty (but not `None`), visit definition will be run but no 

89 visits will be expected (e.g. because no exposures are on-sky 

90 observations). 

91 """ 

92 

93 seed_config = None 

94 """Location of a seed configuration file to pass to butler create. 

95 

96 Useful if additional formatters or storage classes need to be defined. 

97 """ 

98 

99 @property 

100 @abc.abstractmethod 

101 def instrumentClassName(self): 

102 """The fully qualified instrument class name. 

103 

104 Returns 

105 ------- 

106 `str` 

107 The fully qualified instrument class name. 

108 """ 

109 pass 

110 

111 @property 

112 def instrumentClass(self): 

113 """The instrument class.""" 

114 return doImportType(self.instrumentClassName) 

115 

116 @property 

117 def instrumentName(self): 

118 """The name of the instrument. 

119 

120 Returns 

121 ------- 

122 `str` 

123 The name of the instrument. 

124 """ 

125 return self.instrumentClass.getName() 

126 

127 @classmethod 

128 def setUpClass(cls): 

129 # Use a temporary working directory. 

130 cls.root = tempfile.mkdtemp(dir=cls.ingestDir) 

131 cls._createRepo() 

132 

133 # Register the instrument and its static metadata. 

134 cls._registerInstrument() 

135 

136 # Determine the relevant datastore root to use for testing. 

137 butler = Butler(cls.root) 

138 roots = butler.get_datastore_roots() 

139 assert len(roots) == 1 # Only one datastore. 

140 cls.datastore_root = list(roots.values())[0] 

141 

142 def setUp(self): 

143 # Want a unique run name per test. 

144 self.outputRun = "raw_ingest_" + self.id() 

145 

146 @classmethod 

147 def tearDownClass(cls): 

148 if os.path.exists(cls.root): 

149 shutil.rmtree(cls.root, ignore_errors=True) 

150 

151 def verifyIngest(self, files=None, cli=False, fullCheck=False): 

152 """ 

153 Test that RawIngestTask ingested the expected files. 

154 

155 Parameters 

156 ---------- 

157 files : `list` [`str`], or None 

158 List of files to be ingested, or None to use ``self.file`` 

159 fullCheck : `bool`, optional 

160 If `True`, read the full raw dataset and check component 

161 consistency. If `False` check that a component can be read 

162 but do not read the entire raw exposure. 

163 

164 Notes 

165 ----- 

166 Reading all the ingested test data can be expensive. The code paths 

167 for reading the second raw are the same as reading the first so 

168 we do not gain anything by doing full checks of everything. 

169 Only read full pixel data for first dataset from file. 

170 Don't even do that if we are requested not to by the caller. 

171 This only really affects files that contain multiple datasets. 

172 """ 

173 butler = Butler(self.root, run=self.outputRun) 

174 datasets = list(butler.registry.queryDatasets(self.ingestDatasetTypeName, collections=self.outputRun)) 

175 self.assertEqual(len(datasets), len(self.dataIds)) 

176 

177 # Get the URI to the first dataset and check it is inside the 

178 # datastore. 

179 datasetUri = butler.getURI(datasets[0]) 

180 self.assertIsNotNone(datasetUri.relative_to(self.datastore_root)) 

181 

182 # Get the relevant dataset type. 

183 datasetType = butler.registry.getDatasetType(self.ingestDatasetTypeName) 

184 

185 for dataId in self.dataIds: 

186 # For testing we only read the entire dataset the first time 

187 # round if this is an Exposure. If it's not an Exposure 

188 # we always read it completely but we don't read components 

189 # because for an arbitrary dataset type we can't easily tell 

190 # what component to test. 

191 

192 if not datasetType.storageClass.name.startswith("Exposure"): 

193 exposure = butler.get(self.ingestDatasetTypeName, dataId) 

194 # Could be anything so nothing to test by default 

195 continue 

196 

197 # Check that we can read metadata from a raw. 

198 metadata = butler.get(f"{self.ingestDatasetTypeName}.metadata", dataId) 

199 if not fullCheck: 

200 continue 

201 fullCheck = False 

202 exposure = butler.get(self.ingestDatasetTypeName, dataId) 

203 

204 # Comparing headers will not work directly because of header 

205 # fix up provenance. 

206 metadata_headers = metadata.toDict() 

207 exposure_headers = exposure.getMetadata().toDict() 

208 metadata_headers.pop("HIERARCH ASTRO METADATA FIX DATE", None) 

209 exposure_headers.pop("HIERARCH ASTRO METADATA FIX DATE", None) 

210 self.assertEqual(metadata_headers, exposure_headers) 

211 

212 # Since components follow a different code path we check that 

213 # WCS match and also we check that at least the shape 

214 # of the image is the same (rather than doing per-pixel equality) 

215 wcs = butler.get(f"{self.ingestDatasetTypeName}.wcs", dataId) 

216 self.assertEqual(wcs, exposure.getWcs()) 

217 

218 rawImage = butler.get(f"{self.ingestDatasetTypeName}.image", dataId) 

219 self.assertEqual(rawImage.getBBox(), exposure.getBBox()) 

220 

221 # Check that the filter label got the correct band. 

222 filterLabel = butler.get(f"{self.ingestDatasetTypeName}.filter", dataId) 

223 self.assertEqual(filterLabel, self.filterLabel) 

224 

225 # Check that the exposure's Detector is the same as the component 

226 # we would read (this is tricky for LSST, which modifies its 

227 # detector at read time; for most other cameras it should be 

228 # trivially satisfied. 

229 detector = butler.get(f"{self.ingestDatasetTypeName}.detector", dataId) 

230 self.assertDetectorsEqual(detector, exposure.getDetector(), compareTransforms=False) 

231 

232 self.checkRepo(files=files) 

233 

234 def checkRepo(self, files=None): 

235 """Check the state of the repository after ingest. 

236 

237 This is an optional hook provided for subclasses; by default it does 

238 nothing. 

239 

240 Parameters 

241 ---------- 

242 files : `list` [`str`], or None 

243 List of files to be ingested, or None to use ``self.file`` 

244 """ 

245 return 

246 

247 @classmethod 

248 def _createRepo(cls): 

249 """Use the Click `testing` module to call the butler command line api 

250 to create a repository. 

251 """ 

252 runner = LogCliRunner() 

253 args = [] 

254 if cls.seed_config: 

255 args.extend(["--seed-config", cls.seed_config]) 

256 result = runner.invoke(butlerCli, ["create", cls.root, *args]) 

257 # Classmethod so assertEqual does not work. 

258 assert result.exit_code == 0, f"output: {result.output} exception: {result.exception}" 

259 

260 def _ingestRaws(self, transfer, file=None): 

261 """Use the Click `testing` module to call the butler command line api 

262 to ingest raws. 

263 

264 Parameters 

265 ---------- 

266 transfer : `str` 

267 The external data transfer type. 

268 file : `str` 

269 Path to a file to ingest instead of the default associated with 

270 the object. 

271 """ 

272 if file is None: 

273 file = self.file 

274 runner = LogCliRunner() 

275 result = runner.invoke( 

276 butlerCli, 

277 [ 

278 "ingest-raws", 

279 self.root, 

280 file, 

281 "--output-run", 

282 self.outputRun, 

283 "--transfer", 

284 transfer, 

285 "--ingest-task", 

286 self.rawIngestTask, 

287 ], 

288 ) 

289 self.assertEqual(result.exit_code, 0, f"output: {result.output} exception: {result.exception}") 

290 

291 @classmethod 

292 def _registerInstrument(cls): 

293 """Use the Click `testing` module to call the butler command line api 

294 to register the instrument. 

295 """ 

296 runner = LogCliRunner() 

297 result = runner.invoke(butlerCli, ["register-instrument", cls.root, cls.instrumentClassName]) 

298 # Classmethod so assertEqual does not work. 

299 assert result.exit_code == 0, f"output: {result.output} exception: {result.exception}" 

300 

301 def _writeCuratedCalibrations(self): 

302 """Use the Click `testing` module to call the butler command line api 

303 to write curated calibrations. 

304 """ 

305 runner = LogCliRunner() 

306 result = runner.invoke(butlerCli, ["write-curated-calibrations", self.root, self.instrumentName]) 

307 self.assertEqual(result.exit_code, 0, f"output: {result.output} exception: {result.exception}") 

308 

309 def testLink(self): 

310 self._ingestRaws(transfer="link") 

311 self.verifyIngest() 

312 

313 def testSymLink(self): 

314 self._ingestRaws(transfer="symlink") 

315 self.verifyIngest() 

316 

317 def testDirect(self): 

318 self._ingestRaws(transfer="direct") 

319 

320 # Check that it really did have a URI outside of datastore. 

321 srcUri = ResourcePath(self.file, forceAbsolute=True) 

322 butler = Butler(self.root, run=self.outputRun) 

323 datasets = list(butler.registry.queryDatasets(self.ingestDatasetTypeName, collections=self.outputRun)) 

324 datastoreUri = butler.getURI(datasets[0]) 

325 self.assertEqual(datastoreUri, srcUri) 

326 

327 def testCopy(self): 

328 self._ingestRaws(transfer="copy") 

329 # Only test full read of raws for the copy test. No need to do it 

330 # in the other tests since the formatter will be the same in all 

331 # cases. 

332 self.verifyIngest(fullCheck=True) 

333 

334 def testHardLink(self): 

335 try: 

336 self._ingestRaws(transfer="hardlink") 

337 # Running ingest through the Click testing infrastructure causes 

338 # the original exception indicating that we can't hard-link 

339 # on this filesystem to be turned into a nonzero exit code, which 

340 # then trips the test assertion. 

341 except (AssertionError, PermissionError) as err: 

342 raise unittest.SkipTest( 

343 "Skipping hard-link test because input data is on a different filesystem." 

344 ) from err 

345 self.verifyIngest() 

346 

347 def testInPlace(self): 

348 """Test that files already in the directory can be added to the 

349 registry in-place. 

350 """ 

351 butler = Butler(self.root, run=self.outputRun) 

352 

353 # If the test uses an index file the index file needs to also 

354 # appear in the datastore root along with the file to be ingested. 

355 # In that scenario the file name being used for ingest can not 

356 # be modified and must have the same name as found in the index 

357 # file itself. 

358 source_file_uri = ResourcePath(self.file) 

359 index_file = source_file_uri.dirname().join("_index.json") 

360 pathInStore = source_file_uri.basename() 

361 if index_file.exists(): 

362 os.symlink(index_file.ospath, self.datastore_root.join("_index.json").ospath) 

363 else: 

364 # No index file so we are free to pick any name. 

365 pathInStore = "prefix-" + pathInStore 

366 

367 # Create a symlink to the original file so that it looks like it 

368 # is now inside the datastore. 

369 newPath = self.datastore_root.join(pathInStore) 

370 os.symlink(os.path.abspath(self.file), newPath.ospath) 

371 

372 # If there is a sidecar file it needs to be linked in as well 

373 # since ingest code does not follow symlinks. 

374 sidecar_uri = ResourcePath(source_file_uri).updatedExtension(".json") 

375 if sidecar_uri.exists(): 

376 newSidecar = ResourcePath(newPath).updatedExtension(".json") 

377 os.symlink(sidecar_uri.ospath, newSidecar.ospath) 

378 

379 # Run ingest with auto mode since that should automatically determine 

380 # that an in-place ingest is happening. 

381 self._ingestRaws(transfer="auto", file=newPath.ospath) 

382 self.verifyIngest() 

383 

384 # Recreate a butler post-ingest (the earlier one won't see the 

385 # ingested files). 

386 butler = Butler(self.root, run=self.outputRun) 

387 

388 # Check that the URI associated with this path is the right one. 

389 uri = butler.getURI(self.ingestDatasetTypeName, self.dataIds[0]) 

390 self.assertEqual(uri.relative_to(self.datastore_root), pathInStore) 

391 

392 def testFailOnConflict(self): 

393 """Re-ingesting the same data into the repository should fail.""" 

394 self._ingestRaws(transfer="symlink") 

395 with self.assertRaises(AssertionError): 

396 self._ingestRaws(transfer="symlink") 

397 

398 def testWriteCuratedCalibrations(self): 

399 """Test that we can ingest the curated calibrations, and read them 

400 with `loadCamera` both before and after. 

401 """ 

402 if self.curatedCalibrationDatasetTypes is None: 

403 raise unittest.SkipTest("Class requests disabling of writeCuratedCalibrations test") 

404 

405 butler = Butler(self.root, writeable=False) 

406 collection = self.instrumentClass().makeCalibrationCollectionName() 

407 

408 # Trying to load a camera with a data ID not known to the registry 

409 # is an error, because we can't get any temporal information. 

410 with self.assertRaises(LookupError): 

411 lsst.obs.base.loadCamera(butler, {"exposure": 0}, collections=collection) 

412 

413 # Ingest raws in order to get some exposure records. 

414 self._ingestRaws(transfer="auto") 

415 

416 # Load camera should returned an unversioned camera because there's 

417 # nothing in the repo. 

418 camera, isVersioned = lsst.obs.base.loadCamera(butler, self.dataIds[0], collections=collection) 

419 self.assertFalse(isVersioned) 

420 self.assertIsInstance(camera, lsst.afw.cameraGeom.Camera) 

421 

422 self._writeCuratedCalibrations() 

423 

424 # Make a new butler instance to make sure we don't have any stale 

425 # caches (e.g. of DatasetTypes). Note that we didn't give 

426 # _writeCuratedCalibrations the butler instance we had, because it's 

427 # trying to test the CLI interface anyway. 

428 butler = Butler(self.root, writeable=False) 

429 

430 instrumentClass = self.instrumentClass() 

431 calibration_names = instrumentClass.getCuratedCalibrationNames() 

432 

433 for datasetTypeName in self.curatedCalibrationDatasetTypes: 

434 with self.subTest(dtype=datasetTypeName): 

435 found = list( 

436 butler.registry.queryDatasetAssociations( 

437 datasetTypeName, 

438 collections=collection, 

439 ) 

440 ) 

441 self.assertGreater(len(found), 0, f"Checking {datasetTypeName}") 

442 self.assertIn(datasetTypeName, calibration_names) 

443 

444 # Load camera should returned the versioned camera from the repo. 

445 camera, isVersioned = lsst.obs.base.loadCamera(butler, self.dataIds[0], collections=collection) 

446 self.assertTrue(isVersioned) 

447 self.assertIsInstance(camera, lsst.afw.cameraGeom.Camera) 

448 

449 def testDefineVisits(self): 

450 if self.visits is None: 

451 self.skipTest("Expected visits were not defined.") 

452 self._ingestRaws(transfer="link") 

453 

454 # Check that obscore table (if configured) has correct contents. 

455 butler = Butler(self.root, run=self.outputRun) 

456 self._check_obscore(butler.registry, has_visits=False) 

457 

458 # Calling defineVisits tests the implementation of the butler command 

459 # line interface "define-visits" subcommand. Functions in the script 

460 # folder are generally considered protected and should not be used 

461 # as public api. 

462 script.defineVisits( 

463 self.root, 

464 config_file=None, 

465 collections=self.outputRun, 

466 instrument=self.instrumentName, 

467 raw_name=self.ingestDatasetTypeName, 

468 ) 

469 

470 # Test that we got the visits we expected. 

471 visits = butler.registry.queryDataIds(["visit"]).expanded().toSet() 

472 self.assertCountEqual(visits, self.visits.keys()) 

473 instr = Instrument.from_string(self.instrumentName, butler.registry) 

474 camera = instr.getCamera() 

475 for foundVisit, (expectedVisit, expectedExposures) in zip(visits, self.visits.items(), strict=True): 

476 # Test that this visit is associated with the expected exposures. 

477 foundExposures = ( 

478 butler.registry.queryDataIds(["exposure"], dataId=expectedVisit).expanded().toSet() 

479 ) 

480 self.assertCountEqual(foundExposures, expectedExposures) 

481 # Test that we have a visit region, and that it contains all of the 

482 # detector+visit regions. 

483 self.assertIsNotNone(foundVisit.region) 

484 detectorVisitDataIds = ( 

485 butler.registry.queryDataIds(["visit", "detector"], dataId=expectedVisit).expanded().toSet() 

486 ) 

487 self.assertEqual(len(detectorVisitDataIds), len(camera)) 

488 for dataId in detectorVisitDataIds: 

489 self.assertTrue(foundVisit.region.contains(dataId.region)) 

490 

491 # Check obscore table again. 

492 self._check_obscore(butler.registry, has_visits=True) 

493 

494 def _check_obscore(self, registry: Registry, has_visits: bool) -> None: 

495 """Verify contents of obscore table.""" 

496 return