Coverage for python/lsst/obs/base/ingest_tests.py: 29%

191 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2024-01-25 04:32 +0000

1# This file is part of obs_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <https://www.gnu.org/licenses/>. 

21 

22"""Base class for writing Gen3 raw data ingest tests. 

23""" 

24 

25__all__ = ("IngestTestBase",) 

26 

27import abc 

28import os 

29import shutil 

30import tempfile 

31import unittest 

32 

33import lsst.afw.cameraGeom 

34import lsst.obs.base 

35from lsst.daf.butler import Butler 

36from lsst.daf.butler.cli.butler import cli as butlerCli 

37from lsst.daf.butler.cli.utils import LogCliRunner 

38from lsst.pipe.base import Instrument 

39from lsst.resources import ResourcePath 

40from lsst.utils import doImportType 

41 

42from . import script 

43 

44 

45class IngestTestBase(metaclass=abc.ABCMeta): 

46 """Base class for tests of gen3 ingest. Subclass from this, then 

47 `unittest.TestCase` to get a working test suite. 

48 """ 

49 

50 ingestDir = "" 

51 """Root path to ingest files into. Typically `obs_package/tests/`; the 

52 actual directory will be a tempdir under this one. 

53 """ 

54 

55 ingestDatasetTypeName = "raw" 

56 """The DatasetType to use for the ingest. 

57 

58 If this is not an Exposure dataset type the tests will be more limited. 

59 """ 

60 

61 dataIds = [] 

62 """list of butler data IDs of files that should have been ingested.""" 

63 

64 file = "" 

65 """Full path to a file to ingest in tests.""" 

66 

67 filterLabel = None 

68 """The lsst.afw.image.FilterLabel that should be returned by the above 

69 file.""" 

70 

71 rawIngestTask = "lsst.obs.base.RawIngestTask" 

72 """The task to use in the Ingest test.""" 

73 

74 curatedCalibrationDatasetTypes = None 

75 """List or tuple of Datasets types that should be present after calling 

76 writeCuratedCalibrations. If `None` writeCuratedCalibrations will 

77 not be called and the test will be skipped.""" 

78 

79 defineVisitsTask = lsst.obs.base.DefineVisitsTask 

80 """The task to use to define visits from groups of exposures. 

81 This is ignored if ``visits`` is `None`. 

82 """ 

83 

84 visits = {} 

85 """A dictionary mapping visit data IDs the lists of exposure data IDs that 

86 are associated with them. 

87 If this is empty (but not `None`), visit definition will be run but no 

88 visits will be expected (e.g. because no exposures are on-sky 

89 observations). 

90 """ 

91 

92 @property 

93 @abc.abstractmethod 

94 def instrumentClassName(self): 

95 """The fully qualified instrument class name. 

96 

97 Returns 

98 ------- 

99 `str` 

100 The fully qualified instrument class name. 

101 """ 

102 pass 

103 

104 @property 

105 def instrumentClass(self): 

106 """The instrument class.""" 

107 return doImportType(self.instrumentClassName) 

108 

109 @property 

110 def instrumentName(self): 

111 """The name of the instrument. 

112 

113 Returns 

114 ------- 

115 `str` 

116 The name of the instrument. 

117 """ 

118 return self.instrumentClass.getName() 

119 

120 @classmethod 

121 def setUpClass(cls): 

122 # Use a temporary working directory. 

123 cls.root = tempfile.mkdtemp(dir=cls.ingestDir) 

124 cls._createRepo() 

125 

126 # Register the instrument and its static metadata. 

127 cls._registerInstrument() 

128 

129 def setUp(self): 

130 # Want a unique run name per test. 

131 self.outputRun = "raw_ingest_" + self.id() 

132 

133 @classmethod 

134 def tearDownClass(cls): 

135 if os.path.exists(cls.root): 

136 shutil.rmtree(cls.root, ignore_errors=True) 

137 

138 def verifyIngest(self, files=None, cli=False, fullCheck=False): 

139 """ 

140 Test that RawIngestTask ingested the expected files. 

141 

142 Parameters 

143 ---------- 

144 files : `list` [`str`], or None 

145 List of files to be ingested, or None to use ``self.file`` 

146 fullCheck : `bool`, optional 

147 If `True`, read the full raw dataset and check component 

148 consistency. If `False` check that a component can be read 

149 but do not read the entire raw exposure. 

150 

151 Notes 

152 ----- 

153 Reading all the ingested test data can be expensive. The code paths 

154 for reading the second raw are the same as reading the first so 

155 we do not gain anything by doing full checks of everything. 

156 Only read full pixel data for first dataset from file. 

157 Don't even do that if we are requested not to by the caller. 

158 This only really affects files that contain multiple datasets. 

159 """ 

160 butler = Butler(self.root, run=self.outputRun) 

161 datasets = list(butler.registry.queryDatasets(self.ingestDatasetTypeName, collections=self.outputRun)) 

162 self.assertEqual(len(datasets), len(self.dataIds)) 

163 

164 # Get the URI to the first dataset and check it is inside the 

165 # datastore. 

166 datasetUri = butler.getURI(datasets[0]) 

167 self.assertIsNotNone(datasetUri.relative_to(butler.datastore.root)) 

168 

169 # Get the relevant dataset type. 

170 datasetType = butler.registry.getDatasetType(self.ingestDatasetTypeName) 

171 

172 for dataId in self.dataIds: 

173 # For testing we only read the entire dataset the first time 

174 # round if this is an Exposure. If it's not an Exposure 

175 # we always read it completely but we don't read components 

176 # because for an arbitrary dataset type we can't easily tell 

177 # what component to test. 

178 

179 if not datasetType.storageClass.name.startswith("Exposure"): 

180 exposure = butler.get(self.ingestDatasetTypeName, dataId) 

181 # Could be anything so nothing to test by default 

182 continue 

183 

184 # Check that we can read metadata from a raw. 

185 metadata = butler.get(f"{self.ingestDatasetTypeName}.metadata", dataId) 

186 if not fullCheck: 

187 continue 

188 fullCheck = False 

189 exposure = butler.get(self.ingestDatasetTypeName, dataId) 

190 

191 # Comparing headers will not work directly because of header 

192 # fix up provenance. 

193 metadata_headers = metadata.toDict() 

194 exposure_headers = exposure.getMetadata().toDict() 

195 metadata_headers.pop("HIERARCH ASTRO METADATA FIX DATE", None) 

196 exposure_headers.pop("HIERARCH ASTRO METADATA FIX DATE", None) 

197 self.assertEqual(metadata_headers, exposure_headers) 

198 

199 # Since components follow a different code path we check that 

200 # WCS match and also we check that at least the shape 

201 # of the image is the same (rather than doing per-pixel equality) 

202 wcs = butler.get(f"{self.ingestDatasetTypeName}.wcs", dataId) 

203 self.assertEqual(wcs, exposure.getWcs()) 

204 

205 rawImage = butler.get(f"{self.ingestDatasetTypeName}.image", dataId) 

206 self.assertEqual(rawImage.getBBox(), exposure.getBBox()) 

207 

208 # Check that the filter label got the correct band. 

209 filterLabel = butler.get(f"{self.ingestDatasetTypeName}.filter", dataId) 

210 self.assertEqual(filterLabel, self.filterLabel) 

211 

212 # Check that the exposure's Detector is the same as the component 

213 # we would read (this is tricky for LSST, which modifies its 

214 # detector at read time; for most other cameras it should be 

215 # trivially satisfied. 

216 detector = butler.get(f"{self.ingestDatasetTypeName}.detector", dataId) 

217 self.assertDetectorsEqual(detector, exposure.getDetector(), compareTransforms=False) 

218 

219 self.checkRepo(files=files) 

220 

221 def checkRepo(self, files=None): 

222 """Check the state of the repository after ingest. 

223 

224 This is an optional hook provided for subclasses; by default it does 

225 nothing. 

226 

227 Parameters 

228 ---------- 

229 files : `list` [`str`], or None 

230 List of files to be ingested, or None to use ``self.file`` 

231 """ 

232 pass 

233 

234 @classmethod 

235 def _createRepo(cls): 

236 """Use the Click `testing` module to call the butler command line api 

237 to create a repository.""" 

238 runner = LogCliRunner() 

239 result = runner.invoke(butlerCli, ["create", cls.root]) 

240 # Classmethod so assertEqual does not work. 

241 assert result.exit_code == 0, f"output: {result.output} exception: {result.exception}" 

242 

243 def _ingestRaws(self, transfer, file=None): 

244 """Use the Click `testing` module to call the butler command line api 

245 to ingest raws. 

246 

247 Parameters 

248 ---------- 

249 transfer : `str` 

250 The external data transfer type. 

251 file : `str` 

252 Path to a file to ingest instead of the default associated with 

253 the object. 

254 """ 

255 if file is None: 

256 file = self.file 

257 runner = LogCliRunner() 

258 result = runner.invoke( 

259 butlerCli, 

260 [ 

261 "ingest-raws", 

262 self.root, 

263 file, 

264 "--output-run", 

265 self.outputRun, 

266 "--transfer", 

267 transfer, 

268 "--ingest-task", 

269 self.rawIngestTask, 

270 ], 

271 ) 

272 self.assertEqual(result.exit_code, 0, f"output: {result.output} exception: {result.exception}") 

273 

274 @classmethod 

275 def _registerInstrument(cls): 

276 """Use the Click `testing` module to call the butler command line api 

277 to register the instrument.""" 

278 runner = LogCliRunner() 

279 result = runner.invoke(butlerCli, ["register-instrument", cls.root, cls.instrumentClassName]) 

280 # Classmethod so assertEqual does not work. 

281 assert result.exit_code == 0, f"output: {result.output} exception: {result.exception}" 

282 

283 def _writeCuratedCalibrations(self): 

284 """Use the Click `testing` module to call the butler command line api 

285 to write curated calibrations.""" 

286 runner = LogCliRunner() 

287 result = runner.invoke(butlerCli, ["write-curated-calibrations", self.root, self.instrumentName]) 

288 self.assertEqual(result.exit_code, 0, f"output: {result.output} exception: {result.exception}") 

289 

290 def testLink(self): 

291 self._ingestRaws(transfer="link") 

292 self.verifyIngest() 

293 

294 def testSymLink(self): 

295 self._ingestRaws(transfer="symlink") 

296 self.verifyIngest() 

297 

298 def testDirect(self): 

299 self._ingestRaws(transfer="direct") 

300 

301 # Check that it really did have a URI outside of datastore. 

302 srcUri = ResourcePath(self.file, forceAbsolute=True) 

303 butler = Butler(self.root, run=self.outputRun) 

304 datasets = list(butler.registry.queryDatasets(self.ingestDatasetTypeName, collections=self.outputRun)) 

305 datastoreUri = butler.getURI(datasets[0]) 

306 self.assertEqual(datastoreUri, srcUri) 

307 

308 def testCopy(self): 

309 self._ingestRaws(transfer="copy") 

310 # Only test full read of raws for the copy test. No need to do it 

311 # in the other tests since the formatter will be the same in all 

312 # cases. 

313 self.verifyIngest(fullCheck=True) 

314 

315 def testHardLink(self): 

316 try: 

317 self._ingestRaws(transfer="hardlink") 

318 # Running ingest through the Click testing infrastructure causes 

319 # the original exception indicating that we can't hard-link 

320 # on this filesystem to be turned into a nonzero exit code, which 

321 # then trips the test assertion. 

322 except (AssertionError, PermissionError) as err: 

323 raise unittest.SkipTest( 

324 "Skipping hard-link test because input data is on a different filesystem." 

325 ) from err 

326 self.verifyIngest() 

327 

328 def testInPlace(self): 

329 """Test that files already in the directory can be added to the 

330 registry in-place. 

331 """ 

332 butler = Butler(self.root, run=self.outputRun) 

333 

334 # If the test uses an index file the index file needs to also 

335 # appear in the datastore root along with the file to be ingested. 

336 # In that scenario the file name being used for ingest can not 

337 # be modified and must have the same name as found in the index 

338 # file itself. 

339 source_file_uri = ResourcePath(self.file) 

340 index_file = source_file_uri.dirname().join("_index.json") 

341 pathInStore = source_file_uri.basename() 

342 if index_file.exists(): 

343 os.symlink(index_file.ospath, butler.datastore.root.join("_index.json").ospath) 

344 else: 

345 # No index file so we are free to pick any name. 

346 pathInStore = "prefix-" + pathInStore 

347 

348 # Create a symlink to the original file so that it looks like it 

349 # is now inside the datastore. 

350 newPath = butler.datastore.root.join(pathInStore) 

351 os.symlink(os.path.abspath(self.file), newPath.ospath) 

352 

353 # If there is a sidecar file it needs to be linked in as well 

354 # since ingest code does not follow symlinks. 

355 sidecar_uri = ResourcePath(source_file_uri).updatedExtension(".json") 

356 if sidecar_uri.exists(): 

357 newSidecar = ResourcePath(newPath).updatedExtension(".json") 

358 os.symlink(sidecar_uri.ospath, newSidecar.ospath) 

359 

360 # Run ingest with auto mode since that should automatically determine 

361 # that an in-place ingest is happening. 

362 self._ingestRaws(transfer="auto", file=newPath.ospath) 

363 self.verifyIngest() 

364 

365 # Recreate a butler post-ingest (the earlier one won't see the 

366 # ingested files). 

367 butler = Butler(self.root, run=self.outputRun) 

368 

369 # Check that the URI associated with this path is the right one. 

370 uri = butler.getURI(self.ingestDatasetTypeName, self.dataIds[0]) 

371 self.assertEqual(uri.relative_to(butler.datastore.root), pathInStore) 

372 

373 def testFailOnConflict(self): 

374 """Re-ingesting the same data into the repository should fail.""" 

375 self._ingestRaws(transfer="symlink") 

376 with self.assertRaises(Exception): 

377 self._ingestRaws(transfer="symlink") 

378 

379 def testWriteCuratedCalibrations(self): 

380 """Test that we can ingest the curated calibrations, and read them 

381 with `loadCamera` both before and after. 

382 """ 

383 if self.curatedCalibrationDatasetTypes is None: 

384 raise unittest.SkipTest("Class requests disabling of writeCuratedCalibrations test") 

385 

386 butler = Butler(self.root, writeable=False) 

387 collection = self.instrumentClass().makeCalibrationCollectionName() 

388 

389 # Trying to load a camera with a data ID not known to the registry 

390 # is an error, because we can't get any temporal information. 

391 with self.assertRaises(LookupError): 

392 lsst.obs.base.loadCamera(butler, {"exposure": 0}, collections=collection) 

393 

394 # Ingest raws in order to get some exposure records. 

395 self._ingestRaws(transfer="auto") 

396 

397 # Load camera should returned an unversioned camera because there's 

398 # nothing in the repo. 

399 camera, isVersioned = lsst.obs.base.loadCamera(butler, self.dataIds[0], collections=collection) 

400 self.assertFalse(isVersioned) 

401 self.assertIsInstance(camera, lsst.afw.cameraGeom.Camera) 

402 

403 self._writeCuratedCalibrations() 

404 

405 # Make a new butler instance to make sure we don't have any stale 

406 # caches (e.g. of DatasetTypes). Note that we didn't give 

407 # _writeCuratedCalibrations the butler instance we had, because it's 

408 # trying to test the CLI interface anyway. 

409 butler = Butler(self.root, writeable=False) 

410 

411 for datasetTypeName in self.curatedCalibrationDatasetTypes: 

412 with self.subTest(dtype=datasetTypeName): 

413 found = list( 

414 butler.registry.queryDatasetAssociations( 

415 datasetTypeName, 

416 collections=collection, 

417 ) 

418 ) 

419 self.assertGreater(len(found), 0, f"Checking {datasetTypeName}") 

420 

421 # Load camera should returned the versioned camera from the repo. 

422 camera, isVersioned = lsst.obs.base.loadCamera(butler, self.dataIds[0], collections=collection) 

423 self.assertTrue(isVersioned) 

424 self.assertIsInstance(camera, lsst.afw.cameraGeom.Camera) 

425 

426 def testDefineVisits(self): 

427 if self.visits is None: 

428 self.skipTest("Expected visits were not defined.") 

429 self._ingestRaws(transfer="link") 

430 

431 # Calling defineVisits tests the implementation of the butler command 

432 # line interface "define-visits" subcommand. Functions in the script 

433 # folder are generally considered protected and should not be used 

434 # as public api. 

435 script.defineVisits( 

436 self.root, 

437 config_file=None, 

438 collections=self.outputRun, 

439 instrument=self.instrumentName, 

440 raw_name=self.ingestDatasetTypeName, 

441 ) 

442 

443 # Test that we got the visits we expected. 

444 butler = Butler(self.root, run=self.outputRun) 

445 visits = butler.registry.queryDataIds(["visit"]).expanded().toSet() 

446 self.assertCountEqual(visits, self.visits.keys()) 

447 instr = Instrument.from_string(self.instrumentName, butler.registry) 

448 camera = instr.getCamera() 

449 for foundVisit, (expectedVisit, expectedExposures) in zip(visits, self.visits.items()): 

450 # Test that this visit is associated with the expected exposures. 

451 foundExposures = ( 

452 butler.registry.queryDataIds(["exposure"], dataId=expectedVisit).expanded().toSet() 

453 ) 

454 self.assertCountEqual(foundExposures, expectedExposures) 

455 # Test that we have a visit region, and that it contains all of the 

456 # detector+visit regions. 

457 self.assertIsNotNone(foundVisit.region) 

458 detectorVisitDataIds = ( 

459 butler.registry.queryDataIds(["visit", "detector"], dataId=expectedVisit).expanded().toSet() 

460 ) 

461 self.assertEqual(len(detectorVisitDataIds), len(camera)) 

462 for dataId in detectorVisitDataIds: 

463 self.assertTrue(foundVisit.region.contains(dataId.region))