Coverage for python/lsst/obs/base/ingest_tests.py: 29%

201 statements  

« prev     ^ index     » next       coverage.py v6.4.2, created at 2022-07-27 09:52 +0000

1# This file is part of obs_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <https://www.gnu.org/licenses/>. 

21 

22"""Base class for writing Gen3 raw data ingest tests. 

23""" 

24 

25__all__ = ("IngestTestBase",) 

26 

27import abc 

28import os 

29import shutil 

30import tempfile 

31import unittest 

32 

33import lsst.afw.cameraGeom 

34import lsst.obs.base 

35from lsst.daf.butler import Butler 

36from lsst.daf.butler.cli.butler import cli as butlerCli 

37from lsst.daf.butler.cli.utils import LogCliRunner 

38from lsst.pipe.base import Instrument 

39from lsst.resources import ResourcePath 

40from lsst.utils import doImportType 

41 

42from . import script 

43 

44 

45class IngestTestBase(metaclass=abc.ABCMeta): 

46 """Base class for tests of gen3 ingest. Subclass from this, then 

47 `unittest.TestCase` to get a working test suite. 

48 """ 

49 

50 ingestDir = "" 

51 """Root path to ingest files into. Typically `obs_package/tests/`; the 

52 actual directory will be a tempdir under this one. 

53 """ 

54 

55 ingestDatasetTypeName = "raw" 

56 """The DatasetType to use for the ingest. 

57 

58 If this is not an Exposure dataset type the tests will be more limited. 

59 """ 

60 

61 dataIds = [] 

62 """list of butler data IDs of files that should have been ingested.""" 

63 

64 file = "" 

65 """Full path to a file to ingest in tests.""" 

66 

67 filterLabel = None 

68 """The lsst.afw.image.FilterLabel that should be returned by the above 

69 file.""" 

70 

71 rawIngestTask = "lsst.obs.base.RawIngestTask" 

72 """The task to use in the Ingest test.""" 

73 

74 curatedCalibrationDatasetTypes = None 

75 """List or tuple of Datasets types that should be present after calling 

76 writeCuratedCalibrations. If `None` writeCuratedCalibrations will 

77 not be called and the test will be skipped.""" 

78 

79 defineVisitsTask = lsst.obs.base.DefineVisitsTask 

80 """The task to use to define visits from groups of exposures. 

81 This is ignored if ``visits`` is `None`. 

82 """ 

83 

84 visits = {} 

85 """A dictionary mapping visit data IDs the lists of exposure data IDs that 

86 are associated with them. 

87 If this is empty (but not `None`), visit definition will be run but no 

88 visits will be expected (e.g. because no exposures are on-sky 

89 observations). 

90 """ 

91 

92 seed_config = None 

93 """Location of a seed configuration file to pass to butler create. 

94 

95 Useful if additional formatters or storage classes need to be defined. 

96 """ 

97 

98 @property 

99 @abc.abstractmethod 

100 def instrumentClassName(self): 

101 """The fully qualified instrument class name. 

102 

103 Returns 

104 ------- 

105 `str` 

106 The fully qualified instrument class name. 

107 """ 

108 pass 

109 

110 @property 

111 def instrumentClass(self): 

112 """The instrument class.""" 

113 return doImportType(self.instrumentClassName) 

114 

115 @property 

116 def instrumentName(self): 

117 """The name of the instrument. 

118 

119 Returns 

120 ------- 

121 `str` 

122 The name of the instrument. 

123 """ 

124 return self.instrumentClass.getName() 

125 

126 @classmethod 

127 def setUpClass(cls): 

128 # Use a temporary working directory. 

129 cls.root = tempfile.mkdtemp(dir=cls.ingestDir) 

130 cls._createRepo() 

131 

132 # Register the instrument and its static metadata. 

133 cls._registerInstrument() 

134 

135 def setUp(self): 

136 # Want a unique run name per test. 

137 self.outputRun = "raw_ingest_" + self.id() 

138 

139 @classmethod 

140 def tearDownClass(cls): 

141 if os.path.exists(cls.root): 

142 shutil.rmtree(cls.root, ignore_errors=True) 

143 

144 def verifyIngest(self, files=None, cli=False, fullCheck=False): 

145 """ 

146 Test that RawIngestTask ingested the expected files. 

147 

148 Parameters 

149 ---------- 

150 files : `list` [`str`], or None 

151 List of files to be ingested, or None to use ``self.file`` 

152 fullCheck : `bool`, optional 

153 If `True`, read the full raw dataset and check component 

154 consistency. If `False` check that a component can be read 

155 but do not read the entire raw exposure. 

156 

157 Notes 

158 ----- 

159 Reading all the ingested test data can be expensive. The code paths 

160 for reading the second raw are the same as reading the first so 

161 we do not gain anything by doing full checks of everything. 

162 Only read full pixel data for first dataset from file. 

163 Don't even do that if we are requested not to by the caller. 

164 This only really affects files that contain multiple datasets. 

165 """ 

166 butler = Butler(self.root, run=self.outputRun) 

167 datasets = list(butler.registry.queryDatasets(self.ingestDatasetTypeName, collections=self.outputRun)) 

168 self.assertEqual(len(datasets), len(self.dataIds)) 

169 

170 # Get the URI to the first dataset and check it is inside the 

171 # datastore. 

172 datasetUri = butler.getURI(datasets[0]) 

173 self.assertIsNotNone(datasetUri.relative_to(butler.datastore.root)) 

174 

175 # Get the relevant dataset type. 

176 datasetType = butler.registry.getDatasetType(self.ingestDatasetTypeName) 

177 

178 for dataId in self.dataIds: 

179 # For testing we only read the entire dataset the first time 

180 # round if this is an Exposure. If it's not an Exposure 

181 # we always read it completely but we don't read components 

182 # because for an arbitrary dataset type we can't easily tell 

183 # what component to test. 

184 

185 if not datasetType.storageClass.name.startswith("Exposure"): 

186 exposure = butler.get(self.ingestDatasetTypeName, dataId) 

187 # Could be anything so nothing to test by default 

188 continue 

189 

190 # Check that we can read metadata from a raw. 

191 metadata = butler.get(f"{self.ingestDatasetTypeName}.metadata", dataId) 

192 if not fullCheck: 

193 continue 

194 fullCheck = False 

195 exposure = butler.get(self.ingestDatasetTypeName, dataId) 

196 

197 # Comparing headers will not work directly because of header 

198 # fix up provenance. 

199 metadata_headers = metadata.toDict() 

200 exposure_headers = exposure.getMetadata().toDict() 

201 metadata_headers.pop("HIERARCH ASTRO METADATA FIX DATE", None) 

202 exposure_headers.pop("HIERARCH ASTRO METADATA FIX DATE", None) 

203 self.assertEqual(metadata_headers, exposure_headers) 

204 

205 # Since components follow a different code path we check that 

206 # WCS match and also we check that at least the shape 

207 # of the image is the same (rather than doing per-pixel equality) 

208 wcs = butler.get(f"{self.ingestDatasetTypeName}.wcs", dataId) 

209 self.assertEqual(wcs, exposure.getWcs()) 

210 

211 rawImage = butler.get(f"{self.ingestDatasetTypeName}.image", dataId) 

212 self.assertEqual(rawImage.getBBox(), exposure.getBBox()) 

213 

214 # Check that the filter label got the correct band. 

215 filterLabel = butler.get(f"{self.ingestDatasetTypeName}.filter", dataId) 

216 self.assertEqual(filterLabel, self.filterLabel) 

217 

218 # Check that the exposure's Detector is the same as the component 

219 # we would read (this is tricky for LSST, which modifies its 

220 # detector at read time; for most other cameras it should be 

221 # trivially satisfied. 

222 detector = butler.get(f"{self.ingestDatasetTypeName}.detector", dataId) 

223 self.assertDetectorsEqual(detector, exposure.getDetector(), compareTransforms=False) 

224 

225 self.checkRepo(files=files) 

226 

227 def checkRepo(self, files=None): 

228 """Check the state of the repository after ingest. 

229 

230 This is an optional hook provided for subclasses; by default it does 

231 nothing. 

232 

233 Parameters 

234 ---------- 

235 files : `list` [`str`], or None 

236 List of files to be ingested, or None to use ``self.file`` 

237 """ 

238 pass 

239 

240 @classmethod 

241 def _createRepo(cls): 

242 """Use the Click `testing` module to call the butler command line api 

243 to create a repository.""" 

244 runner = LogCliRunner() 

245 args = [] 

246 if cls.seed_config: 

247 args.extend(["--seed-config", cls.seed_config]) 

248 result = runner.invoke(butlerCli, ["create", cls.root, *args]) 

249 # Classmethod so assertEqual does not work. 

250 assert result.exit_code == 0, f"output: {result.output} exception: {result.exception}" 

251 

252 def _ingestRaws(self, transfer, file=None): 

253 """Use the Click `testing` module to call the butler command line api 

254 to ingest raws. 

255 

256 Parameters 

257 ---------- 

258 transfer : `str` 

259 The external data transfer type. 

260 file : `str` 

261 Path to a file to ingest instead of the default associated with 

262 the object. 

263 """ 

264 if file is None: 

265 file = self.file 

266 runner = LogCliRunner() 

267 result = runner.invoke( 

268 butlerCli, 

269 [ 

270 "ingest-raws", 

271 self.root, 

272 file, 

273 "--output-run", 

274 self.outputRun, 

275 "--transfer", 

276 transfer, 

277 "--ingest-task", 

278 self.rawIngestTask, 

279 ], 

280 ) 

281 self.assertEqual(result.exit_code, 0, f"output: {result.output} exception: {result.exception}") 

282 

283 @classmethod 

284 def _registerInstrument(cls): 

285 """Use the Click `testing` module to call the butler command line api 

286 to register the instrument.""" 

287 runner = LogCliRunner() 

288 result = runner.invoke(butlerCli, ["register-instrument", cls.root, cls.instrumentClassName]) 

289 # Classmethod so assertEqual does not work. 

290 assert result.exit_code == 0, f"output: {result.output} exception: {result.exception}" 

291 

292 def _writeCuratedCalibrations(self): 

293 """Use the Click `testing` module to call the butler command line api 

294 to write curated calibrations.""" 

295 runner = LogCliRunner() 

296 result = runner.invoke(butlerCli, ["write-curated-calibrations", self.root, self.instrumentName]) 

297 self.assertEqual(result.exit_code, 0, f"output: {result.output} exception: {result.exception}") 

298 

299 def testLink(self): 

300 self._ingestRaws(transfer="link") 

301 self.verifyIngest() 

302 

303 def testSymLink(self): 

304 self._ingestRaws(transfer="symlink") 

305 self.verifyIngest() 

306 

307 def testDirect(self): 

308 self._ingestRaws(transfer="direct") 

309 

310 # Check that it really did have a URI outside of datastore. 

311 srcUri = ResourcePath(self.file, forceAbsolute=True) 

312 butler = Butler(self.root, run=self.outputRun) 

313 datasets = list(butler.registry.queryDatasets(self.ingestDatasetTypeName, collections=self.outputRun)) 

314 datastoreUri = butler.getURI(datasets[0]) 

315 self.assertEqual(datastoreUri, srcUri) 

316 

317 def testCopy(self): 

318 self._ingestRaws(transfer="copy") 

319 # Only test full read of raws for the copy test. No need to do it 

320 # in the other tests since the formatter will be the same in all 

321 # cases. 

322 self.verifyIngest(fullCheck=True) 

323 

324 def testHardLink(self): 

325 try: 

326 self._ingestRaws(transfer="hardlink") 

327 # Running ingest through the Click testing infrastructure causes 

328 # the original exception indicating that we can't hard-link 

329 # on this filesystem to be turned into a nonzero exit code, which 

330 # then trips the test assertion. 

331 except (AssertionError, PermissionError) as err: 

332 raise unittest.SkipTest( 

333 "Skipping hard-link test because input data is on a different filesystem." 

334 ) from err 

335 self.verifyIngest() 

336 

337 def testInPlace(self): 

338 """Test that files already in the directory can be added to the 

339 registry in-place. 

340 """ 

341 butler = Butler(self.root, run=self.outputRun) 

342 

343 # If the test uses an index file the index file needs to also 

344 # appear in the datastore root along with the file to be ingested. 

345 # In that scenario the file name being used for ingest can not 

346 # be modified and must have the same name as found in the index 

347 # file itself. 

348 source_file_uri = ResourcePath(self.file) 

349 index_file = source_file_uri.dirname().join("_index.json") 

350 pathInStore = source_file_uri.basename() 

351 if index_file.exists(): 

352 os.symlink(index_file.ospath, butler.datastore.root.join("_index.json").ospath) 

353 else: 

354 # No index file so we are free to pick any name. 

355 pathInStore = "prefix-" + pathInStore 

356 

357 # Create a symlink to the original file so that it looks like it 

358 # is now inside the datastore. 

359 newPath = butler.datastore.root.join(pathInStore) 

360 os.symlink(os.path.abspath(self.file), newPath.ospath) 

361 

362 # If there is a sidecar file it needs to be linked in as well 

363 # since ingest code does not follow symlinks. 

364 sidecar_uri = ResourcePath(source_file_uri).updatedExtension(".json") 

365 if sidecar_uri.exists(): 

366 newSidecar = ResourcePath(newPath).updatedExtension(".json") 

367 os.symlink(sidecar_uri.ospath, newSidecar.ospath) 

368 

369 # Run ingest with auto mode since that should automatically determine 

370 # that an in-place ingest is happening. 

371 self._ingestRaws(transfer="auto", file=newPath.ospath) 

372 self.verifyIngest() 

373 

374 # Recreate a butler post-ingest (the earlier one won't see the 

375 # ingested files). 

376 butler = Butler(self.root, run=self.outputRun) 

377 

378 # Check that the URI associated with this path is the right one. 

379 uri = butler.getURI(self.ingestDatasetTypeName, self.dataIds[0]) 

380 self.assertEqual(uri.relative_to(butler.datastore.root), pathInStore) 

381 

382 def testFailOnConflict(self): 

383 """Re-ingesting the same data into the repository should fail.""" 

384 self._ingestRaws(transfer="symlink") 

385 with self.assertRaises(Exception): 

386 self._ingestRaws(transfer="symlink") 

387 

388 def testWriteCuratedCalibrations(self): 

389 """Test that we can ingest the curated calibrations, and read them 

390 with `loadCamera` both before and after. 

391 """ 

392 if self.curatedCalibrationDatasetTypes is None: 

393 raise unittest.SkipTest("Class requests disabling of writeCuratedCalibrations test") 

394 

395 butler = Butler(self.root, writeable=False) 

396 collection = self.instrumentClass().makeCalibrationCollectionName() 

397 

398 # Trying to load a camera with a data ID not known to the registry 

399 # is an error, because we can't get any temporal information. 

400 with self.assertRaises(LookupError): 

401 lsst.obs.base.loadCamera(butler, {"exposure": 0}, collections=collection) 

402 

403 # Ingest raws in order to get some exposure records. 

404 self._ingestRaws(transfer="auto") 

405 

406 # Load camera should returned an unversioned camera because there's 

407 # nothing in the repo. 

408 camera, isVersioned = lsst.obs.base.loadCamera(butler, self.dataIds[0], collections=collection) 

409 self.assertFalse(isVersioned) 

410 self.assertIsInstance(camera, lsst.afw.cameraGeom.Camera) 

411 

412 self._writeCuratedCalibrations() 

413 

414 # Make a new butler instance to make sure we don't have any stale 

415 # caches (e.g. of DatasetTypes). Note that we didn't give 

416 # _writeCuratedCalibrations the butler instance we had, because it's 

417 # trying to test the CLI interface anyway. 

418 butler = Butler(self.root, writeable=False) 

419 

420 instrumentClass = self.instrumentClass() 

421 calibration_names = instrumentClass.getCuratedCalibrationNames() 

422 

423 for datasetTypeName in self.curatedCalibrationDatasetTypes: 

424 with self.subTest(dtype=datasetTypeName): 

425 found = list( 

426 butler.registry.queryDatasetAssociations( 

427 datasetTypeName, 

428 collections=collection, 

429 ) 

430 ) 

431 self.assertGreater(len(found), 0, f"Checking {datasetTypeName}") 

432 self.assertIn(datasetTypeName, calibration_names) 

433 

434 # Load camera should returned the versioned camera from the repo. 

435 camera, isVersioned = lsst.obs.base.loadCamera(butler, self.dataIds[0], collections=collection) 

436 self.assertTrue(isVersioned) 

437 self.assertIsInstance(camera, lsst.afw.cameraGeom.Camera) 

438 

439 def testDefineVisits(self): 

440 if self.visits is None: 

441 self.skipTest("Expected visits were not defined.") 

442 self._ingestRaws(transfer="link") 

443 

444 # Calling defineVisits tests the implementation of the butler command 

445 # line interface "define-visits" subcommand. Functions in the script 

446 # folder are generally considered protected and should not be used 

447 # as public api. 

448 script.defineVisits( 

449 self.root, 

450 config_file=None, 

451 collections=self.outputRun, 

452 instrument=self.instrumentName, 

453 raw_name=self.ingestDatasetTypeName, 

454 ) 

455 

456 # Test that we got the visits we expected. 

457 butler = Butler(self.root, run=self.outputRun) 

458 visits = butler.registry.queryDataIds(["visit"]).expanded().toSet() 

459 self.assertCountEqual(visits, self.visits.keys()) 

460 instr = Instrument.from_string(self.instrumentName, butler.registry) 

461 camera = instr.getCamera() 

462 for foundVisit, (expectedVisit, expectedExposures) in zip(visits, self.visits.items()): 

463 # Test that this visit is associated with the expected exposures. 

464 foundExposures = ( 

465 butler.registry.queryDataIds(["exposure"], dataId=expectedVisit).expanded().toSet() 

466 ) 

467 self.assertCountEqual(foundExposures, expectedExposures) 

468 # Test that we have a visit region, and that it contains all of the 

469 # detector+visit regions. 

470 self.assertIsNotNone(foundVisit.region) 

471 detectorVisitDataIds = ( 

472 butler.registry.queryDataIds(["visit", "detector"], dataId=expectedVisit).expanded().toSet() 

473 ) 

474 self.assertEqual(len(detectorVisitDataIds), len(camera)) 

475 for dataId in detectorVisitDataIds: 

476 self.assertTrue(foundVisit.region.contains(dataId.region)) 

477 

478 idInfo = lsst.obs.base.ExposureIdInfo.fromDataId(dataId) 

479 self.assertGreater(idInfo.unusedBits, 0)