Coverage for python/lsst/obs/base/ingest_tests.py: 27%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

181 statements  

1# This file is part of obs_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <https://www.gnu.org/licenses/>. 

21 

22"""Base class for writing Gen3 raw data ingest tests. 

23""" 

24 

25__all__ = ("IngestTestBase",) 

26 

27import abc 

28import os 

29import shutil 

30import tempfile 

31import unittest 

32 

33import lsst.afw.cameraGeom 

34import lsst.obs.base 

35from lsst.daf.butler import Butler, ButlerURI 

36from lsst.daf.butler.cli.butler import cli as butlerCli 

37from lsst.daf.butler.cli.utils import LogCliRunner 

38from lsst.utils import doImport 

39 

40from . import script 

41from .utils import getInstrument 

42 

43 

44class IngestTestBase(metaclass=abc.ABCMeta): 

45 """Base class for tests of gen3 ingest. Subclass from this, then 

46 `unittest.TestCase` to get a working test suite. 

47 """ 

48 

49 ingestDir = "" 

50 """Root path to ingest files into. Typically `obs_package/tests/`; the 

51 actual directory will be a tempdir under this one. 

52 """ 

53 

54 ingestDatasetTypeName = "raw" 

55 """The DatasetType to use for the ingest. 

56 

57 If this is not an Exposure dataset type the tests will be more limited. 

58 """ 

59 

60 dataIds = [] 

61 """list of butler data IDs of files that should have been ingested.""" 

62 

63 file = "" 

64 """Full path to a file to ingest in tests.""" 

65 

66 filterLabel = None 

67 """The lsst.afw.image.FilterLabel that should be returned by the above 

68 file.""" 

69 

70 rawIngestTask = "lsst.obs.base.RawIngestTask" 

71 """The task to use in the Ingest test.""" 

72 

73 curatedCalibrationDatasetTypes = None 

74 """List or tuple of Datasets types that should be present after calling 

75 writeCuratedCalibrations. If `None` writeCuratedCalibrations will 

76 not be called and the test will be skipped.""" 

77 

78 defineVisitsTask = lsst.obs.base.DefineVisitsTask 

79 """The task to use to define visits from groups of exposures. 

80 This is ignored if ``visits`` is `None`. 

81 """ 

82 

83 visits = {} 

84 """A dictionary mapping visit data IDs the lists of exposure data IDs that 

85 are associated with them. 

86 If this is empty (but not `None`), visit definition will be run but no 

87 visits will be expected (e.g. because no exposures are on-sky 

88 observations). 

89 """ 

90 

91 @property 

92 @abc.abstractmethod 

93 def instrumentClassName(self): 

94 """The fully qualified instrument class name. 

95 

96 Returns 

97 ------- 

98 `str` 

99 The fully qualified instrument class name. 

100 """ 

101 pass 

102 

103 @property 

104 def instrumentClass(self): 

105 """The instrument class.""" 

106 return doImport(self.instrumentClassName) 

107 

108 @property 

109 def instrumentName(self): 

110 """The name of the instrument. 

111 

112 Returns 

113 ------- 

114 `str` 

115 The name of the instrument. 

116 """ 

117 return self.instrumentClass.getName() 

118 

119 @classmethod 

120 def setUpClass(cls): 

121 # Use a temporary working directory. 

122 cls.root = tempfile.mkdtemp(dir=cls.ingestDir) 

123 cls._createRepo() 

124 

125 # Register the instrument and its static metadata. 

126 cls._registerInstrument() 

127 

128 def setUp(self): 

129 # Want a unique run name per test. 

130 self.outputRun = "raw_ingest_" + self.id() 

131 

132 @classmethod 

133 def tearDownClass(cls): 

134 if os.path.exists(cls.root): 

135 shutil.rmtree(cls.root, ignore_errors=True) 

136 

137 def verifyIngest(self, files=None, cli=False, fullCheck=False): 

138 """ 

139 Test that RawIngestTask ingested the expected files. 

140 

141 Parameters 

142 ---------- 

143 files : `list` [`str`], or None 

144 List of files to be ingested, or None to use ``self.file`` 

145 fullCheck : `bool`, optional 

146 If `True`, read the full raw dataset and check component 

147 consistency. If `False` check that a component can be read 

148 but do not read the entire raw exposure. 

149 

150 Notes 

151 ----- 

152 Reading all the ingested test data can be expensive. The code paths 

153 for reading the second raw are the same as reading the first so 

154 we do not gain anything by doing full checks of everything. 

155 Only read full pixel data for first dataset from file. 

156 Don't even do that if we are requested not to by the caller. 

157 This only really affects files that contain multiple datasets. 

158 """ 

159 butler = Butler(self.root, run=self.outputRun) 

160 datasets = list(butler.registry.queryDatasets(self.ingestDatasetTypeName, collections=self.outputRun)) 

161 self.assertEqual(len(datasets), len(self.dataIds)) 

162 

163 # Get the URI to the first dataset and check it is inside the 

164 # datastore. 

165 datasetUri = butler.getURI(datasets[0]) 

166 self.assertIsNotNone(datasetUri.relative_to(butler.datastore.root)) 

167 

168 # Get the relevant dataset type. 

169 datasetType = butler.registry.getDatasetType(self.ingestDatasetTypeName) 

170 

171 for dataId in self.dataIds: 

172 # For testing we only read the entire dataset the first time 

173 # round if this is an Exposure. If it's not an Exposure 

174 # we always read it completely but we don't read components 

175 # because for an arbitrary dataset type we can't easily tell 

176 # what component to test. 

177 

178 if not datasetType.storageClass.name.startswith("Exposure"): 

179 exposure = butler.get(self.ingestDatasetTypeName, dataId) 

180 # Could be anything so nothing to test by default 

181 continue 

182 

183 # Check that we can read metadata from a raw. 

184 metadata = butler.get(f"{self.ingestDatasetTypeName}.metadata", dataId) 

185 if not fullCheck: 

186 continue 

187 fullCheck = False 

188 exposure = butler.get(self.ingestDatasetTypeName, dataId) 

189 

190 # Comparing headers will not work directly because of header 

191 # fix up provenance. 

192 metadata_headers = metadata.toDict() 

193 exposure_headers = exposure.getMetadata().toDict() 

194 metadata_headers.pop("HIERARCH ASTRO METADATA FIX DATE", None) 

195 exposure_headers.pop("HIERARCH ASTRO METADATA FIX DATE", None) 

196 self.assertEqual(metadata_headers, exposure_headers) 

197 

198 # Since components follow a different code path we check that 

199 # WCS match and also we check that at least the shape 

200 # of the image is the same (rather than doing per-pixel equality) 

201 wcs = butler.get(f"{self.ingestDatasetTypeName}.wcs", dataId) 

202 self.assertEqual(wcs, exposure.getWcs()) 

203 

204 rawImage = butler.get(f"{self.ingestDatasetTypeName}.image", dataId) 

205 self.assertEqual(rawImage.getBBox(), exposure.getBBox()) 

206 

207 # Check that the filter label got the correct band. 

208 filterLabel = butler.get(f"{self.ingestDatasetTypeName}.filterLabel", dataId) 

209 self.assertEqual(filterLabel, self.filterLabel) 

210 

211 # Check that the exposure's Detector is the same as the component 

212 # we would read (this is tricky for LSST, which modifies its 

213 # detector at read time; for most other cameras it should be 

214 # trivially satisfied. 

215 detector = butler.get(f"{self.ingestDatasetTypeName}.detector", dataId) 

216 self.assertDetectorsEqual(detector, exposure.getDetector(), compareTransforms=False) 

217 

218 self.checkRepo(files=files) 

219 

220 def checkRepo(self, files=None): 

221 """Check the state of the repository after ingest. 

222 

223 This is an optional hook provided for subclasses; by default it does 

224 nothing. 

225 

226 Parameters 

227 ---------- 

228 files : `list` [`str`], or None 

229 List of files to be ingested, or None to use ``self.file`` 

230 """ 

231 pass 

232 

233 @classmethod 

234 def _createRepo(cls): 

235 """Use the Click `testing` module to call the butler command line api 

236 to create a repository.""" 

237 runner = LogCliRunner() 

238 result = runner.invoke(butlerCli, ["create", cls.root]) 

239 # Classmethod so assertEqual does not work. 

240 assert result.exit_code == 0, f"output: {result.output} exception: {result.exception}" 

241 

242 def _ingestRaws(self, transfer, file=None): 

243 """Use the Click `testing` module to call the butler command line api 

244 to ingest raws. 

245 

246 Parameters 

247 ---------- 

248 transfer : `str` 

249 The external data transfer type. 

250 file : `str` 

251 Path to a file to ingest instead of the default associated with 

252 the object. 

253 """ 

254 if file is None: 

255 file = self.file 

256 runner = LogCliRunner() 

257 result = runner.invoke( 

258 butlerCli, 

259 [ 

260 "ingest-raws", 

261 self.root, 

262 file, 

263 "--output-run", 

264 self.outputRun, 

265 "--transfer", 

266 transfer, 

267 "--ingest-task", 

268 self.rawIngestTask, 

269 ], 

270 ) 

271 self.assertEqual(result.exit_code, 0, f"output: {result.output} exception: {result.exception}") 

272 

273 @classmethod 

274 def _registerInstrument(cls): 

275 """Use the Click `testing` module to call the butler command line api 

276 to register the instrument.""" 

277 runner = LogCliRunner() 

278 result = runner.invoke(butlerCli, ["register-instrument", cls.root, cls.instrumentClassName]) 

279 # Classmethod so assertEqual does not work. 

280 assert result.exit_code == 0, f"output: {result.output} exception: {result.exception}" 

281 

282 def _writeCuratedCalibrations(self): 

283 """Use the Click `testing` module to call the butler command line api 

284 to write curated calibrations.""" 

285 runner = LogCliRunner() 

286 result = runner.invoke(butlerCli, ["write-curated-calibrations", self.root, self.instrumentName]) 

287 self.assertEqual(result.exit_code, 0, f"output: {result.output} exception: {result.exception}") 

288 

289 def testLink(self): 

290 self._ingestRaws(transfer="link") 

291 self.verifyIngest() 

292 

293 def testSymLink(self): 

294 self._ingestRaws(transfer="symlink") 

295 self.verifyIngest() 

296 

297 def testDirect(self): 

298 self._ingestRaws(transfer="direct") 

299 

300 # Check that it really did have a URI outside of datastore. 

301 srcUri = ButlerURI(self.file, forceAbsolute=True) 

302 butler = Butler(self.root, run=self.outputRun) 

303 datasets = list(butler.registry.queryDatasets(self.ingestDatasetTypeName, collections=self.outputRun)) 

304 datastoreUri = butler.getURI(datasets[0]) 

305 self.assertEqual(datastoreUri, srcUri) 

306 

307 def testCopy(self): 

308 self._ingestRaws(transfer="copy") 

309 # Only test full read of raws for the copy test. No need to do it 

310 # in the other tests since the formatter will be the same in all 

311 # cases. 

312 self.verifyIngest(fullCheck=True) 

313 

314 def testHardLink(self): 

315 try: 

316 self._ingestRaws(transfer="hardlink") 

317 # Running ingest through the Click testing infrastructure causes 

318 # the original exception indicating that we can't hard-link 

319 # on this filesystem to be turned into a nonzero exit code, which 

320 # then trips the test assertion. 

321 except (AssertionError, PermissionError) as err: 

322 raise unittest.SkipTest( 

323 "Skipping hard-link test because input data is on a different filesystem." 

324 ) from err 

325 self.verifyIngest() 

326 

327 def testInPlace(self): 

328 """Test that files already in the directory can be added to the 

329 registry in-place. 

330 """ 

331 butler = Butler(self.root, run=self.outputRun) 

332 

333 # If the test uses an index file the index file needs to also 

334 # appear in the datastore root along with the file to be ingested. 

335 # In that scenario the file name being used for ingest can not 

336 # be modified and must have the same name as found in the index 

337 # file itself. 

338 source_file_uri = ButlerURI(self.file) 

339 index_file = source_file_uri.dirname().join("_index.json") 

340 pathInStore = source_file_uri.basename() 

341 if index_file.exists(): 

342 os.symlink(index_file.ospath, butler.datastore.root.join("_index.json").ospath) 

343 else: 

344 # No index file so we are free to pick any name. 

345 pathInStore = "prefix-" + pathInStore 

346 

347 # Create a symlink to the original file so that it looks like it 

348 # is now inside the datastore. 

349 newPath = butler.datastore.root.join(pathInStore) 

350 os.symlink(os.path.abspath(self.file), newPath.ospath) 

351 

352 # If there is a sidecar file it needs to be linked in as well 

353 # since ingest code does not follow symlinks. 

354 sidecar_uri = ButlerURI(source_file_uri).updatedExtension(".json") 

355 if sidecar_uri.exists(): 

356 newSidecar = ButlerURI(newPath).updatedExtension(".json") 

357 os.symlink(sidecar_uri.ospath, newSidecar.ospath) 

358 

359 # Run ingest with auto mode since that should automatically determine 

360 # that an in-place ingest is happening. 

361 self._ingestRaws(transfer="auto", file=newPath.ospath) 

362 self.verifyIngest() 

363 

364 # Recreate a butler post-ingest (the earlier one won't see the 

365 # ingested files). 

366 butler = Butler(self.root, run=self.outputRun) 

367 

368 # Check that the URI associated with this path is the right one. 

369 uri = butler.getURI(self.ingestDatasetTypeName, self.dataIds[0]) 

370 self.assertEqual(uri.relative_to(butler.datastore.root), pathInStore) 

371 

372 def testFailOnConflict(self): 

373 """Re-ingesting the same data into the repository should fail.""" 

374 self._ingestRaws(transfer="symlink") 

375 with self.assertRaises(Exception): 

376 self._ingestRaws(transfer="symlink") 

377 

378 def testWriteCuratedCalibrations(self): 

379 """Test that we can ingest the curated calibrations, and read them 

380 with `loadCamera` both before and after. 

381 """ 

382 if self.curatedCalibrationDatasetTypes is None: 

383 raise unittest.SkipTest("Class requests disabling of writeCuratedCalibrations test") 

384 

385 butler = Butler(self.root, writeable=False) 

386 collection = self.instrumentClass().makeCalibrationCollectionName() 

387 

388 # Trying to load a camera with a data ID not known to the registry 

389 # is an error, because we can't get any temporal information. 

390 with self.assertRaises(LookupError): 

391 lsst.obs.base.loadCamera(butler, {"exposure": 0}, collections=collection) 

392 

393 # Ingest raws in order to get some exposure records. 

394 self._ingestRaws(transfer="auto") 

395 

396 # Load camera should returned an unversioned camera because there's 

397 # nothing in the repo. 

398 camera, isVersioned = lsst.obs.base.loadCamera(butler, self.dataIds[0], collections=collection) 

399 self.assertFalse(isVersioned) 

400 self.assertIsInstance(camera, lsst.afw.cameraGeom.Camera) 

401 

402 self._writeCuratedCalibrations() 

403 

404 # Make a new butler instance to make sure we don't have any stale 

405 # caches (e.g. of DatasetTypes). Note that we didn't give 

406 # _writeCuratedCalibrations the butler instance we had, because it's 

407 # trying to test the CLI interface anyway. 

408 butler = Butler(self.root, writeable=False) 

409 

410 for datasetTypeName in self.curatedCalibrationDatasetTypes: 

411 with self.subTest(dtype=datasetTypeName): 

412 found = list( 

413 butler.registry.queryDatasetAssociations( 

414 datasetTypeName, 

415 collections=collection, 

416 ) 

417 ) 

418 self.assertGreater(len(found), 0, f"Checking {datasetTypeName}") 

419 

420 # Load camera should returned the versioned camera from the repo. 

421 camera, isVersioned = lsst.obs.base.loadCamera(butler, self.dataIds[0], collections=collection) 

422 self.assertTrue(isVersioned) 

423 self.assertIsInstance(camera, lsst.afw.cameraGeom.Camera) 

424 

425 def testDefineVisits(self): 

426 if self.visits is None: 

427 self.skipTest("Expected visits were not defined.") 

428 self._ingestRaws(transfer="link") 

429 

430 # Calling defineVisits tests the implementation of the butler command 

431 # line interface "define-visits" subcommand. Functions in the script 

432 # folder are generally considered protected and should not be used 

433 # as public api. 

434 script.defineVisits( 

435 self.root, 

436 config_file=None, 

437 collections=self.outputRun, 

438 instrument=self.instrumentName, 

439 raw_name=self.ingestDatasetTypeName, 

440 ) 

441 

442 # Test that we got the visits we expected. 

443 butler = Butler(self.root, run=self.outputRun) 

444 visits = butler.registry.queryDataIds(["visit"]).expanded().toSet() 

445 self.assertCountEqual(visits, self.visits.keys()) 

446 instr = getInstrument(self.instrumentName, butler.registry) 

447 camera = instr.getCamera() 

448 for foundVisit, (expectedVisit, expectedExposures) in zip(visits, self.visits.items()): 

449 # Test that this visit is associated with the expected exposures. 

450 foundExposures = ( 

451 butler.registry.queryDataIds(["exposure"], dataId=expectedVisit).expanded().toSet() 

452 ) 

453 self.assertCountEqual(foundExposures, expectedExposures) 

454 # Test that we have a visit region, and that it contains all of the 

455 # detector+visit regions. 

456 self.assertIsNotNone(foundVisit.region) 

457 detectorVisitDataIds = ( 

458 butler.registry.queryDataIds(["visit", "detector"], dataId=expectedVisit).expanded().toSet() 

459 ) 

460 self.assertEqual(len(detectorVisitDataIds), len(camera)) 

461 for dataId in detectorVisitDataIds: 

462 self.assertTrue(foundVisit.region.contains(dataId.region))