Coverage for tests/test_ingest.py: 18%

249 statements  

« prev     ^ index     » next       coverage.py v7.5.0, created at 2024-05-02 10:58 +0000

1# This file is part of obs_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22import json 

23import os 

24import pickle 

25import shutil 

26import tempfile 

27import unittest 

28 

29import lsst.daf.butler.tests as butlerTests 

30from lsst.daf.butler import Butler, Config, DataCoordinate, Registry 

31from lsst.daf.butler.registry import ConflictingDefinitionError 

32from lsst.obs.base import RawIngestTask 

33from lsst.obs.base.ingest_tests import IngestTestBase 

34from lsst.obs.base.instrument_tests import DummyCam 

35from lsst.utils.introspection import get_full_type_name 

36 

37TESTDIR = os.path.abspath(os.path.dirname(__file__)) 

38INGESTDIR = os.path.join(TESTDIR, "data", "ingest") 

39 

40 

41class RawIngestTestCase(IngestTestBase, unittest.TestCase): 

42 """Test ingest using JSON sidecar files.""" 

43 

44 ingestDatasetTypeName = "raw_dict" 

45 rawIngestTask = get_full_type_name(RawIngestTask) 

46 curatedCalibrationDatasetTypes = ("testCalib",) 

47 ingestDir = TESTDIR 

48 instrumentClassName = "lsst.obs.base.instrument_tests.DummyCam" 

49 file = os.path.join(INGESTDIR, "sidecar_data", "dataset_1.yaml") 

50 dataIds = [{"instrument": "DummyCam", "exposure": 100, "detector": 0}] 

51 seed_config = os.path.join(TESTDIR, "data", "curated", "seed.yaml") 

52 

53 @property 

54 def visits(self): 

55 butler = Butler(self.root, collections=[self.outputRun]) 

56 return { 

57 DataCoordinate.standardize(instrument="DummyCam", visit=100, universe=butler.dimensions): [ 

58 DataCoordinate.standardize(instrument="DummyCam", exposure=100, universe=butler.dimensions) 

59 ] 

60 } 

61 

62 def testWriteCuratedCalibrations(self): 

63 # Inject the "data package" location. 

64 DummyCam.dataPackageDir = os.path.join(TESTDIR, "data", "curated") 

65 return super().testWriteCuratedCalibrations() 

66 

67 def _check_obscore(self, registry: Registry, has_visits: bool) -> None: 

68 # Docstring inherited from base class. 

69 assert registry.obsCoreTableManager is not None 

70 with registry.obsCoreTableManager.query(lsst_run=self.outputRun) as result: 

71 rows = list(result) 

72 self.assertEqual(len(rows), 1) 

73 row = rows[0] 

74 

75 # No spatial information until visits are defined 

76 if not has_visits: 

77 self.assertIsNone(row.s_ra) 

78 self.assertIsNone(row.s_dec) 

79 self.assertIsNone(row.s_fov) 

80 self.assertIsNone(row.s_region) 

81 else: 

82 self.assertIsNotNone(row.s_ra) 

83 self.assertIsNotNone(row.s_dec) 

84 self.assertIsNotNone(row.s_fov) 

85 self.assertRegex(row.s_region, "POLYGON ICRS .*") 

86 

87 

88class RawIngestImpliedIndexTestCase(RawIngestTestCase): 

89 """Test ingest using JSON index files.""" 

90 

91 file = os.path.join(INGESTDIR, "indexed_data", "dataset_1.yaml") 

92 

93 

94class RawIngestEdgeCaseTestCase(unittest.TestCase): 

95 """Test ingest using non-standard approaches including failures. 

96 

97 Must create a new butler for each test because dimension records are 

98 globals. 

99 """ 

100 

101 def setUp(self): 

102 butlerConfig = """ 

103datastore: 

104 # Want to ingest real files so can't use in-memory datastore 

105 cls: lsst.daf.butler.datastores.fileDatastore.FileDatastore 

106""" 

107 self.root = tempfile.mkdtemp(dir=TESTDIR) 

108 self.creatorButler = butlerTests.makeTestRepo(self.root, {}, config=Config.fromYaml(butlerConfig)) 

109 DummyCam().register(self.creatorButler.registry) 

110 

111 self.butler = butlerTests.makeTestCollection(self.creatorButler, uniqueId=self.id()) 

112 self.outputRun = self.butler.run 

113 

114 config = RawIngestTask.ConfigClass() 

115 self.task = RawIngestTask(config=config, butler=self.butler) 

116 

117 # Different test files. 

118 self.bad_metadata_file = os.path.join(TESTDIR, "data", "small.fits") 

119 self.good_file = os.path.join(INGESTDIR, "sidecar_data", "dataset_2.yaml") 

120 self.bad_instrument_file = os.path.join(TESTDIR, "data", "calexp.fits") 

121 

122 def tearDown(self): 

123 if self.root is not None: 

124 shutil.rmtree(self.root, ignore_errors=True) 

125 

126 def testSimpleIngest(self): 

127 # Use the default per-instrument run for this. 

128 self.task.run([self.good_file]) 

129 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections="DummyCam/raw/all")) 

130 self.assertEqual(len(datasets), 1) 

131 

132 # Now parallelized. 

133 files = [self.good_file, os.path.join(INGESTDIR, "sidecar_data", "dataset_1.yaml")] 

134 self.task.run(files, processes=2, run=self.outputRun) 

135 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=self.outputRun)) 

136 self.assertEqual(len(datasets), 2) 

137 

138 def testTimeStampWarning(self): 

139 # Now ingest a dataset which should generate a warning because of 

140 # the end time being before the begin time. 

141 return 

142 files = [os.path.join(INGESTDIR, "sidecar_data", "dataset_end.yaml")] 

143 with self.assertLogs("lsst.obs.base._instrument", level="WARNING") as cm: 

144 self.task.run(files, run=self.outputRun) 

145 

146 self.assertIn("has end time before begin time", cm.output[0]) 

147 records = list( 

148 self.butler.registry.queryDimensionRecords( 

149 "exposure", 

150 where="exposure = exp AND instrument = inst", 

151 bind={"exp": 3000, "inst": "DummyCam"}, 

152 ) 

153 ) 

154 record = records[0] 

155 timespan = record.timespan 

156 self.assertEqual(timespan.begin.isot, timespan.end.isot) 

157 

158 def testExplicitIndex(self): 

159 files = [os.path.join(INGESTDIR, "indexed_data", "_index.json")] 

160 self.task.run(files, run=self.outputRun) 

161 

162 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=self.outputRun)) 

163 self.assertEqual(len(datasets), 2) 

164 

165 # Try again with an explicit index and a file that is in that index. 

166 files.append(os.path.join(INGESTDIR, "indexed_data", "dataset_2.yaml")) 

167 new_run = self.outputRun + "b" 

168 self.task.run(files, run=new_run) 

169 

170 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=self.outputRun)) 

171 self.assertEqual(len(datasets), 2) 

172 

173 # Now with two index files that point to the same files. 

174 # Look for the warning from duplication. 

175 files = [ 

176 os.path.join(INGESTDIR, "indexed_data", "_index.json"), 

177 os.path.join(INGESTDIR, "indexed_data", "translated_subdir", "_index.json"), 

178 ] 

179 new_run = self.outputRun + "c" 

180 

181 with self.assertLogs(level="WARNING") as cm: 

182 self.task.run(files, run=new_run) 

183 self.assertIn("already specified in an index file, ignoring content", cm.output[0]) 

184 

185 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=self.outputRun)) 

186 self.assertEqual(len(datasets), 2) 

187 

188 # Again with an index file of metadata and one of translated. 

189 # Translated should win. 

190 # Put the metadata one first to test that order is preserved. 

191 files = [ 

192 os.path.join(INGESTDIR, "indexed_data", "metadata_subdir", "_index.json"), 

193 os.path.join(INGESTDIR, "indexed_data", "_index.json"), 

194 ] 

195 new_run = self.outputRun + "d" 

196 with self.assertLogs(level="WARNING") as cm: 

197 self.task.run(files, run=new_run) 

198 self.assertIn("already specified in an index file but overriding", cm.output[0]) 

199 

200 # Reversing the order should change the warning. 

201 # Again with an index file of metadata and one of translated. 

202 # Translated should win. 

203 # Put the metadata one first to test that order is preserved. 

204 files = [ 

205 os.path.join(INGESTDIR, "indexed_data", "_index.json"), 

206 os.path.join(INGESTDIR, "indexed_data", "metadata_subdir", "_index.json"), 

207 ] 

208 

209 new_run = self.outputRun + "e" 

210 with self.assertLogs(level="WARNING") as cm: 

211 self.task.run(files, run=new_run) 

212 self.assertIn("already specified in an index file, ignoring", cm.output[0]) 

213 

214 # Bad index file. 

215 files = [os.path.join(INGESTDIR, "indexed_data", "bad_index", "_index.json")] 

216 with self.assertRaises(RuntimeError): 

217 self.task.run(files, run=self.outputRun) 

218 

219 # Bad index file due to bad instrument. 

220 files = [os.path.join(INGESTDIR, "indexed_data", "bad_instrument", "_index.json")] 

221 with self.assertLogs(level="WARNING") as cm: 

222 with self.assertRaises(RuntimeError): 

223 self.task.run(files, run=self.outputRun) 

224 self.assertIn("Instrument HSC for file", cm.output[0]) 

225 

226 def testBadExposure(self): 

227 """Test that bad exposures trigger the correct failure modes. 

228 

229 This is the only test that uses the bad definition of dataset 4 

230 because exposure definitions are defined globally in a butler registry. 

231 """ 

232 # Ingest 3 files. 2 of them will implicitly find an index and one 

233 # will use a sidecar. 

234 files = [os.path.join(INGESTDIR, "indexed_data", f"dataset_{n}.yaml") for n in (1, 2, 3)] 

235 new_run = self.outputRun 

236 self.task.run(files, run=new_run) 

237 

238 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=new_run)) 

239 self.assertEqual(len(datasets), 3) 

240 

241 # Test fail fast. 

242 self.task.config.failFast = True 

243 

244 # Ingest files with conflicting exposure definitions. 

245 # Ingest 3 files. One of them will implicitly find an index and one 

246 # will use a sidecar. The 3rd will fail due to exposure conflict. 

247 files = [os.path.join(INGESTDIR, "indexed_data", f"dataset_{n}.yaml") for n in (1, 3, 4)] 

248 new_run = self.outputRun + "_bad_exposure" 

249 with self.assertRaises(ConflictingDefinitionError): 

250 self.task.run(files, run=new_run) 

251 

252 def testBadFile(self): 

253 """Try to ingest a bad file.""" 

254 files = [self.bad_metadata_file] 

255 

256 with self.assertRaises(RuntimeError) as cm: 

257 # Default is to raise an error at the end. 

258 self.task.run(files, run=self.outputRun) 

259 self.assertIn("Some failures", str(cm.exception)) 

260 

261 # Including a good file will result in ingest working but still 

262 # raises (we might want to move this to solely happen in the 

263 # command line invocation). 

264 files.append(self.good_file) 

265 

266 # Also include a file with unknown instrument. 

267 files.append(self.bad_instrument_file) 

268 

269 with self.assertRaises(RuntimeError): 

270 self.task.run(files, run=self.outputRun) 

271 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=self.outputRun)) 

272 self.assertEqual(len(datasets), 1) 

273 

274 # Fail fast will trigger a run time error with different text. 

275 # Use a different output run to be sure we are not failing because 

276 # of the attempt to ingest twice. 

277 self.task.config.failFast = True 

278 new_run = self.outputRun + "b" 

279 with self.assertRaises(RuntimeError) as cm: 

280 self.task.run([self.bad_metadata_file, self.good_file], run=new_run) 

281 self.assertIn("Problem extracting metadata", str(cm.exception)) 

282 

283 # Attempt to ingest good file again -- this will fail for a different 

284 # reason than failed metadata extraction. 

285 with self.assertRaises(ConflictingDefinitionError): 

286 self.task.run([self.good_file], run=self.outputRun) 

287 

288 # Ingest a file with good metadata but unknown instrument. 

289 with self.assertRaises(RuntimeError) as cm: 

290 self.task.run([self.bad_instrument_file], run=self.outputRun) 

291 self.assertIn("Instrument HSC", str(cm.exception)) 

292 

293 # Ingest of a metadata index file that will fail translation. 

294 with self.assertRaises(RuntimeError) as cm: 

295 self.task.run([os.path.join(INGESTDIR, "indexed_data", "metadata_subdir", "_index.json")]) 

296 self.assertIn("Problem extracting metadata", str(cm.exception)) 

297 

298 # Ingest of a bad index file. 

299 with self.assertRaises(RuntimeError) as cm: 

300 self.task.run([os.path.join(INGESTDIR, "indexed_data", "bad_index", "_index.json")]) 

301 self.assertIn("Problem reading index file", str(cm.exception)) 

302 

303 # Ingest of an implied bad index file. 

304 with self.assertRaises(RuntimeError) as cm: 

305 self.task.run([os.path.join(INGESTDIR, "indexed_data", "bad_implied", "dataset_2.yaml")]) 

306 

307 def testCallbacks(self): 

308 """Test the callbacks for failures.""" 

309 # Define the callbacks. 

310 metadata_failures = [] 

311 successes = [] 

312 ingest_failures = [] 

313 

314 def on_metadata_failure(filename, exc): 

315 metadata_failures.append(filename) 

316 

317 def on_success(datasets): 

318 successes.append(datasets) 

319 

320 def on_ingest_failure(exposure, exc): 

321 ingest_failures.append(exposure) 

322 

323 # Need our own task instance 

324 config = RawIngestTask.ConfigClass() 

325 self.task = RawIngestTask( 

326 config=config, 

327 butler=self.butler, 

328 on_metadata_failure=on_metadata_failure, 

329 on_success=on_success, 

330 on_ingest_failure=on_ingest_failure, 

331 ) 

332 

333 files = [self.good_file, self.bad_metadata_file, self.bad_instrument_file] 

334 

335 with self.assertRaises(RuntimeError): 

336 self.task.run(files, run=self.outputRun) 

337 

338 self.assertEqual(len(successes), 1) 

339 self.assertEqual(len(metadata_failures), 2) 

340 self.assertEqual(len(ingest_failures), 0) 

341 

342 # Try the good one a second time. 

343 with self.assertRaises(RuntimeError): 

344 self.task.run([self.good_file], run=self.outputRun) 

345 

346 self.assertEqual(len(successes), 1) 

347 self.assertEqual(len(ingest_failures), 1) 

348 

349 # An index file with metadata that won't translate. 

350 metadata_failures[:] = [] 

351 files = [os.path.join(INGESTDIR, "indexed_data", "metadata_subdir", "_index.json")] 

352 with self.assertRaises(RuntimeError): 

353 self.task.run(files, run=self.outputRun) 

354 self.assertEqual(len(metadata_failures), 2) 

355 

356 # Bad index file. 

357 metadata_failures[:] = [] 

358 files = [os.path.join(INGESTDIR, "indexed_data", "bad_index", "_index.json")] 

359 with self.assertRaises(RuntimeError): 

360 self.task.run(files, run=self.outputRun) 

361 self.assertEqual(len(metadata_failures), 1) 

362 

363 # Ingest two files that have conflicting exposure metadata. 

364 ingest_failures[:] = [] 

365 successes[:] = [] 

366 # Ingest 4 files. 2 of them will implicitly find an index and one 

367 # will use a sidecar. The 4th will fail due to exposure conflict. 

368 files = [os.path.join(INGESTDIR, "indexed_data", f"dataset_{n}.yaml") for n in (1, 2, 3, 4)] 

369 new_run = self.outputRun + "_fail" 

370 with self.assertRaises(RuntimeError): 

371 self.task.run(files, run=new_run) 

372 self.assertEqual(len(ingest_failures), 1) 

373 self.assertEqual(len(successes), 3) 

374 

375 def testSkipExistingExposures(self): 

376 """Test that skip_existing_exposures=True avoids exceptions from trying 

377 to ingest the same file twice. 

378 

379 Notes 

380 ----- 

381 This option also prevents not-ingested-yet raws from being ingested 

382 when exposure already exists, but that's (A) hard to test given the 

383 test data we have now and (B) not really ideal behavior, just behavior 

384 we can live with in order to have a way to avoid keep duplicate ingests 

385 from being an error. 

386 """ 

387 # Ingest the first time. 

388 self.task.run([self.good_file], run=self.outputRun) 

389 # Attempt to ingest a second time with skip_existing_exposures=False 

390 # (default). This should fail. 

391 with self.assertRaises(RuntimeError): 

392 self.task.run([self.good_file], run=self.outputRun) 

393 # Try again with `skip_existing_exposures=True. 

394 self.task.run([self.good_file], run=self.outputRun, skip_existing_exposures=True) 

395 

396 def testUpdateExposureRecords(self): 

397 """Test that update_exposure_records=True allows metadata to be 

398 modified. 

399 """ 

400 config = RawIngestTask.ConfigClass(failFast=True) 

401 task = RawIngestTask(config=config, butler=self.butler) 

402 with open(os.path.join(INGESTDIR, "sidecar_data", "dataset_1.json")) as file: 

403 metadata = json.load(file) 

404 # Modify unique identifiers to avoid clashes with ingests from 

405 # other test methods in this test case, because those share a a 

406 # data repository. 

407 metadata["observation_id"] = "DummyDataset_testUpdateExposureRecords" 

408 metadata["observation_counter"] = 10 

409 metadata["exposure_id"] = 500 

410 metadata["exposure_group"] = "50" 

411 metadata["visit_id"] = 500 

412 base_filename = "dataset" 

413 try: 

414 # Copy the original file to be ingested (.yaml) to a temporary 

415 # directory, and write the new metadata next to it. 

416 tmp_dir = tempfile.mkdtemp(dir=TESTDIR) 

417 raw_filename = os.path.join(tmp_dir, f"{base_filename}.yaml") 

418 sidecar_filename = os.path.join(tmp_dir, f"{base_filename}.json") 

419 shutil.copy(self.good_file, raw_filename) 

420 with open(sidecar_filename, "w") as sidecar_file: 

421 json.dump(metadata, sidecar_file) 

422 task.run([raw_filename], run=self.outputRun) 

423 (record1,) = set( 

424 self.butler.registry.queryDimensionRecords("exposure", instrument="DummyCam", exposure=500) 

425 ) 

426 self.assertEqual(record1.exposure_time, metadata["exposure_time"]) 

427 # Modify some metadata and repeat the process to update the 

428 # exposure. 

429 metadata["exposure_time"] *= 2.0 

430 with open(sidecar_filename, "w") as sidecar_file: 

431 json.dump(metadata, sidecar_file) 

432 task.run( 

433 [raw_filename], run=self.outputRun, skip_existing_exposures=True, update_exposure_records=True 

434 ) 

435 (record2,) = set( 

436 self.butler.registry.queryDimensionRecords("exposure", instrument="DummyCam", exposure=500) 

437 ) 

438 self.assertEqual(record2.exposure_time, record1.exposure_time * 2) 

439 finally: 

440 shutil.rmtree(tmp_dir, ignore_errors=True) 

441 

442 

443class TestRawIngestTaskPickle(unittest.TestCase): 

444 """Test that pickling of the RawIngestTask works properly.""" 

445 

446 @classmethod 

447 def setUpClass(cls): 

448 cls.root = tempfile.mkdtemp(dir=TESTDIR) 

449 cls.creatorButler = butlerTests.makeTestRepo(cls.root, {}) 

450 

451 @classmethod 

452 def tearDownClass(cls): 

453 if cls.root is not None: 

454 shutil.rmtree(cls.root, ignore_errors=True) 

455 

456 def setUp(self): 

457 self.butler = butlerTests.makeTestCollection(self.creatorButler, uniqueId=self.id()) 

458 

459 self.config = RawIngestTask.ConfigClass() 

460 self.config.transfer = "copy" # safe non-default value 

461 self.task = RawIngestTask(config=self.config, butler=self.butler) 

462 

463 def testPickleTask(self): 

464 stream = pickle.dumps(self.task) 

465 copy = pickle.loads(stream) 

466 self.assertEqual(self.task.getFullName(), copy.getFullName()) 

467 self.assertEqual(self.task.log.name, copy.log.name) 

468 self.assertEqual(self.task.config, copy.config) 

469 self.assertEqual(self.task.butler._config, copy.butler._config) 

470 self.assertEqual(self.task.butler.collections, copy.butler.collections) 

471 self.assertEqual(self.task.butler.run, copy.butler.run) 

472 self.assertEqual(self.task.universe, copy.universe) 

473 self.assertEqual(self.task.datasetType, copy.datasetType) 

474 

475 

476if __name__ == "__main__": 476 ↛ 477line 476 didn't jump to line 477, because the condition on line 476 was never true

477 unittest.main()