Coverage for tests/test_ingest.py: 17%

250 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-06-16 09:07 +0000

1# This file is part of obs_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22import json 

23import os 

24import pickle 

25import shutil 

26import tempfile 

27import unittest 

28 

29import lsst.daf.butler.tests as butlerTests 

30from lsst.daf.butler import Butler, Config, DataCoordinate, Registry 

31from lsst.daf.butler.registry import ConflictingDefinitionError 

32from lsst.obs.base import RawIngestTask 

33from lsst.obs.base.ingest_tests import IngestTestBase 

34from lsst.obs.base.instrument_tests import DummyCam 

35from lsst.utils.introspection import get_full_type_name 

36 

37TESTDIR = os.path.abspath(os.path.dirname(__file__)) 

38INGESTDIR = os.path.join(TESTDIR, "data", "ingest") 

39 

40 

41class RawIngestTestCase(IngestTestBase, unittest.TestCase): 

42 """Test ingest using JSON sidecar files.""" 

43 

44 ingestDatasetTypeName = "raw_dict" 

45 rawIngestTask = get_full_type_name(RawIngestTask) 

46 curatedCalibrationDatasetTypes = ("testCalib",) 

47 ingestDir = TESTDIR 

48 instrumentClassName = "lsst.obs.base.instrument_tests.DummyCam" 

49 file = os.path.join(INGESTDIR, "sidecar_data", "dataset_1.yaml") 

50 dataIds = [dict(instrument="DummyCam", exposure=100, detector=0)] 

51 seed_config = os.path.join(TESTDIR, "data", "curated", "seed.yaml") 

52 

53 @property 

54 def visits(self): 

55 butler = Butler(self.root, collections=[self.outputRun]) 

56 return { 

57 DataCoordinate.standardize(instrument="DummyCam", visit=100, universe=butler.dimensions): [ 

58 DataCoordinate.standardize(instrument="DummyCam", exposure=100, universe=butler.dimensions) 

59 ] 

60 } 

61 

62 def testWriteCuratedCalibrations(self): 

63 # Inject the "data package" location. 

64 DummyCam.dataPackageDir = os.path.join(TESTDIR, "data", "curated") 

65 return super().testWriteCuratedCalibrations() 

66 

67 def _check_obscore(self, registry: Registry, has_visits: bool) -> None: 

68 # Docstring inherited from base class. 

69 assert registry.obsCoreTableManager is not None 

70 with registry.obsCoreTableManager.query(lsst_run=self.outputRun) as result: 

71 rows = list(result) 

72 self.assertEqual(len(rows), 1) 

73 row = rows[0] 

74 

75 # No spatial information until visits are defined 

76 if not has_visits: 

77 self.assertIsNone(row.s_ra) 

78 self.assertIsNone(row.s_dec) 

79 self.assertIsNone(row.s_fov) 

80 self.assertIsNone(row.s_region) 

81 else: 

82 self.assertIsNotNone(row.s_ra) 

83 self.assertIsNotNone(row.s_dec) 

84 self.assertIsNotNone(row.s_fov) 

85 self.assertRegex(row.s_region, "POLYGON ICRS .*") 

86 

87 

88class RawIngestImpliedIndexTestCase(RawIngestTestCase): 

89 """Test ingest using JSON index files.""" 

90 

91 file = os.path.join(INGESTDIR, "indexed_data", "dataset_1.yaml") 

92 

93 

94class RawIngestEdgeCaseTestCase(unittest.TestCase): 

95 """Test ingest using non-standard approaches including failures.""" 

96 

97 @classmethod 

98 def setUpClass(cls): 

99 butlerConfig = """ 

100datastore: 

101 # Want to ingest real files so can't use in-memory datastore 

102 cls: lsst.daf.butler.datastores.fileDatastore.FileDatastore 

103""" 

104 cls.root = tempfile.mkdtemp(dir=TESTDIR) 

105 cls.creatorButler = butlerTests.makeTestRepo(cls.root, {}, config=Config.fromYaml(butlerConfig)) 

106 DummyCam().register(cls.creatorButler.registry) 

107 

108 @classmethod 

109 def tearDownClass(cls): 

110 if cls.root is not None: 

111 shutil.rmtree(cls.root, ignore_errors=True) 

112 

113 def setUp(self): 

114 self.butler = butlerTests.makeTestCollection(self.creatorButler) 

115 self.outputRun = self.butler.run 

116 

117 config = RawIngestTask.ConfigClass() 

118 self.task = RawIngestTask(config=config, butler=self.butler) 

119 

120 # Different test files. 

121 self.bad_metadata_file = os.path.join(TESTDIR, "data", "small.fits") 

122 self.good_file = os.path.join(INGESTDIR, "sidecar_data", "dataset_2.yaml") 

123 self.bad_instrument_file = os.path.join(TESTDIR, "data", "calexp.fits") 

124 

125 def testSimpleIngest(self): 

126 # Use the default per-instrument run for this. 

127 self.task.run([self.good_file]) 

128 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections="DummyCam/raw/all")) 

129 self.assertEqual(len(datasets), 1) 

130 

131 # Now parallelized. 

132 files = [self.good_file, os.path.join(INGESTDIR, "sidecar_data", "dataset_1.yaml")] 

133 self.task.run(files, processes=2, run=self.outputRun) 

134 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=self.outputRun)) 

135 self.assertEqual(len(datasets), 2) 

136 

137 def testExplicitIndex(self): 

138 files = [os.path.join(INGESTDIR, "indexed_data", "_index.json")] 

139 self.task.run(files, run=self.outputRun) 

140 

141 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=self.outputRun)) 

142 self.assertEqual(len(datasets), 2) 

143 

144 # Try again with an explicit index and a file that is in that index. 

145 files.append(os.path.join(INGESTDIR, "indexed_data", "dataset_2.yaml")) 

146 new_run = self.outputRun + "b" 

147 self.task.run(files, run=new_run) 

148 

149 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=self.outputRun)) 

150 self.assertEqual(len(datasets), 2) 

151 

152 # Now with two index files that point to the same files. 

153 # Look for the warning from duplication. 

154 files = [ 

155 os.path.join(INGESTDIR, "indexed_data", "_index.json"), 

156 os.path.join(INGESTDIR, "indexed_data", "translated_subdir", "_index.json"), 

157 ] 

158 new_run = self.outputRun + "c" 

159 

160 with self.assertLogs(level="WARNING") as cm: 

161 self.task.run(files, run=new_run) 

162 self.assertIn("already specified in an index file, ignoring content", cm.output[0]) 

163 

164 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=self.outputRun)) 

165 self.assertEqual(len(datasets), 2) 

166 

167 # Again with an index file of metadata and one of translated. 

168 # Translated should win. 

169 # Put the metadata one first to test that order is preserved. 

170 files = [ 

171 os.path.join(INGESTDIR, "indexed_data", "metadata_subdir", "_index.json"), 

172 os.path.join(INGESTDIR, "indexed_data", "_index.json"), 

173 ] 

174 new_run = self.outputRun + "d" 

175 with self.assertLogs(level="WARNING") as cm: 

176 self.task.run(files, run=new_run) 

177 self.assertIn("already specified in an index file but overriding", cm.output[0]) 

178 

179 # Reversing the order should change the warning. 

180 # Again with an index file of metadata and one of translated. 

181 # Translated should win. 

182 # Put the metadata one first to test that order is preserved. 

183 files = [ 

184 os.path.join(INGESTDIR, "indexed_data", "_index.json"), 

185 os.path.join(INGESTDIR, "indexed_data", "metadata_subdir", "_index.json"), 

186 ] 

187 

188 new_run = self.outputRun + "e" 

189 with self.assertLogs(level="WARNING") as cm: 

190 self.task.run(files, run=new_run) 

191 self.assertIn("already specified in an index file, ignoring", cm.output[0]) 

192 

193 # Bad index file. 

194 files = [os.path.join(INGESTDIR, "indexed_data", "bad_index", "_index.json")] 

195 with self.assertRaises(RuntimeError): 

196 self.task.run(files, run=self.outputRun) 

197 

198 # Bad index file due to bad instrument. 

199 files = [os.path.join(INGESTDIR, "indexed_data", "bad_instrument", "_index.json")] 

200 with self.assertLogs(level="WARNING") as cm: 

201 with self.assertRaises(RuntimeError): 

202 self.task.run(files, run=self.outputRun) 

203 self.assertIn("Instrument HSC for file", cm.output[0]) 

204 

205 def testBadExposure(self): 

206 """Test that bad exposures trigger the correct failure modes. 

207 

208 This is the only test that uses the bad definition of dataset 4 

209 because exposure definitions are defined globally in a butler registry. 

210 """ 

211 

212 # Ingest 3 files. 2 of them will implicitly find an index and one 

213 # will use a sidecar. 

214 files = [os.path.join(INGESTDIR, "indexed_data", f"dataset_{n}.yaml") for n in (1, 2, 3)] 

215 new_run = self.outputRun 

216 self.task.run(files, run=new_run) 

217 

218 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=new_run)) 

219 self.assertEqual(len(datasets), 3) 

220 

221 # Test fail fast. 

222 self.task.config.failFast = True 

223 

224 # Ingest files with conflicting exposure definitions. 

225 # Ingest 3 files. One of them will implicitly find an index and one 

226 # will use a sidecar. The 3rd will fail due to exposure conflict. 

227 files = [os.path.join(INGESTDIR, "indexed_data", f"dataset_{n}.yaml") for n in (1, 3, 4)] 

228 new_run = self.outputRun + "_bad_exposure" 

229 with self.assertRaises(ConflictingDefinitionError): 

230 self.task.run(files, run=new_run) 

231 

232 def testBadFile(self): 

233 """Try to ingest a bad file.""" 

234 files = [self.bad_metadata_file] 

235 

236 with self.assertRaises(RuntimeError) as cm: 

237 # Default is to raise an error at the end. 

238 self.task.run(files, run=self.outputRun) 

239 self.assertIn("Some failures", str(cm.exception)) 

240 

241 # Including a good file will result in ingest working but still 

242 # raises (we might want to move this to solely happen in the 

243 # command line invocation). 

244 files.append(self.good_file) 

245 

246 # Also include a file with unknown instrument. 

247 files.append(self.bad_instrument_file) 

248 

249 with self.assertRaises(RuntimeError): 

250 self.task.run(files, run=self.outputRun) 

251 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=self.outputRun)) 

252 self.assertEqual(len(datasets), 1) 

253 

254 # Fail fast will trigger a run time error with different text. 

255 # Use a different output run to be sure we are not failing because 

256 # of the attempt to ingest twice. 

257 self.task.config.failFast = True 

258 new_run = self.outputRun + "b" 

259 with self.assertRaises(RuntimeError) as cm: 

260 self.task.run([self.bad_metadata_file, self.good_file], run=new_run) 

261 self.assertIn("Problem extracting metadata", str(cm.exception)) 

262 

263 # Attempt to ingest good file again -- this will fail for a different 

264 # reason than failed metadata extraction. 

265 with self.assertRaises(ConflictingDefinitionError): 

266 self.task.run([self.good_file], run=self.outputRun) 

267 

268 # Ingest a file with good metadata but unknown instrument. 

269 with self.assertRaises(RuntimeError) as cm: 

270 self.task.run([self.bad_instrument_file], run=self.outputRun) 

271 self.assertIn("Instrument HSC", str(cm.exception)) 

272 

273 # Ingest of a metadata index file that will fail translation. 

274 with self.assertRaises(RuntimeError) as cm: 

275 self.task.run([os.path.join(INGESTDIR, "indexed_data", "metadata_subdir", "_index.json")]) 

276 self.assertIn("Problem extracting metadata", str(cm.exception)) 

277 

278 # Ingest of a bad index file. 

279 with self.assertRaises(RuntimeError) as cm: 

280 self.task.run([os.path.join(INGESTDIR, "indexed_data", "bad_index", "_index.json")]) 

281 self.assertIn("Problem reading index file", str(cm.exception)) 

282 

283 # Ingest of an implied bad index file. 

284 with self.assertRaises(RuntimeError) as cm: 

285 self.task.run([os.path.join(INGESTDIR, "indexed_data", "bad_implied", "dataset_2.yaml")]) 

286 

287 def testCallbacks(self): 

288 """Test the callbacks for failures.""" 

289 

290 # Define the callbacks. 

291 metadata_failures = [] 

292 successes = [] 

293 ingest_failures = [] 

294 

295 def on_metadata_failure(filename, exc): 

296 metadata_failures.append(filename) 

297 

298 def on_success(datasets): 

299 successes.append(datasets) 

300 

301 def on_ingest_failure(exposure, exc): 

302 ingest_failures.append(exposure) 

303 

304 # Need our own task instance 

305 config = RawIngestTask.ConfigClass() 

306 self.task = RawIngestTask( 

307 config=config, 

308 butler=self.butler, 

309 on_metadata_failure=on_metadata_failure, 

310 on_success=on_success, 

311 on_ingest_failure=on_ingest_failure, 

312 ) 

313 

314 files = [self.good_file, self.bad_metadata_file, self.bad_instrument_file] 

315 

316 with self.assertRaises(RuntimeError): 

317 self.task.run(files, run=self.outputRun) 

318 

319 self.assertEqual(len(successes), 1) 

320 self.assertEqual(len(metadata_failures), 2) 

321 self.assertEqual(len(ingest_failures), 0) 

322 

323 # Try the good one a second time. 

324 with self.assertRaises(RuntimeError): 

325 self.task.run([self.good_file], run=self.outputRun) 

326 

327 self.assertEqual(len(successes), 1) 

328 self.assertEqual(len(ingest_failures), 1) 

329 

330 # An index file with metadata that won't translate. 

331 metadata_failures[:] = [] 

332 files = [os.path.join(INGESTDIR, "indexed_data", "metadata_subdir", "_index.json")] 

333 with self.assertRaises(RuntimeError): 

334 self.task.run(files, run=self.outputRun) 

335 self.assertEqual(len(metadata_failures), 2) 

336 

337 # Bad index file. 

338 metadata_failures[:] = [] 

339 files = [os.path.join(INGESTDIR, "indexed_data", "bad_index", "_index.json")] 

340 with self.assertRaises(RuntimeError): 

341 self.task.run(files, run=self.outputRun) 

342 self.assertEqual(len(metadata_failures), 1) 

343 

344 # Ingest two files that have conflicting exposure metadata. 

345 ingest_failures[:] = [] 

346 successes[:] = [] 

347 # Ingest 4 files. 2 of them will implicitly find an index and one 

348 # will use a sidecar. The 4th will fail due to exposure conflict. 

349 files = [os.path.join(INGESTDIR, "indexed_data", f"dataset_{n}.yaml") for n in (1, 2, 3, 4)] 

350 new_run = self.outputRun + "_fail" 

351 with self.assertRaises(RuntimeError): 

352 self.task.run(files, run=new_run) 

353 self.assertEqual(len(ingest_failures), 1) 

354 self.assertEqual(len(successes), 3) 

355 

356 def testSkipExistingExposures(self): 

357 """Test that skip_existing_exposures=True avoids exceptions from trying 

358 to ingest the same file twice. 

359 

360 Notes 

361 ----- 

362 This option also prevents not-ingested-yet raws from being ingested 

363 when exposure already exists, but that's (A) hard to test given the 

364 test data we have now and (B) not really ideal behavior, just behavior 

365 we can live with in order to have a way to avoid keep duplicate ingests 

366 from being an error. 

367 """ 

368 # Ingest the first time. 

369 self.task.run([self.good_file], run=self.outputRun) 

370 # Attempt to ingest a second time with skip_existing_exposures=False 

371 # (default). This should fail. 

372 with self.assertRaises(RuntimeError): 

373 self.task.run([self.good_file], run=self.outputRun) 

374 # Try again with `skip_existing_exposures=True. 

375 self.task.run([self.good_file], run=self.outputRun, skip_existing_exposures=True) 

376 

377 def testUpdateExposureRecords(self): 

378 """Test that update_exposure_records=True allows metadata to be 

379 modified. 

380 """ 

381 config = RawIngestTask.ConfigClass(failFast=True) 

382 task = RawIngestTask(config=config, butler=self.butler) 

383 with open(os.path.join(INGESTDIR, "sidecar_data", "dataset_1.json"), "r") as file: 

384 metadata = json.load(file) 

385 # Modify unique identifiers to avoid clashes with ingests from 

386 # other test methods in this test case, because those share a a 

387 # data repository. 

388 metadata["observation_id"] = "DummyDataset_testUpdateExposureRecords" 

389 metadata["observation_counter"] = 10 

390 metadata["exposure_id"] = 500 

391 metadata["exposure_group"] = "50" 

392 metadata["visit_id"] = 500 

393 base_filename = "dataset" 

394 try: 

395 # Copy the original file to be ingested (.yaml) to a temporary 

396 # directory, and write the new metadata next to it. 

397 tmp_dir = tempfile.mkdtemp(dir=TESTDIR) 

398 raw_filename = os.path.join(tmp_dir, f"{base_filename}.yaml") 

399 sidecar_filename = os.path.join(tmp_dir, f"{base_filename}.json") 

400 shutil.copy(self.good_file, raw_filename) 

401 with open(sidecar_filename, "w") as sidecar_file: 

402 json.dump(metadata, sidecar_file) 

403 task.run([raw_filename], run=self.outputRun) 

404 (record1,) = set( 

405 self.butler.registry.queryDimensionRecords("exposure", instrument="DummyCam", exposure=500) 

406 ) 

407 self.assertEqual(record1.exposure_time, metadata["exposure_time"]) 

408 # Modify some metadata and repeat the process to update the 

409 # exposure. 

410 metadata["exposure_time"] *= 2.0 

411 with open(sidecar_filename, "w") as sidecar_file: 

412 json.dump(metadata, sidecar_file) 

413 task.run( 

414 [raw_filename], run=self.outputRun, skip_existing_exposures=True, update_exposure_records=True 

415 ) 

416 (record2,) = set( 

417 self.butler.registry.queryDimensionRecords("exposure", instrument="DummyCam", exposure=500) 

418 ) 

419 self.assertEqual(record2.exposure_time, record1.exposure_time * 2) 

420 finally: 

421 shutil.rmtree(tmp_dir, ignore_errors=True) 

422 

423 

424class TestRawIngestTaskPickle(unittest.TestCase): 

425 """Test that pickling of the RawIngestTask works properly.""" 

426 

427 @classmethod 

428 def setUpClass(cls): 

429 cls.root = tempfile.mkdtemp(dir=TESTDIR) 

430 cls.creatorButler = butlerTests.makeTestRepo(cls.root, {}) 

431 

432 @classmethod 

433 def tearDownClass(cls): 

434 if cls.root is not None: 

435 shutil.rmtree(cls.root, ignore_errors=True) 

436 

437 def setUp(self): 

438 self.butler = butlerTests.makeTestCollection(self.creatorButler) 

439 

440 self.config = RawIngestTask.ConfigClass() 

441 self.config.transfer = "copy" # safe non-default value 

442 self.task = RawIngestTask(config=self.config, butler=self.butler) 

443 

444 def testPickleTask(self): 

445 stream = pickle.dumps(self.task) 

446 copy = pickle.loads(stream) 

447 self.assertEqual(self.task.getFullName(), copy.getFullName()) 

448 self.assertEqual(self.task.log.name, copy.log.name) 

449 self.assertEqual(self.task.config, copy.config) 

450 self.assertEqual(self.task.butler._config, copy.butler._config) 

451 self.assertEqual(self.task.butler.collections, copy.butler.collections) 

452 self.assertEqual(self.task.butler.run, copy.butler.run) 

453 self.assertEqual(self.task.universe, copy.universe) 

454 self.assertEqual(self.task.datasetType, copy.datasetType) 

455 

456 

457if __name__ == "__main__": 457 ↛ 458line 457 didn't jump to line 458, because the condition on line 457 was never true

458 unittest.main()