Coverage for tests/test_ingest.py: 17%

235 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2023-03-11 02:11 -0800

1# This file is part of obs_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22import json 

23import os 

24import pickle 

25import shutil 

26import tempfile 

27import unittest 

28 

29import lsst.daf.butler.tests as butlerTests 

30from lsst.daf.butler import Butler, Config, DataCoordinate 

31from lsst.daf.butler.registry import ConflictingDefinitionError 

32from lsst.obs.base import RawIngestTask 

33from lsst.obs.base.ingest_tests import IngestTestBase 

34from lsst.obs.base.instrument_tests import DummyCam 

35from lsst.utils.introspection import get_full_type_name 

36 

37TESTDIR = os.path.abspath(os.path.dirname(__file__)) 

38INGESTDIR = os.path.join(TESTDIR, "data", "ingest") 

39 

40 

41class RawIngestTestCase(IngestTestBase, unittest.TestCase): 

42 """Test ingest using JSON sidecar files.""" 

43 

44 ingestDatasetTypeName = "raw_dict" 

45 rawIngestTask = get_full_type_name(RawIngestTask) 

46 curatedCalibrationDatasetTypes = ("testCalib",) 

47 ingestDir = TESTDIR 

48 instrumentClassName = "lsst.obs.base.instrument_tests.DummyCam" 

49 file = os.path.join(INGESTDIR, "sidecar_data", "dataset_1.yaml") 

50 dataIds = [dict(instrument="DummyCam", exposure=100, detector=0)] 

51 seed_config = os.path.join(TESTDIR, "data", "curated", "seed.yaml") 

52 

53 @property 

54 def visits(self): 

55 butler = Butler(self.root, collections=[self.outputRun]) 

56 return { 

57 DataCoordinate.standardize( 

58 instrument="DummyCam", visit=100, universe=butler.registry.dimensions 

59 ): [ 

60 DataCoordinate.standardize( 

61 instrument="DummyCam", exposure=100, universe=butler.registry.dimensions 

62 ) 

63 ] 

64 } 

65 

66 def testWriteCuratedCalibrations(self): 

67 # Inject the "data package" location. 

68 DummyCam.dataPackageDir = os.path.join(TESTDIR, "data", "curated") 

69 return super().testWriteCuratedCalibrations() 

70 

71 

72class RawIngestImpliedIndexTestCase(RawIngestTestCase): 

73 """Test ingest using JSON index files.""" 

74 

75 file = os.path.join(INGESTDIR, "indexed_data", "dataset_1.yaml") 

76 

77 

78class RawIngestEdgeCaseTestCase(unittest.TestCase): 

79 """Test ingest using non-standard approaches including failures.""" 

80 

81 @classmethod 

82 def setUpClass(cls): 

83 butlerConfig = """ 

84datastore: 

85 # Want to ingest real files so can't use in-memory datastore 

86 cls: lsst.daf.butler.datastores.fileDatastore.FileDatastore 

87""" 

88 cls.root = tempfile.mkdtemp(dir=TESTDIR) 

89 cls.creatorButler = butlerTests.makeTestRepo(cls.root, {}, config=Config.fromYaml(butlerConfig)) 

90 DummyCam().register(cls.creatorButler.registry) 

91 

92 @classmethod 

93 def tearDownClass(cls): 

94 if cls.root is not None: 

95 shutil.rmtree(cls.root, ignore_errors=True) 

96 

97 def setUp(self): 

98 self.butler = butlerTests.makeTestCollection(self.creatorButler) 

99 self.outputRun = self.butler.run 

100 

101 config = RawIngestTask.ConfigClass() 

102 self.task = RawIngestTask(config=config, butler=self.butler) 

103 

104 # Different test files. 

105 self.bad_metadata_file = os.path.join(TESTDIR, "data", "small.fits") 

106 self.good_file = os.path.join(INGESTDIR, "sidecar_data", "dataset_2.yaml") 

107 self.bad_instrument_file = os.path.join(TESTDIR, "data", "calexp.fits") 

108 

109 def testSimpleIngest(self): 

110 # Use the default per-instrument run for this. 

111 self.task.run([self.good_file]) 

112 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections="DummyCam/raw/all")) 

113 self.assertEqual(len(datasets), 1) 

114 

115 # Now parallelized. 

116 files = [self.good_file, os.path.join(INGESTDIR, "sidecar_data", "dataset_1.yaml")] 

117 self.task.run(files, processes=2, run=self.outputRun) 

118 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=self.outputRun)) 

119 self.assertEqual(len(datasets), 2) 

120 

121 def testExplicitIndex(self): 

122 files = [os.path.join(INGESTDIR, "indexed_data", "_index.json")] 

123 self.task.run(files, run=self.outputRun) 

124 

125 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=self.outputRun)) 

126 self.assertEqual(len(datasets), 2) 

127 

128 # Try again with an explicit index and a file that is in that index. 

129 files.append(os.path.join(INGESTDIR, "indexed_data", "dataset_2.yaml")) 

130 new_run = self.outputRun + "b" 

131 self.task.run(files, run=new_run) 

132 

133 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=self.outputRun)) 

134 self.assertEqual(len(datasets), 2) 

135 

136 # Now with two index files that point to the same files. 

137 # Look for the warning from duplication. 

138 files = [ 

139 os.path.join(INGESTDIR, "indexed_data", "_index.json"), 

140 os.path.join(INGESTDIR, "indexed_data", "translated_subdir", "_index.json"), 

141 ] 

142 new_run = self.outputRun + "c" 

143 

144 with self.assertLogs(level="WARNING") as cm: 

145 self.task.run(files, run=new_run) 

146 self.assertIn("already specified in an index file, ignoring content", cm.output[0]) 

147 

148 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=self.outputRun)) 

149 self.assertEqual(len(datasets), 2) 

150 

151 # Again with an index file of metadata and one of translated. 

152 # Translated should win. 

153 # Put the metadata one first to test that order is preserved. 

154 files = [ 

155 os.path.join(INGESTDIR, "indexed_data", "metadata_subdir", "_index.json"), 

156 os.path.join(INGESTDIR, "indexed_data", "_index.json"), 

157 ] 

158 new_run = self.outputRun + "d" 

159 with self.assertLogs(level="WARNING") as cm: 

160 self.task.run(files, run=new_run) 

161 self.assertIn("already specified in an index file but overriding", cm.output[0]) 

162 

163 # Reversing the order should change the warning. 

164 # Again with an index file of metadata and one of translated. 

165 # Translated should win. 

166 # Put the metadata one first to test that order is preserved. 

167 files = [ 

168 os.path.join(INGESTDIR, "indexed_data", "_index.json"), 

169 os.path.join(INGESTDIR, "indexed_data", "metadata_subdir", "_index.json"), 

170 ] 

171 

172 new_run = self.outputRun + "e" 

173 with self.assertLogs(level="WARNING") as cm: 

174 self.task.run(files, run=new_run) 

175 self.assertIn("already specified in an index file, ignoring", cm.output[0]) 

176 

177 # Bad index file. 

178 files = [os.path.join(INGESTDIR, "indexed_data", "bad_index", "_index.json")] 

179 with self.assertRaises(RuntimeError): 

180 self.task.run(files, run=self.outputRun) 

181 

182 # Bad index file due to bad instrument. 

183 files = [os.path.join(INGESTDIR, "indexed_data", "bad_instrument", "_index.json")] 

184 with self.assertLogs(level="WARNING") as cm: 

185 with self.assertRaises(RuntimeError): 

186 self.task.run(files, run=self.outputRun) 

187 self.assertIn("Instrument HSC for file", cm.output[0]) 

188 

189 def testBadExposure(self): 

190 """Test that bad exposures trigger the correct failure modes. 

191 

192 This is the only test that uses the bad definition of dataset 4 

193 because exposure definitions are defined globally in a butler registry. 

194 """ 

195 

196 # Ingest 3 files. 2 of them will implicitly find an index and one 

197 # will use a sidecar. 

198 files = [os.path.join(INGESTDIR, "indexed_data", f"dataset_{n}.yaml") for n in (1, 2, 3)] 

199 new_run = self.outputRun 

200 self.task.run(files, run=new_run) 

201 

202 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=new_run)) 

203 self.assertEqual(len(datasets), 3) 

204 

205 # Test fail fast. 

206 self.task.config.failFast = True 

207 

208 # Ingest files with conflicting exposure definitions. 

209 # Ingest 3 files. One of them will implicitly find an index and one 

210 # will use a sidecar. The 3rd will fail due to exposure conflict. 

211 files = [os.path.join(INGESTDIR, "indexed_data", f"dataset_{n}.yaml") for n in (1, 3, 4)] 

212 new_run = self.outputRun + "_bad_exposure" 

213 with self.assertRaises(ConflictingDefinitionError): 

214 self.task.run(files, run=new_run) 

215 

216 def testBadFile(self): 

217 """Try to ingest a bad file.""" 

218 files = [self.bad_metadata_file] 

219 

220 with self.assertRaises(RuntimeError) as cm: 

221 # Default is to raise an error at the end. 

222 self.task.run(files, run=self.outputRun) 

223 self.assertIn("Some failures", str(cm.exception)) 

224 

225 # Including a good file will result in ingest working but still 

226 # raises (we might want to move this to solely happen in the 

227 # command line invocation). 

228 files.append(self.good_file) 

229 

230 # Also include a file with unknown instrument. 

231 files.append(self.bad_instrument_file) 

232 

233 with self.assertRaises(RuntimeError): 

234 self.task.run(files, run=self.outputRun) 

235 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=self.outputRun)) 

236 self.assertEqual(len(datasets), 1) 

237 

238 # Fail fast will trigger a run time error with different text. 

239 # Use a different output run to be sure we are not failing because 

240 # of the attempt to ingest twice. 

241 self.task.config.failFast = True 

242 new_run = self.outputRun + "b" 

243 with self.assertRaises(RuntimeError) as cm: 

244 self.task.run([self.bad_metadata_file, self.good_file], run=new_run) 

245 self.assertIn("Problem extracting metadata", str(cm.exception)) 

246 

247 # Attempt to ingest good file again -- this will fail for a different 

248 # reason than failed metadata extraction. 

249 with self.assertRaises(ConflictingDefinitionError): 

250 self.task.run([self.good_file], run=self.outputRun) 

251 

252 # Ingest a file with good metadata but unknown instrument. 

253 with self.assertRaises(RuntimeError) as cm: 

254 self.task.run([self.bad_instrument_file], run=self.outputRun) 

255 self.assertIn("Instrument HSC", str(cm.exception)) 

256 

257 # Ingest of a metadata index file that will fail translation. 

258 with self.assertRaises(RuntimeError) as cm: 

259 self.task.run([os.path.join(INGESTDIR, "indexed_data", "metadata_subdir", "_index.json")]) 

260 self.assertIn("Problem extracting metadata", str(cm.exception)) 

261 

262 # Ingest of a bad index file. 

263 with self.assertRaises(RuntimeError) as cm: 

264 self.task.run([os.path.join(INGESTDIR, "indexed_data", "bad_index", "_index.json")]) 

265 self.assertIn("Problem reading index file", str(cm.exception)) 

266 

267 # Ingest of an implied bad index file. 

268 with self.assertRaises(RuntimeError) as cm: 

269 self.task.run([os.path.join(INGESTDIR, "indexed_data", "bad_implied", "dataset_2.yaml")]) 

270 

271 def testCallbacks(self): 

272 """Test the callbacks for failures.""" 

273 

274 # Define the callbacks. 

275 metadata_failures = [] 

276 successes = [] 

277 ingest_failures = [] 

278 

279 def on_metadata_failure(filename, exc): 

280 metadata_failures.append(filename) 

281 

282 def on_success(datasets): 

283 successes.append(datasets) 

284 

285 def on_ingest_failure(exposure, exc): 

286 ingest_failures.append(exposure) 

287 

288 # Need our own task instance 

289 config = RawIngestTask.ConfigClass() 

290 self.task = RawIngestTask( 

291 config=config, 

292 butler=self.butler, 

293 on_metadata_failure=on_metadata_failure, 

294 on_success=on_success, 

295 on_ingest_failure=on_ingest_failure, 

296 ) 

297 

298 files = [self.good_file, self.bad_metadata_file, self.bad_instrument_file] 

299 

300 with self.assertRaises(RuntimeError): 

301 self.task.run(files, run=self.outputRun) 

302 

303 self.assertEqual(len(successes), 1) 

304 self.assertEqual(len(metadata_failures), 2) 

305 self.assertEqual(len(ingest_failures), 0) 

306 

307 # Try the good one a second time. 

308 with self.assertRaises(RuntimeError): 

309 self.task.run([self.good_file], run=self.outputRun) 

310 

311 self.assertEqual(len(successes), 1) 

312 self.assertEqual(len(ingest_failures), 1) 

313 

314 # An index file with metadata that won't translate. 

315 metadata_failures[:] = [] 

316 files = [os.path.join(INGESTDIR, "indexed_data", "metadata_subdir", "_index.json")] 

317 with self.assertRaises(RuntimeError): 

318 self.task.run(files, run=self.outputRun) 

319 self.assertEqual(len(metadata_failures), 2) 

320 

321 # Bad index file. 

322 metadata_failures[:] = [] 

323 files = [os.path.join(INGESTDIR, "indexed_data", "bad_index", "_index.json")] 

324 with self.assertRaises(RuntimeError): 

325 self.task.run(files, run=self.outputRun) 

326 self.assertEqual(len(metadata_failures), 1) 

327 

328 # Ingest two files that have conflicting exposure metadata. 

329 ingest_failures[:] = [] 

330 successes[:] = [] 

331 # Ingest 4 files. 2 of them will implicitly find an index and one 

332 # will use a sidecar. The 4th will fail due to exposure conflict. 

333 files = [os.path.join(INGESTDIR, "indexed_data", f"dataset_{n}.yaml") for n in (1, 2, 3, 4)] 

334 new_run = self.outputRun + "_fail" 

335 with self.assertRaises(RuntimeError): 

336 self.task.run(files, run=new_run) 

337 self.assertEqual(len(ingest_failures), 1) 

338 self.assertEqual(len(successes), 3) 

339 

340 def testSkipExistingExposures(self): 

341 """Test that skip_existing_exposures=True avoids exceptions from trying 

342 to ingest the same file twice. 

343 

344 Notes 

345 ----- 

346 This option also prevents not-ingested-yet raws from being ingested 

347 when exposure already exists, but that's (A) hard to test given the 

348 test data we have now and (B) not really ideal behavior, just behavior 

349 we can live with in order to have a way to avoid keep duplicate ingests 

350 from being an error. 

351 """ 

352 # Ingest the first time. 

353 self.task.run([self.good_file], run=self.outputRun) 

354 # Attempt to ingest a second time with skip_existing_exposures=False 

355 # (default). This should fail. 

356 with self.assertRaises(RuntimeError): 

357 self.task.run([self.good_file], run=self.outputRun) 

358 # Try again with `skip_existing_exposures=True. 

359 self.task.run([self.good_file], run=self.outputRun, skip_existing_exposures=True) 

360 

361 def testUpdateExposureRecords(self): 

362 """Test that update_exposure_records=True allows metadata to be 

363 modified. 

364 """ 

365 config = RawIngestTask.ConfigClass(failFast=True) 

366 task = RawIngestTask(config=config, butler=self.butler) 

367 with open(os.path.join(INGESTDIR, "sidecar_data", "dataset_1.json"), "r") as file: 

368 metadata = json.load(file) 

369 # Modify unique identifiers to avoid clashes with ingests from 

370 # other test methods in this test case, because those share a a 

371 # data repository. 

372 metadata["observation_id"] = "DummyDataset_testUpdateExposureRecords" 

373 metadata["observation_counter"] = 10 

374 metadata["exposure_id"] = 500 

375 metadata["exposure_group"] = "50" 

376 metadata["visit_id"] = 500 

377 base_filename = "dataset" 

378 try: 

379 # Copy the original file to be ingested (.yaml) to a temporary 

380 # directory, and write the new metadata next to it. 

381 tmp_dir = tempfile.mkdtemp(dir=TESTDIR) 

382 raw_filename = os.path.join(tmp_dir, f"{base_filename}.yaml") 

383 sidecar_filename = os.path.join(tmp_dir, f"{base_filename}.json") 

384 shutil.copy(self.good_file, raw_filename) 

385 with open(sidecar_filename, "w") as sidecar_file: 

386 json.dump(metadata, sidecar_file) 

387 task.run([raw_filename], run=self.outputRun) 

388 (record1,) = set( 

389 self.butler.registry.queryDimensionRecords("exposure", instrument="DummyCam", exposure=500) 

390 ) 

391 self.assertEqual(record1.exposure_time, metadata["exposure_time"]) 

392 # Modify some metadata and repeat the process to update the 

393 # exposure. 

394 metadata["exposure_time"] *= 2.0 

395 with open(sidecar_filename, "w") as sidecar_file: 

396 json.dump(metadata, sidecar_file) 

397 task.run( 

398 [raw_filename], run=self.outputRun, skip_existing_exposures=True, update_exposure_records=True 

399 ) 

400 (record2,) = set( 

401 self.butler.registry.queryDimensionRecords("exposure", instrument="DummyCam", exposure=500) 

402 ) 

403 self.assertEqual(record2.exposure_time, record1.exposure_time * 2) 

404 finally: 

405 shutil.rmtree(tmp_dir, ignore_errors=True) 

406 

407 

408class TestRawIngestTaskPickle(unittest.TestCase): 

409 """Test that pickling of the RawIngestTask works properly.""" 

410 

411 @classmethod 

412 def setUpClass(cls): 

413 cls.root = tempfile.mkdtemp(dir=TESTDIR) 

414 cls.creatorButler = butlerTests.makeTestRepo(cls.root, {}) 

415 

416 @classmethod 

417 def tearDownClass(cls): 

418 if cls.root is not None: 

419 shutil.rmtree(cls.root, ignore_errors=True) 

420 

421 def setUp(self): 

422 self.butler = butlerTests.makeTestCollection(self.creatorButler) 

423 

424 self.config = RawIngestTask.ConfigClass() 

425 self.config.transfer = "copy" # safe non-default value 

426 self.task = RawIngestTask(config=self.config, butler=self.butler) 

427 

428 def testPickleTask(self): 

429 stream = pickle.dumps(self.task) 

430 copy = pickle.loads(stream) 

431 self.assertEqual(self.task.getFullName(), copy.getFullName()) 

432 self.assertEqual(self.task.log.name, copy.log.name) 

433 self.assertEqual(self.task.config, copy.config) 

434 self.assertEqual(self.task.butler._config, copy.butler._config) 

435 self.assertEqual(self.task.butler.collections, copy.butler.collections) 

436 self.assertEqual(self.task.butler.run, copy.butler.run) 

437 self.assertEqual(self.task.universe, copy.universe) 

438 self.assertEqual(self.task.datasetType, copy.datasetType) 

439 

440 

441if __name__ == "__main__": 441 ↛ 442line 441 didn't jump to line 442, because the condition on line 441 was never true

442 unittest.main()