Coverage for tests/test_ingest.py: 18%

238 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2022-12-09 10:06 +0000

1# This file is part of obs_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22import json 

23import os 

24import pickle 

25import shutil 

26import tempfile 

27import unittest 

28 

29import lsst.daf.butler.tests as butlerTests 

30from lsst.daf.butler import Butler, Config, DataCoordinate, DatasetType 

31from lsst.daf.butler.registry import ConflictingDefinitionError 

32from lsst.obs.base import RawIngestTask 

33from lsst.obs.base.ingest_tests import IngestTestBase 

34from lsst.obs.base.instrument_tests import DummyCam 

35from lsst.utils.introspection import get_full_type_name 

36 

37TESTDIR = os.path.abspath(os.path.dirname(__file__)) 

38INGESTDIR = os.path.join(TESTDIR, "data", "ingest") 

39 

40 

41class DummyCamRawIngestTask(RawIngestTask): 

42 """For DummyCam we ingest a different dataset type that can return 

43 a non-Exposure.""" 

44 

45 def getDatasetType(self): 

46 """Return the DatasetType of the datasets ingested by this Task.""" 

47 return DatasetType( 

48 "raw_dict", 

49 ("instrument", "detector", "exposure"), 

50 "StructuredDataDict", 

51 universe=self.butler.registry.dimensions, 

52 ) 

53 

54 

55class RawIngestTestCase(IngestTestBase, unittest.TestCase): 

56 """Test ingest using JSON sidecar files.""" 

57 

58 ingestDatasetTypeName = "raw_dict" 

59 rawIngestTask = get_full_type_name(DummyCamRawIngestTask) 

60 curatedCalibrationDatasetTypes = ("testCalib",) 

61 ingestDir = TESTDIR 

62 instrumentClassName = "lsst.obs.base.instrument_tests.DummyCam" 

63 file = os.path.join(INGESTDIR, "sidecar_data", "dataset_1.yaml") 

64 dataIds = [dict(instrument="DummyCam", exposure=100, detector=0)] 

65 seed_config = os.path.join(TESTDIR, "data", "curated", "seed.yaml") 

66 

67 @property 

68 def visits(self): 

69 butler = Butler(self.root, collections=[self.outputRun]) 

70 return { 

71 DataCoordinate.standardize( 

72 instrument="DummyCam", visit=100, universe=butler.registry.dimensions 

73 ): [ 

74 DataCoordinate.standardize( 

75 instrument="DummyCam", exposure=100, universe=butler.registry.dimensions 

76 ) 

77 ] 

78 } 

79 

80 def testWriteCuratedCalibrations(self): 

81 # Inject the "data package" location. 

82 DummyCam.dataPackageDir = os.path.join(TESTDIR, "data", "curated") 

83 return super().testWriteCuratedCalibrations() 

84 

85 

86class RawIngestImpliedIndexTestCase(RawIngestTestCase): 

87 """Test ingest using JSON index files.""" 

88 

89 file = os.path.join(INGESTDIR, "indexed_data", "dataset_1.yaml") 

90 

91 

92class RawIngestEdgeCaseTestCase(unittest.TestCase): 

93 """Test ingest using non-standard approaches including failures.""" 

94 

95 @classmethod 

96 def setUpClass(cls): 

97 butlerConfig = """ 

98datastore: 

99 # Want to ingest real files so can't use in-memory datastore 

100 cls: lsst.daf.butler.datastores.fileDatastore.FileDatastore 

101""" 

102 cls.root = tempfile.mkdtemp(dir=TESTDIR) 

103 cls.creatorButler = butlerTests.makeTestRepo(cls.root, {}, config=Config.fromYaml(butlerConfig)) 

104 DummyCam().register(cls.creatorButler.registry) 

105 

106 @classmethod 

107 def tearDownClass(cls): 

108 if cls.root is not None: 

109 shutil.rmtree(cls.root, ignore_errors=True) 

110 

111 def setUp(self): 

112 self.butler = butlerTests.makeTestCollection(self.creatorButler) 

113 self.outputRun = self.butler.run 

114 

115 config = RawIngestTask.ConfigClass() 

116 self.task = DummyCamRawIngestTask(config=config, butler=self.butler) 

117 

118 # Different test files. 

119 self.bad_metadata_file = os.path.join(TESTDIR, "data", "small.fits") 

120 self.good_file = os.path.join(INGESTDIR, "sidecar_data", "dataset_2.yaml") 

121 self.bad_instrument_file = os.path.join(TESTDIR, "data", "calexp.fits") 

122 

123 def testSimpleIngest(self): 

124 # Use the default per-instrument run for this. 

125 self.task.run([self.good_file]) 

126 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections="DummyCam/raw/all")) 

127 self.assertEqual(len(datasets), 1) 

128 

129 # Now parallelized. 

130 files = [self.good_file, os.path.join(INGESTDIR, "sidecar_data", "dataset_1.yaml")] 

131 self.task.run(files, processes=2, run=self.outputRun) 

132 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=self.outputRun)) 

133 self.assertEqual(len(datasets), 2) 

134 

135 def testExplicitIndex(self): 

136 files = [os.path.join(INGESTDIR, "indexed_data", "_index.json")] 

137 self.task.run(files, run=self.outputRun) 

138 

139 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=self.outputRun)) 

140 self.assertEqual(len(datasets), 2) 

141 

142 # Try again with an explicit index and a file that is in that index. 

143 files.append(os.path.join(INGESTDIR, "indexed_data", "dataset_2.yaml")) 

144 new_run = self.outputRun + "b" 

145 self.task.run(files, run=new_run) 

146 

147 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=self.outputRun)) 

148 self.assertEqual(len(datasets), 2) 

149 

150 # Now with two index files that point to the same files. 

151 # Look for the warning from duplication. 

152 files = [ 

153 os.path.join(INGESTDIR, "indexed_data", "_index.json"), 

154 os.path.join(INGESTDIR, "indexed_data", "translated_subdir", "_index.json"), 

155 ] 

156 new_run = self.outputRun + "c" 

157 

158 with self.assertLogs(level="WARNING") as cm: 

159 self.task.run(files, run=new_run) 

160 self.assertIn("already specified in an index file, ignoring content", cm.output[0]) 

161 

162 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=self.outputRun)) 

163 self.assertEqual(len(datasets), 2) 

164 

165 # Again with an index file of metadata and one of translated. 

166 # Translated should win. 

167 # Put the metadata one first to test that order is preserved. 

168 files = [ 

169 os.path.join(INGESTDIR, "indexed_data", "metadata_subdir", "_index.json"), 

170 os.path.join(INGESTDIR, "indexed_data", "_index.json"), 

171 ] 

172 new_run = self.outputRun + "d" 

173 with self.assertLogs(level="WARNING") as cm: 

174 self.task.run(files, run=new_run) 

175 self.assertIn("already specified in an index file but overriding", cm.output[0]) 

176 

177 # Reversing the order should change the warning. 

178 # Again with an index file of metadata and one of translated. 

179 # Translated should win. 

180 # Put the metadata one first to test that order is preserved. 

181 files = [ 

182 os.path.join(INGESTDIR, "indexed_data", "_index.json"), 

183 os.path.join(INGESTDIR, "indexed_data", "metadata_subdir", "_index.json"), 

184 ] 

185 

186 new_run = self.outputRun + "e" 

187 with self.assertLogs(level="WARNING") as cm: 

188 self.task.run(files, run=new_run) 

189 self.assertIn("already specified in an index file, ignoring", cm.output[0]) 

190 

191 # Bad index file. 

192 files = [os.path.join(INGESTDIR, "indexed_data", "bad_index", "_index.json")] 

193 with self.assertRaises(RuntimeError): 

194 self.task.run(files, run=self.outputRun) 

195 

196 # Bad index file due to bad instrument. 

197 files = [os.path.join(INGESTDIR, "indexed_data", "bad_instrument", "_index.json")] 

198 with self.assertLogs(level="WARNING") as cm: 

199 with self.assertRaises(RuntimeError): 

200 self.task.run(files, run=self.outputRun) 

201 self.assertIn("Instrument HSC for file", cm.output[0]) 

202 

203 def testBadExposure(self): 

204 """Test that bad exposures trigger the correct failure modes. 

205 

206 This is the only test that uses the bad definition of dataset 4 

207 because exposure definitions are defined globally in a butler registry. 

208 """ 

209 

210 # Ingest 3 files. 2 of them will implicitly find an index and one 

211 # will use a sidecar. 

212 files = [os.path.join(INGESTDIR, "indexed_data", f"dataset_{n}.yaml") for n in (1, 2, 3)] 

213 new_run = self.outputRun 

214 self.task.run(files, run=new_run) 

215 

216 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=new_run)) 

217 self.assertEqual(len(datasets), 3) 

218 

219 # Test fail fast. 

220 self.task.config.failFast = True 

221 

222 # Ingest files with conflicting exposure definitions. 

223 # Ingest 3 files. One of them will implicitly find an index and one 

224 # will use a sidecar. The 3rd will fail due to exposure conflict. 

225 files = [os.path.join(INGESTDIR, "indexed_data", f"dataset_{n}.yaml") for n in (1, 3, 4)] 

226 new_run = self.outputRun + "_bad_exposure" 

227 with self.assertRaises(ConflictingDefinitionError): 

228 self.task.run(files, run=new_run) 

229 

230 def testBadFile(self): 

231 """Try to ingest a bad file.""" 

232 files = [self.bad_metadata_file] 

233 

234 with self.assertRaises(RuntimeError) as cm: 

235 # Default is to raise an error at the end. 

236 self.task.run(files, run=self.outputRun) 

237 self.assertIn("Some failures", str(cm.exception)) 

238 

239 # Including a good file will result in ingest working but still 

240 # raises (we might want to move this to solely happen in the 

241 # command line invocation). 

242 files.append(self.good_file) 

243 

244 # Also include a file with unknown instrument. 

245 files.append(self.bad_instrument_file) 

246 

247 with self.assertRaises(RuntimeError): 

248 self.task.run(files, run=self.outputRun) 

249 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=self.outputRun)) 

250 self.assertEqual(len(datasets), 1) 

251 

252 # Fail fast will trigger a run time error with different text. 

253 # Use a different output run to be sure we are not failing because 

254 # of the attempt to ingest twice. 

255 self.task.config.failFast = True 

256 new_run = self.outputRun + "b" 

257 with self.assertRaises(RuntimeError) as cm: 

258 self.task.run([self.bad_metadata_file, self.good_file], run=new_run) 

259 self.assertIn("Problem extracting metadata", str(cm.exception)) 

260 

261 # Attempt to ingest good file again -- this will fail for a different 

262 # reason than failed metadata extraction. 

263 with self.assertRaises(ConflictingDefinitionError): 

264 self.task.run([self.good_file], run=self.outputRun) 

265 

266 # Ingest a file with good metadata but unknown instrument. 

267 with self.assertRaises(RuntimeError) as cm: 

268 self.task.run([self.bad_instrument_file], run=self.outputRun) 

269 self.assertIn("Instrument HSC", str(cm.exception)) 

270 

271 # Ingest of a metadata index file that will fail translation. 

272 with self.assertRaises(RuntimeError) as cm: 

273 self.task.run([os.path.join(INGESTDIR, "indexed_data", "metadata_subdir", "_index.json")]) 

274 self.assertIn("Problem extracting metadata", str(cm.exception)) 

275 

276 # Ingest of a bad index file. 

277 with self.assertRaises(RuntimeError) as cm: 

278 self.task.run([os.path.join(INGESTDIR, "indexed_data", "bad_index", "_index.json")]) 

279 self.assertIn("Problem reading index file", str(cm.exception)) 

280 

281 # Ingest of an implied bad index file. 

282 with self.assertRaises(RuntimeError) as cm: 

283 self.task.run([os.path.join(INGESTDIR, "indexed_data", "bad_implied", "dataset_2.yaml")]) 

284 

285 def testCallbacks(self): 

286 """Test the callbacks for failures.""" 

287 

288 # Define the callbacks. 

289 metadata_failures = [] 

290 successes = [] 

291 ingest_failures = [] 

292 

293 def on_metadata_failure(filename, exc): 

294 metadata_failures.append(filename) 

295 

296 def on_success(datasets): 

297 successes.append(datasets) 

298 

299 def on_ingest_failure(exposure, exc): 

300 ingest_failures.append(exposure) 

301 

302 # Need our own task instance 

303 config = RawIngestTask.ConfigClass() 

304 self.task = DummyCamRawIngestTask( 

305 config=config, 

306 butler=self.butler, 

307 on_metadata_failure=on_metadata_failure, 

308 on_success=on_success, 

309 on_ingest_failure=on_ingest_failure, 

310 ) 

311 

312 files = [self.good_file, self.bad_metadata_file, self.bad_instrument_file] 

313 

314 with self.assertRaises(RuntimeError): 

315 self.task.run(files, run=self.outputRun) 

316 

317 self.assertEqual(len(successes), 1) 

318 self.assertEqual(len(metadata_failures), 2) 

319 self.assertEqual(len(ingest_failures), 0) 

320 

321 # Try the good one a second time. 

322 with self.assertRaises(RuntimeError): 

323 self.task.run([self.good_file], run=self.outputRun) 

324 

325 self.assertEqual(len(successes), 1) 

326 self.assertEqual(len(ingest_failures), 1) 

327 

328 # An index file with metadata that won't translate. 

329 metadata_failures[:] = [] 

330 files = [os.path.join(INGESTDIR, "indexed_data", "metadata_subdir", "_index.json")] 

331 with self.assertRaises(RuntimeError): 

332 self.task.run(files, run=self.outputRun) 

333 self.assertEqual(len(metadata_failures), 2) 

334 

335 # Bad index file. 

336 metadata_failures[:] = [] 

337 files = [os.path.join(INGESTDIR, "indexed_data", "bad_index", "_index.json")] 

338 with self.assertRaises(RuntimeError): 

339 self.task.run(files, run=self.outputRun) 

340 self.assertEqual(len(metadata_failures), 1) 

341 

342 # Ingest two files that have conflicting exposure metadata. 

343 ingest_failures[:] = [] 

344 successes[:] = [] 

345 # Ingest 4 files. 2 of them will implicitly find an index and one 

346 # will use a sidecar. The 4th will fail due to exposure conflict. 

347 files = [os.path.join(INGESTDIR, "indexed_data", f"dataset_{n}.yaml") for n in (1, 2, 3, 4)] 

348 new_run = self.outputRun + "_fail" 

349 with self.assertRaises(RuntimeError): 

350 self.task.run(files, run=new_run) 

351 self.assertEqual(len(ingest_failures), 1) 

352 self.assertEqual(len(successes), 3) 

353 

354 def testSkipExistingExposures(self): 

355 """Test that skip_existing_exposures=True avoids exceptions from trying 

356 to ingest the same file twice. 

357 

358 Notes 

359 ----- 

360 This option also prevents not-ingested-yet raws from being ingested 

361 when exposure already exists, but that's (A) hard to test given the 

362 test data we have now and (B) not really ideal behavior, just behavior 

363 we can live with in order to have a way to avoid keep duplicate ingests 

364 from being an error. 

365 """ 

366 # Ingest the first time. 

367 self.task.run([self.good_file], run=self.outputRun) 

368 # Attempt to ingest a second time with skip_existing_exposures=False 

369 # (default). This should fail. 

370 with self.assertRaises(RuntimeError): 

371 self.task.run([self.good_file], run=self.outputRun) 

372 # Try again with `skip_existing_exposures=True. 

373 self.task.run([self.good_file], run=self.outputRun, skip_existing_exposures=True) 

374 

375 def testUpdateExposureRecords(self): 

376 """Test that update_exposure_records=True allows metadata to be 

377 modified. 

378 """ 

379 config = RawIngestTask.ConfigClass(failFast=True) 

380 task = DummyCamRawIngestTask(config=config, butler=self.butler) 

381 with open(os.path.join(INGESTDIR, "sidecar_data", "dataset_1.json"), "r") as file: 

382 metadata = json.load(file) 

383 # Modify unique identifiers to avoid clashes with ingests from 

384 # other test methods in this test case, because those share a a 

385 # data repository. 

386 metadata["observation_id"] = "DummyDataset_testUpdateExposureRecords" 

387 metadata["observation_counter"] = 10 

388 metadata["exposure_id"] = 500 

389 metadata["exposure_group"] = "50" 

390 metadata["visit_id"] = 500 

391 base_filename = "dataset" 

392 try: 

393 # Copy the original file to be ingested (.yaml) to a temporary 

394 # directory, and write the new metadata next to it. 

395 tmp_dir = tempfile.mkdtemp(dir=TESTDIR) 

396 raw_filename = os.path.join(tmp_dir, f"{base_filename}.yaml") 

397 sidecar_filename = os.path.join(tmp_dir, f"{base_filename}.json") 

398 shutil.copy(self.good_file, raw_filename) 

399 with open(sidecar_filename, "w") as sidecar_file: 

400 json.dump(metadata, sidecar_file) 

401 task.run([raw_filename], run=self.outputRun) 

402 (record1,) = set( 

403 self.butler.registry.queryDimensionRecords("exposure", instrument="DummyCam", exposure=500) 

404 ) 

405 self.assertEqual(record1.exposure_time, metadata["exposure_time"]) 

406 # Modify some metadata and repeat the process to update the 

407 # exposure. 

408 metadata["exposure_time"] *= 2.0 

409 with open(sidecar_filename, "w") as sidecar_file: 

410 json.dump(metadata, sidecar_file) 

411 task.run( 

412 [raw_filename], run=self.outputRun, skip_existing_exposures=True, update_exposure_records=True 

413 ) 

414 (record2,) = set( 

415 self.butler.registry.queryDimensionRecords("exposure", instrument="DummyCam", exposure=500) 

416 ) 

417 self.assertEqual(record2.exposure_time, record1.exposure_time * 2) 

418 finally: 

419 shutil.rmtree(tmp_dir, ignore_errors=True) 

420 

421 

422class TestRawIngestTaskPickle(unittest.TestCase): 

423 """Test that pickling of the RawIngestTask works properly.""" 

424 

425 @classmethod 

426 def setUpClass(cls): 

427 cls.root = tempfile.mkdtemp(dir=TESTDIR) 

428 cls.creatorButler = butlerTests.makeTestRepo(cls.root, {}) 

429 

430 @classmethod 

431 def tearDownClass(cls): 

432 if cls.root is not None: 

433 shutil.rmtree(cls.root, ignore_errors=True) 

434 

435 def setUp(self): 

436 self.butler = butlerTests.makeTestCollection(self.creatorButler) 

437 

438 self.config = RawIngestTask.ConfigClass() 

439 self.config.transfer = "copy" # safe non-default value 

440 self.task = RawIngestTask(config=self.config, butler=self.butler) 

441 

442 def testPickleTask(self): 

443 stream = pickle.dumps(self.task) 

444 copy = pickle.loads(stream) 

445 self.assertEqual(self.task.getFullName(), copy.getFullName()) 

446 self.assertEqual(self.task.log.name, copy.log.name) 

447 self.assertEqual(self.task.config, copy.config) 

448 self.assertEqual(self.task.butler._config, copy.butler._config) 

449 self.assertEqual(self.task.butler.collections, copy.butler.collections) 

450 self.assertEqual(self.task.butler.run, copy.butler.run) 

451 self.assertEqual(self.task.universe, copy.universe) 

452 self.assertEqual(self.task.datasetType, copy.datasetType) 

453 

454 

455if __name__ == "__main__": 455 ↛ 456line 455 didn't jump to line 456, because the condition on line 455 was never true

456 unittest.main()