Coverage for tests/test_ingest.py: 20%

236 statements  

« prev     ^ index     » next       coverage.py v6.4.1, created at 2022-06-14 02:56 -0700

1# This file is part of obs_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22import json 

23import os 

24import pickle 

25import shutil 

26import tempfile 

27import unittest 

28 

29import lsst.daf.butler.tests as butlerTests 

30from lsst.daf.butler import Butler, Config, DataCoordinate, DatasetType 

31from lsst.daf.butler.registry import ConflictingDefinitionError 

32from lsst.obs.base import RawIngestTask 

33from lsst.obs.base.ingest_tests import IngestTestBase 

34from lsst.obs.base.instrument_tests import DummyCam 

35from lsst.utils.introspection import get_full_type_name 

36 

37TESTDIR = os.path.abspath(os.path.dirname(__file__)) 

38INGESTDIR = os.path.join(TESTDIR, "data", "ingest") 

39 

40 

41class DummyCamRawIngestTask(RawIngestTask): 

42 """For DummyCam we ingest a different dataset type that can return 

43 a non-Exposure.""" 

44 

45 def getDatasetType(self): 

46 """Return the DatasetType of the datasets ingested by this Task.""" 

47 return DatasetType( 

48 "raw_dict", 

49 ("instrument", "detector", "exposure"), 

50 "StructuredDataDict", 

51 universe=self.butler.registry.dimensions, 

52 ) 

53 

54 

55class RawIngestTestCase(IngestTestBase, unittest.TestCase): 

56 """Test ingest using JSON sidecar files.""" 

57 

58 ingestDatasetTypeName = "raw_dict" 

59 rawIngestTask = get_full_type_name(DummyCamRawIngestTask) 

60 curatedCalibrationDatasetTypes = () 

61 ingestDir = TESTDIR 

62 instrumentClassName = "lsst.obs.base.instrument_tests.DummyCam" 

63 file = os.path.join(INGESTDIR, "sidecar_data", "dataset_1.yaml") 

64 dataIds = [dict(instrument="DummyCam", exposure=100, detector=0)] 

65 

66 @property 

67 def visits(self): 

68 butler = Butler(self.root, collections=[self.outputRun]) 

69 return { 

70 DataCoordinate.standardize( 

71 instrument="DummyCam", visit=100, universe=butler.registry.dimensions 

72 ): [ 

73 DataCoordinate.standardize( 

74 instrument="DummyCam", exposure=100, universe=butler.registry.dimensions 

75 ) 

76 ] 

77 } 

78 

79 def testWriteCuratedCalibrations(self): 

80 """There are no curated calibrations in this test instrument""" 

81 pass 

82 

83 

84class RawIngestImpliedIndexTestCase(RawIngestTestCase): 

85 """Test ingest using JSON index files.""" 

86 

87 file = os.path.join(INGESTDIR, "indexed_data", "dataset_1.yaml") 

88 

89 

90class RawIngestEdgeCaseTestCase(unittest.TestCase): 

91 """Test ingest using non-standard approaches including failures.""" 

92 

93 @classmethod 

94 def setUpClass(cls): 

95 butlerConfig = """ 

96datastore: 

97 # Want to ingest real files so can't use in-memory datastore 

98 cls: lsst.daf.butler.datastores.fileDatastore.FileDatastore 

99""" 

100 cls.root = tempfile.mkdtemp(dir=TESTDIR) 

101 cls.creatorButler = butlerTests.makeTestRepo(cls.root, {}, config=Config.fromYaml(butlerConfig)) 

102 DummyCam().register(cls.creatorButler.registry) 

103 

104 @classmethod 

105 def tearDownClass(cls): 

106 if cls.root is not None: 

107 shutil.rmtree(cls.root, ignore_errors=True) 

108 

109 def setUp(self): 

110 self.butler = butlerTests.makeTestCollection(self.creatorButler) 

111 self.outputRun = self.butler.run 

112 

113 config = RawIngestTask.ConfigClass() 

114 self.task = DummyCamRawIngestTask(config=config, butler=self.butler) 

115 

116 # Different test files. 

117 self.bad_metadata_file = os.path.join(TESTDIR, "data", "small.fits") 

118 self.good_file = os.path.join(INGESTDIR, "sidecar_data", "dataset_2.yaml") 

119 self.bad_instrument_file = os.path.join(TESTDIR, "data", "calexp.fits") 

120 

121 def testSimpleIngest(self): 

122 # Use the default per-instrument run for this. 

123 self.task.run([self.good_file]) 

124 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections="DummyCam/raw/all")) 

125 self.assertEqual(len(datasets), 1) 

126 

127 # Now parallelized. 

128 files = [self.good_file, os.path.join(INGESTDIR, "sidecar_data", "dataset_1.yaml")] 

129 self.task.run(files, processes=2, run=self.outputRun) 

130 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=self.outputRun)) 

131 self.assertEqual(len(datasets), 2) 

132 

133 def testExplicitIndex(self): 

134 files = [os.path.join(INGESTDIR, "indexed_data", "_index.json")] 

135 self.task.run(files, run=self.outputRun) 

136 

137 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=self.outputRun)) 

138 self.assertEqual(len(datasets), 2) 

139 

140 # Try again with an explicit index and a file that is in that index. 

141 files.append(os.path.join(INGESTDIR, "indexed_data", "dataset_2.yaml")) 

142 new_run = self.outputRun + "b" 

143 self.task.run(files, run=new_run) 

144 

145 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=self.outputRun)) 

146 self.assertEqual(len(datasets), 2) 

147 

148 # Now with two index files that point to the same files. 

149 # Look for the warning from duplication. 

150 files = [ 

151 os.path.join(INGESTDIR, "indexed_data", "_index.json"), 

152 os.path.join(INGESTDIR, "indexed_data", "translated_subdir", "_index.json"), 

153 ] 

154 new_run = self.outputRun + "c" 

155 

156 with self.assertLogs(level="WARNING") as cm: 

157 self.task.run(files, run=new_run) 

158 self.assertIn("already specified in an index file, ignoring content", cm.output[0]) 

159 

160 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=self.outputRun)) 

161 self.assertEqual(len(datasets), 2) 

162 

163 # Again with an index file of metadata and one of translated. 

164 # Translated should win. 

165 # Put the metadata one first to test that order is preserved. 

166 files = [ 

167 os.path.join(INGESTDIR, "indexed_data", "metadata_subdir", "_index.json"), 

168 os.path.join(INGESTDIR, "indexed_data", "_index.json"), 

169 ] 

170 new_run = self.outputRun + "d" 

171 with self.assertLogs(level="WARNING") as cm: 

172 self.task.run(files, run=new_run) 

173 self.assertIn("already specified in an index file but overriding", cm.output[0]) 

174 

175 # Reversing the order should change the warning. 

176 # Again with an index file of metadata and one of translated. 

177 # Translated should win. 

178 # Put the metadata one first to test that order is preserved. 

179 files = [ 

180 os.path.join(INGESTDIR, "indexed_data", "_index.json"), 

181 os.path.join(INGESTDIR, "indexed_data", "metadata_subdir", "_index.json"), 

182 ] 

183 

184 new_run = self.outputRun + "e" 

185 with self.assertLogs(level="WARNING") as cm: 

186 self.task.run(files, run=new_run) 

187 self.assertIn("already specified in an index file, ignoring", cm.output[0]) 

188 

189 # Bad index file. 

190 files = [os.path.join(INGESTDIR, "indexed_data", "bad_index", "_index.json")] 

191 with self.assertRaises(RuntimeError): 

192 self.task.run(files, run=self.outputRun) 

193 

194 # Bad index file due to bad instrument. 

195 files = [os.path.join(INGESTDIR, "indexed_data", "bad_instrument", "_index.json")] 

196 with self.assertLogs(level="WARNING") as cm: 

197 with self.assertRaises(RuntimeError): 

198 self.task.run(files, run=self.outputRun) 

199 self.assertIn("Instrument HSC for file", cm.output[0]) 

200 

201 def testBadExposure(self): 

202 """Test that bad exposures trigger the correct failure modes. 

203 

204 This is the only test that uses the bad definition of dataset 4 

205 because exposure definitions are defined globally in a butler registry. 

206 """ 

207 

208 # Ingest 3 files. 2 of them will implicitly find an index and one 

209 # will use a sidecar. 

210 files = [os.path.join(INGESTDIR, "indexed_data", f"dataset_{n}.yaml") for n in (1, 2, 3)] 

211 new_run = self.outputRun 

212 self.task.run(files, run=new_run) 

213 

214 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=new_run)) 

215 self.assertEqual(len(datasets), 3) 

216 

217 # Test fail fast. 

218 self.task.config.failFast = True 

219 

220 # Ingest files with conflicting exposure definitions. 

221 # Ingest 3 files. One of them will implicitly find an index and one 

222 # will use a sidecar. The 3rd will fail due to exposure conflict. 

223 files = [os.path.join(INGESTDIR, "indexed_data", f"dataset_{n}.yaml") for n in (1, 3, 4)] 

224 new_run = self.outputRun + "_bad_exposure" 

225 with self.assertRaises(ConflictingDefinitionError): 

226 self.task.run(files, run=new_run) 

227 

228 def testBadFile(self): 

229 """Try to ingest a bad file.""" 

230 files = [self.bad_metadata_file] 

231 

232 with self.assertRaises(RuntimeError) as cm: 

233 # Default is to raise an error at the end. 

234 self.task.run(files, run=self.outputRun) 

235 self.assertIn("Some failures", str(cm.exception)) 

236 

237 # Including a good file will result in ingest working but still 

238 # raises (we might want to move this to solely happen in the 

239 # command line invocation). 

240 files.append(self.good_file) 

241 

242 # Also include a file with unknown instrument. 

243 files.append(self.bad_instrument_file) 

244 

245 with self.assertRaises(RuntimeError): 

246 self.task.run(files, run=self.outputRun) 

247 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=self.outputRun)) 

248 self.assertEqual(len(datasets), 1) 

249 

250 # Fail fast will trigger a run time error with different text. 

251 # Use a different output run to be sure we are not failing because 

252 # of the attempt to ingest twice. 

253 self.task.config.failFast = True 

254 new_run = self.outputRun + "b" 

255 with self.assertRaises(RuntimeError) as cm: 

256 self.task.run([self.bad_metadata_file, self.good_file], run=new_run) 

257 self.assertIn("Problem extracting metadata", str(cm.exception)) 

258 

259 # Attempt to ingest good file again -- this will fail for a different 

260 # reason than failed metadata extraction. 

261 with self.assertRaises(ConflictingDefinitionError): 

262 self.task.run([self.good_file], run=self.outputRun) 

263 

264 # Ingest a file with good metadata but unknown instrument. 

265 with self.assertRaises(RuntimeError) as cm: 

266 self.task.run([self.bad_instrument_file], run=self.outputRun) 

267 self.assertIn("Instrument HSC", str(cm.exception)) 

268 

269 # Ingest of a metadata index file that will fail translation. 

270 with self.assertRaises(RuntimeError) as cm: 

271 self.task.run([os.path.join(INGESTDIR, "indexed_data", "metadata_subdir", "_index.json")]) 

272 self.assertIn("Problem extracting metadata", str(cm.exception)) 

273 

274 # Ingest of a bad index file. 

275 with self.assertRaises(RuntimeError) as cm: 

276 self.task.run([os.path.join(INGESTDIR, "indexed_data", "bad_index", "_index.json")]) 

277 self.assertIn("Problem reading index file", str(cm.exception)) 

278 

279 # Ingest of an implied bad index file. 

280 with self.assertRaises(RuntimeError) as cm: 

281 self.task.run([os.path.join(INGESTDIR, "indexed_data", "bad_implied", "dataset_2.yaml")]) 

282 

283 def testCallbacks(self): 

284 """Test the callbacks for failures.""" 

285 

286 # Define the callbacks. 

287 metadata_failures = [] 

288 successes = [] 

289 ingest_failures = [] 

290 

291 def on_metadata_failure(filename, exc): 

292 metadata_failures.append(filename) 

293 

294 def on_success(datasets): 

295 successes.append(datasets) 

296 

297 def on_ingest_failure(exposure, exc): 

298 ingest_failures.append(exposure) 

299 

300 # Need our own task instance 

301 config = RawIngestTask.ConfigClass() 

302 self.task = DummyCamRawIngestTask( 

303 config=config, 

304 butler=self.butler, 

305 on_metadata_failure=on_metadata_failure, 

306 on_success=on_success, 

307 on_ingest_failure=on_ingest_failure, 

308 ) 

309 

310 files = [self.good_file, self.bad_metadata_file, self.bad_instrument_file] 

311 

312 with self.assertRaises(RuntimeError): 

313 self.task.run(files, run=self.outputRun) 

314 

315 self.assertEqual(len(successes), 1) 

316 self.assertEqual(len(metadata_failures), 2) 

317 self.assertEqual(len(ingest_failures), 0) 

318 

319 # Try the good one a second time. 

320 with self.assertRaises(RuntimeError): 

321 self.task.run([self.good_file], run=self.outputRun) 

322 

323 self.assertEqual(len(successes), 1) 

324 self.assertEqual(len(ingest_failures), 1) 

325 

326 # An index file with metadata that won't translate. 

327 metadata_failures[:] = [] 

328 files = [os.path.join(INGESTDIR, "indexed_data", "metadata_subdir", "_index.json")] 

329 with self.assertRaises(RuntimeError): 

330 self.task.run(files, run=self.outputRun) 

331 self.assertEqual(len(metadata_failures), 2) 

332 

333 # Bad index file. 

334 metadata_failures[:] = [] 

335 files = [os.path.join(INGESTDIR, "indexed_data", "bad_index", "_index.json")] 

336 with self.assertRaises(RuntimeError): 

337 self.task.run(files, run=self.outputRun) 

338 self.assertEqual(len(metadata_failures), 1) 

339 

340 # Ingest two files that have conflicting exposure metadata. 

341 ingest_failures[:] = [] 

342 successes[:] = [] 

343 # Ingest 4 files. 2 of them will implicitly find an index and one 

344 # will use a sidecar. The 4th will fail due to exposure conflict. 

345 files = [os.path.join(INGESTDIR, "indexed_data", f"dataset_{n}.yaml") for n in (1, 2, 3, 4)] 

346 new_run = self.outputRun + "_fail" 

347 with self.assertRaises(RuntimeError): 

348 self.task.run(files, run=new_run) 

349 self.assertEqual(len(ingest_failures), 1) 

350 self.assertEqual(len(successes), 3) 

351 

352 def testSkipExistingExposures(self): 

353 """Test that skip_existing_exposures=True avoids exceptions from trying 

354 to ingest the same file twice. 

355 

356 Notes 

357 ----- 

358 This option also prevents not-ingested-yet raws from being ingested 

359 when exposure already exists, but that's (A) hard to test given the 

360 test data we have now and (B) not really ideal behavior, just behavior 

361 we can live with in order to have a way to avoid keep duplicate ingests 

362 from being an error. 

363 """ 

364 # Ingest the first time. 

365 self.task.run([self.good_file], run=self.outputRun) 

366 # Attempt to ingest a second time with skip_existing_exposures=False 

367 # (default). This should fail. 

368 with self.assertRaises(RuntimeError): 

369 self.task.run([self.good_file], run=self.outputRun) 

370 # Try again with `skip_existing_exposures=True. 

371 self.task.run([self.good_file], run=self.outputRun, skip_existing_exposures=True) 

372 

373 def testUpdateExposureRecords(self): 

374 """Test that update_exposure_records=True allows metadata to be 

375 modified. 

376 """ 

377 config = RawIngestTask.ConfigClass(failFast=True) 

378 task = DummyCamRawIngestTask(config=config, butler=self.butler) 

379 with open(os.path.join(INGESTDIR, "sidecar_data", "dataset_1.json"), "r") as file: 

380 metadata = json.load(file) 

381 # Modify unique identifiers to avoid clashes with ingests from 

382 # other test methods in this test case, because those share a a 

383 # data repository. 

384 metadata["observation_id"] = "DummyDataset_testUpdateExposureRecords" 

385 metadata["observation_counter"] = 10 

386 metadata["exposure_id"] = 500 

387 metadata["exposure_group"] = "50" 

388 metadata["visit_id"] = 500 

389 base_filename = "dataset" 

390 try: 

391 # Copy the original file to be ingested (.yaml) to a temporary 

392 # directory, and write the new metadata next to it. 

393 tmp_dir = tempfile.mkdtemp(dir=TESTDIR) 

394 raw_filename = os.path.join(tmp_dir, f"{base_filename}.yaml") 

395 sidecar_filename = os.path.join(tmp_dir, f"{base_filename}.json") 

396 shutil.copy(self.good_file, raw_filename) 

397 with open(sidecar_filename, "w") as sidecar_file: 

398 json.dump(metadata, sidecar_file) 

399 task.run([raw_filename], run=self.outputRun) 

400 (record1,) = set( 

401 self.butler.registry.queryDimensionRecords("exposure", instrument="DummyCam", exposure=500) 

402 ) 

403 self.assertEqual(record1.exposure_time, metadata["exposure_time"]) 

404 # Modify some metadata and repeat the process to update the 

405 # exposure. 

406 metadata["exposure_time"] *= 2.0 

407 with open(sidecar_filename, "w") as sidecar_file: 

408 json.dump(metadata, sidecar_file) 

409 task.run( 

410 [raw_filename], run=self.outputRun, skip_existing_exposures=True, update_exposure_records=True 

411 ) 

412 (record2,) = set( 

413 self.butler.registry.queryDimensionRecords("exposure", instrument="DummyCam", exposure=500) 

414 ) 

415 self.assertEqual(record2.exposure_time, record1.exposure_time * 2) 

416 finally: 

417 shutil.rmtree(tmp_dir, ignore_errors=True) 

418 

419 

420class TestRawIngestTaskPickle(unittest.TestCase): 

421 """Test that pickling of the RawIngestTask works properly.""" 

422 

423 @classmethod 

424 def setUpClass(cls): 

425 cls.root = tempfile.mkdtemp(dir=TESTDIR) 

426 cls.creatorButler = butlerTests.makeTestRepo(cls.root, {}) 

427 

428 @classmethod 

429 def tearDownClass(cls): 

430 if cls.root is not None: 

431 shutil.rmtree(cls.root, ignore_errors=True) 

432 

433 def setUp(self): 

434 self.butler = butlerTests.makeTestCollection(self.creatorButler) 

435 

436 self.config = RawIngestTask.ConfigClass() 

437 self.config.transfer = "copy" # safe non-default value 

438 self.task = RawIngestTask(config=self.config, butler=self.butler) 

439 

440 def testPickleTask(self): 

441 stream = pickle.dumps(self.task) 

442 copy = pickle.loads(stream) 

443 self.assertEqual(self.task.getFullName(), copy.getFullName()) 

444 self.assertEqual(self.task.log.name, copy.log.name) 

445 self.assertEqual(self.task.config, copy.config) 

446 self.assertEqual(self.task.butler._config, copy.butler._config) 

447 self.assertEqual(self.task.butler.collections, copy.butler.collections) 

448 self.assertEqual(self.task.butler.run, copy.butler.run) 

449 self.assertEqual(self.task.universe, copy.universe) 

450 self.assertEqual(self.task.datasetType, copy.datasetType) 

451 

452 

453if __name__ == "__main__": 453 ↛ 454line 453 didn't jump to line 454, because the condition on line 453 was never true

454 unittest.main()