Coverage for tests/test_ingest.py: 17%

250 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-06-06 02:59 -0700

1# This file is part of obs_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22import json 

23import os 

24import pickle 

25import shutil 

26import tempfile 

27import unittest 

28 

29import lsst.daf.butler.tests as butlerTests 

30from lsst.daf.butler import Butler, Config, DataCoordinate, Registry 

31from lsst.daf.butler.registry import ConflictingDefinitionError 

32from lsst.obs.base import RawIngestTask 

33from lsst.obs.base.ingest_tests import IngestTestBase 

34from lsst.obs.base.instrument_tests import DummyCam 

35from lsst.utils.introspection import get_full_type_name 

36 

37TESTDIR = os.path.abspath(os.path.dirname(__file__)) 

38INGESTDIR = os.path.join(TESTDIR, "data", "ingest") 

39 

40 

41class RawIngestTestCase(IngestTestBase, unittest.TestCase): 

42 """Test ingest using JSON sidecar files.""" 

43 

44 ingestDatasetTypeName = "raw_dict" 

45 rawIngestTask = get_full_type_name(RawIngestTask) 

46 curatedCalibrationDatasetTypes = ("testCalib",) 

47 ingestDir = TESTDIR 

48 instrumentClassName = "lsst.obs.base.instrument_tests.DummyCam" 

49 file = os.path.join(INGESTDIR, "sidecar_data", "dataset_1.yaml") 

50 dataIds = [dict(instrument="DummyCam", exposure=100, detector=0)] 

51 seed_config = os.path.join(TESTDIR, "data", "curated", "seed.yaml") 

52 

53 @property 

54 def visits(self): 

55 butler = Butler(self.root, collections=[self.outputRun]) 

56 return { 

57 DataCoordinate.standardize( 

58 instrument="DummyCam", visit=100, universe=butler.registry.dimensions 

59 ): [ 

60 DataCoordinate.standardize( 

61 instrument="DummyCam", exposure=100, universe=butler.registry.dimensions 

62 ) 

63 ] 

64 } 

65 

66 def testWriteCuratedCalibrations(self): 

67 # Inject the "data package" location. 

68 DummyCam.dataPackageDir = os.path.join(TESTDIR, "data", "curated") 

69 return super().testWriteCuratedCalibrations() 

70 

71 def _check_obscore(self, registry: Registry, has_visits: bool) -> None: 

72 # Docstring inherited from base class. 

73 assert registry.obsCoreTableManager is not None 

74 with registry.obsCoreTableManager.query(lsst_run=self.outputRun) as result: 

75 rows = list(result) 

76 self.assertEqual(len(rows), 1) 

77 row = rows[0] 

78 

79 # No spatial information until visits are defined 

80 if not has_visits: 

81 self.assertIsNone(row.s_ra) 

82 self.assertIsNone(row.s_dec) 

83 self.assertIsNone(row.s_fov) 

84 self.assertIsNone(row.s_region) 

85 else: 

86 self.assertIsNotNone(row.s_ra) 

87 self.assertIsNotNone(row.s_dec) 

88 self.assertIsNotNone(row.s_fov) 

89 self.assertRegex(row.s_region, "POLYGON ICRS .*") 

90 

91 

92class RawIngestImpliedIndexTestCase(RawIngestTestCase): 

93 """Test ingest using JSON index files.""" 

94 

95 file = os.path.join(INGESTDIR, "indexed_data", "dataset_1.yaml") 

96 

97 

98class RawIngestEdgeCaseTestCase(unittest.TestCase): 

99 """Test ingest using non-standard approaches including failures.""" 

100 

101 @classmethod 

102 def setUpClass(cls): 

103 butlerConfig = """ 

104datastore: 

105 # Want to ingest real files so can't use in-memory datastore 

106 cls: lsst.daf.butler.datastores.fileDatastore.FileDatastore 

107""" 

108 cls.root = tempfile.mkdtemp(dir=TESTDIR) 

109 cls.creatorButler = butlerTests.makeTestRepo(cls.root, {}, config=Config.fromYaml(butlerConfig)) 

110 DummyCam().register(cls.creatorButler.registry) 

111 

112 @classmethod 

113 def tearDownClass(cls): 

114 if cls.root is not None: 

115 shutil.rmtree(cls.root, ignore_errors=True) 

116 

117 def setUp(self): 

118 self.butler = butlerTests.makeTestCollection(self.creatorButler) 

119 self.outputRun = self.butler.run 

120 

121 config = RawIngestTask.ConfigClass() 

122 self.task = RawIngestTask(config=config, butler=self.butler) 

123 

124 # Different test files. 

125 self.bad_metadata_file = os.path.join(TESTDIR, "data", "small.fits") 

126 self.good_file = os.path.join(INGESTDIR, "sidecar_data", "dataset_2.yaml") 

127 self.bad_instrument_file = os.path.join(TESTDIR, "data", "calexp.fits") 

128 

129 def testSimpleIngest(self): 

130 # Use the default per-instrument run for this. 

131 self.task.run([self.good_file]) 

132 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections="DummyCam/raw/all")) 

133 self.assertEqual(len(datasets), 1) 

134 

135 # Now parallelized. 

136 files = [self.good_file, os.path.join(INGESTDIR, "sidecar_data", "dataset_1.yaml")] 

137 self.task.run(files, processes=2, run=self.outputRun) 

138 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=self.outputRun)) 

139 self.assertEqual(len(datasets), 2) 

140 

141 def testExplicitIndex(self): 

142 files = [os.path.join(INGESTDIR, "indexed_data", "_index.json")] 

143 self.task.run(files, run=self.outputRun) 

144 

145 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=self.outputRun)) 

146 self.assertEqual(len(datasets), 2) 

147 

148 # Try again with an explicit index and a file that is in that index. 

149 files.append(os.path.join(INGESTDIR, "indexed_data", "dataset_2.yaml")) 

150 new_run = self.outputRun + "b" 

151 self.task.run(files, run=new_run) 

152 

153 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=self.outputRun)) 

154 self.assertEqual(len(datasets), 2) 

155 

156 # Now with two index files that point to the same files. 

157 # Look for the warning from duplication. 

158 files = [ 

159 os.path.join(INGESTDIR, "indexed_data", "_index.json"), 

160 os.path.join(INGESTDIR, "indexed_data", "translated_subdir", "_index.json"), 

161 ] 

162 new_run = self.outputRun + "c" 

163 

164 with self.assertLogs(level="WARNING") as cm: 

165 self.task.run(files, run=new_run) 

166 self.assertIn("already specified in an index file, ignoring content", cm.output[0]) 

167 

168 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=self.outputRun)) 

169 self.assertEqual(len(datasets), 2) 

170 

171 # Again with an index file of metadata and one of translated. 

172 # Translated should win. 

173 # Put the metadata one first to test that order is preserved. 

174 files = [ 

175 os.path.join(INGESTDIR, "indexed_data", "metadata_subdir", "_index.json"), 

176 os.path.join(INGESTDIR, "indexed_data", "_index.json"), 

177 ] 

178 new_run = self.outputRun + "d" 

179 with self.assertLogs(level="WARNING") as cm: 

180 self.task.run(files, run=new_run) 

181 self.assertIn("already specified in an index file but overriding", cm.output[0]) 

182 

183 # Reversing the order should change the warning. 

184 # Again with an index file of metadata and one of translated. 

185 # Translated should win. 

186 # Put the metadata one first to test that order is preserved. 

187 files = [ 

188 os.path.join(INGESTDIR, "indexed_data", "_index.json"), 

189 os.path.join(INGESTDIR, "indexed_data", "metadata_subdir", "_index.json"), 

190 ] 

191 

192 new_run = self.outputRun + "e" 

193 with self.assertLogs(level="WARNING") as cm: 

194 self.task.run(files, run=new_run) 

195 self.assertIn("already specified in an index file, ignoring", cm.output[0]) 

196 

197 # Bad index file. 

198 files = [os.path.join(INGESTDIR, "indexed_data", "bad_index", "_index.json")] 

199 with self.assertRaises(RuntimeError): 

200 self.task.run(files, run=self.outputRun) 

201 

202 # Bad index file due to bad instrument. 

203 files = [os.path.join(INGESTDIR, "indexed_data", "bad_instrument", "_index.json")] 

204 with self.assertLogs(level="WARNING") as cm: 

205 with self.assertRaises(RuntimeError): 

206 self.task.run(files, run=self.outputRun) 

207 self.assertIn("Instrument HSC for file", cm.output[0]) 

208 

209 def testBadExposure(self): 

210 """Test that bad exposures trigger the correct failure modes. 

211 

212 This is the only test that uses the bad definition of dataset 4 

213 because exposure definitions are defined globally in a butler registry. 

214 """ 

215 

216 # Ingest 3 files. 2 of them will implicitly find an index and one 

217 # will use a sidecar. 

218 files = [os.path.join(INGESTDIR, "indexed_data", f"dataset_{n}.yaml") for n in (1, 2, 3)] 

219 new_run = self.outputRun 

220 self.task.run(files, run=new_run) 

221 

222 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=new_run)) 

223 self.assertEqual(len(datasets), 3) 

224 

225 # Test fail fast. 

226 self.task.config.failFast = True 

227 

228 # Ingest files with conflicting exposure definitions. 

229 # Ingest 3 files. One of them will implicitly find an index and one 

230 # will use a sidecar. The 3rd will fail due to exposure conflict. 

231 files = [os.path.join(INGESTDIR, "indexed_data", f"dataset_{n}.yaml") for n in (1, 3, 4)] 

232 new_run = self.outputRun + "_bad_exposure" 

233 with self.assertRaises(ConflictingDefinitionError): 

234 self.task.run(files, run=new_run) 

235 

236 def testBadFile(self): 

237 """Try to ingest a bad file.""" 

238 files = [self.bad_metadata_file] 

239 

240 with self.assertRaises(RuntimeError) as cm: 

241 # Default is to raise an error at the end. 

242 self.task.run(files, run=self.outputRun) 

243 self.assertIn("Some failures", str(cm.exception)) 

244 

245 # Including a good file will result in ingest working but still 

246 # raises (we might want to move this to solely happen in the 

247 # command line invocation). 

248 files.append(self.good_file) 

249 

250 # Also include a file with unknown instrument. 

251 files.append(self.bad_instrument_file) 

252 

253 with self.assertRaises(RuntimeError): 

254 self.task.run(files, run=self.outputRun) 

255 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=self.outputRun)) 

256 self.assertEqual(len(datasets), 1) 

257 

258 # Fail fast will trigger a run time error with different text. 

259 # Use a different output run to be sure we are not failing because 

260 # of the attempt to ingest twice. 

261 self.task.config.failFast = True 

262 new_run = self.outputRun + "b" 

263 with self.assertRaises(RuntimeError) as cm: 

264 self.task.run([self.bad_metadata_file, self.good_file], run=new_run) 

265 self.assertIn("Problem extracting metadata", str(cm.exception)) 

266 

267 # Attempt to ingest good file again -- this will fail for a different 

268 # reason than failed metadata extraction. 

269 with self.assertRaises(ConflictingDefinitionError): 

270 self.task.run([self.good_file], run=self.outputRun) 

271 

272 # Ingest a file with good metadata but unknown instrument. 

273 with self.assertRaises(RuntimeError) as cm: 

274 self.task.run([self.bad_instrument_file], run=self.outputRun) 

275 self.assertIn("Instrument HSC", str(cm.exception)) 

276 

277 # Ingest of a metadata index file that will fail translation. 

278 with self.assertRaises(RuntimeError) as cm: 

279 self.task.run([os.path.join(INGESTDIR, "indexed_data", "metadata_subdir", "_index.json")]) 

280 self.assertIn("Problem extracting metadata", str(cm.exception)) 

281 

282 # Ingest of a bad index file. 

283 with self.assertRaises(RuntimeError) as cm: 

284 self.task.run([os.path.join(INGESTDIR, "indexed_data", "bad_index", "_index.json")]) 

285 self.assertIn("Problem reading index file", str(cm.exception)) 

286 

287 # Ingest of an implied bad index file. 

288 with self.assertRaises(RuntimeError) as cm: 

289 self.task.run([os.path.join(INGESTDIR, "indexed_data", "bad_implied", "dataset_2.yaml")]) 

290 

291 def testCallbacks(self): 

292 """Test the callbacks for failures.""" 

293 

294 # Define the callbacks. 

295 metadata_failures = [] 

296 successes = [] 

297 ingest_failures = [] 

298 

299 def on_metadata_failure(filename, exc): 

300 metadata_failures.append(filename) 

301 

302 def on_success(datasets): 

303 successes.append(datasets) 

304 

305 def on_ingest_failure(exposure, exc): 

306 ingest_failures.append(exposure) 

307 

308 # Need our own task instance 

309 config = RawIngestTask.ConfigClass() 

310 self.task = RawIngestTask( 

311 config=config, 

312 butler=self.butler, 

313 on_metadata_failure=on_metadata_failure, 

314 on_success=on_success, 

315 on_ingest_failure=on_ingest_failure, 

316 ) 

317 

318 files = [self.good_file, self.bad_metadata_file, self.bad_instrument_file] 

319 

320 with self.assertRaises(RuntimeError): 

321 self.task.run(files, run=self.outputRun) 

322 

323 self.assertEqual(len(successes), 1) 

324 self.assertEqual(len(metadata_failures), 2) 

325 self.assertEqual(len(ingest_failures), 0) 

326 

327 # Try the good one a second time. 

328 with self.assertRaises(RuntimeError): 

329 self.task.run([self.good_file], run=self.outputRun) 

330 

331 self.assertEqual(len(successes), 1) 

332 self.assertEqual(len(ingest_failures), 1) 

333 

334 # An index file with metadata that won't translate. 

335 metadata_failures[:] = [] 

336 files = [os.path.join(INGESTDIR, "indexed_data", "metadata_subdir", "_index.json")] 

337 with self.assertRaises(RuntimeError): 

338 self.task.run(files, run=self.outputRun) 

339 self.assertEqual(len(metadata_failures), 2) 

340 

341 # Bad index file. 

342 metadata_failures[:] = [] 

343 files = [os.path.join(INGESTDIR, "indexed_data", "bad_index", "_index.json")] 

344 with self.assertRaises(RuntimeError): 

345 self.task.run(files, run=self.outputRun) 

346 self.assertEqual(len(metadata_failures), 1) 

347 

348 # Ingest two files that have conflicting exposure metadata. 

349 ingest_failures[:] = [] 

350 successes[:] = [] 

351 # Ingest 4 files. 2 of them will implicitly find an index and one 

352 # will use a sidecar. The 4th will fail due to exposure conflict. 

353 files = [os.path.join(INGESTDIR, "indexed_data", f"dataset_{n}.yaml") for n in (1, 2, 3, 4)] 

354 new_run = self.outputRun + "_fail" 

355 with self.assertRaises(RuntimeError): 

356 self.task.run(files, run=new_run) 

357 self.assertEqual(len(ingest_failures), 1) 

358 self.assertEqual(len(successes), 3) 

359 

360 def testSkipExistingExposures(self): 

361 """Test that skip_existing_exposures=True avoids exceptions from trying 

362 to ingest the same file twice. 

363 

364 Notes 

365 ----- 

366 This option also prevents not-ingested-yet raws from being ingested 

367 when exposure already exists, but that's (A) hard to test given the 

368 test data we have now and (B) not really ideal behavior, just behavior 

369 we can live with in order to have a way to avoid keep duplicate ingests 

370 from being an error. 

371 """ 

372 # Ingest the first time. 

373 self.task.run([self.good_file], run=self.outputRun) 

374 # Attempt to ingest a second time with skip_existing_exposures=False 

375 # (default). This should fail. 

376 with self.assertRaises(RuntimeError): 

377 self.task.run([self.good_file], run=self.outputRun) 

378 # Try again with `skip_existing_exposures=True. 

379 self.task.run([self.good_file], run=self.outputRun, skip_existing_exposures=True) 

380 

381 def testUpdateExposureRecords(self): 

382 """Test that update_exposure_records=True allows metadata to be 

383 modified. 

384 """ 

385 config = RawIngestTask.ConfigClass(failFast=True) 

386 task = RawIngestTask(config=config, butler=self.butler) 

387 with open(os.path.join(INGESTDIR, "sidecar_data", "dataset_1.json"), "r") as file: 

388 metadata = json.load(file) 

389 # Modify unique identifiers to avoid clashes with ingests from 

390 # other test methods in this test case, because those share a a 

391 # data repository. 

392 metadata["observation_id"] = "DummyDataset_testUpdateExposureRecords" 

393 metadata["observation_counter"] = 10 

394 metadata["exposure_id"] = 500 

395 metadata["exposure_group"] = "50" 

396 metadata["visit_id"] = 500 

397 base_filename = "dataset" 

398 try: 

399 # Copy the original file to be ingested (.yaml) to a temporary 

400 # directory, and write the new metadata next to it. 

401 tmp_dir = tempfile.mkdtemp(dir=TESTDIR) 

402 raw_filename = os.path.join(tmp_dir, f"{base_filename}.yaml") 

403 sidecar_filename = os.path.join(tmp_dir, f"{base_filename}.json") 

404 shutil.copy(self.good_file, raw_filename) 

405 with open(sidecar_filename, "w") as sidecar_file: 

406 json.dump(metadata, sidecar_file) 

407 task.run([raw_filename], run=self.outputRun) 

408 (record1,) = set( 

409 self.butler.registry.queryDimensionRecords("exposure", instrument="DummyCam", exposure=500) 

410 ) 

411 self.assertEqual(record1.exposure_time, metadata["exposure_time"]) 

412 # Modify some metadata and repeat the process to update the 

413 # exposure. 

414 metadata["exposure_time"] *= 2.0 

415 with open(sidecar_filename, "w") as sidecar_file: 

416 json.dump(metadata, sidecar_file) 

417 task.run( 

418 [raw_filename], run=self.outputRun, skip_existing_exposures=True, update_exposure_records=True 

419 ) 

420 (record2,) = set( 

421 self.butler.registry.queryDimensionRecords("exposure", instrument="DummyCam", exposure=500) 

422 ) 

423 self.assertEqual(record2.exposure_time, record1.exposure_time * 2) 

424 finally: 

425 shutil.rmtree(tmp_dir, ignore_errors=True) 

426 

427 

428class TestRawIngestTaskPickle(unittest.TestCase): 

429 """Test that pickling of the RawIngestTask works properly.""" 

430 

431 @classmethod 

432 def setUpClass(cls): 

433 cls.root = tempfile.mkdtemp(dir=TESTDIR) 

434 cls.creatorButler = butlerTests.makeTestRepo(cls.root, {}) 

435 

436 @classmethod 

437 def tearDownClass(cls): 

438 if cls.root is not None: 

439 shutil.rmtree(cls.root, ignore_errors=True) 

440 

441 def setUp(self): 

442 self.butler = butlerTests.makeTestCollection(self.creatorButler) 

443 

444 self.config = RawIngestTask.ConfigClass() 

445 self.config.transfer = "copy" # safe non-default value 

446 self.task = RawIngestTask(config=self.config, butler=self.butler) 

447 

448 def testPickleTask(self): 

449 stream = pickle.dumps(self.task) 

450 copy = pickle.loads(stream) 

451 self.assertEqual(self.task.getFullName(), copy.getFullName()) 

452 self.assertEqual(self.task.log.name, copy.log.name) 

453 self.assertEqual(self.task.config, copy.config) 

454 self.assertEqual(self.task.butler._config, copy.butler._config) 

455 self.assertEqual(self.task.butler.collections, copy.butler.collections) 

456 self.assertEqual(self.task.butler.run, copy.butler.run) 

457 self.assertEqual(self.task.universe, copy.universe) 

458 self.assertEqual(self.task.datasetType, copy.datasetType) 

459 

460 

461if __name__ == "__main__": 461 ↛ 462line 461 didn't jump to line 462, because the condition on line 461 was never true

462 unittest.main()