Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of obs_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22import json 

23import os 

24import pickle 

25import shutil 

26import tempfile 

27import unittest 

28 

29import lsst.log 

30import lsst.daf.butler.tests as butlerTests 

31from lsst.daf.butler import DatasetType, Butler, DataCoordinate, Config 

32from lsst.daf.butler.registry import ConflictingDefinitionError 

33from lsst.daf.butler.core.utils import getFullTypeName 

34 

35from lsst.obs.base.ingest_tests import IngestTestBase 

36from lsst.obs.base.instrument_tests import DummyCam 

37from lsst.obs.base import RawIngestTask 

38 

39 

40TESTDIR = os.path.abspath(os.path.dirname(__file__)) 

41INGESTDIR = os.path.join(TESTDIR, "data", "ingest") 

42 

43 

44class DummyCamRawIngestTask(RawIngestTask): 

45 """For DummyCam we ingest a different dataset type that can return 

46 a non-Exposure.""" 

47 

48 def getDatasetType(self): 

49 """Return the DatasetType of the datasets ingested by this Task. 

50 """ 

51 return DatasetType("raw_dict", ("instrument", "detector", "exposure"), "StructuredDataDict", 

52 universe=self.butler.registry.dimensions) 

53 

54 

55class RawIngestTestCase(IngestTestBase, unittest.TestCase): 

56 """Test ingest using JSON sidecar files.""" 

57 

58 ingestDatasetTypeName = "raw_dict" 

59 rawIngestTask = getFullTypeName(DummyCamRawIngestTask) 

60 curatedCalibrationDatasetTypes = () 

61 ingestDir = TESTDIR 

62 instrumentClassName = "lsst.obs.base.instrument_tests.DummyCam" 

63 file = os.path.join(INGESTDIR, "sidecar_data", "dataset_1.yaml") 

64 dataIds = [dict(instrument="DummyCam", exposure=100, detector=0)] 

65 

66 @property 

67 def visits(self): 

68 butler = Butler(self.root, collections=[self.outputRun]) 

69 return { 

70 DataCoordinate.standardize( 

71 instrument="DummyCam", 

72 visit=100, 

73 universe=butler.registry.dimensions 

74 ): [ 

75 DataCoordinate.standardize( 

76 instrument="DummyCam", 

77 exposure=100, 

78 universe=butler.registry.dimensions 

79 ) 

80 ] 

81 } 

82 

83 def testWriteCuratedCalibrations(self): 

84 """There are no curated calibrations in this test instrument""" 

85 pass 

86 

87 

88class RawIngestImpliedIndexTestCase(RawIngestTestCase): 

89 """Test ingest using JSON index files.""" 

90 file = os.path.join(INGESTDIR, "indexed_data", "dataset_1.yaml") 

91 

92 

93class RawIngestEdgeCaseTestCase(unittest.TestCase): 

94 """Test ingest using non-standard approaches including failures.""" 

95 

96 @classmethod 

97 def setUpClass(cls): 

98 butlerConfig = """ 

99datastore: 

100 # Want to ingest real files so can't use in-memory datastore 

101 cls: lsst.daf.butler.datastores.fileDatastore.FileDatastore 

102""" 

103 cls.root = tempfile.mkdtemp(dir=TESTDIR) 

104 cls.creatorButler = butlerTests.makeTestRepo(cls.root, {}, config=Config.fromYaml(butlerConfig)) 

105 DummyCam().register(cls.creatorButler.registry) 

106 

107 @classmethod 

108 def tearDownClass(cls): 

109 if cls.root is not None: 

110 shutil.rmtree(cls.root, ignore_errors=True) 

111 

112 def setUp(self): 

113 self.butler = butlerTests.makeTestCollection(self.creatorButler) 

114 self.outputRun = self.butler.run 

115 

116 config = RawIngestTask.ConfigClass() 

117 self.task = DummyCamRawIngestTask(config=config, butler=self.butler) 

118 

119 # Different test files. 

120 self.bad_metadata_file = os.path.join(TESTDIR, "data", "small.fits") 

121 self.good_file = os.path.join(INGESTDIR, "sidecar_data", "dataset_2.yaml") 

122 self.bad_instrument_file = os.path.join(TESTDIR, "data", "calexp.fits") 

123 

124 def testSimpleIngest(self): 

125 # Use the default per-instrument run for this. 

126 self.task.run([self.good_file]) 

127 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections="DummyCam/raw/all")) 

128 self.assertEqual(len(datasets), 1) 

129 

130 # Now parallelized. 

131 files = [self.good_file, 

132 os.path.join(INGESTDIR, "sidecar_data", "dataset_1.yaml")] 

133 self.task.run(files, processes=2, run=self.outputRun) 

134 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=self.outputRun)) 

135 self.assertEqual(len(datasets), 2) 

136 

137 def testExplicitIndex(self): 

138 files = [os.path.join(INGESTDIR, "indexed_data", "_index.json")] 

139 self.task.run(files, run=self.outputRun) 

140 

141 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=self.outputRun)) 

142 self.assertEqual(len(datasets), 2) 

143 

144 # Try again with an explicit index and a file that is in that index. 

145 files.append(os.path.join(INGESTDIR, "indexed_data", "dataset_2.yaml")) 

146 new_run = self.outputRun + "b" 

147 self.task.run(files, run=new_run) 

148 

149 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=self.outputRun)) 

150 self.assertEqual(len(datasets), 2) 

151 

152 # Now with two index files that point to the same files. 

153 # Look for the warning from duplication. 

154 files = [os.path.join(INGESTDIR, "indexed_data", "_index.json"), 

155 os.path.join(INGESTDIR, "indexed_data", "translated_subdir", "_index.json")] 

156 new_run = self.outputRun + "c" 

157 

158 with self.assertLogs(level="WARNING") as cm: 

159 with lsst.log.UsePythonLogging(): 

160 self.task.run(files, run=new_run) 

161 self.assertIn("already specified in an index file, ignoring content", cm.output[0]) 

162 

163 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=self.outputRun)) 

164 self.assertEqual(len(datasets), 2) 

165 

166 # Again with an index file of metadata and one of translated. 

167 # Translated should win. 

168 # Put the metadata one first to test that order is preserved. 

169 files = [os.path.join(INGESTDIR, "indexed_data", "metadata_subdir", "_index.json"), 

170 os.path.join(INGESTDIR, "indexed_data", "_index.json")] 

171 new_run = self.outputRun + "d" 

172 with self.assertLogs(level="WARNING") as cm: 

173 with lsst.log.UsePythonLogging(): 

174 self.task.run(files, run=new_run) 

175 self.assertIn("already specified in an index file but overriding", cm.output[0]) 

176 

177 # Reversing the order should change the warning. 

178 # Again with an index file of metadata and one of translated. 

179 # Translated should win. 

180 # Put the metadata one first to test that order is preserved. 

181 files = [os.path.join(INGESTDIR, "indexed_data", "_index.json"), 

182 os.path.join(INGESTDIR, "indexed_data", "metadata_subdir", "_index.json")] 

183 

184 new_run = self.outputRun + "e" 

185 with self.assertLogs(level="WARNING") as cm: 

186 with lsst.log.UsePythonLogging(): 

187 self.task.run(files, run=new_run) 

188 self.assertIn("already specified in an index file, ignoring", cm.output[0]) 

189 

190 # Bad index file. 

191 files = [os.path.join(INGESTDIR, "indexed_data", "bad_index", "_index.json")] 

192 with self.assertRaises(RuntimeError): 

193 self.task.run(files, run=self.outputRun) 

194 

195 # Bad index file due to bad instrument. 

196 files = [os.path.join(INGESTDIR, "indexed_data", "bad_instrument", "_index.json")] 

197 with self.assertLogs(level="WARNING") as cm: 

198 with lsst.log.UsePythonLogging(): 

199 with self.assertRaises(RuntimeError): 

200 self.task.run(files, run=self.outputRun) 

201 self.assertIn("Instrument HSC for file", cm.output[0]) 

202 

203 def testBadExposure(self): 

204 """Test that bad exposures trigger the correct failure modes. 

205 

206 This is the only test that uses the bad definition of dataset 4 

207 because exposure definitions are defined globally in a butler registry. 

208 """ 

209 

210 # Ingest 3 files. 2 of them will implicitly find an index and one 

211 # will use a sidecar. 

212 files = [os.path.join(INGESTDIR, "indexed_data", f"dataset_{n}.yaml") for n in (1, 2, 3)] 

213 new_run = self.outputRun 

214 self.task.run(files, run=new_run) 

215 

216 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=new_run)) 

217 self.assertEqual(len(datasets), 3) 

218 

219 # Test fail fast. 

220 self.task.config.failFast = True 

221 

222 # Ingest files with conflicting exposure definitions. 

223 # Ingest 3 files. One of them will implicitly find an index and one 

224 # will use a sidecar. The 3rd will fail due to exposure conflict. 

225 files = [os.path.join(INGESTDIR, "indexed_data", f"dataset_{n}.yaml") for n in (1, 3, 4)] 

226 new_run = self.outputRun + "_bad_exposure" 

227 with self.assertRaises(ConflictingDefinitionError): 

228 self.task.run(files, run=new_run) 

229 

230 def testBadFile(self): 

231 """Try to ingest a bad file.""" 

232 files = [self.bad_metadata_file] 

233 

234 with self.assertRaises(RuntimeError) as cm: 

235 # Default is to raise an error at the end. 

236 self.task.run(files, run=self.outputRun) 

237 self.assertIn("Some failures", str(cm.exception)) 

238 

239 # Including a good file will result in ingest working but still 

240 # raises (we might want to move this to solely happen in the 

241 # command line invocation). 

242 files.append(self.good_file) 

243 

244 # Also include a file with unknown instrument. 

245 files.append(self.bad_instrument_file) 

246 

247 with self.assertRaises(RuntimeError): 

248 self.task.run(files, run=self.outputRun) 

249 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=self.outputRun)) 

250 self.assertEqual(len(datasets), 1) 

251 

252 # Fail fast will trigger a run time error with different text. 

253 # Use a different output run to be sure we are not failing because 

254 # of the attempt to ingest twice. 

255 self.task.config.failFast = True 

256 new_run = self.outputRun + "b" 

257 with self.assertRaises(RuntimeError) as cm: 

258 self.task.run([self.bad_metadata_file, self.good_file], run=new_run) 

259 self.assertIn("Problem extracting metadata", str(cm.exception)) 

260 

261 # Attempt to ingest good file again -- this will fail for a different 

262 # reason than failed metadata extraction. 

263 with self.assertRaises(ConflictingDefinitionError): 

264 self.task.run([self.good_file], run=self.outputRun) 

265 

266 # Ingest a file with good metadata but unknown instrument. 

267 with self.assertRaises(RuntimeError) as cm: 

268 self.task.run([self.bad_instrument_file], run=self.outputRun) 

269 self.assertIn("Instrument HSC", str(cm.exception)) 

270 

271 # Ingest of a metadata index file that will fail translation. 

272 with self.assertRaises(RuntimeError) as cm: 

273 self.task.run([os.path.join(INGESTDIR, "indexed_data", "metadata_subdir", "_index.json")]) 

274 self.assertIn("Problem extracting metadata", str(cm.exception)) 

275 

276 # Ingest of a bad index file. 

277 with self.assertRaises(RuntimeError) as cm: 

278 self.task.run([os.path.join(INGESTDIR, "indexed_data", "bad_index", "_index.json")]) 

279 self.assertIn("Problem reading index file", str(cm.exception)) 

280 

281 # Ingest of an implied bad index file. 

282 with self.assertRaises(RuntimeError) as cm: 

283 self.task.run([os.path.join(INGESTDIR, "indexed_data", "bad_implied", "dataset_2.yaml")]) 

284 

285 def testCallbacks(self): 

286 """Test the callbacks for failures.""" 

287 

288 # Define the callbacks. 

289 metadata_failures = [] 

290 successes = [] 

291 ingest_failures = [] 

292 

293 def on_metadata_failure(filename, exc): 

294 metadata_failures.append(filename) 

295 

296 def on_success(datasets): 

297 successes.append(datasets) 

298 

299 def on_ingest_failure(exposure, exc): 

300 ingest_failures.append(exposure) 

301 

302 # Need our own task instance 

303 config = RawIngestTask.ConfigClass() 

304 self.task = DummyCamRawIngestTask(config=config, butler=self.butler, 

305 on_metadata_failure=on_metadata_failure, 

306 on_success=on_success, 

307 on_ingest_failure=on_ingest_failure) 

308 

309 files = [self.good_file, self.bad_metadata_file, self.bad_instrument_file] 

310 

311 with self.assertRaises(RuntimeError): 

312 self.task.run(files, run=self.outputRun) 

313 

314 self.assertEqual(len(successes), 1) 

315 self.assertEqual(len(metadata_failures), 2) 

316 self.assertEqual(len(ingest_failures), 0) 

317 

318 # Try the good one a second time. 

319 with self.assertRaises(RuntimeError): 

320 self.task.run([self.good_file], run=self.outputRun) 

321 

322 self.assertEqual(len(successes), 1) 

323 self.assertEqual(len(ingest_failures), 1) 

324 

325 # An index file with metadata that won't translate. 

326 metadata_failures[:] = [] 

327 files = [os.path.join(INGESTDIR, "indexed_data", "metadata_subdir", "_index.json")] 

328 with self.assertRaises(RuntimeError): 

329 self.task.run(files, run=self.outputRun) 

330 self.assertEqual(len(metadata_failures), 2) 

331 

332 # Bad index file. 

333 metadata_failures[:] = [] 

334 files = [os.path.join(INGESTDIR, "indexed_data", "bad_index", "_index.json")] 

335 with self.assertRaises(RuntimeError): 

336 self.task.run(files, run=self.outputRun) 

337 self.assertEqual(len(metadata_failures), 1) 

338 

339 # Ingest two files that have conflicting exposure metadata. 

340 ingest_failures[:] = [] 

341 successes[:] = [] 

342 # Ingest 4 files. 2 of them will implicitly find an index and one 

343 # will use a sidecar. The 4th will fail due to exposure conflict. 

344 files = [os.path.join(INGESTDIR, "indexed_data", f"dataset_{n}.yaml") for n in (1, 2, 3, 4)] 

345 new_run = self.outputRun + "_fail" 

346 with self.assertRaises(RuntimeError): 

347 self.task.run(files, run=new_run) 

348 self.assertEqual(len(ingest_failures), 1) 

349 self.assertEqual(len(successes), 3) 

350 

351 def testSkipExistingExposures(self): 

352 """Test that skip_existing_exposures=True avoids exceptions from trying 

353 to ingest the same file twice. 

354 

355 Notes 

356 ----- 

357 This option also prevents not-ingested-yet raws from being ingested 

358 when exposure already exists, but that's (A) hard to test given the 

359 test data we have now and (B) not really ideal behavior, just behavior 

360 we can live with in order to have a way to avoid keep duplicate ingests 

361 from being an error. 

362 """ 

363 # Ingest the first time. 

364 self.task.run([self.good_file], run=self.outputRun) 

365 # Attempt to ingest a second time with skip_existing_exposures=False 

366 # (default). This should fail. 

367 with self.assertRaises(RuntimeError): 

368 self.task.run([self.good_file], run=self.outputRun) 

369 # Try again with `skip_existing_exposures=True. 

370 self.task.run([self.good_file], run=self.outputRun, skip_existing_exposures=True) 

371 

372 def testUpdateExposureRecords(self): 

373 """Test that update_exposure_records=True allows metadata to be 

374 modified. 

375 """ 

376 config = RawIngestTask.ConfigClass(failFast=True) 

377 task = DummyCamRawIngestTask(config=config, butler=self.butler) 

378 with open(os.path.join(INGESTDIR, "sidecar_data", "dataset_1.json"), 'r') as file: 

379 metadata = json.load(file) 

380 # Modify unique identifiers to avoid clashes with ingests from 

381 # other test methods in this test case, because those share a a 

382 # data repository. 

383 metadata["observation_id"] = "DummyDataset_testUpdateExposureRecords" 

384 metadata["observation_counter"] = 10 

385 metadata["exposure_id"] = 500 

386 metadata["exposure_group"] = "50" 

387 metadata["visit_id"] = 500 

388 base_filename = "dataset" 

389 try: 

390 # Copy the original file to be ingested (.yaml) to a temporary 

391 # directory, and write the new metadata next to it. 

392 tmp_dir = tempfile.mkdtemp(dir=TESTDIR) 

393 raw_filename = os.path.join(tmp_dir, f"{base_filename}.yaml") 

394 sidecar_filename = os.path.join(tmp_dir, f"{base_filename}.json") 

395 shutil.copy(self.good_file, raw_filename) 

396 with open(sidecar_filename, "w") as sidecar_file: 

397 json.dump(metadata, sidecar_file) 

398 task.run([raw_filename], run=self.outputRun) 

399 (record1,) = set(self.butler.registry.queryDimensionRecords("exposure", instrument="DummyCam", 

400 exposure=500)) 

401 self.assertEqual(record1.exposure_time, metadata["exposure_time"]) 

402 # Modify some metadata and repeat the process to update the 

403 # exposure. 

404 metadata["exposure_time"] *= 2.0 

405 with open(sidecar_filename, "w") as sidecar_file: 

406 json.dump(metadata, sidecar_file) 

407 task.run([raw_filename], run=self.outputRun, skip_existing_exposures=True, 

408 update_exposure_records=True) 

409 (record2,) = set(self.butler.registry.queryDimensionRecords("exposure", instrument="DummyCam", 

410 exposure=500)) 

411 self.assertEqual(record2.exposure_time, record1.exposure_time*2) 

412 finally: 

413 shutil.rmtree(tmp_dir, ignore_errors=True) 

414 

415 

416class TestRawIngestTaskPickle(unittest.TestCase): 

417 """Test that pickling of the RawIngestTask works properly.""" 

418 

419 @classmethod 

420 def setUpClass(cls): 

421 cls.root = tempfile.mkdtemp(dir=TESTDIR) 

422 cls.creatorButler = butlerTests.makeTestRepo(cls.root, {}) 

423 

424 @classmethod 

425 def tearDownClass(cls): 

426 if cls.root is not None: 

427 shutil.rmtree(cls.root, ignore_errors=True) 

428 

429 def setUp(self): 

430 self.butler = butlerTests.makeTestCollection(self.creatorButler) 

431 

432 self.config = RawIngestTask.ConfigClass() 

433 self.config.transfer = "copy" # safe non-default value 

434 self.task = RawIngestTask(config=self.config, butler=self.butler) 

435 

436 def testPickleTask(self): 

437 stream = pickle.dumps(self.task) 

438 copy = pickle.loads(stream) 

439 self.assertEqual(self.task.getFullName(), copy.getFullName()) 

440 self.assertEqual(self.task.log.name, copy.log.name) 

441 self.assertEqual(self.task.config, copy.config) 

442 self.assertEqual(self.task.butler._config, copy.butler._config) 

443 self.assertEqual(self.task.butler.collections, copy.butler.collections) 

444 self.assertEqual(self.task.butler.run, copy.butler.run) 

445 self.assertEqual(self.task.universe, copy.universe) 

446 self.assertEqual(self.task.datasetType, copy.datasetType) 

447 

448 

449if __name__ == "__main__": 449 ↛ 450line 449 didn't jump to line 450, because the condition on line 449 was never true

450 unittest.main()