Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of obs_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22import json 

23import os 

24import pickle 

25import shutil 

26import tempfile 

27import unittest 

28 

29import lsst.daf.butler.tests as butlerTests 

30from lsst.daf.butler import DatasetType, Butler, DataCoordinate, Config 

31from lsst.daf.butler.registry import ConflictingDefinitionError 

32from lsst.utils.introspection import get_full_type_name 

33 

34from lsst.obs.base.ingest_tests import IngestTestBase 

35from lsst.obs.base.instrument_tests import DummyCam 

36from lsst.obs.base import RawIngestTask 

37 

38 

39TESTDIR = os.path.abspath(os.path.dirname(__file__)) 

40INGESTDIR = os.path.join(TESTDIR, "data", "ingest") 

41 

42 

43class DummyCamRawIngestTask(RawIngestTask): 

44 """For DummyCam we ingest a different dataset type that can return 

45 a non-Exposure.""" 

46 

47 def getDatasetType(self): 

48 """Return the DatasetType of the datasets ingested by this Task. 

49 """ 

50 return DatasetType("raw_dict", ("instrument", "detector", "exposure"), "StructuredDataDict", 

51 universe=self.butler.registry.dimensions) 

52 

53 

54class RawIngestTestCase(IngestTestBase, unittest.TestCase): 

55 """Test ingest using JSON sidecar files.""" 

56 

57 ingestDatasetTypeName = "raw_dict" 

58 rawIngestTask = get_full_type_name(DummyCamRawIngestTask) 

59 curatedCalibrationDatasetTypes = () 

60 ingestDir = TESTDIR 

61 instrumentClassName = "lsst.obs.base.instrument_tests.DummyCam" 

62 file = os.path.join(INGESTDIR, "sidecar_data", "dataset_1.yaml") 

63 dataIds = [dict(instrument="DummyCam", exposure=100, detector=0)] 

64 

65 @property 

66 def visits(self): 

67 butler = Butler(self.root, collections=[self.outputRun]) 

68 return { 

69 DataCoordinate.standardize( 

70 instrument="DummyCam", 

71 visit=100, 

72 universe=butler.registry.dimensions 

73 ): [ 

74 DataCoordinate.standardize( 

75 instrument="DummyCam", 

76 exposure=100, 

77 universe=butler.registry.dimensions 

78 ) 

79 ] 

80 } 

81 

82 def testWriteCuratedCalibrations(self): 

83 """There are no curated calibrations in this test instrument""" 

84 pass 

85 

86 

87class RawIngestImpliedIndexTestCase(RawIngestTestCase): 

88 """Test ingest using JSON index files.""" 

89 file = os.path.join(INGESTDIR, "indexed_data", "dataset_1.yaml") 

90 

91 

92class RawIngestEdgeCaseTestCase(unittest.TestCase): 

93 """Test ingest using non-standard approaches including failures.""" 

94 

95 @classmethod 

96 def setUpClass(cls): 

97 butlerConfig = """ 

98datastore: 

99 # Want to ingest real files so can't use in-memory datastore 

100 cls: lsst.daf.butler.datastores.fileDatastore.FileDatastore 

101""" 

102 cls.root = tempfile.mkdtemp(dir=TESTDIR) 

103 cls.creatorButler = butlerTests.makeTestRepo(cls.root, {}, config=Config.fromYaml(butlerConfig)) 

104 DummyCam().register(cls.creatorButler.registry) 

105 

106 @classmethod 

107 def tearDownClass(cls): 

108 if cls.root is not None: 

109 shutil.rmtree(cls.root, ignore_errors=True) 

110 

111 def setUp(self): 

112 self.butler = butlerTests.makeTestCollection(self.creatorButler) 

113 self.outputRun = self.butler.run 

114 

115 config = RawIngestTask.ConfigClass() 

116 self.task = DummyCamRawIngestTask(config=config, butler=self.butler) 

117 

118 # Different test files. 

119 self.bad_metadata_file = os.path.join(TESTDIR, "data", "small.fits") 

120 self.good_file = os.path.join(INGESTDIR, "sidecar_data", "dataset_2.yaml") 

121 self.bad_instrument_file = os.path.join(TESTDIR, "data", "calexp.fits") 

122 

123 def testSimpleIngest(self): 

124 # Use the default per-instrument run for this. 

125 self.task.run([self.good_file]) 

126 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections="DummyCam/raw/all")) 

127 self.assertEqual(len(datasets), 1) 

128 

129 # Now parallelized. 

130 files = [self.good_file, 

131 os.path.join(INGESTDIR, "sidecar_data", "dataset_1.yaml")] 

132 self.task.run(files, processes=2, run=self.outputRun) 

133 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=self.outputRun)) 

134 self.assertEqual(len(datasets), 2) 

135 

136 def testExplicitIndex(self): 

137 files = [os.path.join(INGESTDIR, "indexed_data", "_index.json")] 

138 self.task.run(files, run=self.outputRun) 

139 

140 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=self.outputRun)) 

141 self.assertEqual(len(datasets), 2) 

142 

143 # Try again with an explicit index and a file that is in that index. 

144 files.append(os.path.join(INGESTDIR, "indexed_data", "dataset_2.yaml")) 

145 new_run = self.outputRun + "b" 

146 self.task.run(files, run=new_run) 

147 

148 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=self.outputRun)) 

149 self.assertEqual(len(datasets), 2) 

150 

151 # Now with two index files that point to the same files. 

152 # Look for the warning from duplication. 

153 files = [os.path.join(INGESTDIR, "indexed_data", "_index.json"), 

154 os.path.join(INGESTDIR, "indexed_data", "translated_subdir", "_index.json")] 

155 new_run = self.outputRun + "c" 

156 

157 with self.assertLogs(level="WARNING") as cm: 

158 self.task.run(files, run=new_run) 

159 self.assertIn("already specified in an index file, ignoring content", cm.output[0]) 

160 

161 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=self.outputRun)) 

162 self.assertEqual(len(datasets), 2) 

163 

164 # Again with an index file of metadata and one of translated. 

165 # Translated should win. 

166 # Put the metadata one first to test that order is preserved. 

167 files = [os.path.join(INGESTDIR, "indexed_data", "metadata_subdir", "_index.json"), 

168 os.path.join(INGESTDIR, "indexed_data", "_index.json")] 

169 new_run = self.outputRun + "d" 

170 with self.assertLogs(level="WARNING") as cm: 

171 self.task.run(files, run=new_run) 

172 self.assertIn("already specified in an index file but overriding", cm.output[0]) 

173 

174 # Reversing the order should change the warning. 

175 # Again with an index file of metadata and one of translated. 

176 # Translated should win. 

177 # Put the metadata one first to test that order is preserved. 

178 files = [os.path.join(INGESTDIR, "indexed_data", "_index.json"), 

179 os.path.join(INGESTDIR, "indexed_data", "metadata_subdir", "_index.json")] 

180 

181 new_run = self.outputRun + "e" 

182 with self.assertLogs(level="WARNING") as cm: 

183 self.task.run(files, run=new_run) 

184 self.assertIn("already specified in an index file, ignoring", cm.output[0]) 

185 

186 # Bad index file. 

187 files = [os.path.join(INGESTDIR, "indexed_data", "bad_index", "_index.json")] 

188 with self.assertRaises(RuntimeError): 

189 self.task.run(files, run=self.outputRun) 

190 

191 # Bad index file due to bad instrument. 

192 files = [os.path.join(INGESTDIR, "indexed_data", "bad_instrument", "_index.json")] 

193 with self.assertLogs(level="WARNING") as cm: 

194 with self.assertRaises(RuntimeError): 

195 self.task.run(files, run=self.outputRun) 

196 self.assertIn("Instrument HSC for file", cm.output[0]) 

197 

198 def testBadExposure(self): 

199 """Test that bad exposures trigger the correct failure modes. 

200 

201 This is the only test that uses the bad definition of dataset 4 

202 because exposure definitions are defined globally in a butler registry. 

203 """ 

204 

205 # Ingest 3 files. 2 of them will implicitly find an index and one 

206 # will use a sidecar. 

207 files = [os.path.join(INGESTDIR, "indexed_data", f"dataset_{n}.yaml") for n in (1, 2, 3)] 

208 new_run = self.outputRun 

209 self.task.run(files, run=new_run) 

210 

211 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=new_run)) 

212 self.assertEqual(len(datasets), 3) 

213 

214 # Test fail fast. 

215 self.task.config.failFast = True 

216 

217 # Ingest files with conflicting exposure definitions. 

218 # Ingest 3 files. One of them will implicitly find an index and one 

219 # will use a sidecar. The 3rd will fail due to exposure conflict. 

220 files = [os.path.join(INGESTDIR, "indexed_data", f"dataset_{n}.yaml") for n in (1, 3, 4)] 

221 new_run = self.outputRun + "_bad_exposure" 

222 with self.assertRaises(ConflictingDefinitionError): 

223 self.task.run(files, run=new_run) 

224 

225 def testBadFile(self): 

226 """Try to ingest a bad file.""" 

227 files = [self.bad_metadata_file] 

228 

229 with self.assertRaises(RuntimeError) as cm: 

230 # Default is to raise an error at the end. 

231 self.task.run(files, run=self.outputRun) 

232 self.assertIn("Some failures", str(cm.exception)) 

233 

234 # Including a good file will result in ingest working but still 

235 # raises (we might want to move this to solely happen in the 

236 # command line invocation). 

237 files.append(self.good_file) 

238 

239 # Also include a file with unknown instrument. 

240 files.append(self.bad_instrument_file) 

241 

242 with self.assertRaises(RuntimeError): 

243 self.task.run(files, run=self.outputRun) 

244 datasets = list(self.butler.registry.queryDatasets("raw_dict", collections=self.outputRun)) 

245 self.assertEqual(len(datasets), 1) 

246 

247 # Fail fast will trigger a run time error with different text. 

248 # Use a different output run to be sure we are not failing because 

249 # of the attempt to ingest twice. 

250 self.task.config.failFast = True 

251 new_run = self.outputRun + "b" 

252 with self.assertRaises(RuntimeError) as cm: 

253 self.task.run([self.bad_metadata_file, self.good_file], run=new_run) 

254 self.assertIn("Problem extracting metadata", str(cm.exception)) 

255 

256 # Attempt to ingest good file again -- this will fail for a different 

257 # reason than failed metadata extraction. 

258 with self.assertRaises(ConflictingDefinitionError): 

259 self.task.run([self.good_file], run=self.outputRun) 

260 

261 # Ingest a file with good metadata but unknown instrument. 

262 with self.assertRaises(RuntimeError) as cm: 

263 self.task.run([self.bad_instrument_file], run=self.outputRun) 

264 self.assertIn("Instrument HSC", str(cm.exception)) 

265 

266 # Ingest of a metadata index file that will fail translation. 

267 with self.assertRaises(RuntimeError) as cm: 

268 self.task.run([os.path.join(INGESTDIR, "indexed_data", "metadata_subdir", "_index.json")]) 

269 self.assertIn("Problem extracting metadata", str(cm.exception)) 

270 

271 # Ingest of a bad index file. 

272 with self.assertRaises(RuntimeError) as cm: 

273 self.task.run([os.path.join(INGESTDIR, "indexed_data", "bad_index", "_index.json")]) 

274 self.assertIn("Problem reading index file", str(cm.exception)) 

275 

276 # Ingest of an implied bad index file. 

277 with self.assertRaises(RuntimeError) as cm: 

278 self.task.run([os.path.join(INGESTDIR, "indexed_data", "bad_implied", "dataset_2.yaml")]) 

279 

280 def testCallbacks(self): 

281 """Test the callbacks for failures.""" 

282 

283 # Define the callbacks. 

284 metadata_failures = [] 

285 successes = [] 

286 ingest_failures = [] 

287 

288 def on_metadata_failure(filename, exc): 

289 metadata_failures.append(filename) 

290 

291 def on_success(datasets): 

292 successes.append(datasets) 

293 

294 def on_ingest_failure(exposure, exc): 

295 ingest_failures.append(exposure) 

296 

297 # Need our own task instance 

298 config = RawIngestTask.ConfigClass() 

299 self.task = DummyCamRawIngestTask(config=config, butler=self.butler, 

300 on_metadata_failure=on_metadata_failure, 

301 on_success=on_success, 

302 on_ingest_failure=on_ingest_failure) 

303 

304 files = [self.good_file, self.bad_metadata_file, self.bad_instrument_file] 

305 

306 with self.assertRaises(RuntimeError): 

307 self.task.run(files, run=self.outputRun) 

308 

309 self.assertEqual(len(successes), 1) 

310 self.assertEqual(len(metadata_failures), 2) 

311 self.assertEqual(len(ingest_failures), 0) 

312 

313 # Try the good one a second time. 

314 with self.assertRaises(RuntimeError): 

315 self.task.run([self.good_file], run=self.outputRun) 

316 

317 self.assertEqual(len(successes), 1) 

318 self.assertEqual(len(ingest_failures), 1) 

319 

320 # An index file with metadata that won't translate. 

321 metadata_failures[:] = [] 

322 files = [os.path.join(INGESTDIR, "indexed_data", "metadata_subdir", "_index.json")] 

323 with self.assertRaises(RuntimeError): 

324 self.task.run(files, run=self.outputRun) 

325 self.assertEqual(len(metadata_failures), 2) 

326 

327 # Bad index file. 

328 metadata_failures[:] = [] 

329 files = [os.path.join(INGESTDIR, "indexed_data", "bad_index", "_index.json")] 

330 with self.assertRaises(RuntimeError): 

331 self.task.run(files, run=self.outputRun) 

332 self.assertEqual(len(metadata_failures), 1) 

333 

334 # Ingest two files that have conflicting exposure metadata. 

335 ingest_failures[:] = [] 

336 successes[:] = [] 

337 # Ingest 4 files. 2 of them will implicitly find an index and one 

338 # will use a sidecar. The 4th will fail due to exposure conflict. 

339 files = [os.path.join(INGESTDIR, "indexed_data", f"dataset_{n}.yaml") for n in (1, 2, 3, 4)] 

340 new_run = self.outputRun + "_fail" 

341 with self.assertRaises(RuntimeError): 

342 self.task.run(files, run=new_run) 

343 self.assertEqual(len(ingest_failures), 1) 

344 self.assertEqual(len(successes), 3) 

345 

346 def testSkipExistingExposures(self): 

347 """Test that skip_existing_exposures=True avoids exceptions from trying 

348 to ingest the same file twice. 

349 

350 Notes 

351 ----- 

352 This option also prevents not-ingested-yet raws from being ingested 

353 when exposure already exists, but that's (A) hard to test given the 

354 test data we have now and (B) not really ideal behavior, just behavior 

355 we can live with in order to have a way to avoid keep duplicate ingests 

356 from being an error. 

357 """ 

358 # Ingest the first time. 

359 self.task.run([self.good_file], run=self.outputRun) 

360 # Attempt to ingest a second time with skip_existing_exposures=False 

361 # (default). This should fail. 

362 with self.assertRaises(RuntimeError): 

363 self.task.run([self.good_file], run=self.outputRun) 

364 # Try again with `skip_existing_exposures=True. 

365 self.task.run([self.good_file], run=self.outputRun, skip_existing_exposures=True) 

366 

367 def testUpdateExposureRecords(self): 

368 """Test that update_exposure_records=True allows metadata to be 

369 modified. 

370 """ 

371 config = RawIngestTask.ConfigClass(failFast=True) 

372 task = DummyCamRawIngestTask(config=config, butler=self.butler) 

373 with open(os.path.join(INGESTDIR, "sidecar_data", "dataset_1.json"), 'r') as file: 

374 metadata = json.load(file) 

375 # Modify unique identifiers to avoid clashes with ingests from 

376 # other test methods in this test case, because those share a a 

377 # data repository. 

378 metadata["observation_id"] = "DummyDataset_testUpdateExposureRecords" 

379 metadata["observation_counter"] = 10 

380 metadata["exposure_id"] = 500 

381 metadata["exposure_group"] = "50" 

382 metadata["visit_id"] = 500 

383 base_filename = "dataset" 

384 try: 

385 # Copy the original file to be ingested (.yaml) to a temporary 

386 # directory, and write the new metadata next to it. 

387 tmp_dir = tempfile.mkdtemp(dir=TESTDIR) 

388 raw_filename = os.path.join(tmp_dir, f"{base_filename}.yaml") 

389 sidecar_filename = os.path.join(tmp_dir, f"{base_filename}.json") 

390 shutil.copy(self.good_file, raw_filename) 

391 with open(sidecar_filename, "w") as sidecar_file: 

392 json.dump(metadata, sidecar_file) 

393 task.run([raw_filename], run=self.outputRun) 

394 (record1,) = set(self.butler.registry.queryDimensionRecords("exposure", instrument="DummyCam", 

395 exposure=500)) 

396 self.assertEqual(record1.exposure_time, metadata["exposure_time"]) 

397 # Modify some metadata and repeat the process to update the 

398 # exposure. 

399 metadata["exposure_time"] *= 2.0 

400 with open(sidecar_filename, "w") as sidecar_file: 

401 json.dump(metadata, sidecar_file) 

402 task.run([raw_filename], run=self.outputRun, skip_existing_exposures=True, 

403 update_exposure_records=True) 

404 (record2,) = set(self.butler.registry.queryDimensionRecords("exposure", instrument="DummyCam", 

405 exposure=500)) 

406 self.assertEqual(record2.exposure_time, record1.exposure_time*2) 

407 finally: 

408 shutil.rmtree(tmp_dir, ignore_errors=True) 

409 

410 

411class TestRawIngestTaskPickle(unittest.TestCase): 

412 """Test that pickling of the RawIngestTask works properly.""" 

413 

414 @classmethod 

415 def setUpClass(cls): 

416 cls.root = tempfile.mkdtemp(dir=TESTDIR) 

417 cls.creatorButler = butlerTests.makeTestRepo(cls.root, {}) 

418 

419 @classmethod 

420 def tearDownClass(cls): 

421 if cls.root is not None: 

422 shutil.rmtree(cls.root, ignore_errors=True) 

423 

424 def setUp(self): 

425 self.butler = butlerTests.makeTestCollection(self.creatorButler) 

426 

427 self.config = RawIngestTask.ConfigClass() 

428 self.config.transfer = "copy" # safe non-default value 

429 self.task = RawIngestTask(config=self.config, butler=self.butler) 

430 

431 def testPickleTask(self): 

432 stream = pickle.dumps(self.task) 

433 copy = pickle.loads(stream) 

434 self.assertEqual(self.task.getFullName(), copy.getFullName()) 

435 self.assertEqual(self.task.log.name, copy.log.name) 

436 self.assertEqual(self.task.config, copy.config) 

437 self.assertEqual(self.task.butler._config, copy.butler._config) 

438 self.assertEqual(self.task.butler.collections, copy.butler.collections) 

439 self.assertEqual(self.task.butler.run, copy.butler.run) 

440 self.assertEqual(self.task.universe, copy.universe) 

441 self.assertEqual(self.task.datasetType, copy.datasetType) 

442 

443 

444if __name__ == "__main__": 444 ↛ 445line 444 didn't jump to line 445, because the condition on line 444 was never true

445 unittest.main()